Sunday, July 21, 2013

The KMP algorithm for string pattern matching proceeds like this:

//
#include "stdafx.h"
#include <iostream>
#include <cstdio>
#include <cstring>
#include <vector>
using namespace std;
int* PreProcess( string pattern) {
 int patternLength = pattern.length();
 if (patternLength == 0) return 0;
    int * next = new int[patternLength + 1];
    if (next == 0) return 0;
    next[0] = -1;  // set up for loop below; unused by KMP

    int i = 0;
    int j = -1;
    while (i < patternLength) {
  next[i + 1] = next[i] + 1;
  while ( next[i+1] > 0 &&
    pattern[i] != pattern[next[i + 1] - 1])
   next[i + 1] = next[next[i + 1] - 1] + 1;
  i++;
 }
    return next;
}
void KMP(string pattern, string text, vector<int> *positions) {
    int patternLength = pattern.length();
    int textLength = text.length();
    int* next =  PreProcess(pattern);
 if (next == 0) return;
    int i = 0;
    int j = 0;
    while ( j < textLength )
 {
  while(true)
   if (text[j] == pattern[i]) //matches
   {
    i++;   // yes, move on to the next state
    if (i == patternLength)  // maybe that was the last state
    {
     // found a match;
     positions->push_back(j-(i-1));
     i = next[i];
    }
    break;
   }
   else if (i == 0) break; // no match in state j = 0, give up
   else i = next[i];
  j++;
 }
}
char CharCode(char chr) {
    if ( 'a' <= chr && chr <= 'z' )
        return 'a';
    if ( 'A' <= chr && chr <= 'Z' )
        return 'A';
    if ( '0' <= chr && chr <= '9' )
        return '0';
    if ( chr == '.' || chr == '?' || chr == '!' || chr == ',' || chr == ':' || chr == ';' || chr == '-' )
        return '.';
}
string CodeText(string text) {
    string ret = text;
    for (int i = 0; i < text.length(); ++i) {
        ret[i] = CharCode(text[i]);
    }
    return ret;
}
void FancyOutput(string pattern, string code, vector<int> *positions) {
    cout << "Matched positions: ";
    for (int i = 0; i < positions->size()-1; ++i)
        cout << (*positions)[i] + 1 << ", ";
    cout << (*positions)[positions->size()-1] + 1 << "." << endl;

    std::cerr << "Text: " << code.c_str() << endl;
    for (int i = 0; i < positions->size(); ++i) {
        printf("%5d ", i+1);
        for (int j = 0; j < (*positions)[i]; ++j) cout << " ";
        cout << pattern.c_str() << endl;
    }
}
int _tmain(int argc, _TCHAR* argv[])
{
    string pattern, text, code;
 char pattext[1024];
 char txttext[1024];
    cout << "Input pattern:" << endl;
    cin.getline(pattext, 1024, '\n');
 pattern.assign(pattext);

    cout << "Input text:" << endl;
    cin.getline(txttext, 1024, '\n');
 text.assign(txttext);
    cout << endl;

    code = CodeText(text);
    cout << "Processed text:" << endl;
    cout << code.c_str() << endl;
    cout << endl;

    vector<int> *positions = new vector<int>();
    KMP(pattern, code, positions);

    if ( positions->size() )
        cout << "Y" << endl;
    else
        cout << "N" << endl;

    if ( positions->size() )
        FancyOutput(pattern, code, positions);
    return 0;
}
 

No comments:

Post a Comment