快速字符串检索算法

创建日期: 2024-09-10 16:56 | 作者: 风波 | 浏览次数: 12 | 分类: 算法

1. the Rabin-Karp algorithm

资料:https://parsing-techniques.duguying.net/ebook/5/10/index.html (5.10 使用有限状态自动机快速文本搜索)《解析技术》(《Parsing Techniques》)

来源:https://www.geeksforgeeks.org/rabin-karp-algorithm-for-pattern-searching/

/* Following program is a C++ implementation of Rabin Karp
Algorithm given in the CLRS book */
#include <bits/stdc++.h>
using namespace std;

// d is the number of characters in the input alphabet
#define d 256

/* pat -> pattern
    txt -> text
    q -> A prime number
*/
void search(char pat[], char txt[], int q)
{
    int M = strlen(pat);
    int N = strlen(txt);
    int i, j;
    int p = 0; // hash value for pattern
    int t = 0; // hash value for txt
    int h = 1;

    // The value of h would be "pow(d, M-1)%q"
    for (i = 0; i < M - 1; i++)
        h = (h * d) % q;

    // Calculate the hash value of pattern and first
    // window of text
    for (i = 0; i < M; i++) {
        p = (d * p + pat[i]) % q;
        t = (d * t + txt[i]) % q;
    }

    // Slide the pattern over text one by one
    for (i = 0; i <= N - M; i++) {

        // Check the hash values of current window of text
        // and pattern. If the hash values match then only
        // check for characters one by one
        if (p == t) {
            /* Check for characters one by one */
            for (j = 0; j < M; j++) {
                if (txt[i + j] != pat[j]) {
                    break;
                }
            }

            // if p == t and pat[0...M-1] = txt[i, i+1,
            // ...i+M-1]

            if (j == M)
                cout << "Pattern found at index " << i
                    << endl;
        }

        // Calculate hash value for next window of text:
        // Remove leading digit, add trailing digit
        if (i < N - M) {
            t = (d * (t - txt[i] * h) + txt[i + M]) % q;

            // We might get negative value of t, converting
            // it to positive
            if (t < 0)
                t = (t + q);
        }
    }
}

/* Driver code */
int main()
{
    char txt[] = "GEEKS FOR GEEKS";
    char pat[] = "GEEK";

    // we mod to avoid overflowing of value but we should
    // take as big q as possible to avoid the collison
    int q = INT_MAX;

    // Function Call
    search(pat, txt, q);
    return 0;
}

// This is code is contributed by rathbhupendra

2. The Aho and Corasick

来源:https://www.geeksforgeeks.org/aho-corasick-algorithm-pattern-searching/

// C++ program for implementation of Aho Corasick algorithm 
// for string matching 
using namespace std; 
#include <bits/stdc++.h> 

// Max number of states in the matching machine. 
// Should be equal to the sum of the length of all keywords. 
const int MAXS = 500; 

// Maximum number of characters in input alphabet 
const int MAXC = 26; 

// OUTPUT FUNCTION IS IMPLEMENTED USING out[] 
// Bit i in this mask is one if the word with index i 
// appears when the machine enters this state. 
int out[MAXS]; 

// FAILURE FUNCTION IS IMPLEMENTED USING f[] 
int f[MAXS]; 

// GOTO FUNCTION (OR TRIE) IS IMPLEMENTED USING g[][] 
int g[MAXS][MAXC]; 

// Builds the string matching machine. 
// arr - array of words. The index of each keyword is important: 
//       "out[state] & (1 << i)" is > 0 if we just found word[i] 
//       in the text. 
// Returns the number of states that the built machine has. 
// States are numbered 0 up to the return value - 1, inclusive. 
int buildMatchingMachine(string arr[], int k) 
{ 
    // Initialize all values in output function as 0. 
    memset(out, 0, sizeof out); 

    // Initialize all values in goto function as -1. 
    memset(g, -1, sizeof g); 

    // Initially, we just have the 0 state 
    int states = 1; 

    // Construct values for goto function, i.e., fill g[][] 
    // This is same as building a Trie for arr[] 
    for (int i = 0; i < k; ++i) 
    { 
        const string &word = arr[i]; 
        int currentState = 0; 

        // Insert all characters of current word in arr[] 
        for (int j = 0; j < word.size(); ++j) 
        { 
            int ch = word[j] - 'a'; 

            // Allocate a new node (create a new state) if a 
            // node for ch doesn't exist. 
            if (g[currentState][ch] == -1) 
                g[currentState][ch] = states++; 

            currentState = g[currentState][ch]; 
        } 

        // Add current word in output function 
        out[currentState] |= (1 << i); 
    } 

    // For all characters which don't have an edge from 
    // root (or state 0) in Trie, add a goto edge to state 
    // 0 itself 
    for (int ch = 0; ch < MAXC; ++ch) 
        if (g[0][ch] == -1) 
            g[0][ch] = 0; 

    // Now, let's build the failure function 

    // Initialize values in fail function 
    memset(f, -1, sizeof f); 

    // Failure function is computed in breadth first order 
    // using a queue 
    queue<int> q; 

    // Iterate over every possible input 
    for (int ch = 0; ch < MAXC; ++ch) 
    { 
        // All nodes of depth 1 have failure function value 
        // as 0. For example, in above diagram we move to 0 
        // from states 1 and 3. 
        if (g[0][ch] != 0) 
        { 
            f[g[0][ch]] = 0; 
            q.push(g[0][ch]); 
        } 
    } 

    // Now queue has states 1 and 3 
    while (q.size()) 
    { 
        // Remove the front state from queue 
        int state = q.front(); 
        q.pop(); 

        // For the removed state, find failure function for 
        // all those characters for which goto function is 
        // not defined. 
        for (int ch = 0; ch <= MAXC; ++ch) 
        { 
            // If goto function is defined for character 'ch' 
            // and 'state' 
            if (g[state][ch] != -1) 
            { 
                // Find failure state of removed state 
                int failure = f[state]; 

                // Find the deepest node labeled by proper 
                // suffix of string from root to current 
                // state. 
                while (g[failure][ch] == -1) 
                    failure = f[failure]; 

                failure = g[failure][ch]; 
                f[g[state][ch]] = failure; 

                // Merge output values 
                out[g[state][ch]] |= out[failure]; 

                // Insert the next level node (of Trie) in Queue 
                q.push(g[state][ch]); 
            } 
        } 
    } 

    return states; 
} 

// Returns the next state the machine will transition to using goto 
// and failure functions. 
// currentState - The current state of the machine. Must be between 
//           0 and the number of states - 1, inclusive. 
// nextInput - The next character that enters into the machine. 
int findNextState(int currentState, char nextInput) 
{ 
    int answer = currentState; 
    int ch = nextInput - 'a'; 

    // If goto is not defined, use failure function 
    while (g[answer][ch] == -1) 
        answer = f[answer]; 

    return g[answer][ch]; 
} 

// This function finds all occurrences of all array words 
// in text. 
void searchWords(string arr[], int k, string text) 
{ 
    // Preprocess patterns. 
    // Build machine with goto, failure and output functions 
    buildMatchingMachine(arr, k); 

    // Initialize current state 
    int currentState = 0; 

    // Traverse the text through the built machine to find 
    // all occurrences of words in arr[] 
    for (int i = 0; i < text.size(); ++i) 
    { 
        currentState = findNextState(currentState, text[i]); 

        // If match not found, move to next state 
        if (out[currentState] == 0) 
            continue; 

        // Match found, print all matching words of arr[] 
        // using output function. 
        for (int j = 0; j < k; ++j) 
        { 
            if (out[currentState] & (1 << j)) 
            { 
                cout << "Word " << arr[j] << " appears from "
                    << i - arr[j].size() + 1 << " to " << i << endl; 
            } 
        } 
    } 
} 

// Driver program to test above 
int main() 
{ 
    string arr[] = {"he", "she", "hers", "his"}; 
    string text = "ahishers"; 
    int k = sizeof(arr)/sizeof(arr[0]); 

    searchWords(arr, k, text); 

    return 0; 
} 
12 浏览
0 评论