Google字符串模糊匹配算法，字典樹模糊查詢

轉載自：http://blog.csdn.net/shifuwawa/article/details/5595514

好吧，我承認我又裝13標題黨了。其實是G查詢關鍵詞過程中匹配的一點大概的算法框架，G的模糊匹配大家都知道，比如你輸入64什麼的，G會自動列出你心裏可能要找

到東西，如下圖:

那這個算法是怎麼實現的呢，用到了一種高級數據結構--字典樹，或者說是字典樹思想，因爲字典樹不規定你具體怎麼實現，可以二維數組，可以map……也可以通常的結構體+next指針。可以通過一個題來講述，就是2009ACM/ICPC 哈爾濱 reginal現場賽G題：Fuzzy Google Suggest（http://acm.hit.edu.cn/judge/show.php?Proid=2888&Contestid=0）講解。當時我搞這題，不知道字典樹，然後一直模擬，結果……(— —|||)先用輸入的單詞構造一棵字典樹，節點數據包括：cnt，表示節點上的字母被多少個單詞經過；vis，0表示經過此節點不能繼續匹配，1表示經過此節點可繼續匹配，2表示此節點就是恰好用於匹配的前綴的最後一個字符；然後一個next數組，大小26，但不是node指針，而是int數組，表示當前節點的兒子，next[i]==-1表示當前節點沒有第i個兒子，否則有，並將此兒子結點進行編號，其編號就是它在字典樹中的編號。然後根據編輯距離進行dfs遍歷；函數設計爲dfs(int x,int pos,intedit,char* key),x是trie樹中第x個節點，pos表示匹配到了前綴字符key的第pos個字符，edit表示剩餘可用的編輯距離。假如某個字符符合當前前綴的匹配條件，則trie節點向兒子結點遞歸，pos++，edit不變dfs(root[x].next[key[pos]-'a'],++pos,edit,key)；否則嘗試使用編輯距離：1，增加一個字符，此時要遍歷26個字符，看增加哪個合法（即此字符在trie中出現了並且是當前key[pos]的兒子節點並且此字符不跟key[pos]相同），然後繼續dfs,此時編輯距離少一個，key的位置不變，trie走向兒子節點，假設增加的字符編號爲i，則dfs(root[x].next[i],++pos,edit-1,key)；2，替換一個字符，此時edit減一，pos向前走一個，dfs(root[x].next[i],pos+1,edit-1,key);3,刪除一個字符，刪除表示爲trie節點不變，但是前綴字符串key串往下走一個，相當於就沒匹配上的忽略，dfs(x,pos+1,edit-1,key),若能遍歷下去，且x節點之前不可通行，則將x標記爲可通行.到達匹配終點的條件有三個：1，前綴串key一路匹配到了末尾，此時的結點x被標記爲，root[x].vis=2,表示它是某個前綴串的終結者。2，在tire中一路通行突然edit用完透支了，那這個前綴串沒有找到匹配的單詞，回溯。3，碰到了某個節點x,root[x].vis=2,說明到x這個前綴串已經能夠匹配。返回可以匹配。然後再利用dfs_calc函數計數符合匹配的單詞數量:vis=2的結點。最後用dfs_clear()函數清理trie樹。關於銷燬trie樹，見有人用一個for循環搞定的，那樣只是把和根節點直接相連的結點進行了delete，但是其他的都變成懸空狀態，並未被銷燬。壞習慣(但對ACM題來說不失爲一種銷燬的捷徑)。不過用struct寫的交上去老是RE，極度掣肘，只好參看某牛的改作數組實現的trie：
RE的：

[cpp] view
plaincopy

#include<pzjay>  

#<一坨頭文件>  

const int sup=500005;  

int tot;//tire結點個數  

int len;//記錄前綴詞 的長度  

int ans;//記錄此前綴匹配單詞的個數  

struct node  

{  

    int cnt;//表示此字母被多少個單詞經過  

    int vis;//vis=0表示經過此單詞不能夠到達要匹配的結點;1表示可以;2表示此字母就是匹配前綴的最後一個字母(即匹配完畢)  

    int next[26];  

}root[sup];  

void creat(char key[])  

{  

    int i=0,index;  

    int k=1;//root下標  

    while(key[i])  

    {  

        index=key[i]-'a';  

        if(-1==root[k].next[index])  

        {  

            root[k].next[index]=tot;//將root[tot]的地址賦給tmp->next[index]  

            root[tot].cnt=1;  

            root[tot].vis=0;  

            ++tot;  

        }  

        else  

            ++root[root[k].next[index]].cnt;  

        k=root[k].next[index];  

        ++i;  

    }  

}  

int dfs(int x,int pos,int edit,char* key)//返回是否成功匹配  

{  

    if(2==root[x].vis)//到達一個匹配的結束點  

        return 1;  

    if(edit<0)  

        return 0;  

    if(pos==len)//到達前綴的末尾  

    {  

        root[x].vis=2;//該節點是前綴的結束字母，x之前的單詞串被成功匹配  

        return 1;  

    }  

    int index=key[pos]-'a';  

    if(-1!=root[x].next[index])//還有兒子結點  

        if(dfs(root[x].next[index],pos+1,edit,key))  

            root[x].vis=1;  

    for(int i=0;i<26;++i)  

    {  

        index=key[pos]-'a';  

        if(index==i || -1==root[x].next[i])//在樹中找可替換的字符  

            continue;  

        if(dfs(root[x].next[i],pos+1,edit-1,key))//將pos處的字母嘗試用i+'a'代替  

            root[x].vis=1;  

        if(dfs(root[x].next[i],pos,edit-1,key))//插入一個字母  

            root[x].vis=1;  

    }  

    if(dfs(x,pos+1,edit-1,key))//delete  

        if(0==root[x].vis)  

            root[x].vis=1;  

    return root[x].vis;  

}  

void dfs_calc(int x)  

{  

    if(2==root[x].vis)  

    {  

        ans+=root[x].cnt;  

        return;  

    }  

    for(int i=0;i<26;++i)  

        if(root[root[x].next[i]].vis > 0)  

            dfs_calc(root[x].next[i]);  

}  

void dfs_clear(int x)  

{  

    root[x].vis=0;  

    for(int i=0;i<26;++i)  

        if(root[root[x].next[i]].vis > 0)  

            dfs_clear(root[x].next[i]);  

}  

int main()  

{  

    int n;  

    //freopen("1.txt","r",stdin);  

    while(scanf("%d",&n)!=EOF)  

    {  

        tot=2;  

        char key[25];  

        int m;  

        int edit;//編輯距離  

        for(int i=0;i<sup;++i)  

            memset(root[i].next,-1,sizeof(root[i].next));  

        //fill(root[i].next,root[i].next+26,-1);  

        while(n--)  

        {  

            scanf("%s",key);  

            creat(key);  

        }  

        scanf("%d",&m);//m個前綴  

        while(m--)  

        {  

            ans=0;  

            scanf("%s %d",key,&edit);  

            len=strlen(key);  

            dfs(1,0,edit,key);  

            //1是x的起始遍歷位置，0是前綴key的起始位置，edit是剩餘的編輯距離  

            dfs_calc(1);//計數符合匹配的單詞個數  

            dfs_clear(1);//清空x  

            printf("%d/n",ans);  

        }  

    }  

    return 0;  

}  

AC：  

const int sup=700005;  

int tot;//tire結點個數  

int len;//記錄前綴詞 的長度  

int ans;//記錄此前綴匹配單詞的個數  

int root[sup][26];//每個節點最多26個分支  

int cnt[sup],vis[sup];//cnt[i]記錄字母i被多少個單詞經過  

void creat(char key[])  

{  

    int k=1,index,i=0;  

    while(key[i])  

    {  

        index=key[i]-'a';  

        if(-1==root[k][index])  

            root[k][index]=tot++;  

        k=root[k][index];  

        ++cnt[k];  

        ++i;  

    }  

}  

int dfs(int x,int pos,int edit,char key[])  

{  

    if(2==vis[x])  

        return 1;  

    if(edit<0)  

        return 0;  

    if(pos==len)//匹配完畢，節點x成爲前綴詞key的結尾字母  

    {  

        vis[x]=2;  

        return  1;  

    }//以上可以直接return的，都是最終的結果:匹配成功或者失敗  

    //下面的只是遞歸到最重結果的過程，故是對vis賦值  

    int index=key[pos]-'a';  

    if(-1!=root[x][index])//可以繼續往深層遍歷  

        if(dfs(root[x][index],pos+1,edit,key))  

            vis[x]=1;//從x往下可以走到目標節點  

    for(int i=0;i<26;++i)  

    {  

        index=key[pos]-'a';  

        if(index==i || -1==root[x][i])//篩選掉跟要替換的字母相同的字母和未在trie樹中出現的字母  

            continue;  

        if(dfs(root[x][i],pos+1,edit-1,key))//pos++，遍歷下一個字母，表示替換一個trie樹中存在的字母  

                vis[x]=1;  

        if(dfs(root[x][i],pos,edit-1,key))//pos不變.表示增加一個字母  

                vis[x]=1;  

    }  

    if(dfs(x,pos+1,edit-1,key))//刪除一個字母  

        if(0==vis[x])  

            vis[x]=1;  

    return vis[x];  

}  

void dfs_calc(int x)  

{  

    if(2==vis[x])  

    {  

        ans+=cnt[x];  

        return;  

    }  

    for(int i=0;i<26;++i)  

        if(vis[root[x][i]])  

            dfs_calc(root[x][i]);  

}  

void dfs_clear(int x)  

{  

    vis[x]=0;  

    for(int i=0;i<26;++i)  

        if(vis[root[x][i]])  

            dfs_clear(root[x][i]);  

}  

int main()  

{  

    int n;  

    char key[16];  

    while(scanf("%d",&n)!=EOF)  

    {  

        int edit,m;  

        memset(root,-1,sizeof(root));  

        memset(vis,0,sizeof(vis));  

        memset(cnt,0,sizeof(cnt));  

        tot=2;  

        while(n--)  

        {  

            scanf("%s",key);  

            creat(key);  

        }  

        scanf("%d",&m);  

        while(m--)  

        {  

            ans=0;  

            scanf("%s %d",key,&edit);  

            len=strlen(key);  

            dfs(1,0,edit,key);  

            dfs_calc(1);  

            printf("%d/n",ans);  

            dfs_clear(1);  

        }  

    }  

    return 0;  

}參看：http://acmicpc.org.cn/wiki/index.php?title=2009_Harbin_Fuzzy_Google_Suggest_Solution  

ps:轉載註明出處：pzjay！

除了模糊匹配外還有精確匹配，金山詞霸手機版E文輸入，T9輸入法等許多優秀的手機E文輸入軟件都採用了精確匹配。以T9輸入法爲例，它摒棄傳統的輸入按鍵模式，假如你想輸入ccc，傳統的是要摁3*3=9下2鍵，但是假如ccc是經常使用的高頻詞彙的話，T9輸入法只摁三下即可。牽扯到頻率，肯定又是字典樹的應用了，題目相關：HDOJ1298
本題先輸入一個單詞表，包括單詞以及該單詞的權值。然後輸入一些數字串，要求模擬手機輸入的過程，每輸入一個數字，就輸出對應的單詞（如果沒有對應的就輸出MANUALLY），如果輸入的數字會對應不同的單詞的前綴，就輸出權值之和最高的前綴（如果權值一樣就按字母表順序）。用Sample來說明，輸入了hell，hello，idea這3個單詞，權值對應分別爲3,4,8，開始輸入數字：輸入4，4可以對應i和h，i是idea的前綴，權值之和爲8，h是hell和hello的前綴，權值之和是3+4=7，輸出權值較大的i；繼續輸入3,43對應的可以是he和id，同樣因爲id的權值大於he，就輸出id；接下來輸入5,435就只能對應hel了……依此類推，每次輸出的都是權值之和最高的詞
思想：trie+BFS
算法流程：
1。根據輸入的單詞建樹
2。根據輸入的按鍵序列依次轉化爲可能的字符序列，維護一個雙端隊列，將樹中出現過(通過查找字典樹實現)的字符序列入列，用於下次增加字符序列
3。若當前枚舉到的按鍵序列遍歷完所有可能後若最大權值還是-1，說明該按鍵序列沒有匹配的字符串；否則輸出權值最大的字符串即可。注意若字符序列中間出現不匹配，那麼以後的都不匹配，但此時仍然要繼續遍歷依次輸出不匹配，不能退出。見過HH大神map實現trie樹的代碼，很好很強大。(map <string,int>表示string出現的頻率int)

[cpp] view
plaincopy

#include<iostream>  

#include<一坨頭文件>  

#include<轉載註明pzjay原創>  

const int sup=100;  

int num[10];//num[i]表示第i個鍵上面的字母個數  

char T9[10][4];//T9[i][j]表示第i個鍵上第j個字母  

deque <string> dq;  

int n;  

struct node  

{  

    int count;//記錄出現次數  

    node* next[26];  

    node(int fre)  

    {  

        count=fre;  

        memset(next,NULL,sizeof(next));  

    }  

};  

node* root;  

void creat(char key[],int freq)  

{  

    int i=0,index;  

    node* tmp=root;  

    while(key[i])  

    {  

        index=key[i]-'a';  

        if(NULL==tmp->next[index])  

            tmp->next[index]=new node(freq);  

        else  

            tmp->next[index]->count+=freq;  

        tmp=tmp->next[index];  

        ++i;  

    }  

}  

int find(string key)  

{  

    int i=0,index;  

    node* tmp=root;  

    while(i<key.length())  

    {  

        index=key[i]-'a';  

        if(NULL==tmp->next[index])  

            return -1;  

        tmp=tmp->next[index];  

        ++i;  

    }  

    return tmp->count;//返回權值  

}  

void init()  

{  

    int i,j;  

    char tmp='a';  

    for(i=2;i<10;++i)  

        num[i]=3;  

    ++num[7];  

    ++num[9];//第7和9個按鍵上各4個字母  

    for(i=2;i<10;++i)  

        for(j=0;j<num[i];++j)  

            T9[i][j]=tmp++;  

}  

void dele()//刪除字典樹  

{  

    for(int i=0;i<26;++i)  

        if(root->next[i])  

            delete root->next[i];  

    delete root;  

}  

int main()  

{  

    init();//初始化數組  

    char key[110];  

    int Case;  

    scanf("%d",&Case);  

    char tmp;  

    int frequency;  

    string str;  

    for(int pzjay=1;pzjay<=Case;++pzjay)  

    {  

        root=new node(0);  

        scanf("%d",&n);  

        while(n--)  

        {  

            scanf("%s %d",key,&frequency);  

            creat(key,frequency);  

        }  

        scanf("%d",&n);  

        int id;  

        string head;  

        string ans;  

        int max_frequency;  

        printf("Scenario #%d:/n",pzjay);  

        int increment,size;  

        while(n--)  

        {  

            scanf("%s",key);  

            size=1;//初始隊列中一個元素  

            while(!dq.empty())  

                dq.pop_back();  

            dq.push_back("");//首先壓入雙端隊列一個空字符串  

            //轉載註明出處:pzjay  

            for(int i=0;key[i]!='1';++i)  

            {  

                id=key[i]-'0';//將按鍵轉化爲數字  

                increment=0;  

                max_frequency=-1;  

                for(int k=0;k<size;++k)  

                {  

                    head=dq.front();//或者dq[0]也可  

                    dq.pop_front();  

                    for(int j=0;j<num[id];++j)  

                    {  

                        str=head+T9[id][j];  

                        int value=find(str);  

                        if(-1!=value)//找到了  

                        {  

                            dq.push_back(str);  

                            ++increment;//記錄本次新增了多少個元素，本次新增的元素就是下次拓展的起點  

                            if(value > max_frequency)  

                            {  

                                max_frequency=value;  

                                ans=str;  

                            }  

                        }     

                    }  

                }  

                size=increment;  

                if(max_frequency!=-1)  

                    printf("%s/n",ans.c_str());  

                else  

                    printf("MANUALLY/n");//其實這時可以退出for了，不過繼續遍歷也無妨，因爲中間斷掉，後面的肯定都不行  

            }  

            printf("/n");  

        }  

        printf("/n");  

        dele();  

    }  

    return pzjay;  

}  

字典樹容易理解，用處廣泛並且本文pzjay原創，— —|||

Google字符串模糊匹配算法，字典樹模糊查詢

面試總結之-哈希算法分析

面試總結之-查找算法分析

面試總結之-遞歸算法分析

new/delete 和malloc/free 的區別

leetcode代碼分類彙總之-排序

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結