字符串搜索的操作是比較+移步(shift)古今,如何高效的移步成為了這個算法的關(guān)鍵所在
最壞的移步就是一次移一步,即暴力解法(Brute Force)嘲玫。那么優(yōu)化的解法就是一次移多步雏门。
字符串搜索算法匯總:傳送門
里面詳細介紹各種解法,真是有點多啊哪雕,囧
比較操作的時候算法中分為兩類一類是從左向右比較即從開始字符到結(jié)束字符,另一類是從右向左比較即從結(jié)束字符到開始字符
本文主要學習Boyer Moore算法
維基百科上說鲫趁,Boyer Moore算法是1977年Robert S. Boyer和J Strother Moore發(fā)明的斯嚎。
阮一峰有篇很好的對BoyerMoore算法的科普文:傳送門
其中詳述了一個Moore教授的例子
BoyerMoore關(guān)于移步最核心的就是兩個概念
壞字符移步(bad-character shift)和好后綴移步(good-suffix shift),弄明白這兩個概念就明白了Boyer Moore.
在需要移步的時候考察這兩個移步的步數(shù)挨厚,誰的值更大就選擇誰來作為移步的步數(shù)
為了生成壞字符移步和好后綴移步堡僻,我們只需要對搜索的子(短)字符串進行處理,而不需要對目標(長)字符串進行處理疫剃。
glibc中關(guān)于子串搜索的函數(shù)strstr的這兩個字符串的參數(shù)名字很有意境钉疫,前者叫needle,后者叫haystack巢价,大海撈針啊牲阁,哈哈。
壞字符移步的生成比較容易理解:
先定義一個包含所有字符(考慮ASCII字符集壤躲,以及ASCII字符集擴展)城菊,一共256個字符
#define ALPHABET_LEN 256
int badCharacter[ALPHABET_LEN];
然后掃描needle串,生成壞字符移步碉克。未在needle串中出現(xiàn)的壞字符的移步都是needle串長度即strlen(needle)
for (i = 0; i < ALPHABET_LEN; i++) {
badcharacter[i] = needlelen;
}
在needle串中出現(xiàn)過的壞字符的移步利用公式:needlelen - 1 - idx求得凌唬,idx代表從左到右對needle串進行掃描的索引,范圍是從0到(needlelen - 1)漏麦。
badcharacter[needle[i]] = needlelen - 1 - i;
考慮到在haystack串中移步時容易理解客税,移步時以最后一個字符作為移步的起點位置况褪,所以上面公式求得的移步值,在移步時要減去字符相比最后字符位置的相對位置即減去(needlelen + 1 + idx)霎挟,idx代表從右到左對needle串進行掃描的索引,范圍是從(needlelen - 1)到0麻掸。
badcharacter[haystack[j+i]] - needlelen + 1 + i
好后綴移步生成:
參考維基百科的C語言實現(xiàn)
經(jīng)過2次掃描needle串
第一次掃描是找是否有needle開始位置的前綴字符串能夠與后綴字符串匹配的情況酥夭,并設(shè)置相應(yīng)字符的好后綴移步
匹配類似下面這種場景
ABCDAB
444461
// true if the suffix of word starting from word[pos] is a prefix
// of word
int is_prefix(uint8_t *word, int wordlen, int pos) {
int i;
int suffixlen = wordlen - pos;
// could also use the strncmp() library function here
for (i = 0; i < suffixlen; i++) {
if (word[i] != word[pos+i]) {
return 0;
}
}
return 1;
}
for (p = needlelen - 1; p >= 0; p--) {
if (is_prefix(needle, needlelen, p + 1)) {
last_prefix_index = p + 1;
}
goodsuffix[p] = last_prefix_index + needlelen - 1 - p;
}
第二次掃描是找是否有needle中的字符串能夠與字符串后綴最大值長度字符匹配的情況,并設(shè)置相應(yīng)字符位置的好后綴移步
匹配類似下面這種場景
BABCDAB
6666461
int suffix_length(uint8_t *word, int wordlen, int pos) {
int i;
// increment suffix length i to the first mismatch or beginning
// of the word
for (i = 0; (word[pos-i] == word[wordlen-1-i]) && (i < pos); i++);
return i;
}
for (p = 0; p < needlelen - 1; p++) {
int slen = suffix_length(needle, needlelen, p);
if (needle[p - slen] != needle[needlelen - 1 - slen]) {
goodsuffix[needlelen - 1 - slen] = needlelen - 1 - p + slen;
}
}
最后用教授的例子作為結(jié)束
HERE IS A SIMPLE EXAMPLE
EXAMPLE
GsMoveStep=1
BcMoveStep=7
HERE IS A SIMPLE EXAMPLE
EXAMPLE
GsMoveStep=1
BcMoveStep=2
HERE IS A SIMPLE EXAMPLE
EXAMPLE
GsMoveStep=6
BcMoveStep=3
HERE IS A SIMPLE EXAMPLE
EXAMPLE
GsMoveStep=1
BcMoveStep=2
HERE IS A SIMPLE EXAMPLE
EXAMPLE
代碼:
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#define ALPHABET_LEN 256
#define NOT_FOUND needlelen
#define max(a, b) ((a < b) ? b : a)
#define PRINT_TABLE(table, len) \
do {\
for(int i = 0; i < len; i++)\
printf("%d,", table[i]);\
printf("\n");\
}while(0)
void make_badcharacter(int *badcharacter, uint8_t *needle, int32_t needlelen) {
int i;
for (i = 0; i < ALPHABET_LEN; i++) {
badcharacter[i] = NOT_FOUND;
}
for (i = 0; i < needlelen - 1; i++) {
badcharacter[needle[i]] = needlelen - 1 - i;
}
}
// true if the suffix of word starting from word[pos] is a prefix
// of word
int is_prefix(uint8_t *word, int wordlen, int pos) {
int i;
int suffixlen = wordlen - pos;
// could also use the strncmp() library function here
for (i = 0; i < suffixlen; i++) {
if (word[i] != word[pos+i]) {
return 0;
}
}
return 1;
}
// length of the longest suffix of word ending on word[pos].
// suffix_length("dddbcabc", 8, 4) = 2
int suffix_length(uint8_t *word, int wordlen, int pos) {
int i;
// increment suffix length i to the first mismatch or beginning
// of the word
for (i = 0; (word[pos-i] == word[wordlen-1-i]) && (i < pos); i++);
return i;
}
void make_goodsuffix(int *goodsuffix, uint8_t *needle, int32_t needlelen) {
int p;
int last_prefix_index = needlelen - 1;
// first loop
for (p = needlelen - 1; p >= 0; p--) {
if (is_prefix(needle, needlelen, p + 1)) {
last_prefix_index = p + 1;
}
goodsuffix[p] = last_prefix_index + needlelen - 1 - p;
}
PRINT_TABLE(goodsuffix, needlelen);
// second loop
for (p = 0; p < needlelen - 1; p++) {
int slen = suffix_length(needle, needlelen, p);
if (needle[p - slen] != needle[needlelen - 1 - slen]) {
goodsuffix[needlelen - 1 - slen] = needlelen - 1 - p + slen;
}
}
PRINT_TABLE(goodsuffix, needlelen);
}
uint8_t* boyer_moore (uint8_t *haystack, uint32_t haystacklen, uint8_t *needle, uint32_t needlelen) {
int i,j;
int badcharacter[ALPHABET_LEN];
int *goodsuffix = (int *)malloc(needlelen * sizeof(int));
make_badcharacter(badcharacter, needle, needlelen);
make_goodsuffix(goodsuffix, needle, needlelen);
// The empty needletern must be considered specially
if (needlelen == 0) return haystack;
j = 0;
while (j <= haystacklen - needlelen) {
printf("%s\n", haystack);
for(int s = 0; s < j; s++) printf(" ");
printf("%s\n", needle);
for (i = needlelen - 1; i >= 0 && haystack[j+i] == needle[i]; --i);
if (i < 0) {
free(goodsuffix);
return (haystack + j);
}
else {
printf("GsMoveStep=%d\n", goodsuffix[i] - needlelen + 1 + i);
printf("BcMoveStep=%d\n", badcharacter[haystack[j+i]] - needlelen + 1 + i);
int k = max(badcharacter[haystack[j+i]] - needlelen + 1 + i, goodsuffix[i] - needlelen + 1 + i);
j += k;
}
}
free(goodsuffix);
return NULL;
}
int main(int argc, char const *argv[])
{
char *s1 = "HERE IS A SIMPLE EXAMPLE";
//char *s2 = "BABCDAB"
char *s3 = "EXAMPLE";
//char *idx1 = (char *)boyer_moore((uint8_t*)s1, (uint32_t)strlen(s1), (uint8_t*)s2, (uint32_t)strlen(s2));
char *idx2 = (char *)boyer_moore((uint8_t*)s1, (uint32_t)strlen(s1), (uint8_t*)s3, (uint32_t)strlen(s3));
//printf("%s\n", idx1);
printf("%s\n", idx2);
return 0;
}