一、單模式串匹配
1.BF和RK算法
(1)BF(暴力匹配算法)
public int BF(char[] target, char[] pattern) {
if (target == null || pattern == null || target.length < pattern.length) return -1;
int tLen = target.length;
int pLen = pattern.length;
for (int i = 0; i < tLen - pLen + 1; i++) {
int j = 0;
for (; j < pLen; j++) {
if (target[i + j] != pattern[j]) {
break;
}
}
if (j == pLen) {
return i;
}
}
return -1;
}
(2)RK算法
選擇的hash算法是26進制轉換一個數(shù),不會存在沖突鸡挠,可能會存在溢出,代碼并未處理搬男。
代碼中省略部分為優(yōu)化前
public int RK(char[] target, char[] pattern) {
if (target == null || pattern == null || target.length < pattern.length) return -1;
int tLen = target.length;
int pLen = pattern.length;
int[] hashCache = new int[pLen];
int sum = 1;
for (int i = 0; i < hashCache.length; i++) {
hashCache[i] = sum;
sum *= 26;
}
//模式串hash
int pHash = 0;
for (int i = 0; i < pLen; i++) {
pHash += (pattern[i] - 'a') * hashCache[pLen - i - 1];
}
//主串中子串hash
int[] targetChildHash = new int[tLen - pLen + 1];
for (int i = 0; i < pLen; i++) {
targetChildHash[0] += (target[i] - 'a') * hashCache[pLen - i - 1];
}
for (int i = 1; i < targetChildHash.length; i++) {
// int hash = 0;
// for (int j = 0; j < pLen; j++) {
// hash += (target[i + j] - 'a') * hashCache[pLen - j - 1];
// }
// targetChildHash[i] = hash;
targetChildHash[i] = (targetChildHash[i - 1] - hashCache[pLen - 1] * (target[i - 1] - 'a')) * 26 + (target[i + pLen - 1] - 'a') * hashCache[0];
}
//比較
for (int i = 0; i < targetChildHash.length; i++) {
//該hash算法不會存在沖突拣展。如果會出現(xiàn)hash沖突,還需要繼續(xù)判斷對比子串和模式串是否相等
if (targetChildHash[i] == pHash) {
return i;
}
}
return -1;
}
關于優(yōu)化部分講解:
假設模式串長度m=3缔逛。主串中相鄰兩個子串 s[i-1]和 s[i](i 表示子串在主串中的起始位置备埃,子串的長度都為 m),對應的哈希值計算公式是有交集的:
優(yōu)化公式推導:
2.BM算法
public class BM {
private static final int SIZE = 256;
public static int bm(char[] target, char[] pattern) {
if (target == null || pattern == null || target.length < pattern.length) return -1;
int[] bc = new int[SIZE];
generateBC(bc, pattern);
int tLen = target.length;
int pLen = pattern.length;
int[] suffix = new int[pLen];
boolean[] prefix = new boolean[pLen];
generateGS(suffix, prefix, pattern, pLen);
int i = 0;
while (i <= tLen - pLen) {
//1.壞字符規(guī)則
int j = pLen - 1;
for (; j >= 0; j--) {
if (target[i + j] != pattern[j]) {//此時j是壞字符對應的模式串下標
break;
}
}
if (j < 0) {//匹配成功
return i;
}
int x = j - bc[(int) target[i + j]];
int y = 0;
//2.好后綴規(guī)則
if (j < pLen - 1) {// 如果有好后綴(j+1~pLen-1為好后綴)
y = getGS(j, suffix, prefix, pLen);
}
i = i + Math.max(x, y);//如果是i + (j - bc[(int) target[i + j]]),相當于模式串往后滑動j - bc[(int) target[i + j]]位
}
return -1;
}
private static int getGS(int j, int[] suffix, boolean[] prefix, int pLen) {
int k = pLen - j - 1;//好后綴長度
if (suffix[k] != -1) return j - suffix[k] + 1;//1.模式串存在好后綴
for (int r = pLen - j - 2; r >= 1; r--) {
if (prefix[r]) {//2.模式串是否存在前綴與好后綴子串匹配
return r;
}
}
return pLen;//3.不存在匹配褐奴,直接滑動pLen
}
private static void generateGS(int[] suffix, boolean[] prefix, char[] pattern, int pLen) {
for (int i = 0; i < pLen; i++) {
suffix[i] = -1;
}
for (int i = 0; i < pLen - 1; i++) {
int j = i;
int k = 0;
while (j >= 0 && pattern[j] == pattern[pLen - k - 1]) {
++k;
suffix[k] = j;
--j;
}
if (j == -1) {
prefix[k] = true;
}
}
}
/**
* 構建壞字符哈希表
* <p>
* 假設字符串的字符集不是很大按脚,每個字符長度是 1 字節(jié),用大小為 256 的數(shù)組來記錄每個字符在模式串中出現(xiàn)的位置敦冬。
* 數(shù)組的下標對應字符的 ASCII 碼值辅搬,數(shù)組中存儲這個字符在模式串中出現(xiàn)的位置。
*
* @param bc
* @param pattern
*/
public static void generateBC(int[] bc, char[] pattern) {
for (int i = 0; i < bc.length; i++) {
bc[i] = -1;
}
for (int i = 0; i < pattern.length; i++) {//從前往后遍歷脖旱,記錄最后面出現(xiàn)的位置
int index = (int) pattern[i];
bc[index] = i;
}
}
public static void main(String[] args) {
String t = "abababc";
String p = "bc";
System.out.println(bm(t.toCharArray(), p.toCharArray()));
}
}
3.KMP算法
public class KMP {
public static int kmp(char[] target, char[] pattern) {
int pLen = pattern.length;
int tLen = target.length;
int[] next = getNexts(pattern, pLen);
int j = 0;
for (int i = 0; i < tLen; i++) {
while (j > 0 && target[i] != pattern[j]) {
j = next[j - 1] + 1;//遇到壞字符時堪遂,查詢next數(shù)組,改變模式串匹配起點
}
if (target[i] == pattern[j]) {//相等繼續(xù)往后匹配
++j;
}
if (j == pLen) {//匹配成功萌庆,返回下標
return i - pLen + 1;
}
}
return -1;
}
private static int[] getNexts(char[] pattern, int pLen) {
int[] next = new int[pLen];
next[0] = -1;// 0位置沒得回溯
int k = -1;// 當前最長可匹配前綴子串的結尾字符下標
for (int i = 1; i < pLen; i++) {// i表示已匹配前綴的位置(當前待填充的數(shù)組下標)
while (k != -1 && pattern[k + 1] != pattern[i]) {
k = next[k];//沒辦法找到更長的可匹配前后綴了蚤氏,回溯找次長可匹配前后綴
}
if (pattern[k + 1] == pattern[i]) {
++k;
}
next[i] = k;
}
return next;
}
}
二、多模式串匹配
1.Trie樹
/**
* 假設字符集只是'a'~'z'的情況
*/
public class Trie {
private TreeNode root = new TreeNode('/');//根節(jié)點不存儲數(shù)據(jù)
public void insert(char[] text) {
TreeNode p = root;
for (int i = 0; i < text.length; i++) {
int index = text[i] - 'a';
if (p.children[index] == null) {
p.children[index] = new TreeNode(text[i]);
}
p = p.children[index];
}
p.isEndingChar = true;
}
public boolean find(char[] text) {
TreeNode p = root;
for (int i = 0; i < text.length; i++) {
int index = text[i] - 'a';
if (p.children[index] == null) {
return false;
}
p = p.children[index];
}
if (!p.isEndingChar) {// 不能完全匹配踊兜,只是匹配了前綴
return false;
}
return true;
}
class TreeNode {
public char data;
public TreeNode[] children = new TreeNode[26];
public boolean isEndingChar = false;
public TreeNode(char data) {
this.data = data;
}
}
}
2.AC自動機
public class AC {
private AcNode root = new AcNode('/');
/**
* 將多個模式串構建成 AC 自動機
*/
public AC(String[] pattern) {
//1.通過多個模式串構建Trie樹
for (String p : pattern) {
insert(p.toCharArray());
}
//2.在 Trie 樹上構建失敗指針
buildFailurePointer();
}
/**
* 構建失敗指針
*/
private void buildFailurePointer() {
Queue<AcNode> queue = new LinkedList<>();
queue.add(root);
while (!queue.isEmpty()) {
AcNode p = queue.poll();
for (int i = 0; i < 26; i++) {
AcNode pc = p.children[i];
if (pc == null) continue;
if (p == root) {
pc.fail = root;
} else {
AcNode q = p.fail;
while (q != null) {
AcNode qc = q.children[i];
if (qc != null) {
pc.fail = qc;
break;
}
q = q.fail;
}
if (q == null) {
pc.fail = root;
}
}
queue.add(pc);
}
}
}
public void insert(char[] data) {
AcNode p = root;
for (char c : data) {
int index = c - 'a';
if (p.children[index] == null) {
p.children[index] = new AcNode(c);
}
p = p.children[index];
}
p.isEndingChar = true;
p.length = data.length;
}
/**
* 多模式串匹配
*
* @param target
*/
private void match(char[] target) {//target是主串
AcNode p = root;
for (int i = 0; i < target.length; i++) {
int index = target[i] - 'a';
if (p.children[index] == null && p != root) {
p = p.fail;
}
p = p.children[index];
if (p == null) {// 如果沒有匹配的竿滨,從root開始重新匹配
p = root;
}
AcNode tmpNode = p;
while (tmpNode != root) {// 打印出可以匹配的模式串
if (tmpNode.isEndingChar) {
int pos = i - tmpNode.length + 1;
System.out.println("匹配起始下標" + pos + "; 長度" + tmpNode.length);
}
tmpNode = tmpNode.fail;
}
}
}
class AcNode {
public char data;
public AcNode[] children = new AcNode[26];//字符集只包含a~z這26個字符
public boolean isEndingChar = false;
public AcNode fail = null;
public int length = -1;//isEndingChar為true時候記錄模式串長度
public AcNode(char data) {
this.data = data;
}
}
public static void main(String[] args) {
String[] pattern = {"abce", "bcd", "ce"};
AC ac = new AC(pattern);
String target = "cdbcdklce";
ac.match(target.toCharArray());
}
}
參考:
[1]32 | 字符串匹配基礎(上):如何借助哈希算法實現(xiàn)高效字符串匹配?-極客時間
[2]33 | 字符串匹配基礎(中):如何實現(xiàn)文本編輯器中的查找功能捏境?-極客時間
[3]34 | 字符串匹配基礎(下):如何借助BM算法輕松理解KMP算法于游?-極客時間
[4]35 | Trie樹:如何實現(xiàn)搜索引擎的搜索關鍵詞提示功能?-極客時間
[5]36 | AC自動機:如何用多模式串匹配實現(xiàn)敏感詞過濾功能垫言?-極客時間