fix:优化AC自动机

This commit is contained in:
sunwenhaopro
2024-01-13 13:36:30 +08:00
parent 8e6841ba9f
commit 96a3c47a49
7 changed files with 34 additions and 47 deletions

View File

@@ -1,7 +1,7 @@
package com.abin.mallchat.common.common.algorithm.sensitiveWord; package com.abin.mallchat.common.common.algorithm.sensitiveWord;
import com.abin.mallchat.common.common.algorithm.ac.ACTrie; import com.abin.mallchat.common.common.algorithm.sensitiveWord.ac.ACTrie;
import com.abin.mallchat.common.common.algorithm.ac.MatchResult; import com.abin.mallchat.common.common.algorithm.sensitiveWord.ac.MatchResult;
import org.HdrHistogram.ConcurrentHistogram; import org.HdrHistogram.ConcurrentHistogram;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;

View File

@@ -1,7 +1,6 @@
package com.abin.mallchat.common.common.algorithm.sensitiveWord; package com.abin.mallchat.common.common.algorithm.sensitiveWord;
import com.abin.mallchat.common.common.algorithm.ac.ACTrie; import com.abin.mallchat.common.common.algorithm.sensitiveWord.acpro.ACProTrie;
import com.abin.mallchat.common.common.algorithm.acpro.ACProTrie;
import io.micrometer.core.instrument.util.StringUtils; import io.micrometer.core.instrument.util.StringUtils;
import java.util.List; import java.util.List;
@@ -13,6 +12,7 @@ import java.util.Objects;
*@description: 基于ACFilter的优化增强版本 *@description: 基于ACFilter的优化增强版本
*/ */
public class ACProFilter implements SensitiveWordFilter{ public class ACProFilter implements SensitiveWordFilter{
private ACProTrie acProTrie; private ACProTrie acProTrie;
@Override @Override

View File

@@ -1,4 +1,4 @@
package com.abin.mallchat.common.common.algorithm.ac; package com.abin.mallchat.common.common.algorithm.sensitiveWord.ac;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;

View File

@@ -1,4 +1,4 @@
package com.abin.mallchat.common.common.algorithm.ac; package com.abin.mallchat.common.common.algorithm.sensitiveWord.ac;
import com.google.common.collect.Maps; import com.google.common.collect.Maps;
import lombok.Getter; import lombok.Getter;

View File

@@ -1,4 +1,4 @@
package com.abin.mallchat.common.common.algorithm.ac; package com.abin.mallchat.common.common.algorithm.sensitiveWord.ac;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.Getter; import lombok.Getter;

View File

@@ -1,4 +1,4 @@
package com.abin.mallchat.common.common.algorithm.acpro; package com.abin.mallchat.common.common.algorithm.sensitiveWord.acpro;
import java.util.*; import java.util.*;
@@ -87,52 +87,42 @@ public class ACProTrie {
// 匹配 // 匹配
public String match(String matchWord) public String match(String matchWord)
{ {
Word walkNode=root; Word walkNode = root;
char[] wordArray=matchWord.toCharArray(); char[] wordArray = matchWord.toCharArray();
for(int i=0;i<wordArray.length;i++) for (int i = 0; i < wordArray.length; i++) {
{ // 失败"回溯"
// 失败回调状态 while (!walkNode.hasChild(wordArray[i]) && walkNode.failOver != null) {
while(!walkNode.hasChild(wordArray[i]) && walkNode.failOver!=null) walkNode = walkNode.failOver;
{
walkNode=walkNode.failOver;
} }
if(walkNode.hasChild(wordArray[i])) { if (walkNode.hasChild(wordArray[i])) {
walkNode=walkNode.next.get(wordArray[i]); walkNode = walkNode.next.get(wordArray[i]);
if(walkNode.end){ if (walkNode.end) {
// sentinelA和sentinelB作为哨兵节点去后面探测是否仍存在end // sentinelA和sentinelB作为哨兵节点去后面探测是否仍存在end
Word sentinelA = walkNode; // 记录当前节点 Word sentinelA = walkNode; // 记录当前节点
Word sentinelB = walkNode; //记录end节点 Word sentinelB = walkNode; //记录end节点
int k = i+1; int k = i + 1;
boolean flag=false; boolean flag = false;
//判断end是不是最终end即敏感词是否存在包含关系(abc,abcd) //判断end是不是最终end即敏感词是否存在包含关系(abc,abcd)
while(k < wordArray.length && sentinelA.hasChild(wordArray[k])) { while (k < wordArray.length && sentinelA.hasChild(wordArray[k])) {
sentinelA = sentinelA.next.get(wordArray[k]); sentinelA = sentinelA.next.get(wordArray[k]);
k++; k++;
if(sentinelA.end) if (sentinelA.end) {
{ sentinelB = sentinelA;
sentinelB=sentinelA; flag = true;
flag=true;
} }
} }
// 根据结果去替换* // 根据结果去替换*
if(flag){ // 计算替换长度
int length=sentinelB.depth; int len = flag ? sentinelB.depth : walkNode.depth;
while(length>0) while (len > 0) {
{ len--;
length--; int index = flag ? i - walkNode.depth + 1 + len : i - len;
wordArray[i+length]=MASK; wordArray[index] = MASK;
}
// 直接跳到最后的end节点failOver
i=i+length;
walkNode = sentinelB.failOver;
}else{
int length=walkNode.depth;
while (length>0){
length--;
wordArray[i-length]=MASK;
}
walkNode = walkNode.failOver;
} }
// 更新i
i += flag ? sentinelB.depth : 0;
// 更新node
walkNode = flag ? sentinelB.failOver : walkNode.failOver;
} }
} }
} }

View File

@@ -4,9 +4,6 @@ import com.abin.mallchat.common.common.algorithm.sensitiveWord.ACFilter;
import com.abin.mallchat.common.common.algorithm.sensitiveWord.ACProFilter; import com.abin.mallchat.common.common.algorithm.sensitiveWord.ACProFilter;
import com.abin.mallchat.common.common.algorithm.sensitiveWord.DFAFilter; import com.abin.mallchat.common.common.algorithm.sensitiveWord.DFAFilter;
import org.junit.Test; import org.junit.Test;
import java.io.BufferedReader;
import java.io.FileReader;
import java.util.*; import java.util.*;
/** /**
@@ -50,7 +47,7 @@ public class SensitiveTest {
@Test @Test
public void ACMulti() { public void ACMulti() {
List<String> sensitiveList = Arrays.asList("白痴", "你是白痴", "白痴吗"); List<String> sensitiveList = Arrays.asList("你是白痴","你是");
ACFilter instance = new ACFilter(); ACFilter instance = new ACFilter();
instance.loadWord(sensitiveList); instance.loadWord(sensitiveList);
System.out.println(instance.filter("你是白痴吗")); System.out.println(instance.filter("你是白痴吗"));