Merge pull request #163 from sunwenhaopro/sunwenhao

【优化】优化AC自动机敏感词过滤
This commit is contained in:
zongzibinbin
2024-01-12 22:53:26 +08:00
committed by GitHub
12 changed files with 201 additions and 14 deletions

View File

@@ -14,7 +14,7 @@ import com.abin.mallchat.common.common.domain.enums.YesOrNoEnum;
import com.abin.mallchat.common.common.utils.AssertUtil;
import com.abin.mallchat.common.common.utils.discover.PrioritizedUrlDiscover;
import com.abin.mallchat.common.common.utils.discover.domain.UrlInfo;
import com.abin.mallchat.common.common.utils.sensitiveWord.SensitiveWordBs;
import com.abin.mallchat.common.common.algorithm.sensitiveWord.SensitiveWordBs;
import com.abin.mallchat.common.user.domain.entity.User;
import com.abin.mallchat.common.user.domain.enums.RoleEnum;
import com.abin.mallchat.common.user.service.IRoleService;

View File

@@ -0,0 +1,141 @@
package com.abin.mallchat.common.common.algorithm.acpro;
import java.util.*;
/**
*@author CtrlCver
*@date 2024/1/12
*@description: AC自动机
*/
public class ACProTrie {
private final static char MASK = '*'; // 替代字符
private Word root;
// 节点
static class Word{
// 判断是否是敏感词结尾
boolean end=false;
// 失败回调节点/状态
Word failOver=null;
// 记录字符偏移
int depth=0;
// 下个自动机状态
Map<Character,Word> next=new HashMap<>();
public boolean hasChild(char c) {
return next.containsKey(c);
}
}
//构建ACTrie
public void createACTrie(List<String> list){
Word currentNode = new Word();
root=currentNode;
for(String key : list)
{
currentNode=root;
for(int j=0;j<key.length();j++)
{
if(currentNode.next!=null&&currentNode.next.containsKey(key.charAt(j))){
currentNode= currentNode.next.get(key.charAt(j));
// 防止乱序输入改变end,比如dadadacdadac先进入第二个a为false,da进入后把a设置为true
// 这样结果就是a是endc也是end
if(j==key.length()-1){
currentNode.end=true;
}
}else {
Word map = new Word();
if(j==key.length()-1){
map.end=true;
}
currentNode.next.put(key.charAt(j), map);
currentNode=map;
}
currentNode.depth = j+1;
}
}
initFailOver();
}
// 初始化匹配失败回调节点/状态
public void initFailOver(){
Queue<Word> queue=new LinkedList<>();
Map<Character,Word> children=root.next;
for(Word node:children.values())
{
node.failOver=root;
queue.offer(node);
}
while(!queue.isEmpty())
{
Word parentNode=queue.poll();
for(Map.Entry<Character,Word> entry:parentNode.next.entrySet())
{
Word childNode=entry.getValue();
Word failOver=parentNode.failOver;
while(failOver!=null&&(!failOver.next.containsKey(entry.getKey()))){
failOver=failOver.failOver;
}
if(failOver==null){
childNode.failOver=root;
}else{
childNode.failOver=failOver.next.get(entry.getKey());
}
queue.offer(childNode);
}
}
}
// 匹配
public String match(String matchWord)
{
Word walkNode=root;
char[] wordArray=matchWord.toCharArray();
for(int i=0;i<wordArray.length;i++)
{
// 失败回调状态
while(!walkNode.hasChild(wordArray[i]) && walkNode.failOver!=null)
{
walkNode=walkNode.failOver;
}
if(walkNode.hasChild(wordArray[i])) {
walkNode=walkNode.next.get(wordArray[i]);
if(walkNode.end){
// sentinelA和sentinelB作为哨兵节点去后面探测是否仍存在end
Word sentinelA = walkNode; // 记录当前节点
Word sentinelB = walkNode; //记录end节点
int k = i+1;
boolean flag=false;
//判断end是不是最终end即敏感词是否存在包含关系(abc,abcd)
while(k < wordArray.length && sentinelA.hasChild(wordArray[k])) {
sentinelA = sentinelA.next.get(wordArray[k]);
k++;
if(sentinelA.end)
{
sentinelB=sentinelA;
flag=true;
}
}
// 根据结果去替换*
if(flag){
int length=sentinelB.depth;
while(length>0)
{
length--;
wordArray[i+length]=MASK;
}
// 直接跳到最后的end节点failOver
i=i+length;
walkNode = sentinelB.failOver;
}else{
int length=walkNode.depth;
while (length>0){
length--;
wordArray[i-length]=MASK;
}
walkNode = walkNode.failOver;
}
}
}
}
return new String(wordArray);
}
}

View File

@@ -1,4 +1,4 @@
package com.abin.mallchat.common.common.utils.sensitiveWord;
package com.abin.mallchat.common.common.algorithm.sensitiveWord;
import com.abin.mallchat.common.common.algorithm.ac.ACTrie;
import com.abin.mallchat.common.common.algorithm.ac.MatchResult;

View File

@@ -0,0 +1,35 @@
package com.abin.mallchat.common.common.algorithm.sensitiveWord;
import com.abin.mallchat.common.common.algorithm.ac.ACTrie;
import com.abin.mallchat.common.common.algorithm.acpro.ACProTrie;
import io.micrometer.core.instrument.util.StringUtils;
import java.util.List;
import java.util.Objects;
/**
*@author CtrlCver
*@date 2024/1/12
*@description: 基于ACFilter的优化增强版本
*/
public class ACProFilter implements SensitiveWordFilter{
private ACProTrie acProTrie;
@Override
public boolean hasSensitiveWord(String text) {
if(StringUtils.isBlank(text)) return false;
return !Objects.equals(filter(text),text);
}
@Override
public String filter(String text) {
return acProTrie.match(text);
}
@Override
public void loadWord(List<String> words) {
if (words == null) return;
acProTrie = new ACProTrie();
acProTrie.createACTrie(words);
}
}

View File

@@ -1,4 +1,4 @@
package com.abin.mallchat.common.common.utils.sensitiveWord;
package com.abin.mallchat.common.common.algorithm.sensitiveWord;
import org.apache.commons.lang3.StringUtils;
import org.springframework.util.CollectionUtils;

View File

@@ -1,4 +1,4 @@
package com.abin.mallchat.common.common.utils.sensitiveWord;
package com.abin.mallchat.common.common.algorithm.sensitiveWord;
import java.util.List;

View File

@@ -1,4 +1,4 @@
package com.abin.mallchat.common.common.utils.sensitiveWord;
package com.abin.mallchat.common.common.algorithm.sensitiveWord;
import java.util.List;

View File

@@ -1,4 +1,4 @@
package com.abin.mallchat.common.common.utils.sensitiveWord;
package com.abin.mallchat.common.common.algorithm.sensitiveWord;
import java.util.List;

View File

@@ -1,7 +1,7 @@
package com.abin.mallchat.common.common.config;
import com.abin.mallchat.common.common.utils.sensitiveWord.DFAFilter;
import com.abin.mallchat.common.common.utils.sensitiveWord.SensitiveWordBs;
import com.abin.mallchat.common.common.algorithm.sensitiveWord.DFAFilter;
import com.abin.mallchat.common.common.algorithm.sensitiveWord.SensitiveWordBs;
import com.abin.mallchat.common.sensitive.MyWordFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Bean;

View File

@@ -1,6 +1,6 @@
package com.abin.mallchat.common.sensitive;
import com.abin.mallchat.common.common.utils.sensitiveWord.IWordFactory;
import com.abin.mallchat.common.common.algorithm.sensitiveWord.IWordFactory;
import com.abin.mallchat.common.sensitive.dao.SensitiveWordDao;
import com.abin.mallchat.common.sensitive.domain.SensitiveWord;
import org.springframework.beans.factory.annotation.Autowired;

View File

@@ -4,7 +4,7 @@ import cn.hutool.core.util.StrUtil;
import com.abin.mallchat.common.common.event.UserBlackEvent;
import com.abin.mallchat.common.common.event.UserRegisterEvent;
import com.abin.mallchat.common.common.utils.AssertUtil;
import com.abin.mallchat.common.common.utils.sensitiveWord.SensitiveWordBs;
import com.abin.mallchat.common.common.algorithm.sensitiveWord.SensitiveWordBs;
import com.abin.mallchat.common.user.dao.BlackDao;
import com.abin.mallchat.common.user.dao.ItemConfigDao;
import com.abin.mallchat.common.user.dao.UserBackpackDao;

View File

@@ -1,11 +1,13 @@
package com.abin.mallchat.common;
import com.abin.mallchat.common.common.utils.sensitiveWord.ACFilter;
import com.abin.mallchat.common.common.utils.sensitiveWord.DFAFilter;
import com.abin.mallchat.common.common.algorithm.sensitiveWord.ACFilter;
import com.abin.mallchat.common.common.algorithm.sensitiveWord.ACProFilter;
import com.abin.mallchat.common.common.algorithm.sensitiveWord.DFAFilter;
import org.junit.Test;
import java.util.Arrays;
import java.util.List;
import java.io.BufferedReader;
import java.io.FileReader;
import java.util.*;
/**
* Description:
@@ -21,6 +23,7 @@ public class SensitiveTest {
System.out.println(instance.hasSensitiveWord("adabcd"));
}
@Test
public void AC() {
List<String> sensitiveList = Arrays.asList("abcd", "abcbba", "adabca");
@@ -29,6 +32,14 @@ public class SensitiveTest {
instance.hasSensitiveWord("adabcd");
}
@Test
public void ACPro()
{
List<String> sensitiveList = Arrays.asList("白痴", "你是白痴", "白痴吗");
ACProFilter acProFilter=new ACProFilter();
acProFilter.loadWord(sensitiveList);
System.out.println(acProFilter.filter("你是白痴吗"));
}
@Test
public void DFAMulti() {
List<String> sensitiveList = Arrays.asList("白痴", "你是白痴", "白痴吗");