mirror of
https://github.com/zongzibinbin/MallChat.git
synced 2026-03-13 21:53:41 +08:00
feat;添加ac自动机算法进行敏感词过滤
This commit is contained in:
@@ -114,6 +114,14 @@
|
||||
<artifactId>redisson-spring-boot-starter</artifactId>
|
||||
<version>3.17.1</version>
|
||||
</dependency>
|
||||
|
||||
<!-- Used for unit testing -->
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<version>${junit.version}</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
<build>
|
||||
<plugins>
|
||||
|
||||
@@ -0,0 +1,95 @@
|
||||
package com.abin.mallchat.common.common.algorithm.ac;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* aho-corasick算法(又称AC自动机算法)
|
||||
* Created by berg on 2023/6/18.
|
||||
*/
|
||||
public class ACTrie {
|
||||
|
||||
// 根节点
|
||||
private ACTrieNode root;
|
||||
|
||||
public ACTrie(List<String> words) {
|
||||
words = words.stream().distinct().collect(Collectors.toList()); // 去重
|
||||
root = new ACTrieNode();
|
||||
for (String word : words) {
|
||||
addWord(word);
|
||||
}
|
||||
initTrieFailover();
|
||||
}
|
||||
|
||||
public void addWord(String word) {
|
||||
ACTrieNode walkNode = root;
|
||||
char[] chars = word.toCharArray();
|
||||
for (int i = 0; i < word.length(); i++) {
|
||||
walkNode.addChildrenIfAbsent(chars[i]);
|
||||
walkNode = walkNode.childOf(chars[i]);
|
||||
walkNode.setDepth(i + 1);
|
||||
}
|
||||
walkNode.setLeaf(true);
|
||||
}
|
||||
|
||||
public void initTrieFailover() {
|
||||
//第一层的fail指针指向root
|
||||
Queue<ACTrieNode> queue = new LinkedList<>();
|
||||
Map<Character, ACTrieNode> childrens = root.getChildren();
|
||||
for (ACTrieNode node : childrens.values()) {
|
||||
node.setFailover(root);
|
||||
queue.offer(node);
|
||||
}
|
||||
//构建剩余层数节点的fail指针,利用层次遍历
|
||||
while (!queue.isEmpty()) {
|
||||
ACTrieNode parentNode = queue.poll();
|
||||
for (Map.Entry<Character, ACTrieNode> entry : parentNode.getChildren().entrySet()) {
|
||||
ACTrieNode childNode = entry.getValue();
|
||||
ACTrieNode failover = parentNode.getFailover();
|
||||
// 在树中找到以childNode为结尾的字符串的最长前缀匹配,failover指向了这个最长前缀匹配的父节点
|
||||
while (failover != null && (!failover.hasChild(entry.getKey()))) {
|
||||
failover = failover.getFailover();
|
||||
}
|
||||
//回溯到了root节点
|
||||
if (failover == null) {
|
||||
childNode.setFailover(root);
|
||||
} else {
|
||||
// 更新当前节点的回退指针
|
||||
childNode.setFailover(failover.childOf(entry.getKey()));
|
||||
}
|
||||
queue.offer(childNode);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 查询句子中包含的敏感词的起始位置和结束位置
|
||||
*
|
||||
* @param text
|
||||
*/
|
||||
public List<MatchResult> matches(String text) {
|
||||
List<MatchResult> result = Lists.newArrayList();
|
||||
ACTrieNode walkNode = root;
|
||||
for (int i = 0; i < text.length(); i++) {
|
||||
char c = text.charAt(i);
|
||||
while (!walkNode.hasChild(c) && walkNode.getFailover() != null) {
|
||||
walkNode = walkNode.getFailover();
|
||||
}
|
||||
//如果因为当前节点的孩子节点有这个字符,则将temp替换为下面的孩子节点
|
||||
if (walkNode.hasChild(c)) {
|
||||
walkNode = walkNode.childOf(c);
|
||||
// 检索到了敏感词
|
||||
if (walkNode.isLeaf()) {
|
||||
result.add(new MatchResult(i - walkNode.getDepth() + 1, i + 1));
|
||||
// 模式串回退到最长可匹配前缀位置并开启新一轮的匹配
|
||||
// 这种回退方式将一个不漏的匹配到所有的敏感词,匹配结果的区间可能会有重叠的部分
|
||||
walkNode = walkNode.getFailover();
|
||||
}
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,46 @@
|
||||
package com.abin.mallchat.common.common.algorithm.ac;
|
||||
|
||||
import com.google.common.collect.Maps;
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Created by berg on 2023/6/18.
|
||||
*/
|
||||
@Getter
|
||||
@Setter
|
||||
public class ACTrieNode {
|
||||
|
||||
// 子节点
|
||||
private Map<Character, ACTrieNode> children = Maps.newHashMap();
|
||||
|
||||
// 匹配过程中,如果模式串不匹配,模式串指针会回退到failover继续进行匹配
|
||||
private ACTrieNode failover = null;
|
||||
|
||||
private int depth;
|
||||
|
||||
private boolean isLeaf = false;
|
||||
|
||||
public void addChildrenIfAbsent(char c) {
|
||||
children.computeIfAbsent(c, (key) -> new ACTrieNode());
|
||||
}
|
||||
|
||||
public ACTrieNode childOf(char c) {
|
||||
return children.get(c);
|
||||
}
|
||||
|
||||
public boolean hasChild(char c) {
|
||||
return children.containsKey(c);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "ACTrieNode{" +
|
||||
"failover=" + failover +
|
||||
", depth=" + depth +
|
||||
", isLeaf=" + isLeaf +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,24 @@
|
||||
package com.abin.mallchat.common.common.algorithm.ac;
|
||||
|
||||
import lombok.*;
|
||||
|
||||
/**
|
||||
* Created by berg on 2023/6/18.
|
||||
*/
|
||||
@Getter
|
||||
@Setter
|
||||
@AllArgsConstructor
|
||||
public class MatchResult {
|
||||
|
||||
private int startIndex;
|
||||
|
||||
private int endIndex;
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "MatchResult{" +
|
||||
"startIndex=" + startIndex +
|
||||
", endIndex=" + endIndex +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,4 @@
|
||||
/**
|
||||
* Created by berg on 2023/6/18.
|
||||
*/
|
||||
package com.abin.mallchat.common.common.algorithm;
|
||||
@@ -1,14 +1,17 @@
|
||||
package com.abin.mallchat.common.common.utils;
|
||||
|
||||
import com.abin.mallchat.common.common.algorithm.ac.ACTrie;
|
||||
import com.abin.mallchat.common.common.algorithm.ac.MatchResult;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.io.*;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.file.Files;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
|
||||
/**
|
||||
@@ -18,11 +21,9 @@ import java.util.stream.Collectors;
|
||||
* @since 2023/06/11
|
||||
*/
|
||||
public final class SensitiveWordUtils {
|
||||
private static SensitiveWordList wordList;
|
||||
private final static char replace = '*'; // 替代字符
|
||||
private final static char[] skip = new char[]{ // 遇到这些字符就会跳过
|
||||
' ', '!', '*', '-', '+', '_', '=', ',', ',', '.', '@', ';', ':', ';', ':'
|
||||
};
|
||||
private final static char mask_char = '*'; // 替代字符
|
||||
|
||||
private static ACTrie ac_trie = null;
|
||||
|
||||
/**
|
||||
* 有敏感词
|
||||
@@ -42,56 +43,20 @@ public final class SensitiveWordUtils {
|
||||
* @return 替换后的文本
|
||||
*/
|
||||
public static String filter(String text) {
|
||||
if (wordList == null || wordList.size() == 0 || StringUtils.isBlank(text)) return text;
|
||||
char[] __char__ = text.toCharArray(); // 把String转化成char数组,便于遍历
|
||||
int i, j;
|
||||
Word word;
|
||||
boolean flag; // 是否需要替换
|
||||
for (i = 0; i < __char__.length; i++) { // 遍历所有字符
|
||||
char c = __char__[i];
|
||||
word = wordList.binaryGet(c); // 使用二分查找来寻找字符,提高效率
|
||||
if (word != null) { // word != null说明找到了
|
||||
flag = false;
|
||||
j = i + 1;
|
||||
while (j < __char__.length) { // 开始逐个比较后面的字符
|
||||
if (skip(__char__[j])) { // 跳过空格之类的无关字符
|
||||
j++;
|
||||
continue;
|
||||
if (StringUtils.isBlank(text)) return text;
|
||||
List<MatchResult> matchResults = ac_trie.matches(text);
|
||||
StringBuffer result = new StringBuffer(text);
|
||||
for (MatchResult matchResult : matchResults) {
|
||||
replaceBetween(result, matchResult.getStartIndex(), matchResult.getEndIndex());
|
||||
}
|
||||
if (word.next != null) { // 字符串尚未结束,不确定是否存在敏感词
|
||||
/*
|
||||
以下代码并没有使用二分查找,因为以同一个字符开头的敏感词较少
|
||||
例如,wordList中记录了所有敏感词的开头第一个字,它的数量通常会有上千个
|
||||
假如现在锁定了字符“T”开头的敏感词,而“T”开头的敏感词只有10个,这时使用二分查找的效率反而低于顺序查找
|
||||
*/
|
||||
word = word.next.get(__char__[j]);
|
||||
if (word == null) {
|
||||
break;
|
||||
return result.toString();
|
||||
}
|
||||
j++;
|
||||
} else { // 字符串已结束,存在敏感词汇
|
||||
flag = true;
|
||||
break;
|
||||
|
||||
private static void replaceBetween(StringBuffer buffer, int startIndex, int endIndex) {
|
||||
for (int i = startIndex; i < endIndex; i++) {
|
||||
buffer.setCharAt(i, mask_char);
|
||||
}
|
||||
}
|
||||
if (word != null && word.next == null) {
|
||||
flag = true;
|
||||
}
|
||||
if (flag) { // 如果flag==true,说明检测出敏感粗,需要替换
|
||||
while (i < j) {
|
||||
// if(skip(__char__[i])){ // 跳过空格之类的无关字符,如果要把空格也替换成'*',则删除这个if语句
|
||||
// i++;
|
||||
// continue;
|
||||
// }
|
||||
__char__[i] = replace;
|
||||
i++;
|
||||
}
|
||||
i--;
|
||||
}
|
||||
}
|
||||
}
|
||||
return new String(__char__);
|
||||
}
|
||||
|
||||
/**
|
||||
* 加载敏感词列表
|
||||
@@ -100,26 +65,7 @@ public final class SensitiveWordUtils {
|
||||
*/
|
||||
public static void loadWord(List<String> words) {
|
||||
if (words == null) return;
|
||||
words = words.stream().distinct().collect(Collectors.toList()); // 去重
|
||||
char[] chars;
|
||||
SensitiveWordList now;
|
||||
Word word;
|
||||
wordList = new SensitiveWordList();
|
||||
for (String __word__ : words) {
|
||||
if (__word__ == null) continue;
|
||||
chars = __word__.toCharArray();
|
||||
now = wordList;
|
||||
word = null;
|
||||
for (char c : chars) {
|
||||
if (word != null) {
|
||||
if (word.next == null) word.next = new SensitiveWordList();
|
||||
now = word.next;
|
||||
}
|
||||
word = now.get(c);
|
||||
if (word == null) word = now.add(c);
|
||||
}
|
||||
}
|
||||
sort(wordList);
|
||||
ac_trie = new ACTrie(words);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -150,102 +96,8 @@ public final class SensitiveWordUtils {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 对敏感词多叉树递增排序
|
||||
*
|
||||
* @param list 待排序List
|
||||
*/
|
||||
private static void sort(SensitiveWordList list) {
|
||||
if (list == null) return;
|
||||
Collections.sort(list); // 递增排序
|
||||
for (Word word : list) {
|
||||
sort(word.next);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断是否跳过当前字符
|
||||
*
|
||||
* @param c 待检测字符
|
||||
* @return true:需要跳过 false:不需要跳过
|
||||
*/
|
||||
private static boolean skip(char c) {
|
||||
for (char c1 : skip) {
|
||||
if (c1 == c) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* 敏感词列表
|
||||
*
|
||||
* @author zhaoyuhang
|
||||
* @since 2023/06/11
|
||||
*/
|
||||
public static class SensitiveWordList extends ArrayList<Word> {
|
||||
public Word get(char c) {
|
||||
for (Word w : this) {
|
||||
if (w.c == c) return w;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* 二分查找,必须先升序排序
|
||||
*
|
||||
* @param c 需要查找的字符
|
||||
* @return Word对象:如果找到 null:如果没找到
|
||||
*/
|
||||
public Word binaryGet(char c) {
|
||||
int left, right, key;
|
||||
Word word;
|
||||
left = 0;
|
||||
right = this.size() - 1;
|
||||
while (left <= right) {
|
||||
key = (left + right) / 2;
|
||||
word = get(key);
|
||||
if (word.c == c) {
|
||||
return word;
|
||||
} else if (word.c > c) {
|
||||
right = key - 1;
|
||||
} else {
|
||||
left = key + 1;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public Word add(char c) {
|
||||
Word word = new Word(c);
|
||||
super.add(word);
|
||||
return word;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 敏感词
|
||||
*
|
||||
* @author zhaoyuhang
|
||||
* @since 2023/06/11
|
||||
*/
|
||||
public static class Word implements Comparable<Word> {
|
||||
public char c;
|
||||
public SensitiveWordList next = null;
|
||||
|
||||
public Word(char c) {
|
||||
this.c = c;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(Word word) {
|
||||
return c - word.c;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return c + "(" + (next == null ? null : next.size()) + ")";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,52 @@
|
||||
package com.abin.mallchat.common.common.algorithm.ac;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
/**
|
||||
* Created by berg on 2023/6/18.
|
||||
*/
|
||||
public class ACTrieTest {
|
||||
|
||||
private final static List<String> ALPHABET = Lists.newArrayList("abc", "bcd", "cde");
|
||||
|
||||
private static ACTrie trie(List<String> keywords) {
|
||||
return new ACTrie(keywords);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test_TextIsLongerThanKeyword() {
|
||||
final ACTrie trie = trie(ALPHABET);
|
||||
final String text = " " + ALPHABET.get(0);
|
||||
List<MatchResult> matchResults = trie.matches(text);
|
||||
checkResult(matchResults.get(0), 1, 4, ALPHABET.get(0), text);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test_VariousKeywordsOneMatch() {
|
||||
final ACTrie trie = trie(ALPHABET);
|
||||
final String text = "bcd";
|
||||
List<MatchResult> matchResults = trie.matches(text);
|
||||
checkResult(matchResults.get(0), 0, 3, ALPHABET.get(1), text);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test_VariousKeywordsMultiMatch() {
|
||||
final ACTrie trie = trie(ALPHABET);
|
||||
final String text = "abcd";
|
||||
List<MatchResult> matchResults = trie.matches(text);
|
||||
assertEquals(2, matchResults.size());
|
||||
checkResult(matchResults.get(0), 0, 3, ALPHABET.get(0), text);
|
||||
checkResult(matchResults.get(1), 1, 4, ALPHABET.get(1), text);
|
||||
}
|
||||
|
||||
private void checkResult(MatchResult matchResult, int expectedStart, int expectedEnd, String expectedKeyword, String text) {
|
||||
assertEquals("Start of match should have been " + expectedStart, expectedStart, matchResult.getStartIndex());
|
||||
assertEquals("End of match should have been " + expectedEnd, expectedEnd, matchResult.getEndIndex());
|
||||
assertEquals(expectedKeyword, text.substring(expectedStart, expectedEnd));
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user