diff --git a/mallchat-common/pom.xml b/mallchat-common/pom.xml
index aefa9d6..f97a2e2 100644
--- a/mallchat-common/pom.xml
+++ b/mallchat-common/pom.xml
@@ -114,6 +114,14 @@
redisson-spring-boot-starter
3.17.1
+
+
+
+ junit
+ junit
+ ${junit.version}
+ test
+
diff --git a/mallchat-common/src/main/java/com/abin/mallchat/common/common/algorithm/ac/ACTrie.java b/mallchat-common/src/main/java/com/abin/mallchat/common/common/algorithm/ac/ACTrie.java
new file mode 100644
index 0000000..b8ab1fb
--- /dev/null
+++ b/mallchat-common/src/main/java/com/abin/mallchat/common/common/algorithm/ac/ACTrie.java
@@ -0,0 +1,95 @@
+package com.abin.mallchat.common.common.algorithm.ac;
+
+import com.google.common.collect.Lists;
+
+import java.util.*;
+import java.util.stream.Collectors;
+
+/**
+ * aho-corasick算法(又称AC自动机算法)
+ * Created by berg on 2023/6/18.
+ */
+public class ACTrie {
+
+ // 根节点
+ private ACTrieNode root;
+
+ public ACTrie(List words) {
+ words = words.stream().distinct().collect(Collectors.toList()); // 去重
+ root = new ACTrieNode();
+ for (String word : words) {
+ addWord(word);
+ }
+ initTrieFailover();
+ }
+
+ public void addWord(String word) {
+ ACTrieNode walkNode = root;
+ char[] chars = word.toCharArray();
+ for (int i = 0; i < word.length(); i++) {
+ walkNode.addChildrenIfAbsent(chars[i]);
+ walkNode = walkNode.childOf(chars[i]);
+ walkNode.setDepth(i + 1);
+ }
+ walkNode.setLeaf(true);
+ }
+
+ public void initTrieFailover() {
+ //第一层的fail指针指向root
+ Queue queue = new LinkedList<>();
+ Map childrens = root.getChildren();
+ for (ACTrieNode node : childrens.values()) {
+ node.setFailover(root);
+ queue.offer(node);
+ }
+ //构建剩余层数节点的fail指针,利用层次遍历
+ while (!queue.isEmpty()) {
+ ACTrieNode parentNode = queue.poll();
+ for (Map.Entry entry : parentNode.getChildren().entrySet()) {
+ ACTrieNode childNode = entry.getValue();
+ ACTrieNode failover = parentNode.getFailover();
+ // 在树中找到以childNode为结尾的字符串的最长前缀匹配,failover指向了这个最长前缀匹配的父节点
+ while (failover != null && (!failover.hasChild(entry.getKey()))) {
+ failover = failover.getFailover();
+ }
+ //回溯到了root节点
+ if (failover == null) {
+ childNode.setFailover(root);
+ } else {
+ // 更新当前节点的回退指针
+ childNode.setFailover(failover.childOf(entry.getKey()));
+ }
+ queue.offer(childNode);
+ }
+ }
+ }
+
+ /**
+ * 查询句子中包含的敏感词的起始位置和结束位置
+ *
+ * @param text
+ */
+ public List matches(String text) {
+ List result = Lists.newArrayList();
+ ACTrieNode walkNode = root;
+ for (int i = 0; i < text.length(); i++) {
+ char c = text.charAt(i);
+ while (!walkNode.hasChild(c) && walkNode.getFailover() != null) {
+ walkNode = walkNode.getFailover();
+ }
+ //如果因为当前节点的孩子节点有这个字符,则将temp替换为下面的孩子节点
+ if (walkNode.hasChild(c)) {
+ walkNode = walkNode.childOf(c);
+ // 检索到了敏感词
+ if (walkNode.isLeaf()) {
+ result.add(new MatchResult(i - walkNode.getDepth() + 1, i + 1));
+ // 模式串回退到最长可匹配前缀位置并开启新一轮的匹配
+ // 这种回退方式将一个不漏的匹配到所有的敏感词,匹配结果的区间可能会有重叠的部分
+ walkNode = walkNode.getFailover();
+ }
+ }
+ }
+ return result;
+ }
+
+}
diff --git a/mallchat-common/src/main/java/com/abin/mallchat/common/common/algorithm/ac/ACTrieNode.java b/mallchat-common/src/main/java/com/abin/mallchat/common/common/algorithm/ac/ACTrieNode.java
new file mode 100644
index 0000000..1068524
--- /dev/null
+++ b/mallchat-common/src/main/java/com/abin/mallchat/common/common/algorithm/ac/ACTrieNode.java
@@ -0,0 +1,46 @@
+package com.abin.mallchat.common.common.algorithm.ac;
+
+import com.google.common.collect.Maps;
+import lombok.Getter;
+import lombok.Setter;
+
+import java.util.Map;
+
+/**
+ * Created by berg on 2023/6/18.
+ */
+@Getter
+@Setter
+public class ACTrieNode {
+
+ // 子节点
+ private Map children = Maps.newHashMap();
+
+ // 匹配过程中,如果模式串不匹配,模式串指针会回退到failover继续进行匹配
+ private ACTrieNode failover = null;
+
+ private int depth;
+
+ private boolean isLeaf = false;
+
+ public void addChildrenIfAbsent(char c) {
+ children.computeIfAbsent(c, (key) -> new ACTrieNode());
+ }
+
+ public ACTrieNode childOf(char c) {
+ return children.get(c);
+ }
+
+ public boolean hasChild(char c) {
+ return children.containsKey(c);
+ }
+
+ @Override
+ public String toString() {
+ return "ACTrieNode{" +
+ "failover=" + failover +
+ ", depth=" + depth +
+ ", isLeaf=" + isLeaf +
+ '}';
+ }
+}
diff --git a/mallchat-common/src/main/java/com/abin/mallchat/common/common/algorithm/ac/MatchResult.java b/mallchat-common/src/main/java/com/abin/mallchat/common/common/algorithm/ac/MatchResult.java
new file mode 100644
index 0000000..36087e4
--- /dev/null
+++ b/mallchat-common/src/main/java/com/abin/mallchat/common/common/algorithm/ac/MatchResult.java
@@ -0,0 +1,24 @@
+package com.abin.mallchat.common.common.algorithm.ac;
+
+import lombok.*;
+
+/**
+ * Created by berg on 2023/6/18.
+ */
+@Getter
+@Setter
+@AllArgsConstructor
+public class MatchResult {
+
+ private int startIndex;
+
+ private int endIndex;
+
+ @Override
+ public String toString() {
+ return "MatchResult{" +
+ "startIndex=" + startIndex +
+ ", endIndex=" + endIndex +
+ '}';
+ }
+}
diff --git a/mallchat-common/src/main/java/com/abin/mallchat/common/common/algorithm/package-info.java b/mallchat-common/src/main/java/com/abin/mallchat/common/common/algorithm/package-info.java
new file mode 100644
index 0000000..3b8d300
--- /dev/null
+++ b/mallchat-common/src/main/java/com/abin/mallchat/common/common/algorithm/package-info.java
@@ -0,0 +1,4 @@
+/**
+ * Created by berg on 2023/6/18.
+ */
+package com.abin.mallchat.common.common.algorithm;
\ No newline at end of file
diff --git a/mallchat-common/src/main/java/com/abin/mallchat/common/common/utils/SensitiveWordUtils.java b/mallchat-common/src/main/java/com/abin/mallchat/common/common/utils/SensitiveWordUtils.java
index 1f38183..1213034 100644
--- a/mallchat-common/src/main/java/com/abin/mallchat/common/common/utils/SensitiveWordUtils.java
+++ b/mallchat-common/src/main/java/com/abin/mallchat/common/common/utils/SensitiveWordUtils.java
@@ -1,14 +1,17 @@
package com.abin.mallchat.common.common.utils;
+import com.abin.mallchat.common.common.algorithm.ac.ACTrie;
+import com.abin.mallchat.common.common.algorithm.ac.MatchResult;
import org.apache.commons.lang3.StringUtils;
-import java.io.*;
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStreamReader;
import java.nio.file.Files;
import java.util.ArrayList;
-import java.util.Collections;
import java.util.List;
import java.util.Objects;
-import java.util.stream.Collectors;
/**
@@ -18,11 +21,9 @@ import java.util.stream.Collectors;
* @since 2023/06/11
*/
public final class SensitiveWordUtils {
- private static SensitiveWordList wordList;
- private final static char replace = '*'; // 替代字符
- private final static char[] skip = new char[]{ // 遇到这些字符就会跳过
- ' ', '!', '*', '-', '+', '_', '=', ',', ',', '.', '@', ';', ':', ';', ':'
- };
+ private final static char mask_char = '*'; // 替代字符
+
+ private static ACTrie ac_trie = null;
/**
* 有敏感词
@@ -42,55 +43,19 @@ public final class SensitiveWordUtils {
* @return 替换后的文本
*/
public static String filter(String text) {
- if (wordList == null || wordList.size() == 0 || StringUtils.isBlank(text)) return text;
- char[] __char__ = text.toCharArray(); // 把String转化成char数组,便于遍历
- int i, j;
- Word word;
- boolean flag; // 是否需要替换
- for (i = 0; i < __char__.length; i++) { // 遍历所有字符
- char c = __char__[i];
- word = wordList.binaryGet(c); // 使用二分查找来寻找字符,提高效率
- if (word != null) { // word != null说明找到了
- flag = false;
- j = i + 1;
- while (j < __char__.length) { // 开始逐个比较后面的字符
- if (skip(__char__[j])) { // 跳过空格之类的无关字符
- j++;
- continue;
- }
- if (word.next != null) { // 字符串尚未结束,不确定是否存在敏感词
- /*
- 以下代码并没有使用二分查找,因为以同一个字符开头的敏感词较少
- 例如,wordList中记录了所有敏感词的开头第一个字,它的数量通常会有上千个
- 假如现在锁定了字符“T”开头的敏感词,而“T”开头的敏感词只有10个,这时使用二分查找的效率反而低于顺序查找
- */
- word = word.next.get(__char__[j]);
- if (word == null) {
- break;
- }
- j++;
- } else { // 字符串已结束,存在敏感词汇
- flag = true;
- break;
- }
- }
- if (word != null && word.next == null) {
- flag = true;
- }
- if (flag) { // 如果flag==true,说明检测出敏感粗,需要替换
- while (i < j) {
- // if(skip(__char__[i])){ // 跳过空格之类的无关字符,如果要把空格也替换成'*',则删除这个if语句
- // i++;
- // continue;
- // }
- __char__[i] = replace;
- i++;
- }
- i--;
- }
- }
+ if (StringUtils.isBlank(text)) return text;
+ List matchResults = ac_trie.matches(text);
+ StringBuffer result = new StringBuffer(text);
+ for (MatchResult matchResult : matchResults) {
+ replaceBetween(result, matchResult.getStartIndex(), matchResult.getEndIndex());
+ }
+ return result.toString();
+ }
+
+ private static void replaceBetween(StringBuffer buffer, int startIndex, int endIndex) {
+ for (int i = startIndex; i < endIndex; i++) {
+ buffer.setCharAt(i, mask_char);
}
- return new String(__char__);
}
/**
@@ -100,26 +65,7 @@ public final class SensitiveWordUtils {
*/
public static void loadWord(List words) {
if (words == null) return;
- words = words.stream().distinct().collect(Collectors.toList()); // 去重
- char[] chars;
- SensitiveWordList now;
- Word word;
- wordList = new SensitiveWordList();
- for (String __word__ : words) {
- if (__word__ == null) continue;
- chars = __word__.toCharArray();
- now = wordList;
- word = null;
- for (char c : chars) {
- if (word != null) {
- if (word.next == null) word.next = new SensitiveWordList();
- now = word.next;
- }
- word = now.get(c);
- if (word == null) word = now.add(c);
- }
- }
- sort(wordList);
+ ac_trie = new ACTrie(words);
}
/**
@@ -150,102 +96,8 @@ public final class SensitiveWordUtils {
}
}
- /**
- * 对敏感词多叉树递增排序
- *
- * @param list 待排序List
- */
- private static void sort(SensitiveWordList list) {
- if (list == null) return;
- Collections.sort(list); // 递增排序
- for (Word word : list) {
- sort(word.next);
- }
- }
- /**
- * 判断是否跳过当前字符
- *
- * @param c 待检测字符
- * @return true:需要跳过 false:不需要跳过
- */
- private static boolean skip(char c) {
- for (char c1 : skip) {
- if (c1 == c) return true;
- }
- return false;
- }
- /**
- * 敏感词列表
- *
- * @author zhaoyuhang
- * @since 2023/06/11
- */
- public static class SensitiveWordList extends ArrayList {
- public Word get(char c) {
- for (Word w : this) {
- if (w.c == c) return w;
- }
- return null;
- }
-
- /**
- * 二分查找,必须先升序排序
- *
- * @param c 需要查找的字符
- * @return Word对象:如果找到 null:如果没找到
- */
- public Word binaryGet(char c) {
- int left, right, key;
- Word word;
- left = 0;
- right = this.size() - 1;
- while (left <= right) {
- key = (left + right) / 2;
- word = get(key);
- if (word.c == c) {
- return word;
- } else if (word.c > c) {
- right = key - 1;
- } else {
- left = key + 1;
- }
- }
- return null;
- }
-
- public Word add(char c) {
- Word word = new Word(c);
- super.add(word);
- return word;
- }
-
- }
-
- /**
- * 敏感词
- *
- * @author zhaoyuhang
- * @since 2023/06/11
- */
- public static class Word implements Comparable {
- public char c;
- public SensitiveWordList next = null;
-
- public Word(char c) {
- this.c = c;
- }
-
- @Override
- public int compareTo(Word word) {
- return c - word.c;
- }
-
- public String toString() {
- return c + "(" + (next == null ? null : next.size()) + ")";
- }
- }
}
diff --git a/mallchat-common/src/test/java/com/abin/mallchat/common/common/algorithm/ac/ACTrieTest.java b/mallchat-common/src/test/java/com/abin/mallchat/common/common/algorithm/ac/ACTrieTest.java
new file mode 100644
index 0000000..04da0f7
--- /dev/null
+++ b/mallchat-common/src/test/java/com/abin/mallchat/common/common/algorithm/ac/ACTrieTest.java
@@ -0,0 +1,52 @@
+package com.abin.mallchat.common.common.algorithm.ac;
+
+import com.google.common.collect.Lists;
+import org.junit.Test;
+
+import java.util.List;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * Created by berg on 2023/6/18.
+ */
+public class ACTrieTest {
+
+ private final static List ALPHABET = Lists.newArrayList("abc", "bcd", "cde");
+
+ private static ACTrie trie(List keywords) {
+ return new ACTrie(keywords);
+ }
+
+ @Test
+ public void test_TextIsLongerThanKeyword() {
+ final ACTrie trie = trie(ALPHABET);
+ final String text = " " + ALPHABET.get(0);
+ List matchResults = trie.matches(text);
+ checkResult(matchResults.get(0), 1, 4, ALPHABET.get(0), text);
+ }
+
+ @Test
+ public void test_VariousKeywordsOneMatch() {
+ final ACTrie trie = trie(ALPHABET);
+ final String text = "bcd";
+ List matchResults = trie.matches(text);
+ checkResult(matchResults.get(0), 0, 3, ALPHABET.get(1), text);
+ }
+
+ @Test
+ public void test_VariousKeywordsMultiMatch() {
+ final ACTrie trie = trie(ALPHABET);
+ final String text = "abcd";
+ List matchResults = trie.matches(text);
+ assertEquals(2, matchResults.size());
+ checkResult(matchResults.get(0), 0, 3, ALPHABET.get(0), text);
+ checkResult(matchResults.get(1), 1, 4, ALPHABET.get(1), text);
+ }
+
+ private void checkResult(MatchResult matchResult, int expectedStart, int expectedEnd, String expectedKeyword, String text) {
+ assertEquals("Start of match should have been " + expectedStart, expectedStart, matchResult.getStartIndex());
+ assertEquals("End of match should have been " + expectedEnd, expectedEnd, matchResult.getEndIndex());
+ assertEquals(expectedKeyword, text.substring(expectedStart, expectedEnd));
+ }
+}