diff --git a/mallchat-common/pom.xml b/mallchat-common/pom.xml index aefa9d6..f97a2e2 100644 --- a/mallchat-common/pom.xml +++ b/mallchat-common/pom.xml @@ -114,6 +114,14 @@ redisson-spring-boot-starter 3.17.1 + + + + junit + junit + ${junit.version} + test + diff --git a/mallchat-common/src/main/java/com/abin/mallchat/common/common/algorithm/ac/ACTrie.java b/mallchat-common/src/main/java/com/abin/mallchat/common/common/algorithm/ac/ACTrie.java new file mode 100644 index 0000000..523e4b2 --- /dev/null +++ b/mallchat-common/src/main/java/com/abin/mallchat/common/common/algorithm/ac/ACTrie.java @@ -0,0 +1,100 @@ +package com.abin.mallchat.common.common.algorithm.ac; + +import com.google.common.collect.Lists; + +import javax.annotation.concurrent.NotThreadSafe; +import java.util.*; +import java.util.stream.Collectors; + +/** + * aho-corasick算法(又称AC自动机算法) + * Created by berg on 2023/6/18. + */ +@NotThreadSafe +public class ACTrie { + + // 根节点 + private ACTrieNode root; + + public ACTrie(List words) { + words = words.stream().distinct().collect(Collectors.toList()); // 去重 + root = new ACTrieNode(); + for (String word : words) { + addWord(word); + } + initFailover(); + } + + public void addWord(String word) { + ACTrieNode walkNode = root; + char[] chars = word.toCharArray(); + for (int i = 0; i < word.length(); i++) { + walkNode.addChildrenIfAbsent(chars[i]); + walkNode = walkNode.childOf(chars[i]); + walkNode.setDepth(i + 1); + } + walkNode.setLeaf(true); + } + + /** + * 初始化节点中的回退指针 + */ + private void initFailover() { + //第一层的fail指针指向root + Queue queue = new LinkedList<>(); + Map children = root.getChildren(); + for (ACTrieNode node : children.values()) { + node.setFailover(root); + queue.offer(node); + } + //构建剩余层数节点的fail指针,利用层次遍历 + while (!queue.isEmpty()) { + ACTrieNode parentNode = queue.poll(); + for (Map.Entry entry : parentNode.getChildren().entrySet()) { + ACTrieNode childNode = entry.getValue(); + ACTrieNode failover = parentNode.getFailover(); + // 在树中找到以childNode为结尾的字符串的最长前缀匹配,failover指向了这个最长前缀匹配的父节点 + while (failover != null && (!failover.hasChild(entry.getKey()))) { + failover = failover.getFailover(); + } + //回溯到了root节点 + if (failover == null) { + childNode.setFailover(root); + } else { + // 更新当前节点的回退指针 + childNode.setFailover(failover.childOf(entry.getKey())); + } + queue.offer(childNode); + } + } + } + + /** + * 查询句子中包含的敏感词的起始位置和结束位置 + * + * @param text + */ + public List matches(String text) { + List result = Lists.newArrayList(); + ACTrieNode walkNode = root; + for (int i = 0; i < text.length(); i++) { + char c = text.charAt(i); + while (!walkNode.hasChild(c) && walkNode.getFailover() != null) { + walkNode = walkNode.getFailover(); + } + //如果因为当前节点的孩子节点有这个字符,则将walkNode替换为下面的孩子节点 + if (walkNode.hasChild(c)) { + walkNode = walkNode.childOf(c); + // 检索到了敏感词 + if (walkNode.isLeaf()) { + result.add(new MatchResult(i - walkNode.getDepth() + 1, i + 1)); + // 模式串回退到最长可匹配前缀位置并开启新一轮的匹配 + // 这种回退方式将一个不漏的匹配到所有的敏感词,匹配结果的区间可能会有重叠的部分 + walkNode = walkNode.getFailover(); + } + } + } + return result; + } + +} diff --git a/mallchat-common/src/main/java/com/abin/mallchat/common/common/algorithm/ac/ACTrieNode.java b/mallchat-common/src/main/java/com/abin/mallchat/common/common/algorithm/ac/ACTrieNode.java new file mode 100644 index 0000000..1068524 --- /dev/null +++ b/mallchat-common/src/main/java/com/abin/mallchat/common/common/algorithm/ac/ACTrieNode.java @@ -0,0 +1,46 @@ +package com.abin.mallchat.common.common.algorithm.ac; + +import com.google.common.collect.Maps; +import lombok.Getter; +import lombok.Setter; + +import java.util.Map; + +/** + * Created by berg on 2023/6/18. + */ +@Getter +@Setter +public class ACTrieNode { + + // 子节点 + private Map children = Maps.newHashMap(); + + // 匹配过程中,如果模式串不匹配,模式串指针会回退到failover继续进行匹配 + private ACTrieNode failover = null; + + private int depth; + + private boolean isLeaf = false; + + public void addChildrenIfAbsent(char c) { + children.computeIfAbsent(c, (key) -> new ACTrieNode()); + } + + public ACTrieNode childOf(char c) { + return children.get(c); + } + + public boolean hasChild(char c) { + return children.containsKey(c); + } + + @Override + public String toString() { + return "ACTrieNode{" + + "failover=" + failover + + ", depth=" + depth + + ", isLeaf=" + isLeaf + + '}'; + } +} diff --git a/mallchat-common/src/main/java/com/abin/mallchat/common/common/algorithm/ac/MatchResult.java b/mallchat-common/src/main/java/com/abin/mallchat/common/common/algorithm/ac/MatchResult.java new file mode 100644 index 0000000..36087e4 --- /dev/null +++ b/mallchat-common/src/main/java/com/abin/mallchat/common/common/algorithm/ac/MatchResult.java @@ -0,0 +1,24 @@ +package com.abin.mallchat.common.common.algorithm.ac; + +import lombok.*; + +/** + * Created by berg on 2023/6/18. + */ +@Getter +@Setter +@AllArgsConstructor +public class MatchResult { + + private int startIndex; + + private int endIndex; + + @Override + public String toString() { + return "MatchResult{" + + "startIndex=" + startIndex + + ", endIndex=" + endIndex + + '}'; + } +} diff --git a/mallchat-common/src/main/java/com/abin/mallchat/common/common/algorithm/package-info.java b/mallchat-common/src/main/java/com/abin/mallchat/common/common/algorithm/package-info.java new file mode 100644 index 0000000..3b8d300 --- /dev/null +++ b/mallchat-common/src/main/java/com/abin/mallchat/common/common/algorithm/package-info.java @@ -0,0 +1,4 @@ +/** + * Created by berg on 2023/6/18. + */ +package com.abin.mallchat.common.common.algorithm; \ No newline at end of file diff --git a/mallchat-common/src/main/java/com/abin/mallchat/common/common/utils/SensitiveWordUtils0.java b/mallchat-common/src/main/java/com/abin/mallchat/common/common/utils/SensitiveWordUtils0.java new file mode 100644 index 0000000..f67c694 --- /dev/null +++ b/mallchat-common/src/main/java/com/abin/mallchat/common/common/utils/SensitiveWordUtils0.java @@ -0,0 +1,70 @@ +package com.abin.mallchat.common.common.utils; + +import com.abin.mallchat.common.common.algorithm.ac.ACTrie; +import com.abin.mallchat.common.common.algorithm.ac.MatchResult; +import org.HdrHistogram.ConcurrentHistogram; +import org.apache.commons.lang3.StringUtils; + +import java.util.List; +import java.util.Objects; + +/** + * 基于ac自动机实现的敏感词过滤工具类 + * 可以用来替代{@link ConcurrentHistogram} + * 为了兼容提供了相同的api接口 {@code hasSensitiveWord} + * + * Created by berg on 2023/6/18. + */ +public class SensitiveWordUtils0 { + + private final static char mask_char = '*'; // 替代字符 + + private static ACTrie ac_trie = null; + + /** + * 有敏感词 + * + * @param text 文本 + * @return boolean + */ + public static boolean hasSensitiveWord(String text) { + if (StringUtils.isBlank(text)) return false; + return !Objects.equals(filter(text), text); + } + + /** + * 敏感词替换 + * + * @param text 待替换文本 + * @return 替换后的文本 + */ + public static String filter(String text) { + if (StringUtils.isBlank(text)) return text; + List matchResults = ac_trie.matches(text); + StringBuffer result = new StringBuffer(text); + // matchResults是按照startIndex排序的,因此可以通过不断更新endIndex最大值的方式算出尚未被替代部分 + int endIndex = 0; + for (MatchResult matchResult : matchResults) { + endIndex = Math.max(endIndex, matchResult.getEndIndex()); + replaceBetween(result, matchResult.getStartIndex(), endIndex); + } + return result.toString(); + } + + private static void replaceBetween(StringBuffer buffer, int startIndex, int endIndex) { + for (int i = startIndex; i < endIndex; i++) { + buffer.setCharAt(i, mask_char); + } + } + + /** + * 加载敏感词列表 + * + * @param words 敏感词数组 + */ + public static void loadWord(List words) { + if (words == null) return; + ac_trie = new ACTrie(words); + } + +} diff --git a/mallchat-common/src/test/java/com/abin/mallchat/common/common/algorithm/ac/ACTrieTest.java b/mallchat-common/src/test/java/com/abin/mallchat/common/common/algorithm/ac/ACTrieTest.java new file mode 100644 index 0000000..2fb749f --- /dev/null +++ b/mallchat-common/src/test/java/com/abin/mallchat/common/common/algorithm/ac/ACTrieTest.java @@ -0,0 +1,63 @@ +package com.abin.mallchat.common.common.algorithm.ac; + +import com.google.common.collect.Lists; +import org.junit.Test; + +import java.util.List; + +import static org.junit.Assert.assertEquals; + +/** + * Created by berg on 2023/6/18. + */ +public class ACTrieTest { + + private final static List ALPHABET = Lists.newArrayList("abc", "bcd", "cde"); + + private static ACTrie trie(List keywords) { + return new ACTrie(keywords); + } + + @Test + public void test_TextIsLongerThanKeyword() { + final ACTrie trie = trie(ALPHABET); + final String text = " " + ALPHABET.get(0); + List matchResults = trie.matches(text); + checkResult(matchResults.get(0), 1, 4, ALPHABET.get(0), text); + } + + @Test + public void test_VariousKeywordsOneMatch() { + final ACTrie trie = trie(ALPHABET); + final String text = "bcd"; + List matchResults = trie.matches(text); + checkResult(matchResults.get(0), 0, 3, ALPHABET.get(1), text); + } + + @Test + public void test_VariousKeywordsMultiMatch() { + final ACTrie trie = trie(ALPHABET); + final String text = "abcd"; + List matchResults = trie.matches(text); + assertEquals(2, matchResults.size()); + checkResult(matchResults.get(0), 0, 3, ALPHABET.get(0), text); + checkResult(matchResults.get(1), 1, 4, ALPHABET.get(1), text); + } + + @Test + public void test_VariousKeywordsMultiMatch2() { + final ACTrie trie = trie(ALPHABET); + final String text = "abcde"; + List matchResults = trie.matches(text); + assertEquals(3, matchResults.size()); + checkResult(matchResults.get(0), 0, 3, ALPHABET.get(0), text); + checkResult(matchResults.get(1), 1, 4, ALPHABET.get(1), text); + checkResult(matchResults.get(2), 2, 5, ALPHABET.get(2), text); + } + + private void checkResult(MatchResult matchResult, int expectedStart, int expectedEnd, String expectedKeyword, String text) { + assertEquals("Start of match should have been " + expectedStart, expectedStart, matchResult.getStartIndex()); + assertEquals("End of match should have been " + expectedEnd, expectedEnd, matchResult.getEndIndex()); + assertEquals(expectedKeyword, text.substring(expectedStart, expectedEnd)); + } +}