From da024f08e96beda1362e9124183c243e0673336e Mon Sep 17 00:00:00 2001 From: xiaocairush Date: Sun, 18 Jun 2023 10:40:25 +0800 Subject: [PATCH] =?UTF-8?q?feat;=E6=B7=BB=E5=8A=A0ac=E8=87=AA=E5=8A=A8?= =?UTF-8?q?=E6=9C=BA=E7=AE=97=E6=B3=95=E8=BF=9B=E8=A1=8C=E6=95=8F=E6=84=9F?= =?UTF-8?q?=E8=AF=8D=E8=BF=87=E6=BB=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mallchat-common/pom.xml | 8 + .../common/common/algorithm/ac/ACTrie.java | 95 +++++++++ .../common/algorithm/ac/ACTrieNode.java | 46 +++++ .../common/algorithm/ac/MatchResult.java | 24 +++ .../common/common/algorithm/package-info.java | 4 + .../common/utils/SensitiveWordUtils.java | 192 ++---------------- .../common/algorithm/ac/ACTrieTest.java | 52 +++++ 7 files changed, 251 insertions(+), 170 deletions(-) create mode 100644 mallchat-common/src/main/java/com/abin/mallchat/common/common/algorithm/ac/ACTrie.java create mode 100644 mallchat-common/src/main/java/com/abin/mallchat/common/common/algorithm/ac/ACTrieNode.java create mode 100644 mallchat-common/src/main/java/com/abin/mallchat/common/common/algorithm/ac/MatchResult.java create mode 100644 mallchat-common/src/main/java/com/abin/mallchat/common/common/algorithm/package-info.java create mode 100644 mallchat-common/src/test/java/com/abin/mallchat/common/common/algorithm/ac/ACTrieTest.java diff --git a/mallchat-common/pom.xml b/mallchat-common/pom.xml index aefa9d6..f97a2e2 100644 --- a/mallchat-common/pom.xml +++ b/mallchat-common/pom.xml @@ -114,6 +114,14 @@ redisson-spring-boot-starter 3.17.1 + + + + junit + junit + ${junit.version} + test + diff --git a/mallchat-common/src/main/java/com/abin/mallchat/common/common/algorithm/ac/ACTrie.java b/mallchat-common/src/main/java/com/abin/mallchat/common/common/algorithm/ac/ACTrie.java new file mode 100644 index 0000000..b8ab1fb --- /dev/null +++ b/mallchat-common/src/main/java/com/abin/mallchat/common/common/algorithm/ac/ACTrie.java @@ -0,0 +1,95 @@ +package com.abin.mallchat.common.common.algorithm.ac; + +import com.google.common.collect.Lists; + +import java.util.*; +import java.util.stream.Collectors; + +/** + * aho-corasick算法(又称AC自动机算法) + * Created by berg on 2023/6/18. + */ +public class ACTrie { + + // 根节点 + private ACTrieNode root; + + public ACTrie(List words) { + words = words.stream().distinct().collect(Collectors.toList()); // 去重 + root = new ACTrieNode(); + for (String word : words) { + addWord(word); + } + initTrieFailover(); + } + + public void addWord(String word) { + ACTrieNode walkNode = root; + char[] chars = word.toCharArray(); + for (int i = 0; i < word.length(); i++) { + walkNode.addChildrenIfAbsent(chars[i]); + walkNode = walkNode.childOf(chars[i]); + walkNode.setDepth(i + 1); + } + walkNode.setLeaf(true); + } + + public void initTrieFailover() { + //第一层的fail指针指向root + Queue queue = new LinkedList<>(); + Map childrens = root.getChildren(); + for (ACTrieNode node : childrens.values()) { + node.setFailover(root); + queue.offer(node); + } + //构建剩余层数节点的fail指针,利用层次遍历 + while (!queue.isEmpty()) { + ACTrieNode parentNode = queue.poll(); + for (Map.Entry entry : parentNode.getChildren().entrySet()) { + ACTrieNode childNode = entry.getValue(); + ACTrieNode failover = parentNode.getFailover(); + // 在树中找到以childNode为结尾的字符串的最长前缀匹配,failover指向了这个最长前缀匹配的父节点 + while (failover != null && (!failover.hasChild(entry.getKey()))) { + failover = failover.getFailover(); + } + //回溯到了root节点 + if (failover == null) { + childNode.setFailover(root); + } else { + // 更新当前节点的回退指针 + childNode.setFailover(failover.childOf(entry.getKey())); + } + queue.offer(childNode); + } + } + } + + /** + * 查询句子中包含的敏感词的起始位置和结束位置 + * + * @param text + */ + public List matches(String text) { + List result = Lists.newArrayList(); + ACTrieNode walkNode = root; + for (int i = 0; i < text.length(); i++) { + char c = text.charAt(i); + while (!walkNode.hasChild(c) && walkNode.getFailover() != null) { + walkNode = walkNode.getFailover(); + } + //如果因为当前节点的孩子节点有这个字符,则将temp替换为下面的孩子节点 + if (walkNode.hasChild(c)) { + walkNode = walkNode.childOf(c); + // 检索到了敏感词 + if (walkNode.isLeaf()) { + result.add(new MatchResult(i - walkNode.getDepth() + 1, i + 1)); + // 模式串回退到最长可匹配前缀位置并开启新一轮的匹配 + // 这种回退方式将一个不漏的匹配到所有的敏感词,匹配结果的区间可能会有重叠的部分 + walkNode = walkNode.getFailover(); + } + } + } + return result; + } + +} diff --git a/mallchat-common/src/main/java/com/abin/mallchat/common/common/algorithm/ac/ACTrieNode.java b/mallchat-common/src/main/java/com/abin/mallchat/common/common/algorithm/ac/ACTrieNode.java new file mode 100644 index 0000000..1068524 --- /dev/null +++ b/mallchat-common/src/main/java/com/abin/mallchat/common/common/algorithm/ac/ACTrieNode.java @@ -0,0 +1,46 @@ +package com.abin.mallchat.common.common.algorithm.ac; + +import com.google.common.collect.Maps; +import lombok.Getter; +import lombok.Setter; + +import java.util.Map; + +/** + * Created by berg on 2023/6/18. + */ +@Getter +@Setter +public class ACTrieNode { + + // 子节点 + private Map children = Maps.newHashMap(); + + // 匹配过程中,如果模式串不匹配,模式串指针会回退到failover继续进行匹配 + private ACTrieNode failover = null; + + private int depth; + + private boolean isLeaf = false; + + public void addChildrenIfAbsent(char c) { + children.computeIfAbsent(c, (key) -> new ACTrieNode()); + } + + public ACTrieNode childOf(char c) { + return children.get(c); + } + + public boolean hasChild(char c) { + return children.containsKey(c); + } + + @Override + public String toString() { + return "ACTrieNode{" + + "failover=" + failover + + ", depth=" + depth + + ", isLeaf=" + isLeaf + + '}'; + } +} diff --git a/mallchat-common/src/main/java/com/abin/mallchat/common/common/algorithm/ac/MatchResult.java b/mallchat-common/src/main/java/com/abin/mallchat/common/common/algorithm/ac/MatchResult.java new file mode 100644 index 0000000..36087e4 --- /dev/null +++ b/mallchat-common/src/main/java/com/abin/mallchat/common/common/algorithm/ac/MatchResult.java @@ -0,0 +1,24 @@ +package com.abin.mallchat.common.common.algorithm.ac; + +import lombok.*; + +/** + * Created by berg on 2023/6/18. + */ +@Getter +@Setter +@AllArgsConstructor +public class MatchResult { + + private int startIndex; + + private int endIndex; + + @Override + public String toString() { + return "MatchResult{" + + "startIndex=" + startIndex + + ", endIndex=" + endIndex + + '}'; + } +} diff --git a/mallchat-common/src/main/java/com/abin/mallchat/common/common/algorithm/package-info.java b/mallchat-common/src/main/java/com/abin/mallchat/common/common/algorithm/package-info.java new file mode 100644 index 0000000..3b8d300 --- /dev/null +++ b/mallchat-common/src/main/java/com/abin/mallchat/common/common/algorithm/package-info.java @@ -0,0 +1,4 @@ +/** + * Created by berg on 2023/6/18. + */ +package com.abin.mallchat.common.common.algorithm; \ No newline at end of file diff --git a/mallchat-common/src/main/java/com/abin/mallchat/common/common/utils/SensitiveWordUtils.java b/mallchat-common/src/main/java/com/abin/mallchat/common/common/utils/SensitiveWordUtils.java index 1f38183..1213034 100644 --- a/mallchat-common/src/main/java/com/abin/mallchat/common/common/utils/SensitiveWordUtils.java +++ b/mallchat-common/src/main/java/com/abin/mallchat/common/common/utils/SensitiveWordUtils.java @@ -1,14 +1,17 @@ package com.abin.mallchat.common.common.utils; +import com.abin.mallchat.common.common.algorithm.ac.ACTrie; +import com.abin.mallchat.common.common.algorithm.ac.MatchResult; import org.apache.commons.lang3.StringUtils; -import java.io.*; +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.io.InputStreamReader; import java.nio.file.Files; import java.util.ArrayList; -import java.util.Collections; import java.util.List; import java.util.Objects; -import java.util.stream.Collectors; /** @@ -18,11 +21,9 @@ import java.util.stream.Collectors; * @since 2023/06/11 */ public final class SensitiveWordUtils { - private static SensitiveWordList wordList; - private final static char replace = '*'; // 替代字符 - private final static char[] skip = new char[]{ // 遇到这些字符就会跳过 - ' ', '!', '*', '-', '+', '_', '=', ',', ',', '.', '@', ';', ':', ';', ':' - }; + private final static char mask_char = '*'; // 替代字符 + + private static ACTrie ac_trie = null; /** * 有敏感词 @@ -42,55 +43,19 @@ public final class SensitiveWordUtils { * @return 替换后的文本 */ public static String filter(String text) { - if (wordList == null || wordList.size() == 0 || StringUtils.isBlank(text)) return text; - char[] __char__ = text.toCharArray(); // 把String转化成char数组,便于遍历 - int i, j; - Word word; - boolean flag; // 是否需要替换 - for (i = 0; i < __char__.length; i++) { // 遍历所有字符 - char c = __char__[i]; - word = wordList.binaryGet(c); // 使用二分查找来寻找字符,提高效率 - if (word != null) { // word != null说明找到了 - flag = false; - j = i + 1; - while (j < __char__.length) { // 开始逐个比较后面的字符 - if (skip(__char__[j])) { // 跳过空格之类的无关字符 - j++; - continue; - } - if (word.next != null) { // 字符串尚未结束,不确定是否存在敏感词 - /* - 以下代码并没有使用二分查找,因为以同一个字符开头的敏感词较少 - 例如,wordList中记录了所有敏感词的开头第一个字,它的数量通常会有上千个 - 假如现在锁定了字符“T”开头的敏感词,而“T”开头的敏感词只有10个,这时使用二分查找的效率反而低于顺序查找 - */ - word = word.next.get(__char__[j]); - if (word == null) { - break; - } - j++; - } else { // 字符串已结束,存在敏感词汇 - flag = true; - break; - } - } - if (word != null && word.next == null) { - flag = true; - } - if (flag) { // 如果flag==true,说明检测出敏感粗,需要替换 - while (i < j) { - // if(skip(__char__[i])){ // 跳过空格之类的无关字符,如果要把空格也替换成'*',则删除这个if语句 - // i++; - // continue; - // } - __char__[i] = replace; - i++; - } - i--; - } - } + if (StringUtils.isBlank(text)) return text; + List matchResults = ac_trie.matches(text); + StringBuffer result = new StringBuffer(text); + for (MatchResult matchResult : matchResults) { + replaceBetween(result, matchResult.getStartIndex(), matchResult.getEndIndex()); + } + return result.toString(); + } + + private static void replaceBetween(StringBuffer buffer, int startIndex, int endIndex) { + for (int i = startIndex; i < endIndex; i++) { + buffer.setCharAt(i, mask_char); } - return new String(__char__); } /** @@ -100,26 +65,7 @@ public final class SensitiveWordUtils { */ public static void loadWord(List words) { if (words == null) return; - words = words.stream().distinct().collect(Collectors.toList()); // 去重 - char[] chars; - SensitiveWordList now; - Word word; - wordList = new SensitiveWordList(); - for (String __word__ : words) { - if (__word__ == null) continue; - chars = __word__.toCharArray(); - now = wordList; - word = null; - for (char c : chars) { - if (word != null) { - if (word.next == null) word.next = new SensitiveWordList(); - now = word.next; - } - word = now.get(c); - if (word == null) word = now.add(c); - } - } - sort(wordList); + ac_trie = new ACTrie(words); } /** @@ -150,102 +96,8 @@ public final class SensitiveWordUtils { } } - /** - * 对敏感词多叉树递增排序 - * - * @param list 待排序List - */ - private static void sort(SensitiveWordList list) { - if (list == null) return; - Collections.sort(list); // 递增排序 - for (Word word : list) { - sort(word.next); - } - } - /** - * 判断是否跳过当前字符 - * - * @param c 待检测字符 - * @return true:需要跳过 false:不需要跳过 - */ - private static boolean skip(char c) { - for (char c1 : skip) { - if (c1 == c) return true; - } - return false; - } - /** - * 敏感词列表 - * - * @author zhaoyuhang - * @since 2023/06/11 - */ - public static class SensitiveWordList extends ArrayList { - public Word get(char c) { - for (Word w : this) { - if (w.c == c) return w; - } - return null; - } - - /** - * 二分查找,必须先升序排序 - * - * @param c 需要查找的字符 - * @return Word对象:如果找到 null:如果没找到 - */ - public Word binaryGet(char c) { - int left, right, key; - Word word; - left = 0; - right = this.size() - 1; - while (left <= right) { - key = (left + right) / 2; - word = get(key); - if (word.c == c) { - return word; - } else if (word.c > c) { - right = key - 1; - } else { - left = key + 1; - } - } - return null; - } - - public Word add(char c) { - Word word = new Word(c); - super.add(word); - return word; - } - - } - - /** - * 敏感词 - * - * @author zhaoyuhang - * @since 2023/06/11 - */ - public static class Word implements Comparable { - public char c; - public SensitiveWordList next = null; - - public Word(char c) { - this.c = c; - } - - @Override - public int compareTo(Word word) { - return c - word.c; - } - - public String toString() { - return c + "(" + (next == null ? null : next.size()) + ")"; - } - } } diff --git a/mallchat-common/src/test/java/com/abin/mallchat/common/common/algorithm/ac/ACTrieTest.java b/mallchat-common/src/test/java/com/abin/mallchat/common/common/algorithm/ac/ACTrieTest.java new file mode 100644 index 0000000..04da0f7 --- /dev/null +++ b/mallchat-common/src/test/java/com/abin/mallchat/common/common/algorithm/ac/ACTrieTest.java @@ -0,0 +1,52 @@ +package com.abin.mallchat.common.common.algorithm.ac; + +import com.google.common.collect.Lists; +import org.junit.Test; + +import java.util.List; + +import static org.junit.Assert.assertEquals; + +/** + * Created by berg on 2023/6/18. + */ +public class ACTrieTest { + + private final static List ALPHABET = Lists.newArrayList("abc", "bcd", "cde"); + + private static ACTrie trie(List keywords) { + return new ACTrie(keywords); + } + + @Test + public void test_TextIsLongerThanKeyword() { + final ACTrie trie = trie(ALPHABET); + final String text = " " + ALPHABET.get(0); + List matchResults = trie.matches(text); + checkResult(matchResults.get(0), 1, 4, ALPHABET.get(0), text); + } + + @Test + public void test_VariousKeywordsOneMatch() { + final ACTrie trie = trie(ALPHABET); + final String text = "bcd"; + List matchResults = trie.matches(text); + checkResult(matchResults.get(0), 0, 3, ALPHABET.get(1), text); + } + + @Test + public void test_VariousKeywordsMultiMatch() { + final ACTrie trie = trie(ALPHABET); + final String text = "abcd"; + List matchResults = trie.matches(text); + assertEquals(2, matchResults.size()); + checkResult(matchResults.get(0), 0, 3, ALPHABET.get(0), text); + checkResult(matchResults.get(1), 1, 4, ALPHABET.get(1), text); + } + + private void checkResult(MatchResult matchResult, int expectedStart, int expectedEnd, String expectedKeyword, String text) { + assertEquals("Start of match should have been " + expectedStart, expectedStart, matchResult.getStartIndex()); + assertEquals("End of match should have been " + expectedEnd, expectedEnd, matchResult.getEndIndex()); + assertEquals(expectedKeyword, text.substring(expectedStart, expectedEnd)); + } +}