mirror of
https://github.com/zongzibinbin/MallChat.git
synced 2026-03-13 21:53:41 +08:00
Merge remote-tracking branch 'origin/main'
This commit is contained in:
@@ -114,6 +114,14 @@
|
|||||||
<artifactId>redisson-spring-boot-starter</artifactId>
|
<artifactId>redisson-spring-boot-starter</artifactId>
|
||||||
<version>3.17.1</version>
|
<version>3.17.1</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<!-- Used for unit testing -->
|
||||||
|
<dependency>
|
||||||
|
<groupId>junit</groupId>
|
||||||
|
<artifactId>junit</artifactId>
|
||||||
|
<version>${junit.version}</version>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
<build>
|
<build>
|
||||||
<plugins>
|
<plugins>
|
||||||
|
|||||||
@@ -0,0 +1,100 @@
|
|||||||
|
package com.abin.mallchat.common.common.algorithm.ac;
|
||||||
|
|
||||||
|
import com.google.common.collect.Lists;
|
||||||
|
|
||||||
|
import javax.annotation.concurrent.NotThreadSafe;
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* aho-corasick算法(又称AC自动机算法)
|
||||||
|
* Created by berg on 2023/6/18.
|
||||||
|
*/
|
||||||
|
@NotThreadSafe
|
||||||
|
public class ACTrie {
|
||||||
|
|
||||||
|
// 根节点
|
||||||
|
private ACTrieNode root;
|
||||||
|
|
||||||
|
public ACTrie(List<String> words) {
|
||||||
|
words = words.stream().distinct().collect(Collectors.toList()); // 去重
|
||||||
|
root = new ACTrieNode();
|
||||||
|
for (String word : words) {
|
||||||
|
addWord(word);
|
||||||
|
}
|
||||||
|
initFailover();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void addWord(String word) {
|
||||||
|
ACTrieNode walkNode = root;
|
||||||
|
char[] chars = word.toCharArray();
|
||||||
|
for (int i = 0; i < word.length(); i++) {
|
||||||
|
walkNode.addChildrenIfAbsent(chars[i]);
|
||||||
|
walkNode = walkNode.childOf(chars[i]);
|
||||||
|
walkNode.setDepth(i + 1);
|
||||||
|
}
|
||||||
|
walkNode.setLeaf(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 初始化节点中的回退指针
|
||||||
|
*/
|
||||||
|
private void initFailover() {
|
||||||
|
//第一层的fail指针指向root
|
||||||
|
Queue<ACTrieNode> queue = new LinkedList<>();
|
||||||
|
Map<Character, ACTrieNode> children = root.getChildren();
|
||||||
|
for (ACTrieNode node : children.values()) {
|
||||||
|
node.setFailover(root);
|
||||||
|
queue.offer(node);
|
||||||
|
}
|
||||||
|
//构建剩余层数节点的fail指针,利用层次遍历
|
||||||
|
while (!queue.isEmpty()) {
|
||||||
|
ACTrieNode parentNode = queue.poll();
|
||||||
|
for (Map.Entry<Character, ACTrieNode> entry : parentNode.getChildren().entrySet()) {
|
||||||
|
ACTrieNode childNode = entry.getValue();
|
||||||
|
ACTrieNode failover = parentNode.getFailover();
|
||||||
|
// 在树中找到以childNode为结尾的字符串的最长前缀匹配,failover指向了这个最长前缀匹配的父节点
|
||||||
|
while (failover != null && (!failover.hasChild(entry.getKey()))) {
|
||||||
|
failover = failover.getFailover();
|
||||||
|
}
|
||||||
|
//回溯到了root节点
|
||||||
|
if (failover == null) {
|
||||||
|
childNode.setFailover(root);
|
||||||
|
} else {
|
||||||
|
// 更新当前节点的回退指针
|
||||||
|
childNode.setFailover(failover.childOf(entry.getKey()));
|
||||||
|
}
|
||||||
|
queue.offer(childNode);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 查询句子中包含的敏感词的起始位置和结束位置
|
||||||
|
*
|
||||||
|
* @param text
|
||||||
|
*/
|
||||||
|
public List<MatchResult> matches(String text) {
|
||||||
|
List<MatchResult> result = Lists.newArrayList();
|
||||||
|
ACTrieNode walkNode = root;
|
||||||
|
for (int i = 0; i < text.length(); i++) {
|
||||||
|
char c = text.charAt(i);
|
||||||
|
while (!walkNode.hasChild(c) && walkNode.getFailover() != null) {
|
||||||
|
walkNode = walkNode.getFailover();
|
||||||
|
}
|
||||||
|
//如果因为当前节点的孩子节点有这个字符,则将walkNode替换为下面的孩子节点
|
||||||
|
if (walkNode.hasChild(c)) {
|
||||||
|
walkNode = walkNode.childOf(c);
|
||||||
|
// 检索到了敏感词
|
||||||
|
if (walkNode.isLeaf()) {
|
||||||
|
result.add(new MatchResult(i - walkNode.getDepth() + 1, i + 1));
|
||||||
|
// 模式串回退到最长可匹配前缀位置并开启新一轮的匹配
|
||||||
|
// 这种回退方式将一个不漏的匹配到所有的敏感词,匹配结果的区间可能会有重叠的部分
|
||||||
|
walkNode = walkNode.getFailover();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@@ -0,0 +1,46 @@
|
|||||||
|
package com.abin.mallchat.common.common.algorithm.ac;
|
||||||
|
|
||||||
|
import com.google.common.collect.Maps;
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.Setter;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by berg on 2023/6/18.
|
||||||
|
*/
|
||||||
|
@Getter
|
||||||
|
@Setter
|
||||||
|
public class ACTrieNode {
|
||||||
|
|
||||||
|
// 子节点
|
||||||
|
private Map<Character, ACTrieNode> children = Maps.newHashMap();
|
||||||
|
|
||||||
|
// 匹配过程中,如果模式串不匹配,模式串指针会回退到failover继续进行匹配
|
||||||
|
private ACTrieNode failover = null;
|
||||||
|
|
||||||
|
private int depth;
|
||||||
|
|
||||||
|
private boolean isLeaf = false;
|
||||||
|
|
||||||
|
public void addChildrenIfAbsent(char c) {
|
||||||
|
children.computeIfAbsent(c, (key) -> new ACTrieNode());
|
||||||
|
}
|
||||||
|
|
||||||
|
public ACTrieNode childOf(char c) {
|
||||||
|
return children.get(c);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean hasChild(char c) {
|
||||||
|
return children.containsKey(c);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "ACTrieNode{" +
|
||||||
|
"failover=" + failover +
|
||||||
|
", depth=" + depth +
|
||||||
|
", isLeaf=" + isLeaf +
|
||||||
|
'}';
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,24 @@
|
|||||||
|
package com.abin.mallchat.common.common.algorithm.ac;
|
||||||
|
|
||||||
|
import lombok.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by berg on 2023/6/18.
|
||||||
|
*/
|
||||||
|
@Getter
|
||||||
|
@Setter
|
||||||
|
@AllArgsConstructor
|
||||||
|
public class MatchResult {
|
||||||
|
|
||||||
|
private int startIndex;
|
||||||
|
|
||||||
|
private int endIndex;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "MatchResult{" +
|
||||||
|
"startIndex=" + startIndex +
|
||||||
|
", endIndex=" + endIndex +
|
||||||
|
'}';
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,4 @@
|
|||||||
|
/**
|
||||||
|
* Created by berg on 2023/6/18.
|
||||||
|
*/
|
||||||
|
package com.abin.mallchat.common.common.algorithm;
|
||||||
@@ -0,0 +1,70 @@
|
|||||||
|
package com.abin.mallchat.common.common.utils;
|
||||||
|
|
||||||
|
import com.abin.mallchat.common.common.algorithm.ac.ACTrie;
|
||||||
|
import com.abin.mallchat.common.common.algorithm.ac.MatchResult;
|
||||||
|
import org.HdrHistogram.ConcurrentHistogram;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 基于ac自动机实现的敏感词过滤工具类
|
||||||
|
* 可以用来替代{@link ConcurrentHistogram}
|
||||||
|
* 为了兼容提供了相同的api接口 {@code hasSensitiveWord}
|
||||||
|
*
|
||||||
|
* Created by berg on 2023/6/18.
|
||||||
|
*/
|
||||||
|
public class SensitiveWordUtils0 {
|
||||||
|
|
||||||
|
private final static char mask_char = '*'; // 替代字符
|
||||||
|
|
||||||
|
private static ACTrie ac_trie = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 有敏感词
|
||||||
|
*
|
||||||
|
* @param text 文本
|
||||||
|
* @return boolean
|
||||||
|
*/
|
||||||
|
public static boolean hasSensitiveWord(String text) {
|
||||||
|
if (StringUtils.isBlank(text)) return false;
|
||||||
|
return !Objects.equals(filter(text), text);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 敏感词替换
|
||||||
|
*
|
||||||
|
* @param text 待替换文本
|
||||||
|
* @return 替换后的文本
|
||||||
|
*/
|
||||||
|
public static String filter(String text) {
|
||||||
|
if (StringUtils.isBlank(text)) return text;
|
||||||
|
List<MatchResult> matchResults = ac_trie.matches(text);
|
||||||
|
StringBuffer result = new StringBuffer(text);
|
||||||
|
// matchResults是按照startIndex排序的,因此可以通过不断更新endIndex最大值的方式算出尚未被替代部分
|
||||||
|
int endIndex = 0;
|
||||||
|
for (MatchResult matchResult : matchResults) {
|
||||||
|
endIndex = Math.max(endIndex, matchResult.getEndIndex());
|
||||||
|
replaceBetween(result, matchResult.getStartIndex(), endIndex);
|
||||||
|
}
|
||||||
|
return result.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void replaceBetween(StringBuffer buffer, int startIndex, int endIndex) {
|
||||||
|
for (int i = startIndex; i < endIndex; i++) {
|
||||||
|
buffer.setCharAt(i, mask_char);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 加载敏感词列表
|
||||||
|
*
|
||||||
|
* @param words 敏感词数组
|
||||||
|
*/
|
||||||
|
public static void loadWord(List<String> words) {
|
||||||
|
if (words == null) return;
|
||||||
|
ac_trie = new ACTrie(words);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@@ -0,0 +1,63 @@
|
|||||||
|
package com.abin.mallchat.common.common.algorithm.ac;
|
||||||
|
|
||||||
|
import com.google.common.collect.Lists;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by berg on 2023/6/18.
|
||||||
|
*/
|
||||||
|
public class ACTrieTest {
|
||||||
|
|
||||||
|
private final static List<String> ALPHABET = Lists.newArrayList("abc", "bcd", "cde");
|
||||||
|
|
||||||
|
private static ACTrie trie(List<String> keywords) {
|
||||||
|
return new ACTrie(keywords);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test_TextIsLongerThanKeyword() {
|
||||||
|
final ACTrie trie = trie(ALPHABET);
|
||||||
|
final String text = " " + ALPHABET.get(0);
|
||||||
|
List<MatchResult> matchResults = trie.matches(text);
|
||||||
|
checkResult(matchResults.get(0), 1, 4, ALPHABET.get(0), text);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test_VariousKeywordsOneMatch() {
|
||||||
|
final ACTrie trie = trie(ALPHABET);
|
||||||
|
final String text = "bcd";
|
||||||
|
List<MatchResult> matchResults = trie.matches(text);
|
||||||
|
checkResult(matchResults.get(0), 0, 3, ALPHABET.get(1), text);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test_VariousKeywordsMultiMatch() {
|
||||||
|
final ACTrie trie = trie(ALPHABET);
|
||||||
|
final String text = "abcd";
|
||||||
|
List<MatchResult> matchResults = trie.matches(text);
|
||||||
|
assertEquals(2, matchResults.size());
|
||||||
|
checkResult(matchResults.get(0), 0, 3, ALPHABET.get(0), text);
|
||||||
|
checkResult(matchResults.get(1), 1, 4, ALPHABET.get(1), text);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test_VariousKeywordsMultiMatch2() {
|
||||||
|
final ACTrie trie = trie(ALPHABET);
|
||||||
|
final String text = "abcde";
|
||||||
|
List<MatchResult> matchResults = trie.matches(text);
|
||||||
|
assertEquals(3, matchResults.size());
|
||||||
|
checkResult(matchResults.get(0), 0, 3, ALPHABET.get(0), text);
|
||||||
|
checkResult(matchResults.get(1), 1, 4, ALPHABET.get(1), text);
|
||||||
|
checkResult(matchResults.get(2), 2, 5, ALPHABET.get(2), text);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void checkResult(MatchResult matchResult, int expectedStart, int expectedEnd, String expectedKeyword, String text) {
|
||||||
|
assertEquals("Start of match should have been " + expectedStart, expectedStart, matchResult.getStartIndex());
|
||||||
|
assertEquals("End of match should have been " + expectedEnd, expectedEnd, matchResult.getEndIndex());
|
||||||
|
assertEquals(expectedKeyword, text.substring(expectedStart, expectedEnd));
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user