mirror of
https://github.com/zongzibinbin/MallChat.git
synced 2026-03-18 09:03:43 +08:00
@@ -2,39 +2,38 @@ package com.abin.mallchat.common.common.utils;
|
|||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.BufferedReader;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.util.ArrayList;
|
import java.util.*;
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Objects;
|
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* 敏感词过滤
|
|
||||||
*
|
|
||||||
* @author zhaoyuhang
|
|
||||||
* @since 2023/06/11
|
|
||||||
*/
|
|
||||||
public final class SensitiveWordUtils {
|
public final class SensitiveWordUtils {
|
||||||
private static SensitiveWordList wordList;
|
private static Map<Character, Word> wordMap; // 敏感词Map
|
||||||
private final static char replace = '*'; // 替代字符
|
private final static char replace = '*'; // 替代字符
|
||||||
private final static char[] skip = new char[]{ // 遇到这些字符就会跳过
|
private final static char[] skip = new char[]{ // 遇到这些字符就会跳过
|
||||||
' ', '!', '*', '-', '+', '_', '=', ',', ',', '.', '@', ';', ':', ';', ':'
|
' ', '!', '*', '-', '+', '_', '=', ',', ',', '.', '@', ';', ':', ';', ':'
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 有敏感词
|
* 判断文本中是否存在敏感词
|
||||||
*
|
*
|
||||||
* @param text 文本
|
* @param text 文本
|
||||||
* @return boolean
|
* @return true: 存在敏感词, false: 不存在敏感词
|
||||||
*/
|
*/
|
||||||
public static boolean hasSensitiveWord(String text) {
|
public static boolean hasSensitiveWord(String text) {
|
||||||
if (StringUtils.isBlank(text)) return false;
|
if (StringUtils.isBlank(text)) return false;
|
||||||
return !Objects.equals(filter(text), text);
|
return !Objects.equals(filter(text), text);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 过滤敏感词并替换为指定字符
|
||||||
|
*
|
||||||
|
* @param text 待替换文本
|
||||||
|
* @return 替换后的文本
|
||||||
|
*/
|
||||||
/**
|
/**
|
||||||
* 敏感词替换
|
* 敏感词替换
|
||||||
*
|
*
|
||||||
@@ -42,57 +41,49 @@ public final class SensitiveWordUtils {
|
|||||||
* @return 替换后的文本
|
* @return 替换后的文本
|
||||||
*/
|
*/
|
||||||
public static String filter(String text) {
|
public static String filter(String text) {
|
||||||
if (wordList == null || wordList.size() == 0 || StringUtils.isBlank(text)) return text;
|
if (wordMap == null || wordMap.isEmpty() || StringUtils.isBlank(text)) return text;
|
||||||
char[] __char__ = text.toCharArray(); // 把String转化成char数组,便于遍历
|
char[] chars = text.toCharArray(); // 将文本转换为字符数组
|
||||||
int i, j;
|
int length = chars.length; // 文本长度
|
||||||
Word word;
|
StringBuilder result = new StringBuilder(length); // 存储替换后的结果
|
||||||
boolean flag; // 是否需要替换
|
int i = 0; // 当前遍历的字符索引
|
||||||
for (i = 0; i < __char__.length; i++) { // 遍历所有字符
|
while (i < length) {
|
||||||
char c = __char__[i];
|
char c = chars[i]; // 当前字符
|
||||||
word = wordList.binaryGet(c); // 使用二分查找来寻找字符,提高效率
|
if (skip(c)) { // 如果是需要跳过的字符,则直接追加到结果中
|
||||||
if (word != null) { // word != null说明找到了
|
i++;
|
||||||
flag = false;
|
continue;
|
||||||
j = i + 1;
|
}
|
||||||
while (j < __char__.length) { // 开始逐个比较后面的字符
|
int startIndex = i; // 敏感词匹配的起始索引
|
||||||
if (skip(__char__[j])) { // 跳过空格之类的无关字符
|
Map<Character, Word> currentMap = wordMap; // 当前层级的敏感词字典
|
||||||
j++;
|
int matchLength = 0; // 匹配到的敏感词长度
|
||||||
continue;
|
for (int j = i; j < length; j++) {
|
||||||
}
|
char ch = chars[j]; // 当前遍历的字符
|
||||||
if (word.next != null) { // 字符串尚未结束,不确定是否存在敏感词
|
if (skip(ch)) { // 如果是需要跳过的字符,则直接追加到结果中
|
||||||
/*
|
continue;
|
||||||
以下代码并没有使用二分查找,因为以同一个字符开头的敏感词较少
|
|
||||||
例如,wordList中记录了所有敏感词的开头第一个字,它的数量通常会有上千个
|
|
||||||
假如现在锁定了字符“T”开头的敏感词,而“T”开头的敏感词只有10个,这时使用二分查找的效率反而低于顺序查找
|
|
||||||
*/
|
|
||||||
word = word.next.get(__char__[j]);
|
|
||||||
if (word == null) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
j++;
|
|
||||||
} else { // 字符串已结束,存在敏感词汇
|
|
||||||
flag = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if (word != null && word.next == null) {
|
Word word = currentMap.get(ch); // 获取当前字符在当前层级的敏感词字典中对应的敏感词节点
|
||||||
flag = true;
|
if (word == null) { // 如果未匹配到敏感词节点,则终止循环
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
if (flag) { // 如果flag==true,说明检测出敏感粗,需要替换
|
if (word.end) { // 如果当前节点是敏感词的最后一个节点,则记录匹配长度
|
||||||
while (i < j) {
|
matchLength = j - startIndex + 1;
|
||||||
// if(skip(__char__[i])){ // 跳过空格之类的无关字符,如果要把空格也替换成'*',则删除这个if语句
|
}
|
||||||
// i++;
|
currentMap = word.next; // 进入下一层级的敏感词字典
|
||||||
// continue;
|
if (word.next == null) { // 如果当前节点是敏感词的最后一个节点,则记录匹配长度
|
||||||
// }
|
matchLength = j - startIndex + 1;
|
||||||
__char__[i] = replace;
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
i--;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (matchLength > 0) { // 如果匹配到敏感词,则将对应的字符替换为指定替代字符
|
||||||
|
for (int j = startIndex; j < startIndex + matchLength; j++) {
|
||||||
|
chars[j] = replace;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
i += matchLength > 0 ? matchLength : 1; // 更新当前索引,跳过匹配到的敏感词
|
||||||
}
|
}
|
||||||
return new String(__char__);
|
result.append(chars); // 将匹配到的敏感词追加到结果中
|
||||||
|
return result.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 加载敏感词列表
|
* 加载敏感词列表
|
||||||
*
|
*
|
||||||
@@ -101,31 +92,33 @@ public final class SensitiveWordUtils {
|
|||||||
public static void loadWord(List<String> words) {
|
public static void loadWord(List<String> words) {
|
||||||
if (words == null) return;
|
if (words == null) return;
|
||||||
words = words.stream().distinct().collect(Collectors.toList()); // 去重
|
words = words.stream().distinct().collect(Collectors.toList()); // 去重
|
||||||
char[] chars;
|
wordMap = new HashMap<>(); // 创建敏感词字典的根节点
|
||||||
SensitiveWordList now;
|
for (String word : words) {
|
||||||
Word word;
|
if (word == null) continue;
|
||||||
wordList = new SensitiveWordList();
|
char[] chars = word.toCharArray();
|
||||||
for (String __word__ : words) {
|
Map<Character, Word> currentMap = wordMap; // 当前层级的敏感词字典
|
||||||
if (__word__ == null) continue;
|
for (int i = 0; i < chars.length; i++) {
|
||||||
chars = __word__.toCharArray();
|
char c = chars[i];
|
||||||
now = wordList;
|
Word currentWord = currentMap.get(c);
|
||||||
word = null;
|
if (currentWord == null) {
|
||||||
for (char c : chars) {
|
Word newWord = new Word(c); // 创建新的敏感词节点
|
||||||
if (word != null) {
|
currentMap.put(c, newWord); // 将节点添加到当前层级的敏感词字典中
|
||||||
if (word.next == null) word.next = new SensitiveWordList();
|
if (i == chars.length - 1) {
|
||||||
now = word.next;
|
newWord.end = true; // 添加结束标志
|
||||||
|
}
|
||||||
|
currentMap = newWord.next = new HashMap<>(); // 进入下一层级
|
||||||
|
} else {
|
||||||
|
currentMap = currentWord.next; // 存在该字符的节点,则进入下一层级
|
||||||
}
|
}
|
||||||
word = now.get(c);
|
|
||||||
if (word == null) word = now.add(c);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
sort(wordList);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 加载敏感词txt文件,每个敏感词独占一行,不可出现空格,空行,逗号等非文字内容,必须使用UTF-8编码
|
* 从文本文件中加载敏感词列表
|
||||||
*
|
*
|
||||||
* @param path txt文件的绝对地址
|
* @param path 文本文件的绝对路径
|
||||||
*/
|
*/
|
||||||
public static void loadWordFromFile(String path) {
|
public static void loadWordFromFile(String path) {
|
||||||
String encoding = "UTF-8";
|
String encoding = "UTF-8";
|
||||||
@@ -151,101 +144,39 @@ public final class SensitiveWordUtils {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 对敏感词多叉树递增排序
|
* 判断是否需要跳过当前字符
|
||||||
*
|
|
||||||
* @param list 待排序List
|
|
||||||
*/
|
|
||||||
private static void sort(SensitiveWordList list) {
|
|
||||||
if (list == null) return;
|
|
||||||
Collections.sort(list); // 递增排序
|
|
||||||
for (Word word : list) {
|
|
||||||
sort(word.next);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* 判断是否跳过当前字符
|
|
||||||
*
|
*
|
||||||
* @param c 待检测字符
|
* @param c 待检测字符
|
||||||
* @return true:需要跳过 false:不需要跳过
|
* @return true: 需要跳过, false: 不需要跳过
|
||||||
*/
|
*/
|
||||||
private static boolean skip(char c) {
|
private static boolean skip(char c) {
|
||||||
for (char c1 : skip) {
|
for (char skipChar : skip) {
|
||||||
if (c1 == c) return true;
|
if (skipChar == c) return true;
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 敏感词列表
|
* 敏感词类
|
||||||
*
|
|
||||||
* @author zhaoyuhang
|
|
||||||
* @since 2023/06/11
|
|
||||||
*/
|
*/
|
||||||
public static class SensitiveWordList extends ArrayList<Word> {
|
private static class Word {
|
||||||
public Word get(char c) {
|
// 当前字符
|
||||||
for (Word w : this) {
|
private char c;
|
||||||
if (w.c == c) return w;
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
// 结束标识
|
||||||
* 二分查找,必须先升序排序
|
private boolean end;
|
||||||
*
|
|
||||||
* @param c 需要查找的字符
|
|
||||||
* @return Word对象:如果找到 null:如果没找到
|
|
||||||
*/
|
|
||||||
public Word binaryGet(char c) {
|
|
||||||
int left, right, key;
|
|
||||||
Word word;
|
|
||||||
left = 0;
|
|
||||||
right = this.size() - 1;
|
|
||||||
while (left <= right) {
|
|
||||||
key = (left + right) / 2;
|
|
||||||
word = get(key);
|
|
||||||
if (word.c == c) {
|
|
||||||
return word;
|
|
||||||
} else if (word.c > c) {
|
|
||||||
right = key - 1;
|
|
||||||
} else {
|
|
||||||
left = key + 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Word add(char c) {
|
// 下一层级的敏感词字典
|
||||||
Word word = new Word(c);
|
private Map<Character, Word> next;
|
||||||
super.add(word);
|
|
||||||
return word;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* 敏感词
|
|
||||||
*
|
|
||||||
* @author zhaoyuhang
|
|
||||||
* @since 2023/06/11
|
|
||||||
*/
|
|
||||||
public static class Word implements Comparable<Word> {
|
|
||||||
public char c;
|
|
||||||
public SensitiveWordList next = null;
|
|
||||||
|
|
||||||
public Word(char c) {
|
public Word(char c) {
|
||||||
this.c = c;
|
this.c = c;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
public static void main(String[] args) {
|
||||||
public int compareTo(Word word) {
|
List<String> strings = Arrays.asList("白日梦", "白痴", "白痴是你","TMD");
|
||||||
return c - word.c;
|
loadWord(strings);
|
||||||
}
|
System.out.println(filter("TMD,白痴是你吗"));
|
||||||
|
|
||||||
public String toString() {
|
|
||||||
return c + "(" + (next == null ? null : next.size()) + ")";
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -4,8 +4,10 @@ import com.abin.mallchat.common.common.utils.SensitiveWordUtils;
|
|||||||
import com.abin.mallchat.common.sensitive.dao.SensitiveWordDao;
|
import com.abin.mallchat.common.sensitive.dao.SensitiveWordDao;
|
||||||
import com.abin.mallchat.common.sensitive.domain.SensitiveWord;
|
import com.abin.mallchat.common.sensitive.domain.SensitiveWord;
|
||||||
import com.abin.mallchat.common.sensitive.service.ISensitiveWordService;
|
import com.abin.mallchat.common.sensitive.service.ISensitiveWordService;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
import org.apache.commons.collections.CollectionUtils;
|
import org.apache.commons.collections.CollectionUtils;
|
||||||
import org.springframework.beans.factory.annotation.Autowired;
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
|
import org.springframework.scheduling.concurrent.ThreadPoolTaskExecutor;
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import javax.annotation.PostConstruct;
|
import javax.annotation.PostConstruct;
|
||||||
@@ -13,18 +15,25 @@ import java.util.List;
|
|||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
@Service
|
@Service
|
||||||
|
@Slf4j
|
||||||
public class SensitiveWordServiceImpl implements ISensitiveWordService {
|
public class SensitiveWordServiceImpl implements ISensitiveWordService {
|
||||||
@Autowired
|
@Autowired
|
||||||
private SensitiveWordDao sensitiveWordDao;
|
private SensitiveWordDao sensitiveWordDao;
|
||||||
|
@Autowired
|
||||||
|
private ThreadPoolTaskExecutor threadPoolTaskExecutor;
|
||||||
|
|
||||||
@PostConstruct
|
@PostConstruct
|
||||||
public void initSensitiveWord() {
|
public void initSensitiveWord() {
|
||||||
List<SensitiveWord> list = sensitiveWordDao.list();
|
threadPoolTaskExecutor.execute(() -> {
|
||||||
if (!CollectionUtils.isEmpty(list)) {
|
log.info("[initSensitiveWord] start");
|
||||||
List<String> wordList = list.stream()
|
List<SensitiveWord> list = sensitiveWordDao.list();
|
||||||
.map(SensitiveWord::getWord)
|
if (!CollectionUtils.isEmpty(list)) {
|
||||||
.collect(Collectors.toList());
|
List<String> wordList = list.stream()
|
||||||
SensitiveWordUtils.loadWord(wordList);
|
.map(SensitiveWord::getWord)
|
||||||
}
|
.collect(Collectors.toList());
|
||||||
|
SensitiveWordUtils.loadWord(wordList);
|
||||||
|
}
|
||||||
|
log.info("[initSensitiveWord] end; loading sensitiveWords num:{}", list.size());
|
||||||
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user