mirror of
https://github.com/zongzibinbin/MallChat.git
synced 2026-03-13 21:53:41 +08:00
@@ -1,21 +1,35 @@
|
|||||||
package com.abin.mallchat.common.common.utils;
|
package com.abin.mallchat.common.common.utils;
|
||||||
|
|
||||||
|
import org.apache.commons.collections.CollectionUtils;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
import java.io.File;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Paths;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 敏感词工具类
|
||||||
|
*
|
||||||
|
* @author zhaoyuhang
|
||||||
|
* @date 2023/06/19
|
||||||
|
*/
|
||||||
public final class SensitiveWordUtils {
|
public final class SensitiveWordUtils {
|
||||||
private static Map<Character, Word> wordMap; // 敏感词Map
|
private static Word root = new Word(' '); // 敏感词字典的根节点
|
||||||
private final static char replace = '*'; // 替代字符
|
private final static char replace = '*'; // 替代字符
|
||||||
private final static char[] skip = new char[]{ // 遇到这些字符就会跳过
|
private final static String skipChars = " !*-+_=,,.@;:;:。、??()()【】[]《》<>“”\"‘’"; // 遇到这些字符就会跳过
|
||||||
' ', '!', '*', '-', '+', '_', '=', ',', ',', '.', '@', ';', ':', ';', ':'
|
private final static Set<Character> skipSet = new HashSet<>(); // 遇到这些字符就会跳过
|
||||||
};
|
|
||||||
|
static {
|
||||||
|
for (char c : skipChars.toCharArray()) {
|
||||||
|
skipSet.add(c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 判断文本中是否存在敏感词
|
* 判断文本中是否存在敏感词
|
||||||
@@ -28,12 +42,6 @@ public final class SensitiveWordUtils {
|
|||||||
return !Objects.equals(filter(text), text);
|
return !Objects.equals(filter(text), text);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* 过滤敏感词并替换为指定字符
|
|
||||||
*
|
|
||||||
* @param text 待替换文本
|
|
||||||
* @return 替换后的文本
|
|
||||||
*/
|
|
||||||
/**
|
/**
|
||||||
* 敏感词替换
|
* 敏感词替换
|
||||||
*
|
*
|
||||||
@@ -41,45 +49,41 @@ public final class SensitiveWordUtils {
|
|||||||
* @return 替换后的文本
|
* @return 替换后的文本
|
||||||
*/
|
*/
|
||||||
public static String filter(String text) {
|
public static String filter(String text) {
|
||||||
if (wordMap == null || wordMap.isEmpty() || StringUtils.isBlank(text)) return text;
|
StringBuilder result = new StringBuilder(text);
|
||||||
char[] chars = text.toCharArray(); // 将文本转换为字符数组
|
int index = 0;
|
||||||
int length = chars.length; // 文本长度
|
while (index < result.length()) {
|
||||||
StringBuilder result = new StringBuilder(length); // 存储替换后的结果
|
char c = result.charAt(index);
|
||||||
int i = 0; // 当前遍历的字符索引
|
if (skip(c)) {
|
||||||
while (i < length) {
|
index++;
|
||||||
char c = chars[i]; // 当前字符
|
|
||||||
if (skip(c)) { // 如果是需要跳过的字符,则直接追加到结果中
|
|
||||||
i++;
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
int startIndex = i; // 敏感词匹配的起始索引
|
Word word = root;
|
||||||
Map<Character, Word> currentMap = wordMap; // 当前层级的敏感词字典
|
int start = index;
|
||||||
int matchLength = 0; // 匹配到的敏感词长度
|
boolean found = false;
|
||||||
for (int j = i; j < length; j++) {
|
for (int i = index; i < result.length(); i++) {
|
||||||
char ch = chars[j]; // 当前遍历的字符
|
c = result.charAt(i);
|
||||||
if (skip(ch)) { // 如果是需要跳过的字符,则直接追加到结果中
|
if (skip(c)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
Word word = currentMap.get(ch); // 获取当前字符在当前层级的敏感词字典中对应的敏感词节点
|
if (c >= 'A' && c <= 'Z') {
|
||||||
if (word == null) { // 如果未匹配到敏感词节点,则终止循环
|
c += 32;
|
||||||
|
}
|
||||||
|
word = word.next.get(c);
|
||||||
|
if (word == null) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (word.end) { // 如果当前节点是敏感词的最后一个节点,则记录匹配长度
|
if (word.end) {
|
||||||
matchLength = j - startIndex + 1;
|
found = true;
|
||||||
|
for (int j = start; j <= i; j++) {
|
||||||
|
result.setCharAt(j, replace);
|
||||||
}
|
}
|
||||||
currentMap = word.next; // 进入下一层级的敏感词字典
|
index = i;
|
||||||
if (word.next == null) { // 如果当前节点是敏感词的最后一个节点,则记录匹配长度
|
|
||||||
matchLength = j - startIndex + 1;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (matchLength > 0) { // 如果匹配到敏感词,则将对应的字符替换为指定替代字符
|
if (!found) {
|
||||||
for (int j = startIndex; j < startIndex + matchLength; j++) {
|
index++;
|
||||||
chars[j] = replace;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
i += matchLength > 0 ? matchLength : 1; // 更新当前索引,跳过匹配到的敏感词
|
|
||||||
}
|
|
||||||
result.append(chars); // 将匹配到的敏感词追加到结果中
|
|
||||||
return result.toString();
|
return result.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -90,28 +94,40 @@ public final class SensitiveWordUtils {
|
|||||||
* @param words 敏感词数组
|
* @param words 敏感词数组
|
||||||
*/
|
*/
|
||||||
public static void loadWord(List<String> words) {
|
public static void loadWord(List<String> words) {
|
||||||
if (words == null) return;
|
if (!CollectionUtils.isEmpty(words)) {
|
||||||
words = words.stream().distinct().collect(Collectors.toList()); // 去重
|
Word newRoot = new Word(' ');
|
||||||
wordMap = new HashMap<>(); // 创建敏感词字典的根节点
|
words.forEach(word -> loadWord(word, newRoot));
|
||||||
for (String word : words) {
|
root = newRoot;
|
||||||
if (word == null) continue;
|
|
||||||
char[] chars = word.toCharArray();
|
|
||||||
Map<Character, Word> currentMap = wordMap; // 当前层级的敏感词字典
|
|
||||||
for (int i = 0; i < chars.length; i++) {
|
|
||||||
char c = chars[i];
|
|
||||||
Word currentWord = currentMap.get(c);
|
|
||||||
if (currentWord == null) {
|
|
||||||
Word newWord = new Word(c); // 创建新的敏感词节点
|
|
||||||
currentMap.put(c, newWord); // 将节点添加到当前层级的敏感词字典中
|
|
||||||
if (i == chars.length - 1) {
|
|
||||||
newWord.end = true; // 添加结束标志
|
|
||||||
}
|
|
||||||
currentMap = newWord.next = new HashMap<>(); // 进入下一层级
|
|
||||||
} else {
|
|
||||||
currentMap = currentWord.next; // 存在该字符的节点,则进入下一层级
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 加载敏感词
|
||||||
|
*
|
||||||
|
* @param word 词
|
||||||
|
*/
|
||||||
|
public static void loadWord(String word, Word root) {
|
||||||
|
if (StringUtils.isBlank(word)) {
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
Word current = root;
|
||||||
|
for (int i = 0; i < word.length(); i++) {
|
||||||
|
char c = word.charAt(i);
|
||||||
|
// 如果是大写字母, 转换为小写
|
||||||
|
if (c >= 'A' && c <= 'Z') {
|
||||||
|
c += 32;
|
||||||
|
}
|
||||||
|
if (skip(c)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
Word next = current.next.get(c);
|
||||||
|
if (next == null) {
|
||||||
|
next = new Word(c);
|
||||||
|
current.next.put(c, next);
|
||||||
|
}
|
||||||
|
current = next;
|
||||||
|
}
|
||||||
|
current.end = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -121,28 +137,30 @@ public final class SensitiveWordUtils {
|
|||||||
* @param path 文本文件的绝对路径
|
* @param path 文本文件的绝对路径
|
||||||
*/
|
*/
|
||||||
public static void loadWordFromFile(String path) {
|
public static void loadWordFromFile(String path) {
|
||||||
String encoding = "UTF-8";
|
try (InputStream inputStream = Files.newInputStream(Paths.get(path))) {
|
||||||
File file = new File(path);
|
loadWord(inputStream);
|
||||||
try {
|
|
||||||
if (file.isFile() && file.exists()) {
|
|
||||||
InputStreamReader inputStreamReader = new InputStreamReader(
|
|
||||||
Files.newInputStream(file.toPath()), encoding
|
|
||||||
);
|
|
||||||
BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
|
|
||||||
String line;
|
|
||||||
ArrayList<String> list = new ArrayList<>();
|
|
||||||
while ((line = bufferedReader.readLine()) != null) {
|
|
||||||
list.add(line);
|
|
||||||
}
|
|
||||||
bufferedReader.close();
|
|
||||||
inputStreamReader.close();
|
|
||||||
loadWord(list);
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 从流中加载敏感词列表
|
||||||
|
*
|
||||||
|
* @param inputStream 文本文件输入流
|
||||||
|
* @throws IOException IO异常
|
||||||
|
*/
|
||||||
|
public static void loadWord(InputStream inputStream) throws IOException {
|
||||||
|
try (BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8))) {
|
||||||
|
String line;
|
||||||
|
ArrayList<String> list = new ArrayList<>();
|
||||||
|
while ((line = reader.readLine()) != null) {
|
||||||
|
list.add(line);
|
||||||
|
}
|
||||||
|
loadWord(list);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 判断是否需要跳过当前字符
|
* 判断是否需要跳过当前字符
|
||||||
*
|
*
|
||||||
@@ -150,10 +168,7 @@ public final class SensitiveWordUtils {
|
|||||||
* @return true: 需要跳过, false: 不需要跳过
|
* @return true: 需要跳过, false: 不需要跳过
|
||||||
*/
|
*/
|
||||||
private static boolean skip(char c) {
|
private static boolean skip(char c) {
|
||||||
for (char skipChar : skip) {
|
return skipSet.contains(c);
|
||||||
if (skipChar == c) return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -161,7 +176,7 @@ public final class SensitiveWordUtils {
|
|||||||
*/
|
*/
|
||||||
private static class Word {
|
private static class Word {
|
||||||
// 当前字符
|
// 当前字符
|
||||||
private char c;
|
private final char c;
|
||||||
|
|
||||||
// 结束标识
|
// 结束标识
|
||||||
private boolean end;
|
private boolean end;
|
||||||
@@ -171,12 +186,17 @@ public final class SensitiveWordUtils {
|
|||||||
|
|
||||||
public Word(char c) {
|
public Word(char c) {
|
||||||
this.c = c;
|
this.c = c;
|
||||||
|
this.end = false;
|
||||||
|
this.next = new HashMap<>();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
List<String> strings = Arrays.asList("白日梦", "白痴", "白痴是你","TMD");
|
String text = "白日,梦";
|
||||||
loadWord(strings);
|
String filter = filter(text);
|
||||||
System.out.println(filter("TMD,白痴是你吗"));
|
System.out.println(filter);
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user