From 744e1243a2fd881a70cc55f19c7ff1f72de75098 Mon Sep 17 00:00:00 2001 From: zhaoyuhang <1045078399@qq.com> Date: Mon, 19 Jun 2023 18:00:19 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E6=95=8F=E6=84=9F=E8=AF=8Dbu?= =?UTF-8?q?g?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../common/utils/SensitiveWordUtils.java | 194 ++++++++++-------- 1 file changed, 107 insertions(+), 87 deletions(-) diff --git a/mallchat-common/src/main/java/com/abin/mallchat/common/common/utils/SensitiveWordUtils.java b/mallchat-common/src/main/java/com/abin/mallchat/common/common/utils/SensitiveWordUtils.java index 63b4e24..336da93 100644 --- a/mallchat-common/src/main/java/com/abin/mallchat/common/common/utils/SensitiveWordUtils.java +++ b/mallchat-common/src/main/java/com/abin/mallchat/common/common/utils/SensitiveWordUtils.java @@ -1,21 +1,35 @@ package com.abin.mallchat.common.common.utils; +import org.apache.commons.collections.CollectionUtils; import org.apache.commons.lang3.StringUtils; import java.io.BufferedReader; -import java.io.File; import java.io.IOException; +import java.io.InputStream; import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; +import java.nio.file.Paths; import java.util.*; -import java.util.stream.Collectors; +/** + * 敏感词工具类 + * + * @author zhaoyuhang + * @date 2023/06/19 + */ public final class SensitiveWordUtils { - private static Map wordMap; // 敏感词Map + private static Word root = new Word(' '); // 敏感词字典的根节点 private final static char replace = '*'; // 替代字符 - private final static char[] skip = new char[]{ // 遇到这些字符就会跳过 - ' ', '!', '*', '-', '+', '_', '=', ',', ',', '.', '@', ';', ':', ';', ':' - }; + private final static String skipChars = " !*-+_=,,.@;:;:。、??()()【】[]《》<>“”\"‘’"; // 遇到这些字符就会跳过 + private final static Set skipSet = new HashSet<>(); // 遇到这些字符就会跳过 + + static { + for (char c : skipChars.toCharArray()) { + skipSet.add(c); + } + } + /** * 判断文本中是否存在敏感词 @@ -28,12 +42,6 @@ public final class SensitiveWordUtils { return !Objects.equals(filter(text), text); } - /** - * 过滤敏感词并替换为指定字符 - * - * @param text 待替换文本 - * @return 替换后的文本 - */ /** * 敏感词替换 * @@ -41,45 +49,41 @@ public final class SensitiveWordUtils { * @return 替换后的文本 */ public static String filter(String text) { - if (wordMap == null || wordMap.isEmpty() || StringUtils.isBlank(text)) return text; - char[] chars = text.toCharArray(); // 将文本转换为字符数组 - int length = chars.length; // 文本长度 - StringBuilder result = new StringBuilder(length); // 存储替换后的结果 - int i = 0; // 当前遍历的字符索引 - while (i < length) { - char c = chars[i]; // 当前字符 - if (skip(c)) { // 如果是需要跳过的字符,则直接追加到结果中 - i++; + StringBuilder result = new StringBuilder(text); + int index = 0; + while (index < result.length()) { + char c = result.charAt(index); + if (skip(c)) { + index++; continue; } - int startIndex = i; // 敏感词匹配的起始索引 - Map currentMap = wordMap; // 当前层级的敏感词字典 - int matchLength = 0; // 匹配到的敏感词长度 - for (int j = i; j < length; j++) { - char ch = chars[j]; // 当前遍历的字符 - if (skip(ch)) { // 如果是需要跳过的字符,则直接追加到结果中 + Word word = root; + int start = index; + boolean found = false; + for (int i = index; i < result.length(); i++) { + c = result.charAt(i); + if (skip(c)) { continue; } - Word word = currentMap.get(ch); // 获取当前字符在当前层级的敏感词字典中对应的敏感词节点 - if (word == null) { // 如果未匹配到敏感词节点,则终止循环 + if (c >= 'A' && c <= 'Z') { + c += 32; + } + word = word.next.get(c); + if (word == null) { break; } - if (word.end) { // 如果当前节点是敏感词的最后一个节点,则记录匹配长度 - matchLength = j - startIndex + 1; - } - currentMap = word.next; // 进入下一层级的敏感词字典 - if (word.next == null) { // 如果当前节点是敏感词的最后一个节点,则记录匹配长度 - matchLength = j - startIndex + 1; + if (word.end) { + found = true; + for (int j = start; j <= i; j++) { + result.setCharAt(j, replace); + } + index = i; } } - if (matchLength > 0) { // 如果匹配到敏感词,则将对应的字符替换为指定替代字符 - for (int j = startIndex; j < startIndex + matchLength; j++) { - chars[j] = replace; - } + if (!found) { + index++; } - i += matchLength > 0 ? matchLength : 1; // 更新当前索引,跳过匹配到的敏感词 } - result.append(chars); // 将匹配到的敏感词追加到结果中 return result.toString(); } @@ -90,30 +94,42 @@ public final class SensitiveWordUtils { * @param words 敏感词数组 */ public static void loadWord(List words) { - if (words == null) return; - words = words.stream().distinct().collect(Collectors.toList()); // 去重 - wordMap = new HashMap<>(); // 创建敏感词字典的根节点 - for (String word : words) { - if (word == null) continue; - char[] chars = word.toCharArray(); - Map currentMap = wordMap; // 当前层级的敏感词字典 - for (int i = 0; i < chars.length; i++) { - char c = chars[i]; - Word currentWord = currentMap.get(c); - if (currentWord == null) { - Word newWord = new Word(c); // 创建新的敏感词节点 - currentMap.put(c, newWord); // 将节点添加到当前层级的敏感词字典中 - if (i == chars.length - 1) { - newWord.end = true; // 添加结束标志 - } - currentMap = newWord.next = new HashMap<>(); // 进入下一层级 - } else { - currentMap = currentWord.next; // 存在该字符的节点,则进入下一层级 - } - } + if (!CollectionUtils.isEmpty(words)) { + Word newRoot = new Word(' '); + words.forEach(word -> loadWord(word, newRoot)); + root = newRoot; } } + /** + * 加载敏感词 + * + * @param word 词 + */ + public static void loadWord(String word, Word root) { + if (StringUtils.isBlank(word)) { + return; + } + Word current = root; + for (int i = 0; i < word.length(); i++) { + char c = word.charAt(i); + // 如果是大写字母, 转换为小写 + if (c >= 'A' && c <= 'Z') { + c += 32; + } + if (skip(c)) { + continue; + } + Word next = current.next.get(c); + if (next == null) { + next = new Word(c); + current.next.put(c, next); + } + current = next; + } + current.end = true; + } + /** * 从文本文件中加载敏感词列表 @@ -121,28 +137,30 @@ public final class SensitiveWordUtils { * @param path 文本文件的绝对路径 */ public static void loadWordFromFile(String path) { - String encoding = "UTF-8"; - File file = new File(path); - try { - if (file.isFile() && file.exists()) { - InputStreamReader inputStreamReader = new InputStreamReader( - Files.newInputStream(file.toPath()), encoding - ); - BufferedReader bufferedReader = new BufferedReader(inputStreamReader); - String line; - ArrayList list = new ArrayList<>(); - while ((line = bufferedReader.readLine()) != null) { - list.add(line); - } - bufferedReader.close(); - inputStreamReader.close(); - loadWord(list); - } + try (InputStream inputStream = Files.newInputStream(Paths.get(path))) { + loadWord(inputStream); } catch (IOException e) { e.printStackTrace(); } } + /** + * 从流中加载敏感词列表 + * + * @param inputStream 文本文件输入流 + * @throws IOException IO异常 + */ + public static void loadWord(InputStream inputStream) throws IOException { + try (BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8))) { + String line; + ArrayList list = new ArrayList<>(); + while ((line = reader.readLine()) != null) { + list.add(line); + } + loadWord(list); + } + } + /** * 判断是否需要跳过当前字符 * @@ -150,10 +168,7 @@ public final class SensitiveWordUtils { * @return true: 需要跳过, false: 不需要跳过 */ private static boolean skip(char c) { - for (char skipChar : skip) { - if (skipChar == c) return true; - } - return false; + return skipSet.contains(c); } /** @@ -161,7 +176,7 @@ public final class SensitiveWordUtils { */ private static class Word { // 当前字符 - private char c; + private final char c; // 结束标识 private boolean end; @@ -171,12 +186,17 @@ public final class SensitiveWordUtils { public Word(char c) { this.c = c; + this.end = false; + this.next = new HashMap<>(); } } public static void main(String[] args) { - List strings = Arrays.asList("白日梦", "白痴", "白痴是你","TMD"); - loadWord(strings); - System.out.println(filter("TMD,白痴是你吗")); + String text = "白日,梦"; + String filter = filter(text); + System.out.println(filter); + + } + }