Merge pull request #47 from 1045078399/main

fix: 修改敏感词bug
2026-06-16 02:25:43 +00:00 · 2023-06-22 21:34:36 +08:00
parent 99f06004f8 744e1243a2
commit db1ffc40ff
1 changed files with 107 additions and 87 deletions
--- a/mallchat-common/src/main/java/com/abin/mallchat/common/common/utils/SensitiveWordUtils.java
+++ b/mallchat-common/src/main/java/com/abin/mallchat/common/common/utils/SensitiveWordUtils.java
@@ -1,21 +1,35 @@
 package com.abin.mallchat.common.common.utils;
 import org.apache.commons.collections.CollectionUtils;
 import org.apache.commons.lang3.StringUtils;
 import java.io.BufferedReader;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Paths;
 import java.util.*;
 import java.util.stream.Collectors;
 /**
 * 敏感词工具类
 *
 * @author zhaoyuhang
 * @date 2023/06/19
 */
 public final class SensitiveWordUtils {
-    private static Map<Character, Word> wordMap; // 敏感词Map
+    private static Word root = new Word(' '); // 敏感词字典的根节点
    private final static char replace = '*'; // 替代字符
-    private final static char[] skip = new char[]{ // 遇到这些字符就会跳过
+    private final static String skipChars = " !*-+_=,，.@;:；：。、？?（）()【】[]《》<>“”\"‘’"; // 遇到这些字符就会跳过
-            ' ', '!', '*', '-', '+', '_', '=', ',', '，', '.', '@', ';', ':', '；', '：'
+    private final static Set<Character> skipSet = new HashSet<>(); // 遇到这些字符就会跳过
-    };
+
    static {
        for (char c : skipChars.toCharArray()) {
            skipSet.add(c);
        }
    }
    /**
     * 判断文本中是否存在敏感词
@@ -28,12 +42,6 @@ public final class SensitiveWordUtils {
        return !Objects.equals(filter(text), text);
    }
    /**
     * 过滤敏感词并替换为指定字符
     *
     * @param text 待替换文本
     * @return 替换后的文本
     */
    /**
     * 敏感词替换
     *
@@ -41,45 +49,41 @@ public final class SensitiveWordUtils {
     * @return 替换后的文本
     */
    public static String filter(String text) {
-        if (wordMap == null || wordMap.isEmpty() || StringUtils.isBlank(text)) return text;
+        StringBuilder result = new StringBuilder(text);
-        char[] chars = text.toCharArray(); // 将文本转换为字符数组
+        int index = 0;
-        int length = chars.length; // 文本长度
+        while (index < result.length()) {
-        StringBuilder result = new StringBuilder(length); // 存储替换后的结果
+            char c = result.charAt(index);
-        int i = 0; // 当前遍历的字符索引
+            if (skip(c)) {
-        while (i < length) {
+                index++;
            char c = chars[i]; // 当前字符
            if (skip(c)) { // 如果是需要跳过的字符，则直接追加到结果中
                i++;
                continue;
            }
-            int startIndex = i; // 敏感词匹配的起始索引
+            Word word = root;
-            Map<Character, Word> currentMap = wordMap; // 当前层级的敏感词字典
+            int start = index;
-            int matchLength = 0; // 匹配到的敏感词长度
+            boolean found = false;
-            for (int j = i; j < length; j++) {
+            for (int i = index; i < result.length(); i++) {
-                char ch = chars[j]; // 当前遍历的字符
+                c = result.charAt(i);
-                if (skip(ch)) { // 如果是需要跳过的字符，则直接追加到结果中
+                if (skip(c)) {
                    continue;
                }
-                Word word = currentMap.get(ch); // 获取当前字符在当前层级的敏感词字典中对应的敏感词节点
+                if (c >= 'A' && c <= 'Z') {
-                if (word == null) { // 如果未匹配到敏感词节点，则终止循环
+                    c += 32;
                }
                word = word.next.get(c);
                if (word == null) {
                    break;
                }
-                if (word.end) { // 如果当前节点是敏感词的最后一个节点，则记录匹配长度
+                if (word.end) {
-                    matchLength = j - startIndex + 1;
+                    found = true;
                    for (int j = start; j <= i; j++) {
                        result.setCharAt(j, replace);
                    }
-                currentMap = word.next; // 进入下一层级的敏感词字典
+                    index = i;
                if (word.next == null) { // 如果当前节点是敏感词的最后一个节点，则记录匹配长度
                    matchLength = j - startIndex + 1;
                }
            }
-            if (matchLength > 0) { // 如果匹配到敏感词，则将对应的字符替换为指定替代字符
+            if (!found) {
-                for (int j = startIndex; j < startIndex + matchLength; j++) {
+                index++;
                    chars[j] = replace;
            }
        }
            i += matchLength > 0 ? matchLength : 1; // 更新当前索引，跳过匹配到的敏感词
        }
        result.append(chars); // 将匹配到的敏感词追加到结果中
        return result.toString();
    }
@@ -90,28 +94,40 @@ public final class SensitiveWordUtils {
     * @param words 敏感词数组
     */
    public static void loadWord(List<String> words) {
-        if (words == null) return;
+        if (!CollectionUtils.isEmpty(words)) {
-        words = words.stream().distinct().collect(Collectors.toList()); // 去重
+            Word newRoot = new Word(' ');
-        wordMap = new HashMap<>(); // 创建敏感词字典的根节点
+            words.forEach(word -> loadWord(word, newRoot));
-        for (String word : words) {
+            root = newRoot;
            if (word == null) continue;
            char[] chars = word.toCharArray();
            Map<Character, Word> currentMap = wordMap; // 当前层级的敏感词字典
            for (int i = 0; i < chars.length; i++) {
                char c = chars[i];
                Word currentWord = currentMap.get(c);
                if (currentWord == null) {
                    Word newWord = new Word(c); // 创建新的敏感词节点
                    currentMap.put(c, newWord); // 将节点添加到当前层级的敏感词字典中
                    if (i == chars.length - 1) {
                        newWord.end = true; // 添加结束标志
                    }
                    currentMap = newWord.next = new HashMap<>(); // 进入下一层级
                } else {
                    currentMap = currentWord.next; // 存在该字符的节点，则进入下一层级
        }
    }
    /**
     * 加载敏感词
     *
     * @param word 词
     */
    public static void loadWord(String word, Word root) {
        if (StringUtils.isBlank(word)) {
            return;
        }
        Word current = root;
        for (int i = 0; i < word.length(); i++) {
            char c = word.charAt(i);
            // 如果是大写字母, 转换为小写
            if (c >= 'A' && c <= 'Z') {
                c += 32;
            }
            if (skip(c)) {
                continue;
            }
            Word next = current.next.get(c);
            if (next == null) {
                next = new Word(c);
                current.next.put(c, next);
            }
            current = next;
        }
        current.end = true;
    }
@@ -121,28 +137,30 @@ public final class SensitiveWordUtils {
     * @param path 文本文件的绝对路径
     */
    public static void loadWordFromFile(String path) {
-        String encoding = "UTF-8";
+        try (InputStream inputStream = Files.newInputStream(Paths.get(path))) {
-        File file = new File(path);
+            loadWord(inputStream);
        try {
            if (file.isFile() && file.exists()) {
                InputStreamReader inputStreamReader = new InputStreamReader(
                        Files.newInputStream(file.toPath()), encoding
                );
                BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
                String line;
                ArrayList<String> list = new ArrayList<>();
                while ((line = bufferedReader.readLine()) != null) {
                    list.add(line);
                }
                bufferedReader.close();
                inputStreamReader.close();
                loadWord(list);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    /**
     * 从流中加载敏感词列表
     *
     * @param inputStream 文本文件输入流
     * @throws IOException IO异常
     */
    public static void loadWord(InputStream inputStream) throws IOException {
        try (BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8))) {
            String line;
            ArrayList<String> list = new ArrayList<>();
            while ((line = reader.readLine()) != null) {
                list.add(line);
            }
            loadWord(list);
        }
    }
    /**
     * 判断是否需要跳过当前字符
     *
@@ -150,10 +168,7 @@ public final class SensitiveWordUtils {
     * @return true: 需要跳过, false: 不需要跳过
     */
    private static boolean skip(char c) {
-        for (char skipChar : skip) {
+        return skipSet.contains(c);
            if (skipChar == c) return true;
        }
        return false;
    }
    /**
@@ -161,7 +176,7 @@ public final class SensitiveWordUtils {
     */
    private static class Word {
        // 当前字符
-        private char c;
+        private final char c;
        // 结束标识
        private boolean end;
@@ -171,12 +186,17 @@ public final class SensitiveWordUtils {
        public Word(char c) {
            this.c = c;
            this.end = false;
            this.next = new HashMap<>();
        }
    }
    public static void main(String[] args) {
-        List<String> strings = Arrays.asList("白日梦", "白痴", "白痴是你","TMD");
+        String text = "白日,梦";
-        loadWord(strings);
+        String filter = filter(text);
-        System.out.println(filter("TMD,白痴是你吗"));
+        System.out.println(filter);
    }
 }