Merge pull request #47 from 1045078399/main

fix: 修改敏感词bug
2026-06-15 17:57:07 +00:00 · 2023-06-22 21:34:36 +08:00
parent 99f06004f8 744e1243a2
commit db1ffc40ff
1 changed files with 107 additions and 87 deletions
--- a/mallchat-common/src/main/java/com/abin/mallchat/common/common/utils/SensitiveWordUtils.java
+++ b/mallchat-common/src/main/java/com/abin/mallchat/common/common/utils/SensitiveWordUtils.java
@@ -1,21 +1,35 @@
 package com.abin.mallchat.common.common.utils;

+import org.apache.commons.collections.CollectionUtils;
 import org.apache.commons.lang3.StringUtils;

 import java.io.BufferedReader;
-import java.io.File;
 import java.io.IOException;
+import java.io.InputStream;
 import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
+import java.nio.file.Paths;
 import java.util.*;
-import java.util.stream.Collectors;

+/**
+ * 敏感词工具类
+ *
+ * @author zhaoyuhang
+ * @date 2023/06/19
+ */
 public final class SensitiveWordUtils {
-    private static Map<Character, Word> wordMap; // 敏感词Map
+    private static Word root = new Word(' '); // 敏感词字典的根节点
    private final static char replace = '*'; // 替代字符
-    private final static char[] skip = new char[]{ // 遇到这些字符就会跳过
-            ' ', '!', '*', '-', '+', '_', '=', ',', '，', '.', '@', ';', ':', '；', '：'
-    };
+    private final static String skipChars = " !*-+_=,，.@;:；：。、？?（）()【】[]《》<>“”\"‘’"; // 遇到这些字符就会跳过
+    private final static Set<Character> skipSet = new HashSet<>(); // 遇到这些字符就会跳过
+
+    static {
+        for (char c : skipChars.toCharArray()) {
+            skipSet.add(c);
+        }
+    }
+

    /**
     * 判断文本中是否存在敏感词
@@ -28,12 +42,6 @@ public final class SensitiveWordUtils {
        return !Objects.equals(filter(text), text);
    }

-    /**
-     * 过滤敏感词并替换为指定字符
-     *
-     * @param text 待替换文本
-     * @return 替换后的文本
-     */
    /**
     * 敏感词替换
     *
@@ -41,45 +49,41 @@ public final class SensitiveWordUtils {
     * @return 替换后的文本
     */
    public static String filter(String text) {
-        if (wordMap == null || wordMap.isEmpty() || StringUtils.isBlank(text)) return text;
-        char[] chars = text.toCharArray(); // 将文本转换为字符数组
-        int length = chars.length; // 文本长度
-        StringBuilder result = new StringBuilder(length); // 存储替换后的结果
-        int i = 0; // 当前遍历的字符索引
-        while (i < length) {
-            char c = chars[i]; // 当前字符
-            if (skip(c)) { // 如果是需要跳过的字符，则直接追加到结果中
-                i++;
+        StringBuilder result = new StringBuilder(text);
+        int index = 0;
+        while (index < result.length()) {
+            char c = result.charAt(index);
+            if (skip(c)) {
+                index++;
                continue;
            }
-            int startIndex = i; // 敏感词匹配的起始索引
-            Map<Character, Word> currentMap = wordMap; // 当前层级的敏感词字典
-            int matchLength = 0; // 匹配到的敏感词长度
-            for (int j = i; j < length; j++) {
-                char ch = chars[j]; // 当前遍历的字符
-                if (skip(ch)) { // 如果是需要跳过的字符，则直接追加到结果中
+            Word word = root;
+            int start = index;
+            boolean found = false;
+            for (int i = index; i < result.length(); i++) {
+                c = result.charAt(i);
+                if (skip(c)) {
                    continue;
                }
-                Word word = currentMap.get(ch); // 获取当前字符在当前层级的敏感词字典中对应的敏感词节点
-                if (word == null) { // 如果未匹配到敏感词节点，则终止循环
+                if (c >= 'A' && c <= 'Z') {
+                    c += 32;
+                }
+                word = word.next.get(c);
+                if (word == null) {
                    break;
                }
-                if (word.end) { // 如果当前节点是敏感词的最后一个节点，则记录匹配长度
-                    matchLength = j - startIndex + 1;
+                if (word.end) {
+                    found = true;
+                    for (int j = start; j <= i; j++) {
+                        result.setCharAt(j, replace);
                    }
-                currentMap = word.next; // 进入下一层级的敏感词字典
-                if (word.next == null) { // 如果当前节点是敏感词的最后一个节点，则记录匹配长度
-                    matchLength = j - startIndex + 1;
+                    index = i;
                }
            }
-            if (matchLength > 0) { // 如果匹配到敏感词，则将对应的字符替换为指定替代字符
-                for (int j = startIndex; j < startIndex + matchLength; j++) {
-                    chars[j] = replace;
+            if (!found) {
+                index++;
            }
        }
-            i += matchLength > 0 ? matchLength : 1; // 更新当前索引，跳过匹配到的敏感词
-        }
-        result.append(chars); // 将匹配到的敏感词追加到结果中
        return result.toString();
    }

@@ -90,28 +94,40 @@ public final class SensitiveWordUtils {
     * @param words 敏感词数组
     */
    public static void loadWord(List<String> words) {
-        if (words == null) return;
-        words = words.stream().distinct().collect(Collectors.toList()); // 去重
-        wordMap = new HashMap<>(); // 创建敏感词字典的根节点
-        for (String word : words) {
-            if (word == null) continue;
-            char[] chars = word.toCharArray();
-            Map<Character, Word> currentMap = wordMap; // 当前层级的敏感词字典
-            for (int i = 0; i < chars.length; i++) {
-                char c = chars[i];
-                Word currentWord = currentMap.get(c);
-                if (currentWord == null) {
-                    Word newWord = new Word(c); // 创建新的敏感词节点
-                    currentMap.put(c, newWord); // 将节点添加到当前层级的敏感词字典中
-                    if (i == chars.length - 1) {
-                        newWord.end = true; // 添加结束标志
-                    }
-                    currentMap = newWord.next = new HashMap<>(); // 进入下一层级
-                } else {
-                    currentMap = currentWord.next; // 存在该字符的节点，则进入下一层级
+        if (!CollectionUtils.isEmpty(words)) {
+            Word newRoot = new Word(' ');
+            words.forEach(word -> loadWord(word, newRoot));
+            root = newRoot;
        }
    }
+
+    /**
+     * 加载敏感词
+     *
+     * @param word 词
+     */
+    public static void loadWord(String word, Word root) {
+        if (StringUtils.isBlank(word)) {
+            return;
        }
+        Word current = root;
+        for (int i = 0; i < word.length(); i++) {
+            char c = word.charAt(i);
+            // 如果是大写字母, 转换为小写
+            if (c >= 'A' && c <= 'Z') {
+                c += 32;
+            }
+            if (skip(c)) {
+                continue;
+            }
+            Word next = current.next.get(c);
+            if (next == null) {
+                next = new Word(c);
+                current.next.put(c, next);
+            }
+            current = next;
+        }
+        current.end = true;
    }


@@ -121,28 +137,30 @@ public final class SensitiveWordUtils {
     * @param path 文本文件的绝对路径
     */
    public static void loadWordFromFile(String path) {
-        String encoding = "UTF-8";
-        File file = new File(path);
-        try {
-            if (file.isFile() && file.exists()) {
-                InputStreamReader inputStreamReader = new InputStreamReader(
-                        Files.newInputStream(file.toPath()), encoding
-                );
-                BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
-                String line;
-                ArrayList<String> list = new ArrayList<>();
-                while ((line = bufferedReader.readLine()) != null) {
-                    list.add(line);
-                }
-                bufferedReader.close();
-                inputStreamReader.close();
-                loadWord(list);
-            }
+        try (InputStream inputStream = Files.newInputStream(Paths.get(path))) {
+            loadWord(inputStream);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

+    /**
+     * 从流中加载敏感词列表
+     *
+     * @param inputStream 文本文件输入流
+     * @throws IOException IO异常
+     */
+    public static void loadWord(InputStream inputStream) throws IOException {
+        try (BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8))) {
+            String line;
+            ArrayList<String> list = new ArrayList<>();
+            while ((line = reader.readLine()) != null) {
+                list.add(line);
+            }
+            loadWord(list);
+        }
+    }
+
    /**
     * 判断是否需要跳过当前字符
     *
@@ -150,10 +168,7 @@ public final class SensitiveWordUtils {
     * @return true: 需要跳过, false: 不需要跳过
     */
    private static boolean skip(char c) {
-        for (char skipChar : skip) {
-            if (skipChar == c) return true;
-        }
-        return false;
+        return skipSet.contains(c);
    }

    /**
@@ -161,7 +176,7 @@ public final class SensitiveWordUtils {
     */
    private static class Word {
        // 当前字符
-        private char c;
+        private final char c;

        // 结束标识
        private boolean end;
@@ -171,12 +186,17 @@ public final class SensitiveWordUtils {

        public Word(char c) {
            this.c = c;
+            this.end = false;
+            this.next = new HashMap<>();
        }
    }

    public static void main(String[] args) {
-        List<String> strings = Arrays.asList("白日梦", "白痴", "白痴是你","TMD");
-        loadWord(strings);
-        System.out.println(filter("TMD,白痴是你吗"));
+        String text = "白日,梦";
+        String filter = filter(text);
+        System.out.println(filter);
+
+
    }
+
 }