From 744e1243a2fd881a70cc55f19c7ff1f72de75098 Mon Sep 17 00:00:00 2001
From: zhaoyuhang <1045078399@qq.com>
Date: Mon, 19 Jun 2023 18:00:19 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E6=95=8F=E6=84=9F=E8=AF=8Dbu?=
 =?UTF-8?q?g?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../common/utils/SensitiveWordUtils.java      | 194 ++++++++++--------
 1 file changed, 107 insertions(+), 87 deletions(-)
diff --git a/mallchat-common/src/main/java/com/abin/mallchat/common/common/utils/SensitiveWordUtils.java b/mallchat-common/src/main/java/com/abin/mallchat/common/common/utils/SensitiveWordUtils.java
index 63b4e24..336da93 100644
--- a/mallchat-common/src/main/java/com/abin/mallchat/common/common/utils/SensitiveWordUtils.java
+++ b/mallchat-common/src/main/java/com/abin/mallchat/common/common/utils/SensitiveWordUtils.java
@@ -1,21 +1,35 @@
 package com.abin.mallchat.common.common.utils;
 
+import org.apache.commons.collections.CollectionUtils;
 import org.apache.commons.lang3.StringUtils;
 
 import java.io.BufferedReader;
-import java.io.File;
 import java.io.IOException;
+import java.io.InputStream;
 import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
+import java.nio.file.Paths;
 import java.util.*;
-import java.util.stream.Collectors;
 
+/**
+ * 敏感词工具类
+ *
+ * @author zhaoyuhang
+ * @date 2023/06/19
+ */
 public final class SensitiveWordUtils {
-    private static Map<Character, Word> wordMap; // 敏感词Map
+    private static Word root = new Word(' '); // 敏感词字典的根节点
     private final static char replace = '*'; // 替代字符
-    private final static char[] skip = new char[]{ // 遇到这些字符就会跳过
-            ' ', '!', '*', '-', '+', '_', '=', ',', '，', '.', '@', ';', ':', '；', '：'
-    };
+    private final static String skipChars = " !*-+_=,，.@;:；：。、？?（）()【】[]《》<>“”\"‘’"; // 遇到这些字符就会跳过
+    private final static Set<Character> skipSet = new HashSet<>(); // 遇到这些字符就会跳过
+
+    static {
+        for (char c : skipChars.toCharArray()) {
+            skipSet.add(c);
+        }
+    }
+
 
     /**
      * 判断文本中是否存在敏感词
@@ -28,12 +42,6 @@ public final class SensitiveWordUtils {
         return !Objects.equals(filter(text), text);
     }
 
-    /**
-     * 过滤敏感词并替换为指定字符
-     *
-     * @param text 待替换文本
-     * @return 替换后的文本
-     */
     /**
      * 敏感词替换
      *
@@ -41,45 +49,41 @@ public final class SensitiveWordUtils {
      * @return 替换后的文本
      */
     public static String filter(String text) {
-        if (wordMap == null || wordMap.isEmpty() || StringUtils.isBlank(text)) return text;
-        char[] chars = text.toCharArray(); // 将文本转换为字符数组
-        int length = chars.length; // 文本长度
-        StringBuilder result = new StringBuilder(length); // 存储替换后的结果
-        int i = 0; // 当前遍历的字符索引
-        while (i < length) {
-            char c = chars[i]; // 当前字符
-            if (skip(c)) { // 如果是需要跳过的字符，则直接追加到结果中
-                i++;
+        StringBuilder result = new StringBuilder(text);
+        int index = 0;
+        while (index < result.length()) {
+            char c = result.charAt(index);
+            if (skip(c)) {
+                index++;
                 continue;
             }
-            int startIndex = i; // 敏感词匹配的起始索引
-            Map<Character, Word> currentMap = wordMap; // 当前层级的敏感词字典
-            int matchLength = 0; // 匹配到的敏感词长度
-            for (int j = i; j < length; j++) {
-                char ch = chars[j]; // 当前遍历的字符
-                if (skip(ch)) { // 如果是需要跳过的字符，则直接追加到结果中
+            Word word = root;
+            int start = index;
+            boolean found = false;
+            for (int i = index; i < result.length(); i++) {
+                c = result.charAt(i);
+                if (skip(c)) {
                     continue;
                 }
-                Word word = currentMap.get(ch); // 获取当前字符在当前层级的敏感词字典中对应的敏感词节点
-                if (word == null) { // 如果未匹配到敏感词节点，则终止循环
+                if (c >= 'A' && c <= 'Z') {
+                    c += 32;
+                }
+                word = word.next.get(c);
+                if (word == null) {
                     break;
                 }
-                if (word.end) { // 如果当前节点是敏感词的最后一个节点，则记录匹配长度
-                    matchLength = j - startIndex + 1;
-                }
-                currentMap = word.next; // 进入下一层级的敏感词字典
-                if (word.next == null) { // 如果当前节点是敏感词的最后一个节点，则记录匹配长度
-                    matchLength = j - startIndex + 1;
+                if (word.end) {
+                    found = true;
+                    for (int j = start; j <= i; j++) {
+                        result.setCharAt(j, replace);
+                    }
+                    index = i;
                 }
             }
-            if (matchLength > 0) { // 如果匹配到敏感词，则将对应的字符替换为指定替代字符
-                for (int j = startIndex; j < startIndex + matchLength; j++) {
-                    chars[j] = replace;
-                }
+            if (!found) {
+                index++;
             }
-            i += matchLength > 0 ? matchLength : 1; // 更新当前索引，跳过匹配到的敏感词
         }
-        result.append(chars); // 将匹配到的敏感词追加到结果中
         return result.toString();
     }
 
@@ -90,30 +94,42 @@ public final class SensitiveWordUtils {
      * @param words 敏感词数组
      */
     public static void loadWord(List<String> words) {
-        if (words == null) return;
-        words = words.stream().distinct().collect(Collectors.toList()); // 去重
-        wordMap = new HashMap<>(); // 创建敏感词字典的根节点
-        for (String word : words) {
-            if (word == null) continue;
-            char[] chars = word.toCharArray();
-            Map<Character, Word> currentMap = wordMap; // 当前层级的敏感词字典
-            for (int i = 0; i < chars.length; i++) {
-                char c = chars[i];
-                Word currentWord = currentMap.get(c);
-                if (currentWord == null) {
-                    Word newWord = new Word(c); // 创建新的敏感词节点
-                    currentMap.put(c, newWord); // 将节点添加到当前层级的敏感词字典中
-                    if (i == chars.length - 1) {
-                        newWord.end = true; // 添加结束标志
-                    }
-                    currentMap = newWord.next = new HashMap<>(); // 进入下一层级
-                } else {
-                    currentMap = currentWord.next; // 存在该字符的节点，则进入下一层级
-                }
-            }
+        if (!CollectionUtils.isEmpty(words)) {
+            Word newRoot = new Word(' ');
+            words.forEach(word -> loadWord(word, newRoot));
+            root = newRoot;
         }
     }
 
+    /**
+     * 加载敏感词
+     *
+     * @param word 词
+     */
+    public static void loadWord(String word, Word root) {
+        if (StringUtils.isBlank(word)) {
+            return;
+        }
+        Word current = root;
+        for (int i = 0; i < word.length(); i++) {
+            char c = word.charAt(i);
+            // 如果是大写字母, 转换为小写
+            if (c >= 'A' && c <= 'Z') {
+                c += 32;
+            }
+            if (skip(c)) {
+                continue;
+            }
+            Word next = current.next.get(c);
+            if (next == null) {
+                next = new Word(c);
+                current.next.put(c, next);
+            }
+            current = next;
+        }
+        current.end = true;
+    }
+
 
     /**
      * 从文本文件中加载敏感词列表
@@ -121,28 +137,30 @@ public final class SensitiveWordUtils {
      * @param path 文本文件的绝对路径
      */
     public static void loadWordFromFile(String path) {
-        String encoding = "UTF-8";
-        File file = new File(path);
-        try {
-            if (file.isFile() && file.exists()) {
-                InputStreamReader inputStreamReader = new InputStreamReader(
-                        Files.newInputStream(file.toPath()), encoding
-                );
-                BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
-                String line;
-                ArrayList<String> list = new ArrayList<>();
-                while ((line = bufferedReader.readLine()) != null) {
-                    list.add(line);
-                }
-                bufferedReader.close();
-                inputStreamReader.close();
-                loadWord(list);
-            }
+        try (InputStream inputStream = Files.newInputStream(Paths.get(path))) {
+            loadWord(inputStream);
         } catch (IOException e) {
             e.printStackTrace();
         }
     }
 
+    /**
+     * 从流中加载敏感词列表
+     *
+     * @param inputStream 文本文件输入流
+     * @throws IOException IO异常
+     */
+    public static void loadWord(InputStream inputStream) throws IOException {
+        try (BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8))) {
+            String line;
+            ArrayList<String> list = new ArrayList<>();
+            while ((line = reader.readLine()) != null) {
+                list.add(line);
+            }
+            loadWord(list);
+        }
+    }
+
     /**
      * 判断是否需要跳过当前字符
      *
@@ -150,10 +168,7 @@ public final class SensitiveWordUtils {
      * @return true: 需要跳过, false: 不需要跳过
      */
     private static boolean skip(char c) {
-        for (char skipChar : skip) {
-            if (skipChar == c) return true;
-        }
-        return false;
+        return skipSet.contains(c);
     }
 
     /**
@@ -161,7 +176,7 @@ public final class SensitiveWordUtils {
      */
     private static class Word {
         // 当前字符
-        private char c;
+        private final char c;
 
         // 结束标识
         private boolean end;
@@ -171,12 +186,17 @@ public final class SensitiveWordUtils {
 
         public Word(char c) {
             this.c = c;
+            this.end = false;
+            this.next = new HashMap<>();
         }
     }
 
     public static void main(String[] args) {
-        List<String> strings = Arrays.asList("白日梦", "白痴", "白痴是你","TMD");
-        loadWord(strings);
-        System.out.println(filter("TMD,白痴是你吗"));
+        String text = "白日,梦";
+        String filter = filter(text);
+        System.out.println(filter);
+
+
     }
+
 }