From 11696a016d29312e0e44819a693875154b8eb33e Mon Sep 17 00:00:00 2001 From: ageerle Date: Wed, 25 Mar 2026 18:51:01 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E6=96=87=E4=BB=B6?= =?UTF-8?q?=E7=B1=BB=E5=9E=8B=E5=8C=B9=E9=85=8D=E5=92=8C=E7=9F=A5=E8=AF=86?= =?UTF-8?q?=E5=BA=93=E5=88=87=E5=89=B2=E9=85=8D=E7=BD=AE=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. 修复 ResourceLoaderFactory 文件扩展名匹配问题 - 去除扩展名前导点,确保 .pdf 能正确匹配 PDF 解析器 - 修复 PDF/Word/Excel 等文件走错解析逻辑的问题 2. 优化文本切割器动态配置 - CharacterTextSplitter 和 ExcelTextSplitter 支持从知识库读取配置 - 根据 kid 查询 separator、textBlockSize、overlapChar - 查询失败时降级使用默认配置 Co-Authored-By: Claude Opus 4.6 --- .../ruoyi/factory/ResourceLoaderFactory.java | 2 ++ .../impl/split/CharacterTextSplitter.java | 32 +++++++++++++++++-- .../impl/split/ExcelTextSplitter.java | 29 +++++++++++++++-- 3 files changed, 57 insertions(+), 6 deletions(-) diff --git a/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/factory/ResourceLoaderFactory.java b/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/factory/ResourceLoaderFactory.java index a03b4fd1..de293426 100644 --- a/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/factory/ResourceLoaderFactory.java +++ b/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/factory/ResourceLoaderFactory.java @@ -1,6 +1,7 @@ package org.ruoyi.factory; import lombok.AllArgsConstructor; +import org.apache.commons.lang3.StringUtils; import org.ruoyi.constant.FileTypeConstants; import org.ruoyi.service.knowledge.ResourceLoader; import org.ruoyi.service.knowledge.impl.loader.*; @@ -16,6 +17,7 @@ public class ResourceLoaderFactory { private final ExcelTextSplitter excelTextSplitter; public ResourceLoader getLoaderByFileType(String fileType) { + fileType = StringUtils.removeStart(fileType, "."); if (FileTypeConstants.isTextFile(fileType)) { return new TextFileLoader(characterTextSplitter); } else if (FileTypeConstants.isWord(fileType)) { diff --git a/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/service/knowledge/impl/split/CharacterTextSplitter.java b/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/service/knowledge/impl/split/CharacterTextSplitter.java index a345b865..107f0dc1 100644 --- a/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/service/knowledge/impl/split/CharacterTextSplitter.java +++ b/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/service/knowledge/impl/split/CharacterTextSplitter.java @@ -1,7 +1,10 @@ package org.ruoyi.service.knowledge.impl.split; +import lombok.AllArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.ruoyi.common.core.utils.StringUtils; +import org.ruoyi.domain.vo.knowledge.KnowledgeInfoVo; +import org.ruoyi.service.knowledge.IKnowledgeInfoService; import org.ruoyi.service.knowledge.TextSplitter; import org.springframework.context.annotation.Primary; import org.springframework.stereotype.Component; @@ -13,14 +16,37 @@ import java.util.List; @Component @Slf4j @Primary +@AllArgsConstructor public class CharacterTextSplitter implements TextSplitter { + private final IKnowledgeInfoService knowledgeInfoService; + @Override public List split(String content, String kid) { - // 使用默认配置 + // 默认配置值 String knowledgeSeparator = "#"; - int textBlockSize = 10000; - int overlapChar = 500; + int textBlockSize = 1000; + int overlapChar = 50; + + // 根据知识库ID查询配置,覆盖默认值 + if (StringUtils.isNotBlank(kid)) { + try { + KnowledgeInfoVo info = knowledgeInfoService.queryById(Long.parseLong(kid)); + if (info != null) { + if (StringUtils.isNotBlank(info.getSeparator())) { + knowledgeSeparator = info.getSeparator(); + } + if (info.getTextBlockSize() != null && info.getTextBlockSize() > 0) { + textBlockSize = info.getTextBlockSize().intValue(); + } + if (info.getOverlapChar() != null && info.getOverlapChar() > 0) { + overlapChar = info.getOverlapChar().intValue(); + } + } + } catch (Exception e) { + log.warn("查询知识库配置失败,使用默认配置, kid={}", kid, e); + } + } List chunkList = new ArrayList<>(); if (content.contains(knowledgeSeparator) && StringUtils.isNotBlank(knowledgeSeparator)) { diff --git a/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/service/knowledge/impl/split/ExcelTextSplitter.java b/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/service/knowledge/impl/split/ExcelTextSplitter.java index 0c242b20..1b610107 100644 --- a/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/service/knowledge/impl/split/ExcelTextSplitter.java +++ b/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/service/knowledge/impl/split/ExcelTextSplitter.java @@ -3,6 +3,8 @@ package org.ruoyi.service.knowledge.impl.split; import lombok.AllArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.ruoyi.common.core.utils.StringUtils; +import org.ruoyi.domain.vo.knowledge.KnowledgeInfoVo; +import org.ruoyi.service.knowledge.IKnowledgeInfoService; import org.ruoyi.service.knowledge.TextSplitter; import org.springframework.stereotype.Component; @@ -15,13 +17,34 @@ import java.util.List; @Slf4j public class ExcelTextSplitter implements TextSplitter { + private final IKnowledgeInfoService knowledgeInfoService; @Override public List split(String content, String kid) { - // 使用默认配置 + // 默认配置 String knowledgeSeparator = "#"; - int textBlockSize = 10000; - int overlapChar = 500; + int textBlockSize = 1000; + int overlapChar = 50; + + // 根据知识库ID查询配置,覆盖默认值 + if (StringUtils.isNotBlank(kid)) { + try { + KnowledgeInfoVo info = knowledgeInfoService.queryById(Long.parseLong(kid)); + if (info != null) { + if (StringUtils.isNotBlank(info.getSeparator())) { + knowledgeSeparator = info.getSeparator(); + } + if (info.getTextBlockSize() != null && info.getTextBlockSize() > 0) { + textBlockSize = info.getTextBlockSize().intValue(); + } + if (info.getOverlapChar() != null && info.getOverlapChar() > 0) { + overlapChar = info.getOverlapChar().intValue(); + } + } + } catch (Exception e) { + log.warn("查询知识库配置失败,使用默认配置, kid={}", kid, e); + } + } List chunkList = new ArrayList<>(); if (content.contains(knowledgeSeparator) && StringUtils.isNotBlank(knowledgeSeparator)) { // 按自定义分隔符切分