fix: 修复文件类型匹配和知识库切割配置问题

1. 修复 ResourceLoaderFactory 文件扩展名匹配问题
   - 去除扩展名前导点,确保 .pdf 能正确匹配 PDF 解析器
   - 修复 PDF/Word/Excel 等文件走错解析逻辑的问题

2. 优化文本切割器动态配置
   - CharacterTextSplitter 和 ExcelTextSplitter 支持从知识库读取配置
   - 根据 kid 查询 separator、textBlockSize、overlapChar
   - 查询失败时降级使用默认配置

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
ageerle
2026-03-25 18:51:01 +08:00
parent 1a10104751
commit 11696a016d
3 changed files with 57 additions and 6 deletions

View File

@@ -1,6 +1,7 @@
package org.ruoyi.factory;
import lombok.AllArgsConstructor;
import org.apache.commons.lang3.StringUtils;
import org.ruoyi.constant.FileTypeConstants;
import org.ruoyi.service.knowledge.ResourceLoader;
import org.ruoyi.service.knowledge.impl.loader.*;
@@ -16,6 +17,7 @@ public class ResourceLoaderFactory {
private final ExcelTextSplitter excelTextSplitter;
public ResourceLoader getLoaderByFileType(String fileType) {
fileType = StringUtils.removeStart(fileType, ".");
if (FileTypeConstants.isTextFile(fileType)) {
return new TextFileLoader(characterTextSplitter);
} else if (FileTypeConstants.isWord(fileType)) {

View File

@@ -1,7 +1,10 @@
package org.ruoyi.service.knowledge.impl.split;
import lombok.AllArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.ruoyi.common.core.utils.StringUtils;
import org.ruoyi.domain.vo.knowledge.KnowledgeInfoVo;
import org.ruoyi.service.knowledge.IKnowledgeInfoService;
import org.ruoyi.service.knowledge.TextSplitter;
import org.springframework.context.annotation.Primary;
import org.springframework.stereotype.Component;
@@ -13,14 +16,37 @@ import java.util.List;
@Component
@Slf4j
@Primary
@AllArgsConstructor
public class CharacterTextSplitter implements TextSplitter {
private final IKnowledgeInfoService knowledgeInfoService;
@Override
public List<String> split(String content, String kid) {
// 使用默认配置
// 默认配置
String knowledgeSeparator = "#";
int textBlockSize = 10000;
int overlapChar = 500;
int textBlockSize = 1000;
int overlapChar = 50;
// 根据知识库ID查询配置覆盖默认值
if (StringUtils.isNotBlank(kid)) {
try {
KnowledgeInfoVo info = knowledgeInfoService.queryById(Long.parseLong(kid));
if (info != null) {
if (StringUtils.isNotBlank(info.getSeparator())) {
knowledgeSeparator = info.getSeparator();
}
if (info.getTextBlockSize() != null && info.getTextBlockSize() > 0) {
textBlockSize = info.getTextBlockSize().intValue();
}
if (info.getOverlapChar() != null && info.getOverlapChar() > 0) {
overlapChar = info.getOverlapChar().intValue();
}
}
} catch (Exception e) {
log.warn("查询知识库配置失败,使用默认配置, kid={}", kid, e);
}
}
List<String> chunkList = new ArrayList<>();
if (content.contains(knowledgeSeparator) && StringUtils.isNotBlank(knowledgeSeparator)) {

View File

@@ -3,6 +3,8 @@ package org.ruoyi.service.knowledge.impl.split;
import lombok.AllArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.ruoyi.common.core.utils.StringUtils;
import org.ruoyi.domain.vo.knowledge.KnowledgeInfoVo;
import org.ruoyi.service.knowledge.IKnowledgeInfoService;
import org.ruoyi.service.knowledge.TextSplitter;
import org.springframework.stereotype.Component;
@@ -15,13 +17,34 @@ import java.util.List;
@Slf4j
public class ExcelTextSplitter implements TextSplitter {
private final IKnowledgeInfoService knowledgeInfoService;
@Override
public List<String> split(String content, String kid) {
// 使用默认配置
// 默认配置
String knowledgeSeparator = "#";
int textBlockSize = 10000;
int overlapChar = 500;
int textBlockSize = 1000;
int overlapChar = 50;
// 根据知识库ID查询配置覆盖默认值
if (StringUtils.isNotBlank(kid)) {
try {
KnowledgeInfoVo info = knowledgeInfoService.queryById(Long.parseLong(kid));
if (info != null) {
if (StringUtils.isNotBlank(info.getSeparator())) {
knowledgeSeparator = info.getSeparator();
}
if (info.getTextBlockSize() != null && info.getTextBlockSize() > 0) {
textBlockSize = info.getTextBlockSize().intValue();
}
if (info.getOverlapChar() != null && info.getOverlapChar() > 0) {
overlapChar = info.getOverlapChar().intValue();
}
}
} catch (Exception e) {
log.warn("查询知识库配置失败,使用默认配置, kid={}", kid, e);
}
}
List<String> chunkList = new ArrayList<>();
if (content.contains(knowledgeSeparator) && StringUtils.isNotBlank(knowledgeSeparator)) {
// 按自定义分隔符切分