mirror of
https://gitcode.com/ageerle/ruoyi-ai.git
synced 2026-04-16 13:23:42 +00:00
fix: 修复文件类型匹配和知识库切割配置问题
1. 修复 ResourceLoaderFactory 文件扩展名匹配问题 - 去除扩展名前导点,确保 .pdf 能正确匹配 PDF 解析器 - 修复 PDF/Word/Excel 等文件走错解析逻辑的问题 2. 优化文本切割器动态配置 - CharacterTextSplitter 和 ExcelTextSplitter 支持从知识库读取配置 - 根据 kid 查询 separator、textBlockSize、overlapChar - 查询失败时降级使用默认配置 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,6 +1,7 @@
|
|||||||
package org.ruoyi.factory;
|
package org.ruoyi.factory;
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.ruoyi.constant.FileTypeConstants;
|
import org.ruoyi.constant.FileTypeConstants;
|
||||||
import org.ruoyi.service.knowledge.ResourceLoader;
|
import org.ruoyi.service.knowledge.ResourceLoader;
|
||||||
import org.ruoyi.service.knowledge.impl.loader.*;
|
import org.ruoyi.service.knowledge.impl.loader.*;
|
||||||
@@ -16,6 +17,7 @@ public class ResourceLoaderFactory {
|
|||||||
private final ExcelTextSplitter excelTextSplitter;
|
private final ExcelTextSplitter excelTextSplitter;
|
||||||
|
|
||||||
public ResourceLoader getLoaderByFileType(String fileType) {
|
public ResourceLoader getLoaderByFileType(String fileType) {
|
||||||
|
fileType = StringUtils.removeStart(fileType, ".");
|
||||||
if (FileTypeConstants.isTextFile(fileType)) {
|
if (FileTypeConstants.isTextFile(fileType)) {
|
||||||
return new TextFileLoader(characterTextSplitter);
|
return new TextFileLoader(characterTextSplitter);
|
||||||
} else if (FileTypeConstants.isWord(fileType)) {
|
} else if (FileTypeConstants.isWord(fileType)) {
|
||||||
|
|||||||
@@ -1,7 +1,10 @@
|
|||||||
package org.ruoyi.service.knowledge.impl.split;
|
package org.ruoyi.service.knowledge.impl.split;
|
||||||
|
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
import org.ruoyi.common.core.utils.StringUtils;
|
import org.ruoyi.common.core.utils.StringUtils;
|
||||||
|
import org.ruoyi.domain.vo.knowledge.KnowledgeInfoVo;
|
||||||
|
import org.ruoyi.service.knowledge.IKnowledgeInfoService;
|
||||||
import org.ruoyi.service.knowledge.TextSplitter;
|
import org.ruoyi.service.knowledge.TextSplitter;
|
||||||
import org.springframework.context.annotation.Primary;
|
import org.springframework.context.annotation.Primary;
|
||||||
import org.springframework.stereotype.Component;
|
import org.springframework.stereotype.Component;
|
||||||
@@ -13,14 +16,37 @@ import java.util.List;
|
|||||||
@Component
|
@Component
|
||||||
@Slf4j
|
@Slf4j
|
||||||
@Primary
|
@Primary
|
||||||
|
@AllArgsConstructor
|
||||||
public class CharacterTextSplitter implements TextSplitter {
|
public class CharacterTextSplitter implements TextSplitter {
|
||||||
|
|
||||||
|
private final IKnowledgeInfoService knowledgeInfoService;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<String> split(String content, String kid) {
|
public List<String> split(String content, String kid) {
|
||||||
// 使用默认配置
|
// 默认配置值
|
||||||
String knowledgeSeparator = "#";
|
String knowledgeSeparator = "#";
|
||||||
int textBlockSize = 10000;
|
int textBlockSize = 1000;
|
||||||
int overlapChar = 500;
|
int overlapChar = 50;
|
||||||
|
|
||||||
|
// 根据知识库ID查询配置,覆盖默认值
|
||||||
|
if (StringUtils.isNotBlank(kid)) {
|
||||||
|
try {
|
||||||
|
KnowledgeInfoVo info = knowledgeInfoService.queryById(Long.parseLong(kid));
|
||||||
|
if (info != null) {
|
||||||
|
if (StringUtils.isNotBlank(info.getSeparator())) {
|
||||||
|
knowledgeSeparator = info.getSeparator();
|
||||||
|
}
|
||||||
|
if (info.getTextBlockSize() != null && info.getTextBlockSize() > 0) {
|
||||||
|
textBlockSize = info.getTextBlockSize().intValue();
|
||||||
|
}
|
||||||
|
if (info.getOverlapChar() != null && info.getOverlapChar() > 0) {
|
||||||
|
overlapChar = info.getOverlapChar().intValue();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.warn("查询知识库配置失败,使用默认配置, kid={}", kid, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
List<String> chunkList = new ArrayList<>();
|
List<String> chunkList = new ArrayList<>();
|
||||||
if (content.contains(knowledgeSeparator) && StringUtils.isNotBlank(knowledgeSeparator)) {
|
if (content.contains(knowledgeSeparator) && StringUtils.isNotBlank(knowledgeSeparator)) {
|
||||||
|
|||||||
@@ -3,6 +3,8 @@ package org.ruoyi.service.knowledge.impl.split;
|
|||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
import org.ruoyi.common.core.utils.StringUtils;
|
import org.ruoyi.common.core.utils.StringUtils;
|
||||||
|
import org.ruoyi.domain.vo.knowledge.KnowledgeInfoVo;
|
||||||
|
import org.ruoyi.service.knowledge.IKnowledgeInfoService;
|
||||||
import org.ruoyi.service.knowledge.TextSplitter;
|
import org.ruoyi.service.knowledge.TextSplitter;
|
||||||
import org.springframework.stereotype.Component;
|
import org.springframework.stereotype.Component;
|
||||||
|
|
||||||
@@ -15,13 +17,34 @@ import java.util.List;
|
|||||||
@Slf4j
|
@Slf4j
|
||||||
public class ExcelTextSplitter implements TextSplitter {
|
public class ExcelTextSplitter implements TextSplitter {
|
||||||
|
|
||||||
|
private final IKnowledgeInfoService knowledgeInfoService;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<String> split(String content, String kid) {
|
public List<String> split(String content, String kid) {
|
||||||
// 使用默认配置
|
// 默认配置
|
||||||
String knowledgeSeparator = "#";
|
String knowledgeSeparator = "#";
|
||||||
int textBlockSize = 10000;
|
int textBlockSize = 1000;
|
||||||
int overlapChar = 500;
|
int overlapChar = 50;
|
||||||
|
|
||||||
|
// 根据知识库ID查询配置,覆盖默认值
|
||||||
|
if (StringUtils.isNotBlank(kid)) {
|
||||||
|
try {
|
||||||
|
KnowledgeInfoVo info = knowledgeInfoService.queryById(Long.parseLong(kid));
|
||||||
|
if (info != null) {
|
||||||
|
if (StringUtils.isNotBlank(info.getSeparator())) {
|
||||||
|
knowledgeSeparator = info.getSeparator();
|
||||||
|
}
|
||||||
|
if (info.getTextBlockSize() != null && info.getTextBlockSize() > 0) {
|
||||||
|
textBlockSize = info.getTextBlockSize().intValue();
|
||||||
|
}
|
||||||
|
if (info.getOverlapChar() != null && info.getOverlapChar() > 0) {
|
||||||
|
overlapChar = info.getOverlapChar().intValue();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.warn("查询知识库配置失败,使用默认配置, kid={}", kid, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
List<String> chunkList = new ArrayList<>();
|
List<String> chunkList = new ArrayList<>();
|
||||||
if (content.contains(knowledgeSeparator) && StringUtils.isNotBlank(knowledgeSeparator)) {
|
if (content.contains(knowledgeSeparator) && StringUtils.isNotBlank(knowledgeSeparator)) {
|
||||||
// 按自定义分隔符切分
|
// 按自定义分隔符切分
|
||||||
|
|||||||
Reference in New Issue
Block a user