feat: 调整知识库模块

This commit is contained in:
ageerle
2025-04-09 17:41:29 +08:00
parent be6d027cad
commit 3be9005f95
424 changed files with 1584 additions and 10005 deletions

View File

@@ -0,0 +1,37 @@
package org.ruoyi.chain.loader;
import lombok.AllArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.ruoyi.chain.split.TextSplitter;
import org.springframework.stereotype.Component;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.List;
@Component
@AllArgsConstructor
@Slf4j
public class CodeFileLoader implements ResourceLoader {
private final TextSplitter textSplitter;
@Override
public String getContent(InputStream inputStream) {
StringBuffer stringBuffer = new StringBuffer();
try (InputStreamReader reader = new InputStreamReader(inputStream);
BufferedReader bufferedReader = new BufferedReader(reader)){
String line;
while ((line = bufferedReader.readLine()) != null) {
stringBuffer.append(line).append("\n");
}
} catch (IOException e) {
e.printStackTrace();
}
return stringBuffer.toString();
}
@Override
public List<String> getChunkList(String content, String kid){
return textSplitter.split(content, kid);
}
}

View File

@@ -0,0 +1,18 @@
package org.ruoyi.chain.loader;
import org.ruoyi.chain.loader.ResourceLoader;
import java.io.InputStream;
import java.util.List;
public class CsvFileLoader implements ResourceLoader {
@Override
public String getContent(InputStream inputStream) {
return null;
}
@Override
public List<String> getChunkList(String content, String kid) {
return null;
}
}

View File

@@ -0,0 +1,16 @@
package org.ruoyi.chain.loader;
import java.io.InputStream;
import java.util.List;
public class FolderLoader implements ResourceLoader{
@Override
public String getContent(InputStream inputStream) {
return null;
}
@Override
public List<String> getChunkList(String content, String kid) {
return null;
}
}

View File

@@ -0,0 +1,18 @@
package org.ruoyi.chain.loader;
import org.ruoyi.chain.loader.ResourceLoader;
import java.io.InputStream;
import java.util.List;
public class GithubLoader implements ResourceLoader {
@Override
public String getContent(InputStream inputStream) {
return null;
}
@Override
public List<String> getChunkList(String content, String kid) {
return null;
}
}

View File

@@ -0,0 +1,18 @@
package org.ruoyi.chain.loader;
import org.ruoyi.chain.loader.ResourceLoader;
import java.io.InputStream;
import java.util.List;
public class JsonFileLoader implements ResourceLoader {
@Override
public String getContent(InputStream inputStream) {
return null;
}
@Override
public List<String> getChunkList(String content, String kid) {
return null;
}
}

View File

@@ -0,0 +1,38 @@
package org.ruoyi.chain.loader;
import lombok.AllArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.ruoyi.chain.loader.ResourceLoader;
import org.ruoyi.chain.split.TextSplitter;
import org.springframework.stereotype.Component;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.List;
@Component
@AllArgsConstructor
@Slf4j
public class MarkDownFileLoader implements ResourceLoader {
private final TextSplitter textSplitter;
@Override
public String getContent(InputStream inputStream) {
StringBuffer stringBuffer = new StringBuffer();
try (InputStreamReader reader = new InputStreamReader(inputStream);
BufferedReader bufferedReader = new BufferedReader(reader)){
String line;
while ((line = bufferedReader.readLine()) != null) {
stringBuffer.append(line).append("\n");
}
} catch (IOException e) {
e.printStackTrace();
}
return stringBuffer.toString();
}
@Override
public List<String> getChunkList(String content, String kid){
return textSplitter.split(content, kid);
}
}

View File

@@ -0,0 +1,35 @@
package org.ruoyi.chain.loader;
import lombok.AllArgsConstructor;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.ruoyi.chain.loader.ResourceLoader;
import org.ruoyi.chain.split.TextSplitter;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
@Component
@AllArgsConstructor
public class PdfFileLoader implements ResourceLoader {
private final TextSplitter characterTextSplitter;
@Override
public String getContent(InputStream inputStream) {
PDDocument document = null;
try {
document = PDDocument.load(inputStream);
PDFTextStripper textStripper = new PDFTextStripper();
String content = textStripper.getText(document);
return content;
} catch (IOException e) {
throw new RuntimeException(e);
}
}
@Override
public List<String> getChunkList(String content, String kid) {
return characterTextSplitter.split(content, kid);
}
}

View File

@@ -0,0 +1,14 @@
package org.ruoyi.chain.loader;
import java.io.InputStream;
import java.util.List;
/**
* 资源载入
*/
public interface ResourceLoader {
String getContent(InputStream inputStream);
List<String> getChunkList(String content, String kid);
}

View File

@@ -0,0 +1,35 @@
package org.ruoyi.chain.loader;
import lombok.AllArgsConstructor;
import org.ruoyi.chain.loader.ResourceLoader;
import org.ruoyi.chain.loader.TextFileLoader;
import org.ruoyi.chain.split.CharacterTextSplitter;
import org.ruoyi.chain.split.CodeTextSplitter;
import org.ruoyi.chain.split.MarkdownTextSplitter;
import org.ruoyi.chain.split.TokenTextSplitter;
import org.ruoyi.knowledge.constant.FileType;
import org.springframework.stereotype.Component;
@AllArgsConstructor
@Component
public class ResourceLoaderFactory {
private final CharacterTextSplitter characterTextSplitter;
private final CodeTextSplitter codeTextSplitter;
private final MarkdownTextSplitter markdownTextSplitter;
private final TokenTextSplitter tokenTextSplitter;
public ResourceLoader getLoaderByFileType(String fileType){
if (FileType.isTextFile(fileType)){
return new TextFileLoader(characterTextSplitter);
} else if (FileType.isWord(fileType)) {
return new WordLoader(characterTextSplitter);
} else if (FileType.isPdf(fileType)) {
return new PdfFileLoader(characterTextSplitter);
} else if (FileType.isMdFile(fileType)) {
return new MarkDownFileLoader(markdownTextSplitter);
}else if (FileType.isCodeFile(fileType)) {
return new CodeFileLoader(codeTextSplitter);
}else {
return new TextFileLoader(characterTextSplitter);
}
}
}

View File

@@ -0,0 +1,37 @@
package org.ruoyi.chain.loader;
import lombok.AllArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.ruoyi.chain.split.TextSplitter;
import org.springframework.stereotype.Component;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.List;
@Component
@AllArgsConstructor
@Slf4j
public class TextFileLoader implements ResourceLoader{
private final TextSplitter textSplitter;
@Override
public String getContent(InputStream inputStream) {
StringBuffer stringBuffer = new StringBuffer();
try (InputStreamReader reader = new InputStreamReader(inputStream, "UTF-8");
BufferedReader bufferedReader = new BufferedReader(reader)){
String line;
while ((line = bufferedReader.readLine()) != null) {
stringBuffer.append(line).append("\n");
}
} catch (IOException e) {
e.printStackTrace();
}
return stringBuffer.toString();
}
@Override
public List<String> getChunkList(String content, String kid){
return textSplitter.split(content, kid);
}
}

View File

@@ -0,0 +1,38 @@
package org.ruoyi.chain.loader;
import lombok.AllArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.ruoyi.chain.loader.ResourceLoader;
import org.ruoyi.chain.split.TextSplitter;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
@Component
@AllArgsConstructor
@Slf4j
public class WordLoader implements ResourceLoader {
private final TextSplitter textSplitter;
@Override
public String getContent(InputStream inputStream) {
XWPFDocument document = null;
try {
document = new XWPFDocument(inputStream);
XWPFWordExtractor extractor = new XWPFWordExtractor(document);
String content = extractor.getText();
return content;
} catch (IOException e) {
throw new RuntimeException(e);
}
}
@Override
public List<String> getChunkList(String content, String kid) {
return textSplitter.split(content, kid);
}
}

View File

@@ -0,0 +1,64 @@
package org.ruoyi.chain.split;
import jakarta.annotation.Resource;
import lombok.extern.slf4j.Slf4j;
import org.ruoyi.common.core.utils.StringUtils;
import org.ruoyi.chain.split.TextSplitter;
import org.ruoyi.domain.vo.KnowledgeInfoVo;
import org.ruoyi.service.IKnowledgeInfoService;
import org.springframework.context.annotation.Lazy;
import org.springframework.context.annotation.Primary;
import org.springframework.stereotype.Component;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
@Component
@Slf4j
@Primary
public class CharacterTextSplitter implements TextSplitter {
@Lazy
@Resource
private IKnowledgeInfoService knowledgeInfoService;
@Override
public List<String> split(String content, String kid) {
// 从知识库表中获取配置
KnowledgeInfoVo knowledgeInfoVo = knowledgeInfoService.queryById(Long.valueOf(kid));
String knowledgeSeparator = knowledgeInfoVo.getKnowledgeSeparator();
int textBlockSize = knowledgeInfoVo.getTextBlockSize();
int overlapChar = knowledgeInfoVo.getOverlapChar();
List<String> chunkList = new ArrayList<>();
if (content.contains(knowledgeSeparator) && StringUtils.isNotBlank(knowledgeSeparator)) {
// 按自定义分隔符切分
String[] chunks = content.split(knowledgeSeparator);
chunkList.addAll(Arrays.asList(chunks));
} else {
int indexMin = 0;
int len = content.length();
int i = 0;
int right = 0;
while (true) {
if (len > right) {
int begin = i * textBlockSize - overlapChar;
if (begin < indexMin) {
begin = indexMin;
}
int end = textBlockSize * (i + 1) + overlapChar;
if (end > len) {
end = len;
}
String chunk = content.substring(begin, end);
chunkList.add(chunk);
i++;
right = right + textBlockSize;
} else {
break;
}
}
}
return chunkList;
}
}

View File

@@ -0,0 +1,18 @@
package org.ruoyi.chain.split;
import lombok.AllArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.ruoyi.chain.split.TextSplitter;
import org.springframework.stereotype.Component;
import java.util.List;
@Component
@AllArgsConstructor
@Slf4j
public class CodeTextSplitter implements TextSplitter {
@Override
public List<String> split(String content, String kid) {
return null;
}
}

View File

@@ -0,0 +1,17 @@
package org.ruoyi.chain.split;
import lombok.AllArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Component;
import java.util.List;
@Component
@AllArgsConstructor
@Slf4j
public class MarkdownTextSplitter implements TextSplitter{
@Override
public List<String> split(String content, String kid) {
return null;
}
}

View File

@@ -0,0 +1,18 @@
package org.ruoyi.chain.split;
import java.util.List;
/**
* 文本切分
*/
public interface TextSplitter {
/**
* 文本切分
*
* @param content 文本内容
* @param kid 知识库id
* @return 切分后的文本列表
*/
List<String> split(String content, String kid);
}

View File

@@ -0,0 +1,18 @@
package org.ruoyi.chain.split;
import lombok.AllArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.ruoyi.chain.split.TextSplitter;
import org.springframework.stereotype.Component;
import java.util.List;
@Component
@AllArgsConstructor
@Slf4j
public class TokenTextSplitter implements TextSplitter {
@Override
public List<String> split(String content, String kid) {
return null;
}
}