mirror of
https://gitcode.com/ageerle/ruoyi-ai.git
synced 2026-03-26 02:53:46 +08:00
feat: 调整知识库模块
This commit is contained in:
@@ -0,0 +1,37 @@
|
||||
package org.ruoyi.chain.loader;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.ruoyi.chain.split.TextSplitter;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.List;
|
||||
|
||||
@Component
|
||||
@AllArgsConstructor
|
||||
@Slf4j
|
||||
public class CodeFileLoader implements ResourceLoader {
|
||||
private final TextSplitter textSplitter;
|
||||
@Override
|
||||
public String getContent(InputStream inputStream) {
|
||||
StringBuffer stringBuffer = new StringBuffer();
|
||||
try (InputStreamReader reader = new InputStreamReader(inputStream);
|
||||
BufferedReader bufferedReader = new BufferedReader(reader)){
|
||||
String line;
|
||||
while ((line = bufferedReader.readLine()) != null) {
|
||||
stringBuffer.append(line).append("\n");
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return stringBuffer.toString();
|
||||
}
|
||||
@Override
|
||||
public List<String> getChunkList(String content, String kid){
|
||||
return textSplitter.split(content, kid);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,18 @@
|
||||
package org.ruoyi.chain.loader;
|
||||
|
||||
import org.ruoyi.chain.loader.ResourceLoader;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.util.List;
|
||||
|
||||
public class CsvFileLoader implements ResourceLoader {
|
||||
@Override
|
||||
public String getContent(InputStream inputStream) {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> getChunkList(String content, String kid) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
package org.ruoyi.chain.loader;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.util.List;
|
||||
|
||||
public class FolderLoader implements ResourceLoader{
|
||||
@Override
|
||||
public String getContent(InputStream inputStream) {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> getChunkList(String content, String kid) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,18 @@
|
||||
package org.ruoyi.chain.loader;
|
||||
|
||||
import org.ruoyi.chain.loader.ResourceLoader;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.util.List;
|
||||
|
||||
public class GithubLoader implements ResourceLoader {
|
||||
@Override
|
||||
public String getContent(InputStream inputStream) {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> getChunkList(String content, String kid) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,18 @@
|
||||
package org.ruoyi.chain.loader;
|
||||
|
||||
import org.ruoyi.chain.loader.ResourceLoader;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.util.List;
|
||||
|
||||
public class JsonFileLoader implements ResourceLoader {
|
||||
@Override
|
||||
public String getContent(InputStream inputStream) {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> getChunkList(String content, String kid) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,38 @@
|
||||
package org.ruoyi.chain.loader;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.ruoyi.chain.loader.ResourceLoader;
|
||||
import org.ruoyi.chain.split.TextSplitter;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.List;
|
||||
|
||||
@Component
|
||||
@AllArgsConstructor
|
||||
@Slf4j
|
||||
public class MarkDownFileLoader implements ResourceLoader {
|
||||
private final TextSplitter textSplitter;
|
||||
@Override
|
||||
public String getContent(InputStream inputStream) {
|
||||
StringBuffer stringBuffer = new StringBuffer();
|
||||
try (InputStreamReader reader = new InputStreamReader(inputStream);
|
||||
BufferedReader bufferedReader = new BufferedReader(reader)){
|
||||
String line;
|
||||
while ((line = bufferedReader.readLine()) != null) {
|
||||
stringBuffer.append(line).append("\n");
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return stringBuffer.toString();
|
||||
}
|
||||
@Override
|
||||
public List<String> getChunkList(String content, String kid){
|
||||
return textSplitter.split(content, kid);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,35 @@
|
||||
package org.ruoyi.chain.loader;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
import org.ruoyi.chain.loader.ResourceLoader;
|
||||
import org.ruoyi.chain.split.TextSplitter;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.List;
|
||||
|
||||
@Component
|
||||
@AllArgsConstructor
|
||||
public class PdfFileLoader implements ResourceLoader {
|
||||
private final TextSplitter characterTextSplitter;
|
||||
@Override
|
||||
public String getContent(InputStream inputStream) {
|
||||
PDDocument document = null;
|
||||
try {
|
||||
document = PDDocument.load(inputStream);
|
||||
PDFTextStripper textStripper = new PDFTextStripper();
|
||||
String content = textStripper.getText(document);
|
||||
return content;
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> getChunkList(String content, String kid) {
|
||||
return characterTextSplitter.split(content, kid);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,14 @@
|
||||
package org.ruoyi.chain.loader;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 资源载入
|
||||
*/
|
||||
public interface ResourceLoader {
|
||||
|
||||
String getContent(InputStream inputStream);
|
||||
|
||||
List<String> getChunkList(String content, String kid);
|
||||
}
|
||||
@@ -0,0 +1,35 @@
|
||||
package org.ruoyi.chain.loader;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import org.ruoyi.chain.loader.ResourceLoader;
|
||||
import org.ruoyi.chain.loader.TextFileLoader;
|
||||
import org.ruoyi.chain.split.CharacterTextSplitter;
|
||||
import org.ruoyi.chain.split.CodeTextSplitter;
|
||||
import org.ruoyi.chain.split.MarkdownTextSplitter;
|
||||
import org.ruoyi.chain.split.TokenTextSplitter;
|
||||
import org.ruoyi.knowledge.constant.FileType;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
@AllArgsConstructor
|
||||
@Component
|
||||
public class ResourceLoaderFactory {
|
||||
private final CharacterTextSplitter characterTextSplitter;
|
||||
private final CodeTextSplitter codeTextSplitter;
|
||||
private final MarkdownTextSplitter markdownTextSplitter;
|
||||
private final TokenTextSplitter tokenTextSplitter;
|
||||
public ResourceLoader getLoaderByFileType(String fileType){
|
||||
if (FileType.isTextFile(fileType)){
|
||||
return new TextFileLoader(characterTextSplitter);
|
||||
} else if (FileType.isWord(fileType)) {
|
||||
return new WordLoader(characterTextSplitter);
|
||||
} else if (FileType.isPdf(fileType)) {
|
||||
return new PdfFileLoader(characterTextSplitter);
|
||||
} else if (FileType.isMdFile(fileType)) {
|
||||
return new MarkDownFileLoader(markdownTextSplitter);
|
||||
}else if (FileType.isCodeFile(fileType)) {
|
||||
return new CodeFileLoader(codeTextSplitter);
|
||||
}else {
|
||||
return new TextFileLoader(characterTextSplitter);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,37 @@
|
||||
package org.ruoyi.chain.loader;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.ruoyi.chain.split.TextSplitter;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.List;
|
||||
|
||||
@Component
|
||||
@AllArgsConstructor
|
||||
@Slf4j
|
||||
public class TextFileLoader implements ResourceLoader{
|
||||
private final TextSplitter textSplitter;
|
||||
@Override
|
||||
public String getContent(InputStream inputStream) {
|
||||
StringBuffer stringBuffer = new StringBuffer();
|
||||
try (InputStreamReader reader = new InputStreamReader(inputStream, "UTF-8");
|
||||
BufferedReader bufferedReader = new BufferedReader(reader)){
|
||||
String line;
|
||||
while ((line = bufferedReader.readLine()) != null) {
|
||||
stringBuffer.append(line).append("\n");
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return stringBuffer.toString();
|
||||
}
|
||||
@Override
|
||||
public List<String> getChunkList(String content, String kid){
|
||||
return textSplitter.split(content, kid);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,38 @@
|
||||
package org.ruoyi.chain.loader;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFDocument;
|
||||
import org.ruoyi.chain.loader.ResourceLoader;
|
||||
import org.ruoyi.chain.split.TextSplitter;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.List;
|
||||
|
||||
@Component
|
||||
@AllArgsConstructor
|
||||
@Slf4j
|
||||
public class WordLoader implements ResourceLoader {
|
||||
private final TextSplitter textSplitter;
|
||||
@Override
|
||||
public String getContent(InputStream inputStream) {
|
||||
XWPFDocument document = null;
|
||||
try {
|
||||
document = new XWPFDocument(inputStream);
|
||||
XWPFWordExtractor extractor = new XWPFWordExtractor(document);
|
||||
String content = extractor.getText();
|
||||
return content;
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> getChunkList(String content, String kid) {
|
||||
return textSplitter.split(content, kid);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,64 @@
|
||||
package org.ruoyi.chain.split;
|
||||
|
||||
import jakarta.annotation.Resource;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.ruoyi.common.core.utils.StringUtils;
|
||||
import org.ruoyi.chain.split.TextSplitter;
|
||||
import org.ruoyi.domain.vo.KnowledgeInfoVo;
|
||||
import org.ruoyi.service.IKnowledgeInfoService;
|
||||
import org.springframework.context.annotation.Lazy;
|
||||
import org.springframework.context.annotation.Primary;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
@Component
|
||||
@Slf4j
|
||||
@Primary
|
||||
public class CharacterTextSplitter implements TextSplitter {
|
||||
|
||||
@Lazy
|
||||
@Resource
|
||||
private IKnowledgeInfoService knowledgeInfoService;
|
||||
|
||||
@Override
|
||||
public List<String> split(String content, String kid) {
|
||||
// 从知识库表中获取配置
|
||||
KnowledgeInfoVo knowledgeInfoVo = knowledgeInfoService.queryById(Long.valueOf(kid));
|
||||
String knowledgeSeparator = knowledgeInfoVo.getKnowledgeSeparator();
|
||||
int textBlockSize = knowledgeInfoVo.getTextBlockSize();
|
||||
int overlapChar = knowledgeInfoVo.getOverlapChar();
|
||||
List<String> chunkList = new ArrayList<>();
|
||||
if (content.contains(knowledgeSeparator) && StringUtils.isNotBlank(knowledgeSeparator)) {
|
||||
// 按自定义分隔符切分
|
||||
String[] chunks = content.split(knowledgeSeparator);
|
||||
chunkList.addAll(Arrays.asList(chunks));
|
||||
} else {
|
||||
int indexMin = 0;
|
||||
int len = content.length();
|
||||
int i = 0;
|
||||
int right = 0;
|
||||
while (true) {
|
||||
if (len > right) {
|
||||
int begin = i * textBlockSize - overlapChar;
|
||||
if (begin < indexMin) {
|
||||
begin = indexMin;
|
||||
}
|
||||
int end = textBlockSize * (i + 1) + overlapChar;
|
||||
if (end > len) {
|
||||
end = len;
|
||||
}
|
||||
String chunk = content.substring(begin, end);
|
||||
chunkList.add(chunk);
|
||||
i++;
|
||||
right = right + textBlockSize;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return chunkList;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,18 @@
|
||||
package org.ruoyi.chain.split;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.ruoyi.chain.split.TextSplitter;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@Component
|
||||
@AllArgsConstructor
|
||||
@Slf4j
|
||||
public class CodeTextSplitter implements TextSplitter {
|
||||
@Override
|
||||
public List<String> split(String content, String kid) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,17 @@
|
||||
package org.ruoyi.chain.split;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@Component
|
||||
@AllArgsConstructor
|
||||
@Slf4j
|
||||
public class MarkdownTextSplitter implements TextSplitter{
|
||||
@Override
|
||||
public List<String> split(String content, String kid) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,18 @@
|
||||
package org.ruoyi.chain.split;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 文本切分
|
||||
*/
|
||||
public interface TextSplitter {
|
||||
|
||||
/**
|
||||
* 文本切分
|
||||
*
|
||||
* @param content 文本内容
|
||||
* @param kid 知识库id
|
||||
* @return 切分后的文本列表
|
||||
*/
|
||||
List<String> split(String content, String kid);
|
||||
}
|
||||
@@ -0,0 +1,18 @@
|
||||
package org.ruoyi.chain.split;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.ruoyi.chain.split.TextSplitter;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@Component
|
||||
@AllArgsConstructor
|
||||
@Slf4j
|
||||
public class TokenTextSplitter implements TextSplitter {
|
||||
@Override
|
||||
public List<String> split(String content, String kid) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user