mirror of
https://gitcode.com/ageerle/ruoyi-ai.git
synced 2026-04-01 22:13:43 +08:00
本地向量化
This commit is contained in:
@@ -0,0 +1,92 @@
|
||||
package org.ruoyi.knowledge.chain.vectorizer;
|
||||
|
||||
import jakarta.annotation.Resource;
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.ruoyi.common.chat.config.ChatConfig;
|
||||
import org.ruoyi.common.chat.localModels.LocalModelsofitClient;
|
||||
import org.ruoyi.common.chat.openai.OpenAiStreamClient;
|
||||
import org.ruoyi.knowledge.domain.vo.KnowledgeInfoVo;
|
||||
import org.ruoyi.knowledge.service.IKnowledgeInfoService;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@Component
|
||||
@Slf4j
|
||||
@RequiredArgsConstructor
|
||||
public class LocalModelsVectorization {
|
||||
@Resource
|
||||
private IKnowledgeInfoService knowledgeInfoService;
|
||||
|
||||
@Resource
|
||||
private LocalModelsofitClient localModelsofitClient;
|
||||
|
||||
@Getter
|
||||
private OpenAiStreamClient openAiStreamClient;
|
||||
|
||||
private final ChatConfig chatConfig;
|
||||
|
||||
/**
|
||||
* 批量向量化
|
||||
*
|
||||
* @param chunkList 文本块列表
|
||||
* @param kid 知识 ID
|
||||
* @return 向量化结果
|
||||
*/
|
||||
|
||||
public List<List<Double>> batchVectorization(List<String> chunkList, String kid) {
|
||||
logVectorizationRequest(kid, chunkList); // 在向量化开始前记录日志
|
||||
openAiStreamClient = chatConfig.getOpenAiStreamClient(); // 获取 OpenAi 客户端
|
||||
KnowledgeInfoVo knowledgeInfoVo = knowledgeInfoService.queryById(Long.valueOf(kid)); // 查询知识信息
|
||||
// 调用 localModelsofitClient 获取 Top K 嵌入向量
|
||||
try {
|
||||
return localModelsofitClient.getTopKEmbeddings(
|
||||
chunkList,
|
||||
knowledgeInfoVo.getVector(),
|
||||
knowledgeInfoVo.getKnowledgeSeparator(),
|
||||
knowledgeInfoVo.getRetrieveLimit(),
|
||||
knowledgeInfoVo.getTextBlockSize(),
|
||||
knowledgeInfoVo.getOverlapChar()
|
||||
);
|
||||
} catch (Exception e) {
|
||||
log.error("Failed to perform batch vectorization for knowledgeId: {}", kid, e);
|
||||
throw new RuntimeException("Batch vectorization failed", e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 单一文本块向量化
|
||||
*
|
||||
* @param chunk 单一文本块
|
||||
* @param kid 知识 ID
|
||||
* @return 向量化结果
|
||||
*/
|
||||
|
||||
public List<Double> singleVectorization(String chunk, String kid) {
|
||||
List<String> chunkList = new ArrayList<>();
|
||||
chunkList.add(chunk);
|
||||
|
||||
// 调用批量向量化方法
|
||||
List<List<Double>> vectorList = batchVectorization(chunkList, kid);
|
||||
|
||||
if (vectorList.isEmpty()) {
|
||||
log.warn("Vectorization returned empty list for chunk: {}", chunk);
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
return vectorList.get(0); // 返回第一个向量
|
||||
}
|
||||
|
||||
/**
|
||||
* 提供更简洁的日志记录方法
|
||||
*
|
||||
* @param kid 知识 ID
|
||||
* @param chunkList 文本块列表
|
||||
*/
|
||||
private void logVectorizationRequest(String kid, List<String> chunkList) {
|
||||
log.info("Starting vectorization for Knowledge ID: {} with {} chunks.", kid, chunkList.size());
|
||||
}
|
||||
}
|
||||
@@ -18,6 +18,7 @@ import org.springframework.stereotype.Component;
|
||||
import java.math.BigDecimal;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Component
|
||||
@Slf4j
|
||||
@@ -27,6 +28,9 @@ public class OpenAiVectorization implements Vectorization {
|
||||
@Lazy
|
||||
@Resource
|
||||
private IKnowledgeInfoService knowledgeInfoService;
|
||||
@Lazy
|
||||
@Resource
|
||||
private LocalModelsVectorization localModelsVectorization;
|
||||
|
||||
@Getter
|
||||
private OpenAiStreamClient openAiStreamClient;
|
||||
@@ -35,25 +39,63 @@ public class OpenAiVectorization implements Vectorization {
|
||||
|
||||
@Override
|
||||
public List<List<Double>> batchVectorization(List<String> chunkList, String kid) {
|
||||
openAiStreamClient = chatConfig.getOpenAiStreamClient();
|
||||
KnowledgeInfoVo knowledgeInfoVo = knowledgeInfoService.queryById(Long.valueOf(kid));
|
||||
Embedding embedding = Embedding.builder()
|
||||
.input(chunkList)
|
||||
.model(knowledgeInfoVo.getVectorModel())
|
||||
.build();
|
||||
EmbeddingResponse embeddings = openAiStreamClient.embeddings(embedding);
|
||||
List<List<Double>> vectorList = new ArrayList<>();
|
||||
embeddings.getData().forEach(data -> {
|
||||
List<BigDecimal> vector = data.getEmbedding();
|
||||
List<Double> doubleVector = new ArrayList<>();
|
||||
for (BigDecimal bd : vector) {
|
||||
doubleVector.add(bd.doubleValue());
|
||||
}
|
||||
vectorList.add(doubleVector);
|
||||
});
|
||||
|
||||
// 获取知识库信息
|
||||
KnowledgeInfoVo knowledgeInfoVo = knowledgeInfoService.queryById(Long.valueOf(kid));
|
||||
|
||||
// 如果使用本地模型
|
||||
try {
|
||||
return localModelsVectorization.batchVectorization(chunkList, kid);
|
||||
} catch (Exception e) {
|
||||
log.error("Local models vectorization failed, falling back to OpenAI embeddings", e);
|
||||
}
|
||||
|
||||
// 如果本地模型失败,则调用 OpenAI 服务进行向量化
|
||||
Embedding embedding = buildEmbedding(chunkList, knowledgeInfoVo);
|
||||
EmbeddingResponse embeddings = openAiStreamClient.embeddings(embedding);
|
||||
|
||||
// 处理 OpenAI 返回的嵌入数据
|
||||
vectorList = processOpenAiEmbeddings(embeddings);
|
||||
|
||||
return vectorList;
|
||||
}
|
||||
|
||||
/**
|
||||
* 构建 Embedding 对象
|
||||
*/
|
||||
private Embedding buildEmbedding(List<String> chunkList, KnowledgeInfoVo knowledgeInfoVo) {
|
||||
return Embedding.builder()
|
||||
.input(chunkList)
|
||||
.model(knowledgeInfoVo.getVectorModel())
|
||||
.build();
|
||||
}
|
||||
|
||||
/**
|
||||
* 处理 OpenAI 返回的嵌入数据
|
||||
*/
|
||||
private List<List<Double>> processOpenAiEmbeddings(EmbeddingResponse embeddings) {
|
||||
List<List<Double>> vectorList = new ArrayList<>();
|
||||
|
||||
embeddings.getData().forEach(data -> {
|
||||
List<BigDecimal> vector = data.getEmbedding();
|
||||
List<Double> doubleVector = convertToDoubleList(vector);
|
||||
vectorList.add(doubleVector);
|
||||
});
|
||||
|
||||
return vectorList;
|
||||
}
|
||||
|
||||
/**
|
||||
* 将 BigDecimal 转换为 Double 列表
|
||||
*/
|
||||
private List<Double> convertToDoubleList(List<BigDecimal> vector) {
|
||||
return vector.stream()
|
||||
.map(BigDecimal::doubleValue)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public List<Double> singleVectorization(String chunk, String kid) {
|
||||
List<String> chunkList = new ArrayList<>();
|
||||
|
||||
@@ -0,0 +1,15 @@
|
||||
package org.ruoyi.knowledge.chain.vectorizer;
|
||||
|
||||
public enum VectorizationType {
|
||||
OPENAI, // OpenAI 向量化
|
||||
LOCAL; // 本地模型向量化
|
||||
|
||||
public static VectorizationType fromString(String type) {
|
||||
for (VectorizationType v : values()) {
|
||||
if (v.name().equalsIgnoreCase(type)) {
|
||||
return v;
|
||||
}
|
||||
}
|
||||
throw new IllegalArgumentException("Unknown VectorizationType: " + type);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user