diff --git a/ruoyi-admin/src/main/resources/application-dev.yml b/ruoyi-admin/src/main/resources/application-dev.yml index 8abf1a68..e51f0019 100644 --- a/ruoyi-admin/src/main/resources/application-dev.yml +++ b/ruoyi-admin/src/main/resources/application-dev.yml @@ -85,4 +85,15 @@ sms: # 腾讯专用 sdkAppId: +pdf: + extract: + service: + url: http://localhost:8080 + ai-api: + url: https://api.pandarobot.chat/v1/chat/completions + key: sk-xxxx +#百炼模型配置 +dashscope: + key: sk-xxxx + model: qvq-max diff --git a/ruoyi-admin/src/main/resources/application-local.yml b/ruoyi-admin/src/main/resources/application-local.yml new file mode 100644 index 00000000..95bb3a93 --- /dev/null +++ b/ruoyi-admin/src/main/resources/application-local.yml @@ -0,0 +1,111 @@ +--- # 监控中心配置 +spring.boot.admin.client: + # 增加客户端开关 + enabled: false + url: http://localhost:9090/admin + instance: + service-host-type: IP + username: ruoyi + password: 123456 + +--- # 数据源配置 +spring: + datasource: + type: com.zaxxer.hikari.HikariDataSource + # 动态数据源文档 https://www.kancloud.cn/tracy5546/dynamic-datasource/content + dynamic: + # 性能分析插件(有性能损耗 不建议生产环境使用) + p6spy: true + # 设置默认的数据源或者数据源组,默认值即为 master + primary: master + # 严格模式 匹配不到数据源则报错 + strict: true + datasource: + # 主库数据源 + master: + type: ${spring.datasource.type} + driverClassName: com.mysql.cj.jdbc.Driver + url: jdbc:mysql://localhost:3306/ruoyi-ai?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull&useSSL=true&serverTimezone=GMT%2B8&autoReconnect=true&rewriteBatchedStatements=true + username: root + password: root + + + hikari: + # 最大连接池数量 + maxPoolSize: 20 + # 最小空闲线程数量 + minIdle: 10 + # 配置获取连接等待超时的时间 + connectionTimeout: 30000 + # 校验超时时间 + validationTimeout: 5000 + # 空闲连接存活最大时间,默认10分钟 + idleTimeout: 600000 + # 此属性控制池中连接的最长生命周期,值0表示无限生命周期,默认30分钟 + maxLifetime: 1800000 + # 连接测试query(配置检测连接是否有效) + connectionTestQuery: SELECT 1 + # 多久检查一次连接的活性 + keepaliveTime: 30000 + +--- # redis 单机配置(单机与集群只能开启一个另一个需要注释掉) +spring.data: + redis: + # 地址 + host: 127.0.0.1 + # 端口,默认为6379 + port: 6379 + # 数据库索引 + database: 0 + # 密码(如没有密码请注释掉) + password: root + # 连接超时时间 + timeout: 10S + # 是否开启ssl + ssl: false + +redisson: + # redis key前缀 + keyPrefix: + # 线程池数量 + threads: 4 + # Netty线程池数量 + nettyThreads: 8 + # 单节点配置 + singleServerConfig: + # 客户端名称 + clientName: ${ruoyi.name} + # 最小空闲连接数 + connectionMinimumIdleSize: 8 + # 连接池大小 + connectionPoolSize: 32 + # 连接空闲超时,单位:毫秒 + idleConnectionTimeout: 10000 + # 命令等待超时,单位:毫秒 + timeout: 3000 + # 发布和订阅连接池大小 + subscriptionConnectionPoolSize: 50 + +--- # sms 短信 +sms: + enabled: false + # 阿里云 dysmsapi.aliyuncs.com + # 腾讯云 sms.tencentcloudapi.com + endpoint: "dysmsapi.aliyuncs.com" + accessKeyId: xxxxxxx + accessKeySecret: xxxxxx + signName: 测试 + # 腾讯专用 + sdkAppId: + +pdf: + extract: + service: + url: http://localhost:8080 + ai-api: + url: https://api.pandarobot.chat/v1/chat/completions + key: sk-xxxx +#百炼模型配置 +dashscope: + key: sk-xxxx + model: qvq-max \ No newline at end of file diff --git a/ruoyi-modules-api/ruoyi-knowledge-api/pom.xml b/ruoyi-modules-api/ruoyi-knowledge-api/pom.xml index f7e91879..2a3e634c 100644 --- a/ruoyi-modules-api/ruoyi-knowledge-api/pom.xml +++ b/ruoyi-modules-api/ruoyi-knowledge-api/pom.xml @@ -89,6 +89,12 @@ langchain4j-document-parser-apache-tika + + com.alibaba + dashscope-sdk-java + 2.19.0 + + diff --git a/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/service/DashscopeService.java b/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/service/DashscopeService.java new file mode 100644 index 00000000..3c8f498e --- /dev/null +++ b/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/service/DashscopeService.java @@ -0,0 +1,23 @@ +package org.ruoyi.service; + +import java.io.IOException; + +/** + * @Description: 阿里百炼api + * @Date: 2025/6/4 下午2:24 + */ +public interface DashscopeService { + + /** + * 视觉推理(QVQ) + * @param imageUrl 图片可访问的地址 + * @return + */ + String qvq(String imageUrl) throws IOException; + /** + * 视觉推理(QVQ) 使用本地文件(输入Base64编码或本地路径) + * @param localPath 图片文件的绝对路径 + * @return + */ + String qvq4LocalPath(String localPath) throws IOException; +} diff --git a/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/service/PdfImageExtractService.java b/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/service/PdfImageExtractService.java new file mode 100644 index 00000000..c9929786 --- /dev/null +++ b/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/service/PdfImageExtractService.java @@ -0,0 +1,34 @@ + + /** + * 处理文件内容 + * + * @param unzip Base64编码的图片数组 + * @return 文件内容结果列表 + * @throws IOException 如果API调用过程中发生错误 + */ + List dealFileContent(String[] unzip) throws IOException; + /** + *利用百炼接口处理文件内容 + * + * @param imageUrl 传入图片地址 + * @return 文件内容结果列表 + * @throws IOException 如果API调用过程中发生错误 + */ + List dealFileContent4Dashscope(String imageUrl) throws IOException; + + /** + * 利用百炼接口处理文件内容 + * + * 视觉推理(QVQ) 使用本地文件(输入Base64编码或本地路径) + * @param localPath 图片文件的绝对路径 + * @return + */ + List dealFileContent4DashscopeBase64(String localPath)throws IOException; + /** + * 提取PDF中的图片并调用gpt-4o-mini,识别图片内容并返回 + * @param file + * @return + * @throws IOException + */ + List extractImages(MultipartFile file) throws IOException; +} diff --git a/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/service/impl/DashscopeServiceImpl.java b/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/service/impl/DashscopeServiceImpl.java new file mode 100644 index 00000000..0f6b0014 --- /dev/null +++ b/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/service/impl/DashscopeServiceImpl.java @@ -0,0 +1,150 @@ +package org.ruoyi.service.impl; + +import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation; +import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam; +import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult; +import com.alibaba.dashscope.common.MultiModalMessage; +import com.alibaba.dashscope.common.Role; +import io.reactivex.Flowable; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import lombok.extern.slf4j.Slf4j; +import org.ruoyi.domain.PdfFileContentResult; +import org.ruoyi.service.DashscopeService; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.stereotype.Service; + +/** + * @Description: 阿里百炼API + * @Date: 2025/6/4 下午2:28 + */ +@Service +@Slf4j +public class DashscopeServiceImpl implements DashscopeService { + + private boolean isFirstPrint; + + @Value("${dashscope.model}") + private String serviceModel; + @Value("${dashscope.key}") + private String serviceKey; + + /** + * 视觉推理(QVQ) + * @param imageUrl 图片可访问地址 + * @return + */ + @Override + public String qvq(String imageUrl) throws IOException { + try { + // 构建多模态消息 + MultiModalMessage userMessage = MultiModalMessage.builder() + .role(Role.USER.getValue()) + .content(Arrays.asList( + Collections.singletonMap("text", "这张图片有什么"), + Collections.singletonMap("image", imageUrl) + )) + .build(); + + // 构建请求参数 + MultiModalConversationParam param = MultiModalConversationParam.builder() + .apiKey(serviceKey) // 使用配置文件中的API Key + .model(serviceModel) + .message(userMessage) + .build(); + + MultiModalConversation conv = new MultiModalConversation(); + + // 调用API + Flowable result = conv.streamCall( + param); + + StringBuilder reasoningContent = new StringBuilder(); + StringBuilder finalContent = new StringBuilder(); + isFirstPrint = true; + + result.blockingForEach(message -> handleGenerationResult(message, reasoningContent, finalContent)); + + return finalContent.toString().replaceAll("[\n\r\s]", ""); + } catch (Exception e) { + log.error("调用百炼API失败: {}", e.getMessage(), e); + throw new IOException("百炼API调用失败: " + e.getMessage(), e); + } + } + /** + * 视觉推理(QVQ) 使用本地文件(输入Base64编码或本地路径) + * @param localPath 图片文件的绝对路径 + * @return + */ + @Override + public String qvq4LocalPath(String localPath) throws IOException { + try { + // 构建多模态消息 + String filePath = "file://"+ localPath; + log.info("filePath: {}", filePath); + MultiModalMessage userMessage = MultiModalMessage.builder().role(Role.USER.getValue()) + .content(Arrays.asList(new HashMap(){{put("image", filePath);}}, + new HashMap(){{put("text", "这张图片有什么");}})).build(); + + // 构建请求参数 + MultiModalConversationParam param = MultiModalConversationParam.builder() + .apiKey(serviceKey) // 使用配置文件中的API Key + .model(serviceModel) + .message(userMessage) + .build(); + MultiModalConversation conv = new MultiModalConversation(); + + // 调用API + Flowable result = conv.streamCall( + param); + + StringBuilder reasoningContent = new StringBuilder(); + StringBuilder finalContent = new StringBuilder(); + isFirstPrint = true; + + result.blockingForEach(message -> handleGenerationResult(message, reasoningContent, finalContent)); + + return finalContent.toString().replaceAll("[\n\r\s]", ""); + } catch (Exception e) { + log.error("调用百炼API失败: {}", e.getMessage(), e); + throw new IOException("百炼API调用失败: " + e.getMessage(), e); + } + } + + + private void handleGenerationResult(MultiModalConversationResult message, StringBuilder reasoningContent, StringBuilder finalContent) { + String re = message.getOutput().getChoices().get(0).getMessage().getReasoningContent(); + String reasoning = Objects.isNull(re) ? "" : re; + + List> content = message.getOutput().getChoices().get(0).getMessage() + .getContent(); + if (!reasoning.isEmpty()) { + reasoningContent.append(reasoning); + } + + if (Objects.nonNull(content) && !content.isEmpty()) { + Object text = content.get(0).get("text"); + finalContent.append(text); + } + + // 检查是否是最后一个响应 + if (message.getOutput().getChoices().get(0).getFinishReason() != null) { + // 输出思考过程 + if (reasoningContent.length() > 0) { + System.out.println("====================思考过程===================="); + System.out.println(reasoningContent.toString()); + } + // 输出完整回复 + if (finalContent.length() > 0) { + System.out.println("====================完整回复===================="); + System.out.println(finalContent.toString()); + } + } + } +} diff --git a/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/service/impl/PdfImageExtractServiceImpl.java b/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/service/impl/PdfImageExtractServiceImpl.java new file mode 100644 index 00000000..32708439 --- /dev/null +++ b/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/service/impl/PdfImageExtractServiceImpl.java @@ -0,0 +1,147 @@ +package org.ruoyi.service.impl; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.TimeUnit; +import lombok.Data; +import lombok.extern.slf4j.Slf4j; +import okhttp3.MediaType; +import okhttp3.MultipartBody; +import okhttp3.OkHttpClient; +import okhttp3.OkHttpClient.Builder; +import okhttp3.Request; +import okhttp3.RequestBody; +import okhttp3.Response; +import org.ruoyi.domain.PdfFileContentResult; +import org.ruoyi.service.DashscopeService; +import org.ruoyi.service.PdfImageExtractService; +import org.ruoyi.utils.ZipUtils; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.stereotype.Service; +import org.springframework.web.multipart.MultipartFile; + +/** + * PDF图片提取服务实现类 + */ +@Service +@Slf4j +@Data +public class PdfImageExtractServiceImpl implements PdfImageExtractService { + + @Value("${pdf.extract.service.url}") + private String serviceUrl; + @Value("${pdf.extract.ai-api.url}") + private String aiApiUrl; + @Value("${pdf.extract.ai-api.key}") + private String aiApiKey; + + private final OkHttpClient client = new Builder() + .connectTimeout(100, TimeUnit.SECONDS) + .readTimeout(150, TimeUnit.SECONDS) + .writeTimeout(150, TimeUnit.SECONDS) + .callTimeout(300, TimeUnit.SECONDS) + .build(); + + private static final MediaType JSON = MediaType.parse("application/json; charset=utf-8"); + +// @Override + public byte[] extractImages(MultipartFile pdfFile, String imageFormat, boolean allowDuplicates) + throws IOException { + // 构建multipart请求 + RequestBody requestBody = new MultipartBody.Builder() + .setType(MultipartBody.FORM) + .addFormDataPart("fileInput", pdfFile.getOriginalFilename(), + RequestBody.create(MediaType.parse("application/pdf"), pdfFile.getBytes())) + .addFormDataPart("format", imageFormat) + .addFormDataPart("allowDuplicates", String.valueOf(allowDuplicates)) + .build(); + + // 创建请求 + Request request = new Request.Builder() + .url(serviceUrl + "/api/v1/misc/extract-images") + .post(requestBody) + .build(); + + // 执行请求 + try (Response response = client.newCall(request).execute()) { + if (!response.isSuccessful()) { + throw new IOException("请求失败: " + response.code()); + } + return response.body().bytes(); + } + } + + /** + * 处理文件内容 + * + * @param unzip Base64编码的图片数组 + * @return 文件内容结果列表 + * @throws IOException 如果API调用过程中发生错误 + */ +// @Override + public List dealFileContent(String[] unzip) throws IOException { + List results = new ArrayList<>(); + int i = 0; + for (String base64Image : unzip) { + // 构建请求JSON + String requestJson = String.format("{" + + "\"model\": \"gpt-4o\"," + + "\"stream\": false," + + "\"messages\": [{" + + "\"role\": \"user\"," + + "\"content\": [{" + + "\"type\": \"text\"," + + "\"text\": \"这张图片有什么\"" + + "}, {" + + "\"type\": \"image_url\"," + + "\"image_url\": {" + + "\"url\": \"%s\"" + + "}}" + + "]}]," + + "\"max_tokens\": 400" + + "}", base64Image); + + // 创建请求 + Request request = new Request.Builder() + .url(aiApiUrl) + .addHeader("Authorization", "Bearer " + aiApiKey) + .post(RequestBody.create(JSON, requestJson)) + .build(); + + // 执行请求 + try { + log.info("=============call=" + ++i); + + Response response = client.newCall(request).execute(); + log.info("=============response=" + response); + if (!response.isSuccessful()) { + throw new IOException("API请求失败: " + response.code() + response.toString()); + } + + String responseBody = response.body().string(); + log.info("=============responseBody=" + responseBody); + // 使用文件名(这里使用base64的前10个字符作为标识)和API返回内容创建结果对象 + String filename = base64Image.substring(0, Math.min(base64Image.length(), 10)); + results.add(new PdfFileContentResult(filename, responseBody)); + } catch (Exception e) { + log.error(e.getMessage()); + throw new RuntimeException(e); + } + } + return results; + } + +// @Override + public List extractImages(MultipartFile file) throws IOException { + String format = "png"; + boolean allowDuplicates = true; + // 获取ZIP数据 + byte[] zipData = this.extractImages(file, format, allowDuplicates); + // 解压文件并识别图片内容并返回 + String[] unzip = ZipUtils.unzipForBase64(zipData); + //解析图片内容 + return this.dealFileContent(unzip); + } +} diff --git a/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/service/impl/VectorStoreServiceImpl.java b/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/service/impl/VectorStoreServiceImpl.java index f3e83b11..799ce729 100644 --- a/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/service/impl/VectorStoreServiceImpl.java +++ b/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/service/impl/VectorStoreServiceImpl.java @@ -7,6 +7,8 @@ import dev.langchain4j.data.segment.TextSegment; import dev.langchain4j.model.embedding.EmbeddingModel; import dev.langchain4j.model.ollama.OllamaEmbeddingModel; import dev.langchain4j.model.openai.OpenAiEmbeddingModel; +import dev.langchain4j.store.embedding.EmbeddingMatch; +import dev.langchain4j.store.embedding.EmbeddingSearchRequest; import dev.langchain4j.store.embedding.EmbeddingStore; import dev.langchain4j.store.embedding.weaviate.WeaviateEmbeddingStore; import io.weaviate.client.Config; @@ -29,6 +31,7 @@ import org.ruoyi.domain.bo.StoreEmbeddingBo; import org.ruoyi.service.VectorStoreService; import org.springframework.stereotype.Service; import java.util.*; +import java.util.stream.Collectors; /** * 向量库管理 diff --git a/ruoyi-modules-api/ruoyi-system-api/src/main/java/org/ruoyi/system/service/ISysOssService.java b/ruoyi-modules-api/ruoyi-system-api/src/main/java/org/ruoyi/system/service/ISysOssService.java index a9ff4952..f5640872 100644 --- a/ruoyi-modules-api/ruoyi-system-api/src/main/java/org/ruoyi/system/service/ISysOssService.java +++ b/ruoyi-modules-api/ruoyi-system-api/src/main/java/org/ruoyi/system/service/ISysOssService.java @@ -32,6 +32,15 @@ public interface ISysOssService { String downloadByByte(Long ossId) throws IOException; + String downloadToTempPath(Long ossId) throws IOException; + Boolean deleteWithValidByIds(Collection ids, Boolean isValid); + /** + * 根据文件路径删除文件 + * + * @param filePath 文件路径 + * @return 是否删除成功 + */ + boolean deleteFile(String filePath); } diff --git a/ruoyi-modules-api/ruoyi-system-api/src/main/java/org/ruoyi/system/service/impl/SysOssServiceImpl.java b/ruoyi-modules-api/ruoyi-system-api/src/main/java/org/ruoyi/system/service/impl/SysOssServiceImpl.java index 0a43c87d..2b9212e7 100644 --- a/ruoyi-modules-api/ruoyi-system-api/src/main/java/org/ruoyi/system/service/impl/SysOssServiceImpl.java +++ b/ruoyi-modules-api/ruoyi-system-api/src/main/java/org/ruoyi/system/service/impl/SysOssServiceImpl.java @@ -209,4 +209,48 @@ public class SysOssServiceImpl implements ISysOssService, OssService { } return oss; } + @Override + public String downloadToTempPath(Long ossId) throws IOException { + SysOssVo sysOss = SpringUtils.getAopProxy(this).getById(ossId); + if (ObjectUtil.isNull(sysOss)) { + throw new ServiceException("文件数据不存在!"); + } + + OssClient storage = OssFactory.instance(); + try (InputStream inputStream = storage.getObjectContent(sysOss.getUrl())) { + // 创建临时文件 + String suffix = StringUtils.isNotEmpty(sysOss.getFileSuffix()) ? sysOss.getFileSuffix() : ""; + java.io.File tempFile = java.io.File.createTempFile("download_", suffix); + // 确保临时文件在JVM退出时删除 + tempFile.deleteOnExit(); + // 将输入流内容写入临时文件 + cn.hutool.core.io.FileUtil.writeFromStream(inputStream, tempFile); + // 返回临时文件的绝对路径 + return tempFile.getAbsolutePath(); + } catch (Exception e) { + throw new ServiceException(e.getMessage()); + } + } + /** + * 根据文件路径删除文件 + * + * @param filePath 文件路径 + * @return 是否删除成功 + */ + @Override + public boolean deleteFile(String filePath) { + if (StringUtils.isEmpty(filePath)) { + return false; + } + + try { + java.io.File file = new java.io.File(filePath); + if (file.exists() && file.isFile()) { + return file.delete(); + } + return false; + } catch (Exception e) { + throw new ServiceException("删除文件失败: " + e.getMessage()); + } + } } diff --git a/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/chat/controller/knowledge/KnowledgeController.java b/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/chat/controller/knowledge/KnowledgeController.java index a1a72e1a..38dbd50c 100644 --- a/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/chat/controller/knowledge/KnowledgeController.java +++ b/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/chat/controller/knowledge/KnowledgeController.java @@ -185,4 +185,17 @@ public class KnowledgeController extends BaseController { return attachService.translationByFile(file, targetLanguage); } + /** + * 提取PDF中的图片并调用gpt-4o-mini,识别图片内容并返回 + * + * @param file PDF文件 + * @return 文件名称和图片内容 + */ + @PostMapping("/extract-images") + @Operation(summary = "提取PDF中的图片并调用大模型,识别图片内容并返回", description = "提取PDF中的图片并调用gpt-4o-mini,识别图片内容并返回") + public R> extractImages( + ) throws IOException { + return R.ok(pdfImageExtractService + .dealFileContent4Dashscope("https://hnzuoran02-1327573163.cos.ap-nanjing.myqcloud.com/crmebimage/public/content/2025/06/04/e115264eb22f423ea0b211709361c29f071avy39ez.jpg")); + } } diff --git a/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/chat/service/knowledge/DealFileService.java b/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/chat/service/knowledge/DealFileService.java new file mode 100644 index 00000000..730cfc5d --- /dev/null +++ b/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/chat/service/knowledge/DealFileService.java @@ -0,0 +1,377 @@ +package org.ruoyi.chat.service.knowledge; + +import cn.hutool.core.util.ObjectUtil; +import cn.hutool.core.util.RandomUtil; +import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper; +import com.baomidou.mybatisplus.core.conditions.update.LambdaUpdateWrapper; +import com.baomidou.mybatisplus.core.toolkit.Wrappers; +import java.util.Collection; +import java.util.Date; +import java.util.List; +import java.util.stream.Collectors; +import lombok.RequiredArgsConstructor; +import org.ruoyi.chain.loader.ResourceLoaderFactory; +import org.ruoyi.constant.DealStatus; +import org.ruoyi.domain.KnowledgeAttach; +import org.ruoyi.domain.KnowledgeAttachPic; +import org.ruoyi.domain.KnowledgeFragment; +import org.ruoyi.domain.KnowledgeInfo; +import org.ruoyi.domain.PdfFileContentResult; +import org.ruoyi.domain.bo.StoreEmbeddingBo; +import org.ruoyi.domain.vo.ChatModelVo; +import org.ruoyi.domain.vo.KnowledgeAttachVo; +import org.ruoyi.domain.vo.KnowledgeInfoVo; +import org.ruoyi.mapper.KnowledgeAttachMapper; +import org.ruoyi.mapper.KnowledgeAttachPicMapper; +import org.ruoyi.mapper.KnowledgeFragmentMapper; +import org.ruoyi.mapper.KnowledgeInfoMapper; +import org.ruoyi.service.IChatModelService; +import org.ruoyi.service.PdfImageExtractService; +import org.ruoyi.service.VectorStoreService; +import org.ruoyi.service.impl.PdfImageExtractServiceImpl; +import org.ruoyi.system.domain.vo.SysOssVo; +import org.ruoyi.system.service.ISysOssService; +import org.ruoyi.utils.ZipUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.scheduling.annotation.Async; +import org.springframework.stereotype.Service; +import org.springframework.web.multipart.MultipartFile; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; + +/** + * @Description: + * @Date: 2025/5/15 下午4:29 + */ +@Service +@RequiredArgsConstructor +public class DealFileService { + private static final Logger log = LoggerFactory.getLogger(DealFileService.class); + + private final KnowledgeInfoMapper baseMapper; + + private final VectorStoreService vectorStoreService; + + private final ResourceLoaderFactory resourceLoaderFactory; + + private final KnowledgeFragmentMapper fragmentMapper; + + private final KnowledgeAttachMapper attachMapper; + + private final IChatModelService chatModelService; + + private final ISysOssService ossService; + + private final PdfImageExtractService pdfImageExtractService; + + private final KnowledgeAttachPicMapper picMapper; + + @Async + public void dealVectorStatus(KnowledgeAttach attachItem) throws Exception { + try { + //锁定数据 更改VectorStatus 到进行中 + if (attachMapper.update(new LambdaUpdateWrapper() + .set(KnowledgeAttach::getVectorStatus, DealStatus.STATUS_20) + .eq(KnowledgeAttach::getPicStatus, DealStatus.STATUS_30) + .eq(KnowledgeAttach::getPicAnysStatus, DealStatus.STATUS_30) + .eq(KnowledgeAttach::getVectorStatus, DealStatus.STATUS_10) + .eq(KnowledgeAttach::getId, attachItem.getId()) + ) == 0) { + return; + } + List knowledgeFragments = fragmentMapper.selectList( + new LambdaQueryWrapper() + .eq(KnowledgeFragment::getKid, attachItem.getKid()) + .eq(KnowledgeFragment::getDocId, attachItem.getDocId()) + ); + if (ObjectUtil.isEmpty(knowledgeFragments)) { + throw new Exception("文件段落为空"); + } + List fids = knowledgeFragments.stream() + .map(KnowledgeFragment::getFid) + .collect(Collectors.toList()); + if (ObjectUtil.isEmpty(fids)) { + throw new Exception("fids 为空"); + } + List chunkList = knowledgeFragments.stream() + .map(KnowledgeFragment::getContent) + .collect(Collectors.toList()); + + if (ObjectUtil.isEmpty(chunkList)) { + throw new Exception("chunkList 为空"); + } + // 通过kid查询知识库信息 + KnowledgeInfoVo knowledgeInfoVo = baseMapper.selectVoOne(Wrappers.lambdaQuery() + .eq(KnowledgeInfo::getId, attachItem.getKid())); + // 通过向量模型查询模型信息 + ChatModelVo chatModelVo = chatModelService.selectModelByName( + knowledgeInfoVo.getEmbeddingModelName()); + + StoreEmbeddingBo storeEmbeddingBo = new StoreEmbeddingBo(); + storeEmbeddingBo.setKid(attachItem.getKid()); + storeEmbeddingBo.setDocId(attachItem.getDocId()); + storeEmbeddingBo.setFids(fids); + storeEmbeddingBo.setChunkList(chunkList); + storeEmbeddingBo.setVectorModelName(knowledgeInfoVo.getVectorModelName()); + storeEmbeddingBo.setEmbeddingModelName(knowledgeInfoVo.getEmbeddingModelName()); + storeEmbeddingBo.setApiKey(chatModelVo.getApiKey()); + storeEmbeddingBo.setBaseUrl(chatModelVo.getApiHost()); + vectorStoreService.storeEmbeddings(storeEmbeddingBo); + + //设置处理完成 + attachMapper.update(new LambdaUpdateWrapper() + .set(KnowledgeAttach::getVectorStatus, DealStatus.STATUS_30) + .eq(KnowledgeAttach::getPicStatus, DealStatus.STATUS_30) + .eq(KnowledgeAttach::getPicAnysStatus, DealStatus.STATUS_30) + .eq(KnowledgeAttach::getVectorStatus, DealStatus.STATUS_20) + .eq(KnowledgeAttach::getId, attachItem.getId())); + } catch (Exception e) { + e.printStackTrace(); + //设置处理失败 + attachMapper.update(new LambdaUpdateWrapper() + .set(KnowledgeAttach::getVectorStatus, DealStatus.STATUS_40) + .set(KnowledgeAttach::getRemark, attachItem.getRemark() + e.getMessage()) + .eq(KnowledgeAttach::getPicStatus, DealStatus.STATUS_30) + .eq(KnowledgeAttach::getPicAnysStatus, DealStatus.STATUS_30) + .eq(KnowledgeAttach::getVectorStatus, DealStatus.STATUS_20) + .eq(KnowledgeAttach::getId, attachItem.getId())); + throw new RuntimeException(e); + } + } + + @Async + public void dealPicStatus(KnowledgeAttach attachItem) throws Exception { + try { + //锁定数据 更改picStatus 到进行中 + if (attachMapper.update(new LambdaUpdateWrapper() + .set(KnowledgeAttach::getPicStatus, DealStatus.STATUS_20) + .eq(KnowledgeAttach::getPicStatus, DealStatus.STATUS_10) + .eq(KnowledgeAttach::getPicAnysStatus, DealStatus.STATUS_10) + .eq(KnowledgeAttach::getVectorStatus, DealStatus.STATUS_10) + .eq(KnowledgeAttach::getId, attachItem.getId()) + ) == 0) { + return; + } + //获取附件 + if (ObjectUtil.isEmpty(attachItem.getOssId())) { + log.error("==========OssId 为空,attachItem={}", attachItem); + throw new Exception("OssId 为空"); + } + //获取oss文件 + MultipartFile multipartFile = ossService.downloadByFile(attachItem.getOssId()); + //拆解出图片ZIP + byte[] pngs = pdfImageExtractService.extractImages(multipartFile, "png", true); + //解压zip,得到图片文件 + MultipartFile[] multipartFiles = ZipUtils.unzipToMultipartFiles(pngs); + //上传文件到OSS,写入表 + for (MultipartFile file : multipartFiles) { + //先查找是否有相同图片名称,先做删除 + List knowledgeAttachPics = picMapper.selectList( + new LambdaQueryWrapper() + .eq(KnowledgeAttachPic::getKid, attachItem.getKid()) + .eq(KnowledgeAttachPic::getAid, attachItem.getId()) + .eq(KnowledgeAttachPic::getDocName, file.getOriginalFilename()) + ); + if (ObjectUtil.isNotEmpty(knowledgeAttachPics)) { + Collection ossIds = knowledgeAttachPics.stream() + .map(KnowledgeAttachPic::getOssId) + .collect(Collectors.toList()); + ossService.deleteWithValidByIds(ossIds, false); + List collect = knowledgeAttachPics.stream().map(KnowledgeAttachPic::getId) + .collect(Collectors.toList()); + picMapper.deleteByIds(collect); + } + + SysOssVo upload = ossService.upload(file); + KnowledgeAttachPic entity = new KnowledgeAttachPic(); + entity.setKid(attachItem.getKid()); + entity.setAid(String.valueOf(attachItem.getId())); + entity.setDocName(file.getOriginalFilename()); + entity.setDocType( + file.getOriginalFilename().substring(file.getOriginalFilename().lastIndexOf(".") + 1)); + entity.setOssId(upload.getOssId()); + int[] ints = extractPageNumbers(file.getOriginalFilename()); + if (ObjectUtil.isNotEmpty(ints)) { + assert ints != null; + if (ints.length == 2) { + entity.setPageNum(ints[0]); + entity.setIndexNum(ints[1]); + } + } + picMapper.insert(entity); + } + + //设置处理完成 + attachMapper.update(new LambdaUpdateWrapper() + .set(KnowledgeAttach::getPicStatus, DealStatus.STATUS_30) + .eq(KnowledgeAttach::getPicStatus, DealStatus.STATUS_20) + .eq(KnowledgeAttach::getPicAnysStatus, DealStatus.STATUS_10) + .eq(KnowledgeAttach::getVectorStatus, DealStatus.STATUS_10) + .eq(KnowledgeAttach::getId, attachItem.getId())); + } catch (Exception e) { + //设置处理失败 + attachMapper.update(new LambdaUpdateWrapper() + .set(KnowledgeAttach::getVectorStatus, DealStatus.STATUS_40) + .set(KnowledgeAttach::getRemark, attachItem.getRemark() + e.getMessage()) + .eq(KnowledgeAttach::getPicStatus, DealStatus.STATUS_20) + .eq(KnowledgeAttach::getPicAnysStatus, DealStatus.STATUS_10) + .eq(KnowledgeAttach::getVectorStatus, DealStatus.STATUS_10) + .eq(KnowledgeAttach::getId, attachItem.getId())); + throw new RuntimeException(e); + } + + } + + + @Async + public void dealPicAnysStatus(KnowledgeAttachPic picItem) throws Exception { + String filePath = null; + try { + //锁定数据 更改 getPicAnysStatus 到进行中 + if (picMapper.update(new LambdaUpdateWrapper() + .set(KnowledgeAttachPic::getPicAnysStatus, DealStatus.STATUS_20) + .eq(KnowledgeAttachPic::getPicAnysStatus, DealStatus.STATUS_10) + .eq(KnowledgeAttachPic::getId, picItem.getId()) + ) == 0) { + return; + } + SysOssVo ossVo = ossService.getById(picItem.getOssId()); + if (ObjectUtil.isNotEmpty(ossVo)) { + filePath = ossService.downloadToTempPath(picItem.getOssId()); + //调用第三方 分析图片内容 + List pdfFileContentResults = pdfImageExtractService.dealFileContent4DashscopeBase64( + filePath); + if (ObjectUtil.isNotEmpty(pdfFileContentResults)) { + for (PdfFileContentResult resultItem : pdfFileContentResults) { + //图片解析内容回写到pic表 + picMapper.update(new LambdaUpdateWrapper() + .set(KnowledgeAttachPic::getContent, parseContent(resultItem.getContent())) + .set(KnowledgeAttachPic::getPicAnysStatus, DealStatus.STATUS_30) + .eq(KnowledgeAttachPic::getId, picItem.getId())); + //将图片解析内容 写入段落表 fragment + KnowledgeAttachVo knowledgeAttachVo = attachMapper.selectVoById(picItem.getAid()); + if (ObjectUtil.isNotEmpty(knowledgeAttachVo)) { + String fid = RandomUtil.randomString(10); + KnowledgeFragment knowledgeFragment = new KnowledgeFragment(); + knowledgeFragment.setKid(knowledgeAttachVo.getKid()); + knowledgeFragment.setDocId(knowledgeAttachVo.getDocId()); + knowledgeFragment.setFid(fid); + knowledgeFragment.setIdx(0); + knowledgeFragment.setContent(parseContent(resultItem.getContent())); + knowledgeFragment.setCreateTime(new Date()); + fragmentMapper.insert(knowledgeFragment); + + //更新attach表,需要所有图片都处理完毕 + // 查询非30状态(完成状态)的记录数量 + long nonStatus30Count = picMapper.selectCount( + new LambdaQueryWrapper() + .ne(KnowledgeAttachPic::getPicAnysStatus, DealStatus.STATUS_30) + .eq(KnowledgeAttachPic::getAid, picItem.getAid()) + ); + if (nonStatus30Count == 0) { + // 执行表更新操作 + attachMapper.update(new LambdaUpdateWrapper() + .set(KnowledgeAttach::getPicAnysStatus, DealStatus.STATUS_30) + .eq(KnowledgeAttach::getPicStatus, DealStatus.STATUS_30) + .eq(KnowledgeAttach::getPicAnysStatus, DealStatus.STATUS_10) + .eq(KnowledgeAttach::getVectorStatus, DealStatus.STATUS_10) + .eq(KnowledgeAttach::getId, picItem.getAid())); + } + } + } + } + } + } catch (Exception e) { + //失败 + picMapper.update(new LambdaUpdateWrapper() + .set(KnowledgeAttachPic::getPicAnysStatus, DealStatus.STATUS_40) + .set(KnowledgeAttachPic::getRemark, picItem.getRemark() + e.getMessage()) + .eq(KnowledgeAttachPic::getPicAnysStatus, DealStatus.STATUS_20) + .eq(KnowledgeAttachPic::getId, picItem.getId())); + throw new RuntimeException(e); + } finally { + //无论成功还是失败,都要删除临时文件 + if (ObjectUtil.isNotEmpty(filePath)) { + ossService.deleteFile(filePath); + } + } + } + + + /** + * 从文件名中提取page后面的两个数字 + * + * @param fileName 文件名 + * @return 包含两个数字的数组,如果未找到则返回null + */ + public static int[] extractPageNumbers(String fileName) { + // 查找"page_"的位置 + int pageIndex = fileName.indexOf("page_"); + + if (pageIndex == -1) { + return null; + } + + // 从"page_"后开始截取 + String afterPage = fileName.substring(pageIndex + 5); + + // 按下划线分割 + String[] parts = afterPage.split("_"); + + if (parts.length >= 2) { + try { + // 提取两个数字 + int firstNumber = Integer.parseInt(parts[0]); + + // 对于第二个数字,需要去掉可能的文件扩展名 + String secondPart = parts[1]; + int dotIndex = secondPart.indexOf("."); + if (dotIndex != -1) { + secondPart = secondPart.substring(0, dotIndex); + } + + int secondNumber = Integer.parseInt(secondPart); + + return new int[]{firstNumber, secondNumber}; + } catch (NumberFormatException e) { + return null; + } + } + + return null; + } + + public static String parseContent(String content) { + try { + // 首先尝试作为JSON解析 + ObjectMapper objectMapper = new ObjectMapper(); + JsonNode rootNode = objectMapper.readTree(content); + + // 如果是JSON格式,按原有逻辑处理 + JsonNode choicesNode = rootNode.get("choices"); + if (choicesNode != null && choicesNode.isArray() && choicesNode.size() > 0) { + JsonNode firstChoice = choicesNode.get(0); + JsonNode messageNode = firstChoice.get("message"); + if (messageNode != null) { + JsonNode contentNode = messageNode.get("content"); + if (contentNode != null) { + return contentNode.asText(); + } + } + return "无法找到content内容"; + } + + // 如果不是预期的JSON格式,直接返回原始内容 + return content; + + } catch (Exception e) { + // 如果解析JSON失败,说明是普通文本,直接返回 + return content; + } + } + + +} diff --git a/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/chat/service/knowledge/KnowledgeInfoServiceImpl.java b/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/chat/service/knowledge/KnowledgeInfoServiceImpl.java index 12ccab1f..30e95731 100644 --- a/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/chat/service/knowledge/KnowledgeInfoServiceImpl.java +++ b/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/chat/service/knowledge/KnowledgeInfoServiceImpl.java @@ -36,7 +36,6 @@ import java.io.IOException; import java.util.*; import java.util.stream.Collectors; - /** * 知识库Service业务层处理 * @@ -191,7 +190,7 @@ public class KnowledgeInfoServiceImpl implements IKnowledgeInfoService { * 保存前的数据校验 */ private void validEntityBeforeSave(KnowledgeInfo entity) { - //TODO 做一些数据校验,如唯一约束 + // TODO 做一些数据校验,如唯一约束 } /** @@ -200,7 +199,7 @@ public class KnowledgeInfoServiceImpl implements IKnowledgeInfoService { @Override public Boolean deleteWithValidByIds(Collection ids, Boolean isValid) { if (isValid) { - //TODO 做一些业务上的校验,判断是否需要校验 + // TODO 做一些业务上的校验,判断是否需要校验 } return baseMapper.deleteBatchIds(ids) > 0; } @@ -316,4 +315,78 @@ public class KnowledgeInfoServiceImpl implements IKnowledgeInfoService { } } + /** + * 第一步 定时 拆解PDF文件中的图片 + */ + @Scheduled(fixedDelay = 15000) // 每3秒执行一次 + public void dealKnowledgeAttachPic() throws Exception { + // 处理 拆解PDF文件中的图片的记录 + List knowledgeAttaches = attachMapper.selectList( + new LambdaQueryWrapper() + .eq(KnowledgeAttach::getPicStatus, DealStatus.STATUS_10) + .eq(KnowledgeAttach::getPicAnysStatus, DealStatus.STATUS_10) + .eq(KnowledgeAttach::getVectorStatus, DealStatus.STATUS_10)); + log.info("===============拆解PDF文件中的图片 size = {}", knowledgeAttaches.size()); + if (ObjectUtil.isNotEmpty(knowledgeAttaches)) { + for (KnowledgeAttach attachItem : knowledgeAttaches) { + dealFileService.dealPicStatus(attachItem); + } + } + } + + /** + * 第二步 定时 解析图片内容 + */ + @Scheduled(fixedDelay = 15000) + public void dealKnowledgeAttachPicAnys() throws Exception { + // 获取未处理的图片记录 + List knowledgeAttachPics = picMapper.selectList( + new LambdaQueryWrapper() + .eq(KnowledgeAttachPic::getPicAnysStatus, DealStatus.STATUS_10) + .last("LIMIT 20")); + if (ObjectUtil.isNotEmpty(knowledgeAttachPics)) { + for (KnowledgeAttachPic picItem : knowledgeAttachPics) { + dealFileService.dealPicAnysStatus(picItem); + } + } + } + + /** + * 第三步 定时 处理 附件上传后上传向量数据库 + */ + @Scheduled(fixedDelay = 30000) // 每3秒执行一次 + public void dealKnowledgeAttachVector() throws Exception { + // 处理 需要上传向量数据库的记录 + List knowledgeAttaches = attachMapper.selectList( + new LambdaQueryWrapper() + .eq(KnowledgeAttach::getPicStatus, DealStatus.STATUS_30) + .eq(KnowledgeAttach::getPicAnysStatus, DealStatus.STATUS_30) + .eq(KnowledgeAttach::getVectorStatus, DealStatus.STATUS_10)); + log.info("===============上传向量数据库 size = {}", knowledgeAttaches.size()); + if (ObjectUtil.isNotEmpty(knowledgeAttaches)) { + for (KnowledgeAttach attachItem : knowledgeAttaches) { + dealFileService.dealVectorStatus(attachItem); + } + } + } + + /** + * 第四步 定时 处理 失败数据 + */ + @Scheduled(fixedDelay = 30 * 60 * 1000) + public void dealKnowledge40Status() throws Exception { + // 拆解PDF失败 重新设置状态 + attachMapper.update(new LambdaUpdateWrapper() + .set(KnowledgeAttach::getPicStatus, DealStatus.STATUS_10) + .eq(KnowledgeAttach::getPicStatus, DealStatus.STATUS_40)); + // 将图片分析失败的数据 重新设置状态 + picMapper.update(new LambdaUpdateWrapper() + .set(KnowledgeAttachPic::getPicAnysStatus, DealStatus.STATUS_10) + .eq(KnowledgeAttachPic::getPicAnysStatus, DealStatus.STATUS_40)); + // 上传向量库失败 重新设置状态 + attachMapper.update(new LambdaUpdateWrapper() + .set(KnowledgeAttach::getVectorStatus, DealStatus.STATUS_10) + .eq(KnowledgeAttach::getVectorStatus, DealStatus.STATUS_40)); + } + } diff --git a/script/sql/update/202505141010.sql b/script/sql/update/202505141010.sql new file mode 100644 index 00000000..775a20b1 --- /dev/null +++ b/script/sql/update/202505141010.sql @@ -0,0 +1,55 @@ +ALTER TABLE `knowledge_attach` +ADD COLUMN `oss_id` bigint(20) NOT NULL COMMENT '对象存储主键' AFTER `remark`, +ADD COLUMN `pic_status` tinyint(1) NOT NULL DEFAULT 10 COMMENT '拆解图片状态10未开始,20进行中,30已完成' AFTER `oss_id`, +ADD COLUMN `pic_anys_status` tinyint(1) NOT NULL DEFAULT 10 COMMENT '分析图片状态10未开始,20进行中,30已完成' AFTER `pic_status`, +ADD COLUMN `vector_status` tinyint(1) NOT NULL DEFAULT 10 COMMENT '写入向量数据库状态10未开始,20进行中,30已完成' AFTER `pic_anys_status`, +DROP PRIMARY KEY, +ADD PRIMARY KEY (`id`) USING BTREE; + +ALTER TABLE `knowledge_attach` +MODIFY COLUMN `remark` text CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL COMMENT '备注' AFTER `update_time`; + +/* + Navicat Premium Data Transfer + + Source Server : localhost-57 + Source Server Type : MySQL + Source Server Version : 50731 (5.7.31) + Source Host : localhost:3306 + Source Schema : ruoyi-ai + + Target Server Type : MySQL + Target Server Version : 50731 (5.7.31) + File Encoding : 65001 + + Date: 19/05/2025 15:22:09 +*/ + +SET NAMES utf8mb4; +SET FOREIGN_KEY_CHECKS = 0; + +-- ---------------------------- +-- Table structure for knowledge_attach_pic +-- ---------------------------- +DROP TABLE IF EXISTS `knowledge_attach_pic`; +CREATE TABLE `knowledge_attach_pic` ( + `id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '主键', + `kid` varchar(50) NOT NULL COMMENT '知识库id', + `aid` varchar(50) NOT NULL COMMENT '附件id', + `doc_name` varchar(500) DEFAULT NULL COMMENT '文档名称', + `doc_type` varchar(50) NOT NULL COMMENT '文档类型', + `content` longtext COMMENT '文档内容', + `page_num` int(5) DEFAULT '0' COMMENT '所在页数', + `index_num` int(5) DEFAULT '0' COMMENT '所在页index', + `pic_anys_status` int(5) NOT NULL DEFAULT '10' COMMENT '分析图片状态10未开始,20进行中,30已完成', + `oss_id` bigint(20) NOT NULL COMMENT '对象存储主键', + `create_dept` varchar(255) DEFAULT NULL COMMENT '部门', + `create_by` varchar(50) DEFAULT NULL COMMENT '创建人', + `create_time` datetime DEFAULT NULL COMMENT '创建时间', + `update_by` bigint(20) DEFAULT NULL COMMENT '更新者', + `update_time` datetime DEFAULT NULL COMMENT '更新时间', + `remark` text COMMENT '备注', + PRIMARY KEY (`id`) +) ENGINE=InnoDB AUTO_INCREMENT=1922929659800637443 DEFAULT CHARSET=utf8mb4 COMMENT='知识库附件图片列表'; + +SET FOREIGN_KEY_CHECKS = 1; diff --git a/script/sql/update/202506041541.sql b/script/sql/update/202506041541.sql new file mode 100644 index 00000000..6e25d24e --- /dev/null +++ b/script/sql/update/202506041541.sql @@ -0,0 +1,2 @@ +ALTER TABLE `knowledge_attach` +MODIFY COLUMN `oss_id` bigint(20) NULL DEFAULT NULL COMMENT '对象存储ID' AFTER `doc_type`; \ No newline at end of file