From 86825eeb2e0c38b796681c7502c9ca88735a23af Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E9=B9=8F=E7=BF=94?= <643541620@qq.com>
Date: Wed, 21 May 2025 14:25:44 +0800
Subject: [PATCH 1/3] =?UTF-8?q?=E7=BB=93=E5=90=88mineru=E6=96=B0=E5=A2=9Ep?=
 =?UTF-8?q?df=E8=BD=AC=E6=8D=A2=E7=BB=93=E6=9E=84=E5=8C=96=E6=95=B0?=
 =?UTF-8?q?=E6=8D=AE=E5=8A=9F=E8=83=BD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../src/main/resources/application-dev.yml    |  13 +-
 .../src/main/resources/application-prod.yml   |   9 +-
 .../chain/loader/PdfMinerUFileLoader.java     | 362 ++++++++++++++++++
 .../chain/loader/ResourceLoaderFactory.java   |   8 +-
 .../config/properties/PdfProperties.java      |  83 ++++
 5 files changed, 470 insertions(+), 5 deletions(-)
 create mode 100644 ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/chain/loader/PdfMinerUFileLoader.java
 create mode 100644 ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/config/properties/PdfProperties.java

diff --git a/ruoyi-admin/src/main/resources/application-dev.yml b/ruoyi-admin/src/main/resources/application-dev.yml
index 954deb66..7f591d51 100644
--- a/ruoyi-admin/src/main/resources/application-dev.yml
+++ b/ruoyi-admin/src/main/resources/application-dev.yml
@@ -25,9 +25,9 @@ spring:
         master:
           type: ${spring.datasource.type}
           driverClassName: com.mysql.cj.jdbc.Driver
-          url: jdbc:mysql://127.0.0.1:3306/ry-vue?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull&useSSL=true&serverTimezone=GMT%2B8&autoReconnect=true&rewriteBatchedStatements=true
-          username: ry-vue
-          password: xx
+          url: jdbc:mysql://127.0.0.1:3306/ruoyi-ai?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull&useSSL=true&serverTimezone=GMT%2B8&autoReconnect=true&rewriteBatchedStatements=true
+          username: root
+          password: root
 
       hikari:
         # 最大连接池数量
@@ -101,4 +101,11 @@ pdf:
     ai-api:
       url: https://api.pandarobot.chat/v1/chat/completions
       key: sk-xxxx
+  transition:
+    # 是否开启mineru
+    enable-minerU: true
+    # mineru conda环境路径
+    conda-env-path: "F:\\ProgramData\\Computer\\Anaconda\\envs\\mineru"
+    # 是否开启图片OCR
+    enable-ocr: true
 
diff --git a/ruoyi-admin/src/main/resources/application-prod.yml b/ruoyi-admin/src/main/resources/application-prod.yml
index cb495f9f..fe3e476b 100644
--- a/ruoyi-admin/src/main/resources/application-prod.yml
+++ b/ruoyi-admin/src/main/resources/application-prod.yml
@@ -179,4 +179,11 @@ pdf:
       url: http://localhost:8080
     ai-api:
       url: https://api.pandarobot.chat/v1/chat/completions
-      key: sk-XXXXXX
\ No newline at end of file
+      key: sk-XXXXXX
+  transition:
+    # 是否开启mineru
+    enable-minerU: true
+    # mineru conda环境路径
+    conda-env-path: "F:\\ProgramData\\Computer\\Anaconda\\envs\\mineru"
+    # 是否开启图片OCR
+    enable-ocr: true
\ No newline at end of file
diff --git a/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/chain/loader/PdfMinerUFileLoader.java b/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/chain/loader/PdfMinerUFileLoader.java
new file mode 100644
index 00000000..e6d17d43
--- /dev/null
+++ b/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/chain/loader/PdfMinerUFileLoader.java
@@ -0,0 +1,362 @@
+package org.ruoyi.chain.loader;
+
+import dev.langchain4j.data.document.Document;
+import dev.langchain4j.data.document.DocumentParser;
+import dev.langchain4j.data.document.loader.FileSystemDocumentLoader;
+import dev.langchain4j.data.document.parser.apache.tika.ApacheTikaDocumentParser;
+import dev.langchain4j.data.message.AiMessage;
+import dev.langchain4j.data.message.ImageContent;
+import dev.langchain4j.data.message.TextContent;
+import dev.langchain4j.data.message.UserMessage;
+import dev.langchain4j.model.chat.response.ChatResponse;
+import dev.langchain4j.model.openai.OpenAiChatModel;
+import lombok.AllArgsConstructor;
+import lombok.extern.slf4j.Slf4j;
+import org.apache.commons.io.FilenameUtils;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.text.PDFTextStripper;
+import org.ruoyi.chain.split.TextSplitter;
+import org.ruoyi.common.core.exception.ServiceException;
+import org.ruoyi.common.core.utils.SpringUtils;
+import org.ruoyi.common.core.utils.StringUtils;
+import org.ruoyi.common.core.utils.file.FileUtils;
+import org.ruoyi.common.oss.core.OssClient;
+import org.ruoyi.common.oss.entity.UploadResult;
+import org.ruoyi.common.oss.factory.OssFactory;
+import org.ruoyi.config.properties.PdfProperties;
+import org.ruoyi.system.domain.SysOss;
+import org.ruoyi.system.mapper.SysOssMapper;
+import org.springframework.http.MediaType;
+import org.springframework.stereotype.Component;
+import org.springframework.web.multipart.MultipartFile;
+
+import java.io.*;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.StandardCopyOption;
+import java.util.Base64;
+import java.util.Comparator;
+import java.util.List;
+import java.util.concurrent.Executors;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Pdf mineru文件加载器
+ *
+ * @author zpx
+ */
+@Slf4j
+@Component
+@AllArgsConstructor
+public class PdfMinerUFileLoader implements ResourceLoader {
+    private final TextSplitter characterTextSplitter;
+    private final PdfProperties properties;
+    private final SysOssMapper sysOssMapper;
+    // 预编译正则表达式
+    private static final Pattern MD_IMAGE_PATTERN = Pattern.compile("!\\[(.*?)]\\((.*?)(\\s*=\\d+)?\\)");
+
+
+    @Override
+    public String getContent(InputStream inputStream) {
+        String content = "";
+        File tempPdf = null;
+        Path outputPath = null;
+        try {
+            // 创建临时文件
+            tempPdf = createTempFile(inputStream);
+            //构建输出路径
+            outputPath = buildOutputPath();
+            // 执行转换命令
+            Process process = buildProcess(properties.getTransition().getCondaEnvPath(), tempPdf, outputPath);
+            //打印执行日志
+            logProcessOutput(process);
+            int exitCode = process.waitFor();
+            //验证转换结果
+            String verifyResult = verifyResult(tempPdf, outputPath, exitCode);
+
+            // 获取生成的.md文件路径
+            Path mdFilePath = Paths.get(verifyResult);
+            if (Files.exists(mdFilePath)) {
+                log.info("找到Markdown文件: " + mdFilePath);
+                DocumentParser documentParser = new ApacheTikaDocumentParser();
+                Document document = FileSystemDocumentLoader.loadDocument(mdFilePath.toString(), documentParser);
+                if (null != document) {
+                    content = document.text();
+                    // 判断是否md文档
+                    String fileType = FilenameUtils.getExtension(mdFilePath.getFileName().toString());
+                    if ("md".contains(fileType)) {
+                        // 如果是md文件，查找所有图片语法，如果是本地图片，替换成网络图片
+                        StringBuffer sb = replaceImageUrl(content, mdFilePath);
+                        content = sb.toString();
+                    }
+                } else {
+                    log.warn("无法解析文档内容");
+                }
+            } else {
+                log.warn("未找到预期的 .md 文件");
+            }
+            return content;
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        } catch (InterruptedException e) {
+            throw new RuntimeException(e);
+        } finally {
+            if (tempPdf != null) {
+                try {
+                    // 清理临时文件
+                    Files.deleteIfExists(tempPdf.toPath());
+                } catch (IOException e) {
+                    log.warn("删除临时文件失败: {}", tempPdf.getAbsolutePath(), e);
+                }
+            }
+            //清理输出目录
+            if (outputPath != null) {
+                cleanOutputDirectory(outputPath);
+            }
+        }
+    }
+
+    @Override
+    public List<String> getChunkList(String content, String kid) {
+        return characterTextSplitter.split(content, kid);
+    }
+
+    /**
+     * 创建临时PDF文件
+     *
+     * @param is 输入流
+     * @return
+     * @throws IOException
+     */
+    private static File createTempFile(InputStream is) throws IOException {
+        File tempFile = File.createTempFile("upload_", ".pdf");
+        Files.copy(is, tempFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
+        return tempFile;
+    }
+
+
+    /**
+     * 构建跨平台文件输出路径
+     *
+     * @return
+     * @throws IOException
+     */
+    private static Path buildOutputPath() throws IOException {
+        Path basePath = isWindows() ?
+                Paths.get(new File("").getCanonicalPath().substring(0, 3)).resolve("minerUOutPut") :
+                Paths.get("/var/minerUOutPut");
+
+        if (!Files.exists(basePath)) {
+            Files.createDirectories(basePath);
+        }
+        return basePath;
+    }
+
+    /**
+     * 判断当前操作系统是否为Windows
+     *
+     * @return
+     */
+    private static boolean isWindows() {
+        return System.getProperty("os.name").toLowerCase().contains("win");
+    }
+
+    /**
+     * 执行命令
+     *
+     * @param condaEnv   conda环境路径
+     * @param inputFile  输入文件
+     * @param outputPath 输出路径
+     * @return
+     * @throws IOException
+     */
+    private static Process buildProcess(String condaEnv, File inputFile, Path outputPath) throws IOException {
+        ProcessBuilder pb = new ProcessBuilder();
+        String[] command;
+
+        if (isWindows()) {
+            command = new String[]{
+                    "cmd", "/c",
+                    "call", "conda", "activate",
+                    condaEnv.replace("\"", ""),
+                    "&&", "magic-pdf",
+                    "-p", inputFile.getAbsolutePath(),
+                    "-o", outputPath.toString()
+            };
+        } else {
+            command = new String[]{
+                    "bash", "-c",
+                    String.format("source '%s/bin/activate' && magic-pdf -p '%s' -o '%s'",
+                            condaEnv,
+                            inputFile.getAbsolutePath(),
+                            outputPath.toString())
+            };
+        }
+
+        return pb.command(command)
+                .redirectErrorStream(true)
+                .start();
+    }
+
+
+    /**
+     * 实时日志输出
+     *
+     * @param process 进程
+     */
+    private static void logProcessOutput(Process process) {
+        Executors.newSingleThreadExecutor().submit(() -> {
+            try (BufferedReader reader = new BufferedReader(
+                    new InputStreamReader(process.getInputStream()))) {
+                String line;
+                while ((line = reader.readLine()) != null) {
+                    log.info("[PROCESS LOG] " + line);
+                }
+            } catch (IOException e) {
+                e.printStackTrace();
+            }
+        });
+    }
+
+    /**
+     * 验证转换结果
+     *
+     * @param inputFile  输入文件
+     * @param outputPath 输出路径
+     * @param exitCode   退出码
+     * @return
+     */
+    private static String verifyResult(File inputFile, Path outputPath, int exitCode) {
+        String baseName = FilenameUtils.removeExtension(inputFile.getName());
+        Path expectedMd = outputPath
+                .resolve(baseName)
+                .resolve("auto")
+                .resolve(baseName + ".md");
+
+        if (exitCode == 0 && Files.exists(expectedMd)) {
+            log.info("转换成功：{}", expectedMd.toString());
+            return expectedMd.toString();
+        }
+        return String.format("转换失败（退出码%d）| 预期文件：%s", exitCode, expectedMd);
+    }
+
+    /**
+     * 正则匹配图片语法
+     * @param content 文本内容
+     * @param basePath 图片路径
+     * @return
+     */
+    private StringBuffer replaceImageUrl(String content, Path basePath)  {
+        // 正则表达式匹配md文件中的图片语法 ![alt text](image url)
+        Matcher matcher = MD_IMAGE_PATTERN.matcher(content);
+
+        StringBuffer sb = new StringBuffer();
+        while (matcher.find()) {
+            String imageUrl = matcher.group(2);
+            // 检查是否是本地图片路径
+            if (!imageUrl.startsWith("http")) {
+                // 获取图片完整路径，上传到Oss中
+                Path imagePath = basePath.getParent().resolve(imageUrl);
+                if (!Files.exists(imagePath)) {
+                    log.error("图片路径不存在: {}", imagePath);
+                }
+                // 获取原始文件名和后缀
+                String originalfileName = imagePath.getFileName().toString();
+                // 获取文件后缀
+                String suffix = StringUtils.substring(originalfileName, originalfileName.lastIndexOf("."),
+                        originalfileName.length());
+                // 读取文件字节流
+                try (InputStream inputStream = Files.newInputStream(imagePath)) {
+                    // 使用 OssClient 直接上传字节流
+                    OssClient storage = OssFactory.instance();
+                    UploadResult uploadResult = storage.uploadSuffix(inputStream, suffix, FileUtils.getMimeType(suffix));
+
+                    // 构建 SysOss 对象并保存数据库记录
+                    SysOss sysOss = new SysOss();
+                    sysOss.setUrl(uploadResult.getUrl());
+                    sysOss.setFileSuffix(suffix);
+                    sysOss.setFileName(uploadResult.getFilename());
+                    sysOss.setOriginalName(originalfileName);
+                    sysOss.setService(storage.getConfigKey());
+
+                    // 插入数据库
+                    sysOssMapper.insert(sysOss);
+
+                    // OCR 处理 & 替换图片链接
+                    String networkImageUrl = uploadResult.getUrl();
+                    //⚠️ 注意：确保 URL 是公网可访问的，否则模型无法加载图片。
+                    //另一种解决方案：使用base64 但是需要申请apikey , 使用demo会出现token超出长度问题。
+                    String imageUrlOCR = imageUrlOCR(networkImageUrl);
+                    matcher.appendReplacement(sb, "![" + matcher.group(1) + imageUrlOCR + "](" + networkImageUrl + ")");
+            } catch (IOException e) {
+                log.error("读取或上传图片失败", e);
+                matcher.appendReplacement(sb, matcher.group(0)); // 保留原图语法
+            }
+            } else {
+                //多模态OCR识别图片内容
+                String imageUrlOCR = imageUrlOCR(imageUrl);
+                matcher.appendReplacement(sb, "![" + matcher.group(1) + imageUrlOCR + "](" + imageUrl + ")");
+            }
+        }
+        matcher.appendTail(sb);
+        return sb;
+    }
+
+    /**
+     * 多模态OCR识别图片内容
+     * @param imageUrl 图片URL
+     * @return
+     */
+    private static String imageUrlOCR(String imageUrl) {
+        OpenAiChatModel model = OpenAiChatModel.builder()
+                .apiKey("demo")
+                .modelName("gpt-4o-mini")
+                .baseUrl("http://langchain4j.dev/demo/openai/v1")
+                .build();
+
+        UserMessage userMessage = UserMessage.from(
+                TextContent.from(
+                        "请按以下逻辑处理图片：\n" +
+                                "1. 文字检测：识别图中所有可见文字（包括水印/标签），若无文字则跳至步骤3\n" +
+                                "2. 文字处理：\n" +
+                                "   a. 按出现顺序完整提取文字（非中文立即翻译）\n" +
+                                "   b. 用20字内总结核心信息，禁止补充解释\n" +
+                                "   c. 描述文字位置(如'顶部居中')、字体特征(颜色/大小)\n" +
+                                "3. 视觉描述：\n" +
+                                "   a. 客观说明主体对象、场景、色彩搭配与画面氛围\n" +
+                                "4. 输出规则：\n" +
+                                "   - 最终输出为纯文本，格式：'[文字总结] 视觉描述 关键词：xx,xx'\n" +
+                                "   - 关键词从内容中提取3个最具代表性的名词"
+                ),
+                ImageContent.from(imageUrl)
+        );
+        ChatResponse chat = model.chat(userMessage);
+        AiMessage answer = chat.aiMessage();
+        return answer.text();
+    }
+
+    /**
+     * 清理输出目录
+     * @param outputPath 输出目录
+     */
+    private static void cleanOutputDirectory(Path outputPath) {
+        if (Files.exists(outputPath)) {
+            try {
+                Files.walk(outputPath)
+                       // 按逆序删除（子目录先删）
+                        .sorted((p1, p2) -> -p1.compareTo(p2))
+                        .forEach(path -> {
+                            try {
+                                Files.delete(path);
+                            } catch (IOException e) {
+                                log.warn("清理输出目录失败: {}", path, e);
+                            }
+                        });
+            } catch (IOException e) {
+                log.error("遍历输出目录失败", e);
+            }
+        }
+    }
+}
diff --git a/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/chain/loader/ResourceLoaderFactory.java b/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/chain/loader/ResourceLoaderFactory.java
index ec33c668..4fb53dfb 100644
--- a/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/chain/loader/ResourceLoaderFactory.java
+++ b/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/chain/loader/ResourceLoaderFactory.java
@@ -3,7 +3,9 @@ package org.ruoyi.chain.loader;
 import lombok.AllArgsConstructor;
 import org.ruoyi.chain.split.*;
 
+import org.ruoyi.config.properties.PdfProperties;
 import org.ruoyi.constant.FileType;
+import org.ruoyi.system.mapper.SysOssMapper;
 import org.springframework.stereotype.Component;
 
 @AllArgsConstructor
@@ -14,15 +16,19 @@ public class ResourceLoaderFactory {
     private final MarkdownTextSplitter markdownTextSplitter;
     private final TokenTextSplitter tokenTextSplitter;
     private final ExcelTextSplitter excelTextSplitter;
+    private final PdfProperties pdfProperties;
+    private final SysOssMapper sysOssMapper;
 
     public ResourceLoader getLoaderByFileType(String fileType){
         if (FileType.isTextFile(fileType)){
             return new TextFileLoader(characterTextSplitter);
         } else if (FileType.isWord(fileType)) {
             return new WordLoader(characterTextSplitter);
+        } else if (FileType.isPdf(fileType) && pdfProperties.getTransition().isEnableMinerU()) {
+            return new PdfMinerUFileLoader(characterTextSplitter,pdfProperties,sysOssMapper);
         } else if (FileType.isPdf(fileType)) {
             return new PdfFileLoader(characterTextSplitter);
-        } else if (FileType.isMdFile(fileType)) {
+        }else if (FileType.isMdFile(fileType)) {
             return new MarkDownFileLoader(markdownTextSplitter);
         }else if (FileType.isExcel(fileType)) {
             return new ExcelFileLoader(excelTextSplitter);
diff --git a/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/config/properties/PdfProperties.java b/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/config/properties/PdfProperties.java
new file mode 100644
index 00000000..fe89ee39
--- /dev/null
+++ b/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/config/properties/PdfProperties.java
@@ -0,0 +1,83 @@
+package org.ruoyi.config.properties;
+
+import lombok.Data;
+import lombok.NoArgsConstructor;
+import org.springframework.boot.context.properties.ConfigurationProperties;
+import org.springframework.stereotype.Component;
+
+/**
+ * PDF 配置属性
+ *
+ * @author zpx
+ */
+@Data
+@Component
+@ConfigurationProperties(prefix = "pdf")
+public class PdfProperties {
+
+    /**
+     * Extract 配置
+     */
+    private ExtractConfig extract;
+
+    /**
+     * Transition 配置
+     */
+    private TransitionConfig transition;
+
+    @Data
+    @NoArgsConstructor
+    public static class ExtractConfig {
+        /**
+         * Service 配置
+         */
+        private ServiceConfig service;
+
+        /**
+         * AI API 配置
+         */
+        private AiApiConfig aiApi;
+
+        @Data
+        @NoArgsConstructor
+        public static class ServiceConfig {
+            /**
+             * 服务地址 URL
+             */
+            private String url;
+        }
+
+        @Data
+        @NoArgsConstructor
+        public static class AiApiConfig {
+            /**
+             * AI API 地址 URL
+             */
+            private String url;
+
+            /**
+             * API 密钥
+             */
+            private String key;
+        }
+    }
+
+    @Data
+    @NoArgsConstructor
+    public static class TransitionConfig {
+        /**
+         * 是否启用 MinerU
+         */
+        private boolean enableMinerU;
+
+        /**
+         * MinerU Conda 环境路径
+         */
+        private String condaEnvPath;
+
+        /**
+         * 是否启用图片 OCR
+         */
+        private boolean enableOcr;
+    }
+}
\ No newline at end of file

From 22d9d9ba85eafbfa7f0bc5f834e42ecf5300abe3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E9=B9=8F=E7=BF=94?= <643541620@qq.com>
Date: Wed, 21 May 2025 14:37:51 +0800
Subject: [PATCH 2/3] =?UTF-8?q?=E6=96=B0=E5=A2=9Epdf=E8=BD=ACmd=E5=90=8E?=
 =?UTF-8?q?=E6=98=AF=E5=90=A6=E8=BF=9B=E8=A1=8C=E5=9B=BE=E7=89=87OCR?=
 =?UTF-8?q?=E5=88=A4=E6=96=AD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../main/java/org/ruoyi/chain/loader/PdfMinerUFileLoader.java  | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/chain/loader/PdfMinerUFileLoader.java b/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/chain/loader/PdfMinerUFileLoader.java
index e6d17d43..fa7b1299 100644
--- a/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/chain/loader/PdfMinerUFileLoader.java
+++ b/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/chain/loader/PdfMinerUFileLoader.java
@@ -86,7 +86,8 @@ public class PdfMinerUFileLoader implements ResourceLoader {
                     content = document.text();
                     // 判断是否md文档
                     String fileType = FilenameUtils.getExtension(mdFilePath.getFileName().toString());
-                    if ("md".contains(fileType)) {
+                    //判断是否需要进行图片OCR识别
+                    if ("md".contains(fileType) && properties.getTransition().isEnableOcr()) {
                         // 如果是md文件，查找所有图片语法，如果是本地图片，替换成网络图片
                         StringBuffer sb = replaceImageUrl(content, mdFilePath);
                         content = sb.toString();

From 0f827111995a46301c9bcb1fb17fad20e5fae9fd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E9=B9=8F=E7=BF=94?= <643541620@qq.com>
Date: Thu, 22 May 2025 14:05:21 +0800
Subject: [PATCH 3/3] =?UTF-8?q?perf:=201.=E4=BC=98=E5=8C=96=E6=96=87?=
 =?UTF-8?q?=E4=BB=B6=E8=BE=93=E5=87=BA=E8=B7=AF=E5=BE=84,=E9=81=BF?=
 =?UTF-8?q?=E5=85=8D=E5=85=B6=E4=BB=96=E7=9B=98=E7=AC=A6=E6=9D=83=E9=99=90?=
 =?UTF-8?q?=E9=97=AE=E9=A2=98=E3=80=82=20=20=20=20=20=20=202.=E9=87=87?=
 =?UTF-8?q?=E7=94=A8=E7=BA=BF=E7=A8=8B=E6=B1=A0=E5=BC=82=E6=AD=A5=E8=B0=83?=
 =?UTF-8?q?=E7=94=A8=E5=A4=9A=E6=A8=A1=E6=9D=BFOCR=E5=9B=BE=E7=89=87?=
 =?UTF-8?q?=E8=AF=86=E5=88=AB=E3=80=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../chain/loader/PdfMinerUFileLoader.java     | 192 +++++++++++++-----
 1 file changed, 139 insertions(+), 53 deletions(-)

diff --git a/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/chain/loader/PdfMinerUFileLoader.java b/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/chain/loader/PdfMinerUFileLoader.java
index fa7b1299..27ef311e 100644
--- a/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/chain/loader/PdfMinerUFileLoader.java
+++ b/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/chain/loader/PdfMinerUFileLoader.java
@@ -13,11 +13,7 @@ import dev.langchain4j.model.openai.OpenAiChatModel;
 import lombok.AllArgsConstructor;
 import lombok.extern.slf4j.Slf4j;
 import org.apache.commons.io.FilenameUtils;
-import org.apache.pdfbox.pdmodel.PDDocument;
-import org.apache.pdfbox.text.PDFTextStripper;
 import org.ruoyi.chain.split.TextSplitter;
-import org.ruoyi.common.core.exception.ServiceException;
-import org.ruoyi.common.core.utils.SpringUtils;
 import org.ruoyi.common.core.utils.StringUtils;
 import org.ruoyi.common.core.utils.file.FileUtils;
 import org.ruoyi.common.oss.core.OssClient;
@@ -26,19 +22,16 @@ import org.ruoyi.common.oss.factory.OssFactory;
 import org.ruoyi.config.properties.PdfProperties;
 import org.ruoyi.system.domain.SysOss;
 import org.ruoyi.system.mapper.SysOssMapper;
-import org.springframework.http.MediaType;
 import org.springframework.stereotype.Component;
-import org.springframework.web.multipart.MultipartFile;
 
 import java.io.*;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.nio.file.StandardCopyOption;
-import java.util.Base64;
-import java.util.Comparator;
+import java.util.ArrayList;
 import java.util.List;
-import java.util.concurrent.Executors;
+import java.util.concurrent.*;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
@@ -56,7 +49,19 @@ public class PdfMinerUFileLoader implements ResourceLoader {
     private final SysOssMapper sysOssMapper;
     // 预编译正则表达式
     private static final Pattern MD_IMAGE_PATTERN = Pattern.compile("!\\[(.*?)]\\((.*?)(\\s*=\\d+)?\\)");
-
+    // OCR图片识别线程池
+    private final ThreadPoolExecutor ocrExecutor = new ThreadPoolExecutor(
+            // 核心线程数
+            5,
+            // 最大线程数
+            10,
+            // 空闲线程存活时间
+            60L, TimeUnit.SECONDS,
+            // 任务队列容量
+            new LinkedBlockingQueue<>(100),
+            // 拒绝策略
+            new ThreadPoolExecutor.CallerRunsPolicy()
+    );
 
     @Override
     public String getContent(InputStream inputStream) {
@@ -99,9 +104,7 @@ public class PdfMinerUFileLoader implements ResourceLoader {
                 log.warn("未找到预期的 .md 文件");
             }
             return content;
-        } catch (IOException e) {
-            throw new RuntimeException(e);
-        } catch (InterruptedException e) {
+        } catch (Exception e) {
             throw new RuntimeException(e);
         } finally {
             if (tempPdf != null) {
@@ -146,7 +149,8 @@ public class PdfMinerUFileLoader implements ResourceLoader {
      */
     private static Path buildOutputPath() throws IOException {
         Path basePath = isWindows() ?
-                Paths.get(new File("").getCanonicalPath().substring(0, 3)).resolve("minerUOutPut") :
+                //  Windows C盘用户路径下 minerUOutPut，避免其他盘符权限问题
+                Paths.get(System.getProperty("user.home")).resolve("minerUOutPut") :
                 Paths.get("/var/minerUOutPut");
 
         if (!Files.exists(basePath)) {
@@ -243,70 +247,138 @@ public class PdfMinerUFileLoader implements ResourceLoader {
         return String.format("转换失败（退出码%d）| 预期文件：%s", exitCode, expectedMd);
     }
 
+
     /**
-     * 正则匹配图片语法
-     * @param content 文本内容
+     * 正则匹配图片语法,多线程进行处理
+     *
+     * @param content  文本内容
      * @param basePath 图片路径
      * @return
      */
-    private StringBuffer replaceImageUrl(String content, Path basePath)  {
-        // 正则表达式匹配md文件中的图片语法 ![alt text](image url)
+    private StringBuffer replaceImageUrl(String content, Path basePath) throws Exception {
+        List<ImageMatch> matches = new ArrayList<>();
         Matcher matcher = MD_IMAGE_PATTERN.matcher(content);
 
-        StringBuffer sb = new StringBuffer();
+        // 收集所有匹配的图片项
         while (matcher.find()) {
-            String imageUrl = matcher.group(2);
-            // 检查是否是本地图片路径
-            if (!imageUrl.startsWith("http")) {
-                // 获取图片完整路径，上传到Oss中
-                Path imagePath = basePath.getParent().resolve(imageUrl);
+            ImageMatch imgMatch = new ImageMatch();
+            imgMatch.altText = matcher.group(1);
+            imgMatch.imageUrl = matcher.group(2);
+            imgMatch.start = matcher.start();
+            imgMatch.end = matcher.end();
+            matches.add(imgMatch);
+        }
+
+        if (matches.isEmpty()) {
+            return new StringBuffer(content);
+        }
+
+        // 提交任务到线程池
+        List<Future<String>> futures = new ArrayList<>();
+        for (ImageMatch imgMatch : matches) {
+            // 为每个图片项创建独立任务
+            Future<String> future = ocrExecutor.submit(() -> processImage(imgMatch, basePath));
+            futures.add(future);
+        }
+
+        // 按原始顺序拼接结果
+        StringBuffer sb = new StringBuffer();
+        int previousEnd = 0;
+
+        for (int i = 0; i < matches.size(); i++) {
+            ImageMatch imgMatch = matches.get(i);
+            // 阻塞等待结果
+            String replacement = futures.get(i).get();
+
+            // 插入未匹配的原始文本和处理后的结果
+            sb.append(content.substring(previousEnd, imgMatch.start));
+            sb.append(replacement);
+            previousEnd = imgMatch.end;
+        }
+        // 添加剩余文本
+        sb.append(content.substring(previousEnd));
+        return sb;
+    }
+
+
+    /**
+     * 图片处理任务
+     *
+     * @param imgMatch 图片匹配结果
+     * @param basePath 本地图片路径
+     * @return
+     */
+    private String processImage(ImageMatch imgMatch, Path basePath) {
+        try {
+            if (!imgMatch.imageUrl.startsWith("http")) {
+                // 处理本地图片
+                Path imagePath = basePath.getParent().resolve(imgMatch.imageUrl).normalize();
+
                 if (!Files.exists(imagePath)) {
                     log.error("图片路径不存在: {}", imagePath);
+                    return String.format("![%s](%s)", imgMatch.altText, imgMatch.imageUrl);
                 }
-                // 获取原始文件名和后缀
-                String originalfileName = imagePath.getFileName().toString();
-                // 获取文件后缀
-                String suffix = StringUtils.substring(originalfileName, originalfileName.lastIndexOf("."),
-                        originalfileName.length());
-                // 读取文件字节流
+
+                // 文件后缀安全提取
+                String originalFileName = imagePath.getFileName().toString();
+                String suffix = "";
+                int lastDotIndex = originalFileName.lastIndexOf(".");
+                if (lastDotIndex != -1) {
+                    suffix = originalFileName.substring(lastDotIndex);
+                }
+
+                // 上传OSS
                 try (InputStream inputStream = Files.newInputStream(imagePath)) {
-                    // 使用 OssClient 直接上传字节流
                     OssClient storage = OssFactory.instance();
                     UploadResult uploadResult = storage.uploadSuffix(inputStream, suffix, FileUtils.getMimeType(suffix));
 
-                    // 构建 SysOss 对象并保存数据库记录
+                    // 保存数据库记录
                     SysOss sysOss = new SysOss();
                     sysOss.setUrl(uploadResult.getUrl());
                     sysOss.setFileSuffix(suffix);
                     sysOss.setFileName(uploadResult.getFilename());
-                    sysOss.setOriginalName(originalfileName);
+                    sysOss.setOriginalName(originalFileName);
                     sysOss.setService(storage.getConfigKey());
-
-                    // 插入数据库
                     sysOssMapper.insert(sysOss);
 
-                    // OCR 处理 & 替换图片链接
-                    String networkImageUrl = uploadResult.getUrl();
+                    // OCR处理
+                    String networkUrl = uploadResult.getUrl();
                     //⚠️ 注意：确保 URL 是公网可访问的，否则模型无法加载图片。
                     //另一种解决方案：使用base64 但是需要申请apikey , 使用demo会出现token超出长度问题。
-                    String imageUrlOCR = imageUrlOCR(networkImageUrl);
-                    matcher.appendReplacement(sb, "![" + matcher.group(1) + imageUrlOCR + "](" + networkImageUrl + ")");
-            } catch (IOException e) {
-                log.error("读取或上传图片失败", e);
-                matcher.appendReplacement(sb, matcher.group(0)); // 保留原图语法
-            }
+                    String ocrResult = safeImageUrlOCR(networkUrl);
+                    return String.format("![%s%s](%s)", imgMatch.altText, ocrResult, networkUrl);
+                }
             } else {
-                //多模态OCR识别图片内容
-                String imageUrlOCR = imageUrlOCR(imageUrl);
-                matcher.appendReplacement(sb, "![" + matcher.group(1) + imageUrlOCR + "](" + imageUrl + ")");
+                // 处理远程图片
+                String ocrResult = safeImageUrlOCR(imgMatch.imageUrl);
+                return String.format("![%s%s](%s)", imgMatch.altText, ocrResult, imgMatch.imageUrl);
             }
+        } catch (Exception e) {
+            log.error("图片处理失败: {}", imgMatch.imageUrl, e);
+            return String.format("![%s](%s)", imgMatch.altText, imgMatch.imageUrl);
         }
-        matcher.appendTail(sb);
-        return sb;
     }
 
+    /**
+     * OCR调用
+     *
+     * @param imageUrl 图片URL
+     * @return
+     */
+    private String safeImageUrlOCR(String imageUrl) {
+        try {
+            return imageUrlOCR(imageUrl);
+        } catch (Exception e) {
+            log.warn("OCR处理失败: {}", imageUrl, e);
+            // OCR失败时返回空字符串
+            return "";
+        }
+    }
+
+
     /**
      * 多模态OCR识别图片内容
+     *
      * @param imageUrl 图片URL
      * @return
      */
@@ -322,14 +394,16 @@ public class PdfMinerUFileLoader implements ResourceLoader {
                         "请按以下逻辑处理图片：\n" +
                                 "1. 文字检测：识别图中所有可见文字（包括水印/标签），若无文字则跳至步骤3\n" +
                                 "2. 文字处理：\n" +
-                                "   a. 按出现顺序完整提取文字（非中文立即翻译）\n" +
-                                "   b. 用20字内总结核心信息，禁止补充解释\n" +
+                                "   a. 对识别到的文字进行❗核心信息提炼\n" +
+                                "   b. ❗禁止直接输出原文内容\n" +
                                 "   c. 描述文字位置(如'顶部居中')、字体特征(颜色/大小)\n" +
                                 "3. 视觉描述：\n" +
-                                "   a. 客观说明主体对象、场景、色彩搭配与画面氛围\n" +
+                                "   a. 若无文字则用❗50字内简洁描述主体对象、场景、色彩搭配与画面氛围\n" +
+                                "   b. 若有文字则补充说明文字与画面的关系\n" +
                                 "4. 输出规则：\n" +
                                 "   - 最终输出为纯文本，格式：'[文字总结] 视觉描述 关键词：xx,xx'\n" +
-                                "   - 关键词从内容中提取3个最具代表性的名词"
+                                "   - 关键词从内容中提取3个最具代表性的名词\n" +
+                                "   - 无文字时格式：'[空] 简洁描述 关键词：xx,xx'"
                 ),
                 ImageContent.from(imageUrl)
         );
@@ -338,15 +412,27 @@ public class PdfMinerUFileLoader implements ResourceLoader {
         return answer.text();
     }
 
+    /**
+     * 静态内部类保存图片匹配信息
+     */
+    private static class ImageMatch {
+        String altText; // 替换文本
+        String imageUrl; // 图片地址
+        int start; // 匹配起始位置
+        int end; // 匹配结束位置
+    }
+
+
     /**
      * 清理输出目录
+     *
      * @param outputPath 输出目录
      */
     private static void cleanOutputDirectory(Path outputPath) {
         if (Files.exists(outputPath)) {
             try {
                 Files.walk(outputPath)
-                       // 按逆序删除（子目录先删）
+                        // 按逆序删除（子目录先删）
                         .sorted((p1, p2) -> -p1.compareTo(p2))
                         .forEach(path -> {
                             try {