From 86825eeb2e0c38b796681c7502c9ca88735a23af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E9=B9=8F=E7=BF=94?= <643541620@qq.com> Date: Wed, 21 May 2025 14:25:44 +0800 Subject: [PATCH 1/3] =?UTF-8?q?=E7=BB=93=E5=90=88mineru=E6=96=B0=E5=A2=9Ep?= =?UTF-8?q?df=E8=BD=AC=E6=8D=A2=E7=BB=93=E6=9E=84=E5=8C=96=E6=95=B0?= =?UTF-8?q?=E6=8D=AE=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/main/resources/application-dev.yml | 13 +- .../src/main/resources/application-prod.yml | 9 +- .../chain/loader/PdfMinerUFileLoader.java | 362 ++++++++++++++++++ .../chain/loader/ResourceLoaderFactory.java | 8 +- .../config/properties/PdfProperties.java | 83 ++++ 5 files changed, 470 insertions(+), 5 deletions(-) create mode 100644 ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/chain/loader/PdfMinerUFileLoader.java create mode 100644 ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/config/properties/PdfProperties.java diff --git a/ruoyi-admin/src/main/resources/application-dev.yml b/ruoyi-admin/src/main/resources/application-dev.yml index 954deb66..7f591d51 100644 --- a/ruoyi-admin/src/main/resources/application-dev.yml +++ b/ruoyi-admin/src/main/resources/application-dev.yml @@ -25,9 +25,9 @@ spring: master: type: ${spring.datasource.type} driverClassName: com.mysql.cj.jdbc.Driver - url: jdbc:mysql://127.0.0.1:3306/ry-vue?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull&useSSL=true&serverTimezone=GMT%2B8&autoReconnect=true&rewriteBatchedStatements=true - username: ry-vue - password: xx + url: jdbc:mysql://127.0.0.1:3306/ruoyi-ai?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull&useSSL=true&serverTimezone=GMT%2B8&autoReconnect=true&rewriteBatchedStatements=true + username: root + password: root hikari: # 最大连接池数量 @@ -101,4 +101,11 @@ pdf: ai-api: url: https://api.pandarobot.chat/v1/chat/completions key: sk-xxxx + transition: + # 是否开启mineru + enable-minerU: true + # mineru conda环境路径 + conda-env-path: "F:\\ProgramData\\Computer\\Anaconda\\envs\\mineru" + # 是否开启图片OCR + enable-ocr: true diff --git a/ruoyi-admin/src/main/resources/application-prod.yml b/ruoyi-admin/src/main/resources/application-prod.yml index cb495f9f..fe3e476b 100644 --- a/ruoyi-admin/src/main/resources/application-prod.yml +++ b/ruoyi-admin/src/main/resources/application-prod.yml @@ -179,4 +179,11 @@ pdf: url: http://localhost:8080 ai-api: url: https://api.pandarobot.chat/v1/chat/completions - key: sk-XXXXXX \ No newline at end of file + key: sk-XXXXXX + transition: + # 是否开启mineru + enable-minerU: true + # mineru conda环境路径 + conda-env-path: "F:\\ProgramData\\Computer\\Anaconda\\envs\\mineru" + # 是否开启图片OCR + enable-ocr: true \ No newline at end of file diff --git a/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/chain/loader/PdfMinerUFileLoader.java b/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/chain/loader/PdfMinerUFileLoader.java new file mode 100644 index 00000000..e6d17d43 --- /dev/null +++ b/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/chain/loader/PdfMinerUFileLoader.java @@ -0,0 +1,362 @@ +package org.ruoyi.chain.loader; + +import dev.langchain4j.data.document.Document; +import dev.langchain4j.data.document.DocumentParser; +import dev.langchain4j.data.document.loader.FileSystemDocumentLoader; +import dev.langchain4j.data.document.parser.apache.tika.ApacheTikaDocumentParser; +import dev.langchain4j.data.message.AiMessage; +import dev.langchain4j.data.message.ImageContent; +import dev.langchain4j.data.message.TextContent; +import dev.langchain4j.data.message.UserMessage; +import dev.langchain4j.model.chat.response.ChatResponse; +import dev.langchain4j.model.openai.OpenAiChatModel; +import lombok.AllArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.apache.commons.io.FilenameUtils; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.text.PDFTextStripper; +import org.ruoyi.chain.split.TextSplitter; +import org.ruoyi.common.core.exception.ServiceException; +import org.ruoyi.common.core.utils.SpringUtils; +import org.ruoyi.common.core.utils.StringUtils; +import org.ruoyi.common.core.utils.file.FileUtils; +import org.ruoyi.common.oss.core.OssClient; +import org.ruoyi.common.oss.entity.UploadResult; +import org.ruoyi.common.oss.factory.OssFactory; +import org.ruoyi.config.properties.PdfProperties; +import org.ruoyi.system.domain.SysOss; +import org.ruoyi.system.mapper.SysOssMapper; +import org.springframework.http.MediaType; +import org.springframework.stereotype.Component; +import org.springframework.web.multipart.MultipartFile; + +import java.io.*; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.StandardCopyOption; +import java.util.Base64; +import java.util.Comparator; +import java.util.List; +import java.util.concurrent.Executors; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Pdf mineru文件加载器 + * + * @author zpx + */ +@Slf4j +@Component +@AllArgsConstructor +public class PdfMinerUFileLoader implements ResourceLoader { + private final TextSplitter characterTextSplitter; + private final PdfProperties properties; + private final SysOssMapper sysOssMapper; + // 预编译正则表达式 + private static final Pattern MD_IMAGE_PATTERN = Pattern.compile("!\\[(.*?)]\\((.*?)(\\s*=\\d+)?\\)"); + + + @Override + public String getContent(InputStream inputStream) { + String content = ""; + File tempPdf = null; + Path outputPath = null; + try { + // 创建临时文件 + tempPdf = createTempFile(inputStream); + //构建输出路径 + outputPath = buildOutputPath(); + // 执行转换命令 + Process process = buildProcess(properties.getTransition().getCondaEnvPath(), tempPdf, outputPath); + //打印执行日志 + logProcessOutput(process); + int exitCode = process.waitFor(); + //验证转换结果 + String verifyResult = verifyResult(tempPdf, outputPath, exitCode); + + // 获取生成的.md文件路径 + Path mdFilePath = Paths.get(verifyResult); + if (Files.exists(mdFilePath)) { + log.info("找到Markdown文件: " + mdFilePath); + DocumentParser documentParser = new ApacheTikaDocumentParser(); + Document document = FileSystemDocumentLoader.loadDocument(mdFilePath.toString(), documentParser); + if (null != document) { + content = document.text(); + // 判断是否md文档 + String fileType = FilenameUtils.getExtension(mdFilePath.getFileName().toString()); + if ("md".contains(fileType)) { + // 如果是md文件,查找所有图片语法,如果是本地图片,替换成网络图片 + StringBuffer sb = replaceImageUrl(content, mdFilePath); + content = sb.toString(); + } + } else { + log.warn("无法解析文档内容"); + } + } else { + log.warn("未找到预期的 .md 文件"); + } + return content; + } catch (IOException e) { + throw new RuntimeException(e); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } finally { + if (tempPdf != null) { + try { + // 清理临时文件 + Files.deleteIfExists(tempPdf.toPath()); + } catch (IOException e) { + log.warn("删除临时文件失败: {}", tempPdf.getAbsolutePath(), e); + } + } + //清理输出目录 + if (outputPath != null) { + cleanOutputDirectory(outputPath); + } + } + } + + @Override + public List getChunkList(String content, String kid) { + return characterTextSplitter.split(content, kid); + } + + /** + * 创建临时PDF文件 + * + * @param is 输入流 + * @return + * @throws IOException + */ + private static File createTempFile(InputStream is) throws IOException { + File tempFile = File.createTempFile("upload_", ".pdf"); + Files.copy(is, tempFile.toPath(), StandardCopyOption.REPLACE_EXISTING); + return tempFile; + } + + + /** + * 构建跨平台文件输出路径 + * + * @return + * @throws IOException + */ + private static Path buildOutputPath() throws IOException { + Path basePath = isWindows() ? + Paths.get(new File("").getCanonicalPath().substring(0, 3)).resolve("minerUOutPut") : + Paths.get("/var/minerUOutPut"); + + if (!Files.exists(basePath)) { + Files.createDirectories(basePath); + } + return basePath; + } + + /** + * 判断当前操作系统是否为Windows + * + * @return + */ + private static boolean isWindows() { + return System.getProperty("os.name").toLowerCase().contains("win"); + } + + /** + * 执行命令 + * + * @param condaEnv conda环境路径 + * @param inputFile 输入文件 + * @param outputPath 输出路径 + * @return + * @throws IOException + */ + private static Process buildProcess(String condaEnv, File inputFile, Path outputPath) throws IOException { + ProcessBuilder pb = new ProcessBuilder(); + String[] command; + + if (isWindows()) { + command = new String[]{ + "cmd", "/c", + "call", "conda", "activate", + condaEnv.replace("\"", ""), + "&&", "magic-pdf", + "-p", inputFile.getAbsolutePath(), + "-o", outputPath.toString() + }; + } else { + command = new String[]{ + "bash", "-c", + String.format("source '%s/bin/activate' && magic-pdf -p '%s' -o '%s'", + condaEnv, + inputFile.getAbsolutePath(), + outputPath.toString()) + }; + } + + return pb.command(command) + .redirectErrorStream(true) + .start(); + } + + + /** + * 实时日志输出 + * + * @param process 进程 + */ + private static void logProcessOutput(Process process) { + Executors.newSingleThreadExecutor().submit(() -> { + try (BufferedReader reader = new BufferedReader( + new InputStreamReader(process.getInputStream()))) { + String line; + while ((line = reader.readLine()) != null) { + log.info("[PROCESS LOG] " + line); + } + } catch (IOException e) { + e.printStackTrace(); + } + }); + } + + /** + * 验证转换结果 + * + * @param inputFile 输入文件 + * @param outputPath 输出路径 + * @param exitCode 退出码 + * @return + */ + private static String verifyResult(File inputFile, Path outputPath, int exitCode) { + String baseName = FilenameUtils.removeExtension(inputFile.getName()); + Path expectedMd = outputPath + .resolve(baseName) + .resolve("auto") + .resolve(baseName + ".md"); + + if (exitCode == 0 && Files.exists(expectedMd)) { + log.info("转换成功:{}", expectedMd.toString()); + return expectedMd.toString(); + } + return String.format("转换失败(退出码%d)| 预期文件:%s", exitCode, expectedMd); + } + + /** + * 正则匹配图片语法 + * @param content 文本内容 + * @param basePath 图片路径 + * @return + */ + private StringBuffer replaceImageUrl(String content, Path basePath) { + // 正则表达式匹配md文件中的图片语法 ![alt text](image url) + Matcher matcher = MD_IMAGE_PATTERN.matcher(content); + + StringBuffer sb = new StringBuffer(); + while (matcher.find()) { + String imageUrl = matcher.group(2); + // 检查是否是本地图片路径 + if (!imageUrl.startsWith("http")) { + // 获取图片完整路径,上传到Oss中 + Path imagePath = basePath.getParent().resolve(imageUrl); + if (!Files.exists(imagePath)) { + log.error("图片路径不存在: {}", imagePath); + } + // 获取原始文件名和后缀 + String originalfileName = imagePath.getFileName().toString(); + // 获取文件后缀 + String suffix = StringUtils.substring(originalfileName, originalfileName.lastIndexOf("."), + originalfileName.length()); + // 读取文件字节流 + try (InputStream inputStream = Files.newInputStream(imagePath)) { + // 使用 OssClient 直接上传字节流 + OssClient storage = OssFactory.instance(); + UploadResult uploadResult = storage.uploadSuffix(inputStream, suffix, FileUtils.getMimeType(suffix)); + + // 构建 SysOss 对象并保存数据库记录 + SysOss sysOss = new SysOss(); + sysOss.setUrl(uploadResult.getUrl()); + sysOss.setFileSuffix(suffix); + sysOss.setFileName(uploadResult.getFilename()); + sysOss.setOriginalName(originalfileName); + sysOss.setService(storage.getConfigKey()); + + // 插入数据库 + sysOssMapper.insert(sysOss); + + // OCR 处理 & 替换图片链接 + String networkImageUrl = uploadResult.getUrl(); + //⚠️ 注意:确保 URL 是公网可访问的,否则模型无法加载图片。 + //另一种解决方案:使用base64 但是需要申请apikey , 使用demo会出现token超出长度问题。 + String imageUrlOCR = imageUrlOCR(networkImageUrl); + matcher.appendReplacement(sb, "![" + matcher.group(1) + imageUrlOCR + "](" + networkImageUrl + ")"); + } catch (IOException e) { + log.error("读取或上传图片失败", e); + matcher.appendReplacement(sb, matcher.group(0)); // 保留原图语法 + } + } else { + //多模态OCR识别图片内容 + String imageUrlOCR = imageUrlOCR(imageUrl); + matcher.appendReplacement(sb, "![" + matcher.group(1) + imageUrlOCR + "](" + imageUrl + ")"); + } + } + matcher.appendTail(sb); + return sb; + } + + /** + * 多模态OCR识别图片内容 + * @param imageUrl 图片URL + * @return + */ + private static String imageUrlOCR(String imageUrl) { + OpenAiChatModel model = OpenAiChatModel.builder() + .apiKey("demo") + .modelName("gpt-4o-mini") + .baseUrl("http://langchain4j.dev/demo/openai/v1") + .build(); + + UserMessage userMessage = UserMessage.from( + TextContent.from( + "请按以下逻辑处理图片:\n" + + "1. 文字检测:识别图中所有可见文字(包括水印/标签),若无文字则跳至步骤3\n" + + "2. 文字处理:\n" + + " a. 按出现顺序完整提取文字(非中文立即翻译)\n" + + " b. 用20字内总结核心信息,禁止补充解释\n" + + " c. 描述文字位置(如'顶部居中')、字体特征(颜色/大小)\n" + + "3. 视觉描述:\n" + + " a. 客观说明主体对象、场景、色彩搭配与画面氛围\n" + + "4. 输出规则:\n" + + " - 最终输出为纯文本,格式:'[文字总结] 视觉描述 关键词:xx,xx'\n" + + " - 关键词从内容中提取3个最具代表性的名词" + ), + ImageContent.from(imageUrl) + ); + ChatResponse chat = model.chat(userMessage); + AiMessage answer = chat.aiMessage(); + return answer.text(); + } + + /** + * 清理输出目录 + * @param outputPath 输出目录 + */ + private static void cleanOutputDirectory(Path outputPath) { + if (Files.exists(outputPath)) { + try { + Files.walk(outputPath) + // 按逆序删除(子目录先删) + .sorted((p1, p2) -> -p1.compareTo(p2)) + .forEach(path -> { + try { + Files.delete(path); + } catch (IOException e) { + log.warn("清理输出目录失败: {}", path, e); + } + }); + } catch (IOException e) { + log.error("遍历输出目录失败", e); + } + } + } +} diff --git a/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/chain/loader/ResourceLoaderFactory.java b/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/chain/loader/ResourceLoaderFactory.java index ec33c668..4fb53dfb 100644 --- a/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/chain/loader/ResourceLoaderFactory.java +++ b/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/chain/loader/ResourceLoaderFactory.java @@ -3,7 +3,9 @@ package org.ruoyi.chain.loader; import lombok.AllArgsConstructor; import org.ruoyi.chain.split.*; +import org.ruoyi.config.properties.PdfProperties; import org.ruoyi.constant.FileType; +import org.ruoyi.system.mapper.SysOssMapper; import org.springframework.stereotype.Component; @AllArgsConstructor @@ -14,15 +16,19 @@ public class ResourceLoaderFactory { private final MarkdownTextSplitter markdownTextSplitter; private final TokenTextSplitter tokenTextSplitter; private final ExcelTextSplitter excelTextSplitter; + private final PdfProperties pdfProperties; + private final SysOssMapper sysOssMapper; public ResourceLoader getLoaderByFileType(String fileType){ if (FileType.isTextFile(fileType)){ return new TextFileLoader(characterTextSplitter); } else if (FileType.isWord(fileType)) { return new WordLoader(characterTextSplitter); + } else if (FileType.isPdf(fileType) && pdfProperties.getTransition().isEnableMinerU()) { + return new PdfMinerUFileLoader(characterTextSplitter,pdfProperties,sysOssMapper); } else if (FileType.isPdf(fileType)) { return new PdfFileLoader(characterTextSplitter); - } else if (FileType.isMdFile(fileType)) { + }else if (FileType.isMdFile(fileType)) { return new MarkDownFileLoader(markdownTextSplitter); }else if (FileType.isExcel(fileType)) { return new ExcelFileLoader(excelTextSplitter); diff --git a/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/config/properties/PdfProperties.java b/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/config/properties/PdfProperties.java new file mode 100644 index 00000000..fe89ee39 --- /dev/null +++ b/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/config/properties/PdfProperties.java @@ -0,0 +1,83 @@ +package org.ruoyi.config.properties; + +import lombok.Data; +import lombok.NoArgsConstructor; +import org.springframework.boot.context.properties.ConfigurationProperties; +import org.springframework.stereotype.Component; + +/** + * PDF 配置属性 + * + * @author zpx + */ +@Data +@Component +@ConfigurationProperties(prefix = "pdf") +public class PdfProperties { + + /** + * Extract 配置 + */ + private ExtractConfig extract; + + /** + * Transition 配置 + */ + private TransitionConfig transition; + + @Data + @NoArgsConstructor + public static class ExtractConfig { + /** + * Service 配置 + */ + private ServiceConfig service; + + /** + * AI API 配置 + */ + private AiApiConfig aiApi; + + @Data + @NoArgsConstructor + public static class ServiceConfig { + /** + * 服务地址 URL + */ + private String url; + } + + @Data + @NoArgsConstructor + public static class AiApiConfig { + /** + * AI API 地址 URL + */ + private String url; + + /** + * API 密钥 + */ + private String key; + } + } + + @Data + @NoArgsConstructor + public static class TransitionConfig { + /** + * 是否启用 MinerU + */ + private boolean enableMinerU; + + /** + * MinerU Conda 环境路径 + */ + private String condaEnvPath; + + /** + * 是否启用图片 OCR + */ + private boolean enableOcr; + } +} \ No newline at end of file From 22d9d9ba85eafbfa7f0bc5f834e42ecf5300abe3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E9=B9=8F=E7=BF=94?= <643541620@qq.com> Date: Wed, 21 May 2025 14:37:51 +0800 Subject: [PATCH 2/3] =?UTF-8?q?=E6=96=B0=E5=A2=9Epdf=E8=BD=ACmd=E5=90=8E?= =?UTF-8?q?=E6=98=AF=E5=90=A6=E8=BF=9B=E8=A1=8C=E5=9B=BE=E7=89=87OCR?= =?UTF-8?q?=E5=88=A4=E6=96=AD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../main/java/org/ruoyi/chain/loader/PdfMinerUFileLoader.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/chain/loader/PdfMinerUFileLoader.java b/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/chain/loader/PdfMinerUFileLoader.java index e6d17d43..fa7b1299 100644 --- a/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/chain/loader/PdfMinerUFileLoader.java +++ b/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/chain/loader/PdfMinerUFileLoader.java @@ -86,7 +86,8 @@ public class PdfMinerUFileLoader implements ResourceLoader { content = document.text(); // 判断是否md文档 String fileType = FilenameUtils.getExtension(mdFilePath.getFileName().toString()); - if ("md".contains(fileType)) { + //判断是否需要进行图片OCR识别 + if ("md".contains(fileType) && properties.getTransition().isEnableOcr()) { // 如果是md文件,查找所有图片语法,如果是本地图片,替换成网络图片 StringBuffer sb = replaceImageUrl(content, mdFilePath); content = sb.toString(); From 0f827111995a46301c9bcb1fb17fad20e5fae9fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E9=B9=8F=E7=BF=94?= <643541620@qq.com> Date: Thu, 22 May 2025 14:05:21 +0800 Subject: [PATCH 3/3] =?UTF-8?q?perf:=201.=E4=BC=98=E5=8C=96=E6=96=87?= =?UTF-8?q?=E4=BB=B6=E8=BE=93=E5=87=BA=E8=B7=AF=E5=BE=84,=E9=81=BF?= =?UTF-8?q?=E5=85=8D=E5=85=B6=E4=BB=96=E7=9B=98=E7=AC=A6=E6=9D=83=E9=99=90?= =?UTF-8?q?=E9=97=AE=E9=A2=98=E3=80=82=20=20=20=20=20=20=202.=E9=87=87?= =?UTF-8?q?=E7=94=A8=E7=BA=BF=E7=A8=8B=E6=B1=A0=E5=BC=82=E6=AD=A5=E8=B0=83?= =?UTF-8?q?=E7=94=A8=E5=A4=9A=E6=A8=A1=E6=9D=BFOCR=E5=9B=BE=E7=89=87?= =?UTF-8?q?=E8=AF=86=E5=88=AB=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../chain/loader/PdfMinerUFileLoader.java | 192 +++++++++++++----- 1 file changed, 139 insertions(+), 53 deletions(-) diff --git a/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/chain/loader/PdfMinerUFileLoader.java b/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/chain/loader/PdfMinerUFileLoader.java index fa7b1299..27ef311e 100644 --- a/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/chain/loader/PdfMinerUFileLoader.java +++ b/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/chain/loader/PdfMinerUFileLoader.java @@ -13,11 +13,7 @@ import dev.langchain4j.model.openai.OpenAiChatModel; import lombok.AllArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.apache.commons.io.FilenameUtils; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.text.PDFTextStripper; import org.ruoyi.chain.split.TextSplitter; -import org.ruoyi.common.core.exception.ServiceException; -import org.ruoyi.common.core.utils.SpringUtils; import org.ruoyi.common.core.utils.StringUtils; import org.ruoyi.common.core.utils.file.FileUtils; import org.ruoyi.common.oss.core.OssClient; @@ -26,19 +22,16 @@ import org.ruoyi.common.oss.factory.OssFactory; import org.ruoyi.config.properties.PdfProperties; import org.ruoyi.system.domain.SysOss; import org.ruoyi.system.mapper.SysOssMapper; -import org.springframework.http.MediaType; import org.springframework.stereotype.Component; -import org.springframework.web.multipart.MultipartFile; import java.io.*; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.StandardCopyOption; -import java.util.Base64; -import java.util.Comparator; +import java.util.ArrayList; import java.util.List; -import java.util.concurrent.Executors; +import java.util.concurrent.*; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -56,7 +49,19 @@ public class PdfMinerUFileLoader implements ResourceLoader { private final SysOssMapper sysOssMapper; // 预编译正则表达式 private static final Pattern MD_IMAGE_PATTERN = Pattern.compile("!\\[(.*?)]\\((.*?)(\\s*=\\d+)?\\)"); - + // OCR图片识别线程池 + private final ThreadPoolExecutor ocrExecutor = new ThreadPoolExecutor( + // 核心线程数 + 5, + // 最大线程数 + 10, + // 空闲线程存活时间 + 60L, TimeUnit.SECONDS, + // 任务队列容量 + new LinkedBlockingQueue<>(100), + // 拒绝策略 + new ThreadPoolExecutor.CallerRunsPolicy() + ); @Override public String getContent(InputStream inputStream) { @@ -99,9 +104,7 @@ public class PdfMinerUFileLoader implements ResourceLoader { log.warn("未找到预期的 .md 文件"); } return content; - } catch (IOException e) { - throw new RuntimeException(e); - } catch (InterruptedException e) { + } catch (Exception e) { throw new RuntimeException(e); } finally { if (tempPdf != null) { @@ -146,7 +149,8 @@ public class PdfMinerUFileLoader implements ResourceLoader { */ private static Path buildOutputPath() throws IOException { Path basePath = isWindows() ? - Paths.get(new File("").getCanonicalPath().substring(0, 3)).resolve("minerUOutPut") : + // Windows C盘用户路径下 minerUOutPut,避免其他盘符权限问题 + Paths.get(System.getProperty("user.home")).resolve("minerUOutPut") : Paths.get("/var/minerUOutPut"); if (!Files.exists(basePath)) { @@ -243,70 +247,138 @@ public class PdfMinerUFileLoader implements ResourceLoader { return String.format("转换失败(退出码%d)| 预期文件:%s", exitCode, expectedMd); } + /** - * 正则匹配图片语法 - * @param content 文本内容 + * 正则匹配图片语法,多线程进行处理 + * + * @param content 文本内容 * @param basePath 图片路径 * @return */ - private StringBuffer replaceImageUrl(String content, Path basePath) { - // 正则表达式匹配md文件中的图片语法 ![alt text](image url) + private StringBuffer replaceImageUrl(String content, Path basePath) throws Exception { + List matches = new ArrayList<>(); Matcher matcher = MD_IMAGE_PATTERN.matcher(content); - StringBuffer sb = new StringBuffer(); + // 收集所有匹配的图片项 while (matcher.find()) { - String imageUrl = matcher.group(2); - // 检查是否是本地图片路径 - if (!imageUrl.startsWith("http")) { - // 获取图片完整路径,上传到Oss中 - Path imagePath = basePath.getParent().resolve(imageUrl); + ImageMatch imgMatch = new ImageMatch(); + imgMatch.altText = matcher.group(1); + imgMatch.imageUrl = matcher.group(2); + imgMatch.start = matcher.start(); + imgMatch.end = matcher.end(); + matches.add(imgMatch); + } + + if (matches.isEmpty()) { + return new StringBuffer(content); + } + + // 提交任务到线程池 + List> futures = new ArrayList<>(); + for (ImageMatch imgMatch : matches) { + // 为每个图片项创建独立任务 + Future future = ocrExecutor.submit(() -> processImage(imgMatch, basePath)); + futures.add(future); + } + + // 按原始顺序拼接结果 + StringBuffer sb = new StringBuffer(); + int previousEnd = 0; + + for (int i = 0; i < matches.size(); i++) { + ImageMatch imgMatch = matches.get(i); + // 阻塞等待结果 + String replacement = futures.get(i).get(); + + // 插入未匹配的原始文本和处理后的结果 + sb.append(content.substring(previousEnd, imgMatch.start)); + sb.append(replacement); + previousEnd = imgMatch.end; + } + // 添加剩余文本 + sb.append(content.substring(previousEnd)); + return sb; + } + + + /** + * 图片处理任务 + * + * @param imgMatch 图片匹配结果 + * @param basePath 本地图片路径 + * @return + */ + private String processImage(ImageMatch imgMatch, Path basePath) { + try { + if (!imgMatch.imageUrl.startsWith("http")) { + // 处理本地图片 + Path imagePath = basePath.getParent().resolve(imgMatch.imageUrl).normalize(); + if (!Files.exists(imagePath)) { log.error("图片路径不存在: {}", imagePath); + return String.format("![%s](%s)", imgMatch.altText, imgMatch.imageUrl); } - // 获取原始文件名和后缀 - String originalfileName = imagePath.getFileName().toString(); - // 获取文件后缀 - String suffix = StringUtils.substring(originalfileName, originalfileName.lastIndexOf("."), - originalfileName.length()); - // 读取文件字节流 + + // 文件后缀安全提取 + String originalFileName = imagePath.getFileName().toString(); + String suffix = ""; + int lastDotIndex = originalFileName.lastIndexOf("."); + if (lastDotIndex != -1) { + suffix = originalFileName.substring(lastDotIndex); + } + + // 上传OSS try (InputStream inputStream = Files.newInputStream(imagePath)) { - // 使用 OssClient 直接上传字节流 OssClient storage = OssFactory.instance(); UploadResult uploadResult = storage.uploadSuffix(inputStream, suffix, FileUtils.getMimeType(suffix)); - // 构建 SysOss 对象并保存数据库记录 + // 保存数据库记录 SysOss sysOss = new SysOss(); sysOss.setUrl(uploadResult.getUrl()); sysOss.setFileSuffix(suffix); sysOss.setFileName(uploadResult.getFilename()); - sysOss.setOriginalName(originalfileName); + sysOss.setOriginalName(originalFileName); sysOss.setService(storage.getConfigKey()); - - // 插入数据库 sysOssMapper.insert(sysOss); - // OCR 处理 & 替换图片链接 - String networkImageUrl = uploadResult.getUrl(); + // OCR处理 + String networkUrl = uploadResult.getUrl(); //⚠️ 注意:确保 URL 是公网可访问的,否则模型无法加载图片。 //另一种解决方案:使用base64 但是需要申请apikey , 使用demo会出现token超出长度问题。 - String imageUrlOCR = imageUrlOCR(networkImageUrl); - matcher.appendReplacement(sb, "![" + matcher.group(1) + imageUrlOCR + "](" + networkImageUrl + ")"); - } catch (IOException e) { - log.error("读取或上传图片失败", e); - matcher.appendReplacement(sb, matcher.group(0)); // 保留原图语法 - } + String ocrResult = safeImageUrlOCR(networkUrl); + return String.format("![%s%s](%s)", imgMatch.altText, ocrResult, networkUrl); + } } else { - //多模态OCR识别图片内容 - String imageUrlOCR = imageUrlOCR(imageUrl); - matcher.appendReplacement(sb, "![" + matcher.group(1) + imageUrlOCR + "](" + imageUrl + ")"); + // 处理远程图片 + String ocrResult = safeImageUrlOCR(imgMatch.imageUrl); + return String.format("![%s%s](%s)", imgMatch.altText, ocrResult, imgMatch.imageUrl); } + } catch (Exception e) { + log.error("图片处理失败: {}", imgMatch.imageUrl, e); + return String.format("![%s](%s)", imgMatch.altText, imgMatch.imageUrl); } - matcher.appendTail(sb); - return sb; } + /** + * OCR调用 + * + * @param imageUrl 图片URL + * @return + */ + private String safeImageUrlOCR(String imageUrl) { + try { + return imageUrlOCR(imageUrl); + } catch (Exception e) { + log.warn("OCR处理失败: {}", imageUrl, e); + // OCR失败时返回空字符串 + return ""; + } + } + + /** * 多模态OCR识别图片内容 + * * @param imageUrl 图片URL * @return */ @@ -322,14 +394,16 @@ public class PdfMinerUFileLoader implements ResourceLoader { "请按以下逻辑处理图片:\n" + "1. 文字检测:识别图中所有可见文字(包括水印/标签),若无文字则跳至步骤3\n" + "2. 文字处理:\n" + - " a. 按出现顺序完整提取文字(非中文立即翻译)\n" + - " b. 用20字内总结核心信息,禁止补充解释\n" + + " a. 对识别到的文字进行❗核心信息提炼\n" + + " b. ❗禁止直接输出原文内容\n" + " c. 描述文字位置(如'顶部居中')、字体特征(颜色/大小)\n" + "3. 视觉描述:\n" + - " a. 客观说明主体对象、场景、色彩搭配与画面氛围\n" + + " a. 若无文字则用❗50字内简洁描述主体对象、场景、色彩搭配与画面氛围\n" + + " b. 若有文字则补充说明文字与画面的关系\n" + "4. 输出规则:\n" + " - 最终输出为纯文本,格式:'[文字总结] 视觉描述 关键词:xx,xx'\n" + - " - 关键词从内容中提取3个最具代表性的名词" + " - 关键词从内容中提取3个最具代表性的名词\n" + + " - 无文字时格式:'[空] 简洁描述 关键词:xx,xx'" ), ImageContent.from(imageUrl) ); @@ -338,15 +412,27 @@ public class PdfMinerUFileLoader implements ResourceLoader { return answer.text(); } + /** + * 静态内部类保存图片匹配信息 + */ + private static class ImageMatch { + String altText; // 替换文本 + String imageUrl; // 图片地址 + int start; // 匹配起始位置 + int end; // 匹配结束位置 + } + + /** * 清理输出目录 + * * @param outputPath 输出目录 */ private static void cleanOutputDirectory(Path outputPath) { if (Files.exists(outputPath)) { try { Files.walk(outputPath) - // 按逆序删除(子目录先删) + // 按逆序删除(子目录先删) .sorted((p1, p2) -> -p1.compareTo(p2)) .forEach(path -> { try {