From c1fc02894b1dc6f4689028128df0cc36b0b804d1 Mon Sep 17 00:00:00 2001 From: wangle Date: Mon, 13 Apr 2026 18:02:27 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E5=8F=91=E5=B8=833.0=E7=89=88=E6=9C=AC?= =?UTF-8?q?=EF=BC=8C=E6=96=B0=E5=A2=9E=E6=96=87=E6=A1=A3=E5=A4=84=E7=90=86?= =?UTF-8?q?=E8=83=BD=E5=8A=9B=E5=92=8C=E6=BC=94=E7=A4=BA=E6=A8=A1=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 升级langchain4j版本至1.13.0 - 新增docx/pdf/xlsx文档处理技能模块 - 添加演示模式配置和切面拦截 - 优化聊天服务和可观测性监听器 Co-Authored-By: Claude Opus 4.6 --- pom.xml | 4 +- .../java/org/ruoyi/RuoYiAIApplication.java | 4 +- .../src/main/resources/application-prod.yml | 3 +- .../src/main/resources/application.yml | 17 + .../main/resources/skills/docx/LICENSE.txt | 30 + .../src/main/resources/skills/docx/SKILL.md | 590 +++ .../resources/skills/docx/scripts/__init__.py | 1 + .../skills/docx/scripts/accept_changes.py | 135 + .../resources/skills/docx/scripts/comment.py | 318 ++ .../docx/scripts/office/helpers/__init__.py | 0 .../docx/scripts/office/helpers/merge_runs.py | 199 + .../office/helpers/simplify_redlines.py | 197 + .../skills/docx/scripts/office/pack.py | 159 + .../schemas/ISO-IEC29500-4_2016/dml-chart.xsd | 1499 ++++++ .../ISO-IEC29500-4_2016/dml-chartDrawing.xsd | 146 + .../ISO-IEC29500-4_2016/dml-diagram.xsd | 1085 ++++ .../ISO-IEC29500-4_2016/dml-lockedCanvas.xsd | 11 + .../schemas/ISO-IEC29500-4_2016/dml-main.xsd | 3081 ++++++++++++ .../ISO-IEC29500-4_2016/dml-picture.xsd | 23 + .../dml-spreadsheetDrawing.xsd | 185 + .../dml-wordprocessingDrawing.xsd | 287 ++ .../schemas/ISO-IEC29500-4_2016/pml.xsd | 1676 +++++++ .../shared-additionalCharacteristics.xsd | 28 + .../shared-bibliography.xsd | 144 + .../shared-commonSimpleTypes.xsd | 174 + .../shared-customXmlDataProperties.xsd | 25 + .../shared-customXmlSchemaProperties.xsd | 18 + .../shared-documentPropertiesCustom.xsd | 59 + .../shared-documentPropertiesExtended.xsd | 56 + .../shared-documentPropertiesVariantTypes.xsd | 195 + .../ISO-IEC29500-4_2016/shared-math.xsd | 582 +++ .../shared-relationshipReference.xsd | 25 + .../schemas/ISO-IEC29500-4_2016/sml.xsd | 4439 +++++++++++++++++ .../schemas/ISO-IEC29500-4_2016/vml-main.xsd | 570 +++ .../ISO-IEC29500-4_2016/vml-officeDrawing.xsd | 509 ++ .../vml-presentationDrawing.xsd | 12 + .../vml-spreadsheetDrawing.xsd | 108 + .../vml-wordprocessingDrawing.xsd | 96 + .../schemas/ISO-IEC29500-4_2016/wml.xsd | 3646 ++++++++++++++ .../schemas/ISO-IEC29500-4_2016/xml.xsd | 116 + .../ecma/fouth-edition/opc-contentTypes.xsd | 42 + .../ecma/fouth-edition/opc-coreProperties.xsd | 50 + .../schemas/ecma/fouth-edition/opc-digSig.xsd | 49 + .../ecma/fouth-edition/opc-relationships.xsd | 33 + .../docx/scripts/office/schemas/mce/mc.xsd | 75 + .../office/schemas/microsoft/wml-2010.xsd | 560 +++ .../office/schemas/microsoft/wml-2012.xsd | 67 + .../office/schemas/microsoft/wml-2018.xsd | 14 + .../office/schemas/microsoft/wml-cex-2018.xsd | 20 + .../office/schemas/microsoft/wml-cid-2016.xsd | 13 + .../microsoft/wml-sdtdatahash-2020.xsd | 4 + .../schemas/microsoft/wml-symex-2015.xsd | 8 + .../skills/docx/scripts/office/soffice.py | 183 + .../skills/docx/scripts/office/unpack.py | 132 + .../skills/docx/scripts/office/validate.py | 111 + .../scripts/office/validators/__init__.py | 15 + .../docx/scripts/office/validators/base.py | 847 ++++ .../docx/scripts/office/validators/docx.py | 446 ++ .../docx/scripts/office/validators/pptx.py | 275 + .../scripts/office/validators/redlining.py | 247 + .../docx/scripts/templates/comments.xml | 3 + .../scripts/templates/commentsExtended.xml | 3 + .../scripts/templates/commentsExtensible.xml | 3 + .../docx/scripts/templates/commentsIds.xml | 3 + .../skills/docx/scripts/templates/people.xml | 3 + .../src/main/resources/skills/pdf/LICENSE.txt | 30 + .../src/main/resources/skills/pdf/SKILL.md | 294 ++ .../src/main/resources/skills/pdf/forms.md | 205 + .../main/resources/skills/pdf/reference.md | 612 +++ .../pdf/scripts/check_bounding_boxes.py | 70 + .../pdf/scripts/check_bounding_boxes_test.py | 226 + .../pdf/scripts/check_fillable_fields.py | 12 + .../pdf/scripts/convert_pdf_to_images.py | 35 + .../pdf/scripts/create_validation_image.py | 41 + .../pdf/scripts/extract_form_field_info.py | 152 + .../pdf/scripts/fill_fillable_fields.py | 114 + .../scripts/fill_pdf_form_with_annotations.py | 108 + .../main/resources/skills/xlsx/LICENSE.txt | 30 + .../src/main/resources/skills/xlsx/SKILL.md | 362 ++ .../src/main/resources/skills/xlsx/recalc.py | 318 ++ .../domain/dto/request/AgentChatRequest.java | 26 + .../chat/domain/dto/request/ChatRequest.java | 30 +- .../security/config/SecurityConfig.java | 6 + .../common/sse/core/SseEmitterManager.java | 3 + .../org/ruoyi/common/sse/dto/SseEventDto.java | 17 + .../ruoyi/common/web/aspectj/DemoAspect.java | 86 + .../web/config/DemoAutoConfiguration.java | 25 + .../web/config/properties/DemoProperties.java | 32 + ...ot.autoconfigure.AutoConfiguration.imports | 1 + ruoyi-modules/ruoyi-chat/pom.xml | 22 + .../java/org/ruoyi/agent/SkillsAgent.java | 35 + .../java/org/ruoyi/agent/WebSearchAgent.java | 37 +- .../ruoyi/controller/chat/ChatController.java | 1 + .../ruoyi/observability/MyAgentListener.java | 34 +- .../observability/MyMcpClientListener.java | 181 +- .../ruoyi/observability/OutputChannel.java | 128 + .../observability/StreamingOutputWrapper.java | 213 + .../SupervisorStreamListener.java | 120 + .../service/chat/impl/ChatServiceFacade.java | 188 +- .../agent/StreamingAgentIntegrationTest.java | 313 ++ 100 files changed, 27576 insertions(+), 189 deletions(-) create mode 100644 ruoyi-admin/src/main/resources/skills/docx/LICENSE.txt create mode 100644 ruoyi-admin/src/main/resources/skills/docx/SKILL.md create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/__init__.py create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/accept_changes.py create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/comment.py create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/helpers/__init__.py create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/helpers/merge_runs.py create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/helpers/simplify_redlines.py create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/pack.py create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/mce/mc.xsd create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/microsoft/wml-2010.xsd create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/microsoft/wml-2012.xsd create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/microsoft/wml-2018.xsd create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/microsoft/wml-cex-2018.xsd create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/microsoft/wml-cid-2016.xsd create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/microsoft/wml-symex-2015.xsd create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/soffice.py create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/unpack.py create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/validate.py create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/validators/__init__.py create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/validators/base.py create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/validators/docx.py create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/validators/pptx.py create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/office/validators/redlining.py create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/templates/comments.xml create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/templates/commentsExtended.xml create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/templates/commentsExtensible.xml create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/templates/commentsIds.xml create mode 100644 ruoyi-admin/src/main/resources/skills/docx/scripts/templates/people.xml create mode 100644 ruoyi-admin/src/main/resources/skills/pdf/LICENSE.txt create mode 100644 ruoyi-admin/src/main/resources/skills/pdf/SKILL.md create mode 100644 ruoyi-admin/src/main/resources/skills/pdf/forms.md create mode 100644 ruoyi-admin/src/main/resources/skills/pdf/reference.md create mode 100644 ruoyi-admin/src/main/resources/skills/pdf/scripts/check_bounding_boxes.py create mode 100644 ruoyi-admin/src/main/resources/skills/pdf/scripts/check_bounding_boxes_test.py create mode 100644 ruoyi-admin/src/main/resources/skills/pdf/scripts/check_fillable_fields.py create mode 100644 ruoyi-admin/src/main/resources/skills/pdf/scripts/convert_pdf_to_images.py create mode 100644 ruoyi-admin/src/main/resources/skills/pdf/scripts/create_validation_image.py create mode 100644 ruoyi-admin/src/main/resources/skills/pdf/scripts/extract_form_field_info.py create mode 100644 ruoyi-admin/src/main/resources/skills/pdf/scripts/fill_fillable_fields.py create mode 100644 ruoyi-admin/src/main/resources/skills/pdf/scripts/fill_pdf_form_with_annotations.py create mode 100644 ruoyi-admin/src/main/resources/skills/xlsx/LICENSE.txt create mode 100644 ruoyi-admin/src/main/resources/skills/xlsx/SKILL.md create mode 100644 ruoyi-admin/src/main/resources/skills/xlsx/recalc.py create mode 100644 ruoyi-common/ruoyi-common-chat/src/main/java/org/ruoyi/common/chat/domain/dto/request/AgentChatRequest.java create mode 100644 ruoyi-common/ruoyi-common-web/src/main/java/org/ruoyi/common/web/aspectj/DemoAspect.java create mode 100644 ruoyi-common/ruoyi-common-web/src/main/java/org/ruoyi/common/web/config/DemoAutoConfiguration.java create mode 100644 ruoyi-common/ruoyi-common-web/src/main/java/org/ruoyi/common/web/config/properties/DemoProperties.java create mode 100644 ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/agent/SkillsAgent.java create mode 100644 ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/observability/OutputChannel.java create mode 100644 ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/observability/StreamingOutputWrapper.java create mode 100644 ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/observability/SupervisorStreamListener.java create mode 100644 ruoyi-modules/ruoyi-chat/src/test/java/org/ruoyi/agent/StreamingAgentIntegrationTest.java diff --git a/pom.xml b/pom.xml index 434e5f6a..4441852f 100644 --- a/pom.xml +++ b/pom.xml @@ -54,8 +54,8 @@ 2.18.2 - 1.11.0 - 1.11.0-beta19 + 1.13.0 + 1.13.0-beta23 1.5.3 1.19.6 1.0.7 diff --git a/ruoyi-admin/src/main/java/org/ruoyi/RuoYiAIApplication.java b/ruoyi-admin/src/main/java/org/ruoyi/RuoYiAIApplication.java index ffca4ef5..441804bd 100644 --- a/ruoyi-admin/src/main/java/org/ruoyi/RuoYiAIApplication.java +++ b/ruoyi-admin/src/main/java/org/ruoyi/RuoYiAIApplication.java @@ -16,11 +16,11 @@ import java.net.ServerSocket; public class RuoYiAIApplication { public static void main(String[] args) { - // killPortProcess(6039); + killPortProcess(6039); SpringApplication application = new SpringApplication(RuoYiAIApplication.class); application.setApplicationStartup(new BufferingApplicationStartup(2048)); application.run(args); - System.out.println("(♥◠‿◠)ノ゙ RuoYi-AI启动成功 ლ(´ڡ`ლ)冢"); + System.out.println("(♥◠‿◠)ノ゙ RuoYi-AI启动成功 ლ(´ڡ`ლ)"); } /** diff --git a/ruoyi-admin/src/main/resources/application-prod.yml b/ruoyi-admin/src/main/resources/application-prod.yml index 7001224f..2708e5af 100644 --- a/ruoyi-admin/src/main/resources/application-prod.yml +++ b/ruoyi-admin/src/main/resources/application-prod.yml @@ -1,3 +1,4 @@ + --- # 监控中心配置 spring.boot.admin.client: # 增加客户端开关 @@ -58,7 +59,7 @@ spring: driverClassName: com.mysql.cj.jdbc.Driver # jdbc 所有参数配置参考 https://lionli.blog.csdn.net/article/details/122018562 # rewriteBatchedStatements=true 批处理优化 大幅提升批量插入更新删除性能(对数据库有性能损耗 使用批量操作应考虑性能问题) - url: jdbc:mysql://127.0.0.1:3306/ruoyi-ai-agent?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull&useSSL=true&serverTimezone=GMT%2B8&autoReconnect=true&rewriteBatchedStatements=true&allowPublicKeyRetrieval=true&nullCatalogMeansCurrent=true + url: jdbc:mysql://127.0.0.1:3306/ruoyi-ai?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull&useSSL=true&serverTimezone=GMT%2B8&autoReconnect=true&rewriteBatchedStatements=true&allowPublicKeyRetrieval=true&nullCatalogMeansCurrent=true username: root password: root # agent: diff --git a/ruoyi-admin/src/main/resources/application.yml b/ruoyi-admin/src/main/resources/application.yml index 269da7ae..009cedfc 100644 --- a/ruoyi-admin/src/main/resources/application.yml +++ b/ruoyi-admin/src/main/resources/application.yml @@ -260,6 +260,23 @@ websocket: # 设置访问源地址 allowedOrigins: '*' +--- # 演示模式配置 +demo: + # 是否开启演示模式(开启后所有写操作将被拦截) + enabled: false + # 提示消息 + message: "演示模式,不允许进行写操作" + # 排除的路径(这些路径不受演示模式限制) + excludes: + - /login + - /logout + - /register + - /captcha/** + - /auth/** + - /chat/send + - /system/session/** + - /system/message/** + --- # warm-flow工作流配置 warm-flow: # 是否开启工作流,默认true diff --git a/ruoyi-admin/src/main/resources/skills/docx/LICENSE.txt b/ruoyi-admin/src/main/resources/skills/docx/LICENSE.txt new file mode 100644 index 00000000..c55ab422 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/LICENSE.txt @@ -0,0 +1,30 @@ +© 2025 Anthropic, PBC. All rights reserved. + +LICENSE: Use of these materials (including all code, prompts, assets, files, +and other components of this Skill) is governed by your agreement with +Anthropic regarding use of Anthropic's services. If no separate agreement +exists, use is governed by Anthropic's Consumer Terms of Service or +Commercial Terms of Service, as applicable: +https://www.anthropic.com/legal/consumer-terms +https://www.anthropic.com/legal/commercial-terms +Your applicable agreement is referred to as the "Agreement." "Services" are +as defined in the Agreement. + +ADDITIONAL RESTRICTIONS: Notwithstanding anything in the Agreement to the +contrary, users may not: + +- Extract these materials from the Services or retain copies of these + materials outside the Services +- Reproduce or copy these materials, except for temporary copies created + automatically during authorized use of the Services +- Create derivative works based on these materials +- Distribute, sublicense, or transfer these materials to any third party +- Make, offer to sell, sell, or import any inventions embodied in these + materials +- Reverse engineer, decompile, or disassemble these materials + +The receipt, viewing, or possession of these materials does not convey or +imply any license or right beyond those expressly granted above. + +Anthropic retains all right, title, and interest in these materials, +including all copyrights, patents, and other intellectual property rights. diff --git a/ruoyi-admin/src/main/resources/skills/docx/SKILL.md b/ruoyi-admin/src/main/resources/skills/docx/SKILL.md new file mode 100644 index 00000000..2951e559 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/SKILL.md @@ -0,0 +1,590 @@ +--- +name: docx +description: "Use this skill whenever the user wants to create, read, edit, or manipulate Word documents (.docx files). Triggers include: any mention of 'Word doc', 'word document', '.docx', or requests to produce professional documents with formatting like tables of contents, headings, page numbers, or letterheads. Also use when extracting or reorganizing content from .docx files, inserting or replacing images in documents, performing find-and-replace in Word files, working with tracked changes or comments, or converting content into a polished Word document. If the user asks for a 'report', 'memo', 'letter', 'template', or similar deliverable as a Word or .docx file, use this skill. Do NOT use for PDFs, spreadsheets, Google Docs, or general coding tasks unrelated to document generation." +license: Proprietary. LICENSE.txt has complete terms +--- + +# DOCX creation, editing, and analysis + +## Overview + +A .docx file is a ZIP archive containing XML files. + +## Quick Reference + +| Task | Approach | +|------|----------| +| Read/analyze content | `pandoc` or unpack for raw XML | +| Create new document | Use `docx-js` - see Creating New Documents below | +| Edit existing document | Unpack → edit XML → repack - see Editing Existing Documents below | + +### Converting .doc to .docx + +Legacy `.doc` files must be converted before editing: + +```bash +python scripts/office/soffice.py --headless --convert-to docx document.doc +``` + +### Reading Content + +```bash +# Text extraction with tracked changes +pandoc --track-changes=all document.docx -o output.md + +# Raw XML access +python scripts/office/unpack.py document.docx unpacked/ +``` + +### Converting to Images + +```bash +python scripts/office/soffice.py --headless --convert-to pdf document.docx +pdftoppm -jpeg -r 150 document.pdf page +``` + +### Accepting Tracked Changes + +To produce a clean document with all tracked changes accepted (requires LibreOffice): + +```bash +python scripts/accept_changes.py input.docx output.docx +``` + +--- + +## Creating New Documents + +Generate .docx files with JavaScript, then validate. Install: `npm install -g docx` + +### Setup +```javascript +const { Document, Packer, Paragraph, TextRun, Table, TableRow, TableCell, ImageRun, + Header, Footer, AlignmentType, PageOrientation, LevelFormat, ExternalHyperlink, + InternalHyperlink, Bookmark, FootnoteReferenceRun, PositionalTab, + PositionalTabAlignment, PositionalTabRelativeTo, PositionalTabLeader, + TabStopType, TabStopPosition, Column, SectionType, + TableOfContents, HeadingLevel, BorderStyle, WidthType, ShadingType, + VerticalAlign, PageNumber, PageBreak } = require('docx'); + +const doc = new Document({ sections: [{ children: [/* content */] }] }); +Packer.toBuffer(doc).then(buffer => fs.writeFileSync("doc.docx", buffer)); +``` + +### Validation +After creating the file, validate it. If validation fails, unpack, fix the XML, and repack. +```bash +python scripts/office/validate.py doc.docx +``` + +### Page Size + +```javascript +// CRITICAL: docx-js defaults to A4, not US Letter +// Always set page size explicitly for consistent results +sections: [{ + properties: { + page: { + size: { + width: 12240, // 8.5 inches in DXA + height: 15840 // 11 inches in DXA + }, + margin: { top: 1440, right: 1440, bottom: 1440, left: 1440 } // 1 inch margins + } + }, + children: [/* content */] +}] +``` + +**Common page sizes (DXA units, 1440 DXA = 1 inch):** + +| Paper | Width | Height | Content Width (1" margins) | +|-------|-------|--------|---------------------------| +| US Letter | 12,240 | 15,840 | 9,360 | +| A4 (default) | 11,906 | 16,838 | 9,026 | + +**Landscape orientation:** docx-js swaps width/height internally, so pass portrait dimensions and let it handle the swap: +```javascript +size: { + width: 12240, // Pass SHORT edge as width + height: 15840, // Pass LONG edge as height + orientation: PageOrientation.LANDSCAPE // docx-js swaps them in the XML +}, +// Content width = 15840 - left margin - right margin (uses the long edge) +``` + +### Styles (Override Built-in Headings) + +Use Arial as the default font (universally supported). Keep titles black for readability. + +```javascript +const doc = new Document({ + styles: { + default: { document: { run: { font: "Arial", size: 24 } } }, // 12pt default + paragraphStyles: [ + // IMPORTANT: Use exact IDs to override built-in styles + { id: "Heading1", name: "Heading 1", basedOn: "Normal", next: "Normal", quickFormat: true, + run: { size: 32, bold: true, font: "Arial" }, + paragraph: { spacing: { before: 240, after: 240 }, outlineLevel: 0 } }, // outlineLevel required for TOC + { id: "Heading2", name: "Heading 2", basedOn: "Normal", next: "Normal", quickFormat: true, + run: { size: 28, bold: true, font: "Arial" }, + paragraph: { spacing: { before: 180, after: 180 }, outlineLevel: 1 } }, + ] + }, + sections: [{ + children: [ + new Paragraph({ heading: HeadingLevel.HEADING_1, children: [new TextRun("Title")] }), + ] + }] +}); +``` + +### Lists (NEVER use unicode bullets) + +```javascript +// ❌ WRONG - never manually insert bullet characters +new Paragraph({ children: [new TextRun("• Item")] }) // BAD +new Paragraph({ children: [new TextRun("\u2022 Item")] }) // BAD + +// ✅ CORRECT - use numbering config with LevelFormat.BULLET +const doc = new Document({ + numbering: { + config: [ + { reference: "bullets", + levels: [{ level: 0, format: LevelFormat.BULLET, text: "•", alignment: AlignmentType.LEFT, + style: { paragraph: { indent: { left: 720, hanging: 360 } } } }] }, + { reference: "numbers", + levels: [{ level: 0, format: LevelFormat.DECIMAL, text: "%1.", alignment: AlignmentType.LEFT, + style: { paragraph: { indent: { left: 720, hanging: 360 } } } }] }, + ] + }, + sections: [{ + children: [ + new Paragraph({ numbering: { reference: "bullets", level: 0 }, + children: [new TextRun("Bullet item")] }), + new Paragraph({ numbering: { reference: "numbers", level: 0 }, + children: [new TextRun("Numbered item")] }), + ] + }] +}); + +// ⚠️ Each reference creates INDEPENDENT numbering +// Same reference = continues (1,2,3 then 4,5,6) +// Different reference = restarts (1,2,3 then 1,2,3) +``` + +### Tables + +**CRITICAL: Tables need dual widths** - set both `columnWidths` on the table AND `width` on each cell. Without both, tables render incorrectly on some platforms. + +```javascript +// CRITICAL: Always set table width for consistent rendering +// CRITICAL: Use ShadingType.CLEAR (not SOLID) to prevent black backgrounds +const border = { style: BorderStyle.SINGLE, size: 1, color: "CCCCCC" }; +const borders = { top: border, bottom: border, left: border, right: border }; + +new Table({ + width: { size: 9360, type: WidthType.DXA }, // Always use DXA (percentages break in Google Docs) + columnWidths: [4680, 4680], // Must sum to table width (DXA: 1440 = 1 inch) + rows: [ + new TableRow({ + children: [ + new TableCell({ + borders, + width: { size: 4680, type: WidthType.DXA }, // Also set on each cell + shading: { fill: "D5E8F0", type: ShadingType.CLEAR }, // CLEAR not SOLID + margins: { top: 80, bottom: 80, left: 120, right: 120 }, // Cell padding (internal, not added to width) + children: [new Paragraph({ children: [new TextRun("Cell")] })] + }) + ] + }) + ] +}) +``` + +**Table width calculation:** + +Always use `WidthType.DXA` — `WidthType.PERCENTAGE` breaks in Google Docs. + +```javascript +// Table width = sum of columnWidths = content width +// US Letter with 1" margins: 12240 - 2880 = 9360 DXA +width: { size: 9360, type: WidthType.DXA }, +columnWidths: [7000, 2360] // Must sum to table width +``` + +**Width rules:** +- **Always use `WidthType.DXA`** — never `WidthType.PERCENTAGE` (incompatible with Google Docs) +- Table width must equal the sum of `columnWidths` +- Cell `width` must match corresponding `columnWidth` +- Cell `margins` are internal padding - they reduce content area, not add to cell width +- For full-width tables: use content width (page width minus left and right margins) + +### Images + +```javascript +// CRITICAL: type parameter is REQUIRED +new Paragraph({ + children: [new ImageRun({ + type: "png", // Required: png, jpg, jpeg, gif, bmp, svg + data: fs.readFileSync("image.png"), + transformation: { width: 200, height: 150 }, + altText: { title: "Title", description: "Desc", name: "Name" } // All three required + })] +}) +``` + +### Page Breaks + +```javascript +// CRITICAL: PageBreak must be inside a Paragraph +new Paragraph({ children: [new PageBreak()] }) + +// Or use pageBreakBefore +new Paragraph({ pageBreakBefore: true, children: [new TextRun("New page")] }) +``` + +### Hyperlinks + +```javascript +// External link +new Paragraph({ + children: [new ExternalHyperlink({ + children: [new TextRun({ text: "Click here", style: "Hyperlink" })], + link: "https://example.com", + })] +}) + +// Internal link (bookmark + reference) +// 1. Create bookmark at destination +new Paragraph({ heading: HeadingLevel.HEADING_1, children: [ + new Bookmark({ id: "chapter1", children: [new TextRun("Chapter 1")] }), +]}) +// 2. Link to it +new Paragraph({ children: [new InternalHyperlink({ + children: [new TextRun({ text: "See Chapter 1", style: "Hyperlink" })], + anchor: "chapter1", +})]}) +``` + +### Footnotes + +```javascript +const doc = new Document({ + footnotes: { + 1: { children: [new Paragraph("Source: Annual Report 2024")] }, + 2: { children: [new Paragraph("See appendix for methodology")] }, + }, + sections: [{ + children: [new Paragraph({ + children: [ + new TextRun("Revenue grew 15%"), + new FootnoteReferenceRun(1), + new TextRun(" using adjusted metrics"), + new FootnoteReferenceRun(2), + ], + })] + }] +}); +``` + +### Tab Stops + +```javascript +// Right-align text on same line (e.g., date opposite a title) +new Paragraph({ + children: [ + new TextRun("Company Name"), + new TextRun("\tJanuary 2025"), + ], + tabStops: [{ type: TabStopType.RIGHT, position: TabStopPosition.MAX }], +}) + +// Dot leader (e.g., TOC-style) +new Paragraph({ + children: [ + new TextRun("Introduction"), + new TextRun({ children: [ + new PositionalTab({ + alignment: PositionalTabAlignment.RIGHT, + relativeTo: PositionalTabRelativeTo.MARGIN, + leader: PositionalTabLeader.DOT, + }), + "3", + ]}), + ], +}) +``` + +### Multi-Column Layouts + +```javascript +// Equal-width columns +sections: [{ + properties: { + column: { + count: 2, // number of columns + space: 720, // gap between columns in DXA (720 = 0.5 inch) + equalWidth: true, + separate: true, // vertical line between columns + }, + }, + children: [/* content flows naturally across columns */] +}] + +// Custom-width columns (equalWidth must be false) +sections: [{ + properties: { + column: { + equalWidth: false, + children: [ + new Column({ width: 5400, space: 720 }), + new Column({ width: 3240 }), + ], + }, + }, + children: [/* content */] +}] +``` + +Force a column break with a new section using `type: SectionType.NEXT_COLUMN`. + +### Table of Contents + +```javascript +// CRITICAL: Headings must use HeadingLevel ONLY - no custom styles +new TableOfContents("Table of Contents", { hyperlink: true, headingStyleRange: "1-3" }) +``` + +### Headers/Footers + +```javascript +sections: [{ + properties: { + page: { margin: { top: 1440, right: 1440, bottom: 1440, left: 1440 } } // 1440 = 1 inch + }, + headers: { + default: new Header({ children: [new Paragraph({ children: [new TextRun("Header")] })] }) + }, + footers: { + default: new Footer({ children: [new Paragraph({ + children: [new TextRun("Page "), new TextRun({ children: [PageNumber.CURRENT] })] + })] }) + }, + children: [/* content */] +}] +``` + +### Critical Rules for docx-js + +- **Set page size explicitly** - docx-js defaults to A4; use US Letter (12240 x 15840 DXA) for US documents +- **Landscape: pass portrait dimensions** - docx-js swaps width/height internally; pass short edge as `width`, long edge as `height`, and set `orientation: PageOrientation.LANDSCAPE` +- **Never use `\n`** - use separate Paragraph elements +- **Never use unicode bullets** - use `LevelFormat.BULLET` with numbering config +- **PageBreak must be in Paragraph** - standalone creates invalid XML +- **ImageRun requires `type`** - always specify png/jpg/etc +- **Always set table `width` with DXA** - never use `WidthType.PERCENTAGE` (breaks in Google Docs) +- **Tables need dual widths** - `columnWidths` array AND cell `width`, both must match +- **Table width = sum of columnWidths** - for DXA, ensure they add up exactly +- **Always add cell margins** - use `margins: { top: 80, bottom: 80, left: 120, right: 120 }` for readable padding +- **Use `ShadingType.CLEAR`** - never SOLID for table shading +- **Never use tables as dividers/rules** - cells have minimum height and render as empty boxes (including in headers/footers); use `border: { bottom: { style: BorderStyle.SINGLE, size: 6, color: "2E75B6", space: 1 } }` on a Paragraph instead. For two-column footers, use tab stops (see Tab Stops section), not tables +- **TOC requires HeadingLevel only** - no custom styles on heading paragraphs +- **Override built-in styles** - use exact IDs: "Heading1", "Heading2", etc. +- **Include `outlineLevel`** - required for TOC (0 for H1, 1 for H2, etc.) + +--- + +## Editing Existing Documents + +**Follow all 3 steps in order.** + +### Step 1: Unpack +```bash +python scripts/office/unpack.py document.docx unpacked/ +``` +Extracts XML, pretty-prints, merges adjacent runs, and converts smart quotes to XML entities (`“` etc.) so they survive editing. Use `--merge-runs false` to skip run merging. + +### Step 2: Edit XML + +Edit files in `unpacked/word/`. See XML Reference below for patterns. + +**Use "Claude" as the author** for tracked changes and comments, unless the user explicitly requests use of a different name. + +**Use the Edit tool directly for string replacement. Do not write Python scripts.** Scripts introduce unnecessary complexity. The Edit tool shows exactly what is being replaced. + +**CRITICAL: Use smart quotes for new content.** When adding text with apostrophes or quotes, use XML entities to produce smart quotes: +```xml + +Here’s a quote: “Hello” +``` +| Entity | Character | +|--------|-----------| +| `‘` | ‘ (left single) | +| `’` | ’ (right single / apostrophe) | +| `“` | “ (left double) | +| `”` | ” (right double) | + +**Adding comments:** Use `comment.py` to handle boilerplate across multiple XML files (text must be pre-escaped XML): +```bash +python scripts/comment.py unpacked/ 0 "Comment text with & and ’" +python scripts/comment.py unpacked/ 1 "Reply text" --parent 0 # reply to comment 0 +python scripts/comment.py unpacked/ 0 "Text" --author "Custom Author" # custom author name +``` +Then add markers to document.xml (see Comments in XML Reference). + +### Step 3: Pack +```bash +python scripts/office/pack.py unpacked/ output.docx --original document.docx +``` +Validates with auto-repair, condenses XML, and creates DOCX. Use `--validate false` to skip. + +**Auto-repair will fix:** +- `durableId` >= 0x7FFFFFFF (regenerates valid ID) +- Missing `xml:space="preserve"` on `` with whitespace + +**Auto-repair won't fix:** +- Malformed XML, invalid element nesting, missing relationships, schema violations + +### Common Pitfalls + +- **Replace entire `` elements**: When adding tracked changes, replace the whole `...` block with `......` as siblings. Don't inject tracked change tags inside a run. +- **Preserve `` formatting**: Copy the original run's `` block into your tracked change runs to maintain bold, font size, etc. + +--- + +## XML Reference + +### Schema Compliance + +- **Element order in ``**: ``, ``, ``, ``, ``, `` last +- **Whitespace**: Add `xml:space="preserve"` to `` with leading/trailing spaces +- **RSIDs**: Must be 8-digit hex (e.g., `00AB1234`) + +### Tracked Changes + +**Insertion:** +```xml + + inserted text + +``` + +**Deletion:** +```xml + + deleted text + +``` + +**Inside ``**: Use `` instead of ``, and `` instead of ``. + +**Minimal edits** - only mark what changes: +```xml + +The term is + + 30 + + + 60 + + days. +``` + +**Deleting entire paragraphs/list items** - when removing ALL content from a paragraph, also mark the paragraph mark as deleted so it merges with the next paragraph. Add `` inside ``: +```xml + + + ... + + + + + + Entire paragraph content being deleted... + + +``` +Without the `` in ``, accepting changes leaves an empty paragraph/list item. + +**Rejecting another author's insertion** - nest deletion inside their insertion: +```xml + + + their inserted text + + +``` + +**Restoring another author's deletion** - add insertion after (don't modify their deletion): +```xml + + deleted text + + + deleted text + +``` + +### Comments + +After running `comment.py` (see Step 2), add markers to document.xml. For replies, use `--parent` flag and nest markers inside the parent's. + +**CRITICAL: `` and `` are siblings of ``, never inside ``.** + +```xml + + + + deleted + + more text + + + + + + + text + + + + +``` + +### Images + +1. Add image file to `word/media/` +2. Add relationship to `word/_rels/document.xml.rels`: +```xml + +``` +3. Add content type to `[Content_Types].xml`: +```xml + +``` +4. Reference in document.xml: +```xml + + + + + + + + + + + + +``` + +--- + +## Dependencies + +- **pandoc**: Text extraction +- **docx**: `npm install -g docx` (new documents) +- **LibreOffice**: PDF conversion (auto-configured for sandboxed environments via `scripts/office/soffice.py`) +- **Poppler**: `pdftoppm` for images diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/__init__.py b/ruoyi-admin/src/main/resources/skills/docx/scripts/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/__init__.py @@ -0,0 +1 @@ + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/accept_changes.py b/ruoyi-admin/src/main/resources/skills/docx/scripts/accept_changes.py new file mode 100644 index 00000000..8e363161 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/accept_changes.py @@ -0,0 +1,135 @@ +"""Accept all tracked changes in a DOCX file using LibreOffice. + +Requires LibreOffice (soffice) to be installed. +""" + +import argparse +import logging +import shutil +import subprocess +from pathlib import Path + +from office.soffice import get_soffice_env + +logger = logging.getLogger(__name__) + +LIBREOFFICE_PROFILE = "/tmp/libreoffice_docx_profile" +MACRO_DIR = f"{LIBREOFFICE_PROFILE}/user/basic/Standard" + +ACCEPT_CHANGES_MACRO = """ + + + Sub AcceptAllTrackedChanges() + Dim document As Object + Dim dispatcher As Object + + document = ThisComponent.CurrentController.Frame + dispatcher = createUnoService("com.sun.star.frame.DispatchHelper") + + dispatcher.executeDispatch(document, ".uno:AcceptAllTrackedChanges", "", 0, Array()) + ThisComponent.store() + ThisComponent.close(True) + End Sub +""" + + +def accept_changes( + input_file: str, + output_file: str, +) -> tuple[None, str]: + input_path = Path(input_file) + output_path = Path(output_file) + + if not input_path.exists(): + return None, f"Error: Input file not found: {input_file}" + + if not input_path.suffix.lower() == ".docx": + return None, f"Error: Input file is not a DOCX file: {input_file}" + + try: + output_path.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(input_path, output_path) + except Exception as e: + return None, f"Error: Failed to copy input file to output location: {e}" + + if not _setup_libreoffice_macro(): + return None, "Error: Failed to setup LibreOffice macro" + + cmd = [ + "soffice", + "--headless", + f"-env:UserInstallation=file://{LIBREOFFICE_PROFILE}", + "--norestore", + "vnd.sun.star.script:Standard.Module1.AcceptAllTrackedChanges?language=Basic&location=application", + str(output_path.absolute()), + ] + + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=30, + check=False, + env=get_soffice_env(), + ) + except subprocess.TimeoutExpired: + return ( + None, + f"Successfully accepted all tracked changes: {input_file} -> {output_file}", + ) + + if result.returncode != 0: + return None, f"Error: LibreOffice failed: {result.stderr}" + + return ( + None, + f"Successfully accepted all tracked changes: {input_file} -> {output_file}", + ) + + +def _setup_libreoffice_macro() -> bool: + macro_dir = Path(MACRO_DIR) + macro_file = macro_dir / "Module1.xba" + + if macro_file.exists() and "AcceptAllTrackedChanges" in macro_file.read_text(): + return True + + if not macro_dir.exists(): + subprocess.run( + [ + "soffice", + "--headless", + f"-env:UserInstallation=file://{LIBREOFFICE_PROFILE}", + "--terminate_after_init", + ], + capture_output=True, + timeout=10, + check=False, + env=get_soffice_env(), + ) + macro_dir.mkdir(parents=True, exist_ok=True) + + try: + macro_file.write_text(ACCEPT_CHANGES_MACRO) + return True + except Exception as e: + logger.warning(f"Failed to setup LibreOffice macro: {e}") + return False + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Accept all tracked changes in a DOCX file" + ) + parser.add_argument("input_file", help="Input DOCX file with tracked changes") + parser.add_argument( + "output_file", help="Output DOCX file (clean, no tracked changes)" + ) + args = parser.parse_args() + + _, message = accept_changes(args.input_file, args.output_file) + print(message) + + if "Error" in message: + raise SystemExit(1) diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/comment.py b/ruoyi-admin/src/main/resources/skills/docx/scripts/comment.py new file mode 100644 index 00000000..36e1c935 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/comment.py @@ -0,0 +1,318 @@ +"""Add comments to DOCX documents. + +Usage: + python comment.py unpacked/ 0 "Comment text" + python comment.py unpacked/ 1 "Reply text" --parent 0 + +Text should be pre-escaped XML (e.g., & for &, ’ for smart quotes). + +After running, add markers to document.xml: + + ... commented content ... + + +""" + +import argparse +import random +import shutil +import sys +from datetime import datetime, timezone +from pathlib import Path + +import defusedxml.minidom + +TEMPLATE_DIR = Path(__file__).parent / "templates" +NS = { + "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main", + "w14": "http://schemas.microsoft.com/office/word/2010/wordml", + "w15": "http://schemas.microsoft.com/office/word/2012/wordml", + "w16cid": "http://schemas.microsoft.com/office/word/2016/wordml/cid", + "w16cex": "http://schemas.microsoft.com/office/word/2018/wordml/cex", +} + +COMMENT_XML = """\ + + + + + + + + + + + + + {text} + + +""" + +COMMENT_MARKER_TEMPLATE = """ +Add to document.xml (markers must be direct children of w:p, never inside w:r): + + ... + + """ + +REPLY_MARKER_TEMPLATE = """ +Nest markers inside parent {pid}'s markers (markers must be direct children of w:p, never inside w:r): + + ... + + + """ + + +def _generate_hex_id() -> str: + return f"{random.randint(0, 0x7FFFFFFE):08X}" + + +SMART_QUOTE_ENTITIES = { + "\u201c": "“", + "\u201d": "”", + "\u2018": "‘", + "\u2019": "’", +} + + +def _encode_smart_quotes(text: str) -> str: + for char, entity in SMART_QUOTE_ENTITIES.items(): + text = text.replace(char, entity) + return text + + +def _append_xml(xml_path: Path, root_tag: str, content: str) -> None: + dom = defusedxml.minidom.parseString(xml_path.read_text(encoding="utf-8")) + root = dom.getElementsByTagName(root_tag)[0] + ns_attrs = " ".join(f'xmlns:{k}="{v}"' for k, v in NS.items()) + wrapper_dom = defusedxml.minidom.parseString(f"{content}") + for child in wrapper_dom.documentElement.childNodes: + if child.nodeType == child.ELEMENT_NODE: + root.appendChild(dom.importNode(child, True)) + output = _encode_smart_quotes(dom.toxml(encoding="UTF-8").decode("utf-8")) + xml_path.write_text(output, encoding="utf-8") + + +def _find_para_id(comments_path: Path, comment_id: int) -> str | None: + dom = defusedxml.minidom.parseString(comments_path.read_text(encoding="utf-8")) + for c in dom.getElementsByTagName("w:comment"): + if c.getAttribute("w:id") == str(comment_id): + for p in c.getElementsByTagName("w:p"): + if pid := p.getAttribute("w14:paraId"): + return pid + return None + + +def _get_next_rid(rels_path: Path) -> int: + dom = defusedxml.minidom.parseString(rels_path.read_text(encoding="utf-8")) + max_rid = 0 + for rel in dom.getElementsByTagName("Relationship"): + rid = rel.getAttribute("Id") + if rid and rid.startswith("rId"): + try: + max_rid = max(max_rid, int(rid[3:])) + except ValueError: + pass + return max_rid + 1 + + +def _has_relationship(rels_path: Path, target: str) -> bool: + dom = defusedxml.minidom.parseString(rels_path.read_text(encoding="utf-8")) + for rel in dom.getElementsByTagName("Relationship"): + if rel.getAttribute("Target") == target: + return True + return False + + +def _has_content_type(ct_path: Path, part_name: str) -> bool: + dom = defusedxml.minidom.parseString(ct_path.read_text(encoding="utf-8")) + for override in dom.getElementsByTagName("Override"): + if override.getAttribute("PartName") == part_name: + return True + return False + + +def _ensure_comment_relationships(unpacked_dir: Path) -> None: + rels_path = unpacked_dir / "word" / "_rels" / "document.xml.rels" + if not rels_path.exists(): + return + + if _has_relationship(rels_path, "comments.xml"): + return + + dom = defusedxml.minidom.parseString(rels_path.read_text(encoding="utf-8")) + root = dom.documentElement + next_rid = _get_next_rid(rels_path) + + rels = [ + ( + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments", + "comments.xml", + ), + ( + "http://schemas.microsoft.com/office/2011/relationships/commentsExtended", + "commentsExtended.xml", + ), + ( + "http://schemas.microsoft.com/office/2016/09/relationships/commentsIds", + "commentsIds.xml", + ), + ( + "http://schemas.microsoft.com/office/2018/08/relationships/commentsExtensible", + "commentsExtensible.xml", + ), + ] + + for rel_type, target in rels: + rel = dom.createElement("Relationship") + rel.setAttribute("Id", f"rId{next_rid}") + rel.setAttribute("Type", rel_type) + rel.setAttribute("Target", target) + root.appendChild(rel) + next_rid += 1 + + rels_path.write_bytes(dom.toxml(encoding="UTF-8")) + + +def _ensure_comment_content_types(unpacked_dir: Path) -> None: + ct_path = unpacked_dir / "[Content_Types].xml" + if not ct_path.exists(): + return + + if _has_content_type(ct_path, "/word/comments.xml"): + return + + dom = defusedxml.minidom.parseString(ct_path.read_text(encoding="utf-8")) + root = dom.documentElement + + overrides = [ + ( + "/word/comments.xml", + "application/vnd.openxmlformats-officedocument.wordprocessingml.comments+xml", + ), + ( + "/word/commentsExtended.xml", + "application/vnd.openxmlformats-officedocument.wordprocessingml.commentsExtended+xml", + ), + ( + "/word/commentsIds.xml", + "application/vnd.openxmlformats-officedocument.wordprocessingml.commentsIds+xml", + ), + ( + "/word/commentsExtensible.xml", + "application/vnd.openxmlformats-officedocument.wordprocessingml.commentsExtensible+xml", + ), + ] + + for part_name, content_type in overrides: + override = dom.createElement("Override") + override.setAttribute("PartName", part_name) + override.setAttribute("ContentType", content_type) + root.appendChild(override) + + ct_path.write_bytes(dom.toxml(encoding="UTF-8")) + + +def add_comment( + unpacked_dir: str, + comment_id: int, + text: str, + author: str = "Claude", + initials: str = "C", + parent_id: int | None = None, +) -> tuple[str, str]: + word = Path(unpacked_dir) / "word" + if not word.exists(): + return "", f"Error: {word} not found" + + para_id, durable_id = _generate_hex_id(), _generate_hex_id() + ts = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + + comments = word / "comments.xml" + first_comment = not comments.exists() + if first_comment: + shutil.copy(TEMPLATE_DIR / "comments.xml", comments) + _ensure_comment_relationships(Path(unpacked_dir)) + _ensure_comment_content_types(Path(unpacked_dir)) + _append_xml( + comments, + "w:comments", + COMMENT_XML.format( + id=comment_id, + author=author, + date=ts, + initials=initials, + para_id=para_id, + text=text, + ), + ) + + ext = word / "commentsExtended.xml" + if not ext.exists(): + shutil.copy(TEMPLATE_DIR / "commentsExtended.xml", ext) + if parent_id is not None: + parent_para = _find_para_id(comments, parent_id) + if not parent_para: + return "", f"Error: Parent comment {parent_id} not found" + _append_xml( + ext, + "w15:commentsEx", + f'', + ) + else: + _append_xml( + ext, + "w15:commentsEx", + f'', + ) + + ids = word / "commentsIds.xml" + if not ids.exists(): + shutil.copy(TEMPLATE_DIR / "commentsIds.xml", ids) + _append_xml( + ids, + "w16cid:commentsIds", + f'', + ) + + extensible = word / "commentsExtensible.xml" + if not extensible.exists(): + shutil.copy(TEMPLATE_DIR / "commentsExtensible.xml", extensible) + _append_xml( + extensible, + "w16cex:commentsExtensible", + f'', + ) + + action = "reply" if parent_id is not None else "comment" + return para_id, f"Added {action} {comment_id} (para_id={para_id})" + + +if __name__ == "__main__": + p = argparse.ArgumentParser(description="Add comments to DOCX documents") + p.add_argument("unpacked_dir", help="Unpacked DOCX directory") + p.add_argument("comment_id", type=int, help="Comment ID (must be unique)") + p.add_argument("text", help="Comment text") + p.add_argument("--author", default="Claude", help="Author name") + p.add_argument("--initials", default="C", help="Author initials") + p.add_argument("--parent", type=int, help="Parent comment ID (for replies)") + args = p.parse_args() + + para_id, msg = add_comment( + args.unpacked_dir, + args.comment_id, + args.text, + args.author, + args.initials, + args.parent, + ) + print(msg) + if "Error" in msg: + sys.exit(1) + cid = args.comment_id + if args.parent is not None: + print(REPLY_MARKER_TEMPLATE.format(pid=args.parent, cid=cid)) + else: + print(COMMENT_MARKER_TEMPLATE.format(cid=cid)) diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/helpers/__init__.py b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/helpers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/helpers/merge_runs.py b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/helpers/merge_runs.py new file mode 100644 index 00000000..ad7c25ee --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/helpers/merge_runs.py @@ -0,0 +1,199 @@ +"""Merge adjacent runs with identical formatting in DOCX. + +Merges adjacent elements that have identical properties. +Works on runs in paragraphs and inside tracked changes (, ). + +Also: +- Removes rsid attributes from runs (revision metadata that doesn't affect rendering) +- Removes proofErr elements (spell/grammar markers that block merging) +""" + +from pathlib import Path + +import defusedxml.minidom + + +def merge_runs(input_dir: str) -> tuple[int, str]: + doc_xml = Path(input_dir) / "word" / "document.xml" + + if not doc_xml.exists(): + return 0, f"Error: {doc_xml} not found" + + try: + dom = defusedxml.minidom.parseString(doc_xml.read_text(encoding="utf-8")) + root = dom.documentElement + + _remove_elements(root, "proofErr") + _strip_run_rsid_attrs(root) + + containers = {run.parentNode for run in _find_elements(root, "r")} + + merge_count = 0 + for container in containers: + merge_count += _merge_runs_in(container) + + doc_xml.write_bytes(dom.toxml(encoding="UTF-8")) + return merge_count, f"Merged {merge_count} runs" + + except Exception as e: + return 0, f"Error: {e}" + + + + +def _find_elements(root, tag: str) -> list: + results = [] + + def traverse(node): + if node.nodeType == node.ELEMENT_NODE: + name = node.localName or node.tagName + if name == tag or name.endswith(f":{tag}"): + results.append(node) + for child in node.childNodes: + traverse(child) + + traverse(root) + return results + + +def _get_child(parent, tag: str): + for child in parent.childNodes: + if child.nodeType == child.ELEMENT_NODE: + name = child.localName or child.tagName + if name == tag or name.endswith(f":{tag}"): + return child + return None + + +def _get_children(parent, tag: str) -> list: + results = [] + for child in parent.childNodes: + if child.nodeType == child.ELEMENT_NODE: + name = child.localName or child.tagName + if name == tag or name.endswith(f":{tag}"): + results.append(child) + return results + + +def _is_adjacent(elem1, elem2) -> bool: + node = elem1.nextSibling + while node: + if node == elem2: + return True + if node.nodeType == node.ELEMENT_NODE: + return False + if node.nodeType == node.TEXT_NODE and node.data.strip(): + return False + node = node.nextSibling + return False + + + + +def _remove_elements(root, tag: str): + for elem in _find_elements(root, tag): + if elem.parentNode: + elem.parentNode.removeChild(elem) + + +def _strip_run_rsid_attrs(root): + for run in _find_elements(root, "r"): + for attr in list(run.attributes.values()): + if "rsid" in attr.name.lower(): + run.removeAttribute(attr.name) + + + + +def _merge_runs_in(container) -> int: + merge_count = 0 + run = _first_child_run(container) + + while run: + while True: + next_elem = _next_element_sibling(run) + if next_elem and _is_run(next_elem) and _can_merge(run, next_elem): + _merge_run_content(run, next_elem) + container.removeChild(next_elem) + merge_count += 1 + else: + break + + _consolidate_text(run) + run = _next_sibling_run(run) + + return merge_count + + +def _first_child_run(container): + for child in container.childNodes: + if child.nodeType == child.ELEMENT_NODE and _is_run(child): + return child + return None + + +def _next_element_sibling(node): + sibling = node.nextSibling + while sibling: + if sibling.nodeType == sibling.ELEMENT_NODE: + return sibling + sibling = sibling.nextSibling + return None + + +def _next_sibling_run(node): + sibling = node.nextSibling + while sibling: + if sibling.nodeType == sibling.ELEMENT_NODE: + if _is_run(sibling): + return sibling + sibling = sibling.nextSibling + return None + + +def _is_run(node) -> bool: + name = node.localName or node.tagName + return name == "r" or name.endswith(":r") + + +def _can_merge(run1, run2) -> bool: + rpr1 = _get_child(run1, "rPr") + rpr2 = _get_child(run2, "rPr") + + if (rpr1 is None) != (rpr2 is None): + return False + if rpr1 is None: + return True + return rpr1.toxml() == rpr2.toxml() + + +def _merge_run_content(target, source): + for child in list(source.childNodes): + if child.nodeType == child.ELEMENT_NODE: + name = child.localName or child.tagName + if name != "rPr" and not name.endswith(":rPr"): + target.appendChild(child) + + +def _consolidate_text(run): + t_elements = _get_children(run, "t") + + for i in range(len(t_elements) - 1, 0, -1): + curr, prev = t_elements[i], t_elements[i - 1] + + if _is_adjacent(prev, curr): + prev_text = prev.firstChild.data if prev.firstChild else "" + curr_text = curr.firstChild.data if curr.firstChild else "" + merged = prev_text + curr_text + + if prev.firstChild: + prev.firstChild.data = merged + else: + prev.appendChild(run.ownerDocument.createTextNode(merged)) + + if merged.startswith(" ") or merged.endswith(" "): + prev.setAttribute("xml:space", "preserve") + elif prev.hasAttribute("xml:space"): + prev.removeAttribute("xml:space") + + run.removeChild(curr) diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/helpers/simplify_redlines.py b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/helpers/simplify_redlines.py new file mode 100644 index 00000000..db963bb9 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/helpers/simplify_redlines.py @@ -0,0 +1,197 @@ +"""Simplify tracked changes by merging adjacent w:ins or w:del elements. + +Merges adjacent elements from the same author into a single element. +Same for elements. This makes heavily-redlined documents easier to +work with by reducing the number of tracked change wrappers. + +Rules: +- Only merges w:ins with w:ins, w:del with w:del (same element type) +- Only merges if same author (ignores timestamp differences) +- Only merges if truly adjacent (only whitespace between them) +""" + +import xml.etree.ElementTree as ET +import zipfile +from pathlib import Path + +import defusedxml.minidom + +WORD_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + + +def simplify_redlines(input_dir: str) -> tuple[int, str]: + doc_xml = Path(input_dir) / "word" / "document.xml" + + if not doc_xml.exists(): + return 0, f"Error: {doc_xml} not found" + + try: + dom = defusedxml.minidom.parseString(doc_xml.read_text(encoding="utf-8")) + root = dom.documentElement + + merge_count = 0 + + containers = _find_elements(root, "p") + _find_elements(root, "tc") + + for container in containers: + merge_count += _merge_tracked_changes_in(container, "ins") + merge_count += _merge_tracked_changes_in(container, "del") + + doc_xml.write_bytes(dom.toxml(encoding="UTF-8")) + return merge_count, f"Simplified {merge_count} tracked changes" + + except Exception as e: + return 0, f"Error: {e}" + + +def _merge_tracked_changes_in(container, tag: str) -> int: + merge_count = 0 + + tracked = [ + child + for child in container.childNodes + if child.nodeType == child.ELEMENT_NODE and _is_element(child, tag) + ] + + if len(tracked) < 2: + return 0 + + i = 0 + while i < len(tracked) - 1: + curr = tracked[i] + next_elem = tracked[i + 1] + + if _can_merge_tracked(curr, next_elem): + _merge_tracked_content(curr, next_elem) + container.removeChild(next_elem) + tracked.pop(i + 1) + merge_count += 1 + else: + i += 1 + + return merge_count + + +def _is_element(node, tag: str) -> bool: + name = node.localName or node.tagName + return name == tag or name.endswith(f":{tag}") + + +def _get_author(elem) -> str: + author = elem.getAttribute("w:author") + if not author: + for attr in elem.attributes.values(): + if attr.localName == "author" or attr.name.endswith(":author"): + return attr.value + return author + + +def _can_merge_tracked(elem1, elem2) -> bool: + if _get_author(elem1) != _get_author(elem2): + return False + + node = elem1.nextSibling + while node and node != elem2: + if node.nodeType == node.ELEMENT_NODE: + return False + if node.nodeType == node.TEXT_NODE and node.data.strip(): + return False + node = node.nextSibling + + return True + + +def _merge_tracked_content(target, source): + while source.firstChild: + child = source.firstChild + source.removeChild(child) + target.appendChild(child) + + +def _find_elements(root, tag: str) -> list: + results = [] + + def traverse(node): + if node.nodeType == node.ELEMENT_NODE: + name = node.localName or node.tagName + if name == tag or name.endswith(f":{tag}"): + results.append(node) + for child in node.childNodes: + traverse(child) + + traverse(root) + return results + + +def get_tracked_change_authors(doc_xml_path: Path) -> dict[str, int]: + if not doc_xml_path.exists(): + return {} + + try: + tree = ET.parse(doc_xml_path) + root = tree.getroot() + except ET.ParseError: + return {} + + namespaces = {"w": WORD_NS} + author_attr = f"{{{WORD_NS}}}author" + + authors: dict[str, int] = {} + for tag in ["ins", "del"]: + for elem in root.findall(f".//w:{tag}", namespaces): + author = elem.get(author_attr) + if author: + authors[author] = authors.get(author, 0) + 1 + + return authors + + +def _get_authors_from_docx(docx_path: Path) -> dict[str, int]: + try: + with zipfile.ZipFile(docx_path, "r") as zf: + if "word/document.xml" not in zf.namelist(): + return {} + with zf.open("word/document.xml") as f: + tree = ET.parse(f) + root = tree.getroot() + + namespaces = {"w": WORD_NS} + author_attr = f"{{{WORD_NS}}}author" + + authors: dict[str, int] = {} + for tag in ["ins", "del"]: + for elem in root.findall(f".//w:{tag}", namespaces): + author = elem.get(author_attr) + if author: + authors[author] = authors.get(author, 0) + 1 + return authors + except (zipfile.BadZipFile, ET.ParseError): + return {} + + +def infer_author(modified_dir: Path, original_docx: Path, default: str = "Claude") -> str: + modified_xml = modified_dir / "word" / "document.xml" + modified_authors = get_tracked_change_authors(modified_xml) + + if not modified_authors: + return default + + original_authors = _get_authors_from_docx(original_docx) + + new_changes: dict[str, int] = {} + for author, count in modified_authors.items(): + original_count = original_authors.get(author, 0) + diff = count - original_count + if diff > 0: + new_changes[author] = diff + + if not new_changes: + return default + + if len(new_changes) == 1: + return next(iter(new_changes)) + + raise ValueError( + f"Multiple authors added new changes: {new_changes}. " + "Cannot infer which author to validate." + ) diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/pack.py b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/pack.py new file mode 100644 index 00000000..db29ed8b --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/pack.py @@ -0,0 +1,159 @@ +"""Pack a directory into a DOCX, PPTX, or XLSX file. + +Validates with auto-repair, condenses XML formatting, and creates the Office file. + +Usage: + python pack.py [--original ] [--validate true|false] + +Examples: + python pack.py unpacked/ output.docx --original input.docx + python pack.py unpacked/ output.pptx --validate false +""" + +import argparse +import sys +import shutil +import tempfile +import zipfile +from pathlib import Path + +import defusedxml.minidom + +from validators import DOCXSchemaValidator, PPTXSchemaValidator, RedliningValidator + +def pack( + input_directory: str, + output_file: str, + original_file: str | None = None, + validate: bool = True, + infer_author_func=None, +) -> tuple[None, str]: + input_dir = Path(input_directory) + output_path = Path(output_file) + suffix = output_path.suffix.lower() + + if not input_dir.is_dir(): + return None, f"Error: {input_dir} is not a directory" + + if suffix not in {".docx", ".pptx", ".xlsx"}: + return None, f"Error: {output_file} must be a .docx, .pptx, or .xlsx file" + + if validate and original_file: + original_path = Path(original_file) + if original_path.exists(): + success, output = _run_validation( + input_dir, original_path, suffix, infer_author_func + ) + if output: + print(output) + if not success: + return None, f"Error: Validation failed for {input_dir}" + + with tempfile.TemporaryDirectory() as temp_dir: + temp_content_dir = Path(temp_dir) / "content" + shutil.copytree(input_dir, temp_content_dir) + + for pattern in ["*.xml", "*.rels"]: + for xml_file in temp_content_dir.rglob(pattern): + _condense_xml(xml_file) + + output_path.parent.mkdir(parents=True, exist_ok=True) + with zipfile.ZipFile(output_path, "w", zipfile.ZIP_DEFLATED) as zf: + for f in temp_content_dir.rglob("*"): + if f.is_file(): + zf.write(f, f.relative_to(temp_content_dir)) + + return None, f"Successfully packed {input_dir} to {output_file}" + + +def _run_validation( + unpacked_dir: Path, + original_file: Path, + suffix: str, + infer_author_func=None, +) -> tuple[bool, str | None]: + output_lines = [] + validators = [] + + if suffix == ".docx": + author = "Claude" + if infer_author_func: + try: + author = infer_author_func(unpacked_dir, original_file) + except ValueError as e: + print(f"Warning: {e} Using default author 'Claude'.", file=sys.stderr) + + validators = [ + DOCXSchemaValidator(unpacked_dir, original_file), + RedliningValidator(unpacked_dir, original_file, author=author), + ] + elif suffix == ".pptx": + validators = [PPTXSchemaValidator(unpacked_dir, original_file)] + + if not validators: + return True, None + + total_repairs = sum(v.repair() for v in validators) + if total_repairs: + output_lines.append(f"Auto-repaired {total_repairs} issue(s)") + + success = all(v.validate() for v in validators) + + if success: + output_lines.append("All validations PASSED!") + + return success, "\n".join(output_lines) if output_lines else None + + +def _condense_xml(xml_file: Path) -> None: + try: + with open(xml_file, encoding="utf-8") as f: + dom = defusedxml.minidom.parse(f) + + for element in dom.getElementsByTagName("*"): + if element.tagName.endswith(":t"): + continue + + for child in list(element.childNodes): + if ( + child.nodeType == child.TEXT_NODE + and child.nodeValue + and child.nodeValue.strip() == "" + ) or child.nodeType == child.COMMENT_NODE: + element.removeChild(child) + + xml_file.write_bytes(dom.toxml(encoding="UTF-8")) + except Exception as e: + print(f"ERROR: Failed to parse {xml_file.name}: {e}", file=sys.stderr) + raise + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Pack a directory into a DOCX, PPTX, or XLSX file" + ) + parser.add_argument("input_directory", help="Unpacked Office document directory") + parser.add_argument("output_file", help="Output Office file (.docx/.pptx/.xlsx)") + parser.add_argument( + "--original", + help="Original file for validation comparison", + ) + parser.add_argument( + "--validate", + type=lambda x: x.lower() == "true", + default=True, + metavar="true|false", + help="Run validation with auto-repair (default: true)", + ) + args = parser.parse_args() + + _, message = pack( + args.input_directory, + args.output_file, + original_file=args.original, + validate=args.validate, + ) + print(message) + + if "Error" in message: + sys.exit(1) diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd new file mode 100644 index 00000000..6454ef9a --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd @@ -0,0 +1,1499 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd new file mode 100644 index 00000000..afa4f463 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd @@ -0,0 +1,146 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd new file mode 100644 index 00000000..64e66b8a --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd @@ -0,0 +1,1085 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd new file mode 100644 index 00000000..687eea82 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd @@ -0,0 +1,11 @@ + + + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd new file mode 100644 index 00000000..6ac81b06 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd @@ -0,0 +1,3081 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd new file mode 100644 index 00000000..1dbf0514 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd @@ -0,0 +1,23 @@ + + + + + + + + + + + + + + + + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd new file mode 100644 index 00000000..f1af17db --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd @@ -0,0 +1,185 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd new file mode 100644 index 00000000..0a185ab6 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd @@ -0,0 +1,287 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd new file mode 100644 index 00000000..14ef4888 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd @@ -0,0 +1,1676 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd new file mode 100644 index 00000000..c20f3bf1 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd @@ -0,0 +1,28 @@ + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd new file mode 100644 index 00000000..ac602522 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd @@ -0,0 +1,144 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd new file mode 100644 index 00000000..424b8ba8 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd @@ -0,0 +1,174 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd new file mode 100644 index 00000000..2bddce29 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd @@ -0,0 +1,25 @@ + + + + + + + + + + + + + + + + + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd new file mode 100644 index 00000000..8a8c18ba --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd new file mode 100644 index 00000000..5c42706a --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd @@ -0,0 +1,59 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd new file mode 100644 index 00000000..853c341c --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd @@ -0,0 +1,56 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd new file mode 100644 index 00000000..da835ee8 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd @@ -0,0 +1,195 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd new file mode 100644 index 00000000..87ad2658 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd @@ -0,0 +1,582 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd new file mode 100644 index 00000000..9e86f1b2 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd @@ -0,0 +1,25 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd new file mode 100644 index 00000000..d0be42e7 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd @@ -0,0 +1,4439 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd new file mode 100644 index 00000000..8821dd18 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd @@ -0,0 +1,570 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd new file mode 100644 index 00000000..ca2575c7 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd @@ -0,0 +1,509 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd new file mode 100644 index 00000000..dd079e60 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd @@ -0,0 +1,12 @@ + + + + + + + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd new file mode 100644 index 00000000..3dd6cf62 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd @@ -0,0 +1,108 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd new file mode 100644 index 00000000..f1041e34 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd @@ -0,0 +1,96 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd new file mode 100644 index 00000000..9c5b7a63 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd @@ -0,0 +1,3646 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd new file mode 100644 index 00000000..0f13678d --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd @@ -0,0 +1,116 @@ + + + + + + See http://www.w3.org/XML/1998/namespace.html and + http://www.w3.org/TR/REC-xml for information about this namespace. + + This schema document describes the XML namespace, in a form + suitable for import by other schema documents. + + Note that local names in this namespace are intended to be defined + only by the World Wide Web Consortium or its subgroups. The + following names are currently defined in this namespace and should + not be used with conflicting semantics by any Working Group, + specification, or document instance: + + base (as an attribute name): denotes an attribute whose value + provides a URI to be used as the base for interpreting any + relative URIs in the scope of the element on which it + appears; its value is inherited. This name is reserved + by virtue of its definition in the XML Base specification. + + lang (as an attribute name): denotes an attribute whose value + is a language code for the natural language of the content of + any element; its value is inherited. This name is reserved + by virtue of its definition in the XML specification. + + space (as an attribute name): denotes an attribute whose + value is a keyword indicating what whitespace processing + discipline is intended for the content of the element; its + value is inherited. This name is reserved by virtue of its + definition in the XML specification. + + Father (in any context at all): denotes Jon Bosak, the chair of + the original XML Working Group. This name is reserved by + the following decision of the W3C XML Plenary and + XML Coordination groups: + + In appreciation for his vision, leadership and dedication + the W3C XML Plenary on this 10th day of February, 2000 + reserves for Jon Bosak in perpetuity the XML name + xml:Father + + + + + This schema defines attributes and an attribute group + suitable for use by + schemas wishing to allow xml:base, xml:lang or xml:space attributes + on elements they define. + + To enable this, such a schema must import this schema + for the XML namespace, e.g. as follows: + <schema . . .> + . . . + <import namespace="http://www.w3.org/XML/1998/namespace" + schemaLocation="http://www.w3.org/2001/03/xml.xsd"/> + + Subsequently, qualified reference to any of the attributes + or the group defined below will have the desired effect, e.g. + + <type . . .> + . . . + <attributeGroup ref="xml:specialAttrs"/> + + will define a type which will schema-validate an instance + element with any of those attributes + + + + In keeping with the XML Schema WG's standard versioning + policy, this schema document will persist at + http://www.w3.org/2001/03/xml.xsd. + At the date of issue it can also be found at + http://www.w3.org/2001/xml.xsd. + The schema document at that URI may however change in the future, + in order to remain compatible with the latest version of XML Schema + itself. In other words, if the XML Schema namespace changes, the version + of this document at + http://www.w3.org/2001/xml.xsd will change + accordingly; the version at + http://www.w3.org/2001/03/xml.xsd will not change. + + + + + + In due course, we should install the relevant ISO 2- and 3-letter + codes as the enumerated possible values . . . + + + + + + + + + + + + + + + See http://www.w3.org/TR/xmlbase/ for + information about this attribute. + + + + + + + + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd new file mode 100644 index 00000000..a6de9d27 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd @@ -0,0 +1,42 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd new file mode 100644 index 00000000..10e978b6 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd @@ -0,0 +1,50 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd new file mode 100644 index 00000000..4248bf7a --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd @@ -0,0 +1,49 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd new file mode 100644 index 00000000..56497467 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd @@ -0,0 +1,33 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/mce/mc.xsd b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/mce/mc.xsd new file mode 100644 index 00000000..ef725457 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/mce/mc.xsd @@ -0,0 +1,75 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/microsoft/wml-2010.xsd b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/microsoft/wml-2010.xsd new file mode 100644 index 00000000..f65f7777 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/microsoft/wml-2010.xsd @@ -0,0 +1,560 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/microsoft/wml-2012.xsd b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/microsoft/wml-2012.xsd new file mode 100644 index 00000000..6b00755a --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/microsoft/wml-2012.xsd @@ -0,0 +1,67 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/microsoft/wml-2018.xsd b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/microsoft/wml-2018.xsd new file mode 100644 index 00000000..f321d333 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/microsoft/wml-2018.xsd @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/microsoft/wml-cex-2018.xsd b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/microsoft/wml-cex-2018.xsd new file mode 100644 index 00000000..364c6a9b --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/microsoft/wml-cex-2018.xsd @@ -0,0 +1,20 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/microsoft/wml-cid-2016.xsd b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/microsoft/wml-cid-2016.xsd new file mode 100644 index 00000000..fed9d15b --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/microsoft/wml-cid-2016.xsd @@ -0,0 +1,13 @@ + + + + + + + + + + + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd new file mode 100644 index 00000000..680cf154 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd @@ -0,0 +1,4 @@ + + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/microsoft/wml-symex-2015.xsd b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/microsoft/wml-symex-2015.xsd new file mode 100644 index 00000000..89ada908 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/schemas/microsoft/wml-symex-2015.xsd @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/soffice.py b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/soffice.py new file mode 100644 index 00000000..c7f7e328 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/soffice.py @@ -0,0 +1,183 @@ +""" +Helper for running LibreOffice (soffice) in environments where AF_UNIX +sockets may be blocked (e.g., sandboxed VMs). Detects the restriction +at runtime and applies an LD_PRELOAD shim if needed. + +Usage: + from office.soffice import run_soffice, get_soffice_env + + # Option 1 – run soffice directly + result = run_soffice(["--headless", "--convert-to", "pdf", "input.docx"]) + + # Option 2 – get env dict for your own subprocess calls + env = get_soffice_env() + subprocess.run(["soffice", ...], env=env) +""" + +import os +import socket +import subprocess +import tempfile +from pathlib import Path + + +def get_soffice_env() -> dict: + env = os.environ.copy() + env["SAL_USE_VCLPLUGIN"] = "svp" + + if _needs_shim(): + shim = _ensure_shim() + env["LD_PRELOAD"] = str(shim) + + return env + + +def run_soffice(args: list[str], **kwargs) -> subprocess.CompletedProcess: + env = get_soffice_env() + return subprocess.run(["soffice"] + args, env=env, **kwargs) + + + +_SHIM_SO = Path(tempfile.gettempdir()) / "lo_socket_shim.so" + + +def _needs_shim() -> bool: + try: + s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + s.close() + return False + except OSError: + return True + + +def _ensure_shim() -> Path: + if _SHIM_SO.exists(): + return _SHIM_SO + + src = Path(tempfile.gettempdir()) / "lo_socket_shim.c" + src.write_text(_SHIM_SOURCE) + subprocess.run( + ["gcc", "-shared", "-fPIC", "-o", str(_SHIM_SO), str(src), "-ldl"], + check=True, + capture_output=True, + ) + src.unlink() + return _SHIM_SO + + + +_SHIM_SOURCE = r""" +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include + +static int (*real_socket)(int, int, int); +static int (*real_socketpair)(int, int, int, int[2]); +static int (*real_listen)(int, int); +static int (*real_accept)(int, struct sockaddr *, socklen_t *); +static int (*real_close)(int); +static int (*real_read)(int, void *, size_t); + +/* Per-FD bookkeeping (FDs >= 1024 are passed through unshimmed). */ +static int is_shimmed[1024]; +static int peer_of[1024]; +static int wake_r[1024]; /* accept() blocks reading this */ +static int wake_w[1024]; /* close() writes to this */ +static int listener_fd = -1; /* FD that received listen() */ + +__attribute__((constructor)) +static void init(void) { + real_socket = dlsym(RTLD_NEXT, "socket"); + real_socketpair = dlsym(RTLD_NEXT, "socketpair"); + real_listen = dlsym(RTLD_NEXT, "listen"); + real_accept = dlsym(RTLD_NEXT, "accept"); + real_close = dlsym(RTLD_NEXT, "close"); + real_read = dlsym(RTLD_NEXT, "read"); + for (int i = 0; i < 1024; i++) { + peer_of[i] = -1; + wake_r[i] = -1; + wake_w[i] = -1; + } +} + +/* ---- socket ---------------------------------------------------------- */ +int socket(int domain, int type, int protocol) { + if (domain == AF_UNIX) { + int fd = real_socket(domain, type, protocol); + if (fd >= 0) return fd; + /* socket(AF_UNIX) blocked – fall back to socketpair(). */ + int sv[2]; + if (real_socketpair(domain, type, protocol, sv) == 0) { + if (sv[0] >= 0 && sv[0] < 1024) { + is_shimmed[sv[0]] = 1; + peer_of[sv[0]] = sv[1]; + int wp[2]; + if (pipe(wp) == 0) { + wake_r[sv[0]] = wp[0]; + wake_w[sv[0]] = wp[1]; + } + } + return sv[0]; + } + errno = EPERM; + return -1; + } + return real_socket(domain, type, protocol); +} + +/* ---- listen ---------------------------------------------------------- */ +int listen(int sockfd, int backlog) { + if (sockfd >= 0 && sockfd < 1024 && is_shimmed[sockfd]) { + listener_fd = sockfd; + return 0; + } + return real_listen(sockfd, backlog); +} + +/* ---- accept ---------------------------------------------------------- */ +int accept(int sockfd, struct sockaddr *addr, socklen_t *addrlen) { + if (sockfd >= 0 && sockfd < 1024 && is_shimmed[sockfd]) { + /* Block until close() writes to the wake pipe. */ + if (wake_r[sockfd] >= 0) { + char buf; + real_read(wake_r[sockfd], &buf, 1); + } + errno = ECONNABORTED; + return -1; + } + return real_accept(sockfd, addr, addrlen); +} + +/* ---- close ----------------------------------------------------------- */ +int close(int fd) { + if (fd >= 0 && fd < 1024 && is_shimmed[fd]) { + int was_listener = (fd == listener_fd); + is_shimmed[fd] = 0; + + if (wake_w[fd] >= 0) { /* unblock accept() */ + char c = 0; + write(wake_w[fd], &c, 1); + real_close(wake_w[fd]); + wake_w[fd] = -1; + } + if (wake_r[fd] >= 0) { real_close(wake_r[fd]); wake_r[fd] = -1; } + if (peer_of[fd] >= 0) { real_close(peer_of[fd]); peer_of[fd] = -1; } + + if (was_listener) + _exit(0); /* conversion done – exit */ + } + return real_close(fd); +} +""" + + + +if __name__ == "__main__": + import sys + result = run_soffice(sys.argv[1:]) + sys.exit(result.returncode) diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/unpack.py b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/unpack.py new file mode 100644 index 00000000..00152533 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/unpack.py @@ -0,0 +1,132 @@ +"""Unpack Office files (DOCX, PPTX, XLSX) for editing. + +Extracts the ZIP archive, pretty-prints XML files, and optionally: +- Merges adjacent runs with identical formatting (DOCX only) +- Simplifies adjacent tracked changes from same author (DOCX only) + +Usage: + python unpack.py [options] + +Examples: + python unpack.py document.docx unpacked/ + python unpack.py presentation.pptx unpacked/ + python unpack.py document.docx unpacked/ --merge-runs false +""" + +import argparse +import sys +import zipfile +from pathlib import Path + +import defusedxml.minidom + +from helpers.merge_runs import merge_runs as do_merge_runs +from helpers.simplify_redlines import simplify_redlines as do_simplify_redlines + +SMART_QUOTE_REPLACEMENTS = { + "\u201c": "“", + "\u201d": "”", + "\u2018": "‘", + "\u2019": "’", +} + + +def unpack( + input_file: str, + output_directory: str, + merge_runs: bool = True, + simplify_redlines: bool = True, +) -> tuple[None, str]: + input_path = Path(input_file) + output_path = Path(output_directory) + suffix = input_path.suffix.lower() + + if not input_path.exists(): + return None, f"Error: {input_file} does not exist" + + if suffix not in {".docx", ".pptx", ".xlsx"}: + return None, f"Error: {input_file} must be a .docx, .pptx, or .xlsx file" + + try: + output_path.mkdir(parents=True, exist_ok=True) + + with zipfile.ZipFile(input_path, "r") as zf: + zf.extractall(output_path) + + xml_files = list(output_path.rglob("*.xml")) + list(output_path.rglob("*.rels")) + for xml_file in xml_files: + _pretty_print_xml(xml_file) + + message = f"Unpacked {input_file} ({len(xml_files)} XML files)" + + if suffix == ".docx": + if simplify_redlines: + simplify_count, _ = do_simplify_redlines(str(output_path)) + message += f", simplified {simplify_count} tracked changes" + + if merge_runs: + merge_count, _ = do_merge_runs(str(output_path)) + message += f", merged {merge_count} runs" + + for xml_file in xml_files: + _escape_smart_quotes(xml_file) + + return None, message + + except zipfile.BadZipFile: + return None, f"Error: {input_file} is not a valid Office file" + except Exception as e: + return None, f"Error unpacking: {e}" + + +def _pretty_print_xml(xml_file: Path) -> None: + try: + content = xml_file.read_text(encoding="utf-8") + dom = defusedxml.minidom.parseString(content) + xml_file.write_bytes(dom.toprettyxml(indent=" ", encoding="utf-8")) + except Exception: + pass + + +def _escape_smart_quotes(xml_file: Path) -> None: + try: + content = xml_file.read_text(encoding="utf-8") + for char, entity in SMART_QUOTE_REPLACEMENTS.items(): + content = content.replace(char, entity) + xml_file.write_text(content, encoding="utf-8") + except Exception: + pass + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Unpack an Office file (DOCX, PPTX, XLSX) for editing" + ) + parser.add_argument("input_file", help="Office file to unpack") + parser.add_argument("output_directory", help="Output directory") + parser.add_argument( + "--merge-runs", + type=lambda x: x.lower() == "true", + default=True, + metavar="true|false", + help="Merge adjacent runs with identical formatting (DOCX only, default: true)", + ) + parser.add_argument( + "--simplify-redlines", + type=lambda x: x.lower() == "true", + default=True, + metavar="true|false", + help="Merge adjacent tracked changes from same author (DOCX only, default: true)", + ) + args = parser.parse_args() + + _, message = unpack( + args.input_file, + args.output_directory, + merge_runs=args.merge_runs, + simplify_redlines=args.simplify_redlines, + ) + print(message) + + if "Error" in message: + sys.exit(1) diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/validate.py b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/validate.py new file mode 100644 index 00000000..03b01f6e --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/validate.py @@ -0,0 +1,111 @@ +""" +Command line tool to validate Office document XML files against XSD schemas and tracked changes. + +Usage: + python validate.py [--original ] [--auto-repair] [--author NAME] + +The first argument can be either: +- An unpacked directory containing the Office document XML files +- A packed Office file (.docx/.pptx/.xlsx) which will be unpacked to a temp directory + +Auto-repair fixes: +- paraId/durableId values that exceed OOXML limits +- Missing xml:space="preserve" on w:t elements with whitespace +""" + +import argparse +import sys +import tempfile +import zipfile +from pathlib import Path + +from validators import DOCXSchemaValidator, PPTXSchemaValidator, RedliningValidator + + +def main(): + parser = argparse.ArgumentParser(description="Validate Office document XML files") + parser.add_argument( + "path", + help="Path to unpacked directory or packed Office file (.docx/.pptx/.xlsx)", + ) + parser.add_argument( + "--original", + required=False, + default=None, + help="Path to original file (.docx/.pptx/.xlsx). If omitted, all XSD errors are reported and redlining validation is skipped.", + ) + parser.add_argument( + "-v", + "--verbose", + action="store_true", + help="Enable verbose output", + ) + parser.add_argument( + "--auto-repair", + action="store_true", + help="Automatically repair common issues (hex IDs, whitespace preservation)", + ) + parser.add_argument( + "--author", + default="Claude", + help="Author name for redlining validation (default: Claude)", + ) + args = parser.parse_args() + + path = Path(args.path) + assert path.exists(), f"Error: {path} does not exist" + + original_file = None + if args.original: + original_file = Path(args.original) + assert original_file.is_file(), f"Error: {original_file} is not a file" + assert original_file.suffix.lower() in [".docx", ".pptx", ".xlsx"], ( + f"Error: {original_file} must be a .docx, .pptx, or .xlsx file" + ) + + file_extension = (original_file or path).suffix.lower() + assert file_extension in [".docx", ".pptx", ".xlsx"], ( + f"Error: Cannot determine file type from {path}. Use --original or provide a .docx/.pptx/.xlsx file." + ) + + if path.is_file() and path.suffix.lower() in [".docx", ".pptx", ".xlsx"]: + temp_dir = tempfile.mkdtemp() + with zipfile.ZipFile(path, "r") as zf: + zf.extractall(temp_dir) + unpacked_dir = Path(temp_dir) + else: + assert path.is_dir(), f"Error: {path} is not a directory or Office file" + unpacked_dir = path + + match file_extension: + case ".docx": + validators = [ + DOCXSchemaValidator(unpacked_dir, original_file, verbose=args.verbose), + ] + if original_file: + validators.append( + RedliningValidator(unpacked_dir, original_file, verbose=args.verbose, author=args.author) + ) + case ".pptx": + validators = [ + PPTXSchemaValidator(unpacked_dir, original_file, verbose=args.verbose), + ] + case _: + print(f"Error: Validation not supported for file type {file_extension}") + sys.exit(1) + + if args.auto_repair: + total_repairs = sum(v.repair() for v in validators) + if total_repairs: + print(f"Auto-repaired {total_repairs} issue(s)") + + success = all(v.validate() for v in validators) + + if success: + print("All validations PASSED!") + + sys.exit(0 if success else 1) + + +if __name__ == "__main__": + main() diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/validators/__init__.py b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/validators/__init__.py new file mode 100644 index 00000000..db092ece --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/validators/__init__.py @@ -0,0 +1,15 @@ +""" +Validation modules for Word document processing. +""" + +from .base import BaseSchemaValidator +from .docx import DOCXSchemaValidator +from .pptx import PPTXSchemaValidator +from .redlining import RedliningValidator + +__all__ = [ + "BaseSchemaValidator", + "DOCXSchemaValidator", + "PPTXSchemaValidator", + "RedliningValidator", +] diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/validators/base.py b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/validators/base.py new file mode 100644 index 00000000..db4a06a2 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/validators/base.py @@ -0,0 +1,847 @@ +""" +Base validator with common validation logic for document files. +""" + +import re +from pathlib import Path + +import defusedxml.minidom +import lxml.etree + + +class BaseSchemaValidator: + + IGNORED_VALIDATION_ERRORS = [ + "hyphenationZone", + "purl.org/dc/terms", + ] + + UNIQUE_ID_REQUIREMENTS = { + "comment": ("id", "file"), + "commentrangestart": ("id", "file"), + "commentrangeend": ("id", "file"), + "bookmarkstart": ("id", "file"), + "bookmarkend": ("id", "file"), + "sldid": ("id", "file"), + "sldmasterid": ("id", "global"), + "sldlayoutid": ("id", "global"), + "cm": ("authorid", "file"), + "sheet": ("sheetid", "file"), + "definedname": ("id", "file"), + "cxnsp": ("id", "file"), + "sp": ("id", "file"), + "pic": ("id", "file"), + "grpsp": ("id", "file"), + } + + EXCLUDED_ID_CONTAINERS = { + "sectionlst", + } + + ELEMENT_RELATIONSHIP_TYPES = {} + + SCHEMA_MAPPINGS = { + "word": "ISO-IEC29500-4_2016/wml.xsd", + "ppt": "ISO-IEC29500-4_2016/pml.xsd", + "xl": "ISO-IEC29500-4_2016/sml.xsd", + "[Content_Types].xml": "ecma/fouth-edition/opc-contentTypes.xsd", + "app.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd", + "core.xml": "ecma/fouth-edition/opc-coreProperties.xsd", + "custom.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd", + ".rels": "ecma/fouth-edition/opc-relationships.xsd", + "people.xml": "microsoft/wml-2012.xsd", + "commentsIds.xml": "microsoft/wml-cid-2016.xsd", + "commentsExtensible.xml": "microsoft/wml-cex-2018.xsd", + "commentsExtended.xml": "microsoft/wml-2012.xsd", + "chart": "ISO-IEC29500-4_2016/dml-chart.xsd", + "theme": "ISO-IEC29500-4_2016/dml-main.xsd", + "drawing": "ISO-IEC29500-4_2016/dml-main.xsd", + } + + MC_NAMESPACE = "http://schemas.openxmlformats.org/markup-compatibility/2006" + XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace" + + PACKAGE_RELATIONSHIPS_NAMESPACE = ( + "http://schemas.openxmlformats.org/package/2006/relationships" + ) + OFFICE_RELATIONSHIPS_NAMESPACE = ( + "http://schemas.openxmlformats.org/officeDocument/2006/relationships" + ) + CONTENT_TYPES_NAMESPACE = ( + "http://schemas.openxmlformats.org/package/2006/content-types" + ) + + MAIN_CONTENT_FOLDERS = {"word", "ppt", "xl"} + + OOXML_NAMESPACES = { + "http://schemas.openxmlformats.org/officeDocument/2006/math", + "http://schemas.openxmlformats.org/officeDocument/2006/relationships", + "http://schemas.openxmlformats.org/schemaLibrary/2006/main", + "http://schemas.openxmlformats.org/drawingml/2006/main", + "http://schemas.openxmlformats.org/drawingml/2006/chart", + "http://schemas.openxmlformats.org/drawingml/2006/chartDrawing", + "http://schemas.openxmlformats.org/drawingml/2006/diagram", + "http://schemas.openxmlformats.org/drawingml/2006/picture", + "http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing", + "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing", + "http://schemas.openxmlformats.org/wordprocessingml/2006/main", + "http://schemas.openxmlformats.org/presentationml/2006/main", + "http://schemas.openxmlformats.org/spreadsheetml/2006/main", + "http://schemas.openxmlformats.org/officeDocument/2006/sharedTypes", + "http://www.w3.org/XML/1998/namespace", + } + + def __init__(self, unpacked_dir, original_file=None, verbose=False): + self.unpacked_dir = Path(unpacked_dir).resolve() + self.original_file = Path(original_file) if original_file else None + self.verbose = verbose + + self.schemas_dir = Path(__file__).parent.parent / "schemas" + + patterns = ["*.xml", "*.rels"] + self.xml_files = [ + f for pattern in patterns for f in self.unpacked_dir.rglob(pattern) + ] + + if not self.xml_files: + print(f"Warning: No XML files found in {self.unpacked_dir}") + + def validate(self): + raise NotImplementedError("Subclasses must implement the validate method") + + def repair(self) -> int: + return self.repair_whitespace_preservation() + + def repair_whitespace_preservation(self) -> int: + repairs = 0 + + for xml_file in self.xml_files: + try: + content = xml_file.read_text(encoding="utf-8") + dom = defusedxml.minidom.parseString(content) + modified = False + + for elem in dom.getElementsByTagName("*"): + if elem.tagName.endswith(":t") and elem.firstChild: + text = elem.firstChild.nodeValue + if text and (text.startswith((' ', '\t')) or text.endswith((' ', '\t'))): + if elem.getAttribute("xml:space") != "preserve": + elem.setAttribute("xml:space", "preserve") + text_preview = repr(text[:30]) + "..." if len(text) > 30 else repr(text) + print(f" Repaired: {xml_file.name}: Added xml:space='preserve' to {elem.tagName}: {text_preview}") + repairs += 1 + modified = True + + if modified: + xml_file.write_bytes(dom.toxml(encoding="UTF-8")) + + except Exception: + pass + + return repairs + + def validate_xml(self): + errors = [] + + for xml_file in self.xml_files: + try: + lxml.etree.parse(str(xml_file)) + except lxml.etree.XMLSyntaxError as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {e.lineno}: {e.msg}" + ) + except Exception as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Unexpected error: {str(e)}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} XML violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All XML files are well-formed") + return True + + def validate_namespaces(self): + errors = [] + + for xml_file in self.xml_files: + try: + root = lxml.etree.parse(str(xml_file)).getroot() + declared = set(root.nsmap.keys()) - {None} + + for attr_val in [ + v for k, v in root.attrib.items() if k.endswith("Ignorable") + ]: + undeclared = set(attr_val.split()) - declared + errors.extend( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Namespace '{ns}' in Ignorable but not declared" + for ns in undeclared + ) + except lxml.etree.XMLSyntaxError: + continue + + if errors: + print(f"FAILED - {len(errors)} namespace issues:") + for error in errors: + print(error) + return False + if self.verbose: + print("PASSED - All namespace prefixes properly declared") + return True + + def validate_unique_ids(self): + errors = [] + global_ids = {} + + for xml_file in self.xml_files: + try: + root = lxml.etree.parse(str(xml_file)).getroot() + file_ids = {} + + mc_elements = root.xpath( + ".//mc:AlternateContent", namespaces={"mc": self.MC_NAMESPACE} + ) + for elem in mc_elements: + elem.getparent().remove(elem) + + for elem in root.iter(): + tag = ( + elem.tag.split("}")[-1].lower() + if "}" in elem.tag + else elem.tag.lower() + ) + + if tag in self.UNIQUE_ID_REQUIREMENTS: + in_excluded_container = any( + ancestor.tag.split("}")[-1].lower() in self.EXCLUDED_ID_CONTAINERS + for ancestor in elem.iterancestors() + ) + if in_excluded_container: + continue + + attr_name, scope = self.UNIQUE_ID_REQUIREMENTS[tag] + + id_value = None + for attr, value in elem.attrib.items(): + attr_local = ( + attr.split("}")[-1].lower() + if "}" in attr + else attr.lower() + ) + if attr_local == attr_name: + id_value = value + break + + if id_value is not None: + if scope == "global": + if id_value in global_ids: + prev_file, prev_line, prev_tag = global_ids[ + id_value + ] + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: Global ID '{id_value}' in <{tag}> " + f"already used in {prev_file} at line {prev_line} in <{prev_tag}>" + ) + else: + global_ids[id_value] = ( + xml_file.relative_to(self.unpacked_dir), + elem.sourceline, + tag, + ) + elif scope == "file": + key = (tag, attr_name) + if key not in file_ids: + file_ids[key] = {} + + if id_value in file_ids[key]: + prev_line = file_ids[key][id_value] + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: Duplicate {attr_name}='{id_value}' in <{tag}> " + f"(first occurrence at line {prev_line})" + ) + else: + file_ids[key][id_value] = elem.sourceline + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} ID uniqueness violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All required IDs are unique") + return True + + def validate_file_references(self): + errors = [] + + rels_files = list(self.unpacked_dir.rglob("*.rels")) + + if not rels_files: + if self.verbose: + print("PASSED - No .rels files found") + return True + + all_files = [] + for file_path in self.unpacked_dir.rglob("*"): + if ( + file_path.is_file() + and file_path.name != "[Content_Types].xml" + and not file_path.name.endswith(".rels") + ): + all_files.append(file_path.resolve()) + + all_referenced_files = set() + + if self.verbose: + print( + f"Found {len(rels_files)} .rels files and {len(all_files)} target files" + ) + + for rels_file in rels_files: + try: + rels_root = lxml.etree.parse(str(rels_file)).getroot() + + rels_dir = rels_file.parent + + referenced_files = set() + broken_refs = [] + + for rel in rels_root.findall( + ".//ns:Relationship", + namespaces={"ns": self.PACKAGE_RELATIONSHIPS_NAMESPACE}, + ): + target = rel.get("Target") + if target and not target.startswith( + ("http", "mailto:") + ): + if target.startswith("/"): + target_path = self.unpacked_dir / target.lstrip("/") + elif rels_file.name == ".rels": + target_path = self.unpacked_dir / target + else: + base_dir = rels_dir.parent + target_path = base_dir / target + + try: + target_path = target_path.resolve() + if target_path.exists() and target_path.is_file(): + referenced_files.add(target_path) + all_referenced_files.add(target_path) + else: + broken_refs.append((target, rel.sourceline)) + except (OSError, ValueError): + broken_refs.append((target, rel.sourceline)) + + if broken_refs: + rel_path = rels_file.relative_to(self.unpacked_dir) + for broken_ref, line_num in broken_refs: + errors.append( + f" {rel_path}: Line {line_num}: Broken reference to {broken_ref}" + ) + + except Exception as e: + rel_path = rels_file.relative_to(self.unpacked_dir) + errors.append(f" Error parsing {rel_path}: {e}") + + unreferenced_files = set(all_files) - all_referenced_files + + if unreferenced_files: + for unref_file in sorted(unreferenced_files): + unref_rel_path = unref_file.relative_to(self.unpacked_dir) + errors.append(f" Unreferenced file: {unref_rel_path}") + + if errors: + print(f"FAILED - Found {len(errors)} relationship validation errors:") + for error in errors: + print(error) + print( + "CRITICAL: These errors will cause the document to appear corrupt. " + + "Broken references MUST be fixed, " + + "and unreferenced files MUST be referenced or removed." + ) + return False + else: + if self.verbose: + print( + "PASSED - All references are valid and all files are properly referenced" + ) + return True + + def validate_all_relationship_ids(self): + import lxml.etree + + errors = [] + + for xml_file in self.xml_files: + if xml_file.suffix == ".rels": + continue + + rels_dir = xml_file.parent / "_rels" + rels_file = rels_dir / f"{xml_file.name}.rels" + + if not rels_file.exists(): + continue + + try: + rels_root = lxml.etree.parse(str(rels_file)).getroot() + rid_to_type = {} + + for rel in rels_root.findall( + f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship" + ): + rid = rel.get("Id") + rel_type = rel.get("Type", "") + if rid: + if rid in rid_to_type: + rels_rel_path = rels_file.relative_to(self.unpacked_dir) + errors.append( + f" {rels_rel_path}: Line {rel.sourceline}: " + f"Duplicate relationship ID '{rid}' (IDs must be unique)" + ) + type_name = ( + rel_type.split("/")[-1] if "/" in rel_type else rel_type + ) + rid_to_type[rid] = type_name + + xml_root = lxml.etree.parse(str(xml_file)).getroot() + + r_ns = self.OFFICE_RELATIONSHIPS_NAMESPACE + rid_attrs_to_check = ["id", "embed", "link"] + for elem in xml_root.iter(): + for attr_name in rid_attrs_to_check: + rid_attr = elem.get(f"{{{r_ns}}}{attr_name}") + if not rid_attr: + continue + xml_rel_path = xml_file.relative_to(self.unpacked_dir) + elem_name = ( + elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag + ) + + if rid_attr not in rid_to_type: + errors.append( + f" {xml_rel_path}: Line {elem.sourceline}: " + f"<{elem_name}> r:{attr_name} references non-existent relationship '{rid_attr}' " + f"(valid IDs: {', '.join(sorted(rid_to_type.keys())[:5])}{'...' if len(rid_to_type) > 5 else ''})" + ) + elif attr_name == "id" and self.ELEMENT_RELATIONSHIP_TYPES: + expected_type = self._get_expected_relationship_type( + elem_name + ) + if expected_type: + actual_type = rid_to_type[rid_attr] + if expected_type not in actual_type.lower(): + errors.append( + f" {xml_rel_path}: Line {elem.sourceline}: " + f"<{elem_name}> references '{rid_attr}' which points to '{actual_type}' " + f"but should point to a '{expected_type}' relationship" + ) + + except Exception as e: + xml_rel_path = xml_file.relative_to(self.unpacked_dir) + errors.append(f" Error processing {xml_rel_path}: {e}") + + if errors: + print(f"FAILED - Found {len(errors)} relationship ID reference errors:") + for error in errors: + print(error) + print("\nThese ID mismatches will cause the document to appear corrupt!") + return False + else: + if self.verbose: + print("PASSED - All relationship ID references are valid") + return True + + def _get_expected_relationship_type(self, element_name): + elem_lower = element_name.lower() + + if elem_lower in self.ELEMENT_RELATIONSHIP_TYPES: + return self.ELEMENT_RELATIONSHIP_TYPES[elem_lower] + + if elem_lower.endswith("id") and len(elem_lower) > 2: + prefix = elem_lower[:-2] + if prefix.endswith("master"): + return prefix.lower() + elif prefix.endswith("layout"): + return prefix.lower() + else: + if prefix == "sld": + return "slide" + return prefix.lower() + + if elem_lower.endswith("reference") and len(elem_lower) > 9: + prefix = elem_lower[:-9] + return prefix.lower() + + return None + + def validate_content_types(self): + errors = [] + + content_types_file = self.unpacked_dir / "[Content_Types].xml" + if not content_types_file.exists(): + print("FAILED - [Content_Types].xml file not found") + return False + + try: + root = lxml.etree.parse(str(content_types_file)).getroot() + declared_parts = set() + declared_extensions = set() + + for override in root.findall( + f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Override" + ): + part_name = override.get("PartName") + if part_name is not None: + declared_parts.add(part_name.lstrip("/")) + + for default in root.findall( + f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Default" + ): + extension = default.get("Extension") + if extension is not None: + declared_extensions.add(extension.lower()) + + declarable_roots = { + "sld", + "sldLayout", + "sldMaster", + "presentation", + "document", + "workbook", + "worksheet", + "theme", + } + + media_extensions = { + "png": "image/png", + "jpg": "image/jpeg", + "jpeg": "image/jpeg", + "gif": "image/gif", + "bmp": "image/bmp", + "tiff": "image/tiff", + "wmf": "image/x-wmf", + "emf": "image/x-emf", + } + + all_files = list(self.unpacked_dir.rglob("*")) + all_files = [f for f in all_files if f.is_file()] + + for xml_file in self.xml_files: + path_str = str(xml_file.relative_to(self.unpacked_dir)).replace( + "\\", "/" + ) + + if any( + skip in path_str + for skip in [".rels", "[Content_Types]", "docProps/", "_rels/"] + ): + continue + + try: + root_tag = lxml.etree.parse(str(xml_file)).getroot().tag + root_name = root_tag.split("}")[-1] if "}" in root_tag else root_tag + + if root_name in declarable_roots and path_str not in declared_parts: + errors.append( + f" {path_str}: File with <{root_name}> root not declared in [Content_Types].xml" + ) + + except Exception: + continue + + for file_path in all_files: + if file_path.suffix.lower() in {".xml", ".rels"}: + continue + if file_path.name == "[Content_Types].xml": + continue + if "_rels" in file_path.parts or "docProps" in file_path.parts: + continue + + extension = file_path.suffix.lstrip(".").lower() + if extension and extension not in declared_extensions: + if extension in media_extensions: + relative_path = file_path.relative_to(self.unpacked_dir) + errors.append( + f' {relative_path}: File with extension \'{extension}\' not declared in [Content_Types].xml - should add: ' + ) + + except Exception as e: + errors.append(f" Error parsing [Content_Types].xml: {e}") + + if errors: + print(f"FAILED - Found {len(errors)} content type declaration errors:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print( + "PASSED - All content files are properly declared in [Content_Types].xml" + ) + return True + + def validate_file_against_xsd(self, xml_file, verbose=False): + xml_file = Path(xml_file).resolve() + unpacked_dir = self.unpacked_dir.resolve() + + is_valid, current_errors = self._validate_single_file_xsd( + xml_file, unpacked_dir + ) + + if is_valid is None: + return None, set() + elif is_valid: + return True, set() + + original_errors = self._get_original_file_errors(xml_file) + + assert current_errors is not None + new_errors = current_errors - original_errors + + new_errors = { + e for e in new_errors + if not any(pattern in e for pattern in self.IGNORED_VALIDATION_ERRORS) + } + + if new_errors: + if verbose: + relative_path = xml_file.relative_to(unpacked_dir) + print(f"FAILED - {relative_path}: {len(new_errors)} new error(s)") + for error in list(new_errors)[:3]: + truncated = error[:250] + "..." if len(error) > 250 else error + print(f" - {truncated}") + return False, new_errors + else: + if verbose: + print( + f"PASSED - No new errors (original had {len(current_errors)} errors)" + ) + return True, set() + + def validate_against_xsd(self): + new_errors = [] + original_error_count = 0 + valid_count = 0 + skipped_count = 0 + + for xml_file in self.xml_files: + relative_path = str(xml_file.relative_to(self.unpacked_dir)) + is_valid, new_file_errors = self.validate_file_against_xsd( + xml_file, verbose=False + ) + + if is_valid is None: + skipped_count += 1 + continue + elif is_valid and not new_file_errors: + valid_count += 1 + continue + elif is_valid: + original_error_count += 1 + valid_count += 1 + continue + + new_errors.append(f" {relative_path}: {len(new_file_errors)} new error(s)") + for error in list(new_file_errors)[:3]: + new_errors.append( + f" - {error[:250]}..." if len(error) > 250 else f" - {error}" + ) + + if self.verbose: + print(f"Validated {len(self.xml_files)} files:") + print(f" - Valid: {valid_count}") + print(f" - Skipped (no schema): {skipped_count}") + if original_error_count: + print(f" - With original errors (ignored): {original_error_count}") + print( + f" - With NEW errors: {len(new_errors) > 0 and len([e for e in new_errors if not e.startswith(' ')]) or 0}" + ) + + if new_errors: + print("\nFAILED - Found NEW validation errors:") + for error in new_errors: + print(error) + return False + else: + if self.verbose: + print("\nPASSED - No new XSD validation errors introduced") + return True + + def _get_schema_path(self, xml_file): + if xml_file.name in self.SCHEMA_MAPPINGS: + return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.name] + + if xml_file.suffix == ".rels": + return self.schemas_dir / self.SCHEMA_MAPPINGS[".rels"] + + if "charts/" in str(xml_file) and xml_file.name.startswith("chart"): + return self.schemas_dir / self.SCHEMA_MAPPINGS["chart"] + + if "theme/" in str(xml_file) and xml_file.name.startswith("theme"): + return self.schemas_dir / self.SCHEMA_MAPPINGS["theme"] + + if xml_file.parent.name in self.MAIN_CONTENT_FOLDERS: + return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.parent.name] + + return None + + def _clean_ignorable_namespaces(self, xml_doc): + xml_string = lxml.etree.tostring(xml_doc, encoding="unicode") + xml_copy = lxml.etree.fromstring(xml_string) + + for elem in xml_copy.iter(): + attrs_to_remove = [] + + for attr in elem.attrib: + if "{" in attr: + ns = attr.split("}")[0][1:] + if ns not in self.OOXML_NAMESPACES: + attrs_to_remove.append(attr) + + for attr in attrs_to_remove: + del elem.attrib[attr] + + self._remove_ignorable_elements(xml_copy) + + return lxml.etree.ElementTree(xml_copy) + + def _remove_ignorable_elements(self, root): + elements_to_remove = [] + + for elem in list(root): + if not hasattr(elem, "tag") or callable(elem.tag): + continue + + tag_str = str(elem.tag) + if tag_str.startswith("{"): + ns = tag_str.split("}")[0][1:] + if ns not in self.OOXML_NAMESPACES: + elements_to_remove.append(elem) + continue + + self._remove_ignorable_elements(elem) + + for elem in elements_to_remove: + root.remove(elem) + + def _preprocess_for_mc_ignorable(self, xml_doc): + root = xml_doc.getroot() + + if f"{{{self.MC_NAMESPACE}}}Ignorable" in root.attrib: + del root.attrib[f"{{{self.MC_NAMESPACE}}}Ignorable"] + + return xml_doc + + def _validate_single_file_xsd(self, xml_file, base_path): + schema_path = self._get_schema_path(xml_file) + if not schema_path: + return None, None + + try: + with open(schema_path, "rb") as xsd_file: + parser = lxml.etree.XMLParser() + xsd_doc = lxml.etree.parse( + xsd_file, parser=parser, base_url=str(schema_path) + ) + schema = lxml.etree.XMLSchema(xsd_doc) + + with open(xml_file, "r") as f: + xml_doc = lxml.etree.parse(f) + + xml_doc, _ = self._remove_template_tags_from_text_nodes(xml_doc) + xml_doc = self._preprocess_for_mc_ignorable(xml_doc) + + relative_path = xml_file.relative_to(base_path) + if ( + relative_path.parts + and relative_path.parts[0] in self.MAIN_CONTENT_FOLDERS + ): + xml_doc = self._clean_ignorable_namespaces(xml_doc) + + if schema.validate(xml_doc): + return True, set() + else: + errors = set() + for error in schema.error_log: + errors.add(error.message) + return False, errors + + except Exception as e: + return False, {str(e)} + + def _get_original_file_errors(self, xml_file): + if self.original_file is None: + return set() + + import tempfile + import zipfile + + xml_file = Path(xml_file).resolve() + unpacked_dir = self.unpacked_dir.resolve() + relative_path = xml_file.relative_to(unpacked_dir) + + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + with zipfile.ZipFile(self.original_file, "r") as zip_ref: + zip_ref.extractall(temp_path) + + original_xml_file = temp_path / relative_path + + if not original_xml_file.exists(): + return set() + + is_valid, errors = self._validate_single_file_xsd( + original_xml_file, temp_path + ) + return errors if errors else set() + + def _remove_template_tags_from_text_nodes(self, xml_doc): + warnings = [] + template_pattern = re.compile(r"\{\{[^}]*\}\}") + + xml_string = lxml.etree.tostring(xml_doc, encoding="unicode") + xml_copy = lxml.etree.fromstring(xml_string) + + def process_text_content(text, content_type): + if not text: + return text + matches = list(template_pattern.finditer(text)) + if matches: + for match in matches: + warnings.append( + f"Found template tag in {content_type}: {match.group()}" + ) + return template_pattern.sub("", text) + return text + + for elem in xml_copy.iter(): + if not hasattr(elem, "tag") or callable(elem.tag): + continue + tag_str = str(elem.tag) + if tag_str.endswith("}t") or tag_str == "t": + continue + + elem.text = process_text_content(elem.text, "text content") + elem.tail = process_text_content(elem.tail, "tail content") + + return lxml.etree.ElementTree(xml_copy), warnings + + +if __name__ == "__main__": + raise RuntimeError("This module should not be run directly.") diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/validators/docx.py b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/validators/docx.py new file mode 100644 index 00000000..fec405e6 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/validators/docx.py @@ -0,0 +1,446 @@ +""" +Validator for Word document XML files against XSD schemas. +""" + +import random +import re +import tempfile +import zipfile + +import defusedxml.minidom +import lxml.etree + +from .base import BaseSchemaValidator + + +class DOCXSchemaValidator(BaseSchemaValidator): + + WORD_2006_NAMESPACE = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + W14_NAMESPACE = "http://schemas.microsoft.com/office/word/2010/wordml" + W16CID_NAMESPACE = "http://schemas.microsoft.com/office/word/2016/wordml/cid" + + ELEMENT_RELATIONSHIP_TYPES = {} + + def validate(self): + if not self.validate_xml(): + return False + + all_valid = True + if not self.validate_namespaces(): + all_valid = False + + if not self.validate_unique_ids(): + all_valid = False + + if not self.validate_file_references(): + all_valid = False + + if not self.validate_content_types(): + all_valid = False + + if not self.validate_against_xsd(): + all_valid = False + + if not self.validate_whitespace_preservation(): + all_valid = False + + if not self.validate_deletions(): + all_valid = False + + if not self.validate_insertions(): + all_valid = False + + if not self.validate_all_relationship_ids(): + all_valid = False + + if not self.validate_id_constraints(): + all_valid = False + + if not self.validate_comment_markers(): + all_valid = False + + self.compare_paragraph_counts() + + return all_valid + + def validate_whitespace_preservation(self): + errors = [] + + for xml_file in self.xml_files: + if xml_file.name != "document.xml": + continue + + try: + root = lxml.etree.parse(str(xml_file)).getroot() + + for elem in root.iter(f"{{{self.WORD_2006_NAMESPACE}}}t"): + if elem.text: + text = elem.text + if re.search(r"^[ \t\n\r]", text) or re.search( + r"[ \t\n\r]$", text + ): + xml_space_attr = f"{{{self.XML_NAMESPACE}}}space" + if ( + xml_space_attr not in elem.attrib + or elem.attrib[xml_space_attr] != "preserve" + ): + text_preview = ( + repr(text)[:50] + "..." + if len(repr(text)) > 50 + else repr(text) + ) + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: w:t element with whitespace missing xml:space='preserve': {text_preview}" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} whitespace preservation violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All whitespace is properly preserved") + return True + + def validate_deletions(self): + errors = [] + + for xml_file in self.xml_files: + if xml_file.name != "document.xml": + continue + + try: + root = lxml.etree.parse(str(xml_file)).getroot() + namespaces = {"w": self.WORD_2006_NAMESPACE} + + for t_elem in root.xpath(".//w:del//w:t", namespaces=namespaces): + if t_elem.text: + text_preview = ( + repr(t_elem.text)[:50] + "..." + if len(repr(t_elem.text)) > 50 + else repr(t_elem.text) + ) + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {t_elem.sourceline}: found within : {text_preview}" + ) + + for instr_elem in root.xpath( + ".//w:del//w:instrText", namespaces=namespaces + ): + text_preview = ( + repr(instr_elem.text or "")[:50] + "..." + if len(repr(instr_elem.text or "")) > 50 + else repr(instr_elem.text or "") + ) + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {instr_elem.sourceline}: found within (use ): {text_preview}" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} deletion validation violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - No w:t elements found within w:del elements") + return True + + def count_paragraphs_in_unpacked(self): + count = 0 + + for xml_file in self.xml_files: + if xml_file.name != "document.xml": + continue + + try: + root = lxml.etree.parse(str(xml_file)).getroot() + paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p") + count = len(paragraphs) + except Exception as e: + print(f"Error counting paragraphs in unpacked document: {e}") + + return count + + def count_paragraphs_in_original(self): + original = self.original_file + if original is None: + return 0 + + count = 0 + + try: + with tempfile.TemporaryDirectory() as temp_dir: + with zipfile.ZipFile(original, "r") as zip_ref: + zip_ref.extractall(temp_dir) + + doc_xml_path = temp_dir + "/word/document.xml" + root = lxml.etree.parse(doc_xml_path).getroot() + + paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p") + count = len(paragraphs) + + except Exception as e: + print(f"Error counting paragraphs in original document: {e}") + + return count + + def validate_insertions(self): + errors = [] + + for xml_file in self.xml_files: + if xml_file.name != "document.xml": + continue + + try: + root = lxml.etree.parse(str(xml_file)).getroot() + namespaces = {"w": self.WORD_2006_NAMESPACE} + + invalid_elements = root.xpath( + ".//w:ins//w:delText[not(ancestor::w:del)]", namespaces=namespaces + ) + + for elem in invalid_elements: + text_preview = ( + repr(elem.text or "")[:50] + "..." + if len(repr(elem.text or "")) > 50 + else repr(elem.text or "") + ) + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: within : {text_preview}" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} insertion validation violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - No w:delText elements within w:ins elements") + return True + + def compare_paragraph_counts(self): + original_count = self.count_paragraphs_in_original() + new_count = self.count_paragraphs_in_unpacked() + + diff = new_count - original_count + diff_str = f"+{diff}" if diff > 0 else str(diff) + print(f"\nParagraphs: {original_count} → {new_count} ({diff_str})") + + def _parse_id_value(self, val: str, base: int = 16) -> int: + return int(val, base) + + def validate_id_constraints(self): + errors = [] + para_id_attr = f"{{{self.W14_NAMESPACE}}}paraId" + durable_id_attr = f"{{{self.W16CID_NAMESPACE}}}durableId" + + for xml_file in self.xml_files: + try: + for elem in lxml.etree.parse(str(xml_file)).iter(): + if val := elem.get(para_id_attr): + if self._parse_id_value(val, base=16) >= 0x80000000: + errors.append( + f" {xml_file.name}:{elem.sourceline}: paraId={val} >= 0x80000000" + ) + + if val := elem.get(durable_id_attr): + if xml_file.name == "numbering.xml": + try: + if self._parse_id_value(val, base=10) >= 0x7FFFFFFF: + errors.append( + f" {xml_file.name}:{elem.sourceline}: " + f"durableId={val} >= 0x7FFFFFFF" + ) + except ValueError: + errors.append( + f" {xml_file.name}:{elem.sourceline}: " + f"durableId={val} must be decimal in numbering.xml" + ) + else: + if self._parse_id_value(val, base=16) >= 0x7FFFFFFF: + errors.append( + f" {xml_file.name}:{elem.sourceline}: " + f"durableId={val} >= 0x7FFFFFFF" + ) + except Exception: + pass + + if errors: + print(f"FAILED - {len(errors)} ID constraint violations:") + for e in errors: + print(e) + elif self.verbose: + print("PASSED - All paraId/durableId values within constraints") + return not errors + + def validate_comment_markers(self): + errors = [] + + document_xml = None + comments_xml = None + for xml_file in self.xml_files: + if xml_file.name == "document.xml" and "word" in str(xml_file): + document_xml = xml_file + elif xml_file.name == "comments.xml": + comments_xml = xml_file + + if not document_xml: + if self.verbose: + print("PASSED - No document.xml found (skipping comment validation)") + return True + + try: + doc_root = lxml.etree.parse(str(document_xml)).getroot() + namespaces = {"w": self.WORD_2006_NAMESPACE} + + range_starts = { + elem.get(f"{{{self.WORD_2006_NAMESPACE}}}id") + for elem in doc_root.xpath( + ".//w:commentRangeStart", namespaces=namespaces + ) + } + range_ends = { + elem.get(f"{{{self.WORD_2006_NAMESPACE}}}id") + for elem in doc_root.xpath( + ".//w:commentRangeEnd", namespaces=namespaces + ) + } + references = { + elem.get(f"{{{self.WORD_2006_NAMESPACE}}}id") + for elem in doc_root.xpath( + ".//w:commentReference", namespaces=namespaces + ) + } + + orphaned_ends = range_ends - range_starts + for comment_id in sorted( + orphaned_ends, key=lambda x: int(x) if x and x.isdigit() else 0 + ): + errors.append( + f' document.xml: commentRangeEnd id="{comment_id}" has no matching commentRangeStart' + ) + + orphaned_starts = range_starts - range_ends + for comment_id in sorted( + orphaned_starts, key=lambda x: int(x) if x and x.isdigit() else 0 + ): + errors.append( + f' document.xml: commentRangeStart id="{comment_id}" has no matching commentRangeEnd' + ) + + comment_ids = set() + if comments_xml and comments_xml.exists(): + comments_root = lxml.etree.parse(str(comments_xml)).getroot() + comment_ids = { + elem.get(f"{{{self.WORD_2006_NAMESPACE}}}id") + for elem in comments_root.xpath( + ".//w:comment", namespaces=namespaces + ) + } + + marker_ids = range_starts | range_ends | references + invalid_refs = marker_ids - comment_ids + for comment_id in sorted( + invalid_refs, key=lambda x: int(x) if x and x.isdigit() else 0 + ): + if comment_id: + errors.append( + f' document.xml: marker id="{comment_id}" references non-existent comment' + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append(f" Error parsing XML: {e}") + + if errors: + print(f"FAILED - {len(errors)} comment marker violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All comment markers properly paired") + return True + + def repair(self) -> int: + repairs = super().repair() + repairs += self.repair_durableId() + return repairs + + def repair_durableId(self) -> int: + repairs = 0 + + for xml_file in self.xml_files: + try: + content = xml_file.read_text(encoding="utf-8") + dom = defusedxml.minidom.parseString(content) + modified = False + + for elem in dom.getElementsByTagName("*"): + if not elem.hasAttribute("w16cid:durableId"): + continue + + durable_id = elem.getAttribute("w16cid:durableId") + needs_repair = False + + if xml_file.name == "numbering.xml": + try: + needs_repair = ( + self._parse_id_value(durable_id, base=10) >= 0x7FFFFFFF + ) + except ValueError: + needs_repair = True + else: + try: + needs_repair = ( + self._parse_id_value(durable_id, base=16) >= 0x7FFFFFFF + ) + except ValueError: + needs_repair = True + + if needs_repair: + value = random.randint(1, 0x7FFFFFFE) + if xml_file.name == "numbering.xml": + new_id = str(value) + else: + new_id = f"{value:08X}" + + elem.setAttribute("w16cid:durableId", new_id) + print( + f" Repaired: {xml_file.name}: durableId {durable_id} → {new_id}" + ) + repairs += 1 + modified = True + + if modified: + xml_file.write_bytes(dom.toxml(encoding="UTF-8")) + + except Exception: + pass + + return repairs + + +if __name__ == "__main__": + raise RuntimeError("This module should not be run directly.") diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/validators/pptx.py b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/validators/pptx.py new file mode 100644 index 00000000..09842aa9 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/validators/pptx.py @@ -0,0 +1,275 @@ +""" +Validator for PowerPoint presentation XML files against XSD schemas. +""" + +import re + +from .base import BaseSchemaValidator + + +class PPTXSchemaValidator(BaseSchemaValidator): + + PRESENTATIONML_NAMESPACE = ( + "http://schemas.openxmlformats.org/presentationml/2006/main" + ) + + ELEMENT_RELATIONSHIP_TYPES = { + "sldid": "slide", + "sldmasterid": "slidemaster", + "notesmasterid": "notesmaster", + "sldlayoutid": "slidelayout", + "themeid": "theme", + "tablestyleid": "tablestyles", + } + + def validate(self): + if not self.validate_xml(): + return False + + all_valid = True + if not self.validate_namespaces(): + all_valid = False + + if not self.validate_unique_ids(): + all_valid = False + + if not self.validate_uuid_ids(): + all_valid = False + + if not self.validate_file_references(): + all_valid = False + + if not self.validate_slide_layout_ids(): + all_valid = False + + if not self.validate_content_types(): + all_valid = False + + if not self.validate_against_xsd(): + all_valid = False + + if not self.validate_notes_slide_references(): + all_valid = False + + if not self.validate_all_relationship_ids(): + all_valid = False + + if not self.validate_no_duplicate_slide_layouts(): + all_valid = False + + return all_valid + + def validate_uuid_ids(self): + import lxml.etree + + errors = [] + uuid_pattern = re.compile( + r"^[\{\(]?[0-9A-Fa-f]{8}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{12}[\}\)]?$" + ) + + for xml_file in self.xml_files: + try: + root = lxml.etree.parse(str(xml_file)).getroot() + + for elem in root.iter(): + for attr, value in elem.attrib.items(): + attr_name = attr.split("}")[-1].lower() + if attr_name == "id" or attr_name.endswith("id"): + if self._looks_like_uuid(value): + if not uuid_pattern.match(value): + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: ID '{value}' appears to be a UUID but contains invalid hex characters" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} UUID ID validation errors:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All UUID-like IDs contain valid hex values") + return True + + def _looks_like_uuid(self, value): + clean_value = value.strip("{}()").replace("-", "") + return len(clean_value) == 32 and all(c.isalnum() for c in clean_value) + + def validate_slide_layout_ids(self): + import lxml.etree + + errors = [] + + slide_masters = list(self.unpacked_dir.glob("ppt/slideMasters/*.xml")) + + if not slide_masters: + if self.verbose: + print("PASSED - No slide masters found") + return True + + for slide_master in slide_masters: + try: + root = lxml.etree.parse(str(slide_master)).getroot() + + rels_file = slide_master.parent / "_rels" / f"{slide_master.name}.rels" + + if not rels_file.exists(): + errors.append( + f" {slide_master.relative_to(self.unpacked_dir)}: " + f"Missing relationships file: {rels_file.relative_to(self.unpacked_dir)}" + ) + continue + + rels_root = lxml.etree.parse(str(rels_file)).getroot() + + valid_layout_rids = set() + for rel in rels_root.findall( + f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship" + ): + rel_type = rel.get("Type", "") + if "slideLayout" in rel_type: + valid_layout_rids.add(rel.get("Id")) + + for sld_layout_id in root.findall( + f".//{{{self.PRESENTATIONML_NAMESPACE}}}sldLayoutId" + ): + r_id = sld_layout_id.get( + f"{{{self.OFFICE_RELATIONSHIPS_NAMESPACE}}}id" + ) + layout_id = sld_layout_id.get("id") + + if r_id and r_id not in valid_layout_rids: + errors.append( + f" {slide_master.relative_to(self.unpacked_dir)}: " + f"Line {sld_layout_id.sourceline}: sldLayoutId with id='{layout_id}' " + f"references r:id='{r_id}' which is not found in slide layout relationships" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {slide_master.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} slide layout ID validation errors:") + for error in errors: + print(error) + print( + "Remove invalid references or add missing slide layouts to the relationships file." + ) + return False + else: + if self.verbose: + print("PASSED - All slide layout IDs reference valid slide layouts") + return True + + def validate_no_duplicate_slide_layouts(self): + import lxml.etree + + errors = [] + slide_rels_files = list(self.unpacked_dir.glob("ppt/slides/_rels/*.xml.rels")) + + for rels_file in slide_rels_files: + try: + root = lxml.etree.parse(str(rels_file)).getroot() + + layout_rels = [ + rel + for rel in root.findall( + f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship" + ) + if "slideLayout" in rel.get("Type", "") + ] + + if len(layout_rels) > 1: + errors.append( + f" {rels_file.relative_to(self.unpacked_dir)}: has {len(layout_rels)} slideLayout references" + ) + + except Exception as e: + errors.append( + f" {rels_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print("FAILED - Found slides with duplicate slideLayout references:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All slides have exactly one slideLayout reference") + return True + + def validate_notes_slide_references(self): + import lxml.etree + + errors = [] + notes_slide_references = {} + + slide_rels_files = list(self.unpacked_dir.glob("ppt/slides/_rels/*.xml.rels")) + + if not slide_rels_files: + if self.verbose: + print("PASSED - No slide relationship files found") + return True + + for rels_file in slide_rels_files: + try: + root = lxml.etree.parse(str(rels_file)).getroot() + + for rel in root.findall( + f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship" + ): + rel_type = rel.get("Type", "") + if "notesSlide" in rel_type: + target = rel.get("Target", "") + if target: + normalized_target = target.replace("../", "") + + slide_name = rels_file.stem.replace( + ".xml", "" + ) + + if normalized_target not in notes_slide_references: + notes_slide_references[normalized_target] = [] + notes_slide_references[normalized_target].append( + (slide_name, rels_file) + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {rels_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + for target, references in notes_slide_references.items(): + if len(references) > 1: + slide_names = [ref[0] for ref in references] + errors.append( + f" Notes slide '{target}' is referenced by multiple slides: {', '.join(slide_names)}" + ) + for slide_name, rels_file in references: + errors.append(f" - {rels_file.relative_to(self.unpacked_dir)}") + + if errors: + print( + f"FAILED - Found {len([e for e in errors if not e.startswith(' ')])} notes slide reference validation errors:" + ) + for error in errors: + print(error) + print("Each slide may optionally have its own slide file.") + return False + else: + if self.verbose: + print("PASSED - All notes slide references are unique") + return True + + +if __name__ == "__main__": + raise RuntimeError("This module should not be run directly.") diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/office/validators/redlining.py b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/validators/redlining.py new file mode 100644 index 00000000..71c81b6b --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/office/validators/redlining.py @@ -0,0 +1,247 @@ +""" +Validator for tracked changes in Word documents. +""" + +import subprocess +import tempfile +import zipfile +from pathlib import Path + + +class RedliningValidator: + + def __init__(self, unpacked_dir, original_docx, verbose=False, author="Claude"): + self.unpacked_dir = Path(unpacked_dir) + self.original_docx = Path(original_docx) + self.verbose = verbose + self.author = author + self.namespaces = { + "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + } + + def repair(self) -> int: + return 0 + + def validate(self): + modified_file = self.unpacked_dir / "word" / "document.xml" + if not modified_file.exists(): + print(f"FAILED - Modified document.xml not found at {modified_file}") + return False + + try: + import xml.etree.ElementTree as ET + + tree = ET.parse(modified_file) + root = tree.getroot() + + del_elements = root.findall(".//w:del", self.namespaces) + ins_elements = root.findall(".//w:ins", self.namespaces) + + author_del_elements = [ + elem + for elem in del_elements + if elem.get(f"{{{self.namespaces['w']}}}author") == self.author + ] + author_ins_elements = [ + elem + for elem in ins_elements + if elem.get(f"{{{self.namespaces['w']}}}author") == self.author + ] + + if not author_del_elements and not author_ins_elements: + if self.verbose: + print(f"PASSED - No tracked changes by {self.author} found.") + return True + + except Exception: + pass + + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + try: + with zipfile.ZipFile(self.original_docx, "r") as zip_ref: + zip_ref.extractall(temp_path) + except Exception as e: + print(f"FAILED - Error unpacking original docx: {e}") + return False + + original_file = temp_path / "word" / "document.xml" + if not original_file.exists(): + print( + f"FAILED - Original document.xml not found in {self.original_docx}" + ) + return False + + try: + import xml.etree.ElementTree as ET + + modified_tree = ET.parse(modified_file) + modified_root = modified_tree.getroot() + original_tree = ET.parse(original_file) + original_root = original_tree.getroot() + except ET.ParseError as e: + print(f"FAILED - Error parsing XML files: {e}") + return False + + self._remove_author_tracked_changes(original_root) + self._remove_author_tracked_changes(modified_root) + + modified_text = self._extract_text_content(modified_root) + original_text = self._extract_text_content(original_root) + + if modified_text != original_text: + error_message = self._generate_detailed_diff( + original_text, modified_text + ) + print(error_message) + return False + + if self.verbose: + print(f"PASSED - All changes by {self.author} are properly tracked") + return True + + def _generate_detailed_diff(self, original_text, modified_text): + error_parts = [ + f"FAILED - Document text doesn't match after removing {self.author}'s tracked changes", + "", + "Likely causes:", + " 1. Modified text inside another author's or tags", + " 2. Made edits without proper tracked changes", + " 3. Didn't nest inside when deleting another's insertion", + "", + "For pre-redlined documents, use correct patterns:", + " - To reject another's INSERTION: Nest inside their ", + " - To restore another's DELETION: Add new AFTER their ", + "", + ] + + git_diff = self._get_git_word_diff(original_text, modified_text) + if git_diff: + error_parts.extend(["Differences:", "============", git_diff]) + else: + error_parts.append("Unable to generate word diff (git not available)") + + return "\n".join(error_parts) + + def _get_git_word_diff(self, original_text, modified_text): + try: + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + original_file = temp_path / "original.txt" + modified_file = temp_path / "modified.txt" + + original_file.write_text(original_text, encoding="utf-8") + modified_file.write_text(modified_text, encoding="utf-8") + + result = subprocess.run( + [ + "git", + "diff", + "--word-diff=plain", + "--word-diff-regex=.", + "-U0", + "--no-index", + str(original_file), + str(modified_file), + ], + capture_output=True, + text=True, + ) + + if result.stdout.strip(): + lines = result.stdout.split("\n") + content_lines = [] + in_content = False + for line in lines: + if line.startswith("@@"): + in_content = True + continue + if in_content and line.strip(): + content_lines.append(line) + + if content_lines: + return "\n".join(content_lines) + + result = subprocess.run( + [ + "git", + "diff", + "--word-diff=plain", + "-U0", + "--no-index", + str(original_file), + str(modified_file), + ], + capture_output=True, + text=True, + ) + + if result.stdout.strip(): + lines = result.stdout.split("\n") + content_lines = [] + in_content = False + for line in lines: + if line.startswith("@@"): + in_content = True + continue + if in_content and line.strip(): + content_lines.append(line) + return "\n".join(content_lines) + + except (subprocess.CalledProcessError, FileNotFoundError, Exception): + pass + + return None + + def _remove_author_tracked_changes(self, root): + ins_tag = f"{{{self.namespaces['w']}}}ins" + del_tag = f"{{{self.namespaces['w']}}}del" + author_attr = f"{{{self.namespaces['w']}}}author" + + for parent in root.iter(): + to_remove = [] + for child in parent: + if child.tag == ins_tag and child.get(author_attr) == self.author: + to_remove.append(child) + for elem in to_remove: + parent.remove(elem) + + deltext_tag = f"{{{self.namespaces['w']}}}delText" + t_tag = f"{{{self.namespaces['w']}}}t" + + for parent in root.iter(): + to_process = [] + for child in parent: + if child.tag == del_tag and child.get(author_attr) == self.author: + to_process.append((child, list(parent).index(child))) + + for del_elem, del_index in reversed(to_process): + for elem in del_elem.iter(): + if elem.tag == deltext_tag: + elem.tag = t_tag + + for child in reversed(list(del_elem)): + parent.insert(del_index, child) + parent.remove(del_elem) + + def _extract_text_content(self, root): + p_tag = f"{{{self.namespaces['w']}}}p" + t_tag = f"{{{self.namespaces['w']}}}t" + + paragraphs = [] + for p_elem in root.findall(f".//{p_tag}"): + text_parts = [] + for t_elem in p_elem.findall(f".//{t_tag}"): + if t_elem.text: + text_parts.append(t_elem.text) + paragraph_text = "".join(text_parts) + if paragraph_text: + paragraphs.append(paragraph_text) + + return "\n".join(paragraphs) + + +if __name__ == "__main__": + raise RuntimeError("This module should not be run directly.") diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/templates/comments.xml b/ruoyi-admin/src/main/resources/skills/docx/scripts/templates/comments.xml new file mode 100644 index 00000000..cd01a7d7 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/templates/comments.xml @@ -0,0 +1,3 @@ + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/templates/commentsExtended.xml b/ruoyi-admin/src/main/resources/skills/docx/scripts/templates/commentsExtended.xml new file mode 100644 index 00000000..411003cc --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/templates/commentsExtended.xml @@ -0,0 +1,3 @@ + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/templates/commentsExtensible.xml b/ruoyi-admin/src/main/resources/skills/docx/scripts/templates/commentsExtensible.xml new file mode 100644 index 00000000..f5572d71 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/templates/commentsExtensible.xml @@ -0,0 +1,3 @@ + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/templates/commentsIds.xml b/ruoyi-admin/src/main/resources/skills/docx/scripts/templates/commentsIds.xml new file mode 100644 index 00000000..32f1629f --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/templates/commentsIds.xml @@ -0,0 +1,3 @@ + + + diff --git a/ruoyi-admin/src/main/resources/skills/docx/scripts/templates/people.xml b/ruoyi-admin/src/main/resources/skills/docx/scripts/templates/people.xml new file mode 100644 index 00000000..3803d2de --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/docx/scripts/templates/people.xml @@ -0,0 +1,3 @@ + + + diff --git a/ruoyi-admin/src/main/resources/skills/pdf/LICENSE.txt b/ruoyi-admin/src/main/resources/skills/pdf/LICENSE.txt new file mode 100644 index 00000000..c55ab422 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/pdf/LICENSE.txt @@ -0,0 +1,30 @@ +© 2025 Anthropic, PBC. All rights reserved. + +LICENSE: Use of these materials (including all code, prompts, assets, files, +and other components of this Skill) is governed by your agreement with +Anthropic regarding use of Anthropic's services. If no separate agreement +exists, use is governed by Anthropic's Consumer Terms of Service or +Commercial Terms of Service, as applicable: +https://www.anthropic.com/legal/consumer-terms +https://www.anthropic.com/legal/commercial-terms +Your applicable agreement is referred to as the "Agreement." "Services" are +as defined in the Agreement. + +ADDITIONAL RESTRICTIONS: Notwithstanding anything in the Agreement to the +contrary, users may not: + +- Extract these materials from the Services or retain copies of these + materials outside the Services +- Reproduce or copy these materials, except for temporary copies created + automatically during authorized use of the Services +- Create derivative works based on these materials +- Distribute, sublicense, or transfer these materials to any third party +- Make, offer to sell, sell, or import any inventions embodied in these + materials +- Reverse engineer, decompile, or disassemble these materials + +The receipt, viewing, or possession of these materials does not convey or +imply any license or right beyond those expressly granted above. + +Anthropic retains all right, title, and interest in these materials, +including all copyrights, patents, and other intellectual property rights. diff --git a/ruoyi-admin/src/main/resources/skills/pdf/SKILL.md b/ruoyi-admin/src/main/resources/skills/pdf/SKILL.md new file mode 100644 index 00000000..f6a22ddf --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/pdf/SKILL.md @@ -0,0 +1,294 @@ +--- +name: pdf +description: Comprehensive PDF manipulation toolkit for extracting text and tables, creating new PDFs, merging/splitting documents, and handling forms. When Claude needs to fill in a PDF form or programmatically process, generate, or analyze PDF documents at scale. +license: Proprietary. LICENSE.txt has complete terms +--- + +# PDF Processing Guide + +## Overview + +This guide covers essential PDF processing operations using Python libraries and command-line tools. For advanced features, JavaScript libraries, and detailed examples, see reference.md. If you need to fill out a PDF form, read forms.md and follow its instructions. + +## Quick Start + +```python +from pypdf import PdfReader, PdfWriter + +# Read a PDF +reader = PdfReader("document.pdf") +print(f"Pages: {len(reader.pages)}") + +# Extract text +text = "" +for page in reader.pages: + text += page.extract_text() +``` + +## Python Libraries + +### pypdf - Basic Operations + +#### Merge PDFs +```python +from pypdf import PdfWriter, PdfReader + +writer = PdfWriter() +for pdf_file in ["doc1.pdf", "doc2.pdf", "doc3.pdf"]: + reader = PdfReader(pdf_file) + for page in reader.pages: + writer.add_page(page) + +with open("merged.pdf", "wb") as output: + writer.write(output) +``` + +#### Split PDF +```python +reader = PdfReader("input.pdf") +for i, page in enumerate(reader.pages): + writer = PdfWriter() + writer.add_page(page) + with open(f"page_{i+1}.pdf", "wb") as output: + writer.write(output) +``` + +#### Extract Metadata +```python +reader = PdfReader("document.pdf") +meta = reader.metadata +print(f"Title: {meta.title}") +print(f"Author: {meta.author}") +print(f"Subject: {meta.subject}") +print(f"Creator: {meta.creator}") +``` + +#### Rotate Pages +```python +reader = PdfReader("input.pdf") +writer = PdfWriter() + +page = reader.pages[0] +page.rotate(90) # Rotate 90 degrees clockwise +writer.add_page(page) + +with open("rotated.pdf", "wb") as output: + writer.write(output) +``` + +### pdfplumber - Text and Table Extraction + +#### Extract Text with Layout +```python +import pdfplumber + +with pdfplumber.open("document.pdf") as pdf: + for page in pdf.pages: + text = page.extract_text() + print(text) +``` + +#### Extract Tables +```python +with pdfplumber.open("document.pdf") as pdf: + for i, page in enumerate(pdf.pages): + tables = page.extract_tables() + for j, table in enumerate(tables): + print(f"Table {j+1} on page {i+1}:") + for row in table: + print(row) +``` + +#### Advanced Table Extraction +```python +import pandas as pd + +with pdfplumber.open("document.pdf") as pdf: + all_tables = [] + for page in pdf.pages: + tables = page.extract_tables() + for table in tables: + if table: # Check if table is not empty + df = pd.DataFrame(table[1:], columns=table[0]) + all_tables.append(df) + +# Combine all tables +if all_tables: + combined_df = pd.concat(all_tables, ignore_index=True) + combined_df.to_excel("extracted_tables.xlsx", index=False) +``` + +### reportlab - Create PDFs + +#### Basic PDF Creation +```python +from reportlab.lib.pagesizes import letter +from reportlab.pdfgen import canvas + +c = canvas.Canvas("hello.pdf", pagesize=letter) +width, height = letter + +# Add text +c.drawString(100, height - 100, "Hello World!") +c.drawString(100, height - 120, "This is a PDF created with reportlab") + +# Add a line +c.line(100, height - 140, 400, height - 140) + +# Save +c.save() +``` + +#### Create PDF with Multiple Pages +```python +from reportlab.lib.pagesizes import letter +from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak +from reportlab.lib.styles import getSampleStyleSheet + +doc = SimpleDocTemplate("report.pdf", pagesize=letter) +styles = getSampleStyleSheet() +story = [] + +# Add content +title = Paragraph("Report Title", styles['Title']) +story.append(title) +story.append(Spacer(1, 12)) + +body = Paragraph("This is the body of the report. " * 20, styles['Normal']) +story.append(body) +story.append(PageBreak()) + +# Page 2 +story.append(Paragraph("Page 2", styles['Heading1'])) +story.append(Paragraph("Content for page 2", styles['Normal'])) + +# Build PDF +doc.build(story) +``` + +## Command-Line Tools + +### pdftotext (poppler-utils) +```bash +# Extract text +pdftotext input.pdf output.txt + +# Extract text preserving layout +pdftotext -layout input.pdf output.txt + +# Extract specific pages +pdftotext -f 1 -l 5 input.pdf output.txt # Pages 1-5 +``` + +### qpdf +```bash +# Merge PDFs +qpdf --empty --pages file1.pdf file2.pdf -- merged.pdf + +# Split pages +qpdf input.pdf --pages . 1-5 -- pages1-5.pdf +qpdf input.pdf --pages . 6-10 -- pages6-10.pdf + +# Rotate pages +qpdf input.pdf output.pdf --rotate=+90:1 # Rotate page 1 by 90 degrees + +# Remove password +qpdf --password=mypassword --decrypt encrypted.pdf decrypted.pdf +``` + +### pdftk (if available) +```bash +# Merge +pdftk file1.pdf file2.pdf cat output merged.pdf + +# Split +pdftk input.pdf burst + +# Rotate +pdftk input.pdf rotate 1east output rotated.pdf +``` + +## Common Tasks + +### Extract Text from Scanned PDFs +```python +# Requires: pip install pytesseract pdf2image +import pytesseract +from pdf2image import convert_from_path + +# Convert PDF to images +images = convert_from_path('scanned.pdf') + +# OCR each page +text = "" +for i, image in enumerate(images): + text += f"Page {i+1}:\n" + text += pytesseract.image_to_string(image) + text += "\n\n" + +print(text) +``` + +### Add Watermark +```python +from pypdf import PdfReader, PdfWriter + +# Create watermark (or load existing) +watermark = PdfReader("watermark.pdf").pages[0] + +# Apply to all pages +reader = PdfReader("document.pdf") +writer = PdfWriter() + +for page in reader.pages: + page.merge_page(watermark) + writer.add_page(page) + +with open("watermarked.pdf", "wb") as output: + writer.write(output) +``` + +### Extract Images +```bash +# Using pdfimages (poppler-utils) +pdfimages -j input.pdf output_prefix + +# This extracts all images as output_prefix-000.jpg, output_prefix-001.jpg, etc. +``` + +### Password Protection +```python +from pypdf import PdfReader, PdfWriter + +reader = PdfReader("input.pdf") +writer = PdfWriter() + +for page in reader.pages: + writer.add_page(page) + +# Add password +writer.encrypt("userpassword", "ownerpassword") + +with open("encrypted.pdf", "wb") as output: + writer.write(output) +``` + +## Quick Reference + +| Task | Best Tool | Command/Code | +|------|-----------|--------------| +| Merge PDFs | pypdf | `writer.add_page(page)` | +| Split PDFs | pypdf | One page per file | +| Extract text | pdfplumber | `page.extract_text()` | +| Extract tables | pdfplumber | `page.extract_tables()` | +| Create PDFs | reportlab | Canvas or Platypus | +| Command line merge | qpdf | `qpdf --empty --pages ...` | +| OCR scanned PDFs | pytesseract | Convert to image first | +| Fill PDF forms | pdf-lib or pypdf (see forms.md) | See forms.md | + +## Next Steps + +- For advanced pypdfium2 usage, see reference.md +- For JavaScript libraries (pdf-lib), see reference.md +- If you need to fill out a PDF form, follow the instructions in forms.md +- For troubleshooting guides, see reference.md diff --git a/ruoyi-admin/src/main/resources/skills/pdf/forms.md b/ruoyi-admin/src/main/resources/skills/pdf/forms.md new file mode 100644 index 00000000..4e234506 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/pdf/forms.md @@ -0,0 +1,205 @@ +**CRITICAL: You MUST complete these steps in order. Do not skip ahead to writing code.** + +If you need to fill out a PDF form, first check to see if the PDF has fillable form fields. Run this script from this file's directory: + `python scripts/check_fillable_fields `, and depending on the result go to either the "Fillable fields" or "Non-fillable fields" and follow those instructions. + +# Fillable fields +If the PDF has fillable form fields: +- Run this script from this file's directory: `python scripts/extract_form_field_info.py `. It will create a JSON file with a list of fields in this format: +``` +[ + { + "field_id": (unique ID for the field), + "page": (page number, 1-based), + "rect": ([left, bottom, right, top] bounding box in PDF coordinates, y=0 is the bottom of the page), + "type": ("text", "checkbox", "radio_group", or "choice"), + }, + // Checkboxes have "checked_value" and "unchecked_value" properties: + { + "field_id": (unique ID for the field), + "page": (page number, 1-based), + "type": "checkbox", + "checked_value": (Set the field to this value to check the checkbox), + "unchecked_value": (Set the field to this value to uncheck the checkbox), + }, + // Radio groups have a "radio_options" list with the possible choices. + { + "field_id": (unique ID for the field), + "page": (page number, 1-based), + "type": "radio_group", + "radio_options": [ + { + "value": (set the field to this value to select this radio option), + "rect": (bounding box for the radio button for this option) + }, + // Other radio options + ] + }, + // Multiple choice fields have a "choice_options" list with the possible choices: + { + "field_id": (unique ID for the field), + "page": (page number, 1-based), + "type": "choice", + "choice_options": [ + { + "value": (set the field to this value to select this option), + "text": (display text of the option) + }, + // Other choice options + ], + } +] +``` +- Convert the PDF to PNGs (one image for each page) with this script (run from this file's directory): +`python scripts/convert_pdf_to_images.py ` +Then analyze the images to determine the purpose of each form field (make sure to convert the bounding box PDF coordinates to image coordinates). +- Create a `field_values.json` file in this format with the values to be entered for each field: +``` +[ + { + "field_id": "last_name", // Must match the field_id from `extract_form_field_info.py` + "description": "The user's last name", + "page": 1, // Must match the "page" value in field_info.json + "value": "Simpson" + }, + { + "field_id": "Checkbox12", + "description": "Checkbox to be checked if the user is 18 or over", + "page": 1, + "value": "/On" // If this is a checkbox, use its "checked_value" value to check it. If it's a radio button group, use one of the "value" values in "radio_options". + }, + // more fields +] +``` +- Run the `fill_fillable_fields.py` script from this file's directory to create a filled-in PDF: +`python scripts/fill_fillable_fields.py ` +This script will verify that the field IDs and values you provide are valid; if it prints error messages, correct the appropriate fields and try again. + +# Non-fillable fields +If the PDF doesn't have fillable form fields, you'll need to visually determine where the data should be added and create text annotations. Follow the below steps *exactly*. You MUST perform all of these steps to ensure that the the form is accurately completed. Details for each step are below. +- Convert the PDF to PNG images and determine field bounding boxes. +- Create a JSON file with field information and validation images showing the bounding boxes. +- Validate the the bounding boxes. +- Use the bounding boxes to fill in the form. + +## Step 1: Visual Analysis (REQUIRED) +- Convert the PDF to PNG images. Run this script from this file's directory: +`python scripts/convert_pdf_to_images.py ` +The script will create a PNG image for each page in the PDF. +- Carefully examine each PNG image and identify all form fields and areas where the user should enter data. For each form field where the user should enter text, determine bounding boxes for both the form field label, and the area where the user should enter text. The label and entry bounding boxes MUST NOT INTERSECT; the text entry box should only include the area where data should be entered. Usually this area will be immediately to the side, above, or below its label. Entry bounding boxes must be tall and wide enough to contain their text. + +These are some examples of form structures that you might see: + +*Label inside box* +``` +┌────────────────────────┐ +│ Name: │ +└────────────────────────┘ +``` +The input area should be to the right of the "Name" label and extend to the edge of the box. + +*Label before line* +``` +Email: _______________________ +``` +The input area should be above the line and include its entire width. + +*Label under line* +``` +_________________________ +Name +``` +The input area should be above the line and include the entire width of the line. This is common for signature and date fields. + +*Label above line* +``` +Please enter any special requests: +________________________________________________ +``` +The input area should extend from the bottom of the label to the line, and should include the entire width of the line. + +*Checkboxes* +``` +Are you a US citizen? Yes □ No □ +``` +For checkboxes: +- Look for small square boxes (□) - these are the actual checkboxes to target. They may be to the left or right of their labels. +- Distinguish between label text ("Yes", "No") and the clickable checkbox squares. +- The entry bounding box should cover ONLY the small square, not the text label. + +### Step 2: Create fields.json and validation images (REQUIRED) +- Create a file named `fields.json` with information for the form fields and bounding boxes in this format: +``` +{ + "pages": [ + { + "page_number": 1, + "image_width": (first page image width in pixels), + "image_height": (first page image height in pixels), + }, + { + "page_number": 2, + "image_width": (second page image width in pixels), + "image_height": (second page image height in pixels), + } + // additional pages + ], + "form_fields": [ + // Example for a text field. + { + "page_number": 1, + "description": "The user's last name should be entered here", + // Bounding boxes are [left, top, right, bottom]. The bounding boxes for the label and text entry should not overlap. + "field_label": "Last name", + "label_bounding_box": [30, 125, 95, 142], + "entry_bounding_box": [100, 125, 280, 142], + "entry_text": { + "text": "Johnson", // This text will be added as an annotation at the entry_bounding_box location + "font_size": 14, // optional, defaults to 14 + "font_color": "000000", // optional, RRGGBB format, defaults to 000000 (black) + } + }, + // Example for a checkbox. TARGET THE SQUARE for the entry bounding box, NOT THE TEXT + { + "page_number": 2, + "description": "Checkbox that should be checked if the user is over 18", + "entry_bounding_box": [140, 525, 155, 540], // Small box over checkbox square + "field_label": "Yes", + "label_bounding_box": [100, 525, 132, 540], // Box containing "Yes" text + // Use "X" to check a checkbox. + "entry_text": { + "text": "X", + } + } + // additional form field entries + ] +} +``` + +Create validation images by running this script from this file's directory for each page: +`python scripts/create_validation_image.py + +The validation images will have red rectangles where text should be entered, and blue rectangles covering label text. + +### Step 3: Validate Bounding Boxes (REQUIRED) +#### Automated intersection check +- Verify that none of bounding boxes intersect and that the entry bounding boxes are tall enough by checking the fields.json file with the `check_bounding_boxes.py` script (run from this file's directory): +`python scripts/check_bounding_boxes.py ` + +If there are errors, reanalyze the relevant fields, adjust the bounding boxes, and iterate until there are no remaining errors. Remember: label (blue) bounding boxes should contain text labels, entry (red) boxes should not. + +#### Manual image inspection +**CRITICAL: Do not proceed without visually inspecting validation images** +- Red rectangles must ONLY cover input areas +- Red rectangles MUST NOT contain any text +- Blue rectangles should contain label text +- For checkboxes: + - Red rectangle MUST be centered on the checkbox square + - Blue rectangle should cover the text label for the checkbox + +- If any rectangles look wrong, fix fields.json, regenerate the validation images, and verify again. Repeat this process until the bounding boxes are fully accurate. + + +### Step 4: Add annotations to the PDF +Run this script from this file's directory to create a filled-out PDF using the information in fields.json: +`python scripts/fill_pdf_form_with_annotations.py diff --git a/ruoyi-admin/src/main/resources/skills/pdf/reference.md b/ruoyi-admin/src/main/resources/skills/pdf/reference.md new file mode 100644 index 00000000..41400bf4 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/pdf/reference.md @@ -0,0 +1,612 @@ +# PDF Processing Advanced Reference + +This document contains advanced PDF processing features, detailed examples, and additional libraries not covered in the main skill instructions. + +## pypdfium2 Library (Apache/BSD License) + +### Overview +pypdfium2 is a Python binding for PDFium (Chromium's PDF library). It's excellent for fast PDF rendering, image generation, and serves as a PyMuPDF replacement. + +### Render PDF to Images +```python +import pypdfium2 as pdfium +from PIL import Image + +# Load PDF +pdf = pdfium.PdfDocument("document.pdf") + +# Render page to image +page = pdf[0] # First page +bitmap = page.render( + scale=2.0, # Higher resolution + rotation=0 # No rotation +) + +# Convert to PIL Image +img = bitmap.to_pil() +img.save("page_1.png", "PNG") + +# Process multiple pages +for i, page in enumerate(pdf): + bitmap = page.render(scale=1.5) + img = bitmap.to_pil() + img.save(f"page_{i+1}.jpg", "JPEG", quality=90) +``` + +### Extract Text with pypdfium2 +```python +import pypdfium2 as pdfium + +pdf = pdfium.PdfDocument("document.pdf") +for i, page in enumerate(pdf): + text = page.get_text() + print(f"Page {i+1} text length: {len(text)} chars") +``` + +## JavaScript Libraries + +### pdf-lib (MIT License) + +pdf-lib is a powerful JavaScript library for creating and modifying PDF documents in any JavaScript environment. + +#### Load and Manipulate Existing PDF +```javascript +import { PDFDocument } from 'pdf-lib'; +import fs from 'fs'; + +async function manipulatePDF() { + // Load existing PDF + const existingPdfBytes = fs.readFileSync('input.pdf'); + const pdfDoc = await PDFDocument.load(existingPdfBytes); + + // Get page count + const pageCount = pdfDoc.getPageCount(); + console.log(`Document has ${pageCount} pages`); + + // Add new page + const newPage = pdfDoc.addPage([600, 400]); + newPage.drawText('Added by pdf-lib', { + x: 100, + y: 300, + size: 16 + }); + + // Save modified PDF + const pdfBytes = await pdfDoc.save(); + fs.writeFileSync('modified.pdf', pdfBytes); +} +``` + +#### Create Complex PDFs from Scratch +```javascript +import { PDFDocument, rgb, StandardFonts } from 'pdf-lib'; +import fs from 'fs'; + +async function createPDF() { + const pdfDoc = await PDFDocument.create(); + + // Add fonts + const helveticaFont = await pdfDoc.embedFont(StandardFonts.Helvetica); + const helveticaBold = await pdfDoc.embedFont(StandardFonts.HelveticaBold); + + // Add page + const page = pdfDoc.addPage([595, 842]); // A4 size + const { width, height } = page.getSize(); + + // Add text with styling + page.drawText('Invoice #12345', { + x: 50, + y: height - 50, + size: 18, + font: helveticaBold, + color: rgb(0.2, 0.2, 0.8) + }); + + // Add rectangle (header background) + page.drawRectangle({ + x: 40, + y: height - 100, + width: width - 80, + height: 30, + color: rgb(0.9, 0.9, 0.9) + }); + + // Add table-like content + const items = [ + ['Item', 'Qty', 'Price', 'Total'], + ['Widget', '2', '$50', '$100'], + ['Gadget', '1', '$75', '$75'] + ]; + + let yPos = height - 150; + items.forEach(row => { + let xPos = 50; + row.forEach(cell => { + page.drawText(cell, { + x: xPos, + y: yPos, + size: 12, + font: helveticaFont + }); + xPos += 120; + }); + yPos -= 25; + }); + + const pdfBytes = await pdfDoc.save(); + fs.writeFileSync('created.pdf', pdfBytes); +} +``` + +#### Advanced Merge and Split Operations +```javascript +import { PDFDocument } from 'pdf-lib'; +import fs from 'fs'; + +async function mergePDFs() { + // Create new document + const mergedPdf = await PDFDocument.create(); + + // Load source PDFs + const pdf1Bytes = fs.readFileSync('doc1.pdf'); + const pdf2Bytes = fs.readFileSync('doc2.pdf'); + + const pdf1 = await PDFDocument.load(pdf1Bytes); + const pdf2 = await PDFDocument.load(pdf2Bytes); + + // Copy pages from first PDF + const pdf1Pages = await mergedPdf.copyPages(pdf1, pdf1.getPageIndices()); + pdf1Pages.forEach(page => mergedPdf.addPage(page)); + + // Copy specific pages from second PDF (pages 0, 2, 4) + const pdf2Pages = await mergedPdf.copyPages(pdf2, [0, 2, 4]); + pdf2Pages.forEach(page => mergedPdf.addPage(page)); + + const mergedPdfBytes = await mergedPdf.save(); + fs.writeFileSync('merged.pdf', mergedPdfBytes); +} +``` + +### pdfjs-dist (Apache License) + +PDF.js is Mozilla's JavaScript library for rendering PDFs in the browser. + +#### Basic PDF Loading and Rendering +```javascript +import * as pdfjsLib from 'pdfjs-dist'; + +// Configure worker (important for performance) +pdfjsLib.GlobalWorkerOptions.workerSrc = './pdf.worker.js'; + +async function renderPDF() { + // Load PDF + const loadingTask = pdfjsLib.getDocument('document.pdf'); + const pdf = await loadingTask.promise; + + console.log(`Loaded PDF with ${pdf.numPages} pages`); + + // Get first page + const page = await pdf.getPage(1); + const viewport = page.getViewport({ scale: 1.5 }); + + // Render to canvas + const canvas = document.createElement('canvas'); + const context = canvas.getContext('2d'); + canvas.height = viewport.height; + canvas.width = viewport.width; + + const renderContext = { + canvasContext: context, + viewport: viewport + }; + + await page.render(renderContext).promise; + document.body.appendChild(canvas); +} +``` + +#### Extract Text with Coordinates +```javascript +import * as pdfjsLib from 'pdfjs-dist'; + +async function extractText() { + const loadingTask = pdfjsLib.getDocument('document.pdf'); + const pdf = await loadingTask.promise; + + let fullText = ''; + + // Extract text from all pages + for (let i = 1; i <= pdf.numPages; i++) { + const page = await pdf.getPage(i); + const textContent = await page.getTextContent(); + + const pageText = textContent.items + .map(item => item.str) + .join(' '); + + fullText += `\n--- Page ${i} ---\n${pageText}`; + + // Get text with coordinates for advanced processing + const textWithCoords = textContent.items.map(item => ({ + text: item.str, + x: item.transform[4], + y: item.transform[5], + width: item.width, + height: item.height + })); + } + + console.log(fullText); + return fullText; +} +``` + +#### Extract Annotations and Forms +```javascript +import * as pdfjsLib from 'pdfjs-dist'; + +async function extractAnnotations() { + const loadingTask = pdfjsLib.getDocument('annotated.pdf'); + const pdf = await loadingTask.promise; + + for (let i = 1; i <= pdf.numPages; i++) { + const page = await pdf.getPage(i); + const annotations = await page.getAnnotations(); + + annotations.forEach(annotation => { + console.log(`Annotation type: ${annotation.subtype}`); + console.log(`Content: ${annotation.contents}`); + console.log(`Coordinates: ${JSON.stringify(annotation.rect)}`); + }); + } +} +``` + +## Advanced Command-Line Operations + +### poppler-utils Advanced Features + +#### Extract Text with Bounding Box Coordinates +```bash +# Extract text with bounding box coordinates (essential for structured data) +pdftotext -bbox-layout document.pdf output.xml + +# The XML output contains precise coordinates for each text element +``` + +#### Advanced Image Conversion +```bash +# Convert to PNG images with specific resolution +pdftoppm -png -r 300 document.pdf output_prefix + +# Convert specific page range with high resolution +pdftoppm -png -r 600 -f 1 -l 3 document.pdf high_res_pages + +# Convert to JPEG with quality setting +pdftoppm -jpeg -jpegopt quality=85 -r 200 document.pdf jpeg_output +``` + +#### Extract Embedded Images +```bash +# Extract all embedded images with metadata +pdfimages -j -p document.pdf page_images + +# List image info without extracting +pdfimages -list document.pdf + +# Extract images in their original format +pdfimages -all document.pdf images/img +``` + +### qpdf Advanced Features + +#### Complex Page Manipulation +```bash +# Split PDF into groups of pages +qpdf --split-pages=3 input.pdf output_group_%02d.pdf + +# Extract specific pages with complex ranges +qpdf input.pdf --pages input.pdf 1,3-5,8,10-end -- extracted.pdf + +# Merge specific pages from multiple PDFs +qpdf --empty --pages doc1.pdf 1-3 doc2.pdf 5-7 doc3.pdf 2,4 -- combined.pdf +``` + +#### PDF Optimization and Repair +```bash +# Optimize PDF for web (linearize for streaming) +qpdf --linearize input.pdf optimized.pdf + +# Remove unused objects and compress +qpdf --optimize-level=all input.pdf compressed.pdf + +# Attempt to repair corrupted PDF structure +qpdf --check input.pdf +qpdf --fix-qdf damaged.pdf repaired.pdf + +# Show detailed PDF structure for debugging +qpdf --show-all-pages input.pdf > structure.txt +``` + +#### Advanced Encryption +```bash +# Add password protection with specific permissions +qpdf --encrypt user_pass owner_pass 256 --print=none --modify=none -- input.pdf encrypted.pdf + +# Check encryption status +qpdf --show-encryption encrypted.pdf + +# Remove password protection (requires password) +qpdf --password=secret123 --decrypt encrypted.pdf decrypted.pdf +``` + +## Advanced Python Techniques + +### pdfplumber Advanced Features + +#### Extract Text with Precise Coordinates +```python +import pdfplumber + +with pdfplumber.open("document.pdf") as pdf: + page = pdf.pages[0] + + # Extract all text with coordinates + chars = page.chars + for char in chars[:10]: # First 10 characters + print(f"Char: '{char['text']}' at x:{char['x0']:.1f} y:{char['y0']:.1f}") + + # Extract text by bounding box (left, top, right, bottom) + bbox_text = page.within_bbox((100, 100, 400, 200)).extract_text() +``` + +#### Advanced Table Extraction with Custom Settings +```python +import pdfplumber +import pandas as pd + +with pdfplumber.open("complex_table.pdf") as pdf: + page = pdf.pages[0] + + # Extract tables with custom settings for complex layouts + table_settings = { + "vertical_strategy": "lines", + "horizontal_strategy": "lines", + "snap_tolerance": 3, + "intersection_tolerance": 15 + } + tables = page.extract_tables(table_settings) + + # Visual debugging for table extraction + img = page.to_image(resolution=150) + img.save("debug_layout.png") +``` + +### reportlab Advanced Features + +#### Create Professional Reports with Tables +```python +from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph +from reportlab.lib.styles import getSampleStyleSheet +from reportlab.lib import colors + +# Sample data +data = [ + ['Product', 'Q1', 'Q2', 'Q3', 'Q4'], + ['Widgets', '120', '135', '142', '158'], + ['Gadgets', '85', '92', '98', '105'] +] + +# Create PDF with table +doc = SimpleDocTemplate("report.pdf") +elements = [] + +# Add title +styles = getSampleStyleSheet() +title = Paragraph("Quarterly Sales Report", styles['Title']) +elements.append(title) + +# Add table with advanced styling +table = Table(data) +table.setStyle(TableStyle([ + ('BACKGROUND', (0, 0), (-1, 0), colors.grey), + ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke), + ('ALIGN', (0, 0), (-1, -1), 'CENTER'), + ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'), + ('FONTSIZE', (0, 0), (-1, 0), 14), + ('BOTTOMPADDING', (0, 0), (-1, 0), 12), + ('BACKGROUND', (0, 1), (-1, -1), colors.beige), + ('GRID', (0, 0), (-1, -1), 1, colors.black) +])) +elements.append(table) + +doc.build(elements) +``` + +## Complex Workflows + +### Extract Figures/Images from PDF + +#### Method 1: Using pdfimages (fastest) +```bash +# Extract all images with original quality +pdfimages -all document.pdf images/img +``` + +#### Method 2: Using pypdfium2 + Image Processing +```python +import pypdfium2 as pdfium +from PIL import Image +import numpy as np + +def extract_figures(pdf_path, output_dir): + pdf = pdfium.PdfDocument(pdf_path) + + for page_num, page in enumerate(pdf): + # Render high-resolution page + bitmap = page.render(scale=3.0) + img = bitmap.to_pil() + + # Convert to numpy for processing + img_array = np.array(img) + + # Simple figure detection (non-white regions) + mask = np.any(img_array != [255, 255, 255], axis=2) + + # Find contours and extract bounding boxes + # (This is simplified - real implementation would need more sophisticated detection) + + # Save detected figures + # ... implementation depends on specific needs +``` + +### Batch PDF Processing with Error Handling +```python +import os +import glob +from pypdf import PdfReader, PdfWriter +import logging + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +def batch_process_pdfs(input_dir, operation='merge'): + pdf_files = glob.glob(os.path.join(input_dir, "*.pdf")) + + if operation == 'merge': + writer = PdfWriter() + for pdf_file in pdf_files: + try: + reader = PdfReader(pdf_file) + for page in reader.pages: + writer.add_page(page) + logger.info(f"Processed: {pdf_file}") + except Exception as e: + logger.error(f"Failed to process {pdf_file}: {e}") + continue + + with open("batch_merged.pdf", "wb") as output: + writer.write(output) + + elif operation == 'extract_text': + for pdf_file in pdf_files: + try: + reader = PdfReader(pdf_file) + text = "" + for page in reader.pages: + text += page.extract_text() + + output_file = pdf_file.replace('.pdf', '.txt') + with open(output_file, 'w', encoding='utf-8') as f: + f.write(text) + logger.info(f"Extracted text from: {pdf_file}") + + except Exception as e: + logger.error(f"Failed to extract text from {pdf_file}: {e}") + continue +``` + +### Advanced PDF Cropping +```python +from pypdf import PdfWriter, PdfReader + +reader = PdfReader("input.pdf") +writer = PdfWriter() + +# Crop page (left, bottom, right, top in points) +page = reader.pages[0] +page.mediabox.left = 50 +page.mediabox.bottom = 50 +page.mediabox.right = 550 +page.mediabox.top = 750 + +writer.add_page(page) +with open("cropped.pdf", "wb") as output: + writer.write(output) +``` + +## Performance Optimization Tips + +### 1. For Large PDFs +- Use streaming approaches instead of loading entire PDF in memory +- Use `qpdf --split-pages` for splitting large files +- Process pages individually with pypdfium2 + +### 2. For Text Extraction +- `pdftotext -bbox-layout` is fastest for plain text extraction +- Use pdfplumber for structured data and tables +- Avoid `pypdf.extract_text()` for very large documents + +### 3. For Image Extraction +- `pdfimages` is much faster than rendering pages +- Use low resolution for previews, high resolution for final output + +### 4. For Form Filling +- pdf-lib maintains form structure better than most alternatives +- Pre-validate form fields before processing + +### 5. Memory Management +```python +# Process PDFs in chunks +def process_large_pdf(pdf_path, chunk_size=10): + reader = PdfReader(pdf_path) + total_pages = len(reader.pages) + + for start_idx in range(0, total_pages, chunk_size): + end_idx = min(start_idx + chunk_size, total_pages) + writer = PdfWriter() + + for i in range(start_idx, end_idx): + writer.add_page(reader.pages[i]) + + # Process chunk + with open(f"chunk_{start_idx//chunk_size}.pdf", "wb") as output: + writer.write(output) +``` + +## Troubleshooting Common Issues + +### Encrypted PDFs +```python +# Handle password-protected PDFs +from pypdf import PdfReader + +try: + reader = PdfReader("encrypted.pdf") + if reader.is_encrypted: + reader.decrypt("password") +except Exception as e: + print(f"Failed to decrypt: {e}") +``` + +### Corrupted PDFs +```bash +# Use qpdf to repair +qpdf --check corrupted.pdf +qpdf --replace-input corrupted.pdf +``` + +### Text Extraction Issues +```python +# Fallback to OCR for scanned PDFs +import pytesseract +from pdf2image import convert_from_path + +def extract_text_with_ocr(pdf_path): + images = convert_from_path(pdf_path) + text = "" + for i, image in enumerate(images): + text += pytesseract.image_to_string(image) + return text +``` + +## License Information + +- **pypdf**: BSD License +- **pdfplumber**: MIT License +- **pypdfium2**: Apache/BSD License +- **reportlab**: BSD License +- **poppler-utils**: GPL-2 License +- **qpdf**: Apache License +- **pdf-lib**: MIT License +- **pdfjs-dist**: Apache License \ No newline at end of file diff --git a/ruoyi-admin/src/main/resources/skills/pdf/scripts/check_bounding_boxes.py b/ruoyi-admin/src/main/resources/skills/pdf/scripts/check_bounding_boxes.py new file mode 100644 index 00000000..7443660c --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/pdf/scripts/check_bounding_boxes.py @@ -0,0 +1,70 @@ +from dataclasses import dataclass +import json +import sys + + +# Script to check that the `fields.json` file that Claude creates when analyzing PDFs +# does not have overlapping bounding boxes. See forms.md. + + +@dataclass +class RectAndField: + rect: list[float] + rect_type: str + field: dict + + +# Returns a list of messages that are printed to stdout for Claude to read. +def get_bounding_box_messages(fields_json_stream) -> list[str]: + messages = [] + fields = json.load(fields_json_stream) + messages.append(f"Read {len(fields['form_fields'])} fields") + + def rects_intersect(r1, r2): + disjoint_horizontal = r1[0] >= r2[2] or r1[2] <= r2[0] + disjoint_vertical = r1[1] >= r2[3] or r1[3] <= r2[1] + return not (disjoint_horizontal or disjoint_vertical) + + rects_and_fields = [] + for f in fields["form_fields"]: + rects_and_fields.append(RectAndField(f["label_bounding_box"], "label", f)) + rects_and_fields.append(RectAndField(f["entry_bounding_box"], "entry", f)) + + has_error = False + for i, ri in enumerate(rects_and_fields): + # This is O(N^2); we can optimize if it becomes a problem. + for j in range(i + 1, len(rects_and_fields)): + rj = rects_and_fields[j] + if ri.field["page_number"] == rj.field["page_number"] and rects_intersect(ri.rect, rj.rect): + has_error = True + if ri.field is rj.field: + messages.append(f"FAILURE: intersection between label and entry bounding boxes for `{ri.field['description']}` ({ri.rect}, {rj.rect})") + else: + messages.append(f"FAILURE: intersection between {ri.rect_type} bounding box for `{ri.field['description']}` ({ri.rect}) and {rj.rect_type} bounding box for `{rj.field['description']}` ({rj.rect})") + if len(messages) >= 20: + messages.append("Aborting further checks; fix bounding boxes and try again") + return messages + if ri.rect_type == "entry": + if "entry_text" in ri.field: + font_size = ri.field["entry_text"].get("font_size", 14) + entry_height = ri.rect[3] - ri.rect[1] + if entry_height < font_size: + has_error = True + messages.append(f"FAILURE: entry bounding box height ({entry_height}) for `{ri.field['description']}` is too short for the text content (font size: {font_size}). Increase the box height or decrease the font size.") + if len(messages) >= 20: + messages.append("Aborting further checks; fix bounding boxes and try again") + return messages + + if not has_error: + messages.append("SUCCESS: All bounding boxes are valid") + return messages + +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: check_bounding_boxes.py [fields.json]") + sys.exit(1) + # Input file should be in the `fields.json` format described in forms.md. + with open(sys.argv[1]) as f: + messages = get_bounding_box_messages(f) + for msg in messages: + print(msg) diff --git a/ruoyi-admin/src/main/resources/skills/pdf/scripts/check_bounding_boxes_test.py b/ruoyi-admin/src/main/resources/skills/pdf/scripts/check_bounding_boxes_test.py new file mode 100644 index 00000000..1dbb463c --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/pdf/scripts/check_bounding_boxes_test.py @@ -0,0 +1,226 @@ +import unittest +import json +import io +from check_bounding_boxes import get_bounding_box_messages + + +# Currently this is not run automatically in CI; it's just for documentation and manual checking. +class TestGetBoundingBoxMessages(unittest.TestCase): + + def create_json_stream(self, data): + """Helper to create a JSON stream from data""" + return io.StringIO(json.dumps(data)) + + def test_no_intersections(self): + """Test case with no bounding box intersections""" + data = { + "form_fields": [ + { + "description": "Name", + "page_number": 1, + "label_bounding_box": [10, 10, 50, 30], + "entry_bounding_box": [60, 10, 150, 30] + }, + { + "description": "Email", + "page_number": 1, + "label_bounding_box": [10, 40, 50, 60], + "entry_bounding_box": [60, 40, 150, 60] + } + ] + } + + stream = self.create_json_stream(data) + messages = get_bounding_box_messages(stream) + self.assertTrue(any("SUCCESS" in msg for msg in messages)) + self.assertFalse(any("FAILURE" in msg for msg in messages)) + + def test_label_entry_intersection_same_field(self): + """Test intersection between label and entry of the same field""" + data = { + "form_fields": [ + { + "description": "Name", + "page_number": 1, + "label_bounding_box": [10, 10, 60, 30], + "entry_bounding_box": [50, 10, 150, 30] # Overlaps with label + } + ] + } + + stream = self.create_json_stream(data) + messages = get_bounding_box_messages(stream) + self.assertTrue(any("FAILURE" in msg and "intersection" in msg for msg in messages)) + self.assertFalse(any("SUCCESS" in msg for msg in messages)) + + def test_intersection_between_different_fields(self): + """Test intersection between bounding boxes of different fields""" + data = { + "form_fields": [ + { + "description": "Name", + "page_number": 1, + "label_bounding_box": [10, 10, 50, 30], + "entry_bounding_box": [60, 10, 150, 30] + }, + { + "description": "Email", + "page_number": 1, + "label_bounding_box": [40, 20, 80, 40], # Overlaps with Name's boxes + "entry_bounding_box": [160, 10, 250, 30] + } + ] + } + + stream = self.create_json_stream(data) + messages = get_bounding_box_messages(stream) + self.assertTrue(any("FAILURE" in msg and "intersection" in msg for msg in messages)) + self.assertFalse(any("SUCCESS" in msg for msg in messages)) + + def test_different_pages_no_intersection(self): + """Test that boxes on different pages don't count as intersecting""" + data = { + "form_fields": [ + { + "description": "Name", + "page_number": 1, + "label_bounding_box": [10, 10, 50, 30], + "entry_bounding_box": [60, 10, 150, 30] + }, + { + "description": "Email", + "page_number": 2, + "label_bounding_box": [10, 10, 50, 30], # Same coordinates but different page + "entry_bounding_box": [60, 10, 150, 30] + } + ] + } + + stream = self.create_json_stream(data) + messages = get_bounding_box_messages(stream) + self.assertTrue(any("SUCCESS" in msg for msg in messages)) + self.assertFalse(any("FAILURE" in msg for msg in messages)) + + def test_entry_height_too_small(self): + """Test that entry box height is checked against font size""" + data = { + "form_fields": [ + { + "description": "Name", + "page_number": 1, + "label_bounding_box": [10, 10, 50, 30], + "entry_bounding_box": [60, 10, 150, 20], # Height is 10 + "entry_text": { + "font_size": 14 # Font size larger than height + } + } + ] + } + + stream = self.create_json_stream(data) + messages = get_bounding_box_messages(stream) + self.assertTrue(any("FAILURE" in msg and "height" in msg for msg in messages)) + self.assertFalse(any("SUCCESS" in msg for msg in messages)) + + def test_entry_height_adequate(self): + """Test that adequate entry box height passes""" + data = { + "form_fields": [ + { + "description": "Name", + "page_number": 1, + "label_bounding_box": [10, 10, 50, 30], + "entry_bounding_box": [60, 10, 150, 30], # Height is 20 + "entry_text": { + "font_size": 14 # Font size smaller than height + } + } + ] + } + + stream = self.create_json_stream(data) + messages = get_bounding_box_messages(stream) + self.assertTrue(any("SUCCESS" in msg for msg in messages)) + self.assertFalse(any("FAILURE" in msg for msg in messages)) + + def test_default_font_size(self): + """Test that default font size is used when not specified""" + data = { + "form_fields": [ + { + "description": "Name", + "page_number": 1, + "label_bounding_box": [10, 10, 50, 30], + "entry_bounding_box": [60, 10, 150, 20], # Height is 10 + "entry_text": {} # No font_size specified, should use default 14 + } + ] + } + + stream = self.create_json_stream(data) + messages = get_bounding_box_messages(stream) + self.assertTrue(any("FAILURE" in msg and "height" in msg for msg in messages)) + self.assertFalse(any("SUCCESS" in msg for msg in messages)) + + def test_no_entry_text(self): + """Test that missing entry_text doesn't cause height check""" + data = { + "form_fields": [ + { + "description": "Name", + "page_number": 1, + "label_bounding_box": [10, 10, 50, 30], + "entry_bounding_box": [60, 10, 150, 20] # Small height but no entry_text + } + ] + } + + stream = self.create_json_stream(data) + messages = get_bounding_box_messages(stream) + self.assertTrue(any("SUCCESS" in msg for msg in messages)) + self.assertFalse(any("FAILURE" in msg for msg in messages)) + + def test_multiple_errors_limit(self): + """Test that error messages are limited to prevent excessive output""" + fields = [] + # Create many overlapping fields + for i in range(25): + fields.append({ + "description": f"Field{i}", + "page_number": 1, + "label_bounding_box": [10, 10, 50, 30], # All overlap + "entry_bounding_box": [20, 15, 60, 35] # All overlap + }) + + data = {"form_fields": fields} + + stream = self.create_json_stream(data) + messages = get_bounding_box_messages(stream) + # Should abort after ~20 messages + self.assertTrue(any("Aborting" in msg for msg in messages)) + # Should have some FAILURE messages but not hundreds + failure_count = sum(1 for msg in messages if "FAILURE" in msg) + self.assertGreater(failure_count, 0) + self.assertLess(len(messages), 30) # Should be limited + + def test_edge_touching_boxes(self): + """Test that boxes touching at edges don't count as intersecting""" + data = { + "form_fields": [ + { + "description": "Name", + "page_number": 1, + "label_bounding_box": [10, 10, 50, 30], + "entry_bounding_box": [50, 10, 150, 30] # Touches at x=50 + } + ] + } + + stream = self.create_json_stream(data) + messages = get_bounding_box_messages(stream) + self.assertTrue(any("SUCCESS" in msg for msg in messages)) + self.assertFalse(any("FAILURE" in msg for msg in messages)) + + +if __name__ == '__main__': + unittest.main() diff --git a/ruoyi-admin/src/main/resources/skills/pdf/scripts/check_fillable_fields.py b/ruoyi-admin/src/main/resources/skills/pdf/scripts/check_fillable_fields.py new file mode 100644 index 00000000..dc43d182 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/pdf/scripts/check_fillable_fields.py @@ -0,0 +1,12 @@ +import sys +from pypdf import PdfReader + + +# Script for Claude to run to determine whether a PDF has fillable form fields. See forms.md. + + +reader = PdfReader(sys.argv[1]) +if (reader.get_fields()): + print("This PDF has fillable form fields") +else: + print("This PDF does not have fillable form fields; you will need to visually determine where to enter data") diff --git a/ruoyi-admin/src/main/resources/skills/pdf/scripts/convert_pdf_to_images.py b/ruoyi-admin/src/main/resources/skills/pdf/scripts/convert_pdf_to_images.py new file mode 100644 index 00000000..f8a4ec52 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/pdf/scripts/convert_pdf_to_images.py @@ -0,0 +1,35 @@ +import os +import sys + +from pdf2image import convert_from_path + + +# Converts each page of a PDF to a PNG image. + + +def convert(pdf_path, output_dir, max_dim=1000): + images = convert_from_path(pdf_path, dpi=200) + + for i, image in enumerate(images): + # Scale image if needed to keep width/height under `max_dim` + width, height = image.size + if width > max_dim or height > max_dim: + scale_factor = min(max_dim / width, max_dim / height) + new_width = int(width * scale_factor) + new_height = int(height * scale_factor) + image = image.resize((new_width, new_height)) + + image_path = os.path.join(output_dir, f"page_{i+1}.png") + image.save(image_path) + print(f"Saved page {i+1} as {image_path} (size: {image.size})") + + print(f"Converted {len(images)} pages to PNG images") + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("Usage: convert_pdf_to_images.py [input pdf] [output directory]") + sys.exit(1) + pdf_path = sys.argv[1] + output_directory = sys.argv[2] + convert(pdf_path, output_directory) diff --git a/ruoyi-admin/src/main/resources/skills/pdf/scripts/create_validation_image.py b/ruoyi-admin/src/main/resources/skills/pdf/scripts/create_validation_image.py new file mode 100644 index 00000000..4913f8f8 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/pdf/scripts/create_validation_image.py @@ -0,0 +1,41 @@ +import json +import sys + +from PIL import Image, ImageDraw + + +# Creates "validation" images with rectangles for the bounding box information that +# Claude creates when determining where to add text annotations in PDFs. See forms.md. + + +def create_validation_image(page_number, fields_json_path, input_path, output_path): + # Input file should be in the `fields.json` format described in forms.md. + with open(fields_json_path, 'r') as f: + data = json.load(f) + + img = Image.open(input_path) + draw = ImageDraw.Draw(img) + num_boxes = 0 + + for field in data["form_fields"]: + if field["page_number"] == page_number: + entry_box = field['entry_bounding_box'] + label_box = field['label_bounding_box'] + # Draw red rectangle over entry bounding box and blue rectangle over the label. + draw.rectangle(entry_box, outline='red', width=2) + draw.rectangle(label_box, outline='blue', width=2) + num_boxes += 2 + + img.save(output_path) + print(f"Created validation image at {output_path} with {num_boxes} bounding boxes") + + +if __name__ == "__main__": + if len(sys.argv) != 5: + print("Usage: create_validation_image.py [page number] [fields.json file] [input image path] [output image path]") + sys.exit(1) + page_number = int(sys.argv[1]) + fields_json_path = sys.argv[2] + input_image_path = sys.argv[3] + output_image_path = sys.argv[4] + create_validation_image(page_number, fields_json_path, input_image_path, output_image_path) diff --git a/ruoyi-admin/src/main/resources/skills/pdf/scripts/extract_form_field_info.py b/ruoyi-admin/src/main/resources/skills/pdf/scripts/extract_form_field_info.py new file mode 100644 index 00000000..f42a2df8 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/pdf/scripts/extract_form_field_info.py @@ -0,0 +1,152 @@ +import json +import sys + +from pypdf import PdfReader + + +# Extracts data for the fillable form fields in a PDF and outputs JSON that +# Claude uses to fill the fields. See forms.md. + + +# This matches the format used by PdfReader `get_fields` and `update_page_form_field_values` methods. +def get_full_annotation_field_id(annotation): + components = [] + while annotation: + field_name = annotation.get('/T') + if field_name: + components.append(field_name) + annotation = annotation.get('/Parent') + return ".".join(reversed(components)) if components else None + + +def make_field_dict(field, field_id): + field_dict = {"field_id": field_id} + ft = field.get('/FT') + if ft == "/Tx": + field_dict["type"] = "text" + elif ft == "/Btn": + field_dict["type"] = "checkbox" # radio groups handled separately + states = field.get("/_States_", []) + if len(states) == 2: + # "/Off" seems to always be the unchecked value, as suggested by + # https://opensource.adobe.com/dc-acrobat-sdk-docs/standards/pdfstandards/pdf/PDF32000_2008.pdf#page=448 + # It can be either first or second in the "/_States_" list. + if "/Off" in states: + field_dict["checked_value"] = states[0] if states[0] != "/Off" else states[1] + field_dict["unchecked_value"] = "/Off" + else: + print(f"Unexpected state values for checkbox `${field_id}`. Its checked and unchecked values may not be correct; if you're trying to check it, visually verify the results.") + field_dict["checked_value"] = states[0] + field_dict["unchecked_value"] = states[1] + elif ft == "/Ch": + field_dict["type"] = "choice" + states = field.get("/_States_", []) + field_dict["choice_options"] = [{ + "value": state[0], + "text": state[1], + } for state in states] + else: + field_dict["type"] = f"unknown ({ft})" + return field_dict + + +# Returns a list of fillable PDF fields: +# [ +# { +# "field_id": "name", +# "page": 1, +# "type": ("text", "checkbox", "radio_group", or "choice") +# // Per-type additional fields described in forms.md +# }, +# ] +def get_field_info(reader: PdfReader): + fields = reader.get_fields() + + field_info_by_id = {} + possible_radio_names = set() + + for field_id, field in fields.items(): + # Skip if this is a container field with children, except that it might be + # a parent group for radio button options. + if field.get("/Kids"): + if field.get("/FT") == "/Btn": + possible_radio_names.add(field_id) + continue + field_info_by_id[field_id] = make_field_dict(field, field_id) + + # Bounding rects are stored in annotations in page objects. + + # Radio button options have a separate annotation for each choice; + # all choices have the same field name. + # See https://westhealth.github.io/exploring-fillable-forms-with-pdfrw.html + radio_fields_by_id = {} + + for page_index, page in enumerate(reader.pages): + annotations = page.get('/Annots', []) + for ann in annotations: + field_id = get_full_annotation_field_id(ann) + if field_id in field_info_by_id: + field_info_by_id[field_id]["page"] = page_index + 1 + field_info_by_id[field_id]["rect"] = ann.get('/Rect') + elif field_id in possible_radio_names: + try: + # ann['/AP']['/N'] should have two items. One of them is '/Off', + # the other is the active value. + on_values = [v for v in ann["/AP"]["/N"] if v != "/Off"] + except KeyError: + continue + if len(on_values) == 1: + rect = ann.get("/Rect") + if field_id not in radio_fields_by_id: + radio_fields_by_id[field_id] = { + "field_id": field_id, + "type": "radio_group", + "page": page_index + 1, + "radio_options": [], + } + # Note: at least on macOS 15.7, Preview.app doesn't show selected + # radio buttons correctly. (It does if you remove the leading slash + # from the value, but that causes them not to appear correctly in + # Chrome/Firefox/Acrobat/etc). + radio_fields_by_id[field_id]["radio_options"].append({ + "value": on_values[0], + "rect": rect, + }) + + # Some PDFs have form field definitions without corresponding annotations, + # so we can't tell where they are. Ignore these fields for now. + fields_with_location = [] + for field_info in field_info_by_id.values(): + if "page" in field_info: + fields_with_location.append(field_info) + else: + print(f"Unable to determine location for field id: {field_info.get('field_id')}, ignoring") + + # Sort by page number, then Y position (flipped in PDF coordinate system), then X. + def sort_key(f): + if "radio_options" in f: + rect = f["radio_options"][0]["rect"] or [0, 0, 0, 0] + else: + rect = f.get("rect") or [0, 0, 0, 0] + adjusted_position = [-rect[1], rect[0]] + return [f.get("page"), adjusted_position] + + sorted_fields = fields_with_location + list(radio_fields_by_id.values()) + sorted_fields.sort(key=sort_key) + + return sorted_fields + + +def write_field_info(pdf_path: str, json_output_path: str): + reader = PdfReader(pdf_path) + field_info = get_field_info(reader) + with open(json_output_path, "w") as f: + json.dump(field_info, f, indent=2) + print(f"Wrote {len(field_info)} fields to {json_output_path}") + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("Usage: extract_form_field_info.py [input pdf] [output json]") + sys.exit(1) + write_field_info(sys.argv[1], sys.argv[2]) diff --git a/ruoyi-admin/src/main/resources/skills/pdf/scripts/fill_fillable_fields.py b/ruoyi-admin/src/main/resources/skills/pdf/scripts/fill_fillable_fields.py new file mode 100644 index 00000000..ac35753c --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/pdf/scripts/fill_fillable_fields.py @@ -0,0 +1,114 @@ +import json +import sys + +from pypdf import PdfReader, PdfWriter + +from extract_form_field_info import get_field_info + + +# Fills fillable form fields in a PDF. See forms.md. + + +def fill_pdf_fields(input_pdf_path: str, fields_json_path: str, output_pdf_path: str): + with open(fields_json_path) as f: + fields = json.load(f) + # Group by page number. + fields_by_page = {} + for field in fields: + if "value" in field: + field_id = field["field_id"] + page = field["page"] + if page not in fields_by_page: + fields_by_page[page] = {} + fields_by_page[page][field_id] = field["value"] + + reader = PdfReader(input_pdf_path) + + has_error = False + field_info = get_field_info(reader) + fields_by_ids = {f["field_id"]: f for f in field_info} + for field in fields: + existing_field = fields_by_ids.get(field["field_id"]) + if not existing_field: + has_error = True + print(f"ERROR: `{field['field_id']}` is not a valid field ID") + elif field["page"] != existing_field["page"]: + has_error = True + print(f"ERROR: Incorrect page number for `{field['field_id']}` (got {field['page']}, expected {existing_field['page']})") + else: + if "value" in field: + err = validation_error_for_field_value(existing_field, field["value"]) + if err: + print(err) + has_error = True + if has_error: + sys.exit(1) + + writer = PdfWriter(clone_from=reader) + for page, field_values in fields_by_page.items(): + writer.update_page_form_field_values(writer.pages[page - 1], field_values, auto_regenerate=False) + + # This seems to be necessary for many PDF viewers to format the form values correctly. + # It may cause the viewer to show a "save changes" dialog even if the user doesn't make any changes. + writer.set_need_appearances_writer(True) + + with open(output_pdf_path, "wb") as f: + writer.write(f) + + +def validation_error_for_field_value(field_info, field_value): + field_type = field_info["type"] + field_id = field_info["field_id"] + if field_type == "checkbox": + checked_val = field_info["checked_value"] + unchecked_val = field_info["unchecked_value"] + if field_value != checked_val and field_value != unchecked_val: + return f'ERROR: Invalid value "{field_value}" for checkbox field "{field_id}". The checked value is "{checked_val}" and the unchecked value is "{unchecked_val}"' + elif field_type == "radio_group": + option_values = [opt["value"] for opt in field_info["radio_options"]] + if field_value not in option_values: + return f'ERROR: Invalid value "{field_value}" for radio group field "{field_id}". Valid values are: {option_values}' + elif field_type == "choice": + choice_values = [opt["value"] for opt in field_info["choice_options"]] + if field_value not in choice_values: + return f'ERROR: Invalid value "{field_value}" for choice field "{field_id}". Valid values are: {choice_values}' + return None + + +# pypdf (at least version 5.7.0) has a bug when setting the value for a selection list field. +# In _writer.py around line 966: +# +# if field.get(FA.FT, "/Tx") == "/Ch" and field_flags & FA.FfBits.Combo == 0: +# txt = "\n".join(annotation.get_inherited(FA.Opt, [])) +# +# The problem is that for selection lists, `get_inherited` returns a list of two-element lists like +# [["value1", "Text 1"], ["value2", "Text 2"], ...] +# This causes `join` to throw a TypeError because it expects an iterable of strings. +# The horrible workaround is to patch `get_inherited` to return a list of the value strings. +# We call the original method and adjust the return value only if the argument to `get_inherited` +# is `FA.Opt` and if the return value is a list of two-element lists. +def monkeypatch_pydpf_method(): + from pypdf.generic import DictionaryObject + from pypdf.constants import FieldDictionaryAttributes + + original_get_inherited = DictionaryObject.get_inherited + + def patched_get_inherited(self, key: str, default = None): + result = original_get_inherited(self, key, default) + if key == FieldDictionaryAttributes.Opt: + if isinstance(result, list) and all(isinstance(v, list) and len(v) == 2 for v in result): + result = [r[0] for r in result] + return result + + DictionaryObject.get_inherited = patched_get_inherited + + +if __name__ == "__main__": + if len(sys.argv) != 4: + print("Usage: fill_fillable_fields.py [input pdf] [field_values.json] [output pdf]") + sys.exit(1) + monkeypatch_pydpf_method() + input_pdf = sys.argv[1] + fields_json = sys.argv[2] + output_pdf = sys.argv[3] + fill_pdf_fields(input_pdf, fields_json, output_pdf) diff --git a/ruoyi-admin/src/main/resources/skills/pdf/scripts/fill_pdf_form_with_annotations.py b/ruoyi-admin/src/main/resources/skills/pdf/scripts/fill_pdf_form_with_annotations.py new file mode 100644 index 00000000..f9805313 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/pdf/scripts/fill_pdf_form_with_annotations.py @@ -0,0 +1,108 @@ +import json +import sys + +from pypdf import PdfReader, PdfWriter +from pypdf.annotations import FreeText + + +# Fills a PDF by adding text annotations defined in `fields.json`. See forms.md. + + +def transform_coordinates(bbox, image_width, image_height, pdf_width, pdf_height): + """Transform bounding box from image coordinates to PDF coordinates""" + # Image coordinates: origin at top-left, y increases downward + # PDF coordinates: origin at bottom-left, y increases upward + x_scale = pdf_width / image_width + y_scale = pdf_height / image_height + + left = bbox[0] * x_scale + right = bbox[2] * x_scale + + # Flip Y coordinates for PDF + top = pdf_height - (bbox[1] * y_scale) + bottom = pdf_height - (bbox[3] * y_scale) + + return left, bottom, right, top + + +def fill_pdf_form(input_pdf_path, fields_json_path, output_pdf_path): + """Fill the PDF form with data from fields.json""" + + # `fields.json` format described in forms.md. + with open(fields_json_path, "r") as f: + fields_data = json.load(f) + + # Open the PDF + reader = PdfReader(input_pdf_path) + writer = PdfWriter() + + # Copy all pages to writer + writer.append(reader) + + # Get PDF dimensions for each page + pdf_dimensions = {} + for i, page in enumerate(reader.pages): + mediabox = page.mediabox + pdf_dimensions[i + 1] = [mediabox.width, mediabox.height] + + # Process each form field + annotations = [] + for field in fields_data["form_fields"]: + page_num = field["page_number"] + + # Get page dimensions and transform coordinates. + page_info = next(p for p in fields_data["pages"] if p["page_number"] == page_num) + image_width = page_info["image_width"] + image_height = page_info["image_height"] + pdf_width, pdf_height = pdf_dimensions[page_num] + + transformed_entry_box = transform_coordinates( + field["entry_bounding_box"], + image_width, image_height, + pdf_width, pdf_height + ) + + # Skip empty fields + if "entry_text" not in field or "text" not in field["entry_text"]: + continue + entry_text = field["entry_text"] + text = entry_text["text"] + if not text: + continue + + font_name = entry_text.get("font", "Arial") + font_size = str(entry_text.get("font_size", 14)) + "pt" + font_color = entry_text.get("font_color", "000000") + + # Font size/color seems to not work reliably across viewers: + # https://github.com/py-pdf/pypdf/issues/2084 + annotation = FreeText( + text=text, + rect=transformed_entry_box, + font=font_name, + font_size=font_size, + font_color=font_color, + border_color=None, + background_color=None, + ) + annotations.append(annotation) + # page_number is 0-based for pypdf + writer.add_annotation(page_number=page_num - 1, annotation=annotation) + + # Save the filled PDF + with open(output_pdf_path, "wb") as output: + writer.write(output) + + print(f"Successfully filled PDF form and saved to {output_pdf_path}") + print(f"Added {len(annotations)} text annotations") + + +if __name__ == "__main__": + if len(sys.argv) != 4: + print("Usage: fill_pdf_form_with_annotations.py [input pdf] [fields.json] [output pdf]") + sys.exit(1) + input_pdf = sys.argv[1] + fields_json = sys.argv[2] + output_pdf = sys.argv[3] + + fill_pdf_form(input_pdf, fields_json, output_pdf) \ No newline at end of file diff --git a/ruoyi-admin/src/main/resources/skills/xlsx/LICENSE.txt b/ruoyi-admin/src/main/resources/skills/xlsx/LICENSE.txt new file mode 100644 index 00000000..c55ab422 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/xlsx/LICENSE.txt @@ -0,0 +1,30 @@ +© 2025 Anthropic, PBC. All rights reserved. + +LICENSE: Use of these materials (including all code, prompts, assets, files, +and other components of this Skill) is governed by your agreement with +Anthropic regarding use of Anthropic's services. If no separate agreement +exists, use is governed by Anthropic's Consumer Terms of Service or +Commercial Terms of Service, as applicable: +https://www.anthropic.com/legal/consumer-terms +https://www.anthropic.com/legal/commercial-terms +Your applicable agreement is referred to as the "Agreement." "Services" are +as defined in the Agreement. + +ADDITIONAL RESTRICTIONS: Notwithstanding anything in the Agreement to the +contrary, users may not: + +- Extract these materials from the Services or retain copies of these + materials outside the Services +- Reproduce or copy these materials, except for temporary copies created + automatically during authorized use of the Services +- Create derivative works based on these materials +- Distribute, sublicense, or transfer these materials to any third party +- Make, offer to sell, sell, or import any inventions embodied in these + materials +- Reverse engineer, decompile, or disassemble these materials + +The receipt, viewing, or possession of these materials does not convey or +imply any license or right beyond those expressly granted above. + +Anthropic retains all right, title, and interest in these materials, +including all copyrights, patents, and other intellectual property rights. diff --git a/ruoyi-admin/src/main/resources/skills/xlsx/SKILL.md b/ruoyi-admin/src/main/resources/skills/xlsx/SKILL.md new file mode 100644 index 00000000..3fa1d994 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/xlsx/SKILL.md @@ -0,0 +1,362 @@ +--- +name: xlsx +description: "Comprehensive spreadsheet creation, editing, and analysis with support for formulas, formatting, data analysis, and visualization. When Claude needs to work with spreadsheets (.xlsx, .xlsm, .csv, .tsv, etc) for: (1) Creating new spreadsheets with formulas and formatting, (2) Reading or analyzing data, (3) Modify existing spreadsheets while preserving formulas, (4) Data analysis and visualization in spreadsheets, or (5) Recalculating formulas" +license: Proprietary. LICENSE.txt has complete terms +--- + +# Requirements for Outputs + +## All Excel files + +### Zero Formula Errors +- Every Excel model MUST be delivered with ZERO formula errors (#REF!, #DIV/0!, #VALUE!, #N/A, #NAME?) + +### Preserve Existing Templates (when updating templates) +- Study and EXACTLY match existing format, style, and conventions when modifying files +- Never impose standardized formatting on files with established patterns +- Existing template conventions ALWAYS override these guidelines + +## Financial models + +### Color Coding Standards +Unless otherwise stated by the user or existing template + +#### Industry-Standard Color Conventions +- **Blue text (RGB: 0,0,255)**: Hardcoded inputs, and numbers users will change for scenarios +- **Black text (RGB: 0,0,0)**: ALL formulas and calculations +- **Green text (RGB: 0,128,0)**: Links pulling from other worksheets within same workbook +- **Red text (RGB: 255,0,0)**: External links to other files +- **Yellow background (RGB: 255,255,0)**: Key assumptions needing attention or cells that need to be updated + +### Number Formatting Standards + +#### Required Format Rules +- **Years**: Format as text strings (e.g., "2024" not "2,024") +- **Currency**: Use $#,##0 format; ALWAYS specify units in headers ("Revenue ($mm)") +- **Zeros**: Use number formatting to make all zeros "-", including percentages (e.g., "$#,##0;($#,##0);-") +- **Percentages**: Default to 0.0% format (one decimal) +- **Multiples**: Format as 0.0x for valuation multiples (EV/EBITDA, P/E) +- **Negative numbers**: Use parentheses (123) not minus -123 + +### Formula Construction Rules + +#### Assumptions Placement +- Place ALL assumptions (growth rates, margins, multiples, etc.) in separate assumption cells +- Use cell references instead of hardcoded values in formulas +- Example: Use =B5*(1+$B$6) instead of =B5*1.05 + +#### Formula Error Prevention +- Verify all cell references are correct +- Check for off-by-one errors in ranges +- Ensure consistent formulas across all projection periods +- Test with edge cases (zero values, negative numbers) +- Verify no unintended circular references + +#### Documentation Requirements for Hardcodes +- Comment or in cells beside (if end of table). Format: "Source: [System/Document], [Date], [Specific Reference], [URL if applicable]" +- Examples: + - "Source: Company 10-K, FY2024, Page 45, Revenue Note, [SEC EDGAR URL]" + - "Source: Company 10-Q, Q2 2025, Exhibit 99.1, [SEC EDGAR URL]" + - "Source: Bloomberg Terminal, 8/15/2025, AAPL US Equity" + - "Source: FactSet, 8/20/2025, Consensus Estimates Screen" + +# XLSX creation, editing, and analysis + +## Overview + +A user may ask you to create, edit, or analyze the contents of an .xlsx file. You have different tools and workflows available for different tasks. + +## Important Requirements + +**LibreOffice Required for Formula Recalculation**: You can assume LibreOffice is installed for recalculating formula values using the `recalc.py` script. The script automatically configures LibreOffice on first run + +## Reading and analyzing data + +### Data analysis with pandas +For data analysis, visualization, and basic operations, use **pandas** which provides powerful data manipulation capabilities: + +```python +import pandas as pd + +# Read Excel +df = pd.read_excel('file.xlsx') # Default: first sheet +all_sheets = pd.read_excel('file.xlsx', sheet_name=None) # All sheets as dict + +# Analyze +df.head() # Preview data +df.info() # Column info +df.describe() # Statistics + +# Write Excel +df.to_excel('output.xlsx', index=False) +``` + +## Excel File Workflows + +## CRITICAL: Use Formulas, Not Hardcoded Values + +**Always use Excel formulas instead of calculating values in Python and hardcoding them.** This ensures the spreadsheet remains dynamic and updateable. + +### ❌ WRONG - Hardcoding Calculated Values +```python +# Bad: Calculating in Python and hardcoding result +total = df['Sales'].sum() +sheet['B10'] = total # Hardcodes 5000 + +# Bad: Computing growth rate in Python +growth = (df.iloc[-1]['Revenue'] - df.iloc[0]['Revenue']) / df.iloc[0]['Revenue'] +sheet['C5'] = growth # Hardcodes 0.15 + +# Bad: Python calculation for average +avg = sum(values) / len(values) +sheet['D20'] = avg # Hardcodes 42.5 +``` + +### ✅ CORRECT - Using Excel Formulas +```python +# Good: Let Excel calculate the sum +sheet['B10'] = '=SUM(B2:B9)' + +# Good: Growth rate as Excel formula +sheet['C5'] = '=(C4-C2)/C2' + +# Good: Average using Excel function +sheet['D20'] = '=AVERAGE(D2:D19)' +``` + +This applies to ALL calculations - totals, percentages, ratios, differences, etc. The spreadsheet should be able to recalculate when source data changes. + +## Common Workflow +1. **Choose tool**: pandas for data, openpyxl for formulas/formatting +2. **Create/Load**: Create new workbook or load existing file +3. **Modify**: Add/edit data, formulas, and formatting +4. **Save**: Write to file +5. **Recalculate formulas (MANDATORY IF USING FORMULAS)**: Use the recalc.py script + ```bash + python recalc.py output.xlsx + ``` +6. **Verify and fix any errors**: + - The script returns JSON with error details + - If `status` is `errors_found`, check `error_summary` for specific error types and locations + - Fix the identified errors and recalculate again + - Common errors to fix: + - `#REF!`: Invalid cell references + - `#DIV/0!`: Division by zero + - `#VALUE!`: Wrong data type in formula + - `#NAME?`: Unrecognized formula name + +### Creating new Excel files + +```python +# Using openpyxl for formulas and formatting +from openpyxl import Workbook +from openpyxl.styles import Font, PatternFill, Alignment + +wb = Workbook() +sheet = wb.active + +# Add data +sheet['A1'] = 'Hello' +sheet['B1'] = 'World' +sheet.append(['Row', 'of', 'data']) + +# Add formula +sheet['B2'] = '=SUM(A1:A10)' + +# Formatting +sheet['A1'].font = Font(bold=True, color='FF0000') +sheet['A1'].fill = PatternFill('solid', start_color='FFFF00') +sheet['A1'].alignment = Alignment(horizontal='center') + +# Column width +sheet.column_dimensions['A'].width = 20 + +wb.save('output.xlsx') +``` + +### Editing existing Excel files + +```python +# Using openpyxl to preserve formulas and formatting +from openpyxl import load_workbook + +# Load existing file +wb = load_workbook('existing.xlsx') +sheet = wb.active # or wb['SheetName'] for specific sheet + +# Working with multiple sheets +for sheet_name in wb.sheetnames: + sheet = wb[sheet_name] + print(f"Sheet: {sheet_name}") + +# Modify cells +sheet['A1'] = 'New Value' +sheet.insert_rows(2) # Insert row at position 2 +sheet.delete_cols(3) # Delete column 3 + +# Add new sheet +new_sheet = wb.create_sheet('NewSheet') +new_sheet['A1'] = 'Data' + +wb.save('modified.xlsx') +``` + +## Recalculating formulas + +Excel files created or modified by openpyxl contain formulas as strings but not calculated values. Use the provided `recalc.py` script to recalculate formulas: + +```bash +# Recalculate existing Excel file +python recalc.py [timeout_seconds] + +# Create new Excel workbook +python recalc.py --create '' +``` + +Examples: +```bash +python recalc.py output.xlsx 30 + +python recalc.py --create output.xlsx '{"Sheet1": {"data": [["Name", "Value"], ["Test", 100]]}}' +``` + +The script: +- Automatically sets up LibreOffice macro on first run +- Recalculates all formulas in all sheets +- Scans ALL cells for Excel errors (#REF!, #DIV/0!, etc.) +- Returns JSON with detailed error locations and counts +- Works on Windows, Linux, and macOS +- Can create new workbooks from JSON configuration + +## Creating workbooks with recalc.py + +The `--create` mode allows you to create new Excel workbooks directly from JSON configuration: + +```bash +python recalc.py --create '' +``` + +**Default Behavior**: If you provide a relative path (e.g., `test.xlsx`), the file will be created in the **project root** directory (`ruoyi-ai-v3/`). To save to a specific directory, use an absolute path. + +### JSON Format + +```json +{ + "Sheet1": { + "data": [ + ["Header1", "Header2", "Header3"], + ["Value1", "Value2", "=A2*B2"], + ["Value3", "Value4", "=SUM(C2:C10)"] + ] + }, + "Sheet2": { + "data": [ + ["Name", "Score"], + ["Alice", 95], + ["Bob", 87] + ] + } +} +``` + +### Examples + +**Simple data table (saves to project root)**: +```bash +python recalc.py --create data.xlsx '{"Sheet1": {"data": [["Name", "Age"], ["John", 30], ["Jane", 28]]}}' +# Creates: ruoyi-ai-v3/data.xlsx +``` + +**With formulas (saves to project root)**: +```bash +python recalc.py --create model.xlsx '{"Sheet1": {"data": [["A", "B", "Sum"], [10, 20, "=A2+B2"], [5, 15, "=A3+B3"]]}}' +# Creates: ruoyi-ai-v3/model.xlsx +``` + +**Save to custom directory (use absolute path)**: +```bash +python recalc.py --create "D:\Downloads\custom.xlsx" '{"Sheet1": {"data": [["A", "B"], [1, 2]]}}' +# Creates: D:\Downloads\custom.xlsx +``` + +### File Path Options + +| Input | Output Location | +|-------|-----------------| +| `test.xlsx` | `ruoyi-ai-v3/test.xlsx` (project root) | +| `output/test.xlsx` | `ruoyi-ai-v3/output/test.xlsx` | +| `D:\Downloads\test.xlsx` | `D:\Downloads\test.xlsx` (absolute path) | + +### Features + +- First row is automatically formatted as bold header +- Supports both values and Excel formulas (strings starting with `=`) +- Multiple sheets can be created in one command +- Output file extension determines format (.xlsx, .xlsm, etc.) + +## Formula Verification Checklist + +Quick checks to ensure formulas work correctly: + +### Essential Verification +- [ ] **Test 2-3 sample references**: Verify they pull correct values before building full model +- [ ] **Column mapping**: Confirm Excel columns match (e.g., column 64 = BL, not BK) +- [ ] **Row offset**: Remember Excel rows are 1-indexed (DataFrame row 5 = Excel row 6) + +### Common Pitfalls +- [ ] **NaN handling**: Check for null values with `pd.notna()` +- [ ] **Far-right columns**: FY data often in columns 50+ +- [ ] **Multiple matches**: Search all occurrences, not just first +- [ ] **Division by zero**: Check denominators before using `/` in formulas (#DIV/0!) +- [ ] **Wrong references**: Verify all cell references point to intended cells (#REF!) +- [ ] **Cross-sheet references**: Use correct format (Sheet1!A1) for linking sheets + +### Formula Testing Strategy +- [ ] **Start small**: Test formulas on 2-3 cells before applying broadly +- [ ] **Verify dependencies**: Check all cells referenced in formulas exist +- [ ] **Test edge cases**: Include zero, negative, and very large values + +### Interpreting recalc.py Output +The script returns JSON with error details: +```json +{ + "status": "success", // or "errors_found" + "total_errors": 0, // Total error count + "total_formulas": 42, // Number of formulas in file + "error_summary": { // Only present if errors found + "#REF!": { + "count": 2, + "locations": ["Sheet1!B5", "Sheet1!C10"] + } + } +} +``` + +## Best Practices + +### Library Selection +- **pandas**: Best for data analysis, bulk operations, and simple data export +- **openpyxl**: Best for complex formatting, formulas, and Excel-specific features + +### Working with openpyxl +- Cell indices are 1-based (row=1, column=1 refers to cell A1) +- Use `data_only=True` to read calculated values: `load_workbook('file.xlsx', data_only=True)` +- **Warning**: If opened with `data_only=True` and saved, formulas are replaced with values and permanently lost +- For large files: Use `read_only=True` for reading or `write_only=True` for writing +- Formulas are preserved but not evaluated - use recalc.py to update values + +### Working with pandas +- Specify data types to avoid inference issues: `pd.read_excel('file.xlsx', dtype={'id': str})` +- For large files, read specific columns: `pd.read_excel('file.xlsx', usecols=['A', 'C', 'E'])` +- Handle dates properly: `pd.read_excel('file.xlsx', parse_dates=['date_column'])` + +## Code Style Guidelines +**IMPORTANT**: When generating Python code for Excel operations: +- Write minimal, concise Python code without unnecessary comments +- Avoid verbose variable names and redundant operations +- Avoid unnecessary print statements + +**For Excel files themselves**: +- Add comments to cells with complex formulas or important assumptions +- Document data sources for hardcoded values +- Include notes for key calculations and model sections \ No newline at end of file diff --git a/ruoyi-admin/src/main/resources/skills/xlsx/recalc.py b/ruoyi-admin/src/main/resources/skills/xlsx/recalc.py new file mode 100644 index 00000000..e25865d3 --- /dev/null +++ b/ruoyi-admin/src/main/resources/skills/xlsx/recalc.py @@ -0,0 +1,318 @@ +#!/usr/bin/env python3 +""" +Excel Formula Recalculation Script +Recalculates all formulas in an Excel file using LibreOffice +""" + +import json +import sys +import subprocess +import os +import platform +from pathlib import Path +from openpyxl import load_workbook + + +def get_soffice_path(): + """Find LibreOffice soffice executable path for the current OS""" + system = platform.system() + + if system == 'Windows': + # Common LibreOffice installation paths on Windows + possible_paths = [ + Path(os.environ.get('PROGRAMFILES', 'C:\\Program Files')) / 'LibreOffice' / 'program' / 'soffice.exe', + Path(os.environ.get('PROGRAMFILES(X86)', 'C:\\Program Files (x86)')) / 'LibreOffice' / 'program' / 'soffice.exe', + Path(os.path.expanduser('~')) / 'AppData' / 'Local' / 'LibreOffice' / 'program' / 'soffice.exe', + ] + + for path in possible_paths: + if path.exists(): + return str(path) + + # Try to find it using where command + try: + result = subprocess.run(['where', 'soffice.exe'], capture_output=True, text=True, timeout=5) + if result.returncode == 0: + return result.stdout.strip().split('\n')[0] + except Exception: + pass + + return None + else: + # For Linux and macOS, soffice is usually in PATH + return 'soffice' + + +def setup_libreoffice_macro(): + """Setup LibreOffice macro for recalculation if not already configured""" + system = platform.system() + + if system == 'Darwin': + macro_dir = os.path.expanduser('~/Library/Application Support/LibreOffice/4/user/basic/Standard') + elif system == 'Windows': + # Windows path for LibreOffice config + appdata = os.path.expanduser('~\\AppData\\Roaming\\LibreOffice\\4\\user\\basic\\Standard') + macro_dir = appdata + else: + # Linux + macro_dir = os.path.expanduser('~/.config/libreoffice/4/user/basic/Standard') + + macro_file = os.path.join(macro_dir, 'Module1.xba') + + if os.path.exists(macro_file): + try: + with open(macro_file, 'r') as f: + if 'RecalculateAndSave' in f.read(): + return True + except Exception: + pass + + if not os.path.exists(macro_dir): + soffice_path = get_soffice_path() + if not soffice_path: + return False + + try: + subprocess.run([soffice_path, '--headless', '--terminate_after_init'], + capture_output=True, timeout=10) + except Exception: + pass + + os.makedirs(macro_dir, exist_ok=True) + + macro_content = ''' + + + Sub RecalculateAndSave() + ThisComponent.calculateAll() + ThisComponent.store() + ThisComponent.close(True) + End Sub +''' + + try: + with open(macro_file, 'w') as f: + f.write(macro_content) + return True + except Exception: + return False + + +def recalc(filename, timeout=30): + """ + Recalculate formulas in Excel file and report any errors + + Args: + filename: Path to Excel file + timeout: Maximum time to wait for recalculation (seconds) + + Returns: + dict with error locations and counts + """ + if not Path(filename).exists(): + return {'error': f'File {filename} does not exist'} + + abs_path = str(Path(filename).absolute()) + + if not setup_libreoffice_macro(): + return {'error': 'Failed to setup LibreOffice macro'} + + soffice_path = get_soffice_path() + if not soffice_path: + return {'error': 'LibreOffice not found. Please install LibreOffice.'} + + cmd = [ + soffice_path, '--headless', '--norestore', + 'vnd.sun.star.script:Standard.Module1.RecalculateAndSave?language=Basic&location=application', + abs_path + ] + + system = platform.system() + + # Handle timeout for different operating systems + if system == 'Windows': + # Windows: use taskkill as fallback, but rely on subprocess timeout + try: + result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) + except subprocess.TimeoutExpired: + return {'error': f'LibreOffice recalculation timed out after {timeout} seconds'} + elif system == 'Linux': + # Linux: use timeout command + timeout_cmd = 'timeout' + cmd = [timeout_cmd, str(timeout)] + cmd + result = subprocess.run(cmd, capture_output=True, text=True) + elif system == 'Darwin': + # macOS: try gtimeout first, fallback to timeout handling + timeout_cmd = None + try: + subprocess.run(['gtimeout', '--version'], capture_output=True, timeout=1, check=False) + timeout_cmd = 'gtimeout' + except (FileNotFoundError, subprocess.TimeoutExpired): + pass + + if timeout_cmd: + cmd = [timeout_cmd, str(timeout)] + cmd + result = subprocess.run(cmd, capture_output=True, text=True) + else: + try: + result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) + except subprocess.TimeoutExpired: + return {'error': f'LibreOffice recalculation timed out after {timeout} seconds'} + else: + result = subprocess.run(cmd, capture_output=True, text=True) + + if result.returncode != 0 and result.returncode != 124: # 124 is timeout exit code + error_msg = result.stderr or 'Unknown error during recalculation' + if 'Module1' in error_msg or 'RecalculateAndSave' not in error_msg: + return {'error': 'LibreOffice macro not configured properly. Error: ' + error_msg} + else: + return {'error': error_msg} + + # Check for Excel errors in the recalculated file - scan ALL cells + try: + wb = load_workbook(filename, data_only=True) + + excel_errors = ['#VALUE!', '#DIV/0!', '#REF!', '#NAME?', '#NULL!', '#NUM!', '#N/A'] + error_details = {err: [] for err in excel_errors} + total_errors = 0 + + for sheet_name in wb.sheetnames: + ws = wb[sheet_name] + # Check ALL rows and columns - no limits + for row in ws.iter_rows(): + for cell in row: + if cell.value is not None and isinstance(cell.value, str): + for err in excel_errors: + if err in cell.value: + location = f"{sheet_name}!{cell.coordinate}" + error_details[err].append(location) + total_errors += 1 + break + + wb.close() + + # Build result summary + result = { + 'status': 'success' if total_errors == 0 else 'errors_found', + 'total_errors': total_errors, + 'error_summary': {} + } + + # Add non-empty error categories + for err_type, locations in error_details.items(): + if locations: + result['error_summary'][err_type] = { + 'count': len(locations), + 'locations': locations[:20] # Show up to 20 locations + } + + # Add formula count for context - also check ALL cells + wb_formulas = load_workbook(filename, data_only=False) + formula_count = 0 + for sheet_name in wb_formulas.sheetnames: + ws = wb_formulas[sheet_name] + for row in ws.iter_rows(): + for cell in row: + if cell.value and isinstance(cell.value, str) and cell.value.startswith('='): + formula_count += 1 + wb_formulas.close() + + result['total_formulas'] = formula_count + + return result + + except Exception as e: + return {'error': str(e)} + + +def get_project_root(): + """Get project root directory (ruoyi-ai-v3)""" + current = os.path.dirname(os.path.abspath(__file__)) + # Traverse up 8 levels from xlsx/recalc.py to project root + for _ in range(8): + current = os.path.dirname(current) + return current + + +def create_workbook(output_path, sheets_config): + """ + Create a new Excel workbook with specified sheets and data + + Args: + output_path: Path where the workbook will be saved + sheets_config: Dict with sheet configurations + Example: { + 'Sheet1': { + 'data': [ + ['Header1', 'Header2'], + ['Value1', '=B2*2'] + ], + 'formulas': False # Whether sheet contains formulas + } + } + + Returns: + dict with success/error status + """ + try: + from openpyxl import Workbook + from openpyxl.styles import Font, Alignment + + wb = Workbook() + wb.remove(wb.active) # Remove default sheet + + for sheet_name, config in sheets_config.items(): + ws = wb.create_sheet(sheet_name) + data = config.get('data', []) + + for row_idx, row_data in enumerate(data, 1): + for col_idx, cell_value in enumerate(row_data, 1): + cell = ws.cell(row=row_idx, column=col_idx, value=cell_value) + # Make header row bold + if row_idx == 1: + cell.font = Font(bold=True) + + wb.save(output_path) + return {'status': 'success', 'message': f'Workbook created at {output_path}'} + + except Exception as e: + return {'status': 'error', 'message': str(e)} + + +def main(): + if len(sys.argv) < 2: + print("Usage:") + print(" python recalc.py [timeout_seconds] # Recalculate formulas") + print(" python recalc.py --create # Create new workbook") + print("\nRecalculate formulas:") + print(" Recalculates all formulas in an Excel file using LibreOffice") + print(" Returns JSON with error details") + print("\nCreate workbook:") + print(" data_json format: '{\"Sheet1\": {\"data\": [[\"A\", \"B\"], [1, 2]]}}'") + sys.exit(1) + + if sys.argv[1] == '--create': + if len(sys.argv) < 4: + print("Error: --create requires output_file and data_json") + sys.exit(1) + output_file = sys.argv[2] + # If relative path, place in project root + if not os.path.isabs(output_file): + project_root = get_project_root() + output_file = os.path.join(project_root, output_file) + data_json = sys.argv[3] + try: + sheets_config = json.loads(data_json) + result = create_workbook(output_file, sheets_config) + print(json.dumps(result, indent=2)) + except json.JSONDecodeError as e: + print(json.dumps({'status': 'error', 'message': f'Invalid JSON: {str(e)}'}, indent=2)) + else: + filename = sys.argv[1] + timeout = int(sys.argv[2]) if len(sys.argv) > 2 else 30 + result = recalc(filename, timeout) + print(json.dumps(result, indent=2)) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/ruoyi-common/ruoyi-common-chat/src/main/java/org/ruoyi/common/chat/domain/dto/request/AgentChatRequest.java b/ruoyi-common/ruoyi-common-chat/src/main/java/org/ruoyi/common/chat/domain/dto/request/AgentChatRequest.java new file mode 100644 index 00000000..1db39338 --- /dev/null +++ b/ruoyi-common/ruoyi-common-chat/src/main/java/org/ruoyi/common/chat/domain/dto/request/AgentChatRequest.java @@ -0,0 +1,26 @@ +package org.ruoyi.common.chat.domain.dto.request; + +import jakarta.validation.constraints.NotEmpty; +import lombok.Data; + +/** + * Agent 对话请求对象(简化版) + * + * @author ageerle@163.com + * @date 2025/04/10 + */ +@Data +public class AgentChatRequest { + + /** + * 对话消息 + */ + @NotEmpty(message = "对话消息不能为空") + private String content; + + /** + * 会话id(可选,不传则不保存历史) + */ + private Long sessionId; + +} diff --git a/ruoyi-common/ruoyi-common-chat/src/main/java/org/ruoyi/common/chat/domain/dto/request/ChatRequest.java b/ruoyi-common/ruoyi-common-chat/src/main/java/org/ruoyi/common/chat/domain/dto/request/ChatRequest.java index f14fb086..23e42375 100644 --- a/ruoyi-common/ruoyi-common-chat/src/main/java/org/ruoyi/common/chat/domain/dto/request/ChatRequest.java +++ b/ruoyi-common/ruoyi-common-chat/src/main/java/org/ruoyi/common/chat/domain/dto/request/ChatRequest.java @@ -3,8 +3,13 @@ package org.ruoyi.common.chat.domain.dto.request; import com.alibaba.fastjson.annotation.JSONField; import com.fasterxml.jackson.databind.annotation.JsonSerialize; import com.fasterxml.jackson.databind.ser.std.ToStringSerializer; +import dev.langchain4j.data.message.ChatMessage; import jakarta.validation.constraints.NotEmpty; import lombok.Data; +import org.ruoyi.common.chat.domain.vo.chat.ChatModelVo; +import org.springframework.web.servlet.mvc.method.annotation.SseEmitter; + +import java.util.List; /** @@ -62,7 +67,6 @@ public class ChatRequest { */ private String appId; - /** * 对话id(每个聊天窗口都不一样) */ @@ -76,8 +80,28 @@ public class ChatRequest { private Boolean enableThinking = false; /** - * 是否支持联网 + * 对话模型详情 */ - private Boolean enableInternet; + private ChatModelVo chatModelVo; + + /** + * 对话事件 + */ + private SseEmitter emitter; + + /** + * 当前登录用户id + */ + private Long userId; + + /** + * 当前登录用户TOKEN + */ + private String tokenValue; + + /** + * 完整的上下文 + */ + private List contextMessages; } diff --git a/ruoyi-common/ruoyi-common-security/src/main/java/org/ruoyi/common/security/config/SecurityConfig.java b/ruoyi-common/ruoyi-common-security/src/main/java/org/ruoyi/common/security/config/SecurityConfig.java index c068cd11..e358ced6 100644 --- a/ruoyi-common/ruoyi-common-security/src/main/java/org/ruoyi/common/security/config/SecurityConfig.java +++ b/ruoyi-common/ruoyi-common-security/src/main/java/org/ruoyi/common/security/config/SecurityConfig.java @@ -1,5 +1,6 @@ package org.ruoyi.common.security.config; +import cn.dev33.satoken.context.SaTokenContextForThreadLocalStaff; import cn.dev33.satoken.exception.NotLoginException; import cn.dev33.satoken.filter.SaServletFilter; import cn.dev33.satoken.httpauth.basic.SaHttpBasicUtil; @@ -49,6 +50,11 @@ public class SecurityConfig implements WebMvcConfigurer { public void addInterceptors(InterceptorRegistry registry) { // 注册路由拦截器,自定义验证规则 registry.addInterceptor(new SaInterceptor(handler -> { + // 异步线程中 SaToken 上下文不存在,跳过检查 + // 这避免了 SSE 流式响应完成后 emitter.complete() 触发的问题 + if (SaTokenContextForThreadLocalStaff.getModelBoxOrNull() == null) { + return; + } AllUrlHandler allUrlHandler = SpringUtils.getBean(AllUrlHandler.class); // 登录验证 -- 排除多个路径 SaRouter diff --git a/ruoyi-common/ruoyi-common-sse/src/main/java/org/ruoyi/common/sse/core/SseEmitterManager.java b/ruoyi-common/ruoyi-common-sse/src/main/java/org/ruoyi/common/sse/core/SseEmitterManager.java index 53086b20..fe4835db 100644 --- a/ruoyi-common/ruoyi-common-sse/src/main/java/org/ruoyi/common/sse/core/SseEmitterManager.java +++ b/ruoyi-common/ruoyi-common-sse/src/main/java/org/ruoyi/common/sse/core/SseEmitterManager.java @@ -202,12 +202,14 @@ public class SseEmitterManager { public void sendEvent(Long userId, SseEventDto eventDto) { Map emitters = USER_TOKEN_EMITTERS.get(userId); if (MapUtil.isNotEmpty(emitters)) { + log.debug("【SSE发送】userId: {}, emitter数量: {}, event: {}", userId, emitters.size(), eventDto.getEvent()); for (Map.Entry entry : emitters.entrySet()) { try { entry.getValue().send(SseEmitter.event() .name(eventDto.getEvent()) .data(JSONUtil.toJsonStr(eventDto))); } catch (Exception e) { + log.error("【SSE发送失败】userId: {}, token: {}, error: {}", userId, entry.getKey(), e.getMessage()); SseEmitter remove = emitters.remove(entry.getKey()); if (remove != null) { remove.complete(); @@ -215,6 +217,7 @@ public class SseEmitterManager { } } } else { + log.warn("【SSE发送失败】userId: {} 没有活跃的SSE连接, 当前连接用户: {}", userId, USER_TOKEN_EMITTERS.keySet()); USER_TOKEN_EMITTERS.remove(userId); } } diff --git a/ruoyi-common/ruoyi-common-sse/src/main/java/org/ruoyi/common/sse/dto/SseEventDto.java b/ruoyi-common/ruoyi-common-sse/src/main/java/org/ruoyi/common/sse/dto/SseEventDto.java index 68081926..f42f7232 100644 --- a/ruoyi-common/ruoyi-common-sse/src/main/java/org/ruoyi/common/sse/dto/SseEventDto.java +++ b/ruoyi-common/ruoyi-common-sse/src/main/java/org/ruoyi/common/sse/dto/SseEventDto.java @@ -89,4 +89,21 @@ public class SseEventDto implements Serializable { .error(error) .build(); } + + /** + * 创建 MCP 工具事件 + */ + public static SseEventDto mcpTool(String toolName, String status, String result) { + return SseEventDto.builder() + .event("mcp_tool") + .content(buildMcpJson(toolName, status, result)) + .build(); + } + + private static String buildMcpJson(String toolName, String status, String result) { + return String.format("{\"toolName\":\"%s\",\"status\":\"%s\",\"result\":\"%s\"}", + toolName != null ? toolName.replace("\"", "\\\"") : "", + status != null ? status : "", + result != null ? result.replace("\"", "\\\"").replace("\n", "\\n") : ""); + } } \ No newline at end of file diff --git a/ruoyi-common/ruoyi-common-web/src/main/java/org/ruoyi/common/web/aspectj/DemoAspect.java b/ruoyi-common/ruoyi-common-web/src/main/java/org/ruoyi/common/web/aspectj/DemoAspect.java new file mode 100644 index 00000000..a1c5afa9 --- /dev/null +++ b/ruoyi-common/ruoyi-common-web/src/main/java/org/ruoyi/common/web/aspectj/DemoAspect.java @@ -0,0 +1,86 @@ +package org.ruoyi.common.web.aspectj; + +import jakarta.servlet.http.HttpServletRequest; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.aspectj.lang.ProceedingJoinPoint; +import org.aspectj.lang.annotation.Around; +import org.aspectj.lang.annotation.Aspect; +import org.ruoyi.common.core.exception.ServiceException; +import org.ruoyi.common.core.utils.ServletUtils; +import org.ruoyi.common.web.config.properties.DemoProperties; + +import java.util.Set; + +/** + * 演示模式切面 - 拦截写操作 + * + * @author ruoyi + */ +@Slf4j +@Aspect +@RequiredArgsConstructor +public class DemoAspect { + + private final DemoProperties demoProperties; + + /** + * 需要拦截的 HTTP 方法 + */ + private static final Set WRITE_METHODS = Set.of( + "POST", "PUT", "DELETE", "PATCH" + ); + + /** + * 拦截所有 Controller 的写操作 + */ + @Around("execution(* org.ruoyi..controller..*.*(..))") + public Object around(ProceedingJoinPoint point) throws Throwable { + // 未开启演示模式,直接放行 + if (!Boolean.TRUE.equals(demoProperties.getEnabled())) { + return point.proceed(); + } + + HttpServletRequest request = ServletUtils.getRequest(); + if (request == null) { + return point.proceed(); + } + + String method = request.getMethod(); + String requestUri = request.getRequestURI(); + + // 非写操作,放行 + if (!WRITE_METHODS.contains(method.toUpperCase())) { + return point.proceed(); + } + + // 检查排除路径 + for (String exclude : demoProperties.getExcludes()) { + if (match(exclude, requestUri)) { + return point.proceed(); + } + } + + log.info("演示模式拦截写操作: {} {}", method, requestUri); + throw new ServiceException(demoProperties.getMessage()); + } + + /** + * 路径匹配(支持通配符 * 和 **) + */ + private boolean match(String pattern, String path) { + if (pattern.endsWith("/**")) { + String prefix = pattern.substring(0, pattern.length() - 3); + return path.startsWith(prefix); + } + if (pattern.endsWith("/*")) { + String prefix = pattern.substring(0, pattern.length() - 2); + if (!path.startsWith(prefix)) { + return false; + } + String suffix = path.substring(prefix.length()); + return suffix.indexOf('/') == -1; + } + return path.equals(pattern); + } +} \ No newline at end of file diff --git a/ruoyi-common/ruoyi-common-web/src/main/java/org/ruoyi/common/web/config/DemoAutoConfiguration.java b/ruoyi-common/ruoyi-common-web/src/main/java/org/ruoyi/common/web/config/DemoAutoConfiguration.java new file mode 100644 index 00000000..a67a7a81 --- /dev/null +++ b/ruoyi-common/ruoyi-common-web/src/main/java/org/ruoyi/common/web/config/DemoAutoConfiguration.java @@ -0,0 +1,25 @@ +package org.ruoyi.common.web.config; + +import org.ruoyi.common.web.aspectj.DemoAspect; +import org.ruoyi.common.web.config.properties.DemoProperties; +import org.springframework.boot.context.properties.EnableConfigurationProperties; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; + +/** + * 演示模式自动配置 + * + * @author ruoyi + */ +@Configuration +@EnableConfigurationProperties(DemoProperties.class) +public class DemoAutoConfiguration { + + /** + * 注册演示模式切面 + */ + @Bean + public DemoAspect demoAspect(DemoProperties demoProperties) { + return new DemoAspect(demoProperties); + } +} \ No newline at end of file diff --git a/ruoyi-common/ruoyi-common-web/src/main/java/org/ruoyi/common/web/config/properties/DemoProperties.java b/ruoyi-common/ruoyi-common-web/src/main/java/org/ruoyi/common/web/config/properties/DemoProperties.java new file mode 100644 index 00000000..f04fc665 --- /dev/null +++ b/ruoyi-common/ruoyi-common-web/src/main/java/org/ruoyi/common/web/config/properties/DemoProperties.java @@ -0,0 +1,32 @@ +package org.ruoyi.common.web.config.properties; + +import lombok.Data; +import org.springframework.boot.context.properties.ConfigurationProperties; + +import java.util.ArrayList; +import java.util.List; + +/** + * 演示模式 配置属性 + * + * @author ruoyi + */ +@Data +@ConfigurationProperties(prefix = "demo") +public class DemoProperties { + + /** + * 是否开启演示模式 + */ + private Boolean enabled = false; + + /** + * 提示消息 + */ + private String message = "演示模式,不允许进行写操作"; + + /** + * 排除的路径(这些路径不受演示模式限制) + */ + private List excludes = new ArrayList<>(); +} \ No newline at end of file diff --git a/ruoyi-common/ruoyi-common-web/src/main/resources/META-INF/spring/org.springframework.boot.autoconfigure.AutoConfiguration.imports b/ruoyi-common/ruoyi-common-web/src/main/resources/META-INF/spring/org.springframework.boot.autoconfigure.AutoConfiguration.imports index e1eb5138..9a3a59f9 100644 --- a/ruoyi-common/ruoyi-common-web/src/main/resources/META-INF/spring/org.springframework.boot.autoconfigure.AutoConfiguration.imports +++ b/ruoyi-common/ruoyi-common-web/src/main/resources/META-INF/spring/org.springframework.boot.autoconfigure.AutoConfiguration.imports @@ -1,4 +1,5 @@ org.ruoyi.common.web.config.CaptchaConfig +org.ruoyi.common.web.config.DemoAutoConfiguration org.ruoyi.common.web.config.FilterConfig org.ruoyi.common.web.config.I18nConfig org.ruoyi.common.web.config.ResourcesConfig diff --git a/ruoyi-modules/ruoyi-chat/pom.xml b/ruoyi-modules/ruoyi-chat/pom.xml index 6cb40321..43ea2b1e 100644 --- a/ruoyi-modules/ruoyi-chat/pom.xml +++ b/ruoyi-modules/ruoyi-chat/pom.xml @@ -103,6 +103,21 @@ ${langchain4j.community.version} + + + dev.langchain4j + langchain4j-skills + ${langchain4j.community.version} + + + + + dev.langchain4j + langchain4j-experimental-skills-shell + ${langchain4j.community.version} + + + org.testcontainers weaviate @@ -152,6 +167,13 @@ mysql-connector-j + + + org.springframework.boot + spring-boot-starter-test + test + + diff --git a/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/agent/SkillsAgent.java b/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/agent/SkillsAgent.java new file mode 100644 index 00000000..d0831441 --- /dev/null +++ b/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/agent/SkillsAgent.java @@ -0,0 +1,35 @@ +package org.ruoyi.agent; + +import dev.langchain4j.agentic.Agent; +import dev.langchain4j.service.SystemMessage; +import dev.langchain4j.service.UserMessage; +import dev.langchain4j.service.V; + +/** + * 技能管理 Agent + * 管理 docx、pdf、xlsx 等文档处理技能 + * + *

可用技能: + *

    + *
  • docx - Word 文档创建、编辑和分析
  • + *
  • pdf - PDF 文档处理、提取文本和表格
  • + *
  • xlsx - Excel 电子表格创建、编辑和分析
  • + *
+ * + * @author ageerle@163.com + * @date 2026/04/10 + */ +public interface SkillsAgent { + + @SystemMessage(""" + 你是一个文档处理技能助手,能够使用 activate_skill 工具激活特定技能来处理各种文档任务。 + 使用指南: + 1. 根据用户请求判断需要哪个技能 + 2. 使用 activate_skill("skill-name") 激活对应技能 + 3. 按照技能指令执行任务 + 4. 如果需要参考文件,使用 read_skill_resource 读取 + """) + @UserMessage("{{query}}") + @Agent("文档处理技能助手,支持 Word、PDF、Excel 文档的创建、编辑和分析") + String process(@V("query") String query); +} diff --git a/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/agent/WebSearchAgent.java b/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/agent/WebSearchAgent.java index 33f8737e..15a6ec52 100644 --- a/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/agent/WebSearchAgent.java +++ b/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/agent/WebSearchAgent.java @@ -6,33 +6,26 @@ import dev.langchain4j.service.UserMessage; import dev.langchain4j.service.V; /** - * Web Search Agent - * A web search assistant that answers natural language questions by searching the internet - * and returning relevant information from web pages. + * 浏览器工具 Agent + * 能够操作浏览器相关工具:网络搜索、网页抓取、浏览器自动化等 + * + * @author ageerle@163.com + * @date 2025/04/10 */ public interface WebSearchAgent { @SystemMessage(""" - You are a web search assistant. Answer questions by searching and retrieving web content. + 你是一个系统工具助手,能够使用工具来帮助用户获取信息和操作浏览器。 - Available tools: - 1. bing_search: Search the internet with keywords - - query (required): search keywords - - count (optional): number of results, default 10, max 50 - - offset (optional): pagination offset, default 0 - Returns: title, link, and summary for each result - - 2. crawl_webpage: Extract text content from a web page - - url (required): web page URL - Returns: cleaned page title and main content - - Instructions: - - Always cite sources in your answers - - Only use the two tools listed above + 【最重要原则】 + 除非用户明确要求使用浏览器查询信息,否则不要主动调用任何搜索或浏览器工具。 + 使用指南: + - 搜索信息时使用 bing_search + - 需要详细网页内容时使用 crawl_webpage + - 需要交互操作(登录、点击、填写表单)时使用 Playwright 工具 + - 在回答中注明信息来源 """) - @UserMessage(""" - Answer the following question by searching the web: {{query}} - """) - @Agent("Web search assistant using Bing search and web scraping to find and retrieve information") + @UserMessage("{{query}}") + @Agent("浏览器工具助手,支持网络搜索、网页抓取和浏览器自动化操作") String search(@V("query") String query); } diff --git a/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/controller/chat/ChatController.java b/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/controller/chat/ChatController.java index d265206f..5a57c0e0 100644 --- a/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/controller/chat/ChatController.java +++ b/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/controller/chat/ChatController.java @@ -4,6 +4,7 @@ import jakarta.servlet.http.HttpServletRequest; import jakarta.validation.Valid; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; +import org.ruoyi.common.chat.domain.dto.request.AgentChatRequest; import org.ruoyi.common.chat.domain.dto.request.ChatRequest; import org.ruoyi.service.chat.impl.ChatServiceFacade; import org.springframework.stereotype.Controller; diff --git a/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/observability/MyAgentListener.java b/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/observability/MyAgentListener.java index 299ea6e9..42c1720b 100644 --- a/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/observability/MyAgentListener.java +++ b/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/observability/MyAgentListener.java @@ -125,21 +125,21 @@ public class MyAgentListener implements dev.langchain4j.agentic.observability.Ag } // ==================== 工具执行生命周期 ==================== - - @Override - public void beforeToolExecution(BeforeToolExecution beforeToolExecution) { - var toolRequest = beforeToolExecution.request(); - log.info("【工具执行前】工具请求ID: {}", toolRequest.id()); - log.info("【工具执行前】工具名称: {}", toolRequest.name()); - log.info("【工具执行前】工具参数: {}", toolRequest.arguments()); - } - - @Override - public void afterToolExecution(ToolExecution toolExecution) { - var toolRequest = toolExecution.request(); - log.info("【工具执行后】工具请求ID: {}", toolRequest.id()); - log.info("【工具执行后】工具名称: {}", toolRequest.name()); - log.info("【工具执行后】工具执行结果: {}", toolExecution.result()); - log.info("【工具执行后】工具执行是否失败: {}", toolExecution.hasFailed()); - } +// +// @Override +// public void beforeToolExecution(BeforeToolExecution beforeToolExecution) { +// var toolRequest = beforeToolExecution.request(); +// log.info("【工具执行前】工具请求ID: {}", toolRequest.id()); +// log.info("【工具执行前】工具名称: {}", toolRequest.name()); +// log.info("【工具执行前】工具参数: {}", toolRequest.arguments()); +// } +// +// @Override +// public void afterToolExecution(ToolExecution toolExecution) { +// var toolRequest = toolExecution.request(); +// log.info("【工具执行后】工具请求ID: {}", toolRequest.id()); +// log.info("【工具执行后】工具名称: {}", toolRequest.name()); +// log.info("【工具执行后】工具执行结果: {}", toolExecution.result()); +// log.info("【工具执行后】工具执行是否失败: {}", toolExecution.hasFailed()); +// } } diff --git a/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/observability/MyMcpClientListener.java b/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/observability/MyMcpClientListener.java index fe19d728..c1457100 100644 --- a/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/observability/MyMcpClientListener.java +++ b/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/observability/MyMcpClientListener.java @@ -1,23 +1,37 @@ package org.ruoyi.observability; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; import dev.langchain4j.invocation.InvocationContext; import dev.langchain4j.mcp.client.McpCallContext; import dev.langchain4j.mcp.client.McpClientListener; import dev.langchain4j.mcp.client.McpGetPromptResult; import dev.langchain4j.mcp.client.McpReadResourceResult; -import dev.langchain4j.mcp.protocol.McpClientMessage; +import dev.langchain4j.mcp.protocol.*; import dev.langchain4j.service.tool.ToolExecutionResult; import lombok.extern.slf4j.Slf4j; +import org.ruoyi.common.sse.dto.SseEventDto; +import org.ruoyi.common.sse.utils.SseMessageUtils; +import java.util.HashMap; import java.util.Map; /** - * 自定义的 McpClientListener 的监听器。 - * 监听 MCP 客户端相关的所有可观测性事件,包括: + * MCP 客户端监听器 + *

+ * 监听 MCP 工具执行事件,并通过 SSE 推送到前端 + *

+ * SSE 推送格式: + *

+ * {
+ *   "event": "mcp",
+ *   "content": "{\"name\":\"工具名称\",\"status\":\"pending|success|error\",\"result\":\"执行结果\"}"
+ * }
+ * 
+ * 前端区分方式: *
    - *
  • MCP 工具执行的开始/成功/错误事件
  • - *
  • MCP 资源读取的开始/成功/错误事件
  • - *
  • MCP 提示词获取的开始/成功/错误事件
  • + *
  • 对话内容:event="content"
  • + *
  • MCP 事件:event="mcp"
  • *
* * @author evo @@ -25,112 +39,135 @@ import java.util.Map; @Slf4j public class MyMcpClientListener implements McpClientListener { - // ==================== 工具执行 ==================== + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private final Long userId; + + public MyMcpClientListener(Long userId) { + this.userId = userId; + } + + public MyMcpClientListener() { + this.userId = null; + } + + // ==================== 工具执行 ==================== @Override public void beforeExecuteTool(McpCallContext context) { - InvocationContext invocationContext = context.invocationContext(); - McpClientMessage message = context.message(); + McpClientRequest message = (McpClientRequest) context.message(); + McpClientParams params = message.getParams(); + if (params instanceof McpCallToolParams callToolParams) { + String name = callToolParams.getName(); + log.info("工具调用之前:{}",name); + pushMcpEvent(name, "pending", null); + } - log.info("【MCP工具执行前】调用唯一标识符: {}", invocationContext.invocationId()); - log.info("【MCP工具执行前】MCP消息ID: {}", message.getId()); - log.info("【MCP工具执行前】MCP方法: {}", message.method); } @Override public void afterExecuteTool(McpCallContext context, ToolExecutionResult result, Map rawResult) { - InvocationContext invocationContext = context.invocationContext(); - McpClientMessage message = context.message(); - - log.info("【MCP工具执行后】调用唯一标识符: {}", invocationContext.invocationId()); - log.info("【MCP工具执行后】MCP消息ID: {}", message.getId()); - log.info("【MCP工具执行后】MCP方法: {}", message.method); - log.info("【MCP工具执行后】工具执行结果: {}", result); - log.info("【MCP工具执行后】原始结果: {}", rawResult); + McpClientRequest message = (McpClientRequest) context.message(); + McpClientParams params = message.getParams(); + if (params instanceof McpCallToolParams callToolParams) { + String name = callToolParams.getName(); + String resultText = result != null ? result.toString() : ""; + log.info("工具调用之后:{},返回结果{}",name,result); + pushMcpEvent(name, "success", truncate(resultText, 500)); + } } @Override public void onExecuteToolError(McpCallContext context, Throwable error) { - InvocationContext invocationContext = context.invocationContext(); - McpClientMessage message = context.message(); - - log.error("【MCP工具执行错误】调用唯一标识符: {}", invocationContext.invocationId()); - log.error("【MCP工具执行错误】MCP消息ID: {}", message.getId()); - log.error("【MCP工具执行错误】MCP方法: {}", message.method); - log.error("【MCP工具执行错误】错误类型: {}", error.getClass().getName()); - log.error("【MCP工具执行错误】错误信息: {}", error.getMessage(), error); + String toolName = getMethodName(context); + log.error("【MCP工具执行错误】工具: {}, 错误: {}", toolName, error.getMessage()); + pushMcpEvent(toolName, "error", error.getMessage()); } // ==================== 资源读取 ==================== @Override public void beforeResourceGet(McpCallContext context) { - InvocationContext invocationContext = context.invocationContext(); - McpClientMessage message = context.message(); - - log.info("【MCP资源读取前】调用唯一标识符: {}", invocationContext.invocationId()); - log.info("【MCP资源读取前】MCP消息ID: {}", message.getId()); - log.info("【MCP资源读取前】MCP方法: {}", message.method); + String name = getMethodName(context); + log.info("【MCP资源读取前】资源: {}", name); + pushMcpEvent(name, "pending", null); } @Override public void afterResourceGet(McpCallContext context, McpReadResourceResult result, Map rawResult) { - InvocationContext invocationContext = context.invocationContext(); - McpClientMessage message = context.message(); - - log.info("【MCP资源读取后】调用唯一标识符: {}", invocationContext.invocationId()); - log.info("【MCP资源读取后】MCP消息ID: {}", message.getId()); - log.info("【MCP资源读取后】MCP方法: {}", message.method); - log.info("【MCP资源读取后】资源内容数量: {}", result.contents() != null ? result.contents().size() : 0); - log.info("【MCP资源读取后】原始结果: {}", rawResult); + String name = getMethodName(context); + int count = result.contents() != null ? result.contents().size() : 0; + log.info("【MCP资源读取后】资源: {}, 数量: {}", name, count); + pushMcpEvent(name, "success", "读取 " + count + " 条资源"); } @Override public void onResourceGetError(McpCallContext context, Throwable error) { - InvocationContext invocationContext = context.invocationContext(); - McpClientMessage message = context.message(); - - log.error("【MCP资源读取错误】调用唯一标识符: {}", invocationContext.invocationId()); - log.error("【MCP资源读取错误】MCP消息ID: {}", message.getId()); - log.error("【MCP资源读取错误】MCP方法: {}", message.method); - log.error("【MCP资源读取错误】错误类型: {}", error.getClass().getName()); - log.error("【MCP资源读取错误】错误信息: {}", error.getMessage(), error); + String name = getMethodName(context); + log.error("【MCP资源读取错误】资源: {}, 错误: {}", name, error.getMessage()); + pushMcpEvent(name, "error", error.getMessage()); } // ==================== 提示词获取 ==================== @Override public void beforePromptGet(McpCallContext context) { - InvocationContext invocationContext = context.invocationContext(); - McpClientMessage message = context.message(); - - log.info("【MCP提示词获取前】调用唯一标识符: {}", invocationContext.invocationId()); - log.info("【MCP提示词获取前】MCP消息ID: {}", message.getId()); - log.info("【MCP提示词获取前】MCP方法: {}", message.method); + String name = getMethodName(context); + log.info("【MCP提示词获取前】提示词: {}", name); + pushMcpEvent(name, "pending", null); } @Override public void afterPromptGet(McpCallContext context, McpGetPromptResult result, Map rawResult) { - InvocationContext invocationContext = context.invocationContext(); - McpClientMessage message = context.message(); - - log.info("【MCP提示词获取后】调用唯一标识符: {}", invocationContext.invocationId()); - log.info("【MCP提示词获取后】MCP消息ID: {}", message.getId()); - log.info("【MCP提示词获取后】MCP方法: {}", message.method); - log.info("【MCP提示词获取后】提示词描述: {}", result.description()); - log.info("【MCP提示词获取后】提示词消息数量: {}", result.messages() != null ? result.messages().size() : 0); - log.info("【MCP提示词获取后】原始结果: {}", rawResult); + String name = getMethodName(context); + int count = result.messages() != null ? result.messages().size() : 0; + log.info("【MCP提示词获取后】提示词: {}, 消息数: {}", name, count); + pushMcpEvent(name, "success", "获取 " + count + " 条消息"); } @Override public void onPromptGetError(McpCallContext context, Throwable error) { - InvocationContext invocationContext = context.invocationContext(); - McpClientMessage message = context.message(); + String name = getMethodName(context); + log.error("【MCP提示词获取错误】提示词: {}, 错误: {}", name, error.getMessage()); + pushMcpEvent(name, "error", error.getMessage()); + } - log.error("【MCP提示词获取错误】调用唯一标识符: {}", invocationContext.invocationId()); - log.error("【MCP提示词获取错误】MCP消息ID: {}", message.getId()); - log.error("【MCP提示词获取错误】MCP方法: {}", message.method); - log.error("【MCP提示词获取错误】错误类型: {}", error.getClass().getName()); - log.error("【MCP提示词获取错误】错误信息: {}", error.getMessage(), error); + // ==================== 辅助方法 ==================== + + private String getMethodName(McpCallContext context) { + try { + McpClientMessage message = context.message(); + return message.method != null ? message.method.toString() : "unknown"; + } catch (Exception e) { + return "unknown"; + } + } + + /** + * 推送 MCP 事件到前端 + */ + private void pushMcpEvent(String name, String status, String result) { + if (userId == null) { + log.warn("userId 为空,无法推送 MCP 事件"); + return; + } + try { + Map content = new HashMap<>(); + content.put("name", name); + content.put("status", status); + content.put("result", result); + + String json = OBJECT_MAPPER.writeValueAsString(content); + SseMessageUtils.sendEvent(userId, SseEventDto.builder() + .event("mcp") + .content(json) + .build()); + } catch (JsonProcessingException e) { + log.error("序列化 MCP 事件失败: {}", e.getMessage()); + } + } + + private String truncate(String str, int maxLen) { + if (str == null) return null; + return str.length() > maxLen ? str.substring(0, maxLen) + "..." : str; } } diff --git a/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/observability/OutputChannel.java b/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/observability/OutputChannel.java new file mode 100644 index 00000000..bddd8257 --- /dev/null +++ b/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/observability/OutputChannel.java @@ -0,0 +1,128 @@ +package org.ruoyi.observability; + +import java.util.Map; +import java.util.concurrent.*; +import java.util.concurrent.atomic.AtomicReference; +import java.util.function.Consumer; + +/** + * 跨线程事件总线 + * + * 写入端(异步线程):StreamingOutputWrapper / SupervisorStreamListener + * 读取端(SSE 线程):ChatServiceFacade.drain + * + * 调用链路: + * SSE请求 -> 创建 OutputChannel + * -> Supervisor.invoke() [同步阻塞调用子Agent] + * ├── SupervisorStreamListener -> channel.send() + * └── searchAgent.search() + * └── StreamingOutputWrapper -> channel.send() [每个token] + * -> channel.complete() + * drain线程 -> channel.drain() -> SSE实时推送 + * + * @author ageerle@163.com + * @date 2025/04/10 + */ +public class OutputChannel { + + private static final String DONE = "__DONE__"; + private static final Map REGISTRY = new ConcurrentHashMap<>(); + + private final BlockingQueue queue = new LinkedBlockingQueue<>(4096); + private final AtomicReference error = new AtomicReference<>(); + private final CountDownLatch completed = new CountDownLatch(1); + + /** + * 创建并注册到全局注册表 + */ + public static OutputChannel create(String requestId) { + OutputChannel ch = new OutputChannel(); + REGISTRY.put(requestId, ch); + return ch; + } + + /** + * 从全局注册表移除 + */ + public static void remove(String requestId) { + REGISTRY.remove(requestId); + } + + /** + * 从全局注册表获取 + */ + public static OutputChannel get(String requestId) { + return REGISTRY.get(requestId); + } + + /** + * 写入:线程安全,非阻塞,队列满时丢弃 + */ +public void send(String text) { + if (text == null || text.isEmpty()) { + return; + } + try { + if (!queue.offer(text, 100, TimeUnit.MILLISECONDS)) { + System.err.println("[OutputChannel] 队列满,丢弃消息: " + truncate(text, 100)); + } + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + } + + /** + * 标记完成 + */ + public void complete() { + queue.offer(DONE); + completed.countDown(); + } + + /** + * 标记错误完成 + */ + public void completeWithError(Throwable t) { + error.set(t); + queue.offer("\n[错误] 致命错误: " + t.getMessage()); + queue.offer(DONE); + completed.countDown(); + } + + /** + * 读取:阻塞迭代,配合 SSE 使用 + */ + public void drain(Consumer emitter) throws InterruptedException { + while (true) { + String msg = queue.poll(200, TimeUnit.MILLISECONDS); + if (msg != null) { + if (DONE.equals(msg)) { + break; + } + emitter.accept(msg); + } else { + if (completed.getCount() == 0 && queue.isEmpty()) { + break; + } + } + } + Throwable t = error.get(); + if (t != null && !(t instanceof InterruptedException)) { + throw new RuntimeException("Agent 执行出错", t); + } + } + + /** + * 检查是否已完成 + */ + public boolean isCompleted() { + return completed.getCount() == 0; + } + + private String truncate(String s, int maxLen) { + if (s == null) { + return "null"; + } + return s.length() > maxLen ? s.substring(0, maxLen) + "..." : s; + } +} diff --git a/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/observability/StreamingOutputWrapper.java b/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/observability/StreamingOutputWrapper.java new file mode 100644 index 00000000..22965c23 --- /dev/null +++ b/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/observability/StreamingOutputWrapper.java @@ -0,0 +1,213 @@ +package org.ruoyi.observability; + +import dev.langchain4j.model.chat.ChatModel; +import dev.langchain4j.model.chat.StreamingChatModel; +import dev.langchain4j.model.chat.listener.ChatModelListener; +import dev.langchain4j.model.chat.request.ChatRequest; +import dev.langchain4j.model.chat.request.ChatRequestParameters; +import dev.langchain4j.model.chat.response.ChatResponse; +import dev.langchain4j.model.chat.response.PartialThinking; +import dev.langchain4j.model.chat.response.PartialThinkingContext; +import dev.langchain4j.model.chat.response.PartialToolCall; +import dev.langchain4j.model.chat.response.PartialToolCallContext; +import dev.langchain4j.model.chat.response.PartialResponse; +import dev.langchain4j.model.chat.response.PartialResponseContext; +import dev.langchain4j.model.chat.response.StreamingChatResponseHandler; +import dev.langchain4j.model.chat.Capability; +import dev.langchain4j.model.ModelProvider; +import lombok.extern.slf4j.Slf4j; + +import java.util.List; +import java.util.Set; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.atomic.AtomicReference; + +/** + * 包装 StreamingChatModel,同时实现 ChatModel 接口。 + * + * 当 AI Service 方法返回 String 时,LangChain4j 使用 ChatModel.chat() + * 当返回 TokenStream 时,使用 StreamingChatModel.chat() + * + * 此包装器同时实现两个接口,将同步调用转换为流式调用并收集结果, + * 同时拦截每个 token 推送到 OutputChannel。 + * + * @author ageerle@163.com + * @date 2025/04/10 + */ +@Slf4j +public class StreamingOutputWrapper implements StreamingChatModel, ChatModel { + + private final StreamingChatModel streamingDelegate; + private final OutputChannel channel; + + /** + * 包装 StreamingChatModel + */ + public StreamingOutputWrapper(StreamingChatModel delegate, OutputChannel channel) { + this.streamingDelegate = delegate; + this.channel = channel; + } + + // ==================== 解决接口默认方法冲突 ==================== + + @Override + public Set supportedCapabilities() { + return streamingDelegate.supportedCapabilities(); + } + + // ==================== ChatModel 接口实现(同步调用) ==================== + + @Override + public ChatResponse chat(ChatRequest request) { + log.info("【StreamingOutputWrapper】chat() 被调用,开始流式处理"); + // 用于收集完整响应 + AtomicReference responseRef = new AtomicReference<>(); + CompletableFuture future = new CompletableFuture<>(); + + // 调用流式模型,拦截每个 token + streamingDelegate.chat(request, new StreamingChatResponseHandler() { + @Override + public void onPartialResponse(String token) { + // 推送到 channel + channel.send(token); + log.debug("【流式Token】{}", token); + } + + @Override + public void onPartialResponse(PartialResponse pr, PartialResponseContext ctx) { + channel.send(pr.text()); + log.debug("【流式PartialResponse】{}", pr.text()); + } + + @Override + public void onPartialThinking(PartialThinking thinking) { + channel.send("[思考] " + thinking.text()); + log.debug("【流式思考】{}", thinking.text()); + } + + @Override + public void onPartialThinking(PartialThinking thinking, PartialThinkingContext ctx) { + channel.send("[思考] " + thinking.text()); + } + + @Override + public void onPartialToolCall(PartialToolCall toolCall) { + // channel.send("[工具参数生成中] " + toolCall); + } + + @Override + public void onPartialToolCall(PartialToolCall toolCall, PartialToolCallContext ctx) { + // channel.send("[工具参数生成中] " + toolCall); + } + + @Override + public void onCompleteResponse(ChatResponse response) { + responseRef.set(response); + if (response.metadata() != null && response.metadata().tokenUsage() != null) { + var usage = response.metadata().tokenUsage(); +// channel.send("\n[Token统计] input=" + usage.inputTokenCount() +// + " output=" + usage.outputTokenCount()); + } + log.info("【StreamingOutputWrapper】流式处理完成"); + future.complete(null); + } + + @Override + public void onError(Throwable error) { + channel.send("\n[错误] " + error.getMessage()); + channel.completeWithError(error); + future.completeExceptionally(error); + log.error("【StreamingOutputWrapper】流式处理出错", error); + } + }); + + // 等待流式完成 + future.join(); + + // 返回收集的响应 + return responseRef.get(); + } + + // ==================== StreamingChatModel 接口实现(流式调用) ==================== + + @Override + public void chat(ChatRequest request, StreamingChatResponseHandler handler) { + StreamingChatResponseHandler wrapped = wrapHandler(handler); + streamingDelegate.chat(request, wrapped); + } + + private StreamingChatResponseHandler wrapHandler(StreamingChatResponseHandler original) { + return new StreamingChatResponseHandler() { + + @Override + public void onPartialResponse(String token) { + channel.send(token); + original.onPartialResponse(token); + } + + @Override + public void onPartialResponse(PartialResponse pr, PartialResponseContext ctx) { + channel.send(pr.text()); + original.onPartialResponse(pr, ctx); + } + + @Override + public void onPartialThinking(PartialThinking thinking) { + channel.send("[思考] " + thinking.text()); + original.onPartialThinking(thinking); + } + + @Override + public void onPartialThinking(PartialThinking thinking, PartialThinkingContext ctx) { + channel.send("[思考] " + thinking.text()); + original.onPartialThinking(thinking, ctx); + } + + @Override + public void onPartialToolCall(PartialToolCall toolCall) { + //channel.send("[工具参数生成中] " + toolCall); + original.onPartialToolCall(toolCall); + } + + @Override + public void onPartialToolCall(PartialToolCall toolCall, PartialToolCallContext ctx) { + //channel.send("[工具参数生成中] " + toolCall); + original.onPartialToolCall(toolCall, ctx); + } + + @Override + public void onCompleteResponse(ChatResponse response) { + if (response.metadata() != null && response.metadata().tokenUsage() != null) { + var usage = response.metadata().tokenUsage(); +// channel.send("\n[Token统计] input=" + usage.inputTokenCount() +// + " output=" + usage.outputTokenCount()); + } + original.onCompleteResponse(response); + } + + @Override + public void onError(Throwable error) { + channel.send("\n[错误] " + error.getMessage()); + channel.completeWithError(error); + original.onError(error); + } + }; + } + + // ==================== 共用接口方法 ==================== + + @Override + public ChatRequestParameters defaultRequestParameters() { + return streamingDelegate.defaultRequestParameters(); + } + + @Override + public List listeners() { + return streamingDelegate.listeners(); + } + + @Override + public ModelProvider provider() { + return streamingDelegate.provider(); + } +} diff --git a/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/observability/SupervisorStreamListener.java b/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/observability/SupervisorStreamListener.java new file mode 100644 index 00000000..4f98e5cb --- /dev/null +++ b/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/observability/SupervisorStreamListener.java @@ -0,0 +1,120 @@ +package org.ruoyi.observability; + +import dev.langchain4j.agentic.observability.AgentInvocationError; +import dev.langchain4j.agentic.observability.AgentRequest; +import dev.langchain4j.agentic.observability.AgentResponse; +import dev.langchain4j.agentic.planner.AgentInstance; +import dev.langchain4j.agentic.scope.AgenticScope; +import dev.langchain4j.service.tool.BeforeToolExecution; +import dev.langchain4j.service.tool.ToolExecution; +import lombok.extern.slf4j.Slf4j; + +import java.util.Map; + +/** + * Supervisor 流式监听器 + * + * 捕获 Agent 生命周期事件、工具执行前后事件,推送到 OutputChannel + * inheritedBySubagents() = true -> 注册在 Supervisor 上,自动继承到所有子 Agent + * + * @author ageerle@163.com + * @date 2025/04/10 + */ +@Slf4j +public class SupervisorStreamListener implements dev.langchain4j.agentic.observability.AgentListener { + + private final OutputChannel channel; + + /** + * 用于在 AgenticScope 中存储 userId 的 key + */ + public static final String USER_ID_KEY = "userId"; + + public SupervisorStreamListener(OutputChannel channel) { + this.channel = channel; + } + + // ==================== Agent 调用生命周期 ==================== + + @Override + public void beforeAgentInvocation(AgentRequest agentRequest) { + AgentInstance agent = agentRequest.agent(); + AgenticScope scope = agentRequest.agenticScope(); + Map inputs = agentRequest.inputs(); + // 只记录日志,不推送输入信息(避免干扰流式输出) + log.info("[Agent开始] {} 输入: {}", agent.name(), inputs); + } + + @Override + public void afterAgentInvocation(AgentResponse agentResponse) { + AgentInstance agent = agentResponse.agent(); + Map inputs = agentResponse.inputs(); + Object output = agentResponse.output(); + String outputStr = output != null ? output.toString() : ""; + + // 只记录日志,不推送输出信息 + // 流式输出由 StreamingOutputWrapper 处理 + // 当无子Agent被调用时,由 ChatServiceFacade 用 plannerModel 生成回复 + log.info("[Agent完成] {} 输出长度: {}", agent.name(), outputStr.length()); + } + + @Override + public void onAgentInvocationError(AgentInvocationError error) { + AgentInstance agent = error.agent(); + Map inputs = error.inputs(); + Throwable throwable = error.error(); + + channel.send("\n[Agent错误] " + agent.name() + + " 异常: " + throwable.getMessage()); + log.error("[Agent错误] {} 异常: {}", agent.name(), throwable.getMessage(), throwable); + } + + // ==================== AgenticScope 生命周期 ==================== + + @Override + public void afterAgenticScopeCreated(AgenticScope agenticScope) { + log.info("[AgenticScope创建] memoryId: {}", agenticScope.memoryId()); + } + + @Override + public void beforeAgenticScopeDestroyed(AgenticScope agenticScope) { + log.info("[AgenticScope销毁] memoryId: {}", agenticScope.memoryId()); + } + + // ==================== 工具执行生命周期 ==================== + +// @Override +// public void beforeToolExecution(BeforeToolExecution beforeToolExecution) { +// var toolRequest = beforeToolExecution.request(); +//// channel.send("\n[工具即将执行] " + toolRequest.name() +//// + " 参数: " + truncate(toolRequest.arguments(), 150)); +// log.info("[工具即将执行] {} 参数: {}", toolRequest.name(), toolRequest.arguments()); +// } + +// @Override +// public void afterToolExecution(ToolExecution toolExecution) { +// var toolRequest = toolExecution.request(); +//// channel.send("\n[工具执行完成] " + toolRequest.name() +//// + " 结果: " + truncate(String.valueOf(toolExecution.result()), 300)); +// log.info("[工具执行完成] {} 结果: {}", toolRequest.name(), toolExecution.result()); +// } + + // ==================== 继承机制 ==================== + + /** + * 返回 true,让此监听器自动继承给所有子 Agent + */ + @Override + public boolean inheritedBySubagents() { + return true; + } + + // ==================== 辅助方法 ==================== + + private String truncate(String s, int maxLen) { + if (s == null) { + return "null"; + } + return s.length() > maxLen ? s.substring(0, maxLen) + "..." : s; + } +} diff --git a/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/service/chat/impl/ChatServiceFacade.java b/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/service/chat/impl/ChatServiceFacade.java index 8e0f463a..108c3cba 100644 --- a/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/service/chat/impl/ChatServiceFacade.java +++ b/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/service/chat/impl/ChatServiceFacade.java @@ -1,7 +1,6 @@ package org.ruoyi.service.chat.impl; import cn.dev33.satoken.stp.StpUtil; -import cn.hutool.core.util.StrUtil; import dev.langchain4j.agentic.AgenticServices; import dev.langchain4j.agentic.supervisor.SupervisorAgent; import dev.langchain4j.agentic.supervisor.SupervisorResponseStrategy; @@ -14,15 +13,19 @@ import dev.langchain4j.mcp.client.McpClient; import dev.langchain4j.mcp.client.transport.McpTransport; import dev.langchain4j.mcp.client.transport.stdio.StdioMcpTransport; import dev.langchain4j.memory.chat.MessageWindowChatMemory; +import dev.langchain4j.model.chat.ChatModel; import dev.langchain4j.model.chat.StreamingChatModel; import dev.langchain4j.model.chat.response.ChatResponse; import dev.langchain4j.model.chat.response.StreamingChatResponseHandler; import dev.langchain4j.model.openai.OpenAiChatModel; import dev.langchain4j.service.tool.ToolProvider; +import dev.langchain4j.skills.shell.ShellSkills; import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; import org.ruoyi.agent.ChartGenerationAgent; +import org.ruoyi.agent.EchartsAgent; +import org.ruoyi.agent.SkillsAgent; import org.ruoyi.agent.SqlAgent; import org.ruoyi.agent.WebSearchAgent; import org.ruoyi.agent.tool.ExecuteSqlQueryTool; @@ -38,6 +41,7 @@ import org.ruoyi.common.chat.service.chat.IChatModelService; import org.ruoyi.common.chat.service.chat.IChatService; import org.ruoyi.common.chat.service.workFlow.IWorkFlowStarterService; import org.ruoyi.common.core.utils.ObjectUtils; +import org.ruoyi.common.core.utils.StringUtils; import org.ruoyi.common.satoken.utils.LoginHelper; import org.ruoyi.common.sse.core.SseEmitterManager; import org.ruoyi.common.sse.utils.SseMessageUtils; @@ -45,9 +49,7 @@ import org.ruoyi.domain.bo.vector.QueryVectorBo; import org.ruoyi.domain.vo.knowledge.KnowledgeInfoVo; import org.ruoyi.factory.ChatServiceFactory; import org.ruoyi.mcp.service.core.ToolProviderFactory; -import org.ruoyi.observability.MyAgentListener; -import org.ruoyi.observability.MyChatModelListener; -import org.ruoyi.observability.MyMcpClientListener; +import org.ruoyi.observability.*; import org.ruoyi.service.chat.AbstractChatService; import org.ruoyi.service.chat.IChatMessageService; import org.ruoyi.service.chat.impl.memory.PersistentChatMemoryStore; @@ -59,6 +61,7 @@ import org.springframework.web.servlet.mvc.method.annotation.SseEmitter; import java.util.ArrayList; import java.util.List; import java.util.Map; +import java.util.concurrent.CompletableFuture; import java.util.concurrent.ConcurrentHashMap; /** @@ -124,10 +127,19 @@ public class ChatServiceFacade implements IChatService { // 2. 构建上下文消息列表 List contextMessages = buildContextMessages(chatRequest); + chatRequest.setEmitter(emitter); + chatRequest.setUserId(userId); + chatRequest.setTokenValue(tokenValue); + chatRequest.setChatModelVo(chatModelVo); + chatRequest.setContextMessages(contextMessages); + + // 保存用户消息 + chatMessageService.saveChatMessage(userId, chatRequest.getSessionId(), chatRequest.getContent(), RoleType.USER.getName(), chatRequest.getModel()); + // 3. 处理特殊聊天模式(工作流、人机交互恢复、思考模式) - SseEmitter specialResult = handleSpecialChatModes(chatRequest, contextMessages, chatModelVo, emitter, userId, tokenValue); - if (specialResult != null) { - return specialResult; + SseEmitter sseEmitter = handleSpecialChatModes(chatRequest); + if (sseEmitter != null) { + return sseEmitter; } // 4. 路由服务提供商 @@ -135,11 +147,8 @@ public class ChatServiceFacade implements IChatService { log.info("路由到服务提供商: {}, 模型: {}", providerCode, chatRequest.getModel()); AbstractChatService chatService = chatServiceFactory.getOriginalService(providerCode); - StreamingChatResponseHandler handler = createResponseHandler(userId, tokenValue,chatRequest); - // 保存用户消息 - chatMessageService.saveChatMessage(userId, chatRequest.getSessionId(), chatRequest.getContent(), RoleType.USER.getName(), chatRequest.getModel()); // 5. 发起对话 StreamingChatModel streamingChatModel = chatService.buildStreamingChatModel(chatModelVo, chatRequest); @@ -151,16 +160,9 @@ public class ChatServiceFacade implements IChatService { * 处理特殊聊天模式(工作流、人机交互恢复、思考模式) * * @param chatRequest 聊天请求 - * @param contextMessages 上下文消息列表(可能被修改) - * @param chatModelVo 聊天模型配置 - * @param emitter SSE发射器 - * @param userId 用户ID - * @param tokenValue 会话令牌 * @return 如果需要提前返回则返回SseEmitter,否则返回null */ - private SseEmitter handleSpecialChatModes(ChatRequest chatRequest, List contextMessages, - ChatModelVo chatModelVo, SseEmitter emitter, - Long userId, String tokenValue) { + private SseEmitter handleSpecialChatModes(ChatRequest chatRequest) { // 处理工作流对话 if (chatRequest.getEnableWorkFlow()) { log.info("处理工作流对话,会话: {}", chatRequest.getSessionId()); @@ -169,7 +171,6 @@ public class ChatServiceFacade implements IChatService { if (ObjectUtils.isEmpty(runner)) { log.warn("工作流参数为空"); } - return workFlowStarterService.streaming( ThreadContext.getCurrentUser(), runner.getUuid(), @@ -181,25 +182,22 @@ public class ChatServiceFacade implements IChatService { // 处理人机交互恢复 if (chatRequest.getIsResume()) { log.info("处理人机交互恢复"); - ReSumeRunner reSumeRunner = chatRequest.getReSumeRunner(); if (ObjectUtils.isEmpty(reSumeRunner)) { log.warn("人机交互恢复参数为空"); - return emitter; } - workFlowStarterService.resumeFlow( reSumeRunner.getRuntimeUuid(), reSumeRunner.getFeedbackContent(), - emitter + chatRequest.getEmitter() ); - return emitter; - } + return chatRequest.getEmitter(); + } // 处理思考模式 if (chatRequest.getEnableThinking()) { - handleThinkingMode(chatRequest, contextMessages, chatModelVo, userId); + return handleThinkingMode(chatRequest); } return null; @@ -209,80 +207,132 @@ public class ChatServiceFacade implements IChatService { * 处理思考模式 * * @param chatRequest 聊天请求 - * @param contextMessages 上下文消息列表 - * @param chatModelVo 聊天模型配置 - * @param userId 用户ID + */ - private void handleThinkingMode(ChatRequest chatRequest, List contextMessages, - ChatModelVo chatModelVo, Long userId) { - // 步骤1: 配置MCP传输层 - 连接到bing-cn-mcp服务器 - McpTransport transport = new StdioMcpTransport.Builder() + private SseEmitter handleThinkingMode(ChatRequest chatRequest) { + // 配置监督者模型 + OpenAiChatModel plannerModel = OpenAiChatModel.builder() + .baseUrl(chatRequest.getChatModelVo().getApiHost()) + .apiKey(chatRequest.getChatModelVo().getApiKey()) + .modelName(chatRequest.getChatModelVo().getModelName()) + .build(); + + // Bing 搜索 MCP 客户端 + McpTransport bingTransport = new StdioMcpTransport.Builder() .command(List.of("C:\\Program Files\\nodejs\\npx.cmd", "-y", "bing-cn-mcp")) .logEvents(true) .build(); - McpClient mcpClient = new DefaultMcpClient.Builder() - .transport(transport) - .listener(new MyMcpClientListener()) + Long userId = chatRequest.getUserId(); + McpClient bingMcpClient = new DefaultMcpClient.Builder() + .transport(bingTransport) + .listener(new MyMcpClientListener(userId)) .build(); - ToolProvider toolProvider = McpToolProvider.builder() - .mcpClients(List.of(mcpClient)) - .build(); - - // 配置echarts MCP - McpTransport transport1 = new StdioMcpTransport.Builder() - .command(List.of("C:\\Program Files\\nodejs\\npx.cmd", "-y", "mcp-echarts")) + // Playwright MCP 客户端 - 浏览器自动化工具 + McpTransport playwrightTransport = new StdioMcpTransport.Builder() + .command(List.of("C:\\Program Files\\nodejs\\npx.cmd", "-y", "@playwright/mcp@latest")) .logEvents(true) .build(); - McpClient mcpClient1 = new DefaultMcpClient.Builder() - .transport(transport1) - .listener(new MyMcpClientListener()) + McpClient playwrightMcpClient = new DefaultMcpClient.Builder() + .transport(playwrightTransport) + .listener(new MyMcpClientListener(userId)) .build(); - ToolProvider toolProvider1 = McpToolProvider.builder() - .mcpClients(List.of(mcpClient1)) + // Filesystem MCP 客户端 - 文件管理工具 + // 允许 AI 读取、写入、搜索文件(基于当前项目根目录) + String userDir = System.getProperty("user.dir"); + McpTransport filesystemTransport = new StdioMcpTransport.Builder() + .command(List.of("C:\\Program Files\\nodejs\\npx.cmd", "-y", + "@modelcontextprotocol/server-filesystem", userDir)) + .logEvents(true) + .build(); - // 配置模型 - OpenAiChatModel plannerModel = OpenAiChatModel.builder() - .baseUrl(chatModelVo.getApiHost()) - .apiKey(chatModelVo.getApiKey()) - .listeners(List.of(new MyChatModelListener())) - .modelName(chatModelVo.getModelName()) + McpClient filesystemMcpClient = new DefaultMcpClient.Builder() + .transport(filesystemTransport) + .listener(new MyMcpClientListener(userId)) .build(); - // 构建各Agent + // 合并三个 MCP 客户端的工具 + ToolProvider toolProvider = McpToolProvider.builder() + // bingMcpClient, + .mcpClients(List.of(playwrightMcpClient, filesystemMcpClient)) + .build(); + + // ========== LangChain4j Skills 基本用法 ========== + // 通过 SKILL.md 文件定义,LLM 按需通过 activate_skill 工具加载 + // 加载 Skills - 使用相对路径,基于项目根目录 + java.nio.file.Path skillsPath = java.nio.file.Path.of(userDir, "ruoyi-admin/src/main/resources/skills"); + List skillsList = dev.langchain4j.skills.FileSystemSkillLoader + .loadSkills(skillsPath) + ; + + ShellSkills skills = ShellSkills.from(skillsList); + + // 构建子 Agent + WebSearchAgent searchAgent = AgenticServices.agentBuilder(WebSearchAgent.class) + .chatModel(plannerModel) + .toolProvider(toolProvider) + .listener(new MyAgentListener()) + .build(); + + // 构建子 Agent 2: SkillsAgent - 负责文档处理技能(docx、pdf、xlsx) + // 独立管理 Skills 工具 + SkillsAgent skillsAgent = AgenticServices.agentBuilder(SkillsAgent.class) + .chatModel(plannerModel) + .systemMessage("You have access to the following skills:\n" + skills.formatAvailableSkills() + + "\nWhen the user's request relates to one of these skills, activate it first using the `activate_skill` tool before proceeding.") + .toolProvider(skills.toolProvider()) + .build(); + + // 构建子 Agent 3: SqlAgent - 负责数据库查询 SqlAgent sqlAgent = AgenticServices.agentBuilder(SqlAgent.class) .chatModel(plannerModel) - .listener(new MyAgentListener()) .tools(new QueryAllTablesTool(), new QueryTableSchemaTool(), new ExecuteSqlQueryTool()) - .build(); - - WebSearchAgent searchAgent = AgenticServices.agentBuilder(WebSearchAgent.class) - .chatModel(plannerModel) .listener(new MyAgentListener()) - .toolProvider(toolProvider) .build(); + // 构建子 Agent 4: ChartGenerationAgent - 负责图表生成 ChartGenerationAgent chartGenerationAgent = AgenticServices.agentBuilder(ChartGenerationAgent.class) .chatModel(plannerModel) .listener(new MyAgentListener()) - .toolProvider(toolProvider1) .build(); - // 构建监督者Agent + // 构建子 Agent 5: EchartsAgent - 负责数据可视化(结合 SQL 查询生成 Echarts 图表) + EchartsAgent echartsAgent = AgenticServices.agentBuilder(EchartsAgent.class) + .chatModel(plannerModel) + .tools(new QueryAllTablesTool(), new QueryTableSchemaTool(), new ExecuteSqlQueryTool()) + .listener(new MyAgentListener()) + .build(); + + // 构建监督者 Agent - 管理多个子 Agent SupervisorAgent supervisor = AgenticServices.supervisorBuilder() .chatModel(plannerModel) - .listener(new MyAgentListener()) - .subAgents(sqlAgent, searchAgent, chartGenerationAgent) + //.listener(new SupervisorStreamListener(null)) + .subAgents(skillsAgent,searchAgent, sqlAgent, chartGenerationAgent, echartsAgent) + // 加入历史上下文 - 使用 ChatMemoryProvider 提供持久化的聊天内存 + //.chatMemoryProvider(memoryId -> createChatMemory(chatRequest.getSessionId())) .responseStrategy(SupervisorResponseStrategy.LAST) .build(); - // 调用 supervisor - String invoke = supervisor.invoke(chatRequest.getContent()); - log.info("supervisor.invoke() 返回: {}", invoke); + String tokenValue = chatRequest.getTokenValue(); + + // 异步执行 supervisor,避免阻塞 HTTP 请求线程导致 SSE 事件被缓冲 + CompletableFuture.runAsync(() -> { + try { + String result = supervisor.invoke(chatRequest.getContent()); + SseMessageUtils.sendContent(userId, result); + SseMessageUtils.sendDone(userId); + } catch (Exception e) { + log.error("Supervisor 执行失败", e); + SseMessageUtils.sendError(userId, e.getMessage()); + } finally { + SseMessageUtils.completeConnection(userId, tokenValue); + } + }); + return chatRequest.getEmitter(); } /** @@ -549,7 +599,5 @@ public class ChatServiceFacade implements IChatService { } }; } - - } diff --git a/ruoyi-modules/ruoyi-chat/src/test/java/org/ruoyi/agent/StreamingAgentIntegrationTest.java b/ruoyi-modules/ruoyi-chat/src/test/java/org/ruoyi/agent/StreamingAgentIntegrationTest.java new file mode 100644 index 00000000..4456affa --- /dev/null +++ b/ruoyi-modules/ruoyi-chat/src/test/java/org/ruoyi/agent/StreamingAgentIntegrationTest.java @@ -0,0 +1,313 @@ +package org.ruoyi.agent; + +import dev.langchain4j.agentic.AgenticServices; +import dev.langchain4j.agentic.supervisor.SupervisorAgent; +import dev.langchain4j.agentic.supervisor.SupervisorResponseStrategy; +import dev.langchain4j.model.chat.ChatModel; +import dev.langchain4j.model.chat.StreamingChatModel; +import dev.langchain4j.model.chat.listener.ChatModelErrorContext; +import dev.langchain4j.model.chat.listener.ChatModelListener; +import dev.langchain4j.model.chat.listener.ChatModelRequestContext; +import dev.langchain4j.model.chat.listener.ChatModelResponseContext; +import dev.langchain4j.model.chat.response.ChatResponse; +import dev.langchain4j.model.openai.OpenAiChatModel; +import dev.langchain4j.model.openai.OpenAiStreamingChatModel; +import dev.langchain4j.service.SystemMessage; +import dev.langchain4j.service.UserMessage; +import dev.langchain4j.service.V; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Test; +import org.ruoyi.observability.OutputChannel; +import org.ruoyi.observability.StreamingOutputWrapper; +import org.ruoyi.observability.SupervisorStreamListener; + +import java.util.List; +import java.util.UUID; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; + +/** + * 子 Agent 流式输出集成测试 + * + * 测试内容: + * 1. 观察单个 Agent 的流式输出 + * 2. 观察 Supervisor 调用子 Agent 的流式输出 + * 3. 验证 AgentListener 事件回调 + * 4. 验证 StreamingOutputWrapper 的 token 拦截 + * + * 注意:运行测试前需要配置正确的 API Key + * 可以通过环境变量或直接修改配置区域 + * + * @author ageerle@163.com + * @date 2025/04/10 + */ +@Disabled("需要配置 API Key 后手动启用") +public class StreamingAgentIntegrationTest { + + // ==================== 配置区域 ==================== + private static final String BASE_URL = "https://api.ppio.com/openai"; + private static final String API_KEY = System.getenv("PPIO_API_KEY") != null + ? System.getenv("PPIO_API_KEY") + : "xx"; // 默认 Key + private static final String MODEL_NAME = "deepseek/deepseek-v3.2"; + + private StreamingChatModel streamingModel; + private OpenAiChatModel syncModel; + + // ==================== Agent 接口定义 ==================== + + public interface MathAgent { + @SystemMessage("你是一个数学计算助手,帮助用户解决数学问题。直接给出计算结果和简要解释。") + @UserMessage("计算:{{query}}") + @dev.langchain4j.agentic.Agent("数学计算助手") + String calculate(@V("query") String query); + } + + public interface TextAgent { + @SystemMessage("你是一个文本分析助手,帮助用户分析文本内容。给出简洁的分析结果。") + @UserMessage("分析以下文本:{{text}}") + @dev.langchain4j.agentic.Agent("文本分析助手") + String analyze(@V("text") String text); + } + + public interface WeatherAgent { + @SystemMessage("你是一个天气助手。根据用户提供的信息给出天气相关的回答。") + @UserMessage("回答问题:{{query}}") + @dev.langchain4j.agentic.Agent("天气助手") + String answer(@V("query") String query); + } + + // ==================== 初始化 ==================== + + @BeforeEach + void setUp() { +// streamingModel = OpenAiStreamingChatModel.builder() +// .baseUrl(BASE_URL) +// .apiKey(API_KEY) +// .modelName(MODEL_NAME) +// .build(); + + streamingModel = OpenAiStreamingChatModel.builder() + .baseUrl(BASE_URL) + .apiKey(API_KEY) + .listeners(List.of(new ChatModelListener() { + @Override + public void onRequest(ChatModelRequestContext ctx) { + // 请求发送前 + } + @Override + public void onResponse(ChatModelResponseContext ctx) { + // 响应完成后 + } + @Override + public void onError(ChatModelErrorContext ctx) { + // 错误时 + } + })) + .build(); + + + syncModel = OpenAiChatModel.builder() + .baseUrl(BASE_URL) + .apiKey(API_KEY) + .modelName(MODEL_NAME) + .build(); + } + + // ==================== 测试方法 ==================== + + @Test + @DisplayName("测试1: 基础流式输出 - 单个 Agent") + void testBasicStreamingAgent() throws Exception { + System.out.println("\n=== 测试1: 基础流式输出 - 单个 Agent ===\n"); + + // 创建事件总线 + String requestId = UUID.randomUUID().toString(); + OutputChannel channel = OutputChannel.create(requestId); + CountDownLatch completed = new CountDownLatch(1); + + // 包装模型以捕获流式输出 + ChatModel wrappedModel = new StreamingOutputWrapper(streamingModel, channel); + + // 构建 Agent + MathAgent mathAgent = AgenticServices.agentBuilder(MathAgent.class) + .chatModel(wrappedModel) + .build(); + + // 异步执行 + CompletableFuture.runAsync(() -> { + try { + System.out.println(">>> 调用 MathAgent.calculate()..."); + String result = mathAgent.calculate("计算 123 * 456 + 789 的值"); + System.out.println("\n>>> 最终结果: " + result); + } catch (Exception e) { + System.err.println(">>> 异常: " + e.getMessage()); + channel.completeWithError(e); + } finally { + channel.complete(); + completed.countDown(); + } + }); + + // drain 推送 + channel.drain(text -> { + System.out.print(text); + System.out.flush(); + }); + + completed.await(30, TimeUnit.SECONDS); + OutputChannel.remove(requestId); + } + + @Test + @DisplayName("测试2: Supervisor 模式 - 单个子 Agent 流式输出") + void testSupervisorWithSingleSubAgent() throws Exception { + System.out.println("\n=== 测试2: Supervisor 模式 - 单个子 Agent ===\n"); + + String requestId = UUID.randomUUID().toString(); + OutputChannel channel = OutputChannel.create(requestId); + CountDownLatch completed = new CountDownLatch(1); + + // 包装模型 + + // 子 Agent + MathAgent mathAgent = AgenticServices.agentBuilder(MathAgent.class) + .streamingChatModel(streamingModel) + .build(); + + + // Supervisor(注册监听器) + SupervisorAgent supervisor = AgenticServices.supervisorBuilder() + .chatModel(syncModel) + //.listener(new SupervisorStreamListener(channel)) + .subAgents(mathAgent) + .responseStrategy(SupervisorResponseStrategy.LAST) + .build(); + + // 异步执行 + CompletableFuture.runAsync(() -> { + try { + System.out.println(">>> Supervisor.invoke() 开始..."); + String result = supervisor.invoke("帮我计算 999 除以 3 等于多少"); + System.out.println("\n>>> Supervisor 结果: " + result); + } catch (Exception e) { + System.err.println(">>> 异常: " + e.getMessage()); + e.printStackTrace(); + // channel.completeWithError(e); + } finally { + // channel.complete(); + completed.countDown(); + } + }); + + // drain 推送 + channel.drain(text -> { + System.out.print(text); + System.out.flush(); + }); + + completed.await(60, TimeUnit.SECONDS); + OutputChannel.remove(requestId); + } + + @Test + @DisplayName("测试3: Supervisor 模式 - 多个子 Agent 流式输出") + void testSupervisorWithMultipleSubAgents() throws Exception { + System.out.println("\n=== 测试3: Supervisor 模式 - 多个子 Agent ===\n"); + + String requestId = UUID.randomUUID().toString(); + OutputChannel channel = OutputChannel.create(requestId); + CountDownLatch completed = new CountDownLatch(1); + + // 包装模型 + ChatModel wrappedModel = new StreamingOutputWrapper(streamingModel, channel); + + + // 子 Agent + MathAgent mathAgent = AgenticServices.agentBuilder(MathAgent.class) + .chatModel(wrappedModel) + .build(); + + TextAgent textAgent = AgenticServices.agentBuilder(TextAgent.class) + .chatModel(wrappedModel) + .build(); + + WeatherAgent weatherAgent = AgenticServices.agentBuilder(WeatherAgent.class) + .chatModel(wrappedModel) + .build(); + + // Supervisor + SupervisorAgent supervisor = AgenticServices.supervisorBuilder() + .chatModel(syncModel) + .listener(new SupervisorStreamListener(channel)) + .subAgents(mathAgent, textAgent, weatherAgent) + .responseStrategy(SupervisorResponseStrategy.LAST) + .build(); + + // 异步执行 - 提一个会触发多个 Agent 的问题 + CompletableFuture.runAsync(() -> { + try { + System.out.println(">>> Supervisor.invoke() 开始..."); + String result = supervisor.invoke( + "请帮我做两件事:1. 计算 50 * 20 的结果;2. 分析 '人工智能正在改变世界' 这句话的含义" + ); + System.out.println("\n>>> Supervisor 结果: " + result); + } catch (Exception e) { + System.err.println(">>> 异常: " + e.getMessage()); + e.printStackTrace(); + channel.completeWithError(e); + } finally { + channel.complete(); + completed.countDown(); + } + }); + + // drain 推送 - 实时观察流式输出 + channel.drain(text -> { + System.out.print("观察流式输出:"+text); + System.out.flush(); + }); + + completed.await(90, TimeUnit.SECONDS); + OutputChannel.remove(requestId); + } + + @Test + @DisplayName("测试4: 直接观察 StreamingChatModel 的流式响应") + void testDirectStreamingChatModel() throws Exception { + System.out.println("\n=== 测试4: 直接观察 StreamingChatModel ===\n"); + + StringBuilder buffer = new StringBuilder(); + CountDownLatch completed = new CountDownLatch(1); + + streamingModel.chat("你好,请自我介绍", new dev.langchain4j.model.chat.response.StreamingChatResponseHandler() { + @Override + public void onPartialResponse(String partialResponse) { + buffer.append(partialResponse); + System.out.print(partialResponse); + System.out.flush(); + } + + @Override + public void onCompleteResponse(ChatResponse completeResponse) { + System.out.println("\n\n[完成] 总Token数: " + + (completeResponse.metadata() != null && completeResponse.metadata().tokenUsage() != null + ? completeResponse.metadata().tokenUsage().totalTokenCount() + : "无")); + completed.countDown(); + } + + @Override + public void onError(Throwable error) { + System.err.println("[错误] " + error.getMessage()); + completed.countDown(); + } + }); + + completed.await(30, TimeUnit.SECONDS); + System.out.println("完整响应内容: " + buffer.toString()); + } +}