diff --git a/pom.xml b/pom.xml index 848c2261..4d6ac7f0 100644 --- a/pom.xml +++ b/pom.xml @@ -40,6 +40,8 @@ 0.2.0 1.18.26 1.72 + + 1.26.2 2.7.0 @@ -283,6 +285,13 @@ ${bouncycastle.version} + + + org.apache.commons + commons-compress + ${commons-compress.version} + + io.github.linpeilie mapstruct-plus-spring-boot-starter diff --git a/ruoyi-extend/pom.xml b/ruoyi-extend/pom.xml index 3fef2b70..3538eb94 100644 --- a/ruoyi-extend/pom.xml +++ b/ruoyi-extend/pom.xml @@ -15,6 +15,7 @@ ruoyi-mcp-server + ruoyi-ai-copilot diff --git a/ruoyi-modules-api/ruoyi-knowledge-api/pom.xml b/ruoyi-modules-api/ruoyi-knowledge-api/pom.xml index 2a3e634c..f8082a67 100644 --- a/ruoyi-modules-api/ruoyi-knowledge-api/pom.xml +++ b/ruoyi-modules-api/ruoyi-knowledge-api/pom.xml @@ -87,6 +87,18 @@ dev.langchain4j langchain4j-document-parser-apache-tika + + + org.apache.commons + commons-compress + + + + + + + org.apache.commons + commons-compress diff --git a/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/chain/split/ExcelTextSplitter.java b/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/chain/split/ExcelTextSplitter.java index cc2b5f04..b33ecd3e 100644 --- a/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/chain/split/ExcelTextSplitter.java +++ b/ruoyi-modules-api/ruoyi-knowledge-api/src/main/java/org/ruoyi/chain/split/ExcelTextSplitter.java @@ -1,17 +1,59 @@ package org.ruoyi.chain.split; +import jakarta.annotation.Resource; import lombok.AllArgsConstructor; import lombok.extern.slf4j.Slf4j; +import org.ruoyi.common.core.utils.StringUtils; +import org.ruoyi.domain.vo.KnowledgeInfoVo; +import org.ruoyi.service.IKnowledgeInfoService; +import org.springframework.context.annotation.Lazy; import org.springframework.stereotype.Component; +import java.util.ArrayList; +import java.util.Arrays; import java.util.List; @Component @AllArgsConstructor @Slf4j public class ExcelTextSplitter implements TextSplitter{ + + @Override public List split(String content, String kid) { - return null; + // 使用默认配置 + String knowledgeSeparator = "#"; + int textBlockSize = 10000; + int overlapChar = 500; + List chunkList = new ArrayList<>(); + if (content.contains(knowledgeSeparator) && StringUtils.isNotBlank(knowledgeSeparator)) { + // 按自定义分隔符切分 + String[] chunks = content.split(knowledgeSeparator); + chunkList.addAll(Arrays.asList(chunks)); + } else { + int indexMin = 0; + int len = content.length(); + int i = 0; + int right = 0; + while (true) { + if (len > right) { + int begin = i * textBlockSize - overlapChar; + if (begin < indexMin) { + begin = indexMin; + } + int end = textBlockSize * (i + 1) + overlapChar; + if (end > len) { + end = len; + } + String chunk = content.substring(begin, end); + chunkList.add(chunk); + i++; + right = right + textBlockSize; + } else { + break; + } + } + } + return chunkList; } } diff --git a/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/chat/service/knowledge/DealFileService.java b/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/chat/service/knowledge/DealFileService.java deleted file mode 100644 index 730cfc5d..00000000 --- a/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/chat/service/knowledge/DealFileService.java +++ /dev/null @@ -1,377 +0,0 @@ -package org.ruoyi.chat.service.knowledge; - -import cn.hutool.core.util.ObjectUtil; -import cn.hutool.core.util.RandomUtil; -import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper; -import com.baomidou.mybatisplus.core.conditions.update.LambdaUpdateWrapper; -import com.baomidou.mybatisplus.core.toolkit.Wrappers; -import java.util.Collection; -import java.util.Date; -import java.util.List; -import java.util.stream.Collectors; -import lombok.RequiredArgsConstructor; -import org.ruoyi.chain.loader.ResourceLoaderFactory; -import org.ruoyi.constant.DealStatus; -import org.ruoyi.domain.KnowledgeAttach; -import org.ruoyi.domain.KnowledgeAttachPic; -import org.ruoyi.domain.KnowledgeFragment; -import org.ruoyi.domain.KnowledgeInfo; -import org.ruoyi.domain.PdfFileContentResult; -import org.ruoyi.domain.bo.StoreEmbeddingBo; -import org.ruoyi.domain.vo.ChatModelVo; -import org.ruoyi.domain.vo.KnowledgeAttachVo; -import org.ruoyi.domain.vo.KnowledgeInfoVo; -import org.ruoyi.mapper.KnowledgeAttachMapper; -import org.ruoyi.mapper.KnowledgeAttachPicMapper; -import org.ruoyi.mapper.KnowledgeFragmentMapper; -import org.ruoyi.mapper.KnowledgeInfoMapper; -import org.ruoyi.service.IChatModelService; -import org.ruoyi.service.PdfImageExtractService; -import org.ruoyi.service.VectorStoreService; -import org.ruoyi.service.impl.PdfImageExtractServiceImpl; -import org.ruoyi.system.domain.vo.SysOssVo; -import org.ruoyi.system.service.ISysOssService; -import org.ruoyi.utils.ZipUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.springframework.beans.factory.annotation.Value; -import org.springframework.scheduling.annotation.Async; -import org.springframework.stereotype.Service; -import org.springframework.web.multipart.MultipartFile; - -import com.fasterxml.jackson.databind.JsonNode; -import com.fasterxml.jackson.databind.ObjectMapper; - -/** - * @Description: - * @Date: 2025/5/15 下午4:29 - */ -@Service -@RequiredArgsConstructor -public class DealFileService { - private static final Logger log = LoggerFactory.getLogger(DealFileService.class); - - private final KnowledgeInfoMapper baseMapper; - - private final VectorStoreService vectorStoreService; - - private final ResourceLoaderFactory resourceLoaderFactory; - - private final KnowledgeFragmentMapper fragmentMapper; - - private final KnowledgeAttachMapper attachMapper; - - private final IChatModelService chatModelService; - - private final ISysOssService ossService; - - private final PdfImageExtractService pdfImageExtractService; - - private final KnowledgeAttachPicMapper picMapper; - - @Async - public void dealVectorStatus(KnowledgeAttach attachItem) throws Exception { - try { - //锁定数据 更改VectorStatus 到进行中 - if (attachMapper.update(new LambdaUpdateWrapper() - .set(KnowledgeAttach::getVectorStatus, DealStatus.STATUS_20) - .eq(KnowledgeAttach::getPicStatus, DealStatus.STATUS_30) - .eq(KnowledgeAttach::getPicAnysStatus, DealStatus.STATUS_30) - .eq(KnowledgeAttach::getVectorStatus, DealStatus.STATUS_10) - .eq(KnowledgeAttach::getId, attachItem.getId()) - ) == 0) { - return; - } - List knowledgeFragments = fragmentMapper.selectList( - new LambdaQueryWrapper() - .eq(KnowledgeFragment::getKid, attachItem.getKid()) - .eq(KnowledgeFragment::getDocId, attachItem.getDocId()) - ); - if (ObjectUtil.isEmpty(knowledgeFragments)) { - throw new Exception("文件段落为空"); - } - List fids = knowledgeFragments.stream() - .map(KnowledgeFragment::getFid) - .collect(Collectors.toList()); - if (ObjectUtil.isEmpty(fids)) { - throw new Exception("fids 为空"); - } - List chunkList = knowledgeFragments.stream() - .map(KnowledgeFragment::getContent) - .collect(Collectors.toList()); - - if (ObjectUtil.isEmpty(chunkList)) { - throw new Exception("chunkList 为空"); - } - // 通过kid查询知识库信息 - KnowledgeInfoVo knowledgeInfoVo = baseMapper.selectVoOne(Wrappers.lambdaQuery() - .eq(KnowledgeInfo::getId, attachItem.getKid())); - // 通过向量模型查询模型信息 - ChatModelVo chatModelVo = chatModelService.selectModelByName( - knowledgeInfoVo.getEmbeddingModelName()); - - StoreEmbeddingBo storeEmbeddingBo = new StoreEmbeddingBo(); - storeEmbeddingBo.setKid(attachItem.getKid()); - storeEmbeddingBo.setDocId(attachItem.getDocId()); - storeEmbeddingBo.setFids(fids); - storeEmbeddingBo.setChunkList(chunkList); - storeEmbeddingBo.setVectorModelName(knowledgeInfoVo.getVectorModelName()); - storeEmbeddingBo.setEmbeddingModelName(knowledgeInfoVo.getEmbeddingModelName()); - storeEmbeddingBo.setApiKey(chatModelVo.getApiKey()); - storeEmbeddingBo.setBaseUrl(chatModelVo.getApiHost()); - vectorStoreService.storeEmbeddings(storeEmbeddingBo); - - //设置处理完成 - attachMapper.update(new LambdaUpdateWrapper() - .set(KnowledgeAttach::getVectorStatus, DealStatus.STATUS_30) - .eq(KnowledgeAttach::getPicStatus, DealStatus.STATUS_30) - .eq(KnowledgeAttach::getPicAnysStatus, DealStatus.STATUS_30) - .eq(KnowledgeAttach::getVectorStatus, DealStatus.STATUS_20) - .eq(KnowledgeAttach::getId, attachItem.getId())); - } catch (Exception e) { - e.printStackTrace(); - //设置处理失败 - attachMapper.update(new LambdaUpdateWrapper() - .set(KnowledgeAttach::getVectorStatus, DealStatus.STATUS_40) - .set(KnowledgeAttach::getRemark, attachItem.getRemark() + e.getMessage()) - .eq(KnowledgeAttach::getPicStatus, DealStatus.STATUS_30) - .eq(KnowledgeAttach::getPicAnysStatus, DealStatus.STATUS_30) - .eq(KnowledgeAttach::getVectorStatus, DealStatus.STATUS_20) - .eq(KnowledgeAttach::getId, attachItem.getId())); - throw new RuntimeException(e); - } - } - - @Async - public void dealPicStatus(KnowledgeAttach attachItem) throws Exception { - try { - //锁定数据 更改picStatus 到进行中 - if (attachMapper.update(new LambdaUpdateWrapper() - .set(KnowledgeAttach::getPicStatus, DealStatus.STATUS_20) - .eq(KnowledgeAttach::getPicStatus, DealStatus.STATUS_10) - .eq(KnowledgeAttach::getPicAnysStatus, DealStatus.STATUS_10) - .eq(KnowledgeAttach::getVectorStatus, DealStatus.STATUS_10) - .eq(KnowledgeAttach::getId, attachItem.getId()) - ) == 0) { - return; - } - //获取附件 - if (ObjectUtil.isEmpty(attachItem.getOssId())) { - log.error("==========OssId 为空,attachItem={}", attachItem); - throw new Exception("OssId 为空"); - } - //获取oss文件 - MultipartFile multipartFile = ossService.downloadByFile(attachItem.getOssId()); - //拆解出图片ZIP - byte[] pngs = pdfImageExtractService.extractImages(multipartFile, "png", true); - //解压zip,得到图片文件 - MultipartFile[] multipartFiles = ZipUtils.unzipToMultipartFiles(pngs); - //上传文件到OSS,写入表 - for (MultipartFile file : multipartFiles) { - //先查找是否有相同图片名称,先做删除 - List knowledgeAttachPics = picMapper.selectList( - new LambdaQueryWrapper() - .eq(KnowledgeAttachPic::getKid, attachItem.getKid()) - .eq(KnowledgeAttachPic::getAid, attachItem.getId()) - .eq(KnowledgeAttachPic::getDocName, file.getOriginalFilename()) - ); - if (ObjectUtil.isNotEmpty(knowledgeAttachPics)) { - Collection ossIds = knowledgeAttachPics.stream() - .map(KnowledgeAttachPic::getOssId) - .collect(Collectors.toList()); - ossService.deleteWithValidByIds(ossIds, false); - List collect = knowledgeAttachPics.stream().map(KnowledgeAttachPic::getId) - .collect(Collectors.toList()); - picMapper.deleteByIds(collect); - } - - SysOssVo upload = ossService.upload(file); - KnowledgeAttachPic entity = new KnowledgeAttachPic(); - entity.setKid(attachItem.getKid()); - entity.setAid(String.valueOf(attachItem.getId())); - entity.setDocName(file.getOriginalFilename()); - entity.setDocType( - file.getOriginalFilename().substring(file.getOriginalFilename().lastIndexOf(".") + 1)); - entity.setOssId(upload.getOssId()); - int[] ints = extractPageNumbers(file.getOriginalFilename()); - if (ObjectUtil.isNotEmpty(ints)) { - assert ints != null; - if (ints.length == 2) { - entity.setPageNum(ints[0]); - entity.setIndexNum(ints[1]); - } - } - picMapper.insert(entity); - } - - //设置处理完成 - attachMapper.update(new LambdaUpdateWrapper() - .set(KnowledgeAttach::getPicStatus, DealStatus.STATUS_30) - .eq(KnowledgeAttach::getPicStatus, DealStatus.STATUS_20) - .eq(KnowledgeAttach::getPicAnysStatus, DealStatus.STATUS_10) - .eq(KnowledgeAttach::getVectorStatus, DealStatus.STATUS_10) - .eq(KnowledgeAttach::getId, attachItem.getId())); - } catch (Exception e) { - //设置处理失败 - attachMapper.update(new LambdaUpdateWrapper() - .set(KnowledgeAttach::getVectorStatus, DealStatus.STATUS_40) - .set(KnowledgeAttach::getRemark, attachItem.getRemark() + e.getMessage()) - .eq(KnowledgeAttach::getPicStatus, DealStatus.STATUS_20) - .eq(KnowledgeAttach::getPicAnysStatus, DealStatus.STATUS_10) - .eq(KnowledgeAttach::getVectorStatus, DealStatus.STATUS_10) - .eq(KnowledgeAttach::getId, attachItem.getId())); - throw new RuntimeException(e); - } - - } - - - @Async - public void dealPicAnysStatus(KnowledgeAttachPic picItem) throws Exception { - String filePath = null; - try { - //锁定数据 更改 getPicAnysStatus 到进行中 - if (picMapper.update(new LambdaUpdateWrapper() - .set(KnowledgeAttachPic::getPicAnysStatus, DealStatus.STATUS_20) - .eq(KnowledgeAttachPic::getPicAnysStatus, DealStatus.STATUS_10) - .eq(KnowledgeAttachPic::getId, picItem.getId()) - ) == 0) { - return; - } - SysOssVo ossVo = ossService.getById(picItem.getOssId()); - if (ObjectUtil.isNotEmpty(ossVo)) { - filePath = ossService.downloadToTempPath(picItem.getOssId()); - //调用第三方 分析图片内容 - List pdfFileContentResults = pdfImageExtractService.dealFileContent4DashscopeBase64( - filePath); - if (ObjectUtil.isNotEmpty(pdfFileContentResults)) { - for (PdfFileContentResult resultItem : pdfFileContentResults) { - //图片解析内容回写到pic表 - picMapper.update(new LambdaUpdateWrapper() - .set(KnowledgeAttachPic::getContent, parseContent(resultItem.getContent())) - .set(KnowledgeAttachPic::getPicAnysStatus, DealStatus.STATUS_30) - .eq(KnowledgeAttachPic::getId, picItem.getId())); - //将图片解析内容 写入段落表 fragment - KnowledgeAttachVo knowledgeAttachVo = attachMapper.selectVoById(picItem.getAid()); - if (ObjectUtil.isNotEmpty(knowledgeAttachVo)) { - String fid = RandomUtil.randomString(10); - KnowledgeFragment knowledgeFragment = new KnowledgeFragment(); - knowledgeFragment.setKid(knowledgeAttachVo.getKid()); - knowledgeFragment.setDocId(knowledgeAttachVo.getDocId()); - knowledgeFragment.setFid(fid); - knowledgeFragment.setIdx(0); - knowledgeFragment.setContent(parseContent(resultItem.getContent())); - knowledgeFragment.setCreateTime(new Date()); - fragmentMapper.insert(knowledgeFragment); - - //更新attach表,需要所有图片都处理完毕 - // 查询非30状态(完成状态)的记录数量 - long nonStatus30Count = picMapper.selectCount( - new LambdaQueryWrapper() - .ne(KnowledgeAttachPic::getPicAnysStatus, DealStatus.STATUS_30) - .eq(KnowledgeAttachPic::getAid, picItem.getAid()) - ); - if (nonStatus30Count == 0) { - // 执行表更新操作 - attachMapper.update(new LambdaUpdateWrapper() - .set(KnowledgeAttach::getPicAnysStatus, DealStatus.STATUS_30) - .eq(KnowledgeAttach::getPicStatus, DealStatus.STATUS_30) - .eq(KnowledgeAttach::getPicAnysStatus, DealStatus.STATUS_10) - .eq(KnowledgeAttach::getVectorStatus, DealStatus.STATUS_10) - .eq(KnowledgeAttach::getId, picItem.getAid())); - } - } - } - } - } - } catch (Exception e) { - //失败 - picMapper.update(new LambdaUpdateWrapper() - .set(KnowledgeAttachPic::getPicAnysStatus, DealStatus.STATUS_40) - .set(KnowledgeAttachPic::getRemark, picItem.getRemark() + e.getMessage()) - .eq(KnowledgeAttachPic::getPicAnysStatus, DealStatus.STATUS_20) - .eq(KnowledgeAttachPic::getId, picItem.getId())); - throw new RuntimeException(e); - } finally { - //无论成功还是失败,都要删除临时文件 - if (ObjectUtil.isNotEmpty(filePath)) { - ossService.deleteFile(filePath); - } - } - } - - - /** - * 从文件名中提取page后面的两个数字 - * - * @param fileName 文件名 - * @return 包含两个数字的数组,如果未找到则返回null - */ - public static int[] extractPageNumbers(String fileName) { - // 查找"page_"的位置 - int pageIndex = fileName.indexOf("page_"); - - if (pageIndex == -1) { - return null; - } - - // 从"page_"后开始截取 - String afterPage = fileName.substring(pageIndex + 5); - - // 按下划线分割 - String[] parts = afterPage.split("_"); - - if (parts.length >= 2) { - try { - // 提取两个数字 - int firstNumber = Integer.parseInt(parts[0]); - - // 对于第二个数字,需要去掉可能的文件扩展名 - String secondPart = parts[1]; - int dotIndex = secondPart.indexOf("."); - if (dotIndex != -1) { - secondPart = secondPart.substring(0, dotIndex); - } - - int secondNumber = Integer.parseInt(secondPart); - - return new int[]{firstNumber, secondNumber}; - } catch (NumberFormatException e) { - return null; - } - } - - return null; - } - - public static String parseContent(String content) { - try { - // 首先尝试作为JSON解析 - ObjectMapper objectMapper = new ObjectMapper(); - JsonNode rootNode = objectMapper.readTree(content); - - // 如果是JSON格式,按原有逻辑处理 - JsonNode choicesNode = rootNode.get("choices"); - if (choicesNode != null && choicesNode.isArray() && choicesNode.size() > 0) { - JsonNode firstChoice = choicesNode.get(0); - JsonNode messageNode = firstChoice.get("message"); - if (messageNode != null) { - JsonNode contentNode = messageNode.get("content"); - if (contentNode != null) { - return contentNode.asText(); - } - } - return "无法找到content内容"; - } - - // 如果不是预期的JSON格式,直接返回原始内容 - return content; - - } catch (Exception e) { - // 如果解析JSON失败,说明是普通文本,直接返回 - return content; - } - } - - -} diff --git a/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/chat/service/knowledge/KnowledgeInfoServiceImpl.java b/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/chat/service/knowledge/KnowledgeInfoServiceImpl.java index 30e95731..a5be768b 100644 --- a/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/chat/service/knowledge/KnowledgeInfoServiceImpl.java +++ b/ruoyi-modules/ruoyi-chat/src/main/java/org/ruoyi/chat/service/knowledge/KnowledgeInfoServiceImpl.java @@ -315,78 +315,4 @@ public class KnowledgeInfoServiceImpl implements IKnowledgeInfoService { } } - /** - * 第一步 定时 拆解PDF文件中的图片 - */ - @Scheduled(fixedDelay = 15000) // 每3秒执行一次 - public void dealKnowledgeAttachPic() throws Exception { - // 处理 拆解PDF文件中的图片的记录 - List knowledgeAttaches = attachMapper.selectList( - new LambdaQueryWrapper() - .eq(KnowledgeAttach::getPicStatus, DealStatus.STATUS_10) - .eq(KnowledgeAttach::getPicAnysStatus, DealStatus.STATUS_10) - .eq(KnowledgeAttach::getVectorStatus, DealStatus.STATUS_10)); - log.info("===============拆解PDF文件中的图片 size = {}", knowledgeAttaches.size()); - if (ObjectUtil.isNotEmpty(knowledgeAttaches)) { - for (KnowledgeAttach attachItem : knowledgeAttaches) { - dealFileService.dealPicStatus(attachItem); - } - } - } - - /** - * 第二步 定时 解析图片内容 - */ - @Scheduled(fixedDelay = 15000) - public void dealKnowledgeAttachPicAnys() throws Exception { - // 获取未处理的图片记录 - List knowledgeAttachPics = picMapper.selectList( - new LambdaQueryWrapper() - .eq(KnowledgeAttachPic::getPicAnysStatus, DealStatus.STATUS_10) - .last("LIMIT 20")); - if (ObjectUtil.isNotEmpty(knowledgeAttachPics)) { - for (KnowledgeAttachPic picItem : knowledgeAttachPics) { - dealFileService.dealPicAnysStatus(picItem); - } - } - } - - /** - * 第三步 定时 处理 附件上传后上传向量数据库 - */ - @Scheduled(fixedDelay = 30000) // 每3秒执行一次 - public void dealKnowledgeAttachVector() throws Exception { - // 处理 需要上传向量数据库的记录 - List knowledgeAttaches = attachMapper.selectList( - new LambdaQueryWrapper() - .eq(KnowledgeAttach::getPicStatus, DealStatus.STATUS_30) - .eq(KnowledgeAttach::getPicAnysStatus, DealStatus.STATUS_30) - .eq(KnowledgeAttach::getVectorStatus, DealStatus.STATUS_10)); - log.info("===============上传向量数据库 size = {}", knowledgeAttaches.size()); - if (ObjectUtil.isNotEmpty(knowledgeAttaches)) { - for (KnowledgeAttach attachItem : knowledgeAttaches) { - dealFileService.dealVectorStatus(attachItem); - } - } - } - - /** - * 第四步 定时 处理 失败数据 - */ - @Scheduled(fixedDelay = 30 * 60 * 1000) - public void dealKnowledge40Status() throws Exception { - // 拆解PDF失败 重新设置状态 - attachMapper.update(new LambdaUpdateWrapper() - .set(KnowledgeAttach::getPicStatus, DealStatus.STATUS_10) - .eq(KnowledgeAttach::getPicStatus, DealStatus.STATUS_40)); - // 将图片分析失败的数据 重新设置状态 - picMapper.update(new LambdaUpdateWrapper() - .set(KnowledgeAttachPic::getPicAnysStatus, DealStatus.STATUS_10) - .eq(KnowledgeAttachPic::getPicAnysStatus, DealStatus.STATUS_40)); - // 上传向量库失败 重新设置状态 - attachMapper.update(new LambdaUpdateWrapper() - .set(KnowledgeAttach::getVectorStatus, DealStatus.STATUS_10) - .eq(KnowledgeAttach::getVectorStatus, DealStatus.STATUS_40)); - } - }