| | |
| | | package com.ruoyi.ai.service.impl; |
| | | |
| | | import com.ruoyi.ai.service.AiFileTextExtractor; |
| | | import com.ruoyi.ai.service.KnowledgeRagService; |
| | | import com.ruoyi.approve.pojo.KnowledgeBaseVector; |
| | | import com.ruoyi.approve.service.KnowledgeBaseVectorService; |
| | |
| | | import dev.langchain4j.data.embedding.Embedding; |
| | | import dev.langchain4j.data.segment.TextSegment; |
| | | import dev.langchain4j.model.embedding.EmbeddingModel; |
| | | import dev.langchain4j.store.embedding.EmbeddingMatch; |
| | | import dev.langchain4j.store.embedding.EmbeddingSearchRequest; |
| | | import dev.langchain4j.store.embedding.EmbeddingSearchResult; |
| | | import dev.langchain4j.store.embedding.EmbeddingStore; |
| | |
| | | import org.springframework.stereotype.Service; |
| | | |
| | | import java.io.File; |
| | | import java.nio.charset.Charset; |
| | | import java.nio.charset.StandardCharsets; |
| | | import java.nio.file.Files; |
| | | import java.util.ArrayList; |
| | | import java.util.HashMap; |
| | |
| | | |
| | | private final KnowledgeBaseVectorService knowledgeBaseVectorService; |
| | | private final StorageBlobService storageBlobService; |
| | | private final AiFileTextExtractor aiFileTextExtractor; |
| | | private final EmbeddingModel embeddingModel; |
| | | private final EmbeddingStore<TextSegment> embeddingStore; |
| | | private final FileProperties fileProperties; |
| | | |
| | | private static final int CHUNK_SIZE = 500; |
| | | private static final int CHUNK_OVERLAP = 100; |
| | | /** |
| | | * 文件大小阈值,超过此值才进行切片 |
| | | * 80MB = 80 * 1024 * 1024 字节 |
| | | */ |
| | | private static final long CHUNK_THRESHOLD_BYTES = 80L * 1024 * 1024; |
| | | /** |
| | | * Embedding 模型最大输入长度限制 |
| | | * 阿里云 DashScope 限制为 8192 字符 |
| | | */ |
| | | private static final int EMBEDDING_MAX_LENGTH = 8000; |
| | | |
| | | @Override |
| | | @Async("threadPoolTaskExecutor") |
| | |
| | | |
| | | File file = getFile(blob); |
| | | log.info("文件路径: {}, 是否存在: {}", file.getAbsolutePath(), file.exists()); |
| | | long fileSize = file.length(); |
| | | log.info("文件大小: {} 字节", fileSize); |
| | | |
| | | // 直接读取文件内容,不使用 MultipartFile 包装 |
| | | log.info("提取文件内容: fileName={}", vector.getFileName()); |
| | | String content = extractFileContent(file, vector.getFileName(), blob.getContentType()); |
| | | String content = extractFileContent(file, vector.getFileName()); |
| | | log.info("文件内容长度: {}", content != null ? content.length() : 0); |
| | | |
| | | if (content == null || content.trim().isEmpty()) { |
| | |
| | | } |
| | | |
| | | // 文本切片 |
| | | log.info("开始文本切片"); |
| | | List<TextSegment> chunks = splitText(content, vector); |
| | | log.info("切片完成,共 {} 个块", chunks.size()); |
| | | List<TextSegment> chunks; |
| | | boolean needChunk = fileSize > CHUNK_THRESHOLD_BYTES || content.length() > EMBEDDING_MAX_LENGTH; |
| | | if (needChunk) { |
| | | log.info("开始切片: fileSize={}, contentLength={}", fileSize, content.length()); |
| | | chunks = splitText(content, vector); |
| | | log.info("切片完成,共 {} 个块", chunks.size()); |
| | | } else { |
| | | log.info("文件较小且内容长度{}不超过{},不进行切片", content.length(), EMBEDDING_MAX_LENGTH); |
| | | Map<String, Object> metadata = buildMetadata(vector); |
| | | chunks = List.of(TextSegment.from(content, new dev.langchain4j.data.document.Metadata(metadata))); |
| | | } |
| | | |
| | | // 批量生成嵌入向量并存储 |
| | | int chunkCount = 0; |
| | |
| | | /** |
| | | * 提取文件内容 |
| | | */ |
| | | private String extractFileContent(File file, String fileName, String contentType) throws Exception { |
| | | private String extractFileContent(File file, String fileName) throws Exception { |
| | | String ext = getFileExtension(fileName); |
| | | |
| | | // 根据文件类型提取内容 |
| | | if (isPlainText(ext)) { |
| | | return Files.readString(file.toPath()); |
| | | return readFileWithEncoding(file); |
| | | } |
| | | |
| | | if ("docx".equals(ext)) { |
| | |
| | | } |
| | | |
| | | // 默认尝试读取文本 |
| | | return Files.readString(file.toPath()); |
| | | return readFileWithEncoding(file); |
| | | } |
| | | |
| | | /** |
| | | * 自动检测文件编码并读取内容 |
| | | * 优先尝试 UTF-8,失败则尝试 GBK |
| | | */ |
| | | private String readFileWithEncoding(File file) throws Exception { |
| | | byte[] bytes = Files.readAllBytes(file.toPath()); |
| | | |
| | | // 先尝试 UTF-8 |
| | | String utf8Content = new String(bytes, StandardCharsets.UTF_8); |
| | | if (isValidUtf8(utf8Content)) { |
| | | log.debug("文件编码: UTF-8"); |
| | | return utf8Content; |
| | | } |
| | | |
| | | // 尝试 GBK |
| | | try { |
| | | Charset gbk = Charset.forName("GBK"); |
| | | String gbkContent = new String(bytes, gbk); |
| | | log.debug("文件编码: GBK"); |
| | | return gbkContent; |
| | | } catch (Exception e) { |
| | | log.warn("编码检测失败,使用 UTF-8"); |
| | | return utf8Content; |
| | | } |
| | | } |
| | | |
| | | /** |
| | | * 检查 UTF-8 解码是否有效 |
| | | */ |
| | | private boolean isValidUtf8(String decoded) { |
| | | // 检查是否包含替换字符(说明 UTF-8 解码失败) |
| | | if (decoded.contains("�")) { |
| | | return false; |
| | | } |
| | | // 检查是否有过多的非打印字符(乱码特征) |
| | | int invalidCount = 0; |
| | | for (int i = 0; i < Math.min(decoded.length(), 1000); i++) { |
| | | char c = decoded.charAt(i); |
| | | // 检查私有使用区域或异常的控制字符 |
| | | if ((c >= '' && c <= '') || (c < ' ' && c != '\n' && c != '\r' && c != '\t')) { |
| | | invalidCount++; |
| | | } |
| | | } |
| | | // 如果无效字符超过 5%,认为是编码错误 |
| | | return invalidCount < Math.min(decoded.length(), 1000) * 0.05; |
| | | } |
| | | |
| | | private String getFileExtension(String fileName) { |