| | |
| | | package com.ruoyi.ai.service.impl; |
| | | |
| | | import com.ruoyi.ai.service.AiFileTextExtractor; |
| | | import com.ruoyi.ai.service.KnowledgeRagService; |
| | | import com.ruoyi.approve.pojo.KnowledgeBaseVector; |
| | | import com.ruoyi.approve.service.KnowledgeBaseVectorService; |
| | |
| | | import dev.langchain4j.data.embedding.Embedding; |
| | | import dev.langchain4j.data.segment.TextSegment; |
| | | import dev.langchain4j.model.embedding.EmbeddingModel; |
| | | import dev.langchain4j.store.embedding.EmbeddingMatch; |
| | | import dev.langchain4j.store.embedding.EmbeddingSearchRequest; |
| | | import dev.langchain4j.store.embedding.EmbeddingSearchResult; |
| | | import dev.langchain4j.store.embedding.EmbeddingStore; |
| | | import lombok.RequiredArgsConstructor; |
| | | import io.pinecone.clients.Index; |
| | | import lombok.extern.slf4j.Slf4j; |
| | | import org.springframework.beans.factory.annotation.Value; |
| | | import org.springframework.scheduling.annotation.Async; |
| | | import org.springframework.stereotype.Service; |
| | | |
| | | import java.io.File; |
| | | import java.nio.charset.Charset; |
| | | import java.nio.charset.StandardCharsets; |
| | | import java.nio.file.Files; |
| | | import java.util.ArrayList; |
| | | import java.util.HashMap; |
| | |
| | | */ |
| | | @Slf4j |
| | | @Service |
| | | @RequiredArgsConstructor |
| | | public class KnowledgeRagServiceImpl implements KnowledgeRagService { |
| | | |
| | | private final KnowledgeBaseVectorService knowledgeBaseVectorService; |
| | | private final StorageBlobService storageBlobService; |
| | | private final AiFileTextExtractor aiFileTextExtractor; |
| | | private final EmbeddingModel embeddingModel; |
| | | private final EmbeddingStore<TextSegment> embeddingStore; |
| | | private final FileProperties fileProperties; |
| | | private final Index pineconeIndex; |
| | | |
| | | @Value("${pinecone.namespace:knowledge-base}") |
| | | private String namespace; |
| | | |
| | | public KnowledgeRagServiceImpl( |
| | | KnowledgeBaseVectorService knowledgeBaseVectorService, |
| | | StorageBlobService storageBlobService, |
| | | EmbeddingModel embeddingModel, |
| | | EmbeddingStore<TextSegment> embeddingStore, |
| | | FileProperties fileProperties, |
| | | Index pineconeIndex) { |
| | | this.knowledgeBaseVectorService = knowledgeBaseVectorService; |
| | | this.storageBlobService = storageBlobService; |
| | | this.embeddingModel = embeddingModel; |
| | | this.embeddingStore = embeddingStore; |
| | | this.fileProperties = fileProperties; |
| | | this.pineconeIndex = pineconeIndex; |
| | | } |
| | | |
| | | private static final int CHUNK_SIZE = 500; |
| | | private static final int CHUNK_OVERLAP = 100; |
| | | /** |
| | | * 文件大小阈值,超过此值才进行切片 |
| | | * 80MB = 80 * 1024 * 1024 字节 |
| | | */ |
| | | private static final long CHUNK_THRESHOLD_BYTES = 80L * 1024 * 1024; |
| | | /** |
| | | * Embedding 模型最大输入长度限制 |
| | | * 阿里云 DashScope 限制为 8192 字符 |
| | | */ |
| | | private static final int EMBEDDING_MAX_LENGTH = 8000; |
| | | |
| | | @Override |
| | | @Async("threadPoolTaskExecutor") |
| | |
| | | |
| | | File file = getFile(blob); |
| | | log.info("文件路径: {}, 是否存在: {}", file.getAbsolutePath(), file.exists()); |
| | | long fileSize = file.length(); |
| | | log.info("文件大小: {} 字节", fileSize); |
| | | |
| | | // 直接读取文件内容,不使用 MultipartFile 包装 |
| | | log.info("提取文件内容: fileName={}", vector.getFileName()); |
| | | String content = extractFileContent(file, vector.getFileName(), blob.getContentType()); |
| | | String content = extractFileContent(file, vector.getFileName()); |
| | | log.info("文件内容长度: {}", content != null ? content.length() : 0); |
| | | |
| | | if (content == null || content.trim().isEmpty()) { |
| | |
| | | } |
| | | |
| | | // 文本切片 |
| | | log.info("开始文本切片"); |
| | | List<TextSegment> chunks = splitText(content, vector); |
| | | log.info("切片完成,共 {} 个块", chunks.size()); |
| | | List<TextSegment> chunks; |
| | | boolean needChunk = fileSize > CHUNK_THRESHOLD_BYTES || content.length() > EMBEDDING_MAX_LENGTH; |
| | | if (needChunk) { |
| | | log.info("开始切片: fileSize={}, contentLength={}", fileSize, content.length()); |
| | | chunks = splitText(content, vector); |
| | | log.info("切片完成,共 {} 个块", chunks.size()); |
| | | } else { |
| | | log.info("文件较小且内容长度{}不超过{},不进行切片", content.length(), EMBEDDING_MAX_LENGTH); |
| | | Map<String, Object> metadata = buildMetadata(vector); |
| | | chunks = List.of(TextSegment.from(content, new dev.langchain4j.data.document.Metadata(metadata))); |
| | | } |
| | | |
| | | // 批量生成嵌入向量并存储 |
| | | int chunkCount = 0; |
| | |
| | | |
| | | @Override |
| | | public void deleteEmbeddings(String namespace, Long storageBlobId) { |
| | | // Pinecone 按命名空间删除需要特定实现 |
| | | // 当前实现:通过 metadata 过滤删除 |
| | | log.info("删除向量数据: namespace={}, storageBlobId={}", namespace, storageBlobId); |
| | | // 注意:Pinecone 的删除操作需要在 EmbeddingStore 层实现 |
| | | // 当前使用 PineconeEmbeddingStore,可能需要调用 Pinecone 客户端直接删除 |
| | | try { |
| | | // Pinecone metadata filter 需要使用 $eq 操作符 |
| | | // 且值的类型需要与存储时一致(Long -> Number) |
| | | com.google.protobuf.Struct filter = com.google.protobuf.Struct.newBuilder() |
| | | .putFields("storageBlobId", com.google.protobuf.Value.newBuilder() |
| | | .setStructValue(com.google.protobuf.Struct.newBuilder() |
| | | .putFields("$eq", com.google.protobuf.Value.newBuilder() |
| | | .setNumberValue(storageBlobId.doubleValue()) |
| | | .build())) |
| | | .build()) |
| | | .build(); |
| | | |
| | | List<String> emptyIds = new ArrayList<>(); |
| | | pineconeIndex.delete(emptyIds, false, this.namespace, filter); |
| | | log.info("向量删除完成: storageBlobId={}", storageBlobId); |
| | | } catch (Exception e) { |
| | | log.error("删除向量数据失败: namespace={}, storageBlobId={}", namespace, storageBlobId, e); |
| | | } |
| | | } |
| | | |
| | | private File getFile(StorageBlob blob) { |
| | |
| | | /** |
| | | * 提取文件内容 |
| | | */ |
| | | private String extractFileContent(File file, String fileName, String contentType) throws Exception { |
| | | private String extractFileContent(File file, String fileName) throws Exception { |
| | | String ext = getFileExtension(fileName); |
| | | |
| | | // 根据文件类型提取内容 |
| | | if (isPlainText(ext)) { |
| | | return Files.readString(file.toPath()); |
| | | return readFileWithEncoding(file); |
| | | } |
| | | |
| | | if ("docx".equals(ext)) { |
| | |
| | | } |
| | | |
| | | // 默认尝试读取文本 |
| | | return Files.readString(file.toPath()); |
| | | return readFileWithEncoding(file); |
| | | } |
| | | |
| | | /** |
| | | * 自动检测文件编码并读取内容 |
| | | * 优先尝试 UTF-8,失败则尝试 GBK |
| | | */ |
| | | private String readFileWithEncoding(File file) throws Exception { |
| | | byte[] bytes = Files.readAllBytes(file.toPath()); |
| | | |
| | | // 先尝试 UTF-8 |
| | | String utf8Content = new String(bytes, StandardCharsets.UTF_8); |
| | | if (isValidUtf8(utf8Content)) { |
| | | log.debug("文件编码: UTF-8"); |
| | | return utf8Content; |
| | | } |
| | | |
| | | // 尝试 GBK |
| | | try { |
| | | Charset gbk = Charset.forName("GBK"); |
| | | String gbkContent = new String(bytes, gbk); |
| | | log.debug("文件编码: GBK"); |
| | | return gbkContent; |
| | | } catch (Exception e) { |
| | | log.warn("编码检测失败,使用 UTF-8"); |
| | | return utf8Content; |
| | | } |
| | | } |
| | | |
| | | /** |
| | | * 检查 UTF-8 解码是否有效 |
| | | */ |
| | | private boolean isValidUtf8(String decoded) { |
| | | // 检查是否包含替换字符(说明 UTF-8 解码失败) |
| | | if (decoded.contains("�")) { |
| | | return false; |
| | | } |
| | | // 检查是否有过多的非打印字符(乱码特征) |
| | | int invalidCount = 0; |
| | | for (int i = 0; i < Math.min(decoded.length(), 1000); i++) { |
| | | char c = decoded.charAt(i); |
| | | // 检查私有使用区域或异常的控制字符 |
| | | if ((c >= '' && c <= '') || (c < ' ' && c != '\n' && c != '\r' && c != '\t')) { |
| | | invalidCount++; |
| | | } |
| | | } |
| | | // 如果无效字符超过 5%,认为是编码错误 |
| | | return invalidCount < Math.min(decoded.length(), 1000) * 0.05; |
| | | } |
| | | |
| | | private String getFileExtension(String fileName) { |