From eb832a81fb17402b1dded01459a01d7d99f16262 Mon Sep 17 00:00:00 2001
From: 云 <2163098428@qq.com>
Date: 星期二, 09 六月 2026 14:43:25 +0800
Subject: [PATCH] refactor(ai): 重构 Pinecone 向量存储配置和服务实现

---
 src/main/java/com/ruoyi/ai/service/impl/KnowledgeRagServiceImpl.java |  139 +++++++++++++++++++++++++++++++++++++++++-----
 1 files changed, 124 insertions(+), 15 deletions(-)

diff --git a/src/main/java/com/ruoyi/ai/service/impl/KnowledgeRagServiceImpl.java b/src/main/java/com/ruoyi/ai/service/impl/KnowledgeRagServiceImpl.java
index 751c427..3b0211a 100644
--- a/src/main/java/com/ruoyi/ai/service/impl/KnowledgeRagServiceImpl.java
+++ b/src/main/java/com/ruoyi/ai/service/impl/KnowledgeRagServiceImpl.java
@@ -1,6 +1,5 @@
 package com.ruoyi.ai.service.impl;
 
-import com.ruoyi.ai.service.AiFileTextExtractor;
 import com.ruoyi.ai.service.KnowledgeRagService;
 import com.ruoyi.approve.pojo.KnowledgeBaseVector;
 import com.ruoyi.approve.service.KnowledgeBaseVectorService;
@@ -10,16 +9,18 @@
 import dev.langchain4j.data.embedding.Embedding;
 import dev.langchain4j.data.segment.TextSegment;
 import dev.langchain4j.model.embedding.EmbeddingModel;
-import dev.langchain4j.store.embedding.EmbeddingMatch;
 import dev.langchain4j.store.embedding.EmbeddingSearchRequest;
 import dev.langchain4j.store.embedding.EmbeddingSearchResult;
 import dev.langchain4j.store.embedding.EmbeddingStore;
-import lombok.RequiredArgsConstructor;
+import io.pinecone.clients.Index;
 import lombok.extern.slf4j.Slf4j;
+import org.springframework.beans.factory.annotation.Value;
 import org.springframework.scheduling.annotation.Async;
 import org.springframework.stereotype.Service;
 
 import java.io.File;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.util.ArrayList;
 import java.util.HashMap;
@@ -32,27 +33,56 @@
  */
 @Slf4j
 @Service
-@RequiredArgsConstructor
 public class KnowledgeRagServiceImpl implements KnowledgeRagService {
 
     private final KnowledgeBaseVectorService knowledgeBaseVectorService;
     private final StorageBlobService storageBlobService;
-    private final AiFileTextExtractor aiFileTextExtractor;
     private final EmbeddingModel embeddingModel;
     private final EmbeddingStore<TextSegment> embeddingStore;
     private final FileProperties fileProperties;
+    private final Index pineconeIndex;
+
+    @Value("${pinecone.namespace:knowledge-base}")
+    private String namespace;
+
+    public KnowledgeRagServiceImpl(
+            KnowledgeBaseVectorService knowledgeBaseVectorService,
+            StorageBlobService storageBlobService,
+            EmbeddingModel embeddingModel,
+            EmbeddingStore<TextSegment> embeddingStore,
+            FileProperties fileProperties,
+            Index pineconeIndex) {
+        this.knowledgeBaseVectorService = knowledgeBaseVectorService;
+        this.storageBlobService = storageBlobService;
+        this.embeddingModel = embeddingModel;
+        this.embeddingStore = embeddingStore;
+        this.fileProperties = fileProperties;
+        this.pineconeIndex = pineconeIndex;
+    }
 
     private static final int CHUNK_SIZE = 500;
     private static final int CHUNK_OVERLAP = 100;
+    /**
+     * 鏂囦欢澶у皬闃堝�硷紝瓒呰繃姝ゅ�兼墠杩涜鍒囩墖
+     * 80MB = 80 * 1024 * 1024 瀛楄妭
+     */
+    private static final long CHUNK_THRESHOLD_BYTES = 80L * 1024 * 1024;
+    /**
+     * Embedding 妯″瀷鏈�澶ц緭鍏ラ暱搴﹂檺鍒�
+     * 闃块噷浜� DashScope 闄愬埗涓� 8192 瀛楃
+     */
+    private static final int EMBEDDING_MAX_LENGTH = 8000;
 
     @Override
-    @Async
+    @Async("threadPoolTaskExecutor")
     public void processVectorAsync(Long vectorId) {
+        log.info("寮�濮嬪紓姝ュ悜閲忓寲澶勭悊: vectorId={}, thread={}", vectorId, Thread.currentThread().getName());
         processVector(vectorId);
     }
 
     @Override
     public void processVector(Long vectorId) {
+        log.info("寮�濮嬪鐞嗗悜閲忓寲: vectorId={}", vectorId);
         KnowledgeBaseVector vector = knowledgeBaseVectorService.getById(vectorId);
         if (vector == null) {
             log.error("鍚戦噺璁板綍涓嶅瓨鍦�: {}", vectorId);
@@ -61,30 +91,48 @@
 
         try {
             // 鏇存柊鐘舵�佷负澶勭悊涓�
+            log.info("鏇存柊鐘舵�佷负澶勭悊涓�: vectorId={}", vectorId);
             knowledgeBaseVectorService.updateVectorStatus(vectorId,
                     KnowledgeBaseVector.STATUS_PROCESSING, null, null);
 
             // 鑾峰彇鏂囦欢鍐呭
+            log.info("鑾峰彇鏂囦欢淇℃伅: storageBlobId={}", vector.getStorageBlobId());
             StorageBlob blob = storageBlobService.getById(vector.getStorageBlobId());
             if (blob == null) {
                 throw new RuntimeException("鏂囦欢涓嶅瓨鍦�: " + vector.getStorageBlobId());
             }
 
             File file = getFile(blob);
+            log.info("鏂囦欢璺緞: {}, 鏄惁瀛樺湪: {}", file.getAbsolutePath(), file.exists());
+            long fileSize = file.length();
+            log.info("鏂囦欢澶у皬: {} 瀛楄妭", fileSize);
 
             // 鐩存帴璇诲彇鏂囦欢鍐呭锛屼笉浣跨敤 MultipartFile 鍖呰
-            String content = extractFileContent(file, vector.getFileName(), blob.getContentType());
+            log.info("鎻愬彇鏂囦欢鍐呭: fileName={}", vector.getFileName());
+            String content = extractFileContent(file, vector.getFileName());
+            log.info("鏂囦欢鍐呭闀垮害: {}", content != null ? content.length() : 0);
 
             if (content == null || content.trim().isEmpty()) {
                 throw new RuntimeException("鏂囦欢鍐呭涓虹┖");
             }
 
             // 鏂囨湰鍒囩墖
-            List<TextSegment> chunks = splitText(content, vector);
+            List<TextSegment> chunks;
+            boolean needChunk = fileSize > CHUNK_THRESHOLD_BYTES || content.length() > EMBEDDING_MAX_LENGTH;
+            if (needChunk) {
+                log.info("寮�濮嬪垏鐗�: fileSize={}, contentLength={}", fileSize, content.length());
+                chunks = splitText(content, vector);
+                log.info("鍒囩墖瀹屾垚锛屽叡 {} 涓潡", chunks.size());
+            } else {
+                log.info("鏂囦欢杈冨皬涓斿唴瀹归暱搴}涓嶈秴杩噞}锛屼笉杩涜鍒囩墖", content.length(), EMBEDDING_MAX_LENGTH);
+                Map<String, Object> metadata = buildMetadata(vector);
+                chunks = List.of(TextSegment.from(content, new dev.langchain4j.data.document.Metadata(metadata)));
+            }
 
             // 鎵归噺鐢熸垚宓屽叆鍚戦噺骞跺瓨鍌�
             int chunkCount = 0;
             for (TextSegment chunk : chunks) {
+                log.debug("澶勭悊绗� {} 涓潡", chunkCount + 1);
                 Embedding embedding = embeddingModel.embed(chunk).content();
                 embeddingStore.add(embedding, chunk);
                 chunkCount++;
@@ -130,11 +178,25 @@
 
     @Override
     public void deleteEmbeddings(String namespace, Long storageBlobId) {
-        // Pinecone 鎸夊懡鍚嶇┖闂村垹闄ら渶瑕佺壒瀹氬疄鐜�
-        // 褰撳墠瀹炵幇锛氶�氳繃 metadata 杩囨护鍒犻櫎
         log.info("鍒犻櫎鍚戦噺鏁版嵁: namespace={}, storageBlobId={}", namespace, storageBlobId);
-        // 娉ㄦ剰锛歅inecone 鐨勫垹闄ゆ搷浣滈渶瑕佸湪 EmbeddingStore 灞傚疄鐜�
-        // 褰撳墠浣跨敤 PineconeEmbeddingStore锛屽彲鑳介渶瑕佽皟鐢� Pinecone 瀹㈡埛绔洿鎺ュ垹闄�
+        try {
+            // Pinecone metadata filter 闇�瑕佷娇鐢� $eq 鎿嶄綔绗�
+            // 涓斿�肩殑绫诲瀷闇�瑕佷笌瀛樺偍鏃朵竴鑷达紙Long -> Number锛�
+            com.google.protobuf.Struct filter = com.google.protobuf.Struct.newBuilder()
+                    .putFields("storageBlobId", com.google.protobuf.Value.newBuilder()
+                            .setStructValue(com.google.protobuf.Struct.newBuilder()
+                                    .putFields("$eq", com.google.protobuf.Value.newBuilder()
+                                            .setNumberValue(storageBlobId.doubleValue())
+                                            .build()))
+                            .build())
+                    .build();
+
+            List<String> emptyIds = new ArrayList<>();
+            pineconeIndex.delete(emptyIds, false, this.namespace, filter);
+            log.info("鍚戦噺鍒犻櫎瀹屾垚: storageBlobId={}", storageBlobId);
+        } catch (Exception e) {
+            log.error("鍒犻櫎鍚戦噺鏁版嵁澶辫触: namespace={}, storageBlobId={}", namespace, storageBlobId, e);
+        }
     }
 
     private File getFile(StorageBlob blob) {
@@ -148,12 +210,12 @@
     /**
      * 鎻愬彇鏂囦欢鍐呭
      */
-    private String extractFileContent(File file, String fileName, String contentType) throws Exception {
+    private String extractFileContent(File file, String fileName) throws Exception {
         String ext = getFileExtension(fileName);
 
         // 鏍规嵁鏂囦欢绫诲瀷鎻愬彇鍐呭
         if (isPlainText(ext)) {
-            return Files.readString(file.toPath());
+            return readFileWithEncoding(file);
         }
 
         if ("docx".equals(ext)) {
@@ -169,7 +231,54 @@
         }
 
         // 榛樿灏濊瘯璇诲彇鏂囨湰
-        return Files.readString(file.toPath());
+        return readFileWithEncoding(file);
+    }
+
+    /**
+     * 鑷姩妫�娴嬫枃浠剁紪鐮佸苟璇诲彇鍐呭
+     * 浼樺厛灏濊瘯 UTF-8锛屽け璐ュ垯灏濊瘯 GBK
+     */
+    private String readFileWithEncoding(File file) throws Exception {
+        byte[] bytes = Files.readAllBytes(file.toPath());
+
+        // 鍏堝皾璇� UTF-8
+        String utf8Content = new String(bytes, StandardCharsets.UTF_8);
+        if (isValidUtf8(utf8Content)) {
+            log.debug("鏂囦欢缂栫爜: UTF-8");
+            return utf8Content;
+        }
+
+        // 灏濊瘯 GBK
+        try {
+            Charset gbk = Charset.forName("GBK");
+            String gbkContent = new String(bytes, gbk);
+            log.debug("鏂囦欢缂栫爜: GBK");
+            return gbkContent;
+        } catch (Exception e) {
+            log.warn("缂栫爜妫�娴嬪け璐ワ紝浣跨敤 UTF-8");
+            return utf8Content;
+        }
+    }
+
+    /**
+     * 妫�鏌� UTF-8 瑙g爜鏄惁鏈夋晥
+     */
+    private boolean isValidUtf8(String decoded) {
+        // 妫�鏌ユ槸鍚﹀寘鍚浛鎹㈠瓧绗︼紙璇存槑 UTF-8 瑙g爜澶辫触锛�
+        if (decoded.contains("锟�")) {
+            return false;
+        }
+        // 妫�鏌ユ槸鍚︽湁杩囧鐨勯潪鎵撳嵃瀛楃锛堜贡鐮佺壒寰侊級
+        int invalidCount = 0;
+        for (int i = 0; i < Math.min(decoded.length(), 1000); i++) {
+            char c = decoded.charAt(i);
+            // 妫�鏌ョ鏈変娇鐢ㄥ尯鍩熸垨寮傚父鐨勬帶鍒跺瓧绗�
+            if ((c >= '顎�' && c <= '铮�') || (c < ' ' && c != '\n' && c != '\r' && c != '\t')) {
+                invalidCount++;
+            }
+        }
+        // 濡傛灉鏃犳晥瀛楃瓒呰繃 5%锛岃涓烘槸缂栫爜閿欒
+        return invalidCount < Math.min(decoded.length(), 1000) * 0.05;
     }
 
     private String getFileExtension(String fileName) {

--
Gitblit v1.9.3