From a61d5a200f064ac52778713ce461161402b5b10f Mon Sep 17 00:00:00 2001
From: 云 <2163098428@qq.com>
Date: 星期二, 09 六月 2026 14:03:39 +0800
Subject: [PATCH] ``` refactor(knowledge-base): 重构RAG向量检索功能的文件关联和异步处理

---
 src/main/java/com/ruoyi/ai/service/impl/KnowledgeRagServiceImpl.java |   86 ++++++++++++++++++++++++++++++++++++++-----
 1 files changed, 76 insertions(+), 10 deletions(-)

diff --git a/src/main/java/com/ruoyi/ai/service/impl/KnowledgeRagServiceImpl.java b/src/main/java/com/ruoyi/ai/service/impl/KnowledgeRagServiceImpl.java
index e0c264e..637011c 100644
--- a/src/main/java/com/ruoyi/ai/service/impl/KnowledgeRagServiceImpl.java
+++ b/src/main/java/com/ruoyi/ai/service/impl/KnowledgeRagServiceImpl.java
@@ -1,6 +1,5 @@
 package com.ruoyi.ai.service.impl;
 
-import com.ruoyi.ai.service.AiFileTextExtractor;
 import com.ruoyi.ai.service.KnowledgeRagService;
 import com.ruoyi.approve.pojo.KnowledgeBaseVector;
 import com.ruoyi.approve.service.KnowledgeBaseVectorService;
@@ -10,7 +9,6 @@
 import dev.langchain4j.data.embedding.Embedding;
 import dev.langchain4j.data.segment.TextSegment;
 import dev.langchain4j.model.embedding.EmbeddingModel;
-import dev.langchain4j.store.embedding.EmbeddingMatch;
 import dev.langchain4j.store.embedding.EmbeddingSearchRequest;
 import dev.langchain4j.store.embedding.EmbeddingSearchResult;
 import dev.langchain4j.store.embedding.EmbeddingStore;
@@ -20,6 +18,8 @@
 import org.springframework.stereotype.Service;
 
 import java.io.File;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.util.ArrayList;
 import java.util.HashMap;
@@ -37,13 +37,22 @@
 
     private final KnowledgeBaseVectorService knowledgeBaseVectorService;
     private final StorageBlobService storageBlobService;
-    private final AiFileTextExtractor aiFileTextExtractor;
     private final EmbeddingModel embeddingModel;
     private final EmbeddingStore<TextSegment> embeddingStore;
     private final FileProperties fileProperties;
 
     private static final int CHUNK_SIZE = 500;
     private static final int CHUNK_OVERLAP = 100;
+    /**
+     * 鏂囦欢澶у皬闃堝�硷紝瓒呰繃姝ゅ�兼墠杩涜鍒囩墖
+     * 80MB = 80 * 1024 * 1024 瀛楄妭
+     */
+    private static final long CHUNK_THRESHOLD_BYTES = 80L * 1024 * 1024;
+    /**
+     * Embedding 妯″瀷鏈�澶ц緭鍏ラ暱搴﹂檺鍒�
+     * 闃块噷浜� DashScope 闄愬埗涓� 8192 瀛楃
+     */
+    private static final int EMBEDDING_MAX_LENGTH = 8000;
 
     @Override
     @Async("threadPoolTaskExecutor")
@@ -76,10 +85,12 @@
 
             File file = getFile(blob);
             log.info("鏂囦欢璺緞: {}, 鏄惁瀛樺湪: {}", file.getAbsolutePath(), file.exists());
+            long fileSize = file.length();
+            log.info("鏂囦欢澶у皬: {} 瀛楄妭", fileSize);
 
             // 鐩存帴璇诲彇鏂囦欢鍐呭锛屼笉浣跨敤 MultipartFile 鍖呰
             log.info("鎻愬彇鏂囦欢鍐呭: fileName={}", vector.getFileName());
-            String content = extractFileContent(file, vector.getFileName(), blob.getContentType());
+            String content = extractFileContent(file, vector.getFileName());
             log.info("鏂囦欢鍐呭闀垮害: {}", content != null ? content.length() : 0);
 
             if (content == null || content.trim().isEmpty()) {
@@ -87,9 +98,17 @@
             }
 
             // 鏂囨湰鍒囩墖
-            log.info("寮�濮嬫枃鏈垏鐗�");
-            List<TextSegment> chunks = splitText(content, vector);
-            log.info("鍒囩墖瀹屾垚锛屽叡 {} 涓潡", chunks.size());
+            List<TextSegment> chunks;
+            boolean needChunk = fileSize > CHUNK_THRESHOLD_BYTES || content.length() > EMBEDDING_MAX_LENGTH;
+            if (needChunk) {
+                log.info("寮�濮嬪垏鐗�: fileSize={}, contentLength={}", fileSize, content.length());
+                chunks = splitText(content, vector);
+                log.info("鍒囩墖瀹屾垚锛屽叡 {} 涓潡", chunks.size());
+            } else {
+                log.info("鏂囦欢杈冨皬涓斿唴瀹归暱搴}涓嶈秴杩噞}锛屼笉杩涜鍒囩墖", content.length(), EMBEDDING_MAX_LENGTH);
+                Map<String, Object> metadata = buildMetadata(vector);
+                chunks = List.of(TextSegment.from(content, new dev.langchain4j.data.document.Metadata(metadata)));
+            }
 
             // 鎵归噺鐢熸垚宓屽叆鍚戦噺骞跺瓨鍌�
             int chunkCount = 0;
@@ -158,12 +177,12 @@
     /**
      * 鎻愬彇鏂囦欢鍐呭
      */
-    private String extractFileContent(File file, String fileName, String contentType) throws Exception {
+    private String extractFileContent(File file, String fileName) throws Exception {
         String ext = getFileExtension(fileName);
 
         // 鏍规嵁鏂囦欢绫诲瀷鎻愬彇鍐呭
         if (isPlainText(ext)) {
-            return Files.readString(file.toPath());
+            return readFileWithEncoding(file);
         }
 
         if ("docx".equals(ext)) {
@@ -179,7 +198,54 @@
         }
 
         // 榛樿灏濊瘯璇诲彇鏂囨湰
-        return Files.readString(file.toPath());
+        return readFileWithEncoding(file);
+    }
+
+    /**
+     * 鑷姩妫�娴嬫枃浠剁紪鐮佸苟璇诲彇鍐呭
+     * 浼樺厛灏濊瘯 UTF-8锛屽け璐ュ垯灏濊瘯 GBK
+     */
+    private String readFileWithEncoding(File file) throws Exception {
+        byte[] bytes = Files.readAllBytes(file.toPath());
+
+        // 鍏堝皾璇� UTF-8
+        String utf8Content = new String(bytes, StandardCharsets.UTF_8);
+        if (isValidUtf8(utf8Content)) {
+            log.debug("鏂囦欢缂栫爜: UTF-8");
+            return utf8Content;
+        }
+
+        // 灏濊瘯 GBK
+        try {
+            Charset gbk = Charset.forName("GBK");
+            String gbkContent = new String(bytes, gbk);
+            log.debug("鏂囦欢缂栫爜: GBK");
+            return gbkContent;
+        } catch (Exception e) {
+            log.warn("缂栫爜妫�娴嬪け璐ワ紝浣跨敤 UTF-8");
+            return utf8Content;
+        }
+    }
+
+    /**
+     * 妫�鏌� UTF-8 瑙g爜鏄惁鏈夋晥
+     */
+    private boolean isValidUtf8(String decoded) {
+        // 妫�鏌ユ槸鍚﹀寘鍚浛鎹㈠瓧绗︼紙璇存槑 UTF-8 瑙g爜澶辫触锛�
+        if (decoded.contains("锟�")) {
+            return false;
+        }
+        // 妫�鏌ユ槸鍚︽湁杩囧鐨勯潪鎵撳嵃瀛楃锛堜贡鐮佺壒寰侊級
+        int invalidCount = 0;
+        for (int i = 0; i < Math.min(decoded.length(), 1000); i++) {
+            char c = decoded.charAt(i);
+            // 妫�鏌ョ鏈変娇鐢ㄥ尯鍩熸垨寮傚父鐨勬帶鍒跺瓧绗�
+            if ((c >= '顎�' && c <= '铮�') || (c < ' ' && c != '\n' && c != '\r' && c != '\t')) {
+                invalidCount++;
+            }
+        }
+        // 濡傛灉鏃犳晥瀛楃瓒呰繃 5%锛岃涓烘槸缂栫爜閿欒
+        return invalidCount < Math.min(decoded.length(), 1000) * 0.05;
     }
 
     private String getFileExtension(String fileName) {

--
Gitblit v1.9.3