From a61d5a200f064ac52778713ce461161402b5b10f Mon Sep 17 00:00:00 2001
From: 云 <2163098428@qq.com>
Date: 星期二, 09 六月 2026 14:03:39 +0800
Subject: [PATCH] ``` refactor(knowledge-base): 重构RAG向量检索功能的文件关联和异步处理
---
src/main/java/com/ruoyi/ai/service/impl/KnowledgeRagServiceImpl.java | 86 ++++++++++++++++++++++++++++++++++++++-----
1 files changed, 76 insertions(+), 10 deletions(-)
diff --git a/src/main/java/com/ruoyi/ai/service/impl/KnowledgeRagServiceImpl.java b/src/main/java/com/ruoyi/ai/service/impl/KnowledgeRagServiceImpl.java
index e0c264e..637011c 100644
--- a/src/main/java/com/ruoyi/ai/service/impl/KnowledgeRagServiceImpl.java
+++ b/src/main/java/com/ruoyi/ai/service/impl/KnowledgeRagServiceImpl.java
@@ -1,6 +1,5 @@
package com.ruoyi.ai.service.impl;
-import com.ruoyi.ai.service.AiFileTextExtractor;
import com.ruoyi.ai.service.KnowledgeRagService;
import com.ruoyi.approve.pojo.KnowledgeBaseVector;
import com.ruoyi.approve.service.KnowledgeBaseVectorService;
@@ -10,7 +9,6 @@
import dev.langchain4j.data.embedding.Embedding;
import dev.langchain4j.data.segment.TextSegment;
import dev.langchain4j.model.embedding.EmbeddingModel;
-import dev.langchain4j.store.embedding.EmbeddingMatch;
import dev.langchain4j.store.embedding.EmbeddingSearchRequest;
import dev.langchain4j.store.embedding.EmbeddingSearchResult;
import dev.langchain4j.store.embedding.EmbeddingStore;
@@ -20,6 +18,8 @@
import org.springframework.stereotype.Service;
import java.io.File;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.HashMap;
@@ -37,13 +37,22 @@
private final KnowledgeBaseVectorService knowledgeBaseVectorService;
private final StorageBlobService storageBlobService;
- private final AiFileTextExtractor aiFileTextExtractor;
private final EmbeddingModel embeddingModel;
private final EmbeddingStore<TextSegment> embeddingStore;
private final FileProperties fileProperties;
private static final int CHUNK_SIZE = 500;
private static final int CHUNK_OVERLAP = 100;
+ /**
+ * 鏂囦欢澶у皬闃堝�硷紝瓒呰繃姝ゅ�兼墠杩涜鍒囩墖
+ * 80MB = 80 * 1024 * 1024 瀛楄妭
+ */
+ private static final long CHUNK_THRESHOLD_BYTES = 80L * 1024 * 1024;
+ /**
+ * Embedding 妯″瀷鏈�澶ц緭鍏ラ暱搴﹂檺鍒�
+ * 闃块噷浜� DashScope 闄愬埗涓� 8192 瀛楃
+ */
+ private static final int EMBEDDING_MAX_LENGTH = 8000;
@Override
@Async("threadPoolTaskExecutor")
@@ -76,10 +85,12 @@
File file = getFile(blob);
log.info("鏂囦欢璺緞: {}, 鏄惁瀛樺湪: {}", file.getAbsolutePath(), file.exists());
+ long fileSize = file.length();
+ log.info("鏂囦欢澶у皬: {} 瀛楄妭", fileSize);
// 鐩存帴璇诲彇鏂囦欢鍐呭锛屼笉浣跨敤 MultipartFile 鍖呰
log.info("鎻愬彇鏂囦欢鍐呭: fileName={}", vector.getFileName());
- String content = extractFileContent(file, vector.getFileName(), blob.getContentType());
+ String content = extractFileContent(file, vector.getFileName());
log.info("鏂囦欢鍐呭闀垮害: {}", content != null ? content.length() : 0);
if (content == null || content.trim().isEmpty()) {
@@ -87,9 +98,17 @@
}
// 鏂囨湰鍒囩墖
- log.info("寮�濮嬫枃鏈垏鐗�");
- List<TextSegment> chunks = splitText(content, vector);
- log.info("鍒囩墖瀹屾垚锛屽叡 {} 涓潡", chunks.size());
+ List<TextSegment> chunks;
+ boolean needChunk = fileSize > CHUNK_THRESHOLD_BYTES || content.length() > EMBEDDING_MAX_LENGTH;
+ if (needChunk) {
+ log.info("寮�濮嬪垏鐗�: fileSize={}, contentLength={}", fileSize, content.length());
+ chunks = splitText(content, vector);
+ log.info("鍒囩墖瀹屾垚锛屽叡 {} 涓潡", chunks.size());
+ } else {
+ log.info("鏂囦欢杈冨皬涓斿唴瀹归暱搴}涓嶈秴杩噞}锛屼笉杩涜鍒囩墖", content.length(), EMBEDDING_MAX_LENGTH);
+ Map<String, Object> metadata = buildMetadata(vector);
+ chunks = List.of(TextSegment.from(content, new dev.langchain4j.data.document.Metadata(metadata)));
+ }
// 鎵归噺鐢熸垚宓屽叆鍚戦噺骞跺瓨鍌�
int chunkCount = 0;
@@ -158,12 +177,12 @@
/**
* 鎻愬彇鏂囦欢鍐呭
*/
- private String extractFileContent(File file, String fileName, String contentType) throws Exception {
+ private String extractFileContent(File file, String fileName) throws Exception {
String ext = getFileExtension(fileName);
// 鏍规嵁鏂囦欢绫诲瀷鎻愬彇鍐呭
if (isPlainText(ext)) {
- return Files.readString(file.toPath());
+ return readFileWithEncoding(file);
}
if ("docx".equals(ext)) {
@@ -179,7 +198,54 @@
}
// 榛樿灏濊瘯璇诲彇鏂囨湰
- return Files.readString(file.toPath());
+ return readFileWithEncoding(file);
+ }
+
+ /**
+ * 鑷姩妫�娴嬫枃浠剁紪鐮佸苟璇诲彇鍐呭
+ * 浼樺厛灏濊瘯 UTF-8锛屽け璐ュ垯灏濊瘯 GBK
+ */
+ private String readFileWithEncoding(File file) throws Exception {
+ byte[] bytes = Files.readAllBytes(file.toPath());
+
+ // 鍏堝皾璇� UTF-8
+ String utf8Content = new String(bytes, StandardCharsets.UTF_8);
+ if (isValidUtf8(utf8Content)) {
+ log.debug("鏂囦欢缂栫爜: UTF-8");
+ return utf8Content;
+ }
+
+ // 灏濊瘯 GBK
+ try {
+ Charset gbk = Charset.forName("GBK");
+ String gbkContent = new String(bytes, gbk);
+ log.debug("鏂囦欢缂栫爜: GBK");
+ return gbkContent;
+ } catch (Exception e) {
+ log.warn("缂栫爜妫�娴嬪け璐ワ紝浣跨敤 UTF-8");
+ return utf8Content;
+ }
+ }
+
+ /**
+ * 妫�鏌� UTF-8 瑙g爜鏄惁鏈夋晥
+ */
+ private boolean isValidUtf8(String decoded) {
+ // 妫�鏌ユ槸鍚﹀寘鍚浛鎹㈠瓧绗︼紙璇存槑 UTF-8 瑙g爜澶辫触锛�
+ if (decoded.contains("锟�")) {
+ return false;
+ }
+ // 妫�鏌ユ槸鍚︽湁杩囧鐨勯潪鎵撳嵃瀛楃锛堜贡鐮佺壒寰侊級
+ int invalidCount = 0;
+ for (int i = 0; i < Math.min(decoded.length(), 1000); i++) {
+ char c = decoded.charAt(i);
+ // 妫�鏌ョ鏈変娇鐢ㄥ尯鍩熸垨寮傚父鐨勬帶鍒跺瓧绗�
+ if ((c >= '顎�' && c <= '铮�') || (c < ' ' && c != '\n' && c != '\r' && c != '\t')) {
+ invalidCount++;
+ }
+ }
+ // 濡傛灉鏃犳晥瀛楃瓒呰繃 5%锛岃涓烘槸缂栫爜閿欒
+ return invalidCount < Math.min(decoded.length(), 1000) * 0.05;
}
private String getFileExtension(String fileName) {
--
Gitblit v1.9.3