From eb832a81fb17402b1dded01459a01d7d99f16262 Mon Sep 17 00:00:00 2001
From: 云 <2163098428@qq.com>
Date: 星期二, 09 六月 2026 14:43:25 +0800
Subject: [PATCH] refactor(ai): 重构 Pinecone 向量存储配置和服务实现
---
src/main/java/com/ruoyi/ai/service/impl/KnowledgeRagServiceImpl.java | 139 +++++++++++++++++++++++++++++++++++++++++-----
1 files changed, 124 insertions(+), 15 deletions(-)
diff --git a/src/main/java/com/ruoyi/ai/service/impl/KnowledgeRagServiceImpl.java b/src/main/java/com/ruoyi/ai/service/impl/KnowledgeRagServiceImpl.java
index 751c427..3b0211a 100644
--- a/src/main/java/com/ruoyi/ai/service/impl/KnowledgeRagServiceImpl.java
+++ b/src/main/java/com/ruoyi/ai/service/impl/KnowledgeRagServiceImpl.java
@@ -1,6 +1,5 @@
package com.ruoyi.ai.service.impl;
-import com.ruoyi.ai.service.AiFileTextExtractor;
import com.ruoyi.ai.service.KnowledgeRagService;
import com.ruoyi.approve.pojo.KnowledgeBaseVector;
import com.ruoyi.approve.service.KnowledgeBaseVectorService;
@@ -10,16 +9,18 @@
import dev.langchain4j.data.embedding.Embedding;
import dev.langchain4j.data.segment.TextSegment;
import dev.langchain4j.model.embedding.EmbeddingModel;
-import dev.langchain4j.store.embedding.EmbeddingMatch;
import dev.langchain4j.store.embedding.EmbeddingSearchRequest;
import dev.langchain4j.store.embedding.EmbeddingSearchResult;
import dev.langchain4j.store.embedding.EmbeddingStore;
-import lombok.RequiredArgsConstructor;
+import io.pinecone.clients.Index;
import lombok.extern.slf4j.Slf4j;
+import org.springframework.beans.factory.annotation.Value;
import org.springframework.scheduling.annotation.Async;
import org.springframework.stereotype.Service;
import java.io.File;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.HashMap;
@@ -32,27 +33,56 @@
*/
@Slf4j
@Service
-@RequiredArgsConstructor
public class KnowledgeRagServiceImpl implements KnowledgeRagService {
private final KnowledgeBaseVectorService knowledgeBaseVectorService;
private final StorageBlobService storageBlobService;
- private final AiFileTextExtractor aiFileTextExtractor;
private final EmbeddingModel embeddingModel;
private final EmbeddingStore<TextSegment> embeddingStore;
private final FileProperties fileProperties;
+ private final Index pineconeIndex;
+
+ @Value("${pinecone.namespace:knowledge-base}")
+ private String namespace;
+
+ public KnowledgeRagServiceImpl(
+ KnowledgeBaseVectorService knowledgeBaseVectorService,
+ StorageBlobService storageBlobService,
+ EmbeddingModel embeddingModel,
+ EmbeddingStore<TextSegment> embeddingStore,
+ FileProperties fileProperties,
+ Index pineconeIndex) {
+ this.knowledgeBaseVectorService = knowledgeBaseVectorService;
+ this.storageBlobService = storageBlobService;
+ this.embeddingModel = embeddingModel;
+ this.embeddingStore = embeddingStore;
+ this.fileProperties = fileProperties;
+ this.pineconeIndex = pineconeIndex;
+ }
private static final int CHUNK_SIZE = 500;
private static final int CHUNK_OVERLAP = 100;
+ /**
+ * 鏂囦欢澶у皬闃堝�硷紝瓒呰繃姝ゅ�兼墠杩涜鍒囩墖
+ * 80MB = 80 * 1024 * 1024 瀛楄妭
+ */
+ private static final long CHUNK_THRESHOLD_BYTES = 80L * 1024 * 1024;
+ /**
+ * Embedding 妯″瀷鏈�澶ц緭鍏ラ暱搴﹂檺鍒�
+ * 闃块噷浜� DashScope 闄愬埗涓� 8192 瀛楃
+ */
+ private static final int EMBEDDING_MAX_LENGTH = 8000;
@Override
- @Async
+ @Async("threadPoolTaskExecutor")
public void processVectorAsync(Long vectorId) {
+ log.info("寮�濮嬪紓姝ュ悜閲忓寲澶勭悊: vectorId={}, thread={}", vectorId, Thread.currentThread().getName());
processVector(vectorId);
}
@Override
public void processVector(Long vectorId) {
+ log.info("寮�濮嬪鐞嗗悜閲忓寲: vectorId={}", vectorId);
KnowledgeBaseVector vector = knowledgeBaseVectorService.getById(vectorId);
if (vector == null) {
log.error("鍚戦噺璁板綍涓嶅瓨鍦�: {}", vectorId);
@@ -61,30 +91,48 @@
try {
// 鏇存柊鐘舵�佷负澶勭悊涓�
+ log.info("鏇存柊鐘舵�佷负澶勭悊涓�: vectorId={}", vectorId);
knowledgeBaseVectorService.updateVectorStatus(vectorId,
KnowledgeBaseVector.STATUS_PROCESSING, null, null);
// 鑾峰彇鏂囦欢鍐呭
+ log.info("鑾峰彇鏂囦欢淇℃伅: storageBlobId={}", vector.getStorageBlobId());
StorageBlob blob = storageBlobService.getById(vector.getStorageBlobId());
if (blob == null) {
throw new RuntimeException("鏂囦欢涓嶅瓨鍦�: " + vector.getStorageBlobId());
}
File file = getFile(blob);
+ log.info("鏂囦欢璺緞: {}, 鏄惁瀛樺湪: {}", file.getAbsolutePath(), file.exists());
+ long fileSize = file.length();
+ log.info("鏂囦欢澶у皬: {} 瀛楄妭", fileSize);
// 鐩存帴璇诲彇鏂囦欢鍐呭锛屼笉浣跨敤 MultipartFile 鍖呰
- String content = extractFileContent(file, vector.getFileName(), blob.getContentType());
+ log.info("鎻愬彇鏂囦欢鍐呭: fileName={}", vector.getFileName());
+ String content = extractFileContent(file, vector.getFileName());
+ log.info("鏂囦欢鍐呭闀垮害: {}", content != null ? content.length() : 0);
if (content == null || content.trim().isEmpty()) {
throw new RuntimeException("鏂囦欢鍐呭涓虹┖");
}
// 鏂囨湰鍒囩墖
- List<TextSegment> chunks = splitText(content, vector);
+ List<TextSegment> chunks;
+ boolean needChunk = fileSize > CHUNK_THRESHOLD_BYTES || content.length() > EMBEDDING_MAX_LENGTH;
+ if (needChunk) {
+ log.info("寮�濮嬪垏鐗�: fileSize={}, contentLength={}", fileSize, content.length());
+ chunks = splitText(content, vector);
+ log.info("鍒囩墖瀹屾垚锛屽叡 {} 涓潡", chunks.size());
+ } else {
+ log.info("鏂囦欢杈冨皬涓斿唴瀹归暱搴}涓嶈秴杩噞}锛屼笉杩涜鍒囩墖", content.length(), EMBEDDING_MAX_LENGTH);
+ Map<String, Object> metadata = buildMetadata(vector);
+ chunks = List.of(TextSegment.from(content, new dev.langchain4j.data.document.Metadata(metadata)));
+ }
// 鎵归噺鐢熸垚宓屽叆鍚戦噺骞跺瓨鍌�
int chunkCount = 0;
for (TextSegment chunk : chunks) {
+ log.debug("澶勭悊绗� {} 涓潡", chunkCount + 1);
Embedding embedding = embeddingModel.embed(chunk).content();
embeddingStore.add(embedding, chunk);
chunkCount++;
@@ -130,11 +178,25 @@
@Override
public void deleteEmbeddings(String namespace, Long storageBlobId) {
- // Pinecone 鎸夊懡鍚嶇┖闂村垹闄ら渶瑕佺壒瀹氬疄鐜�
- // 褰撳墠瀹炵幇锛氶�氳繃 metadata 杩囨护鍒犻櫎
log.info("鍒犻櫎鍚戦噺鏁版嵁: namespace={}, storageBlobId={}", namespace, storageBlobId);
- // 娉ㄦ剰锛歅inecone 鐨勫垹闄ゆ搷浣滈渶瑕佸湪 EmbeddingStore 灞傚疄鐜�
- // 褰撳墠浣跨敤 PineconeEmbeddingStore锛屽彲鑳介渶瑕佽皟鐢� Pinecone 瀹㈡埛绔洿鎺ュ垹闄�
+ try {
+ // Pinecone metadata filter 闇�瑕佷娇鐢� $eq 鎿嶄綔绗�
+ // 涓斿�肩殑绫诲瀷闇�瑕佷笌瀛樺偍鏃朵竴鑷达紙Long -> Number锛�
+ com.google.protobuf.Struct filter = com.google.protobuf.Struct.newBuilder()
+ .putFields("storageBlobId", com.google.protobuf.Value.newBuilder()
+ .setStructValue(com.google.protobuf.Struct.newBuilder()
+ .putFields("$eq", com.google.protobuf.Value.newBuilder()
+ .setNumberValue(storageBlobId.doubleValue())
+ .build()))
+ .build())
+ .build();
+
+ List<String> emptyIds = new ArrayList<>();
+ pineconeIndex.delete(emptyIds, false, this.namespace, filter);
+ log.info("鍚戦噺鍒犻櫎瀹屾垚: storageBlobId={}", storageBlobId);
+ } catch (Exception e) {
+ log.error("鍒犻櫎鍚戦噺鏁版嵁澶辫触: namespace={}, storageBlobId={}", namespace, storageBlobId, e);
+ }
}
private File getFile(StorageBlob blob) {
@@ -148,12 +210,12 @@
/**
* 鎻愬彇鏂囦欢鍐呭
*/
- private String extractFileContent(File file, String fileName, String contentType) throws Exception {
+ private String extractFileContent(File file, String fileName) throws Exception {
String ext = getFileExtension(fileName);
// 鏍规嵁鏂囦欢绫诲瀷鎻愬彇鍐呭
if (isPlainText(ext)) {
- return Files.readString(file.toPath());
+ return readFileWithEncoding(file);
}
if ("docx".equals(ext)) {
@@ -169,7 +231,54 @@
}
// 榛樿灏濊瘯璇诲彇鏂囨湰
- return Files.readString(file.toPath());
+ return readFileWithEncoding(file);
+ }
+
+ /**
+ * 鑷姩妫�娴嬫枃浠剁紪鐮佸苟璇诲彇鍐呭
+ * 浼樺厛灏濊瘯 UTF-8锛屽け璐ュ垯灏濊瘯 GBK
+ */
+ private String readFileWithEncoding(File file) throws Exception {
+ byte[] bytes = Files.readAllBytes(file.toPath());
+
+ // 鍏堝皾璇� UTF-8
+ String utf8Content = new String(bytes, StandardCharsets.UTF_8);
+ if (isValidUtf8(utf8Content)) {
+ log.debug("鏂囦欢缂栫爜: UTF-8");
+ return utf8Content;
+ }
+
+ // 灏濊瘯 GBK
+ try {
+ Charset gbk = Charset.forName("GBK");
+ String gbkContent = new String(bytes, gbk);
+ log.debug("鏂囦欢缂栫爜: GBK");
+ return gbkContent;
+ } catch (Exception e) {
+ log.warn("缂栫爜妫�娴嬪け璐ワ紝浣跨敤 UTF-8");
+ return utf8Content;
+ }
+ }
+
+ /**
+ * 妫�鏌� UTF-8 瑙g爜鏄惁鏈夋晥
+ */
+ private boolean isValidUtf8(String decoded) {
+ // 妫�鏌ユ槸鍚﹀寘鍚浛鎹㈠瓧绗︼紙璇存槑 UTF-8 瑙g爜澶辫触锛�
+ if (decoded.contains("锟�")) {
+ return false;
+ }
+ // 妫�鏌ユ槸鍚︽湁杩囧鐨勯潪鎵撳嵃瀛楃锛堜贡鐮佺壒寰侊級
+ int invalidCount = 0;
+ for (int i = 0; i < Math.min(decoded.length(), 1000); i++) {
+ char c = decoded.charAt(i);
+ // 妫�鏌ョ鏈変娇鐢ㄥ尯鍩熸垨寮傚父鐨勬帶鍒跺瓧绗�
+ if ((c >= '顎�' && c <= '铮�') || (c < ' ' && c != '\n' && c != '\r' && c != '\t')) {
+ invalidCount++;
+ }
+ }
+ // 濡傛灉鏃犳晥瀛楃瓒呰繃 5%锛岃涓烘槸缂栫爜閿欒
+ return invalidCount < Math.min(decoded.length(), 1000) * 0.05;
}
private String getFileExtension(String fileName) {
--
Gitblit v1.9.3