10 天以前 eb832a81fb17402b1dded01459a01d7d99f16262
refactor(ai): 重构 Pinecone 向量存储配置和服务实现

- 将硬编码的 Pinecone 配置替换为通过 @Value 注解读取配置文件
- 新增 Pinecone 和 Index 的 Bean 配置,支持依赖注入
- 修改 KnowledgeRagServiceImpl 构造函数注入方式,移除 @RequiredArgsConstructor 注解
- 实现基于 metadata 的向量数据删除功能,使用 protobuf 结构化过滤器
- 统一向量存储的命名空间管理,支持配置化设置
已修改5个文件
178 ■■■■ 文件已修改
src/main/java/com/ruoyi/ai/config/EmbeddingStoreConfig.java 39 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
src/main/java/com/ruoyi/ai/service/impl/KnowledgeRagServiceImpl.java 131 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
src/main/java/com/ruoyi/approve/mapper/KnowledgeBaseVectorMapper.java 2 ●●● 补丁 | 查看 | 原始文档 | blame | 历史
src/main/java/com/ruoyi/collaborativeApproval/service/impl/MeetingServiceImpl.java 4 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
src/main/java/com/ruoyi/collaborativeApproval/vo/SearchMeetingUseVo.java 2 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
src/main/java/com/ruoyi/ai/config/EmbeddingStoreConfig.java
@@ -5,7 +5,9 @@
import dev.langchain4j.store.embedding.EmbeddingStore;
import dev.langchain4j.store.embedding.pinecone.PineconeEmbeddingStore;
import dev.langchain4j.store.embedding.pinecone.PineconeServerlessIndexConfig;
import org.springframework.beans.factory.annotation.Autowired;
import io.pinecone.clients.Index;
import io.pinecone.clients.Pinecone;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
@@ -16,20 +18,35 @@
@Configuration
public class EmbeddingStoreConfig {
    @Autowired
    private EmbeddingModel embeddingModel;
    @Value("${pinecone.api-key}")
    private String pineconeApiKey;
    @Value("${pinecone.index}")
    private String indexName;
    @Value("${pinecone.namespace}")
    private String namespace;
    @Bean
    public EmbeddingStore<TextSegment> embeddingStore() {
        //创建向量存储
    public Pinecone pinecone() {
        return new Pinecone.Builder(pineconeApiKey).build();
    }
    @Bean
    public Index pineconeIndex(Pinecone pinecone) {
        return pinecone.getIndexConnection(indexName);
    }
    @Bean
    public EmbeddingStore<TextSegment> embeddingStore(EmbeddingModel embeddingModel) {
        return PineconeEmbeddingStore.builder()
                .apiKey("pcsk_4SJLnh_tNB3wSLJU8tc4E5P28PcXX8eCLdURqZpVhg1FMV8CRYxjneWdzqRdB5Ftqooi9")
                .index("xiaozhi-index")//如果指定的索引不存在,将创建一个新的索引
                .nameSpace("xiaozhi-namespace") //如果指定的名称空间不存在,将创建一个新的名称 空间
                .apiKey(pineconeApiKey)
                .index(indexName)
                .nameSpace(namespace)
                .createIndex(PineconeServerlessIndexConfig.builder()
                        .cloud("AWS") //指定索引部署在 AWS 云服务上。
                        .region("us-east-1") //指定索引所在的 AWS 区域为 us-east-1。
                        .dimension(embeddingModel.dimension()) //指定索引的向量维度,该维度与 embeddedModel 生成的向量维度相同。
                        .cloud("AWS")
                        .region("us-east-1")
                        .dimension(embeddingModel.dimension())
                        .build())
                .build();
    }
src/main/java/com/ruoyi/ai/service/impl/KnowledgeRagServiceImpl.java
@@ -1,6 +1,5 @@
package com.ruoyi.ai.service.impl;
import com.ruoyi.ai.service.AiFileTextExtractor;
import com.ruoyi.ai.service.KnowledgeRagService;
import com.ruoyi.approve.pojo.KnowledgeBaseVector;
import com.ruoyi.approve.service.KnowledgeBaseVectorService;
@@ -10,16 +9,18 @@
import dev.langchain4j.data.embedding.Embedding;
import dev.langchain4j.data.segment.TextSegment;
import dev.langchain4j.model.embedding.EmbeddingModel;
import dev.langchain4j.store.embedding.EmbeddingMatch;
import dev.langchain4j.store.embedding.EmbeddingSearchRequest;
import dev.langchain4j.store.embedding.EmbeddingSearchResult;
import dev.langchain4j.store.embedding.EmbeddingStore;
import lombok.RequiredArgsConstructor;
import io.pinecone.clients.Index;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.scheduling.annotation.Async;
import org.springframework.stereotype.Service;
import java.io.File;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.HashMap;
@@ -32,18 +33,45 @@
 */
@Slf4j
@Service
@RequiredArgsConstructor
public class KnowledgeRagServiceImpl implements KnowledgeRagService {
    private final KnowledgeBaseVectorService knowledgeBaseVectorService;
    private final StorageBlobService storageBlobService;
    private final AiFileTextExtractor aiFileTextExtractor;
    private final EmbeddingModel embeddingModel;
    private final EmbeddingStore<TextSegment> embeddingStore;
    private final FileProperties fileProperties;
    private final Index pineconeIndex;
    @Value("${pinecone.namespace:knowledge-base}")
    private String namespace;
    public KnowledgeRagServiceImpl(
            KnowledgeBaseVectorService knowledgeBaseVectorService,
            StorageBlobService storageBlobService,
            EmbeddingModel embeddingModel,
            EmbeddingStore<TextSegment> embeddingStore,
            FileProperties fileProperties,
            Index pineconeIndex) {
        this.knowledgeBaseVectorService = knowledgeBaseVectorService;
        this.storageBlobService = storageBlobService;
        this.embeddingModel = embeddingModel;
        this.embeddingStore = embeddingStore;
        this.fileProperties = fileProperties;
        this.pineconeIndex = pineconeIndex;
    }
    private static final int CHUNK_SIZE = 500;
    private static final int CHUNK_OVERLAP = 100;
    /**
     * 文件大小阈值,超过此值才进行切片
     * 80MB = 80 * 1024 * 1024 字节
     */
    private static final long CHUNK_THRESHOLD_BYTES = 80L * 1024 * 1024;
    /**
     * Embedding 模型最大输入长度限制
     * 阿里云 DashScope 限制为 8192 字符
     */
    private static final int EMBEDDING_MAX_LENGTH = 8000;
    @Override
    @Async("threadPoolTaskExecutor")
@@ -76,10 +104,12 @@
            File file = getFile(blob);
            log.info("文件路径: {}, 是否存在: {}", file.getAbsolutePath(), file.exists());
            long fileSize = file.length();
            log.info("文件大小: {} 字节", fileSize);
            // 直接读取文件内容,不使用 MultipartFile 包装
            log.info("提取文件内容: fileName={}", vector.getFileName());
            String content = extractFileContent(file, vector.getFileName(), blob.getContentType());
            String content = extractFileContent(file, vector.getFileName());
            log.info("文件内容长度: {}", content != null ? content.length() : 0);
            if (content == null || content.trim().isEmpty()) {
@@ -87,9 +117,17 @@
            }
            // 文本切片
            log.info("开始文本切片");
            List<TextSegment> chunks = splitText(content, vector);
            log.info("切片完成,共 {} 个块", chunks.size());
            List<TextSegment> chunks;
            boolean needChunk = fileSize > CHUNK_THRESHOLD_BYTES || content.length() > EMBEDDING_MAX_LENGTH;
            if (needChunk) {
                log.info("开始切片: fileSize={}, contentLength={}", fileSize, content.length());
                chunks = splitText(content, vector);
                log.info("切片完成,共 {} 个块", chunks.size());
            } else {
                log.info("文件较小且内容长度{}不超过{},不进行切片", content.length(), EMBEDDING_MAX_LENGTH);
                Map<String, Object> metadata = buildMetadata(vector);
                chunks = List.of(TextSegment.from(content, new dev.langchain4j.data.document.Metadata(metadata)));
            }
            // 批量生成嵌入向量并存储
            int chunkCount = 0;
@@ -140,11 +178,25 @@
    @Override
    public void deleteEmbeddings(String namespace, Long storageBlobId) {
        // Pinecone 按命名空间删除需要特定实现
        // 当前实现:通过 metadata 过滤删除
        log.info("删除向量数据: namespace={}, storageBlobId={}", namespace, storageBlobId);
        // 注意:Pinecone 的删除操作需要在 EmbeddingStore 层实现
        // 当前使用 PineconeEmbeddingStore,可能需要调用 Pinecone 客户端直接删除
        try {
            // Pinecone metadata filter 需要使用 $eq 操作符
            // 且值的类型需要与存储时一致(Long -> Number)
            com.google.protobuf.Struct filter = com.google.protobuf.Struct.newBuilder()
                    .putFields("storageBlobId", com.google.protobuf.Value.newBuilder()
                            .setStructValue(com.google.protobuf.Struct.newBuilder()
                                    .putFields("$eq", com.google.protobuf.Value.newBuilder()
                                            .setNumberValue(storageBlobId.doubleValue())
                                            .build()))
                            .build())
                    .build();
            List<String> emptyIds = new ArrayList<>();
            pineconeIndex.delete(emptyIds, false, this.namespace, filter);
            log.info("向量删除完成: storageBlobId={}", storageBlobId);
        } catch (Exception e) {
            log.error("删除向量数据失败: namespace={}, storageBlobId={}", namespace, storageBlobId, e);
        }
    }
    private File getFile(StorageBlob blob) {
@@ -158,12 +210,12 @@
    /**
     * 提取文件内容
     */
    private String extractFileContent(File file, String fileName, String contentType) throws Exception {
    private String extractFileContent(File file, String fileName) throws Exception {
        String ext = getFileExtension(fileName);
        // 根据文件类型提取内容
        if (isPlainText(ext)) {
            return Files.readString(file.toPath());
            return readFileWithEncoding(file);
        }
        if ("docx".equals(ext)) {
@@ -179,7 +231,54 @@
        }
        // 默认尝试读取文本
        return Files.readString(file.toPath());
        return readFileWithEncoding(file);
    }
    /**
     * 自动检测文件编码并读取内容
     * 优先尝试 UTF-8,失败则尝试 GBK
     */
    private String readFileWithEncoding(File file) throws Exception {
        byte[] bytes = Files.readAllBytes(file.toPath());
        // 先尝试 UTF-8
        String utf8Content = new String(bytes, StandardCharsets.UTF_8);
        if (isValidUtf8(utf8Content)) {
            log.debug("文件编码: UTF-8");
            return utf8Content;
        }
        // 尝试 GBK
        try {
            Charset gbk = Charset.forName("GBK");
            String gbkContent = new String(bytes, gbk);
            log.debug("文件编码: GBK");
            return gbkContent;
        } catch (Exception e) {
            log.warn("编码检测失败,使用 UTF-8");
            return utf8Content;
        }
    }
    /**
     * 检查 UTF-8 解码是否有效
     */
    private boolean isValidUtf8(String decoded) {
        // 检查是否包含替换字符(说明 UTF-8 解码失败)
        if (decoded.contains("�")) {
            return false;
        }
        // 检查是否有过多的非打印字符(乱码特征)
        int invalidCount = 0;
        for (int i = 0; i < Math.min(decoded.length(), 1000); i++) {
            char c = decoded.charAt(i);
            // 检查私有使用区域或异常的控制字符
            if ((c >= '' && c <= '') || (c < ' ' && c != '\n' && c != '\r' && c != '\t')) {
                invalidCount++;
            }
        }
        // 如果无效字符超过 5%,认为是编码错误
        return invalidCount < Math.min(decoded.length(), 1000) * 0.05;
    }
    private String getFileExtension(String fileName) {
src/main/java/com/ruoyi/approve/mapper/KnowledgeBaseVectorMapper.java
@@ -34,6 +34,6 @@
    /**
     * 统计知识库的总切片数量
     */
    @Select("SELECT SUM(chunk_count) FROM knowledge_base_vector WHERE knowledge_base_id = #{knowledgeBaseId} AND vector_status = 2")
    @Select("SELECT COALESCE(SUM(chunk_count), 0) FROM knowledge_base_vector WHERE knowledge_base_id = #{knowledgeBaseId} AND vector_status = 2")
    int sumChunkCountByKnowledgeBaseId(@Param("knowledgeBaseId") Long knowledgeBaseId);
}
src/main/java/com/ruoyi/collaborativeApproval/service/impl/MeetingServiceImpl.java
@@ -238,9 +238,9 @@
                    .or()
                    .eq(MeetApplication::getApplicationType, "notification");
        });
        if (Objects.nonNull(vo.getMeetingDate())) {
        if (Objects.nonNull(vo.getDate())) {
            alWrapper.and(wrapper -> {
                wrapper.eq(MeetApplication::getMeetingDate, vo.getMeetingDate());
                wrapper.like(MeetApplication::getMeetingDate, vo.getDate());
            });
        }
        alWrapper.orderByAsc(MeetApplication::getStartTime);
src/main/java/com/ruoyi/collaborativeApproval/vo/SearchMeetingUseVo.java
@@ -16,4 +16,6 @@
    @JsonFormat(pattern = "yyyy-MM-dd")
    @DateTimeFormat(pattern = "yyyy-MM-dd")
    private Date meetingDate;
    private String date;
}