10 天以前 eb832a81fb17402b1dded01459a01d7d99f16262
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
package com.ruoyi.ai.service.impl;
 
import com.ruoyi.ai.service.KnowledgeRagService;
import com.ruoyi.approve.pojo.KnowledgeBaseVector;
import com.ruoyi.approve.service.KnowledgeBaseVectorService;
import com.ruoyi.basic.pojo.StorageBlob;
import com.ruoyi.basic.service.StorageBlobService;
import com.ruoyi.common.config.FileProperties;
import dev.langchain4j.data.embedding.Embedding;
import dev.langchain4j.data.segment.TextSegment;
import dev.langchain4j.model.embedding.EmbeddingModel;
import dev.langchain4j.store.embedding.EmbeddingSearchRequest;
import dev.langchain4j.store.embedding.EmbeddingSearchResult;
import dev.langchain4j.store.embedding.EmbeddingStore;
import io.pinecone.clients.Index;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.scheduling.annotation.Async;
import org.springframework.stereotype.Service;
 
import java.io.File;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
 
/**
 * 知识库RAG服务实现
 */
@Slf4j
@Service
public class KnowledgeRagServiceImpl implements KnowledgeRagService {
 
    private final KnowledgeBaseVectorService knowledgeBaseVectorService;
    private final StorageBlobService storageBlobService;
    private final EmbeddingModel embeddingModel;
    private final EmbeddingStore<TextSegment> embeddingStore;
    private final FileProperties fileProperties;
    private final Index pineconeIndex;
 
    @Value("${pinecone.namespace:knowledge-base}")
    private String namespace;
 
    public KnowledgeRagServiceImpl(
            KnowledgeBaseVectorService knowledgeBaseVectorService,
            StorageBlobService storageBlobService,
            EmbeddingModel embeddingModel,
            EmbeddingStore<TextSegment> embeddingStore,
            FileProperties fileProperties,
            Index pineconeIndex) {
        this.knowledgeBaseVectorService = knowledgeBaseVectorService;
        this.storageBlobService = storageBlobService;
        this.embeddingModel = embeddingModel;
        this.embeddingStore = embeddingStore;
        this.fileProperties = fileProperties;
        this.pineconeIndex = pineconeIndex;
    }
 
    private static final int CHUNK_SIZE = 500;
    private static final int CHUNK_OVERLAP = 100;
    /**
     * 文件大小阈值,超过此值才进行切片
     * 80MB = 80 * 1024 * 1024 字节
     */
    private static final long CHUNK_THRESHOLD_BYTES = 80L * 1024 * 1024;
    /**
     * Embedding 模型最大输入长度限制
     * 阿里云 DashScope 限制为 8192 字符
     */
    private static final int EMBEDDING_MAX_LENGTH = 8000;
 
    @Override
    @Async("threadPoolTaskExecutor")
    public void processVectorAsync(Long vectorId) {
        log.info("开始异步向量化处理: vectorId={}, thread={}", vectorId, Thread.currentThread().getName());
        processVector(vectorId);
    }
 
    @Override
    public void processVector(Long vectorId) {
        log.info("开始处理向量化: vectorId={}", vectorId);
        KnowledgeBaseVector vector = knowledgeBaseVectorService.getById(vectorId);
        if (vector == null) {
            log.error("向量记录不存在: {}", vectorId);
            return;
        }
 
        try {
            // 更新状态为处理中
            log.info("更新状态为处理中: vectorId={}", vectorId);
            knowledgeBaseVectorService.updateVectorStatus(vectorId,
                    KnowledgeBaseVector.STATUS_PROCESSING, null, null);
 
            // 获取文件内容
            log.info("获取文件信息: storageBlobId={}", vector.getStorageBlobId());
            StorageBlob blob = storageBlobService.getById(vector.getStorageBlobId());
            if (blob == null) {
                throw new RuntimeException("文件不存在: " + vector.getStorageBlobId());
            }
 
            File file = getFile(blob);
            log.info("文件路径: {}, 是否存在: {}", file.getAbsolutePath(), file.exists());
            long fileSize = file.length();
            log.info("文件大小: {} 字节", fileSize);
 
            // 直接读取文件内容,不使用 MultipartFile 包装
            log.info("提取文件内容: fileName={}", vector.getFileName());
            String content = extractFileContent(file, vector.getFileName());
            log.info("文件内容长度: {}", content != null ? content.length() : 0);
 
            if (content == null || content.trim().isEmpty()) {
                throw new RuntimeException("文件内容为空");
            }
 
            // 文本切片
            List<TextSegment> chunks;
            boolean needChunk = fileSize > CHUNK_THRESHOLD_BYTES || content.length() > EMBEDDING_MAX_LENGTH;
            if (needChunk) {
                log.info("开始切片: fileSize={}, contentLength={}", fileSize, content.length());
                chunks = splitText(content, vector);
                log.info("切片完成,共 {} 个块", chunks.size());
            } else {
                log.info("文件较小且内容长度{}不超过{},不进行切片", content.length(), EMBEDDING_MAX_LENGTH);
                Map<String, Object> metadata = buildMetadata(vector);
                chunks = List.of(TextSegment.from(content, new dev.langchain4j.data.document.Metadata(metadata)));
            }
 
            // 批量生成嵌入向量并存储
            int chunkCount = 0;
            for (TextSegment chunk : chunks) {
                log.debug("处理第 {} 个块", chunkCount + 1);
                Embedding embedding = embeddingModel.embed(chunk).content();
                embeddingStore.add(embedding, chunk);
                chunkCount++;
            }
 
            // 更新状态为完成
            knowledgeBaseVectorService.updateVectorStatus(vectorId,
                    KnowledgeBaseVector.STATUS_COMPLETED, chunkCount, null);
 
            log.info("向量化处理完成: vectorId={}, chunkCount={}", vectorId, chunkCount);
 
        } catch (Exception e) {
            log.error("向量化处理失败: vectorId={}", vectorId, e);
            knowledgeBaseVectorService.updateVectorStatus(vectorId,
                    KnowledgeBaseVector.STATUS_FAILED, null, e.getMessage());
        }
    }
 
    @Override
    public List<String> searchRelevantContent(String namespace, String query, int maxResults) {
        try {
            // 生成查询向量
            Embedding queryEmbedding = embeddingModel.embed(query).content();
 
            // 构建搜索请求,使用元数据过滤
            EmbeddingSearchRequest searchRequest = EmbeddingSearchRequest.builder()
                    .queryEmbedding(queryEmbedding)
                    .maxResults(maxResults)
                    .minScore(0.7)
                    .build();
 
            EmbeddingSearchResult<TextSegment> searchResult = embeddingStore.search(searchRequest);
 
            return searchResult.matches().stream()
                    .map(match -> match.embedded().text())
                    .collect(Collectors.toList());
 
        } catch (Exception e) {
            log.error("向量检索失败: namespace={}", namespace, e);
            return new ArrayList<>();
        }
    }
 
    @Override
    public void deleteEmbeddings(String namespace, Long storageBlobId) {
        log.info("删除向量数据: namespace={}, storageBlobId={}", namespace, storageBlobId);
        try {
            // Pinecone metadata filter 需要使用 $eq 操作符
            // 且值的类型需要与存储时一致(Long -> Number)
            com.google.protobuf.Struct filter = com.google.protobuf.Struct.newBuilder()
                    .putFields("storageBlobId", com.google.protobuf.Value.newBuilder()
                            .setStructValue(com.google.protobuf.Struct.newBuilder()
                                    .putFields("$eq", com.google.protobuf.Value.newBuilder()
                                            .setNumberValue(storageBlobId.doubleValue())
                                            .build()))
                            .build())
                    .build();
 
            List<String> emptyIds = new ArrayList<>();
            pineconeIndex.delete(emptyIds, false, this.namespace, filter);
            log.info("向量删除完成: storageBlobId={}", storageBlobId);
        } catch (Exception e) {
            log.error("删除向量数据失败: namespace={}, storageBlobId={}", namespace, storageBlobId, e);
        }
    }
 
    private File getFile(StorageBlob blob) {
        String path = blob.getPath();
        if (path != null && !path.isEmpty()) {
            return new File(new File(fileProperties.getPath(), path), blob.getUidFilename());
        }
        return new File(fileProperties.getPath(), blob.getUidFilename());
    }
 
    /**
     * 提取文件内容
     */
    private String extractFileContent(File file, String fileName) throws Exception {
        String ext = getFileExtension(fileName);
 
        // 根据文件类型提取内容
        if (isPlainText(ext)) {
            return readFileWithEncoding(file);
        }
 
        if ("docx".equals(ext)) {
            return extractDocx(file);
        }
 
        if ("xlsx".equals(ext)) {
            return extractXlsx(file);
        }
 
        if ("xls".equals(ext)) {
            return extractXls(file);
        }
 
        // 默认尝试读取文本
        return readFileWithEncoding(file);
    }
 
    /**
     * 自动检测文件编码并读取内容
     * 优先尝试 UTF-8,失败则尝试 GBK
     */
    private String readFileWithEncoding(File file) throws Exception {
        byte[] bytes = Files.readAllBytes(file.toPath());
 
        // 先尝试 UTF-8
        String utf8Content = new String(bytes, StandardCharsets.UTF_8);
        if (isValidUtf8(utf8Content)) {
            log.debug("文件编码: UTF-8");
            return utf8Content;
        }
 
        // 尝试 GBK
        try {
            Charset gbk = Charset.forName("GBK");
            String gbkContent = new String(bytes, gbk);
            log.debug("文件编码: GBK");
            return gbkContent;
        } catch (Exception e) {
            log.warn("编码检测失败,使用 UTF-8");
            return utf8Content;
        }
    }
 
    /**
     * 检查 UTF-8 解码是否有效
     */
    private boolean isValidUtf8(String decoded) {
        // 检查是否包含替换字符(说明 UTF-8 解码失败)
        if (decoded.contains("�")) {
            return false;
        }
        // 检查是否有过多的非打印字符(乱码特征)
        int invalidCount = 0;
        for (int i = 0; i < Math.min(decoded.length(), 1000); i++) {
            char c = decoded.charAt(i);
            // 检查私有使用区域或异常的控制字符
            if ((c >= '' && c <= '') || (c < ' ' && c != '\n' && c != '\r' && c != '\t')) {
                invalidCount++;
            }
        }
        // 如果无效字符超过 5%,认为是编码错误
        return invalidCount < Math.min(decoded.length(), 1000) * 0.05;
    }
 
    private String getFileExtension(String fileName) {
        if (fileName == null || !fileName.contains(".")) {
            return "";
        }
        return fileName.substring(fileName.lastIndexOf('.') + 1).toLowerCase();
    }
 
    private boolean isPlainText(String ext) {
        return "txt".equals(ext) || "md".equals(ext) || "json".equals(ext)
                || "csv".equals(ext) || "xml".equals(ext) || "yaml".equals(ext)
                || "yml".equals(ext);
    }
 
    private String extractDocx(File file) throws Exception {
        try (var doc = new org.apache.poi.xwpf.usermodel.XWPFDocument(new java.io.FileInputStream(file));
             var extractor = new org.apache.poi.xwpf.extractor.XWPFWordExtractor(doc)) {
            return extractor.getText();
        }
    }
 
    private String extractXlsx(File file) throws Exception {
        try (var workbook = new org.apache.poi.xssf.usermodel.XSSFWorkbook(file)) {
            return extractWorkbook(workbook);
        }
    }
 
    private String extractXls(File file) throws Exception {
        try (var workbook = new org.apache.poi.hssf.usermodel.HSSFWorkbook(new java.io.FileInputStream(file))) {
            return extractWorkbook(workbook);
        }
    }
 
    private String extractWorkbook(org.apache.poi.ss.usermodel.Workbook workbook) {
        StringBuilder text = new StringBuilder();
        var formatter = new org.apache.poi.ss.usermodel.DataFormatter();
        for (int i = 0; i < workbook.getNumberOfSheets(); i++) {
            var sheet = workbook.getSheetAt(i);
            text.append("Sheet: ").append(sheet.getSheetName()).append("\n");
            for (var row : sheet) {
                for (var cell : row) {
                    text.append(formatter.formatCellValue(cell)).append("\t");
                }
                text.append("\n");
            }
        }
        return text.toString();
    }
 
    /**
     * 文本切片
     */
    private List<TextSegment> splitText(String content, KnowledgeBaseVector vector) {
        List<TextSegment> chunks = new ArrayList<>();
 
        if (content.length() <= CHUNK_SIZE) {
            Map<String, Object> metadata = buildMetadata(vector);
            chunks.add(TextSegment.from(content, new dev.langchain4j.data.document.Metadata(metadata)));
            return chunks;
        }
 
        int start = 0;
        int chunkIndex = 0;
        while (start < content.length()) {
            int end = Math.min(start + CHUNK_SIZE, content.length());
 
            // 尝试在句子边界切分
            if (end < content.length()) {
                int lastPeriod = content.lastIndexOf('。', end);
                int lastNewline = content.lastIndexOf('\n', end);
                int boundary = Math.max(lastPeriod, lastNewline);
                if (boundary > start + CHUNK_SIZE / 2) {
                    end = boundary + 1;
                }
            }
 
            String chunkText = content.substring(start, end).trim();
            if (!chunkText.isEmpty()) {
                Map<String, Object> metadata = buildMetadata(vector);
                metadata.put("chunkIndex", chunkIndex);
                chunks.add(TextSegment.from(chunkText, new dev.langchain4j.data.document.Metadata(metadata)));
                chunkIndex++;
            }
 
            start = end - CHUNK_OVERLAP;
            if (start < 0) start = 0;
            if (start >= content.length() - CHUNK_OVERLAP) break;
        }
 
        return chunks;
    }
 
    private Map<String, Object> buildMetadata(KnowledgeBaseVector vector) {
        Map<String, Object> metadata = new HashMap<>();
        metadata.put("knowledgeBaseId", vector.getKnowledgeBaseId());
        metadata.put("storageBlobId", vector.getStorageBlobId());
        metadata.put("fileName", vector.getFileName());
        metadata.put("namespace", vector.getNamespace());
        return metadata;
    }
}