昨天 8097bee3cbb59dba8414bbe01addfa9cca18ad74
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
package com.ruoyi.ai.service.impl;
 
import com.ruoyi.ai.service.KnowledgeRagService;
import com.ruoyi.approve.pojo.KnowledgeBaseVector;
import com.ruoyi.approve.service.KnowledgeBaseVectorService;
import com.ruoyi.basic.pojo.StorageBlob;
import com.ruoyi.basic.service.StorageBlobService;
import com.ruoyi.common.config.FileProperties;
import com.google.protobuf.Struct;
import com.google.protobuf.Value;
import dev.langchain4j.data.embedding.Embedding;
import dev.langchain4j.data.segment.TextSegment;
import dev.langchain4j.model.embedding.EmbeddingModel;
import dev.langchain4j.store.embedding.EmbeddingSearchRequest;
import dev.langchain4j.store.embedding.EmbeddingSearchResult;
import dev.langchain4j.store.embedding.EmbeddingStore;
import io.pinecone.clients.Index;
import lombok.extern.slf4j.Slf4j;
import org.springframework.scheduling.annotation.Async;
import org.springframework.stereotype.Service;
 
import java.io.File;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
 
/**
 * 知识库RAG服务实现
 */
@Slf4j
@Service
public class KnowledgeRagServiceImpl implements KnowledgeRagService {
 
    private final KnowledgeBaseVectorService knowledgeBaseVectorService;
    private final StorageBlobService storageBlobService;
    private final EmbeddingModel embeddingModel;
    private final EmbeddingStore<TextSegment> embeddingStore;
    private final FileProperties fileProperties;
    private final Index pineconeIndex;
 
    @org.springframework.beans.factory.annotation.Value("${pinecone.namespace:knowledge-base}")
    private String namespace;
 
    public KnowledgeRagServiceImpl(
            KnowledgeBaseVectorService knowledgeBaseVectorService,
            StorageBlobService storageBlobService,
            EmbeddingModel embeddingModel,
            EmbeddingStore<TextSegment> embeddingStore,
            FileProperties fileProperties,
            Index pineconeIndex) {
        this.knowledgeBaseVectorService = knowledgeBaseVectorService;
        this.storageBlobService = storageBlobService;
        this.embeddingModel = embeddingModel;
        this.embeddingStore = embeddingStore;
        this.fileProperties = fileProperties;
        this.pineconeIndex = pineconeIndex;
    }
 
    private static final int CHUNK_SIZE = 500;
    private static final int CHUNK_OVERLAP = 100;
    private static final long CHUNK_THRESHOLD_BYTES = 80L * 1024 * 1024;
    private static final int EMBEDDING_MAX_LENGTH = 8000;
 
    @Override
    @Async("threadPoolTaskExecutor")
    public void processVectorAsync(Long vectorId) {
        log.info("开始异步向量化处理: vectorId={}, thread={}", vectorId, Thread.currentThread().getName());
        processVector(vectorId);
    }
 
    @Override
    public void processVector(Long vectorId) {
        log.info("开始处理向量化: vectorId={}", vectorId);
        KnowledgeBaseVector vector = knowledgeBaseVectorService.getById(vectorId);
        if (vector == null) {
            log.error("向量记录不存在: {}", vectorId);
            return;
        }
 
        try {
            knowledgeBaseVectorService.updateVectorStatus(vectorId,
                    KnowledgeBaseVector.STATUS_PROCESSING, null, null);
 
            StorageBlob blob = storageBlobService.getById(vector.getStorageBlobId());
            if (blob == null) {
                throw new RuntimeException("文件不存在: " + vector.getStorageBlobId());
            }
 
            File file = getFile(blob);
            log.info("文件路径: {}, 是否存在: {}", file.getAbsolutePath(), file.exists());
            long fileSize = file.length();
 
            String content = extractFileContent(file, vector.getFileName());
            log.info("文件内容长度: {}", content != null ? content.length() : 0);
 
            if (content == null || content.trim().isEmpty()) {
                throw new RuntimeException("文件内容为空");
            }
 
            List<TextSegment> chunks;
            boolean needChunk = fileSize > CHUNK_THRESHOLD_BYTES || content.length() > EMBEDDING_MAX_LENGTH;
            if (needChunk) {
                log.info("开始切片: fileSize={}, contentLength={}", fileSize, content.length());
                chunks = splitText(content, vector);
                log.info("切片完成,共 {} 个块", chunks.size());
            } else {
                log.info("文件较小,不进行切片");
                Map<String, Object> metadata = buildMetadata(vector);
                chunks = List.of(TextSegment.from(content, new dev.langchain4j.data.document.Metadata(metadata)));
            }
 
            int chunkCount = 0;
            for (TextSegment chunk : chunks) {
                Embedding embedding = embeddingModel.embed(chunk).content();
                embeddingStore.add(embedding, chunk);
                chunkCount++;
            }
 
            knowledgeBaseVectorService.updateVectorStatus(vectorId,
                    KnowledgeBaseVector.STATUS_COMPLETED, chunkCount, null);
 
            log.info("向量化处理完成: vectorId={}, chunkCount={}", vectorId, chunkCount);
 
        } catch (Exception e) {
            log.error("向量化处理失败: vectorId={}", vectorId, e);
            knowledgeBaseVectorService.updateVectorStatus(vectorId,
                    KnowledgeBaseVector.STATUS_FAILED, null, e.getMessage());
        }
    }
 
    @Override
    public List<String> searchRelevantContent(String namespace, String query, int maxResults) {
        try {
            Embedding queryEmbedding = embeddingModel.embed(query).content();
 
            EmbeddingSearchRequest searchRequest = EmbeddingSearchRequest.builder()
                    .queryEmbedding(queryEmbedding)
                    .maxResults(maxResults)
                    .minScore(0.7)
                    .build();
 
            EmbeddingSearchResult<TextSegment> searchResult = embeddingStore.search(searchRequest);
 
            return searchResult.matches().stream()
                    .map(match -> match.embedded().text())
                    .collect(Collectors.toList());
 
        } catch (Exception e) {
            log.error("向量检索失败: namespace={}", namespace, e);
            return new ArrayList<>();
        }
    }
 
    @Override
    public void deleteEmbeddings(String namespace, Long storageBlobId) {
        log.info("删除向量数据: namespace={}, storageBlobId={}", namespace, storageBlobId);
        try {
            Struct filter = Struct.newBuilder()
                    .putFields("storageBlobId", Value.newBuilder()
                            .setStructValue(Struct.newBuilder()
                                    .putFields("$eq", Value.newBuilder()
                                            .setNumberValue(storageBlobId.doubleValue())
                                            .build()))
                            .build())
                    .build();
 
            List<String> emptyIds = new ArrayList<>();
            pineconeIndex.delete(emptyIds, false, this.namespace, filter);
            log.info("向量删除完成: storageBlobId={}", storageBlobId);
        } catch (Exception e) {
            log.error("删除向量数据失败: namespace={}, storageBlobId={}", namespace, storageBlobId, e);
        }
    }
 
    private File getFile(StorageBlob blob) {
        String path = blob.getPath();
        if (path != null && !path.isEmpty()) {
            return new File(new File(fileProperties.getPath(), path), blob.getUidFilename());
        }
        return new File(fileProperties.getPath(), blob.getUidFilename());
    }
 
    private String extractFileContent(File file, String fileName) throws Exception {
        String ext = getFileExtension(fileName);
 
        if (isPlainText(ext)) {
            return readFileWithEncoding(file);
        }
 
        if ("docx".equals(ext)) {
            return extractDocx(file);
        }
 
        if ("xlsx".equals(ext)) {
            return extractXlsx(file);
        }
 
        if ("xls".equals(ext)) {
            return extractXls(file);
        }
 
        return readFileWithEncoding(file);
    }
 
    private String readFileWithEncoding(File file) throws Exception {
        byte[] bytes = Files.readAllBytes(file.toPath());
 
        String utf8Content = new String(bytes, StandardCharsets.UTF_8);
        if (isValidUtf8(utf8Content)) {
            log.debug("文件编码: UTF-8");
            return utf8Content;
        }
 
        try {
            Charset gbk = Charset.forName("GBK");
            String gbkContent = new String(bytes, gbk);
            log.debug("文件编码: GBK");
            return gbkContent;
        } catch (Exception e) {
            log.warn("编码检测失败,使用 UTF-8");
            return utf8Content;
        }
    }
 
    private boolean isValidUtf8(String decoded) {
        // 检查替换字符 U+FFFD (UTF-8 解码失败时出现)
        if (decoded.contains("�")) {
            return false;
        }
        int invalidCount = 0;
        int checkLen = Math.min(decoded.length(), 1000);
        for (int i = 0; i < checkLen; i++) {
            char c = decoded.charAt(i);
            // 检查私有使用区域 (U+E000-U+F8FF) 或异常控制字符
            if ((c >= '' && c <= '') || (c < ' ' && c != '\n' && c != '\r' && c != '\t')) {
                invalidCount++;
            }
        }
        return invalidCount < checkLen * 0.05;
    }
 
    private String getFileExtension(String fileName) {
        if (fileName == null || !fileName.contains(".")) {
            return "";
        }
        return fileName.substring(fileName.lastIndexOf('.') + 1).toLowerCase();
    }
 
    private boolean isPlainText(String ext) {
        return "txt".equals(ext) || "md".equals(ext) || "json".equals(ext)
                || "csv".equals(ext) || "xml".equals(ext) || "yaml".equals(ext)
                || "yml".equals(ext);
    }
 
    private String extractDocx(File file) throws Exception {
        try (var doc = new org.apache.poi.xwpf.usermodel.XWPFDocument(new java.io.FileInputStream(file));
             var extractor = new org.apache.poi.xwpf.extractor.XWPFWordExtractor(doc)) {
            return extractor.getText();
        }
    }
 
    private String extractXlsx(File file) throws Exception {
        try (var workbook = new org.apache.poi.xssf.usermodel.XSSFWorkbook(file)) {
            return extractWorkbook(workbook);
        }
    }
 
    private String extractXls(File file) throws Exception {
        try (var workbook = new org.apache.poi.hssf.usermodel.HSSFWorkbook(new java.io.FileInputStream(file))) {
            return extractWorkbook(workbook);
        }
    }
 
    private String extractWorkbook(org.apache.poi.ss.usermodel.Workbook workbook) {
        StringBuilder text = new StringBuilder();
        var formatter = new org.apache.poi.ss.usermodel.DataFormatter();
        for (int i = 0; i < workbook.getNumberOfSheets(); i++) {
            var sheet = workbook.getSheetAt(i);
            text.append("Sheet: ").append(sheet.getSheetName()).append("\n");
            for (var row : sheet) {
                for (var cell : row) {
                    text.append(formatter.formatCellValue(cell)).append("\t");
                }
                text.append("\n");
            }
        }
        return text.toString();
    }
 
    private List<TextSegment> splitText(String content, KnowledgeBaseVector vector) {
        List<TextSegment> chunks = new ArrayList<>();
 
        if (content.length() <= CHUNK_SIZE) {
            Map<String, Object> metadata = buildMetadata(vector);
            chunks.add(TextSegment.from(content, new dev.langchain4j.data.document.Metadata(metadata)));
            return chunks;
        }
 
        int start = 0;
        int chunkIndex = 0;
        while (start < content.length()) {
            int end = Math.min(start + CHUNK_SIZE, content.length());
 
            if (end < content.length()) {
                int lastPeriod = content.lastIndexOf('。', end);
                int lastNewline = content.lastIndexOf('\n', end);
                int boundary = Math.max(lastPeriod, lastNewline);
                if (boundary > start + CHUNK_SIZE / 2) {
                    end = boundary + 1;
                }
            }
 
            String chunkText = content.substring(start, end).trim();
            if (!chunkText.isEmpty()) {
                Map<String, Object> metadata = buildMetadata(vector);
                metadata.put("chunkIndex", chunkIndex);
                chunks.add(TextSegment.from(chunkText, new dev.langchain4j.data.document.Metadata(metadata)));
                chunkIndex++;
            }
 
            start = end - CHUNK_OVERLAP;
            if (start < 0) start = 0;
            if (start >= content.length() - CHUNK_OVERLAP) break;
        }
 
        return chunks;
    }
 
    private Map<String, Object> buildMetadata(KnowledgeBaseVector vector) {
        Map<String, Object> metadata = new HashMap<>();
        metadata.put("knowledgeBaseId", vector.getKnowledgeBaseId());
        metadata.put("storageBlobId", vector.getStorageBlobId());
        metadata.put("fileName", vector.getFileName());
        metadata.put("namespace", vector.getNamespace());
        return metadata;
    }
}