Ver código fonte

支持doc、docx文件上传

lamphua 1 mês atrás
pai
commit
fa2e59a9b1

+ 7
- 1
llm-back/ruoyi-agent/pom.xml Ver arquivo

@@ -50,14 +50,20 @@
50 50
             </exclusions>
51 51
         </dependency>
52 52
 
53
+        <dependency>
54
+            <groupId>org.apache.poi</groupId>
55
+            <artifactId>poi</artifactId>
56
+            <version>5.2.5</version>
57
+        </dependency>
53 58
         <dependency>
54 59
             <groupId>org.apache.poi</groupId>
55 60
             <artifactId>poi-ooxml</artifactId>
61
+            <version>5.2.5</version>
56 62
         </dependency>
57 63
         <dependency>
58 64
             <groupId>org.apache.poi</groupId>
59 65
             <artifactId>poi-scratchpad</artifactId>
60
-            <version>5.2.3</version>
66
+            <version>5.2.5</version>
61 67
         </dependency>
62 68
         <dependency>
63 69
             <groupId>com.alibaba.fastjson2</groupId>

+ 93
- 72
llm-back/ruoyi-agent/src/main/java/com/ruoyi/agent/service/impl/McpServiceImpl.java Ver arquivo

@@ -24,11 +24,10 @@ import io.milvus.param.collection.LoadCollectionParam;
24 24
 import io.milvus.param.collection.ReleaseCollectionParam;
25 25
 import io.milvus.param.dml.SearchParam;
26 26
 import io.milvus.response.SearchResultsWrapper;
27
-import org.apache.poi.hwpf.HWPFDocument;
27
+import org.apache.poi.extractor.POITextExtractor;
28
+import org.apache.poi.extractor.ExtractorFactory;
28 29
 import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
29
-import org.apache.poi.xwpf.usermodel.XWPFDocument;
30
-import org.apache.poi.xwpf.usermodel.XWPFParagraph;
31
-import org.apache.poi.xwpf.usermodel.XWPFRun;
30
+import org.apache.poi.xwpf.usermodel.*;
32 31
 import org.noear.solon.Solon;
33 32
 import org.noear.solon.ai.annotation.ToolMapping;
34 33
 import org.noear.solon.ai.chat.ChatModel;
@@ -65,9 +64,13 @@ public class McpServiceImpl implements IMcpService {
65 64
                                            @Param(description = "章节名称") String title,
66 65
                                            @Param(description = "技术文件地址") String templatePath) throws IOException
67 66
     {
68
-        title = String.join(",", extractSubTitles( "/upload/agent/template/technical.docx", title));
69
-        List<JSONObject> contexts = retrieveFromMilvus(milvusClient, embeddingModel, collectionName, title, 10);
70
-        return generateAnswerWithDocumentAndCollection(embeddingModel, agentName, templatePath, title, contexts, "http://192.168.28.188:8000/v1/chat/completions");
67
+            try {
68
+                title = String.join(",", extractSubTitles( "/upload/agent/template/technical.docx", title));
69
+                List<JSONObject> contexts = retrieveFromMilvus(milvusClient, embeddingModel, collectionName, title, 10);
70
+                return generateAnswerWithDocumentAndCollection(embeddingModel, agentName, templatePath, title, contexts, "http://192.168.28.188:8000/v1/chat/completions");
71
+            } catch (IOException e) {
72
+                throw new RuntimeException(e);
73
+            }
71 74
     }
72 75
 
73 76
     /**
@@ -91,7 +94,12 @@ public class McpServiceImpl implements IMcpService {
91 94
      */
92 95
     public AssistantMessage generateAnswerWithDocumentAndCollection(EmbeddingModel embeddingModel, String agentName, String templatePath, String question, List<JSONObject> contexts, String llmServiceUrl) throws IOException {
93 96
         StringBuilder sb = new StringBuilder("招标文件内容:\n\n");
94
-        File profilePath = new File(templatePath.replace("/dev-api/profile", Solon.cfg().getProperty("cmc.profile")).replace("_" + agentName, ""));
97
+        String filename = templatePath.replace("/dev-api/profile", Solon.cfg().getProperty("cmc.profile")).replace("_" + agentName, "");
98
+        File profilePath = new File(filename);
99
+        if (!profilePath.exists()) {
100
+            filename = filename.replace(".docx", ".doc");
101
+            profilePath = new File(filename);
102
+        }
95 103
         List<TextSegment> segments = splitDocument(profilePath);
96 104
         List<Embedding> embeddings = embeddingModel.embedAll(segments).content();
97 105
         InMemoryEmbeddingStore<TextSegment> embeddingStore = new InMemoryEmbeddingStore<>();
@@ -131,7 +139,7 @@ public class McpServiceImpl implements IMcpService {
131 139
 
132 140
         ChatResponse response = chatModel.prompt(chatSession).call();
133 141
         String content = response.lastChoice().getMessage().getContent() + "\n\n" +
134
-                "招标文件分析完成,章节内容已写入【<a href='" + templatePath.replace("/dev-api", "") + "'> 技术文件" + "</a>】,请查阅";
142
+                "招标文件分析完成,章节内容已写入【<a href='" + templatePath + "'> 技术文件" + "</a>】,请查阅";
135 143
         String absolutePath = templatePath.replace("/dev-api/profile", Solon.cfg().getProperty("cmc.profile"));
136 144
         writeContent(response.lastChoice().getMessage().getContent(), question, absolutePath);
137 145
         return ChatMessage.ofAssistant(content);
@@ -144,47 +152,60 @@ public class McpServiceImpl implements IMcpService {
144 152
     public void writeContent(String content, String question, String absolutePath) throws IOException {
145 153
         File file = new File(absolutePath);
146 154
         FileInputStream fileInputStream = new FileInputStream(file);
147
-        XWPFDocument document = new XWPFDocument(fileInputStream);
148
-        String[] contentLines = content.split("\n");
149
-        Map<String, String> map = new HashMap<>();
150
-        String[] titles = question.split(",");
151
-        for (int i = 0; i < titles.length; i ++) {
152
-            int startIndex = Arrays.asList(contentLines).indexOf(titles[i]);
153
-            StringBuilder text = new StringBuilder("");
154
-            if (i < titles.length - 1) {
155
-                int endIndex = Arrays.asList(contentLines).indexOf(titles[i + 1]);
156
-                for (int c = startIndex + 1; c < endIndex; c++)
157
-                    text.append(contentLines[c]);
158
-            }
159
-            else {
160
-                if (startIndex + 1 < contentLines.length)
161
-                    for (int c = startIndex + 1; c < contentLines.length; c++)
155
+        try (XWPFDocument document = new XWPFDocument(fileInputStream)) {
156
+            String[] contentLines = content.split("\n");
157
+            Map<String, String> map = new HashMap<>();
158
+            String[] titles = question.split(",");
159
+            for (int i = 0; i < titles.length; i++) {
160
+                int startIndex = Arrays.asList(contentLines).indexOf(titles[i]);
161
+                StringBuilder text = new StringBuilder("");
162
+                if (i < titles.length - 1) {
163
+                    int endIndex = Arrays.asList(contentLines).indexOf(titles[i + 1]);
164
+                    for (int c = startIndex + 1; c < endIndex; c++)
162 165
                         text.append(contentLines[c]);
166
+                } else {
167
+                    if (startIndex + 1 < contentLines.length)
168
+                        for (int c = startIndex + 1; c < contentLines.length; c++)
169
+                            text.append(contentLines[c]);
170
+                }
171
+                map.put(titles[i], text.toString());
163 172
             }
164
-            map.put(titles[i], text.toString());
165
-        }
166
-        int count = 0;
167
-        for (int i = 0; i < document.getParagraphs().size(); i++) {
168
-            XWPFParagraph paragraph = document.getParagraphs().get(i);
169
-            for (String title : titles) {
170
-                if (paragraph.getText().equals(title)) {
171
-                    int pos = document.getBodyElements().indexOf(paragraph) + 1;
172
-                    XWPFParagraph contentParagraph = document.createParagraph();
173
-                    contentParagraph.setStyle("1");
174
-                    XWPFRun run = contentParagraph.createRun();
175
-                    run.setText(map.get(title));
176
-                    document.setParagraph(contentParagraph, pos);
177
-                    count++;
173
+            int count = 0;
174
+            int position = 0;
175
+            List<Integer> positions = new ArrayList<>();
176
+            List<String> contents = new ArrayList<>();
177
+            for (int i = 0; i < document.getBodyElements().size(); i++) {
178
+                IBodyElement element = document.getBodyElements().get(i);
179
+                if (element instanceof XWPFParagraph) {
180
+                    XWPFParagraph paragraph = (XWPFParagraph) element;
181
+                    for (String title : titles) {
182
+                        if (paragraph.getText().equals(title)) {
183
+                            int pos = position + 1;
184
+                            positions.add(pos);
185
+                            contents.add(map.get(title));
186
+                            count++;
187
+                        }
188
+                    }
189
+                    position++;
190
+                    if (count == titles.length)
191
+                        break;
178 192
                 }
193
+                else if (element instanceof XWPFTable) {
194
+                    XWPFTable table = (XWPFTable) element;
195
+                    position += table.getNumberOfRows();
196
+                }
197
+            }
198
+            for (int i = positions.size() - 1; i >= 0; i--) {
199
+                XWPFParagraph contentParagraph = document.createParagraph();
200
+                contentParagraph.setStyle("1");
201
+                document.setParagraph(contentParagraph, positions.get(i));
202
+                XWPFRun run = contentParagraph.createRun();
203
+                run.setText(contents.get(i));
204
+            }
205
+            try (FileOutputStream out = new FileOutputStream(absolutePath)) {
206
+                document.write(out);
179 207
             }
180
-            if (count == titles.length)
181
-                break;
182 208
         }
183
-        FileOutputStream out = new FileOutputStream(absolutePath);
184
-        document.write(out);
185
-        // 关闭文档
186
-        out.close();
187
-        document.close();
188 209
     }
189 210
 
190 211
     /**
@@ -195,25 +216,26 @@ public class McpServiceImpl implements IMcpService {
195 216
         boolean inTargetSection = false;
196 217
         filename = Solon.cfg().getProperty("cmc.profile") + filename;
197 218
         InputStream fileInputStream = new FileInputStream(filename);
198
-        XWPFDocument document = new XWPFDocument(fileInputStream);
199
-        for (XWPFParagraph paragraph : document.getParagraphs()) {
200
-            String text = paragraph.getText().trim();
201
-            if (paragraph.getStyle() != null) {
202
-                // 判断主标题
203
-                if (paragraph.getStyle().equals("3") &&
204
-                        text.contains(question)) {
205
-                    inTargetSection = true;
206
-                    continue;
207
-                }
208
-
209
-                // 如果已经在目标节中,收集标题3级别的子标题
210
-                if (inTargetSection) {
211
-                    if (paragraph.getStyle().equals("4")) {
212
-                        subTitles.add(text);
219
+        try (XWPFDocument document = new XWPFDocument(fileInputStream)) {
220
+            for (XWPFParagraph paragraph : document.getParagraphs()) {
221
+                String text = paragraph.getText().trim();
222
+                if (paragraph.getStyle() != null) {
223
+                    // 判断主标题
224
+                    if (paragraph.getStyle().equals("3") &&
225
+                            text.contains(question)) {
226
+                        inTargetSection = true;
227
+                        continue;
213 228
                     }
214
-                    // 遇到下一个Heading1则退出
215
-                    else if (paragraph.getStyle().equals("3")) {
216
-                        break;
229
+
230
+                    // 如果已经在目标节中,收集标题3级别的子标题
231
+                    if (inTargetSection) {
232
+                        if (paragraph.getStyle().equals("4")) {
233
+                            subTitles.add(text);
234
+                        }
235
+                        // 遇到下一个Heading1则退出
236
+                        else if (paragraph.getStyle().equals("3")) {
237
+                            break;
238
+                        }
217 239
                     }
218 240
                 }
219 241
             }
@@ -271,15 +293,14 @@ public class McpServiceImpl implements IMcpService {
271 293
         Document document;
272 294
         InputStream fileInputStream = new FileInputStream(transferFile);
273 295
         String filename = transferFile.getName().toLowerCase();
274
-        if (filename.endsWith(".doc")) {
275
-            HWPFDocument doc = new HWPFDocument(fileInputStream);
276
-            document = Document.from(doc.getDocumentText());
277
-        }
278
-        else if (filename.endsWith(".docx")) {
279
-            XWPFDocument docx = new XWPFDocument(fileInputStream);
280
-            XWPFWordExtractor extractor = new XWPFWordExtractor(docx);
281
-            String text = extractor.getText();
282
-            document = Document.from(text);
296
+        if (filename.endsWith(".doc") || filename.endsWith(".docx")) {
297
+            try (POITextExtractor extractor = ExtractorFactory.createExtractor(fileInputStream)) {
298
+                String text = extractor.getText();
299
+                document = Document.from(text);
300
+            }
301
+            catch (IOException e) {
302
+                throw new RuntimeException(e);
303
+            }
283 304
         }
284 305
         else if (filename.endsWith(".pdf")) {
285 306
             document = new ApachePdfBoxDocumentParser().parse(fileInputStream);

+ 11
- 0
llm-back/ruoyi-common/pom.xml Ver arquivo

@@ -72,9 +72,20 @@
72 72
         </dependency>
73 73
 
74 74
         <!-- excel工具 -->
75
+        <dependency>
76
+            <groupId>org.apache.poi</groupId>
77
+            <artifactId>poi</artifactId>
78
+            <version>5.2.5</version>
79
+        </dependency>
75 80
         <dependency>
76 81
             <groupId>org.apache.poi</groupId>
77 82
             <artifactId>poi-ooxml</artifactId>
83
+            <version>5.2.5</version>
84
+        </dependency>
85
+        <dependency>
86
+            <groupId>org.apache.poi</groupId>
87
+            <artifactId>poi-scratchpad</artifactId>
88
+            <version>5.2.5</version>
78 89
         </dependency>
79 90
 
80 91
         <!-- yml解析器 -->

+ 34
- 34
llm-back/ruoyi-llm/src/main/java/com/ruoyi/web/llm/service/impl/LangChainMilvusServiceImpl.java Ver arquivo

@@ -28,8 +28,8 @@ import io.milvus.param.collection.ReleaseCollectionParam;
28 28
 import io.milvus.param.dml.InsertParam;
29 29
 import io.milvus.param.dml.SearchParam;
30 30
 import io.milvus.response.SearchResultsWrapper;
31
-import org.apache.poi.hwpf.HWPFDocument;
32
-import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
31
+import org.apache.poi.extractor.POITextExtractor;
32
+import org.apache.poi.extractor.ExtractorFactory;
33 33
 import org.apache.poi.xwpf.usermodel.XWPFDocument;
34 34
 import org.apache.poi.xwpf.usermodel.XWPFParagraph;
35 35
 import org.noear.solon.ai.chat.ChatModel;
@@ -69,7 +69,7 @@ public class LangChainMilvusServiceImpl implements ILangChainMilvusService
69 69
         File transferFile = new File( profilePath + "/" + file.getOriginalFilename());
70 70
         if (!transferFile.exists())
71 71
             file.transferTo(transferFile);
72
-        List<TextSegment> segments = splitDocument(file.getOriginalFilename(), transferFile);
72
+        List<TextSegment> segments = splitDocument(transferFile);
73 73
 
74 74
         // 提取文本和生成嵌入
75 75
         List<String> fileNames = new ArrayList<>();
@@ -200,7 +200,7 @@ public class LangChainMilvusServiceImpl implements ILangChainMilvusService
200 200
         StringBuilder sb = new StringBuilder("问题: " + question + "\n\n").append("根据以下上下文回答问题:\n\n");
201 201
         for (CmcDocument document : documentList) {
202 202
             File profilePath = new File(RuoYiConfig.getProfile() + document.getPath());
203
-            List<TextSegment> segments = splitDocument(document.getPath(), profilePath);
203
+            List<TextSegment> segments = splitDocument(profilePath);
204 204
             List<Embedding> embeddings = embeddingModel.embedAll(segments).content();
205 205
             InMemoryEmbeddingStore<TextSegment> embeddingStore = new InMemoryEmbeddingStore<>();
206 206
             embeddingStore.addAll(embeddings, segments);
@@ -236,7 +236,7 @@ public class LangChainMilvusServiceImpl implements ILangChainMilvusService
236 236
             if (documentList.size() == 1) {
237 237
                 for (CmcDocument document : documentList) {
238 238
                     File profilePath = new File(RuoYiConfig.getProfile() + document.getPath());
239
-                    List<TextSegment> segments = splitDocument(document.getPath(), profilePath);
239
+                    List<TextSegment> segments = splitDocument(profilePath);
240 240
                     List<Embedding> embeddings = embeddingModel.embedAll(segments).content();
241 241
                     InMemoryEmbeddingStore<TextSegment> embeddingStore = new InMemoryEmbeddingStore<>();
242 242
                     embeddingStore.addAll(embeddings, segments);
@@ -271,25 +271,26 @@ public class LangChainMilvusServiceImpl implements ILangChainMilvusService
271 271
         boolean inTargetSection = false;
272 272
 
273 273
         InputStream fileInputStream = new FileInputStream(filename);
274
-        XWPFDocument document = new XWPFDocument(fileInputStream);
275
-        for (XWPFParagraph paragraph : document.getParagraphs()) {
276
-            String text = paragraph.getText().trim();
277
-            if (paragraph.getStyle() != null) {
278
-                // 判断主标题
279
-                if (paragraph.getStyle().equals("3") &&
280
-                        text.contains(question)) {
281
-                    inTargetSection = true;
282
-                    continue;
283
-                }
284
-
285
-                // 如果已经在目标节中,收集标题3级别的子标题
286
-                if (inTargetSection) {
287
-                    if (paragraph.getStyle().equals("4")) {
288
-                        subTitles.add(text);
274
+        try (XWPFDocument document = new XWPFDocument(fileInputStream)) {
275
+            for (XWPFParagraph paragraph : document.getParagraphs()) {
276
+                String text = paragraph.getText().trim();
277
+                if (paragraph.getStyle() != null) {
278
+                    // 判断主标题
279
+                    if (paragraph.getStyle().equals("3") &&
280
+                            text.contains(question)) {
281
+                        inTargetSection = true;
282
+                        continue;
289 283
                     }
290
-                    // 遇到下一个Heading1则退出
291
-                    else if (paragraph.getStyle().equals("3")) {
292
-                        break;
284
+
285
+                    // 如果已经在目标节中,收集标题3级别的子标题
286
+                    if (inTargetSection) {
287
+                        if (paragraph.getStyle().equals("4")) {
288
+                            subTitles.add(text);
289
+                        }
290
+                        // 遇到下一个Heading1则退出
291
+                        else if (paragraph.getStyle().equals("3")) {
292
+                            break;
293
+                        }
293 294
                     }
294 295
                 }
295 296
             }
@@ -342,20 +343,19 @@ public class LangChainMilvusServiceImpl implements ILangChainMilvusService
342 343
     /**
343 344
      * 检索知识库
344 345
      */
345
-    private List<TextSegment> splitDocument(String filename, File transferFile) throws IOException {
346
+    private List<TextSegment> splitDocument(File transferFile) throws IOException {
346 347
         // 加载文档
347 348
         Document document;
348 349
         InputStream fileInputStream = new FileInputStream(transferFile);
349
-        filename = filename.toLowerCase();
350
-        if (filename.endsWith(".doc")) {
351
-            HWPFDocument doc = new HWPFDocument(fileInputStream);
352
-            document = Document.from(doc.getDocumentText());
353
-        }
354
-        else if (filename.endsWith(".docx")) {
355
-            XWPFDocument docx = new XWPFDocument(fileInputStream);
356
-            XWPFWordExtractor extractor = new XWPFWordExtractor(docx);
357
-            String text = extractor.getText();
358
-            document = Document.from(text);
350
+        String filename = transferFile.getName().toLowerCase();
351
+        if (filename.endsWith(".doc") || filename.endsWith(".docx")) {
352
+            try (POITextExtractor extractor = ExtractorFactory.createExtractor(fileInputStream)) {
353
+                String text = extractor.getText();
354
+                document = Document.from(text);
355
+            }
356
+            catch (IOException e) {
357
+                throw new RuntimeException(e);
358
+            }
359 359
         }
360 360
         else if (filename.endsWith(".pdf")) {
361 361
             document = new ApachePdfBoxDocumentParser().parse(fileInputStream);

+ 9
- 13
llm-back/ruoyi-system/src/main/java/com/ruoyi/llm/service/impl/CmcAgentServiceImpl.java Ver arquivo

@@ -1,9 +1,6 @@
1 1
 package com.ruoyi.llm.service.impl;
2 2
 
3
-import java.io.File;
4
-import java.io.FileInputStream;
5
-import java.io.FileOutputStream;
6
-import java.io.IOException;
3
+import java.io.*;
7 4
 import java.util.Date;
8 5
 import java.util.List;
9 6
 
@@ -101,7 +98,7 @@ public class CmcAgentServiceImpl implements ICmcAgentService
101 98
         String outputFilename = "/upload/agent/" + agentName + "/" + file.getOriginalFilename()
102 99
                 .replace(filenameSplit[filenameSplit.length - 2], filenameSplit[filenameSplit.length - 2] + "_" + agentName);
103 100
         if (file.getOriginalFilename().endsWith(".doc"))
104
-            outputFilename = outputFilename.replace(".doc", "docx");
101
+            outputFilename = outputFilename.replace(".doc", ".docx");
105 102
         CmcDocument cmcDocument = new CmcDocument();
106 103
         cmcDocument.setDocumentId(new SnowFlake().generateId());
107 104
         cmcDocument.setChatId(chatId);
@@ -115,14 +112,13 @@ public class CmcAgentServiceImpl implements ICmcAgentService
115 112
             cmcChat.setInput("招标文件地址:" + "/upload/agent/" + agentName + "/" + file.getOriginalFilename());
116 113
             cmcChat.setUserId(SecurityUtils.getUserId());
117 114
             cmcChatMapper.insertCmcChat(cmcChat);
118
-            XWPFDocument doc = new XWPFDocument(new FileInputStream(RuoYiConfig.getProfile() + "/upload/agent/template/technical.docx"));
119
-            // 保存文档到本地文件系统
120
-            FileOutputStream out = new FileOutputStream(RuoYiConfig.getProfile() + outputFilename);
121
-            doc.write(out);
122
-            // 关闭文档
123
-            out.close();
124
-            doc.close();
125
-            
115
+            InputStream fileInputStream = new FileInputStream(RuoYiConfig.getProfile() + "/upload/agent/template/technical.docx");
116
+            try (XWPFDocument doc = new XWPFDocument(fileInputStream)) {
117
+                // 保存文档到本地文件系统
118
+                try (FileOutputStream out = new FileOutputStream(RuoYiConfig.getProfile() + outputFilename)) {
119
+                    doc.write(out);
120
+                }
121
+            }
126 122
             message = "好的,我已经收到您上传的招标文件,我将给您提供技术文件模板,您可点击进行预览:" +
127 123
                     "【<a href='/profile" + outputFilename + "'> 模版 " + "</a>】\n\n" +
128 124
                     "技术文件涉及多个章节:\n" +

Carregando…
Cancelar
Salvar