From a1ffabae4101b27fe91efc3eb50c40811bf419d5 Mon Sep 17 00:00:00 2001
From: Chen Tao <70054568+eeee0717@users.noreply.github.com>
Date: Tue, 22 Apr 2025 20:17:11 +0800
Subject: [PATCH] fix(knowledge): fix citation bug and optimize extract logic
 (#5195)

* fix(knowledge): change search ui and fix search bug

* fix: knowledge citation

* feat: optimize extract logic
---
 src/renderer/src/config/prompts.ts            |  2 +-
 .../pages/home/Messages/MessageContent.tsx    | 97 ++++++++++++++-----
 src/renderer/src/services/ApiService.ts       |  8 +-
 3 files changed, 82 insertions(+), 25 deletions(-)
diff --git a/src/renderer/src/config/prompts.ts b/src/renderer/src/config/prompts.ts
index 1d5ef1de..a5139792 100644
--- a/src/renderer/src/config/prompts.ts
+++ b/src/renderer/src/config/prompts.ts
@@ -60,7 +60,7 @@ export const SEARCH_SUMMARY_PROMPT = `
   4. Websearch: Always return the rephrased question inside the 'question' XML block. If there are no links in the follow-up question, do not insert a 'links' XML block in your response.
   5. Knowledge: Always return the rephrased question inside the 'question' XML block.
   6. Always wrap the rephrased question in the appropriate XML blocks to specify the tool(s) for retrieving information: use <websearch></websearch> for queries requiring real-time or external information, <knowledge></knowledge> for queries that can be answered from a pre-existing knowledge base, or both if the question could be applicable to either tool. Ensure that the rephrased question is always contained within a <question></question> block inside these wrappers.
-  7. If you are not sure to use knowledge or websearch, you need use both of them.
+  7. *use {tools} to rephrase the question*
 
   There are several examples attached for your reference inside the below 'examples' XML block.
 
diff --git a/src/renderer/src/pages/home/Messages/MessageContent.tsx b/src/renderer/src/pages/home/Messages/MessageContent.tsx
index e6a28522..ddf1b864 100644
--- a/src/renderer/src/pages/home/Messages/MessageContent.tsx
+++ b/src/renderer/src/pages/home/Messages/MessageContent.tsx
@@ -30,7 +30,7 @@ const toolUseRegex = /<tool_use>([\s\S]*?)<\/tool_use>/g
 
 const MessageContent: React.FC<Props> = ({ message: _message, model }) => {
   const { t } = useTranslation()
-  const message = withMessageThought(clone(_message))
+  let message = withMessageThought(clone(_message))
 
   // Memoize message status checks
   const messageStatus = useMemo(
@@ -119,6 +119,9 @@ const MessageContent: React.FC<Props> = ({ message: _message, model }) => {
     message.metadata?.webSearchInfo
   ])
 
+  /**
+   * 知识库索引部分：解决LLM回复中未使用的知识库引用索引问题
+   */
   // Process content to make citation numbers clickable
   const processedContent = useMemo(() => {
     const metadataFields = ['citations', 'webSearch', 'webSearchInfo', 'annotations', 'knowledge']
@@ -129,42 +132,90 @@ const MessageContent: React.FC<Props> = ({ message: _message, model }) => {
       return content
     }
 
-    // 预先计算citations数组，避免重复计算
+    // 预先计算citations数组
     const websearchResults = message?.metadata?.webSearch?.results?.map((result) => result.url) || []
     const knowledgeResults = message?.metadata?.knowledge?.map((result) => result.sourceUrl) || []
     const citations = message?.metadata?.citations || [...websearchResults, ...knowledgeResults]
+    const webSearchLength = websearchResults.length // 计算 web search 结果的数量
 
-    // 优化正则表达式匹配
     if (message.metadata?.webSearch || message.metadata?.knowledge) {
-      // 合并两个正则为一个，减少遍历次数
-      content = content.replace(/\[\[(\d+)\]\]|\[(\d+)\]/g, (match, num1, num2) => {
-        const num = num1 || num2
-        const index = parseInt(num) - 1
+      const usedOriginalIndexes: number[] = []
+      const citationRegex = /\[\[(\d+)\]\]|\[(\d+)\]/g
 
-        if (index < 0 || index >= citations.length) {
-          return match
+      // 第一步: 识别有效的原始索引
+      for (const match of content.matchAll(citationRegex)) {
+        const numStr = match[1] || match[2]
+        const index = parseInt(numStr) - 1
+        if (index >= webSearchLength && index < citations.length && citations[index]) {
+          if (!usedOriginalIndexes.includes(index)) {
+            usedOriginalIndexes.push(index)
+          }
+        }
+      }
+      // 对使用的原始索引进行排序，以便后续查找新索引
+      usedOriginalIndexes.sort((a, b) => a - b)
+
+      // 创建原始索引到新索引的映射
+      const originalIndexToNewIndexMap = new Map<number, number>()
+      usedOriginalIndexes.forEach((originalIndex, newIndex) => {
+        originalIndexToNewIndexMap.set(originalIndex, newIndex)
+      })
+
+      // 第二步: 替换并使用新的索引编号
+      content = content.replace(citationRegex, (match, num1, num2) => {
+        const numStr = num1 || num2
+        const originalIndex = parseInt(numStr) - 1
+
+        // 检查索引是否有效
+        if (originalIndex < 0 || originalIndex >= citations.length || !citations[originalIndex]) {
+          return match // 无效索引，返回原文
         }
 
-        const link = citations[index]
-
-        if (!link) {
-          return match
-        }
-
-        const isWebLink = link.startsWith('http://') || link.startsWith('https://')
-        if (!isWebLink) {
-          return `<sup>${num}</sup>`
-        }
-
-        const citation = citationsData[link] || { url: link }
+        const link = citations[originalIndex]
+        const citation = { ...(citationsData[link] || { url: link }) }
         if (citation.content) {
           citation.content = citation.content.substring(0, 200)
         }
+        const citationDataHtml = encodeHTML(JSON.stringify(citation))
 
-        return `[<sup data-citation='${encodeHTML(JSON.stringify(citation))}'>${num}</sup>](${link})`
+        // 检查是否是 *被使用的知识库* 引用
+        if (originalIndexToNewIndexMap.has(originalIndex)) {
+          const newIndex = originalIndexToNewIndexMap.get(originalIndex)!
+          const newCitationNum = webSearchLength + newIndex + 1 // 重新编号的知识库引用 (从websearch index+1开始)
+
+          const isWebLink = link.startsWith('http://') || link.startsWith('https://')
+          if (!isWebLink) {
+            // 知识库引用通常不是网页链接，只显示上标数字
+            return `<sup>${newCitationNum}</sup>`
+          } else {
+            // 如果知识库源是网页链接 (特殊情况)
+            return `[<sup data-citation='${citationDataHtml}'>${newCitationNum}</sup>](${link})`
+          }
+        }
+        // 检查是否是 *Web搜索* 引用
+        else if (originalIndex < webSearchLength) {
+          const citationNum = originalIndex + 1 // Web搜索引用保持原编号 (从1开始)
+          return `[<sup data-citation='${citationDataHtml}'>${citationNum}</sup>](${link})`
+        }
+        // 其他情况 (如未使用的知识库引用)，返回原文
+        else {
+          return match
+        }
       })
+
+      // 过滤掉未使用的知识索引
+      message = {
+        ...message,
+        metadata: {
+          ...message.metadata,
+          // 根据其对应的全局索引是否存在于 usedOriginalIndexes 来过滤
+          knowledge: message.metadata.knowledge?.filter((_, knowledgeIndex) =>
+            usedOriginalIndexes.includes(knowledgeIndex + webSearchLength)
+          )
+        }
+      }
     } else {
-      // 使用预编译的正则表达式
+      // 处理非 webSearch/knowledge 的情况 (这部分逻辑保持不变)
       const citationRegex = /\[<sup>(\d+)<\/sup>\]\(([^)]+)\)/g
       content = content.replace(citationRegex, (_, num, url) => {
         const citation = citationsData[url] || { url }
diff --git a/src/renderer/src/services/ApiService.ts b/src/renderer/src/services/ApiService.ts
index ea910342..14c408e5 100644
--- a/src/renderer/src/services/ApiService.ts
+++ b/src/renderer/src/services/ApiService.ts
@@ -70,10 +70,16 @@ export async function fetchChatCompletion({
 
   // 网络搜索/知识库 关键词提取
   const extract = async () => {
+    const tools: string[] = []
+
+    if (assistant.enableWebSearch) tools.push('websearch')
+    if (hasKnowledgeBase) tools.push('knowledge')
+
     const summaryAssistant = {
       ...assistant,
-      prompt: SEARCH_SUMMARY_PROMPT
+      prompt: SEARCH_SUMMARY_PROMPT.replace('{tools}', tools.join(', '))
     }
+
     const keywords = await fetchSearchSummary({
       messages: lastAnswer ? [lastAnswer, lastUserMessage] : [lastUserMessage],
       assistant: summaryAssistant