From a1ffabae4101b27fe91efc3eb50c40811bf419d5 Mon Sep 17 00:00:00 2001 From: Chen Tao <70054568+eeee0717@users.noreply.github.com> Date: Tue, 22 Apr 2025 20:17:11 +0800 Subject: [PATCH] fix(knowledge): fix citation bug and optimize extract logic (#5195) * fix(knowledge): change search ui and fix search bug * fix: knowledge citation * feat: optimize extract logic --- src/renderer/src/config/prompts.ts | 2 +- .../pages/home/Messages/MessageContent.tsx | 97 ++++++++++++++----- src/renderer/src/services/ApiService.ts | 8 +- 3 files changed, 82 insertions(+), 25 deletions(-) diff --git a/src/renderer/src/config/prompts.ts b/src/renderer/src/config/prompts.ts index 1d5ef1de..a5139792 100644 --- a/src/renderer/src/config/prompts.ts +++ b/src/renderer/src/config/prompts.ts @@ -60,7 +60,7 @@ export const SEARCH_SUMMARY_PROMPT = ` 4. Websearch: Always return the rephrased question inside the 'question' XML block. If there are no links in the follow-up question, do not insert a 'links' XML block in your response. 5. Knowledge: Always return the rephrased question inside the 'question' XML block. 6. Always wrap the rephrased question in the appropriate XML blocks to specify the tool(s) for retrieving information: use for queries requiring real-time or external information, for queries that can be answered from a pre-existing knowledge base, or both if the question could be applicable to either tool. Ensure that the rephrased question is always contained within a block inside these wrappers. - 7. If you are not sure to use knowledge or websearch, you need use both of them. + 7. *use {tools} to rephrase the question* There are several examples attached for your reference inside the below 'examples' XML block. diff --git a/src/renderer/src/pages/home/Messages/MessageContent.tsx b/src/renderer/src/pages/home/Messages/MessageContent.tsx index e6a28522..ddf1b864 100644 --- a/src/renderer/src/pages/home/Messages/MessageContent.tsx +++ b/src/renderer/src/pages/home/Messages/MessageContent.tsx @@ -30,7 +30,7 @@ const toolUseRegex = /([\s\S]*?)<\/tool_use>/g const MessageContent: React.FC = ({ message: _message, model }) => { const { t } = useTranslation() - const message = withMessageThought(clone(_message)) + let message = withMessageThought(clone(_message)) // Memoize message status checks const messageStatus = useMemo( @@ -119,6 +119,9 @@ const MessageContent: React.FC = ({ message: _message, model }) => { message.metadata?.webSearchInfo ]) + /** + * 知识库索引部分:解决LLM回复中未使用的知识库引用索引问题 + */ // Process content to make citation numbers clickable const processedContent = useMemo(() => { const metadataFields = ['citations', 'webSearch', 'webSearchInfo', 'annotations', 'knowledge'] @@ -129,42 +132,90 @@ const MessageContent: React.FC = ({ message: _message, model }) => { return content } - // 预先计算citations数组,避免重复计算 + // 预先计算citations数组 const websearchResults = message?.metadata?.webSearch?.results?.map((result) => result.url) || [] const knowledgeResults = message?.metadata?.knowledge?.map((result) => result.sourceUrl) || [] const citations = message?.metadata?.citations || [...websearchResults, ...knowledgeResults] + const webSearchLength = websearchResults.length // 计算 web search 结果的数量 - // 优化正则表达式匹配 if (message.metadata?.webSearch || message.metadata?.knowledge) { - // 合并两个正则为一个,减少遍历次数 - content = content.replace(/\[\[(\d+)\]\]|\[(\d+)\]/g, (match, num1, num2) => { - const num = num1 || num2 - const index = parseInt(num) - 1 + const usedOriginalIndexes: number[] = [] + const citationRegex = /\[\[(\d+)\]\]|\[(\d+)\]/g - if (index < 0 || index >= citations.length) { - return match + // 第一步: 识别有效的原始索引 + for (const match of content.matchAll(citationRegex)) { + const numStr = match[1] || match[2] + const index = parseInt(numStr) - 1 + if (index >= webSearchLength && index < citations.length && citations[index]) { + if (!usedOriginalIndexes.includes(index)) { + usedOriginalIndexes.push(index) + } + } + } + // 对使用的原始索引进行排序,以便后续查找新索引 + usedOriginalIndexes.sort((a, b) => a - b) + + // 创建原始索引到新索引的映射 + const originalIndexToNewIndexMap = new Map() + usedOriginalIndexes.forEach((originalIndex, newIndex) => { + originalIndexToNewIndexMap.set(originalIndex, newIndex) + }) + + // 第二步: 替换并使用新的索引编号 + content = content.replace(citationRegex, (match, num1, num2) => { + const numStr = num1 || num2 + const originalIndex = parseInt(numStr) - 1 + + // 检查索引是否有效 + if (originalIndex < 0 || originalIndex >= citations.length || !citations[originalIndex]) { + return match // 无效索引,返回原文 } - const link = citations[index] - - if (!link) { - return match - } - - const isWebLink = link.startsWith('http://') || link.startsWith('https://') - if (!isWebLink) { - return `${num}` - } - - const citation = citationsData[link] || { url: link } + const link = citations[originalIndex] + const citation = { ...(citationsData[link] || { url: link }) } if (citation.content) { citation.content = citation.content.substring(0, 200) } + const citationDataHtml = encodeHTML(JSON.stringify(citation)) - return `[${num}](${link})` + // 检查是否是 *被使用的知识库* 引用 + if (originalIndexToNewIndexMap.has(originalIndex)) { + const newIndex = originalIndexToNewIndexMap.get(originalIndex)! + const newCitationNum = webSearchLength + newIndex + 1 // 重新编号的知识库引用 (从websearch index+1开始) + + const isWebLink = link.startsWith('http://') || link.startsWith('https://') + if (!isWebLink) { + // 知识库引用通常不是网页链接,只显示上标数字 + return `${newCitationNum}` + } else { + // 如果知识库源是网页链接 (特殊情况) + return `[${newCitationNum}](${link})` + } + } + // 检查是否是 *Web搜索* 引用 + else if (originalIndex < webSearchLength) { + const citationNum = originalIndex + 1 // Web搜索引用保持原编号 (从1开始) + return `[${citationNum}](${link})` + } + // 其他情况 (如未使用的知识库引用),返回原文 + else { + return match + } }) + + // 过滤掉未使用的知识索引 + message = { + ...message, + metadata: { + ...message.metadata, + // 根据其对应的全局索引是否存在于 usedOriginalIndexes 来过滤 + knowledge: message.metadata.knowledge?.filter((_, knowledgeIndex) => + usedOriginalIndexes.includes(knowledgeIndex + webSearchLength) + ) + } + } } else { - // 使用预编译的正则表达式 + // 处理非 webSearch/knowledge 的情况 (这部分逻辑保持不变) const citationRegex = /\[(\d+)<\/sup>\]\(([^)]+)\)/g content = content.replace(citationRegex, (_, num, url) => { const citation = citationsData[url] || { url } diff --git a/src/renderer/src/services/ApiService.ts b/src/renderer/src/services/ApiService.ts index ea910342..14c408e5 100644 --- a/src/renderer/src/services/ApiService.ts +++ b/src/renderer/src/services/ApiService.ts @@ -70,10 +70,16 @@ export async function fetchChatCompletion({ // 网络搜索/知识库 关键词提取 const extract = async () => { + const tools: string[] = [] + + if (assistant.enableWebSearch) tools.push('websearch') + if (hasKnowledgeBase) tools.push('knowledge') + const summaryAssistant = { ...assistant, - prompt: SEARCH_SUMMARY_PROMPT + prompt: SEARCH_SUMMARY_PROMPT.replace('{tools}', tools.join(', ')) } + const keywords = await fetchSearchSummary({ messages: lastAnswer ? [lastAnswer, lastUserMessage] : [lastUserMessage], assistant: summaryAssistant