From 7f7300e6dc54968145f7e579eb0411b42e729718 Mon Sep 17 00:00:00 2001 From: Nanami Date: Sat, 25 Jan 2025 16:08:57 +0800 Subject: [PATCH] feat: Support configurable chunk size and overlap for knowledge base --- ...-tools-embedjs-npm-0.1.25-ec5645cf36.patch | 200 ++++++++++++++++++ src/main/services/KnowledgeService.ts | 71 ++++++- src/renderer/src/i18n/locales/en-us.json | 6 +- src/renderer/src/i18n/locales/ja-jp.json | 6 +- src/renderer/src/i18n/locales/ru-ru.json | 6 +- src/renderer/src/i18n/locales/zh-cn.json | 6 +- src/renderer/src/i18n/locales/zh-tw.json | 6 +- .../src/pages/knowledge/KnowledgeContent.tsx | 7 + .../components/AddKnowledgePopup.tsx | 28 ++- src/renderer/src/services/KnowledgeService.ts | 4 +- src/renderer/src/types/index.ts | 4 + yarn.lock | 4 +- 12 files changed, 328 insertions(+), 20 deletions(-) diff --git a/.yarn/patches/@llm-tools-embedjs-npm-0.1.25-ec5645cf36.patch b/.yarn/patches/@llm-tools-embedjs-npm-0.1.25-ec5645cf36.patch index 3b6d373e..4d853dd6 100644 --- a/.yarn/patches/@llm-tools-embedjs-npm-0.1.25-ec5645cf36.patch +++ b/.yarn/patches/@llm-tools-embedjs-npm-0.1.25-ec5645cf36.patch @@ -15,3 +15,203 @@ index 50c3c4064af17bc4c7c46554d8f2419b3afceb0e..632c9b2e04d2e0e3bb09ef1cd8f29d25 } static getInstance() { return RAGEmbedding.singleton; +diff --git a/src/loaders/local-path-loader.d.ts b/src/loaders/local-path-loader.d.ts +index 48c20e68c469cd309be2dc8f28e44c1bd04a26e9..87002be39e7305a02e2a607b0c0d95cbbc359f9d 100644 +--- a/src/loaders/local-path-loader.d.ts ++++ b/src/loaders/local-path-loader.d.ts +@@ -1,19 +1,29 @@ +-import { BaseLoader } from '@llm-tools/embedjs-interfaces'; ++import { BaseLoader } from "@llm-tools/embedjs-interfaces"; + export declare class LocalPathLoader extends BaseLoader<{ +- type: 'LocalPathLoader'; ++ type: "LocalPathLoader"; + }> { +- private readonly debug; +- private readonly path; +- constructor({ path }: { +- path: string; +- }); +- getUnfilteredChunks(): AsyncGenerator<{ +- metadata: { +- type: "LocalPathLoader"; +- originalPath: string; +- source: string; +- }; +- pageContent: string; +- }, void, unknown>; +- private recursivelyAddPath; ++ private readonly debug; ++ private readonly path; ++ constructor({ ++ path, ++ chunkSize, ++ chunkOverlap, ++ }: { ++ path: string; ++ chunkSize?: number; ++ chunkOverlap?: number; ++ }); ++ getUnfilteredChunks(): AsyncGenerator< ++ { ++ metadata: { ++ type: "LocalPathLoader"; ++ originalPath: string; ++ source: string; ++ }; ++ pageContent: string; ++ }, ++ void, ++ unknown ++ >; ++ private recursivelyAddPath; + } +diff --git a/src/loaders/local-path-loader.js b/src/loaders/local-path-loader.js +index 4cf8a6bd1d890244c8ec49d4a05ee3bd58861c79..fd0fe1951c73da315b0c9bf4a8f33effbadb9f8f 100644 +--- a/src/loaders/local-path-loader.js ++++ b/src/loaders/local-path-loader.js +@@ -8,8 +8,8 @@ import { BaseLoader } from '@llm-tools/embedjs-interfaces'; + export class LocalPathLoader extends BaseLoader { + debug = createDebugMessages('embedjs:loader:LocalPathLoader'); + path; +- constructor({ path }) { +- super(`LocalPathLoader_${md5(path)}`, { path }); ++ constructor({ path, chunkSize, chunkOverlap}) { ++ super(`LocalPathLoader_${md5(path)}`, { path }, chunkSize ?? 1000, chunkOverlap ?? 0); + this.path = path; + } + async *getUnfilteredChunks() { +@@ -36,10 +36,12 @@ export class LocalPathLoader extends BaseLoader { + const extension = currentPath.split('.').pop().toLowerCase(); + if (extension === 'md' || extension === 'mdx') + mime = 'text/markdown'; ++ if (extension === 'txt') ++ mime = 'text/plain'; + this.debug(`File '${this.path}' mime type updated to 'text/markdown'`); + } + try { +- const loader = await createLoaderFromMimeType(currentPath, mime); ++ const loader = await createLoaderFromMimeType(currentPath, mime, this.chunkSize, this.chunkOverlap); + for await (const result of await loader.getUnfilteredChunks()) { + yield { + pageContent: result.pageContent, +diff --git a/src/util/mime.d.ts b/src/util/mime.d.ts +index 57f56a1b8edc98366af9f84d671676c41c2f01ca..f53856fa9c78afbeee9e085c7ed0b3a131f8ee5a 100644 +--- a/src/util/mime.d.ts ++++ b/src/util/mime.d.ts +@@ -1,2 +1,7 @@ +-import { BaseLoader } from '@llm-tools/embedjs-interfaces'; +-export declare function createLoaderFromMimeType(loaderData: string, mimeType: string): Promise; ++import { BaseLoader } from "@llm-tools/embedjs-interfaces"; ++export declare function createLoaderFromMimeType( ++ loaderData: string, ++ mimeType: string, ++ chunkSize?: number, ++ chunkOverlap?: number ++): Promise; +diff --git a/src/util/mime.js b/src/util/mime.js +index 9af30bd5b8cf42985f547073a4c19756292c33a3..54ae20343131a533ab70236d3060b6accc8f6126 100644 +--- a/src/util/mime.js ++++ b/src/util/mime.js +@@ -1,7 +1,9 @@ + import mime from 'mime'; + import createDebugMessages from 'debug'; + import { TextLoader } from '../loaders/text-loader.js'; +-export async function createLoaderFromMimeType(loaderData, mimeType) { ++import fs from 'node:fs'; ++ ++export async function createLoaderFromMimeType(loaderData, mimeType, chunkSize, chunkOverlap) { + createDebugMessages('embedjs:util:createLoaderFromMimeType')(`Incoming mime type '${mimeType}'`); + switch (mimeType) { + case 'application/msword': +@@ -10,7 +12,7 @@ export async function createLoaderFromMimeType(loaderData, mimeType) { + throw new Error('Package `@llm-tools/embedjs-loader-msoffice` needs to be installed to load docx files'); + }); + createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported DocxLoader'); +- return new DocxLoader({ filePathOrUrl: loaderData }); ++ return new DocxLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap }); + } + case 'application/vnd.ms-excel': + case 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': { +@@ -18,21 +20,21 @@ export async function createLoaderFromMimeType(loaderData, mimeType) { + throw new Error('Package `@llm-tools/embedjs-loader-msoffice` needs to be installed to load excel files'); + }); + createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported ExcelLoader'); +- return new ExcelLoader({ filePathOrUrl: loaderData }); ++ return new ExcelLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap }); + } + case 'application/pdf': { + const { PdfLoader } = await import('@llm-tools/embedjs-loader-pdf').catch(() => { + throw new Error('Package `@llm-tools/embedjs-loader-pdf` needs to be installed to load PDF files'); + }); + createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported PdfLoader'); +- return new PdfLoader({ filePathOrUrl: loaderData }); ++ return new PdfLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap }); + } + case 'application/vnd.openxmlformats-officedocument.presentationml.presentation': { + const { PptLoader } = await import('@llm-tools/embedjs-loader-msoffice').catch(() => { + throw new Error('Package `@llm-tools/embedjs-loader-msoffice` needs to be installed to load pptx files'); + }); + createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported PptLoader'); +- return new PptLoader({ filePathOrUrl: loaderData }); ++ return new PptLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap }); + } + case 'text/plain': { + const fineType = mime.getType(loaderData); +@@ -42,24 +44,26 @@ export async function createLoaderFromMimeType(loaderData, mimeType) { + throw new Error('Package `@llm-tools/embedjs-loader-csv` needs to be installed to load CSV files'); + }); + createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported CsvLoader'); +- return new CsvLoader({ filePathOrUrl: loaderData }); ++ return new CsvLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap }); ++ } ++ else{ ++ const content = fs.readFileSync(loaderData, 'utf-8'); ++ return new TextLoader({ text: content, chunkSize, chunkOverlap }); + } +- else +- return new TextLoader({ text: loaderData }); + } + case 'application/csv': { + const { CsvLoader } = await import('@llm-tools/embedjs-loader-csv').catch(() => { + throw new Error('Package `@llm-tools/embedjs-loader-csv` needs to be installed to load CSV files'); + }); + createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported CsvLoader'); +- return new CsvLoader({ filePathOrUrl: loaderData }); ++ return new CsvLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap }); + } + case 'text/html': { + const { WebLoader } = await import('@llm-tools/embedjs-loader-web').catch(() => { + throw new Error('Package `@llm-tools/embedjs-loader-web` needs to be installed to load web documents'); + }); + createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported WebLoader'); +- return new WebLoader({ urlOrContent: loaderData }); ++ return new WebLoader({ urlOrContent: loaderData, chunkSize, chunkOverlap }); + } + case 'text/xml': { + const { SitemapLoader } = await import('@llm-tools/embedjs-loader-sitemap').catch(() => { +@@ -67,14 +71,14 @@ export async function createLoaderFromMimeType(loaderData, mimeType) { + }); + createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported SitemapLoader'); + if (await SitemapLoader.test(loaderData)) { +- return new SitemapLoader({ url: loaderData }); ++ return new SitemapLoader({ url: loaderData, chunkSize, chunkOverlap }); + } + //This is not a Sitemap but is still XML + const { XmlLoader } = await import('@llm-tools/embedjs-loader-xml').catch(() => { + throw new Error('Package `@llm-tools/embedjs-loader-xml` needs to be installed to load XML documents'); + }); + createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported XmlLoader'); +- return new XmlLoader({ filePathOrUrl: loaderData }); ++ return new XmlLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap }); + } + case 'text/x-markdown': + case 'text/markdown': { +@@ -82,7 +86,7 @@ export async function createLoaderFromMimeType(loaderData, mimeType) { + throw new Error('Package `@llm-tools/embedjs-loader-markdown` needs to be installed to load markdown files'); + }); + createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported MarkdownLoader'); +- return new MarkdownLoader({ filePathOrUrl: loaderData }); ++ return new MarkdownLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap }); + } + case undefined: + throw new Error(`MIME type could not be detected. Please file an issue if you think this is a bug.`); diff --git a/src/main/services/KnowledgeService.ts b/src/main/services/KnowledgeService.ts index 2b1b3e30..9010f4c2 100644 --- a/src/main/services/KnowledgeService.ts +++ b/src/main/services/KnowledgeService.ts @@ -83,54 +83,103 @@ class KnowledgeService { if (item.type === 'directory') { const directory = item.content as string - return await ragApplication.addLoader(new LocalPathLoader({ path: directory }), forceReload) + return await ragApplication.addLoader( + new LocalPathLoader({ path: directory, chunkSize: base.chunkSize, chunkOverlap: base.chunkOverlap }) as any, + forceReload + ) } if (item.type === 'url') { const content = item.content as string if (content.startsWith('http')) { - // @ts-ignore loader type - return await ragApplication.addLoader(new WebLoader({ urlOrContent: content }), forceReload) + return await ragApplication.addLoader( + new WebLoader({ urlOrContent: content, chunkSize: base.chunkSize, chunkOverlap: base.chunkOverlap }) as any, + forceReload + ) } } if (item.type === 'sitemap') { const content = item.content as string // @ts-ignore loader type - return await ragApplication.addLoader(new SitemapLoader({ url: content }), forceReload) + return await ragApplication.addLoader( + new SitemapLoader({ url: content, chunkSize: base.chunkSize, chunkOverlap: base.chunkOverlap }) as any, + forceReload + ) } if (item.type === 'note') { const content = item.content as string - return await ragApplication.addLoader(new TextLoader({ text: content }), forceReload) + return await ragApplication.addLoader( + new TextLoader({ text: content, chunkSize: base.chunkSize, chunkOverlap: base.chunkOverlap }), + forceReload + ) } if (item.type === 'file') { const file = item.content as FileType if (file.ext === '.pdf') { - return await ragApplication.addLoader(new PdfLoader({ filePathOrUrl: file.path }) as any, forceReload) + return await ragApplication.addLoader( + new PdfLoader({ + filePathOrUrl: file.path, + chunkSize: base.chunkSize, + chunkOverlap: base.chunkOverlap + }) as any, + forceReload + ) } if (file.ext === '.docx') { - return await ragApplication.addLoader(new DocxLoader({ filePathOrUrl: file.path }) as any, forceReload) + return await ragApplication.addLoader( + new DocxLoader({ + filePathOrUrl: file.path, + chunkSize: base.chunkSize, + chunkOverlap: base.chunkOverlap + }) as any, + forceReload + ) } if (file.ext === '.pptx') { - return await ragApplication.addLoader(new PptLoader({ filePathOrUrl: file.path }) as any, forceReload) + return await ragApplication.addLoader( + new PptLoader({ + filePathOrUrl: file.path, + chunkSize: base.chunkSize, + chunkOverlap: base.chunkOverlap + }) as any, + forceReload + ) } if (file.ext === '.xlsx') { - return await ragApplication.addLoader(new ExcelLoader({ filePathOrUrl: file.path }) as any, forceReload) + return await ragApplication.addLoader( + new ExcelLoader({ + filePathOrUrl: file.path, + chunkSize: base.chunkSize, + chunkOverlap: base.chunkOverlap + }) as any, + forceReload + ) } if (['.md'].includes(file.ext)) { - return await ragApplication.addLoader(new MarkdownLoader({ filePathOrUrl: file.path }) as any, forceReload) + return await ragApplication.addLoader( + new MarkdownLoader({ + filePathOrUrl: file.path, + chunkSize: base.chunkSize, + chunkOverlap: base.chunkOverlap + }) as any, + forceReload + ) } const fileContent = fs.readFileSync(file.path, 'utf-8') - return await ragApplication.addLoader(new TextLoader({ text: fileContent }), forceReload) + return await ragApplication.addLoader( + new TextLoader({ text: fileContent, chunkSize: base.chunkSize, chunkOverlap: base.chunkOverlap }), + forceReload + ) } return { entriesAdded: 0, uniqueId: '', loaderType: '' } diff --git a/src/renderer/src/i18n/locales/en-us.json b/src/renderer/src/i18n/locales/en-us.json index 5f57f452..dc230ae5 100644 --- a/src/renderer/src/i18n/locales/en-us.json +++ b/src/renderer/src/i18n/locales/en-us.json @@ -245,6 +245,7 @@ "error.enter.api.key": "Please enter your API key first", "error.enter.model": "Please select a model first", "error.enter.name": "Please enter the name of the knowledge base", + "error.chunk_overlap_too_large": "Chunk overlap cannot be greater than chunk size", "error.invalid.proxy.url": "Invalid proxy URL", "error.invalid.webdav": "Invalid WebDAV settings", "message.code_style": "Code style", @@ -625,7 +626,10 @@ "model_info": "Model Info", "not_support": "Knowledge base database engine updated, the knowledge base will no longer be supported, please create a new knowledge base", "no_provider": "Knowledge base model provider is not set, the knowledge base will no longer be supported, please create a new knowledge base", - "source": "Source" + "source": "Source", + "chunk_size": "Chunk Size", + "chunk_overlap": "Chunk Overlap", + "not_set": "Not Set" }, "models": { "pinned": "Pinned", diff --git a/src/renderer/src/i18n/locales/ja-jp.json b/src/renderer/src/i18n/locales/ja-jp.json index 8e11e990..75112095 100644 --- a/src/renderer/src/i18n/locales/ja-jp.json +++ b/src/renderer/src/i18n/locales/ja-jp.json @@ -244,6 +244,7 @@ "error.enter.api.host": "APIホストを入力してください", "error.enter.api.key": "APIキーを入力してください", "error.enter.model": "モデルを選択してください", + "error.chunk_overlap_too_large": "チャンクの重なりは、チャンクサイズを超えることはできません", "error.invalid.proxy.url": "無効なプロキシURL", "error.invalid.webdav": "無効なWebDAV設定", "message.code_style": "コードスタイル", @@ -609,7 +610,10 @@ "model_info": "モデル情報", "not_support": "ナレッジベースデータベースエンジンが更新されました。このナレッジベースはもうサポートされていません。新しいナレッジベースを作成してください", "no_provider": "ナレッジベースモデルプロバイダーが設定されていません。ナレッジベースはもうサポートされていません。新しいナレッジベースを作成してください", - "source": "ソース" + "source": "ソース", + "chunk_size": "チャンクサイズ", + "chunk_overlap": "チャンクの重なり", + "not_set": "未設定" }, "models": { "pinned": "固定済み", diff --git a/src/renderer/src/i18n/locales/ru-ru.json b/src/renderer/src/i18n/locales/ru-ru.json index 97a55348..b24296c7 100644 --- a/src/renderer/src/i18n/locales/ru-ru.json +++ b/src/renderer/src/i18n/locales/ru-ru.json @@ -245,6 +245,7 @@ "error.enter.api.key": "Пожалуйста, введите ваш API ключ", "error.enter.model": "Пожалуйста, выберите модель", "error.enter.name": "Пожалуйста, введите название базы знаний", + "error.chunk_overlap_too_large": "Перекрытие фрагментов не может быть больше размера фрагмента.", "error.invalid.proxy.url": "Неверный URL прокси", "error.invalid.webdav": "Неверные настройки WebDAV", "message.code_style": "Стиль кода", @@ -622,7 +623,10 @@ "model_info": "Модель информации", "not_support": "База знаний базы данных движок обновлен, база знаний больше не поддерживается, пожалуйста, создайте новую базу знаний", "no_provider": "База знаний модель поставщика не настроена, база знаний больше не поддерживается, пожалуйста, создайте новую базу знаний", - "source": "Источник" + "source": "Источник", + "chunk_size": "Размер фрагмента", + "chunk_overlap": "Перекрытие фрагмента", + "not_set": "Не установлено" }, "models": { "pinned": "Закреплено", diff --git a/src/renderer/src/i18n/locales/zh-cn.json b/src/renderer/src/i18n/locales/zh-cn.json index f77c233f..a7761225 100644 --- a/src/renderer/src/i18n/locales/zh-cn.json +++ b/src/renderer/src/i18n/locales/zh-cn.json @@ -246,6 +246,7 @@ "error.enter.api.key": "请输入您的 API 密钥", "error.enter.model": "请选择一个模型", "error.enter.name": "请输入知识库名称", + "error.chunk_overlap_too_large": "分段重叠不能大于分段大小", "error.invalid.proxy.url": "无效的代理地址", "error.invalid.webdav": "无效的 WebDAV 设置", "message.code_style": "代码风格", @@ -611,7 +612,10 @@ "model_info": "模型信息", "not_support": "知识库数据库引擎已更新,该知识库将不再支持,请重新创建知识库", "no_provider": "知识库模型服务商丢失,该知识库将不再支持,请重新创建知识库", - "source": "来源" + "source": "来源", + "chunk_size": "分段大小", + "chunk_overlap": "重叠大小", + "not_set": "未设置" }, "models": { "pinned": "已固定", diff --git a/src/renderer/src/i18n/locales/zh-tw.json b/src/renderer/src/i18n/locales/zh-tw.json index 97cc6b1e..90506ae6 100644 --- a/src/renderer/src/i18n/locales/zh-tw.json +++ b/src/renderer/src/i18n/locales/zh-tw.json @@ -245,6 +245,7 @@ "error.enter.api.key": "請先輸入您的 API 密鑰", "error.enter.model": "請先選擇一個模型", "error.enter.name": "請先輸入知識庫名稱", + "error.chunk_overlap_too_large": "分段重疊不能大於分段大小", "error.invalid.proxy.url": "無效的代理 URL", "error.invalid.webdav": "無效的 WebDAV 設定", "message.code_style": "程式碼風格", @@ -610,7 +611,10 @@ "model_info": "模型信息", "not_support": "知識庫數據庫引擎已更新,該知識庫將不再支持,請重新創建知識庫", "no_provider": "知識庫模型提供商遺失,該知識庫將不再支持,請重新創建知識庫", - "source": "來源" + "source": "來源", + "chunk_size": "分段大小", + "chunk_overlap": "重疊大小", + "not_set": "未設置" }, "models": { "pinned": "已固定", diff --git a/src/renderer/src/pages/knowledge/KnowledgeContent.tsx b/src/renderer/src/pages/knowledge/KnowledgeContent.tsx index 805d8352..5030b544 100644 --- a/src/renderer/src/pages/knowledge/KnowledgeContent.tsx +++ b/src/renderer/src/pages/knowledge/KnowledgeContent.tsx @@ -361,6 +361,13 @@ const KnowledgeContent: FC = ({ selectedBase }) => { {providerName && {providerName}} + + + {base.chunkSize || t('knowledge.not_set')} + + {base.chunkOverlap || t('knowledge.not_set')} + +