feat: Support configurable chunk size and overlap for knowledge base

This commit is contained in:
Nanami 2025-01-25 16:08:57 +08:00 committed by 亢奋猫
parent 4464992873
commit 7f7300e6dc
12 changed files with 328 additions and 20 deletions

View File

@ -15,3 +15,203 @@ index 50c3c4064af17bc4c7c46554d8f2419b3afceb0e..632c9b2e04d2e0e3bb09ef1cd8f29d25
}
static getInstance() {
return RAGEmbedding.singleton;
diff --git a/src/loaders/local-path-loader.d.ts b/src/loaders/local-path-loader.d.ts
index 48c20e68c469cd309be2dc8f28e44c1bd04a26e9..87002be39e7305a02e2a607b0c0d95cbbc359f9d 100644
--- a/src/loaders/local-path-loader.d.ts
+++ b/src/loaders/local-path-loader.d.ts
@@ -1,19 +1,29 @@
-import { BaseLoader } from '@llm-tools/embedjs-interfaces';
+import { BaseLoader } from "@llm-tools/embedjs-interfaces";
export declare class LocalPathLoader extends BaseLoader<{
- type: 'LocalPathLoader';
+ type: "LocalPathLoader";
}> {
- private readonly debug;
- private readonly path;
- constructor({ path }: {
- path: string;
- });
- getUnfilteredChunks(): AsyncGenerator<{
- metadata: {
- type: "LocalPathLoader";
- originalPath: string;
- source: string;
- };
- pageContent: string;
- }, void, unknown>;
- private recursivelyAddPath;
+ private readonly debug;
+ private readonly path;
+ constructor({
+ path,
+ chunkSize,
+ chunkOverlap,
+ }: {
+ path: string;
+ chunkSize?: number;
+ chunkOverlap?: number;
+ });
+ getUnfilteredChunks(): AsyncGenerator<
+ {
+ metadata: {
+ type: "LocalPathLoader";
+ originalPath: string;
+ source: string;
+ };
+ pageContent: string;
+ },
+ void,
+ unknown
+ >;
+ private recursivelyAddPath;
}
diff --git a/src/loaders/local-path-loader.js b/src/loaders/local-path-loader.js
index 4cf8a6bd1d890244c8ec49d4a05ee3bd58861c79..fd0fe1951c73da315b0c9bf4a8f33effbadb9f8f 100644
--- a/src/loaders/local-path-loader.js
+++ b/src/loaders/local-path-loader.js
@@ -8,8 +8,8 @@ import { BaseLoader } from '@llm-tools/embedjs-interfaces';
export class LocalPathLoader extends BaseLoader {
debug = createDebugMessages('embedjs:loader:LocalPathLoader');
path;
- constructor({ path }) {
- super(`LocalPathLoader_${md5(path)}`, { path });
+ constructor({ path, chunkSize, chunkOverlap}) {
+ super(`LocalPathLoader_${md5(path)}`, { path }, chunkSize ?? 1000, chunkOverlap ?? 0);
this.path = path;
}
async *getUnfilteredChunks() {
@@ -36,10 +36,12 @@ export class LocalPathLoader extends BaseLoader {
const extension = currentPath.split('.').pop().toLowerCase();
if (extension === 'md' || extension === 'mdx')
mime = 'text/markdown';
+ if (extension === 'txt')
+ mime = 'text/plain';
this.debug(`File '${this.path}' mime type updated to 'text/markdown'`);
}
try {
- const loader = await createLoaderFromMimeType(currentPath, mime);
+ const loader = await createLoaderFromMimeType(currentPath, mime, this.chunkSize, this.chunkOverlap);
for await (const result of await loader.getUnfilteredChunks()) {
yield {
pageContent: result.pageContent,
diff --git a/src/util/mime.d.ts b/src/util/mime.d.ts
index 57f56a1b8edc98366af9f84d671676c41c2f01ca..f53856fa9c78afbeee9e085c7ed0b3a131f8ee5a 100644
--- a/src/util/mime.d.ts
+++ b/src/util/mime.d.ts
@@ -1,2 +1,7 @@
-import { BaseLoader } from '@llm-tools/embedjs-interfaces';
-export declare function createLoaderFromMimeType(loaderData: string, mimeType: string): Promise<BaseLoader>;
+import { BaseLoader } from "@llm-tools/embedjs-interfaces";
+export declare function createLoaderFromMimeType(
+ loaderData: string,
+ mimeType: string,
+ chunkSize?: number,
+ chunkOverlap?: number
+): Promise<BaseLoader>;
diff --git a/src/util/mime.js b/src/util/mime.js
index 9af30bd5b8cf42985f547073a4c19756292c33a3..54ae20343131a533ab70236d3060b6accc8f6126 100644
--- a/src/util/mime.js
+++ b/src/util/mime.js
@@ -1,7 +1,9 @@
import mime from 'mime';
import createDebugMessages from 'debug';
import { TextLoader } from '../loaders/text-loader.js';
-export async function createLoaderFromMimeType(loaderData, mimeType) {
+import fs from 'node:fs';
+
+export async function createLoaderFromMimeType(loaderData, mimeType, chunkSize, chunkOverlap) {
createDebugMessages('embedjs:util:createLoaderFromMimeType')(`Incoming mime type '${mimeType}'`);
switch (mimeType) {
case 'application/msword':
@@ -10,7 +12,7 @@ export async function createLoaderFromMimeType(loaderData, mimeType) {
throw new Error('Package `@llm-tools/embedjs-loader-msoffice` needs to be installed to load docx files');
});
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported DocxLoader');
- return new DocxLoader({ filePathOrUrl: loaderData });
+ return new DocxLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap });
}
case 'application/vnd.ms-excel':
case 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': {
@@ -18,21 +20,21 @@ export async function createLoaderFromMimeType(loaderData, mimeType) {
throw new Error('Package `@llm-tools/embedjs-loader-msoffice` needs to be installed to load excel files');
});
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported ExcelLoader');
- return new ExcelLoader({ filePathOrUrl: loaderData });
+ return new ExcelLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap });
}
case 'application/pdf': {
const { PdfLoader } = await import('@llm-tools/embedjs-loader-pdf').catch(() => {
throw new Error('Package `@llm-tools/embedjs-loader-pdf` needs to be installed to load PDF files');
});
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported PdfLoader');
- return new PdfLoader({ filePathOrUrl: loaderData });
+ return new PdfLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap });
}
case 'application/vnd.openxmlformats-officedocument.presentationml.presentation': {
const { PptLoader } = await import('@llm-tools/embedjs-loader-msoffice').catch(() => {
throw new Error('Package `@llm-tools/embedjs-loader-msoffice` needs to be installed to load pptx files');
});
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported PptLoader');
- return new PptLoader({ filePathOrUrl: loaderData });
+ return new PptLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap });
}
case 'text/plain': {
const fineType = mime.getType(loaderData);
@@ -42,24 +44,26 @@ export async function createLoaderFromMimeType(loaderData, mimeType) {
throw new Error('Package `@llm-tools/embedjs-loader-csv` needs to be installed to load CSV files');
});
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported CsvLoader');
- return new CsvLoader({ filePathOrUrl: loaderData });
+ return new CsvLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap });
+ }
+ else{
+ const content = fs.readFileSync(loaderData, 'utf-8');
+ return new TextLoader({ text: content, chunkSize, chunkOverlap });
}
- else
- return new TextLoader({ text: loaderData });
}
case 'application/csv': {
const { CsvLoader } = await import('@llm-tools/embedjs-loader-csv').catch(() => {
throw new Error('Package `@llm-tools/embedjs-loader-csv` needs to be installed to load CSV files');
});
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported CsvLoader');
- return new CsvLoader({ filePathOrUrl: loaderData });
+ return new CsvLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap });
}
case 'text/html': {
const { WebLoader } = await import('@llm-tools/embedjs-loader-web').catch(() => {
throw new Error('Package `@llm-tools/embedjs-loader-web` needs to be installed to load web documents');
});
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported WebLoader');
- return new WebLoader({ urlOrContent: loaderData });
+ return new WebLoader({ urlOrContent: loaderData, chunkSize, chunkOverlap });
}
case 'text/xml': {
const { SitemapLoader } = await import('@llm-tools/embedjs-loader-sitemap').catch(() => {
@@ -67,14 +71,14 @@ export async function createLoaderFromMimeType(loaderData, mimeType) {
});
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported SitemapLoader');
if (await SitemapLoader.test(loaderData)) {
- return new SitemapLoader({ url: loaderData });
+ return new SitemapLoader({ url: loaderData, chunkSize, chunkOverlap });
}
//This is not a Sitemap but is still XML
const { XmlLoader } = await import('@llm-tools/embedjs-loader-xml').catch(() => {
throw new Error('Package `@llm-tools/embedjs-loader-xml` needs to be installed to load XML documents');
});
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported XmlLoader');
- return new XmlLoader({ filePathOrUrl: loaderData });
+ return new XmlLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap });
}
case 'text/x-markdown':
case 'text/markdown': {
@@ -82,7 +86,7 @@ export async function createLoaderFromMimeType(loaderData, mimeType) {
throw new Error('Package `@llm-tools/embedjs-loader-markdown` needs to be installed to load markdown files');
});
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported MarkdownLoader');
- return new MarkdownLoader({ filePathOrUrl: loaderData });
+ return new MarkdownLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap });
}
case undefined:
throw new Error(`MIME type could not be detected. Please file an issue if you think this is a bug.`);

View File

@ -83,54 +83,103 @@ class KnowledgeService {
if (item.type === 'directory') {
const directory = item.content as string
return await ragApplication.addLoader(new LocalPathLoader({ path: directory }), forceReload)
return await ragApplication.addLoader(
new LocalPathLoader({ path: directory, chunkSize: base.chunkSize, chunkOverlap: base.chunkOverlap }) as any,
forceReload
)
}
if (item.type === 'url') {
const content = item.content as string
if (content.startsWith('http')) {
// @ts-ignore loader type
return await ragApplication.addLoader(new WebLoader({ urlOrContent: content }), forceReload)
return await ragApplication.addLoader(
new WebLoader({ urlOrContent: content, chunkSize: base.chunkSize, chunkOverlap: base.chunkOverlap }) as any,
forceReload
)
}
}
if (item.type === 'sitemap') {
const content = item.content as string
// @ts-ignore loader type
return await ragApplication.addLoader(new SitemapLoader({ url: content }), forceReload)
return await ragApplication.addLoader(
new SitemapLoader({ url: content, chunkSize: base.chunkSize, chunkOverlap: base.chunkOverlap }) as any,
forceReload
)
}
if (item.type === 'note') {
const content = item.content as string
return await ragApplication.addLoader(new TextLoader({ text: content }), forceReload)
return await ragApplication.addLoader(
new TextLoader({ text: content, chunkSize: base.chunkSize, chunkOverlap: base.chunkOverlap }),
forceReload
)
}
if (item.type === 'file') {
const file = item.content as FileType
if (file.ext === '.pdf') {
return await ragApplication.addLoader(new PdfLoader({ filePathOrUrl: file.path }) as any, forceReload)
return await ragApplication.addLoader(
new PdfLoader({
filePathOrUrl: file.path,
chunkSize: base.chunkSize,
chunkOverlap: base.chunkOverlap
}) as any,
forceReload
)
}
if (file.ext === '.docx') {
return await ragApplication.addLoader(new DocxLoader({ filePathOrUrl: file.path }) as any, forceReload)
return await ragApplication.addLoader(
new DocxLoader({
filePathOrUrl: file.path,
chunkSize: base.chunkSize,
chunkOverlap: base.chunkOverlap
}) as any,
forceReload
)
}
if (file.ext === '.pptx') {
return await ragApplication.addLoader(new PptLoader({ filePathOrUrl: file.path }) as any, forceReload)
return await ragApplication.addLoader(
new PptLoader({
filePathOrUrl: file.path,
chunkSize: base.chunkSize,
chunkOverlap: base.chunkOverlap
}) as any,
forceReload
)
}
if (file.ext === '.xlsx') {
return await ragApplication.addLoader(new ExcelLoader({ filePathOrUrl: file.path }) as any, forceReload)
return await ragApplication.addLoader(
new ExcelLoader({
filePathOrUrl: file.path,
chunkSize: base.chunkSize,
chunkOverlap: base.chunkOverlap
}) as any,
forceReload
)
}
if (['.md'].includes(file.ext)) {
return await ragApplication.addLoader(new MarkdownLoader({ filePathOrUrl: file.path }) as any, forceReload)
return await ragApplication.addLoader(
new MarkdownLoader({
filePathOrUrl: file.path,
chunkSize: base.chunkSize,
chunkOverlap: base.chunkOverlap
}) as any,
forceReload
)
}
const fileContent = fs.readFileSync(file.path, 'utf-8')
return await ragApplication.addLoader(new TextLoader({ text: fileContent }), forceReload)
return await ragApplication.addLoader(
new TextLoader({ text: fileContent, chunkSize: base.chunkSize, chunkOverlap: base.chunkOverlap }),
forceReload
)
}
return { entriesAdded: 0, uniqueId: '', loaderType: '' }

View File

@ -245,6 +245,7 @@
"error.enter.api.key": "Please enter your API key first",
"error.enter.model": "Please select a model first",
"error.enter.name": "Please enter the name of the knowledge base",
"error.chunk_overlap_too_large": "Chunk overlap cannot be greater than chunk size",
"error.invalid.proxy.url": "Invalid proxy URL",
"error.invalid.webdav": "Invalid WebDAV settings",
"message.code_style": "Code style",
@ -625,7 +626,10 @@
"model_info": "Model Info",
"not_support": "Knowledge base database engine updated, the knowledge base will no longer be supported, please create a new knowledge base",
"no_provider": "Knowledge base model provider is not set, the knowledge base will no longer be supported, please create a new knowledge base",
"source": "Source"
"source": "Source",
"chunk_size": "Chunk Size",
"chunk_overlap": "Chunk Overlap",
"not_set": "Not Set"
},
"models": {
"pinned": "Pinned",

View File

@ -244,6 +244,7 @@
"error.enter.api.host": "APIホストを入力してください",
"error.enter.api.key": "APIキーを入力してください",
"error.enter.model": "モデルを選択してください",
"error.chunk_overlap_too_large": "チャンクの重なりは、チャンクサイズを超えることはできません",
"error.invalid.proxy.url": "無効なプロキシURL",
"error.invalid.webdav": "無効なWebDAV設定",
"message.code_style": "コードスタイル",
@ -609,7 +610,10 @@
"model_info": "モデル情報",
"not_support": "ナレッジベースデータベースエンジンが更新されました。このナレッジベースはもうサポートされていません。新しいナレッジベースを作成してください",
"no_provider": "ナレッジベースモデルプロバイダーが設定されていません。ナレッジベースはもうサポートされていません。新しいナレッジベースを作成してください",
"source": "ソース"
"source": "ソース",
"chunk_size": "チャンクサイズ",
"chunk_overlap": "チャンクの重なり",
"not_set": "未設定"
},
"models": {
"pinned": "固定済み",

View File

@ -245,6 +245,7 @@
"error.enter.api.key": "Пожалуйста, введите ваш API ключ",
"error.enter.model": "Пожалуйста, выберите модель",
"error.enter.name": "Пожалуйста, введите название базы знаний",
"error.chunk_overlap_too_large": "Перекрытие фрагментов не может быть больше размера фрагмента.",
"error.invalid.proxy.url": "Неверный URL прокси",
"error.invalid.webdav": "Неверные настройки WebDAV",
"message.code_style": "Стиль кода",
@ -622,7 +623,10 @@
"model_info": "Модель информации",
"not_support": "База знаний базы данных движок обновлен, база знаний больше не поддерживается, пожалуйста, создайте новую базу знаний",
"no_provider": "База знаний модель поставщика не настроена, база знаний больше не поддерживается, пожалуйста, создайте новую базу знаний",
"source": "Источник"
"source": "Источник",
"chunk_size": "Размер фрагмента",
"chunk_overlap": "Перекрытие фрагмента",
"not_set": "Не установлено"
},
"models": {
"pinned": "Закреплено",

View File

@ -246,6 +246,7 @@
"error.enter.api.key": "请输入您的 API 密钥",
"error.enter.model": "请选择一个模型",
"error.enter.name": "请输入知识库名称",
"error.chunk_overlap_too_large": "分段重叠不能大于分段大小",
"error.invalid.proxy.url": "无效的代理地址",
"error.invalid.webdav": "无效的 WebDAV 设置",
"message.code_style": "代码风格",
@ -611,7 +612,10 @@
"model_info": "模型信息",
"not_support": "知识库数据库引擎已更新,该知识库将不再支持,请重新创建知识库",
"no_provider": "知识库模型服务商丢失,该知识库将不再支持,请重新创建知识库",
"source": "来源"
"source": "来源",
"chunk_size": "分段大小",
"chunk_overlap": "重叠大小",
"not_set": "未设置"
},
"models": {
"pinned": "已固定",

View File

@ -245,6 +245,7 @@
"error.enter.api.key": "請先輸入您的 API 密鑰",
"error.enter.model": "請先選擇一個模型",
"error.enter.name": "請先輸入知識庫名稱",
"error.chunk_overlap_too_large": "分段重疊不能大於分段大小",
"error.invalid.proxy.url": "無效的代理 URL",
"error.invalid.webdav": "無效的 WebDAV 設定",
"message.code_style": "程式碼風格",
@ -610,7 +611,10 @@
"model_info": "模型信息",
"not_support": "知識庫數據庫引擎已更新,該知識庫將不再支持,請重新創建知識庫",
"no_provider": "知識庫模型提供商遺失,該知識庫將不再支持,請重新創建知識庫",
"source": "來源"
"source": "來源",
"chunk_size": "分段大小",
"chunk_overlap": "重疊大小",
"not_set": "未設置"
},
"models": {
"pinned": "已固定",

View File

@ -361,6 +361,13 @@ const KnowledgeContent: FC<KnowledgeContentProps> = ({ selectedBase }) => {
{providerName && <Tag color="purple">{providerName}</Tag>}
</ModelInfo>
<ModelInfo>
<label htmlFor="model-info">{t('knowledge.chunk_size')}</label>
<Tag color="green">{base.chunkSize || t('knowledge.not_set')}</Tag>
<label htmlFor="model-info">{t('knowledge.chunk_overlap')}</label>
<Tag color="orange">{base.chunkOverlap || t('knowledge.not_set')}</Tag>
</ModelInfo>
<IndexSection>
<Button
type="primary"

View File

@ -6,7 +6,7 @@ import AiProvider from '@renderer/providers/AiProvider'
import { getKnowledgeBaseParams } from '@renderer/services/KnowledgeService'
import { getModelUniqId } from '@renderer/services/ModelService'
import { Model } from '@renderer/types'
import { Form, Input, Modal, Select } from 'antd'
import { Form, Input, InputNumber, Modal, Select } from 'antd'
import { find, sortBy } from 'lodash'
import { nanoid } from 'nanoid'
import { useState } from 'react'
@ -19,6 +19,8 @@ interface ShowParams {
interface FormData {
name: string
model: string
chunkSize?: number
chunkOverlap?: number
}
interface Props extends ShowParams {
@ -81,6 +83,8 @@ const PopupContainer: React.FC<Props> = ({ title, resolve }) => {
name: values.name,
model: selectedModel,
dimensions,
chunkSize: values.chunkSize,
chunkOverlap: values.chunkOverlap,
items: [],
created_at: Date.now(),
updated_at: Date.now(),
@ -131,6 +135,28 @@ const PopupContainer: React.FC<Props> = ({ title, resolve }) => {
rules={[{ required: true, message: t('message.error.enter.model') }]}>
<Select style={{ width: '100%' }} options={selectOptions} placeholder={t('settings.models.empty')} />
</Form.Item>
<Form.Item name="chunkSize" label={t('knowledge.chunk_size')} initialValue={1000}>
<InputNumber style={{ width: '100%' }} min={1} />
</Form.Item>
<Form.Item
name="chunkOverlap"
label={t('knowledge.chunk_overlap')}
initialValue={0}
rules={[
({ getFieldValue }) => ({
validator(_, value) {
if (!value || getFieldValue('chunkSize') > value) {
return Promise.resolve()
}
return Promise.reject(new Error(t('message.error.chunk_overlap_too_large')))
}
})
]}
dependencies={['chunkSize']}>
<InputNumber style={{ width: '100%' }} min={0} />
</Form.Item>
</Form>
</Modal>
)

View File

@ -22,7 +22,9 @@ export const getKnowledgeBaseParams = (base: KnowledgeBase): KnowledgeBaseParams
dimensions: base.dimensions,
apiKey: aiProvider.getApiKey() || 'secret',
apiVersion: provider.apiVersion,
baseURL: host
baseURL: host,
chunkSize: base.chunkSize,
chunkOverlap: base.chunkOverlap
}
}

View File

@ -232,6 +232,8 @@ export interface KnowledgeBase {
created_at: number
updated_at: number
version: number
chunkSize?: number
chunkOverlap?: number
}
export type KnowledgeBaseParams = {
@ -241,6 +243,8 @@ export type KnowledgeBaseParams = {
apiKey: string
apiVersion?: string
baseURL: string
chunkSize?: number
chunkOverlap?: number
}
export type GenerateImageParams = {

View File

@ -1693,7 +1693,7 @@ __metadata:
"@llm-tools/embedjs@patch:@llm-tools/embedjs@npm%3A0.1.25#~/.yarn/patches/@llm-tools-embedjs-npm-0.1.25-ec5645cf36.patch":
version: 0.1.25
resolution: "@llm-tools/embedjs@patch:@llm-tools/embedjs@npm%3A0.1.25#~/.yarn/patches/@llm-tools-embedjs-npm-0.1.25-ec5645cf36.patch::version=0.1.25&hash=7b05b5"
resolution: "@llm-tools/embedjs@patch:@llm-tools/embedjs@npm%3A0.1.25#~/.yarn/patches/@llm-tools-embedjs-npm-0.1.25-ec5645cf36.patch::version=0.1.25&hash=3b8a9c"
dependencies:
"@langchain/textsplitters": "npm:^0.1.0"
"@llm-tools/embedjs-interfaces": "npm:0.1.25"
@ -1703,7 +1703,7 @@ __metadata:
md5: "npm:^2.3.0"
mime: "npm:^4.0.6"
stream-mime-type: "npm:^2.0.0"
checksum: 10c0/d0a37a5c7232571a71eff7e90ff4ba612bf33022a6eccd933c3a778844320f427a936d0851aae00092e34407c8c2f3555fe4444c6f2139f978ecfdd42fd89375
checksum: 10c0/3ef5fb0068e662d9fc3ff794c0c200fca91fba548d1989a628ad2c3576e3f97838f3abca683adc77b1774d57e09c6d155c1c4b9d69eb20aac26bd274148f72a1
languageName: node
linkType: hard