cherry-studio/.yarn/patches/@llm-tools-embedjs-npm-0.1.28-8e4393fa2d.patch

159 lines
9.3 KiB
Diff

diff --git a/src/loaders/local-path-loader.d.ts b/src/loaders/local-path-loader.d.ts
index 48c20e68c469cd309be2dc8f28e44c1bd04a26e9..1c16d83bcbf9b7140292793d6cbb8c04281949d9 100644
--- a/src/loaders/local-path-loader.d.ts
+++ b/src/loaders/local-path-loader.d.ts
@@ -4,8 +4,10 @@ export declare class LocalPathLoader extends BaseLoader<{
}> {
private readonly debug;
private readonly path;
- constructor({ path }: {
+ constructor({ path, chunkSize, chunkOverlap }: {
path: string;
+ chunkSize?: number;
+ chunkOverlap?: number;
});
getUnfilteredChunks(): AsyncGenerator<{
metadata: {
diff --git a/src/loaders/local-path-loader.js b/src/loaders/local-path-loader.js
index 4cf8a6bd1d890244c8ec49d4a05ee3bd58861c79..ec8215b01195a21ef20f3c5d56ecc99f186bb596 100644
--- a/src/loaders/local-path-loader.js
+++ b/src/loaders/local-path-loader.js
@@ -8,8 +8,8 @@ import { BaseLoader } from '@llm-tools/embedjs-interfaces';
export class LocalPathLoader extends BaseLoader {
debug = createDebugMessages('embedjs:loader:LocalPathLoader');
path;
- constructor({ path }) {
- super(`LocalPathLoader_${md5(path)}`, { path });
+ constructor({ path, chunkSize, chunkOverlap }) {
+ super(`LocalPathLoader_${md5(path)}`, { path }, chunkSize ?? 1000, chunkOverlap ?? 0);
this.path = path;
}
async *getUnfilteredChunks() {
@@ -36,10 +36,12 @@ export class LocalPathLoader extends BaseLoader {
const extension = currentPath.split('.').pop().toLowerCase();
if (extension === 'md' || extension === 'mdx')
mime = 'text/markdown';
+ if (extension === 'txt')
+ mime = 'text/plain';
this.debug(`File '${this.path}' mime type updated to 'text/markdown'`);
}
try {
- const loader = await createLoaderFromMimeType(currentPath, mime);
+ const loader = await createLoaderFromMimeType(currentPath, mime, this.chunkSize, this.chunkOverlap);
for await (const result of await loader.getUnfilteredChunks()) {
yield {
pageContent: result.pageContent,
diff --git a/src/util/mime.d.ts b/src/util/mime.d.ts
index 57f56a1b8edc98366af9f84d671676c41c2f01ca..14be3b5727cff6eb1978838045e9a788f8f53bfb 100644
--- a/src/util/mime.d.ts
+++ b/src/util/mime.d.ts
@@ -1,2 +1,2 @@
import { BaseLoader } from '@llm-tools/embedjs-interfaces';
-export declare function createLoaderFromMimeType(loaderData: string, mimeType: string): Promise<BaseLoader>;
+export declare function createLoaderFromMimeType(loaderData: string, mimeType: string, chunkSize?: number, chunkOverlap?: number): Promise<BaseLoader>;
diff --git a/src/util/mime.js b/src/util/mime.js
index b6426a859968e2bf6206795f70333e90ae27aeb7..16ae2adb863f8d7abfa757f1c5cc39f6bb1c44fa 100644
--- a/src/util/mime.js
+++ b/src/util/mime.js
@@ -1,7 +1,9 @@
import mime from 'mime';
import createDebugMessages from 'debug';
import { TextLoader } from '../loaders/text-loader.js';
-export async function createLoaderFromMimeType(loaderData, mimeType) {
+import fs from 'node:fs'
+
+export async function createLoaderFromMimeType(loaderData, mimeType, chunkSize, chunkOverlap) {
createDebugMessages('embedjs:util:createLoaderFromMimeType')(`Incoming mime type '${mimeType}'`);
switch (mimeType) {
case 'application/msword':
@@ -10,7 +12,7 @@ export async function createLoaderFromMimeType(loaderData, mimeType) {
throw new Error('Package `@llm-tools/embedjs-loader-msoffice` needs to be installed to load docx files');
});
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported DocxLoader');
- return new DocxLoader({ filePathOrUrl: loaderData });
+ return new DocxLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap });
}
case 'application/vnd.ms-excel':
case 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': {
@@ -18,21 +20,21 @@ export async function createLoaderFromMimeType(loaderData, mimeType) {
throw new Error('Package `@llm-tools/embedjs-loader-msoffice` needs to be installed to load excel files');
});
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported ExcelLoader');
- return new ExcelLoader({ filePathOrUrl: loaderData });
+ return new ExcelLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap });
}
case 'application/pdf': {
const { PdfLoader } = await import('@llm-tools/embedjs-loader-pdf').catch(() => {
throw new Error('Package `@llm-tools/embedjs-loader-pdf` needs to be installed to load PDF files');
});
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported PdfLoader');
- return new PdfLoader({ filePathOrUrl: loaderData });
+ return new PdfLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap });
}
case 'application/vnd.openxmlformats-officedocument.presentationml.presentation': {
const { PptLoader } = await import('@llm-tools/embedjs-loader-msoffice').catch(() => {
throw new Error('Package `@llm-tools/embedjs-loader-msoffice` needs to be installed to load pptx files');
});
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported PptLoader');
- return new PptLoader({ filePathOrUrl: loaderData });
+ return new PptLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap });
}
case 'text/plain': {
const fineType = mime.getType(loaderData);
@@ -42,24 +44,24 @@ export async function createLoaderFromMimeType(loaderData, mimeType) {
throw new Error('Package `@llm-tools/embedjs-loader-csv` needs to be installed to load CSV files');
});
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported CsvLoader');
- return new CsvLoader({ filePathOrUrl: loaderData });
+ return new CsvLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap });
}
- else
- return new TextLoader({ text: loaderData });
+ const content = fs.readFileSync(loaderData, 'utf-8');
+ return new TextLoader({ text: content, chunkSize, chunkOverlap });
}
case 'application/csv': {
const { CsvLoader } = await import('@llm-tools/embedjs-loader-csv').catch(() => {
throw new Error('Package `@llm-tools/embedjs-loader-csv` needs to be installed to load CSV files');
});
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported CsvLoader');
- return new CsvLoader({ filePathOrUrl: loaderData });
+ return new CsvLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap });
}
case 'text/html': {
const { WebLoader } = await import('@llm-tools/embedjs-loader-web').catch(() => {
throw new Error('Package `@llm-tools/embedjs-loader-web` needs to be installed to load web documents');
});
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported WebLoader');
- return new WebLoader({ urlOrContent: loaderData });
+ return new WebLoader({ urlOrContent: loaderData, chunkSize, chunkOverlap });
}
case 'text/xml': {
const { SitemapLoader } = await import('@llm-tools/embedjs-loader-sitemap').catch(() => {
@@ -67,14 +69,14 @@ export async function createLoaderFromMimeType(loaderData, mimeType) {
});
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported SitemapLoader');
if (await SitemapLoader.test(loaderData)) {
- return new SitemapLoader({ url: loaderData });
+ return new SitemapLoader({ url: loaderData, chunkSize, chunkOverlap });
}
//This is not a Sitemap but is still XML
const { XmlLoader } = await import('@llm-tools/embedjs-loader-xml').catch(() => {
throw new Error('Package `@llm-tools/embedjs-loader-xml` needs to be installed to load XML documents');
});
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported XmlLoader');
- return new XmlLoader({ filePathOrUrl: loaderData });
+ return new XmlLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap });
}
case 'text/x-markdown':
case 'text/markdown': {
@@ -82,7 +84,7 @@ export async function createLoaderFromMimeType(loaderData, mimeType) {
throw new Error('Package `@llm-tools/embedjs-loader-markdown` needs to be installed to load markdown files');
});
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported MarkdownLoader');
- return new MarkdownLoader({ filePathOrUrl: loaderData });
+ return new MarkdownLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap });
}
case 'image/png':
case 'image/jpeg': {