159 lines
9.3 KiB
Diff
Vendored
159 lines
9.3 KiB
Diff
Vendored
diff --git a/src/loaders/local-path-loader.d.ts b/src/loaders/local-path-loader.d.ts
|
|
index 48c20e68c469cd309be2dc8f28e44c1bd04a26e9..1c16d83bcbf9b7140292793d6cbb8c04281949d9 100644
|
|
--- a/src/loaders/local-path-loader.d.ts
|
|
+++ b/src/loaders/local-path-loader.d.ts
|
|
@@ -4,8 +4,10 @@ export declare class LocalPathLoader extends BaseLoader<{
|
|
}> {
|
|
private readonly debug;
|
|
private readonly path;
|
|
- constructor({ path }: {
|
|
+ constructor({ path, chunkSize, chunkOverlap }: {
|
|
path: string;
|
|
+ chunkSize?: number;
|
|
+ chunkOverlap?: number;
|
|
});
|
|
getUnfilteredChunks(): AsyncGenerator<{
|
|
metadata: {
|
|
diff --git a/src/loaders/local-path-loader.js b/src/loaders/local-path-loader.js
|
|
index 4cf8a6bd1d890244c8ec49d4a05ee3bd58861c79..ec8215b01195a21ef20f3c5d56ecc99f186bb596 100644
|
|
--- a/src/loaders/local-path-loader.js
|
|
+++ b/src/loaders/local-path-loader.js
|
|
@@ -8,8 +8,8 @@ import { BaseLoader } from '@llm-tools/embedjs-interfaces';
|
|
export class LocalPathLoader extends BaseLoader {
|
|
debug = createDebugMessages('embedjs:loader:LocalPathLoader');
|
|
path;
|
|
- constructor({ path }) {
|
|
- super(`LocalPathLoader_${md5(path)}`, { path });
|
|
+ constructor({ path, chunkSize, chunkOverlap }) {
|
|
+ super(`LocalPathLoader_${md5(path)}`, { path }, chunkSize ?? 1000, chunkOverlap ?? 0);
|
|
this.path = path;
|
|
}
|
|
async *getUnfilteredChunks() {
|
|
@@ -36,10 +36,12 @@ export class LocalPathLoader extends BaseLoader {
|
|
const extension = currentPath.split('.').pop().toLowerCase();
|
|
if (extension === 'md' || extension === 'mdx')
|
|
mime = 'text/markdown';
|
|
+ if (extension === 'txt')
|
|
+ mime = 'text/plain';
|
|
this.debug(`File '${this.path}' mime type updated to 'text/markdown'`);
|
|
}
|
|
try {
|
|
- const loader = await createLoaderFromMimeType(currentPath, mime);
|
|
+ const loader = await createLoaderFromMimeType(currentPath, mime, this.chunkSize, this.chunkOverlap);
|
|
for await (const result of await loader.getUnfilteredChunks()) {
|
|
yield {
|
|
pageContent: result.pageContent,
|
|
diff --git a/src/util/mime.d.ts b/src/util/mime.d.ts
|
|
index 57f56a1b8edc98366af9f84d671676c41c2f01ca..14be3b5727cff6eb1978838045e9a788f8f53bfb 100644
|
|
--- a/src/util/mime.d.ts
|
|
+++ b/src/util/mime.d.ts
|
|
@@ -1,2 +1,2 @@
|
|
import { BaseLoader } from '@llm-tools/embedjs-interfaces';
|
|
-export declare function createLoaderFromMimeType(loaderData: string, mimeType: string): Promise<BaseLoader>;
|
|
+export declare function createLoaderFromMimeType(loaderData: string, mimeType: string, chunkSize?: number, chunkOverlap?: number): Promise<BaseLoader>;
|
|
diff --git a/src/util/mime.js b/src/util/mime.js
|
|
index b6426a859968e2bf6206795f70333e90ae27aeb7..16ae2adb863f8d7abfa757f1c5cc39f6bb1c44fa 100644
|
|
--- a/src/util/mime.js
|
|
+++ b/src/util/mime.js
|
|
@@ -1,7 +1,9 @@
|
|
import mime from 'mime';
|
|
import createDebugMessages from 'debug';
|
|
import { TextLoader } from '../loaders/text-loader.js';
|
|
-export async function createLoaderFromMimeType(loaderData, mimeType) {
|
|
+import fs from 'node:fs'
|
|
+
|
|
+export async function createLoaderFromMimeType(loaderData, mimeType, chunkSize, chunkOverlap) {
|
|
createDebugMessages('embedjs:util:createLoaderFromMimeType')(`Incoming mime type '${mimeType}'`);
|
|
switch (mimeType) {
|
|
case 'application/msword':
|
|
@@ -10,7 +12,7 @@ export async function createLoaderFromMimeType(loaderData, mimeType) {
|
|
throw new Error('Package `@llm-tools/embedjs-loader-msoffice` needs to be installed to load docx files');
|
|
});
|
|
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported DocxLoader');
|
|
- return new DocxLoader({ filePathOrUrl: loaderData });
|
|
+ return new DocxLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap });
|
|
}
|
|
case 'application/vnd.ms-excel':
|
|
case 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': {
|
|
@@ -18,21 +20,21 @@ export async function createLoaderFromMimeType(loaderData, mimeType) {
|
|
throw new Error('Package `@llm-tools/embedjs-loader-msoffice` needs to be installed to load excel files');
|
|
});
|
|
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported ExcelLoader');
|
|
- return new ExcelLoader({ filePathOrUrl: loaderData });
|
|
+ return new ExcelLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap });
|
|
}
|
|
case 'application/pdf': {
|
|
const { PdfLoader } = await import('@llm-tools/embedjs-loader-pdf').catch(() => {
|
|
throw new Error('Package `@llm-tools/embedjs-loader-pdf` needs to be installed to load PDF files');
|
|
});
|
|
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported PdfLoader');
|
|
- return new PdfLoader({ filePathOrUrl: loaderData });
|
|
+ return new PdfLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap });
|
|
}
|
|
case 'application/vnd.openxmlformats-officedocument.presentationml.presentation': {
|
|
const { PptLoader } = await import('@llm-tools/embedjs-loader-msoffice').catch(() => {
|
|
throw new Error('Package `@llm-tools/embedjs-loader-msoffice` needs to be installed to load pptx files');
|
|
});
|
|
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported PptLoader');
|
|
- return new PptLoader({ filePathOrUrl: loaderData });
|
|
+ return new PptLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap });
|
|
}
|
|
case 'text/plain': {
|
|
const fineType = mime.getType(loaderData);
|
|
@@ -42,24 +44,24 @@ export async function createLoaderFromMimeType(loaderData, mimeType) {
|
|
throw new Error('Package `@llm-tools/embedjs-loader-csv` needs to be installed to load CSV files');
|
|
});
|
|
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported CsvLoader');
|
|
- return new CsvLoader({ filePathOrUrl: loaderData });
|
|
+ return new CsvLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap });
|
|
}
|
|
- else
|
|
- return new TextLoader({ text: loaderData });
|
|
+ const content = fs.readFileSync(loaderData, 'utf-8');
|
|
+ return new TextLoader({ text: content, chunkSize, chunkOverlap });
|
|
}
|
|
case 'application/csv': {
|
|
const { CsvLoader } = await import('@llm-tools/embedjs-loader-csv').catch(() => {
|
|
throw new Error('Package `@llm-tools/embedjs-loader-csv` needs to be installed to load CSV files');
|
|
});
|
|
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported CsvLoader');
|
|
- return new CsvLoader({ filePathOrUrl: loaderData });
|
|
+ return new CsvLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap });
|
|
}
|
|
case 'text/html': {
|
|
const { WebLoader } = await import('@llm-tools/embedjs-loader-web').catch(() => {
|
|
throw new Error('Package `@llm-tools/embedjs-loader-web` needs to be installed to load web documents');
|
|
});
|
|
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported WebLoader');
|
|
- return new WebLoader({ urlOrContent: loaderData });
|
|
+ return new WebLoader({ urlOrContent: loaderData, chunkSize, chunkOverlap });
|
|
}
|
|
case 'text/xml': {
|
|
const { SitemapLoader } = await import('@llm-tools/embedjs-loader-sitemap').catch(() => {
|
|
@@ -67,14 +69,14 @@ export async function createLoaderFromMimeType(loaderData, mimeType) {
|
|
});
|
|
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported SitemapLoader');
|
|
if (await SitemapLoader.test(loaderData)) {
|
|
- return new SitemapLoader({ url: loaderData });
|
|
+ return new SitemapLoader({ url: loaderData, chunkSize, chunkOverlap });
|
|
}
|
|
//This is not a Sitemap but is still XML
|
|
const { XmlLoader } = await import('@llm-tools/embedjs-loader-xml').catch(() => {
|
|
throw new Error('Package `@llm-tools/embedjs-loader-xml` needs to be installed to load XML documents');
|
|
});
|
|
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported XmlLoader');
|
|
- return new XmlLoader({ filePathOrUrl: loaderData });
|
|
+ return new XmlLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap });
|
|
}
|
|
case 'text/x-markdown':
|
|
case 'text/markdown': {
|
|
@@ -82,7 +84,7 @@ export async function createLoaderFromMimeType(loaderData, mimeType) {
|
|
throw new Error('Package `@llm-tools/embedjs-loader-markdown` needs to be installed to load markdown files');
|
|
});
|
|
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported MarkdownLoader');
|
|
- return new MarkdownLoader({ filePathOrUrl: loaderData });
|
|
+ return new MarkdownLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap });
|
|
}
|
|
case 'image/png':
|
|
case 'image/jpeg': {
|