diff --git a/src/loaders/local-path-loader.d.ts b/src/loaders/local-path-loader.d.ts index 48c20e68c469cd309be2dc8f28e44c1bd04a26e9..1c16d83bcbf9b7140292793d6cbb8c04281949d9 100644 --- a/src/loaders/local-path-loader.d.ts +++ b/src/loaders/local-path-loader.d.ts @@ -4,8 +4,10 @@ export declare class LocalPathLoader extends BaseLoader<{ }> { private readonly debug; private readonly path; - constructor({ path }: { + constructor({ path, chunkSize, chunkOverlap }: { path: string; + chunkSize?: number; + chunkOverlap?: number; }); getUnfilteredChunks(): AsyncGenerator<{ metadata: { diff --git a/src/loaders/local-path-loader.js b/src/loaders/local-path-loader.js index 4cf8a6bd1d890244c8ec49d4a05ee3bd58861c79..ec8215b01195a21ef20f3c5d56ecc99f186bb596 100644 --- a/src/loaders/local-path-loader.js +++ b/src/loaders/local-path-loader.js @@ -8,8 +8,8 @@ import { BaseLoader } from '@llm-tools/embedjs-interfaces'; export class LocalPathLoader extends BaseLoader { debug = createDebugMessages('embedjs:loader:LocalPathLoader'); path; - constructor({ path }) { - super(`LocalPathLoader_${md5(path)}`, { path }); + constructor({ path, chunkSize, chunkOverlap }) { + super(`LocalPathLoader_${md5(path)}`, { path }, chunkSize ?? 1000, chunkOverlap ?? 0); this.path = path; } async *getUnfilteredChunks() { @@ -36,10 +36,12 @@ export class LocalPathLoader extends BaseLoader { const extension = currentPath.split('.').pop().toLowerCase(); if (extension === 'md' || extension === 'mdx') mime = 'text/markdown'; + if (extension === 'txt') + mime = 'text/plain'; this.debug(`File '${this.path}' mime type updated to 'text/markdown'`); } try { - const loader = await createLoaderFromMimeType(currentPath, mime); + const loader = await createLoaderFromMimeType(currentPath, mime, this.chunkSize, this.chunkOverlap); for await (const result of await loader.getUnfilteredChunks()) { yield { pageContent: result.pageContent, diff --git a/src/util/mime.d.ts b/src/util/mime.d.ts index 57f56a1b8edc98366af9f84d671676c41c2f01ca..14be3b5727cff6eb1978838045e9a788f8f53bfb 100644 --- a/src/util/mime.d.ts +++ b/src/util/mime.d.ts @@ -1,2 +1,2 @@ import { BaseLoader } from '@llm-tools/embedjs-interfaces'; -export declare function createLoaderFromMimeType(loaderData: string, mimeType: string): Promise; +export declare function createLoaderFromMimeType(loaderData: string, mimeType: string, chunkSize?: number, chunkOverlap?: number): Promise; diff --git a/src/util/mime.js b/src/util/mime.js index b6426a859968e2bf6206795f70333e90ae27aeb7..16ae2adb863f8d7abfa757f1c5cc39f6bb1c44fa 100644 --- a/src/util/mime.js +++ b/src/util/mime.js @@ -1,7 +1,9 @@ import mime from 'mime'; import createDebugMessages from 'debug'; import { TextLoader } from '../loaders/text-loader.js'; -export async function createLoaderFromMimeType(loaderData, mimeType) { +import fs from 'node:fs' + +export async function createLoaderFromMimeType(loaderData, mimeType, chunkSize, chunkOverlap) { createDebugMessages('embedjs:util:createLoaderFromMimeType')(`Incoming mime type '${mimeType}'`); switch (mimeType) { case 'application/msword': @@ -10,7 +12,7 @@ export async function createLoaderFromMimeType(loaderData, mimeType) { throw new Error('Package `@llm-tools/embedjs-loader-msoffice` needs to be installed to load docx files'); }); createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported DocxLoader'); - return new DocxLoader({ filePathOrUrl: loaderData }); + return new DocxLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap }); } case 'application/vnd.ms-excel': case 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': { @@ -18,21 +20,21 @@ export async function createLoaderFromMimeType(loaderData, mimeType) { throw new Error('Package `@llm-tools/embedjs-loader-msoffice` needs to be installed to load excel files'); }); createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported ExcelLoader'); - return new ExcelLoader({ filePathOrUrl: loaderData }); + return new ExcelLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap }); } case 'application/pdf': { const { PdfLoader } = await import('@llm-tools/embedjs-loader-pdf').catch(() => { throw new Error('Package `@llm-tools/embedjs-loader-pdf` needs to be installed to load PDF files'); }); createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported PdfLoader'); - return new PdfLoader({ filePathOrUrl: loaderData }); + return new PdfLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap }); } case 'application/vnd.openxmlformats-officedocument.presentationml.presentation': { const { PptLoader } = await import('@llm-tools/embedjs-loader-msoffice').catch(() => { throw new Error('Package `@llm-tools/embedjs-loader-msoffice` needs to be installed to load pptx files'); }); createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported PptLoader'); - return new PptLoader({ filePathOrUrl: loaderData }); + return new PptLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap }); } case 'text/plain': { const fineType = mime.getType(loaderData); @@ -42,24 +44,24 @@ export async function createLoaderFromMimeType(loaderData, mimeType) { throw new Error('Package `@llm-tools/embedjs-loader-csv` needs to be installed to load CSV files'); }); createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported CsvLoader'); - return new CsvLoader({ filePathOrUrl: loaderData }); + return new CsvLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap }); } - else - return new TextLoader({ text: loaderData }); + const content = fs.readFileSync(loaderData, 'utf-8'); + return new TextLoader({ text: content, chunkSize, chunkOverlap }); } case 'application/csv': { const { CsvLoader } = await import('@llm-tools/embedjs-loader-csv').catch(() => { throw new Error('Package `@llm-tools/embedjs-loader-csv` needs to be installed to load CSV files'); }); createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported CsvLoader'); - return new CsvLoader({ filePathOrUrl: loaderData }); + return new CsvLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap }); } case 'text/html': { const { WebLoader } = await import('@llm-tools/embedjs-loader-web').catch(() => { throw new Error('Package `@llm-tools/embedjs-loader-web` needs to be installed to load web documents'); }); createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported WebLoader'); - return new WebLoader({ urlOrContent: loaderData }); + return new WebLoader({ urlOrContent: loaderData, chunkSize, chunkOverlap }); } case 'text/xml': { const { SitemapLoader } = await import('@llm-tools/embedjs-loader-sitemap').catch(() => { @@ -67,14 +69,14 @@ export async function createLoaderFromMimeType(loaderData, mimeType) { }); createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported SitemapLoader'); if (await SitemapLoader.test(loaderData)) { - return new SitemapLoader({ url: loaderData }); + return new SitemapLoader({ url: loaderData, chunkSize, chunkOverlap }); } //This is not a Sitemap but is still XML const { XmlLoader } = await import('@llm-tools/embedjs-loader-xml').catch(() => { throw new Error('Package `@llm-tools/embedjs-loader-xml` needs to be installed to load XML documents'); }); createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported XmlLoader'); - return new XmlLoader({ filePathOrUrl: loaderData }); + return new XmlLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap }); } case 'text/x-markdown': case 'text/markdown': { @@ -82,7 +84,7 @@ export async function createLoaderFromMimeType(loaderData, mimeType) { throw new Error('Package `@llm-tools/embedjs-loader-markdown` needs to be installed to load markdown files'); }); createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported MarkdownLoader'); - return new MarkdownLoader({ filePathOrUrl: loaderData }); + return new MarkdownLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap }); } case 'image/png': case 'image/jpeg': {