diff --git a/src/core/rag-embedding.js b/src/core/rag-embedding.js index 50c3c4064af17bc4c7c46554d8f2419b3afceb0e..632c9b2e04d2e0e3bb09ef1cd8f29d2560e6afc1 100644 --- a/src/core/rag-embedding.js +++ b/src/core/rag-embedding.js @@ -1,10 +1,8 @@ export class RAGEmbedding { static singleton; static async init(embeddingModel) { - if (!this.singleton) { - await embeddingModel.init(); - this.singleton = new RAGEmbedding(embeddingModel); - } + await embeddingModel.init(); + this.singleton = new RAGEmbedding(embeddingModel); } static getInstance() { return RAGEmbedding.singleton; diff --git a/src/loaders/local-path-loader.d.ts b/src/loaders/local-path-loader.d.ts index 48c20e68c469cd309be2dc8f28e44c1bd04a26e9..87002be39e7305a02e2a607b0c0d95cbbc359f9d 100644 --- a/src/loaders/local-path-loader.d.ts +++ b/src/loaders/local-path-loader.d.ts @@ -1,19 +1,29 @@ -import { BaseLoader } from '@llm-tools/embedjs-interfaces'; +import { BaseLoader } from "@llm-tools/embedjs-interfaces"; export declare class LocalPathLoader extends BaseLoader<{ - type: 'LocalPathLoader'; + type: "LocalPathLoader"; }> { - private readonly debug; - private readonly path; - constructor({ path }: { - path: string; - }); - getUnfilteredChunks(): AsyncGenerator<{ - metadata: { - type: "LocalPathLoader"; - originalPath: string; - source: string; - }; - pageContent: string; - }, void, unknown>; - private recursivelyAddPath; + private readonly debug; + private readonly path; + constructor({ + path, + chunkSize, + chunkOverlap, + }: { + path: string; + chunkSize?: number; + chunkOverlap?: number; + }); + getUnfilteredChunks(): AsyncGenerator< + { + metadata: { + type: "LocalPathLoader"; + originalPath: string; + source: string; + }; + pageContent: string; + }, + void, + unknown + >; + private recursivelyAddPath; } diff --git a/src/loaders/local-path-loader.js b/src/loaders/local-path-loader.js index 4cf8a6bd1d890244c8ec49d4a05ee3bd58861c79..fd0fe1951c73da315b0c9bf4a8f33effbadb9f8f 100644 --- a/src/loaders/local-path-loader.js +++ b/src/loaders/local-path-loader.js @@ -8,8 +8,8 @@ import { BaseLoader } from '@llm-tools/embedjs-interfaces'; export class LocalPathLoader extends BaseLoader { debug = createDebugMessages('embedjs:loader:LocalPathLoader'); path; - constructor({ path }) { - super(`LocalPathLoader_${md5(path)}`, { path }); + constructor({ path, chunkSize, chunkOverlap}) { + super(`LocalPathLoader_${md5(path)}`, { path }, chunkSize ?? 1000, chunkOverlap ?? 0); this.path = path; } async *getUnfilteredChunks() { @@ -36,10 +36,12 @@ export class LocalPathLoader extends BaseLoader { const extension = currentPath.split('.').pop().toLowerCase(); if (extension === 'md' || extension === 'mdx') mime = 'text/markdown'; + if (extension === 'txt') + mime = 'text/plain'; this.debug(`File '${this.path}' mime type updated to 'text/markdown'`); } try { - const loader = await createLoaderFromMimeType(currentPath, mime); + const loader = await createLoaderFromMimeType(currentPath, mime, this.chunkSize, this.chunkOverlap); for await (const result of await loader.getUnfilteredChunks()) { yield { pageContent: result.pageContent, diff --git a/src/util/mime.d.ts b/src/util/mime.d.ts index 57f56a1b8edc98366af9f84d671676c41c2f01ca..f53856fa9c78afbeee9e085c7ed0b3a131f8ee5a 100644 --- a/src/util/mime.d.ts +++ b/src/util/mime.d.ts @@ -1,2 +1,7 @@ -import { BaseLoader } from '@llm-tools/embedjs-interfaces'; -export declare function createLoaderFromMimeType(loaderData: string, mimeType: string): Promise; +import { BaseLoader } from "@llm-tools/embedjs-interfaces"; +export declare function createLoaderFromMimeType( + loaderData: string, + mimeType: string, + chunkSize?: number, + chunkOverlap?: number +): Promise; diff --git a/src/util/mime.js b/src/util/mime.js index 9af30bd5b8cf42985f547073a4c19756292c33a3..54ae20343131a533ab70236d3060b6accc8f6126 100644 --- a/src/util/mime.js +++ b/src/util/mime.js @@ -1,7 +1,9 @@ import mime from 'mime'; import createDebugMessages from 'debug'; import { TextLoader } from '../loaders/text-loader.js'; -export async function createLoaderFromMimeType(loaderData, mimeType) { +import fs from 'node:fs'; + +export async function createLoaderFromMimeType(loaderData, mimeType, chunkSize, chunkOverlap) { createDebugMessages('embedjs:util:createLoaderFromMimeType')(`Incoming mime type '${mimeType}'`); switch (mimeType) { case 'application/msword': @@ -10,7 +12,7 @@ export async function createLoaderFromMimeType(loaderData, mimeType) { throw new Error('Package `@llm-tools/embedjs-loader-msoffice` needs to be installed to load docx files'); }); createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported DocxLoader'); - return new DocxLoader({ filePathOrUrl: loaderData }); + return new DocxLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap }); } case 'application/vnd.ms-excel': case 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': { @@ -18,21 +20,21 @@ export async function createLoaderFromMimeType(loaderData, mimeType) { throw new Error('Package `@llm-tools/embedjs-loader-msoffice` needs to be installed to load excel files'); }); createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported ExcelLoader'); - return new ExcelLoader({ filePathOrUrl: loaderData }); + return new ExcelLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap }); } case 'application/pdf': { const { PdfLoader } = await import('@llm-tools/embedjs-loader-pdf').catch(() => { throw new Error('Package `@llm-tools/embedjs-loader-pdf` needs to be installed to load PDF files'); }); createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported PdfLoader'); - return new PdfLoader({ filePathOrUrl: loaderData }); + return new PdfLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap }); } case 'application/vnd.openxmlformats-officedocument.presentationml.presentation': { const { PptLoader } = await import('@llm-tools/embedjs-loader-msoffice').catch(() => { throw new Error('Package `@llm-tools/embedjs-loader-msoffice` needs to be installed to load pptx files'); }); createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported PptLoader'); - return new PptLoader({ filePathOrUrl: loaderData }); + return new PptLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap }); } case 'text/plain': { const fineType = mime.getType(loaderData); @@ -42,24 +44,26 @@ export async function createLoaderFromMimeType(loaderData, mimeType) { throw new Error('Package `@llm-tools/embedjs-loader-csv` needs to be installed to load CSV files'); }); createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported CsvLoader'); - return new CsvLoader({ filePathOrUrl: loaderData }); + return new CsvLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap }); + } + else{ + const content = fs.readFileSync(loaderData, 'utf-8'); + return new TextLoader({ text: content, chunkSize, chunkOverlap }); } - else - return new TextLoader({ text: loaderData }); } case 'application/csv': { const { CsvLoader } = await import('@llm-tools/embedjs-loader-csv').catch(() => { throw new Error('Package `@llm-tools/embedjs-loader-csv` needs to be installed to load CSV files'); }); createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported CsvLoader'); - return new CsvLoader({ filePathOrUrl: loaderData }); + return new CsvLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap }); } case 'text/html': { const { WebLoader } = await import('@llm-tools/embedjs-loader-web').catch(() => { throw new Error('Package `@llm-tools/embedjs-loader-web` needs to be installed to load web documents'); }); createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported WebLoader'); - return new WebLoader({ urlOrContent: loaderData }); + return new WebLoader({ urlOrContent: loaderData, chunkSize, chunkOverlap }); } case 'text/xml': { const { SitemapLoader } = await import('@llm-tools/embedjs-loader-sitemap').catch(() => { @@ -67,14 +71,14 @@ export async function createLoaderFromMimeType(loaderData, mimeType) { }); createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported SitemapLoader'); if (await SitemapLoader.test(loaderData)) { - return new SitemapLoader({ url: loaderData }); + return new SitemapLoader({ url: loaderData, chunkSize, chunkOverlap }); } //This is not a Sitemap but is still XML const { XmlLoader } = await import('@llm-tools/embedjs-loader-xml').catch(() => { throw new Error('Package `@llm-tools/embedjs-loader-xml` needs to be installed to load XML documents'); }); createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported XmlLoader'); - return new XmlLoader({ filePathOrUrl: loaderData }); + return new XmlLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap }); } case 'text/x-markdown': case 'text/markdown': { @@ -82,7 +86,7 @@ export async function createLoaderFromMimeType(loaderData, mimeType) { throw new Error('Package `@llm-tools/embedjs-loader-markdown` needs to be installed to load markdown files'); }); createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported MarkdownLoader'); - return new MarkdownLoader({ filePathOrUrl: loaderData }); + return new MarkdownLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap }); } case undefined: throw new Error(`MIME type could not be detected. Please file an issue if you think this is a bug.`);