218 lines
11 KiB
Diff
218 lines
11 KiB
Diff
diff --git a/src/core/rag-embedding.js b/src/core/rag-embedding.js
|
|
index 50c3c4064af17bc4c7c46554d8f2419b3afceb0e..632c9b2e04d2e0e3bb09ef1cd8f29d2560e6afc1 100644
|
|
--- a/src/core/rag-embedding.js
|
|
+++ b/src/core/rag-embedding.js
|
|
@@ -1,10 +1,8 @@
|
|
export class RAGEmbedding {
|
|
static singleton;
|
|
static async init(embeddingModel) {
|
|
- if (!this.singleton) {
|
|
- await embeddingModel.init();
|
|
- this.singleton = new RAGEmbedding(embeddingModel);
|
|
- }
|
|
+ await embeddingModel.init();
|
|
+ this.singleton = new RAGEmbedding(embeddingModel);
|
|
}
|
|
static getInstance() {
|
|
return RAGEmbedding.singleton;
|
|
diff --git a/src/loaders/local-path-loader.d.ts b/src/loaders/local-path-loader.d.ts
|
|
index 48c20e68c469cd309be2dc8f28e44c1bd04a26e9..87002be39e7305a02e2a607b0c0d95cbbc359f9d 100644
|
|
--- a/src/loaders/local-path-loader.d.ts
|
|
+++ b/src/loaders/local-path-loader.d.ts
|
|
@@ -1,19 +1,29 @@
|
|
-import { BaseLoader } from '@llm-tools/embedjs-interfaces';
|
|
+import { BaseLoader } from "@llm-tools/embedjs-interfaces";
|
|
export declare class LocalPathLoader extends BaseLoader<{
|
|
- type: 'LocalPathLoader';
|
|
+ type: "LocalPathLoader";
|
|
}> {
|
|
- private readonly debug;
|
|
- private readonly path;
|
|
- constructor({ path }: {
|
|
- path: string;
|
|
- });
|
|
- getUnfilteredChunks(): AsyncGenerator<{
|
|
- metadata: {
|
|
- type: "LocalPathLoader";
|
|
- originalPath: string;
|
|
- source: string;
|
|
- };
|
|
- pageContent: string;
|
|
- }, void, unknown>;
|
|
- private recursivelyAddPath;
|
|
+ private readonly debug;
|
|
+ private readonly path;
|
|
+ constructor({
|
|
+ path,
|
|
+ chunkSize,
|
|
+ chunkOverlap,
|
|
+ }: {
|
|
+ path: string;
|
|
+ chunkSize?: number;
|
|
+ chunkOverlap?: number;
|
|
+ });
|
|
+ getUnfilteredChunks(): AsyncGenerator<
|
|
+ {
|
|
+ metadata: {
|
|
+ type: "LocalPathLoader";
|
|
+ originalPath: string;
|
|
+ source: string;
|
|
+ };
|
|
+ pageContent: string;
|
|
+ },
|
|
+ void,
|
|
+ unknown
|
|
+ >;
|
|
+ private recursivelyAddPath;
|
|
}
|
|
diff --git a/src/loaders/local-path-loader.js b/src/loaders/local-path-loader.js
|
|
index 4cf8a6bd1d890244c8ec49d4a05ee3bd58861c79..fd0fe1951c73da315b0c9bf4a8f33effbadb9f8f 100644
|
|
--- a/src/loaders/local-path-loader.js
|
|
+++ b/src/loaders/local-path-loader.js
|
|
@@ -8,8 +8,8 @@ import { BaseLoader } from '@llm-tools/embedjs-interfaces';
|
|
export class LocalPathLoader extends BaseLoader {
|
|
debug = createDebugMessages('embedjs:loader:LocalPathLoader');
|
|
path;
|
|
- constructor({ path }) {
|
|
- super(`LocalPathLoader_${md5(path)}`, { path });
|
|
+ constructor({ path, chunkSize, chunkOverlap}) {
|
|
+ super(`LocalPathLoader_${md5(path)}`, { path }, chunkSize ?? 1000, chunkOverlap ?? 0);
|
|
this.path = path;
|
|
}
|
|
async *getUnfilteredChunks() {
|
|
@@ -36,10 +36,12 @@ export class LocalPathLoader extends BaseLoader {
|
|
const extension = currentPath.split('.').pop().toLowerCase();
|
|
if (extension === 'md' || extension === 'mdx')
|
|
mime = 'text/markdown';
|
|
+ if (extension === 'txt')
|
|
+ mime = 'text/plain';
|
|
this.debug(`File '${this.path}' mime type updated to 'text/markdown'`);
|
|
}
|
|
try {
|
|
- const loader = await createLoaderFromMimeType(currentPath, mime);
|
|
+ const loader = await createLoaderFromMimeType(currentPath, mime, this.chunkSize, this.chunkOverlap);
|
|
for await (const result of await loader.getUnfilteredChunks()) {
|
|
yield {
|
|
pageContent: result.pageContent,
|
|
diff --git a/src/util/mime.d.ts b/src/util/mime.d.ts
|
|
index 57f56a1b8edc98366af9f84d671676c41c2f01ca..f53856fa9c78afbeee9e085c7ed0b3a131f8ee5a 100644
|
|
--- a/src/util/mime.d.ts
|
|
+++ b/src/util/mime.d.ts
|
|
@@ -1,2 +1,7 @@
|
|
-import { BaseLoader } from '@llm-tools/embedjs-interfaces';
|
|
-export declare function createLoaderFromMimeType(loaderData: string, mimeType: string): Promise<BaseLoader>;
|
|
+import { BaseLoader } from "@llm-tools/embedjs-interfaces";
|
|
+export declare function createLoaderFromMimeType(
|
|
+ loaderData: string,
|
|
+ mimeType: string,
|
|
+ chunkSize?: number,
|
|
+ chunkOverlap?: number
|
|
+): Promise<BaseLoader>;
|
|
diff --git a/src/util/mime.js b/src/util/mime.js
|
|
index 9af30bd5b8cf42985f547073a4c19756292c33a3..54ae20343131a533ab70236d3060b6accc8f6126 100644
|
|
--- a/src/util/mime.js
|
|
+++ b/src/util/mime.js
|
|
@@ -1,7 +1,9 @@
|
|
import mime from 'mime';
|
|
import createDebugMessages from 'debug';
|
|
import { TextLoader } from '../loaders/text-loader.js';
|
|
-export async function createLoaderFromMimeType(loaderData, mimeType) {
|
|
+import fs from 'node:fs';
|
|
+
|
|
+export async function createLoaderFromMimeType(loaderData, mimeType, chunkSize, chunkOverlap) {
|
|
createDebugMessages('embedjs:util:createLoaderFromMimeType')(`Incoming mime type '${mimeType}'`);
|
|
switch (mimeType) {
|
|
case 'application/msword':
|
|
@@ -10,7 +12,7 @@ export async function createLoaderFromMimeType(loaderData, mimeType) {
|
|
throw new Error('Package `@llm-tools/embedjs-loader-msoffice` needs to be installed to load docx files');
|
|
});
|
|
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported DocxLoader');
|
|
- return new DocxLoader({ filePathOrUrl: loaderData });
|
|
+ return new DocxLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap });
|
|
}
|
|
case 'application/vnd.ms-excel':
|
|
case 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': {
|
|
@@ -18,21 +20,21 @@ export async function createLoaderFromMimeType(loaderData, mimeType) {
|
|
throw new Error('Package `@llm-tools/embedjs-loader-msoffice` needs to be installed to load excel files');
|
|
});
|
|
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported ExcelLoader');
|
|
- return new ExcelLoader({ filePathOrUrl: loaderData });
|
|
+ return new ExcelLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap });
|
|
}
|
|
case 'application/pdf': {
|
|
const { PdfLoader } = await import('@llm-tools/embedjs-loader-pdf').catch(() => {
|
|
throw new Error('Package `@llm-tools/embedjs-loader-pdf` needs to be installed to load PDF files');
|
|
});
|
|
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported PdfLoader');
|
|
- return new PdfLoader({ filePathOrUrl: loaderData });
|
|
+ return new PdfLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap });
|
|
}
|
|
case 'application/vnd.openxmlformats-officedocument.presentationml.presentation': {
|
|
const { PptLoader } = await import('@llm-tools/embedjs-loader-msoffice').catch(() => {
|
|
throw new Error('Package `@llm-tools/embedjs-loader-msoffice` needs to be installed to load pptx files');
|
|
});
|
|
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported PptLoader');
|
|
- return new PptLoader({ filePathOrUrl: loaderData });
|
|
+ return new PptLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap });
|
|
}
|
|
case 'text/plain': {
|
|
const fineType = mime.getType(loaderData);
|
|
@@ -42,24 +44,26 @@ export async function createLoaderFromMimeType(loaderData, mimeType) {
|
|
throw new Error('Package `@llm-tools/embedjs-loader-csv` needs to be installed to load CSV files');
|
|
});
|
|
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported CsvLoader');
|
|
- return new CsvLoader({ filePathOrUrl: loaderData });
|
|
+ return new CsvLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap });
|
|
+ }
|
|
+ else{
|
|
+ const content = fs.readFileSync(loaderData, 'utf-8');
|
|
+ return new TextLoader({ text: content, chunkSize, chunkOverlap });
|
|
}
|
|
- else
|
|
- return new TextLoader({ text: loaderData });
|
|
}
|
|
case 'application/csv': {
|
|
const { CsvLoader } = await import('@llm-tools/embedjs-loader-csv').catch(() => {
|
|
throw new Error('Package `@llm-tools/embedjs-loader-csv` needs to be installed to load CSV files');
|
|
});
|
|
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported CsvLoader');
|
|
- return new CsvLoader({ filePathOrUrl: loaderData });
|
|
+ return new CsvLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap });
|
|
}
|
|
case 'text/html': {
|
|
const { WebLoader } = await import('@llm-tools/embedjs-loader-web').catch(() => {
|
|
throw new Error('Package `@llm-tools/embedjs-loader-web` needs to be installed to load web documents');
|
|
});
|
|
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported WebLoader');
|
|
- return new WebLoader({ urlOrContent: loaderData });
|
|
+ return new WebLoader({ urlOrContent: loaderData, chunkSize, chunkOverlap });
|
|
}
|
|
case 'text/xml': {
|
|
const { SitemapLoader } = await import('@llm-tools/embedjs-loader-sitemap').catch(() => {
|
|
@@ -67,14 +71,14 @@ export async function createLoaderFromMimeType(loaderData, mimeType) {
|
|
});
|
|
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported SitemapLoader');
|
|
if (await SitemapLoader.test(loaderData)) {
|
|
- return new SitemapLoader({ url: loaderData });
|
|
+ return new SitemapLoader({ url: loaderData, chunkSize, chunkOverlap });
|
|
}
|
|
//This is not a Sitemap but is still XML
|
|
const { XmlLoader } = await import('@llm-tools/embedjs-loader-xml').catch(() => {
|
|
throw new Error('Package `@llm-tools/embedjs-loader-xml` needs to be installed to load XML documents');
|
|
});
|
|
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported XmlLoader');
|
|
- return new XmlLoader({ filePathOrUrl: loaderData });
|
|
+ return new XmlLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap });
|
|
}
|
|
case 'text/x-markdown':
|
|
case 'text/markdown': {
|
|
@@ -82,7 +86,7 @@ export async function createLoaderFromMimeType(loaderData, mimeType) {
|
|
throw new Error('Package `@llm-tools/embedjs-loader-markdown` needs to be installed to load markdown files');
|
|
});
|
|
createDebugMessages('embedjs:util:createLoaderFromMimeType')('Dynamically imported MarkdownLoader');
|
|
- return new MarkdownLoader({ filePathOrUrl: loaderData });
|
|
+ return new MarkdownLoader({ filePathOrUrl: loaderData, chunkSize, chunkOverlap });
|
|
}
|
|
case undefined:
|
|
throw new Error(`MIME type could not be detected. Please file an issue if you think this is a bug.`);
|