diff --git a/src/main/loader/index.ts b/src/main/loader/index.ts index 2c6afa74..5c289632 100644 --- a/src/main/loader/index.ts +++ b/src/main/loader/index.ts @@ -11,8 +11,30 @@ import { DraftsExportLoader } from './draftsExportLoader' import { EpubLoader } from './epubLoader' import { OdLoader, OdType } from './odLoader' -// embedjs内置loader类型 -const commonExts = ['.pdf', '.csv', '.docx', '.pptx', '.xlsx', '.md'] +// 文件扩展名到加载器类型的映射 +const FILE_LOADER_MAP: Record = { + // 内置类型 + '.pdf': 'common', + '.csv': 'common', + '.docx': 'common', + '.pptx': 'common', + '.xlsx': 'common', + '.md': 'common', + // OD类型 + '.odt': 'od', + '.ods': 'od', + '.odp': 'od', + // epub类型 + '.epub': 'epub', + // Drafts类型 + '.draftsexport': 'drafts', + // HTML类型 + '.html': 'html', + '.htm': 'html', + // JSON类型 + '.json': 'json' + // 其他类型默认为文本类型 +} export async function addOdLoader( ragApplication: RAGApplication, @@ -46,110 +68,87 @@ export async function addFileLoader( base: KnowledgeBaseParams, forceReload: boolean ): Promise { - // 内置类型 - if (commonExts.includes(file.ext)) { - const loaderReturn = await ragApplication.addLoader( - // @ts-ignore LocalPathLoader - new LocalPathLoader({ path: file.path, chunkSize: base.chunkSize, chunkOverlap: base.chunkOverlap }) as any, - forceReload - ) - return { - entriesAdded: loaderReturn.entriesAdded, - uniqueId: loaderReturn.uniqueId, - uniqueIds: [loaderReturn.uniqueId], - loaderType: loaderReturn.loaderType - } as LoaderReturn - } + // 获取文件类型,如果没有匹配则默认为文本类型 + const loaderType = FILE_LOADER_MAP[file.ext.toLowerCase()] || 'text' + let loaderReturn: AddLoaderReturn - // 自定义类型 - if (['.odt', '.ods', '.odp'].includes(file.ext)) { - const loaderReturn = await addOdLoader(ragApplication, file, base, forceReload) - return { - entriesAdded: loaderReturn.entriesAdded, - uniqueId: loaderReturn.uniqueId, - uniqueIds: [loaderReturn.uniqueId], - loaderType: loaderReturn.loaderType - } as LoaderReturn - } + // JSON类型处理 + let jsonObject = {} + let jsonParsed = true + Logger.info(`[KnowledgeBase] processing file ${file.path} as ${loaderType} type`) + switch (loaderType) { + case 'common': + // 内置类型处理 + loaderReturn = await ragApplication.addLoader( + new LocalPathLoader({ + path: file.path, + chunkSize: base.chunkSize, + chunkOverlap: base.chunkOverlap + }) as any, + forceReload + ) + break - // epub 文件处理 - if (file.ext === '.epub') { - const loaderReturn = await ragApplication.addLoader( - new EpubLoader({ - filePath: file.path, - chunkSize: base.chunkSize ?? 1000, - chunkOverlap: base.chunkOverlap ?? 200 - }) as any, - forceReload - ) - return { - entriesAdded: loaderReturn.entriesAdded, - uniqueId: loaderReturn.uniqueId, - uniqueIds: [loaderReturn.uniqueId], - loaderType: loaderReturn.loaderType - } as LoaderReturn - } + case 'od': + // OD类型处理 + loaderReturn = await addOdLoader(ragApplication, file, base, forceReload) + break + case 'epub': + // epub类型处理 + loaderReturn = await ragApplication.addLoader( + new EpubLoader({ + filePath: file.path, + chunkSize: base.chunkSize ?? 1000, + chunkOverlap: base.chunkOverlap ?? 200 + }) as any, + forceReload + ) + break - // DraftsExport类型 (file.ext会自动转换成小写) - if (['.draftsexport'].includes(file.ext)) { - const loaderReturn = await ragApplication.addLoader(new DraftsExportLoader(file.path) as any, forceReload) - return { - entriesAdded: loaderReturn.entriesAdded, - uniqueId: loaderReturn.uniqueId, - uniqueIds: [loaderReturn.uniqueId], - loaderType: loaderReturn.loaderType - } - } + case 'drafts': + // Drafts类型处理 + loaderReturn = await ragApplication.addLoader(new DraftsExportLoader(file.path) as any, forceReload) + break - const fileContent = fs.readFileSync(file.path, 'utf-8') + case 'html': + // HTML类型处理 + loaderReturn = await ragApplication.addLoader( + new WebLoader({ + urlOrContent: fs.readFileSync(file.path, 'utf-8'), + chunkSize: base.chunkSize, + chunkOverlap: base.chunkOverlap + }) as any, + forceReload + ) + break - // HTML类型 - if (['.html', '.htm'].includes(file.ext)) { - const loaderReturn = await ragApplication.addLoader( - new WebLoader({ - urlOrContent: fileContent, - chunkSize: base.chunkSize, - chunkOverlap: base.chunkOverlap - }) as any, - forceReload - ) - return { - entriesAdded: loaderReturn.entriesAdded, - uniqueId: loaderReturn.uniqueId, - uniqueIds: [loaderReturn.uniqueId], - loaderType: loaderReturn.loaderType - } - } - - // JSON类型 - if (['.json'].includes(file.ext)) { - let jsonObject = {} - let jsonParsed = true - try { - jsonObject = JSON.parse(fileContent) - } catch (error) { - jsonParsed = false - Logger.warn('[KnowledgeBase] failed parsing json file, failling back to text processing:', file.path, error) - } - if (jsonParsed) { - const loaderReturn = await ragApplication.addLoader(new JsonLoader({ object: jsonObject })) - return { - entriesAdded: loaderReturn.entriesAdded, - uniqueId: loaderReturn.uniqueId, - uniqueIds: [loaderReturn.uniqueId], - loaderType: loaderReturn.loaderType + case 'json': + try { + jsonObject = JSON.parse(fs.readFileSync(file.path, 'utf-8')) + } catch (error) { + jsonParsed = false + Logger.warn('[KnowledgeBase] failed parsing json file, falling back to text processing:', file.path, error) } - } + + if (jsonParsed) { + loaderReturn = await ragApplication.addLoader(new JsonLoader({ object: jsonObject }), forceReload) + break + } + // fallthrough - JSON 解析失败时作为文本处理 + default: + // 文本类型处理(默认) + // 如果是其他文本类型且尚未读取文件,则读取文件 + loaderReturn = await ragApplication.addLoader( + new TextLoader({ + text: fs.readFileSync(file.path, 'utf-8'), + chunkSize: base.chunkSize, + chunkOverlap: base.chunkOverlap + }) as any, + forceReload + ) + break } - // 文本类型 - const loaderReturn = await ragApplication.addLoader( - new TextLoader({ text: fileContent, chunkSize: base.chunkSize, chunkOverlap: base.chunkOverlap }) as any, - forceReload - ) - - Logger.info('[KnowledgeBase] processing file', file.path) - return { entriesAdded: loaderReturn.entriesAdded, uniqueId: loaderReturn.uniqueId,