feat: Improve file upload for the knowledge base (#2544)

* feat: Improve file upload for the knowledge base

* feat: Improve file upload for the knowledge base

* feat: Improve file upload for the knowledge base

---------

Co-authored-by: 亢奋猫 <kangfenmao@qq.com>
This commit is contained in:
icinggslits 2025-03-02 20:15:51 +08:00 committed by kangfenmao
parent 808b457503
commit 1b09bb47bf
2 changed files with 367 additions and 91 deletions

View File

@ -8,17 +8,77 @@ import { SitemapLoader } from '@llm-tools/embedjs-loader-sitemap'
import { WebLoader } from '@llm-tools/embedjs-loader-web' import { WebLoader } from '@llm-tools/embedjs-loader-web'
import { AzureOpenAiEmbeddings, OpenAiEmbeddings } from '@llm-tools/embedjs-openai' import { AzureOpenAiEmbeddings, OpenAiEmbeddings } from '@llm-tools/embedjs-openai'
import { addFileLoader } from '@main/loader' import { addFileLoader } from '@main/loader'
import { windowService } from '@main/services/WindowService'
import { getInstanceName } from '@main/utils' import { getInstanceName } from '@main/utils'
import { getAllFiles } from '@main/utils/file' import { getAllFiles } from '@main/utils/file'
import type { LoaderReturn } from '@shared/config/types' import type { LoaderReturn } from '@shared/config/types'
import { FileType, KnowledgeBaseParams, KnowledgeItem } from '@types' import { FileType, KnowledgeBaseParams, KnowledgeItem } from '@types'
import { app } from 'electron' import { app } from 'electron'
import Logger from 'electron-log'
import { v4 as uuidv4 } from 'uuid' import { v4 as uuidv4 } from 'uuid'
import { windowService } from './WindowService' export interface KnowledgeBaseAddItemOptions {
base: KnowledgeBaseParams
item: KnowledgeItem
forceReload?: boolean
}
interface KnowledgeBaseAddItemOptionsNonNullableAttribute {
base: KnowledgeBaseParams
item: KnowledgeItem
forceReload: boolean
}
interface EvaluateTaskWorkload {
workload: number
}
type LoaderDoneReturn = LoaderReturn | null
enum LoaderTaskItemState {
PENDING,
PROCESSING,
DONE
}
interface LoaderTaskItem {
state: LoaderTaskItemState
task: () => Promise<unknown>
evaluateTaskWorkload: EvaluateTaskWorkload
}
interface LoaderTask {
loaderTasks: LoaderTaskItem[]
loaderDoneReturn: LoaderDoneReturn
}
interface LoaderTaskOfSet {
loaderTasks: Set<LoaderTaskItem>
loaderDoneReturn: LoaderDoneReturn
}
interface QueueTaskItem {
taskPromise: () => Promise<unknown>
resolve: () => void
evaluateTaskWorkload: EvaluateTaskWorkload
}
const loaderTaskIntoOfSet = (loaderTask: LoaderTask): LoaderTaskOfSet => {
return {
loaderTasks: new Set(loaderTask.loaderTasks),
loaderDoneReturn: loaderTask.loaderDoneReturn
}
}
class KnowledgeService { class KnowledgeService {
private storageDir = path.join(app.getPath('userData'), 'Data', 'KnowledgeBase') private storageDir = path.join(app.getPath('userData'), 'Data', 'KnowledgeBase')
// Byte based
private workload = 0
private processingItemCount = 0
private knowledgeItemProcessingQueueMappingPromise: Map<LoaderTaskOfSet, () => void> = new Map()
private static MAXIMUM_WORKLOAD = 1024 * 1024 * 80
private static MAXIMUM_PROCESSING_ITEM_COUNT = 30
private static ERROR_LOADER_RETURN: LoaderReturn = { entriesAdded: 0, uniqueId: '', uniqueIds: [''], loaderType: '' }
constructor() { constructor() {
this.initStorageDir() this.initStorageDir()
@ -79,11 +139,52 @@ class KnowledgeService {
} }
} }
public add = async ( private maximumLoad() {
_: Electron.IpcMainInvokeEvent, return (
{ base, item, forceReload = false }: { base: KnowledgeBaseParams; item: KnowledgeItem; forceReload: boolean } this.processingItemCount >= KnowledgeService.MAXIMUM_PROCESSING_ITEM_COUNT ||
): Promise<LoaderReturn> => { this.workload >= KnowledgeService.MAXIMUM_WORKLOAD
const ragApplication = await this.getRagApplication(base) )
}
private fileTask(
ragApplication: RAGApplication,
options: KnowledgeBaseAddItemOptionsNonNullableAttribute
): LoaderTask {
const { base, item, forceReload } = options
const file = item.content as FileType
const loaderTask: LoaderTask = {
loaderTasks: [
{
state: LoaderTaskItemState.PENDING,
task: () =>
addFileLoader(ragApplication, file, base, forceReload)
.then((result) => {
loaderTask.loaderDoneReturn = result
return result
})
.catch((err) => {
Logger.error(err)
return KnowledgeService.ERROR_LOADER_RETURN
}),
evaluateTaskWorkload: { workload: file.size }
}
],
loaderDoneReturn: null
}
return loaderTask
}
private directoryTask(
ragApplication: RAGApplication,
options: KnowledgeBaseAddItemOptionsNonNullableAttribute
): LoaderTask {
const { base, item, forceReload } = options
const directory = item.content as string
const files = getAllFiles(directory)
const totalFiles = files.length
let processedFiles = 0
const sendDirectoryProcessingPercent = (totalFiles: number, processedFiles: number) => { const sendDirectoryProcessingPercent = (totalFiles: number, processedFiles: number) => {
const mainWindow = windowService.getMainWindow() const mainWindow = windowService.getMainWindow()
@ -93,86 +194,257 @@ class KnowledgeService {
}) })
} }
if (item.type === 'directory') { const loaderDoneReturn: LoaderDoneReturn = {
const directory = item.content as string entriesAdded: 0,
const files = getAllFiles(directory) uniqueId: `DirectoryLoader_${uuidv4()}`,
const totalFiles = files.length uniqueIds: [],
let processedFiles = 0 loaderType: 'DirectoryLoader'
}
const loaderPromises = files.map(async (file) => { const loaderTasks: LoaderTaskItem[] = []
const result = await addFileLoader(ragApplication, file, base, forceReload) for (const file of files) {
processedFiles++ loaderTasks.push({
state: LoaderTaskItemState.PENDING,
task: () =>
addFileLoader(ragApplication, file, base, forceReload)
.then((result) => {
loaderDoneReturn.entriesAdded += 1
processedFiles += 1
sendDirectoryProcessingPercent(totalFiles, processedFiles) sendDirectoryProcessingPercent(totalFiles, processedFiles)
loaderDoneReturn.uniqueIds.push(result.uniqueId)
return result return result
}) })
.catch((err) => {
const loaderResults = await Promise.allSettled(loaderPromises) Logger.error(err)
// @ts-ignore uniqueId return KnowledgeService.ERROR_LOADER_RETURN
const uniqueIds = loaderResults }),
.filter((result) => result.status === 'fulfilled') evaluateTaskWorkload: { workload: file.size }
.map((result) => result.value.uniqueId) })
return {
entriesAdded: loaderResults.length,
uniqueId: `DirectoryLoader_${uuidv4()}`,
uniqueIds,
loaderType: 'DirectoryLoader'
} as LoaderReturn
} }
if (item.type === 'url') { return {
loaderTasks,
loaderDoneReturn
}
}
private urlTask(
ragApplication: RAGApplication,
options: KnowledgeBaseAddItemOptionsNonNullableAttribute
): LoaderTask {
const { base, item, forceReload } = options
const content = item.content as string const content = item.content as string
if (content.startsWith('http')) {
const loaderReturn = await ragApplication.addLoader( const loaderTask: LoaderTask = {
new WebLoader({ urlOrContent: content, chunkSize: base.chunkSize, chunkOverlap: base.chunkOverlap }) as any, loaderTasks: [
{
state: LoaderTaskItemState.PENDING,
task: () => {
const loaderReturn = ragApplication.addLoader(
new WebLoader({
urlOrContent: content,
chunkSize: base.chunkSize,
chunkOverlap: base.chunkOverlap
}),
forceReload forceReload
) ) as Promise<LoaderReturn>
return {
entriesAdded: loaderReturn.entriesAdded, return loaderReturn
uniqueId: loaderReturn.uniqueId, .then((result) => {
uniqueIds: [loaderReturn.uniqueId], const { entriesAdded, uniqueId, loaderType } = result
loaderType: loaderReturn.loaderType loaderTask.loaderDoneReturn = {
} as LoaderReturn entriesAdded: entriesAdded,
uniqueId: uniqueId,
uniqueIds: [uniqueId],
loaderType: loaderType
} }
return result
})
.catch((err) => {
Logger.error(err)
return KnowledgeService.ERROR_LOADER_RETURN
})
},
evaluateTaskWorkload: { workload: 1024 * 1024 * 2 }
}
],
loaderDoneReturn: null
}
return loaderTask
} }
if (item.type === 'sitemap') { private sitemapTask(
ragApplication: RAGApplication,
options: KnowledgeBaseAddItemOptionsNonNullableAttribute
): LoaderTask {
const { base, item, forceReload } = options
const content = item.content as string const content = item.content as string
// @ts-ignore loader type
const loaderReturn = await ragApplication.addLoader( const loaderTask: LoaderTask = {
loaderTasks: [
{
state: LoaderTaskItemState.PENDING,
task: () =>
ragApplication
.addLoader(
new SitemapLoader({ url: content, chunkSize: base.chunkSize, chunkOverlap: base.chunkOverlap }) as any, new SitemapLoader({ url: content, chunkSize: base.chunkSize, chunkOverlap: base.chunkOverlap }) as any,
forceReload forceReload
) )
return { .then((result) => {
entriesAdded: loaderReturn.entriesAdded, const { entriesAdded, uniqueId, loaderType } = result
uniqueId: loaderReturn.uniqueId, loaderTask.loaderDoneReturn = {
uniqueIds: [loaderReturn.uniqueId], entriesAdded: entriesAdded,
loaderType: loaderReturn.loaderType uniqueId: uniqueId,
} as LoaderReturn uniqueIds: [uniqueId],
loaderType: loaderType
}
return result
})
.catch((err) => {
Logger.error(err)
return KnowledgeService.ERROR_LOADER_RETURN
}),
evaluateTaskWorkload: { workload: 1024 * 1024 * 20 }
}
],
loaderDoneReturn: null
}
return loaderTask
} }
if (item.type === 'note') { private noteTask(
ragApplication: RAGApplication,
options: KnowledgeBaseAddItemOptionsNonNullableAttribute
): LoaderTask {
const { base, item, forceReload } = options
const content = item.content as string const content = item.content as string
console.debug('chunkSize', base.chunkSize) console.debug('chunkSize', base.chunkSize)
const loaderReturn = await ragApplication.addLoader(
const encoder = new TextEncoder()
const contentBytes = encoder.encode(content)
const loaderTask: LoaderTask = {
loaderTasks: [
{
state: LoaderTaskItemState.PENDING,
task: () => {
const loaderReturn = ragApplication.addLoader(
new TextLoader({ text: content, chunkSize: base.chunkSize, chunkOverlap: base.chunkOverlap }), new TextLoader({ text: content, chunkSize: base.chunkSize, chunkOverlap: base.chunkOverlap }),
forceReload forceReload
) ) as Promise<LoaderReturn>
return {
entriesAdded: loaderReturn.entriesAdded, return loaderReturn
uniqueId: loaderReturn.uniqueId, .then(({ entriesAdded, uniqueId, loaderType }) => {
uniqueIds: [loaderReturn.uniqueId], loaderTask.loaderDoneReturn = {
loaderType: loaderReturn.loaderType entriesAdded: entriesAdded,
} as LoaderReturn uniqueId: uniqueId,
uniqueIds: [uniqueId],
loaderType: loaderType
}
})
.catch((err) => {
Logger.error(err)
return KnowledgeService.ERROR_LOADER_RETURN
})
},
evaluateTaskWorkload: { workload: contentBytes.length }
}
],
loaderDoneReturn: null
}
return loaderTask
} }
if (item.type === 'file') { private processingQueueHandle() {
const file = item.content as FileType const getSubtasksUntilMaximumLoad = (): QueueTaskItem[] => {
const queueTaskList: QueueTaskItem[] = []
return await addFileLoader(ragApplication, file, base, forceReload) that: for (const [task, resolve] of this.knowledgeItemProcessingQueueMappingPromise) {
for (const item of task.loaderTasks) {
if (this.maximumLoad()) {
break that
} }
return { entriesAdded: 0, uniqueId: '', uniqueIds: [''], loaderType: '' } const { state, task: taskPromise, evaluateTaskWorkload } = item
if (state !== LoaderTaskItemState.PENDING) {
continue
}
const { workload } = evaluateTaskWorkload
this.workload += workload
this.processingItemCount += 1
item.state = LoaderTaskItemState.PROCESSING
queueTaskList.push({
taskPromise: () =>
taskPromise().then(() => {
this.workload -= workload
this.processingItemCount -= 1
task.loaderTasks.delete(item)
if (task.loaderTasks.size === 0) {
this.knowledgeItemProcessingQueueMappingPromise.delete(task)
resolve()
}
this.processingQueueHandle()
}),
resolve: () => {},
evaluateTaskWorkload
})
}
}
return queueTaskList
}
const subTasks = getSubtasksUntilMaximumLoad()
if (subTasks.length > 0) {
const subTaskPromises = subTasks.map(({ taskPromise }) => taskPromise())
Promise.all(subTaskPromises).then(() => {
subTasks.forEach(({ resolve }) => resolve())
})
}
}
private appendProcessingQueue(task: LoaderTask): Promise<LoaderReturn> {
return new Promise((resolve) => {
this.knowledgeItemProcessingQueueMappingPromise.set(loaderTaskIntoOfSet(task), () => {
resolve(task.loaderDoneReturn!)
})
})
}
public add = (_: Electron.IpcMainInvokeEvent, options: KnowledgeBaseAddItemOptions): Promise<LoaderReturn> => {
return new Promise((resolve) => {
const { base, item, forceReload = false } = options
const optionsNonNullableAttribute = { base, item, forceReload }
this.getRagApplication(base)
.then((ragApplication) => {
const task = (() => {
switch (item.type) {
case 'file':
return this.fileTask(ragApplication, optionsNonNullableAttribute)
case 'directory':
return this.directoryTask(ragApplication, optionsNonNullableAttribute)
case 'url':
return this.urlTask(ragApplication, optionsNonNullableAttribute)
case 'sitemap':
return this.sitemapTask(ragApplication, optionsNonNullableAttribute)
case 'note':
return this.noteTask(ragApplication, optionsNonNullableAttribute)
default:
return null
}
})()
if (task) {
this.appendProcessingQueue(task).then(() => {
resolve(task.loaderDoneReturn!)
})
this.processingQueueHandle()
} else {
resolve(KnowledgeService.ERROR_LOADER_RETURN)
}
})
.catch((err) => {
Logger.error(err)
resolve(KnowledgeService.ERROR_LOADER_RETURN)
})
})
} }
public remove = async ( public remove = async (

View File

@ -51,20 +51,24 @@ class KnowledgeQueue {
throw new Error('Knowledge base not found') throw new Error('Knowledge base not found')
} }
const processableItems = base.items.filter((item) => { const findProcessableItem = () => {
const state = store.getState()
const base = state.knowledge.bases.find((b) => b.id === baseId)
return (
base?.items.find((item) => {
if (item.processingStatus === 'failed') { if (item.processingStatus === 'failed') {
return !item.retryCount || item.retryCount < this.MAX_RETRIES return !item.retryCount || item.retryCount < this.MAX_RETRIES
} } else {
return item.processingStatus === 'pending' return item.processingStatus === 'pending'
}) }
}) ?? null
for (const item of processableItems) { )
if (!this.processing.get(baseId)) {
console.log(`[KnowledgeQueue] Processing interrupted for base ${baseId}`)
break
} }
this.processItem(baseId, item) let processableItem = findProcessableItem()
while (processableItem) {
this.processItem(baseId, processableItem).then()
processableItem = findProcessableItem()
} }
} finally { } finally {
console.log(`[KnowledgeQueue] Finished processing queue for base ${baseId}`) console.log(`[KnowledgeQueue] Finished processing queue for base ${baseId}`)
@ -153,7 +157,7 @@ class KnowledgeQueue {
} }
console.debug(`[KnowledgeQueue] Updated uniqueId for item ${item.id} in base ${baseId} `) console.debug(`[KnowledgeQueue] Updated uniqueId for item ${item.id} in base ${baseId} `)
setTimeout(() => store.dispatch(clearCompletedProcessing({ baseId })), 1000) store.dispatch(clearCompletedProcessing({ baseId }))
} catch (error) { } catch (error) {
console.error(`[KnowledgeQueue] Error processing item ${item.id}: `, error) console.error(`[KnowledgeQueue] Error processing item ${item.id}: `, error)
store.dispatch( store.dispatch(