feat: Improve file upload for the knowledge base (#2544)
* feat: Improve file upload for the knowledge base * feat: Improve file upload for the knowledge base * feat: Improve file upload for the knowledge base --------- Co-authored-by: 亢奋猫 <kangfenmao@qq.com>
This commit is contained in:
parent
808b457503
commit
1b09bb47bf
@ -8,17 +8,77 @@ import { SitemapLoader } from '@llm-tools/embedjs-loader-sitemap'
|
||||
import { WebLoader } from '@llm-tools/embedjs-loader-web'
|
||||
import { AzureOpenAiEmbeddings, OpenAiEmbeddings } from '@llm-tools/embedjs-openai'
|
||||
import { addFileLoader } from '@main/loader'
|
||||
import { windowService } from '@main/services/WindowService'
|
||||
import { getInstanceName } from '@main/utils'
|
||||
import { getAllFiles } from '@main/utils/file'
|
||||
import type { LoaderReturn } from '@shared/config/types'
|
||||
import { FileType, KnowledgeBaseParams, KnowledgeItem } from '@types'
|
||||
import { app } from 'electron'
|
||||
import Logger from 'electron-log'
|
||||
import { v4 as uuidv4 } from 'uuid'
|
||||
|
||||
import { windowService } from './WindowService'
|
||||
export interface KnowledgeBaseAddItemOptions {
|
||||
base: KnowledgeBaseParams
|
||||
item: KnowledgeItem
|
||||
forceReload?: boolean
|
||||
}
|
||||
|
||||
interface KnowledgeBaseAddItemOptionsNonNullableAttribute {
|
||||
base: KnowledgeBaseParams
|
||||
item: KnowledgeItem
|
||||
forceReload: boolean
|
||||
}
|
||||
|
||||
interface EvaluateTaskWorkload {
|
||||
workload: number
|
||||
}
|
||||
|
||||
type LoaderDoneReturn = LoaderReturn | null
|
||||
|
||||
enum LoaderTaskItemState {
|
||||
PENDING,
|
||||
PROCESSING,
|
||||
DONE
|
||||
}
|
||||
|
||||
interface LoaderTaskItem {
|
||||
state: LoaderTaskItemState
|
||||
task: () => Promise<unknown>
|
||||
evaluateTaskWorkload: EvaluateTaskWorkload
|
||||
}
|
||||
|
||||
interface LoaderTask {
|
||||
loaderTasks: LoaderTaskItem[]
|
||||
loaderDoneReturn: LoaderDoneReturn
|
||||
}
|
||||
|
||||
interface LoaderTaskOfSet {
|
||||
loaderTasks: Set<LoaderTaskItem>
|
||||
loaderDoneReturn: LoaderDoneReturn
|
||||
}
|
||||
|
||||
interface QueueTaskItem {
|
||||
taskPromise: () => Promise<unknown>
|
||||
resolve: () => void
|
||||
evaluateTaskWorkload: EvaluateTaskWorkload
|
||||
}
|
||||
|
||||
const loaderTaskIntoOfSet = (loaderTask: LoaderTask): LoaderTaskOfSet => {
|
||||
return {
|
||||
loaderTasks: new Set(loaderTask.loaderTasks),
|
||||
loaderDoneReturn: loaderTask.loaderDoneReturn
|
||||
}
|
||||
}
|
||||
|
||||
class KnowledgeService {
|
||||
private storageDir = path.join(app.getPath('userData'), 'Data', 'KnowledgeBase')
|
||||
// Byte based
|
||||
private workload = 0
|
||||
private processingItemCount = 0
|
||||
private knowledgeItemProcessingQueueMappingPromise: Map<LoaderTaskOfSet, () => void> = new Map()
|
||||
private static MAXIMUM_WORKLOAD = 1024 * 1024 * 80
|
||||
private static MAXIMUM_PROCESSING_ITEM_COUNT = 30
|
||||
private static ERROR_LOADER_RETURN: LoaderReturn = { entriesAdded: 0, uniqueId: '', uniqueIds: [''], loaderType: '' }
|
||||
|
||||
constructor() {
|
||||
this.initStorageDir()
|
||||
@ -79,11 +139,52 @@ class KnowledgeService {
|
||||
}
|
||||
}
|
||||
|
||||
public add = async (
|
||||
_: Electron.IpcMainInvokeEvent,
|
||||
{ base, item, forceReload = false }: { base: KnowledgeBaseParams; item: KnowledgeItem; forceReload: boolean }
|
||||
): Promise<LoaderReturn> => {
|
||||
const ragApplication = await this.getRagApplication(base)
|
||||
private maximumLoad() {
|
||||
return (
|
||||
this.processingItemCount >= KnowledgeService.MAXIMUM_PROCESSING_ITEM_COUNT ||
|
||||
this.workload >= KnowledgeService.MAXIMUM_WORKLOAD
|
||||
)
|
||||
}
|
||||
|
||||
private fileTask(
|
||||
ragApplication: RAGApplication,
|
||||
options: KnowledgeBaseAddItemOptionsNonNullableAttribute
|
||||
): LoaderTask {
|
||||
const { base, item, forceReload } = options
|
||||
const file = item.content as FileType
|
||||
|
||||
const loaderTask: LoaderTask = {
|
||||
loaderTasks: [
|
||||
{
|
||||
state: LoaderTaskItemState.PENDING,
|
||||
task: () =>
|
||||
addFileLoader(ragApplication, file, base, forceReload)
|
||||
.then((result) => {
|
||||
loaderTask.loaderDoneReturn = result
|
||||
return result
|
||||
})
|
||||
.catch((err) => {
|
||||
Logger.error(err)
|
||||
return KnowledgeService.ERROR_LOADER_RETURN
|
||||
}),
|
||||
evaluateTaskWorkload: { workload: file.size }
|
||||
}
|
||||
],
|
||||
loaderDoneReturn: null
|
||||
}
|
||||
|
||||
return loaderTask
|
||||
}
|
||||
|
||||
private directoryTask(
|
||||
ragApplication: RAGApplication,
|
||||
options: KnowledgeBaseAddItemOptionsNonNullableAttribute
|
||||
): LoaderTask {
|
||||
const { base, item, forceReload } = options
|
||||
const directory = item.content as string
|
||||
const files = getAllFiles(directory)
|
||||
const totalFiles = files.length
|
||||
let processedFiles = 0
|
||||
|
||||
const sendDirectoryProcessingPercent = (totalFiles: number, processedFiles: number) => {
|
||||
const mainWindow = windowService.getMainWindow()
|
||||
@ -93,86 +194,257 @@ class KnowledgeService {
|
||||
})
|
||||
}
|
||||
|
||||
if (item.type === 'directory') {
|
||||
const directory = item.content as string
|
||||
const files = getAllFiles(directory)
|
||||
const totalFiles = files.length
|
||||
let processedFiles = 0
|
||||
|
||||
const loaderPromises = files.map(async (file) => {
|
||||
const result = await addFileLoader(ragApplication, file, base, forceReload)
|
||||
processedFiles++
|
||||
const loaderDoneReturn: LoaderDoneReturn = {
|
||||
entriesAdded: 0,
|
||||
uniqueId: `DirectoryLoader_${uuidv4()}`,
|
||||
uniqueIds: [],
|
||||
loaderType: 'DirectoryLoader'
|
||||
}
|
||||
const loaderTasks: LoaderTaskItem[] = []
|
||||
for (const file of files) {
|
||||
loaderTasks.push({
|
||||
state: LoaderTaskItemState.PENDING,
|
||||
task: () =>
|
||||
addFileLoader(ragApplication, file, base, forceReload)
|
||||
.then((result) => {
|
||||
loaderDoneReturn.entriesAdded += 1
|
||||
processedFiles += 1
|
||||
sendDirectoryProcessingPercent(totalFiles, processedFiles)
|
||||
loaderDoneReturn.uniqueIds.push(result.uniqueId)
|
||||
return result
|
||||
})
|
||||
|
||||
const loaderResults = await Promise.allSettled(loaderPromises)
|
||||
// @ts-ignore uniqueId
|
||||
const uniqueIds = loaderResults
|
||||
.filter((result) => result.status === 'fulfilled')
|
||||
.map((result) => result.value.uniqueId)
|
||||
|
||||
return {
|
||||
entriesAdded: loaderResults.length,
|
||||
uniqueId: `DirectoryLoader_${uuidv4()}`,
|
||||
uniqueIds,
|
||||
loaderType: 'DirectoryLoader'
|
||||
} as LoaderReturn
|
||||
.catch((err) => {
|
||||
Logger.error(err)
|
||||
return KnowledgeService.ERROR_LOADER_RETURN
|
||||
}),
|
||||
evaluateTaskWorkload: { workload: file.size }
|
||||
})
|
||||
}
|
||||
|
||||
if (item.type === 'url') {
|
||||
return {
|
||||
loaderTasks,
|
||||
loaderDoneReturn
|
||||
}
|
||||
}
|
||||
|
||||
private urlTask(
|
||||
ragApplication: RAGApplication,
|
||||
options: KnowledgeBaseAddItemOptionsNonNullableAttribute
|
||||
): LoaderTask {
|
||||
const { base, item, forceReload } = options
|
||||
const content = item.content as string
|
||||
if (content.startsWith('http')) {
|
||||
const loaderReturn = await ragApplication.addLoader(
|
||||
new WebLoader({ urlOrContent: content, chunkSize: base.chunkSize, chunkOverlap: base.chunkOverlap }) as any,
|
||||
|
||||
const loaderTask: LoaderTask = {
|
||||
loaderTasks: [
|
||||
{
|
||||
state: LoaderTaskItemState.PENDING,
|
||||
task: () => {
|
||||
const loaderReturn = ragApplication.addLoader(
|
||||
new WebLoader({
|
||||
urlOrContent: content,
|
||||
chunkSize: base.chunkSize,
|
||||
chunkOverlap: base.chunkOverlap
|
||||
}),
|
||||
forceReload
|
||||
)
|
||||
return {
|
||||
entriesAdded: loaderReturn.entriesAdded,
|
||||
uniqueId: loaderReturn.uniqueId,
|
||||
uniqueIds: [loaderReturn.uniqueId],
|
||||
loaderType: loaderReturn.loaderType
|
||||
} as LoaderReturn
|
||||
) as Promise<LoaderReturn>
|
||||
|
||||
return loaderReturn
|
||||
.then((result) => {
|
||||
const { entriesAdded, uniqueId, loaderType } = result
|
||||
loaderTask.loaderDoneReturn = {
|
||||
entriesAdded: entriesAdded,
|
||||
uniqueId: uniqueId,
|
||||
uniqueIds: [uniqueId],
|
||||
loaderType: loaderType
|
||||
}
|
||||
return result
|
||||
})
|
||||
.catch((err) => {
|
||||
Logger.error(err)
|
||||
return KnowledgeService.ERROR_LOADER_RETURN
|
||||
})
|
||||
},
|
||||
evaluateTaskWorkload: { workload: 1024 * 1024 * 2 }
|
||||
}
|
||||
],
|
||||
loaderDoneReturn: null
|
||||
}
|
||||
return loaderTask
|
||||
}
|
||||
|
||||
if (item.type === 'sitemap') {
|
||||
private sitemapTask(
|
||||
ragApplication: RAGApplication,
|
||||
options: KnowledgeBaseAddItemOptionsNonNullableAttribute
|
||||
): LoaderTask {
|
||||
const { base, item, forceReload } = options
|
||||
const content = item.content as string
|
||||
// @ts-ignore loader type
|
||||
const loaderReturn = await ragApplication.addLoader(
|
||||
|
||||
const loaderTask: LoaderTask = {
|
||||
loaderTasks: [
|
||||
{
|
||||
state: LoaderTaskItemState.PENDING,
|
||||
task: () =>
|
||||
ragApplication
|
||||
.addLoader(
|
||||
new SitemapLoader({ url: content, chunkSize: base.chunkSize, chunkOverlap: base.chunkOverlap }) as any,
|
||||
forceReload
|
||||
)
|
||||
return {
|
||||
entriesAdded: loaderReturn.entriesAdded,
|
||||
uniqueId: loaderReturn.uniqueId,
|
||||
uniqueIds: [loaderReturn.uniqueId],
|
||||
loaderType: loaderReturn.loaderType
|
||||
} as LoaderReturn
|
||||
.then((result) => {
|
||||
const { entriesAdded, uniqueId, loaderType } = result
|
||||
loaderTask.loaderDoneReturn = {
|
||||
entriesAdded: entriesAdded,
|
||||
uniqueId: uniqueId,
|
||||
uniqueIds: [uniqueId],
|
||||
loaderType: loaderType
|
||||
}
|
||||
return result
|
||||
})
|
||||
.catch((err) => {
|
||||
Logger.error(err)
|
||||
return KnowledgeService.ERROR_LOADER_RETURN
|
||||
}),
|
||||
evaluateTaskWorkload: { workload: 1024 * 1024 * 20 }
|
||||
}
|
||||
],
|
||||
loaderDoneReturn: null
|
||||
}
|
||||
return loaderTask
|
||||
}
|
||||
|
||||
if (item.type === 'note') {
|
||||
private noteTask(
|
||||
ragApplication: RAGApplication,
|
||||
options: KnowledgeBaseAddItemOptionsNonNullableAttribute
|
||||
): LoaderTask {
|
||||
const { base, item, forceReload } = options
|
||||
const content = item.content as string
|
||||
console.debug('chunkSize', base.chunkSize)
|
||||
const loaderReturn = await ragApplication.addLoader(
|
||||
|
||||
const encoder = new TextEncoder()
|
||||
const contentBytes = encoder.encode(content)
|
||||
const loaderTask: LoaderTask = {
|
||||
loaderTasks: [
|
||||
{
|
||||
state: LoaderTaskItemState.PENDING,
|
||||
task: () => {
|
||||
const loaderReturn = ragApplication.addLoader(
|
||||
new TextLoader({ text: content, chunkSize: base.chunkSize, chunkOverlap: base.chunkOverlap }),
|
||||
forceReload
|
||||
)
|
||||
return {
|
||||
entriesAdded: loaderReturn.entriesAdded,
|
||||
uniqueId: loaderReturn.uniqueId,
|
||||
uniqueIds: [loaderReturn.uniqueId],
|
||||
loaderType: loaderReturn.loaderType
|
||||
} as LoaderReturn
|
||||
) as Promise<LoaderReturn>
|
||||
|
||||
return loaderReturn
|
||||
.then(({ entriesAdded, uniqueId, loaderType }) => {
|
||||
loaderTask.loaderDoneReturn = {
|
||||
entriesAdded: entriesAdded,
|
||||
uniqueId: uniqueId,
|
||||
uniqueIds: [uniqueId],
|
||||
loaderType: loaderType
|
||||
}
|
||||
})
|
||||
.catch((err) => {
|
||||
Logger.error(err)
|
||||
return KnowledgeService.ERROR_LOADER_RETURN
|
||||
})
|
||||
},
|
||||
evaluateTaskWorkload: { workload: contentBytes.length }
|
||||
}
|
||||
],
|
||||
loaderDoneReturn: null
|
||||
}
|
||||
return loaderTask
|
||||
}
|
||||
|
||||
if (item.type === 'file') {
|
||||
const file = item.content as FileType
|
||||
|
||||
return await addFileLoader(ragApplication, file, base, forceReload)
|
||||
private processingQueueHandle() {
|
||||
const getSubtasksUntilMaximumLoad = (): QueueTaskItem[] => {
|
||||
const queueTaskList: QueueTaskItem[] = []
|
||||
that: for (const [task, resolve] of this.knowledgeItemProcessingQueueMappingPromise) {
|
||||
for (const item of task.loaderTasks) {
|
||||
if (this.maximumLoad()) {
|
||||
break that
|
||||
}
|
||||
|
||||
return { entriesAdded: 0, uniqueId: '', uniqueIds: [''], loaderType: '' }
|
||||
const { state, task: taskPromise, evaluateTaskWorkload } = item
|
||||
|
||||
if (state !== LoaderTaskItemState.PENDING) {
|
||||
continue
|
||||
}
|
||||
|
||||
const { workload } = evaluateTaskWorkload
|
||||
this.workload += workload
|
||||
this.processingItemCount += 1
|
||||
item.state = LoaderTaskItemState.PROCESSING
|
||||
queueTaskList.push({
|
||||
taskPromise: () =>
|
||||
taskPromise().then(() => {
|
||||
this.workload -= workload
|
||||
this.processingItemCount -= 1
|
||||
task.loaderTasks.delete(item)
|
||||
if (task.loaderTasks.size === 0) {
|
||||
this.knowledgeItemProcessingQueueMappingPromise.delete(task)
|
||||
resolve()
|
||||
}
|
||||
this.processingQueueHandle()
|
||||
}),
|
||||
resolve: () => {},
|
||||
evaluateTaskWorkload
|
||||
})
|
||||
}
|
||||
}
|
||||
return queueTaskList
|
||||
}
|
||||
const subTasks = getSubtasksUntilMaximumLoad()
|
||||
if (subTasks.length > 0) {
|
||||
const subTaskPromises = subTasks.map(({ taskPromise }) => taskPromise())
|
||||
Promise.all(subTaskPromises).then(() => {
|
||||
subTasks.forEach(({ resolve }) => resolve())
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
private appendProcessingQueue(task: LoaderTask): Promise<LoaderReturn> {
|
||||
return new Promise((resolve) => {
|
||||
this.knowledgeItemProcessingQueueMappingPromise.set(loaderTaskIntoOfSet(task), () => {
|
||||
resolve(task.loaderDoneReturn!)
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
public add = (_: Electron.IpcMainInvokeEvent, options: KnowledgeBaseAddItemOptions): Promise<LoaderReturn> => {
|
||||
return new Promise((resolve) => {
|
||||
const { base, item, forceReload = false } = options
|
||||
const optionsNonNullableAttribute = { base, item, forceReload }
|
||||
this.getRagApplication(base)
|
||||
.then((ragApplication) => {
|
||||
const task = (() => {
|
||||
switch (item.type) {
|
||||
case 'file':
|
||||
return this.fileTask(ragApplication, optionsNonNullableAttribute)
|
||||
case 'directory':
|
||||
return this.directoryTask(ragApplication, optionsNonNullableAttribute)
|
||||
case 'url':
|
||||
return this.urlTask(ragApplication, optionsNonNullableAttribute)
|
||||
case 'sitemap':
|
||||
return this.sitemapTask(ragApplication, optionsNonNullableAttribute)
|
||||
case 'note':
|
||||
return this.noteTask(ragApplication, optionsNonNullableAttribute)
|
||||
default:
|
||||
return null
|
||||
}
|
||||
})()
|
||||
|
||||
if (task) {
|
||||
this.appendProcessingQueue(task).then(() => {
|
||||
resolve(task.loaderDoneReturn!)
|
||||
})
|
||||
this.processingQueueHandle()
|
||||
} else {
|
||||
resolve(KnowledgeService.ERROR_LOADER_RETURN)
|
||||
}
|
||||
})
|
||||
.catch((err) => {
|
||||
Logger.error(err)
|
||||
resolve(KnowledgeService.ERROR_LOADER_RETURN)
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
public remove = async (
|
||||
|
||||
@ -51,20 +51,24 @@ class KnowledgeQueue {
|
||||
throw new Error('Knowledge base not found')
|
||||
}
|
||||
|
||||
const processableItems = base.items.filter((item) => {
|
||||
const findProcessableItem = () => {
|
||||
const state = store.getState()
|
||||
const base = state.knowledge.bases.find((b) => b.id === baseId)
|
||||
return (
|
||||
base?.items.find((item) => {
|
||||
if (item.processingStatus === 'failed') {
|
||||
return !item.retryCount || item.retryCount < this.MAX_RETRIES
|
||||
}
|
||||
} else {
|
||||
return item.processingStatus === 'pending'
|
||||
})
|
||||
|
||||
for (const item of processableItems) {
|
||||
if (!this.processing.get(baseId)) {
|
||||
console.log(`[KnowledgeQueue] Processing interrupted for base ${baseId}`)
|
||||
break
|
||||
}
|
||||
}) ?? null
|
||||
)
|
||||
}
|
||||
|
||||
this.processItem(baseId, item)
|
||||
let processableItem = findProcessableItem()
|
||||
while (processableItem) {
|
||||
this.processItem(baseId, processableItem).then()
|
||||
processableItem = findProcessableItem()
|
||||
}
|
||||
} finally {
|
||||
console.log(`[KnowledgeQueue] Finished processing queue for base ${baseId}`)
|
||||
@ -153,7 +157,7 @@ class KnowledgeQueue {
|
||||
}
|
||||
console.debug(`[KnowledgeQueue] Updated uniqueId for item ${item.id} in base ${baseId} `)
|
||||
|
||||
setTimeout(() => store.dispatch(clearCompletedProcessing({ baseId })), 1000)
|
||||
store.dispatch(clearCompletedProcessing({ baseId }))
|
||||
} catch (error) {
|
||||
console.error(`[KnowledgeQueue] Error processing item ${item.id}: `, error)
|
||||
store.dispatch(
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user