mirror of
https://github.com/lobehub/lobe-chat.git
synced 2026-06-14 19:50:09 +00:00
✨ feat: Add configurable PDF processing method with Unstructured (#5927)
* ✨ feat: Add configurable PDF processing method with Unstructured * 🔧 fix: Update import path for env utility in ContentChunk module * feat: add USE_UNSTRUCTURED_FOR_PDF environment variable to knowledge config * Delete src/server/utils/env.ts * feat: implement ChunkingRuleParser for file type and service mapping * refactor: remove USE_UNSTRUCTURED_FOR_PDF from knowledge environment configuration * test: add unit tests for ChunkingRuleParser functionality * refactor: remove isUsingUnstructured method from ContentChunk class * refactor: update ChunkingService type and clean up ContentChunk rules * refactor: simplify ChunkingRuleParser and update ContentChunk module * refactor: update ContentChunk module import for ChunkingService
This commit is contained in:
+14
-16
@@ -1,19 +1,17 @@
|
||||
import { createEnv } from '@t3-oss/env-nextjs';
|
||||
import { z } from 'zod';
|
||||
|
||||
export const getKnowledgeConfig = () => {
|
||||
return createEnv({
|
||||
runtimeEnv: {
|
||||
DEFAULT_FILES_CONFIG: process.env.DEFAULT_FILES_CONFIG,
|
||||
UNSTRUCTURED_API_KEY: process.env.UNSTRUCTURED_API_KEY,
|
||||
UNSTRUCTURED_SERVER_URL: process.env.UNSTRUCTURED_SERVER_URL,
|
||||
},
|
||||
server: {
|
||||
DEFAULT_FILES_CONFIG: z.string().optional(),
|
||||
UNSTRUCTURED_API_KEY: z.string().optional(),
|
||||
UNSTRUCTURED_SERVER_URL: z.string().optional(),
|
||||
},
|
||||
});
|
||||
};
|
||||
|
||||
export const knowledgeEnv = getKnowledgeConfig();
|
||||
export const knowledgeEnv = createEnv({
|
||||
runtimeEnv: {
|
||||
DEFAULT_FILES_CONFIG: process.env.DEFAULT_FILES_CONFIG,
|
||||
FILE_TYPE_CHUNKING_RULES: process.env.FILE_TYPE_CHUNKING_RULES,
|
||||
UNSTRUCTURED_API_KEY: process.env.UNSTRUCTURED_API_KEY,
|
||||
UNSTRUCTURED_SERVER_URL: process.env.UNSTRUCTURED_SERVER_URL,
|
||||
},
|
||||
server: {
|
||||
DEFAULT_FILES_CONFIG: z.string().optional(),
|
||||
FILE_TYPE_CHUNKING_RULES: z.string().optional(),
|
||||
UNSTRUCTURED_API_KEY: z.string().optional(),
|
||||
UNSTRUCTURED_SERVER_URL: z.string().optional(),
|
||||
},
|
||||
});
|
||||
|
||||
@@ -1,9 +1,13 @@
|
||||
import { ChunkingLoader } from 'src/libs/langchain';
|
||||
import { Strategy } from 'unstructured-client/sdk/models/shared';
|
||||
|
||||
import { NewChunkItem, NewUnstructuredChunkItem } from '@/database/schemas';
|
||||
import { knowledgeEnv } from '@/config/knowledge';
|
||||
import type { NewChunkItem, NewUnstructuredChunkItem } from '@/database/schemas';
|
||||
import { ChunkingStrategy, Unstructured } from '@/libs/unstructured';
|
||||
|
||||
import { ChunkingRuleParser } from './rules';
|
||||
import type { ChunkingService } from './rules';
|
||||
|
||||
export interface ChunkContentParams {
|
||||
content: Uint8Array;
|
||||
fileType: string;
|
||||
@@ -19,23 +23,57 @@ interface ChunkResult {
|
||||
export class ContentChunk {
|
||||
private unstructuredClient: Unstructured;
|
||||
private langchainClient: ChunkingLoader;
|
||||
private chunkingRules: Record<string, ChunkingService[]>;
|
||||
|
||||
constructor() {
|
||||
this.unstructuredClient = new Unstructured();
|
||||
this.langchainClient = new ChunkingLoader();
|
||||
this.chunkingRules = ChunkingRuleParser.parse(knowledgeEnv.FILE_TYPE_CHUNKING_RULES || '');
|
||||
}
|
||||
|
||||
isUsingUnstructured(params: ChunkContentParams) {
|
||||
return params.fileType === 'application/pdf' && params.mode === 'hi-res';
|
||||
private getChunkingServices(fileType: string): ChunkingService[] {
|
||||
const ext = fileType.split('/').pop()?.toLowerCase() || '';
|
||||
return this.chunkingRules[ext] || ['default'];
|
||||
}
|
||||
|
||||
async chunkContent(params: ChunkContentParams): Promise<ChunkResult> {
|
||||
if (this.isUsingUnstructured(params))
|
||||
return await this.chunkByUnstructured(params.filename, params.content);
|
||||
const services = this.getChunkingServices(params.fileType);
|
||||
|
||||
for (const service of services) {
|
||||
try {
|
||||
switch (service) {
|
||||
case 'unstructured': {
|
||||
if (this.canUseUnstructured()) {
|
||||
return await this.chunkByUnstructured(params.filename, params.content);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case 'doc2x': {
|
||||
// Future implementation
|
||||
break;
|
||||
}
|
||||
|
||||
default: {
|
||||
return await this.chunkByLangChain(params.filename, params.content);
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
// If this is the last service, throw the error
|
||||
if (service === services.at(-1)) throw error;
|
||||
// Otherwise continue to next service
|
||||
console.error(`Chunking failed with service ${service}:`, error);
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback to langchain if no service succeeded
|
||||
return await this.chunkByLangChain(params.filename, params.content);
|
||||
}
|
||||
|
||||
private canUseUnstructured(): boolean {
|
||||
return !!(knowledgeEnv.UNSTRUCTURED_API_KEY && knowledgeEnv.UNSTRUCTURED_SERVER_URL);
|
||||
}
|
||||
|
||||
private chunkByUnstructured = async (
|
||||
filename: string,
|
||||
content: Uint8Array,
|
||||
|
||||
@@ -0,0 +1,81 @@
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import { ChunkingRuleParser } from './rules';
|
||||
|
||||
describe('ChunkingRuleParser', () => {
|
||||
describe('parse', () => {
|
||||
it('should parse a single file type rule correctly', () => {
|
||||
const input = 'pdf=unstructured,default';
|
||||
const result = ChunkingRuleParser.parse(input);
|
||||
|
||||
expect(result).toEqual({
|
||||
pdf: ['unstructured', 'default'],
|
||||
});
|
||||
});
|
||||
|
||||
it('should parse multiple file type rules correctly', () => {
|
||||
const input = 'pdf=unstructured,default;doc=doc2x,default;txt=default';
|
||||
const result = ChunkingRuleParser.parse(input);
|
||||
|
||||
expect(result).toEqual({
|
||||
pdf: ['unstructured', 'default'],
|
||||
doc: ['doc2x', 'default'],
|
||||
txt: ['default'],
|
||||
});
|
||||
});
|
||||
|
||||
it('should convert file types to lowercase', () => {
|
||||
const input = 'PDF=unstructured;DOC=doc2x';
|
||||
const result = ChunkingRuleParser.parse(input);
|
||||
|
||||
expect(result).toEqual({
|
||||
pdf: ['unstructured'],
|
||||
doc: ['doc2x'],
|
||||
});
|
||||
});
|
||||
|
||||
it('should filter out invalid service names', () => {
|
||||
const input = 'pdf=unstructured,invalid,default,wrongservice';
|
||||
const result = ChunkingRuleParser.parse(input);
|
||||
|
||||
expect(result).toEqual({
|
||||
pdf: ['unstructured', 'default'],
|
||||
});
|
||||
});
|
||||
|
||||
it('should handle empty string input', () => {
|
||||
const input = '';
|
||||
const result = ChunkingRuleParser.parse(input);
|
||||
|
||||
expect(result).toEqual({});
|
||||
});
|
||||
|
||||
it('should skip invalid rule formats', () => {
|
||||
const input = 'pdf=unstructured;invalid;doc=doc2x;=default;txt';
|
||||
const result = ChunkingRuleParser.parse(input);
|
||||
|
||||
expect(result).toEqual({
|
||||
pdf: ['unstructured'],
|
||||
doc: ['doc2x'],
|
||||
});
|
||||
});
|
||||
|
||||
it('should handle whitespace in service names', () => {
|
||||
const input = 'pdf= unstructured , default ;doc=doc2x';
|
||||
const result = ChunkingRuleParser.parse(input);
|
||||
|
||||
expect(result).toEqual({
|
||||
pdf: ['unstructured', 'default'],
|
||||
doc: ['doc2x'],
|
||||
});
|
||||
});
|
||||
|
||||
it('should handle duplicate services for same file type', () => {
|
||||
const input = 'pdf=unstructured,default,unstructured';
|
||||
const result = ChunkingRuleParser.parse(input);
|
||||
|
||||
expect(result).toEqual({
|
||||
pdf: ['unstructured', 'default', 'unstructured'],
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,23 @@
|
||||
export type ChunkingService = 'unstructured' | 'doc2x' | 'default';
|
||||
|
||||
export const ChunkingRuleParser = {
|
||||
parse(rulesStr: string): Record<string, ChunkingService[]> {
|
||||
const rules: Record<string, ChunkingService[]> = {};
|
||||
|
||||
// Split by semicolon for different file types
|
||||
const fileTypeRules = rulesStr.split(';');
|
||||
|
||||
for (const rule of fileTypeRules) {
|
||||
const [fileType, services] = rule.split('=');
|
||||
if (!fileType || !services) continue;
|
||||
|
||||
// Split services by comma and validate each service
|
||||
rules[fileType.toLowerCase()] = services
|
||||
.split(',')
|
||||
.map((s) => s.trim().toLowerCase())
|
||||
.filter((s): s is ChunkingService => ['unstructured', 'doc2x', 'default'].includes(s));
|
||||
}
|
||||
|
||||
return rules;
|
||||
},
|
||||
} as const;
|
||||
Reference in New Issue
Block a user