mirror of
https://github.com/lobehub/lobe-chat.git
synced 2026-06-14 03:30:19 +00:00
✨ feat(kb): extend BM25 search to file-backed documents (#15247)
`searchKnowledgeBaseDocuments` only matched inline `custom/document` pages, so parsed PDFs and other file-backed documents never surfaced via the BM25 path — vector search was the sole way to retrieve them. Run two scoped ParadeDB queries in parallel (inline via `documents.knowledge_base_id`, file-backed via a `knowledge_base_files` join) and merge by score in JS. A single OR-ed predicate trips ParadeDB's `Unsupported query shape` because `paradedb.score()` requires a conjunctive tantivy scan. Folder rows are excluded; hits now carry an optional `fileId` so the agent can read with either `docs_*` or `file_*` ids. The XML formatter exposes the new attribute downstream.
This commit is contained in:
@@ -24,12 +24,14 @@ export interface SearchKnowledgeBaseArgs {
|
||||
}
|
||||
|
||||
/**
|
||||
* BM25 hit on a custom/document inside a knowledge base.
|
||||
* BM25 hit on a document inside a knowledge base. Covers both inline
|
||||
* `custom/document` pages and file-backed documents (parsed PDFs and the like).
|
||||
* Mirrors database/repositories/search KnowledgeBaseDocumentHit; redeclared
|
||||
* here to keep this package decoupled from server-only types.
|
||||
*/
|
||||
export interface KnowledgeBaseDocumentResult {
|
||||
documentId: string;
|
||||
fileId?: string;
|
||||
knowledgeBaseId: string;
|
||||
relevance: number;
|
||||
snippet: string;
|
||||
|
||||
@@ -6,7 +6,7 @@ import { documents } from '../../schemas';
|
||||
import type { NewAgent } from '../../schemas/agent';
|
||||
import { agents } from '../../schemas/agent';
|
||||
import type { NewFile } from '../../schemas/file';
|
||||
import { files, knowledgeBases } from '../../schemas/file';
|
||||
import { files, knowledgeBaseFiles, knowledgeBases } from '../../schemas/file';
|
||||
import { messages } from '../../schemas/message';
|
||||
import type { NewTopic } from '../../schemas/topic';
|
||||
import { topics } from '../../schemas/topic';
|
||||
@@ -967,6 +967,127 @@ describe.skipIf(!isServerDB)('SearchRepo', () => {
|
||||
expect(r.relevance).toBeLessThanOrEqual(3);
|
||||
});
|
||||
});
|
||||
|
||||
describe('file-backed documents (PDF / parsed files)', () => {
|
||||
const pdfFileId = 'file-bm25-pdf-1';
|
||||
const pdfDocId = 'docs-bm25-pdf-1';
|
||||
const folderDocId = 'docs-bm25-folder-1';
|
||||
const otherUserFileId = 'file-bm25-other-1';
|
||||
const otherUserDocId = 'docs-bm25-other-1';
|
||||
|
||||
beforeEach(async () => {
|
||||
await serverDB.insert(files).values([
|
||||
{
|
||||
fileType: 'application/pdf',
|
||||
id: pdfFileId,
|
||||
name: 'transformers-paper.pdf',
|
||||
size: 2048,
|
||||
url: 's3://bucket/transformers-paper.pdf',
|
||||
userId,
|
||||
},
|
||||
{
|
||||
fileType: 'application/pdf',
|
||||
id: otherUserFileId,
|
||||
name: 'leak-check.pdf',
|
||||
size: 2048,
|
||||
url: 's3://bucket/leak-check.pdf',
|
||||
userId: otherUserId,
|
||||
},
|
||||
]);
|
||||
|
||||
await serverDB.insert(knowledgeBaseFiles).values([
|
||||
{ fileId: pdfFileId, knowledgeBaseId: kbA, userId },
|
||||
{ fileId: otherUserFileId, knowledgeBaseId: 'kb-other-1', userId: otherUserId },
|
||||
]);
|
||||
|
||||
await serverDB.insert(documents).values([
|
||||
{
|
||||
content:
|
||||
'Attention is all you need. The Transformer architecture relies on self-attention ' +
|
||||
'and replaces recurrence with parallel multi-head attention layers.',
|
||||
fileId: pdfFileId,
|
||||
fileType: 'application/pdf',
|
||||
filename: 'transformers-paper.pdf',
|
||||
id: pdfDocId,
|
||||
source: 's3://bucket/transformers-paper.pdf',
|
||||
sourceType: 'file',
|
||||
title: 'Attention Is All You Need',
|
||||
totalCharCount: 200,
|
||||
totalLineCount: 5,
|
||||
userId,
|
||||
},
|
||||
{
|
||||
content: '',
|
||||
fileType: 'custom/folder',
|
||||
filename: 'a folder',
|
||||
id: folderDocId,
|
||||
knowledgeBaseId: kbA,
|
||||
source: 'internal://folder/placeholder',
|
||||
sourceType: 'api',
|
||||
title: 'Transformer Folder',
|
||||
totalCharCount: 0,
|
||||
totalLineCount: 0,
|
||||
userId,
|
||||
},
|
||||
{
|
||||
content:
|
||||
'Attention paper in another user knowledge base — must never surface for current user.',
|
||||
fileId: otherUserFileId,
|
||||
fileType: 'application/pdf',
|
||||
filename: 'leak-check.pdf',
|
||||
id: otherUserDocId,
|
||||
source: 's3://bucket/leak-check.pdf',
|
||||
sourceType: 'file',
|
||||
title: 'Attention Leak Check',
|
||||
totalCharCount: 100,
|
||||
totalLineCount: 3,
|
||||
userId: otherUserId,
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it('returns a PDF-backed document hit via knowledge_base_files join', async () => {
|
||||
const results = await searchRepo.searchKnowledgeBaseDocuments('attention transformer', [
|
||||
kbA,
|
||||
]);
|
||||
const pdfHit = results.find((r) => r.documentId === pdfDocId);
|
||||
expect(pdfHit).toBeDefined();
|
||||
expect(pdfHit?.knowledgeBaseId).toBe(kbA);
|
||||
expect(pdfHit?.fileId).toBe(pdfFileId);
|
||||
expect(pdfHit?.title).toBe('Attention Is All You Need');
|
||||
});
|
||||
|
||||
it('still matches inline custom/document hits in the same call', async () => {
|
||||
await serverDB.insert(documents).values({
|
||||
content: 'Attention transformer notes written inline for KB-A',
|
||||
fileType: 'custom/document',
|
||||
filename: 'inline-notes.md',
|
||||
knowledgeBaseId: kbA,
|
||||
source: 'internal://document/placeholder',
|
||||
sourceType: 'api',
|
||||
title: 'Inline Attention Notes',
|
||||
totalCharCount: 60,
|
||||
totalLineCount: 2,
|
||||
userId,
|
||||
});
|
||||
|
||||
const results = await searchRepo.searchKnowledgeBaseDocuments('attention', [kbA]);
|
||||
expect(results.some((r) => r.title === 'Inline Attention Notes')).toBe(true);
|
||||
expect(results.some((r) => r.documentId === pdfDocId)).toBe(true);
|
||||
});
|
||||
|
||||
it('excludes folder documents even when they match the query', async () => {
|
||||
const results = await searchRepo.searchKnowledgeBaseDocuments('transformer folder', [kbA]);
|
||||
expect(results.every((r) => r.documentId !== folderDocId)).toBe(true);
|
||||
});
|
||||
|
||||
it('does not surface another user PDF when querying their KB', async () => {
|
||||
const results = await searchRepo.searchKnowledgeBaseDocuments('attention', [
|
||||
'kb-other-1',
|
||||
]);
|
||||
expect(results).toEqual([]);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe('search - context types', () => {
|
||||
|
||||
@@ -134,12 +134,17 @@ export interface KnowledgeBaseSearchResult extends BaseSearchResult {
|
||||
}
|
||||
|
||||
/**
|
||||
* BM25 hit for KB-scoped documents (custom/document) used by chunkRouter.semanticSearchForChat.
|
||||
* Distinct from PageSearchResult — this carries snippet + KB id for agent tool consumption.
|
||||
* BM25 hit for KB-scoped documents used by chunkRouter.semanticSearchForChat.
|
||||
* Covers both inline `custom/document` pages and file-backed documents
|
||||
* (e.g. parsed PDFs) joined through `knowledge_base_files`.
|
||||
*
|
||||
* `fileId` is present when the hit comes from a parsed-file document, letting
|
||||
* the agent fetch the original via readKnowledge with either docs_* or file_*.
|
||||
* `relevance` is normalized to [1, 3] (lower = better, matches BaseSearchResult semantics).
|
||||
*/
|
||||
export interface KnowledgeBaseDocumentHit {
|
||||
documentId: string;
|
||||
fileId?: string;
|
||||
knowledgeBaseId: string;
|
||||
relevance: number;
|
||||
snippet: string;
|
||||
@@ -679,9 +684,19 @@ export class SearchRepo {
|
||||
}
|
||||
|
||||
/**
|
||||
* KB-scoped BM25 search over custom/document documents.
|
||||
* Used by chunkRouter.semanticSearchForChat to surface inline documents
|
||||
* to the KB agent tool's searchKnowledgeBase API.
|
||||
* KB-scoped BM25 search over documents.
|
||||
*
|
||||
* Covers two routes to the KB scope, executed as two separate ParadeDB
|
||||
* scoring queries that we merge in JS:
|
||||
* - inline pages: `documents.knowledge_base_id` directly references the KB
|
||||
* - file-backed docs (e.g. parsed PDFs): joined through `knowledge_base_files`
|
||||
* via `documents.file_id`
|
||||
*
|
||||
* Two queries instead of an `OR`-ed WHERE clause because `paradedb.score()`
|
||||
* requires a tantivy index scan, and ParadeDB rejects disjunctive shapes
|
||||
* spanning bm25 and non-bm25 predicates ("Unsupported query shape").
|
||||
*
|
||||
* Folder rows (DOCUMENT_FOLDER_TYPE) are excluded — they carry no content.
|
||||
*/
|
||||
async searchKnowledgeBaseDocuments(
|
||||
query: string,
|
||||
@@ -693,9 +708,14 @@ export class SearchRepo {
|
||||
|
||||
const bm25Query = sanitizeBm25Query(query);
|
||||
|
||||
const rows = await this.db
|
||||
const matchClause = sql`(${documents.title} @@@ ${bm25Query} OR ${documents.slug} @@@ ${bm25Query} OR ${documents.content} @@@ ${bm25Query})`;
|
||||
const folderClause = ne(documents.fileType, DOCUMENT_FOLDER_TYPE);
|
||||
const userClause = eq(documents.userId, this.userId);
|
||||
|
||||
const inlineRowsPromise = this.db
|
||||
.select({
|
||||
content: documents.content,
|
||||
fileId: documents.fileId,
|
||||
filename: documents.filename,
|
||||
id: documents.id,
|
||||
knowledgeBaseId: documents.knowledgeBaseId,
|
||||
@@ -706,17 +726,56 @@ export class SearchRepo {
|
||||
.from(documents)
|
||||
.where(
|
||||
and(
|
||||
eq(documents.userId, this.userId),
|
||||
eq(documents.fileType, 'custom/document'),
|
||||
userClause,
|
||||
folderClause,
|
||||
inArray(documents.knowledgeBaseId, knowledgeBaseIds),
|
||||
sql`(${documents.title} @@@ ${bm25Query} OR ${documents.slug} @@@ ${bm25Query} OR ${documents.content} @@@ ${bm25Query})`,
|
||||
matchClause,
|
||||
),
|
||||
)
|
||||
.orderBy(sql`paradedb.score(${documents.id}) DESC`)
|
||||
.limit(limit);
|
||||
|
||||
return this.mapScoresToRelevance(rows).map((row) => ({
|
||||
const fileBackedRowsPromise = this.db
|
||||
.select({
|
||||
content: documents.content,
|
||||
fileId: documents.fileId,
|
||||
filename: documents.filename,
|
||||
id: documents.id,
|
||||
knowledgeBaseId: knowledgeBaseFiles.knowledgeBaseId,
|
||||
score: sql<number>`paradedb.score(${documents.id})`,
|
||||
title: documents.title,
|
||||
updatedAt: documents.updatedAt,
|
||||
})
|
||||
.from(documents)
|
||||
.innerJoin(
|
||||
knowledgeBaseFiles,
|
||||
and(
|
||||
eq(knowledgeBaseFiles.fileId, documents.fileId),
|
||||
eq(knowledgeBaseFiles.userId, this.userId),
|
||||
inArray(knowledgeBaseFiles.knowledgeBaseId, knowledgeBaseIds),
|
||||
),
|
||||
)
|
||||
.where(and(userClause, folderClause, matchClause))
|
||||
.orderBy(sql`paradedb.score(${documents.id}) DESC`)
|
||||
.limit(limit);
|
||||
|
||||
const [inlineRows, fileBackedRows] = await Promise.all([
|
||||
inlineRowsPromise,
|
||||
fileBackedRowsPromise,
|
||||
]);
|
||||
|
||||
const byId = new Map<string, (typeof inlineRows)[number]>();
|
||||
for (const row of [...inlineRows, ...fileBackedRows]) {
|
||||
const prev = byId.get(row.id);
|
||||
if (!prev || row.score > prev.score) byId.set(row.id, row);
|
||||
}
|
||||
const merged = Array.from(byId.values())
|
||||
.sort((a, b) => b.score - a.score)
|
||||
.slice(0, limit);
|
||||
|
||||
return this.mapScoresToRelevance(merged).map((row) => ({
|
||||
documentId: row.id,
|
||||
fileId: row.fileId ?? undefined,
|
||||
knowledgeBaseId: row.knowledgeBaseId ?? '',
|
||||
relevance: row.relevance,
|
||||
snippet: this.truncate(row.content, 300) ?? '',
|
||||
|
||||
@@ -225,6 +225,22 @@ describe('formatSearchResults', () => {
|
||||
expect(result).toMatchSnapshot();
|
||||
});
|
||||
|
||||
it('should expose fileId attribute on file-backed documents', () => {
|
||||
const documents: DocumentSearchResult[] = [
|
||||
{
|
||||
documentId: 'docs_pdf_xyz',
|
||||
fileId: 'file_pdf_xyz',
|
||||
knowledgeBaseId: 'kb_research',
|
||||
relevance: 1.4,
|
||||
snippet: 'Attention is all you need...',
|
||||
title: 'Attention Paper',
|
||||
},
|
||||
];
|
||||
const result = formatSearchResults([], 'attention', documents);
|
||||
expect(result).toContain('fileId="file_pdf_xyz"');
|
||||
expect(result).toContain('id="docs_pdf_xyz"');
|
||||
});
|
||||
|
||||
it('should annotate when vector search fails but BM25 succeeds', () => {
|
||||
const documents: DocumentSearchResult[] = [
|
||||
{
|
||||
|
||||
@@ -12,6 +12,7 @@ export interface FileSearchResult {
|
||||
|
||||
export interface DocumentSearchResult {
|
||||
documentId: string;
|
||||
fileId?: string;
|
||||
knowledgeBaseId: string;
|
||||
relevance: number;
|
||||
snippet: string;
|
||||
@@ -42,12 +43,13 @@ ${chunks.join('\n')}
|
||||
};
|
||||
|
||||
/**
|
||||
* Formats a single document search result (BM25 hit on custom/document) with XML tags.
|
||||
* Formats a single document search result (BM25 hit on a KB document) with XML tags.
|
||||
* Documents return only a snippet — agent should call readKnowledge with the docs_* id
|
||||
* to fetch the full content.
|
||||
* (or file_* id when present, for parsed-file documents) to fetch the full content.
|
||||
*/
|
||||
const formatDocument = (doc: DocumentSearchResult): string => {
|
||||
return `<document id="${doc.documentId}" title="${doc.title}" relevance="${doc.relevance}" knowledgeBaseId="${doc.knowledgeBaseId}">
|
||||
const fileIdAttr = doc.fileId ? ` fileId="${doc.fileId}"` : '';
|
||||
return `<document id="${doc.documentId}"${fileIdAttr} title="${doc.title}" relevance="${doc.relevance}" knowledgeBaseId="${doc.knowledgeBaseId}">
|
||||
<snippet>${doc.snippet}</snippet>
|
||||
</document>`;
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user