feat(kb): extend BM25 search to file-backed documents (#15247)

`searchKnowledgeBaseDocuments` only matched inline `custom/document`
pages, so parsed PDFs and other file-backed documents never surfaced
via the BM25 path — vector search was the sole way to retrieve them.

Run two scoped ParadeDB queries in parallel (inline via
`documents.knowledge_base_id`, file-backed via a `knowledge_base_files`
join) and merge by score in JS. A single OR-ed predicate trips
ParadeDB's `Unsupported query shape` because `paradedb.score()`
requires a conjunctive tantivy scan.

Folder rows are excluded; hits now carry an optional `fileId` so the
agent can read with either `docs_*` or `file_*` ids. The XML formatter
exposes the new attribute downstream.
This commit is contained in:
Innei
2026-05-29 01:01:47 +08:00
committed by GitHub
parent 2194b23390
commit 65113ca2a7
5 changed files with 215 additions and 15 deletions
@@ -24,12 +24,14 @@ export interface SearchKnowledgeBaseArgs {
}
/**
* BM25 hit on a custom/document inside a knowledge base.
* BM25 hit on a document inside a knowledge base. Covers both inline
* `custom/document` pages and file-backed documents (parsed PDFs and the like).
* Mirrors database/repositories/search KnowledgeBaseDocumentHit; redeclared
* here to keep this package decoupled from server-only types.
*/
export interface KnowledgeBaseDocumentResult {
documentId: string;
fileId?: string;
knowledgeBaseId: string;
relevance: number;
snippet: string;
@@ -6,7 +6,7 @@ import { documents } from '../../schemas';
import type { NewAgent } from '../../schemas/agent';
import { agents } from '../../schemas/agent';
import type { NewFile } from '../../schemas/file';
import { files, knowledgeBases } from '../../schemas/file';
import { files, knowledgeBaseFiles, knowledgeBases } from '../../schemas/file';
import { messages } from '../../schemas/message';
import type { NewTopic } from '../../schemas/topic';
import { topics } from '../../schemas/topic';
@@ -967,6 +967,127 @@ describe.skipIf(!isServerDB)('SearchRepo', () => {
expect(r.relevance).toBeLessThanOrEqual(3);
});
});
describe('file-backed documents (PDF / parsed files)', () => {
const pdfFileId = 'file-bm25-pdf-1';
const pdfDocId = 'docs-bm25-pdf-1';
const folderDocId = 'docs-bm25-folder-1';
const otherUserFileId = 'file-bm25-other-1';
const otherUserDocId = 'docs-bm25-other-1';
beforeEach(async () => {
await serverDB.insert(files).values([
{
fileType: 'application/pdf',
id: pdfFileId,
name: 'transformers-paper.pdf',
size: 2048,
url: 's3://bucket/transformers-paper.pdf',
userId,
},
{
fileType: 'application/pdf',
id: otherUserFileId,
name: 'leak-check.pdf',
size: 2048,
url: 's3://bucket/leak-check.pdf',
userId: otherUserId,
},
]);
await serverDB.insert(knowledgeBaseFiles).values([
{ fileId: pdfFileId, knowledgeBaseId: kbA, userId },
{ fileId: otherUserFileId, knowledgeBaseId: 'kb-other-1', userId: otherUserId },
]);
await serverDB.insert(documents).values([
{
content:
'Attention is all you need. The Transformer architecture relies on self-attention ' +
'and replaces recurrence with parallel multi-head attention layers.',
fileId: pdfFileId,
fileType: 'application/pdf',
filename: 'transformers-paper.pdf',
id: pdfDocId,
source: 's3://bucket/transformers-paper.pdf',
sourceType: 'file',
title: 'Attention Is All You Need',
totalCharCount: 200,
totalLineCount: 5,
userId,
},
{
content: '',
fileType: 'custom/folder',
filename: 'a folder',
id: folderDocId,
knowledgeBaseId: kbA,
source: 'internal://folder/placeholder',
sourceType: 'api',
title: 'Transformer Folder',
totalCharCount: 0,
totalLineCount: 0,
userId,
},
{
content:
'Attention paper in another user knowledge base — must never surface for current user.',
fileId: otherUserFileId,
fileType: 'application/pdf',
filename: 'leak-check.pdf',
id: otherUserDocId,
source: 's3://bucket/leak-check.pdf',
sourceType: 'file',
title: 'Attention Leak Check',
totalCharCount: 100,
totalLineCount: 3,
userId: otherUserId,
},
]);
});
it('returns a PDF-backed document hit via knowledge_base_files join', async () => {
const results = await searchRepo.searchKnowledgeBaseDocuments('attention transformer', [
kbA,
]);
const pdfHit = results.find((r) => r.documentId === pdfDocId);
expect(pdfHit).toBeDefined();
expect(pdfHit?.knowledgeBaseId).toBe(kbA);
expect(pdfHit?.fileId).toBe(pdfFileId);
expect(pdfHit?.title).toBe('Attention Is All You Need');
});
it('still matches inline custom/document hits in the same call', async () => {
await serverDB.insert(documents).values({
content: 'Attention transformer notes written inline for KB-A',
fileType: 'custom/document',
filename: 'inline-notes.md',
knowledgeBaseId: kbA,
source: 'internal://document/placeholder',
sourceType: 'api',
title: 'Inline Attention Notes',
totalCharCount: 60,
totalLineCount: 2,
userId,
});
const results = await searchRepo.searchKnowledgeBaseDocuments('attention', [kbA]);
expect(results.some((r) => r.title === 'Inline Attention Notes')).toBe(true);
expect(results.some((r) => r.documentId === pdfDocId)).toBe(true);
});
it('excludes folder documents even when they match the query', async () => {
const results = await searchRepo.searchKnowledgeBaseDocuments('transformer folder', [kbA]);
expect(results.every((r) => r.documentId !== folderDocId)).toBe(true);
});
it('does not surface another user PDF when querying their KB', async () => {
const results = await searchRepo.searchKnowledgeBaseDocuments('attention', [
'kb-other-1',
]);
expect(results).toEqual([]);
});
});
});
describe('search - context types', () => {
@@ -134,12 +134,17 @@ export interface KnowledgeBaseSearchResult extends BaseSearchResult {
}
/**
* BM25 hit for KB-scoped documents (custom/document) used by chunkRouter.semanticSearchForChat.
* Distinct from PageSearchResult — this carries snippet + KB id for agent tool consumption.
* BM25 hit for KB-scoped documents used by chunkRouter.semanticSearchForChat.
* Covers both inline `custom/document` pages and file-backed documents
* (e.g. parsed PDFs) joined through `knowledge_base_files`.
*
* `fileId` is present when the hit comes from a parsed-file document, letting
* the agent fetch the original via readKnowledge with either docs_* or file_*.
* `relevance` is normalized to [1, 3] (lower = better, matches BaseSearchResult semantics).
*/
export interface KnowledgeBaseDocumentHit {
documentId: string;
fileId?: string;
knowledgeBaseId: string;
relevance: number;
snippet: string;
@@ -679,9 +684,19 @@ export class SearchRepo {
}
/**
* KB-scoped BM25 search over custom/document documents.
* Used by chunkRouter.semanticSearchForChat to surface inline documents
* to the KB agent tool's searchKnowledgeBase API.
* KB-scoped BM25 search over documents.
*
* Covers two routes to the KB scope, executed as two separate ParadeDB
* scoring queries that we merge in JS:
* - inline pages: `documents.knowledge_base_id` directly references the KB
* - file-backed docs (e.g. parsed PDFs): joined through `knowledge_base_files`
* via `documents.file_id`
*
* Two queries instead of an `OR`-ed WHERE clause because `paradedb.score()`
* requires a tantivy index scan, and ParadeDB rejects disjunctive shapes
* spanning bm25 and non-bm25 predicates ("Unsupported query shape").
*
* Folder rows (DOCUMENT_FOLDER_TYPE) are excluded — they carry no content.
*/
async searchKnowledgeBaseDocuments(
query: string,
@@ -693,9 +708,14 @@ export class SearchRepo {
const bm25Query = sanitizeBm25Query(query);
const rows = await this.db
const matchClause = sql`(${documents.title} @@@ ${bm25Query} OR ${documents.slug} @@@ ${bm25Query} OR ${documents.content} @@@ ${bm25Query})`;
const folderClause = ne(documents.fileType, DOCUMENT_FOLDER_TYPE);
const userClause = eq(documents.userId, this.userId);
const inlineRowsPromise = this.db
.select({
content: documents.content,
fileId: documents.fileId,
filename: documents.filename,
id: documents.id,
knowledgeBaseId: documents.knowledgeBaseId,
@@ -706,17 +726,56 @@ export class SearchRepo {
.from(documents)
.where(
and(
eq(documents.userId, this.userId),
eq(documents.fileType, 'custom/document'),
userClause,
folderClause,
inArray(documents.knowledgeBaseId, knowledgeBaseIds),
sql`(${documents.title} @@@ ${bm25Query} OR ${documents.slug} @@@ ${bm25Query} OR ${documents.content} @@@ ${bm25Query})`,
matchClause,
),
)
.orderBy(sql`paradedb.score(${documents.id}) DESC`)
.limit(limit);
return this.mapScoresToRelevance(rows).map((row) => ({
const fileBackedRowsPromise = this.db
.select({
content: documents.content,
fileId: documents.fileId,
filename: documents.filename,
id: documents.id,
knowledgeBaseId: knowledgeBaseFiles.knowledgeBaseId,
score: sql<number>`paradedb.score(${documents.id})`,
title: documents.title,
updatedAt: documents.updatedAt,
})
.from(documents)
.innerJoin(
knowledgeBaseFiles,
and(
eq(knowledgeBaseFiles.fileId, documents.fileId),
eq(knowledgeBaseFiles.userId, this.userId),
inArray(knowledgeBaseFiles.knowledgeBaseId, knowledgeBaseIds),
),
)
.where(and(userClause, folderClause, matchClause))
.orderBy(sql`paradedb.score(${documents.id}) DESC`)
.limit(limit);
const [inlineRows, fileBackedRows] = await Promise.all([
inlineRowsPromise,
fileBackedRowsPromise,
]);
const byId = new Map<string, (typeof inlineRows)[number]>();
for (const row of [...inlineRows, ...fileBackedRows]) {
const prev = byId.get(row.id);
if (!prev || row.score > prev.score) byId.set(row.id, row);
}
const merged = Array.from(byId.values())
.sort((a, b) => b.score - a.score)
.slice(0, limit);
return this.mapScoresToRelevance(merged).map((row) => ({
documentId: row.id,
fileId: row.fileId ?? undefined,
knowledgeBaseId: row.knowledgeBaseId ?? '',
relevance: row.relevance,
snippet: this.truncate(row.content, 300) ?? '',
@@ -225,6 +225,22 @@ describe('formatSearchResults', () => {
expect(result).toMatchSnapshot();
});
it('should expose fileId attribute on file-backed documents', () => {
const documents: DocumentSearchResult[] = [
{
documentId: 'docs_pdf_xyz',
fileId: 'file_pdf_xyz',
knowledgeBaseId: 'kb_research',
relevance: 1.4,
snippet: 'Attention is all you need...',
title: 'Attention Paper',
},
];
const result = formatSearchResults([], 'attention', documents);
expect(result).toContain('fileId="file_pdf_xyz"');
expect(result).toContain('id="docs_pdf_xyz"');
});
it('should annotate when vector search fails but BM25 succeeds', () => {
const documents: DocumentSearchResult[] = [
{
@@ -12,6 +12,7 @@ export interface FileSearchResult {
export interface DocumentSearchResult {
documentId: string;
fileId?: string;
knowledgeBaseId: string;
relevance: number;
snippet: string;
@@ -42,12 +43,13 @@ ${chunks.join('\n')}
};
/**
* Formats a single document search result (BM25 hit on custom/document) with XML tags.
* Formats a single document search result (BM25 hit on a KB document) with XML tags.
* Documents return only a snippet — agent should call readKnowledge with the docs_* id
* to fetch the full content.
* (or file_* id when present, for parsed-file documents) to fetch the full content.
*/
const formatDocument = (doc: DocumentSearchResult): string => {
return `<document id="${doc.documentId}" title="${doc.title}" relevance="${doc.relevance}" knowledgeBaseId="${doc.knowledgeBaseId}">
const fileIdAttr = doc.fileId ? ` fileId="${doc.fileId}"` : '';
return `<document id="${doc.documentId}"${fileIdAttr} title="${doc.title}" relevance="${doc.relevance}" knowledgeBaseId="${doc.knowledgeBaseId}">
<snippet>${doc.snippet}</snippet>
</document>`;
};