diff --git a/packages/builtin-tool-knowledge-base/src/types.ts b/packages/builtin-tool-knowledge-base/src/types.ts index 4e20f67c2c..933ed77e13 100644 --- a/packages/builtin-tool-knowledge-base/src/types.ts +++ b/packages/builtin-tool-knowledge-base/src/types.ts @@ -24,12 +24,14 @@ export interface SearchKnowledgeBaseArgs { } /** - * BM25 hit on a custom/document inside a knowledge base. + * BM25 hit on a document inside a knowledge base. Covers both inline + * `custom/document` pages and file-backed documents (parsed PDFs and the like). * Mirrors database/repositories/search KnowledgeBaseDocumentHit; redeclared * here to keep this package decoupled from server-only types. */ export interface KnowledgeBaseDocumentResult { documentId: string; + fileId?: string; knowledgeBaseId: string; relevance: number; snippet: string; diff --git a/packages/database/src/repositories/search/index.test.ts b/packages/database/src/repositories/search/index.test.ts index 9fd96b70c1..f8e00672c1 100644 --- a/packages/database/src/repositories/search/index.test.ts +++ b/packages/database/src/repositories/search/index.test.ts @@ -6,7 +6,7 @@ import { documents } from '../../schemas'; import type { NewAgent } from '../../schemas/agent'; import { agents } from '../../schemas/agent'; import type { NewFile } from '../../schemas/file'; -import { files, knowledgeBases } from '../../schemas/file'; +import { files, knowledgeBaseFiles, knowledgeBases } from '../../schemas/file'; import { messages } from '../../schemas/message'; import type { NewTopic } from '../../schemas/topic'; import { topics } from '../../schemas/topic'; @@ -967,6 +967,127 @@ describe.skipIf(!isServerDB)('SearchRepo', () => { expect(r.relevance).toBeLessThanOrEqual(3); }); }); + + describe('file-backed documents (PDF / parsed files)', () => { + const pdfFileId = 'file-bm25-pdf-1'; + const pdfDocId = 'docs-bm25-pdf-1'; + const folderDocId = 'docs-bm25-folder-1'; + const otherUserFileId = 'file-bm25-other-1'; + const otherUserDocId = 'docs-bm25-other-1'; + + beforeEach(async () => { + await serverDB.insert(files).values([ + { + fileType: 'application/pdf', + id: pdfFileId, + name: 'transformers-paper.pdf', + size: 2048, + url: 's3://bucket/transformers-paper.pdf', + userId, + }, + { + fileType: 'application/pdf', + id: otherUserFileId, + name: 'leak-check.pdf', + size: 2048, + url: 's3://bucket/leak-check.pdf', + userId: otherUserId, + }, + ]); + + await serverDB.insert(knowledgeBaseFiles).values([ + { fileId: pdfFileId, knowledgeBaseId: kbA, userId }, + { fileId: otherUserFileId, knowledgeBaseId: 'kb-other-1', userId: otherUserId }, + ]); + + await serverDB.insert(documents).values([ + { + content: + 'Attention is all you need. The Transformer architecture relies on self-attention ' + + 'and replaces recurrence with parallel multi-head attention layers.', + fileId: pdfFileId, + fileType: 'application/pdf', + filename: 'transformers-paper.pdf', + id: pdfDocId, + source: 's3://bucket/transformers-paper.pdf', + sourceType: 'file', + title: 'Attention Is All You Need', + totalCharCount: 200, + totalLineCount: 5, + userId, + }, + { + content: '', + fileType: 'custom/folder', + filename: 'a folder', + id: folderDocId, + knowledgeBaseId: kbA, + source: 'internal://folder/placeholder', + sourceType: 'api', + title: 'Transformer Folder', + totalCharCount: 0, + totalLineCount: 0, + userId, + }, + { + content: + 'Attention paper in another user knowledge base — must never surface for current user.', + fileId: otherUserFileId, + fileType: 'application/pdf', + filename: 'leak-check.pdf', + id: otherUserDocId, + source: 's3://bucket/leak-check.pdf', + sourceType: 'file', + title: 'Attention Leak Check', + totalCharCount: 100, + totalLineCount: 3, + userId: otherUserId, + }, + ]); + }); + + it('returns a PDF-backed document hit via knowledge_base_files join', async () => { + const results = await searchRepo.searchKnowledgeBaseDocuments('attention transformer', [ + kbA, + ]); + const pdfHit = results.find((r) => r.documentId === pdfDocId); + expect(pdfHit).toBeDefined(); + expect(pdfHit?.knowledgeBaseId).toBe(kbA); + expect(pdfHit?.fileId).toBe(pdfFileId); + expect(pdfHit?.title).toBe('Attention Is All You Need'); + }); + + it('still matches inline custom/document hits in the same call', async () => { + await serverDB.insert(documents).values({ + content: 'Attention transformer notes written inline for KB-A', + fileType: 'custom/document', + filename: 'inline-notes.md', + knowledgeBaseId: kbA, + source: 'internal://document/placeholder', + sourceType: 'api', + title: 'Inline Attention Notes', + totalCharCount: 60, + totalLineCount: 2, + userId, + }); + + const results = await searchRepo.searchKnowledgeBaseDocuments('attention', [kbA]); + expect(results.some((r) => r.title === 'Inline Attention Notes')).toBe(true); + expect(results.some((r) => r.documentId === pdfDocId)).toBe(true); + }); + + it('excludes folder documents even when they match the query', async () => { + const results = await searchRepo.searchKnowledgeBaseDocuments('transformer folder', [kbA]); + expect(results.every((r) => r.documentId !== folderDocId)).toBe(true); + }); + + it('does not surface another user PDF when querying their KB', async () => { + const results = await searchRepo.searchKnowledgeBaseDocuments('attention', [ + 'kb-other-1', + ]); + expect(results).toEqual([]); + }); + }); }); describe('search - context types', () => { diff --git a/packages/database/src/repositories/search/index.ts b/packages/database/src/repositories/search/index.ts index a758b8ab57..cc26430fcc 100644 --- a/packages/database/src/repositories/search/index.ts +++ b/packages/database/src/repositories/search/index.ts @@ -134,12 +134,17 @@ export interface KnowledgeBaseSearchResult extends BaseSearchResult { } /** - * BM25 hit for KB-scoped documents (custom/document) used by chunkRouter.semanticSearchForChat. - * Distinct from PageSearchResult — this carries snippet + KB id for agent tool consumption. + * BM25 hit for KB-scoped documents used by chunkRouter.semanticSearchForChat. + * Covers both inline `custom/document` pages and file-backed documents + * (e.g. parsed PDFs) joined through `knowledge_base_files`. + * + * `fileId` is present when the hit comes from a parsed-file document, letting + * the agent fetch the original via readKnowledge with either docs_* or file_*. * `relevance` is normalized to [1, 3] (lower = better, matches BaseSearchResult semantics). */ export interface KnowledgeBaseDocumentHit { documentId: string; + fileId?: string; knowledgeBaseId: string; relevance: number; snippet: string; @@ -679,9 +684,19 @@ export class SearchRepo { } /** - * KB-scoped BM25 search over custom/document documents. - * Used by chunkRouter.semanticSearchForChat to surface inline documents - * to the KB agent tool's searchKnowledgeBase API. + * KB-scoped BM25 search over documents. + * + * Covers two routes to the KB scope, executed as two separate ParadeDB + * scoring queries that we merge in JS: + * - inline pages: `documents.knowledge_base_id` directly references the KB + * - file-backed docs (e.g. parsed PDFs): joined through `knowledge_base_files` + * via `documents.file_id` + * + * Two queries instead of an `OR`-ed WHERE clause because `paradedb.score()` + * requires a tantivy index scan, and ParadeDB rejects disjunctive shapes + * spanning bm25 and non-bm25 predicates ("Unsupported query shape"). + * + * Folder rows (DOCUMENT_FOLDER_TYPE) are excluded — they carry no content. */ async searchKnowledgeBaseDocuments( query: string, @@ -693,9 +708,14 @@ export class SearchRepo { const bm25Query = sanitizeBm25Query(query); - const rows = await this.db + const matchClause = sql`(${documents.title} @@@ ${bm25Query} OR ${documents.slug} @@@ ${bm25Query} OR ${documents.content} @@@ ${bm25Query})`; + const folderClause = ne(documents.fileType, DOCUMENT_FOLDER_TYPE); + const userClause = eq(documents.userId, this.userId); + + const inlineRowsPromise = this.db .select({ content: documents.content, + fileId: documents.fileId, filename: documents.filename, id: documents.id, knowledgeBaseId: documents.knowledgeBaseId, @@ -706,17 +726,56 @@ export class SearchRepo { .from(documents) .where( and( - eq(documents.userId, this.userId), - eq(documents.fileType, 'custom/document'), + userClause, + folderClause, inArray(documents.knowledgeBaseId, knowledgeBaseIds), - sql`(${documents.title} @@@ ${bm25Query} OR ${documents.slug} @@@ ${bm25Query} OR ${documents.content} @@@ ${bm25Query})`, + matchClause, ), ) .orderBy(sql`paradedb.score(${documents.id}) DESC`) .limit(limit); - return this.mapScoresToRelevance(rows).map((row) => ({ + const fileBackedRowsPromise = this.db + .select({ + content: documents.content, + fileId: documents.fileId, + filename: documents.filename, + id: documents.id, + knowledgeBaseId: knowledgeBaseFiles.knowledgeBaseId, + score: sql`paradedb.score(${documents.id})`, + title: documents.title, + updatedAt: documents.updatedAt, + }) + .from(documents) + .innerJoin( + knowledgeBaseFiles, + and( + eq(knowledgeBaseFiles.fileId, documents.fileId), + eq(knowledgeBaseFiles.userId, this.userId), + inArray(knowledgeBaseFiles.knowledgeBaseId, knowledgeBaseIds), + ), + ) + .where(and(userClause, folderClause, matchClause)) + .orderBy(sql`paradedb.score(${documents.id}) DESC`) + .limit(limit); + + const [inlineRows, fileBackedRows] = await Promise.all([ + inlineRowsPromise, + fileBackedRowsPromise, + ]); + + const byId = new Map(); + for (const row of [...inlineRows, ...fileBackedRows]) { + const prev = byId.get(row.id); + if (!prev || row.score > prev.score) byId.set(row.id, row); + } + const merged = Array.from(byId.values()) + .sort((a, b) => b.score - a.score) + .slice(0, limit); + + return this.mapScoresToRelevance(merged).map((row) => ({ documentId: row.id, + fileId: row.fileId ?? undefined, knowledgeBaseId: row.knowledgeBaseId ?? '', relevance: row.relevance, snippet: this.truncate(row.content, 300) ?? '', diff --git a/packages/prompts/src/prompts/knowledgeBaseQA/formatSearchResults.test.ts b/packages/prompts/src/prompts/knowledgeBaseQA/formatSearchResults.test.ts index c699c98ff9..b015087b67 100644 --- a/packages/prompts/src/prompts/knowledgeBaseQA/formatSearchResults.test.ts +++ b/packages/prompts/src/prompts/knowledgeBaseQA/formatSearchResults.test.ts @@ -225,6 +225,22 @@ describe('formatSearchResults', () => { expect(result).toMatchSnapshot(); }); + it('should expose fileId attribute on file-backed documents', () => { + const documents: DocumentSearchResult[] = [ + { + documentId: 'docs_pdf_xyz', + fileId: 'file_pdf_xyz', + knowledgeBaseId: 'kb_research', + relevance: 1.4, + snippet: 'Attention is all you need...', + title: 'Attention Paper', + }, + ]; + const result = formatSearchResults([], 'attention', documents); + expect(result).toContain('fileId="file_pdf_xyz"'); + expect(result).toContain('id="docs_pdf_xyz"'); + }); + it('should annotate when vector search fails but BM25 succeeds', () => { const documents: DocumentSearchResult[] = [ { diff --git a/packages/prompts/src/prompts/knowledgeBaseQA/formatSearchResults.ts b/packages/prompts/src/prompts/knowledgeBaseQA/formatSearchResults.ts index 1a467c3d4f..f42c2bb57f 100644 --- a/packages/prompts/src/prompts/knowledgeBaseQA/formatSearchResults.ts +++ b/packages/prompts/src/prompts/knowledgeBaseQA/formatSearchResults.ts @@ -12,6 +12,7 @@ export interface FileSearchResult { export interface DocumentSearchResult { documentId: string; + fileId?: string; knowledgeBaseId: string; relevance: number; snippet: string; @@ -42,12 +43,13 @@ ${chunks.join('\n')} }; /** - * Formats a single document search result (BM25 hit on custom/document) with XML tags. + * Formats a single document search result (BM25 hit on a KB document) with XML tags. * Documents return only a snippet — agent should call readKnowledge with the docs_* id - * to fetch the full content. + * (or file_* id when present, for parsed-file documents) to fetch the full content. */ const formatDocument = (doc: DocumentSearchResult): string => { - return ` + const fileIdAttr = doc.fileId ? ` fileId="${doc.fileId}"` : ''; + return ` ${doc.snippet} `; };