✨ feat(kb): extend BM25 search to file-backed documents (#15247)

`searchKnowledgeBaseDocuments` only matched inline `custom/document` pages, so parsed PDFs and other file-backed documents never surfaced via the BM25 path — vector search was the sole way to retrieve them. Run two scoped ParadeDB queries in parallel (inline via `documents.knowledge_base_id`, file-backed via a `knowledge_base_files` join) and merge by score in JS. A single OR-ed predicate trips ParadeDB's `Unsupported query shape` because `paradedb.score()` requires a conjunctive tantivy scan. Folder rows are excluded; hits now carry an optional `fileId` so the agent can read with either `docs_*` or `file_*` ids. The XML formatter exposes the new attribute downstream.
2026-06-14 03:30:19 +00:00 · 2026-05-29 01:01:47 +08:00
parent 2194b23390
commit 65113ca2a7
5 changed files with 215 additions and 15 deletions
@@ -24,12 +24,14 @@ export interface SearchKnowledgeBaseArgs {
 }

 /**
- * BM25 hit on a custom/document inside a knowledge base.
+ * BM25 hit on a document inside a knowledge base. Covers both inline
+ * `custom/document` pages and file-backed documents (parsed PDFs and the like).
 * Mirrors database/repositories/search KnowledgeBaseDocumentHit; redeclared
 * here to keep this package decoupled from server-only types.
 */
 export interface KnowledgeBaseDocumentResult {
  documentId: string;
+  fileId?: string;
  knowledgeBaseId: string;
  relevance: number;
  snippet: string;
@@ -6,7 +6,7 @@ import { documents } from '../../schemas';
 import type { NewAgent } from '../../schemas/agent';
 import { agents } from '../../schemas/agent';
 import type { NewFile } from '../../schemas/file';
-import { files, knowledgeBases } from '../../schemas/file';
+import { files, knowledgeBaseFiles, knowledgeBases } from '../../schemas/file';
 import { messages } from '../../schemas/message';
 import type { NewTopic } from '../../schemas/topic';
 import { topics } from '../../schemas/topic';
@@ -967,6 +967,127 @@ describe.skipIf(!isServerDB)('SearchRepo', () => {
        expect(r.relevance).toBeLessThanOrEqual(3);
      });
    });
+
+    describe('file-backed documents (PDF / parsed files)', () => {
+      const pdfFileId = 'file-bm25-pdf-1';
+      const pdfDocId = 'docs-bm25-pdf-1';
+      const folderDocId = 'docs-bm25-folder-1';
+      const otherUserFileId = 'file-bm25-other-1';
+      const otherUserDocId = 'docs-bm25-other-1';
+
+      beforeEach(async () => {
+        await serverDB.insert(files).values([
+          {
+            fileType: 'application/pdf',
+            id: pdfFileId,
+            name: 'transformers-paper.pdf',
+            size: 2048,
+            url: 's3://bucket/transformers-paper.pdf',
+            userId,
+          },
+          {
+            fileType: 'application/pdf',
+            id: otherUserFileId,
+            name: 'leak-check.pdf',
+            size: 2048,
+            url: 's3://bucket/leak-check.pdf',
+            userId: otherUserId,
+          },
+        ]);
+
+        await serverDB.insert(knowledgeBaseFiles).values([
+          { fileId: pdfFileId, knowledgeBaseId: kbA, userId },
+          { fileId: otherUserFileId, knowledgeBaseId: 'kb-other-1', userId: otherUserId },
+        ]);
+
+        await serverDB.insert(documents).values([
+          {
+            content:
+              'Attention is all you need. The Transformer architecture relies on self-attention ' +
+              'and replaces recurrence with parallel multi-head attention layers.',
+            fileId: pdfFileId,
+            fileType: 'application/pdf',
+            filename: 'transformers-paper.pdf',
+            id: pdfDocId,
+            source: 's3://bucket/transformers-paper.pdf',
+            sourceType: 'file',
+            title: 'Attention Is All You Need',
+            totalCharCount: 200,
+            totalLineCount: 5,
+            userId,
+          },
+          {
+            content: '',
+            fileType: 'custom/folder',
+            filename: 'a folder',
+            id: folderDocId,
+            knowledgeBaseId: kbA,
+            source: 'internal://folder/placeholder',
+            sourceType: 'api',
+            title: 'Transformer Folder',
+            totalCharCount: 0,
+            totalLineCount: 0,
+            userId,
+          },
+          {
+            content:
+              'Attention paper in another user knowledge base — must never surface for current user.',
+            fileId: otherUserFileId,
+            fileType: 'application/pdf',
+            filename: 'leak-check.pdf',
+            id: otherUserDocId,
+            source: 's3://bucket/leak-check.pdf',
+            sourceType: 'file',
+            title: 'Attention Leak Check',
+            totalCharCount: 100,
+            totalLineCount: 3,
+            userId: otherUserId,
+          },
+        ]);
+      });
+
+      it('returns a PDF-backed document hit via knowledge_base_files join', async () => {
+        const results = await searchRepo.searchKnowledgeBaseDocuments('attention transformer', [
+          kbA,
+        ]);
+        const pdfHit = results.find((r) => r.documentId === pdfDocId);
+        expect(pdfHit).toBeDefined();
+        expect(pdfHit?.knowledgeBaseId).toBe(kbA);
+        expect(pdfHit?.fileId).toBe(pdfFileId);
+        expect(pdfHit?.title).toBe('Attention Is All You Need');
+      });
+
+      it('still matches inline custom/document hits in the same call', async () => {
+        await serverDB.insert(documents).values({
+          content: 'Attention transformer notes written inline for KB-A',
+          fileType: 'custom/document',
+          filename: 'inline-notes.md',
+          knowledgeBaseId: kbA,
+          source: 'internal://document/placeholder',
+          sourceType: 'api',
+          title: 'Inline Attention Notes',
+          totalCharCount: 60,
+          totalLineCount: 2,
+          userId,
+        });
+
+        const results = await searchRepo.searchKnowledgeBaseDocuments('attention', [kbA]);
+        expect(results.some((r) => r.title === 'Inline Attention Notes')).toBe(true);
+        expect(results.some((r) => r.documentId === pdfDocId)).toBe(true);
+      });
+
+      it('excludes folder documents even when they match the query', async () => {
+        const results = await searchRepo.searchKnowledgeBaseDocuments('transformer folder', [kbA]);
+        expect(results.every((r) => r.documentId !== folderDocId)).toBe(true);
+      });
+
+      it('does not surface another user PDF when querying their KB', async () => {
+        const results = await searchRepo.searchKnowledgeBaseDocuments('attention', [
+          'kb-other-1',
+        ]);
+        expect(results).toEqual([]);
+      });
+    });
  });

  describe('search - context types', () => {
@@ -134,12 +134,17 @@ export interface KnowledgeBaseSearchResult extends BaseSearchResult {
 }

 /**
- * BM25 hit for KB-scoped documents (custom/document) used by chunkRouter.semanticSearchForChat.
- * Distinct from PageSearchResult — this carries snippet + KB id for agent tool consumption.
+ * BM25 hit for KB-scoped documents used by chunkRouter.semanticSearchForChat.
+ * Covers both inline `custom/document` pages and file-backed documents
+ * (e.g. parsed PDFs) joined through `knowledge_base_files`.
+ *
+ * `fileId` is present when the hit comes from a parsed-file document, letting
+ * the agent fetch the original via readKnowledge with either docs_* or file_*.
 * `relevance` is normalized to [1, 3] (lower = better, matches BaseSearchResult semantics).
 */
 export interface KnowledgeBaseDocumentHit {
  documentId: string;
+  fileId?: string;
  knowledgeBaseId: string;
  relevance: number;
  snippet: string;
@@ -679,9 +684,19 @@ export class SearchRepo {
  }

  /**
-   * KB-scoped BM25 search over custom/document documents.
-   * Used by chunkRouter.semanticSearchForChat to surface inline documents
-   * to the KB agent tool's searchKnowledgeBase API.
+   * KB-scoped BM25 search over documents.
+   *
+   * Covers two routes to the KB scope, executed as two separate ParadeDB
+   * scoring queries that we merge in JS:
+   *   - inline pages: `documents.knowledge_base_id` directly references the KB
+   *   - file-backed docs (e.g. parsed PDFs): joined through `knowledge_base_files`
+   *     via `documents.file_id`
+   *
+   * Two queries instead of an `OR`-ed WHERE clause because `paradedb.score()`
+   * requires a tantivy index scan, and ParadeDB rejects disjunctive shapes
+   * spanning bm25 and non-bm25 predicates ("Unsupported query shape").
+   *
+   * Folder rows (DOCUMENT_FOLDER_TYPE) are excluded — they carry no content.
   */
  async searchKnowledgeBaseDocuments(
    query: string,
@@ -693,9 +708,14 @@ export class SearchRepo {

    const bm25Query = sanitizeBm25Query(query);

-    const rows = await this.db
+    const matchClause = sql`(${documents.title} @@@ ${bm25Query} OR ${documents.slug} @@@ ${bm25Query} OR ${documents.content} @@@ ${bm25Query})`;
+    const folderClause = ne(documents.fileType, DOCUMENT_FOLDER_TYPE);
+    const userClause = eq(documents.userId, this.userId);
+
+    const inlineRowsPromise = this.db
      .select({
        content: documents.content,
+        fileId: documents.fileId,
        filename: documents.filename,
        id: documents.id,
        knowledgeBaseId: documents.knowledgeBaseId,
@@ -706,17 +726,56 @@ export class SearchRepo {
      .from(documents)
      .where(
        and(
-          eq(documents.userId, this.userId),
-          eq(documents.fileType, 'custom/document'),
+          userClause,
+          folderClause,
          inArray(documents.knowledgeBaseId, knowledgeBaseIds),
-          sql`(${documents.title} @@@ ${bm25Query} OR ${documents.slug} @@@ ${bm25Query} OR ${documents.content} @@@ ${bm25Query})`,
+          matchClause,
        ),
      )
      .orderBy(sql`paradedb.score(${documents.id}) DESC`)
      .limit(limit);

-    return this.mapScoresToRelevance(rows).map((row) => ({
+    const fileBackedRowsPromise = this.db
+      .select({
+        content: documents.content,
+        fileId: documents.fileId,
+        filename: documents.filename,
+        id: documents.id,
+        knowledgeBaseId: knowledgeBaseFiles.knowledgeBaseId,
+        score: sql<number>`paradedb.score(${documents.id})`,
+        title: documents.title,
+        updatedAt: documents.updatedAt,
+      })
+      .from(documents)
+      .innerJoin(
+        knowledgeBaseFiles,
+        and(
+          eq(knowledgeBaseFiles.fileId, documents.fileId),
+          eq(knowledgeBaseFiles.userId, this.userId),
+          inArray(knowledgeBaseFiles.knowledgeBaseId, knowledgeBaseIds),
+        ),
+      )
+      .where(and(userClause, folderClause, matchClause))
+      .orderBy(sql`paradedb.score(${documents.id}) DESC`)
+      .limit(limit);
+
+    const [inlineRows, fileBackedRows] = await Promise.all([
+      inlineRowsPromise,
+      fileBackedRowsPromise,
+    ]);
+
+    const byId = new Map<string, (typeof inlineRows)[number]>();
+    for (const row of [...inlineRows, ...fileBackedRows]) {
+      const prev = byId.get(row.id);
+      if (!prev || row.score > prev.score) byId.set(row.id, row);
+    }
+    const merged = Array.from(byId.values())
+      .sort((a, b) => b.score - a.score)
+      .slice(0, limit);
+
+    return this.mapScoresToRelevance(merged).map((row) => ({
      documentId: row.id,
+      fileId: row.fileId ?? undefined,
      knowledgeBaseId: row.knowledgeBaseId ?? '',
      relevance: row.relevance,
      snippet: this.truncate(row.content, 300) ?? '',
@@ -225,6 +225,22 @@ describe('formatSearchResults', () => {
    expect(result).toMatchSnapshot();
  });

+  it('should expose fileId attribute on file-backed documents', () => {
+    const documents: DocumentSearchResult[] = [
+      {
+        documentId: 'docs_pdf_xyz',
+        fileId: 'file_pdf_xyz',
+        knowledgeBaseId: 'kb_research',
+        relevance: 1.4,
+        snippet: 'Attention is all you need...',
+        title: 'Attention Paper',
+      },
+    ];
+    const result = formatSearchResults([], 'attention', documents);
+    expect(result).toContain('fileId="file_pdf_xyz"');
+    expect(result).toContain('id="docs_pdf_xyz"');
+  });
+
  it('should annotate when vector search fails but BM25 succeeds', () => {
    const documents: DocumentSearchResult[] = [
      {
@@ -12,6 +12,7 @@ export interface FileSearchResult {

 export interface DocumentSearchResult {
  documentId: string;
+  fileId?: string;
  knowledgeBaseId: string;
  relevance: number;
  snippet: string;
@@ -42,12 +43,13 @@ ${chunks.join('\n')}
 };

 /**
- * Formats a single document search result (BM25 hit on custom/document) with XML tags.
+ * Formats a single document search result (BM25 hit on a KB document) with XML tags.
 * Documents return only a snippet — agent should call readKnowledge with the docs_* id
- * to fetch the full content.
+ * (or file_* id when present, for parsed-file documents) to fetch the full content.
 */
 const formatDocument = (doc: DocumentSearchResult): string => {
-  return `<document id="${doc.documentId}" title="${doc.title}" relevance="${doc.relevance}" knowledgeBaseId="${doc.knowledgeBaseId}">
+  const fileIdAttr = doc.fileId ? ` fileId="${doc.fileId}"` : '';
+  return `<document id="${doc.documentId}"${fileIdAttr} title="${doc.title}" relevance="${doc.relevance}" knowledgeBaseId="${doc.knowledgeBaseId}">
 <snippet>${doc.snippet}</snippet>
 </document>`;
 };