From 23fade3cf80d00a75c6ea76b7b8eb80a44a06da8 Mon Sep 17 00:00:00 2001 From: Zhijie He Date: Tue, 3 Jun 2025 21:38:36 +0800 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20feat:=20add=20more=20provider=20sup?= =?UTF-8?q?port=20for=20search=20&=20crawl=20(#8033)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * ✨ feat: support tavily as search provider * ✨ feat: support tavily as crawl provider * πŸ› fix: fix category mapping * ✨ feat: support firecrwal as crawl provider * ✨ feat: support firecrawl as search provider * πŸ”¨ chore: support firecrawl baseUrl for self-host * πŸ› fix: fix build error * ✨ feat: support jina as search provider * πŸ› fix: fix build error * ✨ feat: support BoChaAI & Exa as search & crawl provider * πŸ› fix: fix build error * πŸ”¨ chore: rename to bocha * πŸ› fix: fix typo in bocha impl * ♻️ refactor: add `category` & `time_range` support for Exa --------- Co-authored-by: Arvin Xu --- packages/web-crawler/src/crawImpl/exa.ts | 93 +++++++++++++ .../web-crawler/src/crawImpl/firecrawl.ts | 97 +++++++++++++ packages/web-crawler/src/crawImpl/index.ts | 6 + packages/web-crawler/src/crawImpl/tavily.ts | 94 +++++++++++++ .../services/search/impls/bocha/index.ts | 124 +++++++++++++++++ .../services/search/impls/bocha/type.ts | 47 +++++++ src/server/services/search/impls/exa/index.ts | 129 ++++++++++++++++++ src/server/services/search/impls/exa/type.ts | 39 ++++++ .../services/search/impls/firecrawl/index.ts | 128 +++++++++++++++++ .../services/search/impls/firecrawl/type.ts | 35 +++++ src/server/services/search/impls/index.ts | 31 +++++ .../services/search/impls/jina/index.ts | 109 +++++++++++++++ src/server/services/search/impls/jina/type.ts | 26 ++++ .../services/search/impls/tavily/index.ts | 124 +++++++++++++++++ .../services/search/impls/tavily/type.ts | 36 +++++ 15 files changed, 1118 insertions(+) create mode 100644 packages/web-crawler/src/crawImpl/exa.ts create mode 100644 packages/web-crawler/src/crawImpl/firecrawl.ts create mode 100644 packages/web-crawler/src/crawImpl/tavily.ts create mode 100644 src/server/services/search/impls/bocha/index.ts create mode 100644 src/server/services/search/impls/bocha/type.ts create mode 100644 src/server/services/search/impls/exa/index.ts create mode 100644 src/server/services/search/impls/exa/type.ts create mode 100644 src/server/services/search/impls/firecrawl/index.ts create mode 100644 src/server/services/search/impls/firecrawl/type.ts create mode 100644 src/server/services/search/impls/jina/index.ts create mode 100644 src/server/services/search/impls/jina/type.ts create mode 100644 src/server/services/search/impls/tavily/index.ts create mode 100644 src/server/services/search/impls/tavily/type.ts diff --git a/packages/web-crawler/src/crawImpl/exa.ts b/packages/web-crawler/src/crawImpl/exa.ts new file mode 100644 index 0000000000..fdb2a6ba9f --- /dev/null +++ b/packages/web-crawler/src/crawImpl/exa.ts @@ -0,0 +1,93 @@ +import { CrawlImpl, CrawlSuccessResult } from '../type'; +import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../utils/errorType'; +import { DEFAULT_TIMEOUT, withTimeout } from '../utils/withTimeout'; + +interface ExaResults { + author?: string; + favicon?: string; + id?: string; + image?: string; + publishedDate?: string; + summary?: string; + text: string; + title: string; + url: string; +} + +interface ExaResponse { + requestId?: string; + results: ExaResults[]; +} + +export const exa: CrawlImpl = async (url) => { + // Get API key from environment variable + const apiKey = process.env.EXA_API_KEY; + + let res: Response; + + try { + res = await withTimeout( + fetch('https://api.exa.ai/contents', { + body: JSON.stringify({ + livecrawl: 'fallback', // always, fallback + text: true, + urls: [url], + }), + headers: { + 'Content-Type': 'application/json', + 'x-api-key': !apiKey ? '' : apiKey, + }, + method: 'POST', + }), + DEFAULT_TIMEOUT, + ); + } catch (e) { + const error = e as Error; + if (error.message === 'fetch failed') { + throw new NetworkConnectionError(); + } + + if (error instanceof TimeoutError) { + throw error; + } + + throw e; + } + + if (!res.ok) { + if (res.status === 404) { + throw new PageNotFoundError(res.statusText); + } + + throw new Error(`Exa request failed with status ${res.status}: ${res.statusText}`); + } + + try { + const data = (await res.json()) as ExaResponse; + + if (!data.results || data.results.length === 0) { + console.warn( 'Exa API returned no results for URL:', url ) + return + } + + const firstResult = data.results[0]; + + // Check if content is empty or too short + if (!firstResult.text || firstResult.text.length < 100) { + return; + } + + return { + content: firstResult.text, + contentType: 'text', + length: firstResult.text.length, + siteName: new URL(url).hostname, + title: firstResult.title, + url: firstResult.url || url, + } satisfies CrawlSuccessResult; + } catch (error) { + console.error(error); + } + + return; +}; diff --git a/packages/web-crawler/src/crawImpl/firecrawl.ts b/packages/web-crawler/src/crawImpl/firecrawl.ts new file mode 100644 index 0000000000..642cab87dc --- /dev/null +++ b/packages/web-crawler/src/crawImpl/firecrawl.ts @@ -0,0 +1,97 @@ +import { CrawlImpl, CrawlSuccessResult } from '../type'; +import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../utils/errorType'; +import { DEFAULT_TIMEOUT, withTimeout } from '../utils/withTimeout'; + +interface FirecrawlMetadata { + description: string; + keywords: string; + language: string; + ogDescription?: string; + ogImage?: string; + ogLocaleAlternate?: string[]; + ogSiteName?: string; + ogTitle?: string; + ogUrl?: string; + robots: string; + statusCode: number; + sourceURL: string; + title: string; +} + +interface FirecrawlResults { + html?: string; + markdown?: string; + metadata: FirecrawlMetadata; +} + +interface FirecrawlResponse { + success: boolean; + data: FirecrawlResults; +} + +export const firecrawl: CrawlImpl = async (url) => { + // Get API key from environment variable + const apiKey = process.env.FIRECRAWL_API_KEY; + const baseUrl = process.env.FIRECRAWL_URL || 'https://api.firecrawl.dev/v1'; + + let res: Response; + + try { + res = await withTimeout( + fetch(`${baseUrl}/scrape`, { + body: JSON.stringify({ + formats: ["markdown"], // ["markdown", "html"] + url, + }), + headers: { + 'Authorization': !apiKey ? '' : `Bearer ${apiKey}`, + 'Content-Type': 'application/json', + }, + method: 'POST', + }), + DEFAULT_TIMEOUT, + ); + } catch (e) { + const error = e as Error; + if (error.message === 'fetch failed') { + throw new NetworkConnectionError(); + } + + if (error instanceof TimeoutError) { + throw error; + } + + throw e; + } + + if (!res.ok) { + if (res.status === 404) { + throw new PageNotFoundError(res.statusText); + } + + throw new Error(`Firecrawl request failed with status ${res.status}: ${res.statusText}`); + } + + try { + const data = (await res.json()) as FirecrawlResponse; + + // Check if content is empty or too short + if (!data.data.markdown || data.data.markdown.length < 100) { + return; + } + + return { + content: data.data.markdown, + contentType: 'text', + description: data.data.metadata.description, + length: data.data.markdown.length, + siteName: new URL(url).hostname, + title: data.data.metadata.title, + url: url, + } satisfies CrawlSuccessResult; + } catch (error) { + console.error(error); + } + + return; +}; diff --git a/packages/web-crawler/src/crawImpl/index.ts b/packages/web-crawler/src/crawImpl/index.ts index c58d77b2d8..208b44d525 100644 --- a/packages/web-crawler/src/crawImpl/index.ts +++ b/packages/web-crawler/src/crawImpl/index.ts @@ -1,13 +1,19 @@ import { browserless } from './browserless'; +import { exa } from './exa'; +import { firecrawl } from './firecrawl'; import { jina } from './jina'; import { naive } from './naive'; import { search1api } from './search1api'; +import { tavily } from './tavily'; export const crawlImpls = { browserless, + exa, + firecrawl, jina, naive, search1api, + tavily, }; export type CrawlImplType = keyof typeof crawlImpls; diff --git a/packages/web-crawler/src/crawImpl/tavily.ts b/packages/web-crawler/src/crawImpl/tavily.ts new file mode 100644 index 0000000000..3f1f860e17 --- /dev/null +++ b/packages/web-crawler/src/crawImpl/tavily.ts @@ -0,0 +1,94 @@ +import { CrawlImpl, CrawlSuccessResult } from '../type'; +import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../utils/errorType'; +import { DEFAULT_TIMEOUT, withTimeout } from '../utils/withTimeout'; + +interface TavilyResults { + images?: string[]; + raw_content: string; + url: string; +} + +interface TavilyFailedResults { + error?: string; + url: string; +} + +interface TavilyResponse { + base_url: string; + failed_results?: TavilyFailedResults[]; + response_time: number; + results: TavilyResults[]; +} + +export const tavily: CrawlImpl = async (url) => { + // Get API key from environment variable + const apiKey = process.env.TAVILY_API_KEY; + + let res: Response; + + try { + res = await withTimeout( + fetch('https://api.tavily.com/extract', { + body: JSON.stringify({ + extract_depth: process.env.TAVILY_EXTRACT_DEPTH || 'basic', // basic or advanced + include_images: false, + urls: url, + }), + headers: { + 'Authorization': !apiKey ? '' : `Bearer ${apiKey}`, + 'Content-Type': 'application/json', + }, + method: 'POST', + }), + DEFAULT_TIMEOUT, + ); + } catch (e) { + const error = e as Error; + if (error.message === 'fetch failed') { + throw new NetworkConnectionError(); + } + + if (error instanceof TimeoutError) { + throw error; + } + + throw e; + } + + if (!res.ok) { + if (res.status === 404) { + throw new PageNotFoundError(res.statusText); + } + + throw new Error(`Tavily request failed with status ${res.status}: ${res.statusText}`); + } + + try { + const data = (await res.json()) as TavilyResponse; + + if (!data.results || data.results.length === 0) { + console.warn( 'Tavily API returned no results for URL:', url ) + return + } + + const firstResult = data.results[0]; + + // Check if content is empty or too short + if (!firstResult.raw_content || firstResult.raw_content.length < 100) { + return; + } + + return { + content: firstResult.raw_content, + contentType: 'text', + length: firstResult.raw_content.length, + siteName: new URL(url).hostname, + title: new URL(url).hostname, + url: firstResult.url || url, + } satisfies CrawlSuccessResult; + } catch (error) { + console.error(error); + } + + return; +}; diff --git a/src/server/services/search/impls/bocha/index.ts b/src/server/services/search/impls/bocha/index.ts new file mode 100644 index 0000000000..8e9bf5e993 --- /dev/null +++ b/src/server/services/search/impls/bocha/index.ts @@ -0,0 +1,124 @@ +import { TRPCError } from '@trpc/server'; +import debug from 'debug'; +import urlJoin from 'url-join'; + +import { SearchParams, UniformSearchResponse, UniformSearchResult } from '@/types/tool/search'; + +import { SearchServiceImpl } from '../type'; +import { BochaSearchParameters, BochaResponse } from './type'; + +const log = debug('lobe-search:Bocha'); + +const timeRangeMapping = { + day: 'oneDay', + month: 'oneMonth', + week: 'oneWeek', + year: 'oneYear', +}; + +/** + * Bocha implementation of the search service + * Primarily used for web crawling + */ +export class BochaImpl implements SearchServiceImpl { + private get apiKey(): string | undefined { + return process.env.BOCHA_API_KEY; + } + + private get baseUrl(): string { + // Assuming the base URL is consistent with the crawl endpoint + return 'https://api.bochaai.com/v1'; + } + + async query(query: string, params: SearchParams = {}): Promise { + log('Starting Bocha query with query: "%s", params: %o', query, params); + const endpoint = urlJoin(this.baseUrl, '/web-search'); + + const defaultQueryParams: BochaSearchParameters = { + count: 15, + query, + summary: true, + }; + + let body: BochaSearchParameters = { + ...defaultQueryParams, + freshness: + params?.searchTimeRange && params.searchTimeRange !== 'anytime' + ? timeRangeMapping[params.searchTimeRange as keyof typeof timeRangeMapping] ?? undefined + : undefined, + }; + + log('Constructed request body: %o', body); + + let response: Response; + const startAt = Date.now(); + let costTime = 0; + try { + log('Sending request to endpoint: %s', endpoint); + response = await fetch(endpoint, { + body: JSON.stringify(body), + headers: { + 'Authorization': this.apiKey ? `Bearer ${this.apiKey}` : '', + 'Content-Type': 'application/json', + }, + method: 'POST', + }); + log('Received response with status: %d', response.status); + costTime = Date.now() - startAt; + } catch (error) { + log.extend('error')('Bocha fetch error: %o', error); + throw new TRPCError({ + cause: error, + code: 'SERVICE_UNAVAILABLE', + message: 'Failed to connect to Bocha.', + }); + } + + if (!response.ok) { + const errorBody = await response.text(); + log.extend('error')( + `Bocha request failed with status ${response.status}: %s`, + errorBody.length > 200 ? `${errorBody.slice(0, 200)}...` : errorBody, + ); + throw new TRPCError({ + cause: errorBody, + code: 'SERVICE_UNAVAILABLE', + message: `Bocha request failed: ${response.statusText}`, + }); + } + + try { + const bochaResponse = (await response.json()) as BochaResponse; + + log('Parsed Bocha response: %o', bochaResponse); + + const mappedResults = (bochaResponse.data.webPages.value || []).map( + (result): UniformSearchResult => ({ + category: 'general', // Default category + content: result.summary || result.snippet || '', // Prioritize content, fallback to snippet + engines: ['bocha'], // Use 'bocha' as the engine name + parsedUrl: result.url ? new URL(result.url).hostname : '', // Basic URL parsing + score: 1, // Default score to 1 + title: result.name || '', + url: result.url, + }), + ); + + log('Mapped %d results to SearchResult format', mappedResults.length); + + return { + costTime, + query: query, + resultNumbers: mappedResults.length, + results: mappedResults, + }; + } catch (error) { + log.extend('error')('Error parsing Bocha response: %o', error); + throw new TRPCError({ + cause: error, + code: 'INTERNAL_SERVER_ERROR', + message: 'Failed to parse Bocha response.', + }); + } + } +} diff --git a/src/server/services/search/impls/bocha/type.ts b/src/server/services/search/impls/bocha/type.ts new file mode 100644 index 0000000000..3fe7bfef03 --- /dev/null +++ b/src/server/services/search/impls/bocha/type.ts @@ -0,0 +1,47 @@ +export interface BochaSearchParameters { + count?: number; + exclude?: string; + freshness?: string; + include?: string; + query: string; + summary?: boolean; +} + +interface BochaQueryContext { + originalQuery: string; +} + +interface BochaValue { + cachedPageUrl?: string; + dateLastCrawled?: string; + displayUrl?: string; + id?: string | null; + isFamilyFriendly?: boolean; + isNavigational?: boolean; + language?: string; + name: string; + siteName?: string; + snippet?: string; + summary?: string; + url: string; +} + +interface BochaWebPages { + totalEstimatedMatches?: number; + value?: BochaValue[]; + webSearchUrl?: string; +} + +interface BochaData { + images?: any; + queryContext?: BochaQueryContext; + videos?: any; + webPages: BochaWebPages; +} + +export interface BochaResponse { + code?: number; + data: BochaData; + log_id?: string; + msg?: string | null; +} diff --git a/src/server/services/search/impls/exa/index.ts b/src/server/services/search/impls/exa/index.ts new file mode 100644 index 0000000000..c73427859c --- /dev/null +++ b/src/server/services/search/impls/exa/index.ts @@ -0,0 +1,129 @@ +import { TRPCError } from '@trpc/server'; +import debug from 'debug'; +import urlJoin from 'url-join'; + +import { SearchParams, UniformSearchResponse, UniformSearchResult } from '@/types/tool/search'; + +import { SearchServiceImpl } from '../type'; +import { ExaSearchParameters, ExaResponse } from './type'; + +const log = debug('lobe-search:Exa'); + +/** + * Exa implementation of the search service + * Primarily used for web crawling + */ +export class ExaImpl implements SearchServiceImpl { + private get apiKey(): string | undefined { + return process.env.EXA_API_KEY; + } + + private get baseUrl(): string { + // Assuming the base URL is consistent with the crawl endpoint + return 'https://api.exa.ai'; + } + + async query(query: string, params: SearchParams = {}): Promise { + log('Starting Exa query with query: "%s", params: %o', query, params); + const endpoint = urlJoin(this.baseUrl, '/search'); + + const defaultQueryParams: ExaSearchParameters = { + numResults: 15, + query, + type: 'auto', + }; + + let body: ExaSearchParameters = { + ...defaultQueryParams, + ...(params?.searchTimeRange && params.searchTimeRange !== 'anytime' + ? (() => { + const now = Date.now(); + const days = { day: 1, month: 30, week: 7, year: 365 }[params.searchTimeRange!]; + + if (days === undefined) return {}; + + return { + endPublishedDate: new Date(now).toISOString(), + startPublishedDate: new Date(now - days * 86_400 * 1000).toISOString(), + }; + })() + : {}), + category: + // Exa εͺζ”―ζŒ news η±»εž‹ + params?.searchCategories?.filter(cat => ['news'].includes(cat))?.[0], + }; + + log('Constructed request body: %o', body); + + let response: Response; + const startAt = Date.now(); + let costTime = 0; + try { + log('Sending request to endpoint: %s', endpoint); + response = await fetch(endpoint, { + body: JSON.stringify(body), + headers: { + 'Content-Type': 'application/json', + 'x-api-key': this.apiKey ? this.apiKey : '', + }, + method: 'POST', + }); + log('Received response with status: %d', response.status); + costTime = Date.now() - startAt; + } catch (error) { + log.extend('error')('Exa fetch error: %o', error); + throw new TRPCError({ + cause: error, + code: 'SERVICE_UNAVAILABLE', + message: 'Failed to connect to Exa.', + }); + } + + if (!response.ok) { + const errorBody = await response.text(); + log.extend('error')( + `Exa request failed with status ${response.status}: %s`, + errorBody.length > 200 ? `${errorBody.slice(0, 200)}...` : errorBody, + ); + throw new TRPCError({ + cause: errorBody, + code: 'SERVICE_UNAVAILABLE', + message: `Exa request failed: ${response.statusText}`, + }); + } + + try { + const exaResponse = (await response.json()) as ExaResponse; + + log('Parsed Exa response: %o', exaResponse); + + const mappedResults = (exaResponse.results || []).map( + (result): UniformSearchResult => ({ + category: body.category || 'general', // Default category + content: result.text || '', // Prioritize content, fallback to snippet + engines: ['exa'], // Use 'exa' as the engine name + parsedUrl: result.url ? new URL(result.url).hostname : '', // Basic URL parsing + score: result.score || 0, // Default score to 0 if undefined + title: result.title || '', + url: result.url, + }), + ); + + log('Mapped %d results to SearchResult format', mappedResults.length); + + return { + costTime, + query: query, + resultNumbers: mappedResults.length, + results: mappedResults, + }; + } catch (error) { + log.extend('error')('Error parsing Exa response: %o', error); + throw new TRPCError({ + cause: error, + code: 'INTERNAL_SERVER_ERROR', + message: 'Failed to parse Exa response.', + }); + } + } +} diff --git a/src/server/services/search/impls/exa/type.ts b/src/server/services/search/impls/exa/type.ts new file mode 100644 index 0000000000..9937a62fbc --- /dev/null +++ b/src/server/services/search/impls/exa/type.ts @@ -0,0 +1,39 @@ +export interface ExaSearchParameters { + category?: string; + endCrawlDate?: string; + endPublishedDate?: string; + excludeDomains?: string[]; + excludeText?: string[]; + includeDomains?: string[]; + includeText?: string[]; + numResults?: number; + query: string; + startCrawlDate?: string; + startPublishedDate?: string; + type?: string; +} + +interface ExaCostDollars { + total: number; +} + +interface ExaResults { + author?: string | null; + favicon?: string; + id?: string; + image?: string; + publishedDate?: string | null; + score?: number | null; + summery?: string; + text: string; + title: string; + url: string; +} + +export interface ExaResponse { + costDollars?: ExaCostDollars; + requestId?: string; + resolvedSearchType?: string; + results: ExaResults[]; + searchType?: string; +} diff --git a/src/server/services/search/impls/firecrawl/index.ts b/src/server/services/search/impls/firecrawl/index.ts new file mode 100644 index 0000000000..5227a3fc4e --- /dev/null +++ b/src/server/services/search/impls/firecrawl/index.ts @@ -0,0 +1,128 @@ +import { TRPCError } from '@trpc/server'; +import debug from 'debug'; +import urlJoin from 'url-join'; + +import { SearchParams, UniformSearchResponse, UniformSearchResult } from '@/types/tool/search'; + +import { SearchServiceImpl } from '../type'; +import { FirecrawlSearchParameters, FirecrawlResponse } from './type'; + +const log = debug('lobe-search:Firecrawl'); + +const timeRangeMapping = { + day: 'qdr:d', + month: 'qdr:m', + week: 'qdr:w', + year: 'qdr:y', +}; + +/** + * Firecrawl implementation of the search service + * Primarily used for web crawling + */ +export class FirecrawlImpl implements SearchServiceImpl { + private get apiKey(): string | undefined { + return process.env.FIRECRAWL_API_KEY; + } + + private get baseUrl(): string { + // Assuming the base URL is consistent with the crawl endpoint + return process.env.FIRECRAWL_URL || 'https://api.firecrawl.dev/v1'; + } + + async query(query: string, params: SearchParams = {}): Promise { + log('Starting Firecrawl query with query: "%s", params: %o', query, params); + const endpoint = urlJoin(this.baseUrl, '/search'); + + const defaultQueryParams: FirecrawlSearchParameters = { + limit: 15, + query, + /* + scrapeOptions: { + formats: ["markdown"] + }, + */ + }; + + let body: FirecrawlSearchParameters = { + ...defaultQueryParams, + tbs: + params?.searchTimeRange && params.searchTimeRange !== 'anytime' + ? timeRangeMapping[params.searchTimeRange as keyof typeof timeRangeMapping] ?? undefined + : undefined, + }; + + log('Constructed request body: %o', body); + + let response: Response; + const startAt = Date.now(); + let costTime = 0; + try { + log('Sending request to endpoint: %s', endpoint); + response = await fetch(endpoint, { + body: JSON.stringify(body), + headers: { + 'Authorization': this.apiKey ? `Bearer ${this.apiKey}` : '', + 'Content-Type': 'application/json', + }, + method: 'POST', + }); + log('Received response with status: %d', response.status); + costTime = Date.now() - startAt; + } catch (error) { + log.extend('error')('Firecrawl fetch error: %o', error); + throw new TRPCError({ + cause: error, + code: 'SERVICE_UNAVAILABLE', + message: 'Failed to connect to Firecrawl.', + }); + } + + if (!response.ok) { + const errorBody = await response.text(); + log.extend('error')( + `Firecrawl request failed with status ${response.status}: %s`, + errorBody.length > 200 ? `${errorBody.slice(0, 200)}...` : errorBody, + ); + throw new TRPCError({ + cause: errorBody, + code: 'SERVICE_UNAVAILABLE', + message: `Firecrawl request failed: ${response.statusText}`, + }); + } + + try { + const firecrawlResponse = (await response.json()) as FirecrawlResponse; + + log('Parsed Firecrawl response: %o', firecrawlResponse); + + const mappedResults = (firecrawlResponse.data || []).map( + (result): UniformSearchResult => ({ + category: 'general', // Default category + content: result.description || '', // Prioritize content, fallback to snippet + engines: ['firecrawl'], // Use 'firecrawl' as the engine name + parsedUrl: result.url ? new URL(result.url).hostname : '', // Basic URL parsing + score: 1, // Default score to 1 + title: result.title || '', + url: result.url, + }), + ); + + log('Mapped %d results to SearchResult format', mappedResults.length); + + return { + costTime, + query: query, + resultNumbers: mappedResults.length, + results: mappedResults, + }; + } catch (error) { + log.extend('error')('Error parsing Firecrawl response: %o', error); + throw new TRPCError({ + cause: error, + code: 'INTERNAL_SERVER_ERROR', + message: 'Failed to parse Firecrawl response.', + }); + } + } +} diff --git a/src/server/services/search/impls/firecrawl/type.ts b/src/server/services/search/impls/firecrawl/type.ts new file mode 100644 index 0000000000..f7a53969dc --- /dev/null +++ b/src/server/services/search/impls/firecrawl/type.ts @@ -0,0 +1,35 @@ +interface FirecrawlScrapeOptions { + formats: string[]; +} + +export interface FirecrawlSearchParameters { + country?: string; + lang?: string; + limit?: number; + query: string; + scrapeOptions?: FirecrawlScrapeOptions; + tbs?: string; + timeout?: number; +} + +interface FirecrawlMetadata { + description?: string; + sourceURL?: string; + statusCode?: number; + title: string; +} + +interface FirecrawlData { + description?: string; + html?: string; + links?: string[]; + markdown?: string; + metadata?: FirecrawlMetadata; + title?: string; + url: string; +} + +export interface FirecrawlResponse { + data: FirecrawlData[]; + success?: boolean; +} diff --git a/src/server/services/search/impls/index.ts b/src/server/services/search/impls/index.ts index 77607c9810..a71a38507d 100644 --- a/src/server/services/search/impls/index.ts +++ b/src/server/services/search/impls/index.ts @@ -1,13 +1,24 @@ +import { BochaImpl } from './bocha'; +import { ExaImpl } from './exa'; +import { FirecrawlImpl } from './firecrawl'; +import { JinaImpl } from './jina'; import { Search1APIImpl } from './search1api'; import { SearXNGImpl } from './searxng'; +import { TavilyImpl } from './tavily'; + import { SearchServiceImpl } from './type'; /** * Available search service implementations */ export enum SearchImplType { + Bocha = 'bocha', + Exa = 'exa', + Firecrawl = 'firecrawl', + Jina = 'jina', SearXNG = 'searxng', Search1API = 'search1api', + Tavily = 'tavily', } /** @@ -17,10 +28,30 @@ export const createSearchServiceImpl = ( type: SearchImplType = SearchImplType.SearXNG, ): SearchServiceImpl => { switch (type) { + case SearchImplType.Bocha: { + return new BochaImpl(); + } + + case SearchImplType.Exa: { + return new ExaImpl(); + } + + case SearchImplType.Firecrawl: { + return new FirecrawlImpl(); + } + + case SearchImplType.Jina: { + return new JinaImpl(); + } + case SearchImplType.SearXNG: { return new SearXNGImpl(); } + case SearchImplType.Tavily: { + return new TavilyImpl(); + } + default: { return new Search1APIImpl(); } diff --git a/src/server/services/search/impls/jina/index.ts b/src/server/services/search/impls/jina/index.ts new file mode 100644 index 0000000000..b735aed83d --- /dev/null +++ b/src/server/services/search/impls/jina/index.ts @@ -0,0 +1,109 @@ +import { TRPCError } from '@trpc/server'; +import debug from 'debug'; +import urlJoin from 'url-join'; + +import { SearchParams, UniformSearchResponse, UniformSearchResult } from '@/types/tool/search'; + +import { SearchServiceImpl } from '../type'; +import { JinaSearchParameters, JinaResponse } from './type'; + +const log = debug('lobe-search:Jina'); + +/** + * Jina implementation of the search service + * Primarily used for web crawling + */ +export class JinaImpl implements SearchServiceImpl { + private get apiKey(): string | undefined { + return process.env.JINA_READER_API_KEY || process.env.JINA_API_KEY; + } + + private get baseUrl(): string { + // Assuming the base URL is consistent with the crawl endpoint + return 'https://s.jina.ai'; + } + + async query(query: string, params: SearchParams = {}): Promise { + log('Starting Jina query with query: "%s", params: %o', query, params); + const endpoint = urlJoin(this.baseUrl, '/'); + + let body: JinaSearchParameters = { + q: query, + }; + + log('Constructed request body: %o', body); + + let response: Response; + const startAt = Date.now(); + let costTime = 0; + try { + log('Sending request to endpoint: %s', endpoint); + response = await fetch(endpoint, { + body: JSON.stringify(body), + headers: { + 'Accept': 'application/json', + 'Authorization': this.apiKey ? `Bearer ${this.apiKey}` : '', + 'Content-Type': 'application/json', + 'X-Respond-With': 'no-content', + }, + method: 'POST', + }); + log('Received response with status: %d', response.status); + costTime = Date.now() - startAt; + } catch (error) { + log.extend('error')('Jina fetch error: %o', error); + throw new TRPCError({ + cause: error, + code: 'SERVICE_UNAVAILABLE', + message: 'Failed to connect to Jina.', + }); + } + + if (!response.ok) { + const errorBody = await response.text(); + log.extend('error')( + `Jina request failed with status ${response.status}: %s`, + errorBody.length > 200 ? `${errorBody.slice(0, 200)}...` : errorBody, + ); + throw new TRPCError({ + cause: errorBody, + code: 'SERVICE_UNAVAILABLE', + message: `Jina request failed: ${response.statusText}`, + }); + } + + try { + const jinaResponse = (await response.json()) as JinaResponse; + + log('Parsed Jina response: %o', jinaResponse); + + const mappedResults = (jinaResponse.data || []).map( + (result): UniformSearchResult => ({ + category: 'general', // Default category + content: result.description || '', // Prioritize content, fallback to snippet + engines: ['jina'], // Use 'jina' as the engine name + parsedUrl: result.url ? new URL(result.url).hostname : '', // Basic URL parsing + score: 1, // Default score to 1 + title: result.title || '', + url: result.url, + }), + ); + + log('Mapped %d results to SearchResult format', mappedResults.length); + + return { + costTime, + query: query, + resultNumbers: mappedResults.length, + results: mappedResults, + }; + } catch (error) { + log.extend('error')('Error parsing Jina response: %o', error); + throw new TRPCError({ + cause: error, + code: 'INTERNAL_SERVER_ERROR', + message: 'Failed to parse Jina response.', + }); + } + } +} diff --git a/src/server/services/search/impls/jina/type.ts b/src/server/services/search/impls/jina/type.ts new file mode 100644 index 0000000000..dbba283e25 --- /dev/null +++ b/src/server/services/search/impls/jina/type.ts @@ -0,0 +1,26 @@ +export interface JinaSearchParameters { + q: string; +} + +interface JinaUsage { + tokens: number; +} + +interface JinaMeta { + usage: JinaUsage; +} + +interface JinaData { + content?: string; + description?: string; + title: string; + url: string; + usage?: JinaUsage; +} + +export interface JinaResponse { + code?: number; + data: JinaData[]; + meta?: JinaMeta; + status?: number; +} diff --git a/src/server/services/search/impls/tavily/index.ts b/src/server/services/search/impls/tavily/index.ts new file mode 100644 index 0000000000..d900528777 --- /dev/null +++ b/src/server/services/search/impls/tavily/index.ts @@ -0,0 +1,124 @@ +import { TRPCError } from '@trpc/server'; +import debug from 'debug'; +import urlJoin from 'url-join'; + +import { SearchParams, UniformSearchResponse, UniformSearchResult } from '@/types/tool/search'; + +import { SearchServiceImpl } from '../type'; +import { TavilySearchParameters, TavilyResponse } from './type'; + +const log = debug('lobe-search:Tavily'); + +/** + * Tavily implementation of the search service + * Primarily used for web crawling + */ +export class TavilyImpl implements SearchServiceImpl { + private get apiKey(): string | undefined { + return process.env.TAVILY_API_KEY; + } + + private get baseUrl(): string { + // Assuming the base URL is consistent with the crawl endpoint + return 'https://api.tavily.com'; + } + + async query(query: string, params: SearchParams = {}): Promise { + log('Starting Tavily query with query: "%s", params: %o', query, params); + const endpoint = urlJoin(this.baseUrl, '/search'); + + const defaultQueryParams: TavilySearchParameters = { + include_answer: false, + include_image_descriptions: true, + include_images: false, + include_raw_content: false, + max_results: 15, + query, + search_depth: process.env.TAVILY_SEARCH_DEPTH || 'basic' // basic or advanced + }; + + let body: TavilySearchParameters = { + ...defaultQueryParams, + time_range: + params?.searchTimeRange && params.searchTimeRange !== 'anytime' + ? params.searchTimeRange + : undefined, + topic: + // Tavily εͺζ”―ζŒ news ε’Œ general δΈ€η§η±»εž‹ + params?.searchCategories?.filter(cat => ['news', 'general'].includes(cat))?.[0], + }; + + log('Constructed request body: %o', body); + + let response: Response; + const startAt = Date.now(); + let costTime = 0; + try { + log('Sending request to endpoint: %s', endpoint); + response = await fetch(endpoint, { + body: JSON.stringify(body), + headers: { + 'Authorization': this.apiKey ? `Bearer ${this.apiKey}` : '', + 'Content-Type': 'application/json', + }, + method: 'POST', + }); + log('Received response with status: %d', response.status); + costTime = Date.now() - startAt; + } catch (error) { + log.extend('error')('Tavily fetch error: %o', error); + throw new TRPCError({ + cause: error, + code: 'SERVICE_UNAVAILABLE', + message: 'Failed to connect to Tavily.', + }); + } + + if (!response.ok) { + const errorBody = await response.text(); + log.extend('error')( + `Tavily request failed with status ${response.status}: %s`, + errorBody.length > 200 ? `${errorBody.slice(0, 200)}...` : errorBody, + ); + throw new TRPCError({ + cause: errorBody, + code: 'SERVICE_UNAVAILABLE', + message: `Tavily request failed: ${response.statusText}`, + }); + } + + try { + const tavilyResponse = (await response.json()) as TavilyResponse; + + log('Parsed Tavily response: %o', tavilyResponse); + + const mappedResults = (tavilyResponse.results || []).map( + (result): UniformSearchResult => ({ + category: body.topic || 'general', // Default category + content: result.content || '', // Prioritize content, fallback to snippet + engines: ['tavily'], // Use 'tavily' as the engine name + parsedUrl: result.url ? new URL(result.url).hostname : '', // Basic URL parsing + score: result.score || 0, // Default score to 0 if undefined + title: result.title || '', + url: result.url, + }), + ); + + log('Mapped %d results to SearchResult format', mappedResults.length); + + return { + costTime, + query: query, + resultNumbers: mappedResults.length, + results: mappedResults, + }; + } catch (error) { + log.extend('error')('Error parsing Tavily response: %o', error); + throw new TRPCError({ + cause: error, + code: 'INTERNAL_SERVER_ERROR', + message: 'Failed to parse Tavily response.', + }); + } + } +} diff --git a/src/server/services/search/impls/tavily/type.ts b/src/server/services/search/impls/tavily/type.ts new file mode 100644 index 0000000000..e513c66dbf --- /dev/null +++ b/src/server/services/search/impls/tavily/type.ts @@ -0,0 +1,36 @@ +export interface TavilySearchParameters { + chunks_per_source?: number; + days?: number; + exclude_domains?: string[]; + include_answer?: boolean | string; + include_domains?: string[]; + include_image_descriptions?: boolean; + include_images?: boolean; + include_raw_content?: boolean; + max_results?: number; + query: string; + search_depth?: string; + time_range?: string; + topic?: string; +} + +interface TavilyImages { + description?: string; + url: string; +} + +interface TavilyResults { + content?: string; + raw_content?: string | null; + score?: number; + title?: string; + url: string; +} + +export interface TavilyResponse { + answer?: string; + images?: TavilyImages[]; + query: string; + response_time: number; + results: TavilyResults[]; +}