mirror of
https://github.com/lobehub/lobe-chat.git
synced 2026-06-14 03:30:19 +00:00
✨ feat: add more provider support for search & crawl (#8033)
* ✨ feat: support tavily as search provider * ✨ feat: support tavily as crawl provider * 🐛 fix: fix category mapping * ✨ feat: support firecrwal as crawl provider * ✨ feat: support firecrawl as search provider * 🔨 chore: support firecrawl baseUrl for self-host * 🐛 fix: fix build error * ✨ feat: support jina as search provider * 🐛 fix: fix build error * ✨ feat: support BoChaAI & Exa as search & crawl provider * 🐛 fix: fix build error * 🔨 chore: rename to bocha * 🐛 fix: fix typo in bocha impl * ♻️ refactor: add `category` & `time_range` support for Exa --------- Co-authored-by: Arvin Xu <arvinx@foxmail.com>
This commit is contained in:
@@ -0,0 +1,93 @@
|
||||
import { CrawlImpl, CrawlSuccessResult } from '../type';
|
||||
import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../utils/errorType';
|
||||
import { DEFAULT_TIMEOUT, withTimeout } from '../utils/withTimeout';
|
||||
|
||||
interface ExaResults {
|
||||
author?: string;
|
||||
favicon?: string;
|
||||
id?: string;
|
||||
image?: string;
|
||||
publishedDate?: string;
|
||||
summary?: string;
|
||||
text: string;
|
||||
title: string;
|
||||
url: string;
|
||||
}
|
||||
|
||||
interface ExaResponse {
|
||||
requestId?: string;
|
||||
results: ExaResults[];
|
||||
}
|
||||
|
||||
export const exa: CrawlImpl = async (url) => {
|
||||
// Get API key from environment variable
|
||||
const apiKey = process.env.EXA_API_KEY;
|
||||
|
||||
let res: Response;
|
||||
|
||||
try {
|
||||
res = await withTimeout(
|
||||
fetch('https://api.exa.ai/contents', {
|
||||
body: JSON.stringify({
|
||||
livecrawl: 'fallback', // always, fallback
|
||||
text: true,
|
||||
urls: [url],
|
||||
}),
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
'x-api-key': !apiKey ? '' : apiKey,
|
||||
},
|
||||
method: 'POST',
|
||||
}),
|
||||
DEFAULT_TIMEOUT,
|
||||
);
|
||||
} catch (e) {
|
||||
const error = e as Error;
|
||||
if (error.message === 'fetch failed') {
|
||||
throw new NetworkConnectionError();
|
||||
}
|
||||
|
||||
if (error instanceof TimeoutError) {
|
||||
throw error;
|
||||
}
|
||||
|
||||
throw e;
|
||||
}
|
||||
|
||||
if (!res.ok) {
|
||||
if (res.status === 404) {
|
||||
throw new PageNotFoundError(res.statusText);
|
||||
}
|
||||
|
||||
throw new Error(`Exa request failed with status ${res.status}: ${res.statusText}`);
|
||||
}
|
||||
|
||||
try {
|
||||
const data = (await res.json()) as ExaResponse;
|
||||
|
||||
if (!data.results || data.results.length === 0) {
|
||||
console.warn( 'Exa API returned no results for URL:', url )
|
||||
return
|
||||
}
|
||||
|
||||
const firstResult = data.results[0];
|
||||
|
||||
// Check if content is empty or too short
|
||||
if (!firstResult.text || firstResult.text.length < 100) {
|
||||
return;
|
||||
}
|
||||
|
||||
return {
|
||||
content: firstResult.text,
|
||||
contentType: 'text',
|
||||
length: firstResult.text.length,
|
||||
siteName: new URL(url).hostname,
|
||||
title: firstResult.title,
|
||||
url: firstResult.url || url,
|
||||
} satisfies CrawlSuccessResult;
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
}
|
||||
|
||||
return;
|
||||
};
|
||||
@@ -0,0 +1,97 @@
|
||||
import { CrawlImpl, CrawlSuccessResult } from '../type';
|
||||
import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../utils/errorType';
|
||||
import { DEFAULT_TIMEOUT, withTimeout } from '../utils/withTimeout';
|
||||
|
||||
interface FirecrawlMetadata {
|
||||
description: string;
|
||||
keywords: string;
|
||||
language: string;
|
||||
ogDescription?: string;
|
||||
ogImage?: string;
|
||||
ogLocaleAlternate?: string[];
|
||||
ogSiteName?: string;
|
||||
ogTitle?: string;
|
||||
ogUrl?: string;
|
||||
robots: string;
|
||||
statusCode: number;
|
||||
sourceURL: string;
|
||||
title: string;
|
||||
}
|
||||
|
||||
interface FirecrawlResults {
|
||||
html?: string;
|
||||
markdown?: string;
|
||||
metadata: FirecrawlMetadata;
|
||||
}
|
||||
|
||||
interface FirecrawlResponse {
|
||||
success: boolean;
|
||||
data: FirecrawlResults;
|
||||
}
|
||||
|
||||
export const firecrawl: CrawlImpl = async (url) => {
|
||||
// Get API key from environment variable
|
||||
const apiKey = process.env.FIRECRAWL_API_KEY;
|
||||
const baseUrl = process.env.FIRECRAWL_URL || 'https://api.firecrawl.dev/v1';
|
||||
|
||||
let res: Response;
|
||||
|
||||
try {
|
||||
res = await withTimeout(
|
||||
fetch(`${baseUrl}/scrape`, {
|
||||
body: JSON.stringify({
|
||||
formats: ["markdown"], // ["markdown", "html"]
|
||||
url,
|
||||
}),
|
||||
headers: {
|
||||
'Authorization': !apiKey ? '' : `Bearer ${apiKey}`,
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
method: 'POST',
|
||||
}),
|
||||
DEFAULT_TIMEOUT,
|
||||
);
|
||||
} catch (e) {
|
||||
const error = e as Error;
|
||||
if (error.message === 'fetch failed') {
|
||||
throw new NetworkConnectionError();
|
||||
}
|
||||
|
||||
if (error instanceof TimeoutError) {
|
||||
throw error;
|
||||
}
|
||||
|
||||
throw e;
|
||||
}
|
||||
|
||||
if (!res.ok) {
|
||||
if (res.status === 404) {
|
||||
throw new PageNotFoundError(res.statusText);
|
||||
}
|
||||
|
||||
throw new Error(`Firecrawl request failed with status ${res.status}: ${res.statusText}`);
|
||||
}
|
||||
|
||||
try {
|
||||
const data = (await res.json()) as FirecrawlResponse;
|
||||
|
||||
// Check if content is empty or too short
|
||||
if (!data.data.markdown || data.data.markdown.length < 100) {
|
||||
return;
|
||||
}
|
||||
|
||||
return {
|
||||
content: data.data.markdown,
|
||||
contentType: 'text',
|
||||
description: data.data.metadata.description,
|
||||
length: data.data.markdown.length,
|
||||
siteName: new URL(url).hostname,
|
||||
title: data.data.metadata.title,
|
||||
url: url,
|
||||
} satisfies CrawlSuccessResult;
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
}
|
||||
|
||||
return;
|
||||
};
|
||||
@@ -1,13 +1,19 @@
|
||||
import { browserless } from './browserless';
|
||||
import { exa } from './exa';
|
||||
import { firecrawl } from './firecrawl';
|
||||
import { jina } from './jina';
|
||||
import { naive } from './naive';
|
||||
import { search1api } from './search1api';
|
||||
import { tavily } from './tavily';
|
||||
|
||||
export const crawlImpls = {
|
||||
browserless,
|
||||
exa,
|
||||
firecrawl,
|
||||
jina,
|
||||
naive,
|
||||
search1api,
|
||||
tavily,
|
||||
};
|
||||
|
||||
export type CrawlImplType = keyof typeof crawlImpls;
|
||||
|
||||
@@ -0,0 +1,94 @@
|
||||
import { CrawlImpl, CrawlSuccessResult } from '../type';
|
||||
import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../utils/errorType';
|
||||
import { DEFAULT_TIMEOUT, withTimeout } from '../utils/withTimeout';
|
||||
|
||||
interface TavilyResults {
|
||||
images?: string[];
|
||||
raw_content: string;
|
||||
url: string;
|
||||
}
|
||||
|
||||
interface TavilyFailedResults {
|
||||
error?: string;
|
||||
url: string;
|
||||
}
|
||||
|
||||
interface TavilyResponse {
|
||||
base_url: string;
|
||||
failed_results?: TavilyFailedResults[];
|
||||
response_time: number;
|
||||
results: TavilyResults[];
|
||||
}
|
||||
|
||||
export const tavily: CrawlImpl = async (url) => {
|
||||
// Get API key from environment variable
|
||||
const apiKey = process.env.TAVILY_API_KEY;
|
||||
|
||||
let res: Response;
|
||||
|
||||
try {
|
||||
res = await withTimeout(
|
||||
fetch('https://api.tavily.com/extract', {
|
||||
body: JSON.stringify({
|
||||
extract_depth: process.env.TAVILY_EXTRACT_DEPTH || 'basic', // basic or advanced
|
||||
include_images: false,
|
||||
urls: url,
|
||||
}),
|
||||
headers: {
|
||||
'Authorization': !apiKey ? '' : `Bearer ${apiKey}`,
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
method: 'POST',
|
||||
}),
|
||||
DEFAULT_TIMEOUT,
|
||||
);
|
||||
} catch (e) {
|
||||
const error = e as Error;
|
||||
if (error.message === 'fetch failed') {
|
||||
throw new NetworkConnectionError();
|
||||
}
|
||||
|
||||
if (error instanceof TimeoutError) {
|
||||
throw error;
|
||||
}
|
||||
|
||||
throw e;
|
||||
}
|
||||
|
||||
if (!res.ok) {
|
||||
if (res.status === 404) {
|
||||
throw new PageNotFoundError(res.statusText);
|
||||
}
|
||||
|
||||
throw new Error(`Tavily request failed with status ${res.status}: ${res.statusText}`);
|
||||
}
|
||||
|
||||
try {
|
||||
const data = (await res.json()) as TavilyResponse;
|
||||
|
||||
if (!data.results || data.results.length === 0) {
|
||||
console.warn( 'Tavily API returned no results for URL:', url )
|
||||
return
|
||||
}
|
||||
|
||||
const firstResult = data.results[0];
|
||||
|
||||
// Check if content is empty or too short
|
||||
if (!firstResult.raw_content || firstResult.raw_content.length < 100) {
|
||||
return;
|
||||
}
|
||||
|
||||
return {
|
||||
content: firstResult.raw_content,
|
||||
contentType: 'text',
|
||||
length: firstResult.raw_content.length,
|
||||
siteName: new URL(url).hostname,
|
||||
title: new URL(url).hostname,
|
||||
url: firstResult.url || url,
|
||||
} satisfies CrawlSuccessResult;
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
}
|
||||
|
||||
return;
|
||||
};
|
||||
@@ -0,0 +1,124 @@
|
||||
import { TRPCError } from '@trpc/server';
|
||||
import debug from 'debug';
|
||||
import urlJoin from 'url-join';
|
||||
|
||||
import { SearchParams, UniformSearchResponse, UniformSearchResult } from '@/types/tool/search';
|
||||
|
||||
import { SearchServiceImpl } from '../type';
|
||||
import { BochaSearchParameters, BochaResponse } from './type';
|
||||
|
||||
const log = debug('lobe-search:Bocha');
|
||||
|
||||
const timeRangeMapping = {
|
||||
day: 'oneDay',
|
||||
month: 'oneMonth',
|
||||
week: 'oneWeek',
|
||||
year: 'oneYear',
|
||||
};
|
||||
|
||||
/**
|
||||
* Bocha implementation of the search service
|
||||
* Primarily used for web crawling
|
||||
*/
|
||||
export class BochaImpl implements SearchServiceImpl {
|
||||
private get apiKey(): string | undefined {
|
||||
return process.env.BOCHA_API_KEY;
|
||||
}
|
||||
|
||||
private get baseUrl(): string {
|
||||
// Assuming the base URL is consistent with the crawl endpoint
|
||||
return 'https://api.bochaai.com/v1';
|
||||
}
|
||||
|
||||
async query(query: string, params: SearchParams = {}): Promise<UniformSearchResponse> {
|
||||
log('Starting Bocha query with query: "%s", params: %o', query, params);
|
||||
const endpoint = urlJoin(this.baseUrl, '/web-search');
|
||||
|
||||
const defaultQueryParams: BochaSearchParameters = {
|
||||
count: 15,
|
||||
query,
|
||||
summary: true,
|
||||
};
|
||||
|
||||
let body: BochaSearchParameters = {
|
||||
...defaultQueryParams,
|
||||
freshness:
|
||||
params?.searchTimeRange && params.searchTimeRange !== 'anytime'
|
||||
? timeRangeMapping[params.searchTimeRange as keyof typeof timeRangeMapping] ?? undefined
|
||||
: undefined,
|
||||
};
|
||||
|
||||
log('Constructed request body: %o', body);
|
||||
|
||||
let response: Response;
|
||||
const startAt = Date.now();
|
||||
let costTime = 0;
|
||||
try {
|
||||
log('Sending request to endpoint: %s', endpoint);
|
||||
response = await fetch(endpoint, {
|
||||
body: JSON.stringify(body),
|
||||
headers: {
|
||||
'Authorization': this.apiKey ? `Bearer ${this.apiKey}` : '',
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
method: 'POST',
|
||||
});
|
||||
log('Received response with status: %d', response.status);
|
||||
costTime = Date.now() - startAt;
|
||||
} catch (error) {
|
||||
log.extend('error')('Bocha fetch error: %o', error);
|
||||
throw new TRPCError({
|
||||
cause: error,
|
||||
code: 'SERVICE_UNAVAILABLE',
|
||||
message: 'Failed to connect to Bocha.',
|
||||
});
|
||||
}
|
||||
|
||||
if (!response.ok) {
|
||||
const errorBody = await response.text();
|
||||
log.extend('error')(
|
||||
`Bocha request failed with status ${response.status}: %s`,
|
||||
errorBody.length > 200 ? `${errorBody.slice(0, 200)}...` : errorBody,
|
||||
);
|
||||
throw new TRPCError({
|
||||
cause: errorBody,
|
||||
code: 'SERVICE_UNAVAILABLE',
|
||||
message: `Bocha request failed: ${response.statusText}`,
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
const bochaResponse = (await response.json()) as BochaResponse;
|
||||
|
||||
log('Parsed Bocha response: %o', bochaResponse);
|
||||
|
||||
const mappedResults = (bochaResponse.data.webPages.value || []).map(
|
||||
(result): UniformSearchResult => ({
|
||||
category: 'general', // Default category
|
||||
content: result.summary || result.snippet || '', // Prioritize content, fallback to snippet
|
||||
engines: ['bocha'], // Use 'bocha' as the engine name
|
||||
parsedUrl: result.url ? new URL(result.url).hostname : '', // Basic URL parsing
|
||||
score: 1, // Default score to 1
|
||||
title: result.name || '',
|
||||
url: result.url,
|
||||
}),
|
||||
);
|
||||
|
||||
log('Mapped %d results to SearchResult format', mappedResults.length);
|
||||
|
||||
return {
|
||||
costTime,
|
||||
query: query,
|
||||
resultNumbers: mappedResults.length,
|
||||
results: mappedResults,
|
||||
};
|
||||
} catch (error) {
|
||||
log.extend('error')('Error parsing Bocha response: %o', error);
|
||||
throw new TRPCError({
|
||||
cause: error,
|
||||
code: 'INTERNAL_SERVER_ERROR',
|
||||
message: 'Failed to parse Bocha response.',
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,47 @@
|
||||
export interface BochaSearchParameters {
|
||||
count?: number;
|
||||
exclude?: string;
|
||||
freshness?: string;
|
||||
include?: string;
|
||||
query: string;
|
||||
summary?: boolean;
|
||||
}
|
||||
|
||||
interface BochaQueryContext {
|
||||
originalQuery: string;
|
||||
}
|
||||
|
||||
interface BochaValue {
|
||||
cachedPageUrl?: string;
|
||||
dateLastCrawled?: string;
|
||||
displayUrl?: string;
|
||||
id?: string | null;
|
||||
isFamilyFriendly?: boolean;
|
||||
isNavigational?: boolean;
|
||||
language?: string;
|
||||
name: string;
|
||||
siteName?: string;
|
||||
snippet?: string;
|
||||
summary?: string;
|
||||
url: string;
|
||||
}
|
||||
|
||||
interface BochaWebPages {
|
||||
totalEstimatedMatches?: number;
|
||||
value?: BochaValue[];
|
||||
webSearchUrl?: string;
|
||||
}
|
||||
|
||||
interface BochaData {
|
||||
images?: any;
|
||||
queryContext?: BochaQueryContext;
|
||||
videos?: any;
|
||||
webPages: BochaWebPages;
|
||||
}
|
||||
|
||||
export interface BochaResponse {
|
||||
code?: number;
|
||||
data: BochaData;
|
||||
log_id?: string;
|
||||
msg?: string | null;
|
||||
}
|
||||
@@ -0,0 +1,129 @@
|
||||
import { TRPCError } from '@trpc/server';
|
||||
import debug from 'debug';
|
||||
import urlJoin from 'url-join';
|
||||
|
||||
import { SearchParams, UniformSearchResponse, UniformSearchResult } from '@/types/tool/search';
|
||||
|
||||
import { SearchServiceImpl } from '../type';
|
||||
import { ExaSearchParameters, ExaResponse } from './type';
|
||||
|
||||
const log = debug('lobe-search:Exa');
|
||||
|
||||
/**
|
||||
* Exa implementation of the search service
|
||||
* Primarily used for web crawling
|
||||
*/
|
||||
export class ExaImpl implements SearchServiceImpl {
|
||||
private get apiKey(): string | undefined {
|
||||
return process.env.EXA_API_KEY;
|
||||
}
|
||||
|
||||
private get baseUrl(): string {
|
||||
// Assuming the base URL is consistent with the crawl endpoint
|
||||
return 'https://api.exa.ai';
|
||||
}
|
||||
|
||||
async query(query: string, params: SearchParams = {}): Promise<UniformSearchResponse> {
|
||||
log('Starting Exa query with query: "%s", params: %o', query, params);
|
||||
const endpoint = urlJoin(this.baseUrl, '/search');
|
||||
|
||||
const defaultQueryParams: ExaSearchParameters = {
|
||||
numResults: 15,
|
||||
query,
|
||||
type: 'auto',
|
||||
};
|
||||
|
||||
let body: ExaSearchParameters = {
|
||||
...defaultQueryParams,
|
||||
...(params?.searchTimeRange && params.searchTimeRange !== 'anytime'
|
||||
? (() => {
|
||||
const now = Date.now();
|
||||
const days = { day: 1, month: 30, week: 7, year: 365 }[params.searchTimeRange!];
|
||||
|
||||
if (days === undefined) return {};
|
||||
|
||||
return {
|
||||
endPublishedDate: new Date(now).toISOString(),
|
||||
startPublishedDate: new Date(now - days * 86_400 * 1000).toISOString(),
|
||||
};
|
||||
})()
|
||||
: {}),
|
||||
category:
|
||||
// Exa 只支持 news 类型
|
||||
params?.searchCategories?.filter(cat => ['news'].includes(cat))?.[0],
|
||||
};
|
||||
|
||||
log('Constructed request body: %o', body);
|
||||
|
||||
let response: Response;
|
||||
const startAt = Date.now();
|
||||
let costTime = 0;
|
||||
try {
|
||||
log('Sending request to endpoint: %s', endpoint);
|
||||
response = await fetch(endpoint, {
|
||||
body: JSON.stringify(body),
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
'x-api-key': this.apiKey ? this.apiKey : '',
|
||||
},
|
||||
method: 'POST',
|
||||
});
|
||||
log('Received response with status: %d', response.status);
|
||||
costTime = Date.now() - startAt;
|
||||
} catch (error) {
|
||||
log.extend('error')('Exa fetch error: %o', error);
|
||||
throw new TRPCError({
|
||||
cause: error,
|
||||
code: 'SERVICE_UNAVAILABLE',
|
||||
message: 'Failed to connect to Exa.',
|
||||
});
|
||||
}
|
||||
|
||||
if (!response.ok) {
|
||||
const errorBody = await response.text();
|
||||
log.extend('error')(
|
||||
`Exa request failed with status ${response.status}: %s`,
|
||||
errorBody.length > 200 ? `${errorBody.slice(0, 200)}...` : errorBody,
|
||||
);
|
||||
throw new TRPCError({
|
||||
cause: errorBody,
|
||||
code: 'SERVICE_UNAVAILABLE',
|
||||
message: `Exa request failed: ${response.statusText}`,
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
const exaResponse = (await response.json()) as ExaResponse;
|
||||
|
||||
log('Parsed Exa response: %o', exaResponse);
|
||||
|
||||
const mappedResults = (exaResponse.results || []).map(
|
||||
(result): UniformSearchResult => ({
|
||||
category: body.category || 'general', // Default category
|
||||
content: result.text || '', // Prioritize content, fallback to snippet
|
||||
engines: ['exa'], // Use 'exa' as the engine name
|
||||
parsedUrl: result.url ? new URL(result.url).hostname : '', // Basic URL parsing
|
||||
score: result.score || 0, // Default score to 0 if undefined
|
||||
title: result.title || '',
|
||||
url: result.url,
|
||||
}),
|
||||
);
|
||||
|
||||
log('Mapped %d results to SearchResult format', mappedResults.length);
|
||||
|
||||
return {
|
||||
costTime,
|
||||
query: query,
|
||||
resultNumbers: mappedResults.length,
|
||||
results: mappedResults,
|
||||
};
|
||||
} catch (error) {
|
||||
log.extend('error')('Error parsing Exa response: %o', error);
|
||||
throw new TRPCError({
|
||||
cause: error,
|
||||
code: 'INTERNAL_SERVER_ERROR',
|
||||
message: 'Failed to parse Exa response.',
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,39 @@
|
||||
export interface ExaSearchParameters {
|
||||
category?: string;
|
||||
endCrawlDate?: string;
|
||||
endPublishedDate?: string;
|
||||
excludeDomains?: string[];
|
||||
excludeText?: string[];
|
||||
includeDomains?: string[];
|
||||
includeText?: string[];
|
||||
numResults?: number;
|
||||
query: string;
|
||||
startCrawlDate?: string;
|
||||
startPublishedDate?: string;
|
||||
type?: string;
|
||||
}
|
||||
|
||||
interface ExaCostDollars {
|
||||
total: number;
|
||||
}
|
||||
|
||||
interface ExaResults {
|
||||
author?: string | null;
|
||||
favicon?: string;
|
||||
id?: string;
|
||||
image?: string;
|
||||
publishedDate?: string | null;
|
||||
score?: number | null;
|
||||
summery?: string;
|
||||
text: string;
|
||||
title: string;
|
||||
url: string;
|
||||
}
|
||||
|
||||
export interface ExaResponse {
|
||||
costDollars?: ExaCostDollars;
|
||||
requestId?: string;
|
||||
resolvedSearchType?: string;
|
||||
results: ExaResults[];
|
||||
searchType?: string;
|
||||
}
|
||||
@@ -0,0 +1,128 @@
|
||||
import { TRPCError } from '@trpc/server';
|
||||
import debug from 'debug';
|
||||
import urlJoin from 'url-join';
|
||||
|
||||
import { SearchParams, UniformSearchResponse, UniformSearchResult } from '@/types/tool/search';
|
||||
|
||||
import { SearchServiceImpl } from '../type';
|
||||
import { FirecrawlSearchParameters, FirecrawlResponse } from './type';
|
||||
|
||||
const log = debug('lobe-search:Firecrawl');
|
||||
|
||||
const timeRangeMapping = {
|
||||
day: 'qdr:d',
|
||||
month: 'qdr:m',
|
||||
week: 'qdr:w',
|
||||
year: 'qdr:y',
|
||||
};
|
||||
|
||||
/**
|
||||
* Firecrawl implementation of the search service
|
||||
* Primarily used for web crawling
|
||||
*/
|
||||
export class FirecrawlImpl implements SearchServiceImpl {
|
||||
private get apiKey(): string | undefined {
|
||||
return process.env.FIRECRAWL_API_KEY;
|
||||
}
|
||||
|
||||
private get baseUrl(): string {
|
||||
// Assuming the base URL is consistent with the crawl endpoint
|
||||
return process.env.FIRECRAWL_URL || 'https://api.firecrawl.dev/v1';
|
||||
}
|
||||
|
||||
async query(query: string, params: SearchParams = {}): Promise<UniformSearchResponse> {
|
||||
log('Starting Firecrawl query with query: "%s", params: %o', query, params);
|
||||
const endpoint = urlJoin(this.baseUrl, '/search');
|
||||
|
||||
const defaultQueryParams: FirecrawlSearchParameters = {
|
||||
limit: 15,
|
||||
query,
|
||||
/*
|
||||
scrapeOptions: {
|
||||
formats: ["markdown"]
|
||||
},
|
||||
*/
|
||||
};
|
||||
|
||||
let body: FirecrawlSearchParameters = {
|
||||
...defaultQueryParams,
|
||||
tbs:
|
||||
params?.searchTimeRange && params.searchTimeRange !== 'anytime'
|
||||
? timeRangeMapping[params.searchTimeRange as keyof typeof timeRangeMapping] ?? undefined
|
||||
: undefined,
|
||||
};
|
||||
|
||||
log('Constructed request body: %o', body);
|
||||
|
||||
let response: Response;
|
||||
const startAt = Date.now();
|
||||
let costTime = 0;
|
||||
try {
|
||||
log('Sending request to endpoint: %s', endpoint);
|
||||
response = await fetch(endpoint, {
|
||||
body: JSON.stringify(body),
|
||||
headers: {
|
||||
'Authorization': this.apiKey ? `Bearer ${this.apiKey}` : '',
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
method: 'POST',
|
||||
});
|
||||
log('Received response with status: %d', response.status);
|
||||
costTime = Date.now() - startAt;
|
||||
} catch (error) {
|
||||
log.extend('error')('Firecrawl fetch error: %o', error);
|
||||
throw new TRPCError({
|
||||
cause: error,
|
||||
code: 'SERVICE_UNAVAILABLE',
|
||||
message: 'Failed to connect to Firecrawl.',
|
||||
});
|
||||
}
|
||||
|
||||
if (!response.ok) {
|
||||
const errorBody = await response.text();
|
||||
log.extend('error')(
|
||||
`Firecrawl request failed with status ${response.status}: %s`,
|
||||
errorBody.length > 200 ? `${errorBody.slice(0, 200)}...` : errorBody,
|
||||
);
|
||||
throw new TRPCError({
|
||||
cause: errorBody,
|
||||
code: 'SERVICE_UNAVAILABLE',
|
||||
message: `Firecrawl request failed: ${response.statusText}`,
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
const firecrawlResponse = (await response.json()) as FirecrawlResponse;
|
||||
|
||||
log('Parsed Firecrawl response: %o', firecrawlResponse);
|
||||
|
||||
const mappedResults = (firecrawlResponse.data || []).map(
|
||||
(result): UniformSearchResult => ({
|
||||
category: 'general', // Default category
|
||||
content: result.description || '', // Prioritize content, fallback to snippet
|
||||
engines: ['firecrawl'], // Use 'firecrawl' as the engine name
|
||||
parsedUrl: result.url ? new URL(result.url).hostname : '', // Basic URL parsing
|
||||
score: 1, // Default score to 1
|
||||
title: result.title || '',
|
||||
url: result.url,
|
||||
}),
|
||||
);
|
||||
|
||||
log('Mapped %d results to SearchResult format', mappedResults.length);
|
||||
|
||||
return {
|
||||
costTime,
|
||||
query: query,
|
||||
resultNumbers: mappedResults.length,
|
||||
results: mappedResults,
|
||||
};
|
||||
} catch (error) {
|
||||
log.extend('error')('Error parsing Firecrawl response: %o', error);
|
||||
throw new TRPCError({
|
||||
cause: error,
|
||||
code: 'INTERNAL_SERVER_ERROR',
|
||||
message: 'Failed to parse Firecrawl response.',
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,35 @@
|
||||
interface FirecrawlScrapeOptions {
|
||||
formats: string[];
|
||||
}
|
||||
|
||||
export interface FirecrawlSearchParameters {
|
||||
country?: string;
|
||||
lang?: string;
|
||||
limit?: number;
|
||||
query: string;
|
||||
scrapeOptions?: FirecrawlScrapeOptions;
|
||||
tbs?: string;
|
||||
timeout?: number;
|
||||
}
|
||||
|
||||
interface FirecrawlMetadata {
|
||||
description?: string;
|
||||
sourceURL?: string;
|
||||
statusCode?: number;
|
||||
title: string;
|
||||
}
|
||||
|
||||
interface FirecrawlData {
|
||||
description?: string;
|
||||
html?: string;
|
||||
links?: string[];
|
||||
markdown?: string;
|
||||
metadata?: FirecrawlMetadata;
|
||||
title?: string;
|
||||
url: string;
|
||||
}
|
||||
|
||||
export interface FirecrawlResponse {
|
||||
data: FirecrawlData[];
|
||||
success?: boolean;
|
||||
}
|
||||
@@ -1,13 +1,24 @@
|
||||
import { BochaImpl } from './bocha';
|
||||
import { ExaImpl } from './exa';
|
||||
import { FirecrawlImpl } from './firecrawl';
|
||||
import { JinaImpl } from './jina';
|
||||
import { Search1APIImpl } from './search1api';
|
||||
import { SearXNGImpl } from './searxng';
|
||||
import { TavilyImpl } from './tavily';
|
||||
|
||||
import { SearchServiceImpl } from './type';
|
||||
|
||||
/**
|
||||
* Available search service implementations
|
||||
*/
|
||||
export enum SearchImplType {
|
||||
Bocha = 'bocha',
|
||||
Exa = 'exa',
|
||||
Firecrawl = 'firecrawl',
|
||||
Jina = 'jina',
|
||||
SearXNG = 'searxng',
|
||||
Search1API = 'search1api',
|
||||
Tavily = 'tavily',
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -17,10 +28,30 @@ export const createSearchServiceImpl = (
|
||||
type: SearchImplType = SearchImplType.SearXNG,
|
||||
): SearchServiceImpl => {
|
||||
switch (type) {
|
||||
case SearchImplType.Bocha: {
|
||||
return new BochaImpl();
|
||||
}
|
||||
|
||||
case SearchImplType.Exa: {
|
||||
return new ExaImpl();
|
||||
}
|
||||
|
||||
case SearchImplType.Firecrawl: {
|
||||
return new FirecrawlImpl();
|
||||
}
|
||||
|
||||
case SearchImplType.Jina: {
|
||||
return new JinaImpl();
|
||||
}
|
||||
|
||||
case SearchImplType.SearXNG: {
|
||||
return new SearXNGImpl();
|
||||
}
|
||||
|
||||
case SearchImplType.Tavily: {
|
||||
return new TavilyImpl();
|
||||
}
|
||||
|
||||
default: {
|
||||
return new Search1APIImpl();
|
||||
}
|
||||
|
||||
@@ -0,0 +1,109 @@
|
||||
import { TRPCError } from '@trpc/server';
|
||||
import debug from 'debug';
|
||||
import urlJoin from 'url-join';
|
||||
|
||||
import { SearchParams, UniformSearchResponse, UniformSearchResult } from '@/types/tool/search';
|
||||
|
||||
import { SearchServiceImpl } from '../type';
|
||||
import { JinaSearchParameters, JinaResponse } from './type';
|
||||
|
||||
const log = debug('lobe-search:Jina');
|
||||
|
||||
/**
|
||||
* Jina implementation of the search service
|
||||
* Primarily used for web crawling
|
||||
*/
|
||||
export class JinaImpl implements SearchServiceImpl {
|
||||
private get apiKey(): string | undefined {
|
||||
return process.env.JINA_READER_API_KEY || process.env.JINA_API_KEY;
|
||||
}
|
||||
|
||||
private get baseUrl(): string {
|
||||
// Assuming the base URL is consistent with the crawl endpoint
|
||||
return 'https://s.jina.ai';
|
||||
}
|
||||
|
||||
async query(query: string, params: SearchParams = {}): Promise<UniformSearchResponse> {
|
||||
log('Starting Jina query with query: "%s", params: %o', query, params);
|
||||
const endpoint = urlJoin(this.baseUrl, '/');
|
||||
|
||||
let body: JinaSearchParameters = {
|
||||
q: query,
|
||||
};
|
||||
|
||||
log('Constructed request body: %o', body);
|
||||
|
||||
let response: Response;
|
||||
const startAt = Date.now();
|
||||
let costTime = 0;
|
||||
try {
|
||||
log('Sending request to endpoint: %s', endpoint);
|
||||
response = await fetch(endpoint, {
|
||||
body: JSON.stringify(body),
|
||||
headers: {
|
||||
'Accept': 'application/json',
|
||||
'Authorization': this.apiKey ? `Bearer ${this.apiKey}` : '',
|
||||
'Content-Type': 'application/json',
|
||||
'X-Respond-With': 'no-content',
|
||||
},
|
||||
method: 'POST',
|
||||
});
|
||||
log('Received response with status: %d', response.status);
|
||||
costTime = Date.now() - startAt;
|
||||
} catch (error) {
|
||||
log.extend('error')('Jina fetch error: %o', error);
|
||||
throw new TRPCError({
|
||||
cause: error,
|
||||
code: 'SERVICE_UNAVAILABLE',
|
||||
message: 'Failed to connect to Jina.',
|
||||
});
|
||||
}
|
||||
|
||||
if (!response.ok) {
|
||||
const errorBody = await response.text();
|
||||
log.extend('error')(
|
||||
`Jina request failed with status ${response.status}: %s`,
|
||||
errorBody.length > 200 ? `${errorBody.slice(0, 200)}...` : errorBody,
|
||||
);
|
||||
throw new TRPCError({
|
||||
cause: errorBody,
|
||||
code: 'SERVICE_UNAVAILABLE',
|
||||
message: `Jina request failed: ${response.statusText}`,
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
const jinaResponse = (await response.json()) as JinaResponse;
|
||||
|
||||
log('Parsed Jina response: %o', jinaResponse);
|
||||
|
||||
const mappedResults = (jinaResponse.data || []).map(
|
||||
(result): UniformSearchResult => ({
|
||||
category: 'general', // Default category
|
||||
content: result.description || '', // Prioritize content, fallback to snippet
|
||||
engines: ['jina'], // Use 'jina' as the engine name
|
||||
parsedUrl: result.url ? new URL(result.url).hostname : '', // Basic URL parsing
|
||||
score: 1, // Default score to 1
|
||||
title: result.title || '',
|
||||
url: result.url,
|
||||
}),
|
||||
);
|
||||
|
||||
log('Mapped %d results to SearchResult format', mappedResults.length);
|
||||
|
||||
return {
|
||||
costTime,
|
||||
query: query,
|
||||
resultNumbers: mappedResults.length,
|
||||
results: mappedResults,
|
||||
};
|
||||
} catch (error) {
|
||||
log.extend('error')('Error parsing Jina response: %o', error);
|
||||
throw new TRPCError({
|
||||
cause: error,
|
||||
code: 'INTERNAL_SERVER_ERROR',
|
||||
message: 'Failed to parse Jina response.',
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
export interface JinaSearchParameters {
|
||||
q: string;
|
||||
}
|
||||
|
||||
interface JinaUsage {
|
||||
tokens: number;
|
||||
}
|
||||
|
||||
interface JinaMeta {
|
||||
usage: JinaUsage;
|
||||
}
|
||||
|
||||
interface JinaData {
|
||||
content?: string;
|
||||
description?: string;
|
||||
title: string;
|
||||
url: string;
|
||||
usage?: JinaUsage;
|
||||
}
|
||||
|
||||
export interface JinaResponse {
|
||||
code?: number;
|
||||
data: JinaData[];
|
||||
meta?: JinaMeta;
|
||||
status?: number;
|
||||
}
|
||||
@@ -0,0 +1,124 @@
|
||||
import { TRPCError } from '@trpc/server';
|
||||
import debug from 'debug';
|
||||
import urlJoin from 'url-join';
|
||||
|
||||
import { SearchParams, UniformSearchResponse, UniformSearchResult } from '@/types/tool/search';
|
||||
|
||||
import { SearchServiceImpl } from '../type';
|
||||
import { TavilySearchParameters, TavilyResponse } from './type';
|
||||
|
||||
const log = debug('lobe-search:Tavily');
|
||||
|
||||
/**
|
||||
* Tavily implementation of the search service
|
||||
* Primarily used for web crawling
|
||||
*/
|
||||
export class TavilyImpl implements SearchServiceImpl {
|
||||
private get apiKey(): string | undefined {
|
||||
return process.env.TAVILY_API_KEY;
|
||||
}
|
||||
|
||||
private get baseUrl(): string {
|
||||
// Assuming the base URL is consistent with the crawl endpoint
|
||||
return 'https://api.tavily.com';
|
||||
}
|
||||
|
||||
async query(query: string, params: SearchParams = {}): Promise<UniformSearchResponse> {
|
||||
log('Starting Tavily query with query: "%s", params: %o', query, params);
|
||||
const endpoint = urlJoin(this.baseUrl, '/search');
|
||||
|
||||
const defaultQueryParams: TavilySearchParameters = {
|
||||
include_answer: false,
|
||||
include_image_descriptions: true,
|
||||
include_images: false,
|
||||
include_raw_content: false,
|
||||
max_results: 15,
|
||||
query,
|
||||
search_depth: process.env.TAVILY_SEARCH_DEPTH || 'basic' // basic or advanced
|
||||
};
|
||||
|
||||
let body: TavilySearchParameters = {
|
||||
...defaultQueryParams,
|
||||
time_range:
|
||||
params?.searchTimeRange && params.searchTimeRange !== 'anytime'
|
||||
? params.searchTimeRange
|
||||
: undefined,
|
||||
topic:
|
||||
// Tavily 只支持 news 和 general 两种类型
|
||||
params?.searchCategories?.filter(cat => ['news', 'general'].includes(cat))?.[0],
|
||||
};
|
||||
|
||||
log('Constructed request body: %o', body);
|
||||
|
||||
let response: Response;
|
||||
const startAt = Date.now();
|
||||
let costTime = 0;
|
||||
try {
|
||||
log('Sending request to endpoint: %s', endpoint);
|
||||
response = await fetch(endpoint, {
|
||||
body: JSON.stringify(body),
|
||||
headers: {
|
||||
'Authorization': this.apiKey ? `Bearer ${this.apiKey}` : '',
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
method: 'POST',
|
||||
});
|
||||
log('Received response with status: %d', response.status);
|
||||
costTime = Date.now() - startAt;
|
||||
} catch (error) {
|
||||
log.extend('error')('Tavily fetch error: %o', error);
|
||||
throw new TRPCError({
|
||||
cause: error,
|
||||
code: 'SERVICE_UNAVAILABLE',
|
||||
message: 'Failed to connect to Tavily.',
|
||||
});
|
||||
}
|
||||
|
||||
if (!response.ok) {
|
||||
const errorBody = await response.text();
|
||||
log.extend('error')(
|
||||
`Tavily request failed with status ${response.status}: %s`,
|
||||
errorBody.length > 200 ? `${errorBody.slice(0, 200)}...` : errorBody,
|
||||
);
|
||||
throw new TRPCError({
|
||||
cause: errorBody,
|
||||
code: 'SERVICE_UNAVAILABLE',
|
||||
message: `Tavily request failed: ${response.statusText}`,
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
const tavilyResponse = (await response.json()) as TavilyResponse;
|
||||
|
||||
log('Parsed Tavily response: %o', tavilyResponse);
|
||||
|
||||
const mappedResults = (tavilyResponse.results || []).map(
|
||||
(result): UniformSearchResult => ({
|
||||
category: body.topic || 'general', // Default category
|
||||
content: result.content || '', // Prioritize content, fallback to snippet
|
||||
engines: ['tavily'], // Use 'tavily' as the engine name
|
||||
parsedUrl: result.url ? new URL(result.url).hostname : '', // Basic URL parsing
|
||||
score: result.score || 0, // Default score to 0 if undefined
|
||||
title: result.title || '',
|
||||
url: result.url,
|
||||
}),
|
||||
);
|
||||
|
||||
log('Mapped %d results to SearchResult format', mappedResults.length);
|
||||
|
||||
return {
|
||||
costTime,
|
||||
query: query,
|
||||
resultNumbers: mappedResults.length,
|
||||
results: mappedResults,
|
||||
};
|
||||
} catch (error) {
|
||||
log.extend('error')('Error parsing Tavily response: %o', error);
|
||||
throw new TRPCError({
|
||||
cause: error,
|
||||
code: 'INTERNAL_SERVER_ERROR',
|
||||
message: 'Failed to parse Tavily response.',
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,36 @@
|
||||
export interface TavilySearchParameters {
|
||||
chunks_per_source?: number;
|
||||
days?: number;
|
||||
exclude_domains?: string[];
|
||||
include_answer?: boolean | string;
|
||||
include_domains?: string[];
|
||||
include_image_descriptions?: boolean;
|
||||
include_images?: boolean;
|
||||
include_raw_content?: boolean;
|
||||
max_results?: number;
|
||||
query: string;
|
||||
search_depth?: string;
|
||||
time_range?: string;
|
||||
topic?: string;
|
||||
}
|
||||
|
||||
interface TavilyImages {
|
||||
description?: string;
|
||||
url: string;
|
||||
}
|
||||
|
||||
interface TavilyResults {
|
||||
content?: string;
|
||||
raw_content?: string | null;
|
||||
score?: number;
|
||||
title?: string;
|
||||
url: string;
|
||||
}
|
||||
|
||||
export interface TavilyResponse {
|
||||
answer?: string;
|
||||
images?: TavilyImages[];
|
||||
query: string;
|
||||
response_time: number;
|
||||
results: TavilyResults[];
|
||||
}
|
||||
Reference in New Issue
Block a user