mirror of
https://github.com/open-webui/open-webui.git
synced 2026-06-13 19:20:05 +00:00
e0d6074cd2
Co-authored-by: Tim Baek <tim@openwebui.com>
230 lines
6.7 KiB
Python
230 lines
6.7 KiB
Python
from __future__ import annotations
|
|
|
|
import logging
|
|
import time
|
|
from typing import TYPE_CHECKING, Any
|
|
|
|
import requests
|
|
from langchain_core.documents import Document
|
|
|
|
if TYPE_CHECKING:
|
|
from open_webui.retrieval.web.main import SearchResult
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
DEFAULT_FIRECRAWL_API_BASE_URL = 'https://api.firecrawl.dev'
|
|
FIRECRAWL_RETRY_STATUS_CODES = {429, 500, 502, 503, 504}
|
|
FIRECRAWL_MAX_RETRIES = 2
|
|
|
|
|
|
def build_firecrawl_url(base_url: str | None, path: str) -> str:
|
|
base_url = (base_url or DEFAULT_FIRECRAWL_API_BASE_URL).rstrip('/')
|
|
path = path.lstrip('/')
|
|
|
|
if base_url.endswith('/v2'):
|
|
return f'{base_url}/{path}'
|
|
|
|
return f'{base_url}/v2/{path}'
|
|
|
|
|
|
def build_firecrawl_headers(api_key: str | None) -> dict[str, str]:
|
|
return {
|
|
'Content-Type': 'application/json',
|
|
'Authorization': f'Bearer {api_key or ""}',
|
|
}
|
|
|
|
|
|
def get_firecrawl_timeout_seconds(timeout: Any) -> float | None:
|
|
if timeout in (None, ''):
|
|
return None
|
|
|
|
try:
|
|
timeout = float(timeout)
|
|
except (TypeError, ValueError):
|
|
return None
|
|
|
|
return timeout if timeout > 0 else None
|
|
|
|
|
|
def get_firecrawl_scrape_timeout_ms(timeout: Any) -> int | None:
|
|
timeout_seconds = get_firecrawl_timeout_seconds(timeout)
|
|
if timeout_seconds is None:
|
|
return None
|
|
|
|
# Firecrawl v2 expects scrape timeouts in milliseconds.
|
|
return min(300000, max(1000, int(timeout_seconds * 1000)))
|
|
|
|
|
|
def get_firecrawl_client_timeout_seconds(timeout: Any, fallback: float = 60) -> float:
|
|
# Keep the local HTTP timeout slightly above Firecrawl's scrape timeout.
|
|
return (get_firecrawl_timeout_seconds(timeout) or fallback) + 10
|
|
|
|
|
|
def get_firecrawl_retry_delay(headers: Any, attempt: int) -> float:
|
|
retry_after = headers.get('Retry-After') if headers else None
|
|
if retry_after:
|
|
try:
|
|
return min(10.0, max(0.0, float(retry_after)))
|
|
except (TypeError, ValueError):
|
|
pass
|
|
|
|
return min(8.0, float(2**attempt))
|
|
|
|
|
|
def request_firecrawl_json(
|
|
method: str,
|
|
url: str,
|
|
*,
|
|
headers: dict[str, str],
|
|
json: dict[str, Any] | None = None,
|
|
timeout: float | None = None,
|
|
verify: bool = True,
|
|
) -> dict[str, Any]:
|
|
last_error = None
|
|
|
|
for attempt in range(FIRECRAWL_MAX_RETRIES + 1):
|
|
try:
|
|
response = requests.request(
|
|
method,
|
|
url,
|
|
headers=headers,
|
|
json=json,
|
|
timeout=timeout,
|
|
verify=verify,
|
|
)
|
|
|
|
if response.status_code in FIRECRAWL_RETRY_STATUS_CODES and attempt < FIRECRAWL_MAX_RETRIES:
|
|
delay = get_firecrawl_retry_delay(response.headers, attempt)
|
|
log.warning(
|
|
'Firecrawl %s %s returned HTTP %s; retrying in %.1fs',
|
|
method,
|
|
url,
|
|
response.status_code,
|
|
delay,
|
|
)
|
|
time.sleep(delay)
|
|
continue
|
|
|
|
response.raise_for_status()
|
|
return response.json()
|
|
except (requests.ConnectionError, requests.Timeout) as e:
|
|
last_error = e
|
|
if attempt >= FIRECRAWL_MAX_RETRIES:
|
|
break
|
|
|
|
delay = get_firecrawl_retry_delay(None, attempt)
|
|
log.warning('Firecrawl %s %s failed; retrying in %.1fs: %s', method, url, delay, e)
|
|
time.sleep(delay)
|
|
|
|
if last_error:
|
|
raise last_error
|
|
|
|
raise RuntimeError(f'Firecrawl {method} {url} failed without a response')
|
|
|
|
|
|
def get_firecrawl_result_url(result: dict[str, Any]) -> str:
|
|
metadata = result.get('metadata') or {}
|
|
return (
|
|
result.get('url')
|
|
or result.get('link')
|
|
or metadata.get('url')
|
|
or metadata.get('sourceURL')
|
|
or metadata.get('source_url')
|
|
or ''
|
|
)
|
|
|
|
|
|
def scrape_firecrawl_url(
|
|
firecrawl_url: str,
|
|
firecrawl_api_key: str,
|
|
url: str,
|
|
*,
|
|
verify_ssl: bool = True,
|
|
timeout: Any = None,
|
|
params: dict[str, Any] | None = None,
|
|
) -> Document | None:
|
|
payload = {
|
|
'url': url,
|
|
'formats': ['markdown'],
|
|
'skipTlsVerification': not verify_ssl,
|
|
'removeBase64Images': True,
|
|
**(params or {}),
|
|
}
|
|
scrape_timeout_ms = get_firecrawl_scrape_timeout_ms(timeout)
|
|
if scrape_timeout_ms is not None:
|
|
payload['timeout'] = scrape_timeout_ms
|
|
|
|
response = request_firecrawl_json(
|
|
'POST',
|
|
build_firecrawl_url(firecrawl_url, 'scrape'),
|
|
headers=build_firecrawl_headers(firecrawl_api_key),
|
|
json=payload,
|
|
timeout=get_firecrawl_client_timeout_seconds(timeout),
|
|
verify=verify_ssl,
|
|
)
|
|
data = response.get('data') or {}
|
|
content = data.get('markdown') or ''
|
|
if not isinstance(content, str) or not content.strip():
|
|
return None
|
|
|
|
metadata = data.get('metadata') or {}
|
|
document_metadata = {'source': get_firecrawl_result_url(data) or url}
|
|
if metadata.get('title'):
|
|
document_metadata['title'] = metadata['title']
|
|
if metadata.get('description'):
|
|
document_metadata['description'] = metadata['description']
|
|
|
|
return Document(page_content=content, metadata=document_metadata)
|
|
|
|
|
|
def search_firecrawl(
|
|
firecrawl_url: str,
|
|
firecrawl_api_key: str,
|
|
query: str,
|
|
count: int,
|
|
filter_list: list[str] | None = None,
|
|
) -> list[SearchResult]:
|
|
try:
|
|
response = request_firecrawl_json(
|
|
'POST',
|
|
build_firecrawl_url(firecrawl_url, 'search'),
|
|
headers=build_firecrawl_headers(firecrawl_api_key),
|
|
json={
|
|
'query': query,
|
|
'limit': count,
|
|
'timeout': count * 3000,
|
|
'ignoreInvalidURLs': True,
|
|
},
|
|
timeout=count * 3 + 10,
|
|
)
|
|
data = response.get('data') or {}
|
|
results = data.get('web') or []
|
|
|
|
if filter_list:
|
|
from open_webui.retrieval.web.main import get_filtered_results
|
|
|
|
results = get_filtered_results(results, filter_list)
|
|
|
|
from open_webui.retrieval.web.main import SearchResult
|
|
|
|
search_results = []
|
|
for result in results[:count]:
|
|
url = get_firecrawl_result_url(result)
|
|
if not url:
|
|
continue
|
|
|
|
metadata = result.get('metadata') or {}
|
|
search_results.append(
|
|
SearchResult(
|
|
link=url,
|
|
title=result.get('title') or metadata.get('title'),
|
|
snippet=result.get('description') or result.get('snippet') or metadata.get('description'),
|
|
)
|
|
)
|
|
|
|
log.info(f'FireCrawl search results: {search_results}')
|
|
return search_results
|
|
except Exception as e:
|
|
log.error(f'Error in FireCrawl search: {e}')
|
|
return []
|