mirror of
https://github.com/open-webui/open-webui.git
synced 2026-06-14 03:30:25 +00:00
refactor(firecrawl): use v2 API directly (#23934)
Co-authored-by: Tim Baek <tim@openwebui.com>
This commit is contained in:
@@ -1,52 +1,229 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
from typing import Optional, List
|
import time
|
||||||
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from open_webui.retrieval.web.main import SearchResult, get_filtered_results
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from open_webui.retrieval.web.main import SearchResult
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
DEFAULT_FIRECRAWL_API_BASE_URL = 'https://api.firecrawl.dev'
|
||||||
|
FIRECRAWL_RETRY_STATUS_CODES = {429, 500, 502, 503, 504}
|
||||||
|
FIRECRAWL_MAX_RETRIES = 2
|
||||||
|
|
||||||
|
|
||||||
|
def build_firecrawl_url(base_url: str | None, path: str) -> str:
|
||||||
|
base_url = (base_url or DEFAULT_FIRECRAWL_API_BASE_URL).rstrip('/')
|
||||||
|
path = path.lstrip('/')
|
||||||
|
|
||||||
|
if base_url.endswith('/v2'):
|
||||||
|
return f'{base_url}/{path}'
|
||||||
|
|
||||||
|
return f'{base_url}/v2/{path}'
|
||||||
|
|
||||||
|
|
||||||
|
def build_firecrawl_headers(api_key: str | None) -> dict[str, str]:
|
||||||
|
return {
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
'Authorization': f'Bearer {api_key or ""}',
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_firecrawl_timeout_seconds(timeout: Any) -> float | None:
|
||||||
|
if timeout in (None, ''):
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
timeout = float(timeout)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
return None
|
||||||
|
|
||||||
|
return timeout if timeout > 0 else None
|
||||||
|
|
||||||
|
|
||||||
|
def get_firecrawl_scrape_timeout_ms(timeout: Any) -> int | None:
|
||||||
|
timeout_seconds = get_firecrawl_timeout_seconds(timeout)
|
||||||
|
if timeout_seconds is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Firecrawl v2 expects scrape timeouts in milliseconds.
|
||||||
|
return min(300000, max(1000, int(timeout_seconds * 1000)))
|
||||||
|
|
||||||
|
|
||||||
|
def get_firecrawl_client_timeout_seconds(timeout: Any, fallback: float = 60) -> float:
|
||||||
|
# Keep the local HTTP timeout slightly above Firecrawl's scrape timeout.
|
||||||
|
return (get_firecrawl_timeout_seconds(timeout) or fallback) + 10
|
||||||
|
|
||||||
|
|
||||||
|
def get_firecrawl_retry_delay(headers: Any, attempt: int) -> float:
|
||||||
|
retry_after = headers.get('Retry-After') if headers else None
|
||||||
|
if retry_after:
|
||||||
|
try:
|
||||||
|
return min(10.0, max(0.0, float(retry_after)))
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
return min(8.0, float(2**attempt))
|
||||||
|
|
||||||
|
|
||||||
|
def request_firecrawl_json(
|
||||||
|
method: str,
|
||||||
|
url: str,
|
||||||
|
*,
|
||||||
|
headers: dict[str, str],
|
||||||
|
json: dict[str, Any] | None = None,
|
||||||
|
timeout: float | None = None,
|
||||||
|
verify: bool = True,
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
last_error = None
|
||||||
|
|
||||||
|
for attempt in range(FIRECRAWL_MAX_RETRIES + 1):
|
||||||
|
try:
|
||||||
|
response = requests.request(
|
||||||
|
method,
|
||||||
|
url,
|
||||||
|
headers=headers,
|
||||||
|
json=json,
|
||||||
|
timeout=timeout,
|
||||||
|
verify=verify,
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status_code in FIRECRAWL_RETRY_STATUS_CODES and attempt < FIRECRAWL_MAX_RETRIES:
|
||||||
|
delay = get_firecrawl_retry_delay(response.headers, attempt)
|
||||||
|
log.warning(
|
||||||
|
'Firecrawl %s %s returned HTTP %s; retrying in %.1fs',
|
||||||
|
method,
|
||||||
|
url,
|
||||||
|
response.status_code,
|
||||||
|
delay,
|
||||||
|
)
|
||||||
|
time.sleep(delay)
|
||||||
|
continue
|
||||||
|
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.json()
|
||||||
|
except (requests.ConnectionError, requests.Timeout) as e:
|
||||||
|
last_error = e
|
||||||
|
if attempt >= FIRECRAWL_MAX_RETRIES:
|
||||||
|
break
|
||||||
|
|
||||||
|
delay = get_firecrawl_retry_delay(None, attempt)
|
||||||
|
log.warning('Firecrawl %s %s failed; retrying in %.1fs: %s', method, url, delay, e)
|
||||||
|
time.sleep(delay)
|
||||||
|
|
||||||
|
if last_error:
|
||||||
|
raise last_error
|
||||||
|
|
||||||
|
raise RuntimeError(f'Firecrawl {method} {url} failed without a response')
|
||||||
|
|
||||||
|
|
||||||
|
def get_firecrawl_result_url(result: dict[str, Any]) -> str:
|
||||||
|
metadata = result.get('metadata') or {}
|
||||||
|
return (
|
||||||
|
result.get('url')
|
||||||
|
or result.get('link')
|
||||||
|
or metadata.get('url')
|
||||||
|
or metadata.get('sourceURL')
|
||||||
|
or metadata.get('source_url')
|
||||||
|
or ''
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def scrape_firecrawl_url(
|
||||||
|
firecrawl_url: str,
|
||||||
|
firecrawl_api_key: str,
|
||||||
|
url: str,
|
||||||
|
*,
|
||||||
|
verify_ssl: bool = True,
|
||||||
|
timeout: Any = None,
|
||||||
|
params: dict[str, Any] | None = None,
|
||||||
|
) -> Document | None:
|
||||||
|
payload = {
|
||||||
|
'url': url,
|
||||||
|
'formats': ['markdown'],
|
||||||
|
'skipTlsVerification': not verify_ssl,
|
||||||
|
'removeBase64Images': True,
|
||||||
|
**(params or {}),
|
||||||
|
}
|
||||||
|
scrape_timeout_ms = get_firecrawl_scrape_timeout_ms(timeout)
|
||||||
|
if scrape_timeout_ms is not None:
|
||||||
|
payload['timeout'] = scrape_timeout_ms
|
||||||
|
|
||||||
|
response = request_firecrawl_json(
|
||||||
|
'POST',
|
||||||
|
build_firecrawl_url(firecrawl_url, 'scrape'),
|
||||||
|
headers=build_firecrawl_headers(firecrawl_api_key),
|
||||||
|
json=payload,
|
||||||
|
timeout=get_firecrawl_client_timeout_seconds(timeout),
|
||||||
|
verify=verify_ssl,
|
||||||
|
)
|
||||||
|
data = response.get('data') or {}
|
||||||
|
content = data.get('markdown') or ''
|
||||||
|
if not isinstance(content, str) or not content.strip():
|
||||||
|
return None
|
||||||
|
|
||||||
|
metadata = data.get('metadata') or {}
|
||||||
|
document_metadata = {'source': get_firecrawl_result_url(data) or url}
|
||||||
|
if metadata.get('title'):
|
||||||
|
document_metadata['title'] = metadata['title']
|
||||||
|
if metadata.get('description'):
|
||||||
|
document_metadata['description'] = metadata['description']
|
||||||
|
|
||||||
|
return Document(page_content=content, metadata=document_metadata)
|
||||||
|
|
||||||
|
|
||||||
def search_firecrawl(
|
def search_firecrawl(
|
||||||
firecrawl_url: str,
|
firecrawl_url: str,
|
||||||
firecrawl_api_key: str,
|
firecrawl_api_key: str,
|
||||||
query: str,
|
query: str,
|
||||||
count: int,
|
count: int,
|
||||||
filter_list: Optional[List[str]] = None,
|
filter_list: list[str] | None = None,
|
||||||
) -> List[SearchResult]:
|
) -> list[SearchResult]:
|
||||||
try:
|
try:
|
||||||
url = firecrawl_url.rstrip('/')
|
response = request_firecrawl_json(
|
||||||
response = requests.post(
|
'POST',
|
||||||
f'{url}/v1/search',
|
build_firecrawl_url(firecrawl_url, 'search'),
|
||||||
headers={
|
headers=build_firecrawl_headers(firecrawl_api_key),
|
||||||
'Content-Type': 'application/json',
|
|
||||||
'Authorization': f'Bearer {firecrawl_api_key}',
|
|
||||||
},
|
|
||||||
json={
|
json={
|
||||||
'query': query,
|
'query': query,
|
||||||
'limit': count,
|
'limit': count,
|
||||||
'timeout': count * 3000,
|
'timeout': count * 3000,
|
||||||
|
'ignoreInvalidURLs': True,
|
||||||
},
|
},
|
||||||
timeout=count * 3 + 10,
|
timeout=count * 3 + 10,
|
||||||
)
|
)
|
||||||
response.raise_for_status()
|
data = response.get('data') or {}
|
||||||
data = response.json().get('data', [])
|
results = data.get('web') or []
|
||||||
|
|
||||||
results = [
|
|
||||||
SearchResult(
|
|
||||||
link=r.get('url', ''),
|
|
||||||
title=r.get('title', ''),
|
|
||||||
snippet=r.get('description', ''),
|
|
||||||
)
|
|
||||||
for r in (data if isinstance(data, list) else [])
|
|
||||||
]
|
|
||||||
|
|
||||||
if filter_list:
|
if filter_list:
|
||||||
|
from open_webui.retrieval.web.main import get_filtered_results
|
||||||
|
|
||||||
results = get_filtered_results(results, filter_list)
|
results = get_filtered_results(results, filter_list)
|
||||||
|
|
||||||
results = results[:count]
|
from open_webui.retrieval.web.main import SearchResult
|
||||||
log.info(f'FireCrawl search results: {results}')
|
|
||||||
return results
|
search_results = []
|
||||||
|
for result in results[:count]:
|
||||||
|
url = get_firecrawl_result_url(result)
|
||||||
|
if not url:
|
||||||
|
continue
|
||||||
|
|
||||||
|
metadata = result.get('metadata') or {}
|
||||||
|
search_results.append(
|
||||||
|
SearchResult(
|
||||||
|
link=url,
|
||||||
|
title=result.get('title') or metadata.get('title'),
|
||||||
|
snippet=result.get('description') or result.get('snippet') or metadata.get('description'),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
log.info(f'FireCrawl search results: {search_results}')
|
||||||
|
return search_results
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.error(f'Error in FireCrawl search: {e}')
|
log.error(f'Error in FireCrawl search: {e}')
|
||||||
return []
|
return []
|
||||||
|
|||||||
@@ -30,6 +30,7 @@ from langchain_core.documents import Document
|
|||||||
|
|
||||||
from open_webui.retrieval.loaders.tavily import TavilyLoader
|
from open_webui.retrieval.loaders.tavily import TavilyLoader
|
||||||
from open_webui.retrieval.loaders.external_web import ExternalWebLoader
|
from open_webui.retrieval.loaders.external_web import ExternalWebLoader
|
||||||
|
from open_webui.retrieval.web.firecrawl import scrape_firecrawl_url
|
||||||
from open_webui.constants import ERROR_MESSAGES
|
from open_webui.constants import ERROR_MESSAGES
|
||||||
from open_webui.config import (
|
from open_webui.config import (
|
||||||
ENABLE_RAG_LOCAL_WEB_FETCH,
|
ENABLE_RAG_LOCAL_WEB_FETCH,
|
||||||
@@ -218,39 +219,20 @@ class SafeFireCrawlLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
|
|||||||
|
|
||||||
def lazy_load(self) -> Iterator[Document]:
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
try:
|
try:
|
||||||
headers = {
|
|
||||||
'Content-Type': 'application/json',
|
|
||||||
'Authorization': f'Bearer {self.api_key}',
|
|
||||||
}
|
|
||||||
|
|
||||||
for url in self.web_paths:
|
for url in self.web_paths:
|
||||||
payload = {
|
doc = scrape_firecrawl_url(
|
||||||
'url': url,
|
self.api_url,
|
||||||
'formats': ['markdown'],
|
self.api_key,
|
||||||
**self.params,
|
url,
|
||||||
}
|
verify_ssl=self.verify_ssl,
|
||||||
if self.timeout:
|
timeout=self.timeout,
|
||||||
payload['timeout'] = self.timeout * 1000
|
params=self.params,
|
||||||
|
|
||||||
response = requests.post(
|
|
||||||
f'{self.api_url}/v1/scrape',
|
|
||||||
headers=headers,
|
|
||||||
json=payload,
|
|
||||||
timeout=self.timeout or 60,
|
|
||||||
verify=self.verify_ssl,
|
|
||||||
)
|
|
||||||
response.raise_for_status()
|
|
||||||
data = response.json().get('data', {})
|
|
||||||
metadata = data.get('metadata', {})
|
|
||||||
source = metadata.get('url') or metadata.get('sourceURL') or url
|
|
||||||
|
|
||||||
yield Document(
|
|
||||||
page_content=data.get('markdown', ''),
|
|
||||||
metadata={'source': source},
|
|
||||||
)
|
)
|
||||||
|
if doc is not None:
|
||||||
|
yield doc
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if self.continue_on_failure:
|
if self.continue_on_failure:
|
||||||
log.exception(f'Error extracting content from URLs: {e}')
|
log.warning(f'Error extracting content from URLs with Firecrawl: {e}')
|
||||||
else:
|
else:
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
@@ -261,7 +243,7 @@ class SafeFireCrawlLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
|
|||||||
yield doc
|
yield doc
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if self.continue_on_failure:
|
if self.continue_on_failure:
|
||||||
log.exception(f'Error extracting content from URLs: {e}')
|
log.warning(f'Error extracting content from URLs with Firecrawl: {e}')
|
||||||
else:
|
else:
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
|
|||||||
@@ -145,9 +145,6 @@ pytest-docker~=3.2.5
|
|||||||
## LDAP
|
## LDAP
|
||||||
ldap3==2.9.1
|
ldap3==2.9.1
|
||||||
|
|
||||||
## Firecrawl
|
|
||||||
firecrawl-py==4.18.0
|
|
||||||
|
|
||||||
## Trace
|
## Trace
|
||||||
opentelemetry-api==1.40.0
|
opentelemetry-api==1.40.0
|
||||||
opentelemetry-sdk==1.40.0
|
opentelemetry-sdk==1.40.0
|
||||||
|
|||||||
@@ -167,7 +167,6 @@ all = [
|
|||||||
"oracledb==3.4.2",
|
"oracledb==3.4.2",
|
||||||
"colbert-ai==0.2.22",
|
"colbert-ai==0.2.22",
|
||||||
|
|
||||||
"firecrawl-py==4.18.0",
|
|
||||||
"azure-search-documents==11.6.0",
|
"azure-search-documents==11.6.0",
|
||||||
"unstructured==0.18.31",
|
"unstructured==0.18.31",
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -1133,22 +1133,6 @@ wheels = [
|
|||||||
{ url = "https://files.pythonhosted.org/packages/18/79/1b8fa1bb3568781e84c9200f951c735f3f157429f44be0495da55894d620/filetype-1.2.0-py2.py3-none-any.whl", hash = "sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25", size = 19970, upload-time = "2022-11-02T17:34:01.425Z" },
|
{ url = "https://files.pythonhosted.org/packages/18/79/1b8fa1bb3568781e84c9200f951c735f3f157429f44be0495da55894d620/filetype-1.2.0-py2.py3-none-any.whl", hash = "sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25", size = 19970, upload-time = "2022-11-02T17:34:01.425Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "firecrawl-py"
|
|
||||||
version = "1.12.0"
|
|
||||||
source = { registry = "https://pypi.org/simple" }
|
|
||||||
dependencies = [
|
|
||||||
{ name = "nest-asyncio" },
|
|
||||||
{ name = "pydantic" },
|
|
||||||
{ name = "python-dotenv" },
|
|
||||||
{ name = "requests" },
|
|
||||||
{ name = "websockets" },
|
|
||||||
]
|
|
||||||
sdist = { url = "https://files.pythonhosted.org/packages/18/db/e4f8ef9f0475b91b7c16a15e02fe19069d443cc5516cdefa2f9a0924a9a3/firecrawl_py-1.12.0.tar.gz", hash = "sha256:bbf883f6c774f05a5426121b85978a5f7b5ab11e614aff609f0673b097c3e553", size = 19655, upload-time = "2025-02-13T15:40:15.745Z" }
|
|
||||||
wheels = [
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/cc/d8/301d829099082c606ed16ed2a9acd263c47a365d471b9636435bf5d858b3/firecrawl_py-1.12.0-py3-none-any.whl", hash = "sha256:2b9c549315027da32421aca2a7ca597cb05cdbb968cfe0a89f389c7bb20afa4a", size = 31854, upload-time = "2025-02-13T15:40:14.492Z" },
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "flask"
|
name = "flask"
|
||||||
version = "3.1.0"
|
version = "3.1.0"
|
||||||
@@ -2692,7 +2676,6 @@ dependencies = [
|
|||||||
{ name = "fake-useragent" },
|
{ name = "fake-useragent" },
|
||||||
{ name = "fastapi" },
|
{ name = "fastapi" },
|
||||||
{ name = "faster-whisper" },
|
{ name = "faster-whisper" },
|
||||||
{ name = "firecrawl-py" },
|
|
||||||
{ name = "fpdf2" },
|
{ name = "fpdf2" },
|
||||||
{ name = "ftfy" },
|
{ name = "ftfy" },
|
||||||
{ name = "gcp-storage-emulator" },
|
{ name = "gcp-storage-emulator" },
|
||||||
@@ -2803,7 +2786,6 @@ requires-dist = [
|
|||||||
{ name = "fake-useragent", specifier = "==2.1.0" },
|
{ name = "fake-useragent", specifier = "==2.1.0" },
|
||||||
{ name = "fastapi", specifier = "==0.115.7" },
|
{ name = "fastapi", specifier = "==0.115.7" },
|
||||||
{ name = "faster-whisper", specifier = "==1.1.1" },
|
{ name = "faster-whisper", specifier = "==1.1.1" },
|
||||||
{ name = "firecrawl-py", specifier = "==1.12.0" },
|
|
||||||
{ name = "fpdf2", specifier = "==2.8.2" },
|
{ name = "fpdf2", specifier = "==2.8.2" },
|
||||||
{ name = "ftfy", specifier = "==6.2.3" },
|
{ name = "ftfy", specifier = "==6.2.3" },
|
||||||
{ name = "gcp-storage-emulator", specifier = ">=2024.8.3" },
|
{ name = "gcp-storage-emulator", specifier = ">=2024.8.3" },
|
||||||
|
|||||||
Reference in New Issue
Block a user