Files
open-webui/backend/open_webui/routers/retrieval.py
T

2701 lines
119 KiB
Python
Raw Normal View History

2024-08-28 00:10:27 +02:00
import json
import logging
import mimetypes
import os
import shutil
2025-05-10 17:54:41 +04:00
import asyncio
2025-10-07 16:20:27 -05:00
import re
2024-08-28 00:10:27 +02:00
import uuid
2024-06-07 21:18:04 -07:00
from datetime import datetime
2024-02-17 21:06:08 -08:00
from pathlib import Path
from typing import Iterator, List, Optional, Sequence, Union
2024-01-06 22:59:22 -08:00
2024-12-11 18:05:42 -08:00
from fastapi import (
Depends,
FastAPI,
2025-12-21 18:08:36 +04:00
Query,
2024-12-11 18:05:42 -08:00
File,
Form,
HTTPException,
UploadFile,
Request,
status,
APIRouter,
)
2024-09-10 02:27:50 +01:00
from fastapi.middleware.cors import CORSMiddleware
2025-02-14 07:05:10 +00:00
from fastapi.concurrency import run_in_threadpool
2024-09-10 02:27:50 +01:00
from pydantic import BaseModel
2024-10-25 21:46:14 -07:00
import tiktoken
2024-09-10 02:27:50 +01:00
2024-10-13 03:02:02 -07:00
2025-12-20 22:50:44 +09:00
from langchain_text_splitters import (
RecursiveCharacterTextSplitter,
TokenTextSplitter,
MarkdownHeaderTextSplitter,
)
2024-12-11 18:05:42 -08:00
from langchain_core.documents import Document
2025-11-09 21:06:21 -05:00
from open_webui.models.files import FileModel, FileUpdateForm, Files
2026-03-01 13:49:36 -06:00
from open_webui.utils.access_control.files import has_access_to_file
2024-12-10 00:54:13 -08:00
from open_webui.models.knowledge import Knowledges
2024-10-20 23:45:15 -07:00
from open_webui.storage.provider import Storage
2026-04-12 22:08:27 -05:00
from open_webui.internal.db import get_async_db, get_async_session
2026-04-12 14:22:11 -05:00
from sqlalchemy.ext.asyncio import AsyncSession
2024-12-11 18:05:42 -08:00
from open_webui.retrieval.vector.factory import VECTOR_DB_CLIENT
from open_webui.retrieval.vector.async_client import ASYNC_VECTOR_DB_CLIENT
2024-09-28 02:23:09 +02:00
# Document loaders
2026-04-21 15:47:32 +09:00
2024-12-11 18:05:42 -08:00
from open_webui.retrieval.loaders.youtube import YoutubeLoader
2024-09-28 02:23:09 +02:00
# Web search engines
2024-12-11 18:05:42 -08:00
from open_webui.retrieval.web.main import SearchResult
from open_webui.retrieval.web.utils import get_web_loader
2025-09-24 15:20:31 -05:00
from open_webui.retrieval.web.ollama import search_ollama_cloud
2025-09-25 14:02:46 -05:00
from open_webui.retrieval.web.perplexity_search import search_perplexity_search
2024-12-11 18:05:42 -08:00
from open_webui.retrieval.web.brave import search_brave
from open_webui.retrieval.web.kagi import search_kagi
from open_webui.retrieval.web.mojeek import search_mojeek
2025-02-10 16:44:47 +08:00
from open_webui.retrieval.web.bocha import search_bocha
2024-12-11 18:05:42 -08:00
from open_webui.retrieval.web.duckduckgo import search_duckduckgo
from open_webui.retrieval.web.google_pse import search_google_pse
from open_webui.retrieval.web.jina_search import search_jina
from open_webui.retrieval.web.searchapi import search_searchapi
2025-02-14 12:24:58 +08:00
from open_webui.retrieval.web.serpapi import search_serpapi
2024-12-11 18:05:42 -08:00
from open_webui.retrieval.web.searxng import search_searxng
from open_webui.retrieval.web.yacy import search_yacy
2024-12-11 18:05:42 -08:00
from open_webui.retrieval.web.serper import search_serper
from open_webui.retrieval.web.serply import search_serply
from open_webui.retrieval.web.serpstack import search_serpstack
from open_webui.retrieval.web.tavily import search_tavily
from open_webui.retrieval.web.bing import search_bing
2025-11-14 10:12:34 +10:00
from open_webui.retrieval.web.azure import search_azure
from open_webui.retrieval.web.exa import search_exa
from open_webui.retrieval.web.perplexity import search_perplexity
from open_webui.retrieval.web.sougou import search_sougou
2025-04-24 14:57:28 +08:00
from open_webui.retrieval.web.firecrawl import search_firecrawl
from open_webui.retrieval.web.external import search_external
2026-01-26 17:31:44 +05:00
from open_webui.retrieval.web.yandex import search_yandex
from open_webui.retrieval.web.ydc import search_youcom
2024-12-11 18:05:42 -08:00
from open_webui.retrieval.utils import (
2026-04-21 15:47:32 +09:00
build_loader_from_config,
2026-04-17 13:47:21 +09:00
filter_accessible_collections,
2025-10-07 16:20:27 -05:00
get_content_from_url,
2024-08-28 00:10:27 +02:00
get_embedding_function,
get_reranking_function,
2024-08-28 00:10:27 +02:00
get_model_path,
query_collection,
query_collection_with_hybrid_search,
query_doc,
query_doc_with_hybrid_search,
2024-02-17 21:06:08 -08:00
)
2025-09-28 20:17:27 -05:00
from open_webui.retrieval.vector.utils import filter_metadata
2024-12-11 18:05:42 -08:00
from open_webui.utils.misc import (
calculate_sha256_string,
sanitize_text_for_db,
2024-12-11 18:05:42 -08:00
)
from open_webui.utils.auth import get_admin_user, get_verified_user
from open_webui.utils.access_control import has_permission
2024-12-11 18:05:42 -08:00
2024-09-04 16:54:48 +02:00
from open_webui.config import (
2024-08-28 00:10:27 +02:00
ENV,
2024-04-25 07:49:59 -05:00
RAG_EMBEDDING_MODEL_AUTO_UPDATE,
2024-04-22 13:27:43 -05:00
RAG_EMBEDDING_MODEL_TRUST_REMOTE_CODE,
2024-04-25 07:49:59 -05:00
RAG_RERANKING_MODEL_AUTO_UPDATE,
2024-04-22 15:49:58 -05:00
RAG_RERANKING_MODEL_TRUST_REMOTE_CODE,
2024-08-28 00:10:27 +02:00
UPLOAD_DIR,
2024-10-28 11:33:52 +02:00
DEFAULT_LOCALE,
2025-03-30 21:55:20 -07:00
RAG_EMBEDDING_CONTENT_PREFIX,
RAG_EMBEDDING_QUERY_PREFIX,
2024-02-17 21:06:08 -08:00
)
from open_webui.env import (
DEVICE_TYPE,
DOCKER,
2026-02-12 15:25:24 -06:00
RAG_EMBEDDING_TIMEOUT,
2025-04-24 01:55:18 +09:00
SENTENCE_TRANSFORMERS_BACKEND,
SENTENCE_TRANSFORMERS_MODEL_KWARGS,
SENTENCE_TRANSFORMERS_CROSS_ENCODER_BACKEND,
SENTENCE_TRANSFORMERS_CROSS_ENCODER_MODEL_KWARGS,
SENTENCE_TRANSFORMERS_CROSS_ENCODER_SIGMOID_ACTIVATION_FUNCTION,
)
2025-04-24 01:55:18 +09:00
2024-12-11 18:05:42 -08:00
from open_webui.constants import ERROR_MESSAGES
2024-01-06 22:59:22 -08:00
log = logging.getLogger(__name__)
2024-12-11 18:05:42 -08:00
##########################################
#
# Utility functions
2026-03-24 04:49:48 -05:00
# Give us this day our relevant chunks, and lead us
# not into hallucination, but deliver us from noise.
2024-12-11 18:05:42 -08:00
#
##########################################
2024-06-01 19:03:56 -07:00
2024-12-11 18:46:29 -08:00
def get_ef(
engine: str,
2024-04-25 07:49:59 -05:00
embedding_model: str,
2025-12-02 09:21:03 -05:00
auto_update: bool = RAG_EMBEDDING_MODEL_AUTO_UPDATE,
2024-04-25 07:49:59 -05:00
):
2024-12-11 18:46:29 -08:00
ef = None
2026-03-17 17:58:01 -05:00
if embedding_model and engine == '':
2024-10-13 00:21:06 -07:00
from sentence_transformers import SentenceTransformer
try:
2024-12-11 18:46:29 -08:00
ef = SentenceTransformer(
get_model_path(embedding_model, auto_update),
device=DEVICE_TYPE,
trust_remote_code=RAG_EMBEDDING_MODEL_TRUST_REMOTE_CODE,
2025-04-24 01:55:18 +09:00
backend=SENTENCE_TRANSFORMERS_BACKEND,
model_kwargs=SENTENCE_TRANSFORMERS_MODEL_KWARGS,
)
except Exception as e:
2026-04-13 13:36:54 -05:00
log.error(f'Error loading SentenceTransformer: {e}')
2024-04-25 07:49:59 -05:00
2024-12-11 18:46:29 -08:00
return ef
2024-04-25 07:49:59 -05:00
2024-12-11 18:46:29 -08:00
def get_rf(
2026-03-17 17:58:01 -05:00
engine: str = '',
2025-04-02 18:15:14 -07:00
reranking_model: Optional[str] = None,
2026-03-17 17:58:01 -05:00
external_reranker_url: str = '',
external_reranker_api_key: str = '',
external_reranker_timeout: str = '',
2025-12-02 09:21:03 -05:00
auto_update: bool = RAG_RERANKING_MODEL_AUTO_UPDATE,
2024-04-25 07:49:59 -05:00
):
2024-12-11 18:46:29 -08:00
rf = None
# Convert timeout string to int or None (system default)
2026-03-17 17:58:01 -05:00
timeout_value = int(external_reranker_timeout) if external_reranker_timeout else None
2024-04-25 07:49:59 -05:00
if reranking_model:
2026-03-17 17:58:01 -05:00
if any(model in reranking_model for model in ['jinaai/jina-colbert-v2']):
2024-09-17 23:07:04 +02:00
try:
2024-12-11 18:05:42 -08:00
from open_webui.retrieval.models.colbert import ColBERT
2024-12-11 18:46:29 -08:00
rf = ColBERT(
2024-09-29 23:20:37 +02:00
get_model_path(reranking_model, auto_update),
2026-03-17 17:58:01 -05:00
env='docker' if DOCKER else None,
2024-09-19 18:40:23 +02:00
)
2024-12-11 18:46:29 -08:00
2024-09-17 23:13:51 +02:00
except Exception as e:
2026-03-17 17:58:01 -05:00
log.error(f'ColBERT: {e}')
2024-12-11 18:46:29 -08:00
raise Exception(ERROR_MESSAGES.DEFAULT(e))
2024-09-16 11:46:39 +02:00
else:
2026-03-17 17:58:01 -05:00
if engine == 'external':
2025-05-10 18:25:20 +04:00
try:
from open_webui.retrieval.models.external import ExternalReranker
rf = ExternalReranker(
url=external_reranker_url,
api_key=external_reranker_api_key,
model=reranking_model,
timeout=timeout_value,
2025-05-10 18:25:20 +04:00
)
except Exception as e:
2026-03-17 17:58:01 -05:00
log.error(f'ExternalReranking: {e}')
2025-05-10 18:25:20 +04:00
raise Exception(ERROR_MESSAGES.DEFAULT(e))
else:
import sentence_transformers
import torch
2025-05-10 18:25:20 +04:00
try:
rf = sentence_transformers.CrossEncoder(
get_model_path(reranking_model, auto_update),
device=DEVICE_TYPE,
trust_remote_code=RAG_RERANKING_MODEL_TRUST_REMOTE_CODE,
backend=SENTENCE_TRANSFORMERS_CROSS_ENCODER_BACKEND,
model_kwargs=SENTENCE_TRANSFORMERS_CROSS_ENCODER_MODEL_KWARGS,
activation_fn=(
torch.nn.Sigmoid()
if SENTENCE_TRANSFORMERS_CROSS_ENCODER_SIGMOID_ACTIVATION_FUNCTION
else None
),
2025-05-10 18:25:20 +04:00
)
except Exception as e:
2026-03-17 17:58:01 -05:00
log.error(f'CrossEncoder: {e}')
raise Exception(ERROR_MESSAGES.DEFAULT('CrossEncoder error'))
2025-10-07 16:20:27 -05:00
# Safely adjust pad_token_id if missing as some models do not have this in config
try:
2026-03-17 17:58:01 -05:00
model_cfg = getattr(rf, 'model', None)
if model_cfg and hasattr(model_cfg, 'config'):
2025-10-07 16:20:27 -05:00
cfg = model_cfg.config
2026-03-17 17:58:01 -05:00
if getattr(cfg, 'pad_token_id', None) is None:
2025-10-07 16:20:27 -05:00
# Fallback to eos_token_id when available
2026-03-17 17:58:01 -05:00
eos = getattr(cfg, 'eos_token_id', None)
2025-10-07 16:20:27 -05:00
if eos is not None:
cfg.pad_token_id = eos
2026-03-17 17:58:01 -05:00
log.debug(f'Missing pad_token_id detected; set to eos_token_id={eos}')
2025-10-07 16:20:27 -05:00
else:
2026-03-17 17:58:01 -05:00
log.warning('Neither pad_token_id nor eos_token_id present in model config')
2025-10-07 16:20:27 -05:00
except Exception as e2:
2026-03-17 17:58:01 -05:00
log.warning(f'Failed to adjust pad_token_id on CrossEncoder: {e2}')
2025-10-07 16:20:27 -05:00
2024-12-11 18:46:29 -08:00
return rf
2024-04-25 07:49:59 -05:00
2024-02-17 22:29:52 -08:00
2024-12-11 18:05:42 -08:00
##########################################
#
# API routes
#
##########################################
2024-04-27 15:38:50 -04:00
2024-12-11 18:05:42 -08:00
router = APIRouter()
2024-01-06 22:07:20 -08:00
2024-01-06 23:40:51 -08:00
class CollectionNameForm(BaseModel):
2024-09-28 02:29:08 +02:00
collection_name: Optional[str] = None
2024-01-06 22:59:22 -08:00
2024-09-28 02:29:08 +02:00
class ProcessUrlForm(CollectionNameForm):
2024-01-06 23:40:51 -08:00
url: str
2024-03-25 23:47:08 -07:00
2025-04-06 15:45:48 -07:00
class SearchForm(BaseModel):
2025-05-10 17:54:41 +04:00
queries: List[str]
2024-05-06 16:39:25 +08:00
2026-03-17 17:58:01 -05:00
@router.get('/')
2024-12-11 18:05:42 -08:00
async def get_status(request: Request):
2024-02-17 22:29:52 -08:00
return {
2026-03-17 17:58:01 -05:00
'status': True,
'CHUNK_SIZE': request.app.state.config.CHUNK_SIZE,
'CHUNK_OVERLAP': request.app.state.config.CHUNK_OVERLAP,
'RAG_TEMPLATE': request.app.state.config.RAG_TEMPLATE,
'RAG_EMBEDDING_ENGINE': request.app.state.config.RAG_EMBEDDING_ENGINE,
'RAG_EMBEDDING_MODEL': request.app.state.config.RAG_EMBEDDING_MODEL,
'RAG_RERANKING_MODEL': request.app.state.config.RAG_RERANKING_MODEL,
'RAG_EMBEDDING_BATCH_SIZE': request.app.state.config.RAG_EMBEDDING_BATCH_SIZE,
'ENABLE_ASYNC_EMBEDDING': request.app.state.config.ENABLE_ASYNC_EMBEDDING,
'RAG_EMBEDDING_CONCURRENT_REQUESTS': request.app.state.config.RAG_EMBEDDING_CONCURRENT_REQUESTS,
2024-02-19 11:05:45 -08:00
}
2026-03-17 17:58:01 -05:00
@router.get('/embedding')
2024-12-11 18:05:42 -08:00
async def get_embedding_config(request: Request, user=Depends(get_admin_user)):
2024-02-19 11:05:45 -08:00
return {
2026-03-17 17:58:01 -05:00
'status': True,
'RAG_EMBEDDING_ENGINE': request.app.state.config.RAG_EMBEDDING_ENGINE,
'RAG_EMBEDDING_MODEL': request.app.state.config.RAG_EMBEDDING_MODEL,
'RAG_EMBEDDING_BATCH_SIZE': request.app.state.config.RAG_EMBEDDING_BATCH_SIZE,
'ENABLE_ASYNC_EMBEDDING': request.app.state.config.ENABLE_ASYNC_EMBEDDING,
'RAG_EMBEDDING_CONCURRENT_REQUESTS': request.app.state.config.RAG_EMBEDDING_CONCURRENT_REQUESTS,
'openai_config': {
'url': request.app.state.config.RAG_OPENAI_API_BASE_URL,
'key': request.app.state.config.RAG_OPENAI_API_KEY,
2024-04-14 19:15:39 -04:00
},
2026-03-17 17:58:01 -05:00
'ollama_config': {
'url': request.app.state.config.RAG_OLLAMA_BASE_URL,
'key': request.app.state.config.RAG_OLLAMA_API_KEY,
2024-11-18 14:19:56 -08:00
},
2026-03-17 17:58:01 -05:00
'azure_openai_config': {
'url': request.app.state.config.RAG_AZURE_OPENAI_BASE_URL,
'key': request.app.state.config.RAG_AZURE_OPENAI_API_KEY,
'version': request.app.state.config.RAG_AZURE_OPENAI_API_VERSION,
2025-05-19 22:58:04 -04:00
},
2024-02-19 11:05:45 -08:00
}
2024-04-14 19:15:39 -04:00
class OpenAIConfigForm(BaseModel):
url: str
key: str
2024-11-18 14:19:56 -08:00
class OllamaConfigForm(BaseModel):
url: str
key: str
2025-05-19 22:58:04 -04:00
class AzureOpenAIConfigForm(BaseModel):
url: str
key: str
version: str
2024-02-19 11:05:45 -08:00
class EmbeddingModelUpdateForm(BaseModel):
2024-04-14 19:15:39 -04:00
openai_config: Optional[OpenAIConfigForm] = None
2024-11-18 14:19:56 -08:00
ollama_config: Optional[OllamaConfigForm] = None
2025-05-19 22:58:04 -04:00
azure_openai_config: Optional[AzureOpenAIConfigForm] = None
2025-11-25 02:05:27 -05:00
RAG_EMBEDDING_ENGINE: str
RAG_EMBEDDING_MODEL: str
RAG_EMBEDDING_BATCH_SIZE: Optional[int] = 1
ENABLE_ASYNC_EMBEDDING: Optional[bool] = True
2026-02-21 14:33:48 -06:00
RAG_EMBEDDING_CONCURRENT_REQUESTS: Optional[int] = 0
2024-02-19 11:05:45 -08:00
2025-11-25 02:05:27 -05:00
def unload_embedding_model(request: Request):
2026-03-17 17:58:01 -05:00
if request.app.state.config.RAG_EMBEDDING_ENGINE == '':
2025-08-21 10:49:03 +02:00
# unloads current internal embedding model and clears VRAM cache
request.app.state.ef = None
request.app.state.EMBEDDING_FUNCTION = None
import gc
2025-08-21 13:40:56 +02:00
2025-08-21 10:49:03 +02:00
gc.collect()
2026-03-17 17:58:01 -05:00
if DEVICE_TYPE == 'cuda':
2025-08-21 13:19:24 +02:00
import torch
2025-08-21 13:40:56 +02:00
2025-08-21 13:19:24 +02:00
if torch.cuda.is_available():
torch.cuda.empty_cache()
2025-11-25 02:05:27 -05:00
2026-03-17 17:58:01 -05:00
@router.post('/embedding/update')
async def update_embedding_config(request: Request, form_data: EmbeddingModelUpdateForm, user=Depends(get_admin_user)):
2025-11-25 02:05:27 -05:00
log.info(
2026-03-17 17:58:01 -05:00
f'Updating embedding model: {request.app.state.config.RAG_EMBEDDING_MODEL} to {form_data.RAG_EMBEDDING_MODEL}'
2025-11-25 02:05:27 -05:00
)
unload_embedding_model(request)
try:
2025-11-25 02:05:27 -05:00
request.app.state.config.RAG_EMBEDDING_ENGINE = form_data.RAG_EMBEDDING_ENGINE
request.app.state.config.RAG_EMBEDDING_MODEL = form_data.RAG_EMBEDDING_MODEL
2026-03-17 17:58:01 -05:00
request.app.state.config.RAG_EMBEDDING_BATCH_SIZE = form_data.RAG_EMBEDDING_BATCH_SIZE
request.app.state.config.ENABLE_ASYNC_EMBEDDING = form_data.ENABLE_ASYNC_EMBEDDING
request.app.state.config.RAG_EMBEDDING_CONCURRENT_REQUESTS = form_data.RAG_EMBEDDING_CONCURRENT_REQUESTS
2024-04-14 18:31:40 -04:00
2025-05-30 00:34:18 +04:00
if request.app.state.config.RAG_EMBEDDING_ENGINE in [
2026-03-17 17:58:01 -05:00
'ollama',
'openai',
'azure_openai',
2025-05-30 00:34:18 +04:00
]:
if form_data.openai_config is not None:
2026-03-17 17:58:01 -05:00
request.app.state.config.RAG_OPENAI_API_BASE_URL = form_data.openai_config.url
request.app.state.config.RAG_OPENAI_API_KEY = form_data.openai_config.key
2024-11-18 14:19:56 -08:00
if form_data.ollama_config is not None:
2026-03-17 17:58:01 -05:00
request.app.state.config.RAG_OLLAMA_BASE_URL = form_data.ollama_config.url
request.app.state.config.RAG_OLLAMA_API_KEY = form_data.ollama_config.key
2024-11-18 14:19:56 -08:00
2025-05-19 22:58:04 -04:00
if form_data.azure_openai_config is not None:
2026-03-17 17:58:01 -05:00
request.app.state.config.RAG_AZURE_OPENAI_BASE_URL = form_data.azure_openai_config.url
request.app.state.config.RAG_AZURE_OPENAI_API_KEY = form_data.azure_openai_config.key
request.app.state.config.RAG_AZURE_OPENAI_API_VERSION = form_data.azure_openai_config.version
2025-05-19 22:58:04 -04:00
2024-12-11 18:46:29 -08:00
request.app.state.ef = get_ef(
request.app.state.config.RAG_EMBEDDING_ENGINE,
request.app.state.config.RAG_EMBEDDING_MODEL,
)
2024-12-11 18:05:42 -08:00
request.app.state.EMBEDDING_FUNCTION = get_embedding_function(
request.app.state.config.RAG_EMBEDDING_ENGINE,
request.app.state.config.RAG_EMBEDDING_MODEL,
2024-12-11 18:46:29 -08:00
request.app.state.ef,
2024-11-18 14:19:56 -08:00
(
2024-12-12 20:22:17 -08:00
request.app.state.config.RAG_OPENAI_API_BASE_URL
2026-03-17 17:58:01 -05:00
if request.app.state.config.RAG_EMBEDDING_ENGINE == 'openai'
2025-05-19 22:58:04 -04:00
else (
request.app.state.config.RAG_OLLAMA_BASE_URL
2026-03-17 17:58:01 -05:00
if request.app.state.config.RAG_EMBEDDING_ENGINE == 'ollama'
2025-05-19 22:58:04 -04:00
else request.app.state.config.RAG_AZURE_OPENAI_BASE_URL
)
2024-11-18 14:19:56 -08:00
),
(
2024-12-12 20:22:17 -08:00
request.app.state.config.RAG_OPENAI_API_KEY
2026-03-17 17:58:01 -05:00
if request.app.state.config.RAG_EMBEDDING_ENGINE == 'openai'
2025-05-19 22:58:04 -04:00
else (
request.app.state.config.RAG_OLLAMA_API_KEY
2026-03-17 17:58:01 -05:00
if request.app.state.config.RAG_EMBEDDING_ENGINE == 'ollama'
2025-05-19 22:58:04 -04:00
else request.app.state.config.RAG_AZURE_OPENAI_API_KEY
)
2024-11-18 14:19:56 -08:00
),
2024-12-11 18:05:42 -08:00
request.app.state.config.RAG_EMBEDDING_BATCH_SIZE,
2025-05-30 00:34:18 +04:00
azure_api_version=(
request.app.state.config.RAG_AZURE_OPENAI_API_VERSION
2026-03-17 17:58:01 -05:00
if request.app.state.config.RAG_EMBEDDING_ENGINE == 'azure_openai'
2025-05-19 22:58:04 -04:00
else None
),
enable_async=request.app.state.config.ENABLE_ASYNC_EMBEDDING,
2026-02-21 14:33:48 -06:00
concurrent_requests=request.app.state.config.RAG_EMBEDDING_CONCURRENT_REQUESTS,
2024-04-27 15:38:50 -04:00
)
2024-04-10 00:59:05 -07:00
return {
2026-03-17 17:58:01 -05:00
'status': True,
'RAG_EMBEDDING_ENGINE': request.app.state.config.RAG_EMBEDDING_ENGINE,
'RAG_EMBEDDING_MODEL': request.app.state.config.RAG_EMBEDDING_MODEL,
'RAG_EMBEDDING_BATCH_SIZE': request.app.state.config.RAG_EMBEDDING_BATCH_SIZE,
'ENABLE_ASYNC_EMBEDDING': request.app.state.config.ENABLE_ASYNC_EMBEDDING,
'RAG_EMBEDDING_CONCURRENT_REQUESTS': request.app.state.config.RAG_EMBEDDING_CONCURRENT_REQUESTS,
'openai_config': {
'url': request.app.state.config.RAG_OPENAI_API_BASE_URL,
'key': request.app.state.config.RAG_OPENAI_API_KEY,
2024-04-14 19:15:39 -04:00
},
2026-03-17 17:58:01 -05:00
'ollama_config': {
'url': request.app.state.config.RAG_OLLAMA_BASE_URL,
'key': request.app.state.config.RAG_OLLAMA_API_KEY,
2024-11-18 14:19:56 -08:00
},
2026-03-17 17:58:01 -05:00
'azure_openai_config': {
'url': request.app.state.config.RAG_AZURE_OPENAI_BASE_URL,
'key': request.app.state.config.RAG_AZURE_OPENAI_API_KEY,
'version': request.app.state.config.RAG_AZURE_OPENAI_API_VERSION,
2025-05-19 22:58:04 -04:00
},
2024-04-10 00:59:05 -07:00
}
except Exception as e:
2026-03-17 17:58:01 -05:00
log.exception(f'Problem updating embedding model: {e}')
2024-04-10 00:59:05 -07:00
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=ERROR_MESSAGES.DEFAULT(e),
)
2024-02-17 22:29:52 -08:00
2026-03-17 17:58:01 -05:00
@router.get('/config')
2024-12-11 18:05:42 -08:00
async def get_rag_config(request: Request, user=Depends(get_admin_user)):
2024-02-17 22:29:52 -08:00
return {
2026-03-17 17:58:01 -05:00
'status': True,
2025-04-12 16:33:36 -07:00
# RAG settings
2026-03-17 17:58:01 -05:00
'RAG_TEMPLATE': request.app.state.config.RAG_TEMPLATE,
'TOP_K': request.app.state.config.TOP_K,
'BYPASS_EMBEDDING_AND_RETRIEVAL': request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL,
'RAG_FULL_CONTEXT': request.app.state.config.RAG_FULL_CONTEXT,
2025-04-12 16:33:36 -07:00
# Hybrid search settings
2026-03-17 17:58:01 -05:00
'ENABLE_RAG_HYBRID_SEARCH': request.app.state.config.ENABLE_RAG_HYBRID_SEARCH,
'ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS': request.app.state.config.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS,
'TOP_K_RERANKER': request.app.state.config.TOP_K_RERANKER,
'RELEVANCE_THRESHOLD': request.app.state.config.RELEVANCE_THRESHOLD,
'HYBRID_BM25_WEIGHT': request.app.state.config.HYBRID_BM25_WEIGHT,
2025-04-12 16:33:36 -07:00
# Content extraction settings
2026-03-17 17:58:01 -05:00
'CONTENT_EXTRACTION_ENGINE': request.app.state.config.CONTENT_EXTRACTION_ENGINE,
'PDF_EXTRACT_IMAGES': request.app.state.config.PDF_EXTRACT_IMAGES,
'PDF_LOADER_MODE': request.app.state.config.PDF_LOADER_MODE,
'DATALAB_MARKER_API_KEY': request.app.state.config.DATALAB_MARKER_API_KEY,
'DATALAB_MARKER_API_BASE_URL': request.app.state.config.DATALAB_MARKER_API_BASE_URL,
'DATALAB_MARKER_ADDITIONAL_CONFIG': request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG,
'DATALAB_MARKER_SKIP_CACHE': request.app.state.config.DATALAB_MARKER_SKIP_CACHE,
'DATALAB_MARKER_FORCE_OCR': request.app.state.config.DATALAB_MARKER_FORCE_OCR,
'DATALAB_MARKER_PAGINATE': request.app.state.config.DATALAB_MARKER_PAGINATE,
'DATALAB_MARKER_STRIP_EXISTING_OCR': request.app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR,
'DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION': request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION,
'DATALAB_MARKER_FORMAT_LINES': request.app.state.config.DATALAB_MARKER_FORMAT_LINES,
'DATALAB_MARKER_USE_LLM': request.app.state.config.DATALAB_MARKER_USE_LLM,
'DATALAB_MARKER_OUTPUT_FORMAT': request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT,
'EXTERNAL_DOCUMENT_LOADER_URL': request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL,
'EXTERNAL_DOCUMENT_LOADER_API_KEY': request.app.state.config.EXTERNAL_DOCUMENT_LOADER_API_KEY,
'TIKA_SERVER_URL': request.app.state.config.TIKA_SERVER_URL,
'DOCLING_SERVER_URL': request.app.state.config.DOCLING_SERVER_URL,
'DOCLING_API_KEY': request.app.state.config.DOCLING_API_KEY,
'DOCLING_PARAMS': request.app.state.config.DOCLING_PARAMS,
'DOCUMENT_INTELLIGENCE_ENDPOINT': request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
'DOCUMENT_INTELLIGENCE_KEY': request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
'DOCUMENT_INTELLIGENCE_MODEL': request.app.state.config.DOCUMENT_INTELLIGENCE_MODEL,
'MISTRAL_OCR_API_BASE_URL': request.app.state.config.MISTRAL_OCR_API_BASE_URL,
'MISTRAL_OCR_API_KEY': request.app.state.config.MISTRAL_OCR_API_KEY,
'PADDLEOCR_VL_BASE_URL': request.app.state.config.PADDLEOCR_VL_BASE_URL,
'PADDLEOCR_VL_TOKEN': request.app.state.config.PADDLEOCR_VL_TOKEN,
# MinerU settings
2026-03-17 17:58:01 -05:00
'MINERU_API_MODE': request.app.state.config.MINERU_API_MODE,
'MINERU_API_URL': request.app.state.config.MINERU_API_URL,
'MINERU_API_KEY': request.app.state.config.MINERU_API_KEY,
'MINERU_API_TIMEOUT': request.app.state.config.MINERU_API_TIMEOUT,
'MINERU_PARAMS': request.app.state.config.MINERU_PARAMS,
2025-05-10 18:25:20 +04:00
# Reranking settings
2026-03-17 17:58:01 -05:00
'RAG_RERANKING_MODEL': request.app.state.config.RAG_RERANKING_MODEL,
'RAG_RERANKING_ENGINE': request.app.state.config.RAG_RERANKING_ENGINE,
'RAG_RERANKING_BATCH_SIZE': request.app.state.config.RAG_RERANKING_BATCH_SIZE,
2026-03-17 17:58:01 -05:00
'RAG_EXTERNAL_RERANKER_URL': request.app.state.config.RAG_EXTERNAL_RERANKER_URL,
'RAG_EXTERNAL_RERANKER_API_KEY': request.app.state.config.RAG_EXTERNAL_RERANKER_API_KEY,
'RAG_EXTERNAL_RERANKER_TIMEOUT': request.app.state.config.RAG_EXTERNAL_RERANKER_TIMEOUT,
2025-04-12 16:33:36 -07:00
# Chunking settings
2026-03-17 17:58:01 -05:00
'TEXT_SPLITTER': request.app.state.config.TEXT_SPLITTER,
'ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER': request.app.state.config.ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER,
'CHUNK_SIZE': request.app.state.config.CHUNK_SIZE,
'CHUNK_MIN_SIZE_TARGET': request.app.state.config.CHUNK_MIN_SIZE_TARGET,
'CHUNK_OVERLAP': request.app.state.config.CHUNK_OVERLAP,
2025-04-12 16:33:36 -07:00
# File upload settings
2026-03-17 17:58:01 -05:00
'FILE_MAX_SIZE': request.app.state.config.FILE_MAX_SIZE,
'FILE_MAX_COUNT': request.app.state.config.FILE_MAX_COUNT,
'FILE_IMAGE_COMPRESSION_WIDTH': request.app.state.config.FILE_IMAGE_COMPRESSION_WIDTH,
'FILE_IMAGE_COMPRESSION_HEIGHT': request.app.state.config.FILE_IMAGE_COMPRESSION_HEIGHT,
'ALLOWED_FILE_EXTENSIONS': request.app.state.config.ALLOWED_FILE_EXTENSIONS,
2025-04-12 16:33:36 -07:00
# Integration settings
2026-03-17 17:58:01 -05:00
'ENABLE_GOOGLE_DRIVE_INTEGRATION': request.app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION,
'ENABLE_ONEDRIVE_INTEGRATION': request.app.state.config.ENABLE_ONEDRIVE_INTEGRATION,
2025-04-12 16:33:36 -07:00
# Web search settings
2026-03-17 17:58:01 -05:00
'web': {
'ENABLE_WEB_SEARCH': request.app.state.config.ENABLE_WEB_SEARCH,
'WEB_SEARCH_ENGINE': request.app.state.config.WEB_SEARCH_ENGINE,
'WEB_SEARCH_TRUST_ENV': request.app.state.config.WEB_SEARCH_TRUST_ENV,
'WEB_SEARCH_RESULT_COUNT': request.app.state.config.WEB_SEARCH_RESULT_COUNT,
'WEB_SEARCH_CONCURRENT_REQUESTS': request.app.state.config.WEB_SEARCH_CONCURRENT_REQUESTS,
'WEB_FETCH_MAX_CONTENT_LENGTH': request.app.state.config.WEB_FETCH_MAX_CONTENT_LENGTH,
'WEB_LOADER_CONCURRENT_REQUESTS': request.app.state.config.WEB_LOADER_CONCURRENT_REQUESTS,
'WEB_SEARCH_DOMAIN_FILTER_LIST': request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
'BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL': request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL,
'BYPASS_WEB_SEARCH_WEB_LOADER': request.app.state.config.BYPASS_WEB_SEARCH_WEB_LOADER,
'OLLAMA_CLOUD_WEB_SEARCH_API_KEY': request.app.state.config.OLLAMA_CLOUD_WEB_SEARCH_API_KEY,
'SEARXNG_QUERY_URL': request.app.state.config.SEARXNG_QUERY_URL,
'SEARXNG_LANGUAGE': request.app.state.config.SEARXNG_LANGUAGE,
'YACY_QUERY_URL': request.app.state.config.YACY_QUERY_URL,
'YACY_USERNAME': request.app.state.config.YACY_USERNAME,
'YACY_PASSWORD': request.app.state.config.YACY_PASSWORD,
'GOOGLE_PSE_API_KEY': request.app.state.config.GOOGLE_PSE_API_KEY,
'GOOGLE_PSE_ENGINE_ID': request.app.state.config.GOOGLE_PSE_ENGINE_ID,
'BRAVE_SEARCH_API_KEY': request.app.state.config.BRAVE_SEARCH_API_KEY,
'KAGI_SEARCH_API_KEY': request.app.state.config.KAGI_SEARCH_API_KEY,
'MOJEEK_SEARCH_API_KEY': request.app.state.config.MOJEEK_SEARCH_API_KEY,
'BOCHA_SEARCH_API_KEY': request.app.state.config.BOCHA_SEARCH_API_KEY,
'SERPSTACK_API_KEY': request.app.state.config.SERPSTACK_API_KEY,
'SERPSTACK_HTTPS': request.app.state.config.SERPSTACK_HTTPS,
'SERPER_API_KEY': request.app.state.config.SERPER_API_KEY,
'SERPLY_API_KEY': request.app.state.config.SERPLY_API_KEY,
'DDGS_BACKEND': request.app.state.config.DDGS_BACKEND,
'TAVILY_API_KEY': request.app.state.config.TAVILY_API_KEY,
'SEARCHAPI_API_KEY': request.app.state.config.SEARCHAPI_API_KEY,
'SEARCHAPI_ENGINE': request.app.state.config.SEARCHAPI_ENGINE,
'SERPAPI_API_KEY': request.app.state.config.SERPAPI_API_KEY,
'SERPAPI_ENGINE': request.app.state.config.SERPAPI_ENGINE,
'JINA_API_KEY': request.app.state.config.JINA_API_KEY,
'JINA_API_BASE_URL': request.app.state.config.JINA_API_BASE_URL,
'BING_SEARCH_V7_ENDPOINT': request.app.state.config.BING_SEARCH_V7_ENDPOINT,
'BING_SEARCH_V7_SUBSCRIPTION_KEY': request.app.state.config.BING_SEARCH_V7_SUBSCRIPTION_KEY,
'EXA_API_KEY': request.app.state.config.EXA_API_KEY,
'PERPLEXITY_API_KEY': request.app.state.config.PERPLEXITY_API_KEY,
'PERPLEXITY_MODEL': request.app.state.config.PERPLEXITY_MODEL,
'PERPLEXITY_SEARCH_CONTEXT_USAGE': request.app.state.config.PERPLEXITY_SEARCH_CONTEXT_USAGE,
'PERPLEXITY_SEARCH_API_URL': request.app.state.config.PERPLEXITY_SEARCH_API_URL,
'SOUGOU_API_SID': request.app.state.config.SOUGOU_API_SID,
'SOUGOU_API_SK': request.app.state.config.SOUGOU_API_SK,
'WEB_LOADER_ENGINE': request.app.state.config.WEB_LOADER_ENGINE,
'WEB_LOADER_TIMEOUT': request.app.state.config.WEB_LOADER_TIMEOUT,
'ENABLE_WEB_LOADER_SSL_VERIFICATION': request.app.state.config.ENABLE_WEB_LOADER_SSL_VERIFICATION,
'PLAYWRIGHT_WS_URL': request.app.state.config.PLAYWRIGHT_WS_URL,
'PLAYWRIGHT_TIMEOUT': request.app.state.config.PLAYWRIGHT_TIMEOUT,
'FIRECRAWL_API_KEY': request.app.state.config.FIRECRAWL_API_KEY,
'FIRECRAWL_API_BASE_URL': request.app.state.config.FIRECRAWL_API_BASE_URL,
'FIRECRAWL_TIMEOUT': request.app.state.config.FIRECRAWL_TIMEOUT,
'TAVILY_EXTRACT_DEPTH': request.app.state.config.TAVILY_EXTRACT_DEPTH,
'EXTERNAL_WEB_SEARCH_URL': request.app.state.config.EXTERNAL_WEB_SEARCH_URL,
'EXTERNAL_WEB_SEARCH_API_KEY': request.app.state.config.EXTERNAL_WEB_SEARCH_API_KEY,
'EXTERNAL_WEB_LOADER_URL': request.app.state.config.EXTERNAL_WEB_LOADER_URL,
'EXTERNAL_WEB_LOADER_API_KEY': request.app.state.config.EXTERNAL_WEB_LOADER_API_KEY,
'YOUTUBE_LOADER_LANGUAGE': request.app.state.config.YOUTUBE_LOADER_LANGUAGE,
'YOUTUBE_LOADER_PROXY_URL': request.app.state.config.YOUTUBE_LOADER_PROXY_URL,
'YOUTUBE_LOADER_TRANSLATION': request.app.state.YOUTUBE_LOADER_TRANSLATION,
'YANDEX_WEB_SEARCH_URL': request.app.state.config.YANDEX_WEB_SEARCH_URL,
'YANDEX_WEB_SEARCH_API_KEY': request.app.state.config.YANDEX_WEB_SEARCH_API_KEY,
'YANDEX_WEB_SEARCH_CONFIG': request.app.state.config.YANDEX_WEB_SEARCH_CONFIG,
'YOUCOM_API_KEY': request.app.state.config.YOUCOM_API_KEY,
2024-06-01 19:03:56 -07:00
},
2024-02-17 22:29:52 -08:00
}
2024-06-01 19:40:48 -07:00
class WebConfig(BaseModel):
2025-04-12 16:33:36 -07:00
ENABLE_WEB_SEARCH: Optional[bool] = None
WEB_SEARCH_ENGINE: Optional[str] = None
WEB_SEARCH_TRUST_ENV: Optional[bool] = None
WEB_SEARCH_RESULT_COUNT: Optional[int] = None
WEB_SEARCH_CONCURRENT_REQUESTS: Optional[int] = None
2026-03-31 23:12:23 -05:00
WEB_SEARCH_DOMAIN_FILTER_LIST: Optional[List[str]] = []
2026-03-17 17:54:59 -05:00
WEB_FETCH_MAX_CONTENT_LENGTH: Optional[int] = None
WEB_LOADER_CONCURRENT_REQUESTS: Optional[int] = None
2025-04-12 16:33:36 -07:00
BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL: Optional[bool] = None
2025-05-23 02:30:35 +04:00
BYPASS_WEB_SEARCH_WEB_LOADER: Optional[bool] = None
2025-09-24 15:19:05 -05:00
OLLAMA_CLOUD_WEB_SEARCH_API_KEY: Optional[str] = None
2025-04-12 16:33:36 -07:00
SEARXNG_QUERY_URL: Optional[str] = None
SEARXNG_LANGUAGE: Optional[str] = None
YACY_QUERY_URL: Optional[str] = None
YACY_USERNAME: Optional[str] = None
YACY_PASSWORD: Optional[str] = None
2025-04-12 16:33:36 -07:00
GOOGLE_PSE_API_KEY: Optional[str] = None
GOOGLE_PSE_ENGINE_ID: Optional[str] = None
BRAVE_SEARCH_API_KEY: Optional[str] = None
KAGI_SEARCH_API_KEY: Optional[str] = None
MOJEEK_SEARCH_API_KEY: Optional[str] = None
BOCHA_SEARCH_API_KEY: Optional[str] = None
SERPSTACK_API_KEY: Optional[str] = None
SERPSTACK_HTTPS: Optional[bool] = None
SERPER_API_KEY: Optional[str] = None
SERPLY_API_KEY: Optional[str] = None
DDGS_BACKEND: Optional[str] = None
2025-04-12 16:33:36 -07:00
TAVILY_API_KEY: Optional[str] = None
SEARCHAPI_API_KEY: Optional[str] = None
SEARCHAPI_ENGINE: Optional[str] = None
SERPAPI_API_KEY: Optional[str] = None
SERPAPI_ENGINE: Optional[str] = None
JINA_API_KEY: Optional[str] = None
2026-01-01 02:17:47 +04:00
JINA_API_BASE_URL: Optional[str] = None
2025-04-12 16:33:36 -07:00
BING_SEARCH_V7_ENDPOINT: Optional[str] = None
BING_SEARCH_V7_SUBSCRIPTION_KEY: Optional[str] = None
EXA_API_KEY: Optional[str] = None
PERPLEXITY_API_KEY: Optional[str] = None
PERPLEXITY_MODEL: Optional[str] = None
PERPLEXITY_SEARCH_CONTEXT_USAGE: Optional[str] = None
PERPLEXITY_SEARCH_API_URL: Optional[str] = None
2025-04-12 16:33:36 -07:00
SOUGOU_API_SID: Optional[str] = None
SOUGOU_API_SK: Optional[str] = None
WEB_LOADER_ENGINE: Optional[str] = None
2025-12-08 11:49:27 -05:00
WEB_LOADER_TIMEOUT: Optional[str] = None
2025-04-12 16:33:36 -07:00
ENABLE_WEB_LOADER_SSL_VERIFICATION: Optional[bool] = None
PLAYWRIGHT_WS_URL: Optional[str] = None
PLAYWRIGHT_TIMEOUT: Optional[int] = None
FIRECRAWL_API_KEY: Optional[str] = None
FIRECRAWL_API_BASE_URL: Optional[str] = None
2026-01-01 02:07:22 +04:00
FIRECRAWL_TIMEOUT: Optional[str] = None
2025-04-12 16:33:36 -07:00
TAVILY_EXTRACT_DEPTH: Optional[str] = None
EXTERNAL_WEB_SEARCH_URL: Optional[str] = None
EXTERNAL_WEB_SEARCH_API_KEY: Optional[str] = None
EXTERNAL_WEB_LOADER_URL: Optional[str] = None
EXTERNAL_WEB_LOADER_API_KEY: Optional[str] = None
2025-04-12 16:33:36 -07:00
YOUTUBE_LOADER_LANGUAGE: Optional[List[str]] = None
YOUTUBE_LOADER_PROXY_URL: Optional[str] = None
YOUTUBE_LOADER_TRANSLATION: Optional[str] = None
2026-01-26 17:31:44 +05:00
YANDEX_WEB_SEARCH_URL: Optional[str] = None
YANDEX_WEB_SEARCH_API_KEY: Optional[str] = None
YANDEX_WEB_SEARCH_CONFIG: Optional[str] = None
YOUCOM_API_KEY: Optional[str] = None
2025-04-12 16:33:36 -07:00
class ConfigForm(BaseModel):
# RAG settings
2025-04-12 22:55:24 -07:00
RAG_TEMPLATE: Optional[str] = None
2025-04-12 16:33:36 -07:00
TOP_K: Optional[int] = None
2025-02-26 15:42:19 -08:00
BYPASS_EMBEDDING_AND_RETRIEVAL: Optional[bool] = None
2025-04-12 16:33:36 -07:00
RAG_FULL_CONTEXT: Optional[bool] = None
# Hybrid search settings
ENABLE_RAG_HYBRID_SEARCH: Optional[bool] = None
ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS: Optional[bool] = None
2025-04-12 16:33:36 -07:00
TOP_K_RERANKER: Optional[int] = None
RELEVANCE_THRESHOLD: Optional[float] = None
2025-05-23 22:06:44 +02:00
HYBRID_BM25_WEIGHT: Optional[float] = None
2025-04-12 16:33:36 -07:00
# Content extraction settings
CONTENT_EXTRACTION_ENGINE: Optional[str] = None
PDF_EXTRACT_IMAGES: Optional[bool] = None
2026-01-21 23:51:36 +04:00
PDF_LOADER_MODE: Optional[str] = None
2025-10-07 16:20:27 -05:00
2025-05-27 00:44:07 -04:00
DATALAB_MARKER_API_KEY: Optional[str] = None
2025-07-22 20:49:28 -04:00
DATALAB_MARKER_API_BASE_URL: Optional[str] = None
DATALAB_MARKER_ADDITIONAL_CONFIG: Optional[str] = None
2025-05-27 00:44:07 -04:00
DATALAB_MARKER_SKIP_CACHE: Optional[bool] = None
DATALAB_MARKER_FORCE_OCR: Optional[bool] = None
DATALAB_MARKER_PAGINATE: Optional[bool] = None
DATALAB_MARKER_STRIP_EXISTING_OCR: Optional[bool] = None
DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION: Optional[bool] = None
2025-07-22 21:06:29 -04:00
DATALAB_MARKER_FORMAT_LINES: Optional[bool] = None
2025-05-27 00:44:07 -04:00
DATALAB_MARKER_USE_LLM: Optional[bool] = None
DATALAB_MARKER_OUTPUT_FORMAT: Optional[str] = None
2025-10-07 16:20:27 -05:00
2025-05-14 22:28:40 +04:00
EXTERNAL_DOCUMENT_LOADER_URL: Optional[str] = None
EXTERNAL_DOCUMENT_LOADER_API_KEY: Optional[str] = None
2025-04-12 16:33:36 -07:00
TIKA_SERVER_URL: Optional[str] = None
DOCLING_SERVER_URL: Optional[str] = None
2025-11-24 16:01:13 -05:00
DOCLING_API_KEY: Optional[str] = None
2025-10-07 16:20:27 -05:00
DOCLING_PARAMS: Optional[dict] = None
2025-04-12 16:33:36 -07:00
DOCUMENT_INTELLIGENCE_ENDPOINT: Optional[str] = None
DOCUMENT_INTELLIGENCE_KEY: Optional[str] = None
DOCUMENT_INTELLIGENCE_MODEL: Optional[str] = None
2025-11-05 23:25:51 -05:00
MISTRAL_OCR_API_BASE_URL: Optional[str] = None
2025-04-12 16:33:36 -07:00
MISTRAL_OCR_API_KEY: Optional[str] = None
PADDLEOCR_VL_BASE_URL: Optional[str] = None
PADDLEOCR_VL_TOKEN: Optional[str] = None
2025-04-12 16:33:36 -07:00
# MinerU settings
MINERU_API_MODE: Optional[str] = None
MINERU_API_URL: Optional[str] = None
MINERU_API_KEY: Optional[str] = None
2025-12-20 17:39:33 +04:00
MINERU_API_TIMEOUT: Optional[str] = None
MINERU_PARAMS: Optional[dict] = None
2025-05-10 18:25:20 +04:00
# Reranking settings
RAG_RERANKING_MODEL: Optional[str] = None
RAG_RERANKING_ENGINE: Optional[str] = None
RAG_RERANKING_BATCH_SIZE: Optional[int] = None
2025-05-10 18:29:04 +04:00
RAG_EXTERNAL_RERANKER_URL: Optional[str] = None
RAG_EXTERNAL_RERANKER_API_KEY: Optional[str] = None
RAG_EXTERNAL_RERANKER_TIMEOUT: Optional[str] = None
2025-05-10 18:25:20 +04:00
2025-04-12 16:33:36 -07:00
# Chunking settings
TEXT_SPLITTER: Optional[str] = None
2025-12-30 19:38:45 +04:00
ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER: Optional[bool] = None
2025-04-12 16:33:36 -07:00
CHUNK_SIZE: Optional[int] = None
CHUNK_MIN_SIZE_TARGET: Optional[int] = None
2025-04-12 16:33:36 -07:00
CHUNK_OVERLAP: Optional[int] = None
# File upload settings
2026-03-06 15:13:21 -06:00
FILE_MAX_SIZE: Optional[Union[int, str]] = None
FILE_MAX_COUNT: Optional[Union[int, str]] = None
FILE_IMAGE_COMPRESSION_WIDTH: Optional[Union[int, str]] = None
FILE_IMAGE_COMPRESSION_HEIGHT: Optional[Union[int, str]] = None
2025-05-16 21:05:52 +04:00
ALLOWED_FILE_EXTENSIONS: Optional[List[str]] = None
2025-04-12 16:33:36 -07:00
# Integration settings
ENABLE_GOOGLE_DRIVE_INTEGRATION: Optional[bool] = None
ENABLE_ONEDRIVE_INTEGRATION: Optional[bool] = None
# Web search settings
2024-06-01 19:40:48 -07:00
web: Optional[WebConfig] = None
2024-03-10 13:32:34 -07:00
2026-03-17 17:58:01 -05:00
@router.post('/config/update')
async def update_rag_config(request: Request, form_data: ConfigForm, user=Depends(get_admin_user)):
2025-04-12 16:33:36 -07:00
# RAG settings
request.app.state.config.RAG_TEMPLATE = (
2026-03-17 17:58:01 -05:00
form_data.RAG_TEMPLATE if form_data.RAG_TEMPLATE is not None else request.app.state.config.RAG_TEMPLATE
2025-04-12 16:33:36 -07:00
)
2026-03-17 17:58:01 -05:00
request.app.state.config.TOP_K = form_data.TOP_K if form_data.TOP_K is not None else request.app.state.config.TOP_K
2025-04-12 16:33:36 -07:00
request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL = (
form_data.BYPASS_EMBEDDING_AND_RETRIEVAL
if form_data.BYPASS_EMBEDDING_AND_RETRIEVAL is not None
else request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL
2024-05-06 14:50:55 -07:00
)
2025-02-18 21:14:58 -08:00
request.app.state.config.RAG_FULL_CONTEXT = (
form_data.RAG_FULL_CONTEXT
if form_data.RAG_FULL_CONTEXT is not None
else request.app.state.config.RAG_FULL_CONTEXT
)
2025-04-12 16:33:36 -07:00
# Hybrid search settings
request.app.state.config.ENABLE_RAG_HYBRID_SEARCH = (
form_data.ENABLE_RAG_HYBRID_SEARCH
if form_data.ENABLE_RAG_HYBRID_SEARCH is not None
else request.app.state.config.ENABLE_RAG_HYBRID_SEARCH
)
request.app.state.config.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS = (
form_data.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS
if form_data.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS is not None
else request.app.state.config.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS
)
2025-04-12 16:33:36 -07:00
request.app.state.config.TOP_K_RERANKER = (
2026-03-17 17:58:01 -05:00
form_data.TOP_K_RERANKER if form_data.TOP_K_RERANKER is not None else request.app.state.config.TOP_K_RERANKER
2025-04-12 16:33:36 -07:00
)
request.app.state.config.RELEVANCE_THRESHOLD = (
form_data.RELEVANCE_THRESHOLD
if form_data.RELEVANCE_THRESHOLD is not None
else request.app.state.config.RELEVANCE_THRESHOLD
)
2025-05-23 22:06:44 +02:00
request.app.state.config.HYBRID_BM25_WEIGHT = (
form_data.HYBRID_BM25_WEIGHT
if form_data.HYBRID_BM25_WEIGHT is not None
else request.app.state.config.HYBRID_BM25_WEIGHT
)
2025-04-12 16:33:36 -07:00
# Content extraction settings
request.app.state.config.CONTENT_EXTRACTION_ENGINE = (
form_data.CONTENT_EXTRACTION_ENGINE
if form_data.CONTENT_EXTRACTION_ENGINE is not None
else request.app.state.config.CONTENT_EXTRACTION_ENGINE
)
request.app.state.config.PDF_EXTRACT_IMAGES = (
form_data.PDF_EXTRACT_IMAGES
if form_data.PDF_EXTRACT_IMAGES is not None
else request.app.state.config.PDF_EXTRACT_IMAGES
)
2026-01-21 23:51:36 +04:00
request.app.state.config.PDF_LOADER_MODE = (
2026-03-17 17:58:01 -05:00
form_data.PDF_LOADER_MODE if form_data.PDF_LOADER_MODE is not None else request.app.state.config.PDF_LOADER_MODE
2026-01-21 23:51:36 +04:00
)
2025-05-27 00:44:07 -04:00
request.app.state.config.DATALAB_MARKER_API_KEY = (
form_data.DATALAB_MARKER_API_KEY
if form_data.DATALAB_MARKER_API_KEY is not None
else request.app.state.config.DATALAB_MARKER_API_KEY
)
2025-07-22 20:49:28 -04:00
request.app.state.config.DATALAB_MARKER_API_BASE_URL = (
form_data.DATALAB_MARKER_API_BASE_URL
if form_data.DATALAB_MARKER_API_BASE_URL is not None
else request.app.state.config.DATALAB_MARKER_API_BASE_URL
)
request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG = (
form_data.DATALAB_MARKER_ADDITIONAL_CONFIG
if form_data.DATALAB_MARKER_ADDITIONAL_CONFIG is not None
else request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG
2025-05-27 00:44:07 -04:00
)
request.app.state.config.DATALAB_MARKER_SKIP_CACHE = (
form_data.DATALAB_MARKER_SKIP_CACHE
if form_data.DATALAB_MARKER_SKIP_CACHE is not None
else request.app.state.config.DATALAB_MARKER_SKIP_CACHE
)
request.app.state.config.DATALAB_MARKER_FORCE_OCR = (
form_data.DATALAB_MARKER_FORCE_OCR
if form_data.DATALAB_MARKER_FORCE_OCR is not None
else request.app.state.config.DATALAB_MARKER_FORCE_OCR
)
request.app.state.config.DATALAB_MARKER_PAGINATE = (
form_data.DATALAB_MARKER_PAGINATE
if form_data.DATALAB_MARKER_PAGINATE is not None
else request.app.state.config.DATALAB_MARKER_PAGINATE
)
request.app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR = (
form_data.DATALAB_MARKER_STRIP_EXISTING_OCR
if form_data.DATALAB_MARKER_STRIP_EXISTING_OCR is not None
else request.app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR
)
request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION = (
form_data.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION
if form_data.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION is not None
else request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION
)
2025-07-22 21:06:29 -04:00
request.app.state.config.DATALAB_MARKER_FORMAT_LINES = (
form_data.DATALAB_MARKER_FORMAT_LINES
if form_data.DATALAB_MARKER_FORMAT_LINES is not None
else request.app.state.config.DATALAB_MARKER_FORMAT_LINES
)
2025-05-27 00:44:07 -04:00
request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT = (
form_data.DATALAB_MARKER_OUTPUT_FORMAT
if form_data.DATALAB_MARKER_OUTPUT_FORMAT is not None
else request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT
)
request.app.state.config.DATALAB_MARKER_USE_LLM = (
form_data.DATALAB_MARKER_USE_LLM
if form_data.DATALAB_MARKER_USE_LLM is not None
else request.app.state.config.DATALAB_MARKER_USE_LLM
)
2025-05-14 22:28:40 +04:00
request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL = (
form_data.EXTERNAL_DOCUMENT_LOADER_URL
if form_data.EXTERNAL_DOCUMENT_LOADER_URL is not None
else request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL
)
request.app.state.config.EXTERNAL_DOCUMENT_LOADER_API_KEY = (
form_data.EXTERNAL_DOCUMENT_LOADER_API_KEY
if form_data.EXTERNAL_DOCUMENT_LOADER_API_KEY is not None
else request.app.state.config.EXTERNAL_DOCUMENT_LOADER_API_KEY
)
2025-04-12 16:33:36 -07:00
request.app.state.config.TIKA_SERVER_URL = (
2026-03-17 17:58:01 -05:00
form_data.TIKA_SERVER_URL if form_data.TIKA_SERVER_URL is not None else request.app.state.config.TIKA_SERVER_URL
2025-04-12 16:33:36 -07:00
)
request.app.state.config.DOCLING_SERVER_URL = (
form_data.DOCLING_SERVER_URL
if form_data.DOCLING_SERVER_URL is not None
else request.app.state.config.DOCLING_SERVER_URL
)
2025-11-24 16:01:13 -05:00
request.app.state.config.DOCLING_API_KEY = (
2026-03-17 17:58:01 -05:00
form_data.DOCLING_API_KEY if form_data.DOCLING_API_KEY is not None else request.app.state.config.DOCLING_API_KEY
2025-11-24 16:01:13 -05:00
)
2025-10-07 16:20:27 -05:00
request.app.state.config.DOCLING_PARAMS = (
2026-03-17 17:58:01 -05:00
form_data.DOCLING_PARAMS if form_data.DOCLING_PARAMS is not None else request.app.state.config.DOCLING_PARAMS
2025-10-07 16:20:27 -05:00
)
2025-04-12 16:33:36 -07:00
request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = (
form_data.DOCUMENT_INTELLIGENCE_ENDPOINT
if form_data.DOCUMENT_INTELLIGENCE_ENDPOINT is not None
else request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT
)
request.app.state.config.DOCUMENT_INTELLIGENCE_KEY = (
form_data.DOCUMENT_INTELLIGENCE_KEY
if form_data.DOCUMENT_INTELLIGENCE_KEY is not None
else request.app.state.config.DOCUMENT_INTELLIGENCE_KEY
)
request.app.state.config.DOCUMENT_INTELLIGENCE_MODEL = (
form_data.DOCUMENT_INTELLIGENCE_MODEL
if form_data.DOCUMENT_INTELLIGENCE_MODEL is not None
else request.app.state.config.DOCUMENT_INTELLIGENCE_MODEL
)
2025-11-05 23:25:51 -05:00
request.app.state.config.MISTRAL_OCR_API_BASE_URL = (
form_data.MISTRAL_OCR_API_BASE_URL
if form_data.MISTRAL_OCR_API_BASE_URL is not None
else request.app.state.config.MISTRAL_OCR_API_BASE_URL
)
2025-04-12 16:33:36 -07:00
request.app.state.config.MISTRAL_OCR_API_KEY = (
form_data.MISTRAL_OCR_API_KEY
if form_data.MISTRAL_OCR_API_KEY is not None
else request.app.state.config.MISTRAL_OCR_API_KEY
)
request.app.state.config.PADDLEOCR_VL_BASE_URL = (
form_data.PADDLEOCR_VL_BASE_URL
if form_data.PADDLEOCR_VL_BASE_URL is not None
else request.app.state.config.PADDLEOCR_VL_BASE_URL
)
request.app.state.config.PADDLEOCR_VL_TOKEN = (
form_data.PADDLEOCR_VL_TOKEN
if form_data.PADDLEOCR_VL_TOKEN is not None
else request.app.state.config.PADDLEOCR_VL_TOKEN
)
2025-04-12 16:33:36 -07:00
# MinerU settings
request.app.state.config.MINERU_API_MODE = (
2026-03-17 17:58:01 -05:00
form_data.MINERU_API_MODE if form_data.MINERU_API_MODE is not None else request.app.state.config.MINERU_API_MODE
)
request.app.state.config.MINERU_API_URL = (
2026-03-17 17:58:01 -05:00
form_data.MINERU_API_URL if form_data.MINERU_API_URL is not None else request.app.state.config.MINERU_API_URL
)
request.app.state.config.MINERU_API_KEY = (
2026-03-17 17:58:01 -05:00
form_data.MINERU_API_KEY if form_data.MINERU_API_KEY is not None else request.app.state.config.MINERU_API_KEY
)
2025-12-20 17:39:33 +04:00
request.app.state.config.MINERU_API_TIMEOUT = (
form_data.MINERU_API_TIMEOUT
if form_data.MINERU_API_TIMEOUT is not None
else request.app.state.config.MINERU_API_TIMEOUT
)
request.app.state.config.MINERU_PARAMS = (
2026-03-17 17:58:01 -05:00
form_data.MINERU_PARAMS if form_data.MINERU_PARAMS is not None else request.app.state.config.MINERU_PARAMS
)
2025-05-10 18:25:20 +04:00
# Reranking settings
2026-03-17 17:58:01 -05:00
if request.app.state.config.RAG_RERANKING_ENGINE == '':
2025-08-21 10:49:03 +02:00
# Unloading the internal reranker and clear VRAM memory
request.app.state.rf = None
request.app.state.RERANKING_FUNCTION = None
import gc
2025-08-21 13:40:56 +02:00
2025-08-21 10:49:03 +02:00
gc.collect()
2026-03-17 17:58:01 -05:00
if DEVICE_TYPE == 'cuda':
2025-08-21 13:19:24 +02:00
import torch
2025-08-21 13:40:56 +02:00
2025-08-21 13:19:24 +02:00
if torch.cuda.is_available():
torch.cuda.empty_cache()
2025-05-10 18:25:20 +04:00
request.app.state.config.RAG_RERANKING_ENGINE = (
form_data.RAG_RERANKING_ENGINE
if form_data.RAG_RERANKING_ENGINE is not None
else request.app.state.config.RAG_RERANKING_ENGINE
)
2025-05-10 18:29:04 +04:00
request.app.state.config.RAG_EXTERNAL_RERANKER_URL = (
form_data.RAG_EXTERNAL_RERANKER_URL
if form_data.RAG_EXTERNAL_RERANKER_URL is not None
else request.app.state.config.RAG_EXTERNAL_RERANKER_URL
2025-05-10 18:25:20 +04:00
)
2025-05-10 18:29:04 +04:00
request.app.state.config.RAG_EXTERNAL_RERANKER_API_KEY = (
form_data.RAG_EXTERNAL_RERANKER_API_KEY
if form_data.RAG_EXTERNAL_RERANKER_API_KEY is not None
else request.app.state.config.RAG_EXTERNAL_RERANKER_API_KEY
2025-05-10 18:25:20 +04:00
)
request.app.state.config.RAG_EXTERNAL_RERANKER_TIMEOUT = (
form_data.RAG_EXTERNAL_RERANKER_TIMEOUT
if form_data.RAG_EXTERNAL_RERANKER_TIMEOUT is not None
else request.app.state.config.RAG_EXTERNAL_RERANKER_TIMEOUT
)
request.app.state.config.RAG_RERANKING_BATCH_SIZE = (
form_data.RAG_RERANKING_BATCH_SIZE
if form_data.RAG_RERANKING_BATCH_SIZE is not None
else request.app.state.config.RAG_RERANKING_BATCH_SIZE
)
2025-05-10 18:25:20 +04:00
log.info(
2026-03-17 17:58:01 -05:00
f'Updating reranking model: {request.app.state.config.RAG_RERANKING_MODEL} to {form_data.RAG_RERANKING_MODEL}'
2025-05-10 18:25:20 +04:00
)
try:
request.app.state.config.RAG_RERANKING_MODEL = (
form_data.RAG_RERANKING_MODEL
if form_data.RAG_RERANKING_MODEL is not None
else request.app.state.config.RAG_RERANKING_MODEL
)
2025-05-10 18:25:20 +04:00
try:
2025-08-21 13:40:56 +02:00
if (
request.app.state.config.ENABLE_RAG_HYBRID_SEARCH
and not request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL
2025-08-21 13:42:03 +02:00
):
2025-08-20 13:30:45 +02:00
request.app.state.rf = get_rf(
request.app.state.config.RAG_RERANKING_ENGINE,
request.app.state.config.RAG_RERANKING_MODEL,
request.app.state.config.RAG_EXTERNAL_RERANKER_URL,
request.app.state.config.RAG_EXTERNAL_RERANKER_API_KEY,
request.app.state.config.RAG_EXTERNAL_RERANKER_TIMEOUT,
2025-08-20 13:30:45 +02:00
)
2025-08-20 13:30:45 +02:00
request.app.state.RERANKING_FUNCTION = get_reranking_function(
request.app.state.config.RAG_RERANKING_ENGINE,
request.app.state.config.RAG_RERANKING_MODEL,
request.app.state.rf,
reranking_batch_size=request.app.state.config.RAG_RERANKING_BATCH_SIZE,
2025-08-20 13:30:45 +02:00
)
2025-05-10 18:25:20 +04:00
except Exception as e:
2026-03-17 17:58:01 -05:00
log.error(f'Error loading reranking model: {e}')
2025-05-10 18:25:20 +04:00
request.app.state.config.ENABLE_RAG_HYBRID_SEARCH = False
except Exception as e:
2026-03-17 17:58:01 -05:00
log.exception(f'Problem updating reranking model: {e}')
2025-05-10 18:25:20 +04:00
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=ERROR_MESSAGES.DEFAULT(e),
)
2025-04-12 16:33:36 -07:00
# Chunking settings
request.app.state.config.TEXT_SPLITTER = (
2026-03-17 17:58:01 -05:00
form_data.TEXT_SPLITTER if form_data.TEXT_SPLITTER is not None else request.app.state.config.TEXT_SPLITTER
2025-04-12 16:33:36 -07:00
)
2026-01-26 12:29:15 +01:00
request.app.state.config.ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER = (
form_data.ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER
if form_data.ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER is not None
else request.app.state.config.ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER
)
2025-04-12 16:33:36 -07:00
request.app.state.config.CHUNK_SIZE = (
2026-03-17 17:58:01 -05:00
form_data.CHUNK_SIZE if form_data.CHUNK_SIZE is not None else request.app.state.config.CHUNK_SIZE
2025-04-12 16:33:36 -07:00
)
request.app.state.config.CHUNK_MIN_SIZE_TARGET = (
form_data.CHUNK_MIN_SIZE_TARGET
if form_data.CHUNK_MIN_SIZE_TARGET is not None
else request.app.state.config.CHUNK_MIN_SIZE_TARGET
)
2025-04-12 16:33:36 -07:00
request.app.state.config.CHUNK_OVERLAP = (
2026-03-17 17:58:01 -05:00
form_data.CHUNK_OVERLAP if form_data.CHUNK_OVERLAP is not None else request.app.state.config.CHUNK_OVERLAP
2025-04-12 16:33:36 -07:00
)
# File upload settings
2026-03-06 15:13:21 -06:00
# Empty string means "clear to None" (unlimited/no compression),
# None means "don't change", int means "set to this value"
if form_data.FILE_MAX_SIZE is not None:
2026-03-17 17:58:01 -05:00
request.app.state.config.FILE_MAX_SIZE = None if form_data.FILE_MAX_SIZE == '' else form_data.FILE_MAX_SIZE
2026-03-06 15:13:21 -06:00
if form_data.FILE_MAX_COUNT is not None:
2026-03-17 17:58:01 -05:00
request.app.state.config.FILE_MAX_COUNT = None if form_data.FILE_MAX_COUNT == '' else form_data.FILE_MAX_COUNT
2026-03-06 15:13:21 -06:00
if form_data.FILE_IMAGE_COMPRESSION_WIDTH is not None:
request.app.state.config.FILE_IMAGE_COMPRESSION_WIDTH = (
2026-03-17 17:58:01 -05:00
None if form_data.FILE_IMAGE_COMPRESSION_WIDTH == '' else form_data.FILE_IMAGE_COMPRESSION_WIDTH
2026-03-06 15:13:21 -06:00
)
if form_data.FILE_IMAGE_COMPRESSION_HEIGHT is not None:
request.app.state.config.FILE_IMAGE_COMPRESSION_HEIGHT = (
2026-03-17 17:58:01 -05:00
None if form_data.FILE_IMAGE_COMPRESSION_HEIGHT == '' else form_data.FILE_IMAGE_COMPRESSION_HEIGHT
2026-03-06 15:13:21 -06:00
)
2026-03-06 15:05:36 -06:00
2025-05-16 21:05:52 +04:00
request.app.state.config.ALLOWED_FILE_EXTENSIONS = (
form_data.ALLOWED_FILE_EXTENSIONS
if form_data.ALLOWED_FILE_EXTENSIONS is not None
else request.app.state.config.ALLOWED_FILE_EXTENSIONS
)
2025-02-26 15:42:19 -08:00
2025-04-12 16:33:36 -07:00
# Integration settings
2024-12-18 18:04:56 -08:00
request.app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION = (
2025-04-13 12:55:50 +08:00
form_data.ENABLE_GOOGLE_DRIVE_INTEGRATION
if form_data.ENABLE_GOOGLE_DRIVE_INTEGRATION is not None
2024-12-18 18:04:56 -08:00
else request.app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION
)
2025-02-24 23:14:10 +09:00
request.app.state.config.ENABLE_ONEDRIVE_INTEGRATION = (
2025-04-12 16:33:36 -07:00
form_data.ENABLE_ONEDRIVE_INTEGRATION
if form_data.ENABLE_ONEDRIVE_INTEGRATION is not None
2025-02-24 23:14:10 +09:00
else request.app.state.config.ENABLE_ONEDRIVE_INTEGRATION
)
2025-04-12 16:33:36 -07:00
if form_data.web is not None:
# Web search settings
request.app.state.config.ENABLE_WEB_SEARCH = form_data.web.ENABLE_WEB_SEARCH
request.app.state.config.WEB_SEARCH_ENGINE = form_data.web.WEB_SEARCH_ENGINE
2026-03-17 17:58:01 -05:00
request.app.state.config.WEB_SEARCH_TRUST_ENV = form_data.web.WEB_SEARCH_TRUST_ENV
request.app.state.config.WEB_SEARCH_RESULT_COUNT = form_data.web.WEB_SEARCH_RESULT_COUNT
request.app.state.config.WEB_SEARCH_CONCURRENT_REQUESTS = form_data.web.WEB_SEARCH_CONCURRENT_REQUESTS
request.app.state.config.WEB_FETCH_MAX_CONTENT_LENGTH = form_data.web.WEB_FETCH_MAX_CONTENT_LENGTH
request.app.state.config.WEB_LOADER_CONCURRENT_REQUESTS = form_data.web.WEB_LOADER_CONCURRENT_REQUESTS
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST = form_data.web.WEB_SEARCH_DOMAIN_FILTER_LIST
2025-04-12 16:33:36 -07:00
request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL = (
form_data.web.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL
2024-12-11 18:05:42 -08:00
)
2026-03-17 17:58:01 -05:00
request.app.state.config.BYPASS_WEB_SEARCH_WEB_LOADER = form_data.web.BYPASS_WEB_SEARCH_WEB_LOADER
request.app.state.config.OLLAMA_CLOUD_WEB_SEARCH_API_KEY = form_data.web.OLLAMA_CLOUD_WEB_SEARCH_API_KEY
2025-04-12 16:33:36 -07:00
request.app.state.config.SEARXNG_QUERY_URL = form_data.web.SEARXNG_QUERY_URL
request.app.state.config.SEARXNG_LANGUAGE = form_data.web.SEARXNG_LANGUAGE
request.app.state.config.YACY_QUERY_URL = form_data.web.YACY_QUERY_URL
request.app.state.config.YACY_USERNAME = form_data.web.YACY_USERNAME
request.app.state.config.YACY_PASSWORD = form_data.web.YACY_PASSWORD
2025-04-12 16:33:36 -07:00
request.app.state.config.GOOGLE_PSE_API_KEY = form_data.web.GOOGLE_PSE_API_KEY
2026-03-17 17:58:01 -05:00
request.app.state.config.GOOGLE_PSE_ENGINE_ID = form_data.web.GOOGLE_PSE_ENGINE_ID
request.app.state.config.BRAVE_SEARCH_API_KEY = form_data.web.BRAVE_SEARCH_API_KEY
2025-04-12 16:33:36 -07:00
request.app.state.config.KAGI_SEARCH_API_KEY = form_data.web.KAGI_SEARCH_API_KEY
2026-03-17 17:58:01 -05:00
request.app.state.config.MOJEEK_SEARCH_API_KEY = form_data.web.MOJEEK_SEARCH_API_KEY
request.app.state.config.BOCHA_SEARCH_API_KEY = form_data.web.BOCHA_SEARCH_API_KEY
2025-04-12 16:33:36 -07:00
request.app.state.config.SERPSTACK_API_KEY = form_data.web.SERPSTACK_API_KEY
request.app.state.config.SERPSTACK_HTTPS = form_data.web.SERPSTACK_HTTPS
request.app.state.config.SERPER_API_KEY = form_data.web.SERPER_API_KEY
request.app.state.config.SERPLY_API_KEY = form_data.web.SERPLY_API_KEY
request.app.state.config.DDGS_BACKEND = form_data.web.DDGS_BACKEND
2025-04-12 16:33:36 -07:00
request.app.state.config.TAVILY_API_KEY = form_data.web.TAVILY_API_KEY
request.app.state.config.SEARCHAPI_API_KEY = form_data.web.SEARCHAPI_API_KEY
request.app.state.config.SEARCHAPI_ENGINE = form_data.web.SEARCHAPI_ENGINE
request.app.state.config.SERPAPI_API_KEY = form_data.web.SERPAPI_API_KEY
request.app.state.config.SERPAPI_ENGINE = form_data.web.SERPAPI_ENGINE
request.app.state.config.JINA_API_KEY = form_data.web.JINA_API_KEY
2026-01-01 02:17:47 +04:00
request.app.state.config.JINA_API_BASE_URL = form_data.web.JINA_API_BASE_URL
2026-03-17 17:58:01 -05:00
request.app.state.config.BING_SEARCH_V7_ENDPOINT = form_data.web.BING_SEARCH_V7_ENDPOINT
request.app.state.config.BING_SEARCH_V7_SUBSCRIPTION_KEY = form_data.web.BING_SEARCH_V7_SUBSCRIPTION_KEY
2025-04-12 16:33:36 -07:00
request.app.state.config.EXA_API_KEY = form_data.web.EXA_API_KEY
request.app.state.config.PERPLEXITY_API_KEY = form_data.web.PERPLEXITY_API_KEY
request.app.state.config.PERPLEXITY_MODEL = form_data.web.PERPLEXITY_MODEL
2026-03-17 17:58:01 -05:00
request.app.state.config.PERPLEXITY_SEARCH_CONTEXT_USAGE = form_data.web.PERPLEXITY_SEARCH_CONTEXT_USAGE
request.app.state.config.PERPLEXITY_SEARCH_API_URL = form_data.web.PERPLEXITY_SEARCH_API_URL
2025-04-12 16:33:36 -07:00
request.app.state.config.SOUGOU_API_SID = form_data.web.SOUGOU_API_SID
request.app.state.config.SOUGOU_API_SK = form_data.web.SOUGOU_API_SK
# Web loader settings
request.app.state.config.WEB_LOADER_ENGINE = form_data.web.WEB_LOADER_ENGINE
2025-12-08 11:49:27 -05:00
request.app.state.config.WEB_LOADER_TIMEOUT = form_data.web.WEB_LOADER_TIMEOUT
2026-03-17 17:58:01 -05:00
request.app.state.config.ENABLE_WEB_LOADER_SSL_VERIFICATION = form_data.web.ENABLE_WEB_LOADER_SSL_VERIFICATION
2025-04-12 16:33:36 -07:00
request.app.state.config.PLAYWRIGHT_WS_URL = form_data.web.PLAYWRIGHT_WS_URL
request.app.state.config.PLAYWRIGHT_TIMEOUT = form_data.web.PLAYWRIGHT_TIMEOUT
request.app.state.config.FIRECRAWL_API_KEY = form_data.web.FIRECRAWL_API_KEY
2026-03-17 17:58:01 -05:00
request.app.state.config.FIRECRAWL_API_BASE_URL = form_data.web.FIRECRAWL_API_BASE_URL
2026-01-01 02:07:22 +04:00
request.app.state.config.FIRECRAWL_TIMEOUT = form_data.web.FIRECRAWL_TIMEOUT
2026-03-17 17:58:01 -05:00
request.app.state.config.EXTERNAL_WEB_SEARCH_URL = form_data.web.EXTERNAL_WEB_SEARCH_URL
request.app.state.config.EXTERNAL_WEB_SEARCH_API_KEY = form_data.web.EXTERNAL_WEB_SEARCH_API_KEY
request.app.state.config.EXTERNAL_WEB_LOADER_URL = form_data.web.EXTERNAL_WEB_LOADER_URL
request.app.state.config.EXTERNAL_WEB_LOADER_API_KEY = form_data.web.EXTERNAL_WEB_LOADER_API_KEY
request.app.state.config.TAVILY_EXTRACT_DEPTH = form_data.web.TAVILY_EXTRACT_DEPTH
request.app.state.config.YOUTUBE_LOADER_LANGUAGE = form_data.web.YOUTUBE_LOADER_LANGUAGE
request.app.state.config.YOUTUBE_LOADER_PROXY_URL = form_data.web.YOUTUBE_LOADER_PROXY_URL
request.app.state.YOUTUBE_LOADER_TRANSLATION = form_data.web.YOUTUBE_LOADER_TRANSLATION
request.app.state.config.YANDEX_WEB_SEARCH_URL = form_data.web.YANDEX_WEB_SEARCH_URL
request.app.state.config.YANDEX_WEB_SEARCH_API_KEY = form_data.web.YANDEX_WEB_SEARCH_API_KEY
request.app.state.config.YANDEX_WEB_SEARCH_CONFIG = form_data.web.YANDEX_WEB_SEARCH_CONFIG
2026-02-21 15:35:34 -06:00
request.app.state.config.YOUCOM_API_KEY = form_data.web.YOUCOM_API_KEY
2024-02-17 22:29:52 -08:00
return {
2026-03-17 17:58:01 -05:00
'status': True,
2025-04-12 22:55:24 -07:00
# RAG settings
2026-03-17 17:58:01 -05:00
'RAG_TEMPLATE': request.app.state.config.RAG_TEMPLATE,
'TOP_K': request.app.state.config.TOP_K,
'BYPASS_EMBEDDING_AND_RETRIEVAL': request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL,
'RAG_FULL_CONTEXT': request.app.state.config.RAG_FULL_CONTEXT,
2025-04-12 22:55:24 -07:00
# Hybrid search settings
2026-03-17 17:58:01 -05:00
'ENABLE_RAG_HYBRID_SEARCH': request.app.state.config.ENABLE_RAG_HYBRID_SEARCH,
'TOP_K_RERANKER': request.app.state.config.TOP_K_RERANKER,
'RELEVANCE_THRESHOLD': request.app.state.config.RELEVANCE_THRESHOLD,
'HYBRID_BM25_WEIGHT': request.app.state.config.HYBRID_BM25_WEIGHT,
2025-04-12 16:33:36 -07:00
# Content extraction settings
2026-03-17 17:58:01 -05:00
'CONTENT_EXTRACTION_ENGINE': request.app.state.config.CONTENT_EXTRACTION_ENGINE,
'PDF_EXTRACT_IMAGES': request.app.state.config.PDF_EXTRACT_IMAGES,
'PDF_LOADER_MODE': request.app.state.config.PDF_LOADER_MODE,
'DATALAB_MARKER_API_KEY': request.app.state.config.DATALAB_MARKER_API_KEY,
'DATALAB_MARKER_API_BASE_URL': request.app.state.config.DATALAB_MARKER_API_BASE_URL,
'DATALAB_MARKER_ADDITIONAL_CONFIG': request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG,
'DATALAB_MARKER_SKIP_CACHE': request.app.state.config.DATALAB_MARKER_SKIP_CACHE,
'DATALAB_MARKER_FORCE_OCR': request.app.state.config.DATALAB_MARKER_FORCE_OCR,
'DATALAB_MARKER_PAGINATE': request.app.state.config.DATALAB_MARKER_PAGINATE,
'DATALAB_MARKER_STRIP_EXISTING_OCR': request.app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR,
'DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION': request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION,
'DATALAB_MARKER_USE_LLM': request.app.state.config.DATALAB_MARKER_USE_LLM,
'DATALAB_MARKER_OUTPUT_FORMAT': request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT,
'EXTERNAL_DOCUMENT_LOADER_URL': request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL,
'EXTERNAL_DOCUMENT_LOADER_API_KEY': request.app.state.config.EXTERNAL_DOCUMENT_LOADER_API_KEY,
'TIKA_SERVER_URL': request.app.state.config.TIKA_SERVER_URL,
'DOCLING_SERVER_URL': request.app.state.config.DOCLING_SERVER_URL,
'DOCLING_API_KEY': request.app.state.config.DOCLING_API_KEY,
'DOCLING_PARAMS': request.app.state.config.DOCLING_PARAMS,
'DOCUMENT_INTELLIGENCE_ENDPOINT': request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
'DOCUMENT_INTELLIGENCE_KEY': request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
'DOCUMENT_INTELLIGENCE_MODEL': request.app.state.config.DOCUMENT_INTELLIGENCE_MODEL,
'MISTRAL_OCR_API_BASE_URL': request.app.state.config.MISTRAL_OCR_API_BASE_URL,
'MISTRAL_OCR_API_KEY': request.app.state.config.MISTRAL_OCR_API_KEY,
'PADDLEOCR_VL_BASE_URL': request.app.state.config.PADDLEOCR_VL_BASE_URL,
'PADDLEOCR_VL_TOKEN': request.app.state.config.PADDLEOCR_VL_TOKEN,
# MinerU settings
2026-03-17 17:58:01 -05:00
'MINERU_API_MODE': request.app.state.config.MINERU_API_MODE,
'MINERU_API_URL': request.app.state.config.MINERU_API_URL,
'MINERU_API_KEY': request.app.state.config.MINERU_API_KEY,
'MINERU_API_TIMEOUT': request.app.state.config.MINERU_API_TIMEOUT,
'MINERU_PARAMS': request.app.state.config.MINERU_PARAMS,
2025-05-10 18:25:20 +04:00
# Reranking settings
2026-03-17 17:58:01 -05:00
'RAG_RERANKING_MODEL': request.app.state.config.RAG_RERANKING_MODEL,
'RAG_RERANKING_ENGINE': request.app.state.config.RAG_RERANKING_ENGINE,
'RAG_EXTERNAL_RERANKER_URL': request.app.state.config.RAG_EXTERNAL_RERANKER_URL,
'RAG_EXTERNAL_RERANKER_API_KEY': request.app.state.config.RAG_EXTERNAL_RERANKER_API_KEY,
'RAG_EXTERNAL_RERANKER_TIMEOUT': request.app.state.config.RAG_EXTERNAL_RERANKER_TIMEOUT,
2025-04-12 16:33:36 -07:00
# Chunking settings
2026-03-17 17:58:01 -05:00
'TEXT_SPLITTER': request.app.state.config.TEXT_SPLITTER,
'CHUNK_SIZE': request.app.state.config.CHUNK_SIZE,
'CHUNK_MIN_SIZE_TARGET': request.app.state.config.CHUNK_MIN_SIZE_TARGET,
'ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER': request.app.state.config.ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER,
'CHUNK_OVERLAP': request.app.state.config.CHUNK_OVERLAP,
2025-04-12 16:33:36 -07:00
# File upload settings
2026-03-17 17:58:01 -05:00
'FILE_MAX_SIZE': request.app.state.config.FILE_MAX_SIZE,
'FILE_MAX_COUNT': request.app.state.config.FILE_MAX_COUNT,
'FILE_IMAGE_COMPRESSION_WIDTH': request.app.state.config.FILE_IMAGE_COMPRESSION_WIDTH,
'FILE_IMAGE_COMPRESSION_HEIGHT': request.app.state.config.FILE_IMAGE_COMPRESSION_HEIGHT,
'ALLOWED_FILE_EXTENSIONS': request.app.state.config.ALLOWED_FILE_EXTENSIONS,
2025-04-12 16:33:36 -07:00
# Integration settings
2026-03-17 17:58:01 -05:00
'ENABLE_GOOGLE_DRIVE_INTEGRATION': request.app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION,
'ENABLE_ONEDRIVE_INTEGRATION': request.app.state.config.ENABLE_ONEDRIVE_INTEGRATION,
2025-04-12 16:33:36 -07:00
# Web search settings
2026-03-17 17:58:01 -05:00
'web': {
'ENABLE_WEB_SEARCH': request.app.state.config.ENABLE_WEB_SEARCH,
'WEB_SEARCH_ENGINE': request.app.state.config.WEB_SEARCH_ENGINE,
'WEB_SEARCH_TRUST_ENV': request.app.state.config.WEB_SEARCH_TRUST_ENV,
'WEB_SEARCH_RESULT_COUNT': request.app.state.config.WEB_SEARCH_RESULT_COUNT,
'WEB_SEARCH_CONCURRENT_REQUESTS': request.app.state.config.WEB_SEARCH_CONCURRENT_REQUESTS,
2026-03-31 23:12:23 -05:00
'WEB_FETCH_MAX_CONTENT_LENGTH': request.app.state.config.WEB_FETCH_MAX_CONTENT_LENGTH,
2026-03-17 17:58:01 -05:00
'WEB_LOADER_CONCURRENT_REQUESTS': request.app.state.config.WEB_LOADER_CONCURRENT_REQUESTS,
'WEB_SEARCH_DOMAIN_FILTER_LIST': request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
'BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL': request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL,
'BYPASS_WEB_SEARCH_WEB_LOADER': request.app.state.config.BYPASS_WEB_SEARCH_WEB_LOADER,
'OLLAMA_CLOUD_WEB_SEARCH_API_KEY': request.app.state.config.OLLAMA_CLOUD_WEB_SEARCH_API_KEY,
'SEARXNG_QUERY_URL': request.app.state.config.SEARXNG_QUERY_URL,
'SEARXNG_LANGUAGE': request.app.state.config.SEARXNG_LANGUAGE,
'YACY_QUERY_URL': request.app.state.config.YACY_QUERY_URL,
'YACY_USERNAME': request.app.state.config.YACY_USERNAME,
'YACY_PASSWORD': request.app.state.config.YACY_PASSWORD,
'GOOGLE_PSE_API_KEY': request.app.state.config.GOOGLE_PSE_API_KEY,
'GOOGLE_PSE_ENGINE_ID': request.app.state.config.GOOGLE_PSE_ENGINE_ID,
'BRAVE_SEARCH_API_KEY': request.app.state.config.BRAVE_SEARCH_API_KEY,
'KAGI_SEARCH_API_KEY': request.app.state.config.KAGI_SEARCH_API_KEY,
'MOJEEK_SEARCH_API_KEY': request.app.state.config.MOJEEK_SEARCH_API_KEY,
'BOCHA_SEARCH_API_KEY': request.app.state.config.BOCHA_SEARCH_API_KEY,
'SERPSTACK_API_KEY': request.app.state.config.SERPSTACK_API_KEY,
'SERPSTACK_HTTPS': request.app.state.config.SERPSTACK_HTTPS,
'SERPER_API_KEY': request.app.state.config.SERPER_API_KEY,
'SERPLY_API_KEY': request.app.state.config.SERPLY_API_KEY,
'TAVILY_API_KEY': request.app.state.config.TAVILY_API_KEY,
'SEARCHAPI_API_KEY': request.app.state.config.SEARCHAPI_API_KEY,
'SEARCHAPI_ENGINE': request.app.state.config.SEARCHAPI_ENGINE,
'SERPAPI_API_KEY': request.app.state.config.SERPAPI_API_KEY,
'SERPAPI_ENGINE': request.app.state.config.SERPAPI_ENGINE,
'JINA_API_KEY': request.app.state.config.JINA_API_KEY,
'JINA_API_BASE_URL': request.app.state.config.JINA_API_BASE_URL,
'BING_SEARCH_V7_ENDPOINT': request.app.state.config.BING_SEARCH_V7_ENDPOINT,
'BING_SEARCH_V7_SUBSCRIPTION_KEY': request.app.state.config.BING_SEARCH_V7_SUBSCRIPTION_KEY,
'EXA_API_KEY': request.app.state.config.EXA_API_KEY,
'PERPLEXITY_API_KEY': request.app.state.config.PERPLEXITY_API_KEY,
'PERPLEXITY_MODEL': request.app.state.config.PERPLEXITY_MODEL,
'PERPLEXITY_SEARCH_CONTEXT_USAGE': request.app.state.config.PERPLEXITY_SEARCH_CONTEXT_USAGE,
'PERPLEXITY_SEARCH_API_URL': request.app.state.config.PERPLEXITY_SEARCH_API_URL,
'SOUGOU_API_SID': request.app.state.config.SOUGOU_API_SID,
'SOUGOU_API_SK': request.app.state.config.SOUGOU_API_SK,
'WEB_LOADER_ENGINE': request.app.state.config.WEB_LOADER_ENGINE,
'WEB_LOADER_TIMEOUT': request.app.state.config.WEB_LOADER_TIMEOUT,
'ENABLE_WEB_LOADER_SSL_VERIFICATION': request.app.state.config.ENABLE_WEB_LOADER_SSL_VERIFICATION,
'PLAYWRIGHT_WS_URL': request.app.state.config.PLAYWRIGHT_WS_URL,
'PLAYWRIGHT_TIMEOUT': request.app.state.config.PLAYWRIGHT_TIMEOUT,
'FIRECRAWL_API_KEY': request.app.state.config.FIRECRAWL_API_KEY,
'FIRECRAWL_API_BASE_URL': request.app.state.config.FIRECRAWL_API_BASE_URL,
'FIRECRAWL_TIMEOUT': request.app.state.config.FIRECRAWL_TIMEOUT,
'TAVILY_EXTRACT_DEPTH': request.app.state.config.TAVILY_EXTRACT_DEPTH,
'EXTERNAL_WEB_SEARCH_URL': request.app.state.config.EXTERNAL_WEB_SEARCH_URL,
'EXTERNAL_WEB_SEARCH_API_KEY': request.app.state.config.EXTERNAL_WEB_SEARCH_API_KEY,
'EXTERNAL_WEB_LOADER_URL': request.app.state.config.EXTERNAL_WEB_LOADER_URL,
'EXTERNAL_WEB_LOADER_API_KEY': request.app.state.config.EXTERNAL_WEB_LOADER_API_KEY,
'YOUTUBE_LOADER_LANGUAGE': request.app.state.config.YOUTUBE_LOADER_LANGUAGE,
'YOUTUBE_LOADER_PROXY_URL': request.app.state.config.YOUTUBE_LOADER_PROXY_URL,
'YOUTUBE_LOADER_TRANSLATION': request.app.state.YOUTUBE_LOADER_TRANSLATION,
'YANDEX_WEB_SEARCH_URL': request.app.state.config.YANDEX_WEB_SEARCH_URL,
'YANDEX_WEB_SEARCH_API_KEY': request.app.state.config.YANDEX_WEB_SEARCH_API_KEY,
'YANDEX_WEB_SEARCH_CONFIG': request.app.state.config.YANDEX_WEB_SEARCH_CONFIG,
'YOUCOM_API_KEY': request.app.state.config.YOUCOM_API_KEY,
2024-06-01 19:40:48 -07:00
},
2024-02-17 22:29:52 -08:00
}
2024-01-06 22:59:22 -08:00
2024-09-28 02:23:09 +02:00
####################################
#
# Document process and retrieval
#
####################################
2024-02-01 13:35:41 -08:00
def can_merge_chunks(a: Document, b: Document) -> bool:
2026-03-17 17:58:01 -05:00
if a.metadata.get('source') != b.metadata.get('source'):
return False
2026-03-17 17:58:01 -05:00
a_file_id = a.metadata.get('file_id')
b_file_id = b.metadata.get('file_id')
if a_file_id is not None and b_file_id is not None:
return a_file_id == b_file_id
return True
def merge_docs_to_target_size(
request: Request,
chunks: list[Document],
) -> list[Document]:
"""
Best-effort normalization of chunk sizes.
Attempts to grow small chunks up to a desired minimum size,
without exceeding the maximum size or crossing source/file
boundaries.
"""
min_chunk_size_target = request.app.state.config.CHUNK_MIN_SIZE_TARGET
max_chunk_size = request.app.state.config.CHUNK_SIZE
if min_chunk_size_target <= 0:
return chunks
measure_chunk_size = len
2026-03-17 17:58:01 -05:00
if request.app.state.config.TEXT_SPLITTER == 'token':
encoding = tiktoken.get_encoding(str(request.app.state.config.TIKTOKEN_ENCODING_NAME))
measure_chunk_size = lambda text: len(encoding.encode(text))
processed_chunks: list[Document] = []
current_chunk: Document | None = None
2026-03-17 17:58:01 -05:00
current_content: str = ''
for next_chunk in chunks:
if current_chunk is None:
current_chunk = next_chunk
current_content = next_chunk.page_content
continue # First chunk initialization
2026-03-17 17:58:01 -05:00
proposed_content = f'{current_content}\n\n{next_chunk.page_content}'
2026-01-03 19:48:37 +04:00
can_merge = (
can_merge_chunks(current_chunk, next_chunk)
and measure_chunk_size(current_content) < min_chunk_size_target
and measure_chunk_size(proposed_content) <= max_chunk_size
)
2026-01-03 19:48:37 +04:00
if can_merge:
current_content = proposed_content
else:
processed_chunks.append(
Document(
page_content=current_content,
metadata={**current_chunk.metadata},
)
)
current_chunk = next_chunk
current_content = next_chunk.page_content
if current_chunk is not None:
processed_chunks.append(
Document(
page_content=current_content,
metadata={**current_chunk.metadata},
)
)
return processed_chunks
2024-09-28 02:38:59 +02:00
def save_docs_to_vector_db(
2024-12-11 18:05:42 -08:00
request: Request,
2024-09-28 02:38:59 +02:00
docs,
collection_name,
metadata: Optional[dict] = None,
overwrite: bool = False,
split: bool = True,
2024-10-03 06:44:17 -07:00
add: bool = False,
2025-02-05 00:07:45 -08:00
user=None,
2024-09-28 02:23:09 +02:00
) -> bool:
2024-12-11 18:05:42 -08:00
def _get_docs_info(docs: list[Document]) -> str:
docs_info = set()
# Trying to select relevant metadata identifying the document.
for doc in docs:
2026-03-17 17:58:01 -05:00
metadata = getattr(doc, 'metadata', {})
doc_name = metadata.get('name', '')
2024-12-11 18:05:42 -08:00
if not doc_name:
2026-03-17 17:58:01 -05:00
doc_name = metadata.get('title', '')
2024-12-11 18:05:42 -08:00
if not doc_name:
2026-03-17 17:58:01 -05:00
doc_name = metadata.get('source', '')
2024-12-11 18:05:42 -08:00
if doc_name:
docs_info.add(doc_name)
2026-03-17 17:58:01 -05:00
return ', '.join(docs_info)
2024-12-11 18:05:42 -08:00
2026-03-17 17:58:01 -05:00
log.debug(f'save_docs_to_vector_db: document {_get_docs_info(docs)} {collection_name}')
2024-09-28 02:23:09 +02:00
2024-10-03 06:53:21 -07:00
# Check if entries with the same hash (metadata.hash) already exist
2026-03-17 17:58:01 -05:00
if metadata and 'hash' in metadata:
2024-10-03 23:06:47 -07:00
result = VECTOR_DB_CLIENT.query(
2024-10-03 06:53:21 -07:00
collection_name=collection_name,
2026-03-17 17:58:01 -05:00
filter={'hash': metadata['hash']},
2024-10-03 06:53:21 -07:00
)
2024-10-03 23:06:47 -07:00
if result is not None and result.ids and len(result.ids) > 0:
2024-10-03 23:06:47 -07:00
existing_doc_ids = result.ids[0]
if existing_doc_ids:
# Check if the existing document belongs to the same file
# If same file_id, this is a re-add/reindex - allow it
# If different file_id, this is a duplicate - block it
existing_file_id = None
if result.metadatas and result.metadatas[0]:
2026-03-17 17:58:01 -05:00
existing_file_id = result.metadatas[0][0].get('file_id')
2026-01-22 18:58:00 +04:00
2026-03-17 17:58:01 -05:00
if existing_file_id != metadata.get('file_id'):
log.info(f'Document with hash {metadata["hash"]} already exists')
raise ValueError(ERROR_MESSAGES.DUPLICATE_CONTENT)
2024-10-03 06:53:21 -07:00
2024-09-28 02:38:59 +02:00
if split:
2025-12-30 19:31:59 +04:00
if request.app.state.config.ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER:
2026-03-17 17:58:01 -05:00
log.info('Using markdown header text splitter')
2025-12-30 19:31:59 +04:00
# Define headers to split on - covering most common markdown header levels
markdown_splitter = MarkdownHeaderTextSplitter(
headers_to_split_on=[
2026-03-17 17:58:01 -05:00
('#', 'Header 1'),
('##', 'Header 2'),
('###', 'Header 3'),
('####', 'Header 4'),
('#####', 'Header 5'),
('######', 'Header 6'),
2025-12-30 19:31:59 +04:00
],
strip_headers=False, # Keep headers in content for context
)
split_docs = []
for doc in docs:
2025-12-30 19:33:30 +04:00
split_docs.extend(
[
Document(
page_content=split_chunk.page_content,
metadata={**doc.metadata},
)
2026-03-17 17:58:01 -05:00
for split_chunk in markdown_splitter.split_text(doc.page_content)
2025-12-30 19:33:30 +04:00
]
)
2025-12-30 19:31:59 +04:00
docs = split_docs
if request.app.state.config.CHUNK_MIN_SIZE_TARGET > 0:
docs = merge_docs_to_target_size(request, docs)
2025-12-30 19:31:59 +04:00
2026-03-17 17:58:01 -05:00
if request.app.state.config.TEXT_SPLITTER in ['', 'character']:
2024-10-13 02:07:50 -07:00
text_splitter = RecursiveCharacterTextSplitter(
2024-12-11 18:05:42 -08:00
chunk_size=request.app.state.config.CHUNK_SIZE,
chunk_overlap=request.app.state.config.CHUNK_OVERLAP,
2024-10-13 02:07:50 -07:00
add_start_index=True,
)
docs = text_splitter.split_documents(docs)
2026-03-17 17:58:01 -05:00
elif request.app.state.config.TEXT_SPLITTER == 'token':
log.info(f'Using token text splitter: {request.app.state.config.TIKTOKEN_ENCODING_NAME}')
2024-10-25 22:23:21 -07:00
2024-12-11 18:05:42 -08:00
tiktoken.get_encoding(str(request.app.state.config.TIKTOKEN_ENCODING_NAME))
2024-10-13 02:07:50 -07:00
text_splitter = TokenTextSplitter(
2024-12-11 18:05:42 -08:00
encoding_name=str(request.app.state.config.TIKTOKEN_ENCODING_NAME),
chunk_size=request.app.state.config.CHUNK_SIZE,
chunk_overlap=request.app.state.config.CHUNK_OVERLAP,
2024-10-13 02:07:50 -07:00
add_start_index=True,
)
docs = text_splitter.split_documents(docs)
2024-10-13 02:07:50 -07:00
else:
2026-03-17 17:58:01 -05:00
raise ValueError(ERROR_MESSAGES.DEFAULT('Invalid text splitter'))
2024-10-13 02:07:50 -07:00
2024-09-28 02:38:59 +02:00
if len(docs) == 0:
raise ValueError(ERROR_MESSAGES.EMPTY_CONTENT)
2024-09-28 02:23:09 +02:00
texts = [sanitize_text_for_db(doc.page_content) for doc in docs]
2024-10-13 03:25:11 -07:00
metadatas = [
{
**doc.metadata,
**(metadata if metadata else {}),
2026-03-17 17:58:01 -05:00
'embedding_config': {
'engine': request.app.state.config.RAG_EMBEDDING_ENGINE,
'model': request.app.state.config.RAG_EMBEDDING_MODEL,
2025-07-31 17:45:06 +04:00
},
2024-10-13 03:25:11 -07:00
}
for doc in docs
]
2024-09-28 02:23:09 +02:00
2024-04-14 17:55:00 -04:00
try:
2024-09-28 02:23:09 +02:00
if VECTOR_DB_CLIENT.has_collection(collection_name=collection_name):
2026-03-17 17:58:01 -05:00
log.info(f'collection {collection_name} already exists')
2024-09-28 02:23:09 +02:00
2024-10-03 06:44:17 -07:00
if overwrite:
VECTOR_DB_CLIENT.delete_collection(collection_name=collection_name)
2026-03-17 17:58:01 -05:00
log.info(f'deleting existing collection {collection_name}')
2024-10-17 13:08:10 -07:00
elif add is False:
2026-03-17 17:58:01 -05:00
log.info(f'collection {collection_name} already exists, overwrite is False and add is False')
2024-10-03 06:44:17 -07:00
return True
2024-04-14 19:15:39 -04:00
2026-03-17 17:58:01 -05:00
log.info(f'generating embeddings for {collection_name}')
2024-10-03 06:44:17 -07:00
embedding_function = get_embedding_function(
2024-12-11 18:05:42 -08:00
request.app.state.config.RAG_EMBEDDING_ENGINE,
request.app.state.config.RAG_EMBEDDING_MODEL,
2024-12-11 18:46:29 -08:00
request.app.state.ef,
2024-11-18 14:19:56 -08:00
(
2024-12-12 20:22:17 -08:00
request.app.state.config.RAG_OPENAI_API_BASE_URL
2026-03-17 17:58:01 -05:00
if request.app.state.config.RAG_EMBEDDING_ENGINE == 'openai'
2025-05-19 22:58:04 -04:00
else (
request.app.state.config.RAG_OLLAMA_BASE_URL
2026-03-17 17:58:01 -05:00
if request.app.state.config.RAG_EMBEDDING_ENGINE == 'ollama'
2025-05-19 22:58:04 -04:00
else request.app.state.config.RAG_AZURE_OPENAI_BASE_URL
)
2024-11-18 14:19:56 -08:00
),
(
2024-12-12 20:22:17 -08:00
request.app.state.config.RAG_OPENAI_API_KEY
2026-03-17 17:58:01 -05:00
if request.app.state.config.RAG_EMBEDDING_ENGINE == 'openai'
2025-05-19 22:58:04 -04:00
else (
request.app.state.config.RAG_OLLAMA_API_KEY
2026-03-17 17:58:01 -05:00
if request.app.state.config.RAG_EMBEDDING_ENGINE == 'ollama'
2025-05-19 22:58:04 -04:00
else request.app.state.config.RAG_AZURE_OPENAI_API_KEY
)
2024-11-18 14:19:56 -08:00
),
2024-12-11 18:05:42 -08:00
request.app.state.config.RAG_EMBEDDING_BATCH_SIZE,
2025-05-30 00:34:18 +04:00
azure_api_version=(
request.app.state.config.RAG_AZURE_OPENAI_API_VERSION
2026-03-17 17:58:01 -05:00
if request.app.state.config.RAG_EMBEDDING_ENGINE == 'azure_openai'
2025-05-19 22:58:04 -04:00
else None
),
enable_async=request.app.state.config.ENABLE_ASYNC_EMBEDDING,
2026-02-21 14:33:48 -06:00
concurrent_requests=request.app.state.config.RAG_EMBEDDING_CONCURRENT_REQUESTS,
2024-10-03 06:44:17 -07:00
)
# Run async embedding in sync context using the main event loop
# This allows the main loop to stay responsive to health checks during long operations
2026-02-12 15:25:24 -06:00
embedding_timeout = RAG_EMBEDDING_TIMEOUT
future = asyncio.run_coroutine_threadsafe(
2025-11-23 20:15:52 -05:00
embedding_function(
2026-03-17 17:58:01 -05:00
list(map(lambda x: x.replace('\n', ' '), texts)),
2025-11-23 20:15:52 -05:00
prefix=RAG_EMBEDDING_CONTENT_PREFIX,
user=user,
),
request.app.state.main_loop,
2025-11-23 20:15:52 -05:00
)
embeddings = future.result(timeout=embedding_timeout)
2026-03-17 17:58:01 -05:00
log.info(f'embeddings generated {len(embeddings)} for {len(texts)} items')
2024-04-14 19:15:39 -04:00
2024-10-04 00:23:14 -07:00
items = [
{
2026-03-17 17:58:01 -05:00
'id': str(uuid.uuid4()),
'text': text,
'vector': embeddings[idx],
'metadata': metadatas[idx],
2024-10-04 00:23:14 -07:00
}
for idx, text in enumerate(texts)
]
2024-10-04 00:46:32 -07:00
2026-03-17 17:58:01 -05:00
log.info(f'adding to collection {collection_name}')
2024-10-03 06:44:17 -07:00
VECTOR_DB_CLIENT.insert(
collection_name=collection_name,
2024-10-04 00:23:14 -07:00
items=items,
2024-10-03 06:44:17 -07:00
)
2026-03-17 17:58:01 -05:00
log.info(f'added {len(items)} items to collection {collection_name}')
2024-10-03 06:44:17 -07:00
return True
2024-04-14 17:55:00 -04:00
except Exception as e:
log.exception(e)
2024-11-30 22:29:53 -08:00
raise e
2024-02-01 13:35:41 -08:00
2024-09-28 02:23:09 +02:00
class ProcessFileForm(BaseModel):
file_id: str
2024-10-04 00:23:14 -07:00
content: Optional[str] = None
2024-09-28 02:23:09 +02:00
collection_name: Optional[str] = None
2024-05-06 16:39:25 +08:00
2026-03-17 17:58:01 -05:00
@router.post('/process/file')
2026-04-12 14:22:11 -05:00
async def process_file(
2024-12-11 18:05:42 -08:00
request: Request,
2024-09-28 02:23:09 +02:00
form_data: ProcessFileForm,
user=Depends(get_verified_user),
2026-04-12 14:22:11 -05:00
db: AsyncSession = Depends(get_async_session),
2024-09-28 02:23:09 +02:00
):
2025-11-25 05:07:53 -05:00
"""
Process a file and save its content to the vector database.
2026-01-22 18:58:00 +04:00
Process a file and save its content to the vector database.
Note: granular session management is used to prevent connection pool exhaustion.
The session is committed before external API calls, and updates use a fresh session.
2025-11-25 05:07:53 -05:00
"""
2026-03-17 17:58:01 -05:00
if user.role == 'admin':
2026-04-12 14:22:11 -05:00
file = await Files.get_file_by_id(form_data.file_id, db=db)
2025-09-24 12:17:01 -05:00
else:
2026-04-12 14:22:11 -05:00
file = await Files.get_file_by_id_and_user_id(form_data.file_id, user.id, db=db)
2024-06-12 01:37:53 -07:00
2025-09-24 12:17:01 -05:00
if file:
try:
collection_name = form_data.collection_name
2024-10-04 00:23:14 -07:00
2025-09-24 12:17:01 -05:00
if collection_name is None:
2026-03-17 17:58:01 -05:00
collection_name = f'file-{file.id}'
2024-10-05 10:08:48 -07:00
2025-09-24 12:17:01 -05:00
if form_data.content:
# Update the content in the file
# Usage: /files/{file_id}/data/content/update, /files/ (audio file upload pipeline)
2024-10-05 09:58:46 -07:00
2025-09-24 12:17:01 -05:00
try:
# /files/{file_id}/data/content/update
await ASYNC_VECTOR_DB_CLIENT.delete_collection(collection_name=f'file-{file.id}')
except Exception:
2025-09-24 12:17:01 -05:00
# Audio file upload pipeline
pass
2024-10-05 09:58:46 -07:00
docs = [
Document(
2026-03-17 17:58:01 -05:00
page_content=form_data.content.replace('<br/>', '\n'),
2024-10-05 09:58:46 -07:00
metadata={
2024-11-21 19:46:09 -08:00
**file.meta,
2026-03-17 17:58:01 -05:00
'name': file.filename,
'created_by': file.user_id,
'file_id': file.id,
'source': file.filename,
2024-10-05 09:58:46 -07:00
},
)
]
2025-09-24 12:17:01 -05:00
text_content = form_data.content
elif form_data.collection_name:
# Check if the file has already been processed and save the content
# Usage: /knowledge/{id}/file/add, /knowledge/{id}/file/update
result = await ASYNC_VECTOR_DB_CLIENT.query(
collection_name=f'file-{file.id}', filter={'file_id': file.id}
)
2024-11-21 19:46:09 -08:00
2025-09-24 12:17:01 -05:00
if result is not None and len(result.ids[0]) > 0:
docs = [
Document(
page_content=result.documents[0][idx],
metadata=result.metadatas[0][idx],
)
for idx, id in enumerate(result.ids[0])
]
else:
docs = [
Document(
2026-03-17 17:58:01 -05:00
page_content=file.data.get('content', ''),
2025-09-24 12:17:01 -05:00
metadata={
**file.meta,
2026-03-17 17:58:01 -05:00
'name': file.filename,
'created_by': file.user_id,
'file_id': file.id,
'source': file.filename,
2025-09-24 12:17:01 -05:00
},
)
]
2026-03-17 17:58:01 -05:00
text_content = file.data.get('content', '')
2025-09-24 12:17:01 -05:00
else:
# Process the file and save the content
# Usage: /files/
file_path = file.path
if file_path:
2026-04-14 10:55:11 -05:00
file_path = await asyncio.to_thread(Storage.get_file, file_path)
2026-04-21 15:47:32 +09:00
loader = build_loader_from_config(request)
loader.user = user
docs = await loader.aload(file.filename, file.meta.get('content_type'), file_path)
2025-09-24 12:17:01 -05:00
docs = [
Document(
page_content=doc.page_content,
metadata={
2025-09-28 20:17:27 -05:00
**filter_metadata(doc.metadata),
2026-03-17 17:58:01 -05:00
'name': file.filename,
'created_by': file.user_id,
'file_id': file.id,
'source': file.filename,
2025-09-24 12:17:01 -05:00
},
)
for doc in docs
]
else:
docs = [
Document(
2026-03-17 17:58:01 -05:00
page_content=file.data.get('content', ''),
2025-09-24 12:17:01 -05:00
metadata={
**file.meta,
2026-03-17 17:58:01 -05:00
'name': file.filename,
'created_by': file.user_id,
'file_id': file.id,
'source': file.filename,
2025-09-24 12:17:01 -05:00
},
)
]
2026-03-17 17:58:01 -05:00
text_content = ' '.join([doc.page_content for doc in docs])
2025-09-24 12:17:01 -05:00
2026-03-17 17:58:01 -05:00
log.debug(f'text_content: {text_content}')
2026-04-12 14:22:11 -05:00
await Files.update_file_data_by_id(
2025-09-24 12:17:01 -05:00
file.id,
2026-03-17 17:58:01 -05:00
{'content': text_content},
2025-12-29 00:21:18 +04:00
db=db,
2025-09-24 12:17:01 -05:00
)
hash = calculate_sha256_string(text_content)
if request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL:
2026-04-12 14:22:11 -05:00
await Files.update_file_data_by_id(file.id, {'status': 'completed'}, db=db)
await Files.update_file_hash_by_id(file.id, hash, db=db)
2025-09-24 12:17:01 -05:00
return {
2026-03-17 17:58:01 -05:00
'status': True,
'collection_name': None,
'filename': file.filename,
'content': text_content,
2025-09-24 12:17:01 -05:00
}
2024-10-04 00:23:14 -07:00
else:
2025-09-24 12:17:01 -05:00
try:
# Commit any pending changes before the slow embedding step.
# Note: file is already a Pydantic model (not ORM), so no expunge needed.
2026-04-12 22:08:27 -05:00
await db.commit()
2026-01-22 18:58:00 +04:00
# External embedding API takes time (5-60s+).
2026-04-12 22:08:27 -05:00
# Subsequent updates use fresh async sessions.
2026-04-13 13:26:13 -05:00
# NOTE: save_docs_to_vector_db is a sync function that
# calls asyncio.run_coroutine_threadsafe(..., main_loop).result()
# which blocks the calling thread. We MUST run it in a
# worker thread to avoid deadlocking the event loop.
result = await run_in_threadpool(
save_docs_to_vector_db,
2025-09-24 12:17:01 -05:00
request,
docs=docs,
collection_name=collection_name,
2024-10-04 00:23:14 -07:00
metadata={
2026-03-17 17:58:01 -05:00
'file_id': file.id,
'name': file.filename,
'hash': hash,
2024-10-04 00:23:14 -07:00
},
2025-09-24 12:17:01 -05:00
add=(True if form_data.collection_name else False),
user=user,
2024-10-04 00:23:14 -07:00
)
2026-03-17 17:58:01 -05:00
log.info(f'added {len(docs)} items to collection {collection_name}')
2025-09-24 12:17:01 -05:00
if result:
2026-01-22 18:58:00 +04:00
# Fresh session for the final update.
2026-04-12 22:08:27 -05:00
async with get_async_db() as session:
2026-04-12 14:22:11 -05:00
await Files.update_file_metadata_by_id(
2026-01-22 18:58:00 +04:00
file.id,
{
2026-03-17 17:58:01 -05:00
'collection_name': collection_name,
2026-01-22 18:58:00 +04:00
},
db=session,
)
2026-04-12 14:22:11 -05:00
await Files.update_file_data_by_id(
2026-01-22 18:58:00 +04:00
file.id,
2026-03-17 17:58:01 -05:00
{'status': 'completed'},
2026-01-22 18:58:00 +04:00
db=session,
)
2026-04-12 14:22:11 -05:00
await Files.update_file_hash_by_id(file.id, hash, db=session)
2026-01-22 18:58:00 +04:00
return {
2026-03-17 17:58:01 -05:00
'status': True,
'collection_name': collection_name,
'filename': file.filename,
'content': text_content,
2026-01-22 18:58:00 +04:00
}
2025-09-24 12:17:01 -05:00
else:
2026-03-17 17:58:01 -05:00
raise Exception('Error saving document to vector database')
2025-09-24 12:17:01 -05:00
except Exception as e:
raise e
2025-02-26 15:42:19 -08:00
2025-09-24 12:17:01 -05:00
except Exception as e:
log.exception(e)
2026-01-22 18:58:00 +04:00
# Fresh session for error status update.
2026-04-12 22:08:27 -05:00
async with get_async_db() as session:
2026-04-12 14:22:11 -05:00
await Files.update_file_data_by_id(
2026-01-22 18:58:00 +04:00
file.id,
2026-03-17 17:58:01 -05:00
{'status': 'failed'},
2026-01-22 18:58:00 +04:00
db=session,
)
# Clear the hash so the file can be re-uploaded after fixing the issue
2026-04-12 14:22:11 -05:00
await Files.update_file_hash_by_id(file.id, None, db=session)
2025-09-17 11:28:04 -05:00
2026-03-17 17:58:01 -05:00
if 'No pandoc was found' in str(e):
2025-09-24 12:17:01 -05:00
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=ERROR_MESSAGES.PANDOC_NOT_INSTALLED,
)
else:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=str(e),
)
2025-02-26 15:42:19 -08:00
2025-09-24 12:17:01 -05:00
else:
2026-03-17 17:58:01 -05:00
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=ERROR_MESSAGES.NOT_FOUND)
2024-09-28 02:23:09 +02:00
2024-09-28 02:29:08 +02:00
class ProcessTextForm(BaseModel):
2024-09-28 02:23:09 +02:00
name: str
content: str
collection_name: Optional[str] = None
2026-03-17 17:58:01 -05:00
@router.post('/process/text')
2025-11-25 05:07:53 -05:00
async def process_text(
2024-12-11 18:05:42 -08:00
request: Request,
2024-09-28 02:29:08 +02:00
form_data: ProcessTextForm,
2024-09-28 02:23:09 +02:00
user=Depends(get_verified_user),
):
collection_name = form_data.collection_name
if collection_name is None:
collection_name = calculate_sha256_string(form_data.content)
2026-04-17 13:35:35 +09:00
else:
await _validate_collection_access([collection_name], user, access_type='write')
2024-09-28 02:23:09 +02:00
2024-09-28 02:38:59 +02:00
docs = [
Document(
page_content=form_data.content,
2026-03-17 17:58:01 -05:00
metadata={'name': form_data.name, 'created_by': user.id},
2024-09-28 02:38:59 +02:00
)
]
2024-09-29 18:55:26 +02:00
text_content = form_data.content
2026-03-17 17:58:01 -05:00
log.debug(f'text_content: {text_content}')
2024-09-29 18:55:26 +02:00
2026-03-17 17:58:01 -05:00
result = await run_in_threadpool(save_docs_to_vector_db, request, docs, collection_name, user=user)
2024-09-28 02:23:09 +02:00
if result:
2024-09-29 18:55:26 +02:00
return {
2026-03-17 17:58:01 -05:00
'status': True,
'collection_name': collection_name,
'content': text_content,
2024-09-29 18:55:26 +02:00
}
2024-09-28 02:23:09 +02:00
else:
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=ERROR_MESSAGES.DEFAULT(),
)
2026-03-17 17:58:01 -05:00
@router.post('/process/youtube')
@router.post('/process/web')
2025-11-25 05:07:53 -05:00
async def process_web(
2025-12-21 18:08:36 +04:00
request: Request,
form_data: ProcessUrlForm,
2026-03-17 17:58:01 -05:00
process: bool = Query(True, description='Whether to process and save the content'),
overwrite: bool = Query(True, description='Whether to overwrite existing collection'),
2025-12-21 18:08:36 +04:00
user=Depends(get_verified_user),
2024-12-11 18:05:42 -08:00
):
2024-09-28 02:23:09 +02:00
try:
2026-03-17 17:58:01 -05:00
content, docs = await run_in_threadpool(get_content_from_url, request, form_data.url)
log.debug(f'text_content: {content}')
2025-12-21 18:08:36 +04:00
if process:
collection_name = form_data.collection_name
if not collection_name:
collection_name = calculate_sha256_string(form_data.url)[:63]
2026-04-17 13:35:35 +09:00
else:
await _validate_collection_access([collection_name], user, access_type='write')
2025-12-21 18:08:36 +04:00
if not request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL:
await run_in_threadpool(
save_docs_to_vector_db,
request,
docs,
collection_name,
2026-02-19 16:03:03 -06:00
overwrite=overwrite,
2026-02-24 17:23:36 -06:00
add=(not overwrite),
2025-12-21 18:08:36 +04:00
user=user,
)
else:
collection_name = None
return {
2026-03-17 17:58:01 -05:00
'status': True,
'collection_name': collection_name,
'filename': form_data.url,
'file': {
'data': {
'content': content,
2025-12-21 18:08:36 +04:00
},
2026-03-17 17:58:01 -05:00
'meta': {
'name': form_data.url,
'source': form_data.url,
2025-12-21 18:08:36 +04:00
},
2024-10-06 19:44:02 -07:00
},
2025-12-21 18:08:36 +04:00
}
else:
return {
2026-03-17 17:58:01 -05:00
'status': True,
'content': content,
2025-12-21 18:08:36 +04:00
}
2024-09-28 02:23:09 +02:00
except Exception as e:
log.exception(e)
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=ERROR_MESSAGES.DEFAULT(e),
)
2024-06-12 11:08:05 -07:00
2026-03-17 17:58:01 -05:00
def search_web(request: Request, engine: str, query: str, user=None) -> list[SearchResult]:
2024-06-01 19:52:12 -07:00
"""Search the web using a search engine and return the results as a list of SearchResult objects.
Will look for a search engine API key in environment variables in the following order:
- SEARXNG_QUERY_URL
- YACY_QUERY_URL + YACY_USERNAME + YACY_PASSWORD
2024-06-01 19:52:12 -07:00
- GOOGLE_PSE_API_KEY + GOOGLE_PSE_ENGINE_ID
- BRAVE_SEARCH_API_KEY
2024-12-08 00:21:10 -05:00
- KAGI_SEARCH_API_KEY
2024-10-29 16:45:38 +02:00
- MOJEEK_SEARCH_API_KEY
2025-02-10 16:44:47 +08:00
- BOCHA_SEARCH_API_KEY
2024-06-01 19:52:12 -07:00
- SERPSTACK_API_KEY
- SERPER_API_KEY
2024-06-09 20:44:34 -04:00
- SERPLY_API_KEY
- TAVILY_API_KEY
- EXA_API_KEY
- PERPLEXITY_API_KEY
- SOUGOU_API_SID + SOUGOU_API_SK
2024-08-27 13:15:17 +05:30
- SEARCHAPI_API_KEY + SEARCHAPI_ENGINE (by default `google`)
2025-02-14 12:24:58 +08:00
- SERPAPI_API_KEY + SERPAPI_ENGINE (by default `google`)
2024-06-01 19:52:12 -07:00
Args:
query (str): The query to search for
"""
# TODO: add playwright to search the web
2026-03-17 17:58:01 -05:00
if engine == 'ollama_cloud':
2025-09-24 15:20:31 -05:00
return search_ollama_cloud(
2026-03-17 17:58:01 -05:00
'https://ollama.com',
2025-09-24 15:19:05 -05:00
request.app.state.config.OLLAMA_CLOUD_WEB_SEARCH_API_KEY,
query,
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
)
2026-03-17 17:58:01 -05:00
elif engine == 'perplexity_search':
2025-09-25 14:02:46 -05:00
if request.app.state.config.PERPLEXITY_API_KEY:
return search_perplexity_search(
request.app.state.config.PERPLEXITY_API_KEY,
query,
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
request.app.state.config.PERPLEXITY_SEARCH_API_URL,
user,
2025-09-25 14:02:46 -05:00
)
else:
2026-03-17 17:58:01 -05:00
raise Exception('No PERPLEXITY_API_KEY found in environment variables')
elif engine == 'searxng':
2024-12-11 18:05:42 -08:00
if request.app.state.config.SEARXNG_QUERY_URL:
2026-03-17 17:58:01 -05:00
searxng_kwargs = {'language': request.app.state.config.SEARXNG_LANGUAGE}
2024-06-01 19:57:00 -07:00
return search_searxng(
2024-12-11 18:05:42 -08:00
request.app.state.config.SEARXNG_QUERY_URL,
2024-06-01 19:57:00 -07:00
query,
2025-04-12 16:33:36 -07:00
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
**searxng_kwargs,
2024-06-01 19:57:00 -07:00
)
2024-06-01 19:52:12 -07:00
else:
2026-03-17 17:58:01 -05:00
raise Exception('No SEARXNG_QUERY_URL found in environment variables')
elif engine == 'yacy':
if request.app.state.config.YACY_QUERY_URL:
return search_yacy(
request.app.state.config.YACY_QUERY_URL,
request.app.state.config.YACY_USERNAME,
request.app.state.config.YACY_PASSWORD,
query,
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
)
else:
2026-03-17 17:58:01 -05:00
raise Exception('No YACY_QUERY_URL found in environment variables')
elif engine == 'google_pse':
if request.app.state.config.GOOGLE_PSE_API_KEY and request.app.state.config.GOOGLE_PSE_ENGINE_ID:
2024-06-01 19:52:12 -07:00
return search_google_pse(
2024-12-11 18:05:42 -08:00
request.app.state.config.GOOGLE_PSE_API_KEY,
request.app.state.config.GOOGLE_PSE_ENGINE_ID,
2024-06-01 19:52:12 -07:00
query,
2025-04-12 16:33:36 -07:00
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
2025-11-04 13:50:07 -05:00
referer=request.app.state.config.WEBUI_URL,
2024-06-01 19:52:12 -07:00
)
else:
2026-03-17 17:58:01 -05:00
raise Exception('No GOOGLE_PSE_API_KEY or GOOGLE_PSE_ENGINE_ID found in environment variables')
elif engine == 'brave':
2024-12-11 18:05:42 -08:00
if request.app.state.config.BRAVE_SEARCH_API_KEY:
2024-06-01 19:57:00 -07:00
return search_brave(
2024-12-11 18:05:42 -08:00
request.app.state.config.BRAVE_SEARCH_API_KEY,
2024-06-01 19:57:00 -07:00
query,
2025-04-12 16:33:36 -07:00
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
2024-06-01 19:57:00 -07:00
)
2024-06-01 19:52:12 -07:00
else:
2026-03-17 17:58:01 -05:00
raise Exception('No BRAVE_SEARCH_API_KEY found in environment variables')
elif engine == 'kagi':
2024-12-11 18:05:42 -08:00
if request.app.state.config.KAGI_SEARCH_API_KEY:
2024-12-08 00:21:10 -05:00
return search_kagi(
2024-12-11 18:05:42 -08:00
request.app.state.config.KAGI_SEARCH_API_KEY,
2024-12-08 00:21:10 -05:00
query,
2025-04-12 16:33:36 -07:00
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
2024-12-08 00:21:10 -05:00
)
else:
2026-03-17 17:58:01 -05:00
raise Exception('No KAGI_SEARCH_API_KEY found in environment variables')
elif engine == 'mojeek':
2024-12-11 18:05:42 -08:00
if request.app.state.config.MOJEEK_SEARCH_API_KEY:
2024-10-29 16:45:38 +02:00
return search_mojeek(
2024-12-11 18:05:42 -08:00
request.app.state.config.MOJEEK_SEARCH_API_KEY,
2024-10-29 16:45:38 +02:00
query,
2025-04-12 16:33:36 -07:00
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
2024-10-29 16:45:38 +02:00
)
else:
2026-03-17 17:58:01 -05:00
raise Exception('No MOJEEK_SEARCH_API_KEY found in environment variables')
elif engine == 'bocha':
2025-02-10 16:44:47 +08:00
if request.app.state.config.BOCHA_SEARCH_API_KEY:
return search_bocha(
request.app.state.config.BOCHA_SEARCH_API_KEY,
query,
2025-04-12 16:33:36 -07:00
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
2025-02-10 16:44:47 +08:00
)
else:
2026-03-17 17:58:01 -05:00
raise Exception('No BOCHA_SEARCH_API_KEY found in environment variables')
elif engine == 'serpstack':
2024-12-11 18:05:42 -08:00
if request.app.state.config.SERPSTACK_API_KEY:
2024-06-01 19:52:12 -07:00
return search_serpstack(
2024-12-11 18:05:42 -08:00
request.app.state.config.SERPSTACK_API_KEY,
2024-06-01 19:52:12 -07:00
query,
2025-04-12 16:33:36 -07:00
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
2024-12-11 18:05:42 -08:00
https_enabled=request.app.state.config.SERPSTACK_HTTPS,
2024-06-01 19:52:12 -07:00
)
else:
2026-03-17 17:58:01 -05:00
raise Exception('No SERPSTACK_API_KEY found in environment variables')
elif engine == 'serper':
2024-12-11 18:05:42 -08:00
if request.app.state.config.SERPER_API_KEY:
2024-06-01 19:57:00 -07:00
return search_serper(
2024-12-11 18:05:42 -08:00
request.app.state.config.SERPER_API_KEY,
2024-06-01 19:57:00 -07:00
query,
2025-04-12 16:33:36 -07:00
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
2024-06-01 19:57:00 -07:00
)
2024-06-01 19:52:12 -07:00
else:
2026-03-17 17:58:01 -05:00
raise Exception('No SERPER_API_KEY found in environment variables')
elif engine == 'serply':
2024-12-11 18:05:42 -08:00
if request.app.state.config.SERPLY_API_KEY:
2024-06-09 20:44:34 -04:00
return search_serply(
2024-12-11 18:05:42 -08:00
request.app.state.config.SERPLY_API_KEY,
2024-06-09 20:44:34 -04:00
query,
2025-04-12 16:33:36 -07:00
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
2025-08-09 00:37:37 +04:00
filter_list=request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
2024-06-09 20:44:34 -04:00
)
else:
2026-03-17 17:58:01 -05:00
raise Exception('No SERPLY_API_KEY found in environment variables')
elif engine == 'duckduckgo':
2024-06-17 14:32:23 -07:00
return search_duckduckgo(
query,
2025-04-12 16:33:36 -07:00
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
concurrent_requests=request.app.state.config.WEB_SEARCH_CONCURRENT_REQUESTS,
backend=request.app.state.config.DDGS_BACKEND,
2024-06-17 14:32:23 -07:00
)
2026-03-17 17:58:01 -05:00
elif engine == 'tavily':
2024-12-11 18:05:42 -08:00
if request.app.state.config.TAVILY_API_KEY:
return search_tavily(
2024-12-11 18:05:42 -08:00
request.app.state.config.TAVILY_API_KEY,
query,
2025-04-12 16:33:36 -07:00
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
)
else:
2026-03-17 17:58:01 -05:00
raise Exception('No TAVILY_API_KEY found in environment variables')
elif engine == 'exa':
2025-06-19 13:52:58 +08:00
if request.app.state.config.EXA_API_KEY:
return search_exa(
request.app.state.config.EXA_API_KEY,
query,
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
)
else:
2026-03-17 17:58:01 -05:00
raise Exception('No EXA_API_KEY found in environment variables')
elif engine == 'searchapi':
2024-12-11 18:05:42 -08:00
if request.app.state.config.SEARCHAPI_API_KEY:
2024-08-27 13:15:17 +05:30
return search_searchapi(
2024-12-11 18:05:42 -08:00
request.app.state.config.SEARCHAPI_API_KEY,
request.app.state.config.SEARCHAPI_ENGINE,
2024-08-27 13:15:17 +05:30
query,
2025-04-12 16:33:36 -07:00
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
2024-08-27 13:15:17 +05:30
)
else:
2026-03-17 17:58:01 -05:00
raise Exception('No SEARCHAPI_API_KEY found in environment variables')
elif engine == 'serpapi':
2025-02-14 12:24:58 +08:00
if request.app.state.config.SERPAPI_API_KEY:
return search_serpapi(
request.app.state.config.SERPAPI_API_KEY,
request.app.state.config.SERPAPI_ENGINE,
query,
2025-04-12 16:33:36 -07:00
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
2025-02-14 12:24:58 +08:00
)
else:
2026-03-17 17:58:01 -05:00
raise Exception('No SERPAPI_API_KEY found in environment variables')
elif engine == 'jina':
2024-11-03 17:07:24 -08:00
return search_jina(
2024-12-11 18:05:42 -08:00
request.app.state.config.JINA_API_KEY,
2024-11-03 17:07:24 -08:00
query,
2025-04-12 16:33:36 -07:00
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
2026-01-01 02:17:47 +04:00
request.app.state.config.JINA_API_BASE_URL,
2024-11-03 17:07:24 -08:00
)
2026-03-17 17:58:01 -05:00
elif engine == 'bing':
2024-10-28 11:33:52 +02:00
return search_bing(
2024-12-11 18:05:42 -08:00
request.app.state.config.BING_SEARCH_V7_SUBSCRIPTION_KEY,
request.app.state.config.BING_SEARCH_V7_ENDPOINT,
2024-10-28 11:33:52 +02:00
str(DEFAULT_LOCALE),
query,
2025-04-12 16:33:36 -07:00
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
2024-10-28 11:33:52 +02:00
)
2026-03-17 17:58:01 -05:00
elif engine == 'azure':
2025-11-14 10:12:34 +10:00
if (
request.app.state.config.AZURE_AI_SEARCH_API_KEY
and request.app.state.config.AZURE_AI_SEARCH_ENDPOINT
and request.app.state.config.AZURE_AI_SEARCH_INDEX_NAME
):
return search_azure(
request.app.state.config.AZURE_AI_SEARCH_API_KEY,
request.app.state.config.AZURE_AI_SEARCH_ENDPOINT,
request.app.state.config.AZURE_AI_SEARCH_INDEX_NAME,
query,
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
)
else:
raise Exception(
2026-03-17 17:58:01 -05:00
'AZURE_AI_SEARCH_API_KEY, AZURE_AI_SEARCH_ENDPOINT, and AZURE_AI_SEARCH_INDEX_NAME are required for Azure AI Search'
2025-11-14 10:12:34 +10:00
)
2026-03-17 17:58:01 -05:00
elif engine == 'exa':
2025-06-24 21:24:53 -04:00
return search_exa(
request.app.state.config.EXA_API_KEY,
query,
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
)
2026-03-17 17:58:01 -05:00
elif engine == 'perplexity':
return search_perplexity(
request.app.state.config.PERPLEXITY_API_KEY,
query,
2025-04-12 16:33:36 -07:00
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
model=request.app.state.config.PERPLEXITY_MODEL,
search_context_usage=request.app.state.config.PERPLEXITY_SEARCH_CONTEXT_USAGE,
)
2026-03-17 17:58:01 -05:00
elif engine == 'sougou':
if request.app.state.config.SOUGOU_API_SID and request.app.state.config.SOUGOU_API_SK:
return search_sougou(
request.app.state.config.SOUGOU_API_SID,
request.app.state.config.SOUGOU_API_SK,
query,
2025-04-12 16:33:36 -07:00
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
)
else:
2026-03-17 17:58:01 -05:00
raise Exception('No SOUGOU_API_SID or SOUGOU_API_SK found in environment variables')
elif engine == 'firecrawl':
2025-04-24 14:57:28 +08:00
return search_firecrawl(
request.app.state.config.FIRECRAWL_API_BASE_URL,
request.app.state.config.FIRECRAWL_API_KEY,
query,
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
)
2026-03-17 17:58:01 -05:00
elif engine == 'external':
return search_external(
request,
request.app.state.config.EXTERNAL_WEB_SEARCH_URL,
request.app.state.config.EXTERNAL_WEB_SEARCH_API_KEY,
query,
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
user=user,
)
2026-03-17 17:58:01 -05:00
elif engine == 'yandex':
2026-01-26 17:31:44 +05:00
return search_yandex(
request,
request.app.state.config.YANDEX_WEB_SEARCH_URL,
request.app.state.config.YANDEX_WEB_SEARCH_API_KEY,
request.app.state.config.YANDEX_WEB_SEARCH_CONFIG,
query,
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
user=user,
)
2026-03-17 17:58:01 -05:00
elif engine == 'youcom':
return search_youcom(
request.app.state.config.YOUCOM_API_KEY,
query,
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
)
2024-06-01 19:52:12 -07:00
else:
2026-03-17 17:58:01 -05:00
raise Exception('No search engine API key found in environment variables')
2024-06-01 19:52:12 -07:00
2026-03-17 17:58:01 -05:00
@router.post('/process/web/search')
async def process_web_search(request: Request, form_data: SearchForm, user=Depends(get_verified_user)):
if not request.app.state.config.ENABLE_WEB_SEARCH:
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail=ERROR_MESSAGES.ACCESS_PROHIBITED,
)
2026-04-12 14:22:11 -05:00
if user.role != 'admin' and not await has_permission(
2026-03-17 17:58:01 -05:00
user.id, 'features.web_search', request.app.state.config.USER_PERMISSIONS
):
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail=ERROR_MESSAGES.ACCESS_PROHIBITED,
)
2025-05-10 17:54:41 +04:00
urls = []
2025-09-07 03:00:28 +04:00
result_items = []
2024-05-06 16:39:25 +08:00
try:
2026-03-17 17:58:01 -05:00
logging.debug(f'trying to web search with {request.app.state.config.WEB_SEARCH_ENGINE, form_data.queries}')
2025-05-10 17:54:41 +04:00
# Use semaphore to limit concurrent requests based on WEB_SEARCH_CONCURRENT_REQUESTS
# 0 or None = unlimited (previous behavior), positive number = limited concurrency
# Set to 1 for sequential execution (rate-limited APIs like Brave free tier)
concurrent_limit = request.app.state.config.WEB_SEARCH_CONCURRENT_REQUESTS
if concurrent_limit:
# Limited concurrency with semaphore
semaphore = asyncio.Semaphore(concurrent_limit)
2026-02-21 14:33:48 -06:00
async def search_query_with_semaphore(query):
async with semaphore:
return await run_in_threadpool(
search_web,
request,
request.app.state.config.WEB_SEARCH_ENGINE,
query,
user,
)
2026-03-17 17:58:01 -05:00
search_tasks = [search_query_with_semaphore(query) for query in form_data.queries]
else:
# Unlimited parallel execution (previous behavior)
search_tasks = [
run_in_threadpool(
search_web,
request,
request.app.state.config.WEB_SEARCH_ENGINE,
query,
user,
)
for query in form_data.queries
]
2025-05-10 17:54:41 +04:00
search_results = await asyncio.gather(*search_tasks)
for result in search_results:
if result:
for item in result:
if item and item.link:
2025-09-07 03:00:28 +04:00
result_items.append(item)
2025-05-10 17:54:41 +04:00
urls.append(item.link)
urls = list(dict.fromkeys(urls))
2026-03-17 17:58:01 -05:00
log.debug(f'urls: {urls}')
2025-05-10 17:54:41 +04:00
2024-06-01 19:52:12 -07:00
except Exception as e:
log.exception(e)
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=ERROR_MESSAGES.WEB_SEARCH_ERROR(e),
)
2025-10-07 16:20:27 -05:00
if len(urls) == 0:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
2026-03-17 17:58:01 -05:00
detail=ERROR_MESSAGES.DEFAULT('No results found from web search'),
2025-10-07 16:20:27 -05:00
)
2024-06-01 19:52:12 -07:00
try:
2025-05-23 02:30:35 +04:00
if request.app.state.config.BYPASS_WEB_SEARCH_WEB_LOADER:
2026-03-17 17:58:01 -05:00
search_results = [item for result in search_results for item in result if result]
2025-06-16 16:17:52 +04:00
2025-05-23 02:30:35 +04:00
docs = [
Document(
page_content=result.snippet,
metadata={
2026-03-17 17:58:01 -05:00
'source': result.link,
'title': result.title,
'snippet': result.snippet,
'link': result.link,
2025-05-23 02:30:35 +04:00
},
)
for result in search_results
2026-03-17 17:58:01 -05:00
if hasattr(result, 'snippet') and result.snippet is not None
2025-05-23 02:30:35 +04:00
]
else:
loader = get_web_loader(
urls,
verify_ssl=request.app.state.config.ENABLE_WEB_LOADER_SSL_VERIFICATION,
requests_per_second=request.app.state.config.WEB_LOADER_CONCURRENT_REQUESTS,
2025-05-23 02:30:35 +04:00
trust_env=request.app.state.config.WEB_SEARCH_TRUST_ENV,
)
docs = await loader.aload()
2025-04-06 15:45:48 -07:00
urls = [
2026-03-17 17:58:01 -05:00
doc.metadata.get('source') for doc in docs if doc.metadata.get('source')
2025-05-10 17:54:41 +04:00
] # only keep the urls returned by the loader
2025-09-07 03:00:28 +04:00
result_items = [
dict(item) for item in result_items if item.link in urls
] # only keep the search results that have been loaded
2024-09-28 02:38:59 +02:00
2025-02-26 15:42:19 -08:00
if request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL:
2025-02-17 18:14:26 -08:00
return {
2026-03-17 17:58:01 -05:00
'status': True,
'collection_name': None,
'filenames': urls,
'items': result_items,
'docs': [
2025-02-17 18:14:26 -08:00
{
2026-03-17 17:58:01 -05:00
'content': doc.page_content,
'metadata': doc.metadata,
2025-02-17 18:14:26 -08:00
}
for doc in docs
],
2026-03-17 17:58:01 -05:00
'loaded_count': len(docs),
2025-02-17 18:14:26 -08:00
}
else:
2025-05-10 17:54:41 +04:00
# Create a single collection for all documents
2026-03-17 17:58:01 -05:00
collection_name = f'web-search-{calculate_sha256_string("-".join(form_data.queries))}'[:63]
2025-05-10 17:54:41 +04:00
try:
await run_in_threadpool(
save_docs_to_vector_db,
request,
docs,
collection_name,
overwrite=True,
user=user,
)
except Exception as e:
2026-03-17 17:58:01 -05:00
log.debug(f'error saving docs: {e}')
2025-02-17 18:14:26 -08:00
return {
2026-03-17 17:58:01 -05:00
'status': True,
'collection_names': [collection_name],
'items': result_items,
'filenames': urls,
'loaded_count': len(docs),
2025-02-17 18:14:26 -08:00
}
2024-05-06 16:39:25 +08:00
except Exception as e:
log.exception(e)
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=ERROR_MESSAGES.DEFAULT(e),
)
2026-04-17 13:35:35 +09:00
async def _validate_collection_access(collection_names: list[str], user, access_type: str = 'read') -> None:
"""
2026-04-17 13:47:21 +09:00
Raise 403 if the user lacks access to any of the requested collections.
Delegates to the shared filter_accessible_collections utility so the
access rules stay in one place.
"""
2026-04-17 13:47:21 +09:00
requested = set(collection_names)
allowed = await filter_accessible_collections(requested, user, access_type=access_type)
denied = requested - allowed
if denied:
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail=ERROR_MESSAGES.ACCESS_PROHIBITED,
)
2024-09-28 02:23:09 +02:00
class QueryDocForm(BaseModel):
collection_name: str
query: str
k: Optional[int] = None
2025-03-06 10:47:57 +01:00
k_reranker: Optional[int] = None
2024-09-28 02:23:09 +02:00
r: Optional[float] = None
hybrid: Optional[bool] = None
2024-03-24 00:40:27 -07:00
2026-03-17 17:58:01 -05:00
@router.post('/query/doc')
async def query_doc_handler(
2024-12-11 18:05:42 -08:00
request: Request,
2024-09-28 02:23:09 +02:00
form_data: QueryDocForm,
user=Depends(get_verified_user),
):
2026-04-12 14:22:11 -05:00
await _validate_collection_access([form_data.collection_name], user)
2024-09-28 02:23:09 +02:00
try:
2026-03-17 17:58:01 -05:00
if request.app.state.config.ENABLE_RAG_HYBRID_SEARCH and (form_data.hybrid is None or form_data.hybrid):
collection_results = {}
collection_results[form_data.collection_name] = await ASYNC_VECTOR_DB_CLIENT.get(
collection_name=form_data.collection_name
)
return await query_doc_with_hybrid_search(
2024-09-28 02:23:09 +02:00
collection_name=form_data.collection_name,
collection_result=collection_results[form_data.collection_name],
2024-09-28 02:23:09 +02:00
query=form_data.query,
2025-03-31 14:13:27 -07:00
embedding_function=lambda query, prefix: request.app.state.EMBEDDING_FUNCTION(
query, prefix=prefix, user=user
2025-02-05 00:07:45 -08:00
),
2024-12-11 18:05:42 -08:00
k=form_data.k if form_data.k else request.app.state.config.TOP_K,
2025-07-14 14:05:06 +04:00
reranking_function=(
2026-03-17 17:58:01 -05:00
(lambda query, documents: request.app.state.RERANKING_FUNCTION(query, documents, user=user))
2025-07-16 13:56:02 +04:00
if request.app.state.RERANKING_FUNCTION
else None
),
2026-03-17 17:58:01 -05:00
k_reranker=form_data.k_reranker or request.app.state.config.TOP_K_RERANKER,
r=(form_data.r if form_data.r else request.app.state.config.RELEVANCE_THRESHOLD),
2025-05-23 22:06:44 +02:00
hybrid_bm25_weight=(
form_data.hybrid_bm25_weight
if form_data.hybrid_bm25_weight
else request.app.state.config.HYBRID_BM25_WEIGHT
),
2025-02-05 00:07:45 -08:00
user=user,
2024-09-28 02:23:09 +02:00
)
else:
query_embedding = await request.app.state.EMBEDDING_FUNCTION(
form_data.query, prefix=RAG_EMBEDDING_QUERY_PREFIX, user=user
)
# query_doc wraps a blocking VECTOR_DB_CLIENT.search call;
# offload so the request's event loop stays responsive.
return await asyncio.to_thread(
query_doc,
2024-09-28 02:23:09 +02:00
collection_name=form_data.collection_name,
query_embedding=query_embedding,
2024-12-11 18:05:42 -08:00
k=form_data.k if form_data.k else request.app.state.config.TOP_K,
2025-02-05 00:07:45 -08:00
user=user,
2024-09-28 02:23:09 +02:00
)
except Exception as e:
log.exception(e)
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=ERROR_MESSAGES.DEFAULT(e),
)
2024-03-24 00:40:27 -07:00
2024-09-28 02:23:09 +02:00
class QueryCollectionsForm(BaseModel):
collection_names: list[str]
query: str
k: Optional[int] = None
2025-03-06 10:47:57 +01:00
k_reranker: Optional[int] = None
2024-09-28 02:23:09 +02:00
r: Optional[float] = None
hybrid: Optional[bool] = None
hybrid_bm25_weight: Optional[float] = None
enable_enriched_texts: Optional[bool] = None
2024-03-25 23:47:08 -07:00
2024-03-24 00:40:27 -07:00
2026-03-17 17:58:01 -05:00
@router.post('/query/collection')
async def query_collection_handler(
2024-12-11 18:05:42 -08:00
request: Request,
2024-09-28 02:23:09 +02:00
form_data: QueryCollectionsForm,
2024-06-27 11:29:59 -07:00
user=Depends(get_verified_user),
2024-01-07 02:46:12 -08:00
):
2026-04-12 14:22:11 -05:00
await _validate_collection_access(form_data.collection_names, user)
2026-03-01 13:49:36 -06:00
2024-01-06 22:59:22 -08:00
try:
2026-03-17 17:58:01 -05:00
if request.app.state.config.ENABLE_RAG_HYBRID_SEARCH and (form_data.hybrid is None or form_data.hybrid):
return await query_collection_with_hybrid_search(
2024-09-28 02:23:09 +02:00
collection_names=form_data.collection_names,
2024-12-01 13:36:36 -08:00
queries=[form_data.query],
2025-03-31 14:13:27 -07:00
embedding_function=lambda query, prefix: request.app.state.EMBEDDING_FUNCTION(
query, prefix=prefix, user=user
2025-02-05 00:07:45 -08:00
),
2024-12-11 18:05:42 -08:00
k=form_data.k if form_data.k else request.app.state.config.TOP_K,
2025-07-16 13:56:02 +04:00
reranking_function=(
2026-03-17 17:58:01 -05:00
(lambda query, documents: request.app.state.RERANKING_FUNCTION(query, documents, user=user))
2025-07-16 13:56:02 +04:00
if request.app.state.RERANKING_FUNCTION
else None
),
2026-03-17 17:58:01 -05:00
k_reranker=form_data.k_reranker or request.app.state.config.TOP_K_RERANKER,
r=(form_data.r if form_data.r else request.app.state.config.RELEVANCE_THRESHOLD),
2025-05-23 22:06:44 +02:00
hybrid_bm25_weight=(
form_data.hybrid_bm25_weight
if form_data.hybrid_bm25_weight
else request.app.state.config.HYBRID_BM25_WEIGHT
),
enable_enriched_texts=(
form_data.enable_enriched_texts
if form_data.enable_enriched_texts is not None
else request.app.state.config.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS
),
2024-01-13 08:46:56 -05:00
)
else:
return await query_collection(
2026-03-21 17:12:33 -05:00
request,
2024-09-28 02:23:09 +02:00
collection_names=form_data.collection_names,
2024-12-01 13:36:36 -08:00
queries=[form_data.query],
2025-03-31 14:13:27 -07:00
embedding_function=lambda query, prefix: request.app.state.EMBEDDING_FUNCTION(
query, prefix=prefix, user=user
2025-02-05 00:07:45 -08:00
),
2024-12-11 18:05:42 -08:00
k=form_data.k if form_data.k else request.app.state.config.TOP_K,
2024-07-15 13:05:38 +02:00
)
2024-06-18 13:50:18 -07:00
except Exception as e:
log.exception(e)
2024-03-24 00:40:27 -07:00
raise HTTPException(
2024-09-28 02:23:09 +02:00
status_code=status.HTTP_400_BAD_REQUEST,
detail=ERROR_MESSAGES.DEFAULT(e),
2024-03-24 00:40:27 -07:00
)
2024-09-28 02:23:09 +02:00
####################################
#
# Vector DB operations
#
####################################
2024-02-17 21:06:08 -08:00
2024-10-03 06:44:17 -07:00
class DeleteForm(BaseModel):
collection_name: str
file_id: str
2026-03-17 17:58:01 -05:00
@router.post('/delete')
2026-04-12 14:22:11 -05:00
async def delete_entries_from_collection(
2025-12-30 19:31:59 +04:00
form_data: DeleteForm,
user=Depends(get_admin_user),
2026-04-12 14:22:11 -05:00
db: AsyncSession = Depends(get_async_session),
2025-12-30 19:31:59 +04:00
):
2024-10-03 06:44:17 -07:00
try:
if await ASYNC_VECTOR_DB_CLIENT.has_collection(collection_name=form_data.collection_name):
2026-04-12 14:22:11 -05:00
file = await Files.get_file_by_id(form_data.file_id, db=db)
if not file:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=ERROR_MESSAGES.NOT_FOUND,
)
2024-10-03 06:44:17 -07:00
hash = file.hash
# Refuse to issue a `filter={'hash': None}` query — the
# match semantics of a null filter value are
# backend-dependent (some backends ignore the key, some
# match every row whose metadata lacks `hash`) and risk
# deleting unrelated entries. Files without a hash are
# typically unprocessed / failed / legacy records that
# can't be targeted by hash anyway.
if hash is None:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
2026-04-14 17:27:31 -05:00
detail=ERROR_MESSAGES.DEFAULT('File has no hash; cannot delete vector entries by hash.'),
)
# Pre-existing bug: this used `metadata=` which is not a
# parameter on `VectorDBBase.delete` nor on any backend
# implementation, so the call always raised TypeError that
# was silently swallowed by the surrounding `except
# Exception` and the endpoint reported `{'status': False}`
# for every request. Use `filter` to actually do what the
# endpoint name promises.
await ASYNC_VECTOR_DB_CLIENT.delete(
2024-10-03 06:44:17 -07:00
collection_name=form_data.collection_name,
filter={'hash': hash},
2024-10-03 06:44:17 -07:00
)
2026-03-17 17:58:01 -05:00
return {'status': True}
2024-10-03 06:44:17 -07:00
else:
2026-03-17 17:58:01 -05:00
return {'status': False}
except HTTPException:
# Caller-meaningful errors (404/400 above) must not be
# swallowed and re-shaped as `{'status': False}`.
raise
2024-10-03 06:44:17 -07:00
except Exception as e:
log.exception(e)
2026-03-17 17:58:01 -05:00
return {'status': False}
2024-10-03 06:44:17 -07:00
2026-03-17 17:58:01 -05:00
@router.post('/reset/db')
2026-04-12 14:22:11 -05:00
async def reset_vector_db(user=Depends(get_admin_user), db: AsyncSession = Depends(get_async_session)):
await ASYNC_VECTOR_DB_CLIENT.reset()
2026-04-12 14:22:11 -05:00
await Knowledges.delete_all_knowledge(db=db)
2024-01-07 01:40:36 -08:00
2026-03-17 17:58:01 -05:00
@router.post('/reset/uploads')
2026-04-12 14:22:11 -05:00
async def reset_upload_dir(user=Depends(get_admin_user)) -> bool:
2026-03-17 17:58:01 -05:00
folder = f'{UPLOAD_DIR}'
2024-06-03 21:45:36 -07:00
try:
# Check if the directory exists
if os.path.exists(folder):
# Iterate over all the files and directories in the specified directory
for filename in os.listdir(folder):
file_path = os.path.join(folder, filename)
try:
if os.path.isfile(file_path) or os.path.islink(file_path):
os.unlink(file_path) # Remove the file or link
elif os.path.isdir(file_path):
shutil.rmtree(file_path) # Remove the directory
except Exception as e:
2026-03-17 17:58:01 -05:00
log.exception(f'Failed to delete {file_path}. Reason: {e}')
2024-06-03 21:45:36 -07:00
else:
2026-03-17 17:58:01 -05:00
log.warning(f'The directory {folder} does not exist')
2024-06-03 21:45:36 -07:00
except Exception as e:
2026-03-17 17:58:01 -05:00
log.exception(f'Failed to process the directory {folder}. Reason: {e}')
return True
2024-05-19 06:51:32 -07:00
2024-06-12 00:18:22 -07:00
2026-03-17 17:58:01 -05:00
if ENV == 'dev':
2024-05-19 06:51:32 -07:00
2026-03-17 17:58:01 -05:00
@router.get('/ef/{text}')
async def get_embeddings(request: Request, text: Optional[str] = 'Hello World!'):
return {'result': await request.app.state.EMBEDDING_FUNCTION(text, prefix=RAG_EMBEDDING_QUERY_PREFIX)}
2024-12-17 18:40:50 -08:00
class BatchProcessFilesForm(BaseModel):
files: List[FileModel]
collection_name: str
2024-12-17 18:40:50 -08:00
class BatchProcessFilesResult(BaseModel):
file_id: str
status: str
error: Optional[str] = None
2024-12-17 18:40:50 -08:00
class BatchProcessFilesResponse(BaseModel):
results: List[BatchProcessFilesResult]
errors: List[BatchProcessFilesResult]
2024-12-17 18:40:50 -08:00
2026-03-17 17:58:01 -05:00
@router.post('/process/files/batch')
2025-11-25 05:07:53 -05:00
async def process_files_batch(
2024-12-30 17:36:34 +01:00
request: Request,
form_data: BatchProcessFilesForm,
user=Depends(get_verified_user),
2026-04-01 05:52:03 -05:00
db=None,
) -> BatchProcessFilesResponse:
"""
Process a batch of files and save them to the vector database.
2026-04-12 14:22:11 -05:00
NOTE: We intentionally do NOT use Depends(get_async_session) here.
The save_docs_to_vector_db() call makes external embedding API calls which
can take 5-60+ seconds for batch operations. Database operations after
embedding (Files.update_file_by_id) manage their own short-lived sessions.
"""
2025-11-09 21:06:21 -05:00
collection_name = form_data.collection_name
2025-11-09 21:06:21 -05:00
file_results: List[BatchProcessFilesResult] = []
file_errors: List[BatchProcessFilesResult] = []
file_updates: List[FileUpdateForm] = []
# Prepare all documents first
all_docs: List[Document] = []
2025-11-09 21:06:21 -05:00
for file in form_data.files:
try:
2026-03-01 13:37:31 -06:00
# Ownership check: verify the requesting user owns the file or is an admin
2026-04-12 14:22:11 -05:00
db_file = await Files.get_file_by_id(file.id, db=db)
2026-03-01 13:37:31 -06:00
if not db_file:
file_errors.append(
BatchProcessFilesResult(
file_id=file.id,
2026-03-17 17:58:01 -05:00
status='failed',
error='File not found',
2026-03-01 13:37:31 -06:00
)
)
continue
2026-03-17 17:58:01 -05:00
if db_file.user_id != user.id and user.role != 'admin':
2026-03-01 13:37:31 -06:00
file_errors.append(
BatchProcessFilesResult(
file_id=file.id,
2026-03-17 17:58:01 -05:00
status='failed',
error='Permission denied: not file owner',
2026-03-01 13:37:31 -06:00
)
)
continue
2026-03-17 17:58:01 -05:00
text_content = file.data.get('content', '')
docs: List[Document] = [
Document(
2026-03-17 17:58:01 -05:00
page_content=text_content.replace('<br/>', '\n'),
metadata={
**file.meta,
2026-03-17 17:58:01 -05:00
'name': file.filename,
'created_by': file.user_id,
'file_id': file.id,
'source': file.filename,
},
)
]
all_docs.extend(docs)
2025-11-09 21:06:21 -05:00
file_updates.append(
FileUpdateForm(
hash=calculate_sha256_string(text_content),
2026-03-17 17:58:01 -05:00
data={'content': text_content},
2025-11-09 21:06:21 -05:00
)
)
2026-03-17 17:58:01 -05:00
file_results.append(BatchProcessFilesResult(file_id=file.id, status='prepared'))
except Exception as e:
2026-03-17 17:58:01 -05:00
log.error(f'process_files_batch: Error processing file {file.id}: {str(e)}')
file_errors.append(BatchProcessFilesResult(file_id=file.id, status='failed', error=str(e)))
# Save all documents in one batch
if all_docs:
try:
2025-11-25 05:07:53 -05:00
await run_in_threadpool(
2025-11-25 17:19:33 -05:00
save_docs_to_vector_db,
request,
all_docs,
collection_name,
add=True,
user=user,
)
2024-12-17 18:40:50 -08:00
# Update all files with collection name
2025-11-09 21:06:21 -05:00
for file_update, file_result in zip(file_updates, file_results):
2026-04-12 14:22:11 -05:00
await Files.update_file_by_id(id=file_result.file_id, form_data=file_update, db=db)
2026-03-17 17:58:01 -05:00
file_result.status = 'completed'
except Exception as e:
2026-03-17 17:58:01 -05:00
log.error(f'process_files_batch: Error saving documents to vector DB: {str(e)}')
2025-11-09 21:06:21 -05:00
for file_result in file_results:
2026-03-17 17:58:01 -05:00
file_result.status = 'failed'
file_errors.append(BatchProcessFilesResult(file_id=file_result.file_id, status='failed', error=str(e)))
2024-12-13 15:29:43 +01:00
2025-11-09 21:06:21 -05:00
return BatchProcessFilesResponse(results=file_results, errors=file_errors)