""" NOTE: This vector database integration is community-supported and maintained on a best-effort basis. """ from elasticsearch import Elasticsearch, BadRequestError from typing import Optional import ssl from elasticsearch.helpers import bulk, scan from open_webui.retrieval.vector.utils import process_metadata from open_webui.retrieval.vector.main import ( VectorDBBase, VectorItem, SearchResult, GetResult, ) from open_webui.config import ( ELASTICSEARCH_URL, ELASTICSEARCH_CA_CERTS, ELASTICSEARCH_API_KEY, ELASTICSEARCH_USERNAME, ELASTICSEARCH_PASSWORD, ELASTICSEARCH_CLOUD_ID, ELASTICSEARCH_INDEX_PREFIX, SSL_ASSERT_FINGERPRINT, ) class ElasticsearchClient(VectorDBBase): """ Important: in order to reduce the number of indexes and since the embedding vector length is fixed, we avoid creating an index for each file but store it as a text field, while seperating to different index baesd on the embedding length. """ def __init__(self): self.index_prefix = ELASTICSEARCH_INDEX_PREFIX self.client = Elasticsearch( hosts=[ELASTICSEARCH_URL], ca_certs=ELASTICSEARCH_CA_CERTS, api_key=ELASTICSEARCH_API_KEY, cloud_id=ELASTICSEARCH_CLOUD_ID, basic_auth=( (ELASTICSEARCH_USERNAME, ELASTICSEARCH_PASSWORD) if ELASTICSEARCH_USERNAME and ELASTICSEARCH_PASSWORD else None ), ssl_assert_fingerprint=SSL_ASSERT_FINGERPRINT, ) # Status: works def _get_index_name(self, dimension: int) -> str: return f'{self.index_prefix}_d{str(dimension)}' # Status: works def _scan_result_to_get_result(self, result) -> GetResult: if not result: return None ids = [] documents = [] metadatas = [] for hit in result: ids.append(hit['_id']) documents.append(hit['_source'].get('text')) metadatas.append(hit['_source'].get('metadata')) return GetResult(ids=[ids], documents=[documents], metadatas=[metadatas]) # Status: works def _result_to_get_result(self, result) -> GetResult: if not result['hits']['hits']: return None ids = [] documents = [] metadatas = [] for hit in result['hits']['hits']: ids.append(hit['_id']) documents.append(hit['_source'].get('text')) metadatas.append(hit['_source'].get('metadata')) return GetResult(ids=[ids], documents=[documents], metadatas=[metadatas]) # Status: works def _result_to_search_result(self, result) -> SearchResult: ids = [] distances = [] documents = [] metadatas = [] for hit in result['hits']['hits']: ids.append(hit['_id']) distances.append(hit['_score']) documents.append(hit['_source'].get('text')) metadatas.append(hit['_source'].get('metadata')) return SearchResult( ids=[ids], distances=[distances], documents=[documents], metadatas=[metadatas], ) # Status: works def _create_index(self, dimension: int): body = { 'mappings': { 'dynamic_templates': [ { 'strings': { 'match_mapping_type': 'string', 'mapping': {'type': 'keyword'}, } } ], 'properties': { 'collection': {'type': 'keyword'}, 'id': {'type': 'keyword'}, 'vector': { 'type': 'dense_vector', 'dims': dimension, # Adjust based on your vector dimensions 'index': True, 'similarity': 'cosine', }, 'text': {'type': 'text'}, 'metadata': {'type': 'object'}, }, } } self.client.indices.create(index=self._get_index_name(dimension), body=body) # Status: works def _create_batches(self, items: list[VectorItem], batch_size=100): for i in range(0, len(items), batch_size): yield items[i : min(i + batch_size, len(items))] # Status: works def has_collection(self, collection_name) -> bool: query_body = {'query': {'bool': {'filter': []}}} query_body['query']['bool']['filter'].append({'term': {'collection': collection_name}}) try: result = self.client.count(index=f'{self.index_prefix}*', body=query_body) return result.body['count'] > 0 except Exception as e: return None def delete_collection(self, collection_name: str): query = {'query': {'term': {'collection': collection_name}}} self.client.delete_by_query(index=f'{self.index_prefix}*', body=query) # Status: works def search( self, collection_name: str, vectors: list[list[float]], filter: Optional[dict] = None, limit: int = 10, ) -> Optional[SearchResult]: query = { 'size': limit, '_source': ['text', 'metadata'], 'query': { 'script_score': { 'query': {'bool': {'filter': [{'term': {'collection': collection_name}}]}}, 'script': { 'source': "cosineSimilarity(params.vector, 'vector') + 1.0", 'params': {'vector': vectors[0]}, # Assuming single query vector }, } }, } result = self.client.search(index=self._get_index_name(len(vectors[0])), body=query) return self._result_to_search_result(result) # Status: only tested halfwat def query(self, collection_name: str, filter: dict, limit: Optional[int] = None) -> Optional[GetResult]: if not self.has_collection(collection_name): return None query_body = { 'query': {'bool': {'filter': []}}, '_source': ['text', 'metadata'], } for field, value in filter.items(): query_body['query']['bool']['filter'].append({'term': {field: value}}) query_body['query']['bool']['filter'].append({'term': {'collection': collection_name}}) size = limit if limit else 10 try: result = self.client.search( index=f'{self.index_prefix}*', body=query_body, size=size, ) return self._result_to_get_result(result) except Exception as e: return None # Status: works def _has_index(self, dimension: int): return self.client.indices.exists(index=self._get_index_name(dimension=dimension)) def get_or_create_index(self, dimension: int): if not self._has_index(dimension=dimension): self._create_index(dimension=dimension) # Status: works def get(self, collection_name: str) -> Optional[GetResult]: # Get all the items in the collection. query = { 'query': {'bool': {'filter': [{'term': {'collection': collection_name}}]}}, '_source': ['text', 'metadata'], } results = list(scan(self.client, index=f'{self.index_prefix}*', query=query)) return self._scan_result_to_get_result(results) # Status: works def insert(self, collection_name: str, items: list[VectorItem]): if not self._has_index(dimension=len(items[0]['vector'])): self._create_index(dimension=len(items[0]['vector'])) for batch in self._create_batches(items): actions = [ { '_index': self._get_index_name(dimension=len(items[0]['vector'])), '_id': item['id'], '_source': { 'collection': collection_name, 'vector': item['vector'], 'text': item['text'], 'metadata': process_metadata(item['metadata']), }, } for item in batch ] bulk(self.client, actions) # Upsert documents using the update API with doc_as_upsert=True. def upsert(self, collection_name: str, items: list[VectorItem]): if not self._has_index(dimension=len(items[0]['vector'])): self._create_index(dimension=len(items[0]['vector'])) for batch in self._create_batches(items): actions = [ { '_op_type': 'update', '_index': self._get_index_name(dimension=len(item['vector'])), '_id': item['id'], 'doc': { 'collection': collection_name, 'vector': item['vector'], 'text': item['text'], 'metadata': process_metadata(item['metadata']), }, 'doc_as_upsert': True, } for item in batch ] bulk(self.client, actions) # Delete specific documents from a collection by filtering on both collection and document IDs. def delete( self, collection_name: str, ids: Optional[list[str]] = None, filter: Optional[dict] = None, ): query = {'query': {'bool': {'filter': [{'term': {'collection': collection_name}}]}}} # logic based on chromaDB if ids: query['query']['bool']['filter'].append({'terms': {'_id': ids}}) elif filter: for field, value in filter.items(): query['query']['bool']['filter'].append({'term': {f'metadata.{field}': value}}) self.client.delete_by_query(index=f'{self.index_prefix}*', body=query) def reset(self): indices = self.client.indices.get(index=f'{self.index_prefix}*') for index in indices: self.client.indices.delete(index=index)