This commit is contained in:
Timothy Jaeryang Baek
2026-03-13 20:57:12 -05:00
parent 157ff57c40
commit 6862d618ee
3 changed files with 111 additions and 15 deletions
+105 -12
View File
@@ -13,12 +13,6 @@ from langchain_community.document_loaders import (
OutlookMessageLoader,
PyPDFLoader,
TextLoader,
UnstructuredEPubLoader,
UnstructuredExcelLoader,
UnstructuredODTLoader,
UnstructuredPowerPointLoader,
UnstructuredRSTLoader,
UnstructuredXMLLoader,
YoutubeLoader,
)
from langchain_core.documents import Document
@@ -92,6 +86,54 @@ known_source_ext = [
]
class ExcelLoader:
"""Fallback Excel loader using pandas when unstructured is not installed."""
def __init__(self, file_path):
self.file_path = file_path
def load(self) -> list[Document]:
import pandas as pd
text_parts = []
xls = pd.ExcelFile(self.file_path)
for sheet_name in xls.sheet_names:
df = pd.read_excel(xls, sheet_name=sheet_name)
text_parts.append(f"Sheet: {sheet_name}\n{df.to_string(index=False)}")
return [
Document(
page_content="\n\n".join(text_parts),
metadata={"source": self.file_path},
)
]
class PptxLoader:
"""Fallback PowerPoint loader using python-pptx when unstructured is not installed."""
def __init__(self, file_path):
self.file_path = file_path
def load(self) -> list[Document]:
from pptx import Presentation
prs = Presentation(self.file_path)
text_parts = []
for i, slide in enumerate(prs.slides, 1):
slide_texts = []
for shape in slide.shapes:
if shape.has_text_frame:
slide_texts.append(shape.text_frame.text)
if slide_texts:
text_parts.append(f"Slide {i}:\n" + "\n".join(slide_texts))
return [
Document(
page_content="\n\n".join(text_parts),
metadata={"source": self.file_path},
)
]
class TikaLoader:
def __init__(self, url, file_path, mime_type=None, extract_images=None):
self.url = url
@@ -371,15 +413,40 @@ class Loader:
elif file_ext == "csv":
loader = CSVLoader(file_path, autodetect_encoding=True)
elif file_ext == "rst":
loader = UnstructuredRSTLoader(file_path, mode="elements")
try:
from langchain_community.document_loaders import UnstructuredRSTLoader
loader = UnstructuredRSTLoader(file_path, mode="elements")
except ImportError:
log.warning(
"The 'unstructured' package is not installed. "
"Falling back to plain text loading for .rst file. "
"Install it with: pip install unstructured"
)
loader = TextLoader(file_path, autodetect_encoding=True)
elif file_ext == "xml":
loader = UnstructuredXMLLoader(file_path)
try:
from langchain_community.document_loaders import UnstructuredXMLLoader
loader = UnstructuredXMLLoader(file_path)
except ImportError:
log.warning(
"The 'unstructured' package is not installed. "
"Falling back to plain text loading for .xml file. "
"Install it with: pip install unstructured"
)
loader = TextLoader(file_path, autodetect_encoding=True)
elif file_ext in ["htm", "html"]:
loader = BSHTMLLoader(file_path, open_encoding="unicode_escape")
elif file_ext == "md":
loader = TextLoader(file_path, autodetect_encoding=True)
elif file_content_type == "application/epub+zip":
loader = UnstructuredEPubLoader(file_path)
try:
from langchain_community.document_loaders import UnstructuredEPubLoader
loader = UnstructuredEPubLoader(file_path)
except ImportError:
raise ValueError(
"Processing .epub files requires the 'unstructured' package. "
"Install it with: pip install unstructured"
)
elif (
file_content_type
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
@@ -390,19 +457,45 @@ class Loader:
"application/vnd.ms-excel",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
] or file_ext in ["xls", "xlsx"]:
loader = UnstructuredExcelLoader(file_path)
try:
from langchain_community.document_loaders import UnstructuredExcelLoader
loader = UnstructuredExcelLoader(file_path)
except ImportError:
log.warning(
"The 'unstructured' package is not installed. "
"Falling back to pandas for Excel file loading. "
"Install unstructured for better results: pip install unstructured"
)
loader = ExcelLoader(file_path)
elif file_content_type in [
"application/vnd.ms-powerpoint",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
] or file_ext in ["ppt", "pptx"]:
loader = UnstructuredPowerPointLoader(file_path)
try:
from langchain_community.document_loaders import UnstructuredPowerPointLoader
loader = UnstructuredPowerPointLoader(file_path)
except ImportError:
log.warning(
"The 'unstructured' package is not installed. "
"Falling back to python-pptx for PowerPoint file loading. "
"Install unstructured for better results: pip install unstructured"
)
loader = PptxLoader(file_path)
elif file_ext == "msg":
loader = OutlookMessageLoader(file_path)
elif file_ext == "odt":
loader = UnstructuredODTLoader(file_path)
try:
from langchain_community.document_loaders import UnstructuredODTLoader
loader = UnstructuredODTLoader(file_path)
except ImportError:
raise ValueError(
"Processing .odt files requires the 'unstructured' package. "
"Install it with: pip install unstructured"
)
elif self._is_text_file(file_ext, file_content_type):
loader = TextLoader(file_path, autodetect_encoding=True)
else:
loader = TextLoader(file_path, autodetect_encoding=True)
return loader
+2 -1
View File
@@ -69,8 +69,9 @@ fpdf2==2.8.7
pymdown-extensions==10.21
docx2txt==0.9
python-pptx==1.0.2
unstructured==0.18.31
msoffcrypto-tool==6.0.0
unstructured==0.18.31
nltk==3.9.3
Markdown==3.10.2
beautifulsoup4
+4 -2
View File
@@ -77,7 +77,6 @@ dependencies = [
"pymdown-extensions==10.21",
"docx2txt==0.9",
"python-pptx==1.0.2",
"unstructured==0.18.31",
"msoffcrypto-tool==6.0.0",
"nltk==3.9.3",
"Markdown==3.10.2",
@@ -140,12 +139,14 @@ postgres = [
mariadb = [
"mariadb==1.1.14",
]
unstructured = [
"unstructured==0.18.31",
]
all = [
"pymongo",
"psycopg2-binary==2.9.11",
"pgvector==0.4.2",
"mariadb==1.1.14",
"moto[s3]>=5.0.26",
"gcp-storage-emulator>=2024.8.3",
"docker~=7.1.0",
@@ -164,6 +165,7 @@ all = [
"firecrawl-py==4.18.0",
"azure-search-documents==11.6.0",
"unstructured==0.18.31",
]
[project.scripts]