Skip to content

Commit cc880f6

Browse files
committed
Add a regex for archive source
1 parent b8aa43a commit cc880f6

File tree

2 files changed

+16
-2
lines changed

2 files changed

+16
-2
lines changed

llmstack/common/utils/text_extract.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,8 @@ def extract_text_elements(
177177
elif mime_type == "text/markdown":
178178
elements = partition_md(text=data.decode(charset), chunking_strategy=chunking_strategy)
179179
else:
180-
raise Exception("Unsupported file type")
180+
logger.error(f"Unsupported mime type: {mime_type}")
181+
elements = []
181182

182183
# Merge elements depending on metadata page number
183184
merged_elements = []

llmstack/data/sources/files/archive.py

+14-1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import io
33
import logging
44
import mimetypes
5+
import re
56
import tarfile
67
import uuid
78
import zipfile
@@ -30,7 +31,9 @@ def extract_archive_files(mime_type, file_name, file_data):
3031
continue
3132
with archive.open(file_info) as file:
3233
file_mime_type = mimetypes.guess_type(file_info.filename)[0]
33-
data_uri = f"data:{file_mime_type};name={file_info.filename};base64,{base64.b64encode(file.read()).decode()}"
34+
filename = file_info.filename
35+
filename = "/".join(filename.split("/")[1:])
36+
data_uri = f"data:{file_mime_type};name={filename};base64,{base64.b64encode(file.read()).decode()}"
3437
extracted_files.append(data_uri)
3538
elif mime_type in ["application/x-tar", "application/gzip", "application/x-bzip2"]:
3639
with tarfile.open(fileobj=io.BytesIO(file_data), mode="r:*") as archive:
@@ -65,6 +68,11 @@ class ArchiveFileSchema(BaseSource):
6568
description="Split the archive into individual files",
6669
json_schema_extra={"advanced_parameter": True},
6770
)
71+
file_regex: str = Field(
72+
default=None,
73+
description="Regex to filter files",
74+
json_schema_extra={"advanced_parameter": True},
75+
)
6876

6977
@classmethod
7078
def slug(cls):
@@ -89,6 +97,8 @@ def get_data_documents(self, **kwargs) -> List[DataDocument]:
8997
for file in files:
9098
file_id = str(uuid.uuid4())
9199
mime_type, file_name, file_data = validate_parse_data_uri(file)
100+
if self.split_files and self.file_regex and not re.match(self.file_regex, file_name):
101+
continue
92102
file_objref = create_source_document_asset(
93103
file, datasource_uuid=kwargs["datasource_uuid"], document_id=file_id
94104
)
@@ -103,6 +113,7 @@ def get_data_documents(self, **kwargs) -> List[DataDocument]:
103113
"mime_type": mime_type,
104114
"source": file_name,
105115
"datasource_uuid": kwargs["datasource_uuid"],
116+
"file_regex": self.file_regex,
106117
},
107118
datasource_uuid=kwargs["datasource_uuid"],
108119
extra_info={"extra_data": self.get_extra_data()},
@@ -121,6 +132,8 @@ def process_document(cls, document: DataDocument) -> DataDocument:
121132
text_content = ""
122133
for extracted_file in extracted_files:
123134
mime_type, file_name, extracted_file_data = validate_parse_data_uri(extracted_file)
135+
if document.metadata.get("file_regex") and not re.match(document.metadata["file_regex"], file_name):
136+
continue
124137
text_content += f"File: {file_name}\n"
125138
decoded_file_data = base64.b64decode(extracted_file_data)
126139
elements += extract_text_elements(

0 commit comments

Comments
 (0)