Skip to content

Commit 050c59e

Browse files
authored
Merge pull request #240 from jwchmodx/fix/bugs-processor-parser-config
fix: prevent crashes from uninitialized LightRAG, env-var stripping, and parser cleanup
2 parents 5959335 + ab9f739 commit 050c59e

6 files changed

Lines changed: 223 additions & 66 deletions

File tree

raganything/batch.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,11 @@ async def process_folder_complete(
7070
if max_workers is None:
7171
max_workers = self.config.max_concurrent_files
7272

73-
await self._ensure_lightrag_initialized()
73+
init_result = await self._ensure_lightrag_initialized()
74+
if not init_result or not init_result.get("success"):
75+
raise RuntimeError(
76+
f"LightRAG initialization failed: {(init_result or {}).get('error', 'unknown error')}"
77+
)
7478

7579
# Get all files in the folder
7680
folder_path_obj = Path(folder_path)
@@ -105,10 +109,9 @@ async def process_folder_complete(
105109
async def process_single_file(file_path: Path):
106110
async with semaphore:
107111
is_in_subdir = (
108-
lambda file_path, dir_path: len(
109-
file_path.relative_to(dir_path).parents
112+
lambda file_path, dir_path: (
113+
len(file_path.relative_to(dir_path).parents) > 1
110114
)
111-
> 1
112115
)(file_path, folder_path_obj)
113116

114117
try:
@@ -363,7 +366,11 @@ async def process_documents_with_rag_batch(
363366

364367
# Step 2: Process with RAG
365368
# Initialize RAG system
366-
await self._ensure_lightrag_initialized()
369+
init_result = await self._ensure_lightrag_initialized()
370+
if not init_result or not init_result.get("success"):
371+
raise RuntimeError(
372+
f"LightRAG initialization failed: {(init_result or {}).get('error', 'unknown error')}"
373+
)
367374

368375
# Then, process each successful file with RAG
369376
rag_results = {}

raganything/config.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -59,11 +59,14 @@ class RAGAnythingConfig:
5959
"""Maximum number of files to process concurrently."""
6060

6161
supported_file_extensions: List[str] = field(
62-
default_factory=lambda: get_env_value(
63-
"SUPPORTED_FILE_EXTENSIONS",
64-
".pdf,.jpg,.jpeg,.png,.bmp,.tiff,.tif,.gif,.webp,.doc,.docx,.ppt,.pptx,.xls,.xlsx,.txt,.md",
65-
str,
66-
).split(",")
62+
default_factory=lambda: [
63+
x.strip()
64+
for x in get_env_value(
65+
"SUPPORTED_FILE_EXTENSIONS",
66+
".pdf,.jpg,.jpeg,.png,.bmp,.tiff,.tif,.gif,.webp,.doc,.docx,.ppt,.pptx,.xls,.xlsx,.txt,.md",
67+
str,
68+
).split(",")
69+
]
6770
)
6871
"""List of supported file extensions for batch processing."""
6972

@@ -94,9 +97,12 @@ class RAGAnythingConfig:
9497
"""Whether to include image/table captions in context."""
9598

9699
context_filter_content_types: List[str] = field(
97-
default_factory=lambda: get_env_value(
98-
"CONTEXT_FILTER_CONTENT_TYPES", "text", str
99-
).split(",")
100+
default_factory=lambda: [
101+
x.strip()
102+
for x in get_env_value("CONTEXT_FILTER_CONTENT_TYPES", "text", str).split(
103+
","
104+
)
105+
]
100106
)
101107
"""Content types to include in context extraction (e.g., 'text', 'image', 'table')."""
102108

raganything/parser.py

Lines changed: 25 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525

2626

2727
import os
28+
import platform
2829
import hashlib
2930
import json
3031
import argparse
@@ -45,10 +46,9 @@
4546
Tuple,
4647
Any,
4748
Iterator,
48-
TypeVar,
4949
)
5050

51-
T = TypeVar("T")
51+
_IS_WINDOWS: bool = platform.system() == "Windows"
5252

5353

5454
class MineruExecutionError(Exception):
@@ -228,8 +228,6 @@ def convert_office_to_pdf(
228228
)
229229

230230
# Prepare subprocess parameters to hide console window on Windows
231-
import platform
232-
233231
# Try LibreOffice commands in order of preference
234232
commands_to_try = ["libreoffice", "soffice"]
235233

@@ -258,7 +256,7 @@ def convert_office_to_pdf(
258256
}
259257

260258
# Hide console window on Windows
261-
if platform.system() == "Windows":
259+
if _IS_WINDOWS:
262260
convert_subprocess_kwargs["creationflags"] = (
263261
subprocess.CREATE_NO_WINDOW
264262
)
@@ -801,7 +799,6 @@ def _run_mineru_command(
801799

802800
try:
803801
# Prepare subprocess parameters to hide console window on Windows
804-
import platform
805802
import threading
806803
from queue import Queue, Empty
807804

@@ -824,7 +821,7 @@ def _run_mineru_command(
824821
}
825822

826823
# Hide console window on Windows
827-
if platform.system() == "Windows":
824+
if _IS_WINDOWS:
828825
subprocess_kwargs["creationflags"] = subprocess.CREATE_NO_WINDOW
829826

830827
# Function to read output from subprocess and add to queue
@@ -1427,8 +1424,6 @@ def check_installation(self) -> bool:
14271424
"""
14281425
try:
14291426
# Prepare subprocess parameters to hide console window on Windows
1430-
import platform
1431-
14321427
subprocess_kwargs = {
14331428
"capture_output": True,
14341429
"text": True,
@@ -1438,7 +1433,7 @@ def check_installation(self) -> bool:
14381433
}
14391434

14401435
# Hide console window on Windows
1441-
if platform.system() == "Windows":
1436+
if _IS_WINDOWS:
14421437
subprocess_kwargs["creationflags"] = subprocess.CREATE_NO_WINDOW
14431438

14441439
result = subprocess.run(["mineru", "--version"], **subprocess_kwargs)
@@ -1630,8 +1625,6 @@ def _run_docling_command(
16301625

16311626
try:
16321627
# Prepare subprocess parameters to hide console window on Windows
1633-
import platform
1634-
16351628
env = None
16361629
if custom_env:
16371630
env = os.environ.copy()
@@ -1647,7 +1640,7 @@ def _run_docling_command(
16471640
}
16481641

16491642
# Hide console window on Windows
1650-
if platform.system() == "Windows":
1643+
if _IS_WINDOWS:
16511644
docling_subprocess_kwargs["creationflags"] = subprocess.CREATE_NO_WINDOW
16521645

16531646
result = subprocess.run(cmd, **docling_subprocess_kwargs)
@@ -1737,9 +1730,20 @@ def read_from_block_recursive(
17371730
for member in members:
17381731
cnt += 1
17391732
member_tag = member["$ref"]
1740-
member_type = member_tag.split("/")[1]
1741-
member_num = member_tag.split("/")[2]
1742-
member_block = docling_content[member_type][int(member_num)]
1733+
# JSON References follow the form "#/<type>/<index>" (e.g. "#/body/0")
1734+
ref_parts = member_tag.split("/")
1735+
if len(ref_parts) < 3:
1736+
self.logger.warning(
1737+
f"Unexpected $ref format (expected #/<type>/<index>): {member_tag!r}"
1738+
)
1739+
continue
1740+
member_type = ref_parts[1]
1741+
member_num = ref_parts[2]
1742+
try:
1743+
member_block = docling_content[member_type][int(member_num)]
1744+
except (KeyError, ValueError, IndexError) as e:
1745+
self.logger.warning(f"Could not resolve $ref {member_tag!r}: {e}")
1746+
continue
17431747
content_list.extend(
17441748
self.read_from_block_recursive(
17451749
member_block,
@@ -1773,7 +1777,10 @@ def read_from_block(
17731777
elif type == "pictures":
17741778
try:
17751779
base64_uri = block["image"]["uri"]
1776-
base64_str = base64_uri.split(",")[1]
1780+
# base64 data URIs have the form "data:<mime>;base64,<data>"
1781+
# but some exporters may omit the prefix
1782+
parts = base64_uri.split(",", 1)
1783+
base64_str = parts[1] if len(parts) == 2 else parts[0]
17771784
# Create images directory within the docling subdirectory
17781785
image_dir = output_dir / "images"
17791786
image_dir.mkdir(parents=True, exist_ok=True) # Ensure directory exists
@@ -1939,8 +1946,6 @@ def check_installation(self) -> bool:
19391946
"""
19401947
try:
19411948
# Prepare subprocess parameters to hide console window on Windows
1942-
import platform
1943-
19441949
subprocess_kwargs = {
19451950
"capture_output": True,
19461951
"text": True,
@@ -1950,7 +1955,7 @@ def check_installation(self) -> bool:
19501955
}
19511956

19521957
# Hide console window on Windows
1953-
if platform.system() == "Windows":
1958+
if _IS_WINDOWS:
19541959
subprocess_kwargs["creationflags"] = subprocess.CREATE_NO_WINDOW
19551960

19561961
result = subprocess.run(["docling", "--version"], **subprocess_kwargs)

raganything/processor.py

Lines changed: 86 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -531,6 +531,14 @@ async def _process_multimodal_content(
531531
doc_id=doc_id,
532532
)
533533

534+
# Ensure LightRAG is initialized before accessing its storages
535+
init_result = await self._ensure_lightrag_initialized()
536+
if not init_result or not init_result.get("success"):
537+
self.logger.error(
538+
"LightRAG initialization failed; skipping multimodal processing"
539+
)
540+
return
541+
534542
# Check multimodal processing status - handle LightRAG's early DocStatus.PROCESSED marking
535543
try:
536544
existing_doc_status = await self.lightrag.doc_status.get_by_id(doc_id)
@@ -573,9 +581,6 @@ async def _process_multimodal_content(
573581
pipeline_status["history_messages"].append(log_message)
574582

575583
try:
576-
# Ensure LightRAG is initialized
577-
await self._ensure_lightrag_initialized()
578-
579584
await self._process_multimodal_content_batch_type_aware(
580585
multimodal_items=multimodal_items, file_path=file_path, doc_id=doc_id
581586
)
@@ -1540,7 +1545,11 @@ async def process_document_complete(
15401545

15411546
try:
15421547
# Ensure LightRAG is initialized
1543-
await self._ensure_lightrag_initialized()
1548+
init_result = await self._ensure_lightrag_initialized()
1549+
if not init_result or not init_result.get("success"):
1550+
raise RuntimeError(
1551+
f"LightRAG initialization failed: {(init_result or {}).get('error', 'unknown error')}"
1552+
)
15441553

15451554
# Use config defaults if not provided
15461555
if output_dir is None:
@@ -1675,25 +1684,61 @@ async def process_document_complete_lightrag_api(
16751684
doc_pre_id = f"doc-pre-{file_name}"
16761685
pipeline_status = None
16771686
pipeline_status_lock = None
1687+
current_doc_status = {} # initialised here so the except block can always unpack it
1688+
1689+
async def mark_initialization_failed(error_msg: str) -> None:
1690+
"""Persist init failures when LightRAG doc_status is already available."""
1691+
lightrag = getattr(self, "lightrag", None)
1692+
doc_status = getattr(lightrag, "doc_status", None)
1693+
if doc_status is None:
1694+
self.logger.error(
1695+
"LightRAG initialization failed before doc_status was available; "
1696+
f"unable to persist failed status for {file_path}"
1697+
)
1698+
return
1699+
1700+
try:
1701+
existing_status = await doc_status.get_by_id(doc_pre_id)
1702+
failed_status = {
1703+
"status": DocStatus.FAILED,
1704+
"content": "",
1705+
"error_msg": error_msg,
1706+
"content_summary": "",
1707+
"multimodal_content": [],
1708+
"scheme_name": scheme_name,
1709+
"content_length": 0,
1710+
"created_at": "",
1711+
"updated_at": time.strftime("%Y-%m-%dT%H:%M:%S+00:00"),
1712+
"file_path": file_name,
1713+
}
1714+
if existing_status:
1715+
failed_status = {
1716+
**existing_status,
1717+
"status": DocStatus.FAILED,
1718+
"error_msg": error_msg,
1719+
"updated_at": time.strftime("%Y-%m-%dT%H:%M:%S+00:00"),
1720+
}
1721+
await doc_status.upsert({doc_pre_id: failed_status})
1722+
await doc_status.index_done_callback()
1723+
except Exception as status_error:
1724+
self.logger.error(
1725+
f"Failed to persist initialization failure status for {file_path}: "
1726+
f"{status_error}"
1727+
)
16781728

16791729
if parser:
16801730
self.config.parser = parser
16811731

1682-
current_doc_status = await self.lightrag.doc_status.get_by_id(doc_pre_id)
1683-
16841732
try:
1685-
# Ensure LightRAG is initialized
1733+
# Ensure LightRAG is initialized before accessing its storages
16861734
result = await self._ensure_lightrag_initialized()
1687-
if not result["success"]:
1688-
await self.lightrag.doc_status.upsert(
1689-
{
1690-
doc_pre_id: {
1691-
**current_doc_status,
1692-
"status": DocStatus.FAILED,
1693-
"error_msg": result["error"],
1694-
}
1695-
}
1735+
if not result or not result.get("success"):
1736+
error_msg = (result or {}).get("error", "unknown error")
1737+
self.logger.error(
1738+
f"LightRAG initialization failed: {error_msg}; "
1739+
f"skipping document processing for {file_path}"
16961740
)
1741+
await mark_initialization_failed(str(error_msg))
16971742
return False
16981743

16991744
# Use config defaults if not provided
@@ -1761,9 +1806,10 @@ async def process_document_complete_lightrag_api(
17611806
file_path, output_dir, parse_method, display_stats, **kwargs
17621807
)
17631808
except MineruExecutionError as e:
1764-
error_message = e.error_msg
17651809
if isinstance(e.error_msg, list):
1766-
error_message = "\n".join(e.error_msg)
1810+
error_message = "\n".join(str(m) for m in e.error_msg)
1811+
else:
1812+
error_message = str(e.error_msg)
17671813
await self.lightrag.doc_status.upsert(
17681814
{
17691815
doc_pre_id: {
@@ -1859,15 +1905,23 @@ async def process_document_complete_lightrag_api(
18591905
return False
18601906

18611907
finally:
1862-
async with pipeline_status_lock:
1863-
pipeline_status.update({"scan_disabled": False})
1864-
pipeline_status["latest_message"] = (
1865-
f"RAGAnything processing completed for {file_name}"
1866-
)
1867-
pipeline_status["history_messages"].append(
1868-
f"RAGAnything processing completed for {file_name}"
1869-
)
1870-
pipeline_status["history_messages"].append("Now is allowed to scan")
1908+
if pipeline_status_lock is not None and pipeline_status is not None:
1909+
try:
1910+
async with pipeline_status_lock:
1911+
pipeline_status.update({"scan_disabled": False})
1912+
pipeline_status["latest_message"] = (
1913+
f"RAGAnything processing completed for {file_name}"
1914+
)
1915+
pipeline_status["history_messages"].append(
1916+
f"RAGAnything processing completed for {file_name}"
1917+
)
1918+
pipeline_status["history_messages"].append(
1919+
"Now is allowed to scan"
1920+
)
1921+
except Exception as _finally_err:
1922+
self.logger.error(
1923+
f"Failed to update pipeline status in finally block: {_finally_err}"
1924+
)
18711925

18721926
async def insert_content_list(
18731927
self,
@@ -1907,7 +1961,11 @@ async def insert_content_list(
19071961
doc_start_time = time.time()
19081962

19091963
# Ensure LightRAG is initialized
1910-
await self._ensure_lightrag_initialized()
1964+
init_result = await self._ensure_lightrag_initialized()
1965+
if not init_result or not init_result.get("success"):
1966+
raise RuntimeError(
1967+
f"LightRAG initialization failed: {(init_result or {}).get('error', 'unknown error')}"
1968+
)
19111969

19121970
# Use config defaults if not provided
19131971
if display_stats is None:

0 commit comments

Comments
 (0)