Skip to content

Commit 1454ce2

Browse files
📝 Add docstrings to todo17
Docstrings generation was requested by @domfahey. * #1 (comment) The following files were modified: * `src/pdf_ocr_pipeline/__init__.py` * `src/pdf_ocr_pipeline/cli.py` * `src/pdf_ocr_pipeline/llm_client.py` * `src/pdf_ocr_pipeline/logging_utils.py` * `src/pdf_ocr_pipeline/segment_cli.py` * `src/pdf_ocr_pipeline/segmentation.py`
1 parent bbb519c commit 1454ce2

6 files changed

Lines changed: 74 additions & 61 deletions

File tree

src/pdf_ocr_pipeline/__init__.py

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -23,17 +23,19 @@ def process_pdf(
2323
path: Union[str, Path],
2424
settings: ProcessSettings = ProcessSettings(),
2525
) -> Union[SegmentationResult, OcrResult]:
26-
"""High‑level convenience wrapper combining OCR and optional analysis.
27-
28-
Parameters
29-
----------
30-
path:
31-
Path to the PDF file.
32-
analyze:
33-
When *True* the function sends the OCR result to the segmentation LLM
34-
and returns its JSON output. When *False* only OCR is performed.
35-
dpi, lang, prompt, model:
36-
Override defaults from :pymod:`pdf_ocr_pipeline.settings`.
26+
"""
27+
Processes a PDF file with OCR and optionally performs AI-based segmentation analysis.
28+
29+
Args:
30+
path: Path to the PDF file to process.
31+
settings: Optional processing settings, including OCR parameters and analysis options.
32+
33+
Returns:
34+
An OcrResult dictionary with the filename and extracted text if analysis is disabled,
35+
or a SegmentationResult dictionary with segmentation output if analysis is enabled.
36+
37+
Raises:
38+
FileNotFoundError: If the specified PDF file does not exist.
3739
"""
3840

3941
pdf_path = Path(path)

src/pdf_ocr_pipeline/cli.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,9 @@
4242

4343
def main() -> None:
4444
"""
45-
Parse arguments and perform OCR on one or more PDF files.
46-
Outputs a JSON array of {file, ocr_text} objects to stdout.
45+
Parses command-line arguments and performs OCR on one or more PDF files, outputting results as a JSON array to standard output.
46+
47+
Validates input files, manages logging verbosity, and processes PDFs in parallel. Each result includes the filename and either the extracted OCR text or an error message if processing fails. Exits with status code 1 on unrecoverable errors.
4748
"""
4849
# ------------------------------------------------------------------
4950
# CLI argument parsing

src/pdf_ocr_pipeline/llm_client.py

Lines changed: 15 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -50,12 +50,10 @@ class MissingApiKeyError(RuntimeError):
5050

5151

5252
def _get_client() -> "OpenAI":
53-
"""Instantiate and cache an *OpenAI* client.
54-
55-
Environment variables inspected:
56-
* ``OPENAI_API_KEY`` **(required)**
57-
* ``OPENAI_BASE_URL`` / ``OPENAI_API_BASE`` (optional override)
58-
* ``OPENAI_API_VERSION`` (optional override)
53+
"""
54+
Returns a singleton OpenAI client instance configured from environment variables.
55+
56+
Reads the required API key and optional endpoint or version overrides from the environment. Raises MissingApiKeyError if the API key is missing or appears to be a placeholder, and RuntimeError if no supported OpenAI SDK is installed.
5957
"""
6058

6159
global _client
@@ -115,28 +113,17 @@ def send(
115113
client: Optional["OpenAI"] = None,
116114
**kwargs: Any,
117115
) -> Dict[str, Any]:
118-
"""Send *messages* to the chat completion endpoint and return JSON output.
119-
120-
Parameters
121-
----------
122-
messages:
123-
List of role/content dicts as expected by the OpenAI chat completion
124-
endpoint.
125-
model:
126-
Model name – defaults to ``gpt-4o``.
127-
client:
128-
Optional already‑initialised *OpenAI* client (mainly for tests). When
129-
*None* the module‑level singleton returned by :func:`_get_client` is
130-
used.
131-
**kwargs:
132-
Additional keyword arguments passed straight through to
133-
``chat.completions.create`` (e.g. ``max_tokens``).
134-
135-
Returns
136-
-------
137-
dict
138-
Parsed JSON object produced by the model *or* a dict with an ``error``
139-
key when something went wrong.
116+
"""
117+
Sends chat messages to a language model and returns the parsed JSON response.
118+
119+
Args:
120+
messages: List of dictionaries representing chat messages, each with a role and content.
121+
model: Name of the model to use. Defaults to "gpt-4o".
122+
client: Optional pre-initialized OpenAI client. If not provided, a singleton client is used.
123+
**kwargs: Additional keyword arguments passed to the chat completion API.
124+
125+
Returns:
126+
A dictionary containing the parsed JSON object from the model's response, or a dictionary with an "error" key if the request fails or the response is invalid.
140127
"""
141128

142129
cli = client or _get_client()

src/pdf_ocr_pipeline/logging_utils.py

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,10 @@
2828

2929

3030
def _initialise_root_logger() -> None:
31-
"""Attach a single *StreamHandler* to the root logger.
32-
33-
The handler is only added once per interpreter session. We deliberately do
34-
**not** rely on :pyfunc:`logging.basicConfig` because re‑invoking it from
35-
multiple modules is a common source of duplicate log lines.
31+
"""
32+
Attaches a single StreamHandler with a consistent formatter to the root logger.
33+
34+
Ensures the handler is only added once per interpreter session to prevent duplicate log lines, avoiding the use of logging.basicConfig. Does not modify the root logger's level.
3635
"""
3736

3837
global _INITIALISED # noqa: WPS420 (module‑level state is fine here)
@@ -55,12 +54,17 @@ def _initialise_root_logger() -> None:
5554

5655

5756
def get_logger(name: str, *, level: int | None = None) -> logging.Logger: # noqa: D401
58-
"""Return a module-level logger with our global formatting applied.
59-
60-
The first call will attach the global handler to the root logger
61-
(without changing its level). Subsequent calls simply retrieve the
62-
named logger. A per-logger *level* may be provided but is rarely
63-
necessary—prefer using :func:`set_root_level` to adjust verbosity.
57+
"""
58+
Returns a logger with the specified name, ensuring global formatting is applied.
59+
60+
On the first call, attaches a single global handler with consistent formatting to the root logger without modifying its level. If a level is provided, sets it on the returned logger instance. Prefer adjusting the root logger's level using `set_root_level` for consistent verbosity control across modules.
61+
62+
Args:
63+
name: The name of the logger to retrieve.
64+
level: Optional log level to set on the returned logger.
65+
66+
Returns:
67+
A logger instance with the specified name and global formatting.
6468
"""
6569

6670
# Ensure the global handler is attached. Do not adjust root level here.
@@ -75,7 +79,12 @@ def get_logger(name: str, *, level: int | None = None) -> logging.Logger: # noq
7579

7680

7781
def set_root_level(level: int) -> None:
78-
"""Ensure the global handler is attached and set the root logger level."""
82+
"""
83+
Attaches the global logging handler if needed and sets the root logger's level.
84+
85+
Args:
86+
level: The logging level to set for the root logger (e.g., logging.INFO).
87+
"""
7988
# Attach handler if not already configured
8089
_initialise_root_logger()
8190
# Set root logger level

src/pdf_ocr_pipeline/segment_cli.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,11 @@
3131

3232

3333
def _read_input() -> List[Dict[str, Any]]:
34-
"""Read JSON array or raw text from *stdin*.
35-
36-
Returns a list of ``{"file": ..., "ocr_text": ...}`` dictionaries.
34+
"""
35+
Reads OCR input from stdin as either a JSON array or raw text.
36+
37+
Returns:
38+
A list of dictionaries, each containing "file" and "ocr_text" keys. If the input is not valid JSON, the entire input is treated as a single OCR text blob with an "unknown" file identifier. If the input is a JSON array, it is returned as-is. Non-list JSON input is wrapped as a single document.
3739
"""
3840

3941
raw = sys.stdin.read().strip()
@@ -60,7 +62,11 @@ def _read_input() -> List[Dict[str, Any]]:
6062

6163

6264
def main() -> None: # noqa: WPS231 – CLI assembly is inevitably imperative
63-
"""Segment OCR text(s) read from *stdin* and emit JSON to *stdout*."""
65+
"""
66+
Runs the CLI tool to segment OCR text from stdin and outputs the results as JSON.
67+
68+
Parses command-line arguments for custom prompt templates, JSON formatting, and logging verbosity. Reads OCR text input (raw or JSON) from stdin, segments each document using a segmentation function, and prints the segmentation results as a JSON array to stdout.
69+
"""
6470

6571
default_verbose = settings.verbose
6672
default_prompt = None # let segment_pdf load bundled template unless user overrides

src/pdf_ocr_pipeline/segmentation.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,18 @@ def segment_pdf(
2727
client: Optional[object] = None,
2828
model: str = "gpt-4o",
2929
) -> Dict[str, Any]:
30-
"""Return segmentation JSON for *text* using *prompt*.
31-
If no prompt is provided, the default segmentation template is used.
32-
33-
The implementation is intentionally minimal – real logic lives in the LLM.
30+
"""
31+
Segments OCR-extracted PDF text into structured JSON using an LLM.
32+
33+
If no prompt is provided, a default segmentation template is loaded and cached from the package resources. Returns the LLM's JSON output representing the segmented documents.
34+
35+
Args:
36+
text: The OCR text to segment.
37+
prompt: Optional custom prompt to instruct the LLM; if not provided, a default template is used.
38+
model: The LLM model identifier.
39+
40+
Returns:
41+
A dictionary containing the JSON segmentation output from the LLM.
3442
"""
3543

3644
# ------------------------------------------------------------------

0 commit comments

Comments
 (0)