📝 Add docstrings to todo17

coderabbitai[bot] · web-flow · commit 1454ce28a184 · 2025-04-26T22:40:36.000Z
Docstrings generation was requested by @domfahey. * #1 (comment) The following files were modified: * `src/pdf_ocr_pipeline/__init__.py` * `src/pdf_ocr_pipeline/cli.py` * `src/pdf_ocr_pipeline/llm_client.py` * `src/pdf_ocr_pipeline/logging_utils.py` * `src/pdf_ocr_pipeline/segment_cli.py` * `src/pdf_ocr_pipeline/segmentation.py`
diff --git a/src/pdf_ocr_pipeline/__init__.py b/src/pdf_ocr_pipeline/__init__.py
@@ -23,17 +23,19 @@ def process_pdf(
     path: Union[str, Path],
     settings: ProcessSettings = ProcessSettings(),
 ) -> Union[SegmentationResult, OcrResult]:
-    """High‑level convenience wrapper combining OCR and optional analysis.
-
-    Parameters
-    ----------
-    path:
-        Path to the PDF file.
-    analyze:
-        When *True* the function sends the OCR result to the segmentation LLM
-        and returns its JSON output.  When *False* only OCR is performed.
-    dpi, lang, prompt, model:
-        Override defaults from :pymod:`pdf_ocr_pipeline.settings`.
+    """
+    Processes a PDF file with OCR and optionally performs AI-based segmentation analysis.
+    
+    Args:
+        path: Path to the PDF file to process.
+        settings: Optional processing settings, including OCR parameters and analysis options.
+    
+    Returns:
+        An OcrResult dictionary with the filename and extracted text if analysis is disabled,
+        or a SegmentationResult dictionary with segmentation output if analysis is enabled.
+    
+    Raises:
+        FileNotFoundError: If the specified PDF file does not exist.
     """
 
     pdf_path = Path(path)
diff --git a/src/pdf_ocr_pipeline/cli.py b/src/pdf_ocr_pipeline/cli.py
@@ -42,8 +42,9 @@
 
 def main() -> None:
     """
-    Parse arguments and perform OCR on one or more PDF files.
-    Outputs a JSON array of {file, ocr_text} objects to stdout.
+    Parses command-line arguments and performs OCR on one or more PDF files, outputting results as a JSON array to standard output.
+    
+    Validates input files, manages logging verbosity, and processes PDFs in parallel. Each result includes the filename and either the extracted OCR text or an error message if processing fails. Exits with status code 1 on unrecoverable errors.
     """
     # ------------------------------------------------------------------
     # CLI argument parsing
diff --git a/src/pdf_ocr_pipeline/llm_client.py b/src/pdf_ocr_pipeline/llm_client.py
@@ -50,12 +50,10 @@ class MissingApiKeyError(RuntimeError):
 
 
 def _get_client() -> "OpenAI":
-    """Instantiate and cache an *OpenAI* client.
-
-    Environment variables inspected:
-    * ``OPENAI_API_KEY`` **(required)**
-    * ``OPENAI_BASE_URL`` / ``OPENAI_API_BASE`` (optional override)
-    * ``OPENAI_API_VERSION``                (optional override)
+    """
+    Returns a singleton OpenAI client instance configured from environment variables.
+    
+    Reads the required API key and optional endpoint or version overrides from the environment. Raises MissingApiKeyError if the API key is missing or appears to be a placeholder, and RuntimeError if no supported OpenAI SDK is installed.
     """
 
     global _client
@@ -115,28 +113,17 @@ def send(
     client: Optional["OpenAI"] = None,
     **kwargs: Any,
 ) -> Dict[str, Any]:
-    """Send *messages* to the chat completion endpoint and return JSON output.
-
-    Parameters
-    ----------
-    messages:
-        List of role/content dicts as expected by the OpenAI chat completion
-        endpoint.
-    model:
-        Model name – defaults to ``gpt-4o``.
-    client:
-        Optional already‑initialised *OpenAI* client (mainly for tests).  When
-        *None* the module‑level singleton returned by :func:`_get_client` is
-        used.
-    **kwargs:
-        Additional keyword arguments passed straight through to
-        ``chat.completions.create`` (e.g. ``max_tokens``).
-
-    Returns
-    -------
-    dict
-        Parsed JSON object produced by the model *or* a dict with an ``error``
-        key when something went wrong.
+    """
+    Sends chat messages to a language model and returns the parsed JSON response.
+    
+    Args:
+        messages: List of dictionaries representing chat messages, each with a role and content.
+        model: Name of the model to use. Defaults to "gpt-4o".
+        client: Optional pre-initialized OpenAI client. If not provided, a singleton client is used.
+        **kwargs: Additional keyword arguments passed to the chat completion API.
+    
+    Returns:
+        A dictionary containing the parsed JSON object from the model's response, or a dictionary with an "error" key if the request fails or the response is invalid.
     """
 
     cli = client or _get_client()
diff --git a/src/pdf_ocr_pipeline/logging_utils.py b/src/pdf_ocr_pipeline/logging_utils.py
@@ -28,11 +28,10 @@
 
 
 def _initialise_root_logger() -> None:
-    """Attach a single *StreamHandler* to the root logger.
-
-    The handler is only added once per interpreter session.  We deliberately do
-    **not** rely on :pyfunc:`logging.basicConfig` because re‑invoking it from
-    multiple modules is a common source of duplicate log lines.
+    """
+    Attaches a single StreamHandler with a consistent formatter to the root logger.
+    
+    Ensures the handler is only added once per interpreter session to prevent duplicate log lines, avoiding the use of logging.basicConfig. Does not modify the root logger's level.
     """
 
     global _INITIALISED  # noqa: WPS420 (module‑level state is fine here)
@@ -55,12 +54,17 @@ def _initialise_root_logger() -> None:
 
 
 def get_logger(name: str, *, level: int | None = None) -> logging.Logger:  # noqa: D401
-    """Return a module-level logger with our global formatting applied.
-
-    The first call will attach the global handler to the root logger
-    (without changing its level). Subsequent calls simply retrieve the
-    named logger. A per-logger *level* may be provided but is rarely
-    necessary—prefer using :func:`set_root_level` to adjust verbosity.
+    """
+    Returns a logger with the specified name, ensuring global formatting is applied.
+    
+    On the first call, attaches a single global handler with consistent formatting to the root logger without modifying its level. If a level is provided, sets it on the returned logger instance. Prefer adjusting the root logger's level using `set_root_level` for consistent verbosity control across modules.
+    
+    Args:
+        name: The name of the logger to retrieve.
+        level: Optional log level to set on the returned logger.
+    
+    Returns:
+        A logger instance with the specified name and global formatting.
     """
 
     # Ensure the global handler is attached.  Do not adjust root level here.
@@ -75,7 +79,12 @@ def get_logger(name: str, *, level: int | None = None) -> logging.Logger:  # noq
 
 
 def set_root_level(level: int) -> None:
-    """Ensure the global handler is attached and set the root logger level."""
+    """
+    Attaches the global logging handler if needed and sets the root logger's level.
+    
+    Args:
+        level: The logging level to set for the root logger (e.g., logging.INFO).
+    """
     # Attach handler if not already configured
     _initialise_root_logger()
     # Set root logger level
diff --git a/src/pdf_ocr_pipeline/segment_cli.py b/src/pdf_ocr_pipeline/segment_cli.py
@@ -31,9 +31,11 @@
 
 
 def _read_input() -> List[Dict[str, Any]]:
-    """Read JSON array or raw text from *stdin*.
-
-    Returns a list of ``{"file": ..., "ocr_text": ...}`` dictionaries.
+    """
+    Reads OCR input from stdin as either a JSON array or raw text.
+    
+    Returns:
+        A list of dictionaries, each containing "file" and "ocr_text" keys. If the input is not valid JSON, the entire input is treated as a single OCR text blob with an "unknown" file identifier. If the input is a JSON array, it is returned as-is. Non-list JSON input is wrapped as a single document.
     """
 
     raw = sys.stdin.read().strip()
@@ -60,7 +62,11 @@ def _read_input() -> List[Dict[str, Any]]:
 
 
 def main() -> None:  # noqa: WPS231 – CLI assembly is inevitably imperative
-    """Segment OCR text(s) read from *stdin* and emit JSON to *stdout*."""
+    """
+    Runs the CLI tool to segment OCR text from stdin and outputs the results as JSON.
+    
+    Parses command-line arguments for custom prompt templates, JSON formatting, and logging verbosity. Reads OCR text input (raw or JSON) from stdin, segments each document using a segmentation function, and prints the segmentation results as a JSON array to stdout.
+    """
 
     default_verbose = settings.verbose
     default_prompt = None  # let segment_pdf load bundled template unless user overrides
diff --git a/src/pdf_ocr_pipeline/segmentation.py b/src/pdf_ocr_pipeline/segmentation.py
@@ -27,10 +27,18 @@ def segment_pdf(
     client: Optional[object] = None,
     model: str = "gpt-4o",
 ) -> Dict[str, Any]:
-    """Return segmentation JSON for *text* using *prompt*.
-    If no prompt is provided, the default segmentation template is used.
-
-    The implementation is intentionally minimal – real logic lives in the LLM.
+    """
+    Segments OCR-extracted PDF text into structured JSON using an LLM.
+    
+    If no prompt is provided, a default segmentation template is loaded and cached from the package resources. Returns the LLM's JSON output representing the segmented documents.
+    
+    Args:
+        text: The OCR text to segment.
+        prompt: Optional custom prompt to instruct the LLM; if not provided, a default template is used.
+        model: The LLM model identifier.
+    
+    Returns:
+        A dictionary containing the JSON segmentation output from the LLM.
     """
 
     # ------------------------------------------------------------------