domfahey
diff --git a/‎.flake8‎
Lines changed: 0 additions & 3 deletions b/‎.flake8‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎CLAUDE.md‎
Lines changed: 21 additions & 0 deletions b/‎CLAUDE.md‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎examples/segment_pdf.sh‎
Lines changed: 37 additions & 0 deletions b/‎examples/segment_pdf.sh‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎ocroutput.txt‎
Lines changed: 6 additions & 0 deletions b/‎ocroutput.txt‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎package-lock.json‎
Lines changed: 6 additions & 0 deletions b/‎package-lock.json‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions b/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎segment.json‎
Lines changed: 68 additions & 0 deletions b/‎segment.json‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎setup.cfg‎
Lines changed: 0 additions & 3 deletions b/‎setup.cfg‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎src/pdf_ocr_pipeline/__init__.py‎
Lines changed: 22 additions & 5 deletions b/‎src/pdf_ocr_pipeline/__init__.py‎
Lines changed: 22 additions & 5 deletions
diff --git a/‎src/pdf_ocr_pipeline/cli.py‎
Lines changed: 38 additions & 11 deletions b/‎src/pdf_ocr_pipeline/cli.py‎
Lines changed: 38 additions & 11 deletions
@@ -0,0 +1,21 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## Build/Test Commands
+- Run all tests: `make test` or `pytest`
+- Run single test: `pytest tests/test_file.py::TestClass::test_name`
+- Test with coverage: `pytest --cov=pdf_ocr_pipeline tests/`
+- Format code: `make format`
+- Lint code: `make lint`
+- Combined check: `make check`
+
+## Code Style Guidelines
+- Python 3.8+ with strict type annotations (mypy)
+- Line length: 88 characters (Black)
+- Formatting: ruff format followed by Black
+- Imports: stdlib first, then third-party, grouped alphabetically
+- Types: Full annotation required, use `TypedDict`, `dataclasses`, from `__future__` import annotations
+- Naming: snake_case (variables/functions), PascalCase (classes), ALL_CAPS (constants)
+- Error handling: Use custom exceptions from `errors.py`, never use `sys.exit()` in library code
+- Docstrings: Google style format with clear parameter and return type documentation
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+# Simple wrapper to segment a PDF using the default real-estate template.
+# Usage: segment_pdf.sh <pdf_file> [output_file]
+
+if [ $# -lt 1 ]; then
+    echo "Usage: $(basename "$0") <pdf_file> [output_file]"
+    exit 1
+fi
+
+PDF_FILE="$1"
+OUTPUT_FILE="${2:-segment.json}"
+
+# Ensure input PDF exists
+if [ ! -f "$PDF_FILE" ]; then
+    echo "Error: File '$PDF_FILE' not found." >&2
+    exit 1
+fi
+
+echo "Segmenting PDF: $PDF_FILE"
+echo "Output will be saved to: $OUTPUT_FILE"
+
+# Ensure local 'src' is first on PYTHONPATH so we run the patched code
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+export PYTHONPATH="$PROJECT_ROOT/src${PYTHONPATH:+:$PYTHONPATH}"
+
+# Run OCR then segmentation (uses default prompt template)
+## Always use local module to ensure using current workspace code
+python3 -m pdf_ocr_pipeline "$PDF_FILE" \
+  | python3 -m pdf_ocr_pipeline.summarize --pretty > "$OUTPUT_FILE"
+
+if [ $? -eq 0 ]; then
+    echo "Segmentation complete. Results saved to: $OUTPUT_FILE"
+else
+    echo "Error: Segmentation failed." >&2
+    exit 1
+fi
@@ -29,6 +29,7 @@ dependencies = ["openai>=1.75.0", "pydantic>=2.0"]
 [project.scripts]
 pdf-ocr = "pdf_ocr_pipeline.cli:main"
 pdf-ocr-summarize = "pdf_ocr_pipeline.summarize:main"
+pdf-ocr-segment = "pdf_ocr_pipeline.segment_cli:main"
 
 [project.optional-dependencies]
 dev = [
 
@@ -0,0 +1,68 @@
+[
+  {
+    "file": "Sample_Title_Search3.pdf",
+    "analysis": {
+      "documents": [
+        {
+          "title": "Notice of Settlement",
+          "pages": [
+            1,
+            3
+          ],
+          "summary": "Notice of Settlement for property at 408 Trinidad Boulevard, involving Michael and Donna Reilly as buyers.",
+          "recording_reference": "File No. 989-2260645"
+        },
+        {
+          "title": "American Land Title Association Owner's Policy",
+          "pages": [
+            4,
+            6
+          ],
+          "summary": "Title policy for George and Linda DiCugno, covering 413 Trinidad Boulevard, issued by First American Title Insurance.",
+          "recording_reference": "Policy No: 5011434-0131314e"
+        },
+        {
+          "title": "Deed",
+          "pages": [
+            7,
+            9
+          ],
+          "summary": "Conveys property at 408 Trinidad Boulevard from Holiday City at Monroe, Inc. to Joseph and Nancy Quieti."
+        },
+        {
+          "title": "Affidavit of Consideration",
+          "pages": [
+            10,
+            10
+          ],
+          "summary": "Affidavit confirming monetary consideration in transferring property at 405 Paradise Road."
+        },
+        {
+          "title": "Deed of Easement",
+          "pages": [
+            11,
+            13
+          ],
+          "summary": "Grants Atlantic City Electric Company easement for utility lines on properties within Monroe Township."
+        },
+        {
+          "title": "Stream Encroachment Permit",
+          "pages": [
+            14,
+            15
+          ],
+          "summary": "Permit for constructing stormwater structures within defined blocks in Monroe Township."
+        },
+        {
+          "title": "Bargain and Sale Deed",
+          "pages": [
+            16,
+            17
+          ],
+          "summary": "Conveys property at 413 Trinidad Boulevard from Hovsons Inc. to Harold and Elizabeth Throneberry."
+        }
+      ],
+      "total_pages": 17
+    }
+  }
+]
@@ -1,3 +0,0 @@
-[flake8]
-max-line-length = 88
-extend-ignore = E203
@@ -40,16 +40,33 @@ def process_pdf(
     if not pdf_path.is_file():
         raise FileNotFoundError(f"File not found: {pdf_path}")
 
-    dpi_val = settings.dpi or _settings.dpi
-    lang_val = settings.lang or _settings.lang
+    opts = settings  # local alias for brevity (does not shadow module)
+
+    dpi_val = opts.dpi or _settings.dpi
+    lang_val = opts.lang or _settings.lang
 
     ocr_text = ocr_pdf(pdf_path, dpi=dpi_val, lang=lang_val)
 
-    if not settings.analyze:
+    if not opts.analyze:
         return cast(OcrResult, {"file": pdf_path.name, "ocr_text": ocr_text})
 
-    prompt_val: str = settings.prompt or _settings.prompt
-    seg_json = segment_pdf(ocr_text, prompt_val, model=settings.model or "gpt-4o")
+    # Only forward a prompt if the caller explicitly supplied one **and** it
+    # is non‑empty.  Otherwise let *segment_pdf* fall back to its built‑in
+    # template to avoid accidentally re‑using a generic summarization prompt
+    # that may be present in ``settings.prompt`` (e.g. from a local INI file).
+
+    prompt_val = opts.prompt or ""
+
+    if prompt_val:
+        prompt_arg = prompt_val
+    else:
+        prompt_arg = None  # let segment_pdf pick default template
+
+    seg_json = segment_pdf(
+        ocr_text,
+        prompt_arg,
+        model=opts.model or "gpt-4o",
+    )
 
     return cast(SegmentationResult, seg_json)
 
 
@@ -11,7 +11,7 @@
 from concurrent.futures import ThreadPoolExecutor, as_completed
 
 # Local imports
-from .logging_utils import get_logger
+from .logging_utils import get_logger, set_root_level
 from .ocr import ocr_pdf
 from .errors import PipelineError
 from .types import OcrResult
@@ -27,6 +27,12 @@
 """
 logger = get_logger(__name__)
 
+LOG_LEVELS: dict[str, int] = {
+    "DEBUG": logging.DEBUG,
+    "INFO": logging.INFO,
+    "WARNING": logging.WARNING,
+    "ERROR": logging.ERROR,
+}
 
 """
 Command-line interface entrypoint.
@@ -76,20 +82,32 @@ def main() -> None:
         default=False,
         help="suppress informational output; only warnings and errors are shown",
     )
+    parser.add_argument(
+        "--log-level",
+        choices=list(LOG_LEVELS.keys()),
+        help="set root log level",
+    )
     # ------------------------------------------------------------------
     # Apply logging level according to CLI flags
     # ------------------------------------------------------------------
     args = parser.parse_args()
 
-    root_logger = logging.getLogger()
+    # Determine flags for logging, guard against mocks in tests
+    log_level = args.log_level if args.log_level in LOG_LEVELS else None
+    verbose = args.verbose if isinstance(args.verbose, bool) else False
+    quiet = args.quiet if isinstance(args.quiet, bool) else False
 
-    if args.verbose and getattr(args, "quiet", False):
+    # Apply logging level according to CLI flags (--log-level supersedes verbose/quiet)
+    if verbose and quiet:
         parser.error("--verbose and --quiet are mutually exclusive")
-
-    if getattr(args, "quiet", False):
-        root_logger.setLevel(logging.WARNING)
-    elif args.verbose:
-        root_logger.setLevel(logging.DEBUG)
+    if log_level and (verbose or quiet):
+        parser.error("--log-level cannot be used with --verbose/--quiet")
+    if log_level:
+        set_root_level(LOG_LEVELS[log_level])
+    elif quiet:
+        set_root_level(logging.WARNING)
+    elif verbose:
+        set_root_level(logging.DEBUG)
         logger.debug("Verbose flag enabled – root log‑level set to DEBUG")
 
     try:
@@ -130,9 +148,18 @@ def main() -> None:
             for pdf_path in args.pdfs:
                 results.append(completed[pdf_path])
 
-        print(
-            json.dumps(results, ensure_ascii=False, indent=2 if args.verbose else None)
-        )
+        # Emit JSON to stdout.
+        # If the downstream pipe closes early (e.g. `| head`), writing to
+        # stdout raises BrokenPipeError.  Treat that as a normal termination
+        # and exit silently.
+        import contextlib
+
+        with contextlib.suppress(BrokenPipeError):
+            print(
+                json.dumps(
+                    results, ensure_ascii=False, indent=2 if args.verbose else None
+                )
+            )
 
     except PipelineError as exc:
         logger.error(str(exc))
Original file line number	Diff line number	Diff line change
`@@ -1,3 +0,0 @@`
`1`		`-[flake8]`
`2`		`-max-line-length = 88`
`3`		`-extend-ignore = E203`