Skip to content

Commit 236c63a

Browse files
authored
Merge pull request #1 from domfahey/todo17
Implement LLM client enhancements (#17): thread-safe singleton instan…
2 parents a3ffdc0 + bbb519c commit 236c63a

18 files changed

Lines changed: 524 additions & 79 deletions

.flake8

Lines changed: 0 additions & 3 deletions
This file was deleted.

CLAUDE.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# CLAUDE.md
2+
3+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4+
5+
## Build/Test Commands
6+
- Run all tests: `make test` or `pytest`
7+
- Run single test: `pytest tests/test_file.py::TestClass::test_name`
8+
- Test with coverage: `pytest --cov=pdf_ocr_pipeline tests/`
9+
- Format code: `make format`
10+
- Lint code: `make lint`
11+
- Combined check: `make check`
12+
13+
## Code Style Guidelines
14+
- Python 3.8+ with strict type annotations (mypy)
15+
- Line length: 88 characters (Black)
16+
- Formatting: ruff format followed by Black
17+
- Imports: stdlib first, then third-party, grouped alphabetically
18+
- Types: Full annotation required, use `TypedDict`, `dataclasses`, from `__future__` import annotations
19+
- Naming: snake_case (variables/functions), PascalCase (classes), ALL_CAPS (constants)
20+
- Error handling: Use custom exceptions from `errors.py`, never use `sys.exit()` in library code
21+
- Docstrings: Google style format with clear parameter and return type documentation

examples/segment_pdf.sh

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
#!/usr/bin/env bash
2+
# Simple wrapper to segment a PDF using the default real-estate template.
3+
# Usage: segment_pdf.sh <pdf_file> [output_file]
4+
5+
if [ $# -lt 1 ]; then
6+
echo "Usage: $(basename "$0") <pdf_file> [output_file]"
7+
exit 1
8+
fi
9+
10+
PDF_FILE="$1"
11+
OUTPUT_FILE="${2:-segment.json}"
12+
13+
# Ensure input PDF exists
14+
if [ ! -f "$PDF_FILE" ]; then
15+
echo "Error: File '$PDF_FILE' not found." >&2
16+
exit 1
17+
fi
18+
19+
echo "Segmenting PDF: $PDF_FILE"
20+
echo "Output will be saved to: $OUTPUT_FILE"
21+
22+
# Ensure local 'src' is first on PYTHONPATH so we run the patched code
23+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
24+
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
25+
export PYTHONPATH="$PROJECT_ROOT/src${PYTHONPATH:+:$PYTHONPATH}"
26+
27+
# Run OCR then segmentation (uses default prompt template)
28+
## Always use local module to ensure using current workspace code
29+
python3 -m pdf_ocr_pipeline "$PDF_FILE" \
30+
| python3 -m pdf_ocr_pipeline.summarize --pretty > "$OUTPUT_FILE"
31+
32+
if [ $? -eq 0 ]; then
33+
echo "Segmentation complete. Results saved to: $OUTPUT_FILE"
34+
else
35+
echo "Error: Segmentation failed." >&2
36+
exit 1
37+
fi

ocroutput.txt

Lines changed: 6 additions & 0 deletions
Large diffs are not rendered by default.

package-lock.json

Lines changed: 6 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ dependencies = ["openai>=1.75.0", "pydantic>=2.0"]
2929
[project.scripts]
3030
pdf-ocr = "pdf_ocr_pipeline.cli:main"
3131
pdf-ocr-summarize = "pdf_ocr_pipeline.summarize:main"
32+
pdf-ocr-segment = "pdf_ocr_pipeline.segment_cli:main"
3233

3334
[project.optional-dependencies]
3435
dev = [

segment.json

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
[
2+
{
3+
"file": "Sample_Title_Search3.pdf",
4+
"analysis": {
5+
"documents": [
6+
{
7+
"title": "Notice of Settlement",
8+
"pages": [
9+
1,
10+
3
11+
],
12+
"summary": "Notice of Settlement for property at 408 Trinidad Boulevard, involving Michael and Donna Reilly as buyers.",
13+
"recording_reference": "File No. 989-2260645"
14+
},
15+
{
16+
"title": "American Land Title Association Owner's Policy",
17+
"pages": [
18+
4,
19+
6
20+
],
21+
"summary": "Title policy for George and Linda DiCugno, covering 413 Trinidad Boulevard, issued by First American Title Insurance.",
22+
"recording_reference": "Policy No: 5011434-0131314e"
23+
},
24+
{
25+
"title": "Deed",
26+
"pages": [
27+
7,
28+
9
29+
],
30+
"summary": "Conveys property at 408 Trinidad Boulevard from Holiday City at Monroe, Inc. to Joseph and Nancy Quieti."
31+
},
32+
{
33+
"title": "Affidavit of Consideration",
34+
"pages": [
35+
10,
36+
10
37+
],
38+
"summary": "Affidavit confirming monetary consideration in transferring property at 405 Paradise Road."
39+
},
40+
{
41+
"title": "Deed of Easement",
42+
"pages": [
43+
11,
44+
13
45+
],
46+
"summary": "Grants Atlantic City Electric Company easement for utility lines on properties within Monroe Township."
47+
},
48+
{
49+
"title": "Stream Encroachment Permit",
50+
"pages": [
51+
14,
52+
15
53+
],
54+
"summary": "Permit for constructing stormwater structures within defined blocks in Monroe Township."
55+
},
56+
{
57+
"title": "Bargain and Sale Deed",
58+
"pages": [
59+
16,
60+
17
61+
],
62+
"summary": "Conveys property at 413 Trinidad Boulevard from Hovsons Inc. to Harold and Elizabeth Throneberry."
63+
}
64+
],
65+
"total_pages": 17
66+
}
67+
}
68+
]

setup.cfg

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +0,0 @@
1-
[flake8]
2-
max-line-length = 88
3-
extend-ignore = E203

src/pdf_ocr_pipeline/__init__.py

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -40,16 +40,33 @@ def process_pdf(
4040
if not pdf_path.is_file():
4141
raise FileNotFoundError(f"File not found: {pdf_path}")
4242

43-
dpi_val = settings.dpi or _settings.dpi
44-
lang_val = settings.lang or _settings.lang
43+
opts = settings # local alias for brevity (does not shadow module)
44+
45+
dpi_val = opts.dpi or _settings.dpi
46+
lang_val = opts.lang or _settings.lang
4547

4648
ocr_text = ocr_pdf(pdf_path, dpi=dpi_val, lang=lang_val)
4749

48-
if not settings.analyze:
50+
if not opts.analyze:
4951
return cast(OcrResult, {"file": pdf_path.name, "ocr_text": ocr_text})
5052

51-
prompt_val: str = settings.prompt or _settings.prompt
52-
seg_json = segment_pdf(ocr_text, prompt_val, model=settings.model or "gpt-4o")
53+
# Only forward a prompt if the caller explicitly supplied one **and** it
54+
# is non‑empty. Otherwise let *segment_pdf* fall back to its built‑in
55+
# template to avoid accidentally re‑using a generic summarization prompt
56+
# that may be present in ``settings.prompt`` (e.g. from a local INI file).
57+
58+
prompt_val = opts.prompt or ""
59+
60+
if prompt_val:
61+
prompt_arg = prompt_val
62+
else:
63+
prompt_arg = None # let segment_pdf pick default template
64+
65+
seg_json = segment_pdf(
66+
ocr_text,
67+
prompt_arg,
68+
model=opts.model or "gpt-4o",
69+
)
5370

5471
return cast(SegmentationResult, seg_json)
5572

src/pdf_ocr_pipeline/cli.py

Lines changed: 38 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from concurrent.futures import ThreadPoolExecutor, as_completed
1212

1313
# Local imports
14-
from .logging_utils import get_logger
14+
from .logging_utils import get_logger, set_root_level
1515
from .ocr import ocr_pdf
1616
from .errors import PipelineError
1717
from .types import OcrResult
@@ -27,6 +27,12 @@
2727
"""
2828
logger = get_logger(__name__)
2929

30+
LOG_LEVELS: dict[str, int] = {
31+
"DEBUG": logging.DEBUG,
32+
"INFO": logging.INFO,
33+
"WARNING": logging.WARNING,
34+
"ERROR": logging.ERROR,
35+
}
3036

3137
"""
3238
Command-line interface entrypoint.
@@ -76,20 +82,32 @@ def main() -> None:
7682
default=False,
7783
help="suppress informational output; only warnings and errors are shown",
7884
)
85+
parser.add_argument(
86+
"--log-level",
87+
choices=list(LOG_LEVELS.keys()),
88+
help="set root log level",
89+
)
7990
# ------------------------------------------------------------------
8091
# Apply logging level according to CLI flags
8192
# ------------------------------------------------------------------
8293
args = parser.parse_args()
8394

84-
root_logger = logging.getLogger()
95+
# Determine flags for logging, guard against mocks in tests
96+
log_level = args.log_level if args.log_level in LOG_LEVELS else None
97+
verbose = args.verbose if isinstance(args.verbose, bool) else False
98+
quiet = args.quiet if isinstance(args.quiet, bool) else False
8599

86-
if args.verbose and getattr(args, "quiet", False):
100+
# Apply logging level according to CLI flags (--log-level supersedes verbose/quiet)
101+
if verbose and quiet:
87102
parser.error("--verbose and --quiet are mutually exclusive")
88-
89-
if getattr(args, "quiet", False):
90-
root_logger.setLevel(logging.WARNING)
91-
elif args.verbose:
92-
root_logger.setLevel(logging.DEBUG)
103+
if log_level and (verbose or quiet):
104+
parser.error("--log-level cannot be used with --verbose/--quiet")
105+
if log_level:
106+
set_root_level(LOG_LEVELS[log_level])
107+
elif quiet:
108+
set_root_level(logging.WARNING)
109+
elif verbose:
110+
set_root_level(logging.DEBUG)
93111
logger.debug("Verbose flag enabled – root log‑level set to DEBUG")
94112

95113
try:
@@ -130,9 +148,18 @@ def main() -> None:
130148
for pdf_path in args.pdfs:
131149
results.append(completed[pdf_path])
132150

133-
print(
134-
json.dumps(results, ensure_ascii=False, indent=2 if args.verbose else None)
135-
)
151+
# Emit JSON to stdout.
152+
# If the downstream pipe closes early (e.g. `| head`), writing to
153+
# stdout raises BrokenPipeError. Treat that as a normal termination
154+
# and exit silently.
155+
import contextlib
156+
157+
with contextlib.suppress(BrokenPipeError):
158+
print(
159+
json.dumps(
160+
results, ensure_ascii=False, indent=2 if args.verbose else None
161+
)
162+
)
136163

137164
except PipelineError as exc:
138165
logger.error(str(exc))

0 commit comments

Comments
 (0)