Skip to content

Commit f4f4131

Browse files
authored
Merge pull request #3 from domfahey/add_page_numbers_to_ocr_output
Add page numbers to ocr output
2 parents 236c63a + 2234cfc commit f4f4131

File tree

10 files changed

+256
-54
lines changed

10 files changed

+256
-54
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,3 +65,4 @@ venv.bak/
6565
output.json
6666
*.ppm
6767
*.png
68+
.aider*

ocroutput.txt

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

ocroutput.txt.old

Lines changed: 6 additions & 0 deletions
Large diffs are not rendered by default.
File renamed without changes.

segments.json

Whitespace-only changes.

segments2.json

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
[
2+
{
3+
"file": "Sample_Title_Search3.pdf",
4+
"segmentation": {
5+
"documents": [
6+
{
7+
"title": "Assessment Postcard",
8+
"pages": [
9+
1,
10+
1
11+
],
12+
"summary": "Tax assessment details for 408 Trinidad Blvd over multiple years.",
13+
"recording_reference": "Book 2448 Page 67"
14+
},
15+
{
16+
"title": "Tax Map",
17+
"pages": [
18+
2,
19+
5
20+
],
21+
"summary": "A detailed tax map showing various blocks and lots including sectional overlays.",
22+
"recording_reference": null
23+
},
24+
{
25+
"title": "Title Insurance Policy",
26+
"pages": [
27+
6,
28+
8
29+
],
30+
"summary": "Owner’s policy covering 413 Trinidad Blvd and relevant coverage details.",
31+
"recording_reference": "Deed Book 6156 Page 249"
32+
},
33+
{
34+
"title": "Deed - Bargain & Sale",
35+
"pages": [
36+
9,
37+
12
38+
],
39+
"summary": "Transfer of ownership from Holiday City to Joseph & Nancy Quieti.",
40+
"recording_reference": "Book 2448 Page 67"
41+
},
42+
{
43+
"title": "Notice of Settlement",
44+
"pages": [
45+
13,
46+
15
47+
],
48+
"summary": "Planned transfer of 408 Trinidad Blvd from Estate of Nancy Quieti to Michael & Donna Reilly.",
49+
"recording_reference": "DOCKET# 36113"
50+
},
51+
{
52+
"title": "Declaration of Covenants",
53+
"pages": [
54+
16,
55+
17
56+
],
57+
"summary": "Covenants and restrictions for Holiday City at Monroe.",
58+
"recording_reference": "Book 1827 Page 103"
59+
},
60+
{
61+
"title": "Deed",
62+
"pages": [
63+
18,
64+
22
65+
],
66+
"summary": "Transfer of ownership from Holiday City to Elinor G Steele.",
67+
"recording_reference": "Book 2460 Page 299"
68+
},
69+
{
70+
"title": "Deed of Easement",
71+
"pages": [
72+
23,
73+
30
74+
],
75+
"summary": "Easement rights granted from Hovsons to Atlantic City Electric Company.",
76+
"recording_reference": "Book 1456 Page 843"
77+
},
78+
{
79+
"title": "Permit",
80+
"pages": [
81+
31,
82+
33
83+
],
84+
"summary": "Environmental protection permit issued for Holiday City at Monroe subdivision.",
85+
"recording_reference": "Book 1594 Page 340"
86+
},
87+
{
88+
"title": "Deed - Bargain & Sale",
89+
"pages": [
90+
34,
91+
37
92+
],
93+
"summary": "Transfer of property from Holiday City to Harold and Elizabeth Throneberry.",
94+
"recording_reference": "Book 1937 Page 192"
95+
}
96+
],
97+
"total_pages": 37
98+
}
99+
}
100+
]

src/pdf_ocr_pipeline/ocr.py

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,19 @@ def run_cmd(
159159
return proc
160160

161161

162+
def _wrap_page_text(text: str, page_num: int) -> str:
163+
"""Wrap page text with standardized page number tags.
164+
165+
Args:
166+
text: The OCR text for a single page
167+
page_num: The page number (1-based)
168+
169+
Returns:
170+
Text wrapped with page number tags
171+
"""
172+
return f"<page number {page_num}>\n{text}\n</page number {page_num}>"
173+
174+
162175
def ocr_pdf(pdf_path: Path, dpi: int = 300, lang: str = "eng") -> str:
163176
"""
164177
Perform OCR on a PDF file using pdftoppm and tesseract.
@@ -169,7 +182,8 @@ def ocr_pdf(pdf_path: Path, dpi: int = 300, lang: str = "eng") -> str:
169182
lang: Tesseract language code.
170183
171184
Returns:
172-
The recognized text as a Unicode string.
185+
The recognized text as a Unicode string, with each page wrapped in
186+
'<page number X>...</page number X>' tags.
173187
174188
Exits:
175189
1 if pdftoppm or tesseract fails.
@@ -346,7 +360,9 @@ def ocr_pdf(pdf_path: Path, dpi: int = 300, lang: str = "eng") -> str:
346360
)
347361
raise OcrError("tesseract failed during streaming fallback") from e
348362

349-
return (tess_res.stdout or b"").decode("utf-8", errors="replace")
363+
# Wrap single-page output in tags
364+
text = (tess_res.stdout or b"").decode("utf-8", errors="replace")
365+
return _wrap_page_text(text, 1)
350366

351367
if not images:
352368
logger.error("pdftoppm produced no images for %s", pdf_path)
@@ -390,4 +406,8 @@ def ocr_pdf(pdf_path: Path, dpi: int = 300, lang: str = "eng") -> str:
390406
(tess_res.stdout or b"").decode("utf-8", errors="replace")
391407
)
392408

393-
return "\n\f\n".join(ocr_text_parts)
409+
# Wrap each page's OCR text in page-number tags
410+
pages: List[str] = []
411+
for idx, part in enumerate(ocr_text_parts, start=1):
412+
pages.append(_wrap_page_text(part, idx))
413+
return "\n".join(pages)
Lines changed: 58 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,58 +1,73 @@
11
Task Name: "Segment and Label Real‑Estate Documents Inside a Single PDF"
22

33
1. Your Role
4-
You are a senior real‑estate paralegal and title‑search specialist. You know the structure, phrasing, and recording conventions of:
5-
- Deeds (Warranty, Quit‑Claim, etc.)
6-
- Mortgages / Deeds of Trust
7-
- Assignments & Releases
8-
- Affidavits
9-
- Easements
10-
- Liens & Lien Releases
11-
- Title Commitments / Policies
12-
- Any other real‑estate instrument that may appear in a closing package
13-
14-
2. Input
15-
A single multi‑page PDF that may bundle several distinct instruments.
16-
17-
3. Output (required)
18-
Return exactly one valid JSON object with this shape (1‑based page numbers):
4+
System:
5+
You are **Segmenter-X**, an LLM that slices multi-page real-estate PDFs into discrete instruments and returns a single JSON payload. Any text outside the required tags will break downstream code—don’t do it.
196

7+
User:
8+
<task>
9+
{{TASK_NAME}}
10+
</task>
11+
12+
<doc>
13+
{{PDF_CONTENT # base64 or URL}}
14+
</doc>
15+
16+
### 📤 OUTPUT SPEC
17+
Return one—and only one—JSON object wrapped in `<json>` tags.
18+
19+
<json>
2020
{
2121
"documents": [
2222
{
23-
"title": "Warranty Deed",
24-
"pages": [1, 4],
25-
"summary": "Conveys fee simple from Grantor A to Grantee B; legal description in Exhibit A",
26-
"recording_reference": "OR Book 123 / Page 456" // omit if not visible
27-
},
28-
...
23+
"title": "Formal Instrument Name or \"Unknown\"",
24+
"pages": [start, end], // 1-based, inclusive
25+
"summary": "One-sentence gist",
26+
"recording_reference": "Book/Volume/Page etc. or null"
27+
}
28+
2929
],
30-
"total_pages": 37
30+
"total_pages": INT
3131
}
32+
</json>
33+
34+
Rules
35+
1. Every page 1..total_pages appears in exactly one range.
36+
2. If unsure about a boundary, merge into the earlier doc; explain why in analysis.
37+
3. Keep summaries under 40 words; omit filler (“this document”).
38+
4. If recording data isn’t on the face, set `"recording_reference": null`.
39+
5. When token usage > 80 % of limit, stop scanning and throw:
40+
```{"error": "context_exceeded", "processed_pages": N}```
3241

33-
Rules:
34-
1. pages is an array [start, end]; include every page once and only once.
35-
2. title must be the formal instrument name as it appears (fallback to your best guess).
36-
3. summary and recording_reference are optional but encouraged when information is available.
37-
4. Add "Unknown" as title if you cannot classify an instrument.
42+
### 🧐 ANALYSIS (for humans, parsers ignore)
43+
Wrap reasoning in `<analysis>` … `</analysis>` tags. Do not nest tags. Inside, follow this plan—bullet lists welcome:
3844

39-
4. Method (how you should think)
45+
1. **Page signals** – dump header/footer hits, page-x-of-y resets, title blocks.
46+
2. **Keyword scan** – report matches per page for deed, mortgage, assignment vocab.
47+
3. **Boundary hypotheses** – propose splits; note exhibits & signature drift.
48+
4. **Second pass** – resolve overlaps; show before/after page map.
49+
5. **Self-check** – confirm ∑(range lengths) == total_pages; if not, fix and explain.
4050

41-
1. Scan headers/footers for document names, internal page numbers, recorder stamps.
42-
2. Detect page‑number resets (“Page 1 of X” → new doc).
43-
3. Spot title blocks / opening clauses (“THIS DEED…”, “THIS MORTGAGE…”).
44-
4. Watch signature & notary pages – the next page after one often begins a new doc.
45-
5. Check vocabulary:
46-
- Deed → “grantor”, “grantee”, “consideration”, metes & bounds.
47-
- Mortgage → “borrower”, “lender”, “security instrument”.
48-
- Assignment/Release → cites prior instrument & recording data.
49-
6. Edge cases
50-
- If a document spills an exhibit into the next pages, treat exhibits as part of that doc.
51-
- Illegible pages: use context before/after to decide.
51+
End `<analysis>` then emit the `<json>` block and nothing else.
5252

53-
Finish by validating the sum of all page ranges equals total_pages. If it doesn’t, fix it.
53+
### 📝 MINI EXAMPLE (helps zero-shot)
54+
Suppose a 4-page PDF:
55+
p1 “THIS WARRANTY DEED …”
56+
p2 deed continues
57+
p3 “MORTGAGE … page 1 of 2”
58+
p4 mortgage page 2.
5459

55-
5. Tone & Formatting for the Response
56-
Respond only with the JSON object—no narrative, no Markdown.
60+
Expected:
61+
62+
<json>
63+
{
64+
"documents":[
65+
{"title":"Warranty Deed","pages":[1,2],"summary":"…","recording_reference":null},
66+
{"title":"Mortgage","pages":[3,4],"summary":"…","recording_reference":null}
67+
],
68+
"total_pages":4
69+
}
70+
</json>
5771

58-
That’s it. Go segment the PDF.
72+
### 🔒 REMEMBER
73+
*No stray text after the JSON block.* Any deviation 400s the pipeline.

tests/test_ocr.py

Lines changed: 64 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -103,9 +103,10 @@ def test_ocr_pdf_success_scanned(self, mock_run_cmd):
103103

104104
# Select pdf path
105105
result = ocr_pdf(self.scanned_pdf)
106-
106+
# Expected tagged output for a single page
107+
expected = "<page number 1>\nSample OCR text\n</page number 1>"
107108
# Assertions
108-
self.assertEqual(result, "Sample OCR text")
109+
self.assertEqual(result, expected)
109110
self.assertEqual(mock_run_cmd.call_count, 2)
110111

111112
def test_ocr_pdf_success_digital(self, mock_run_cmd):
@@ -125,9 +126,10 @@ def test_ocr_pdf_success_digital(self, mock_run_cmd):
125126
from pdf_ocr_pipeline.ocr import ocr_pdf
126127

127128
result = ocr_pdf(self.digital_pdf)
128-
129+
# Expected tagged output for a single page
130+
expected = "<page number 1>\nSample OCR text\n</page number 1>"
129131
# Assertions
130-
self.assertEqual(result, "Sample OCR text")
132+
self.assertEqual(result, expected)
131133
self.assertEqual(mock_run_cmd.call_count, 2)
132134

133135
def test_ocr_pdf_pdftoppm_error_scanned(self, mock_run_cmd):
@@ -228,6 +230,64 @@ def test_ocr_pdf_no_stdout_digital(self, mock_run_cmd):
228230
ocr_pdf(self.digital_pdf)
229231
self.mock_logger.error.assert_called_once()
230232

233+
def test_ocr_pdf_multiple_pages(self, mock_run_cmd):
234+
"""Test that multi-page PDFs are correctly processed with page number tags."""
235+
# Import here to apply patches properly
236+
from pdf_ocr_pipeline.ocr import ocr_pdf
237+
238+
# Mock pdftoppm to produce multiple image files
239+
ppm_result = MagicMock(spec=subprocess.CompletedProcess)
240+
ppm_result.stdout = b"ppm_image_data"
241+
ppm_result.returncode = 0
242+
243+
# Set up multiple test page results
244+
page1_text = "Page one content"
245+
page2_text = "Page two content"
246+
page3_text = "Page three content"
247+
248+
# Create a sequence of mock responses for each page
249+
tess_result1 = MagicMock(spec=subprocess.CompletedProcess)
250+
tess_result1.stdout = page1_text.encode("utf-8")
251+
tess_result1.returncode = 0
252+
253+
tess_result2 = MagicMock(spec=subprocess.CompletedProcess)
254+
tess_result2.stdout = page2_text.encode("utf-8")
255+
tess_result2.returncode = 0
256+
257+
tess_result3 = MagicMock(spec=subprocess.CompletedProcess)
258+
tess_result3.stdout = page3_text.encode("utf-8")
259+
tess_result3.returncode = 0
260+
261+
# Set up the sequence of responses
262+
mock_run_cmd.side_effect = [
263+
ppm_result,
264+
tess_result1,
265+
tess_result2,
266+
tess_result3,
267+
]
268+
269+
# Mock Path.glob to return multiple image file paths
270+
with patch("pathlib.Path.glob") as mock_glob:
271+
mock_glob.return_value = [
272+
Path("/tmp/page-01.ppm"),
273+
Path("/tmp/page-02.ppm"),
274+
Path("/tmp/page-03.ppm"),
275+
]
276+
277+
# Call the function under test
278+
result = ocr_pdf(self.digital_pdf)
279+
280+
# Expected output with correct page number tags
281+
expected = (
282+
f"<page number 1>\n{page1_text}\n</page number 1>\n"
283+
f"<page number 2>\n{page2_text}\n</page number 2>\n"
284+
f"<page number 3>\n{page3_text}\n</page number 3>"
285+
)
286+
287+
# Assertions
288+
self.assertEqual(result, expected)
289+
self.assertEqual(mock_run_cmd.call_count, 4) # 1 pdftoppm + 3 tesseract calls
290+
231291

232292
if __name__ == "__main__":
233293
unittest.main()

todos.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,9 @@ Done‑when** to guide implementers.
5050

5151
### 17 LLM client
5252

53-
- [ ] Thread‑safe singleton lock.
54-
- [ ] Re‑raise `KeyboardInterrupt` / `BaseException`.
55-
- [ ] Validate response schema.
53+
- [x] Thread‑safe singleton lock.
54+
- [x] Re‑raise `KeyboardInterrupt` / `BaseException`.
55+
- [x] Validate response schema.
5656

5757
### 18 OCR path
5858

0 commit comments

Comments
 (0)