domfahey
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ocroutput.txt‎
Lines changed: 1 addition & 1 deletion b/‎ocroutput.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ocroutput.txt.old‎
Lines changed: 6 additions & 0 deletions b/‎ocroutput.txt.old‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎segment.json‎ ‎segment.json.old‎segment.json renamed to segment.json.old b/‎segment.json‎ ‎segment.json.old‎segment.json renamed to segment.json.old
diff --git a/‎segments.json‎ b/‎segments.json‎
diff --git a/‎segments2.json‎
Lines changed: 100 additions & 0 deletions b/‎segments2.json‎
Lines changed: 100 additions & 0 deletions
diff --git a/‎src/pdf_ocr_pipeline/ocr.py‎
Lines changed: 23 additions & 3 deletions b/‎src/pdf_ocr_pipeline/ocr.py‎
Lines changed: 23 additions & 3 deletions
diff --git a/‎src/pdf_ocr_pipeline/templates/segment_prompt.txt‎
Lines changed: 58 additions & 43 deletions b/‎src/pdf_ocr_pipeline/templates/segment_prompt.txt‎
Lines changed: 58 additions & 43 deletions
diff --git a/‎tests/test_ocr.py‎
Lines changed: 64 additions & 4 deletions b/‎tests/test_ocr.py‎
Lines changed: 64 additions & 4 deletions
diff --git a/‎todos.md‎
Lines changed: 3 additions & 3 deletions b/‎todos.md‎
Lines changed: 3 additions & 3 deletions
@@ -65,3 +65,4 @@ venv.bak/
 output.json
 *.ppm
 *.png
+.aider*
@@ -0,0 +1,100 @@
+[
+  {
+    "file": "Sample_Title_Search3.pdf",
+    "segmentation": {
+      "documents": [
+        {
+          "title": "Assessment Postcard",
+          "pages": [
+            1,
+            1
+          ],
+          "summary": "Tax assessment details for 408 Trinidad Blvd over multiple years.",
+          "recording_reference": "Book 2448 Page 67"
+        },
+        {
+          "title": "Tax Map",
+          "pages": [
+            2,
+            5
+          ],
+          "summary": "A detailed tax map showing various blocks and lots including sectional overlays.",
+          "recording_reference": null
+        },
+        {
+          "title": "Title Insurance Policy",
+          "pages": [
+            6,
+            8
+          ],
+          "summary": "Owner’s policy covering 413 Trinidad Blvd and relevant coverage details.",
+          "recording_reference": "Deed Book 6156 Page 249"
+        },
+        {
+          "title": "Deed - Bargain & Sale",
+          "pages": [
+            9,
+            12
+          ],
+          "summary": "Transfer of ownership from Holiday City to Joseph & Nancy Quieti.",
+          "recording_reference": "Book 2448 Page 67"
+        },
+        {
+          "title": "Notice of Settlement",
+          "pages": [
+            13,
+            15
+          ],
+          "summary": "Planned transfer of 408 Trinidad Blvd from Estate of Nancy Quieti to Michael & Donna Reilly.",
+          "recording_reference": "DOCKET# 36113"
+        },
+        {
+          "title": "Declaration of Covenants",
+          "pages": [
+            16,
+            17
+          ],
+          "summary": "Covenants and restrictions for Holiday City at Monroe.",
+          "recording_reference": "Book 1827 Page 103"
+        },
+        {
+          "title": "Deed",
+          "pages": [
+            18,
+            22
+          ],
+          "summary": "Transfer of ownership from Holiday City to Elinor G Steele.",
+          "recording_reference": "Book 2460 Page 299"
+        },
+        {
+          "title": "Deed of Easement",
+          "pages": [
+            23,
+            30
+          ],
+          "summary": "Easement rights granted from Hovsons to Atlantic City Electric Company.",
+          "recording_reference": "Book 1456 Page 843"
+        },
+        {
+          "title": "Permit",
+          "pages": [
+            31,
+            33
+          ],
+          "summary": "Environmental protection permit issued for Holiday City at Monroe subdivision.",
+          "recording_reference": "Book 1594 Page 340"
+        },
+        {
+          "title": "Deed - Bargain & Sale",
+          "pages": [
+            34,
+            37
+          ],
+          "summary": "Transfer of property from Holiday City to Harold and Elizabeth Throneberry.",
+          "recording_reference": "Book 1937 Page 192"
+        }
+      ],
+      "total_pages": 37
+    }
+  }
+]
@@ -159,6 +159,19 @@ def run_cmd(
     return proc
 
 
+def _wrap_page_text(text: str, page_num: int) -> str:
+    """Wrap page text with standardized page number tags.
+
+    Args:
+        text: The OCR text for a single page
+        page_num: The page number (1-based)
+
+    Returns:
+        Text wrapped with page number tags
+    """
+    return f"<page number {page_num}>\n{text}\n</page number {page_num}>"
+
+
 def ocr_pdf(pdf_path: Path, dpi: int = 300, lang: str = "eng") -> str:
     """
     Perform OCR on a PDF file using pdftoppm and tesseract.
@@ -169,7 +182,8 @@ def ocr_pdf(pdf_path: Path, dpi: int = 300, lang: str = "eng") -> str:
         lang: Tesseract language code.
 
     Returns:
-        The recognized text as a Unicode string.
+        The recognized text as a Unicode string, with each page wrapped in
+        '<page number X>...</page number X>' tags.
 
     Exits:
         1 if pdftoppm or tesseract fails.
@@ -346,7 +360,9 @@ def ocr_pdf(pdf_path: Path, dpi: int = 300, lang: str = "eng") -> str:
                 )
                 raise OcrError("tesseract failed during streaming fallback") from e
 
-            return (tess_res.stdout or b"").decode("utf-8", errors="replace")
+            # Wrap single-page output in tags
+            text = (tess_res.stdout or b"").decode("utf-8", errors="replace")
+            return _wrap_page_text(text, 1)
 
         if not images:
             logger.error("pdftoppm produced no images for %s", pdf_path)
@@ -390,4 +406,8 @@ def ocr_pdf(pdf_path: Path, dpi: int = 300, lang: str = "eng") -> str:
                 (tess_res.stdout or b"").decode("utf-8", errors="replace")
             )
 
-    return "\n\f\n".join(ocr_text_parts)
+    # Wrap each page's OCR text in page-number tags
+    pages: List[str] = []
+    for idx, part in enumerate(ocr_text_parts, start=1):
+        pages.append(_wrap_page_text(part, idx))
+    return "\n".join(pages)
@@ -1,58 +1,73 @@
 Task Name: "Segment and Label Real‑Estate Documents Inside a Single PDF"
 
 1. Your Role
-You are a senior real‑estate paralegal and title‑search specialist. You know the structure, phrasing, and recording conventions of:
-- Deeds (Warranty, Quit‑Claim, etc.)
-- Mortgages / Deeds of Trust
-- Assignments & Releases
-- Affidavits
-- Easements
-- Liens & Lien Releases
-- Title Commitments / Policies
-- Any other real‑estate instrument that may appear in a closing package
-
-2. Input
-A single multi‑page PDF that may bundle several distinct instruments.
-
-3. Output (required)
-Return exactly one valid JSON object with this shape (1‑based page numbers):
+System:
+You are **Segmenter-X**, an LLM that slices multi-page real-estate PDFs into discrete instruments and returns a single JSON payload. Any text outside the required tags will break downstream code—don’t do it.
 
+User:
+<task>
+{{TASK_NAME}}
+</task>
+
+<doc>
+{{PDF_CONTENT   # base64 or URL}}
+</doc>
+
+### 📤 OUTPUT SPEC
+Return one—and only one—JSON object wrapped in `<json>` tags.
+
+<json>
 {
   "documents": [
     {
-      "title": "Warranty Deed",
-      "pages": [1, 4],
-      "summary": "Conveys fee simple from Grantor A to Grantee B; legal description in Exhibit A",
-      "recording_reference": "OR Book 123 / Page 456"  // omit if not visible
-    },
-    ...
+      "title": "Formal Instrument Name or \"Unknown\"",
+      "pages": [start, end],     // 1-based, inclusive
+      "summary": "One-sentence gist",
+      "recording_reference": "Book/Volume/Page etc. or null"
+    }
+    …
   ],
-  "total_pages": 37
+  "total_pages": INT
 }
+</json>
+
+Rules
+1. Every page 1..total_pages appears in exactly one range.
+2. If unsure about a boundary, merge into the earlier doc; explain why in analysis.
+3. Keep summaries under 40 words; omit filler (“this document”).
+4. If recording data isn’t on the face, set `"recording_reference": null`.
+5. When token usage > 80 % of limit, stop scanning and throw:
+   ```{"error": "context_exceeded", "processed_pages": N}```
 
-Rules:
-1. pages is an array [start, end]; include every page once and only once.
-2. title must be the formal instrument name as it appears (fallback to your best guess).
-3. summary and recording_reference are optional but encouraged when information is available.
-4. Add "Unknown" as title if you cannot classify an instrument.
+### 🧐 ANALYSIS (for humans, parsers ignore)
+Wrap reasoning in `<analysis>` … `</analysis>` tags. Do not nest tags. Inside, follow this plan—bullet lists welcome:
 
-4. Method (how you should think)
+1. **Page signals** – dump header/footer hits, page-x-of-y resets, title blocks.
+2. **Keyword scan** – report matches per page for deed, mortgage, assignment vocab.
+3. **Boundary hypotheses** – propose splits; note exhibits & signature drift.
+4. **Second pass** – resolve overlaps; show before/after page map.
+5. **Self-check** – confirm ∑(range lengths) == total_pages; if not, fix and explain.
 
-1. Scan headers/footers for document names, internal page numbers, recorder stamps.
-2. Detect page‑number resets (“Page 1 of X” → new doc).
-3. Spot title blocks / opening clauses (“THIS DEED…”, “THIS MORTGAGE…”).
-4. Watch signature & notary pages – the next page after one often begins a new doc.
-5. Check vocabulary:
-   - Deed → “grantor”, “grantee”, “consideration”, metes & bounds.
-   - Mortgage → “borrower”, “lender”, “security instrument”.
-   - Assignment/Release → cites prior instrument & recording data.
-6. Edge cases
-   - If a document spills an exhibit into the next pages, treat exhibits as part of that doc.
-   - Illegible pages: use context before/after to decide.
+End `<analysis>` then emit the `<json>` block and nothing else.
 
-Finish by validating the sum of all page ranges equals total_pages. If it doesn’t, fix it.
+### 📝 MINI EXAMPLE  (helps zero-shot)
+Suppose a 4-page PDF:  
+p1 “THIS WARRANTY DEED …”  
+p2 deed continues  
+p3 “MORTGAGE … page 1 of 2”  
+p4 mortgage page 2.  
 
-5. Tone & Formatting for the Response
-Respond only with the JSON object—no narrative, no Markdown.
+Expected:
+
+<json>
+{
+  "documents":[
+    {"title":"Warranty Deed","pages":[1,2],"summary":"…","recording_reference":null},
+    {"title":"Mortgage","pages":[3,4],"summary":"…","recording_reference":null}
+  ],
+  "total_pages":4
+}
+</json>
 
-That’s it. Go segment the PDF.
+### 🔒 REMEMBER
+*No stray text after the JSON block.* Any deviation 400s the pipeline.
@@ -103,9 +103,10 @@ def test_ocr_pdf_success_scanned(self, mock_run_cmd):
 
         # Select pdf path
         result = ocr_pdf(self.scanned_pdf)
-
+        # Expected tagged output for a single page
+        expected = "<page number 1>\nSample OCR text\n</page number 1>"
         # Assertions
-        self.assertEqual(result, "Sample OCR text")
+        self.assertEqual(result, expected)
         self.assertEqual(mock_run_cmd.call_count, 2)
 
     def test_ocr_pdf_success_digital(self, mock_run_cmd):
@@ -125,9 +126,10 @@ def test_ocr_pdf_success_digital(self, mock_run_cmd):
         from pdf_ocr_pipeline.ocr import ocr_pdf
 
         result = ocr_pdf(self.digital_pdf)
-
+        # Expected tagged output for a single page
+        expected = "<page number 1>\nSample OCR text\n</page number 1>"
         # Assertions
-        self.assertEqual(result, "Sample OCR text")
+        self.assertEqual(result, expected)
         self.assertEqual(mock_run_cmd.call_count, 2)
 
     def test_ocr_pdf_pdftoppm_error_scanned(self, mock_run_cmd):
@@ -228,6 +230,64 @@ def test_ocr_pdf_no_stdout_digital(self, mock_run_cmd):
             ocr_pdf(self.digital_pdf)
         self.mock_logger.error.assert_called_once()
 
+    def test_ocr_pdf_multiple_pages(self, mock_run_cmd):
+        """Test that multi-page PDFs are correctly processed with page number tags."""
+        # Import here to apply patches properly
+        from pdf_ocr_pipeline.ocr import ocr_pdf
+
+        # Mock pdftoppm to produce multiple image files
+        ppm_result = MagicMock(spec=subprocess.CompletedProcess)
+        ppm_result.stdout = b"ppm_image_data"
+        ppm_result.returncode = 0
+
+        # Set up multiple test page results
+        page1_text = "Page one content"
+        page2_text = "Page two content"
+        page3_text = "Page three content"
+
+        # Create a sequence of mock responses for each page
+        tess_result1 = MagicMock(spec=subprocess.CompletedProcess)
+        tess_result1.stdout = page1_text.encode("utf-8")
+        tess_result1.returncode = 0
+
+        tess_result2 = MagicMock(spec=subprocess.CompletedProcess)
+        tess_result2.stdout = page2_text.encode("utf-8")
+        tess_result2.returncode = 0
+
+        tess_result3 = MagicMock(spec=subprocess.CompletedProcess)
+        tess_result3.stdout = page3_text.encode("utf-8")
+        tess_result3.returncode = 0
+
+        # Set up the sequence of responses
+        mock_run_cmd.side_effect = [
+            ppm_result,
+            tess_result1,
+            tess_result2,
+            tess_result3,
+        ]
+
+        # Mock Path.glob to return multiple image file paths
+        with patch("pathlib.Path.glob") as mock_glob:
+            mock_glob.return_value = [
+                Path("/tmp/page-01.ppm"),
+                Path("/tmp/page-02.ppm"),
+                Path("/tmp/page-03.ppm"),
+            ]
+
+            # Call the function under test
+            result = ocr_pdf(self.digital_pdf)
+
+        # Expected output with correct page number tags
+        expected = (
+            f"<page number 1>\n{page1_text}\n</page number 1>\n"
+            f"<page number 2>\n{page2_text}\n</page number 2>\n"
+            f"<page number 3>\n{page3_text}\n</page number 3>"
+        )
+
+        # Assertions
+        self.assertEqual(result, expected)
+        self.assertEqual(mock_run_cmd.call_count, 4)  # 1 pdftoppm + 3 tesseract calls
+
 
 if __name__ == "__main__":
     unittest.main()
@@ -50,9 +50,9 @@ Done‑when** to guide implementers.
 
 ### 17 LLM client
 
-- [ ] Thread‑safe singleton lock.  
-- [ ] Re‑raise `KeyboardInterrupt` / `BaseException`.  
-- [ ] Validate response schema.
+- [x] Thread‑safe singleton lock.  
+- [x] Re‑raise `KeyboardInterrupt` / `BaseException`.  
+- [x] Validate response schema.
 
 ### 18 OCR path
-Original file line number
+Diff line change
 output.json
 *.ppm
 *.png
 +.aider*