Skip to content

Commit 82d0bf2

Browse files
committed
added test case for method bbox_no_intersection method
changed the test name to be more aligned with other tests
1 parent f432359 commit 82d0bf2

File tree

1 file changed

+56
-0
lines changed

1 file changed

+56
-0
lines changed

tests/test_utils.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
"Test to check intersection logic when no intersection area returned"
2+
import os
3+
import sys
4+
5+
from pdfminer.pdfparser import PDFParser
6+
from pdfminer.pdfdocument import PDFDocument
7+
from pdfminer.pdfpage import PDFPage
8+
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
9+
from pdfminer.pdfinterp import PDFResourceManager
10+
from pdfminer.pdfinterp import PDFPageInterpreter
11+
from pdfminer.converter import PDFPageAggregator
12+
from pdfminer.layout import (
13+
LAParams,
14+
LTAnno,
15+
LTChar,
16+
LTTextLineHorizontal,
17+
LTTextLineVertical,
18+
LTImage,
19+
LTTextBoxHorizontal
20+
)
21+
22+
testdir = os.path.dirname(os.path.abspath(__file__))
23+
testdir = os.path.join(testdir, "files")
24+
25+
from camelot.utils import bbox_intersection_area
26+
27+
def get_text_from_pdf(filename):
28+
"Method to extract text object from pdf"
29+
#https://stackoverflow.com/questions/22898145/how-to-extract-text-and-text-coordinates-from-a-pdf-file
30+
#https://pdfminer-docs.readthedocs.io/programming.html#performing-layout-analysis
31+
document = open(filename, 'rb')
32+
#Create resource manager
33+
rsrcmgr = PDFResourceManager()
34+
# Set parameters for analysis.
35+
laparams = LAParams()
36+
# Create a PDF page aggregator object.
37+
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
38+
interpreter = PDFPageInterpreter(rsrcmgr, device)
39+
for page in PDFPage.get_pages(document):
40+
interpreter.process_page(page)
41+
# receive the LTPage object for the page.
42+
layout = device.get_result()
43+
for element in layout:
44+
if isinstance(element, LTTextBoxHorizontal):
45+
return element
46+
47+
def test_bbox_intersection_text():
48+
"""
49+
Test to check area of intersection between both boxes when no intersection area returned
50+
"""
51+
filename1 = os.path.join(testdir, "foo.pdf")
52+
pdftextelement1 = get_text_from_pdf(filename1)
53+
filename2 = os.path.join(testdir, "tabula/12s0324.pdf")
54+
pdftextelement2 = get_text_from_pdf(filename2)
55+
56+
assert bbox_intersection_area(pdftextelement1, pdftextelement2) == 0.0

0 commit comments

Comments
 (0)