|
| 1 | +"Test to check intersection logic when no intersection area returned" |
| 2 | +import os |
| 3 | +import sys |
| 4 | + |
| 5 | +from pdfminer.pdfparser import PDFParser |
| 6 | +from pdfminer.pdfdocument import PDFDocument |
| 7 | +from pdfminer.pdfpage import PDFPage |
| 8 | +from pdfminer.pdfpage import PDFTextExtractionNotAllowed |
| 9 | +from pdfminer.pdfinterp import PDFResourceManager |
| 10 | +from pdfminer.pdfinterp import PDFPageInterpreter |
| 11 | +from pdfminer.converter import PDFPageAggregator |
| 12 | +from pdfminer.layout import ( |
| 13 | + LAParams, |
| 14 | + LTAnno, |
| 15 | + LTChar, |
| 16 | + LTTextLineHorizontal, |
| 17 | + LTTextLineVertical, |
| 18 | + LTImage, |
| 19 | + LTTextBoxHorizontal |
| 20 | +) |
| 21 | + |
| 22 | +testdir = os.path.dirname(os.path.abspath(__file__)) |
| 23 | +testdir = os.path.join(testdir, "files") |
| 24 | + |
| 25 | +from camelot.utils import bbox_intersection_area |
| 26 | + |
| 27 | +def get_text_from_pdf(filename): |
| 28 | + "Method to extract text object from pdf" |
| 29 | + #https://stackoverflow.com/questions/22898145/how-to-extract-text-and-text-coordinates-from-a-pdf-file |
| 30 | + #https://pdfminer-docs.readthedocs.io/programming.html#performing-layout-analysis |
| 31 | + document = open(filename, 'rb') |
| 32 | + #Create resource manager |
| 33 | + rsrcmgr = PDFResourceManager() |
| 34 | + # Set parameters for analysis. |
| 35 | + laparams = LAParams() |
| 36 | + # Create a PDF page aggregator object. |
| 37 | + device = PDFPageAggregator(rsrcmgr, laparams=laparams) |
| 38 | + interpreter = PDFPageInterpreter(rsrcmgr, device) |
| 39 | + for page in PDFPage.get_pages(document): |
| 40 | + interpreter.process_page(page) |
| 41 | + # receive the LTPage object for the page. |
| 42 | + layout = device.get_result() |
| 43 | + for element in layout: |
| 44 | + if isinstance(element, LTTextBoxHorizontal): |
| 45 | + return element |
| 46 | + |
| 47 | +def test_bbox_intersection_text(): |
| 48 | + """ |
| 49 | + Test to check area of intersection between both boxes when no intersection area returned |
| 50 | + """ |
| 51 | + filename1 = os.path.join(testdir, "foo.pdf") |
| 52 | + pdftextelement1 = get_text_from_pdf(filename1) |
| 53 | + filename2 = os.path.join(testdir, "tabula/12s0324.pdf") |
| 54 | + pdftextelement2 = get_text_from_pdf(filename2) |
| 55 | + |
| 56 | + assert bbox_intersection_area(pdftextelement1, pdftextelement2) == 0.0 |
0 commit comments