11"""Core routines for PDF extraction."""
2+
23import itertools
34import logging
45import os
1415from libpdf import process as pro
1516from libpdf .apiobjects import ApiObjects
1617from libpdf .catalog import catalog , extract_catalog
17- from libpdf .exceptions import LibpdfException
18+ from libpdf .exceptions import LibpdfError
1819from libpdf .log import logging_needed
1920from libpdf .models .figure import Figure
2021from libpdf .models .file import File
@@ -86,7 +87,7 @@ def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-sta
8687 :param no_rects: flag triggering the exclusion of rects
8788 :param overall_pbar: total progress bar for whole libpdf run
8889 :return: instance of Objects class
89- :raise LibpdfException : PDF contains no pages
90+ :raise LibpdfError : PDF contains no pages
9091 """
9192 LOG .info ("PDF extraction started ..." )
9293
@@ -116,7 +117,7 @@ def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-sta
116117 if len (pdf .pages ) == 0 :
117118 message = "Page range selection: no pages left in the PDF to analyze."
118119 LOG .critical (message )
119- raise LibpdfException (message )
120+ raise LibpdfError (message )
120121
121122 overall_pbar .update (5 )
122123 pdf = delete_page_ann (pdf )
@@ -131,7 +132,7 @@ def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-sta
131132 pages_list = extract_page_metadata (pdf )
132133
133134 if not pages_list :
134- raise LibpdfException ("PDF contains no pages" )
135+ raise LibpdfError ("PDF contains no pages" )
135136
136137 overall_pbar .update (1 )
137138
@@ -530,15 +531,15 @@ def _get_datetime_format(date: str):
530531 if "CreationDate" in pdf .metadata :
531532 preprocessed_date = _time_preprocess (pdf .metadata ["CreationDate" ])
532533 time_format = _get_datetime_format (preprocessed_date )
533- file_meta_params .update (
534- { "creation_date" : datetime .strptime (preprocessed_date , time_format )}
535- )
534+ file_meta_params .update ({
535+ "creation_date" : datetime .strptime (preprocessed_date , time_format )
536+ } )
536537 if "ModDate" in pdf .metadata :
537538 preprocessed_date = _time_preprocess (pdf .metadata ["ModDate" ])
538539 time_format = _get_datetime_format (preprocessed_date )
539- file_meta_params .update (
540- { "modified_date" : datetime .strptime (preprocessed_date , time_format )}
541- )
540+ file_meta_params .update ({
541+ "modified_date" : datetime .strptime (preprocessed_date , time_format )
542+ } )
542543 if "Trapped" in pdf .metadata :
543544 file_meta_params .update ({"trapped" : pdf .metadata ["Trapped" ]})
544545
@@ -705,7 +706,7 @@ def extract_rects(
705706 )
706707
707708 LOG .info (
708- f"found rect at { rect_bbox } at page { idx_page + 1 } : color { non_stroking_color } "
709+ f"found rect at { rect_bbox } at page { idx_page + 1 } : color { non_stroking_color } "
709710 )
710711 lt_textbox = lt_textbox_crop (
711712 rect_bbox ,
@@ -722,7 +723,9 @@ def extract_rects(
722723 rect_list .append (rect )
723724
724725 else :
725- LOG .info (f"found no rects on page { idx_page + 1 } : { page_crop .objects .keys ()} " )
726+ LOG .info (
727+ f"found no rects on page { idx_page + 1 } : { page_crop .objects .keys ()} "
728+ )
726729
727730 # return figure_list
728731 return rect_list
0 commit comments