@@ -302,37 +302,16 @@ def test_blob_image_normalize_to_bq(images_mm_df: bpd.DataFrame, bq_connection:
302302
303303
304304@pytest .mark .parametrize (
305- "verbose, expected " ,
305+ "verbose" ,
306306 [
307- (
308- True ,
309- pd .Series (
310- [
311- {"status" : "File has not been decrypted" , "content" : "" },
312- {
313- "status" : "" ,
314- "content" : "Sample PDF This is a testing file. Some dummy messages are used for testing purposes. " ,
315- },
316- ]
317- ),
318- ),
319- (
320- False ,
321- pd .Series (
322- [
323- "" ,
324- "Sample PDF This is a testing file. Some dummy messages are used for testing purposes. " ,
325- ],
326- name = "pdf" ,
327- ),
328- ),
307+ (True ),
308+ (False ),
329309 ],
330310)
331311def test_blob_pdf_extract (
332312 pdf_mm_df : bpd .DataFrame ,
333313 verbose : bool ,
334314 bq_connection : str ,
335- expected : pd .Series ,
336315):
337316 actual = (
338317 pdf_mm_df ["pdf" ]
@@ -341,49 +320,44 @@ def test_blob_pdf_extract(
341320 .to_pandas ()
342321 )
343322
344- pd .testing .assert_series_equal (
345- actual ,
346- expected ,
347- check_dtype = False ,
348- check_index = False ,
323+ # check relative length
324+ expected_text = "Sample PDF This is a testing file. Some dummy messages are used for testing purposes."
325+ expected_len = len (expected_text )
326+
327+ actual_text = ""
328+ if verbose :
329+ # The first entry is for a file that doesn't exist, so we check the second one
330+ successful_results = actual [actual .apply (lambda x : x ["status" ] == "" )]
331+ actual_text = successful_results .apply (lambda x : x ["content" ]).iloc [0 ]
332+ else :
333+ actual_text = actual [actual != "" ].iloc [0 ]
334+ actual_len = len (actual_text )
335+
336+ relative_length_tolerance = 0.25
337+ min_acceptable_len = expected_len * (1 - relative_length_tolerance )
338+ max_acceptable_len = expected_len * (1 + relative_length_tolerance )
339+ assert min_acceptable_len <= actual_len <= max_acceptable_len , (
340+ f"Item (verbose={ verbose } ): Extracted text length { actual_len } is outside the acceptable range "
341+ f"[{ min_acceptable_len :.0f} , { max_acceptable_len :.0f} ]. "
342+ f"Expected reference length was { expected_len } . "
349343 )
350344
345+ # check for major keywords
346+ major_keywords = ["Sample" , "PDF" , "testing" , "dummy" , "messages" ]
347+ for keyword in major_keywords :
348+ assert (
349+ keyword .lower () in actual_text .lower ()
350+ ), f"Item (verbose={ verbose } ): Expected keyword '{ keyword } ' not found in extracted text. "
351+
351352
352353@pytest .mark .parametrize (
353- "verbose, expected " ,
354+ "verbose" ,
354355 [
355- (
356- True ,
357- pd .Series (
358- [
359- {"status" : "File has not been decrypted" , "content" : []},
360- {
361- "status" : "" ,
362- "content" : [
363- "Sample PDF This is a testing file. Some " ,
364- "dummy messages are used for testing " ,
365- "purposes. " ,
366- ],
367- },
368- ]
369- ),
370- ),
371- (
372- False ,
373- pd .Series (
374- [
375- pd .NA ,
376- "Sample PDF This is a testing file. Some " ,
377- "dummy messages are used for testing " ,
378- "purposes. " ,
379- ],
380- ),
381- ),
356+ (True ),
357+ (False ),
382358 ],
383359)
384- def test_blob_pdf_chunk (
385- pdf_mm_df : bpd .DataFrame , verbose : bool , bq_connection : str , expected : pd .Series
386- ):
360+ def test_blob_pdf_chunk (pdf_mm_df : bpd .DataFrame , verbose : bool , bq_connection : str ):
387361 actual = (
388362 pdf_mm_df ["pdf" ]
389363 .blob .pdf_chunk (
@@ -397,13 +371,36 @@ def test_blob_pdf_chunk(
397371 .to_pandas ()
398372 )
399373
400- pd .testing .assert_series_equal (
401- actual ,
402- expected ,
403- check_dtype = False ,
404- check_index = False ,
374+ # check relative length
375+ expected_text = "Sample PDF This is a testing file. Some dummy messages are used for testing purposes."
376+ expected_len = len (expected_text )
377+
378+ actual_text = ""
379+ if verbose :
380+ # The first entry is for a file that doesn't exist, so we check the second one
381+ successful_results = actual [actual .apply (lambda x : x ["status" ] == "" )]
382+ actual_text = "" .join (successful_results .apply (lambda x : x ["content" ]).iloc [0 ])
383+ else :
384+ # First entry is NA
385+ actual_text = "" .join (actual .dropna ())
386+ actual_len = len (actual_text )
387+
388+ relative_length_tolerance = 0.25
389+ min_acceptable_len = expected_len * (1 - relative_length_tolerance )
390+ max_acceptable_len = expected_len * (1 + relative_length_tolerance )
391+ assert min_acceptable_len <= actual_len <= max_acceptable_len , (
392+ f"Item (verbose={ verbose } ): Extracted text length { actual_len } is outside the acceptable range "
393+ f"[{ min_acceptable_len :.0f} , { max_acceptable_len :.0f} ]. "
394+ f"Expected reference length was { expected_len } . "
405395 )
406396
397+ # check for major keywords
398+ major_keywords = ["Sample" , "PDF" , "testing" , "dummy" , "messages" ]
399+ for keyword in major_keywords :
400+ assert (
401+ keyword .lower () in actual_text .lower ()
402+ ), f"Item (verbose={ verbose } ): Expected keyword '{ keyword } ' not found in extracted text. "
403+
407404
408405@pytest .mark .parametrize (
409406 "model_name, verbose" ,
0 commit comments