30
30
import eyecite
31
31
from enum import Enum
32
32
import sigfig
33
+ from typing import Any
33
34
import yaml
34
35
from .pdf_wrangling import (
35
36
get_existing_pdf_fields ,
@@ -884,7 +885,7 @@ def get_env_bool(name: str) -> bool:
884
885
DEBUG_LANGUAGE_DETECTION = get_env_bool ("DEBUG_LANGUAGE_DETECTION" )
885
886
DEBUG_LANGUAGE_DETECTION_PRINT_ALL = get_env_bool ("DEBUG_LANGUAGE_DETECTION_PRINT_ALL" )
886
887
# Support values are: langdetect, langid, lingua
887
- LANGUAGE_DETECTION_PRIMARY_LIBRARY = (
888
+ LANGUAGE_DETECTION_PRIMARY_LIBRARY : str = (
888
889
os .getenv ("LANGUAGE_DETECTION_PRIMARY_LIBRARY" )
889
890
if os .getenv ("LANGUAGE_DETECTION_PRIMARY_LIBRARY" )
890
891
else "langdetect"
@@ -899,30 +900,30 @@ def get_env_bool(name: str) -> bool:
899
900
900
901
# Minimum lines to chunk together in a paragraph. The language detection will run when both this and the character
901
902
# minimums are met, or at the end of the text with whatever is leftover.
902
- LANGUAGE_DETECTION_PARAGRAPH_MIN_LINES = (
903
+ LANGUAGE_DETECTION_PARAGRAPH_MIN_LINES : int = (
903
904
int (os .getenv ("LANGUAGE_DETECTION_PARAGRAPH_MIN_LINES" ))
904
905
if os .getenv ("LANGUAGE_DETECTION_PARAGRAPH_MIN_LINES" )
905
906
else 3
906
907
)
907
908
# Minimum characters to be considered a paragraph. The language detection will run when both this and the line minimums
908
909
# are met, or at the end of the text with whatever is leftover.
909
- LANGUAGE_DETECTION_PARAGRAPH_MIN_CHARS = (
910
+ LANGUAGE_DETECTION_PARAGRAPH_MIN_CHARS : int = (
910
911
int (os .getenv ("LANGUAGE_DETECTION_PARAGRAPH_MIN_CHARS" ))
911
912
if os .getenv ("LANGUAGE_DETECTION_PARAGRAPH_MIN_CHARS" )
912
913
else 30
913
914
)
914
915
# Threshold percentage of non-English text before using the stripped text. This threshold avoids false positives.
915
916
# 1.0 = 100%
916
917
# 0.05 = 5%
917
- LANGUAGE_DETECTION_THRESHOLD_PERCENTAGE = (
918
+ LANGUAGE_DETECTION_THRESHOLD_PERCENTAGE : float = (
918
919
float (os .getenv ("LANGUAGE_DETECTION_THRESHOLD_PERCENTAGE" ))
919
920
if os .getenv ("LANGUAGE_DETECTION_THRESHOLD_PERCENTAGE" )
920
921
else 0.05
921
922
)
922
923
# Lingua-Py is the only one that requires specifying the language set beforehand, but seems the most accurate w/ this
923
924
# subset on languages. Initial language set was taken from the Venn diagram of common lanagues, the Mass Court
924
925
# Forms & CA Court Forms translation list, intersected with the 75 available languages in Lingua.
925
- LINGUA_LANGUAGES = [
926
+ LINGUA_LANGUAGES : List [ Any ] = [
926
927
Language .ENGLISH ,
927
928
Language .FRENCH ,
928
929
Language .GERMAN ,
@@ -952,13 +953,13 @@ def extract_english_only_text(original_text: str) -> Tuple[bool, int, float, str
952
953
"""
953
954
954
955
lines = original_text .split ("\n " )
955
- english_lines = []
956
+ english_lines : List [ str ] = []
956
957
chunk_size = LANGUAGE_DETECTION_PARAGRAPH_MIN_LINES
957
958
min_len = LANGUAGE_DETECTION_PARAGRAPH_MIN_CHARS
958
959
any_skipped = False
959
960
skipped_count = 0
960
961
skipped_percentage = 0.0
961
- current_lines = []
962
+ current_lines : List [ str ] = []
962
963
for line in lines :
963
964
current_lines .append (line )
964
965
if len (current_lines ) >= chunk_size :
@@ -991,7 +992,9 @@ def extract_english_only_text(original_text: str) -> Tuple[bool, int, float, str
991
992
# Don't use the processed text if the minimum threshold is not met.
992
993
if skipped_percentage < LANGUAGE_DETECTION_THRESHOLD_PERCENTAGE :
993
994
if DEBUG_LANGUAGE_DETECTION :
994
- print (f"\n Discarding skipped because min threshold was not met. percentage: { skipped_percentage } threshold: { LANGUAGE_DETECTION_THRESHOLD_PERCENTAGE } " )
995
+ print (
996
+ f"\n Discarding skipped because min threshold was not met. percentage: { skipped_percentage } threshold: { LANGUAGE_DETECTION_THRESHOLD_PERCENTAGE } "
997
+ )
995
998
return False , 0 , 0.0 , original_text
996
999
997
1000
return any_skipped , skipped_count , skipped_percentage , english_only_text
@@ -1014,9 +1017,7 @@ def detect_english_only_paragraph(paragraph: List[str]) -> bool:
1014
1017
if DEBUG_LANGUAGE_DETECTION :
1015
1018
if all_english and not DEBUG_LANGUAGE_DETECTION_PRINT_ALL :
1016
1019
return primary_is_english
1017
- print (
1018
- f"\n \n ===== Start Paragraph len: { len (paragraph )} "
1019
- )
1020
+ print (f"\n \n ===== Start Paragraph len: { len (paragraph )} " )
1020
1021
print (f"{ paragraph } " )
1021
1022
print (
1022
1023
f"===== End Paragraph primary: { primary_is_english } votes: { is_english } langdetect: { langdetect_confidence } langid: { langid_langs } lingua: { lingua_lang } "
0 commit comments