Skip to content

Commit 82ee2b8

Browse files
committed
Add some types for mypy.
1 parent 2cee5b2 commit 82ee2b8

File tree

1 file changed

+12
-11
lines changed

1 file changed

+12
-11
lines changed

formfyxer/lit_explorer.py

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import eyecite
3131
from enum import Enum
3232
import sigfig
33+
from typing import Any
3334
import yaml
3435
from .pdf_wrangling import (
3536
get_existing_pdf_fields,
@@ -884,7 +885,7 @@ def get_env_bool(name: str) -> bool:
884885
DEBUG_LANGUAGE_DETECTION = get_env_bool("DEBUG_LANGUAGE_DETECTION")
885886
DEBUG_LANGUAGE_DETECTION_PRINT_ALL = get_env_bool("DEBUG_LANGUAGE_DETECTION_PRINT_ALL")
886887
# Support values are: langdetect, langid, lingua
887-
LANGUAGE_DETECTION_PRIMARY_LIBRARY = (
888+
LANGUAGE_DETECTION_PRIMARY_LIBRARY: str = (
888889
os.getenv("LANGUAGE_DETECTION_PRIMARY_LIBRARY")
889890
if os.getenv("LANGUAGE_DETECTION_PRIMARY_LIBRARY")
890891
else "langdetect"
@@ -899,30 +900,30 @@ def get_env_bool(name: str) -> bool:
899900

900901
# Minimum lines to chunk together in a paragraph. The language detection will run when both this and the character
901902
# minimums are met, or at the end of the text with whatever is leftover.
902-
LANGUAGE_DETECTION_PARAGRAPH_MIN_LINES = (
903+
LANGUAGE_DETECTION_PARAGRAPH_MIN_LINES: int = (
903904
int(os.getenv("LANGUAGE_DETECTION_PARAGRAPH_MIN_LINES"))
904905
if os.getenv("LANGUAGE_DETECTION_PARAGRAPH_MIN_LINES")
905906
else 3
906907
)
907908
# Minimum characters to be considered a paragraph. The language detection will run when both this and the line minimums
908909
# are met, or at the end of the text with whatever is leftover.
909-
LANGUAGE_DETECTION_PARAGRAPH_MIN_CHARS = (
910+
LANGUAGE_DETECTION_PARAGRAPH_MIN_CHARS: int = (
910911
int(os.getenv("LANGUAGE_DETECTION_PARAGRAPH_MIN_CHARS"))
911912
if os.getenv("LANGUAGE_DETECTION_PARAGRAPH_MIN_CHARS")
912913
else 30
913914
)
914915
# Threshold percentage of non-English text before using the stripped text. This threshold avoids false positives.
915916
# 1.0 = 100%
916917
# 0.05 = 5%
917-
LANGUAGE_DETECTION_THRESHOLD_PERCENTAGE = (
918+
LANGUAGE_DETECTION_THRESHOLD_PERCENTAGE: float = (
918919
float(os.getenv("LANGUAGE_DETECTION_THRESHOLD_PERCENTAGE"))
919920
if os.getenv("LANGUAGE_DETECTION_THRESHOLD_PERCENTAGE")
920921
else 0.05
921922
)
922923
# Lingua-Py is the only one that requires specifying the language set beforehand, but seems the most accurate w/ this
923924
# subset on languages. Initial language set was taken from the Venn diagram of common lanagues, the Mass Court
924925
# Forms & CA Court Forms translation list, intersected with the 75 available languages in Lingua.
925-
LINGUA_LANGUAGES = [
926+
LINGUA_LANGUAGES: List[Any] = [
926927
Language.ENGLISH,
927928
Language.FRENCH,
928929
Language.GERMAN,
@@ -952,13 +953,13 @@ def extract_english_only_text(original_text: str) -> Tuple[bool, int, float, str
952953
"""
953954

954955
lines = original_text.split("\n")
955-
english_lines = []
956+
english_lines:List[str] = []
956957
chunk_size = LANGUAGE_DETECTION_PARAGRAPH_MIN_LINES
957958
min_len = LANGUAGE_DETECTION_PARAGRAPH_MIN_CHARS
958959
any_skipped = False
959960
skipped_count = 0
960961
skipped_percentage = 0.0
961-
current_lines = []
962+
current_lines:List[str] = []
962963
for line in lines:
963964
current_lines.append(line)
964965
if len(current_lines) >= chunk_size:
@@ -991,7 +992,9 @@ def extract_english_only_text(original_text: str) -> Tuple[bool, int, float, str
991992
# Don't use the processed text if the minimum threshold is not met.
992993
if skipped_percentage < LANGUAGE_DETECTION_THRESHOLD_PERCENTAGE:
993994
if DEBUG_LANGUAGE_DETECTION:
994-
print(f"\nDiscarding skipped because min threshold was not met. percentage: {skipped_percentage} threshold: {LANGUAGE_DETECTION_THRESHOLD_PERCENTAGE}")
995+
print(
996+
f"\nDiscarding skipped because min threshold was not met. percentage: {skipped_percentage} threshold: {LANGUAGE_DETECTION_THRESHOLD_PERCENTAGE}"
997+
)
995998
return False, 0, 0.0, original_text
996999

9971000
return any_skipped, skipped_count, skipped_percentage, english_only_text
@@ -1014,9 +1017,7 @@ def detect_english_only_paragraph(paragraph: List[str]) -> bool:
10141017
if DEBUG_LANGUAGE_DETECTION:
10151018
if all_english and not DEBUG_LANGUAGE_DETECTION_PRINT_ALL:
10161019
return primary_is_english
1017-
print(
1018-
f"\n\n===== Start Paragraph len: {len(paragraph)}"
1019-
)
1020+
print(f"\n\n===== Start Paragraph len: {len(paragraph)}")
10201021
print(f"{paragraph}")
10211022
print(
10221023
f"===== End Paragraph primary: {primary_is_english} votes: {is_english} langdetect: {langdetect_confidence} langid: {langid_langs} lingua: {lingua_lang}"

0 commit comments

Comments
 (0)