Skip to content

Commit 146aa3c

Browse files
authored
Merge branch 'main' into python-bigquery-reservation-migration
2 parents d6b7ad1 + 2b0fd48 commit 146aa3c

File tree

2 files changed

+177
-0
lines changed

2 files changed

+177
-0
lines changed

dlp/snippets/deid.py

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -914,6 +914,137 @@ def write_data(data: types.storage.Value) -> str:
914914
# [END dlp_deidentify_date_shift]
915915

916916

917+
# [START dlp_deidentify_time_extract]
918+
import csv # noqa: F811, E402, I100
919+
from datetime import datetime # noqa: F811, E402, I100
920+
from typing import List # noqa: F811, E402
921+
922+
import google.cloud.dlp # noqa: F811, E402
923+
924+
925+
def deidentify_with_time_extract(
926+
project: str,
927+
date_fields: List[str],
928+
input_csv_file: str,
929+
output_csv_file: str,
930+
) -> None:
931+
""" Uses the Data Loss Prevention API to deidentify dates in a CSV file through
932+
time part extraction.
933+
Args:
934+
project: The Google Cloud project id to use as a parent resource.
935+
date_fields: A list of (date) fields in CSV file to de-identify
936+
through time extraction. Example: ['birth_date', 'register_date'].
937+
Date values in format: mm/DD/YYYY are considered as part of this
938+
sample.
939+
input_csv_file: The path to the CSV file to deidentify. The first row
940+
of the file must specify column names, and all other rows must
941+
contain valid values.
942+
output_csv_file: The output file path to save the time extracted data.
943+
"""
944+
945+
# Instantiate a client.
946+
dlp = google.cloud.dlp_v2.DlpServiceClient()
947+
948+
# Convert date field list to Protobuf type.
949+
def map_fields(field):
950+
return {"name": field}
951+
952+
if date_fields:
953+
date_fields = map(map_fields, date_fields)
954+
else:
955+
date_fields = []
956+
957+
csv_lines = []
958+
with open(input_csv_file) as csvfile:
959+
reader = csv.reader(csvfile)
960+
for row in reader:
961+
csv_lines.append(row)
962+
963+
# Helper function for converting CSV rows to Protobuf types
964+
def map_headers(header):
965+
return {"name": header}
966+
967+
def map_data(value):
968+
try:
969+
date = datetime.strptime(value, "%m/%d/%Y")
970+
return {
971+
"date_value": {
972+
"year": date.year, "month": date.month, "day": date.day
973+
}
974+
}
975+
except ValueError:
976+
return {"string_value": value}
977+
978+
def map_rows(row):
979+
return {"values": map(map_data, row)}
980+
981+
# Using the helper functions, convert CSV rows to protobuf-compatible
982+
# dictionaries.
983+
csv_headers = map(map_headers, csv_lines[0])
984+
csv_rows = map(map_rows, csv_lines[1:])
985+
986+
# Construct the table dictionary.
987+
table = {"headers": csv_headers, "rows": csv_rows}
988+
989+
# Construct the `item` for table to de-identify.
990+
item = {"table": table}
991+
992+
# Construct deidentify configuration dictionary.
993+
deidentify_config = {
994+
"record_transformations": {
995+
"field_transformations": [
996+
{
997+
"primitive_transformation": {
998+
"time_part_config": {
999+
"part_to_extract": "YEAR"
1000+
}
1001+
},
1002+
"fields": date_fields,
1003+
}
1004+
]
1005+
}
1006+
}
1007+
1008+
# Write to CSV helper methods.
1009+
def write_header(header):
1010+
return header.name
1011+
1012+
def write_data(data):
1013+
return data.string_value or "{}/{}/{}".format(
1014+
data.date_value.month,
1015+
data.date_value.day,
1016+
data.date_value.year,
1017+
)
1018+
1019+
# Convert the project id into a full resource id.
1020+
parent = f"projects/{project}"
1021+
1022+
# Call the API
1023+
response = dlp.deidentify_content(
1024+
request={
1025+
"parent": parent,
1026+
"deidentify_config": deidentify_config,
1027+
"item": item,
1028+
}
1029+
)
1030+
1031+
# Print the result.
1032+
print("Table after de-identification: {}".format(response.item.table))
1033+
1034+
# Write results to CSV file.
1035+
with open(output_csv_file, "w") as csvfile:
1036+
write_file = csv.writer(csvfile, delimiter=",")
1037+
write_file.writerow(map(write_header, response.item.table.headers))
1038+
for row in response.item.table.rows:
1039+
write_file.writerow(map(write_data, row.values))
1040+
1041+
# Print status.
1042+
print(f"Successfully saved date-extracted output to {output_csv_file}")
1043+
1044+
1045+
# [END dlp_deidentify_time_extract]
1046+
1047+
9171048
# [START dlp_deidentify_replace_infotype]
9181049
from typing import List # noqa: F811, E402, I100
9191050

@@ -2124,6 +2255,30 @@ def deidentify_table_with_multiple_crypto_hash(
21242255
"key_name.",
21252256
)
21262257

2258+
time_extract_parser = subparsers.add_parser(
2259+
"deid_time_extract",
2260+
help="Deidentify dates in a CSV file by extracting a date part.",
2261+
)
2262+
time_extract_parser.add_argument(
2263+
"project",
2264+
help="The Google Cloud project id to use as a parent resource.",
2265+
)
2266+
time_extract_parser.add_argument(
2267+
"input_csv_file",
2268+
help="The path to the CSV file to deidentify. The first row of the "
2269+
"file must specify column names, and all other rows must contain "
2270+
"valid values.",
2271+
)
2272+
time_extract_parser.add_argument(
2273+
"date_fields",
2274+
nargs="+",
2275+
help="The list of date fields in the CSV file to de-identify. Example: "
2276+
"['birth_date', 'register_date']",
2277+
)
2278+
time_extract_parser.add_argument(
2279+
"output_csv_file", help="The path to save the time-extracted data."
2280+
)
2281+
21272282
replace_with_infotype_parser = subparsers.add_parser(
21282283
"replace_with_infotype",
21292284
help="Deidentify sensitive data in a string by replacing it with the "
@@ -2485,6 +2640,13 @@ def deidentify_table_with_multiple_crypto_hash(
24852640
wrapped_key=args.wrapped_key,
24862641
key_name=args.key_name,
24872642
)
2643+
elif args.content == "deid_time_extract":
2644+
deidentify_with_time_extract(
2645+
args.project,
2646+
date_fields=args.date_fields,
2647+
input_csv_file=args.input_csv_file,
2648+
output_csv_file=args.output_csv_file,
2649+
)
24882650
elif args.content == "replace_with_infotype":
24892651
deidentify_with_replace_infotype(
24902652
args.project,

dlp/snippets/deid_test.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,21 @@ def test_deidentify_with_date_shift_using_context_field(
230230
assert "Successful" in out
231231

232232

233+
def test_deidentify_with_time_extract(tempdir: TextIO, capsys: pytest.CaptureFixture) -> None:
234+
output_filepath = os.path.join(str(tempdir), "year-extracted.csv")
235+
236+
deid.deidentify_with_time_extract(
237+
GCLOUD_PROJECT,
238+
input_csv_file=CSV_FILE,
239+
output_csv_file=output_filepath,
240+
date_fields=DATE_FIELDS,
241+
)
242+
243+
out, _ = capsys.readouterr()
244+
245+
assert "Successful" in out
246+
247+
233248
def test_reidentify_with_fpe(capsys: pytest.CaptureFixture) -> None:
234249
labeled_fpe_string = "My SSN is SSN_TOKEN(9):731997681"
235250

0 commit comments

Comments
 (0)