@@ -914,6 +914,137 @@ def write_data(data: types.storage.Value) -> str:
914
914
# [END dlp_deidentify_date_shift]
915
915
916
916
917
+ # [START dlp_deidentify_time_extract]
918
+ import csv # noqa: F811, E402, I100
919
+ from datetime import datetime # noqa: F811, E402, I100
920
+ from typing import List # noqa: F811, E402
921
+
922
+ import google .cloud .dlp # noqa: F811, E402
923
+
924
+
925
+ def deidentify_with_time_extract (
926
+ project : str ,
927
+ date_fields : List [str ],
928
+ input_csv_file : str ,
929
+ output_csv_file : str ,
930
+ ) -> None :
931
+ """ Uses the Data Loss Prevention API to deidentify dates in a CSV file through
932
+ time part extraction.
933
+ Args:
934
+ project: The Google Cloud project id to use as a parent resource.
935
+ date_fields: A list of (date) fields in CSV file to de-identify
936
+ through time extraction. Example: ['birth_date', 'register_date'].
937
+ Date values in format: mm/DD/YYYY are considered as part of this
938
+ sample.
939
+ input_csv_file: The path to the CSV file to deidentify. The first row
940
+ of the file must specify column names, and all other rows must
941
+ contain valid values.
942
+ output_csv_file: The output file path to save the time extracted data.
943
+ """
944
+
945
+ # Instantiate a client.
946
+ dlp = google .cloud .dlp_v2 .DlpServiceClient ()
947
+
948
+ # Convert date field list to Protobuf type.
949
+ def map_fields (field ):
950
+ return {"name" : field }
951
+
952
+ if date_fields :
953
+ date_fields = map (map_fields , date_fields )
954
+ else :
955
+ date_fields = []
956
+
957
+ csv_lines = []
958
+ with open (input_csv_file ) as csvfile :
959
+ reader = csv .reader (csvfile )
960
+ for row in reader :
961
+ csv_lines .append (row )
962
+
963
+ # Helper function for converting CSV rows to Protobuf types
964
+ def map_headers (header ):
965
+ return {"name" : header }
966
+
967
+ def map_data (value ):
968
+ try :
969
+ date = datetime .strptime (value , "%m/%d/%Y" )
970
+ return {
971
+ "date_value" : {
972
+ "year" : date .year , "month" : date .month , "day" : date .day
973
+ }
974
+ }
975
+ except ValueError :
976
+ return {"string_value" : value }
977
+
978
+ def map_rows (row ):
979
+ return {"values" : map (map_data , row )}
980
+
981
+ # Using the helper functions, convert CSV rows to protobuf-compatible
982
+ # dictionaries.
983
+ csv_headers = map (map_headers , csv_lines [0 ])
984
+ csv_rows = map (map_rows , csv_lines [1 :])
985
+
986
+ # Construct the table dictionary.
987
+ table = {"headers" : csv_headers , "rows" : csv_rows }
988
+
989
+ # Construct the `item` for table to de-identify.
990
+ item = {"table" : table }
991
+
992
+ # Construct deidentify configuration dictionary.
993
+ deidentify_config = {
994
+ "record_transformations" : {
995
+ "field_transformations" : [
996
+ {
997
+ "primitive_transformation" : {
998
+ "time_part_config" : {
999
+ "part_to_extract" : "YEAR"
1000
+ }
1001
+ },
1002
+ "fields" : date_fields ,
1003
+ }
1004
+ ]
1005
+ }
1006
+ }
1007
+
1008
+ # Write to CSV helper methods.
1009
+ def write_header (header ):
1010
+ return header .name
1011
+
1012
+ def write_data (data ):
1013
+ return data .string_value or "{}/{}/{}" .format (
1014
+ data .date_value .month ,
1015
+ data .date_value .day ,
1016
+ data .date_value .year ,
1017
+ )
1018
+
1019
+ # Convert the project id into a full resource id.
1020
+ parent = f"projects/{ project } "
1021
+
1022
+ # Call the API
1023
+ response = dlp .deidentify_content (
1024
+ request = {
1025
+ "parent" : parent ,
1026
+ "deidentify_config" : deidentify_config ,
1027
+ "item" : item ,
1028
+ }
1029
+ )
1030
+
1031
+ # Print the result.
1032
+ print ("Table after de-identification: {}" .format (response .item .table ))
1033
+
1034
+ # Write results to CSV file.
1035
+ with open (output_csv_file , "w" ) as csvfile :
1036
+ write_file = csv .writer (csvfile , delimiter = "," )
1037
+ write_file .writerow (map (write_header , response .item .table .headers ))
1038
+ for row in response .item .table .rows :
1039
+ write_file .writerow (map (write_data , row .values ))
1040
+
1041
+ # Print status.
1042
+ print (f"Successfully saved date-extracted output to { output_csv_file } " )
1043
+
1044
+
1045
+ # [END dlp_deidentify_time_extract]
1046
+
1047
+
917
1048
# [START dlp_deidentify_replace_infotype]
918
1049
from typing import List # noqa: F811, E402, I100
919
1050
@@ -2124,6 +2255,30 @@ def deidentify_table_with_multiple_crypto_hash(
2124
2255
"key_name." ,
2125
2256
)
2126
2257
2258
+ time_extract_parser = subparsers .add_parser (
2259
+ "deid_time_extract" ,
2260
+ help = "Deidentify dates in a CSV file by extracting a date part." ,
2261
+ )
2262
+ time_extract_parser .add_argument (
2263
+ "project" ,
2264
+ help = "The Google Cloud project id to use as a parent resource." ,
2265
+ )
2266
+ time_extract_parser .add_argument (
2267
+ "input_csv_file" ,
2268
+ help = "The path to the CSV file to deidentify. The first row of the "
2269
+ "file must specify column names, and all other rows must contain "
2270
+ "valid values." ,
2271
+ )
2272
+ time_extract_parser .add_argument (
2273
+ "date_fields" ,
2274
+ nargs = "+" ,
2275
+ help = "The list of date fields in the CSV file to de-identify. Example: "
2276
+ "['birth_date', 'register_date']" ,
2277
+ )
2278
+ time_extract_parser .add_argument (
2279
+ "output_csv_file" , help = "The path to save the time-extracted data."
2280
+ )
2281
+
2127
2282
replace_with_infotype_parser = subparsers .add_parser (
2128
2283
"replace_with_infotype" ,
2129
2284
help = "Deidentify sensitive data in a string by replacing it with the "
@@ -2485,6 +2640,13 @@ def deidentify_table_with_multiple_crypto_hash(
2485
2640
wrapped_key = args .wrapped_key ,
2486
2641
key_name = args .key_name ,
2487
2642
)
2643
+ elif args .content == "deid_time_extract" :
2644
+ deidentify_with_time_extract (
2645
+ args .project ,
2646
+ date_fields = args .date_fields ,
2647
+ input_csv_file = args .input_csv_file ,
2648
+ output_csv_file = args .output_csv_file ,
2649
+ )
2488
2650
elif args .content == "replace_with_infotype" :
2489
2651
deidentify_with_replace_infotype (
2490
2652
args .project ,
0 commit comments