-
Notifications
You must be signed in to change notification settings - Fork 6.5k
Add code sample for string replacement based deidentification. #3956
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
4b0d97e
debe98e
8c033c1
b0770b1
998af67
ca1eea7
bb4b2c4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,13 +21,13 @@ | |
|
||
# [START dlp_deidentify_masking] | ||
def deidentify_with_mask( | ||
project, string, info_types, masking_character=None, number_to_mask=0 | ||
project, input_str, info_types, masking_character=None, number_to_mask=0 | ||
): | ||
"""Uses the Data Loss Prevention API to deidentify sensitive data in a | ||
string by masking it with a character. | ||
Args: | ||
project: The Google Cloud project id to use as a parent resource. | ||
item: The string to deidentify (will be treated as text). | ||
input_str: The string to deidentify (will be treated as text). | ||
masking_character: The character to mask matching sensitive data with. | ||
number_to_mask: The maximum number of sensitive characters to mask in | ||
a match. If omitted or set to zero, the API will default to no | ||
|
@@ -67,7 +67,7 @@ def deidentify_with_mask( | |
} | ||
|
||
# Construct item | ||
item = {"value": string} | ||
item = {"value": input_str} | ||
|
||
# Call the API | ||
response = dlp.deidentify_content( | ||
|
@@ -83,11 +83,76 @@ def deidentify_with_mask( | |
|
||
# [END dlp_deidentify_masking] | ||
|
||
# [START dlp_deidentify_replace] | ||
def deidentify_with_replace( | ||
project, | ||
input_str, | ||
info_types, | ||
replacement_str="REPLACEMENT_STR", | ||
): | ||
"""Uses the Data Loss Prevention API to deidentify sensitive data in a | ||
string by replacing matched input values with a value you specify. | ||
Args: | ||
project: The Google Cloud project id to use as a parent resource. | ||
input_str: The string to deidentify (will be treated as text). | ||
info_types: A list of strings representing info types to look for. | ||
replacement_str: The string to replace all values that match given | ||
info types. | ||
Returns: | ||
None; the response from the API is printed to the terminal. | ||
""" | ||
import google.cloud.dlp | ||
|
||
# Instantiate a client | ||
dlp = google.cloud.dlp_v2.DlpServiceClient() | ||
|
||
# Convert the project id into a full resource id. | ||
parent = dlp.project_path(project) | ||
|
||
# Construct inspect configuration dictionary | ||
inspect_config = { | ||
"info_types": [{"name": info_type} for info_type in info_types] | ||
} | ||
|
||
# Construct deidentify configuration dictionary | ||
deidentify_config = { | ||
"info_type_transformations": { | ||
"transformations": [ | ||
{ | ||
"primitive_transformation": { | ||
"replace_config": { | ||
"new_value": { | ||
"string_value": replacement_str, | ||
} | ||
} | ||
} | ||
} | ||
] | ||
} | ||
} | ||
Comment on lines
+118
to
+132
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's hard for me to wrap my head around this because of how deeply this is nested. Could we perhaps use two dictionaries? transformation = {
"primitive_transformation": {
"replace_config": {
"new_value": {
"string_value": replacement_str,
}
}
}
}
deidentify_config = {
"info_type_transformations": {
"transformations": [transformation]
}
} There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the review. I was trying to follow surrounding code for consistency. Do you feel strongly about the nesting? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nope, I didn't notice the existing function. I'm fine with keeping this as is. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 to breaking this out - I think there is significantly more nesting going on here than in the other function that it feel necessary |
||
|
||
# Construct item | ||
item = {"value": input_str} | ||
|
||
# Call the API | ||
response = dlp.deidentify_content( | ||
parent, | ||
inspect_config=inspect_config, | ||
deidentify_config=deidentify_config, | ||
item=item, | ||
) | ||
|
||
# Print out the results. | ||
print(response.item.value) | ||
|
||
# [END dlp_deidentify_replace] | ||
|
||
# [START dlp_deidentify_fpe] | ||
|
||
|
||
def deidentify_with_fpe( | ||
project, | ||
string, | ||
input_str, | ||
info_types, | ||
alphabet=None, | ||
surrogate_type=None, | ||
|
@@ -98,7 +163,7 @@ def deidentify_with_fpe( | |
string using Format Preserving Encryption (FPE). | ||
Args: | ||
project: The Google Cloud project id to use as a parent resource. | ||
item: The string to deidentify (will be treated as text). | ||
input_str: The string to deidentify (will be treated as text). | ||
alphabet: The set of characters to replace sensitive ones with. For | ||
more information, see https://cloud.google.com/dlp/docs/reference/ | ||
rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet | ||
|
@@ -166,7 +231,7 @@ def deidentify_with_fpe( | |
} | ||
|
||
# Convert string to item | ||
item = {"value": string} | ||
item = {"value": input_str} | ||
|
||
# Call the API | ||
response = dlp.deidentify_content( | ||
|
@@ -186,7 +251,7 @@ def deidentify_with_fpe( | |
# [START dlp_reidentify_fpe] | ||
def reidentify_with_fpe( | ||
project, | ||
string, | ||
input_str, | ||
alphabet=None, | ||
surrogate_type=None, | ||
key_name=None, | ||
|
@@ -196,7 +261,7 @@ def reidentify_with_fpe( | |
string that was encrypted by Format Preserving Encryption (FPE). | ||
Args: | ||
project: The Google Cloud project id to use as a parent resource. | ||
item: The string to deidentify (will be treated as text). | ||
input_str: The string to deidentify (will be treated as text). | ||
alphabet: The set of characters to replace sensitive ones with. For | ||
more information, see https://cloud.google.com/dlp/docs/reference/ | ||
rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet | ||
|
@@ -255,7 +320,7 @@ def reidentify_with_fpe( | |
} | ||
|
||
# Convert string to item | ||
item = {"value": string} | ||
item = {"value": input_str} | ||
|
||
# Call the API | ||
response = dlp.reidentify_content( | ||
|
@@ -531,6 +596,28 @@ def redact_sensitive_data(project, item, info_types): | |
help="The character to mask matching sensitive data with.", | ||
) | ||
|
||
replace_parser = subparsers.add_parser( | ||
"deid_replace", | ||
help="Deidentify sensitive data in a string by replacing it with " | ||
"another string.", | ||
) | ||
replace_parser.add_argument( | ||
"--info_types", | ||
nargs="+", | ||
help="Strings representing info types to look for. A full list of " | ||
"info categories and types is available from the API. Examples " | ||
'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' | ||
"If unspecified, the three above examples will be used.", | ||
default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], | ||
) | ||
replace_parser.add_argument( | ||
"project", | ||
help="The Google Cloud project id to use as a parent resource.", | ||
) | ||
replace_parser.add_argument("item", help="The string to deidentify.") | ||
replace_parser.add_argument("replacement_str", help="The string to " | ||
"replace all matched values with.") | ||
|
||
fpe_parser = subparsers.add_parser( | ||
"deid_fpe", | ||
help="Deidentify sensitive data in a string using Format Preserving " | ||
|
@@ -715,6 +802,13 @@ def redact_sensitive_data(project, item, info_types): | |
masking_character=args.masking_character, | ||
number_to_mask=args.number_to_mask, | ||
) | ||
elif args.content == "deid_replace": | ||
deidentify_with_replace( | ||
args.project, | ||
args.item, | ||
args.info_types, | ||
replacement_str=args.replacement_str, | ||
) | ||
elif args.content == "deid_fpe": | ||
deidentify_with_fpe( | ||
args.project, | ||
|
Uh oh!
There was an error while loading. Please reload this page.