Skip to content

Commit 2b508e6

Browse files
authored
Add code sample for string replacement based deidentification. (#3956)
Adds a code sample corresponding to the replacement based deidentification in the Cloud DLP API. The detected sensitive value is replaced with a specified surrogate.
1 parent 66825c2 commit 2b508e6

File tree

2 files changed

+113
-9
lines changed

2 files changed

+113
-9
lines changed

dlp/deid.py

Lines changed: 103 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,13 @@
2121

2222
# [START dlp_deidentify_masking]
2323
def deidentify_with_mask(
24-
project, string, info_types, masking_character=None, number_to_mask=0
24+
project, input_str, info_types, masking_character=None, number_to_mask=0
2525
):
2626
"""Uses the Data Loss Prevention API to deidentify sensitive data in a
2727
string by masking it with a character.
2828
Args:
2929
project: The Google Cloud project id to use as a parent resource.
30-
item: The string to deidentify (will be treated as text).
30+
input_str: The string to deidentify (will be treated as text).
3131
masking_character: The character to mask matching sensitive data with.
3232
number_to_mask: The maximum number of sensitive characters to mask in
3333
a match. If omitted or set to zero, the API will default to no
@@ -67,7 +67,7 @@ def deidentify_with_mask(
6767
}
6868

6969
# Construct item
70-
item = {"value": string}
70+
item = {"value": input_str}
7171

7272
# Call the API
7373
response = dlp.deidentify_content(
@@ -83,11 +83,76 @@ def deidentify_with_mask(
8383

8484
# [END dlp_deidentify_masking]
8585

86+
# [START dlp_deidentify_replace]
87+
def deidentify_with_replace(
88+
project,
89+
input_str,
90+
info_types,
91+
replacement_str="REPLACEMENT_STR",
92+
):
93+
"""Uses the Data Loss Prevention API to deidentify sensitive data in a
94+
string by replacing matched input values with a value you specify.
95+
Args:
96+
project: The Google Cloud project id to use as a parent resource.
97+
input_str: The string to deidentify (will be treated as text).
98+
info_types: A list of strings representing info types to look for.
99+
replacement_str: The string to replace all values that match given
100+
info types.
101+
Returns:
102+
None; the response from the API is printed to the terminal.
103+
"""
104+
import google.cloud.dlp
105+
106+
# Instantiate a client
107+
dlp = google.cloud.dlp_v2.DlpServiceClient()
108+
109+
# Convert the project id into a full resource id.
110+
parent = dlp.project_path(project)
111+
112+
# Construct inspect configuration dictionary
113+
inspect_config = {
114+
"info_types": [{"name": info_type} for info_type in info_types]
115+
}
116+
117+
# Construct deidentify configuration dictionary
118+
deidentify_config = {
119+
"info_type_transformations": {
120+
"transformations": [
121+
{
122+
"primitive_transformation": {
123+
"replace_config": {
124+
"new_value": {
125+
"string_value": replacement_str,
126+
}
127+
}
128+
}
129+
}
130+
]
131+
}
132+
}
133+
134+
# Construct item
135+
item = {"value": input_str}
136+
137+
# Call the API
138+
response = dlp.deidentify_content(
139+
parent,
140+
inspect_config=inspect_config,
141+
deidentify_config=deidentify_config,
142+
item=item,
143+
)
144+
145+
# Print out the results.
146+
print(response.item.value)
147+
148+
# [END dlp_deidentify_replace]
86149

87150
# [START dlp_deidentify_fpe]
151+
152+
88153
def deidentify_with_fpe(
89154
project,
90-
string,
155+
input_str,
91156
info_types,
92157
alphabet=None,
93158
surrogate_type=None,
@@ -98,7 +163,7 @@ def deidentify_with_fpe(
98163
string using Format Preserving Encryption (FPE).
99164
Args:
100165
project: The Google Cloud project id to use as a parent resource.
101-
item: The string to deidentify (will be treated as text).
166+
input_str: The string to deidentify (will be treated as text).
102167
alphabet: The set of characters to replace sensitive ones with. For
103168
more information, see https://cloud.google.com/dlp/docs/reference/
104169
rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet
@@ -166,7 +231,7 @@ def deidentify_with_fpe(
166231
}
167232

168233
# Convert string to item
169-
item = {"value": string}
234+
item = {"value": input_str}
170235

171236
# Call the API
172237
response = dlp.deidentify_content(
@@ -186,7 +251,7 @@ def deidentify_with_fpe(
186251
# [START dlp_reidentify_fpe]
187252
def reidentify_with_fpe(
188253
project,
189-
string,
254+
input_str,
190255
alphabet=None,
191256
surrogate_type=None,
192257
key_name=None,
@@ -196,7 +261,7 @@ def reidentify_with_fpe(
196261
string that was encrypted by Format Preserving Encryption (FPE).
197262
Args:
198263
project: The Google Cloud project id to use as a parent resource.
199-
item: The string to deidentify (will be treated as text).
264+
input_str: The string to deidentify (will be treated as text).
200265
alphabet: The set of characters to replace sensitive ones with. For
201266
more information, see https://cloud.google.com/dlp/docs/reference/
202267
rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet
@@ -255,7 +320,7 @@ def reidentify_with_fpe(
255320
}
256321

257322
# Convert string to item
258-
item = {"value": string}
323+
item = {"value": input_str}
259324

260325
# Call the API
261326
response = dlp.reidentify_content(
@@ -531,6 +596,28 @@ def redact_sensitive_data(project, item, info_types):
531596
help="The character to mask matching sensitive data with.",
532597
)
533598

599+
replace_parser = subparsers.add_parser(
600+
"deid_replace",
601+
help="Deidentify sensitive data in a string by replacing it with "
602+
"another string.",
603+
)
604+
replace_parser.add_argument(
605+
"--info_types",
606+
nargs="+",
607+
help="Strings representing info types to look for. A full list of "
608+
"info categories and types is available from the API. Examples "
609+
'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
610+
"If unspecified, the three above examples will be used.",
611+
default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"],
612+
)
613+
replace_parser.add_argument(
614+
"project",
615+
help="The Google Cloud project id to use as a parent resource.",
616+
)
617+
replace_parser.add_argument("item", help="The string to deidentify.")
618+
replace_parser.add_argument("replacement_str", help="The string to "
619+
"replace all matched values with.")
620+
534621
fpe_parser = subparsers.add_parser(
535622
"deid_fpe",
536623
help="Deidentify sensitive data in a string using Format Preserving "
@@ -715,6 +802,13 @@ def redact_sensitive_data(project, item, info_types):
715802
masking_character=args.masking_character,
716803
number_to_mask=args.number_to_mask,
717804
)
805+
elif args.content == "deid_replace":
806+
deidentify_with_replace(
807+
args.project,
808+
args.item,
809+
args.info_types,
810+
replacement_str=args.replacement_str,
811+
)
718812
elif args.content == "deid_fpe":
719813
deidentify_with_fpe(
720814
args.project,

dlp/deid_test.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,16 @@ def test_deidentify_with_mask_masking_number_specified(capsys):
8888
assert "My SSN is *******27" in out
8989

9090

91+
def test_deidentify_with_replace(capsys):
92+
deid.deidentify_with_replace(
93+
GCLOUD_PROJECT, HARMFUL_STRING, ["US_SOCIAL_SECURITY_NUMBER"],
94+
replacement_str="REPLACEMENT_STR"
95+
)
96+
97+
out, _ = capsys.readouterr()
98+
assert "My SSN is REPLACEMENT_STR" in out
99+
100+
91101
def test_deidentify_with_fpe(capsys):
92102
deid.deidentify_with_fpe(
93103
GCLOUD_PROJECT,

0 commit comments

Comments
 (0)