Skip to content

update redact_image, quickstart samples #1399

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Mar 13, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 16 additions & 10 deletions dlp/quickstart.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,22 @@ def quickstart():

# [START quickstart]
# Import the client library
import google.cloud.dlp_v2beta1
import google.cloud.dlp

# Edit this with your Google Cloud Project ID.
project = 'your-project'

# Instantiate a client.
dlp = google.cloud.dlp_v2beta1.DlpServiceClient()
dlp = google.cloud.dlp.DlpServiceClient()

# The string to inspect
content = 'Robert Frost'

# Construct the list of content items to inspect; in this case, only one.
items = [{'type': 'text/plain', 'value': content}]
# Construct the item to inspect.
item = {'value': content}

# The info types to search for in the content.
info_types = [{'name': 'US_MALE_NAME'}, {'name': 'US_FEMALE_NAME'}]
# The info types to search for in the content. Required.
info_types = [{'name': 'FIRST_NAME'}, {'name': 'LAST_NAME'}]

# The minimum likelihood to constitute a match. Optional.
min_likelihood = 'LIKELIHOOD_UNSPECIFIED'
Expand All @@ -51,16 +54,19 @@ def quickstart():
inspect_config = {
'info_types': info_types,
'min_likelihood': min_likelihood,
'max_findings': max_findings,
'include_quote': include_quote,
'limits': {'max_findings_per_request': max_findings},
}

# Convert the project id into a full resource id.
parent = dlp.project_path(project)

# Call the API.
response = dlp.inspect_content(inspect_config, items)
response = dlp.inspect_content(parent, inspect_config, item)

# Print out the results.
if response.results[0].findings:
for finding in response.results[0].findings:
if response.result.findings:
for finding in response.result.findings:
try:
print('Quote: {}'.format(finding.quote))
except AttributeError:
Expand Down
72 changes: 46 additions & 26 deletions dlp/redact.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import argparse
import mimetypes
import os


# [START redact_string]
Expand Down Expand Up @@ -83,8 +84,8 @@ def redact_string(item, replace_string, info_types=None, min_likelihood=None):


# [START redact_image]
def redact_image(filename, output_filename,
info_types=None, min_likelihood=None, mime_type=None):
def redact_image(project, filename, output_filename,
info_types, min_likelihood=None, mime_type=None):
"""Uses the Data Loss Prevention API to redact protected data in an image.
Args:
filename: The path to the file to inspect.
Expand All @@ -101,17 +102,14 @@ def redact_image(filename, output_filename,
None; the response from the API is printed to the terminal.
"""
# Import the client library
import google.cloud.dlp_v2beta1
import google.cloud.dlp

# Instantiate a client.
dlp = google.cloud.dlp_v2beta1.DlpServiceClient()
dlp = google.cloud.dlp.DlpServiceClient()

# Prepare info_types by converting the list of strings into a list of
# dictionaries (protos are also accepted). The info_types are not submitted
# directly in this example, but are used in the construction of
# image_redaction_configs.
if info_types is not None:
info_types = [{'name': info_type} for info_type in info_types]
# dictionaries (protos are also accepted).
info_types = [{'name': info_type} for info_type in info_types]

# Prepare image_redaction_configs, a list of dictionaries. Each dictionary
# contains an info_type and optionally the color used for the replacement.
Expand All @@ -124,39 +122,57 @@ def redact_image(filename, output_filename,

# Construct the configuration dictionary. Keys which are None may
# optionally be omitted entirely.
redact_config = {
inspect_config = {
'min_likelihood': min_likelihood,
'info_types': info_types,
}

# If mime_type is not specified, guess it from the filename.
if mime_type is None:
mime_guess = mimetypes.MimeTypes().guess_type(filename)
mime_type = mime_guess[0] or 'application/octet-stream'

# Construct the items list (in this case, only one item, containing the
# image file's byte data).
# Select the content type index from the list of supported types.
supported_content_types = {
None: 0, # "Unspecified"
'image/jpeg': 1,
'image/bmp': 2,
'image/png': 3,
'image/svg': 4,
'text/plain': 5,
}
content_type_index = supported_content_types.get(mime_type, 0)

# Construct the byte_item, containing the file's byte data.
with open(filename, mode='rb') as f:
items = [{'type': mime_type, 'data': f.read()}]
byte_item = {'type': content_type_index, 'data': f.read()}

# Convert the project id into a full resource id.
parent = dlp.project_path(project)

# Call the API.
response = dlp.redact_content(
redact_config, items, None,
image_redaction_configs=image_redaction_configs)
response = dlp.redact_image(
parent, inspect_config=inspect_config,
image_redaction_configs=image_redaction_configs,
byte_item=byte_item)

# Write out the results.
with open(output_filename, mode='wb') as f:
f.write(response.items[0].data)
f.write(response.redacted_image)
print("Wrote {byte_count} to {filename}".format(
byte_count=len(response.items[0].data), filename=output_filename))
byte_count=len(response.redacted_image), filename=output_filename))
# [END redact_string]


if __name__ == '__main__':
default_project = os.environ.get('GCLOUD_PROJECT')

parser = argparse.ArgumentParser(description=__doc__)
subparsers = parser.add_subparsers(
dest='content', help='Select how to submit content to the API.')
subparsers.required = True

parser_string = subparsers.add_parser('string', help='Inspect a string.')
parser_string = subparsers.add_parser('string', help='Redact a string.')
parser_string.add_argument('item', help='The string to inspect.')
parser_string.add_argument(
'replace_string',
Expand All @@ -177,20 +193,23 @@ def redact_image(filename, output_filename,
help='A string representing the minimum likelihood threshold that '
'constitutes a match.')

parser_file = subparsers.add_parser('image', help='Inspect an image file.')
parser_file = subparsers.add_parser('image', help='Redact an image file.')
parser_file.add_argument(
'filename', help='The path to the file to inspect.')
parser_file.add_argument(
'output_filename',
help='The path to which the redacted image will be written.')
parser_file.add_argument(
'--project',
help='The Google Cloud project id to use as a parent resource.',
default=default_project)
parser_file.add_argument(
'--info_types', action='append',
help='Strings representing info types to look for. A full list of '
'info categories and types is available from the API. Examples '
'include "US_MALE_NAME", "US_FEMALE_NAME", "EMAIL_ADDRESS", '
'"CANADA_SOCIAL_INSURANCE_NUMBER", "JAPAN_PASSPORT". If omitted, '
'the API will use a limited default set. Specify this flag '
'multiple times to specify multiple info types.')
'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
'If unspecified, the three above examples will be used.',
default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'])
parser_file.add_argument(
'--min_likelihood',
choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY',
Expand All @@ -210,5 +229,6 @@ def redact_image(filename, output_filename,
min_likelihood=args.min_likelihood)
elif args.content == 'image':
redact_image(
args.filename, args.output_filename, info_types=args.info_types,
min_likelihood=args.min_likelihood, mime_type=args.mime_type)
args.project, args.filename, args.output_filename,
args.info_types, min_likelihood=args.min_likelihood,
mime_type=args.mime_type)
17 changes: 5 additions & 12 deletions dlp/redact_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

import redact

GCLOUD_PROJECT = os.getenv('GCLOUD_PROJECT')
RESOURCE_DIRECTORY = os.path.join(os.path.dirname(__file__), 'resources')


Expand Down Expand Up @@ -63,19 +64,11 @@ def test_redact_image_file(tempdir, capsys):
test_filepath = os.path.join(RESOURCE_DIRECTORY, 'test.png')
output_filepath = os.path.join(tempdir, 'redacted.png')

redact.redact_image(test_filepath, output_filepath)

out, _ = capsys.readouterr()
assert output_filepath in out


def test_redact_image_file_with_infotype(tempdir, capsys):
test_filepath = os.path.join(RESOURCE_DIRECTORY, 'test.png')
output_filepath = os.path.join(tempdir, 'redacted_with_infotype.png')

redact.redact_image(
test_filepath, output_filepath,
info_types=['EMAIL_ADDRESS', 'US_MALE_NAME'])
GCLOUD_PROJECT,
test_filepath,
output_filepath,
['FIRST_NAME', 'EMAIL_ADDRESS'])

out, _ = capsys.readouterr()
assert output_filepath in out