Skip to content

Commit d66e3a7

Browse files
dalequarknnegreytelpirion
authored
feat: Document AI code snippets for beta
* first pass v1beta2 analyze_form.py * Update document/cloud-client/analyze_form.py Move region tag to the top, set sensible defaults Co-Authored-By: Noah Negrey <[email protected]> * updated form code * changed naming to be consistent with node * added parse table files * style updates * added quickstart * added batch samples * added set endpoint * renamed set endpoint fn name * feat: adds AutoML model sample * feat: adds requirements files * fix: linter issues * chore: changes to GCS output * fix: linter issues * fix: changes format for AutoML model * fix: per reviewer * fix: added bounding poly comments * fix: adjusts locations, reviewer feedback * fix: reviewer feedback * fix: linter issues * fix: moved comment * fix: per reviewer * fix: per reviewer * fix: region tag bracket * fix: test assert Co-authored-by: Noah Negrey <[email protected]> Co-authored-by: Eric Schmidt <[email protected]>
1 parent b221fbf commit d66e3a7

16 files changed

+784
-0
lines changed
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
# Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
# [START documentai_batch_parse_form_beta]
17+
from google.cloud import documentai_v1beta2 as documentai
18+
from google.cloud import storage
19+
import re
20+
21+
22+
def batch_parse_form(
23+
project_id='YOUR_PROJECT_ID',
24+
input_uri='gs://cloud-samples-data/documentai/form.pdf',
25+
destination_uri='gs://your-bucket-id/path/to/save/results/'):
26+
"""Parse a form"""
27+
28+
client = documentai.DocumentUnderstandingServiceClient()
29+
30+
gcs_source = documentai.types.GcsSource(uri=input_uri)
31+
32+
# mime_type can be application/pdf, image/tiff,
33+
# and image/gif, or application/json
34+
input_config = documentai.types.InputConfig(
35+
gcs_source=gcs_source, mime_type='application/pdf')
36+
37+
# where to write results
38+
output_config = documentai.types.OutputConfig(
39+
gcs_destination=documentai.types.GcsDestination(
40+
uri=destination_uri),
41+
pages_per_shard=1 # Map one doc page to one output page
42+
)
43+
44+
# Improve form parsing results by providing key-value pair hints.
45+
# For each key hint, key is text that is likely to appear in the
46+
# document as a form field name (i.e. "DOB").
47+
# Value types are optional, but can be one or more of:
48+
# ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER, ID,
49+
# NUMBER, EMAIL, PRICE, TERMS, DATE, NAME
50+
key_value_pair_hints = [
51+
documentai.types.KeyValuePairHint(
52+
key='Emergency Contact',
53+
value_types=['NAME']),
54+
documentai.types.KeyValuePairHint(
55+
key='Referred By')
56+
]
57+
58+
# Setting enabled=True enables form extraction
59+
form_extraction_params = documentai.types.FormExtractionParams(
60+
enabled=True, key_value_pair_hints=key_value_pair_hints)
61+
62+
# Location can be 'us' or 'eu'
63+
parent = 'projects/{}/locations/us'.format(project_id)
64+
request = documentai.types.ProcessDocumentRequest(
65+
input_config=input_config,
66+
output_config=output_config,
67+
form_extraction_params=form_extraction_params)
68+
69+
# Add each ProcessDocumentRequest to the batch request
70+
requests = []
71+
requests.append(request)
72+
73+
batch_request = documentai.types.BatchProcessDocumentsRequest(
74+
parent=parent, requests=requests
75+
)
76+
77+
operation = client.batch_process_documents(batch_request)
78+
79+
# Wait for the operation to finish
80+
operation.result()
81+
82+
# Results are written to GCS. Use a regex to find
83+
# output files
84+
match = re.match(r'gs://([^/]+)/(.+)', destination_uri)
85+
output_bucket = match.group(1)
86+
prefix = match.group(2)
87+
88+
storage_client = storage.client.Client()
89+
bucket = storage_client.get_bucket(output_bucket)
90+
blob_list = list(bucket.list_blobs(prefix=prefix))
91+
print('Output files:')
92+
for blob in blob_list:
93+
print(blob.name)
94+
95+
96+
# [END documentai_batch_parse_form_beta]
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific ladnguage governing permissions and
13+
# limitations under the License.
14+
15+
import batch_parse_form_beta
16+
import os
17+
import pytest
18+
import uuid
19+
from google.cloud import storage
20+
21+
BUCKET = 'document-ai-{}'.format(uuid.uuid4())
22+
OUTPUT_PREFIX = 'TEST_OUTPUT_{}'.format(uuid.uuid4())
23+
PROJECT_ID = os.environ['GCLOUD_PROJECT']
24+
INPUT_URI = 'gs://cloud-samples-data/documentai/invoice.pdf'
25+
BATCH_OUTPUT_URI = 'gs://{}/{}/'.format(BUCKET, OUTPUT_PREFIX)
26+
27+
28+
@pytest.fixture(autouse=True)
29+
def setup_teardown():
30+
"""Create a temporary bucket to store annotation output."""
31+
storage_client = storage.Client()
32+
bucket = storage_client.create_bucket(BUCKET)
33+
34+
yield
35+
36+
bucket.delete(force=True)
37+
38+
39+
def test_batch_parse_form(capsys):
40+
batch_parse_form_beta.batch_parse_form(PROJECT_ID, INPUT_URI, BATCH_OUTPUT_URI)
41+
out, _ = capsys.readouterr()
42+
assert 'Output files' in out
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
# Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
# [START documentai_batch_parse_table_beta]
17+
from google.cloud import documentai_v1beta2 as documentai
18+
from google.cloud import storage
19+
import re
20+
21+
22+
def batch_parse_table(
23+
project_id='YOUR_PROJECT_ID',
24+
input_uri='gs://cloud-samples-data/documentai/form.pdf',
25+
destination_uri='gs://your-bucket-id/path/to/save/results/'):
26+
"""Parse a form"""
27+
28+
client = documentai.DocumentUnderstandingServiceClient()
29+
30+
gcs_source = documentai.types.GcsSource(uri=input_uri)
31+
32+
# mime_type can be application/pdf, image/tiff,
33+
# and image/gif, or application/json
34+
input_config = documentai.types.InputConfig(
35+
gcs_source=gcs_source, mime_type='application/pdf')
36+
37+
# where to write results
38+
output_config = documentai.types.OutputConfig(
39+
gcs_destination=documentai.types.GcsDestination(
40+
uri=destination_uri),
41+
pages_per_shard=1 # Map one doc page to one output page
42+
)
43+
44+
# Improve table parsing results by providing bounding boxes
45+
# specifying where the box appears in the document (optional)
46+
table_bound_hints = [
47+
documentai.types.TableBoundHint(
48+
page_number=1,
49+
bounding_box=documentai.types.BoundingPoly(
50+
# Define a polygon around tables to detect
51+
# Each vertice coordinate must be a number between 0 and 1
52+
normalized_vertices=[
53+
# Top left
54+
documentai.types.geometry.NormalizedVertex(
55+
x=0,
56+
y=0
57+
),
58+
# Top right
59+
documentai.types.geometry.NormalizedVertex(
60+
x=1,
61+
y=0
62+
),
63+
# Bottom right
64+
documentai.types.geometry.NormalizedVertex(
65+
x=1,
66+
y=1
67+
),
68+
# Bottom left
69+
documentai.types.geometry.NormalizedVertex(
70+
x=0,
71+
y=1
72+
)
73+
]
74+
)
75+
)
76+
]
77+
78+
# Setting enabled=True enables form extraction
79+
table_extraction_params = documentai.types.TableExtractionParams(
80+
enabled=True, table_bound_hints=table_bound_hints)
81+
82+
# Location can be 'us' or 'eu'
83+
parent = 'projects/{}/locations/us'.format(project_id)
84+
request = documentai.types.ProcessDocumentRequest(
85+
input_config=input_config,
86+
output_config=output_config,
87+
table_extraction_params=table_extraction_params)
88+
89+
requests = []
90+
requests.append(request)
91+
92+
batch_request = documentai.types.BatchProcessDocumentsRequest(
93+
parent=parent, requests=requests
94+
)
95+
96+
operation = client.batch_process_documents(batch_request)
97+
98+
# Wait for the operation to finish
99+
operation.result()
100+
101+
# Results are written to GCS. Use a regex to find
102+
# output files
103+
match = re.match(r'gs://([^/]+)/(.+)', destination_uri)
104+
output_bucket = match.group(1)
105+
prefix = match.group(2)
106+
107+
storage_client = storage.client.Client()
108+
bucket = storage_client.get_bucket(output_bucket)
109+
blob_list = list(bucket.list_blobs(prefix=prefix))
110+
print('Output files:')
111+
for blob in blob_list:
112+
print(blob.name)
113+
114+
# [END documentai_batch_parse_table_beta]
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific ladnguage governing permissions and
13+
# limitations under the License.
14+
15+
import batch_parse_table_beta
16+
import os
17+
import pytest
18+
import uuid
19+
from google.cloud import storage
20+
21+
BUCKET = 'document-ai-{}'.format(uuid.uuid4())
22+
OUTPUT_PREFIX = 'TEST_OUTPUT_{}'.format(uuid.uuid4())
23+
PROJECT_ID = os.environ['GCLOUD_PROJECT']
24+
INPUT_URI = 'gs://cloud-samples-data/documentai/invoice.pdf'
25+
BATCH_OUTPUT_URI = 'gs://{}/{}/'.format(BUCKET, OUTPUT_PREFIX)
26+
27+
28+
@pytest.fixture(autouse=True)
29+
def setup_teardown():
30+
"""Create a temporary bucket to store annotation output."""
31+
storage_client = storage.Client()
32+
bucket = storage_client.create_bucket(BUCKET)
33+
34+
yield
35+
36+
bucket.delete(force=True)
37+
38+
39+
def test_batch_parse_table(capsys):
40+
batch_parse_table_beta.batch_parse_table(PROJECT_ID, INPUT_URI, BATCH_OUTPUT_URI)
41+
out, _ = capsys.readouterr()
42+
assert 'Output files:' in out
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
# Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the 'License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# [START documentai_parse_form_beta]
16+
from google.cloud import documentai_v1beta2 as documentai
17+
18+
19+
def parse_form(project_id='YOUR_PROJECT_ID',
20+
input_uri='gs://cloud-samples-data/documentai/form.pdf'):
21+
"""Parse a form"""
22+
23+
client = documentai.DocumentUnderstandingServiceClient()
24+
25+
gcs_source = documentai.types.GcsSource(uri=input_uri)
26+
27+
# mime_type can be application/pdf, image/tiff,
28+
# and image/gif, or application/json
29+
input_config = documentai.types.InputConfig(
30+
gcs_source=gcs_source, mime_type='application/pdf')
31+
32+
# Improve form parsing results by providing key-value pair hints.
33+
# For each key hint, key is text that is likely to appear in the
34+
# document as a form field name (i.e. "DOB").
35+
# Value types are optional, but can be one or more of:
36+
# ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER, ID,
37+
# NUMBER, EMAIL, PRICE, TERMS, DATE, NAME
38+
key_value_pair_hints = [
39+
documentai.types.KeyValuePairHint(key='Emergency Contact',
40+
value_types=['NAME']),
41+
documentai.types.KeyValuePairHint(
42+
key='Referred By')
43+
]
44+
45+
# Setting enabled=True enables form extraction
46+
form_extraction_params = documentai.types.FormExtractionParams(
47+
enabled=True, key_value_pair_hints=key_value_pair_hints)
48+
49+
# Location can be 'us' or 'eu'
50+
parent = 'projects/{}/locations/us'.format(project_id)
51+
request = documentai.types.ProcessDocumentRequest(
52+
parent=parent,
53+
input_config=input_config,
54+
form_extraction_params=form_extraction_params)
55+
56+
document = client.process_document(request=request)
57+
58+
def _get_text(el):
59+
"""Doc AI identifies form fields by their offsets
60+
in document text. This function converts offsets
61+
to text snippets.
62+
"""
63+
response = ''
64+
# If a text segment spans several lines, it will
65+
# be stored in different text segments.
66+
for segment in el.text_anchor.text_segments:
67+
start_index = segment.start_index
68+
end_index = segment.end_index
69+
response += document.text[start_index:end_index]
70+
return response
71+
72+
for page in document.pages:
73+
print('Page number: {}'.format(page.page_number))
74+
for form_field in page.form_fields:
75+
print('Field Name: {}\tConfidence: {}'.format(
76+
_get_text(form_field.field_name),
77+
form_field.field_name.confidence))
78+
print('Field Value: {}\tConfidence: {}'.format(
79+
_get_text(form_field.field_value),
80+
form_field.field_value.confidence))
81+
82+
# [END documentai_parse_form_beta]

0 commit comments

Comments
 (0)