Language classify (#1095)

dizcology · Jon Wayne Parrott · commit b91f9afe933c · 2017-09-15T13:36:20.000-07:00
* add classify text samples and tests

* use longer text

* move entity sentiment to v1

* flake

* year when first written

* year first written
diff --git a/language/cloud-client/v1/snippets.py b/language/cloud-client/v1/snippets.py
@@ -22,10 +22,12 @@
 """
 
 import argparse
+import sys
 
 from google.cloud import language
 from google.cloud.language import enums
 from google.cloud.language import types
+
 import six
 
 
@@ -192,12 +194,80 @@ def syntax_file(gcs_uri):
 # [END def_syntax_file]
 
 
+# [START def_entity_sentiment_text]
+def entity_sentiment_text(text):
+    """Detects entity sentiment in the provided text."""
+    client = language.LanguageServiceClient()
+
+    if isinstance(text, six.binary_type):
+        text = text.decode('utf-8')
+
+    document = types.Document(
+        content=text.encode('utf-8'),
+        type=enums.Document.Type.PLAIN_TEXT)
+
+    # Detect and send native Python encoding to receive correct word offsets.
+    encoding = enums.EncodingType.UTF32
+    if sys.maxunicode == 65535:
+        encoding = enums.EncodingType.UTF16
+
+    result = client.analyze_entity_sentiment(document, encoding)
+
+    for entity in result.entities:
+        print('Mentions: ')
+        print(u'Name: "{}"'.format(entity.name))
+        for mention in entity.mentions:
+            print(u'  Begin Offset : {}'.format(mention.text.begin_offset))
+            print(u'  Content : {}'.format(mention.text.content))
+            print(u'  Magnitude : {}'.format(mention.sentiment.magnitude))
+            print(u'  Sentiment : {}'.format(mention.sentiment.score))
+            print(u'  Type : {}'.format(mention.type))
+        print(u'Salience: {}'.format(entity.salience))
+        print(u'Sentiment: {}\n'.format(entity.sentiment))
+# [END def_entity_sentiment_text]
+
+
+def entity_sentiment_file(gcs_uri):
+    """Detects entity sentiment in a Google Cloud Storage file."""
+    client = language.LanguageServiceClient()
+
+    document = types.Document(
+        gcs_content_uri=gcs_uri,
+        type=enums.Document.Type.PLAIN_TEXT)
+
+    # Detect and send native Python encoding to receive correct word offsets.
+    encoding = enums.EncodingType.UTF32
+    if sys.maxunicode == 65535:
+        encoding = enums.EncodingType.UTF16
+
+    result = client.analyze_entity_sentiment(document, encoding)
+
+    for entity in result.entities:
+        print(u'Name: "{}"'.format(entity.name))
+        for mention in entity.mentions:
+            print(u'  Begin Offset : {}'.format(mention.text.begin_offset))
+            print(u'  Content : {}'.format(mention.text.content))
+            print(u'  Magnitude : {}'.format(mention.sentiment.magnitude))
+            print(u'  Sentiment : {}'.format(mention.sentiment.score))
+            print(u'  Type : {}'.format(mention.type))
+        print(u'Salience: {}'.format(entity.salience))
+        print(u'Sentiment: {}\n'.format(entity.sentiment))
+
+
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(
         description=__doc__,
         formatter_class=argparse.RawDescriptionHelpFormatter)
     subparsers = parser.add_subparsers(dest='command')
 
+    sentiment_entities_text_parser = subparsers.add_parser(
+        'sentiment-entities-text', help=entity_sentiment_text.__doc__)
+    sentiment_entities_text_parser.add_argument('text')
+
+    sentiment_entities_file_parser = subparsers.add_parser(
+        'sentiment-entities-file', help=entity_sentiment_file.__doc__)
+    sentiment_entities_file_parser.add_argument('gcs_uri')
+
     sentiment_text_parser = subparsers.add_parser(
         'sentiment-text', help=sentiment_text.__doc__)
     sentiment_text_parser.add_argument('text')
@@ -236,3 +306,7 @@ def syntax_file(gcs_uri):
         syntax_text(args.text)
     elif args.command == 'syntax-file':
         syntax_file(args.gcs_uri)
+    elif args.command == 'sentiment-entities-text':
+        entity_sentiment_text(args.text)
+    elif args.command == 'sentiment-entities-file':
+        entity_sentiment_file(args.gcs_uri)
diff --git a/language/cloud-client/v1/snippets_test.py b/language/cloud-client/v1/snippets_test.py
@@ -1,4 +1,5 @@
-# Copyright 2016 Google, Inc.
+# -*- coding: utf-8 -*-
+# Copyright 2017 Google, Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -56,3 +57,23 @@ def test_syntax_file(capsys):
     snippets.syntax_file(TEST_FILE_URL)
     out, _ = capsys.readouterr()
     assert 'NOUN: President' in out
+
+
+def test_sentiment_entities_text(capsys):
+    snippets.entity_sentiment_text(
+        'President Obama is speaking at the White House.')
+    out, _ = capsys.readouterr()
+    assert 'Content : White House' in out
+
+
+def test_sentiment_entities_file(capsys):
+    snippets.entity_sentiment_file(TEST_FILE_URL)
+    out, _ = capsys.readouterr()
+    assert 'Content : White House' in out
+
+
+def test_sentiment_entities_utf(capsys):
+    snippets.entity_sentiment_text(
+        'foo→bar')
+    out, _ = capsys.readouterr()
+    assert 'Begin Offset : 4' in out
diff --git a/language/cloud-client/v1beta2/resources/android_text.txt b/language/cloud-client/v1beta2/resources/android_text.txt
@@ -0,0 +1 @@
+Android is a mobile operating system developed by Google, based on the Linux kernel and designed primarily for touchscreen mobile devices such as smartphones and tablets.
diff --git a/language/cloud-client/v1beta2/snippets.py b/language/cloud-client/v1beta2/snippets.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-# Copyright 2017 Google, Inc.
+# Copyright 2016 Google, Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -22,7 +22,6 @@
 """
 
 import argparse
-import sys
 
 # [START beta_import]
 from google.cloud import language_v1beta2
@@ -173,9 +172,9 @@ def syntax_file(gcs_uri):
                                token.text.content))
 
 
-# [START def_entity_sentiment_text]
-def entity_sentiment_text(text):
-    """Detects entity sentiment in the provided text."""
+# [START def_classify_text]
+def classify_text(text):
+    """Classifies the provided text."""
     # [START beta_client]
     client = language_v1beta2.LanguageServiceClient()
     # [END beta_client]
@@ -187,52 +186,31 @@ def entity_sentiment_text(text):
         content=text.encode('utf-8'),
         type=enums.Document.Type.PLAIN_TEXT)
 
-    # Pass in encoding type to get useful offsets in the response.
-    encoding = enums.EncodingType.UTF32
-    if sys.maxunicode == 65535:
-        encoding = enums.EncodingType.UTF16
-
-    result = client.analyze_entity_sentiment(document, encoding)
-
-    for entity in result.entities:
-        print('Mentions: ')
-        print(u'Name: "{}"'.format(entity.name))
-        for mention in entity.mentions:
-            print(u'  Begin Offset : {}'.format(mention.text.begin_offset))
-            print(u'  Content : {}'.format(mention.text.content))
-            print(u'  Magnitude : {}'.format(mention.sentiment.magnitude))
-            print(u'  Sentiment : {}'.format(mention.sentiment.score))
-            print(u'  Type : {}'.format(mention.type))
-        print(u'Salience: {}'.format(entity.salience))
-        print(u'Sentiment: {}\n'.format(entity.sentiment))
-# [END def_entity_sentiment_text]
-
-
-def entity_sentiment_file(gcs_uri):
-    """Detects entity sentiment in a Google Cloud Storage file."""
+    categories = client.classify_text(document).categories
+
+    for category in categories:
+        print(u'=' * 20)
+        print(u'{:<16}: {}'.format('name', category.name))
+        print(u'{:<16}: {}'.format('confidence', category.confidence))
+# [END def_classify_text]
+
+
+# [START def_classify_file]
+def classify_file(gcs_uri):
+    """Classifies the text in a Google Cloud Storage file."""
     client = language_v1beta2.LanguageServiceClient()
 
     document = types.Document(
         gcs_content_uri=gcs_uri,
         type=enums.Document.Type.PLAIN_TEXT)
 
-    # Pass in encoding type to get useful offsets in the response.
-    encoding = enums.EncodingType.UTF32
-    if sys.maxunicode == 65535:
-        encoding = enums.EncodingType.UTF16
-
-    result = client.analyze_entity_sentiment(document, encoding)
+    categories = client.classify_text(document).categories
 
-    for entity in result.entities:
-        print(u'Name: "{}"'.format(entity.name))
-        for mention in entity.mentions:
-            print(u'  Begin Offset : {}'.format(mention.text.begin_offset))
-            print(u'  Content : {}'.format(mention.text.content))
-            print(u'  Magnitude : {}'.format(mention.sentiment.magnitude))
-            print(u'  Sentiment : {}'.format(mention.sentiment.score))
-            print(u'  Type : {}'.format(mention.type))
-        print(u'Salience: {}'.format(entity.salience))
-        print(u'Sentiment: {}\n'.format(entity.sentiment))
+    for category in categories:
+        print(u'=' * 20)
+        print(u'{:<16}: {}'.format('name', category.name))
+        print(u'{:<16}: {}'.format('confidence', category.confidence))
+# [END def_classify_file]
 
 
 if __name__ == '__main__':
@@ -241,13 +219,13 @@ def entity_sentiment_file(gcs_uri):
         formatter_class=argparse.RawDescriptionHelpFormatter)
     subparsers = parser.add_subparsers(dest='command')
 
-    sentiment_entities_text_parser = subparsers.add_parser(
-        'sentiment-entities-text', help=entity_sentiment_text.__doc__)
-    sentiment_entities_text_parser.add_argument('text')
+    classify_text_parser = subparsers.add_parser(
+        'classify-text', help=classify_text.__doc__)
+    classify_text_parser.add_argument('text')
 
-    sentiment_entities_file_parser = subparsers.add_parser(
-        'sentiment-entities-file', help=entity_sentiment_file.__doc__)
-    sentiment_entities_file_parser.add_argument('gcs_uri')
+    classify_text_parser = subparsers.add_parser(
+        'classify-file', help=classify_file.__doc__)
+    classify_text_parser.add_argument('gcs_uri')
 
     sentiment_text_parser = subparsers.add_parser(
         'sentiment-text', help=sentiment_text.__doc__)
@@ -287,7 +265,7 @@ def entity_sentiment_file(gcs_uri):
         syntax_text(args.text)
     elif args.command == 'syntax-file':
         syntax_file(args.gcs_uri)
-    elif args.command == 'sentiment-entities-text':
-        entity_sentiment_text(args.text)
-    elif args.command == 'sentiment-entities-file':
-        entity_sentiment_file(args.gcs_uri)
+    elif args.command == 'classify-text':
+        classify_text(args.text)
+    elif args.command == 'classify-file':
+        classify_file(args.gcs_uri)
diff --git a/language/cloud-client/v1beta2/snippets_test.py b/language/cloud-client/v1beta2/snippets_test.py
@@ -19,6 +19,7 @@
 
 BUCKET = os.environ['CLOUD_STORAGE_BUCKET']
 TEST_FILE_URL = 'gs://{}/text.txt'.format(BUCKET)
+LONG_TEST_FILE_URL = 'gs://{}/android_text.txt'.format(BUCKET)
 
 
 def test_sentiment_text(capsys):
@@ -68,21 +69,18 @@ def test_syntax_file(capsys):
     assert 'NOUN: President' in out
 
 
-def test_sentiment_entities_text(capsys):
-    snippets.entity_sentiment_text(
-        'President Obama is speaking at the White House.')
+def test_classify_text(capsys):
+    snippets.classify_text(
+        'Android is a mobile operating system developed by Google, '
+        'based on the Linux kernel and designed primarily for touchscreen '
+        'mobile devices such as smartphones and tablets.')
     out, _ = capsys.readouterr()
-    assert 'Content : White House' in out
-
-
-def test_sentiment_entities_file(capsys):
-    snippets.entity_sentiment_file(TEST_FILE_URL)
-    out, _ = capsys.readouterr()
-    assert 'Content : White House' in out
+    assert 'name' in out
+    assert '/Computers & Electronics' in out
 
 
-def test_sentiment_entities_utf(capsys):
-    snippets.entity_sentiment_text(
-        'foo→bar')
+def test_classify_file(capsys):
+    snippets.classify_file(LONG_TEST_FILE_URL)
     out, _ = capsys.readouterr()
-    assert 'Begin Offset : 4' in out
+    assert 'name' in out
+    assert '/Computers & Electronics' in out

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Android is a mobile operating system developed by Google, based on the Linux kernel and designed primarily for touchscreen mobile devices such as smartphones and tablets.`