diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 3d050a7..0000000 Binary files a/.DS_Store and /dev/null differ diff --git a/README.md b/README.md index bee010e..4f1b363 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,8 @@ For examples that use S3 bucket, upload sample images to an S3 bucket and update | [10-tables.py](./python/10-tables.py) | Example showing table processing. | | [11-tables-expense.py](./python/11-tables-expense.py) | Example showing validation of table data. | | [12-pdf-text.py](./python/12-pdf-text.py) | Example showing PDF document processing. | +| [13-signature.py](./python/13-signature.py) | Example showing Signature detection. | +| [14-pdf-numeric-table-detection.ipynb](./python/14-pdf-numeric-table.ipynb) | Example showing detection of numeric tables in pdf files. | ## .NET Usage diff --git a/python/.DS_Store b/python/.DS_Store deleted file mode 100644 index 6989235..0000000 Binary files a/python/.DS_Store and /dev/null differ diff --git a/python/14-pdf-numeric-table.ipynb b/python/14-pdf-numeric-table.ipynb new file mode 100644 index 0000000..c33764c --- /dev/null +++ b/python/14-pdf-numeric-table.ipynb @@ -0,0 +1,343 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Numeric Table Detection\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The objective is to demonstrate detection of tables in raw texxt to use targeted table extraction and lower the cost of using Textract.\n", + "Exmple for 1M pages per month containing 19% of pages with at leaqst 1 table. \n", + "\n", + "https://calculator.aws/#/estimate?id=fb01232c5a8b8304d1c1169b2911af087467ea82\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Notebook Setup\n", + "\n", + "In this step, we will import some necessary libraries that will be used throughout this notebook. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "!python -m pip install -q amazon-textract-response-parser==0.1.44\n", + "!python -m pip install -q amazon-textract-caller==0.0.28\n", + "!python -m pip install -q amazon-textract-prettyprinter==0.0.16\n", + "!python -m pip install -q amazon-textract-textractor==1.1.1\n", + "!python -m pip install -q pypdf==3.1.0\n", + "!python -m pip install -q pandas==1.3.5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import boto3\n", + "import botocore\n", + "from IPython.display import Image, display, JSON, HTML\n", + "from textractcaller.t_call import call_textract, Textract_Features, call_textract_expense\n", + "from textractprettyprinter.t_pretty_print import convert_table_to_list\n", + "from trp import Document\n", + "import os\n", + "\n", + "\n", + "# variables\n", + "\n", + "s3=boto3.client('s3')\n", + "# because the reference document is stored in us-east-2, we run Textract in us-east-2 as well\n", + "textract = boto3.client('textract', region_name='us-east-2')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "file=\"./DemoTable.pdf\"\n", + "s3url=\"s3://amazon-textract-public-content/code-samples/DemoTable.pdf\"\n", + "!aws s3 cp s3://amazon-textract-public-content/code-samples/DemoTable.pdf .\n", + "!aws s3 cp s3://amazon-textract-public-content/code-samples/DemoTableDocument-ExtractedTablePages.pdf ." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from IPython.display import IFrame\n", + "IFrame(file, width=1500, height=600)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "Extract unstructured data with Amazon Textract \n", + "\n", + "Amazon Textract is an ML powered OCR service that is capable of detecting and extracting text from documents. Text data in the form of WORDS and LINES can be extracted from documents using Amazon Textract `DetectDocumentText` API. Let's extract the words and lines from the demonstration document." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "# Call Amazon Textract\n", + "response = call_textract(input_document=s3url, boto3_textract_client=textract)\n", + "\n", + "text=\"\"\n", + "# Print detected text\n", + "for item in response[\"Blocks\"]:\n", + " if item[\"BlockType\"] == \"LINE\":\n", + " text=text+\"\\n\"+item[\"Text\"]\n", + "print (text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "# Detect Tables in raw text\n", + "\n", + "In this example using a threshold of 4 characters out of the 15 most used characters being digits works well to identify if the page is mostly constituted of numeric tables.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from collections import Counter\n", + "import io\n", + "\n", + "def containsatable(textcontent):\n", + " counter = Counter(textcontent)\n", + " mostcom = counter.most_common(15)\n", + " countdigit=0\n", + " for i in mostcom :\n", + " if (i[0].isdigit()):\n", + " countdigit=countdigit+1\n", + " return(countdigit>=4)\n", + "\n", + "# Alternatively, we can identify if 50% of the lines satisfy the criteria (to identify tables embedded in text)\n", + "\n", + "def containsatableperline(textcontent):\n", + " countlines=0\n", + " tablelines=0\n", + " for line in io.StringIO(textcontent):\n", + " countlines=countlines+1\n", + " counter = Counter(line)\n", + " mostcom = counter.most_common(15)\n", + " countdigit=0\n", + " for i in mostcom :\n", + " if (i[0].isdigit()):\n", + " countdigit=countdigit+1\n", + " if (countdigit>=4):\n", + " tablelines=tablelines+1\n", + " \n", + " return(tablelines>(countlines/2))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from trp import Document\n", + "import math\n", + "doc = Document(response)\n", + "#print(doc.pages[0])\n", + "# Iterate over elements in the document\n", + "pagenum=0\n", + "pageswithtables=[]\n", + "pageswithtables2=[]\n", + "for page in doc.pages:\n", + " pagenum=pagenum+1\n", + " text=''\n", + " for line in page.lines:\n", + " text=text+line.text\n", + " if (containsatable(text)):\n", + " pageswithtables.append(pagenum)\n", + " if (containsatableperline(text)):\n", + " pageswithtables2.append(pagenum)\n", + "\n", + "print(\"List of pages with Tables:\") \n", + "print (pageswithtables)\n", + "print (pageswithtables2)\n", + "print(\"Percentage of pages with tables:\") \n", + "print (str(len(pageswithtables))+\"/\"+str(pagenum))\n", + "print (str(math.trunc(len(pageswithtables)/pagenum*100))+\"%\")\n", + " \n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Call Analyse Documents only on the pages with tables\n", + "First, we build a smaller PDF document containing only the pages identified as containing tables and then we run the AnalyzeDocument call on the new document." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from pypdf import PdfReader, PdfWriter\n", + "\n", + "pdf_file_path = file\n", + "file_base_name = pdf_file_path.replace('.pdf', '')\n", + "\n", + "pdf = PdfReader(pdf_file_path)\n", + "\n", + "pages = pageswithtables\n", + "pdfWriter = PdfWriter()\n", + "\n", + "for page_num in pages:\n", + " pdfWriter.add_page(pdf.pages[page_num-1])\n", + "\n", + "pdf_out = open(\"temp.pdf\", 'wb')\n", + "pdfWriter.write(pdf_out)\n", + "pdf_out.close()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from IPython.display import IFrame\n", + "IFrame('temp.pdf', width=1500, height=600)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# When you are using your own document, you need to copy the result to S3 and process that.\n", + "# In our sample case, we already uploaded the output to S3, so the next line is commented out\n", + "#!aws s3 cp temp.pdf s3://{data_bucket}/temp.pdf --only-show-errors\n", + "s3url=\"s3://amazon-textract-public-content/code-samples/DemoTableDocument-ExtractedTablePages.pdf\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "resp = call_textract(input_document=s3url, features=[Textract_Features.TABLES])\n", + "tdoc = Document(resp)\n", + "dfs = list()\n", + "\n", + "for page in tdoc.pages:\n", + " for table in page.tables:\n", + " tab_list = convert_table_to_list(trp_table=table)\n", + " print(tab_list)\n", + " dfs.append(pd.DataFrame(tab_list))\n", + "\n", + "df1 = dfs[0]\n", + "df2 = dfs[1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "instance_type": "ml.t3.medium", + "kernelspec": { + "display_name": "Python 3 (Data Science)", + "language": "python", + "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:081325390199:image/datascience-1.0" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}