-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
6f4c17e
commit 885f019
Showing
1 changed file
with
326 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,326 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"# Document AI\n", | ||
"Google service (API) to parse structured information from unstructured or semi-structured documents using state-of-the-art Google AI such as natural language, computer vision, translation, and AutoML." | ||
], | ||
"metadata": { | ||
"id": "AMyDKXCjygvE" | ||
} | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"Install gCloud CLI" | ||
], | ||
"metadata": { | ||
"id": "NcWRFfYj24dp" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"#!curl -O https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-cli-435.0.1-linux-x86_64.tar.gz # unnecessary for Google Colab Notebooks" | ||
], | ||
"metadata": { | ||
"id": "9v5bDEU13-KI" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"Initialize gCloud CLI" | ||
], | ||
"metadata": { | ||
"id": "PqnlhaL55Mjw" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"!gcloud init # Follow prompts to configure and log in. Project is \"dk-ocr-test\"" | ||
], | ||
"metadata": { | ||
"id": "AoSU4BXA4XxQ" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"Setting up Application Default Credentials (ADC) for Google API client libraries" | ||
], | ||
"metadata": { | ||
"id": "QFHcT3JbzNzu" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"!gcloud auth application-default login" | ||
], | ||
"metadata": { | ||
"id": "INdY4oSJ6Ca1" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"Import GCP libraries" | ||
], | ||
"metadata": { | ||
"id": "a65_t7d226qJ" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"!pip install google # unnecessary for Google Colab Notebooks\n", | ||
"!pip install google.cloud\n", | ||
"!pip install google-cloud-documentai" | ||
], | ||
"metadata": { | ||
"id": "HZzo7-S76gYQ" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"from __future__ import annotations\n", | ||
"\n", | ||
"from collections.abc import Sequence\n", | ||
"\n", | ||
"from google.api_core.client_options import ClientOptions\n", | ||
"import google.cloud.documentai as documentai # type: ignore" | ||
], | ||
"metadata": { | ||
"id": "NtnvIPp0624y" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"Environmental variable assignment" | ||
], | ||
"metadata": { | ||
"id": "8iTnt82N8jCx" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"project_id = 'YOUR_PROJECT_ID'\n", | ||
"location = 'YOUR_PROCESSOR_LOCATION' # Format is 'us' or 'eu'\n", | ||
"processor_id = 'YOUR_PROCESSOR_ID' # Create processor before running sample\n", | ||
"processor_version = 'rc' # Refer to https://cloud.google.com/document-ai/docs/manage-processor-versions for more information\n", | ||
"file_path = '/ocr-data/dataset/the.jpg'\n", | ||
"mime_type = 'image/jpeg' # Refer to https://cloud.google.com/document-ai/docs/file-types" | ||
], | ||
"metadata": { | ||
"id": "b68RexNH8cXq" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"Define `process_document_ocr_sample` function for Document AI API requests" | ||
], | ||
"metadata": { | ||
"id": "urjMqOep9XHV" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"id": "5wLUIa6ffhDF" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"def process_document_ocr_sample(\n", | ||
" project_id: str,\n", | ||
" location: str,\n", | ||
" processor_id: str,\n", | ||
" processor_version: str,\n", | ||
" file_path: str,\n", | ||
" mime_type: str,\n", | ||
") -> None:\n", | ||
" # Online processing request to Document AI\n", | ||
" document = process_document(\n", | ||
" project_id, location, processor_id, processor_version, file_path, mime_type\n", | ||
" )\n", | ||
"\n", | ||
" # For a full list of Document object attributes, please reference this page:\n", | ||
" # https://cloud.google.com/python/docs/reference/documentai/latest/google.cloud.documentai_v1.types.Document\n", | ||
"\n", | ||
" text = document.text\n", | ||
" print(f\"Full document text: {text}\\n\")\n", | ||
" print(f\"There are {len(document.pages)} page(s) in this document.\\n\")\n", | ||
"\n", | ||
" for page in document.pages:\n", | ||
" print(f\"Page {page.page_number}:\")\n", | ||
" print_page_dimensions(page.dimension)\n", | ||
" print_detected_langauges(page.detected_languages)\n", | ||
" print_paragraphs(page.paragraphs, text)\n", | ||
" print_blocks(page.blocks, text)\n", | ||
" print_lines(page.lines, text)\n", | ||
" print_tokens(page.tokens, text)\n", | ||
"\n", | ||
" # Currently supported in version pretrained-ocr-v1.1-2022-09-12\n", | ||
" if page.image_quality_scores:\n", | ||
" print_image_quality_scores(page.image_quality_scores)\n", | ||
"\n", | ||
"\n", | ||
"def process_document(\n", | ||
" project_id: str,\n", | ||
" location: str,\n", | ||
" processor_id: str,\n", | ||
" processor_version: str,\n", | ||
" file_path: str,\n", | ||
" mime_type: str,\n", | ||
") -> documentai.Document:\n", | ||
" # You must set the api_endpoint if you use a location other than 'us'.\n", | ||
" opts = ClientOptions(api_endpoint=f\"{location}-documentai.googleapis.com\")\n", | ||
"\n", | ||
" client = documentai.DocumentProcessorServiceClient(client_options=opts)\n", | ||
"\n", | ||
" # The full resource name of the processor version\n", | ||
" # e.g. projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}\n", | ||
" # You must create processors before running sample code.\n", | ||
" name = client.processor_version_path(\n", | ||
" project_id, location, processor_id, processor_version\n", | ||
" )\n", | ||
"\n", | ||
" # Read the file into memory\n", | ||
" with open(file_path, \"rb\") as image:\n", | ||
" image_content = image.read()\n", | ||
"\n", | ||
" # Load Binary Data into Document AI RawDocument Object\n", | ||
" raw_document = documentai.RawDocument(content=image_content, mime_type=mime_type)\n", | ||
"\n", | ||
" # Configure the process request\n", | ||
" request = documentai.ProcessRequest(name=name, raw_document=raw_document)\n", | ||
"\n", | ||
" result = client.process_document(request=request)\n", | ||
"\n", | ||
" return result.document\n", | ||
"\n", | ||
"\n", | ||
"def print_page_dimensions(dimension: documentai.Document.Page.Dimension) -> None:\n", | ||
" print(f\" Width: {str(dimension.width)}\")\n", | ||
" print(f\" Height: {str(dimension.height)}\")\n", | ||
"\n", | ||
"\n", | ||
"def print_detected_langauges(\n", | ||
" detected_languages: Sequence[documentai.Document.Page.DetectedLanguage],\n", | ||
") -> None:\n", | ||
" print(\" Detected languages:\")\n", | ||
" for lang in detected_languages:\n", | ||
" code = lang.language_code\n", | ||
" print(f\" {code} ({lang.confidence:.1%} confidence)\")\n", | ||
"\n", | ||
"\n", | ||
"def print_paragraphs(\n", | ||
" paragraphs: Sequence[documentai.Document.Page.Paragraph], text: str\n", | ||
") -> None:\n", | ||
" print(f\" {len(paragraphs)} paragraphs detected:\")\n", | ||
" first_paragraph_text = layout_to_text(paragraphs[0].layout, text)\n", | ||
" print(f\" First paragraph text: {repr(first_paragraph_text)}\")\n", | ||
" last_paragraph_text = layout_to_text(paragraphs[-1].layout, text)\n", | ||
" print(f\" Last paragraph text: {repr(last_paragraph_text)}\")\n", | ||
"\n", | ||
"\n", | ||
"def print_blocks(blocks: Sequence[documentai.Document.Page.Block], text: str) -> None:\n", | ||
" print(f\" {len(blocks)} blocks detected:\")\n", | ||
" first_block_text = layout_to_text(blocks[0].layout, text)\n", | ||
" print(f\" First text block: {repr(first_block_text)}\")\n", | ||
" last_block_text = layout_to_text(blocks[-1].layout, text)\n", | ||
" print(f\" Last text block: {repr(last_block_text)}\")\n", | ||
"\n", | ||
"\n", | ||
"def print_lines(lines: Sequence[documentai.Document.Page.Line], text: str) -> None:\n", | ||
" print(f\" {len(lines)} lines detected:\")\n", | ||
" first_line_text = layout_to_text(lines[0].layout, text)\n", | ||
" print(f\" First line text: {repr(first_line_text)}\")\n", | ||
" last_line_text = layout_to_text(lines[-1].layout, text)\n", | ||
" print(f\" Last line text: {repr(last_line_text)}\")\n", | ||
"\n", | ||
"\n", | ||
"def print_tokens(tokens: Sequence[documentai.Document.Page.Token], text: str) -> None:\n", | ||
" print(f\" {len(tokens)} tokens detected:\")\n", | ||
" first_token_text = layout_to_text(tokens[0].layout, text)\n", | ||
" first_token_break_type = tokens[0].detected_break.type_.name\n", | ||
" print(f\" First token text: {repr(first_token_text)}\")\n", | ||
" print(f\" First token break type: {repr(first_token_break_type)}\")\n", | ||
" last_token_text = layout_to_text(tokens[-1].layout, text)\n", | ||
" last_token_break_type = tokens[-1].detected_break.type_.name\n", | ||
" print(f\" Last token text: {repr(last_token_text)}\")\n", | ||
" print(f\" Last token break type: {repr(last_token_break_type)}\")\n", | ||
"\n", | ||
"\n", | ||
"def print_image_quality_scores(\n", | ||
" image_quality_scores: documentai.Document.Page.ImageQualityScores,\n", | ||
") -> None:\n", | ||
" print(f\" Quality score: {image_quality_scores.quality_score:.1%}\")\n", | ||
" print(\" Detected defects:\")\n", | ||
"\n", | ||
" for detected_defect in image_quality_scores.detected_defects:\n", | ||
" print(f\" {detected_defect.type_}: {detected_defect.confidence:.1%}\")\n", | ||
"\n", | ||
"\n", | ||
"def layout_to_text(layout: documentai.Document.Page.Layout, text: str) -> str:\n", | ||
" \"\"\"\n", | ||
" Document AI identifies text in different parts of the document by their\n", | ||
" offsets in the entirety of the document's text. This function converts\n", | ||
" offsets to a string.\n", | ||
" \"\"\"\n", | ||
" response = \"\"\n", | ||
" # If a text segment spans several lines, it will\n", | ||
" # be stored in different text segments.\n", | ||
" for segment in layout.text_anchor.text_segments:\n", | ||
" start_index = int(segment.start_index)\n", | ||
" end_index = int(segment.end_index)\n", | ||
" response += text[start_index:end_index]\n", | ||
" return response" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.4" | ||
}, | ||
"orig_nbformat": 4, | ||
"colab": { | ||
"provenance": [] | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 0 | ||
} |