Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
Christine-DK authored Jun 15, 2023
1 parent 6f4c17e commit 885f019
Showing 1 changed file with 326 additions and 0 deletions.
326 changes: 326 additions & 0 deletions Google/Copy_of_Doc_AI_test.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,326 @@
{
"cells": [
{
"cell_type": "markdown",
"source": [
"# Document AI\n",
"Google service (API) to parse structured information from unstructured or semi-structured documents using state-of-the-art Google AI such as natural language, computer vision, translation, and AutoML."
],
"metadata": {
"id": "AMyDKXCjygvE"
}
},
{
"cell_type": "markdown",
"source": [
"Install gCloud CLI"
],
"metadata": {
"id": "NcWRFfYj24dp"
}
},
{
"cell_type": "code",
"source": [
"#!curl -O https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-cli-435.0.1-linux-x86_64.tar.gz # unnecessary for Google Colab Notebooks"
],
"metadata": {
"id": "9v5bDEU13-KI"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Initialize gCloud CLI"
],
"metadata": {
"id": "PqnlhaL55Mjw"
}
},
{
"cell_type": "code",
"source": [
"!gcloud init # Follow prompts to configure and log in. Project is \"dk-ocr-test\""
],
"metadata": {
"id": "AoSU4BXA4XxQ"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Setting up Application Default Credentials (ADC) for Google API client libraries"
],
"metadata": {
"id": "QFHcT3JbzNzu"
}
},
{
"cell_type": "code",
"source": [
"!gcloud auth application-default login"
],
"metadata": {
"id": "INdY4oSJ6Ca1"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Import GCP libraries"
],
"metadata": {
"id": "a65_t7d226qJ"
}
},
{
"cell_type": "code",
"source": [
"!pip install google # unnecessary for Google Colab Notebooks\n",
"!pip install google.cloud\n",
"!pip install google-cloud-documentai"
],
"metadata": {
"id": "HZzo7-S76gYQ"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from __future__ import annotations\n",
"\n",
"from collections.abc import Sequence\n",
"\n",
"from google.api_core.client_options import ClientOptions\n",
"import google.cloud.documentai as documentai # type: ignore"
],
"metadata": {
"id": "NtnvIPp0624y"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Environmental variable assignment"
],
"metadata": {
"id": "8iTnt82N8jCx"
}
},
{
"cell_type": "code",
"source": [
"project_id = 'YOUR_PROJECT_ID'\n",
"location = 'YOUR_PROCESSOR_LOCATION' # Format is 'us' or 'eu'\n",
"processor_id = 'YOUR_PROCESSOR_ID' # Create processor before running sample\n",
"processor_version = 'rc' # Refer to https://cloud.google.com/document-ai/docs/manage-processor-versions for more information\n",
"file_path = '/ocr-data/dataset/the.jpg'\n",
"mime_type = 'image/jpeg' # Refer to https://cloud.google.com/document-ai/docs/file-types"
],
"metadata": {
"id": "b68RexNH8cXq"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Define `process_document_ocr_sample` function for Document AI API requests"
],
"metadata": {
"id": "urjMqOep9XHV"
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "5wLUIa6ffhDF"
},
"outputs": [],
"source": [
"def process_document_ocr_sample(\n",
" project_id: str,\n",
" location: str,\n",
" processor_id: str,\n",
" processor_version: str,\n",
" file_path: str,\n",
" mime_type: str,\n",
") -> None:\n",
" # Online processing request to Document AI\n",
" document = process_document(\n",
" project_id, location, processor_id, processor_version, file_path, mime_type\n",
" )\n",
"\n",
" # For a full list of Document object attributes, please reference this page:\n",
" # https://cloud.google.com/python/docs/reference/documentai/latest/google.cloud.documentai_v1.types.Document\n",
"\n",
" text = document.text\n",
" print(f\"Full document text: {text}\\n\")\n",
" print(f\"There are {len(document.pages)} page(s) in this document.\\n\")\n",
"\n",
" for page in document.pages:\n",
" print(f\"Page {page.page_number}:\")\n",
" print_page_dimensions(page.dimension)\n",
" print_detected_langauges(page.detected_languages)\n",
" print_paragraphs(page.paragraphs, text)\n",
" print_blocks(page.blocks, text)\n",
" print_lines(page.lines, text)\n",
" print_tokens(page.tokens, text)\n",
"\n",
" # Currently supported in version pretrained-ocr-v1.1-2022-09-12\n",
" if page.image_quality_scores:\n",
" print_image_quality_scores(page.image_quality_scores)\n",
"\n",
"\n",
"def process_document(\n",
" project_id: str,\n",
" location: str,\n",
" processor_id: str,\n",
" processor_version: str,\n",
" file_path: str,\n",
" mime_type: str,\n",
") -> documentai.Document:\n",
" # You must set the api_endpoint if you use a location other than 'us'.\n",
" opts = ClientOptions(api_endpoint=f\"{location}-documentai.googleapis.com\")\n",
"\n",
" client = documentai.DocumentProcessorServiceClient(client_options=opts)\n",
"\n",
" # The full resource name of the processor version\n",
" # e.g. projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}\n",
" # You must create processors before running sample code.\n",
" name = client.processor_version_path(\n",
" project_id, location, processor_id, processor_version\n",
" )\n",
"\n",
" # Read the file into memory\n",
" with open(file_path, \"rb\") as image:\n",
" image_content = image.read()\n",
"\n",
" # Load Binary Data into Document AI RawDocument Object\n",
" raw_document = documentai.RawDocument(content=image_content, mime_type=mime_type)\n",
"\n",
" # Configure the process request\n",
" request = documentai.ProcessRequest(name=name, raw_document=raw_document)\n",
"\n",
" result = client.process_document(request=request)\n",
"\n",
" return result.document\n",
"\n",
"\n",
"def print_page_dimensions(dimension: documentai.Document.Page.Dimension) -> None:\n",
" print(f\" Width: {str(dimension.width)}\")\n",
" print(f\" Height: {str(dimension.height)}\")\n",
"\n",
"\n",
"def print_detected_langauges(\n",
" detected_languages: Sequence[documentai.Document.Page.DetectedLanguage],\n",
") -> None:\n",
" print(\" Detected languages:\")\n",
" for lang in detected_languages:\n",
" code = lang.language_code\n",
" print(f\" {code} ({lang.confidence:.1%} confidence)\")\n",
"\n",
"\n",
"def print_paragraphs(\n",
" paragraphs: Sequence[documentai.Document.Page.Paragraph], text: str\n",
") -> None:\n",
" print(f\" {len(paragraphs)} paragraphs detected:\")\n",
" first_paragraph_text = layout_to_text(paragraphs[0].layout, text)\n",
" print(f\" First paragraph text: {repr(first_paragraph_text)}\")\n",
" last_paragraph_text = layout_to_text(paragraphs[-1].layout, text)\n",
" print(f\" Last paragraph text: {repr(last_paragraph_text)}\")\n",
"\n",
"\n",
"def print_blocks(blocks: Sequence[documentai.Document.Page.Block], text: str) -> None:\n",
" print(f\" {len(blocks)} blocks detected:\")\n",
" first_block_text = layout_to_text(blocks[0].layout, text)\n",
" print(f\" First text block: {repr(first_block_text)}\")\n",
" last_block_text = layout_to_text(blocks[-1].layout, text)\n",
" print(f\" Last text block: {repr(last_block_text)}\")\n",
"\n",
"\n",
"def print_lines(lines: Sequence[documentai.Document.Page.Line], text: str) -> None:\n",
" print(f\" {len(lines)} lines detected:\")\n",
" first_line_text = layout_to_text(lines[0].layout, text)\n",
" print(f\" First line text: {repr(first_line_text)}\")\n",
" last_line_text = layout_to_text(lines[-1].layout, text)\n",
" print(f\" Last line text: {repr(last_line_text)}\")\n",
"\n",
"\n",
"def print_tokens(tokens: Sequence[documentai.Document.Page.Token], text: str) -> None:\n",
" print(f\" {len(tokens)} tokens detected:\")\n",
" first_token_text = layout_to_text(tokens[0].layout, text)\n",
" first_token_break_type = tokens[0].detected_break.type_.name\n",
" print(f\" First token text: {repr(first_token_text)}\")\n",
" print(f\" First token break type: {repr(first_token_break_type)}\")\n",
" last_token_text = layout_to_text(tokens[-1].layout, text)\n",
" last_token_break_type = tokens[-1].detected_break.type_.name\n",
" print(f\" Last token text: {repr(last_token_text)}\")\n",
" print(f\" Last token break type: {repr(last_token_break_type)}\")\n",
"\n",
"\n",
"def print_image_quality_scores(\n",
" image_quality_scores: documentai.Document.Page.ImageQualityScores,\n",
") -> None:\n",
" print(f\" Quality score: {image_quality_scores.quality_score:.1%}\")\n",
" print(\" Detected defects:\")\n",
"\n",
" for detected_defect in image_quality_scores.detected_defects:\n",
" print(f\" {detected_defect.type_}: {detected_defect.confidence:.1%}\")\n",
"\n",
"\n",
"def layout_to_text(layout: documentai.Document.Page.Layout, text: str) -> str:\n",
" \"\"\"\n",
" Document AI identifies text in different parts of the document by their\n",
" offsets in the entirety of the document's text. This function converts\n",
" offsets to a string.\n",
" \"\"\"\n",
" response = \"\"\n",
" # If a text segment spans several lines, it will\n",
" # be stored in different text segments.\n",
" for segment in layout.text_anchor.text_segments:\n",
" start_index = int(segment.start_index)\n",
" end_index = int(segment.end_index)\n",
" response += text[start_index:end_index]\n",
" return response"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
},
"orig_nbformat": 4,
"colab": {
"provenance": []
}
},
"nbformat": 4,
"nbformat_minor": 0
}

0 comments on commit 885f019

Please sign in to comment.