Add files via upload

datakind · Jun 15, 2023 · 885f019 · 885f019
1 parent 6f4c17e
commit 885f019
Showing 1 changed file with 326 additions and 0 deletions.
diff --git a/Google/Copy_of_Doc_AI_test.ipynb b/Google/Copy_of_Doc_AI_test.ipynb
@@ -0,0 +1,326 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Document AI\n",
+        "Google service (API) to parse structured information from unstructured or semi-structured documents using state-of-the-art Google AI such as natural language, computer vision, translation, and AutoML."
+      ],
+      "metadata": {
+        "id": "AMyDKXCjygvE"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Install gCloud CLI"
+      ],
+      "metadata": {
+        "id": "NcWRFfYj24dp"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "#!curl -O https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-cli-435.0.1-linux-x86_64.tar.gz # unnecessary for Google Colab Notebooks"
+      ],
+      "metadata": {
+        "id": "9v5bDEU13-KI"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Initialize gCloud CLI"
+      ],
+      "metadata": {
+        "id": "PqnlhaL55Mjw"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!gcloud init # Follow prompts to configure and log in. Project is \"dk-ocr-test\""
+      ],
+      "metadata": {
+        "id": "AoSU4BXA4XxQ"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Setting up Application Default Credentials (ADC) for Google API client libraries"
+      ],
+      "metadata": {
+        "id": "QFHcT3JbzNzu"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!gcloud auth application-default login"
+      ],
+      "metadata": {
+        "id": "INdY4oSJ6Ca1"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Import GCP libraries"
+      ],
+      "metadata": {
+        "id": "a65_t7d226qJ"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!pip install google # unnecessary for Google Colab Notebooks\n",
+        "!pip install google.cloud\n",
+        "!pip install google-cloud-documentai"
+      ],
+      "metadata": {
+        "id": "HZzo7-S76gYQ"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from __future__ import annotations\n",
+        "\n",
+        "from collections.abc import Sequence\n",
+        "\n",
+        "from google.api_core.client_options import ClientOptions\n",
+        "import google.cloud.documentai as documentai  # type: ignore"
+      ],
+      "metadata": {
+        "id": "NtnvIPp0624y"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Environmental variable assignment"
+      ],
+      "metadata": {
+        "id": "8iTnt82N8jCx"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "project_id = 'YOUR_PROJECT_ID'\n",
+        "location = 'YOUR_PROCESSOR_LOCATION' # Format is 'us' or 'eu'\n",
+        "processor_id = 'YOUR_PROCESSOR_ID' # Create processor before running sample\n",
+        "processor_version = 'rc' # Refer to https://cloud.google.com/document-ai/docs/manage-processor-versions for more information\n",
+        "file_path = '/ocr-data/dataset/the.jpg'\n",
+        "mime_type = 'image/jpeg' # Refer to https://cloud.google.com/document-ai/docs/file-types"
+      ],
+      "metadata": {
+        "id": "b68RexNH8cXq"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Define `process_document_ocr_sample` function for Document AI API requests"
+      ],
+      "metadata": {
+        "id": "urjMqOep9XHV"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "5wLUIa6ffhDF"
+      },
+      "outputs": [],
+      "source": [
+        "def process_document_ocr_sample(\n",
+        "    project_id: str,\n",
+        "    location: str,\n",
+        "    processor_id: str,\n",
+        "    processor_version: str,\n",
+        "    file_path: str,\n",
+        "    mime_type: str,\n",
+        ") -> None:\n",
+        "    # Online processing request to Document AI\n",
+        "    document = process_document(\n",
+        "        project_id, location, processor_id, processor_version, file_path, mime_type\n",
+        "    )\n",
+        "\n",
+        "    # For a full list of Document object attributes, please reference this page:\n",
+        "    # https://cloud.google.com/python/docs/reference/documentai/latest/google.cloud.documentai_v1.types.Document\n",
+        "\n",
+        "    text = document.text\n",
+        "    print(f\"Full document text: {text}\\n\")\n",
+        "    print(f\"There are {len(document.pages)} page(s) in this document.\\n\")\n",
+        "\n",
+        "    for page in document.pages:\n",
+        "        print(f\"Page {page.page_number}:\")\n",
+        "        print_page_dimensions(page.dimension)\n",
+        "        print_detected_langauges(page.detected_languages)\n",
+        "        print_paragraphs(page.paragraphs, text)\n",
+        "        print_blocks(page.blocks, text)\n",
+        "        print_lines(page.lines, text)\n",
+        "        print_tokens(page.tokens, text)\n",
+        "\n",
+        "        # Currently supported in version pretrained-ocr-v1.1-2022-09-12\n",
+        "        if page.image_quality_scores:\n",
+        "            print_image_quality_scores(page.image_quality_scores)\n",
+        "\n",
+        "\n",
+        "def process_document(\n",
+        "    project_id: str,\n",
+        "    location: str,\n",
+        "    processor_id: str,\n",
+        "    processor_version: str,\n",
+        "    file_path: str,\n",
+        "    mime_type: str,\n",
+        ") -> documentai.Document:\n",
+        "    # You must set the api_endpoint if you use a location other than 'us'.\n",
+        "    opts = ClientOptions(api_endpoint=f\"{location}-documentai.googleapis.com\")\n",
+        "\n",
+        "    client = documentai.DocumentProcessorServiceClient(client_options=opts)\n",
+        "\n",
+        "    # The full resource name of the processor version\n",
+        "    # e.g. projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}\n",
+        "    # You must create processors before running sample code.\n",
+        "    name = client.processor_version_path(\n",
+        "        project_id, location, processor_id, processor_version\n",
+        "    )\n",
+        "\n",
+        "    # Read the file into memory\n",
+        "    with open(file_path, \"rb\") as image:\n",
+        "        image_content = image.read()\n",
+        "\n",
+        "    # Load Binary Data into Document AI RawDocument Object\n",
+        "    raw_document = documentai.RawDocument(content=image_content, mime_type=mime_type)\n",
+        "\n",
+        "    # Configure the process request\n",
+        "    request = documentai.ProcessRequest(name=name, raw_document=raw_document)\n",
+        "\n",
+        "    result = client.process_document(request=request)\n",
+        "\n",
+        "    return result.document\n",
+        "\n",
+        "\n",
+        "def print_page_dimensions(dimension: documentai.Document.Page.Dimension) -> None:\n",
+        "    print(f\"    Width: {str(dimension.width)}\")\n",
+        "    print(f\"    Height: {str(dimension.height)}\")\n",
+        "\n",
+        "\n",
+        "def print_detected_langauges(\n",
+        "    detected_languages: Sequence[documentai.Document.Page.DetectedLanguage],\n",
+        ") -> None:\n",
+        "    print(\"    Detected languages:\")\n",
+        "    for lang in detected_languages:\n",
+        "        code = lang.language_code\n",
+        "        print(f\"        {code} ({lang.confidence:.1%} confidence)\")\n",
+        "\n",
+        "\n",
+        "def print_paragraphs(\n",
+        "    paragraphs: Sequence[documentai.Document.Page.Paragraph], text: str\n",
+        ") -> None:\n",
+        "    print(f\"    {len(paragraphs)} paragraphs detected:\")\n",
+        "    first_paragraph_text = layout_to_text(paragraphs[0].layout, text)\n",
+        "    print(f\"        First paragraph text: {repr(first_paragraph_text)}\")\n",
+        "    last_paragraph_text = layout_to_text(paragraphs[-1].layout, text)\n",
+        "    print(f\"        Last paragraph text: {repr(last_paragraph_text)}\")\n",
+        "\n",
+        "\n",
+        "def print_blocks(blocks: Sequence[documentai.Document.Page.Block], text: str) -> None:\n",
+        "    print(f\"    {len(blocks)} blocks detected:\")\n",
+        "    first_block_text = layout_to_text(blocks[0].layout, text)\n",
+        "    print(f\"        First text block: {repr(first_block_text)}\")\n",
+        "    last_block_text = layout_to_text(blocks[-1].layout, text)\n",
+        "    print(f\"        Last text block: {repr(last_block_text)}\")\n",
+        "\n",
+        "\n",
+        "def print_lines(lines: Sequence[documentai.Document.Page.Line], text: str) -> None:\n",
+        "    print(f\"    {len(lines)} lines detected:\")\n",
+        "    first_line_text = layout_to_text(lines[0].layout, text)\n",
+        "    print(f\"        First line text: {repr(first_line_text)}\")\n",
+        "    last_line_text = layout_to_text(lines[-1].layout, text)\n",
+        "    print(f\"        Last line text: {repr(last_line_text)}\")\n",
+        "\n",
+        "\n",
+        "def print_tokens(tokens: Sequence[documentai.Document.Page.Token], text: str) -> None:\n",
+        "    print(f\"    {len(tokens)} tokens detected:\")\n",
+        "    first_token_text = layout_to_text(tokens[0].layout, text)\n",
+        "    first_token_break_type = tokens[0].detected_break.type_.name\n",
+        "    print(f\"        First token text: {repr(first_token_text)}\")\n",
+        "    print(f\"        First token break type: {repr(first_token_break_type)}\")\n",
+        "    last_token_text = layout_to_text(tokens[-1].layout, text)\n",
+        "    last_token_break_type = tokens[-1].detected_break.type_.name\n",
+        "    print(f\"        Last token text: {repr(last_token_text)}\")\n",
+        "    print(f\"        Last token break type: {repr(last_token_break_type)}\")\n",
+        "\n",
+        "\n",
+        "def print_image_quality_scores(\n",
+        "    image_quality_scores: documentai.Document.Page.ImageQualityScores,\n",
+        ") -> None:\n",
+        "    print(f\"    Quality score: {image_quality_scores.quality_score:.1%}\")\n",
+        "    print(\"    Detected defects:\")\n",
+        "\n",
+        "    for detected_defect in image_quality_scores.detected_defects:\n",
+        "        print(f\"        {detected_defect.type_}: {detected_defect.confidence:.1%}\")\n",
+        "\n",
+        "\n",
+        "def layout_to_text(layout: documentai.Document.Page.Layout, text: str) -> str:\n",
+        "    \"\"\"\n",
+        "    Document AI identifies text in different parts of the document by their\n",
+        "    offsets in the entirety of the document's text. This function converts\n",
+        "    offsets to a string.\n",
+        "    \"\"\"\n",
+        "    response = \"\"\n",
+        "    # If a text segment spans several lines, it will\n",
+        "    # be stored in different text segments.\n",
+        "    for segment in layout.text_anchor.text_segments:\n",
+        "        start_index = int(segment.start_index)\n",
+        "        end_index = int(segment.end_index)\n",
+        "        response += text[start_index:end_index]\n",
+        "    return response"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.10.4"
+    },
+    "orig_nbformat": 4,
+    "colab": {
+      "provenance": []
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}