create ocr-data

datakind · Jun 13, 2023 · 746520f · 746520f
1 parent 60e3167
commit 746520f
Show file tree

Hide file tree

Showing 5 changed files with 224 additions and 0 deletions.
diff --git a/ocr-data/DDI-100.ipynb b/ocr-data/DDI-100.ipynb
diff --git a/ocr-data/scripts/examples.py b/ocr-data/scripts/examples.py
@@ -0,0 +1,49 @@
+import cv2
+from generator import Generator
+from utils import draw_word_boxes, combine_masks
+
+
+def test_doc():
+    gen = Generator("../data/pdf_dataset")
+    img, mask, data = gen.get_doc()
+    cv2.imshow("image", cv2.resize(img, (0, 0), fx=0.2, fy=0.2))
+    cv2.imshow("mask", cv2.resize(mask, (0, 0), fx=0.2, fy=0.2))
+    draw_word_boxes(img, data, word_color=0)
+    cv2.imshow("image with boxes", cv2.resize(img, (0, 0), fx=0.2, fy=0.2))
+    cv2.waitKey()
+
+
+def test_str():
+    gen = Generator("../data/pdf_dataset")
+    img, data, delimeters = gen.get_string()
+    cv2.imshow("image", cv2.resize(img, (0, 0), fx=2, fy=2))
+    print(data)
+    for delim in delimeters:
+        cv2.line(img, (delim, 0), (delim, 32), color=0, thickness=2)
+    cv2.imshow("image with delims", cv2.resize(img, (0, 0), fx=2, fy=2))
+    cv2.waitKey()
+
+
+def test_char():
+    gen = Generator("../data/pdf_dataset")
+    img, data = gen.get_char()
+    cv2.imshow("image", cv2.resize(img, (0, 0), fx=2, fy=2))
+    print(data)
+    cv2.waitKey()
+
+
+def test_mask():
+    gen = Generator("../data/pdf_dataset")
+    _, mask1, _ = gen.get_doc()
+    _, mask2, _ = gen.get_doc()
+    mask2 = cv2.resize(mask2, (mask1.shape[1], mask1.shape[0]))
+    img = combine_masks(mask1, mask2)
+    cv2.imshow("image", cv2.resize(img, (0, 0), fx=.2, fy=.2))
+    cv2.waitKey()
+
+
+if __name__ == "__main__":
+    test_doc()
+    test_str()
+    test_char()
+    test_mask()
diff --git a/ocr-data/scripts/generator.py b/ocr-data/scripts/generator.py
@@ -0,0 +1,77 @@
+import cv2
+import numpy as np
+import pickle
+
+from glob import glob
+from pathlib import Path
+from random import choice
+
+from .utils import get_image_from_box
+
+
+class Generator:
+    def __init__(self, pickle_path=None, dataset_path=None, book_paths=None):
+        """
+        Class Generator implements simple random sample choice.
+        One of the arguments must be specified.
+
+        :param pickle_path: str - path to pickle file with image paths
+        :param dataset_path: str - path to full dataset
+        :param book_paths: str - paths to book directories.
+        """
+        if dataset_path is None and book_paths is None and pickle_path is None:
+            raise ValueError("At least one argument must be specified")
+        if dataset_path is not None:
+            book_paths = glob(f"{dataset_path}/*")
+        if book_paths is not None:
+            self.paths = []
+            for book in book_paths:
+                self.paths += glob(f"{book}/gen_imgs/*")
+        if pickle_path is not None:
+            with open(pickle_path, "rb") as f:
+                paths = pickle.load(f)
+            self.paths = paths
+
+    def get_doc(self):
+        """
+        Returns random document sample from dataset.
+
+        :return: (img, masks, data)
+        """
+        path = choice(self.paths)
+        img_path = Path(choice(self.paths))
+        boxes_path = img_path.parent.parent.joinpath('gen_boxes').joinpath(img_path.stem + ".pickle")
+        mask_paths = sorted(img_path.parent.parent.joinpath('gen_masks').glob(img_path.stem + '_*'))
+        with open(boxes_path, "rb") as f:
+            data = pickle.load(f)
+        img = cv2.imread(str(img_path), 0)
+        masks = []
+        for mask_path in mask_paths:
+            masks.append(cv2.imread(str(mask_path), 0))
+        return img, masks, data
+
+    def get_string(self):
+        """
+        Returns sample with random single word string from dataset.
+
+        :return: (img, str, list) - img of word, string representation, list of char x axis delimiters
+        """
+        img, mask, data = self.get_doc()
+        word = choice(data)
+        cut_img, delimiters = get_image_from_box(img, word)
+        shift = np.min(word['box'], axis=0)
+        for char in word['chars']:
+            char['box'] -= shift
+        return cut_img, word['text'], delimiters
+
+    def get_char(self):
+        """
+        Returns sample with random single char string from dataset.
+
+        :return: (img, str) - img with letter and letter
+        """
+        img, _, boxes = self.get_doc()
+        word = choice(boxes)
+        char = choice(word['chars'])
+        cut_img = get_image_from_box(img, char['box'])
+        return cut_img, char['text']
diff --git a/ocr-data/scripts/utils.py b/ocr-data/scripts/utils.py
@@ -0,0 +1,68 @@
+import cv2
+import numpy as np
+
+
+def draw_word_boxes(img, word_box_list, word_color=(255, 0, 0), letter_color=None, fill=False):
+    """
+    Draws boxes on the image. Inplace
+    :param img: np.array - image
+    :param word_box_list: list - boxes
+    :param word_color: tuple or int or None - boundary color fow words
+    :param letter_color: tuple or list or None - boundary color for letters
+    :param fill: bool - whether to fill the boxes with color or just draw a quadrilateral
+    """
+    thickness = -1 if fill else 2
+    for word in word_box_list:
+        if word_color is not None:
+            cv2.polylines(img, [word['box'][[0, 1, 3, 2], ::-1].reshape((-1, 1, 2))], True, word_color,
+                          thickness=thickness)
+
+        if letter_color is not None:
+            for char in word['chars']:
+                cv2.polylines(img, [char['box'][[0, 1, 3, 2], ::-1].reshape((-1, 1, 2))], True, letter_color,
+                              thickness=thickness)
+
+
+def get_image_from_box(image, data, height=32):
+    """
+    Cuts image with bounding box using perspective Transform
+    :param image: numpy.ndarray: image
+    :param data: dict: corresponding word data box
+    :param height: int: height of the result image
+    :return: (np.ndarray, list): cut image, list of char x axis delimiters
+    """
+    box = data['box']
+    scale = np.sqrt((box[0, 1] - box[1, 1])**2 + (box[0, 0] - box[1, 0])**2) / height
+    w = int(np.sqrt((box[1, 1] - box[2, 1])**2 + (box[1, 0] - box[2, 0])**2) / scale)
+    pts1 = np.float32(box)[:, ::-1]
+    pts1 = pts1[[1, 0, 3, 2]]
+    pts2 = np.float32([[0, 0], [height, 0], [0, w],  [height, w]])[:, ::-1]
+    M = cv2.getPerspectiveTransform(pts1, pts2)
+    result_img = cv2.warpPerspective(image, M, (w, height))
+
+    begin = box[0][1]
+    dist = (box[2][1] - begin)
+    delimiters = []
+    for (char, next_char) in zip(data['chars'], data['chars'][1:]):
+        left = (char['box'][3][1] - begin) / dist * w
+        right = (next_char['box'][0][1] - begin) / dist * w
+        delimiters.append(int((left + right) / 2))
+    return result_img, delimiters
+
+
+def combine_masks(true_mask, predicted_mask):
+    """
+    Combines true and predicted masks into one image for convenient comparison.
+    :param true_mask: 2D np.ndarray - gray image with true mask
+    :param predicted_mask: 2D np.ndarray - gray image with predicted mask
+    :return: 3D np.ndarray - colored image with both masks
+    """
+    if true_mask.shape != predicted_mask.shape:
+        raise ValueError("Shapes do not match")
+    if true_mask.ndim != 2:
+        raise ValueError("Masks should be greyscaled")
+
+    img = np.zeros(true_mask.shape + (3,))
+    img[:, :, 2] = 255 - predicted_mask
+    img[:, :, 1] = 255 - true_mask
+    return img
diff --git a/ocr-data/scripts/visualization.py b/ocr-data/scripts/visualization.py
@@ -0,0 +1,30 @@
+import cv2
+import random
+import pickle
+
+from glob import glob
+from pathlib import Path
+from utils import draw_word_boxes
+from generator import Generator
+
+
+def random_show(gen):
+    img, mask, data = gen.get_doc()
+    mask = mask.copy()
+    cv2.imshow("image", cv2.resize(img, (0, 0), fx=0.2, fy=0.2))
+    cv2.imshow("mask", cv2.resize(mask, (0, 0), fx=0.2, fy=0.2))
+
+    draw_word_boxes(img, data, word_color=0)
+    cv2.imshow("image with boxes", cv2.resize(img, (0, 0), fx=0.2, fy=0.2))
+    cv2.waitKey(1000)
+
+
+def show_dataset(dataset_path):
+    gen = Generator(dataset_path)
+    while True:
+        random_show(gen)
+
+
+if __name__ == "__main__":
+    dataset_path = "../data/pdf_dataset"
+    show_dataset(dataset_path)