add documentation

Elabbasy00 · Jul 16, 2020 · b626a25 · b626a25
1 parent c7b394a
commit b626a25
Show file tree

Hide file tree

Showing 6 changed files with 118 additions and 60 deletions.
diff --git a/README.md b/README.md
@@ -1,11 +1,15 @@
-# Easy OCR
+# EasyOCR
+
+[![PyPI Status](https://badge.fury.io/py/easyocr.svg)](https://badge.fury.io/py/easyocr)
+[![license](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/JaidedAI/EasyOCR/blob/master/LICENSE)
+[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.fan/easyocr)[![GitHub stars](https://img.shields.io/github/stars/JaidedAI/EasyOCR.svg?style=social&label=Star&maxAge=2592000)](https://GitHub.com/JaidedAI/EasyOCR/stargazers/)
 
 Ready-to-use OCR with 40+ languages supported including Chinese, Japanese, Korean and Thai.
 
 ## Examples
 
 See this [Colab Demo](https://colab.fan/easyocr). You can run it in the browser.
-[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.fan/easyocr)
+
 
 ![example](examples/example.png)
 
@@ -101,17 +105,85 @@ reader = easyocr.Reader(['th','en'], gpu = False)
 
 There are optional arguments for readtext function, `decoder` can be 'greedy'(default), 'beamsearch', or 'wordbeamsearch'. For 'beamsearch' and 'wordbeamsearch', you can also set `beamWidth` (default=5). Bigger number will be slower but can be more accurate. For multiprocessing, you can set `workers` and `batch_size`. Current version converts image into grey scale for recognition model, so contrast can be an issue. You can try playing with `contrast_ths`, `adjust_contrast` and `filter_ths`. `allowlist` and `blocklist` accept input in string (like this blocklist = '!&$%').
 
-### Run on command line
+#### Run on command line
 
 ```shell
 $ easyocr -l ch_sim en -f chinese.jpg --detail=1 --gpu=True
 ```
 
-## To be implemented
+## API Documentation
+
+#### `Reader` class
+> Base class for EasyOCR
+>
+> **Parameters**
+> * **lang_list** (list) - list of language code you want to recognize, for example ['ch_sim','en']. List of supported language code is [here](#Supported-Languages).
+> * **gpu** (bool, string, default = True)
+>
+> **Attribute**
+> * **lang_char** - Show all available characters in current model
+
+#### `readtext` method
+> Main method for Reader object. There are 4 groups of parameter: General,
+Contrast, Text Detection and Bounding Box Merging.
+>
+> **Parameters 1: General**
+> * **image** (string, numpy array, byte)
+> * **decoder** (string, default = 'greedy') - options are 'greedy', 'beamsearch' and 'wordbeamsearch'.
+> * **beamWidth** (int, default = 5)
+> * **batch_size** (int, default = 1) - batch_size>1 will make EasyOCR faster but use more memory
+> * **workers** (int, default = 0)
+> * **allowlist** (string) - Force EasyOCR to recognize only subset of characters
+> * **blocklist** (string) - Will be ignored if allowlist is given
+> * **detail** (int, default = 1) - Set this to 0 for simple output
+>
+> **Parameters 2: Contrast**
+> * **contrast_ths** (float, default = 0.1)
+> * **adjust_contrast** (float, default = 0.5)
+> * **filter_ths** (float, default = 0.003)
+>
+> **Parameters 3: Text Detection (from CRAFT)**
+> * **text_threshold** (float, default = 0.7)
+> * **low_text** (float, default = 0.4)
+> * **link_threshold** (float, default = 0.4)
+> * **canvas_size** (int, default = 2560)
+> * **mag_ratio** (float, default = 1)
+>
+> **Parameters 4: Bounding Box Merging**
+> * **slope_ths** (float, default = 0.1)
+> * **ycenter_ths** (float, default = 0.5)
+> * **height_ths** (float, default = 0.5)
+> * **width_ths** (float, default = 0.5)
+> * **add_margin** (float, default = 0.1)
+>
+> **Return** (list)
+
+## Implementation Roadmap
+
+#### Phase 1 (Now - October, 2020)
+
+1. Language packs: Hindi, Arabic, Cyrillic alphabet, etc. Aim to cover > 80-90% of world's population) See [current development list](https://github.com/JaidedAI/EasyOCR/issues/91).
+2. Better documentation and api
+3. Language model for better decoding
+
+#### Phase 2 (After October, 2020)
+
+1. Handwritten support: Network architecture should not matter.
+The key is using GAN to generate realistic handwritten dataset.
+2. Faster processing time: model pruning/quantization/export to other platforms
+3. Data generation script and model training pipeline
+4. Restructure code to support swappable detection and recognition algorithm.
+The api should be as easy as
+``` python
+reader = easyocr.Reader(['en'], detection='pixellink', recognition = 'ReXNet_LSTM_Attention')
+```
+The idea is to be able to plug-in any state-of-the-art model into EasyOCR. There are a lot of geniuses trying to make better detection/recognition model. We are not trying to be a genius here, just make genius's works quickly accessible to the public ... for free. (well I believe most geniuses want their work to create positive impact as fast/big as possible) The pipeline should be something like below diagram. Grey slots are placeholders for changeable light blue modules.
+
+![plan](examples/easyocr_framework.jpeg)
+
+Personal Note: I think any next-generation open platform should have default module that works out-of-box but also allow changeable modules. The key here is SIMPLICITY in allowing anyone to plug-in their choice of module and have their own version of application without hassle. Think of social media where you can have your own (or community's) version of front-end, ranking algorithm, payment system, transparent blocklist, etc. The world can be a better place with this kind of freedom.
+
 
-1. Language packs: Hindi, Arabic, Cyrillic alphabet, etc.
-2. Language model for better decoding
-3. Better documentation and api
 
 ## Acknowledgement and References
 

diff --git a/easyocr/__init__.py b/easyocr/__init__.py
@@ -1,3 +1,3 @@
 from .easyocr import Reader
 
-__version__ = '1.1.3'
+__version__ = '1.1.4'
diff --git a/easyocr/easyocr.py b/easyocr/easyocr.py
@@ -201,10 +201,12 @@ def __init__(self, lang_list, gpu=True):
                                                          dict_list, MODEL_PATH, device = self.device)
 
     def readtext(self, image, decoder = 'greedy', beamWidth= 5, batch_size = 1,\
-                 text_threshold = 0.7, low_text = 0.4, link_threshold = 0.4,\
-                 canvas_size = 2560, mag_ratio = 1., poly = False,\
+                 workers = 0, allowlist = None, blocklist = None, detail = 1,\
                  contrast_ths = 0.1,adjust_contrast = 0.5, filter_ths = 0.003,\
-                 workers = 0, allowlist = None, blocklist = None, detail = 1):
+                 text_threshold = 0.7, low_text = 0.4, link_threshold = 0.4,\
+                 canvas_size = 2560, mag_ratio = 1.,\
+                 slope_ths = 0.1, ycenter_ths = 0.5, height_ths = 0.5,\
+                 width_ths = 0.5, add_margin = 0.1):
         '''
         Parameters:
         file: file path or numpy-array or a byte stream object
@@ -229,8 +231,8 @@ def readtext(self, image, decoder = 'greedy', beamWidth= 5, batch_size = 1,\
             img_cv_grey = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
 
         text_box = get_textbox(self.detector, img, canvas_size, mag_ratio, text_threshold,\
-                               link_threshold, low_text, poly, self.device)
-        horizontal_list, free_list = group_text_box(text_box, width_ths = 0.5, add_margin = 0.1)
+                               link_threshold, low_text, False, self.device)
+        horizontal_list, free_list = group_text_box(text_box, slope_ths, ycenter_ths, height_ths, width_ths, add_margin)
 
         # should add filter to screen small box out
 
@@ -248,7 +250,7 @@ def readtext(self, image, decoder = 'greedy', beamWidth= 5, batch_size = 1,\
                       ignore_char, decoder, beamWidth, batch_size, contrast_ths, adjust_contrast, filter_ths,\
                       workers, self.device)
 
-        if detail == 0: 
+        if detail == 0:
             return [item[1] for item in result]
         else:
             return result
diff --git a/easyocr/recognition.py b/easyocr/recognition.py
@@ -3,7 +3,6 @@
 import torch.backends.cudnn as cudnn
 import torch.utils.data
 import torch.nn.functional as F
-import torch.utils.data
 import torchvision.transforms as transforms
 import numpy as np
 from collections import OrderedDict
@@ -15,13 +14,13 @@
 def contrast_grey(img):
     high = np.percentile(img, 90)
     low  = np.percentile(img, 10)
-    return (high-low)/(high+low), high, low
+    return (high-low)/np.maximum(10, high+low), high, low
 
 def adjust_contrast_grey(img, target = 0.4):
-    contrast, high, low = contrast_grey(img)    
+    contrast, high, low = contrast_grey(img)
     if contrast < target:
         img = img.astype(int)
-        ratio = 200./(high-low)
+        ratio = 200./np.maximum(10, high-low)
         img = (img - low + 25)*ratio
         img = np.maximum(np.full(img.shape, 0) ,np.minimum(np.full(img.shape, 255), img)).astype(np.uint8)
     return img
@@ -48,7 +47,7 @@ def __call__(self, img):
 class ListDataset(torch.utils.data.Dataset):
 
     def __init__(self, image_list):
-        self.image_list = image_list  
+        self.image_list = image_list
         self.nSamples = len(image_list)
 
     def __len__(self):
@@ -58,7 +57,7 @@ def __getitem__(self, index):
         img = self.image_list[index]
 
         return Image.fromarray(img, 'L')
-          
+
 class AlignCollate(object):
 
     def __init__(self, imgH=32, imgW=100, keep_ratio_with_pad=False, adjust_contrast = 0.):
@@ -80,7 +79,7 @@ def __call__(self, batch):
             w, h = image.size
             #### augmentation here - change contrast
             if self.adjust_contrast > 0:
-                image = np.array(image.convert("L"))   
+                image = np.array(image.convert("L"))
                 image = adjust_contrast_grey(image, target = self.adjust_contrast)
                 image = Image.fromarray(image, 'L')
 
@@ -94,10 +93,10 @@ def __call__(self, batch):
             resized_images.append(transform(resized_image))
 
         image_tensors = torch.cat([t.unsqueeze(0) for t in resized_images], 0)
-        return image_tensors   
+        return image_tensors
 
 def recognizer_predict(model, converter, test_loader, batch_max_length,\
-                       ignore_idx, char_group_idx, decoder = 'greedy', beamWidth= 5, device = 'cpu'):  
+                       ignore_idx, char_group_idx, decoder = 'greedy', beamWidth= 5, device = 'cpu'):
     model.eval()
     result = []
     with torch.no_grad():
@@ -107,7 +106,7 @@ def recognizer_predict(model, converter, test_loader, batch_max_length,\
             # For max length prediction
             length_for_pred = torch.IntTensor([batch_max_length] * batch_size).to(device)
             text_for_pred = torch.LongTensor(batch_size, batch_max_length + 1).fill_(0).to(device)
-            
+
             preds = model(image, text_for_pred)
 
             # Select max probabilty (greedy decoding) then decode index to character
@@ -120,7 +119,7 @@ def recognizer_predict(model, converter, test_loader, batch_max_length,\
             pred_norm = preds_prob.sum(axis=2)
             preds_prob = preds_prob/np.expand_dims(pred_norm, axis=-1)
             preds_prob = torch.from_numpy(preds_prob).float().to(device)
-            
+
             if decoder == 'greedy':
                 # Select max probabilty (greedy decoding) then decode index to character
                 _, preds_index = preds_prob.max(2)
@@ -135,21 +134,21 @@ def recognizer_predict(model, converter, test_loader, batch_max_length,\
                 preds_str = converter.decode_wordbeamsearch(k, beamWidth=beamWidth)
 
             preds_max_prob, _ = preds_prob.max(dim=2)
-            
+
             for pred, pred_max_prob in zip(preds_str, preds_max_prob):
                 confidence_score = pred_max_prob.cumprod(dim=0)[-1]
                 result.append([pred, confidence_score.item()])
-                
+
     return result
 
 def get_recognizer(input_channel, output_channel, hidden_size, character,\
                    separator_list, dict_list, model_path, device = 'cpu'):
-     
+
     converter = CTCLabelConverter(character, separator_list, dict_list)
     num_class = len(converter.character)
     model = Model(input_channel, output_channel, hidden_size, num_class)
-    
-    if device == 'cpu':      
+
+    if device == 'cpu':
         state_dict = torch.load(model_path, map_location=device)
         new_state_dict = OrderedDict()
         for key, value in state_dict.items():
@@ -159,56 +158,56 @@ def get_recognizer(input_channel, output_channel, hidden_size, character,\
     else:
         model = torch.nn.DataParallel(model).to(device)
         model.load_state_dict(torch.load(model_path, map_location=device))
-    
+
     return model, converter
 
 def get_text(character, imgH, imgW, recognizer, converter, image_list,\
              ignore_char = '',decoder = 'greedy', beamWidth =5, batch_size=1, contrast_ths=0.1,\
-             adjust_contrast=0.5, filter_ths = 0.003, workers = 1, device = 'cpu'):  
+             adjust_contrast=0.5, filter_ths = 0.003, workers = 1, device = 'cpu'):
     batch_max_length = int(imgW/10)
 
     char_group_idx = {}
     ignore_idx = []
     for char in ignore_char:
         try: ignore_idx.append(character.index(char)+1)
         except: pass
-    
+
     coord = [item[0] for item in image_list]
     img_list = [item[1] for item in image_list]
     AlignCollate_normal = AlignCollate(imgH=imgH, imgW=imgW, keep_ratio_with_pad=True)
-    test_data = ListDataset(img_list) 
+    test_data = ListDataset(img_list)
     test_loader = torch.utils.data.DataLoader(
         test_data, batch_size=batch_size, shuffle=False,
         num_workers=int(workers), collate_fn=AlignCollate_normal, pin_memory=True)
 
     # predict first round
     result1 = recognizer_predict(recognizer, converter, test_loader,batch_max_length,\
                                  ignore_idx, char_group_idx, decoder, beamWidth, device = device)
-    
+
     # predict second round
     low_confident_idx = [i for i,item in enumerate(result1) if (item[1] < contrast_ths)]
-    if len(low_confident_idx) > 0:      
-        img_list2 = [img_list[i] for i in low_confident_idx] 
+    if len(low_confident_idx) > 0:
+        img_list2 = [img_list[i] for i in low_confident_idx]
         AlignCollate_contrast = AlignCollate(imgH=imgH, imgW=imgW, keep_ratio_with_pad=True, adjust_contrast=adjust_contrast)
-        test_data = ListDataset(img_list2) 
+        test_data = ListDataset(img_list2)
         test_loader = torch.utils.data.DataLoader(
                         test_data, batch_size=batch_size, shuffle=False,
                         num_workers=int(workers), collate_fn=AlignCollate_contrast, pin_memory=True)
         result2 = recognizer_predict(recognizer, converter, test_loader, batch_max_length,\
                                      ignore_idx, char_group_idx, decoder, beamWidth, device = device)
-    
+
     result = []
     for i, zipped in enumerate(zip(coord, result1)):
         box, pred1 = zipped
-        if i in low_confident_idx:   
+        if i in low_confident_idx:
             pred2 = result2[low_confident_idx.index(i)]
             if pred1[1]>pred2[1]:
                 result.append( (box, pred1[0], pred1[1]) )
             else:
                 result.append( (box, pred2[0], pred2[1]) )
         else:
             result.append( (box, pred1[0], pred1[1]) )
-    
+
     #confidence_score = pred_max_prob.cumprod(dim=0)[-1]
     #if confidence_score.item() > filter_ths:
     #    print(pred, confidence_score.item())

diff --git a/easyocr/utils.py b/easyocr/utils.py
@@ -352,28 +352,13 @@ def four_point_transform(image, rect):
 
     return warped
 
-def contrast_grey(img):
-    high = np.percentile(img, 90)
-    low  = np.percentile(img, 10)
-    return (high-low)/(high+low), high, low
-
-def adjust_contrast_grey(img, target = 0.7):
-    contrast, high, low = contrast_grey(img)
-    if contrast < target:
-        img = img.astype(int)
-        ratio = 200./(high-low)
-        img = (img - low + 25)*ratio
-        img = np.maximum(np.full(img.shape, 0) ,np.minimum(np.full(img.shape, 255), img)).astype(np.uint8)
-    return img
-
 def group_text_box(polys, slope_ths = 0.1, ycenter_ths = 0.5, height_ths = 0.5, width_ths = 1.0, add_margin = 0.05):
     # poly top-left, top-right, low-right, low-left
-
     horizontal_list, free_list,combined_list, merged_list = [],[],[],[]
 
     for poly in polys:
-        slope_up = (poly[3]-poly[1])/(poly[2]-poly[0])
-        slope_down = (poly[5]-poly[7])/(poly[4]-poly[6])
+        slope_up = (poly[3]-poly[1])/np.maximum(10, (poly[2]-poly[0]))
+        slope_down = (poly[5]-poly[7])/np.maximum(10, (poly[4]-poly[6]))
         if max(abs(slope_up), abs(slope_down)) < slope_ths:
             x_max = max([poly[0],poly[2],poly[4],poly[6]])
             x_min = min([poly[0],poly[2],poly[4],poly[6]])
@@ -384,8 +369,8 @@ def group_text_box(polys, slope_ths = 0.1, ycenter_ths = 0.5, height_ths = 0.5,
             height = np.linalg.norm( [poly[6]-poly[0],poly[7]-poly[1]])
             margin = int(1.44*add_margin*height)
 
-            theta13 = abs(np.arctan( (poly[1]-poly[5])/(poly[0]-poly[4]) ))
-            theta24 = abs(np.arctan( (poly[3]-poly[7])/(poly[2]-poly[6]) ))
+            theta13 = abs(np.arctan( (poly[1]-poly[5])/np.maximum(10, (poly[0]-poly[4]))))
+            theta24 = abs(np.arctan( (poly[3]-poly[7])/np.maximum(10, (poly[2]-poly[6]))))
             # do I need to clip minimum, maximum value here?
             x1 = poly[0] - np.cos(theta13)*margin
             y1 = poly[1] - np.sin(theta13)*margin

diff --git a/examples/easyocr_framework.jpeg b/examples/easyocr_framework.jpeg