From 104e4a23f16589c8754103e00456426ecdcc82b1 Mon Sep 17 00:00:00 2001
From: Rodrigo V Honorato <r.vargashonorato@uu.nl>
Date: Thu, 26 May 2022 16:53:03 +0200
Subject: [PATCH] Implement v2.0.0 (#16)

* major code update

* add missing docstring

* linting :goat:
---
 .flake8                                       |   3 +
 .gitignore                                    | 161 +++++++-
 .isort.cfg                                    |   2 +
 CONTRIBUTE.md => CONTRIBUTING.md              |   9 +-
 README.md                                     | 106 ++---
 README.rst                                    |   2 +-
 cazy_parser/create_cazy_db.py                 | 352 -----------------
 cazy_parser/extract_cazy_ids.py               | 182 ---------
 pyproject.toml                                |   2 +
 requirements.txt                              |   2 +
 setup.py                                      |  70 +---
 src/cazy_parser/__init__.py                   |   7 +
 src/cazy_parser/cli.py                        | 119 ++++++
 .../cazy_parser/modules}/__init__.py          |   0
 src/cazy_parser/modules/fasta.py              |  54 +++
 src/cazy_parser/modules/html.py               | 374 ++++++++++++++++++
 src/cazy_parser/utils.py                      |  23 ++
 src/cazy_parser/version.py                    |   1 +
 tests/test_fasta.py                           |  26 ++
 tests/test_html.py                            | 106 +++++
 tests/test_utils.py                           |   7 +
 21 files changed, 922 insertions(+), 686 deletions(-)
 create mode 100644 .flake8
 create mode 100644 .isort.cfg
 rename CONTRIBUTE.md => CONTRIBUTING.md (60%)
 delete mode 100755 cazy_parser/create_cazy_db.py
 delete mode 100755 cazy_parser/extract_cazy_ids.py
 create mode 100644 pyproject.toml
 create mode 100644 src/cazy_parser/__init__.py
 create mode 100644 src/cazy_parser/cli.py
 rename {cazy_parser => src/cazy_parser/modules}/__init__.py (100%)
 create mode 100644 src/cazy_parser/modules/fasta.py
 create mode 100644 src/cazy_parser/modules/html.py
 create mode 100644 src/cazy_parser/utils.py
 create mode 100644 src/cazy_parser/version.py
 create mode 100644 tests/test_fasta.py
 create mode 100644 tests/test_html.py
 create mode 100644 tests/test_utils.py

diff --git a/.flake8 b/.flake8
new file mode 100644
index 0000000..8dd399a
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,3 @@
+[flake8]
+max-line-length = 88
+extend-ignore = E203
diff --git a/.gitignore b/.gitignore
index ddd21ed..5a14f6b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,160 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
 dist/
-paper/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 .idea/
-build/
-cazy_parser.egg-info/
+
+# VScode
+.vscode/
+
+# Project-specific
+*.csv
+*.chk
+*.fasta
diff --git a/.isort.cfg b/.isort.cfg
new file mode 100644
index 0000000..f238bf7
--- /dev/null
+++ b/.isort.cfg
@@ -0,0 +1,2 @@
+[settings]
+profile = black
diff --git a/CONTRIBUTE.md b/CONTRIBUTING.md
similarity index 60%
rename from CONTRIBUTE.md
rename to CONTRIBUTING.md
index 0c264ef..9cf83f0 100644
--- a/CONTRIBUTE.md
+++ b/CONTRIBUTING.md
@@ -1,17 +1,10 @@
-## cazy-parser
-*A way to extract specific information from the Carbohydrate-Active enZYmes.*
-
-# How to contribute?
+# How to contribute to cazy-parser?
 
 There are still a few features that could be implemented, such as:
 
 * Organism specific selection
 * Retrieve three dimensional structures for each family
 
-and specially
-
-* **Retrieve fasta sequences from NCBIs servers**
-
 ___
 
 Feel free to contact me with **suggestions**, **bugs reports** or if you need any **assistance** running the software.
diff --git a/README.md b/README.md
index b3a1be4..1b37bbd 100644
--- a/README.md
+++ b/README.md
@@ -18,105 +18,57 @@ License: [GNU GPLv3](https://www.gnu.org/licenses/gpl-3.0.html)
 
 doi: 10.21105/joss.00053
 
-# IMPORTANT
-
-Due to changes in the CAZy database, the parser is no longer functional, I will try to revive the code and update it soon. (:
 
 ## Introduction
  *cazy-parser* is a tool that extract information from [CAZy](http://www.cazy.org/) in a more usable and readable format. Firstly, a script reads the HTML structure and creates a mirror of the database as a tab delimited file. Secondly, information is extracted from the database according to user inputted parameters and presented to the user as a set of accession codes.
 
 ## Install / Upgrade
-`$ pip install --upgrade cazy-parser`
-
-or
-
-Download latest source from [this link](https://pypi.python.org/pypi/cazy-parser)
-
 ```
-$ tar -zxvf cazy-parser-x.x.x.tar.gz
-$ cd cazy-parser-x.x.x
-$ python setup.py install
-
+$ pip install --upgrade cazy-parser
 ```
 
-Note: It my be necessary to open a new terminal.
 
 ## Usage
 
 *Internet connection required*
 
-1) Database creation
-
-`$ create_cazy_db`
-
-(-h for help)
-* This script will parse the [CAZy](http://www.cazy.org/) database website and create a comma separated table containing the following information:
-    * domain
-    * protein_name
-    * family
-    * tag *(characterized status)*
-    * organism_code
-    * [EC](http://www.enzyme-database.org/) number (ec stands for enzyme comission number)
-    * [GENBANK](https://www.ncbi.nlm.nih.gov/genbank/) id
-    * [UNIPROT](https://www.uniprot.org) code
-    * subfamily
-    * organism
-    * [PDB](http://www.rcsb.org/) code
-
-2) Extract accession codes
-
-* Based on the previously generated csv table, extract accession codes for a given protein family.
-
-`$ extract_cazy_ids --db <database> --family <family code>`
-
-(-h for help)
-* Optional:
-
-`--subfamilies` Create a file for each subfamily, default = False
-
-`--characterized` Create a file containing only characterized enzymes, default = False
-
-## Usage examples
-
-1) Extract all accession codes from family 9 of Glycosyl Transferases.
-
-`$ extract_cazy_ids --db CAZy_DB_xx-xx-xxxx.csv --family GT9`
-
-This will generate the following files:
-```
-GT9.csv
-```
-
-2) Extract all accession codes from family 43 of Glycoside Hydrolase, including subfamilies
-
-`$ extract_cazy_ids --db CAZy_DB_xx-xx-xxxx.csv --family GH43 --subfamilies`
-
-This will generate the following files:
 
 ```
-GH43.csv
-GH43_sub1.csv
-GH43_sub2.csv
-GH43_sub3.csv
-(...)
-GH43_sub37.csv
+cazy-parser -h
+usage: cazy-parser [-h] [-f FAMILY] [-s SUBFAMILY] [-c CHARACTERIZED] [-v] {GH,GT,PL,CA,AA}
+
+positional arguments:
+  {GH,GT,PL,CA,AA}
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -f FAMILY, --family FAMILY
+  -s SUBFAMILY, --subfamily SUBFAMILY
+  -c CHARACTERIZED, --characterized CHARACTERIZED
+  -v, --version         show version
 ```
 
-3) Extract all accession codes from family 42 of Polysaccharide Lyases including characterized entries
-
-`$ extract_cazy_ids --db CAZy_DB_xx-xx-xxxx.csv --family PL42 --characterized`
+### Example
 
-This will generate the following files:
+Extract all fasta sequences from family 43 of Glycoside Hydrolase subfamily 1
 
 ```
-PL42.csv
-PL42_characterized.csv
+$ cazy-parser GH -f 43 -s 1
+ [2022-05-26 16:39:21,511 91 INFO] ------------------------------------------
+ [2022-05-26 16:39:21,511 92 INFO]
+ [2022-05-26 16:39:21,511 93 INFO] ┌─┐┌─┐┌─┐┬ ┬   ┌─┐┌─┐┬─┐┌─┐┌─┐┬─┐
+ [2022-05-26 16:39:21,511 94 INFO] │  ├─┤┌─┘└┬┘───├─┘├─┤├┬┘└─┐├┤ ├┬┘
+ [2022-05-26 16:39:21,511 95 INFO] └─┘┴ ┴└─┘ ┴    ┴  ┴ ┴┴└─└─┘└─┘┴└─ v2.0.0
+ [2022-05-26 16:39:21,511 96 INFO]
+ [2022-05-26 16:39:21,511 97 INFO] ------------------------------------------
+ [2022-05-26 16:39:21,511 183 INFO] Fetching links for Glycoside-Hydrolases, url: http://www.cazy.org/Glycoside-Hydrolases.html
+ [2022-05-26 16:39:22,454 189 INFO] Only using links of family 43 subfamily 1
+ [2022-05-26 16:39:23,029 26 INFO] Dowloading 1415 fasta sequences...
+ [2022-05-26 16:40:32,187 51 INFO] Dumping fasta sequences to file GH43_1_26052022.fasta
 ```
 
-### Download fasta sequences
-
-Go to [NCBI's Batch Entrez](https://www.ncbi.nlm.nih.gov/sites/batchentrez) change the database to protein and submit the generated `.csv`.
+This will generate the following file `GH43_1_DDMMYYY.fasta` containing the fasta sequences.
 
 ## To-do and how to contribute
 
-Please refer to CONTRIBUTE.md
+Please refer to [CONTRIBUTING](CONTRIBUTING.md) (:
diff --git a/README.rst b/README.rst
index f5cb2b1..2efceed 100644
--- a/README.rst
+++ b/README.rst
@@ -3,7 +3,7 @@ cazy-parser
 
 The `Carbohydrate-Active enZYmes Database (CAZy) <https://www.cazy.org>`_ provides access to a sequence based classification of enzyme that are responsible for the assembly, modification and breakdown of oligo and polysaccharides.
 
-This database has been online for eighteen years providing relevant genomic, structural and biochemical data on carbohydrate-active enzymes, such asglycoside hydrolases, glycosyl transferases, polysaccharide lyases, carbohydrateesterases and similar enzymes with auxiliary activities. The database isorganized and presented to the user as a series of highly annotated HTML tables.
+This database has been online for several years providing relevant genomic, structural and biochemical data on carbohydrate-active enzymes, such as glycoside hydrolases, glycosyl transferases, polysaccharide lyases, carbohydrateesterases and similar enzymes with auxiliary activities. The database isorganized and presented to the user as a series of highly annotated HTML tables.
 
 This script provides a way to extract information from the database according to user need.
 
diff --git a/cazy_parser/create_cazy_db.py b/cazy_parser/create_cazy_db.py
deleted file mode 100755
index 92af863..0000000
--- a/cazy_parser/create_cazy_db.py
+++ /dev/null
@@ -1,352 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-# ==============================================================================#
-# Copyright (C) 2016  Rodrigo Honorato
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>
-# ==============================================================================#
-
-# ==============================================================================#
-# create a parsed database using CAZY html structure
-import os
-import re
-import string
-import sys
-import time
-import urllib
-
-import progressbar
-from bs4 import BeautifulSoup
-
-# ==============================================================================#
-
-checkpoint_f = "cp.chk"
-fam_checkpoint_f = "fam.chk"
-
-
-def init_bar():
-    """Initilize progress bar"""
-    bar = progressbar.ProgressBar(
-        widgets=[
-            " ",
-            progressbar.Timer(),
-            " ",
-            progressbar.Percentage(),
-            " ",
-            progressbar.Bar("█", "[", "]"),
-            " ",
-            progressbar.ETA(),
-            " ",
-        ]
-    )
-    return bar
-
-
-def fetch_species():
-    """Gather species names and IDs (full genome only)"""
-    print("> Gathering species with full genomes")
-    # a = archea // b = bacteria // e = eukaryota // v = virus
-    species_domain_list = ["a", "b", "e", "v"]
-    species_dic = {}
-
-    bar = init_bar()
-    bar.max_value = len(string.ascii_uppercase) * len(species_domain_list)
-    bar.start()
-
-    counter = 0
-    for domain in species_domain_list:
-        for initial in string.ascii_uppercase:
-            counter += 1
-            bar.update(counter)
-            link = "http://www.cazy.org/%s%s.html" % (domain, initial)
-            f = urllib.request.urlopen(link)
-            species_list_hp = f.read().decode("utf-8")
-            # parse webpage
-            index_list = re.findall(
-                '"http://www.cazy.org/{}(\d.*).html" class="nav">(.*)</a>'.format(
-                    species_domain_list
-                ),
-                species_list_hp,
-            )
-            for entry in index_list:
-                index, species = entry
-                try:
-                    species_dic[species].append(index)
-                except KeyError:
-                    species_dic[species] = [index]
-    bar.finish()
-
-    # Double check to see which of the species codes are valid
-    for j, species in enumerate(species_dic.keys()):
-        entry_list = species_dic[species]
-        if len(entry_list) > 1:
-            # More than one entry for this species
-            #  > This is (likely) a duplicate
-            #  > Use the higher number, should be the newer page
-            newer_entry = max([int(i.split("b")[-1]) for i in entry_list])
-            selected_entry = "b%i" % newer_entry
-
-            species_dic[species] = selected_entry
-        else:
-            species_dic[species] = species_dic[species][0]
-
-    return species_dic
-
-
-def fetch_families(main_link):
-    """Identify family link structure and return a list"""
-    soup = BeautifulSoup(urllib.request.urlopen(main_link), features="html.parser")
-    family_table = soup.findAll(name="table")[0]
-    rows = family_table.findAll(name="td")
-    family_list = [
-        str(r.find("a")["href"].split("/")[-1].split(".html")[0]) for r in rows
-    ]
-
-    return family_list
-
-
-def fetch_links(enzyme_class):
-    """Fetch link structure for an enzyme class"""
-
-    # Links need to be fetched everytime since there is no way (?) to guarantee
-    #  that all were previously fetched without actually fetching them.
-    main_class_link = "http://www.cazy.org/%s.html" % enzyme_class
-
-    family_list = fetch_families(main_class_link)
-
-    bar = init_bar()
-    bar.max_value = len(family_list)
-    bar.start()
-
-    page_list = []
-    for j, family in enumerate(family_list):
-        bar.update(j + 1)
-        main_link = "http://www.cazy.org/%s.html" % family
-
-        # TODO: Implement checkpoint for link fetching
-
-        family_soup = BeautifulSoup(
-            urllib.request.urlopen(main_link), features="html.parser"
-        )
-        superfamily_list = [
-            l.find("a")["href"]
-            for l in family_soup.findAll("span", attrs={"class": "choix"})
-        ][1:]
-
-        superfamily_list = [
-            f for f in superfamily_list if "structure" not in f
-        ]  # remove structure tab for parsing
-        for main_link in superfamily_list:
-            page_zero = main_link
-            soup = BeautifulSoup(
-                urllib.request.urlopen(main_link), features="html.parser"
-            )
-
-            # Get page list for the family
-            page_index_list = soup.findAll(name="a", attrs={"class": "lien_pagination"})
-            if bool(page_index_list):
-
-                # =====================#
-                # be careful with this #
-                first_page_idx = int(re.findall("=(\d*)#", str(page_index_list[0]))[0])
-                last_page_idx = int(re.findall("=(\d*)#", str(page_index_list[-2]))[0])
-                # =====================#
-
-                page_list.append(page_zero)
-                for i in range(
-                    first_page_idx, last_page_idx + first_page_idx, first_page_idx
-                ):
-                    link = (
-                        "http://www.cazy.org/"
-                        + page_index_list[0]["href"].split("=")[0]
-                        + "="
-                        + str(i)
-                    )
-                    page_list.append(link)
-
-            else:
-                page_list.append(page_zero)
-
-    return page_list
-
-
-def check_status(chk_f, link):
-    """Check if contents of this link have been downloaded"""
-
-    if os.path.isfile(chk_f):
-        for l in open(chk_f):
-            if link in l:
-                return True
-
-
-def save_status(chk_f, link):
-    """Save link address in checkpoint file"""
-    open(chk_f, "a").write("{}\n".format(link))
-
-
-def fetch_data(link_list, species_dic, out_f):
-    """Parse list of links and extract information"""
-
-    bar = init_bar()
-    bar.max_value = len(link_list)
-    bar.start()
-
-    protein_counter = 0
-    for j, link in enumerate(link_list):
-
-        db_dic = {}
-        bar.update(j + 1)
-
-        if check_status(checkpoint_f, link):
-            continue
-
-        # tr  = rows // # td = cells
-        soup = BeautifulSoup(urllib.request.urlopen(link), features="html.parser")
-        table = soup.find("table", attrs={"class": "listing"})
-        domain = ""
-        family = link.split(".org/")[-1].split("_")[0]
-
-        # consistency check to look for deleted families. i.e. GH21
-        try:
-            table.findAll("tr")
-        except AttributeError:
-            # not a valid link, move on
-            continue
-        for row in table.findAll("tr"):
-            try:
-                if row["class"] == "royaume" and row.text != "Top":
-                    domain = str(row.text).lower()
-            except KeyError:
-                pass
-            tds = row.findAll("td")
-            if len(tds) > 1 and tds[0].text != "Protein Name":
-                # valid line
-                db_dic[protein_counter] = {}
-                db_dic[protein_counter]["protein_name"] = tds[0].text.replace(
-                    "&nbsp;", ""
-                )
-                db_dic[protein_counter]["family"] = family
-                db_dic[protein_counter]["domain"] = domain
-                db_dic[protein_counter]["ec"] = tds[1].text.replace("&nbsp;", "")
-                db_dic[protein_counter]["organism"] = tds[2].text.replace("&nbsp;", "")
-                try:
-                    db_dic[protein_counter]["genbank"] = (
-                        tds[3].find("a").text.replace("&nbsp;", "")
-                    )  # get latest entry
-                except AttributeError:
-                    # there is a crazy aberration when there is no genbank available
-                    db_dic[protein_counter]["genbank"] = "unavailable"
-
-                db_dic[protein_counter]["uniprot"] = tds[4].text.replace("&nbsp;", "")
-                db_dic[protein_counter]["pdb"] = tds[5].text.replace("&nbsp;", "")
-
-                # check if this is species has a complete genome
-                try:
-                    db_dic[protein_counter]["organism_code"] = species_dic[
-                        tds[2].text.replace("&nbsp;", "")
-                    ]
-                except KeyError:
-                    db_dic[protein_counter]["organism_code"] = "invalid"
-
-                # check if there are subfamilies
-                try:
-                    db_dic[protein_counter]["subfamily"] = tds[6].text.replace(
-                        "&nbsp;", ""
-                    )
-                except KeyError:
-                    db_dic[protein_counter]["subfamily"] = ""
-
-                # if 'characterized' in main_link:
-                if "characterized" in link:
-                    db_dic[protein_counter]["tag"] = "characterized"
-                else:
-                    db_dic[protein_counter]["tag"] = ""
-
-                protein_counter += 1
-
-        save_output(out_f, db_dic)
-        save_status(checkpoint_f, link)
-
-    bar.finish()
-
-
-def save_output(output_f, d_dic):
-    """Save information as .csv file"""
-    if not os.path.isfile(output_f):
-        out = open(output_f, "w")
-        header = ",".join(d_dic[0].keys()) + "\n"
-        out.write(header)
-    else:
-        out = open(output_f, "a")
-
-    for p in d_dic:
-        tbw = ",".join(list(d_dic[p].values())).replace("\n", "")
-        tbw += "\n"
-        out.write(tbw)
-    out.close()
-
-
-def clean(out_f):
-    """Remove duplicates from output file"""
-    file_l = open(out_f).readlines()
-    new_file_l = list(set(file_l))
-    open(out_f, "w").write("".join(new_file_l))
-    del file_l
-    del new_file_l
-
-
-def logo():
-    version = "1.4.2"
-    print("")
-    print("┌─┐┌─┐┌─┐┬ ┬   ┌─┐┌─┐┬─┐┌─┐┌─┐┬─┐")
-    print("│  ├─┤┌─┘└┬┘───├─┘├─┤├┬┘└─┐├┤ ├┬┘")
-    print("└─┘┴ ┴└─┘ ┴    ┴  ┴ ┴┴└─└─┘└─┘┴└─ v{}".format(version))
-    print("")
-    print("This is the database creator script")
-    print("")
-    print("(get a coffee, this will take a while)")
-    print("")
-
-
-def main():
-
-    logo()
-
-    species_dic = fetch_species()
-
-    enzyme_list = [
-        "Glycoside-Hydrolases",
-        "GlycosylTransferases",
-        "Polysaccharide-Lyases",
-        "Carbohydrate-Esterases",
-        "Auxiliary-Activities",
-    ]
-
-    for enzyme_name in enzyme_list:
-        print(">> {}".format(enzyme_name))
-        output_f = "CAZy_DB_{}_{}.csv".format(enzyme_name, time.strftime("%d-%m-%Y"))
-
-        print("> Fetching links")
-        page_list = fetch_links(enzyme_name)
-
-        print("> Gathering data")
-        fetch_data(page_list, species_dic, output_f)
-
-        clean(output_f)
-
-    sys.exit(0)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/cazy_parser/extract_cazy_ids.py b/cazy_parser/extract_cazy_ids.py
deleted file mode 100755
index d0c8ca9..0000000
--- a/cazy_parser/extract_cazy_ids.py
+++ /dev/null
@@ -1,182 +0,0 @@
-#!/usr/bin/env python
-# ==============================================================================#
-# Copyright (C) 2016  Rodrigo Honorato
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>
-# ==============================================================================#
-
-import argparse
-import sys
-
-
-def logo():
-    version = "1.4.2"
-    print("")
-    print("┌─┐┌─┐┌─┐┬ ┬   ┌─┐┌─┐┬─┐┌─┐┌─┐┬─┐")
-    print("│  ├─┤┌─┘└┬┘───├─┘├─┤├┬┘└─┐├┤ ├┬┘")
-    print("└─┘┴ ┴└─┘ ┴    ┴  ┴ ┴┴└─└─┘└─┘┴└─ v{}".format(version))
-    print("")
-    print("This is the accession code retrieval script")
-    print("")
-
-
-def load_db(f):
-    db_f = open(f).readlines()
-    sep = ","
-    header = db_f[0].split(sep)
-
-    print(">> Loading {}".format(f))
-    db = {}
-    for i, l in enumerate(db_f[1:]):
-        db[i] = {}
-        data = l.split(sep)
-        # init dictionary
-        for c, idx in enumerate(header):
-            try:
-                db[i][idx] = data[c]
-            except IndexError:
-                db[i][idx] = ""
-    return db
-
-
-def select_fam(fam, db):
-    selection_list = []
-    for e in db:
-        if db[e]["family"] == fam:
-            selection_list.append(db[e]["genbank"])
-
-    print(
-        ">> Retrieving all {} accession codes for Family {}".format(
-            len(selection_list), fam
-        )
-    )
-
-    sele_l = list(set(selection_list))
-    out_f = "%s.csv" % fam
-    print(">> Creating {}".format(out_f))
-    out = open(out_f, "w")
-    out.write("\n".join(sele_l))
-    out.close()
-
-
-def select_subfam(fam, db):
-    selection_dic = {}
-    for e in db:
-        if db[e]["family"] == fam and bool(db[e]["subfamily"]):
-            try:
-                selection_dic[db[e]["subfamily"]].append(db[e]["genbank"])
-            except KeyError:
-                selection_dic[db[e]["subfamily"]] = [db[e]["genbank"]]
-
-    # output
-    print(
-        ">> Retrieving accession codes for {} subfamilies for Family {}".format(
-            len(selection_dic), fam
-        )
-    )
-    for sub in selection_dic:
-        s_l = list(set(selection_dic[sub]))
-        out_f = "{}_sub{}.csv".format(fam, sub)
-        print(">> Creating {}".format(out_f))
-        out = open(out_f, "w")
-        out.write("\n".join(s_l))
-        out.close()
-
-
-def select_char(fam, db):
-    selection_list = []
-    for e in db:
-        if db[e]["family"] == fam and bool(db[e]["tag"]):
-            selection_list.append(db[e]["genbank"])
-
-    print(
-        ">> Selecting {} CHARACTERIZED proteins for Family {}".format(
-            len(selection_list), fam
-        )
-    )
-
-    out_f = "{}_characterized.csv".format(fam)
-    out = open(out_f, "w")
-    print(">> Creating {}".format(out_f))
-    out.write("\n".join(selection_list))
-    out.close()
-
-
-def main():
-    logo()
-
-    parser = argparse.ArgumentParser(
-        description="Select accession codes for a given protein family. Optional: Select subfamilies and/or "
-        "characterized enzymes"
-    )
-
-    parser.add_argument(
-        "--db",
-        action="store",
-        dest="db_file",
-        help="Database file generated by cazy-parser",
-    )
-
-    parser.add_argument(
-        "--family",
-        action="store",
-        dest="target_family",
-        help="Family to be searched ex. GH5",
-    )
-
-    parser.add_argument(
-        "--subfamily",
-        action="store_true",
-        default=False,
-        help="(Optional) Create a file with accession codes for each subfamily",
-    )
-
-    parser.add_argument(
-        "--characterized",
-        action="store_true",
-        default=False,
-        help="(Optional) Create a file with accession codes only for characterized enzymes",
-    )
-
-    results = parser.parse_args()
-
-    check = False
-    if results.db_file is None:
-        print("\n>> [ERROR] Missing database file\n")
-        check = True
-
-    if results.target_family is None:
-        print("\n>> [ERROR] Missing target family\n")
-        check = True
-
-    if check:
-        parser.print_help()
-        sys.exit(0)
-
-    db = load_db(results.db_file)
-
-    if bool(results.target_family):
-        select_fam(results.target_family, db)
-
-    if bool(results.subfamily):
-        select_subfam(results.target_family, db)
-
-    if bool(results.characterized):
-        select_char(results.target_family, db)
-
-    sys.exit(0)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..a15b339
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,2 @@
+[tool.pytest.ini_options]
+pythonpath = ["src"]
diff --git a/requirements.txt b/requirements.txt
index 091857c..cb47bbe 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,3 +3,5 @@ certifi==2022.5.18.1
 progressbar2==4.0.0
 python-utils==3.2.3
 soupsieve==2.3.2.post1
+requests==2.27.1
+biopython==1.79
diff --git a/setup.py b/setup.py
index eb41e68..8f33578 100644
--- a/setup.py
+++ b/setup.py
@@ -1,17 +1,7 @@
-"""A setuptools based setup module.
-
-See:
-https://packaging.python.org/en/latest/distributing.html
-https://github.com/pypa/sampleproject
-"""
-
-# To use a consistent encoding
 from codecs import open
 from os import path
 
-# Always prefer setuptools over distutils
 from setuptools import find_packages, setup
-from setuptools.command.install import install
 
 here = path.abspath(path.dirname(__file__))
 
@@ -23,78 +13,32 @@
     required = f.read().splitlines()
 
 
-class CazyParser(install):
-    def run(self):
-        install.run(self)
-
-
 setup(
     name="cazy-parser",
-    # Versions should comply with PEP440.  For a discussion on single-sourcing
-    # the version across setup.py and the project code, see
-    # https://packaging.python.org/en/latest/single_source_version.html
-    version="1.4.2",
+    version="2.0.0",
     description="A way to extract specific information from CAZy",
     long_description=long_description,
-    # The project's main homepage.
     url="https://github.com/rodrigovrgs/cazy-parser",
-    # Author details
     author="Rodrigo Honorato",
-    author_email="rvhonorato@gmail.com",
-    # Choose your license
+    author_email="rvhonorato@protonmail.com",
     license="GPL3",
     # See https://pypi.python.org/pypi?%3Aaction=list_classifiers
     classifiers=[
-        # How mature is this project? Common values are
-        #   3 - Alpha
-        #   4 - Beta
-        #   5 - Production/Stable
         "Development Status :: 5 - Production/Stable",
-        # Indicate who your project is intended for
         "Intended Audience :: Science/Research",
         "Topic :: Scientific/Engineering :: Bio-Informatics",
-        # Pick your license as you wish (should match "license" above)
         "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
-        # Specify the Python versions you support here. In particular, ensure
-        "Programming Language :: Python :: 3.6",
+        "Programming Language :: Python :: 3.9",
     ],
-    # What does your project relate to?
     keywords="cazy database datamining",
-    # You can just specify the packages manually here if your project is
-    # simple. Or you can use find_packages().
-    packages=find_packages(exclude=["contrib", "docs", "tests"]),
-    # Alternatively, if you want to distribute just a my_module.py, uncomment
-    # this:
-    #   py_modules=["my_module"],
-    # List run-time dependencies here.  These will be installed by pip when
-    # your project is installed. For an analysis of "install_requires" vs pip's
-    # requirements files see:
-    # https://packaging.python.org/en/latest/requirements.html
+    packages=find_packages("src"),
     install_requires=required,
-    # List additional groups of dependencies here (e.g. development
-    # dependencies). You can install these using the following syntax,
-    # for example:
-    # $ pip install -e .[dev,test]
     extras_require={},
-    # If there are data files included in your packages that need to be
-    # installed, specify them here.  If using Python 2.6 or less, then these
-    # have to be included in MANIFEST.in as well.
-    package_data={
-        "cazy-parser": [],
-    },
-    # Although 'package_data' is the preferred approach, in some case you may
-    # need to place data files outside of your packages. See:
-    # http://docs.python.org/3.4/distutils/setupscript.html#installing-additional-files # noqa
-    # In this case, 'data_file' will be installed into '<sys.prefix>/my_data'
-    # data_files=[],
-    # To provide executable scripts, use entry points in preference to the
-    # "scripts" keyword. Entry points provide cross-platform support and allow
-    # pip to create the appropriate form of executable for the target platform.
+    package_data={"cazy-parser": []},
+    package_dir={"": "src"},
     entry_points={
         "console_scripts": [
-            "create_cazy_db=cazy_parser.create_cazy_db:main",
-            "extract_cazy_ids=cazy_parser.extract_cazy_ids:main",
+            "cazy-parser=cazy_parser.cli:maincli",
         ],
     },
-    cmdclass={"install": CazyParser},
 )
diff --git a/src/cazy_parser/__init__.py b/src/cazy_parser/__init__.py
new file mode 100644
index 0000000..59879c7
--- /dev/null
+++ b/src/cazy_parser/__init__.py
@@ -0,0 +1,7 @@
+ENZYME_LIST = {
+    "GH": "Glycoside-Hydrolases",
+    "GT": "GlycosylTransferases",
+    "PL": "Polysaccharide-Lyases",
+    "CA": "Carbohydrate-Esterases",
+    "AA": "Auxiliary-Activities",
+}
diff --git a/src/cazy_parser/cli.py b/src/cazy_parser/cli.py
new file mode 100644
index 0000000..074fcb0
--- /dev/null
+++ b/src/cazy_parser/cli.py
@@ -0,0 +1,119 @@
+import argparse
+import logging
+import sys
+import time
+
+from cazy_parser import ENZYME_LIST
+from cazy_parser.modules.fasta import dump_fastas
+from cazy_parser.modules.html import retrieve_genbank_ids
+from cazy_parser.version import VERSION
+
+log = logging.getLogger("cazylog")
+ch = logging.StreamHandler()
+formatter = logging.Formatter(" [%(asctime)s %(lineno)d %(levelname)s] %(message)s")
+ch.setFormatter(formatter)
+log.addHandler(ch)
+log.setLevel("DEBUG")
+
+
+# CONFIG = json.load(open(Path(Path(__file__).parents[2], "etc/config.json")))
+
+# ===========================================================================================================
+# Define arguments
+ap = argparse.ArgumentParser()
+
+ap.add_argument(
+    "enzyme_class",
+    choices=["GH", "GT", "PL", "CA", "AA"],
+)
+
+ap.add_argument("-f", "--family", type=int)
+
+ap.add_argument("-s", "--subfamily")
+
+ap.add_argument("-c", "--characterized")
+
+ap.add_argument(
+    "-v",
+    "--version",
+    help="show version",
+    action="version",
+    version=f"Running {ap.prog} v{VERSION}",
+)
+
+
+def load_args(ap):
+    """
+    Load argument parser.
+
+    Parameters
+    ----------
+    ap : argparse.ArgumentParser
+        Argument parser.
+
+    Returns
+    -------
+    cmd : argparse.Namespace
+        Parsed command-line arguments.
+
+    """
+    return ap.parse_args()
+
+
+# ====================================================================================#
+# Define CLI
+def cli(ap, main):
+    """
+    Command-line interface entry point.
+
+    Parameters
+    ----------
+    ap : argparse.ArgumentParser
+        Argument parser.
+    main : function
+        Main function.
+
+    """
+    cmd = load_args(ap)
+    main(**vars(cmd))
+
+
+def maincli():
+    """Execute main client."""
+    cli(ap, main)
+
+
+# ====================================================================================#
+# Main code
+def main(enzyme_class, family, subfamily, characterized):
+    """Main function."""
+
+    log.info("-" * 42)
+    log.info("")
+    log.info("┌─┐┌─┐┌─┐┬ ┬   ┌─┐┌─┐┬─┐┌─┐┌─┐┬─┐")
+    log.info("│  ├─┤┌─┘└┬┘───├─┘├─┤├┬┘└─┐├┤ ├┬┘")
+    log.info(f"└─┘┴ ┴└─┘ ┴    ┴  ┴ ┴┴└─└─┘└─┘┴└─ v{VERSION}")
+    log.info("")
+    log.info("-" * 42)
+
+    if enzyme_class not in ENZYME_LIST:
+        logging.error(f"Enzyme class {enzyme_class} not supportes")
+        sys.exit()
+    else:
+        enzyme_name = ENZYME_LIST[enzyme_class]
+
+    id_list = retrieve_genbank_ids(enzyme_name, family, subfamily, characterized)
+
+    output_fname = f"{enzyme_class}"
+    if family:
+        output_fname += f"{family}"
+    if subfamily:
+        output_fname += f"_{subfamily}"
+
+    today = time.strftime("%d%m%Y")
+    output_fname += f"_{today}.fasta"
+    dump_fastas(id_list, output_fname)
+
+
+if __name__ == "__main__":
+    sys.exit(maincli())
diff --git a/cazy_parser/__init__.py b/src/cazy_parser/modules/__init__.py
similarity index 100%
rename from cazy_parser/__init__.py
rename to src/cazy_parser/modules/__init__.py
diff --git a/src/cazy_parser/modules/fasta.py b/src/cazy_parser/modules/fasta.py
new file mode 100644
index 0000000..14bdc9b
--- /dev/null
+++ b/src/cazy_parser/modules/fasta.py
@@ -0,0 +1,54 @@
+import logging
+import os
+
+from Bio import Entrez, SeqIO
+
+Entrez.email = "your-email-here@example.org"
+
+log = logging.getLogger("cazylog")
+
+
+def download_fastas(id_list):
+    """
+    Download fasta files from NCBI.
+
+    Parameters
+    ----------
+    id_list : list
+        List of genbank ids.
+
+    Returns
+    -------
+    fasta_list : list
+        List of fasta strings.
+
+    """
+    log.info(f"Dowloading {len(id_list)} fasta sequences...")
+    fasta_list = []
+    handle = Entrez.efetch(db="protein", id=id_list, rettype="fasta", retmode="text")
+    for seq_record in SeqIO.parse(handle, "fasta"):
+        fasta_str = (
+            f">{seq_record.description}{os.linesep}" f"{seq_record.seq}{os.linesep*2}"
+        )
+        fasta_list.append(fasta_str)
+
+    return fasta_list
+
+
+def dump_fastas(id_list, output_f):
+    """
+    Save the fasta strings to a file.
+
+    Parameters
+    ----------
+    id_list : list
+        List of genbank ids.
+    output_f : str
+        Path to the output file.
+
+    """
+    fasta_list = download_fastas(id_list)
+    log.info(f"Dumping fasta sequences to file {output_f}")
+    with open(output_f, "w") as fh:
+        for fasta in fasta_list:
+            fh.write(fasta)
diff --git a/src/cazy_parser/modules/html.py b/src/cazy_parser/modules/html.py
new file mode 100644
index 0000000..d065366
--- /dev/null
+++ b/src/cazy_parser/modules/html.py
@@ -0,0 +1,374 @@
+import itertools
+import logging
+import os
+import re
+import string
+import sys
+import urllib
+
+import requests
+from bs4 import BeautifulSoup
+
+from cazy_parser.utils import init_bar
+
+log = logging.getLogger("cazylog")
+
+
+def fetch_data(link_list):
+    """
+    Parse list of links and extract information.
+
+    Parameters
+    ----------
+    link_list : list
+        List of links to parse
+
+
+    Returns
+    -------
+    data : list
+        List of dictionaries containing information about each entry.
+
+    """
+
+    data = []
+    for _, link in enumerate(link_list):
+
+        if ".txt" in link:
+            link_data = get_data_from_txt(link)
+        else:
+            link_data = get_data_from_html(link)
+
+        data.append(link_data)
+
+    return list(itertools.chain(*data))
+
+
+def parse_td(td_list):
+    """
+    Parse a table from the HTML page.
+
+    Parameters
+    ----------
+    td_list : list
+        List of table elements.
+
+    Returns
+    -------
+    data_dict : dic
+        Dictionary of data.
+
+    """
+    data_dict = {}
+    if len(td_list) <= 1 or td_list[0].text == "Protein Name":
+        # this is likely the header
+        return data_dict
+
+    protein_name = td_list[0].text
+    ec = td_list[1].text
+    # referece = td_list[2].text
+    organism = td_list[3].text
+    try:
+        genbank = td_list[4].find("a").text
+    except AttributeError:
+        genbank = "unavailable"
+
+    uniprot = td_list[5].text
+    pdb = td_list[6].text
+
+    data_dict["protein_name"] = protein_name
+    data_dict["ec"] = ec
+    data_dict["organism"] = organism
+    data_dict["genbank"] = genbank
+    data_dict["uniprot"] = uniprot
+    data_dict["pdb"] = pdb
+
+    return data_dict
+
+
+def parse_table(table):
+    """
+    Parse a beautiful soup table and retrieve information.
+
+    Parameters
+    ----------
+    table : bs4.element.Tag
+        Beautiful soup table.
+
+    Returns
+    -------
+    table_data : list
+        List of dictionaries containing information from the table.
+
+    """
+    table_data = []
+    for tr in table.findAll("tr"):
+        tds = tr.findAll("td")
+        td_dic = parse_td(tds)
+        table_data.append(td_dic)
+    return table_data
+
+
+def get_data_from_html(link):
+    """
+    Retrieve information from the HTML page.
+
+    Parameters
+    ----------
+    link : str
+        Link to the page.
+
+    """
+    soup = BeautifulSoup(urllib.request.urlopen(link), features="html.parser")
+    table = soup.find("table", attrs={"class": "listing"})
+    domain = ""
+    family = link.split(".org/")[-1].split("_")[0]
+    data_list = parse_table(table)
+
+    tag = "characterized" if "characterized" in link else ""
+    # add more information to the data
+    for data in data_list:
+        data["tag"] = tag
+        data["family"] = family
+        data["domain"] = domain
+
+    return data_list
+
+
+def get_data_from_txt(link):
+    """
+    Retrieve information from the TXT file.
+
+    Parameters
+    ----------
+    link : str
+        Link to the page.
+
+    Returns
+    -------
+    data_list : list
+        List of dictionaries containing information from the TXT file.
+
+    """
+    data_list = []
+    response = requests.get(link)
+    tag = "characterized" if "characterized" in link else ""
+    for line in response.text.split(os.linesep):
+        data_dict = {}
+        data = line.split("\t")
+
+        try:
+            family = data[0]
+            domain = data[1]
+            species = data[2]
+            gene = data[3]
+
+            data_dict["family"] = family
+            data_dict["domain"] = domain
+            data_dict["species"] = species
+            data_dict["genbank"] = gene
+            data_dict["tag"] = tag
+        except IndexError:
+            # no data for this line
+            continue
+
+        data_list.append(data_dict)
+
+    return data_list
+
+
+def fetch_links(enzyme_class, family, subfamily):
+    """
+    Fetch link structure for an enzyme class.
+
+    Parameters
+    ----------
+    enzyme_class : str
+        Enzyme class to fetch links for.
+
+    Returns
+    -------
+    page_list : list
+        List of links to the pages.
+
+    """
+
+    main_class_link = f"http://www.cazy.org/{enzyme_class}.html"
+    log.info(f"Fetching links for {enzyme_class}, url: {main_class_link}")
+    family_list = fetch_families(main_class_link)
+
+    # Filter by family
+    if family:
+        if subfamily:
+            log.info(f"Only using links of family {family} subfamily {subfamily}")
+            family_list = [e for e in family_list if e[2:] == f"{family}_{subfamily}"]
+        else:
+            log.info(f"Only using links of family {family}")
+            family_list = [e for e in family_list if int(e[2:]) == family]
+
+    if not family_list:
+        log.error("No links were found.")
+        sys.exit()
+
+    page_list = []
+    for j, family in enumerate(family_list):
+        # bar.update(j + 1)
+        family_link = f"http://www.cazy.org/{family}.html"
+
+        # TODO: Implement checkpoint for link fetching
+
+        family_soup = BeautifulSoup(
+            urllib.request.urlopen(family_link), features="html.parser"
+        )
+
+        # Find the links to the individual pages
+        superfamily_links = []
+        for line in family_soup.findAll("span", attrs={"class": "choix"}):
+            _link = line.find("a")["href"]
+            if "krona" not in _link and "structure" not in _link:
+                superfamily_links.append(_link)
+
+        for link in superfamily_links:
+            page_zero = link
+            try:
+                soup = BeautifulSoup(
+                    urllib.request.urlopen(link), features="html.parser"
+                )
+            except ValueError:
+                # This is a link to a text file
+                page_list.append(f"http://www.cazy.org/{link}")
+                continue
+
+            # Get page list for the family
+            page_index_list = soup.findAll(name="a", attrs={"class": "lien_pagination"})
+            if bool(page_index_list):
+
+                # =====================#
+                # be careful with this #
+                first_page_idx = int(re.findall(r"=(\d*)#", str(page_index_list[0]))[0])
+                last_page_idx = int(re.findall(r"=(\d*)#", str(page_index_list[-2]))[0])
+                # =====================#
+
+                page_list.append(page_zero)
+                page_range = range(
+                    first_page_idx, last_page_idx + first_page_idx, first_page_idx
+                )
+                for i in page_range:
+                    sub_str = page_index_list[0]["href"].split("=")[0]
+                    link = f"http://www.cazy.org/{sub_str}={i}"
+                    page_list.append(link)
+
+            else:
+                page_list.append(page_zero)
+
+    return page_list
+
+
+def fetch_families(link):
+    """
+    Identify family link structure and return a list.
+
+    Parameters
+    ----------
+    link : str
+        Link to the page.
+
+    Returns
+    -------
+    family_link_list : list
+        List of family links.
+
+    """
+    enzyme_regex = r"([A-Z]{2}\d*_?\d?).html"
+    soup = BeautifulSoup(urllib.request.urlopen(link), features="html.parser")
+    all_links = soup.find_all("a")
+    family_link_list = []
+    for link in all_links:
+        try:
+            href = re.findall(enzyme_regex, link.attrs["href"])[0]
+            family_link_list.append(href)
+        except (IndexError, KeyError):
+            continue
+
+    return list(set(family_link_list))
+
+
+def fetch_species():
+    """Gather species names and IDs (full genome only)."""
+    log.info("> Gathering species with full genomes")
+    # a = archea // b = bacteria // e = eukaryota // v = virus
+    species_domain_list = ["a", "b", "e", "v"]
+    species_dic = {}
+
+    bar = init_bar()
+    bar.max_value = len(string.ascii_uppercase) * len(species_domain_list)
+    bar.start()
+
+    counter = 0
+    for domain in species_domain_list:
+        for initial in string.ascii_uppercase:
+            counter += 1
+            bar.update(counter)
+            link = f"http://www.cazy.org/{domain}{initial}.html"
+            f = urllib.request.urlopen(link)
+            species_list_hp = f.read().decode("utf-8")
+            # parse webpage
+            index_list = re.findall(
+                rf'"http://www.cazy.org/{species_domain_list}(\d.*).html"'
+                r' class="nav">(.*)</a>',
+                species_list_hp,
+            )
+            for entry in index_list:
+                index, species = entry
+                try:
+                    species_dic[species].append(index)
+                except KeyError:
+                    species_dic[species] = [index]
+    bar.finish()
+
+    # Double check to see which of the species codes are valid
+    for j, species in enumerate(species_dic.keys()):
+        entry_list = species_dic[species]
+        if len(entry_list) > 1:
+            # More than one entry for this species
+            #  > This is (likely) a duplicate
+            #  > Use the higher number, should be the newer page
+            newer_entry = max([int(i.split("b")[-1]) for i in entry_list])
+            selected_entry = "b%i" % newer_entry
+
+            species_dic[species] = selected_entry
+        else:
+            species_dic[species] = species_dic[species][0]
+
+    return species_dic
+
+
+def retrieve_genbank_ids(enzyme_name, family=None, subfamily=None, characterized=None):
+    """
+    Retrieve genbank IDs for a given enzyme.
+
+    Parameters
+    ----------
+    enzyme_name : str
+        Enzyme name to retrieve genbank IDs for.
+    family : int
+        Family number to retrieve genbank IDs for.
+    subfamily : int
+        Subfamily number to retrieve genbank IDs for.
+    characterized : bool
+        Whether to retrieve genbank IDs for characterized enzymes.
+
+    Returns
+    -------
+    genbank_id_list : list
+        List of genbank IDs.
+
+    """
+    page_list = fetch_links(enzyme_name, family, subfamily)
+    data = fetch_data(page_list)
+    genbank_id_list = []
+    for element in data:
+        if "genbank" in element:
+            genbank_id_list.append(element["genbank"])
+
+    return genbank_id_list
diff --git a/src/cazy_parser/utils.py b/src/cazy_parser/utils.py
new file mode 100644
index 0000000..2465209
--- /dev/null
+++ b/src/cazy_parser/utils.py
@@ -0,0 +1,23 @@
+import logging
+
+import progressbar
+
+log = logging.getLogger("cazylog")
+
+
+def init_bar():
+    """Initilize progress bar."""
+    bar = progressbar.ProgressBar(
+        widgets=[
+            " ",
+            progressbar.Timer(),
+            " ",
+            progressbar.Percentage(),
+            " ",
+            progressbar.Bar("█", "[", "]"),
+            " ",
+            progressbar.ETA(),
+            " ",
+        ]
+    )
+    return bar
diff --git a/src/cazy_parser/version.py b/src/cazy_parser/version.py
new file mode 100644
index 0000000..2101409
--- /dev/null
+++ b/src/cazy_parser/version.py
@@ -0,0 +1 @@
+VERSION = "2.0.0"
diff --git a/tests/test_fasta.py b/tests/test_fasta.py
new file mode 100644
index 0000000..db2ec26
--- /dev/null
+++ b/tests/test_fasta.py
@@ -0,0 +1,26 @@
+import tempfile
+from pathlib import Path
+
+import pytest
+
+from cazy_parser.modules.fasta import download_fastas, dump_fastas
+
+
+@pytest.fixture
+def id_list():
+    return ["WP_010249057.1", "ABN51772.1"]
+
+
+def test_download_fastas(id_list):
+    observed_fasta_list = download_fastas(id_list)
+
+    assert len(observed_fasta_list) == 2
+    assert observed_fasta_list[0][0] == ">"
+
+
+def test_dump_fastas(id_list):
+    temp_f = tempfile.NamedTemporaryFile(delete=False)
+    dump_fastas(id_list, temp_f.name)
+
+    assert Path(temp_f.name).exists()
+    assert Path(temp_f.name).stat().st_size != 0
diff --git a/tests/test_html.py b/tests/test_html.py
new file mode 100644
index 0000000..eafb449
--- /dev/null
+++ b/tests/test_html.py
@@ -0,0 +1,106 @@
+import urllib
+
+import pytest
+from bs4 import BeautifulSoup
+
+from cazy_parser.modules.html import (
+    fetch_data,
+    fetch_families,
+    fetch_links,
+    fetch_species,
+    get_data_from_html,
+    get_data_from_txt,
+    parse_table,
+    parse_td,
+    retrieve_genbank_ids,
+)
+
+
+@pytest.fixture
+def soup():
+    soup = BeautifulSoup(
+        urllib.request.urlopen("http://www.cazy.org/GH173_characterized.html"),
+        features="html.parser",
+    )
+    return soup
+
+
+@pytest.fixture
+def table(soup):
+    return soup.find("table", attrs={"class": "listing"})
+
+
+@pytest.fixture
+def td_list(table):
+    td_list = [tr.findAll("td") for tr in table.findAll("tr")][3]
+    return td_list
+
+
+def test_fetch_data():
+    observed_data = fetch_data(
+        [
+            "http://www.cazy.org/GH173_characterized.html",
+            "http://www.cazy.org/IMG/cazy_data/GH173.txt",
+        ]
+    )
+
+    assert [e for e in observed_data if "tag" in e]
+    assert [e for e in observed_data if "family" in e]
+
+
+def test_parse_td(td_list):
+    observed_data_dic = parse_td(td_list)
+
+    assert "protein_name" in observed_data_dic
+    assert "pdb" in observed_data_dic
+    assert "ec" in observed_data_dic
+
+
+def test_parse_table(table):
+    observed_data = parse_table(table)
+
+    assert [e for e in observed_data if "protein_name" in e]
+
+
+def test_get_data_from_html():
+    observed_data = get_data_from_html(
+        "http://www.cazy.org/GH173" "_characterized.html"
+    )
+
+    assert observed_data
+    assert [e for e in observed_data if "protein_name" in e]
+
+
+def test_get_data_from_txt():
+    observed_data = get_data_from_txt("http://www.cazy.org/IMG/cazy_data/GH173.txt")
+
+    assert [e for e in observed_data if "family" in e]
+
+
+def test_fetch_links():
+    observed_links = fetch_links("Carbohydrate-Esterases", family=None, subfamily=None)
+
+    assert "http://www.cazy.org/CE20_characterized.html" in observed_links
+    assert "http://www.cazy.org/IMG/cazy_data/CE20.txt" in observed_links
+
+
+def test_fetch_families():
+    observed_families = fetch_families("http://www.cazy.org/Glycoside-Hydrolases.html")
+
+    assert "GH1" in observed_families
+
+
+def test_fetch_species():
+    observed_species = fetch_species()
+
+    assert observed_species
+    assert isinstance(observed_species, dict)
+
+
+def test_retrieve_genbank_ids():
+    observed_id_list = retrieve_genbank_ids(
+        enzyme_name="Glycoside-Hydrolases", family=5, subfamily=1
+    )
+
+    assert observed_id_list
+    assert len(observed_id_list) == 1163
diff --git a/tests/test_utils.py b/tests/test_utils.py
new file mode 100644
index 0000000..de390c8
--- /dev/null
+++ b/tests/test_utils.py
@@ -0,0 +1,7 @@
+from cazy_parser.utils import init_bar
+
+
+def test_init_bar():
+    bar = init_bar()
+
+    assert bar