From 104e4a23f16589c8754103e00456426ecdcc82b1 Mon Sep 17 00:00:00 2001 From: Rodrigo V Honorato Date: Thu, 26 May 2022 16:53:03 +0200 Subject: [PATCH] Implement v2.0.0 (#16) * major code update * add missing docstring * linting :goat: --- .flake8 | 3 + .gitignore | 161 +++++++- .isort.cfg | 2 + CONTRIBUTE.md => CONTRIBUTING.md | 9 +- README.md | 106 ++--- README.rst | 2 +- cazy_parser/create_cazy_db.py | 352 ----------------- cazy_parser/extract_cazy_ids.py | 182 --------- pyproject.toml | 2 + requirements.txt | 2 + setup.py | 70 +--- src/cazy_parser/__init__.py | 7 + src/cazy_parser/cli.py | 119 ++++++ .../cazy_parser/modules}/__init__.py | 0 src/cazy_parser/modules/fasta.py | 54 +++ src/cazy_parser/modules/html.py | 374 ++++++++++++++++++ src/cazy_parser/utils.py | 23 ++ src/cazy_parser/version.py | 1 + tests/test_fasta.py | 26 ++ tests/test_html.py | 106 +++++ tests/test_utils.py | 7 + 21 files changed, 922 insertions(+), 686 deletions(-) create mode 100644 .flake8 create mode 100644 .isort.cfg rename CONTRIBUTE.md => CONTRIBUTING.md (60%) delete mode 100755 cazy_parser/create_cazy_db.py delete mode 100755 cazy_parser/extract_cazy_ids.py create mode 100644 pyproject.toml create mode 100644 src/cazy_parser/__init__.py create mode 100644 src/cazy_parser/cli.py rename {cazy_parser => src/cazy_parser/modules}/__init__.py (100%) create mode 100644 src/cazy_parser/modules/fasta.py create mode 100644 src/cazy_parser/modules/html.py create mode 100644 src/cazy_parser/utils.py create mode 100644 src/cazy_parser/version.py create mode 100644 tests/test_fasta.py create mode 100644 tests/test_html.py create mode 100644 tests/test_utils.py diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..8dd399a --- /dev/null +++ b/.flake8 @@ -0,0 +1,3 @@ +[flake8] +max-line-length = 88 +extend-ignore = E203 diff --git a/.gitignore b/.gitignore index ddd21ed..5a14f6b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,160 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ dist/ -paper/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintainted in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. .idea/ -build/ -cazy_parser.egg-info/ + +# VScode +.vscode/ + +# Project-specific +*.csv +*.chk +*.fasta diff --git a/.isort.cfg b/.isort.cfg new file mode 100644 index 0000000..f238bf7 --- /dev/null +++ b/.isort.cfg @@ -0,0 +1,2 @@ +[settings] +profile = black diff --git a/CONTRIBUTE.md b/CONTRIBUTING.md similarity index 60% rename from CONTRIBUTE.md rename to CONTRIBUTING.md index 0c264ef..9cf83f0 100644 --- a/CONTRIBUTE.md +++ b/CONTRIBUTING.md @@ -1,17 +1,10 @@ -## cazy-parser -*A way to extract specific information from the Carbohydrate-Active enZYmes.* - -# How to contribute? +# How to contribute to cazy-parser? There are still a few features that could be implemented, such as: * Organism specific selection * Retrieve three dimensional structures for each family -and specially - -* **Retrieve fasta sequences from NCBIs servers** - ___ Feel free to contact me with **suggestions**, **bugs reports** or if you need any **assistance** running the software. diff --git a/README.md b/README.md index b3a1be4..1b37bbd 100644 --- a/README.md +++ b/README.md @@ -18,105 +18,57 @@ License: [GNU GPLv3](https://www.gnu.org/licenses/gpl-3.0.html) doi: 10.21105/joss.00053 -# IMPORTANT - -Due to changes in the CAZy database, the parser is no longer functional, I will try to revive the code and update it soon. (: ## Introduction *cazy-parser* is a tool that extract information from [CAZy](http://www.cazy.org/) in a more usable and readable format. Firstly, a script reads the HTML structure and creates a mirror of the database as a tab delimited file. Secondly, information is extracted from the database according to user inputted parameters and presented to the user as a set of accession codes. ## Install / Upgrade -`$ pip install --upgrade cazy-parser` - -or - -Download latest source from [this link](https://pypi.python.org/pypi/cazy-parser) - ``` -$ tar -zxvf cazy-parser-x.x.x.tar.gz -$ cd cazy-parser-x.x.x -$ python setup.py install - +$ pip install --upgrade cazy-parser ``` -Note: It my be necessary to open a new terminal. ## Usage *Internet connection required* -1) Database creation - -`$ create_cazy_db` - -(-h for help) -* This script will parse the [CAZy](http://www.cazy.org/) database website and create a comma separated table containing the following information: - * domain - * protein_name - * family - * tag *(characterized status)* - * organism_code - * [EC](http://www.enzyme-database.org/) number (ec stands for enzyme comission number) - * [GENBANK](https://www.ncbi.nlm.nih.gov/genbank/) id - * [UNIPROT](https://www.uniprot.org) code - * subfamily - * organism - * [PDB](http://www.rcsb.org/) code - -2) Extract accession codes - -* Based on the previously generated csv table, extract accession codes for a given protein family. - -`$ extract_cazy_ids --db --family ` - -(-h for help) -* Optional: - -`--subfamilies` Create a file for each subfamily, default = False - -`--characterized` Create a file containing only characterized enzymes, default = False - -## Usage examples - -1) Extract all accession codes from family 9 of Glycosyl Transferases. - -`$ extract_cazy_ids --db CAZy_DB_xx-xx-xxxx.csv --family GT9` - -This will generate the following files: -``` -GT9.csv -``` - -2) Extract all accession codes from family 43 of Glycoside Hydrolase, including subfamilies - -`$ extract_cazy_ids --db CAZy_DB_xx-xx-xxxx.csv --family GH43 --subfamilies` - -This will generate the following files: ``` -GH43.csv -GH43_sub1.csv -GH43_sub2.csv -GH43_sub3.csv -(...) -GH43_sub37.csv +cazy-parser -h +usage: cazy-parser [-h] [-f FAMILY] [-s SUBFAMILY] [-c CHARACTERIZED] [-v] {GH,GT,PL,CA,AA} + +positional arguments: + {GH,GT,PL,CA,AA} + +optional arguments: + -h, --help show this help message and exit + -f FAMILY, --family FAMILY + -s SUBFAMILY, --subfamily SUBFAMILY + -c CHARACTERIZED, --characterized CHARACTERIZED + -v, --version show version ``` -3) Extract all accession codes from family 42 of Polysaccharide Lyases including characterized entries - -`$ extract_cazy_ids --db CAZy_DB_xx-xx-xxxx.csv --family PL42 --characterized` +### Example -This will generate the following files: +Extract all fasta sequences from family 43 of Glycoside Hydrolase subfamily 1 ``` -PL42.csv -PL42_characterized.csv +$ cazy-parser GH -f 43 -s 1 + [2022-05-26 16:39:21,511 91 INFO] ------------------------------------------ + [2022-05-26 16:39:21,511 92 INFO] + [2022-05-26 16:39:21,511 93 INFO] ┌─┐┌─┐┌─┐┬ ┬ ┌─┐┌─┐┬─┐┌─┐┌─┐┬─┐ + [2022-05-26 16:39:21,511 94 INFO] │ ├─┤┌─┘└┬┘───├─┘├─┤├┬┘└─┐├┤ ├┬┘ + [2022-05-26 16:39:21,511 95 INFO] └─┘┴ ┴└─┘ ┴ ┴ ┴ ┴┴└─└─┘└─┘┴└─ v2.0.0 + [2022-05-26 16:39:21,511 96 INFO] + [2022-05-26 16:39:21,511 97 INFO] ------------------------------------------ + [2022-05-26 16:39:21,511 183 INFO] Fetching links for Glycoside-Hydrolases, url: http://www.cazy.org/Glycoside-Hydrolases.html + [2022-05-26 16:39:22,454 189 INFO] Only using links of family 43 subfamily 1 + [2022-05-26 16:39:23,029 26 INFO] Dowloading 1415 fasta sequences... + [2022-05-26 16:40:32,187 51 INFO] Dumping fasta sequences to file GH43_1_26052022.fasta ``` -### Download fasta sequences - -Go to [NCBI's Batch Entrez](https://www.ncbi.nlm.nih.gov/sites/batchentrez) change the database to protein and submit the generated `.csv`. +This will generate the following file `GH43_1_DDMMYYY.fasta` containing the fasta sequences. ## To-do and how to contribute -Please refer to CONTRIBUTE.md +Please refer to [CONTRIBUTING](CONTRIBUTING.md) (: diff --git a/README.rst b/README.rst index f5cb2b1..2efceed 100644 --- a/README.rst +++ b/README.rst @@ -3,7 +3,7 @@ cazy-parser The `Carbohydrate-Active enZYmes Database (CAZy) `_ provides access to a sequence based classification of enzyme that are responsible for the assembly, modification and breakdown of oligo and polysaccharides. -This database has been online for eighteen years providing relevant genomic, structural and biochemical data on carbohydrate-active enzymes, such asglycoside hydrolases, glycosyl transferases, polysaccharide lyases, carbohydrateesterases and similar enzymes with auxiliary activities. The database isorganized and presented to the user as a series of highly annotated HTML tables. +This database has been online for several years providing relevant genomic, structural and biochemical data on carbohydrate-active enzymes, such as glycoside hydrolases, glycosyl transferases, polysaccharide lyases, carbohydrateesterases and similar enzymes with auxiliary activities. The database isorganized and presented to the user as a series of highly annotated HTML tables. This script provides a way to extract information from the database according to user need. diff --git a/cazy_parser/create_cazy_db.py b/cazy_parser/create_cazy_db.py deleted file mode 100755 index 92af863..0000000 --- a/cazy_parser/create_cazy_db.py +++ /dev/null @@ -1,352 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# ==============================================================================# -# Copyright (C) 2016 Rodrigo Honorato -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see -# ==============================================================================# - -# ==============================================================================# -# create a parsed database using CAZY html structure -import os -import re -import string -import sys -import time -import urllib - -import progressbar -from bs4 import BeautifulSoup - -# ==============================================================================# - -checkpoint_f = "cp.chk" -fam_checkpoint_f = "fam.chk" - - -def init_bar(): - """Initilize progress bar""" - bar = progressbar.ProgressBar( - widgets=[ - " ", - progressbar.Timer(), - " ", - progressbar.Percentage(), - " ", - progressbar.Bar("█", "[", "]"), - " ", - progressbar.ETA(), - " ", - ] - ) - return bar - - -def fetch_species(): - """Gather species names and IDs (full genome only)""" - print("> Gathering species with full genomes") - # a = archea // b = bacteria // e = eukaryota // v = virus - species_domain_list = ["a", "b", "e", "v"] - species_dic = {} - - bar = init_bar() - bar.max_value = len(string.ascii_uppercase) * len(species_domain_list) - bar.start() - - counter = 0 - for domain in species_domain_list: - for initial in string.ascii_uppercase: - counter += 1 - bar.update(counter) - link = "http://www.cazy.org/%s%s.html" % (domain, initial) - f = urllib.request.urlopen(link) - species_list_hp = f.read().decode("utf-8") - # parse webpage - index_list = re.findall( - '"http://www.cazy.org/{}(\d.*).html" class="nav">(.*)'.format( - species_domain_list - ), - species_list_hp, - ) - for entry in index_list: - index, species = entry - try: - species_dic[species].append(index) - except KeyError: - species_dic[species] = [index] - bar.finish() - - # Double check to see which of the species codes are valid - for j, species in enumerate(species_dic.keys()): - entry_list = species_dic[species] - if len(entry_list) > 1: - # More than one entry for this species - # > This is (likely) a duplicate - # > Use the higher number, should be the newer page - newer_entry = max([int(i.split("b")[-1]) for i in entry_list]) - selected_entry = "b%i" % newer_entry - - species_dic[species] = selected_entry - else: - species_dic[species] = species_dic[species][0] - - return species_dic - - -def fetch_families(main_link): - """Identify family link structure and return a list""" - soup = BeautifulSoup(urllib.request.urlopen(main_link), features="html.parser") - family_table = soup.findAll(name="table")[0] - rows = family_table.findAll(name="td") - family_list = [ - str(r.find("a")["href"].split("/")[-1].split(".html")[0]) for r in rows - ] - - return family_list - - -def fetch_links(enzyme_class): - """Fetch link structure for an enzyme class""" - - # Links need to be fetched everytime since there is no way (?) to guarantee - # that all were previously fetched without actually fetching them. - main_class_link = "http://www.cazy.org/%s.html" % enzyme_class - - family_list = fetch_families(main_class_link) - - bar = init_bar() - bar.max_value = len(family_list) - bar.start() - - page_list = [] - for j, family in enumerate(family_list): - bar.update(j + 1) - main_link = "http://www.cazy.org/%s.html" % family - - # TODO: Implement checkpoint for link fetching - - family_soup = BeautifulSoup( - urllib.request.urlopen(main_link), features="html.parser" - ) - superfamily_list = [ - l.find("a")["href"] - for l in family_soup.findAll("span", attrs={"class": "choix"}) - ][1:] - - superfamily_list = [ - f for f in superfamily_list if "structure" not in f - ] # remove structure tab for parsing - for main_link in superfamily_list: - page_zero = main_link - soup = BeautifulSoup( - urllib.request.urlopen(main_link), features="html.parser" - ) - - # Get page list for the family - page_index_list = soup.findAll(name="a", attrs={"class": "lien_pagination"}) - if bool(page_index_list): - - # =====================# - # be careful with this # - first_page_idx = int(re.findall("=(\d*)#", str(page_index_list[0]))[0]) - last_page_idx = int(re.findall("=(\d*)#", str(page_index_list[-2]))[0]) - # =====================# - - page_list.append(page_zero) - for i in range( - first_page_idx, last_page_idx + first_page_idx, first_page_idx - ): - link = ( - "http://www.cazy.org/" - + page_index_list[0]["href"].split("=")[0] - + "=" - + str(i) - ) - page_list.append(link) - - else: - page_list.append(page_zero) - - return page_list - - -def check_status(chk_f, link): - """Check if contents of this link have been downloaded""" - - if os.path.isfile(chk_f): - for l in open(chk_f): - if link in l: - return True - - -def save_status(chk_f, link): - """Save link address in checkpoint file""" - open(chk_f, "a").write("{}\n".format(link)) - - -def fetch_data(link_list, species_dic, out_f): - """Parse list of links and extract information""" - - bar = init_bar() - bar.max_value = len(link_list) - bar.start() - - protein_counter = 0 - for j, link in enumerate(link_list): - - db_dic = {} - bar.update(j + 1) - - if check_status(checkpoint_f, link): - continue - - # tr = rows // # td = cells - soup = BeautifulSoup(urllib.request.urlopen(link), features="html.parser") - table = soup.find("table", attrs={"class": "listing"}) - domain = "" - family = link.split(".org/")[-1].split("_")[0] - - # consistency check to look for deleted families. i.e. GH21 - try: - table.findAll("tr") - except AttributeError: - # not a valid link, move on - continue - for row in table.findAll("tr"): - try: - if row["class"] == "royaume" and row.text != "Top": - domain = str(row.text).lower() - except KeyError: - pass - tds = row.findAll("td") - if len(tds) > 1 and tds[0].text != "Protein Name": - # valid line - db_dic[protein_counter] = {} - db_dic[protein_counter]["protein_name"] = tds[0].text.replace( - " ", "" - ) - db_dic[protein_counter]["family"] = family - db_dic[protein_counter]["domain"] = domain - db_dic[protein_counter]["ec"] = tds[1].text.replace(" ", "") - db_dic[protein_counter]["organism"] = tds[2].text.replace(" ", "") - try: - db_dic[protein_counter]["genbank"] = ( - tds[3].find("a").text.replace(" ", "") - ) # get latest entry - except AttributeError: - # there is a crazy aberration when there is no genbank available - db_dic[protein_counter]["genbank"] = "unavailable" - - db_dic[protein_counter]["uniprot"] = tds[4].text.replace(" ", "") - db_dic[protein_counter]["pdb"] = tds[5].text.replace(" ", "") - - # check if this is species has a complete genome - try: - db_dic[protein_counter]["organism_code"] = species_dic[ - tds[2].text.replace(" ", "") - ] - except KeyError: - db_dic[protein_counter]["organism_code"] = "invalid" - - # check if there are subfamilies - try: - db_dic[protein_counter]["subfamily"] = tds[6].text.replace( - " ", "" - ) - except KeyError: - db_dic[protein_counter]["subfamily"] = "" - - # if 'characterized' in main_link: - if "characterized" in link: - db_dic[protein_counter]["tag"] = "characterized" - else: - db_dic[protein_counter]["tag"] = "" - - protein_counter += 1 - - save_output(out_f, db_dic) - save_status(checkpoint_f, link) - - bar.finish() - - -def save_output(output_f, d_dic): - """Save information as .csv file""" - if not os.path.isfile(output_f): - out = open(output_f, "w") - header = ",".join(d_dic[0].keys()) + "\n" - out.write(header) - else: - out = open(output_f, "a") - - for p in d_dic: - tbw = ",".join(list(d_dic[p].values())).replace("\n", "") - tbw += "\n" - out.write(tbw) - out.close() - - -def clean(out_f): - """Remove duplicates from output file""" - file_l = open(out_f).readlines() - new_file_l = list(set(file_l)) - open(out_f, "w").write("".join(new_file_l)) - del file_l - del new_file_l - - -def logo(): - version = "1.4.2" - print("") - print("┌─┐┌─┐┌─┐┬ ┬ ┌─┐┌─┐┬─┐┌─┐┌─┐┬─┐") - print("│ ├─┤┌─┘└┬┘───├─┘├─┤├┬┘└─┐├┤ ├┬┘") - print("└─┘┴ ┴└─┘ ┴ ┴ ┴ ┴┴└─└─┘└─┘┴└─ v{}".format(version)) - print("") - print("This is the database creator script") - print("") - print("(get a coffee, this will take a while)") - print("") - - -def main(): - - logo() - - species_dic = fetch_species() - - enzyme_list = [ - "Glycoside-Hydrolases", - "GlycosylTransferases", - "Polysaccharide-Lyases", - "Carbohydrate-Esterases", - "Auxiliary-Activities", - ] - - for enzyme_name in enzyme_list: - print(">> {}".format(enzyme_name)) - output_f = "CAZy_DB_{}_{}.csv".format(enzyme_name, time.strftime("%d-%m-%Y")) - - print("> Fetching links") - page_list = fetch_links(enzyme_name) - - print("> Gathering data") - fetch_data(page_list, species_dic, output_f) - - clean(output_f) - - sys.exit(0) - - -if __name__ == "__main__": - main() diff --git a/cazy_parser/extract_cazy_ids.py b/cazy_parser/extract_cazy_ids.py deleted file mode 100755 index d0c8ca9..0000000 --- a/cazy_parser/extract_cazy_ids.py +++ /dev/null @@ -1,182 +0,0 @@ -#!/usr/bin/env python -# ==============================================================================# -# Copyright (C) 2016 Rodrigo Honorato -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see -# ==============================================================================# - -import argparse -import sys - - -def logo(): - version = "1.4.2" - print("") - print("┌─┐┌─┐┌─┐┬ ┬ ┌─┐┌─┐┬─┐┌─┐┌─┐┬─┐") - print("│ ├─┤┌─┘└┬┘───├─┘├─┤├┬┘└─┐├┤ ├┬┘") - print("└─┘┴ ┴└─┘ ┴ ┴ ┴ ┴┴└─└─┘└─┘┴└─ v{}".format(version)) - print("") - print("This is the accession code retrieval script") - print("") - - -def load_db(f): - db_f = open(f).readlines() - sep = "," - header = db_f[0].split(sep) - - print(">> Loading {}".format(f)) - db = {} - for i, l in enumerate(db_f[1:]): - db[i] = {} - data = l.split(sep) - # init dictionary - for c, idx in enumerate(header): - try: - db[i][idx] = data[c] - except IndexError: - db[i][idx] = "" - return db - - -def select_fam(fam, db): - selection_list = [] - for e in db: - if db[e]["family"] == fam: - selection_list.append(db[e]["genbank"]) - - print( - ">> Retrieving all {} accession codes for Family {}".format( - len(selection_list), fam - ) - ) - - sele_l = list(set(selection_list)) - out_f = "%s.csv" % fam - print(">> Creating {}".format(out_f)) - out = open(out_f, "w") - out.write("\n".join(sele_l)) - out.close() - - -def select_subfam(fam, db): - selection_dic = {} - for e in db: - if db[e]["family"] == fam and bool(db[e]["subfamily"]): - try: - selection_dic[db[e]["subfamily"]].append(db[e]["genbank"]) - except KeyError: - selection_dic[db[e]["subfamily"]] = [db[e]["genbank"]] - - # output - print( - ">> Retrieving accession codes for {} subfamilies for Family {}".format( - len(selection_dic), fam - ) - ) - for sub in selection_dic: - s_l = list(set(selection_dic[sub])) - out_f = "{}_sub{}.csv".format(fam, sub) - print(">> Creating {}".format(out_f)) - out = open(out_f, "w") - out.write("\n".join(s_l)) - out.close() - - -def select_char(fam, db): - selection_list = [] - for e in db: - if db[e]["family"] == fam and bool(db[e]["tag"]): - selection_list.append(db[e]["genbank"]) - - print( - ">> Selecting {} CHARACTERIZED proteins for Family {}".format( - len(selection_list), fam - ) - ) - - out_f = "{}_characterized.csv".format(fam) - out = open(out_f, "w") - print(">> Creating {}".format(out_f)) - out.write("\n".join(selection_list)) - out.close() - - -def main(): - logo() - - parser = argparse.ArgumentParser( - description="Select accession codes for a given protein family. Optional: Select subfamilies and/or " - "characterized enzymes" - ) - - parser.add_argument( - "--db", - action="store", - dest="db_file", - help="Database file generated by cazy-parser", - ) - - parser.add_argument( - "--family", - action="store", - dest="target_family", - help="Family to be searched ex. GH5", - ) - - parser.add_argument( - "--subfamily", - action="store_true", - default=False, - help="(Optional) Create a file with accession codes for each subfamily", - ) - - parser.add_argument( - "--characterized", - action="store_true", - default=False, - help="(Optional) Create a file with accession codes only for characterized enzymes", - ) - - results = parser.parse_args() - - check = False - if results.db_file is None: - print("\n>> [ERROR] Missing database file\n") - check = True - - if results.target_family is None: - print("\n>> [ERROR] Missing target family\n") - check = True - - if check: - parser.print_help() - sys.exit(0) - - db = load_db(results.db_file) - - if bool(results.target_family): - select_fam(results.target_family, db) - - if bool(results.subfamily): - select_subfam(results.target_family, db) - - if bool(results.characterized): - select_char(results.target_family, db) - - sys.exit(0) - - -if __name__ == "__main__": - main() diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..a15b339 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,2 @@ +[tool.pytest.ini_options] +pythonpath = ["src"] diff --git a/requirements.txt b/requirements.txt index 091857c..cb47bbe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,5 @@ certifi==2022.5.18.1 progressbar2==4.0.0 python-utils==3.2.3 soupsieve==2.3.2.post1 +requests==2.27.1 +biopython==1.79 diff --git a/setup.py b/setup.py index eb41e68..8f33578 100644 --- a/setup.py +++ b/setup.py @@ -1,17 +1,7 @@ -"""A setuptools based setup module. - -See: -https://packaging.python.org/en/latest/distributing.html -https://github.com/pypa/sampleproject -""" - -# To use a consistent encoding from codecs import open from os import path -# Always prefer setuptools over distutils from setuptools import find_packages, setup -from setuptools.command.install import install here = path.abspath(path.dirname(__file__)) @@ -23,78 +13,32 @@ required = f.read().splitlines() -class CazyParser(install): - def run(self): - install.run(self) - - setup( name="cazy-parser", - # Versions should comply with PEP440. For a discussion on single-sourcing - # the version across setup.py and the project code, see - # https://packaging.python.org/en/latest/single_source_version.html - version="1.4.2", + version="2.0.0", description="A way to extract specific information from CAZy", long_description=long_description, - # The project's main homepage. url="https://github.com/rodrigovrgs/cazy-parser", - # Author details author="Rodrigo Honorato", - author_email="rvhonorato@gmail.com", - # Choose your license + author_email="rvhonorato@protonmail.com", license="GPL3", # See https://pypi.python.org/pypi?%3Aaction=list_classifiers classifiers=[ - # How mature is this project? Common values are - # 3 - Alpha - # 4 - Beta - # 5 - Production/Stable "Development Status :: 5 - Production/Stable", - # Indicate who your project is intended for "Intended Audience :: Science/Research", "Topic :: Scientific/Engineering :: Bio-Informatics", - # Pick your license as you wish (should match "license" above) "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", - # Specify the Python versions you support here. In particular, ensure - "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.9", ], - # What does your project relate to? keywords="cazy database datamining", - # You can just specify the packages manually here if your project is - # simple. Or you can use find_packages(). - packages=find_packages(exclude=["contrib", "docs", "tests"]), - # Alternatively, if you want to distribute just a my_module.py, uncomment - # this: - # py_modules=["my_module"], - # List run-time dependencies here. These will be installed by pip when - # your project is installed. For an analysis of "install_requires" vs pip's - # requirements files see: - # https://packaging.python.org/en/latest/requirements.html + packages=find_packages("src"), install_requires=required, - # List additional groups of dependencies here (e.g. development - # dependencies). You can install these using the following syntax, - # for example: - # $ pip install -e .[dev,test] extras_require={}, - # If there are data files included in your packages that need to be - # installed, specify them here. If using Python 2.6 or less, then these - # have to be included in MANIFEST.in as well. - package_data={ - "cazy-parser": [], - }, - # Although 'package_data' is the preferred approach, in some case you may - # need to place data files outside of your packages. See: - # http://docs.python.org/3.4/distutils/setupscript.html#installing-additional-files # noqa - # In this case, 'data_file' will be installed into '/my_data' - # data_files=[], - # To provide executable scripts, use entry points in preference to the - # "scripts" keyword. Entry points provide cross-platform support and allow - # pip to create the appropriate form of executable for the target platform. + package_data={"cazy-parser": []}, + package_dir={"": "src"}, entry_points={ "console_scripts": [ - "create_cazy_db=cazy_parser.create_cazy_db:main", - "extract_cazy_ids=cazy_parser.extract_cazy_ids:main", + "cazy-parser=cazy_parser.cli:maincli", ], }, - cmdclass={"install": CazyParser}, ) diff --git a/src/cazy_parser/__init__.py b/src/cazy_parser/__init__.py new file mode 100644 index 0000000..59879c7 --- /dev/null +++ b/src/cazy_parser/__init__.py @@ -0,0 +1,7 @@ +ENZYME_LIST = { + "GH": "Glycoside-Hydrolases", + "GT": "GlycosylTransferases", + "PL": "Polysaccharide-Lyases", + "CA": "Carbohydrate-Esterases", + "AA": "Auxiliary-Activities", +} diff --git a/src/cazy_parser/cli.py b/src/cazy_parser/cli.py new file mode 100644 index 0000000..074fcb0 --- /dev/null +++ b/src/cazy_parser/cli.py @@ -0,0 +1,119 @@ +import argparse +import logging +import sys +import time + +from cazy_parser import ENZYME_LIST +from cazy_parser.modules.fasta import dump_fastas +from cazy_parser.modules.html import retrieve_genbank_ids +from cazy_parser.version import VERSION + +log = logging.getLogger("cazylog") +ch = logging.StreamHandler() +formatter = logging.Formatter(" [%(asctime)s %(lineno)d %(levelname)s] %(message)s") +ch.setFormatter(formatter) +log.addHandler(ch) +log.setLevel("DEBUG") + + +# CONFIG = json.load(open(Path(Path(__file__).parents[2], "etc/config.json"))) + +# =========================================================================================================== +# Define arguments +ap = argparse.ArgumentParser() + +ap.add_argument( + "enzyme_class", + choices=["GH", "GT", "PL", "CA", "AA"], +) + +ap.add_argument("-f", "--family", type=int) + +ap.add_argument("-s", "--subfamily") + +ap.add_argument("-c", "--characterized") + +ap.add_argument( + "-v", + "--version", + help="show version", + action="version", + version=f"Running {ap.prog} v{VERSION}", +) + + +def load_args(ap): + """ + Load argument parser. + + Parameters + ---------- + ap : argparse.ArgumentParser + Argument parser. + + Returns + ------- + cmd : argparse.Namespace + Parsed command-line arguments. + + """ + return ap.parse_args() + + +# ====================================================================================# +# Define CLI +def cli(ap, main): + """ + Command-line interface entry point. + + Parameters + ---------- + ap : argparse.ArgumentParser + Argument parser. + main : function + Main function. + + """ + cmd = load_args(ap) + main(**vars(cmd)) + + +def maincli(): + """Execute main client.""" + cli(ap, main) + + +# ====================================================================================# +# Main code +def main(enzyme_class, family, subfamily, characterized): + """Main function.""" + + log.info("-" * 42) + log.info("") + log.info("┌─┐┌─┐┌─┐┬ ┬ ┌─┐┌─┐┬─┐┌─┐┌─┐┬─┐") + log.info("│ ├─┤┌─┘└┬┘───├─┘├─┤├┬┘└─┐├┤ ├┬┘") + log.info(f"└─┘┴ ┴└─┘ ┴ ┴ ┴ ┴┴└─└─┘└─┘┴└─ v{VERSION}") + log.info("") + log.info("-" * 42) + + if enzyme_class not in ENZYME_LIST: + logging.error(f"Enzyme class {enzyme_class} not supportes") + sys.exit() + else: + enzyme_name = ENZYME_LIST[enzyme_class] + + id_list = retrieve_genbank_ids(enzyme_name, family, subfamily, characterized) + + output_fname = f"{enzyme_class}" + if family: + output_fname += f"{family}" + if subfamily: + output_fname += f"_{subfamily}" + + today = time.strftime("%d%m%Y") + output_fname += f"_{today}.fasta" + dump_fastas(id_list, output_fname) + + +if __name__ == "__main__": + sys.exit(maincli()) diff --git a/cazy_parser/__init__.py b/src/cazy_parser/modules/__init__.py similarity index 100% rename from cazy_parser/__init__.py rename to src/cazy_parser/modules/__init__.py diff --git a/src/cazy_parser/modules/fasta.py b/src/cazy_parser/modules/fasta.py new file mode 100644 index 0000000..14bdc9b --- /dev/null +++ b/src/cazy_parser/modules/fasta.py @@ -0,0 +1,54 @@ +import logging +import os + +from Bio import Entrez, SeqIO + +Entrez.email = "your-email-here@example.org" + +log = logging.getLogger("cazylog") + + +def download_fastas(id_list): + """ + Download fasta files from NCBI. + + Parameters + ---------- + id_list : list + List of genbank ids. + + Returns + ------- + fasta_list : list + List of fasta strings. + + """ + log.info(f"Dowloading {len(id_list)} fasta sequences...") + fasta_list = [] + handle = Entrez.efetch(db="protein", id=id_list, rettype="fasta", retmode="text") + for seq_record in SeqIO.parse(handle, "fasta"): + fasta_str = ( + f">{seq_record.description}{os.linesep}" f"{seq_record.seq}{os.linesep*2}" + ) + fasta_list.append(fasta_str) + + return fasta_list + + +def dump_fastas(id_list, output_f): + """ + Save the fasta strings to a file. + + Parameters + ---------- + id_list : list + List of genbank ids. + output_f : str + Path to the output file. + + """ + fasta_list = download_fastas(id_list) + log.info(f"Dumping fasta sequences to file {output_f}") + with open(output_f, "w") as fh: + for fasta in fasta_list: + fh.write(fasta) diff --git a/src/cazy_parser/modules/html.py b/src/cazy_parser/modules/html.py new file mode 100644 index 0000000..d065366 --- /dev/null +++ b/src/cazy_parser/modules/html.py @@ -0,0 +1,374 @@ +import itertools +import logging +import os +import re +import string +import sys +import urllib + +import requests +from bs4 import BeautifulSoup + +from cazy_parser.utils import init_bar + +log = logging.getLogger("cazylog") + + +def fetch_data(link_list): + """ + Parse list of links and extract information. + + Parameters + ---------- + link_list : list + List of links to parse + + + Returns + ------- + data : list + List of dictionaries containing information about each entry. + + """ + + data = [] + for _, link in enumerate(link_list): + + if ".txt" in link: + link_data = get_data_from_txt(link) + else: + link_data = get_data_from_html(link) + + data.append(link_data) + + return list(itertools.chain(*data)) + + +def parse_td(td_list): + """ + Parse a table from the HTML page. + + Parameters + ---------- + td_list : list + List of table elements. + + Returns + ------- + data_dict : dic + Dictionary of data. + + """ + data_dict = {} + if len(td_list) <= 1 or td_list[0].text == "Protein Name": + # this is likely the header + return data_dict + + protein_name = td_list[0].text + ec = td_list[1].text + # referece = td_list[2].text + organism = td_list[3].text + try: + genbank = td_list[4].find("a").text + except AttributeError: + genbank = "unavailable" + + uniprot = td_list[5].text + pdb = td_list[6].text + + data_dict["protein_name"] = protein_name + data_dict["ec"] = ec + data_dict["organism"] = organism + data_dict["genbank"] = genbank + data_dict["uniprot"] = uniprot + data_dict["pdb"] = pdb + + return data_dict + + +def parse_table(table): + """ + Parse a beautiful soup table and retrieve information. + + Parameters + ---------- + table : bs4.element.Tag + Beautiful soup table. + + Returns + ------- + table_data : list + List of dictionaries containing information from the table. + + """ + table_data = [] + for tr in table.findAll("tr"): + tds = tr.findAll("td") + td_dic = parse_td(tds) + table_data.append(td_dic) + return table_data + + +def get_data_from_html(link): + """ + Retrieve information from the HTML page. + + Parameters + ---------- + link : str + Link to the page. + + """ + soup = BeautifulSoup(urllib.request.urlopen(link), features="html.parser") + table = soup.find("table", attrs={"class": "listing"}) + domain = "" + family = link.split(".org/")[-1].split("_")[0] + data_list = parse_table(table) + + tag = "characterized" if "characterized" in link else "" + # add more information to the data + for data in data_list: + data["tag"] = tag + data["family"] = family + data["domain"] = domain + + return data_list + + +def get_data_from_txt(link): + """ + Retrieve information from the TXT file. + + Parameters + ---------- + link : str + Link to the page. + + Returns + ------- + data_list : list + List of dictionaries containing information from the TXT file. + + """ + data_list = [] + response = requests.get(link) + tag = "characterized" if "characterized" in link else "" + for line in response.text.split(os.linesep): + data_dict = {} + data = line.split("\t") + + try: + family = data[0] + domain = data[1] + species = data[2] + gene = data[3] + + data_dict["family"] = family + data_dict["domain"] = domain + data_dict["species"] = species + data_dict["genbank"] = gene + data_dict["tag"] = tag + except IndexError: + # no data for this line + continue + + data_list.append(data_dict) + + return data_list + + +def fetch_links(enzyme_class, family, subfamily): + """ + Fetch link structure for an enzyme class. + + Parameters + ---------- + enzyme_class : str + Enzyme class to fetch links for. + + Returns + ------- + page_list : list + List of links to the pages. + + """ + + main_class_link = f"http://www.cazy.org/{enzyme_class}.html" + log.info(f"Fetching links for {enzyme_class}, url: {main_class_link}") + family_list = fetch_families(main_class_link) + + # Filter by family + if family: + if subfamily: + log.info(f"Only using links of family {family} subfamily {subfamily}") + family_list = [e for e in family_list if e[2:] == f"{family}_{subfamily}"] + else: + log.info(f"Only using links of family {family}") + family_list = [e for e in family_list if int(e[2:]) == family] + + if not family_list: + log.error("No links were found.") + sys.exit() + + page_list = [] + for j, family in enumerate(family_list): + # bar.update(j + 1) + family_link = f"http://www.cazy.org/{family}.html" + + # TODO: Implement checkpoint for link fetching + + family_soup = BeautifulSoup( + urllib.request.urlopen(family_link), features="html.parser" + ) + + # Find the links to the individual pages + superfamily_links = [] + for line in family_soup.findAll("span", attrs={"class": "choix"}): + _link = line.find("a")["href"] + if "krona" not in _link and "structure" not in _link: + superfamily_links.append(_link) + + for link in superfamily_links: + page_zero = link + try: + soup = BeautifulSoup( + urllib.request.urlopen(link), features="html.parser" + ) + except ValueError: + # This is a link to a text file + page_list.append(f"http://www.cazy.org/{link}") + continue + + # Get page list for the family + page_index_list = soup.findAll(name="a", attrs={"class": "lien_pagination"}) + if bool(page_index_list): + + # =====================# + # be careful with this # + first_page_idx = int(re.findall(r"=(\d*)#", str(page_index_list[0]))[0]) + last_page_idx = int(re.findall(r"=(\d*)#", str(page_index_list[-2]))[0]) + # =====================# + + page_list.append(page_zero) + page_range = range( + first_page_idx, last_page_idx + first_page_idx, first_page_idx + ) + for i in page_range: + sub_str = page_index_list[0]["href"].split("=")[0] + link = f"http://www.cazy.org/{sub_str}={i}" + page_list.append(link) + + else: + page_list.append(page_zero) + + return page_list + + +def fetch_families(link): + """ + Identify family link structure and return a list. + + Parameters + ---------- + link : str + Link to the page. + + Returns + ------- + family_link_list : list + List of family links. + + """ + enzyme_regex = r"([A-Z]{2}\d*_?\d?).html" + soup = BeautifulSoup(urllib.request.urlopen(link), features="html.parser") + all_links = soup.find_all("a") + family_link_list = [] + for link in all_links: + try: + href = re.findall(enzyme_regex, link.attrs["href"])[0] + family_link_list.append(href) + except (IndexError, KeyError): + continue + + return list(set(family_link_list)) + + +def fetch_species(): + """Gather species names and IDs (full genome only).""" + log.info("> Gathering species with full genomes") + # a = archea // b = bacteria // e = eukaryota // v = virus + species_domain_list = ["a", "b", "e", "v"] + species_dic = {} + + bar = init_bar() + bar.max_value = len(string.ascii_uppercase) * len(species_domain_list) + bar.start() + + counter = 0 + for domain in species_domain_list: + for initial in string.ascii_uppercase: + counter += 1 + bar.update(counter) + link = f"http://www.cazy.org/{domain}{initial}.html" + f = urllib.request.urlopen(link) + species_list_hp = f.read().decode("utf-8") + # parse webpage + index_list = re.findall( + rf'"http://www.cazy.org/{species_domain_list}(\d.*).html"' + r' class="nav">(.*)', + species_list_hp, + ) + for entry in index_list: + index, species = entry + try: + species_dic[species].append(index) + except KeyError: + species_dic[species] = [index] + bar.finish() + + # Double check to see which of the species codes are valid + for j, species in enumerate(species_dic.keys()): + entry_list = species_dic[species] + if len(entry_list) > 1: + # More than one entry for this species + # > This is (likely) a duplicate + # > Use the higher number, should be the newer page + newer_entry = max([int(i.split("b")[-1]) for i in entry_list]) + selected_entry = "b%i" % newer_entry + + species_dic[species] = selected_entry + else: + species_dic[species] = species_dic[species][0] + + return species_dic + + +def retrieve_genbank_ids(enzyme_name, family=None, subfamily=None, characterized=None): + """ + Retrieve genbank IDs for a given enzyme. + + Parameters + ---------- + enzyme_name : str + Enzyme name to retrieve genbank IDs for. + family : int + Family number to retrieve genbank IDs for. + subfamily : int + Subfamily number to retrieve genbank IDs for. + characterized : bool + Whether to retrieve genbank IDs for characterized enzymes. + + Returns + ------- + genbank_id_list : list + List of genbank IDs. + + """ + page_list = fetch_links(enzyme_name, family, subfamily) + data = fetch_data(page_list) + genbank_id_list = [] + for element in data: + if "genbank" in element: + genbank_id_list.append(element["genbank"]) + + return genbank_id_list diff --git a/src/cazy_parser/utils.py b/src/cazy_parser/utils.py new file mode 100644 index 0000000..2465209 --- /dev/null +++ b/src/cazy_parser/utils.py @@ -0,0 +1,23 @@ +import logging + +import progressbar + +log = logging.getLogger("cazylog") + + +def init_bar(): + """Initilize progress bar.""" + bar = progressbar.ProgressBar( + widgets=[ + " ", + progressbar.Timer(), + " ", + progressbar.Percentage(), + " ", + progressbar.Bar("█", "[", "]"), + " ", + progressbar.ETA(), + " ", + ] + ) + return bar diff --git a/src/cazy_parser/version.py b/src/cazy_parser/version.py new file mode 100644 index 0000000..2101409 --- /dev/null +++ b/src/cazy_parser/version.py @@ -0,0 +1 @@ +VERSION = "2.0.0" diff --git a/tests/test_fasta.py b/tests/test_fasta.py new file mode 100644 index 0000000..db2ec26 --- /dev/null +++ b/tests/test_fasta.py @@ -0,0 +1,26 @@ +import tempfile +from pathlib import Path + +import pytest + +from cazy_parser.modules.fasta import download_fastas, dump_fastas + + +@pytest.fixture +def id_list(): + return ["WP_010249057.1", "ABN51772.1"] + + +def test_download_fastas(id_list): + observed_fasta_list = download_fastas(id_list) + + assert len(observed_fasta_list) == 2 + assert observed_fasta_list[0][0] == ">" + + +def test_dump_fastas(id_list): + temp_f = tempfile.NamedTemporaryFile(delete=False) + dump_fastas(id_list, temp_f.name) + + assert Path(temp_f.name).exists() + assert Path(temp_f.name).stat().st_size != 0 diff --git a/tests/test_html.py b/tests/test_html.py new file mode 100644 index 0000000..eafb449 --- /dev/null +++ b/tests/test_html.py @@ -0,0 +1,106 @@ +import urllib + +import pytest +from bs4 import BeautifulSoup + +from cazy_parser.modules.html import ( + fetch_data, + fetch_families, + fetch_links, + fetch_species, + get_data_from_html, + get_data_from_txt, + parse_table, + parse_td, + retrieve_genbank_ids, +) + + +@pytest.fixture +def soup(): + soup = BeautifulSoup( + urllib.request.urlopen("http://www.cazy.org/GH173_characterized.html"), + features="html.parser", + ) + return soup + + +@pytest.fixture +def table(soup): + return soup.find("table", attrs={"class": "listing"}) + + +@pytest.fixture +def td_list(table): + td_list = [tr.findAll("td") for tr in table.findAll("tr")][3] + return td_list + + +def test_fetch_data(): + observed_data = fetch_data( + [ + "http://www.cazy.org/GH173_characterized.html", + "http://www.cazy.org/IMG/cazy_data/GH173.txt", + ] + ) + + assert [e for e in observed_data if "tag" in e] + assert [e for e in observed_data if "family" in e] + + +def test_parse_td(td_list): + observed_data_dic = parse_td(td_list) + + assert "protein_name" in observed_data_dic + assert "pdb" in observed_data_dic + assert "ec" in observed_data_dic + + +def test_parse_table(table): + observed_data = parse_table(table) + + assert [e for e in observed_data if "protein_name" in e] + + +def test_get_data_from_html(): + observed_data = get_data_from_html( + "http://www.cazy.org/GH173" "_characterized.html" + ) + + assert observed_data + assert [e for e in observed_data if "protein_name" in e] + + +def test_get_data_from_txt(): + observed_data = get_data_from_txt("http://www.cazy.org/IMG/cazy_data/GH173.txt") + + assert [e for e in observed_data if "family" in e] + + +def test_fetch_links(): + observed_links = fetch_links("Carbohydrate-Esterases", family=None, subfamily=None) + + assert "http://www.cazy.org/CE20_characterized.html" in observed_links + assert "http://www.cazy.org/IMG/cazy_data/CE20.txt" in observed_links + + +def test_fetch_families(): + observed_families = fetch_families("http://www.cazy.org/Glycoside-Hydrolases.html") + + assert "GH1" in observed_families + + +def test_fetch_species(): + observed_species = fetch_species() + + assert observed_species + assert isinstance(observed_species, dict) + + +def test_retrieve_genbank_ids(): + observed_id_list = retrieve_genbank_ids( + enzyme_name="Glycoside-Hydrolases", family=5, subfamily=1 + ) + + assert observed_id_list + assert len(observed_id_list) == 1163 diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..de390c8 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,7 @@ +from cazy_parser.utils import init_bar + + +def test_init_bar(): + bar = init_bar() + + assert bar