From f267586491bf97cc9eb11292da540c94ef449c09 Mon Sep 17 00:00:00 2001 From: Rodrigo V Honorato Date: Thu, 12 Oct 2023 12:11:08 +0200 Subject: [PATCH] Retrieve only the experimentally characterized cazy (#43) * upgrade poetry * simplify cli * add missing characterized option * update tests * update version --- .trunk/trunk.yaml | 20 +++---- cazy_parser/cli.py | 109 ++++++++++++------------------------ cazy_parser/modules/html.py | 57 +++++++------------ cazy_parser/version.py | 2 +- pyproject.toml | 6 +- tests/test_html.py | 18 +++++- 6 files changed, 85 insertions(+), 127 deletions(-) diff --git a/.trunk/trunk.yaml b/.trunk/trunk.yaml index 4282d8c..d30aab0 100644 --- a/.trunk/trunk.yaml +++ b/.trunk/trunk.yaml @@ -1,23 +1,23 @@ version: 0.1 cli: - version: 1.6.1 + version: 1.16.2 plugins: sources: - id: trunk - ref: v0.0.13 + ref: v1.2.1 uri: https://github.com/trunk-io/plugins lint: enabled: - - actionlint@1.6.23 - - black@23.1.0 + - actionlint@1.6.26 + - black@23.9.1 - git-diff-check - - gitleaks@8.16.1 + - gitleaks@8.18.0 - isort@5.12.0 - - markdownlint@0.33.0 - - prettier@2.8.4 - - ruff@0.0.256 - - taplo@0.7.0 - - yamllint@1.29.0 + - markdownlint@0.37.0 + - prettier@3.0.3 + - ruff@0.0.292 + - taplo@0.8.1 + - yamllint@1.32.0 ignore: - linters: [ALL] paths: diff --git a/cazy_parser/cli.py b/cazy_parser/cli.py index 53ae324..05b601e 100644 --- a/cazy_parser/cli.py +++ b/cazy_parser/cli.py @@ -16,75 +16,52 @@ log.addHandler(ch) log.setLevel("DEBUG") -# =========================================================================================================== -# Define arguments -ap = argparse.ArgumentParser() -ap.add_argument( - "enzyme_class", - choices=["GH", "GT", "PL", "CA", "AA"], -) - -ap.add_argument("-f", "--family", type=int) - -ap.add_argument("-s", "--subfamily") - -ap.add_argument("-c", "--characterized") - -ap.add_argument( - "-v", - "--version", - help="show version", - action="version", - version=f"Running {ap.prog} v{VERSION}", -) - - -def load_args(ap): - """ - Load argument parser. - - Parameters - ---------- - ap : argparse.ArgumentParser - Argument parser. +# ====================================================================================# +# Main code +def main(): + """Main function.""" - Returns - ------- - cmd : argparse.Namespace - Parsed command-line arguments. + ap = argparse.ArgumentParser() - """ - return ap.parse_args() + ap.add_argument( + "enzyme_class", + choices=["GH", "GT", "PL", "CA", "AA"], + ) + ap.add_argument("-f", "--family", type=int, default=None) -# ====================================================================================# -# Define CLI -def cli(ap, main): - """ - Command-line interface entry point. + ap.add_argument("-s", "--subfamily", type=int, default=None) - Parameters - ---------- - ap : argparse.ArgumentParser - Argument parser. - main : function - Main function. + ap.add_argument("-c", "--characterized", action="store_true", default=False) - """ - cmd = load_args(ap) - main(**vars(cmd)) + ap.add_argument( + "-v", + "--version", + help="show version", + action="version", + version=f"Running {ap.prog} v{VERSION}", + ) + args = ap.parse_args() -def maincli(): - """Execute main client.""" - cli(ap, main) + if args.enzyme_class not in ENZYME_LIST: + logging.error(f"Enzyme class {args.enzyme_class} not supported") + sys.exit() + else: + enzyme_name = ENZYME_LIST[args.enzyme_class] + id_list = retrieve_genbank_ids( + enzyme_name, args.family, args.subfamily, args.characterized + ) -# ====================================================================================# -# Main code -def main(enzyme_class, family, subfamily, characterized): - """Main function.""" + output_fname = f"{args.enzyme_class}" + if args.family: + output_fname += f"{args.family}" + if args.subfamily: + output_fname += f"_{args.subfamily}" + if args.characterized: + output_fname += "_characterized" log.info("-" * 42) log.info("") @@ -94,20 +71,6 @@ def main(enzyme_class, family, subfamily, characterized): log.info("") log.info("-" * 42) - if enzyme_class not in ENZYME_LIST: - logging.error(f"Enzyme class {enzyme_class} not supported") - sys.exit() - else: - enzyme_name = ENZYME_LIST[enzyme_class] - - id_list = retrieve_genbank_ids(enzyme_name, family, subfamily, characterized) - - output_fname = f"{enzyme_class}" - if family: - output_fname += f"{family}" - if subfamily: - output_fname += f"_{subfamily}" - today = time.strftime("%d%m%Y") output_fname += f"_{today}.fasta" try: @@ -128,4 +91,4 @@ def main(enzyme_class, family, subfamily, characterized): if __name__ == "__main__": - sys.exit(maincli()) + sys.exit(main()) diff --git a/cazy_parser/modules/html.py b/cazy_parser/modules/html.py index 66bba06..b358108 100644 --- a/cazy_parser/modules/html.py +++ b/cazy_parser/modules/html.py @@ -5,6 +5,8 @@ import string import sys import urllib +import urllib.request +from typing import Optional import requests from bs4 import BeautifulSoup @@ -176,21 +178,13 @@ def get_data_from_txt(link): return data_list -def fetch_links(enzyme_class, family, subfamily): - """ - Fetch link structure for an enzyme class. - - Parameters - ---------- - enzyme_class : str - Enzyme class to fetch links for. - - Returns - ------- - page_list : list - List of links to the pages. - - """ +def fetch_links( + enzyme_class: str, + family: Optional[int] = None, + subfamily: Optional[int] = None, + characterized: Optional[bool] = None, +) -> list[str]: + """Fetch link structure for an enzyme class.""" main_class_link = f"http://www.cazy.org/{enzyme_class}.html" log.info(f"Fetching links for {enzyme_class}, url: {main_class_link}") @@ -205,6 +199,9 @@ def fetch_links(enzyme_class, family, subfamily): log.info(f"Only using links of family {family}") family_list = [e for e in family_list if int(e[2:]) == family] + if characterized: + log.info("Only using characterized links") + if not family_list: log.error("No links were found.") sys.exit() @@ -259,6 +256,9 @@ def fetch_links(enzyme_class, family, subfamily): else: page_list.append(page_zero) + if characterized: + page_list = [e for e in page_list if "characterized" in e] + return page_list @@ -341,28 +341,11 @@ def fetch_species(): return species_dic -def retrieve_genbank_ids(enzyme_name, family=None, subfamily=None, characterized=None): - """ - Retrieve genbank IDs for a given enzyme. - - Parameters - ---------- - enzyme_name : str - Enzyme name to retrieve genbank IDs for. - family : int - Family number to retrieve genbank IDs for. - subfamily : int - Subfamily number to retrieve genbank IDs for. - characterized : bool - Whether to retrieve genbank IDs for characterized enzymes. - - Returns - ------- - genbank_id_list : list - List of genbank IDs. - - """ - page_list = fetch_links(enzyme_name, family, subfamily) +def retrieve_genbank_ids( + enzyme_name: str, family: int, subfamily: int, characterized: bool +) -> list[str]: + """Retrieve genbank IDs for a given enzyme.""" + page_list = fetch_links(enzyme_name, family, subfamily, characterized) data = fetch_data(page_list) genbank_id_list = [] for element in data: diff --git a/cazy_parser/version.py b/cazy_parser/version.py index 311aff2..c129f68 100644 --- a/cazy_parser/version.py +++ b/cazy_parser/version.py @@ -1 +1 @@ -VERSION = "2.0.2" +VERSION = "2.0.3" diff --git a/pyproject.toml b/pyproject.toml index f68a6f0..031b16c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,9 +3,9 @@ pythonpath = ["src"] [tool.poetry] name = "cazy-parser" -version = "2.0.2" +version = "2.0.3" description = "A way to extract specific information from CAZy" -authors = ["Rodrigo V. Honorato "] +authors = ["Rodrigo V. Honorato "] readme = "README.md" packages = [{ include = "cazy_parser" }] classifiers = [ @@ -34,7 +34,7 @@ coverage = "^7.2.5" hypothesis = "^6.75.1" [tool.poetry.scripts] -cazy-parser = 'cazy_parser.cli:maincli' +cazy-parser = 'cazy_parser.cli:main' [build-system] requires = ["poetry-core"] diff --git a/tests/test_html.py b/tests/test_html.py index 769677f..89997ee 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -1,4 +1,4 @@ -import urllib +import urllib.request import pytest from bs4 import BeautifulSoup @@ -78,7 +78,12 @@ def test_get_data_from_txt(): def test_fetch_links(): - observed_links = fetch_links("Carbohydrate-Esterases", family=None, subfamily=None) + observed_links = fetch_links("Carbohydrate-Esterases", characterized=True) + + assert "http://www.cazy.org/CE20_characterized.html" in observed_links + assert "http://www.cazy.org/IMG/cazy_data/CE20.txt" not in observed_links + + observed_links = fetch_links("Carbohydrate-Esterases", characterized=False) assert "http://www.cazy.org/CE20_characterized.html" in observed_links assert "http://www.cazy.org/IMG/cazy_data/CE20.txt" in observed_links @@ -99,8 +104,15 @@ def test_fetch_species(): def test_retrieve_genbank_ids(): observed_id_list = retrieve_genbank_ids( - enzyme_name="Glycoside-Hydrolases", family=5, subfamily=1 + enzyme_name="Glycoside-Hydrolases", family=5, subfamily=1, characterized=False ) assert observed_id_list assert len(observed_id_list) >= 1223 + + observed_id_list = retrieve_genbank_ids( + enzyme_name="Glycoside-Hydrolases", family=5, subfamily=1, characterized=True + ) + + assert observed_id_list + assert 36 <= len(observed_id_list) <= 1000