diff --git a/src/indra_cogex/sources/disgenet/__init__.py b/src/indra_cogex/sources/disgenet/__init__.py index 1dc6782d5..1ce14e24a 100644 --- a/src/indra_cogex/sources/disgenet/__init__.py +++ b/src/indra_cogex/sources/disgenet/__init__.py @@ -24,12 +24,10 @@ CURATED_DISEASE_GENES_ASSOCIATIONS_URL = ( f"{DOWNLOAD_BASE}/curated_gene_disease_associations.tsv.gz" ) -# TODO Tenzin! CURATED_DISEASE_VARIANT_ASSOCIATIONS_URL = ( f"{DOWNLOAD_BASE}/curated_variant_disease_associations.tsv.gz" ) - TARGET_KEYS = { "NofSnps": int, "DSI": str, @@ -65,19 +63,22 @@ class DisgenetGeneProcessor(Processor): def __init__(self): """Initialize the DisGeNet processor.""" - self.df = load_disgenet_disease_gene(CURATED_DISEASE_GENES_ASSOCIATIONS_URL) + self.df = load_disgenet_disease_gene( + CURATED_DISEASE_GENES_ASSOCIATIONS_URL) def get_nodes(self): # noqa:D102 diseases = { tuple(row) - for row in self.df[["disease_prefix", "disease_id", "disease_name"]].values + for row in + self.df[["disease_prefix", "disease_id", "disease_name"]].values } for prefix, identifier, name in diseases: yield Node.standardized( db_ns=prefix, db_id=identifier, name=name, labels=["BioEntity"] ) for hgnc_id in self.df["hgnc_id"].unique(): - yield Node.standardized(db_ns="HGNC", db_id=hgnc_id, labels=["BioEntity"]) + yield Node.standardized(db_ns="HGNC", db_id=hgnc_id, + labels=["BioEntity"]) def get_relations(self): # noqa:D102 columns = [ @@ -90,35 +91,14 @@ def get_relations(self): # noqa:D102 for hgnc_id, disease_prefix, disease_id, snps, papers in self.df[ columns ].values: - data = {"snps:int": snps, "source": self.name, "papers:int": papers} + data = {"snps:int": snps, "source": self.name, + "papers:int": papers} yield Relation( - "HGNC", hgnc_id, disease_prefix, disease_id, self.relation, data + "HGNC", hgnc_id, disease_prefix, disease_id, self.relation, + data ) -def load_disgenet_disease_gene(url, force: bool = False) -> pd.DataFrame: - """Export disease-gene association file.""" - df = SUBMODULE.ensure_csv( - url=url, - read_csv_kwargs=dict(dtype={"geneId": str}), - force=force, - ) - - df["hgnc_id"] = df["geneId"].map( - lambda s: hgnc_client.get_hgnc_from_entrez(s.strip()) - ) - df = df[df["hgnc_id"].notna()] - - umls_mapper = UmlsMapper() - ( - df["disease_prefix"], - df["disease_id"], - df["disease_name"], - ) = zip(*df["diseaseId"].map(umls_mapper.standardize)) - df = df[df["disease_prefix"].notna()] - return df - - class DisgenetVariantProcessor(Processor): """Processor for the DisGeNet database.""" @@ -127,4 +107,79 @@ class DisgenetVariantProcessor(Processor): node_types = ["BioEntity"] relation = "variant_disease_association" - ... + def __init__(self): + """Initialize the DisGeNet processor.""" + self.df = load_disgenet_disease_gene( + CURATED_DISEASE_VARIANT_ASSOCIATIONS_URL, variant=True) + + def get_nodes(self): # noqa:D102 + # Since there exists no column in the disease-variant association tsv + # that describes disease prefix, opted to use diseaseClass + diseases = { + tuple(row) + for row in + self.df[["diseaseClass", "diseaseId", "diseaseName"]].values + } + for disease_class, identifier, name in diseases: + yield Node.standardized( + db_ns=disease_class, db_id=identifier, name=name, + labels=["BioEntity"] + ) + for snp_id in self.df["snpId"].unique(): + yield Node.standardized(db_ns="DBSNP", db_id=snp_id, + labels=["BioEntity"]) + + def get_relations(self): # noqa:D102 + # Use diseaseName instead of diseasePrefix as diseasePrefix does not + # exist in df for disease-variant tsv + columns = [ + "snpId", + "DSI", + "DPI", + "diseaseName", + "diseaseId", + "NofPmids", + ] + + for snp_id, dsi, dpi, disease_name, disease_id, papers in self.df[ + columns + ].values: + data = {"snp_id:str": snp_id, "source": self.name, + "dsi:float": dsi, "dpi:float": dpi, + "papers:int": papers} + yield Relation( + "DBSNP", snp_id, disease_name, disease_id, self.relation, + data + ) + + +def load_disgenet_disease_gene(url, force: bool = False, + variant: bool = False) -> (pd.DataFrame): + """Export disease-gene association file.""" + df = SUBMODULE.ensure_csv( + url=url, + read_csv_kwargs=dict(dtype={"geneId": str, + "snpId": str}), + force=force, + ) + + if not variant: + df["hgnc_id"] = df["geneId"].map( + lambda s: hgnc_client.get_hgnc_from_entrez(s.strip()) + ) + df = df[df["hgnc_id"].notna()] + umls_mapper = UmlsMapper() + ( + df["disease_prefix"], + df["disease_id"], + df["disease_name"], + ) = zip(*df["diseaseId"].map(umls_mapper.standardize)) + df = df[df["disease_prefix"].notna()] + return df + elif variant: + # several dpi and dsi fields are NaN + df = df[df["DSI"].notna()] + df = df[df["DPI"].notna()] + return df + +