Skip to content

Commit

Permalink
Add initial implementation for adding disease-variant tsv file
Browse files Browse the repository at this point in the history
  • Loading branch information
nanglo123 committed Dec 12, 2023
1 parent b8ae9da commit 248b00c
Showing 1 changed file with 86 additions and 31 deletions.
117 changes: 86 additions & 31 deletions src/indra_cogex/sources/disgenet/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,10 @@
CURATED_DISEASE_GENES_ASSOCIATIONS_URL = (
f"{DOWNLOAD_BASE}/curated_gene_disease_associations.tsv.gz"
)
# TODO Tenzin!
CURATED_DISEASE_VARIANT_ASSOCIATIONS_URL = (
f"{DOWNLOAD_BASE}/curated_variant_disease_associations.tsv.gz"
)


TARGET_KEYS = {
"NofSnps": int,
"DSI": str,
Expand Down Expand Up @@ -65,19 +63,22 @@ class DisgenetGeneProcessor(Processor):

def __init__(self):
"""Initialize the DisGeNet processor."""
self.df = load_disgenet_disease_gene(CURATED_DISEASE_GENES_ASSOCIATIONS_URL)
self.df = load_disgenet_disease_gene(
CURATED_DISEASE_GENES_ASSOCIATIONS_URL)

def get_nodes(self): # noqa:D102
diseases = {
tuple(row)
for row in self.df[["disease_prefix", "disease_id", "disease_name"]].values
for row in
self.df[["disease_prefix", "disease_id", "disease_name"]].values
}
for prefix, identifier, name in diseases:
yield Node.standardized(
db_ns=prefix, db_id=identifier, name=name, labels=["BioEntity"]
)
for hgnc_id in self.df["hgnc_id"].unique():
yield Node.standardized(db_ns="HGNC", db_id=hgnc_id, labels=["BioEntity"])
yield Node.standardized(db_ns="HGNC", db_id=hgnc_id,
labels=["BioEntity"])

def get_relations(self): # noqa:D102
columns = [
Expand All @@ -90,35 +91,14 @@ def get_relations(self): # noqa:D102
for hgnc_id, disease_prefix, disease_id, snps, papers in self.df[
columns
].values:
data = {"snps:int": snps, "source": self.name, "papers:int": papers}
data = {"snps:int": snps, "source": self.name,
"papers:int": papers}
yield Relation(
"HGNC", hgnc_id, disease_prefix, disease_id, self.relation, data
"HGNC", hgnc_id, disease_prefix, disease_id, self.relation,
data
)


def load_disgenet_disease_gene(url, force: bool = False) -> pd.DataFrame:
"""Export disease-gene association file."""
df = SUBMODULE.ensure_csv(
url=url,
read_csv_kwargs=dict(dtype={"geneId": str}),
force=force,
)

df["hgnc_id"] = df["geneId"].map(
lambda s: hgnc_client.get_hgnc_from_entrez(s.strip())
)
df = df[df["hgnc_id"].notna()]

umls_mapper = UmlsMapper()
(
df["disease_prefix"],
df["disease_id"],
df["disease_name"],
) = zip(*df["diseaseId"].map(umls_mapper.standardize))
df = df[df["disease_prefix"].notna()]
return df


class DisgenetVariantProcessor(Processor):
"""Processor for the DisGeNet database."""

Expand All @@ -127,4 +107,79 @@ class DisgenetVariantProcessor(Processor):
node_types = ["BioEntity"]
relation = "variant_disease_association"

...
def __init__(self):
"""Initialize the DisGeNet processor."""
self.df = load_disgenet_disease_gene(
CURATED_DISEASE_VARIANT_ASSOCIATIONS_URL, variant=True)

def get_nodes(self): # noqa:D102
# Since there exists no column in the disease-variant association tsv
# that describes disease prefix, opted to use diseaseClass
diseases = {
tuple(row)
for row in
self.df[["diseaseClass", "diseaseId", "diseaseName"]].values
}
for disease_class, identifier, name in diseases:
yield Node.standardized(
db_ns=disease_class, db_id=identifier, name=name,
labels=["BioEntity"]
)
for snp_id in self.df["snpId"].unique():
yield Node.standardized(db_ns="DBSNP", db_id=snp_id,
labels=["BioEntity"])

def get_relations(self): # noqa:D102
# Use diseaseName instead of diseasePrefix as diseasePrefix does not
# exist in df for disease-variant tsv
columns = [
"snpId",
"DSI",
"DPI",
"diseaseName",
"diseaseId",
"NofPmids",
]

for snp_id, dsi, dpi, disease_name, disease_id, papers in self.df[
columns
].values:
data = {"snp_id:str": snp_id, "source": self.name,
"dsi:float": dsi, "dpi:float": dpi,
"papers:int": papers}
yield Relation(
"DBSNP", snp_id, disease_name, disease_id, self.relation,
data
)


def load_disgenet_disease_gene(url, force: bool = False,
variant: bool = False) -> (pd.DataFrame):
"""Export disease-gene association file."""
df = SUBMODULE.ensure_csv(
url=url,
read_csv_kwargs=dict(dtype={"geneId": str,
"snpId": str}),
force=force,
)

if not variant:
df["hgnc_id"] = df["geneId"].map(
lambda s: hgnc_client.get_hgnc_from_entrez(s.strip())
)
df = df[df["hgnc_id"].notna()]
umls_mapper = UmlsMapper()
(
df["disease_prefix"],
df["disease_id"],
df["disease_name"],
) = zip(*df["diseaseId"].map(umls_mapper.standardize))
df = df[df["disease_prefix"].notna()]
return df
elif variant:
# several dpi and dsi fields are NaN
df = df[df["DSI"].notna()]
df = df[df["DPI"].notna()]
return df


0 comments on commit 248b00c

Please sign in to comment.