Skip to content

Commit

Permalink
Merge pull request #149 from kkaris/validate-data
Browse files Browse the repository at this point in the history
Dumping updates and data validation
  • Loading branch information
bgyori authored Feb 5, 2024
2 parents c0d3493 + 7ef4314 commit fa9a217
Show file tree
Hide file tree
Showing 43 changed files with 1,344 additions and 561 deletions.
14 changes: 12 additions & 2 deletions src/indra_cogex/apps/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@
from typing import Union
from indra.util.statement_presentation import db_sources, reader_sources
from indra.config import get_config
from pusher import pusher

try:
from pusher import pusher
except ImportError:
pusher = None


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -32,6 +36,12 @@
"has_patent": "Project Patents",
"has_marker": "Cell Markers",
"has_domain": "Protein Domains",
"gene_disease_association": "Gene Disease Associations",
# Links Publications to Journals
"published_in": "Journal Associations",
"variant_disease_association": "Variant Disease Associations",
"variant_gene_association": "Variant Gene Associations",
"variant_phenotype_association": "Variant Phenotype Associations",
}

INDRA_COGEX_WEB_LOCAL = (get_config("INDRA_COGEX_WEB_LOCAL") or "").lower() in {
Expand Down Expand Up @@ -82,7 +92,7 @@
pusher_cluster = get_config("CLARE_PUSHER_CLUSTER")

# Pusher app
if pusher_app_id and pusher_key and pusher_secret and pusher_cluster:
if pusher is not None and pusher_app_id and pusher_key and pusher_secret and pusher_cluster:
pusher_app = pusher.Pusher(
app_id=pusher_app_id,
key=pusher_key,
Expand Down
24 changes: 15 additions & 9 deletions src/indra_cogex/apps/curator/curator_blueprint.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,17 @@
from indra_cogex.client.curation import (
get_disprot_statements,
get_dub_statements,
get_entity_evidence_counts,
get_goa_evidence_counts,
get_entity_source_counts,
get_goa_source_counts,
get_kinase_statements,
get_mirna_statements,
get_phosphatase_statements,
get_ppi_evidence_counts,
get_ppi_source_counts,
get_tf_statements,
)
from indra_cogex.client.queries import get_stmts_for_mesh, get_stmts_for_stmt_hashes

from .utils import get_conflict_evidence_counts
from .utils import get_conflict_source_counts
from ..utils import (
remove_curated_evidences,
remove_curated_pa_hashes,
Expand Down Expand Up @@ -237,7 +237,7 @@ def _render_func(
----------
func :
A function that takes a ``client`` and any arbitrary arguments
(passed through ``func_kwargs``) and returns an evidence count
(passed through ``func_kwargs``) and returns a source count
dictionary
func_kwargs :
Keyword arguments to pass to the function
Expand Down Expand Up @@ -312,7 +312,7 @@ def _render_evidence_counts(
def ppi():
"""The PPI curator looks for the highest evidences for PPIs that don't appear in a database."""
return _render_func(
get_ppi_evidence_counts,
get_ppi_source_counts,
title="PPI Curator",
description=f"""\
The protein-protein interaction (PPI) curator identifies INDRA
Expand All @@ -330,7 +330,7 @@ def ppi():
def goa():
"""The GO Annotation curator looks for the highest evidence gene-GO term relations that don't appear in GOA."""
return _render_func(
get_goa_evidence_counts,
get_goa_source_counts,
title="GO Annotation Curator",
description=f"""\
The Gene Ontology annotation curator identifiers INDRA statements
Expand All @@ -349,7 +349,13 @@ def goa():
def conflicts():
"""Curate statements with conflicting prior curations."""
return _render_func(
get_conflict_evidence_counts, title="Conflict Resolver", filter_curated=False
get_conflict_source_counts, title="Conflict Resolver",
filter_curated=False,
description=f"""\
The conflict resolver identifies INDRA statements that have
conflicting prior curations. {_database_text("Pathway Commons")}
{EVIDENCE_TEXT}
""",
)


Expand Down Expand Up @@ -471,7 +477,7 @@ def entity(prefix: str, identifier: str):
return _curate_paper(prefix, identifier, filter_curated=proxies.filter_curated)
if prefix in {"hgnc"}:
return _render_func(
get_entity_evidence_counts,
get_entity_source_counts,
func_kwargs=dict(
prefix=prefix,
identifier=identifier,
Expand Down
20 changes: 10 additions & 10 deletions src/indra_cogex/apps/curator/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from indra_cogex.apps.proxies import curation_cache

__all__ = [
"get_conflict_evidence_counts",
"get_conflict_source_counts",
"unfinished",
"Curation",
"iterate_conflicts",
Expand All @@ -17,13 +17,13 @@


@autoclient()
def get_conflict_evidence_counts(
def get_conflict_source_counts(
*, curations: List[Curation] = None, client: Neo4jClient
) -> Mapping[int, int]:
) -> Mapping[int, Mapping[str, int]]:
"""Get hashes of statements whose curations need resolving."""
return {
stmt_hash: evidence_count
for stmt_hash, evidence_count, status in iterate_conflicts(
stmt_hash: source_counts
for stmt_hash, source_counts, status in iterate_conflicts(
curations=curations, client=client
)
if status
Expand All @@ -33,7 +33,7 @@ def get_conflict_evidence_counts(
@autoclient()
def iterate_conflicts(
*, curations: List[Curation] = None, client: Neo4jClient
) -> Iterable[Tuple[int, int, bool]]:
) -> Iterable[Tuple[int, Mapping[str, int], bool]]:
"""Iterate hashes of statements and their resolution status."""
if curations is None:
curations = curation_cache.get_curation_cache()
Expand All @@ -42,13 +42,13 @@ def iterate_conflicts(
MATCH (:BioEntity)-[r:indra_rel]->(:BioEntity)
WHERE
r.stmt_hash IN {sorted(stmt_hash_to_counter)!r}
RETURN r.stmt_hash, r.evidence_count
RETURN r.stmt_hash, r.source_counts
"""
for stmt_hash, evidence_count in client.query_tx(query):
yield stmt_hash, evidence_count, unfinished(
for stmt_hash, source_counts in client.query_dict_value_json(query).items():
yield stmt_hash, source_counts, unfinished(
correct=stmt_hash_to_counter[stmt_hash][True],
incorrect=stmt_hash_to_counter[stmt_hash][False],
evidences=evidence_count,
evidences=sum(source_counts.values()),
)


Expand Down
9 changes: 8 additions & 1 deletion src/indra_cogex/apps/data_display/cli.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,22 @@
from flask import Flask
from flask_bootstrap import Bootstrap4
from indra_cogex.apps.curation_cache import CurationCache
from indralab_auth_tools.auth import auth, config_auth
from more_click import make_web_command

from indra_cogex.apps.constants import INDRA_COGEX_EXTENSION, STATIC_DIR, TEMPLATES_DIR
from indra_cogex.apps.constants import (
INDRA_COGEX_EXTENSION,
STATIC_DIR,
TEMPLATES_DIR,
STATEMENT_CURATION_CACHE
)
from indra_cogex.apps.data_display import data_display_blueprint
from indra_cogex.client.neo4j_client import Neo4jClient

app = Flask(__name__, template_folder=TEMPLATES_DIR, static_folder=STATIC_DIR)
bootstrap = Bootstrap4(app)
app.extensions[INDRA_COGEX_EXTENSION] = Neo4jClient()
app.extensions[STATEMENT_CURATION_CACHE] = CurationCache()
app.register_blueprint(auth)
app.register_blueprint(data_display_blueprint)
SC, jwt = config_auth(app)
Expand Down
2 changes: 1 addition & 1 deletion src/indra_cogex/apps/home/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

"""Run the landing page app with ``python -m indra_cogex.apps.home``."""

from . import cli
from indra_cogex.apps.home.cli import cli

if __name__ == "__main__":
cli()
2 changes: 1 addition & 1 deletion src/indra_cogex/apps/queries_web/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
"mesh_term": ["MESH", "D015002"],
"pmid_term": ["PUBMED", "34634383"],
"paper_term": ["PUBMED", "34634383"],
"pubmeds": ["20861832", "19503834"],
"pmids": ["20861832", "19503834"],
"include_child_terms": True,
# NOTE: statement hashes are too large to be int for JavaScript
"stmt_hash": "12198579805553967",
Expand Down
17 changes: 1 addition & 16 deletions src/indra_cogex/apps/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
from indra.assemblers.html.assembler import _format_evidence_text, _format_stmt_text
from indra.statements import Statement
from indra.util.statement_presentation import _get_available_ev_source_counts
from indra_cogex.util import unicode_escape, UnicodeEscapeError
from indra_cogex.apps.constants import VUE_SRC_JS, VUE_SRC_CSS, sources_dict
from indra_cogex.apps.curation_cache.curation_cache import Curations
from indra_cogex.apps.proxies import curation_cache
Expand Down Expand Up @@ -242,33 +241,19 @@ def _stmt_to_row(
if not ev_array:
return None

unicode_errors = 0
for ev in ev_array:
# Translate OrderedDict to dict
org_json = ev["original_json"]
ev["original_json"] = dict(org_json)

# Fix unicode escaping: the text will be JSON serialized, so we need to
# remove extra escapes or we will have strings like '\\\\\\\\u....'
# in the final data.
text = ev["text"]
if text:
try:
ev["text"] = unicode_escape(text)
except UnicodeEscapeError:
unicode_errors += 1

if unicode_errors:
logger.warning(f"{unicode_errors} unicode errors in {stmt.get_hash()}")

english = _format_stmt_text(stmt)
hash_int = stmt.get_hash()
if source_counts is None:
sources = _get_available_ev_source_counts(stmt.evidence)
else:
sources = source_counts

# Remove medscan from the sources count and decrement the total count.
# Remove medscan from the sources count
if remove_medscan and "medscan" in sources:
del sources["medscan"]

Expand Down
30 changes: 15 additions & 15 deletions src/indra_cogex/client/curation.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,14 @@
"get_prioritized_stmt_hashes",
"get_curation_df",
"get_go_curation_hashes",
"get_ppi_evidence_counts",
"get_goa_evidence_counts",
"get_ppi_source_counts",
"get_goa_source_counts",
"get_tf_statements",
"get_kinase_statements",
"get_phosphatase_statements",
"get_conflicting_statements",
"get_dub_statements",
"get_entity_evidence_counts",
"get_entity_source_counts",
"get_mirna_statements",
"get_disprot_statements",
]
Expand Down Expand Up @@ -164,7 +164,7 @@ def _limit_line(limit: Optional[int] = None) -> str:


@autoclient()
def get_ppi_evidence_counts(
def get_ppi_source_counts(
*,
client: Neo4jClient,
minimum_evidences: int = 20,
Expand Down Expand Up @@ -203,7 +203,7 @@ def get_ppi_evidence_counts(


@autoclient()
def get_goa_evidence_counts(
def get_goa_source_counts(
*,
client: Neo4jClient,
minimum_evidences: int = 10,
Expand Down Expand Up @@ -247,7 +247,7 @@ def _get_symbol_curies(symbols: Iterable[str]) -> List[str]:
@autoclient()
def get_tf_statements(
*, client: Neo4jClient, limit: Optional[int] = None
) -> Mapping[int, int]:
) -> Mapping[int, Mapping[str, int]]:
"""Get transcription factor increase amount / decrease amount."""
return _help(
sources=TF_CURIES,
Expand All @@ -268,7 +268,7 @@ def get_tf_statements(
@autoclient()
def get_kinase_statements(
*, client: Neo4jClient, limit: Optional[int] = None
) -> Mapping[int, int]:
) -> Mapping[int, Mapping[str, int]]:
"""Get kinase statements."""
return _help(
sources=KINASE_CURIES,
Expand All @@ -285,7 +285,7 @@ def get_kinase_statements(
@autoclient()
def get_phosphatase_statements(
*, client: Neo4jClient, limit: Optional[int] = None
) -> Mapping[int, int]:
) -> Mapping[int, Mapping[str, int]]:
"""Get phosphatase statements."""
return _help(
sources=PHOSPHATASE_CURIES,
Expand All @@ -311,7 +311,7 @@ def _get_dub_curies():
@autoclient()
def get_dub_statements(
*, client: Neo4jClient, limit: Optional[int] = None
) -> Mapping[int, int]:
) -> Mapping[int, Mapping[str, int]]:
"""Get deubiquitinase statements."""
return _help(
sources=_get_dub_curies(),
Expand All @@ -336,7 +336,7 @@ def _get_mirnas() -> List[str]:
@autoclient()
def get_mirna_statements(
*, client: Neo4jClient, limit: Optional[int] = None
) -> Mapping[int, int]:
) -> Mapping[int, Mapping[str, int]]:
"""Get miRNA statements."""
return _help(
sources=MIRNA_CURIES,
Expand All @@ -360,7 +360,7 @@ def get_disprot_statements(
client: Neo4jClient,
limit: Optional[int] = None,
object_prefix: Optional[str] = None,
) -> Mapping[int, int]:
) -> Mapping[int, Mapping[str, int]]:
"""Get statements about disordered proteins."""
return _help(
sources=DISPROT_CURIES,
Expand Down Expand Up @@ -409,24 +409,24 @@ def _help(


@autoclient()
def get_entity_evidence_counts(
def get_entity_source_counts(
prefix: str,
identifier: str,
*,
client: Neo4jClient,
limit: Optional[int] = None,
) -> Mapping[int, int]:
) -> Mapping[int, Mapping[str, int]]:
query = f"""\
MATCH p=(a:BioEntity)-[r:indra_rel]->(b:BioEntity)
WHERE
a.id = "{prefix}:{identifier}"
AND NOT r.has_database_evidence
AND a.id <> b.id
RETURN r.stmt_hash, r.evidence_count
RETURN r.stmt_hash, r.source_counts
ORDER BY r.evidence_count DESC
{_limit_line(limit)}
"""
return client.query_dict(query)
return client.query_dict_value_json(query)


@autoclient()
Expand Down
2 changes: 1 addition & 1 deletion src/indra_cogex/client/enrichment/discrete.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def count_human_genes(*, client: Neo4jClient) -> int:
query = """\
MATCH (n:BioEntity)
WHERE n.id STARTS WITH 'hgnc'
AND NOT n.obsolete = "True"
AND NOT n.obsolete
RETURN count(n) as count
"""
results = client.query_tx(query)
Expand Down
Loading

0 comments on commit fa9a217

Please sign in to comment.