Merge pull request #149 from kkaris/validate-data

Dumping updates and data validation
gyorilab · Feb 5, 2024 · fa9a217 · fa9a217
2 parents c0d3493 + 7ef4314
commit fa9a217
Show file tree

Hide file tree

Showing 43 changed files with 1,344 additions and 561 deletions.
diff --git a/src/indra_cogex/apps/constants.py b/src/indra_cogex/apps/constants.py
@@ -3,7 +3,11 @@
 from typing import Union
 from indra.util.statement_presentation import db_sources, reader_sources
 from indra.config import get_config
-from pusher import pusher
+
+try:
+    from pusher import pusher
+except ImportError:
+    pusher = None
 
 
 logger = logging.getLogger(__name__)
@@ -32,6 +36,12 @@
     "has_patent": "Project Patents",
     "has_marker": "Cell Markers",
     "has_domain": "Protein Domains",
+    "gene_disease_association": "Gene Disease Associations",
+    # Links Publications to Journals
+    "published_in": "Journal Associations",
+    "variant_disease_association": "Variant Disease Associations",
+    "variant_gene_association": "Variant Gene Associations",
+    "variant_phenotype_association": "Variant Phenotype Associations",
 }
 
 INDRA_COGEX_WEB_LOCAL = (get_config("INDRA_COGEX_WEB_LOCAL") or "").lower() in {
@@ -82,7 +92,7 @@
 pusher_cluster = get_config("CLARE_PUSHER_CLUSTER")
 
 # Pusher app
-if pusher_app_id and pusher_key and pusher_secret and pusher_cluster:
+if pusher is not None and pusher_app_id and pusher_key and pusher_secret and pusher_cluster:
     pusher_app = pusher.Pusher(
         app_id=pusher_app_id,
         key=pusher_key,

diff --git a/src/indra_cogex/apps/curator/curator_blueprint.py b/src/indra_cogex/apps/curator/curator_blueprint.py
@@ -17,17 +17,17 @@
 from indra_cogex.client.curation import (
     get_disprot_statements,
     get_dub_statements,
-    get_entity_evidence_counts,
-    get_goa_evidence_counts,
+    get_entity_source_counts,
+    get_goa_source_counts,
     get_kinase_statements,
     get_mirna_statements,
     get_phosphatase_statements,
-    get_ppi_evidence_counts,
+    get_ppi_source_counts,
     get_tf_statements,
 )
 from indra_cogex.client.queries import get_stmts_for_mesh, get_stmts_for_stmt_hashes
 
-from .utils import get_conflict_evidence_counts
+from .utils import get_conflict_source_counts
 from ..utils import (
     remove_curated_evidences,
     remove_curated_pa_hashes,
@@ -237,7 +237,7 @@ def _render_func(
     ----------
     func :
         A function that takes a ``client`` and any arbitrary arguments
-        (passed through ``func_kwargs``) and returns an evidence count
+        (passed through ``func_kwargs``) and returns a source count
         dictionary
     func_kwargs :
         Keyword arguments to pass to the function
@@ -312,7 +312,7 @@ def _render_evidence_counts(
 def ppi():
     """The PPI curator looks for the highest evidences for PPIs that don't appear in a database."""
     return _render_func(
-        get_ppi_evidence_counts,
+        get_ppi_source_counts,
         title="PPI Curator",
         description=f"""\
             The protein-protein interaction (PPI) curator identifies INDRA
@@ -330,7 +330,7 @@ def ppi():
 def goa():
     """The GO Annotation curator looks for the highest evidence gene-GO term relations that don't appear in GOA."""
     return _render_func(
-        get_goa_evidence_counts,
+        get_goa_source_counts,
         title="GO Annotation Curator",
         description=f"""\
             The Gene Ontology annotation curator identifiers INDRA statements
@@ -349,7 +349,13 @@ def goa():
 def conflicts():
     """Curate statements with conflicting prior curations."""
     return _render_func(
-        get_conflict_evidence_counts, title="Conflict Resolver", filter_curated=False
+        get_conflict_source_counts, title="Conflict Resolver",
+        filter_curated=False,
+        description=f"""\
+            The conflict resolver identifies INDRA statements that have
+            conflicting prior curations. {_database_text("Pathway Commons")}
+            {EVIDENCE_TEXT}
+        """,
     )
 
 
@@ -471,7 +477,7 @@ def entity(prefix: str, identifier: str):
         return _curate_paper(prefix, identifier, filter_curated=proxies.filter_curated)
     if prefix in {"hgnc"}:
         return _render_func(
-            get_entity_evidence_counts,
+            get_entity_source_counts,
             func_kwargs=dict(
                 prefix=prefix,
                 identifier=identifier,

diff --git a/src/indra_cogex/apps/curator/utils.py b/src/indra_cogex/apps/curator/utils.py
@@ -7,7 +7,7 @@
 from indra_cogex.apps.proxies import curation_cache
 
 __all__ = [
-    "get_conflict_evidence_counts",
+    "get_conflict_source_counts",
     "unfinished",
     "Curation",
     "iterate_conflicts",
@@ -17,13 +17,13 @@
 
 
 @autoclient()
-def get_conflict_evidence_counts(
+def get_conflict_source_counts(
     *, curations: List[Curation] = None, client: Neo4jClient
-) -> Mapping[int, int]:
+) -> Mapping[int, Mapping[str, int]]:
     """Get hashes of statements whose curations need resolving."""
     return {
-        stmt_hash: evidence_count
-        for stmt_hash, evidence_count, status in iterate_conflicts(
+        stmt_hash: source_counts
+        for stmt_hash, source_counts, status in iterate_conflicts(
             curations=curations, client=client
         )
         if status
@@ -33,7 +33,7 @@ def get_conflict_evidence_counts(
 @autoclient()
 def iterate_conflicts(
     *, curations: List[Curation] = None, client: Neo4jClient
-) -> Iterable[Tuple[int, int, bool]]:
+) -> Iterable[Tuple[int, Mapping[str, int], bool]]:
     """Iterate hashes of statements and their resolution status."""
     if curations is None:
         curations = curation_cache.get_curation_cache()
@@ -42,13 +42,13 @@ def iterate_conflicts(
         MATCH (:BioEntity)-[r:indra_rel]->(:BioEntity)
         WHERE
             r.stmt_hash IN {sorted(stmt_hash_to_counter)!r}
-        RETURN r.stmt_hash, r.evidence_count
+        RETURN r.stmt_hash, r.source_counts
     """
-    for stmt_hash, evidence_count in client.query_tx(query):
-        yield stmt_hash, evidence_count, unfinished(
+    for stmt_hash, source_counts in client.query_dict_value_json(query).items():
+        yield stmt_hash, source_counts, unfinished(
             correct=stmt_hash_to_counter[stmt_hash][True],
             incorrect=stmt_hash_to_counter[stmt_hash][False],
-            evidences=evidence_count,
+            evidences=sum(source_counts.values()),
         )
 
 

diff --git a/src/indra_cogex/apps/data_display/cli.py b/src/indra_cogex/apps/data_display/cli.py
@@ -1,15 +1,22 @@
 from flask import Flask
 from flask_bootstrap import Bootstrap4
+from indra_cogex.apps.curation_cache import CurationCache
 from indralab_auth_tools.auth import auth, config_auth
 from more_click import make_web_command
 
-from indra_cogex.apps.constants import INDRA_COGEX_EXTENSION, STATIC_DIR, TEMPLATES_DIR
+from indra_cogex.apps.constants import (
+    INDRA_COGEX_EXTENSION,
+    STATIC_DIR,
+    TEMPLATES_DIR,
+    STATEMENT_CURATION_CACHE
+)
 from indra_cogex.apps.data_display import data_display_blueprint
 from indra_cogex.client.neo4j_client import Neo4jClient
 
 app = Flask(__name__, template_folder=TEMPLATES_DIR, static_folder=STATIC_DIR)
 bootstrap = Bootstrap4(app)
 app.extensions[INDRA_COGEX_EXTENSION] = Neo4jClient()
+app.extensions[STATEMENT_CURATION_CACHE] = CurationCache()
 app.register_blueprint(auth)
 app.register_blueprint(data_display_blueprint)
 SC, jwt = config_auth(app)

diff --git a/src/indra_cogex/apps/home/__main__.py b/src/indra_cogex/apps/home/__main__.py
@@ -2,7 +2,7 @@
 
 """Run the landing page app with ``python -m indra_cogex.apps.home``."""
 
-from . import cli
+from indra_cogex.apps.home.cli import cli
 
 if __name__ == "__main__":
     cli()
diff --git a/src/indra_cogex/apps/queries_web/__init__.py b/src/indra_cogex/apps/queries_web/__init__.py
@@ -45,7 +45,7 @@
     "mesh_term": ["MESH", "D015002"],
     "pmid_term": ["PUBMED", "34634383"],
     "paper_term": ["PUBMED", "34634383"],
-    "pubmeds": ["20861832", "19503834"],
+    "pmids": ["20861832", "19503834"],
     "include_child_terms": True,
     # NOTE: statement hashes are too large to be int for JavaScript
     "stmt_hash": "12198579805553967",

diff --git a/src/indra_cogex/apps/utils.py b/src/indra_cogex/apps/utils.py
@@ -20,7 +20,6 @@
 from indra.assemblers.html.assembler import _format_evidence_text, _format_stmt_text
 from indra.statements import Statement
 from indra.util.statement_presentation import _get_available_ev_source_counts
-from indra_cogex.util import unicode_escape, UnicodeEscapeError
 from indra_cogex.apps.constants import VUE_SRC_JS, VUE_SRC_CSS, sources_dict
 from indra_cogex.apps.curation_cache.curation_cache import Curations
 from indra_cogex.apps.proxies import curation_cache
@@ -242,33 +241,19 @@ def _stmt_to_row(
         if not ev_array:
             return None
 
-    unicode_errors = 0
     for ev in ev_array:
         # Translate OrderedDict to dict
         org_json = ev["original_json"]
         ev["original_json"] = dict(org_json)
 
-        # Fix unicode escaping: the text will be JSON serialized, so we need to
-        # remove extra escapes or we will have strings like '\\\\\\\\u....'
-        # in the final data.
-        text = ev["text"]
-        if text:
-            try:
-                ev["text"] = unicode_escape(text)
-            except UnicodeEscapeError:
-                unicode_errors += 1
-
-    if unicode_errors:
-        logger.warning(f"{unicode_errors} unicode errors in {stmt.get_hash()}")
-
     english = _format_stmt_text(stmt)
     hash_int = stmt.get_hash()
     if source_counts is None:
         sources = _get_available_ev_source_counts(stmt.evidence)
     else:
         sources = source_counts
 
-    # Remove medscan from the sources count and decrement the total count.
+    # Remove medscan from the sources count
     if remove_medscan and "medscan" in sources:
         del sources["medscan"]
 

diff --git a/src/indra_cogex/client/curation.py b/src/indra_cogex/client/curation.py
@@ -34,14 +34,14 @@
     "get_prioritized_stmt_hashes",
     "get_curation_df",
     "get_go_curation_hashes",
-    "get_ppi_evidence_counts",
-    "get_goa_evidence_counts",
+    "get_ppi_source_counts",
+    "get_goa_source_counts",
     "get_tf_statements",
     "get_kinase_statements",
     "get_phosphatase_statements",
     "get_conflicting_statements",
     "get_dub_statements",
-    "get_entity_evidence_counts",
+    "get_entity_source_counts",
     "get_mirna_statements",
     "get_disprot_statements",
 ]
@@ -164,7 +164,7 @@ def _limit_line(limit: Optional[int] = None) -> str:
 
 
 @autoclient()
-def get_ppi_evidence_counts(
+def get_ppi_source_counts(
     *,
     client: Neo4jClient,
     minimum_evidences: int = 20,
@@ -203,7 +203,7 @@ def get_ppi_evidence_counts(
 
 
 @autoclient()
-def get_goa_evidence_counts(
+def get_goa_source_counts(
     *,
     client: Neo4jClient,
     minimum_evidences: int = 10,
@@ -247,7 +247,7 @@ def _get_symbol_curies(symbols: Iterable[str]) -> List[str]:
 @autoclient()
 def get_tf_statements(
     *, client: Neo4jClient, limit: Optional[int] = None
-) -> Mapping[int, int]:
+) -> Mapping[int, Mapping[str, int]]:
     """Get transcription factor increase amount / decrease amount."""
     return _help(
         sources=TF_CURIES,
@@ -268,7 +268,7 @@ def get_tf_statements(
 @autoclient()
 def get_kinase_statements(
     *, client: Neo4jClient, limit: Optional[int] = None
-) -> Mapping[int, int]:
+) -> Mapping[int, Mapping[str, int]]:
     """Get kinase statements."""
     return _help(
         sources=KINASE_CURIES,
@@ -285,7 +285,7 @@ def get_kinase_statements(
 @autoclient()
 def get_phosphatase_statements(
     *, client: Neo4jClient, limit: Optional[int] = None
-) -> Mapping[int, int]:
+) -> Mapping[int, Mapping[str, int]]:
     """Get phosphatase statements."""
     return _help(
         sources=PHOSPHATASE_CURIES,
@@ -311,7 +311,7 @@ def _get_dub_curies():
 @autoclient()
 def get_dub_statements(
     *, client: Neo4jClient, limit: Optional[int] = None
-) -> Mapping[int, int]:
+) -> Mapping[int, Mapping[str, int]]:
     """Get deubiquitinase statements."""
     return _help(
         sources=_get_dub_curies(),
@@ -336,7 +336,7 @@ def _get_mirnas() -> List[str]:
 @autoclient()
 def get_mirna_statements(
     *, client: Neo4jClient, limit: Optional[int] = None
-) -> Mapping[int, int]:
+) -> Mapping[int, Mapping[str, int]]:
     """Get miRNA statements."""
     return _help(
         sources=MIRNA_CURIES,
@@ -360,7 +360,7 @@ def get_disprot_statements(
     client: Neo4jClient,
     limit: Optional[int] = None,
     object_prefix: Optional[str] = None,
-) -> Mapping[int, int]:
+) -> Mapping[int, Mapping[str, int]]:
     """Get statements about disordered proteins."""
     return _help(
         sources=DISPROT_CURIES,
@@ -409,24 +409,24 @@ def _help(
 
 
 @autoclient()
-def get_entity_evidence_counts(
+def get_entity_source_counts(
     prefix: str,
     identifier: str,
     *,
     client: Neo4jClient,
     limit: Optional[int] = None,
-) -> Mapping[int, int]:
+) -> Mapping[int, Mapping[str, int]]:
     query = f"""\
         MATCH p=(a:BioEntity)-[r:indra_rel]->(b:BioEntity)
         WHERE
             a.id = "{prefix}:{identifier}"
             AND NOT r.has_database_evidence
             AND a.id <> b.id
-        RETURN r.stmt_hash, r.evidence_count
+        RETURN r.stmt_hash, r.source_counts
         ORDER BY r.evidence_count DESC
         {_limit_line(limit)}
     """
-    return client.query_dict(query)
+    return client.query_dict_value_json(query)
 
 
 @autoclient()

diff --git a/src/indra_cogex/client/enrichment/discrete.py b/src/indra_cogex/client/enrichment/discrete.py
@@ -96,7 +96,7 @@ def count_human_genes(*, client: Neo4jClient) -> int:
     query = """\
         MATCH (n:BioEntity)
         WHERE n.id STARTS WITH 'hgnc'
-        AND NOT n.obsolete = "True"
+        AND NOT n.obsolete
         RETURN count(n) as count
     """
     results = client.query_tx(query)