Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dumping updates and data validation #149

Merged
merged 110 commits into from
Feb 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
110 commits
Select commit Hold shift + click to select a range
4809093
Make Py 3.7, 3.8 compatible
kkaris Dec 17, 2023
6f33bdc
Extract article PublicationType
kkaris Dec 19, 2023
c7bfe69
Put PublicationType tags in pmid year file. Update file name.
kkaris Dec 19, 2023
faaa266
Add PublicationType tags to Publication Nodes
kkaris Dec 19, 2023
30ba860
Fix file path in wikidata processor
kkaris Dec 20, 2023
1ab476c
Add retraction boolean to Publication Nodes
kkaris Jan 2, 2024
469a864
Check for principal DB connection before starting raw export script
kkaris Dec 12, 2023
22bf10a
Log successful detection of prerequisite resources
kkaris Dec 12, 2023
ce42148
Make pmid year types file tab delimited
kkaris Jan 2, 2024
dc9b4fe
Handle missing main issn value
kkaris Jan 2, 2024
07dc37d
Add shorthand for getting neo4j boolean
kkaris Jan 3, 2024
d170939
Use boolean helper in all boolean exports
kkaris Jan 3, 2024
9fa8c53
Update docstring for helper
kkaris Jan 3, 2024
f651905
Circular import fix for indra_db, pubmed
kkaris Jan 4, 2024
fd33276
Set default file paths for process_mesh_xml_to_csv
kkaris Jan 4, 2024
be1e602
Set retraction boolean, year, pubtypes in Evidence, Publication nodes
kkaris Jan 4, 2024
fd6252a
Set retraction boolean for statement relations
kkaris Jan 4, 2024
5215c3d
Set None if no pubtypes
kkaris Jan 4, 2024
fdae175
Better docstring for get_bool
kkaris Jan 4, 2024
b260fd1
Fix retraction lookup
kkaris Jan 4, 2024
1235d48
Fix comments, log messages
kkaris Jan 8, 2024
325c504
Warm up bio ontology before for loop for better tqdm estimate
kkaris Jan 8, 2024
37b9a20
Fix bug not checking indra evidence
kkaris Jan 8, 2024
c3f2b65
Fix data type in disgenet relations
kkaris Jan 20, 2024
0437ff8
Update tqdm counts, set unit scale.
kkaris Jan 20, 2024
c4f4459
Fix order of snps and score
kkaris Jan 20, 2024
920ff09
Handle NaN in JournalPublisherProcessor
kkaris Jan 22, 2024
6f76554
Make a static version of _dump_nodes_to_path to allow for calling fro…
kkaris Jan 22, 2024
8f6fe13
Add Journal and Publisher nodes in indexing
kkaris Jan 22, 2024
49234ee
Also check for empty strings and literal nan in JournalPublisherProce…
kkaris Jan 22, 2024
86c287d
Set name instead of getting from Processor
kkaris Jan 22, 2024
0f977e1
Add util for processor checking data types
kkaris Jan 3, 2024
64734cd
Check 10 first data dicts of nodes and edges
kkaris Jan 3, 2024
24e9b75
Add test for data validator
kkaris Jan 3, 2024
7eabe7b
Add error class for unknown data type
kkaris Jan 3, 2024
f8a64b7
Clearly log bad types for values
kkaris Jan 3, 2024
2a91dbf
Better logic
kkaris Jan 3, 2024
a20219b
Clearly specify which type is Neoj in error message
kkaris Jan 3, 2024
29aa478
Better error message, consolidate checks or same python type
kkaris Jan 3, 2024
ce989aa
Fix docstring
kkaris Jan 3, 2024
2a0f52d
Update URL to docs
kkaris Jan 3, 2024
2335a88
Import constant from util
kkaris Jan 3, 2024
fc203ca
Add test for bad data value
kkaris Jan 3, 2024
71b5869
Handle arrays
kkaris Jan 4, 2024
0afba04
Add link to Neo4j python API docs
kkaris Jan 4, 2024
0e8d523
Fix error type
kkaris Jan 4, 2024
6e8937f
Add more test cases
kkaris Jan 4, 2024
d714c40
Rename test file
kkaris Jan 4, 2024
8f7cfea
Keep array type when checking data
kkaris Jan 4, 2024
5d9f594
WIP cache todos
kkaris Jan 4, 2024
4ab4d71
More clear error message
kkaris Jan 8, 2024
fc5c1cd
Improve docstring
kkaris Jan 8, 2024
4bd97ae
Validate node data until all available fields are checked
kkaris Jan 8, 2024
87fad15
Add docstrings to validate_nodes and validate_relations
kkaris Jan 8, 2024
2aba99f
Get header before checking validity of relations
kkaris Jan 8, 2024
c915535
Only pass metadata headers for nodes validation
kkaris Jan 8, 2024
a9f9a81
Fix array data type validator test
kkaris Jan 9, 2024
92d9989
Extend and fix basic data validator test
kkaris Jan 9, 2024
248ece2
Fox logic of when to flag data as checked
kkaris Jan 9, 2024
af2fab0
Make sure relations is not generator
kkaris Jan 9, 2024
0721404
Fix bug getting dict items
kkaris Jan 9, 2024
0cebbf3
Log when data was checked last time
kkaris Jan 10, 2024
e42ac5e
Add missing check for exception
kkaris Jan 10, 2024
1366acd
Actually load evidence json
kkaris Jan 10, 2024
9e5e851
Fix bug due to rebase conflict
kkaris Jan 22, 2024
ca61d31
Set neo4j type for version
kkaris Jan 22, 2024
bfb42fd
Standardize nodes in GoaProcessor
kkaris Jan 22, 2024
deeb1af
Set neo4j types for ontology
kkaris Jan 22, 2024
8fe7b03
Add names in clinicaltrial BioEntity nodes
kkaris Jan 23, 2024
9ad52e1
Standardize nodes in cbioportal processors
kkaris Jan 23, 2024
abc6dfa
Standardize interpro BioEntity nodes
kkaris Jan 23, 2024
82ecb93
Add argument for checking all data in node/edges validation, true by …
kkaris Jan 23, 2024
fda1a2c
Compress clinicaltrials csv
kkaris Jan 23, 2024
add25ad
Allow None
kkaris Jan 23, 2024
ac618b7
Handlr NaN for start year
kkaris Jan 23, 2024
fce902e
Reset version to string in PyOboProcessor
kkaris Jan 23, 2024
280f785
Add ping method to client
kkaris Jan 23, 2024
0ac2491
Undo some of the data parsing in ontology
kkaris Jan 23, 2024
80befa8
De-duplicate clinicaltrials nodes
kkaris Jan 23, 2024
6b4fc46
Update argument name
kkaris Jan 22, 2024
7ccb140
Use boolean properly after data update
kkaris Jan 23, 2024
ff8c4d8
Fix import of cli
kkaris Jan 24, 2024
186bcb7
Add new mappings to edge labels
kkaris Jan 24, 2024
d84151b
Align functions and epxected return values in curation pages
kkaris Jan 24, 2024
5cf2810
Fix comment
kkaris Jan 24, 2024
e818d87
Update so data display can be run on its own
kkaris Jan 24, 2024
b2ed050
Remove now unnecessary unicode escaping
kkaris Jan 24, 2024
7145b71
Remove unused functions, tests related to old unicode escaping
kkaris Jan 24, 2024
af0150f
Filter out duplicates for ResearchProject nodes in NihReporter
kkaris Jan 25, 2024
e6d29dd
Fix warning
kkaris Jan 25, 2024
0fdf26c
Set node labels to assemble automatically
kkaris Jan 26, 2024
efd0419
Fix f-string
kkaris Jan 26, 2024
1f960e6
Add pusher checks
cthoyt Jan 26, 2024
fc96467
Set evidence codes as string array
kkaris Jan 26, 2024
28c17e2
Remove redundant name setting
kkaris Feb 1, 2024
e325615
Restore to un-standardized nodes in GoaProcessor
kkaris Feb 1, 2024
1f2d561
Make ec-codes string array
kkaris Feb 1, 2024
fcdc00b
Catch and raise error when no nodes/relations are generated from proc…
kkaris Feb 1, 2024
fc40e08
Straighten out node standardization in ClinicaltrialsProcessor
kkaris Feb 1, 2024
fb6b100
Unstandardize GO, HGNC in InterproProcessor
kkaris Feb 1, 2024
f03d012
Unstandardize HGNC nodes for CCLE Mutations and Cna processores
kkaris Feb 1, 2024
a51d706
Unstandardize drug mapping nodes, use provided name instead
kkaris Feb 1, 2024
caaf350
Handle null values in data validation
kkaris Feb 1, 2024
b948836
Handle non-string values in data validation
kkaris Feb 1, 2024
05d5494
Filter out null data in arrays
kkaris Feb 1, 2024
05cfbc0
Add mock nodes and relation in mock processor testing
kkaris Feb 1, 2024
8778435
Return hgnc entries only
kkaris Feb 2, 2024
620a94f
Update test for get genes for go term
kkaris Feb 2, 2024
8364132
Update drug in clinicaltrial
kkaris Feb 2, 2024
7ef4314
Compress isni in relations as well
kkaris Feb 2, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions src/indra_cogex/apps/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@
from typing import Union
from indra.util.statement_presentation import db_sources, reader_sources
from indra.config import get_config
from pusher import pusher

try:
from pusher import pusher
except ImportError:
pusher = None


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -32,6 +36,12 @@
"has_patent": "Project Patents",
"has_marker": "Cell Markers",
"has_domain": "Protein Domains",
"gene_disease_association": "Gene Disease Associations",
# Links Publications to Journals
"published_in": "Journal Associations",
"variant_disease_association": "Variant Disease Associations",
"variant_gene_association": "Variant Gene Associations",
"variant_phenotype_association": "Variant Phenotype Associations",
}

INDRA_COGEX_WEB_LOCAL = (get_config("INDRA_COGEX_WEB_LOCAL") or "").lower() in {
Expand Down Expand Up @@ -82,7 +92,7 @@
pusher_cluster = get_config("CLARE_PUSHER_CLUSTER")

# Pusher app
if pusher_app_id and pusher_key and pusher_secret and pusher_cluster:
if pusher is not None and pusher_app_id and pusher_key and pusher_secret and pusher_cluster:
pusher_app = pusher.Pusher(
app_id=pusher_app_id,
key=pusher_key,
Expand Down
24 changes: 15 additions & 9 deletions src/indra_cogex/apps/curator/curator_blueprint.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,17 @@
from indra_cogex.client.curation import (
get_disprot_statements,
get_dub_statements,
get_entity_evidence_counts,
get_goa_evidence_counts,
get_entity_source_counts,
get_goa_source_counts,
get_kinase_statements,
get_mirna_statements,
get_phosphatase_statements,
get_ppi_evidence_counts,
get_ppi_source_counts,
get_tf_statements,
)
from indra_cogex.client.queries import get_stmts_for_mesh, get_stmts_for_stmt_hashes

from .utils import get_conflict_evidence_counts
from .utils import get_conflict_source_counts
from ..utils import (
remove_curated_evidences,
remove_curated_pa_hashes,
Expand Down Expand Up @@ -237,7 +237,7 @@ def _render_func(
----------
func :
A function that takes a ``client`` and any arbitrary arguments
(passed through ``func_kwargs``) and returns an evidence count
(passed through ``func_kwargs``) and returns a source count
dictionary
func_kwargs :
Keyword arguments to pass to the function
Expand Down Expand Up @@ -312,7 +312,7 @@ def _render_evidence_counts(
def ppi():
"""The PPI curator looks for the highest evidences for PPIs that don't appear in a database."""
return _render_func(
get_ppi_evidence_counts,
get_ppi_source_counts,
title="PPI Curator",
description=f"""\
The protein-protein interaction (PPI) curator identifies INDRA
Expand All @@ -330,7 +330,7 @@ def ppi():
def goa():
"""The GO Annotation curator looks for the highest evidence gene-GO term relations that don't appear in GOA."""
return _render_func(
get_goa_evidence_counts,
get_goa_source_counts,
title="GO Annotation Curator",
description=f"""\
The Gene Ontology annotation curator identifiers INDRA statements
Expand All @@ -349,7 +349,13 @@ def goa():
def conflicts():
"""Curate statements with conflicting prior curations."""
return _render_func(
get_conflict_evidence_counts, title="Conflict Resolver", filter_curated=False
get_conflict_source_counts, title="Conflict Resolver",
filter_curated=False,
description=f"""\
The conflict resolver identifies INDRA statements that have
conflicting prior curations. {_database_text("Pathway Commons")}
{EVIDENCE_TEXT}
""",
)


Expand Down Expand Up @@ -471,7 +477,7 @@ def entity(prefix: str, identifier: str):
return _curate_paper(prefix, identifier, filter_curated=proxies.filter_curated)
if prefix in {"hgnc"}:
return _render_func(
get_entity_evidence_counts,
get_entity_source_counts,
func_kwargs=dict(
prefix=prefix,
identifier=identifier,
Expand Down
20 changes: 10 additions & 10 deletions src/indra_cogex/apps/curator/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from indra_cogex.apps.proxies import curation_cache

__all__ = [
"get_conflict_evidence_counts",
"get_conflict_source_counts",
"unfinished",
"Curation",
"iterate_conflicts",
Expand All @@ -17,13 +17,13 @@


@autoclient()
def get_conflict_evidence_counts(
def get_conflict_source_counts(
*, curations: List[Curation] = None, client: Neo4jClient
) -> Mapping[int, int]:
) -> Mapping[int, Mapping[str, int]]:
"""Get hashes of statements whose curations need resolving."""
return {
stmt_hash: evidence_count
for stmt_hash, evidence_count, status in iterate_conflicts(
stmt_hash: source_counts
for stmt_hash, source_counts, status in iterate_conflicts(
curations=curations, client=client
)
if status
Expand All @@ -33,7 +33,7 @@ def get_conflict_evidence_counts(
@autoclient()
def iterate_conflicts(
*, curations: List[Curation] = None, client: Neo4jClient
) -> Iterable[Tuple[int, int, bool]]:
) -> Iterable[Tuple[int, Mapping[str, int], bool]]:
"""Iterate hashes of statements and their resolution status."""
if curations is None:
curations = curation_cache.get_curation_cache()
Expand All @@ -42,13 +42,13 @@ def iterate_conflicts(
MATCH (:BioEntity)-[r:indra_rel]->(:BioEntity)
WHERE
r.stmt_hash IN {sorted(stmt_hash_to_counter)!r}
RETURN r.stmt_hash, r.evidence_count
RETURN r.stmt_hash, r.source_counts
"""
for stmt_hash, evidence_count in client.query_tx(query):
yield stmt_hash, evidence_count, unfinished(
for stmt_hash, source_counts in client.query_dict_value_json(query).items():
yield stmt_hash, source_counts, unfinished(
correct=stmt_hash_to_counter[stmt_hash][True],
incorrect=stmt_hash_to_counter[stmt_hash][False],
evidences=evidence_count,
evidences=sum(source_counts.values()),
)


Expand Down
9 changes: 8 additions & 1 deletion src/indra_cogex/apps/data_display/cli.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,22 @@
from flask import Flask
from flask_bootstrap import Bootstrap4
from indra_cogex.apps.curation_cache import CurationCache
from indralab_auth_tools.auth import auth, config_auth
from more_click import make_web_command

from indra_cogex.apps.constants import INDRA_COGEX_EXTENSION, STATIC_DIR, TEMPLATES_DIR
from indra_cogex.apps.constants import (
INDRA_COGEX_EXTENSION,
STATIC_DIR,
TEMPLATES_DIR,
STATEMENT_CURATION_CACHE
)
from indra_cogex.apps.data_display import data_display_blueprint
from indra_cogex.client.neo4j_client import Neo4jClient

app = Flask(__name__, template_folder=TEMPLATES_DIR, static_folder=STATIC_DIR)
bootstrap = Bootstrap4(app)
app.extensions[INDRA_COGEX_EXTENSION] = Neo4jClient()
app.extensions[STATEMENT_CURATION_CACHE] = CurationCache()
app.register_blueprint(auth)
app.register_blueprint(data_display_blueprint)
SC, jwt = config_auth(app)
Expand Down
2 changes: 1 addition & 1 deletion src/indra_cogex/apps/home/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

"""Run the landing page app with ``python -m indra_cogex.apps.home``."""

from . import cli
from indra_cogex.apps.home.cli import cli

if __name__ == "__main__":
cli()
2 changes: 1 addition & 1 deletion src/indra_cogex/apps/queries_web/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
"mesh_term": ["MESH", "D015002"],
"pmid_term": ["PUBMED", "34634383"],
"paper_term": ["PUBMED", "34634383"],
"pubmeds": ["20861832", "19503834"],
"pmids": ["20861832", "19503834"],
"include_child_terms": True,
# NOTE: statement hashes are too large to be int for JavaScript
"stmt_hash": "12198579805553967",
Expand Down
17 changes: 1 addition & 16 deletions src/indra_cogex/apps/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
from indra.assemblers.html.assembler import _format_evidence_text, _format_stmt_text
from indra.statements import Statement
from indra.util.statement_presentation import _get_available_ev_source_counts
from indra_cogex.util import unicode_escape, UnicodeEscapeError
from indra_cogex.apps.constants import VUE_SRC_JS, VUE_SRC_CSS, sources_dict
from indra_cogex.apps.curation_cache.curation_cache import Curations
from indra_cogex.apps.proxies import curation_cache
Expand Down Expand Up @@ -242,33 +241,19 @@ def _stmt_to_row(
if not ev_array:
return None

unicode_errors = 0
for ev in ev_array:
# Translate OrderedDict to dict
org_json = ev["original_json"]
ev["original_json"] = dict(org_json)

# Fix unicode escaping: the text will be JSON serialized, so we need to
# remove extra escapes or we will have strings like '\\\\\\\\u....'
# in the final data.
text = ev["text"]
if text:
try:
ev["text"] = unicode_escape(text)
except UnicodeEscapeError:
unicode_errors += 1

if unicode_errors:
logger.warning(f"{unicode_errors} unicode errors in {stmt.get_hash()}")

english = _format_stmt_text(stmt)
hash_int = stmt.get_hash()
if source_counts is None:
sources = _get_available_ev_source_counts(stmt.evidence)
else:
sources = source_counts

# Remove medscan from the sources count and decrement the total count.
# Remove medscan from the sources count
if remove_medscan and "medscan" in sources:
del sources["medscan"]

Expand Down
30 changes: 15 additions & 15 deletions src/indra_cogex/client/curation.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,14 @@
"get_prioritized_stmt_hashes",
"get_curation_df",
"get_go_curation_hashes",
"get_ppi_evidence_counts",
"get_goa_evidence_counts",
"get_ppi_source_counts",
"get_goa_source_counts",
"get_tf_statements",
"get_kinase_statements",
"get_phosphatase_statements",
"get_conflicting_statements",
"get_dub_statements",
"get_entity_evidence_counts",
"get_entity_source_counts",
"get_mirna_statements",
"get_disprot_statements",
]
Expand Down Expand Up @@ -164,7 +164,7 @@ def _limit_line(limit: Optional[int] = None) -> str:


@autoclient()
def get_ppi_evidence_counts(
def get_ppi_source_counts(
*,
client: Neo4jClient,
minimum_evidences: int = 20,
Expand Down Expand Up @@ -203,7 +203,7 @@ def get_ppi_evidence_counts(


@autoclient()
def get_goa_evidence_counts(
def get_goa_source_counts(
*,
client: Neo4jClient,
minimum_evidences: int = 10,
Expand Down Expand Up @@ -247,7 +247,7 @@ def _get_symbol_curies(symbols: Iterable[str]) -> List[str]:
@autoclient()
def get_tf_statements(
*, client: Neo4jClient, limit: Optional[int] = None
) -> Mapping[int, int]:
) -> Mapping[int, Mapping[str, int]]:
"""Get transcription factor increase amount / decrease amount."""
return _help(
sources=TF_CURIES,
Expand All @@ -268,7 +268,7 @@ def get_tf_statements(
@autoclient()
def get_kinase_statements(
*, client: Neo4jClient, limit: Optional[int] = None
) -> Mapping[int, int]:
) -> Mapping[int, Mapping[str, int]]:
"""Get kinase statements."""
return _help(
sources=KINASE_CURIES,
Expand All @@ -285,7 +285,7 @@ def get_kinase_statements(
@autoclient()
def get_phosphatase_statements(
*, client: Neo4jClient, limit: Optional[int] = None
) -> Mapping[int, int]:
) -> Mapping[int, Mapping[str, int]]:
"""Get phosphatase statements."""
return _help(
sources=PHOSPHATASE_CURIES,
Expand All @@ -311,7 +311,7 @@ def _get_dub_curies():
@autoclient()
def get_dub_statements(
*, client: Neo4jClient, limit: Optional[int] = None
) -> Mapping[int, int]:
) -> Mapping[int, Mapping[str, int]]:
"""Get deubiquitinase statements."""
return _help(
sources=_get_dub_curies(),
Expand All @@ -336,7 +336,7 @@ def _get_mirnas() -> List[str]:
@autoclient()
def get_mirna_statements(
*, client: Neo4jClient, limit: Optional[int] = None
) -> Mapping[int, int]:
) -> Mapping[int, Mapping[str, int]]:
"""Get miRNA statements."""
return _help(
sources=MIRNA_CURIES,
Expand All @@ -360,7 +360,7 @@ def get_disprot_statements(
client: Neo4jClient,
limit: Optional[int] = None,
object_prefix: Optional[str] = None,
) -> Mapping[int, int]:
) -> Mapping[int, Mapping[str, int]]:
"""Get statements about disordered proteins."""
return _help(
sources=DISPROT_CURIES,
Expand Down Expand Up @@ -409,24 +409,24 @@ def _help(


@autoclient()
def get_entity_evidence_counts(
def get_entity_source_counts(
prefix: str,
identifier: str,
*,
client: Neo4jClient,
limit: Optional[int] = None,
) -> Mapping[int, int]:
) -> Mapping[int, Mapping[str, int]]:
query = f"""\
MATCH p=(a:BioEntity)-[r:indra_rel]->(b:BioEntity)
WHERE
a.id = "{prefix}:{identifier}"
AND NOT r.has_database_evidence
AND a.id <> b.id
RETURN r.stmt_hash, r.evidence_count
RETURN r.stmt_hash, r.source_counts
ORDER BY r.evidence_count DESC
{_limit_line(limit)}
"""
return client.query_dict(query)
return client.query_dict_value_json(query)


@autoclient()
Expand Down
2 changes: 1 addition & 1 deletion src/indra_cogex/client/enrichment/discrete.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def count_human_genes(*, client: Neo4jClient) -> int:
query = """\
MATCH (n:BioEntity)
WHERE n.id STARTS WITH 'hgnc'
AND NOT n.obsolete = "True"
AND NOT n.obsolete
RETURN count(n) as count
"""
results = client.query_tx(query)
Expand Down
Loading
Loading