-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
44 changed files
with
1,788 additions
and
858 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
class Constants: | ||
GRN_FOLDER_PATH = 'network-database' | ||
PPI_FOLDER_PATH = 'protein-protein-database' | ||
EXPRESSION_FOLDER_PATH = 'expression-database' | ||
UNION_GENE_FOLDER_PATH = 'union-gene-data/' | ||
|
||
# Gene data source file path | ||
GRN_GENE_SOURCE = GRN_FOLDER_PATH + "/script-results/processed-loader-files/gene.csv" | ||
PPI_GENE_SOURCE = PPI_FOLDER_PATH + "/script-results/processed-loader-files/gene.csv" | ||
EXPRESSION_GENE_SOURCE = EXPRESSION_FOLDER_PATH + "/script-results/processed-expression/genes.csv" | ||
|
||
# Union gene data | ||
GENE_DATA_DIRECTORY = UNION_GENE_FOLDER_PATH + 'union_genes.csv' | ||
MISSING_GENE_UNION_DIRECTORY = UNION_GENE_FOLDER_PATH + 'union-missing-genes.csv' | ||
UPDATE_GENE_UNION_DIRECTORY = UNION_GENE_FOLDER_PATH + 'union-update-genes.csv' | ||
|
||
# Constants name: NETWORK_<table_name>_DATA_DIRECTORY | ||
GRN_DATABASE_NAMESPACE = 'gene_regulatory_network' | ||
GRN_SOURCE_TABLE_DATA_DIRECTORY = GRN_FOLDER_PATH + '/script-results/processed-loader-files/source.csv' | ||
GRN_NETWORK_TABLE_DATA_DIRECTORY = GRN_FOLDER_PATH + '/script-results/processed-loader-files/network.csv' | ||
|
||
# Protein-protein-interactions | ||
PPI_DATABASE_NAMESPACE = 'protein_protein_interactions' | ||
PPI_SOURCE_TABLE_DATA_DIRECTORY = PPI_FOLDER_PATH + '/script-results/processed-loader-files/source.csv' | ||
PPI_NETWORK_TABLE_DATA_DIRECTORY = PPI_FOLDER_PATH + '/script-results/processed-loader-files/physical_interaction_no_dupe.csv' | ||
PPI_PROTEIN_TABLE_DATA_DIRECTORY = PPI_FOLDER_PATH + '/script-results/processed-loader-files/protein.csv' | ||
|
||
# Expression data | ||
EXPRESISON_DATABASE_NAMESPACE = 'gene_expression' | ||
EXPRESSION_REFS_TABLE_DATA_DIRECTORY = EXPRESSION_FOLDER_PATH + '/script-results/processed-expression/refs.csv' | ||
EXPRESSION_METADATA_TABLE_DATA_DIRECTORY = EXPRESSION_FOLDER_PATH + '/script-results/processed-expression/expression-metadata.csv' | ||
EXPRESSION_EXPRESSION_TABLE_DATA_DIRECTORY = EXPRESSION_FOLDER_PATH + '/script-results/processed-expression/expression-data.csv' | ||
EXPRESSION_PRODUCTION_RATE_TABLE_DATA_DIRECTORY = EXPRESSION_FOLDER_PATH + '/script-results/processed-expression/production-rates.csv' | ||
EXPRESSION_DEGRADATION_RATE_TABLE_DATA_DIRECTORY = EXPRESSION_FOLDER_PATH + '/script-results/processed-expression/degradation-rates.csv' | ||
|
||
# Paths for update files | ||
PPI_MISSING_GENE_DIRECTORY = PPI_FOLDER_PATH + '/script-results/processed-loader-files/missing-genes.csv' | ||
PPI_UPDATE_GENE_DIRECTORY = PPI_FOLDER_PATH + '/script-results/processed-loader-files/update-genes.csv' | ||
PPI_MISSING_PROTEIN_DIRECTORY = PPI_FOLDER_PATH + '/script-results/processed-loader-files/missing-proteins.csv' | ||
PPI_UPDATE_PROTEIN_DIRECTORY = PPI_FOLDER_PATH + '/script-results/processed-loader-files/update-proteins.csv' | ||
GRN_MISSING_GENE_DIRECTORY = GRN_FOLDER_PATH + '/script-results/processed-loader-files/missing-genes.csv' | ||
GRN_UPDATE_GENE_DIRECTORY = GRN_FOLDER_PATH + '/script-results/processed-loader-files/update-genes.csv' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,168 @@ | ||
import os | ||
import csv | ||
from sqlalchemy import create_engine | ||
from sqlalchemy import text | ||
from constants import Constants | ||
from utils import Utils | ||
|
||
PROTEIN_GENE_HEADER = f'Gene ID\tDisplay Gene ID\tSpecies\tTaxon ID' | ||
GRN_GENE_HEADER = f'Gene ID\tDisplay Gene ID\tSpecies\tTaxon ID\tRegulator' | ||
|
||
def _get_all_data_from_database_table(database_namespace, table_name): | ||
db = create_engine(os.environ['DB_URL']) | ||
with db.connect() as connection: | ||
result_set = connection.execute(text(f"SELECT * FROM {database_namespace}.{table_name}")) | ||
return result_set.fetchall() | ||
|
||
def _get_all_db_genes(database_namespace): | ||
gene_records = _get_all_data_from_database_table(database_namespace, "gene") | ||
genes = {} | ||
for gene in gene_records: | ||
key = (gene[0], gene[3]) | ||
if len(gene) > 4: | ||
value = (gene[1], gene[2], gene[4]) | ||
else: | ||
value = (gene[1], gene[2]) | ||
genes[key] = value | ||
return genes | ||
|
||
def _get_all_db_grn_genes(): | ||
return _get_all_db_genes(Constants.GRN_DATABASE_NAMESPACE) | ||
|
||
def _get_all_db_ppi_genes(): | ||
return _get_all_db_genes(Constants.PPI_DATABASE_NAMESPACE) | ||
|
||
def _get_all_genes(): | ||
db_grn_genes = _get_all_db_grn_genes() | ||
db_ppi_genes = _get_all_db_ppi_genes() | ||
|
||
if not os.path.exists('union-gene-data'): | ||
os.makedirs('union-gene-data') | ||
Utils.create_union_file([Constants.PPI_GENE_SOURCE, Constants.GRN_GENE_SOURCE], Constants.GENE_DATA_DIRECTORY) | ||
genes = db_grn_genes | ||
|
||
for gene in db_ppi_genes: | ||
if gene not in genes: | ||
display_gene_id, species = db_ppi_genes[gene] | ||
genes[gene] = [display_gene_id, species, False] | ||
|
||
with open(Constants.GENE_DATA_DIRECTORY, 'r+', encoding="UTF-8") as f: | ||
i = 0 | ||
reader = csv.reader(f) | ||
for row in reader: | ||
if i != 0: | ||
row = row[0].split('\t') | ||
gene_id = row[0] | ||
display_gene_id = row[1] | ||
species = row[2] | ||
taxon_id = row[3] | ||
regulator = row[4].capitalize() | ||
key = (gene_id, taxon_id) | ||
value = (display_gene_id, species, regulator) | ||
if key not in genes: | ||
genes[key] = value | ||
elif genes[key][0] != display_gene_id: | ||
if display_gene_id != "None": | ||
genes[key] = value | ||
i+=1 | ||
return genes | ||
|
||
|
||
def get_all_proteins(): | ||
protein_records = _get_all_data_from_database_table(Constants.PPI_DATABASE_NAMESPACE, "protein") | ||
proteins = {} | ||
for protein in protein_records: | ||
key = (protein[0], protein[5]) | ||
value = (protein[1], protein[2], protein[3], protein[4]) | ||
proteins[key] = value | ||
return proteins | ||
|
||
def processing_grn_gene_file(): | ||
return _processing_gene_file(_get_all_db_grn_genes(), is_protein=False) | ||
|
||
def processing_ppi_gene_file(): | ||
return _processing_gene_file(_get_all_db_ppi_genes()) | ||
|
||
def _processing_gene_file(db_genes, is_protein=True): | ||
print(f'Processing gene') | ||
missing_genes = {} | ||
genes_to_update = {} | ||
all_genes = _get_all_genes() | ||
for gene in all_genes: | ||
display_gene_id, species, regulator = all_genes[gene] | ||
values_for_ppi = (display_gene_id, species) | ||
values_for_grn = (display_gene_id, species, regulator) | ||
if gene not in db_genes: | ||
if is_protein: | ||
missing_genes[gene] = values_for_ppi | ||
else: | ||
missing_genes[gene] = values_for_grn | ||
elif gene in db_genes and db_genes[gene][0] != display_gene_id: | ||
if db_genes[gene][0] != "None": | ||
if is_protein: | ||
genes_to_update[gene] = values_for_ppi | ||
else: | ||
genes_to_update[gene] = values_for_grn | ||
return missing_genes, genes_to_update | ||
|
||
def processing_protein_file(file_path, db_proteins): | ||
print(f'Processing file {file_path}') | ||
ppi_missing_proteins = {} | ||
ppi_proteins_to_update = {} | ||
with open(file_path, 'r+', encoding="UTF-8") as f: | ||
i = 0 | ||
reader = csv.reader(f) | ||
for row in reader: | ||
if i != 0: | ||
row = row[0].split('\t') | ||
standard_name = row[0] | ||
gene_systematic_name = row[1] | ||
length = float(row[2]) if row[2] != "None" else 0 | ||
molecular_weight = float(row[3]) if row[3] != "None" else 0 | ||
pi = float(row[4]) if row[4] != "None" else 0 | ||
taxon_id = row[5] | ||
key = (standard_name, taxon_id) | ||
value = (gene_systematic_name, length, molecular_weight, pi) | ||
if key not in db_proteins: | ||
ppi_missing_proteins[key] = value | ||
elif db_proteins[key] != value: | ||
ppi_proteins_to_update[key] = value | ||
i+=1 | ||
return ppi_missing_proteins, ppi_proteins_to_update | ||
|
||
def create_grn_gene_file(file_path, data): | ||
_create_gene_file(file_path, GRN_GENE_HEADER, data, is_protein=False) | ||
|
||
def create_ppi_gene_file(file_path, data): | ||
_create_gene_file(file_path, PROTEIN_GENE_HEADER, data) | ||
|
||
def _create_gene_file(file_path, headers, data, is_protein=True): | ||
print(f'Creating {file_path}\n') | ||
gene_file = open(file_path, 'w') | ||
gene_file.write(f'{headers}\n') | ||
for gene in data: | ||
if is_protein: | ||
gene_file.write(f'{gene[0]}\t{data[gene][0]}\t{data[gene][1]}\t{gene[1]}\n') | ||
else: | ||
gene_file.write(f'{gene[0]}\t{data[gene][0]}\t{data[gene][1]}\t{gene[1]}\t{data[gene][2]}\n') | ||
gene_file.close() | ||
|
||
def create_ppi_protein_file(file_path, data): | ||
print(f'Creating {file_path}\n') | ||
protein_file = open(file_path, 'w') | ||
headers = f'Standard Name\tGene Systematic Name\tLength\tMolecular Weight\tPI\tTaxon ID' | ||
protein_file.write(f'{headers}\n') | ||
for protein in data: | ||
protein_file.write(f'{protein[0]}\t{data[protein][0]}\t{data[protein][1]}\t{data[protein][2]}\t{data[protein][3]}\t{protein[1]}\n') | ||
protein_file.close() | ||
|
||
# Processing gene files | ||
ppi_missing_genes, ppi_genes_to_update = processing_ppi_gene_file() | ||
grn_missing_genes, grn_genes_to_update = processing_grn_gene_file() | ||
ppi_missing_proteins, ppi_proteins_to_update = processing_protein_file(Constants.PPI_PROTEIN_TABLE_DATA_DIRECTORY, get_all_proteins()) | ||
create_grn_gene_file(Constants.GRN_MISSING_GENE_DIRECTORY, grn_missing_genes) | ||
create_grn_gene_file(Constants.GRN_UPDATE_GENE_DIRECTORY, grn_genes_to_update) | ||
create_ppi_gene_file(Constants.PPI_MISSING_GENE_DIRECTORY, ppi_missing_genes) | ||
create_ppi_gene_file(Constants.PPI_UPDATE_GENE_DIRECTORY, ppi_genes_to_update) | ||
create_ppi_protein_file(Constants.PPI_MISSING_PROTEIN_DIRECTORY, ppi_missing_proteins) | ||
create_ppi_protein_file(Constants.PPI_UPDATE_PROTEIN_DIRECTORY, ppi_proteins_to_update) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
import csv | ||
from utils import * | ||
from constants import Constants | ||
# python3 loader.py | psql postgresql://localhost/postgres | ||
import os | ||
|
||
if not os.path.exists('union-gene-data'): | ||
os.makedirs('union-gene-data') | ||
|
||
# Get union gene data | ||
Utils.create_union_file([Constants.EXPRESSION_GENE_SOURCE, Constants.PPI_GENE_SOURCE, Constants.GRN_GENE_SOURCE], Constants.GENE_DATA_DIRECTORY) | ||
|
||
# Regulatory Network | ||
Utils.load_sources(Constants.GRN_SOURCE_TABLE_DATA_DIRECTORY, Constants.GRN_DATABASE_NAMESPACE) | ||
Utils.load_grn_genes(Constants.GRN_GENE_SOURCE, Constants.GRN_DATABASE_NAMESPACE) | ||
Utils.load_grn_network(Constants.GRN_NETWORK_TABLE_DATA_DIRECTORY, Constants.GRN_DATABASE_NAMESPACE) | ||
|
||
# Protein-protein-interactions | ||
Utils.load_sources(Constants.PPI_SOURCE_TABLE_DATA_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE) | ||
Utils.load_ppi_genes(Constants.PPI_GENE_SOURCE, Constants.PPI_DATABASE_NAMESPACE) | ||
Utils.load_proteins(Constants.PPI_PROTEIN_TABLE_DATA_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE) | ||
Utils.load_ppi_network(Constants.PPI_NETWORK_TABLE_DATA_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE) | ||
|
||
# Expression data | ||
Utils.load_refs(Constants.EXPRESSION_REFS_TABLE_DATA_DIRECTORY, Constants.EXPRESISON_DATABASE_NAMESPACE) | ||
Utils.load_expression_genes(Constants.EXPRESSION_GENE_SOURCE, Constants.EXPRESISON_DATABASE_NAMESPACE) | ||
Utils.load_expression_metadata(Constants.EXPRESSION_METADATA_TABLE_DATA_DIRECTORY, Constants.EXPRESISON_DATABASE_NAMESPACE) | ||
Utils.load_expression_data(Constants.EXPRESSION_EXPRESSION_TABLE_DATA_DIRECTORY, Constants.EXPRESISON_DATABASE_NAMESPACE) | ||
Utils.load_production_rates(Constants.EXPRESSION_PRODUCTION_RATE_TABLE_DATA_DIRECTORY, Constants.EXPRESISON_DATABASE_NAMESPACE) | ||
Utils.load_degradation_rates(Constants.EXPRESSION_DEGRADATION_RATE_TABLE_DATA_DIRECTORY, Constants.EXPRESISON_DATABASE_NAMESPACE) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
import argparse | ||
from constants import Constants | ||
from utils import Utils | ||
|
||
def load_grn_data_into_database(): | ||
Utils.load_sources(Constants.GRN_SOURCE_TABLE_DATA_DIRECTORY, Constants.GRN_DATABASE_NAMESPACE) | ||
Utils.update_grn_genes(Constants.GRN_UPDATE_GENE_DIRECTORY, Constants.GRN_DATABASE_NAMESPACE) | ||
Utils.load_grn_genes(Constants.GRN_MISSING_GENE_DIRECTORY, Constants.GRN_DATABASE_NAMESPACE) | ||
Utils.load_grn_network(Constants.GRN_NETWORK_TABLE_DATA_DIRECTORY, Constants.GRN_DATABASE_NAMESPACE) | ||
|
||
def load_ppi_data_into_database(): | ||
Utils.load_sources(Constants.PPI_SOURCE_TABLE_DATA_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE) | ||
Utils.update_ppi_genes(Constants.PPI_UPDATE_GENE_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE) | ||
Utils.update_ppi_proteins(Constants.PPI_UPDATE_PROTEIN_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE) | ||
Utils.load_ppi_genes(Constants.PPI_MISSING_GENE_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE) | ||
Utils.load_proteins(Constants.PPI_MISSING_PROTEIN_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE) | ||
Utils.load_ppi_network(Constants.PPI_NETWORK_TABLE_DATA_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE) | ||
|
||
def main(): | ||
# Set up argument parsing | ||
parser = argparse.ArgumentParser(description="Load data into database for GRN or PPI networks.") | ||
parser.add_argument('--network', choices=['GRN', 'PPI'], required=True, help="Specify the network type to load data for") | ||
|
||
# Parse arguments | ||
args = parser.parse_args() | ||
|
||
# Execute the relevant operations based on the argument | ||
if args.network == 'GRN': | ||
load_grn_data_into_database() | ||
elif args.network == 'PPI': | ||
load_ppi_data_into_database() | ||
else: | ||
print("Invalid network type. Please choose 'GRN' or 'PPI'.") | ||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.