Skip to content

Commit

Permalink
Merge branch 'beta'
Browse files Browse the repository at this point in the history
  • Loading branch information
dondi committed Sep 24, 2024
2 parents f4b4c72 + feb269b commit 265d7e3
Show file tree
Hide file tree
Showing 44 changed files with 1,788 additions and 858 deletions.
356 changes: 207 additions & 149 deletions database/README.md

Large diffs are not rendered by default.

42 changes: 42 additions & 0 deletions database/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
class Constants:
GRN_FOLDER_PATH = 'network-database'
PPI_FOLDER_PATH = 'protein-protein-database'
EXPRESSION_FOLDER_PATH = 'expression-database'
UNION_GENE_FOLDER_PATH = 'union-gene-data/'

# Gene data source file path
GRN_GENE_SOURCE = GRN_FOLDER_PATH + "/script-results/processed-loader-files/gene.csv"
PPI_GENE_SOURCE = PPI_FOLDER_PATH + "/script-results/processed-loader-files/gene.csv"
EXPRESSION_GENE_SOURCE = EXPRESSION_FOLDER_PATH + "/script-results/processed-expression/genes.csv"

# Union gene data
GENE_DATA_DIRECTORY = UNION_GENE_FOLDER_PATH + 'union_genes.csv'
MISSING_GENE_UNION_DIRECTORY = UNION_GENE_FOLDER_PATH + 'union-missing-genes.csv'
UPDATE_GENE_UNION_DIRECTORY = UNION_GENE_FOLDER_PATH + 'union-update-genes.csv'

# Constants name: NETWORK_<table_name>_DATA_DIRECTORY
GRN_DATABASE_NAMESPACE = 'gene_regulatory_network'
GRN_SOURCE_TABLE_DATA_DIRECTORY = GRN_FOLDER_PATH + '/script-results/processed-loader-files/source.csv'
GRN_NETWORK_TABLE_DATA_DIRECTORY = GRN_FOLDER_PATH + '/script-results/processed-loader-files/network.csv'

# Protein-protein-interactions
PPI_DATABASE_NAMESPACE = 'protein_protein_interactions'
PPI_SOURCE_TABLE_DATA_DIRECTORY = PPI_FOLDER_PATH + '/script-results/processed-loader-files/source.csv'
PPI_NETWORK_TABLE_DATA_DIRECTORY = PPI_FOLDER_PATH + '/script-results/processed-loader-files/physical_interaction_no_dupe.csv'
PPI_PROTEIN_TABLE_DATA_DIRECTORY = PPI_FOLDER_PATH + '/script-results/processed-loader-files/protein.csv'

# Expression data
EXPRESISON_DATABASE_NAMESPACE = 'gene_expression'
EXPRESSION_REFS_TABLE_DATA_DIRECTORY = EXPRESSION_FOLDER_PATH + '/script-results/processed-expression/refs.csv'
EXPRESSION_METADATA_TABLE_DATA_DIRECTORY = EXPRESSION_FOLDER_PATH + '/script-results/processed-expression/expression-metadata.csv'
EXPRESSION_EXPRESSION_TABLE_DATA_DIRECTORY = EXPRESSION_FOLDER_PATH + '/script-results/processed-expression/expression-data.csv'
EXPRESSION_PRODUCTION_RATE_TABLE_DATA_DIRECTORY = EXPRESSION_FOLDER_PATH + '/script-results/processed-expression/production-rates.csv'
EXPRESSION_DEGRADATION_RATE_TABLE_DATA_DIRECTORY = EXPRESSION_FOLDER_PATH + '/script-results/processed-expression/degradation-rates.csv'

# Paths for update files
PPI_MISSING_GENE_DIRECTORY = PPI_FOLDER_PATH + '/script-results/processed-loader-files/missing-genes.csv'
PPI_UPDATE_GENE_DIRECTORY = PPI_FOLDER_PATH + '/script-results/processed-loader-files/update-genes.csv'
PPI_MISSING_PROTEIN_DIRECTORY = PPI_FOLDER_PATH + '/script-results/processed-loader-files/missing-proteins.csv'
PPI_UPDATE_PROTEIN_DIRECTORY = PPI_FOLDER_PATH + '/script-results/processed-loader-files/update-proteins.csv'
GRN_MISSING_GENE_DIRECTORY = GRN_FOLDER_PATH + '/script-results/processed-loader-files/missing-genes.csv'
GRN_UPDATE_GENE_DIRECTORY = GRN_FOLDER_PATH + '/script-results/processed-loader-files/update-genes.csv'
168 changes: 168 additions & 0 deletions database/filter_update.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
import os
import csv
from sqlalchemy import create_engine
from sqlalchemy import text
from constants import Constants
from utils import Utils

PROTEIN_GENE_HEADER = f'Gene ID\tDisplay Gene ID\tSpecies\tTaxon ID'
GRN_GENE_HEADER = f'Gene ID\tDisplay Gene ID\tSpecies\tTaxon ID\tRegulator'

def _get_all_data_from_database_table(database_namespace, table_name):
db = create_engine(os.environ['DB_URL'])
with db.connect() as connection:
result_set = connection.execute(text(f"SELECT * FROM {database_namespace}.{table_name}"))
return result_set.fetchall()

def _get_all_db_genes(database_namespace):
gene_records = _get_all_data_from_database_table(database_namespace, "gene")
genes = {}
for gene in gene_records:
key = (gene[0], gene[3])
if len(gene) > 4:
value = (gene[1], gene[2], gene[4])
else:
value = (gene[1], gene[2])
genes[key] = value
return genes

def _get_all_db_grn_genes():
return _get_all_db_genes(Constants.GRN_DATABASE_NAMESPACE)

def _get_all_db_ppi_genes():
return _get_all_db_genes(Constants.PPI_DATABASE_NAMESPACE)

def _get_all_genes():
db_grn_genes = _get_all_db_grn_genes()
db_ppi_genes = _get_all_db_ppi_genes()

if not os.path.exists('union-gene-data'):
os.makedirs('union-gene-data')
Utils.create_union_file([Constants.PPI_GENE_SOURCE, Constants.GRN_GENE_SOURCE], Constants.GENE_DATA_DIRECTORY)
genes = db_grn_genes

for gene in db_ppi_genes:
if gene not in genes:
display_gene_id, species = db_ppi_genes[gene]
genes[gene] = [display_gene_id, species, False]

with open(Constants.GENE_DATA_DIRECTORY, 'r+', encoding="UTF-8") as f:
i = 0
reader = csv.reader(f)
for row in reader:
if i != 0:
row = row[0].split('\t')
gene_id = row[0]
display_gene_id = row[1]
species = row[2]
taxon_id = row[3]
regulator = row[4].capitalize()
key = (gene_id, taxon_id)
value = (display_gene_id, species, regulator)
if key not in genes:
genes[key] = value
elif genes[key][0] != display_gene_id:
if display_gene_id != "None":
genes[key] = value
i+=1
return genes


def get_all_proteins():
protein_records = _get_all_data_from_database_table(Constants.PPI_DATABASE_NAMESPACE, "protein")
proteins = {}
for protein in protein_records:
key = (protein[0], protein[5])
value = (protein[1], protein[2], protein[3], protein[4])
proteins[key] = value
return proteins

def processing_grn_gene_file():
return _processing_gene_file(_get_all_db_grn_genes(), is_protein=False)

def processing_ppi_gene_file():
return _processing_gene_file(_get_all_db_ppi_genes())

def _processing_gene_file(db_genes, is_protein=True):
print(f'Processing gene')
missing_genes = {}
genes_to_update = {}
all_genes = _get_all_genes()
for gene in all_genes:
display_gene_id, species, regulator = all_genes[gene]
values_for_ppi = (display_gene_id, species)
values_for_grn = (display_gene_id, species, regulator)
if gene not in db_genes:
if is_protein:
missing_genes[gene] = values_for_ppi
else:
missing_genes[gene] = values_for_grn
elif gene in db_genes and db_genes[gene][0] != display_gene_id:
if db_genes[gene][0] != "None":
if is_protein:
genes_to_update[gene] = values_for_ppi
else:
genes_to_update[gene] = values_for_grn
return missing_genes, genes_to_update

def processing_protein_file(file_path, db_proteins):
print(f'Processing file {file_path}')
ppi_missing_proteins = {}
ppi_proteins_to_update = {}
with open(file_path, 'r+', encoding="UTF-8") as f:
i = 0
reader = csv.reader(f)
for row in reader:
if i != 0:
row = row[0].split('\t')
standard_name = row[0]
gene_systematic_name = row[1]
length = float(row[2]) if row[2] != "None" else 0
molecular_weight = float(row[3]) if row[3] != "None" else 0
pi = float(row[4]) if row[4] != "None" else 0
taxon_id = row[5]
key = (standard_name, taxon_id)
value = (gene_systematic_name, length, molecular_weight, pi)
if key not in db_proteins:
ppi_missing_proteins[key] = value
elif db_proteins[key] != value:
ppi_proteins_to_update[key] = value
i+=1
return ppi_missing_proteins, ppi_proteins_to_update

def create_grn_gene_file(file_path, data):
_create_gene_file(file_path, GRN_GENE_HEADER, data, is_protein=False)

def create_ppi_gene_file(file_path, data):
_create_gene_file(file_path, PROTEIN_GENE_HEADER, data)

def _create_gene_file(file_path, headers, data, is_protein=True):
print(f'Creating {file_path}\n')
gene_file = open(file_path, 'w')
gene_file.write(f'{headers}\n')
for gene in data:
if is_protein:
gene_file.write(f'{gene[0]}\t{data[gene][0]}\t{data[gene][1]}\t{gene[1]}\n')
else:
gene_file.write(f'{gene[0]}\t{data[gene][0]}\t{data[gene][1]}\t{gene[1]}\t{data[gene][2]}\n')
gene_file.close()

def create_ppi_protein_file(file_path, data):
print(f'Creating {file_path}\n')
protein_file = open(file_path, 'w')
headers = f'Standard Name\tGene Systematic Name\tLength\tMolecular Weight\tPI\tTaxon ID'
protein_file.write(f'{headers}\n')
for protein in data:
protein_file.write(f'{protein[0]}\t{data[protein][0]}\t{data[protein][1]}\t{data[protein][2]}\t{data[protein][3]}\t{protein[1]}\n')
protein_file.close()

# Processing gene files
ppi_missing_genes, ppi_genes_to_update = processing_ppi_gene_file()
grn_missing_genes, grn_genes_to_update = processing_grn_gene_file()
ppi_missing_proteins, ppi_proteins_to_update = processing_protein_file(Constants.PPI_PROTEIN_TABLE_DATA_DIRECTORY, get_all_proteins())
create_grn_gene_file(Constants.GRN_MISSING_GENE_DIRECTORY, grn_missing_genes)
create_grn_gene_file(Constants.GRN_UPDATE_GENE_DIRECTORY, grn_genes_to_update)
create_ppi_gene_file(Constants.PPI_MISSING_GENE_DIRECTORY, ppi_missing_genes)
create_ppi_gene_file(Constants.PPI_UPDATE_GENE_DIRECTORY, ppi_genes_to_update)
create_ppi_protein_file(Constants.PPI_MISSING_PROTEIN_DIRECTORY, ppi_missing_proteins)
create_ppi_protein_file(Constants.PPI_UPDATE_PROTEIN_DIRECTORY, ppi_proteins_to_update)
30 changes: 30 additions & 0 deletions database/loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import csv
from utils import *
from constants import Constants
# python3 loader.py | psql postgresql://localhost/postgres
import os

if not os.path.exists('union-gene-data'):
os.makedirs('union-gene-data')

# Get union gene data
Utils.create_union_file([Constants.EXPRESSION_GENE_SOURCE, Constants.PPI_GENE_SOURCE, Constants.GRN_GENE_SOURCE], Constants.GENE_DATA_DIRECTORY)

# Regulatory Network
Utils.load_sources(Constants.GRN_SOURCE_TABLE_DATA_DIRECTORY, Constants.GRN_DATABASE_NAMESPACE)
Utils.load_grn_genes(Constants.GRN_GENE_SOURCE, Constants.GRN_DATABASE_NAMESPACE)
Utils.load_grn_network(Constants.GRN_NETWORK_TABLE_DATA_DIRECTORY, Constants.GRN_DATABASE_NAMESPACE)

# Protein-protein-interactions
Utils.load_sources(Constants.PPI_SOURCE_TABLE_DATA_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE)
Utils.load_ppi_genes(Constants.PPI_GENE_SOURCE, Constants.PPI_DATABASE_NAMESPACE)
Utils.load_proteins(Constants.PPI_PROTEIN_TABLE_DATA_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE)
Utils.load_ppi_network(Constants.PPI_NETWORK_TABLE_DATA_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE)

# Expression data
Utils.load_refs(Constants.EXPRESSION_REFS_TABLE_DATA_DIRECTORY, Constants.EXPRESISON_DATABASE_NAMESPACE)
Utils.load_expression_genes(Constants.EXPRESSION_GENE_SOURCE, Constants.EXPRESISON_DATABASE_NAMESPACE)
Utils.load_expression_metadata(Constants.EXPRESSION_METADATA_TABLE_DATA_DIRECTORY, Constants.EXPRESISON_DATABASE_NAMESPACE)
Utils.load_expression_data(Constants.EXPRESSION_EXPRESSION_TABLE_DATA_DIRECTORY, Constants.EXPRESISON_DATABASE_NAMESPACE)
Utils.load_production_rates(Constants.EXPRESSION_PRODUCTION_RATE_TABLE_DATA_DIRECTORY, Constants.EXPRESISON_DATABASE_NAMESPACE)
Utils.load_degradation_rates(Constants.EXPRESSION_DEGRADATION_RATE_TABLE_DATA_DIRECTORY, Constants.EXPRESISON_DATABASE_NAMESPACE)
36 changes: 36 additions & 0 deletions database/loader_update.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import argparse
from constants import Constants
from utils import Utils

def load_grn_data_into_database():
Utils.load_sources(Constants.GRN_SOURCE_TABLE_DATA_DIRECTORY, Constants.GRN_DATABASE_NAMESPACE)
Utils.update_grn_genes(Constants.GRN_UPDATE_GENE_DIRECTORY, Constants.GRN_DATABASE_NAMESPACE)
Utils.load_grn_genes(Constants.GRN_MISSING_GENE_DIRECTORY, Constants.GRN_DATABASE_NAMESPACE)
Utils.load_grn_network(Constants.GRN_NETWORK_TABLE_DATA_DIRECTORY, Constants.GRN_DATABASE_NAMESPACE)

def load_ppi_data_into_database():
Utils.load_sources(Constants.PPI_SOURCE_TABLE_DATA_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE)
Utils.update_ppi_genes(Constants.PPI_UPDATE_GENE_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE)
Utils.update_ppi_proteins(Constants.PPI_UPDATE_PROTEIN_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE)
Utils.load_ppi_genes(Constants.PPI_MISSING_GENE_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE)
Utils.load_proteins(Constants.PPI_MISSING_PROTEIN_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE)
Utils.load_ppi_network(Constants.PPI_NETWORK_TABLE_DATA_DIRECTORY, Constants.PPI_DATABASE_NAMESPACE)

def main():
# Set up argument parsing
parser = argparse.ArgumentParser(description="Load data into database for GRN or PPI networks.")
parser.add_argument('--network', choices=['GRN', 'PPI'], required=True, help="Specify the network type to load data for")

# Parse arguments
args = parser.parse_args()

# Execute the relevant operations based on the argument
if args.network == 'GRN':
load_grn_data_into_database()
elif args.network == 'PPI':
load_ppi_data_into_database()
else:
print("Invalid network type. Please choose 'GRN' or 'PPI'.")

if __name__ == "__main__":
main()
4 changes: 2 additions & 2 deletions database/network-database/schema.sql
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,6 @@ CREATE TABLE gene_regulatory_network.network (
time_stamp TIMESTAMP WITH TIME ZONE,
source VARCHAR,
FOREIGN KEY (regulator_gene_id, taxon_id) REFERENCES gene_regulatory_network.gene(gene_id, taxon_id),
FOREIGN KEY (target_gene_id, taxon_id) REFERENCES gene_regulatory_network.gene(gene_id, taxon_id),
FOREIGN KEY (time_stamp, source) REFERENCES gene_regulatory_network.source(time_stamp, source)
FOREIGN KEY (target_gene_id, taxon_id) REFERENCES gene_regulatory_network_testing.gene(gene_id, taxon_id),
FOREIGN KEY (time_stamp, source) REFERENCES gene_regulatory_network_testing.source(time_stamp, source)
);
76 changes: 0 additions & 76 deletions database/network-database/scripts/filter_genes.py

This file was deleted.

Loading

0 comments on commit 265d7e3

Please sign in to comment.