filtering vertices by UniProtID

ding-lab · Oct 4, 2017 · 745d22c · 745d22c
1 parent d23084d
commit 745d22c
Show file tree

Hide file tree

Showing 3 changed files with 101 additions and 39 deletions.
diff --git a/HotSpot3D-1.8.0.tar.gz b/HotSpot3D-1.8.0.tar.gz
diff --git a/lib/TGI/Mutpro/Main/Cluster.pm b/lib/TGI/Mutpro/Main/Cluster.pm
@@ -115,6 +115,7 @@ sub new {
 	$this->{'mutations_json_hash'} = undef;
 	$this->{'distance_matrix_json_hash'} = undef;
 	$this->{'siteVertexMap_json_hash'} = undef;
+	$this->{'hup_file'} = undef;
 
     bless $this, $class;
     $this->process();
@@ -262,6 +263,7 @@ sub setOptions {
 		'mutations-hash-json-file=s' => \$this->{'mutations_json_hash'},
 		'distance-matrix-json-file=s' => \$this->{'distance_matrix_json_hash'},
 		'siteVertexMap-json-file=s' => \$this->{'siteVertexMap_json_hash'},
+		'hup-file=s'	=> \$this->{'hup_file'},
 
         'help' => \$help,
     );
@@ -406,6 +408,12 @@ sub setOptions {
 	if ( not exists $tempMericHash->{$this->{'meric_type'}} ) {
 		die "Error: meric-type should be one of the following: intra, monomer, homomer, inter, heteromer, multimer, unspecified\n";
 	}
+	if ( $this->{'vertex_type'} eq $SITE or $this->{'clustering'} eq $DENSITY ) {
+		if ( not defined $this->{'hup_file'} or not -e $this->{'hup_file'} ) {
+			warn "If you're using SITE and/or DENSITY, you must provide a valid hugo.uniprot.pdb.csv file location using --hup-file option\n";
+			die $this->help_text();
+		}
+	}
 
 	##### START gene-list and structure-list options
 	if ( defined $this->geneListFile() ) {
@@ -554,24 +562,30 @@ sub vertexFilter {
 		print STDOUT "Filtering vertices\n";
 #TODO if using a different .maf from search step, then some mutations can be missed
 		my $vertexMap = {}; #a hash to map isSameProteinPosition vertices (and others to their selves)-- map=f()
-		my @mKeys = sort keys %{$temp_mutations}; #an array to store all the mutation keys
-
-		#filter same vertices and generate a map
-		foreach my $mutationKey1 ( @mKeys ) {
-			next if not exists $temp_mutations->{$mutationKey1};
-			foreach my $mutationKey2 ( @mKeys ) {
-				next if not exists $temp_mutations->{$mutationKey2};
-				if ( $mutationKey1 eq $mutationKey2 ) { # this if condition is important to capture mk2=mk1 cases to $siteVertexMap hash
-					$vertexMap->{$mutationKey2} = $mutationKey1;
-					$siteVertexMap->{$mutationKey1}->{$mutationKey2} = $temp_mutations->{$mutationKey2};
-					# print "ACSW::VertexFilter::Equal $mutationKey2 \=\=\> $mutationKey1\n";
-					next;
-				}
-				elsif ( $this->isSameProteinPosition( $temp_mutations , $mutationKey1 , $mutationKey2 ) ) { #if same site
-					$vertexMap->{$mutationKey2} = $mutationKey1;
-					$siteVertexMap->{$mutationKey1}->{$mutationKey2} = $temp_mutations->{$mutationKey2};
-					print "ACSW::VertexFilter::SameSite $mutationKey2 \=\=\> $mutationKey1\n";
-					delete $temp_mutations->{$mutationKey2};
+		my $vfHash = makeVertexFilterHash( $this , $temp_mutations , $temp_distance_matrix ); # a hash with mutationKeys by uniprot id
+		print "vfHash\n";
+		print Dumper $vfHash;
+
+		foreach my $uniprotID ( keys %{$vfHash} ) {
+			my @mKeys = sort keys %{$vfHash->{$uniprotID}}; #an array to store all the mutation keys corresponding to the uniprotID under consideration
+
+			#filter same vertices and generate a map
+			foreach my $mutationKey1 ( @mKeys ) {
+				next if not exists $temp_mutations->{$mutationKey1};
+				foreach my $mutationKey2 ( @mKeys ) {
+					next if not exists $temp_mutations->{$mutationKey2};
+					if ( $mutationKey1 eq $mutationKey2 ) { # this if condition is important to capture mk2=mk1 cases to $siteVertexMap hash
+						$vertexMap->{$mutationKey2} = $mutationKey1;
+						$siteVertexMap->{$mutationKey1}->{$mutationKey2} = $temp_mutations->{$mutationKey2};
+						# print "ACSW::VertexFilter::Equal $mutationKey2 \=\=\> $mutationKey1\n";
+						next;
+					}
+					elsif ( $this->isSameProteinPosition( $temp_mutations , $mutationKey1 , $mutationKey2 ) ) { #if same site
+						$vertexMap->{$mutationKey2} = $mutationKey1;
+						$siteVertexMap->{$mutationKey1}->{$mutationKey2} = $temp_mutations->{$mutationKey2};
+						print "ACSW::VertexFilter::SameSite $mutationKey2 \=\=\> $mutationKey1\n";
+						delete $temp_mutations->{$mutationKey2};
+					}
 				}
 			}
 		}
@@ -610,8 +624,8 @@ sub vertexFilter {
 				}
 			}
 		}
-		# print "vertex_map\n";
-		# print Dumper $vertexMap;
+		print "vertex_map\n";
+		print Dumper $vertexMap;
 	} else {
 		%{$mutations} = %{$temp_mutations};
 		%{$distance_matrix} = %{$temp_distance_matrix};
@@ -622,6 +636,32 @@ sub vertexFilter {
 	return;
 }
 
+sub makeVertexFilterHash { # a hash to do vertex filtering by UniprotID
+	my ( $this , $temp_mutations , $temp_distance_matrix ) = @_;
+
+	my $vfHash = {};
+	my $hugoToUniprot = {};
+	# read hugo.uniprot.pdb.transcript file
+	my $fh = new FileHandle;
+	die "Could not open hugo.uniprot.pdb.csv file\n" unless( $fh->open( $this->{'hup_file'} , "r" ) );
+
+	while ( my $line = <$fh> ) {
+		chomp( $line ); 
+		my ( $hugo, $uniprot ) = ( split /\t/, $line )[0,1];
+		$hugoToUniprot->{$hugo} = $uniprot;		
+	}
+	$fh->close();
+
+	# go through each mutation key and make a hash with uniprotID->transcript->{mutations}
+	foreach my $mutationKey ( keys %{$temp_mutations} ) {
+		my $hugo = ( split /:/,$mutationKey )[0];
+		my $uniprot = $hugoToUniprot->{$hugo};
+		# add this part of the mutation hash to vfHash
+		$vfHash->{$uniprot}->{$mutationKey} = 0;
+	}
+	return $vfHash;
+}
+
 sub getARepresentativeAnnotation { # choose a representative out of all the mutations detected as same protein position
 	my ( $this , $mutations , $mutationKey, $siteVertexMap ) = @_;
 	# my $ra = ".:.";
@@ -1535,7 +1575,7 @@ sub isSameProteinPosition { # shared
 				$aaPosition1 =  $1;
 			} else {
 				unless ( $refAlt1 =~ m/$PTM/ ) {
-					print "...next, no match aaChange1\n";
+					# print "...next, no match aaChange1\n";
 					next;
 				}
 			}
@@ -1549,7 +1589,7 @@ sub isSameProteinPosition { # shared
 						$aaPosition2 =  $1;
 					} else {
 						unless ( $refAlt2 =~ m/$PTM/ ) {
-							print "...next, no match aaChange2\n";
+							# print "...next, no match aaChange2\n";
 							next;
 						}
 					}
@@ -1917,15 +1957,22 @@ sub density_help_text{
 Usage: hotspot3d density [options]
 
                              REQUIRED
---pairwise-file              3D pairwise data file
+--pairwise-file              A .pairwise file with mutation-mutation pairs (provide maf-file)
+
+                             CONDITIONAL REQUIREMENT
+--maf-file                   .maf file used in proximity search step
+                                 necessary for pairwise, drug-clean, or musites pair data
 
                              OPTIONAL
 --Epsilon                    Epsilon value, default: 10
 --MinPts                     MinPts, default: 4
 --number-of-runs             Number of density clustering runs to perform before the cluster membership probability being calculated, default: 10
 --probability-cut-off        Clusters will be formed with variants having at least this probability, default: 100
 --distance-measure           Pair distance to use (shortest or average), default: average
---structure-dependent        Clusters for each structure or across all structures (dependent or independent), default: independent 
+--structure-dependent        Clusters for each structure or across all structures (dependent or independent), default: independent
+--use-JSON                   Use pre-encoded mutations and distance-matrix hashes in json format, default (no flag): do not use json
+--mutations-hash-json-file   JSON encoded mutations hash file produced by a previous cluster run
+--distance-matrix-json-file  JSON encoded distance-matrix hash file produced by a previous cluster run 
 
 --help                       this message
 
@@ -1948,22 +1995,16 @@ Usage: hotspot3d cluster [options]
                              CONDITIONAL REQUIREMENT
 --maf-file                   .maf file used in proximity search step
                                  necessary for pairwise, drug-clean, or musites pair data
+--hup-file                   hugo.uniprot.pdb.csv file location (this file is generated inside the preprocess data directory)
+                                 required if --vertex-type=site or --clustering=density
 
-                             OPTIONAL
---output-prefix              Output prefix, default: 3D_Proximity
---p-value-cutoff             P_value cutoff (<), default: 0.05 (if 3d-distance-cutoff also not set)
---3d-distance-cutoff         3D distance cutoff (<), default: 100 (if p-value-cutoff also not set)
---linear-cutoff              Linear distance cutoff (> peptides), default: 0
---max-radius                 Maximum cluster radius (max network geodesic from centroid, <= Angstroms), default: 10
+                             OPTIONAL (General)
 --clustering                 Cluster using network or density-based methods (network or density), default: network
 --vertex-type                Graph vertex type for network-based clustering (recurrence, unique, site or weight), default: site
                                  recurrence vertices are the genomic mutations for each sample from the given .maf
                                  unique vertices are the specific genomic changes
                                  site vertices are the affected protein positions
                                  weight vertices are the genomic mutations with a numerical weighting
---weight-scale               Weight scale used to control scoring of vertices, default: 20
---length-scale               Length scale used to control scoring of vertices, default: 10
---vertex-score               Vertex score system to use (centrality, exponentials), default: centrality
 --distance-measure           Pair distance to use (shortest or average), default: average
 --structure-dependent        Clusters for each structure or across all structures, default (no flag): independent
 --subunit-dependent          Clusters for each subunit or across all subunits, default (no flag): independent
@@ -1977,9 +2018,26 @@ Usage: hotspot3d cluster [options]
 --max-processes              Set if using parallel type local (CAUTION: make sure you know your max CPU processes)
 --gene-list-file             Choose mutations from the genes given in this list
 --structure-list-file        Choose mutations from the structures given in this list
+
+                              OPTIONAL (Network)
+--output-prefix              Output prefix, default: 3D_Proximity
+--p-value-cutoff             P_value cutoff (<), default: 0.05 (if 3d-distance-cutoff also not set)
+--3d-distance-cutoff         3D distance cutoff (<), default: 100 (if p-value-cutoff also not set)
+--linear-cutoff              Linear distance cutoff (> peptides), default: 0
+--max-radius                 Maximum cluster radius (max network geodesic from centroid, <= Angstroms), default: 10
+--weight-scale               Weight scale used to control scoring of vertices, default: 20
+--length-scale               Length scale used to control scoring of vertices, default: 10
+--vertex-score               Vertex score system to use (centrality, exponentials), default: centrality
+
+                              OPTIONAL (Density)
 --use-JSON                   Use pre-encoded mutations and distance-matrix hashes in json format, default (no flag): do not use json
 --mutations-hash-json-file   JSON encoded mutations hash file produced by a previous cluster run
 --distance-matrix-json-file  JSON encoded distance-matrix hash file produced by a previous cluster run
+--Epsilon                    Epsilon value, default: 10
+--MinPts                     MinPts, default: 4
+--number-of-runs             Number of density clustering runs to perform before the cluster membership probability being calculated, default: 10
+--probability-cut-off        Clusters will be formed with variants having at least this probability, default: 100
+
 
 
 

diff --git a/lib/TGI/Mutpro/Main/Density.pm b/lib/TGI/Mutpro/Main/Density.pm
@@ -104,12 +104,12 @@ sub process {
 		printHash( $mutations, "mutations_hash" );
         printHash( $siteVertexMap, "siteVertexMap_hash" );
 
-        # print "mutations\n";
-        # print Dumper $mutations;
-        # print "distance_matrix\n";
-        # print Dumper $distance_matrix;
-        # print "siteVertexMap\n";
-        # print Dumper $siteVertexMap;
+        print "mutations\n";
+        print Dumper $mutations;
+        print "distance_matrix\n";
+        print Dumper $distance_matrix;
+        print "siteVertexMap\n";
+        print Dumper $siteVertexMap;
 	}
 
     $this->{"siteVertexMap"} = $siteVertexMap; # store the reference to siteVertexMap
@@ -1014,6 +1014,10 @@ sub getClusterProbabilities{
                             $variant =~ /(\w+)\:(\D\.\D+\d+\D+)/g;
                             # print OUT "$SCID.$levelID.$SubID\t$1\t$2\t0\t0\t0\t0\t$CurrentEpsilon\t$CurrentAvgDensity\t$CoveringClusters\t$genomicAnnotation\n";
 
+                            if ( scalar keys %{$this->{Memberships}->{$SCID}->{$levelID}->{$SubID}} == 1 and $CurrentAvgDensity == 0.1 ) { # artificially resetting epsilon-prime to 0.1 for singleton clusters
+                                $CurrentEpsilon = 0.1;
+                            }
+
                             # print clusters with other mutations (not only representative vertices, but all)
 
                             my $gene = $1;