Skip to content

Commit

Permalink
Adding recurrence and weight based clustering to the Density Module
Browse files Browse the repository at this point in the history
  • Loading branch information
amilacsw committed Sep 28, 2017
1 parent e399f75 commit 6bf3a23
Show file tree
Hide file tree
Showing 5 changed files with 255 additions and 77 deletions.
Binary file modified HotSpot3D-1.7.3.tar.gz
Binary file not shown.
76 changes: 56 additions & 20 deletions lib/TGI/Mutpro/Main/Cluster.pm
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ sub new {
$this->{'JSON_status'} = undef;
$this->{'mutations_json_hash'} = undef;
$this->{'distance_matrix_json_hash'} = undef;
$this->{'siteVertexMap_json_hash'} = undef;

bless $this, $class;
$this->process();
Expand Down Expand Up @@ -250,6 +251,7 @@ sub setOptions {
'use-JSON' => \$this->{'JSON_status'},
'mutations-hash-json-file=s' => \$this->{'mutations_json_hash'},
'distance-matrix-json-file=s' => \$this->{'distance_matrix_json_hash'},
'siteVertexMap-json-file=s' => \$this->{'siteVertexMap_json_hash'},

'help' => \$help,
);
Expand Down Expand Up @@ -349,10 +351,10 @@ sub setOptions {
if ( defined $this->{'JSON_status'} ) {
$this->{'JSON_status'} = 1;
warn "HotSpot3D::Cluster::setOptions warning: use-JSON flag used (will not look for pairwise data or maf file)\n";
if ( not defined $this->{'mutations_json_hash'} or not defined $this->{'distance_matrix_json_hash'} ) {
if ( not defined $this->{'mutations_json_hash'} or not defined $this->{'distance_matrix_json_hash'} or not defined $this->{'siteVertexMap_json_hash'} ) {
die "HotSpot3D::Cluster::setOptions Error: use-JSON flag is used, but json file locations are not provided!\n";
}
elsif ( not -e $this->{'mutations_json_hash'} or not -e $this->{'mutations_json_hash'} ) {
elsif ( not -e $this->{'mutations_json_hash'} or not -e $this->{'mutations_json_hash'} or not -e $this->{'siteVertexMap_json_hash'} ) {
die "HotSpot3D::Cluster::setOptions Error: use-JSON flag is used, but the provided JSON files do not exist!\n";
}
}
Expand Down Expand Up @@ -530,8 +532,8 @@ sub launchClustering {

sub vertexFilter {
#$this->vertexFilter( $temp_mutations , $temp_distance_matrix , $mutations , $distance_matrix );
my ( $this , $temp_mutations , $temp_distance_matrix , $mutations , $distance_matrix ) = @_;
if ( $this->{'vertex_type'} eq $SITE ) {
my ( $this , $temp_mutations , $temp_distance_matrix , $mutations , $distance_matrix, $siteVertexMap ) = @_;
if ( $this->{'vertex_type'} eq $SITE or $this->{'clustering'} eq $DENSITY ) {
print STDOUT "Filtering vertices\n";
#TODO if using a different .maf from search step, then some mutations can be missed
my $vertexMap = {}; #a hash to map isSameProteinPosition vertices (and others to their selves)-- map=f()
Expand All @@ -542,13 +544,15 @@ sub vertexFilter {
next if not exists $temp_mutations->{$mutationKey1};
foreach my $mutationKey2 ( @mKeys ) {
next if not exists $temp_mutations->{$mutationKey2};
if ( $mutationKey1 eq $mutationKey2 ) {
if ( $mutationKey1 eq $mutationKey2 ) { # this if condition is important to capture mk2=mk1 cases to $siteVertexMap hash
$vertexMap->{$mutationKey2} = $mutationKey1;
$siteVertexMap->{$mutationKey1}->{$mutationKey2} = $temp_mutations->{$mutationKey2};
# print "ACSW::VertexFilter::Equal $mutationKey2 \=\=\> $mutationKey1\n";
next;
}
elsif ( $this->isSameProteinPosition( $temp_mutations , $mutationKey1 , $mutationKey2 ) ) { #if same site
$vertexMap->{$mutationKey2} = $mutationKey1;
$siteVertexMap->{$mutationKey1}->{$mutationKey2} = $temp_mutations->{$mutationKey2};
print "ACSW::VertexFilter::SameSite $mutationKey2 \=\=\> $mutationKey1\n";
delete $temp_mutations->{$mutationKey2};
}
Expand All @@ -558,8 +562,14 @@ sub vertexFilter {

#generate representative annotations
foreach my $mutationKey ( keys %{$temp_mutations} ) {
my ( $ra , $pk ) = $this->getARepresentativeAnnotation( $temp_mutations , $mutationKey );
$mutations->{$mutationKey}->{$ra}->{$pk} = 1;
my ( $ra , $pk, $highestRecurrence, $totalRecurrence ) = $this->getARepresentativeAnnotation( $temp_mutations , $mutationKey, $siteVertexMap );
if ( $this->{'vertex_type'} eq $SITE ) {
$mutations->{$mutationKey}->{$ra}->{$pk} = 1; # weight = 1 for SITE
}
else {
$mutations->{$mutationKey}->{$ra}->{$pk} = $totalRecurrence; # weight = total recurrence/weight for RECURRENCE/WEIGHT
}

}
print "ACSW::VertexFilter::mutations representative annotation done\n";

Expand All @@ -583,30 +593,56 @@ sub vertexFilter {
}
}
}
# print "vertex_map\n";
# print Dumper $vertexMap;
} else {
%{$mutations} = %{$temp_mutations};
%{$distance_matrix} = %{$temp_distance_matrix};
}
$temp_mutations = undef;
$temp_distance_matrix = undef;
# print "distance_matrix\n";
# print Dumper $distance_matrix;

return;
}

sub getARepresentativeAnnotation {
my ( $this , $mutations , $mutationKey ) = @_;
my $ra = ".:.";
my $pk = ".:p.";
foreach my $refAlt ( keys %{$mutations->{$mutationKey}} ) {
$ra = $refAlt;
foreach my $proteinKey ( keys %{$mutations->{$mutationKey}->{$refAlt}} ) {
$pk = $proteinKey;
last;
sub getARepresentativeAnnotation { # choose a representative out of all the mutations detected as same protein position
my ( $this , $mutations , $mutationKey, $siteVertexMap ) = @_;
# my $ra = ".:.";
# my $pk = ".:p.";
my ($ra, $pk, $highestRecurrence ) = undef;
my $totalRecurrence = 0;

foreach my $mk ( keys %{$siteVertexMap->{$mutationKey}} ) {
foreach my $refAlt ( keys %{$siteVertexMap->{$mutationKey}->{$mk}} ) {
my $lastPK; # to store one proteinKey from one ref:alt
foreach my $proteinKey ( keys %{$siteVertexMap->{$mutationKey}->{$mk}->{$refAlt}} ) {
$lastPK = $proteinKey;
if ( $mk eq $mutationKey ) { # to make sure our selected refAlt and pKey are members of the mutationKey used in downstream
if ( not defined $highestRecurrence ) { # assigning to the first entry
$ra = $refAlt;
$pk = $proteinKey;
$highestRecurrence = $siteVertexMap->{$mutationKey}->{$mk}->{$refAlt}->{$proteinKey};
}
elsif ( $highestRecurrence < $siteVertexMap->{$mutationKey}->{$mk}->{$refAlt}->{$proteinKey} ) { # this recurrence is larger than everything seen so far
$ra = $refAlt;
$pk = $proteinKey;
$highestRecurrence = $siteVertexMap->{$mutationKey}->{$mk}->{$refAlt}->{$proteinKey};
}
}
}
$totalRecurrence += $siteVertexMap->{$mutationKey}->{$mk}->{$refAlt}->{$lastPK}; # add the recurrence of one proteinKey per ref:alt
}
last;
}
return ( $ra , $pk );

# foreach my $refAlt ( keys %{$mutations->{$mutationKey}} ) {
# $ra = $refAlt;
# foreach my $proteinKey ( keys %{$mutations->{$mutationKey}->{$refAlt}} ) {
# $pk = $proteinKey;
# last;
# }
# last;
# }
return ( $ra , $pk, $highestRecurrence, $totalRecurrence );
}

sub checkPartners {
Expand Down
37 changes: 36 additions & 1 deletion lib/TGI/Mutpro/Main/ColorScore.R
Original file line number Diff line number Diff line change
@@ -1,14 +1,49 @@
#!/usr/bin/env Rscript

# Authors : Amila Weerasinghe
# Generates a reachability plot colored by weight (*.Horiz.weights.pdf)

args = commandArgs(trailingOnly=TRUE)
d <- read.table(args[1], header=FALSE, sep = "\t") # data table with variant,RD,genomic_annotations,weight

segData = read.table(args[2], sep = "\t") # data table with start,stop,epsilon',clusterID

#################################################################
### process the segData so that cluster labels do not overlap ###
#################################################################

# make a column to show process status
segData$Processed = 0

# display singleton clusters at RD=0.1
segData[segData$V1==segData$V2,"V3"] = 0.1
segData[segData$V1==segData$V2,"Processed"] = 1

segData$textOffset = 0

# now start from the first un-processed row and see if there are nearby levels
unprocessed = segData[segData$Processed==0,]

while ( length(unprocessed$V1) != 0 ) {
# take the first un-processed one and the nearby stuff
tab = segData[segData$V3<unprocessed$V3[1]+0.5 & segData$V3>=unprocessed$V3[1] & segData$V2==unprocessed$V2[1],]
offset = 0.1
# go through each row in tab and add offset
for (i in c(1:length(tab$V1))) {
segData[segData$V3<unprocessed$V3[1]+0.5 & segData$V3>=unprocessed$V3[1] & segData$V2==unprocessed$V2[1],][i,"textOffset"] = offset
offset = offset + 0.1
}
segData[segData$V3<unprocessed$V3[1]+0.5 & segData$V3>=unprocessed$V3[1] & segData$V2==unprocessed$V2[1],]$Processed = 1
unprocessed = segData[segData$Processed==0,] # update the unprocessed table
}
#################################################################

y0<-segData[[3]]
x0<-segData[[1]]+1
y1<-segData[[3]]
x1<-segData[[2]]+1
Cluster<-segData[[5]]
labelOffset <- segData$textOffset

names(d)[1] <- "variant"
names(d)[2] <- "RD"
Expand Down Expand Up @@ -37,7 +72,7 @@ for (i in values) {
p <- p + geom_segment(x=x0[i], y=y0[i], xend=x1[i], yend=y1[i])
}

p <- p + annotate("text", x = x1+1, y = y0, label = Cluster, size = 2)
p <- p + annotate("text", x = x1+labelOffset, y = y0, label = Cluster, size = 2)

p <- p + ggtitle(paste("Reachability Plot with weights: Epsilon=",args[4],"MinPts=",args[5]))

Expand Down
Loading

0 comments on commit 6bf3a23

Please sign in to comment.