forked from mskcc/vcf2maf
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvcf2maf.pl
386 lines (332 loc) · 15.3 KB
/
vcf2maf.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
#!/usr/bin/env perl
use strict;
use warnings;
use IO::File;
use File::Temp qw( tempdir );
use Getopt::Long qw( GetOptions );
use Pod::Usage qw( pod2usage );
# Set any default paths and constants
my $snpeff_cmd = "java -Xmx2g -jar ~/snpEff/snpEff.jar eff -config ~/snpEff/snpEff.config -noStats -hgvs GRCh37.73";
# Check for missing or crappy arguments
unless( @ARGV and $ARGV[0]=~m/^-/ ) {
pod2usage( -verbose => 0, -message => "$0: Missing or invalid arguments!\n", -exitval => 2 );
}
# Parse options and print usage if there is a syntax error, or if usage was explicitly requested
my ( $man, $help ) = ( 0, 0 );
my ( $input_vcf, $output_maf, $tumor_id, $normal_id );
GetOptions(
'help!' => \$help,
'man!' => \$man,
'input-vcf=s' => \$input_vcf,
'output-maf=s' => \$output_maf,
'tumor-id=s' => \$tumor_id,
'normal-id=s' => \$normal_id,
'snpeff-cmd=s' => \$snpeff_cmd
) or pod2usage( -verbose => 1, -input => \*DATA, -exitval => 2 );
pod2usage( -verbose => 1, -input => \*DATA, -exitval => 0 ) if( $help );
pod2usage( -verbose => 2, -input => \*DATA, -exitval => 0 ) if( $man );
# Make sure all our input files exist and are non-zero
( -s $input_vcf ) or die "Input VCF file missing or empty!\nPath: $input_vcf\n";
# Run snpEff to annotate the VCF variants to all possible isoforms, unless it was done earlier
my $snpeff_vcf = $input_vcf;
$snpeff_vcf =~ s/.vcf$//;
$snpeff_vcf .= ".anno.vcf";
if( -s $snpeff_vcf ) {
print "Reusing this snpEff annotated VCF: $snpeff_vcf\n";
}
else {
print "Running snpEff to annotate each variant to all possible isoforms...\n";
print `$snpeff_cmd $input_vcf > $snpeff_vcf`;
}
# Use STDIN/STDOUT if input/output filenames are not defined
my ( $vcf_fh, $maf_fh ) = ( *STDIN, *STDOUT );
$vcf_fh = IO::File->new( $snpeff_vcf ) or die "Couldn't open file $snpeff_vcf! $!" if( $snpeff_vcf );
$maf_fh = IO::File->new( $output_maf, ">" ) or die "Couldn't open file $output_maf! $!" if( $output_maf );
# Predefine the columns of the MAF Header, and add 4 columns to the end for protein changes
# Ref: https://wiki.nci.nih.gov/display/TCGA/Mutation+Annotation+Format+%28MAF%29+Specification
my @mafHeader = qw(
Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand
Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2
dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode
Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2
Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status
Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_File Sequencer
Tumor_Sample_UUID Matched_Norm_Sample_UUID Codon_Change Protein_Change Transcript_ID Exon_Number
);
# Parse through each variant in the VCF, pull out the snpEff effects, and choose one isoform per
# variant whose annotation will be used in the MAF
my ( $num_snvs, $num_indels, $num_svs ) = ( 0, 0, 0 );
$maf_fh->print( join( "\t", @mafHeader ), "\n" ); # Print MAF header
while( my $line = $vcf_fh->getline ) {
# Skip all comment lines and the header cuz we know what we're doing!
next if( $line =~ m/^#/ );
# We don't need any of the data from the formatted columnms. Fetch everything else
chomp( $line );
my ( $chrom, $pos, $ids, $ref, $alt, $qual, $filter, $info_line ) = split( /\t/, $line );
# Parse out the data in the info column, and store into a hash
my %info = map {(m/=/ ? (split(/=/)) : ($_,1))} split( /\;/, $info_line );
# Figure out the appropriate start/stop loci and var type/allele to report in the MAF
my $start = my $stop = my $var = my $var_type = "";
my ( $ref_length, $alt_length, $indel_size ) = ( length( $ref ), length( $alt ), 0 );
if( $ref_length == $alt_length ) { # Handle SNP, DNP, TNP, or ONP
( $start, $stop ) = ( $pos, $pos + $alt_length - 1 );
$var = $alt;
$num_snvs++;
my %np_type = qw( 1 SNP 2 DNP 3 TNP );
$var_type = ( $alt_length > 3 ? "ONP" : $np_type{$alt_length} );
}
elsif( $ref_length < $alt_length ) { # Handle insertions
$indel_size = length( $alt ) - $ref_length;
( $ref, $var ) = ( "-", substr( $alt, $ref_length, $indel_size ));
( $start, $stop ) = ( $pos, $pos + 1 );
$num_indels++;
$var_type = "INS";
}
elsif( $ref_length > $alt_length ) { # Handle deletions
$indel_size = length( $ref ) - $alt_length;
( $ref, $var ) = ( substr( $ref, $alt_length, $indel_size ), "-" );
( $start, $stop ) = ( $pos + 1, $pos + $indel_size );
$num_indels++;
$var_type = "DEL";
}
else { # Poop-out on unknown variant types
die "Unhandled variant type in VCF:\n$line\nPlease check/update this script!";
}
# Parse through each snpEff effect, which is stored in this format:
# Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change | Amino_Acid_length | Gene_Name | Transcript_BioType | Gene_Coding | Transcript_ID | Exon | GenotypeNum [ | ERRORS | WARNINGS ] )
my $effLine = $info{EFF};
my @effList = split( /,/, $effLine ) if( $effLine );
# Load all effects & details into lists, skipping those with errors/warnings, and find the highest rank of their effect types, as defined in sub GetEffectPriority
my $bestEffectRank;
my @allEffects = ();
my @allEffectDetails = ();
foreach my $eff ( @effList ) {
if( $eff =~ /^(\w+)\((.+)\)$/ ) {
my $effect = $1;
my @effectDetails = split( /\|/, $2 );
my ( $warnings, $errors ) = @effectDetails[11,12];
unless( $warnings or $errors ) {
$bestEffectRank = GetEffectPriority( $effect ) if( !$bestEffectRank or $bestEffectRank > GetEffectPriority( $effect ));
push( @allEffects, $effect );
push( @allEffectDetails, \@effectDetails );
}
}
}
# Among effects with the bestEffectRank, find the highest rank of their biotypes, as defined in sub GetBiotypePriority
my $bestBiotypeRank;
for( my $i = 0; $i < scalar( @allEffectDetails ); $i++ ) {
my $effect = $allEffects[$i];
my @effectDetails = @{$allEffectDetails[$i]};
my $biotype = $effectDetails[6];
if( GetEffectPriority( $effect ) == $bestEffectRank ) {
$bestBiotypeRank = GetBiotypePriority( $biotype ) if( !$bestBiotypeRank or $bestBiotypeRank > GetBiotypePriority( $biotype ));
}
}
# Among effects with the bestEffectRank and bestBiotypeRank, choose the longest isoform as the one we annotate our variant to
my $longest_aa_length = 0;
my ( $bestEffect, @bestEffectDetails ) = ( "", () );
for( my $i = 0; $i < scalar( @allEffectDetails ); $i++ ) {
my $effect = $allEffects[$i];
my @effectDetails = @{$allEffectDetails[$i]};
my ( $aa_length, $biotype ) = @effectDetails[4,6];
$aa_length = 0 unless( $aa_length );
if( GetEffectPriority( $effect ) == $bestEffectRank and GetBiotypePriority( $biotype ) == $bestBiotypeRank and $longest_aa_length <= $aa_length ) {
$longest_aa_length = $aa_length;
$bestEffect = $effect;
@bestEffectDetails = @effectDetails;
}
}
# Construct the MAF columns and print to file
my %mafLine = map{($_,'')} @mafHeader;
$mafLine{'Hugo_Symbol'} = ( $bestEffectDetails[5] ? $bestEffectDetails[5] : 'Unknown' );
$mafLine{'Entrez_Gene_Id'} = '0';
$mafLine{'Center'} = '.';
$mafLine{'NCBI_Build'} = 37;
$mafLine{'Chromosome'} = $chrom;
$mafLine{'Start_Position'} = $start;
$mafLine{'End_Position'} = $stop;
$mafLine{'Strand'} = '+';
$mafLine{'Variant_Classification'} = GetVariantClassification($bestEffect, \@bestEffectDetails, $var_type);
$mafLine{'Variant_Type'} = $var_type;
$mafLine{'Reference_Allele'} = $ref;
$mafLine{'Tumor_Seq_Allele1'} = $var;
$mafLine{'Tumor_Seq_Allele2'} = $var;
$mafLine{'dbSNP_RS'} = GetrsIDs( $ids );
$mafLine{'Tumor_Sample_Barcode'} = $tumor_id;
$mafLine{'Matched_Norm_Sample_Barcode'} = $normal_id;
$mafLine{'Codon_Change'} = ( $bestEffectDetails[2] ? $bestEffectDetails[2] : "." );
$mafLine{'Protein_Change'} = ( $bestEffectDetails[3] ? $bestEffectDetails[3] : "." );
$mafLine{'Transcript_ID'} = ( $bestEffectDetails[8] ? $bestEffectDetails[8] : "." );
$mafLine{'Exon_Number'} = ( $bestEffectDetails[9] ? $bestEffectDetails[9] : "." );
foreach my $col (@mafHeader) {
$maf_fh->print( "\t" ) if ( $col ne $mafHeader[0] );
$maf_fh->print( $mafLine{$col} );
$maf_fh->print( "\n" ) if ( $col eq $mafHeader[$#mafHeader] );
}
}
# Prioritize snpEff effects, in the following generalized order:
# 1. Truncating
# 2. Missense
# 3. 5'UTR
# 4. 3'UTR
# 5. Upstream
# 6. Downstream
# 7. Synonymous
# 8. Non-protein
# 9. Intron - Conserved
# 10. Intron
# 11. Intergenic - Conserved
# 12. Intergenic/intragenic
sub GetEffectPriority {
my ($effect) = @_;
my %effectPriority = (
'SPLICE_SITE_ACCEPTOR' => 1,
'SPLICE_SITE_DONOR' => 1,
'START_LOST' => 1,
'EXON_DELETED' => 1,
'FRAME_SHIFT' => 1,
'STOP_GAINED' => 1,
'STOP_LOST' => 1,
'START_GAINED' => 2,
'RARE_AMINO_ACID' => 2,
'NON_SYNONYMOUS_START' => 2,
'NON_SYNONYMOUS_CODING' => 2,
'CODON_CHANGE' => 2,
'CODON_INSERTION' => 2,
'CODON_CHANGE_PLUS_CODON_INSERTION' => 2,
'CODON_DELETION' => 2,
'CODON_CHANGE_PLUS_CODON_DELETION' => 2,
'UTR_5_PRIME' => 3,
'UTR_5_DELETED' => 3,
'UTR_3_PRIME' => 4,
'UTR_3_DELETED' => 4,
'UPSTREAM' => 5,
'DOWNSTREAM' => 6,
'SYNONYMOUS_START' => 7,
'CDS' => 7,
'SYNONYMOUS_CODING' => 7,
'SYNONYMOUS_STOP' => 7,
'GENE' => 8,
'TRANSCRIPT' => 8,
'EXON' => 8,
'INTRON_CONSERVED' => 9,
'INTRON' => 10,
'INTERGENIC_CONSERVED' => 11,
'INTRAGENIC' => 12,
'INTERGENIC' => 12
);
$effectPriority{$effect} or die "ERROR: Unrecognized effect \"$effect\". Please update your hashes!";
return $effectPriority{$effect};
}
# Prioritize the transcript biotypes that variants are annotated to
sub GetBiotypePriority {
my ( $biotype ) = @_;
my %biotypePriority = (
'protein_coding' => 1,
'IG_C_gene' => 2,
'IG_D_gene' => 2,
'IG_J_gene' => 2,
'IG_V_gene' => 2,
'TR_C_gene' => 2,
'TR_D_gene' => 2,
'TR_J_gene' => 2,
'TR_V_gene' => 2,
'miRNA' => 3,
'snRNA' => 3,
'snoRNA' => 3,
'rRNA' => 3,
'lincRNA' => 3,
'Mt_tRNA' => 4,
'Mt_rRNA' => 4,
'antisense' => 5,
'sense_intronic' => 5,
'sense_overlapping' => 5,
'3prime_overlapping_ncrna' => 5,
'misc_RNA' => 5,
'processed_transcript' => 6,
'TEC' => 6,
'retained_intron' => 7,
'nonsense_mediated_decay' => 7,
'non_stop_decay' => 7,
'pseudogene' => 8,
'processed_pseudogene' => 8,
'polymorphic_pseudogene' => 8,
'transcribed_processed_pseudogene' => 8,
'transcribed_unprocessed_pseudogene' => 8,
'unitary_pseudogene' => 8,
'unprocessed_pseudogene' => 8,
'IG_C_pseudogene' => 8,
'IG_J_pseudogene' => 8,
'IG_V_pseudogene' => 8,
'TR_J_pseudogene' => 8,
'TR_V_pseudogene' => 8,
'' => 9
);
$biotypePriority{$biotype} or die "ERROR: Unrecognized biotype \"$biotype\". Please update your hashes!";
return $biotypePriority{$biotype};
}
# Converts snpEff effect types to MAF variant classifications
sub GetVariantClassification {
my ( $effect, $ref_bestEffectDetails, $var_type ) = @_;
return "5'UTR" if( $effect =~ /UTR_5/ );
return "3'UTR" if( $effect =~ /UTR_3/ );
return "IGR" if( $effect =~ /INTERGENIC/ );
return "Intron" if ( $effect =~ /INTRON/ );
return "Splice_Site" if( $effect =~ /^SPLICE_SITE/ );
return "5'Flank" if( $effect eq 'UPSTREAM' );
return "3'Flank" if ( $effect eq 'DOWNSTREAM' );
return "Missense_Mutation" if( $effect eq 'NON_SYNONYMOUS_CODING' or $effect eq 'CODON_CHANGE' or $effect eq 'RARE_AMINO_ACID' );
return "Translation_Start_Site" if( $effect eq 'START_LOST' );
return "De_novo_Start_InFrame" if( $effect eq 'START_GAINED' );
return "Nonsense_Mutation" if( $effect eq 'STOP_GAINED' );
return "Nonstop_Mutation" if( $effect eq 'STOP_LOST' );
return "Frame_Shift_Ins" if( $effect eq 'FRAME_SHIFT' and $var_type eq 'INS' );
return "Frame_Shift_Del" if ( $effect eq 'FRAME_SHIFT' and $var_type eq 'DEL' );
return "In_Frame_Ins" if( $effect eq 'CODON_INSERTION' or $effect eq 'CODON_CHANGE_PLUS_CODON_INSERTION' );
return "In_Frame_Del" if( $effect eq 'CODON_DELETION' or $effect eq 'CODON_CHANGE_PLUS_CODON_DELETION' );
return "Silent" if( $effect =~ /^SYNONYMOUS_/ );
return "IGR" if( $effect eq 'INTRAGENIC' );
# All non-coding RNA genes are grouped into one classification
return "RNA" if( $effect eq 'EXON' and ${$ref_bestEffectDetails}[6]=~m/RNA/ and ${$ref_bestEffectDetails}[7]=~m/^NON_CODING$/ );
# Annotate everything else simply as a targeted region
return "Targeted_Region";
}
# If the VCF variant ID contains multiple entries, return the ones that look like dbSNP rsIDs
sub GetrsIDs {
my ( $id_line ) = @_;
return '' if( !$id_line or $id_line eq '.' or $id_line=~m/^\s*$/ ); # Handle null data
my @rs_ids = grep( /^rs\d+$/, split( /;/, $id_line )); # Pull out rsIDs into a list
# If no rsIDs were found, return null. Else return a semicolon delimited list
return '' unless( scalar( @rs_ids ) > 0 );
return join( ";", @rs_ids );
}
__DATA__
=head1 NAME
vcf2maf.pl - Convert VCF to MAF. Use snpEff to annotate calls to all possible isoforms and choose one to report.
=head1 SYNOPSIS
perl vcf2maf.pl \
--input-vcf WD1309_vs_NB1308.vcf --output-maf WD1309_vs_NB1308.maf \
--tumor-id WD1309 --normal-id NB1308 \
--snpeff-cmd "java -Xmx2g -jar /srv/java/snpEff/snpEff.jar eff -cancer -lof -hgvs -config /srv/java/snpEff/snpEff.config -noStats GRCh37.73"
=head1 OPTIONS
=over 8
=item B<--tumor-id>
The tumor sample ID to report in the 16th column of the MAF
=item B<--normal-id>
The normal sample ID to report in the 17th column of the MAF
=item B<--snpeff-cmd>
Command to run snpEff annotation on a VCF file
=item B<--help>
Prints a brief help message and quits
=item B<--man>
Prints the detailed manual
=back
=head1 DESCRIPTION
To convert a VCF into a MAF, each variant must be annotated to only one of all possible gene transcripts/isoforms that it might affect. This selection of a single affected transcript/isoform per variant, is often subjective. For now, we try to follow best-practices, but over time, the selection process will be made smarter and more configurable.
This script needs snpEff (snpeff.sourceforge.net), a variant annotator that can quickly map each variant to all possible transcripts in a database. snpEff is downloadable as a java archive, so make sure you also have Java installed.
=head1 AUTHORS
Cyriac Kandoth (ckandoth@gmail.com)
William Lee (leew1@cbio.mskcc.org)
=head1 LICENSE
LGPLv3, Memorial Sloan-Kettering Cancer Center, New York, NY 10065, USA
=cut