0
|
1 #!/usr/bin/env perl
|
|
2
|
|
3 use warnings;
|
|
4 use strict;
|
|
5
|
|
6 use Getopt::Long;
|
|
7 use IO::File;
|
|
8 use File::Temp qw( tempdir );
|
|
9 use File::Spec;
|
|
10 use Cwd qw( abs_path cwd);
|
|
11
|
|
12 ## Define filtering parameters ##
|
|
13
|
|
14 my $min_read_pos = 0.10;
|
|
15 my $max_read_pos = 1 - $min_read_pos;
|
|
16 my $min_var_freq = 0.05;
|
|
17 my $min_var_count = 4;
|
|
18
|
|
19 my $min_strandedness = 0.01;
|
|
20 my $max_strandedness = 1 - $min_strandedness;
|
|
21
|
|
22 my $max_mm_qualsum_diff = 50;
|
|
23 my $max_mapqual_diff = 30;
|
|
24 my $max_readlen_diff = 25;
|
|
25 my $min_var_dist_3 = 0.20;
|
|
26 my $max_var_mm_qualsum = 100;
|
|
27
|
|
28
|
|
29 ## Parse arguments ##
|
|
30
|
|
31 my $output_basename;
|
|
32
|
|
33 my $vcf_file;
|
|
34 my $output_file;
|
|
35 my $bam_file;
|
|
36 my $bam_index;
|
|
37 my $sample;
|
|
38 my $bam_readcount_path = 'bam-readcount';
|
|
39 my $samtools_path = 'samtools';
|
|
40 my $ref_fasta;
|
|
41 my $help;
|
|
42
|
|
43
|
|
44 my $opt_result;
|
|
45 my @params = @ARGV;
|
|
46
|
|
47 $opt_result = GetOptions(
|
|
48 'vcf-file=s' => \$vcf_file,
|
|
49 'bam-file=s' => \$bam_file,
|
|
50 'bam-index=s' => \$bam_index,
|
|
51 'sample=s' => \$sample,
|
|
52 'bam-readcount=s' => \$bam_readcount_path,
|
|
53 'samtools=s' => \$samtools_path,
|
|
54 'reference=s' => \$ref_fasta,
|
|
55 'output=s' => \$output_file,
|
|
56 'min-read-pos=f' => \$min_read_pos,
|
|
57 'min-var-freq=f' => \$min_var_freq,
|
|
58 'min-var-count=f' => \$min_var_count,
|
|
59 'min-strandedness=f' => \$min_strandedness,
|
|
60 'max-mm-qualsum-diff=f' => \$max_mm_qualsum_diff,
|
|
61 'max-mapqual-diff=f' => \$max_mapqual_diff,
|
|
62 'max-readlen-diff=f' => \$max_readlen_diff,
|
|
63 'min-var-dist-3=f' => \$min_var_dist_3,
|
|
64 'max-var-mm-qualsum=f' => \$max_var_mm_qualsum,
|
|
65 'help' => \$help,
|
|
66 );
|
|
67
|
|
68 unless($opt_result) {
|
|
69 die help_text();
|
|
70 }
|
|
71
|
|
72 if($help) {
|
|
73 print STDOUT help_text();
|
|
74 exit 0;
|
|
75 }
|
|
76
|
|
77 unless($vcf_file) {
|
|
78 warn "You must provide a file to be filtered\n";
|
|
79 die help_text();
|
|
80 }
|
|
81
|
|
82 unless(-s $vcf_file) {
|
|
83 die "Can not find VCF file: $vcf_file\n";
|
|
84 }
|
|
85
|
|
86 unless($ref_fasta) {
|
|
87 warn "You must provide a reference fasta for generating readcounts to use for filtering\n";
|
|
88 die help_text();
|
|
89 }
|
|
90
|
|
91 unless(-s $ref_fasta) {
|
|
92 die "Can not find valid reference fasta: $ref_fasta\n";
|
|
93 }
|
|
94
|
|
95 unless($bam_file) {
|
|
96 die "You must provide a BAM file for generating readcounts\n";
|
|
97 die help_text();
|
|
98 }
|
|
99
|
|
100 unless(-s $bam_file) {
|
|
101 die "Can not find valid BAM file: $bam_file\n";
|
|
102 }
|
|
103
|
|
104 if($bam_index && !-s $bam_index) {
|
|
105 die "Can not find valid BAM index file: $bam_index\n";
|
|
106 }
|
|
107
|
|
108 unless($output_file) {
|
|
109 warn "You must provide an output file name: $output_file\n";
|
|
110 die help_text();
|
|
111 }
|
|
112 else {
|
|
113 $output_file = abs_path($output_file); #make sure we have full path as manipulating cwd below
|
|
114 }
|
|
115
|
|
116 unless($sample) {
|
|
117 warn "You must provide a sample name\n";
|
|
118 die help_text();
|
|
119 }
|
|
120
|
|
121 my %filters;
|
|
122 $filters{'position'} = [sprintf("PB%0.f",$min_read_pos*100), "Average position on read less than " . $min_read_pos . " or greater than " . $max_read_pos . " fraction of the read length"];
|
|
123 $filters{'strand_bias'} = [sprintf("SB%0.f",$min_strandedness*100), "Reads supporting the variant have less than " . $min_strandedness . " fraction of the reads on one strand, but reference supporting reads are not similarly biased"];
|
|
124 $filters{'min_var_count'} = [ "MVC".$min_var_count, "Less than " . $min_var_count . " high quality reads support the variant"];
|
|
125 $filters{'min_var_freq'} = [ sprintf("MVF%0.f",$min_var_freq*100), "Variant allele frequency is less than " . $min_var_freq];
|
|
126 $filters{'mmqs_diff'} = [ sprintf("MMQSD%d",$max_mm_qualsum_diff), "Difference in average mismatch quality sum between variant and reference supporting reads is greater than " . $max_mm_qualsum_diff];
|
|
127 $filters{'mq_diff'} = [ sprintf("MQD%d",$max_mapqual_diff), "Difference in average mapping quality sum between variant and reference supporting reads is greater than " . $max_mapqual_diff];
|
|
128 $filters{'read_length'} = [ sprintf("RLD%d",$max_readlen_diff), "Difference in average clipped read length between variant and reference supporting reads is greater than " . $max_readlen_diff];
|
|
129 $filters{'dist3'} = [ sprintf("DETP%0.f",$min_var_dist_3*100), "Average distance of the variant base to the effective 3' end is less than " . $min_var_dist_3];
|
|
130 $filters{'var_mmqs'} = [ sprintf("MMQS%d",$max_var_mm_qualsum), "The average mismatch quality sum of reads supporting the variant is greater than " . $max_var_mm_qualsum] if($max_var_mm_qualsum);
|
|
131 $filters{'no_var_readcount'} = [ "NRC", "Unable to grab readcounts for variant allele"];
|
|
132 $filters{'incomplete_readcount'} = [ "IRC", "Unable to grab any sort of readcount for either the reference or the variant allele"];
|
|
133
|
|
134 my $vcf_header;
|
|
135 my @vcf_lines;
|
|
136
|
|
137 my %rc_for_snp; # store info on snp positions and VCF line
|
|
138 my %rc_for_indel; #store info on indel positions and VCF line
|
|
139
|
|
140 my ($workdir, $working_fasta, $working_bam) = setup_workdir($ref_fasta, $bam_file, $bam_index);
|
|
141
|
|
142 my $starting_dir = cwd();
|
|
143
|
|
144 my $input = IO::File->new($vcf_file) or die "Unable to open input file $vcf_file: $!\n";
|
|
145 $vcf_header = parse_vcf_header($input);
|
|
146 add_filters_to_vcf_header($vcf_header, values %filters);
|
|
147 add_process_log_to_header($vcf_header, $vcf_file, @params);
|
|
148
|
|
149 my $header_line = $vcf_header->[-1];
|
|
150 chomp $header_line;
|
|
151 my @header_fields = split "\t", $header_line;
|
|
152
|
|
153 while(my $entry = $input->getline) {
|
|
154 push @vcf_lines, $entry;
|
|
155 chomp $entry;
|
|
156
|
|
157 my %fields;
|
|
158 @fields{@header_fields} = split("\t", $entry);
|
|
159
|
|
160 my $filter_sample = $fields{$sample};
|
|
161 unless($filter_sample) {
|
|
162 die "Unable to find field for $sample\n";
|
|
163 }
|
|
164 my @sample_fields = split /:/, $filter_sample;
|
|
165 unless(@sample_fields) {
|
|
166 die "Unable to parse field for $sample\n";
|
|
167 }
|
|
168 my $index = 0;
|
|
169 my %format_keys = map { $_ => $sample_fields[$index++] } split /:/, $fields{FORMAT};
|
|
170 #these are in order ACGT
|
|
171 my @alleles = ($fields{REF}, split /,/, $fields{ALT});
|
|
172 my %gt_alleles = map {$_ => 1} grep { $_ > 0 } split /\//, $format_keys{GT};
|
|
173 my @used_alleles;
|
|
174 for my $allele_index (keys %gt_alleles) {
|
|
175 push @used_alleles, $alleles[$allele_index];
|
|
176 }
|
|
177 my ($var) = sort @used_alleles; #follow existing convention of fp filter using alphabetical order to choose a single base on triallelic sites
|
|
178 $var = q{} unless defined $var; #in the case where there is no variant allele, set this to the empty string. Later it will be filtered as NRC or IRC
|
|
179 $var = uc($var);
|
|
180 my $ref = uc($fields{REF});
|
|
181 my $chrom = $fields{'#CHROM'};
|
|
182 my $pos = $fields{POS};
|
|
183
|
|
184 if(length($ref) > 1 || length($var) > 1) {
|
|
185 #it's an indel or mnp
|
|
186 if(length($ref) == length($var)) {
|
|
187 die "MNPs unsupported\n";
|
|
188 }
|
|
189 elsif(length($ref) > length($var)) {
|
|
190 #it's a deletion
|
|
191 $pos += 1;
|
|
192 ($ref, $var) = ($var, $ref);
|
|
193 $ref = substr($var, 1, 1);
|
|
194 $var = "-" . substr($var, 1);
|
|
195 }
|
|
196 else {
|
|
197 #it's an insertion
|
|
198 substr($var, 0, 1, "+");
|
|
199 }
|
|
200 $rc_for_indel{$chrom}{$pos}{$ref}{$var} = \$vcf_lines[-1];
|
|
201 }
|
|
202 else {
|
|
203 #it's a SNP
|
|
204 $rc_for_snp{$chrom}{$pos}{$ref}{$var} = \$vcf_lines[-1];
|
|
205 }
|
|
206 }
|
|
207
|
|
208 if(%rc_for_snp) {
|
|
209 filter_sites_in_hash(\%rc_for_snp, $bam_readcount_path, $working_bam, $working_fasta, $workdir);
|
|
210 }
|
|
211 else {
|
|
212 print STDERR "No SNP sites identified\n";
|
|
213 }
|
|
214
|
|
215 if(%rc_for_indel) {
|
|
216 filter_sites_in_hash(\%rc_for_indel, $bam_readcount_path, $working_bam, $working_fasta, $workdir, '-i');
|
|
217 }
|
|
218 else {
|
|
219 print STDERR "No Indel sites identified\n";
|
|
220 }
|
|
221
|
|
222 ## Open the output files ##
|
|
223 my $filtered_vcf = IO::File->new("$output_file","w") or die "Can't open output file $output_file: $!\n";
|
|
224 print $filtered_vcf @$vcf_header;
|
|
225 print $filtered_vcf @vcf_lines;
|
|
226
|
|
227 chdir $starting_dir or die "Unable to go back to starting dir\n";
|
|
228 exit(0);
|
|
229
|
|
230 ################################################################################
|
|
231
|
|
232 =head3 read_counts_by_allele
|
|
233
|
|
234 Retrieve relevant read counts for a certain allele
|
|
235
|
|
236
|
|
237 =cut
|
|
238
|
|
239 sub read_counts_by_allele {
|
|
240 (my $line, my $allele) = @_;
|
|
241
|
|
242 my @lineContents = split(/\t/, $line);
|
|
243 my $numContents = @lineContents;
|
|
244
|
|
245 for(my $colCounter = 5; $colCounter < $numContents; $colCounter++) {
|
|
246 my $this_allele = $lineContents[$colCounter];
|
|
247 my @alleleContents = split(/\:/, $this_allele);
|
|
248 if($alleleContents[0] eq $allele) {
|
|
249 my $numAlleleContents = @alleleContents;
|
|
250
|
|
251 return("") if($numAlleleContents < 8);
|
|
252
|
|
253 my $return_string = "";
|
|
254 my $return_sum = 0;
|
|
255 for(my $printCounter = 1; $printCounter < $numAlleleContents; $printCounter++) {
|
|
256 $return_sum += $alleleContents[$printCounter];
|
|
257 $return_string .= "\t" if($return_string);
|
|
258 $return_string .= $alleleContents[$printCounter];
|
|
259 }
|
|
260
|
|
261 return($return_string);
|
|
262
|
|
263 }
|
|
264 }
|
|
265
|
|
266 return("");
|
|
267 }
|
|
268
|
|
269
|
|
270 sub help_text {
|
|
271 return <<HELP;
|
|
272 fpfilter - Filtering for Illumina Sequencing
|
|
273
|
|
274 SYNOPSIS
|
|
275 fpfilter [options] [file ...]
|
|
276
|
|
277 OPTIONS
|
|
278 --vcf-file the input VCF file. Must have a GT field.
|
|
279 --bam-file the BAM file of the sample you are filtering on. Typically the tumor.
|
|
280 --sample the sample name of the sample you want to filter on in the VCF file.
|
|
281 --reference a fasta containing the reference sequence the BAM file was aligned to.
|
|
282 --output the filename of the output VCF file
|
|
283 --min-read-pos minimum average relative distance from start/end of read
|
|
284 --min-var-freq minimum variant allele frequency
|
|
285 --min-var-count minimum number of variant-supporting reads
|
|
286 --min-strandedness minimum representation of variant allele on each strand
|
|
287 --max-mm-qualsum-diff maximum difference of mismatch quality sum between variant and reference reads (paralog filter)
|
|
288 --max_var_mm_qualsum maximum mismatch quality sum of reference-supporting reads
|
|
289 --max-mapqual-diff maximum difference of mapping quality between variant and reference reads
|
|
290 --max-readlen-diff maximum difference of average supporting read length between variant and reference reads (paralog filter)
|
|
291 --min-var-dist-3 minimum average distance to effective 3prime end of read (real end or Q2) for variant-supporting reads
|
|
292 --help this message
|
|
293
|
|
294 DESCRIPTION
|
|
295 This program will filter a VCF with a variety of filters as detailed in the VarScan2 paper (http://www.ncbi.nlm.nih.gov/pubmed/22300766). It requires the bam-readcount utility (https://github.com/genome/bam-readcount).
|
|
296
|
|
297 This filter was calibrated on 100bp PE Illumina reads. It is likely to be overly stringent for longer reads and may be less effective on shorter reads.
|
|
298
|
|
299 AUTHORS
|
|
300 Dan Koboldt Original code
|
|
301 Dave Larson Modifications for VCF and exportation.
|
|
302
|
|
303 HELP
|
|
304 }
|
|
305
|
|
306 ### methods copied from elsewhere begin here...
|
|
307 sub parse_vcf_header {
|
|
308 my $input_fh = shift;
|
|
309
|
|
310 my @header;
|
|
311 my $header_end = 0;
|
|
312 while (!$header_end) {
|
|
313 my $line = $input_fh->getline;
|
|
314 if ($line =~ m/^##/) {
|
|
315 push @header, $line;
|
|
316 } elsif ($line =~ m/^#/) {
|
|
317 push @header, $line;
|
|
318 $header_end = 1;
|
|
319 } else {
|
|
320 die "Missed the final header line with the sample list? Last line examined: $line Header so far: " . join("\n", @header) . "\n";
|
|
321 }
|
|
322 }
|
|
323 return \@header;
|
|
324 }
|
|
325
|
|
326 sub generate_region_list {
|
|
327 my ($hash, $region_fh) = @_; #input_fh should be a filehandle to the VCF
|
|
328 print STDERR "Printing variants to temporary region_list file...\n";
|
|
329 for my $chr (keys %$hash) {
|
|
330 for my $pos (sort { $a <=> $b } keys %{$hash->{$chr}}) {
|
|
331 print $region_fh "$chr\t$pos\t$pos\n";
|
|
332 }
|
|
333 }
|
|
334 }
|
|
335
|
|
336 sub _simplify_indel_allele {
|
|
337 my ($ref, $var) = @_;
|
|
338 #these could be padded e.g. G, GT for a T insertion or GCC G for a 2bp deletion
|
|
339 #they could also be complex e.g. GCCCGT, GCGT for a 2bp deletion
|
|
340 #they could also represent an insertion deletion event e.g. GCCCGT GCGGGGT; these cannot be represented in genome bed. Throw an error or warn.
|
|
341 #
|
|
342 #I think the algorithm should be trim end (no updating of coords required)
|
|
343 #trim beginning and return number of bases trimmed
|
|
344
|
|
345 my @ref_array = map { uc } split //, $ref;
|
|
346 my @var_array = map { uc } split //, $var;
|
|
347
|
|
348 while(@ref_array and @var_array and $ref_array[-1] eq $var_array[-1]) {
|
|
349 pop @ref_array;
|
|
350 pop @var_array;
|
|
351 }
|
|
352
|
|
353 my $right_shift = 0;
|
|
354 while(@ref_array and @var_array and $ref_array[0] eq $var_array[0]) {
|
|
355 shift @ref_array;
|
|
356 shift @var_array;
|
|
357 $right_shift++;
|
|
358 }
|
|
359
|
|
360 return (join("",@ref_array), join("",@var_array), $right_shift);
|
|
361 }
|
|
362
|
|
363 sub filter_site {
|
|
364 my ($ref_result, $var_result) = @_;
|
|
365 #this will return a list of filter names
|
|
366 my @filter_names;
|
|
367 if($ref_result && $var_result) {
|
|
368 ## Parse out the bam-readcounts details for each allele. The fields should be: ##
|
|
369 #num_reads : avg_mapqual : avg_basequal : avg_semq : reads_plus : reads_minus : avg_clip_read_pos : avg_mmqs : reads_q2 : avg_dist_to_q2 : avgRLclipped : avg_eff_3'_dist
|
|
370 my ($ref_count, $ref_map_qual, $ref_base_qual, $ref_semq, $ref_plus, $ref_minus, $ref_pos, $ref_subs, $ref_mmqs, $ref_q2_reads, $ref_q2_dist, $ref_avg_rl, $ref_dist_3) = split(/\t/, $ref_result);
|
|
371 my ($var_count, $var_map_qual, $var_base_qual, $var_semq, $var_plus, $var_minus, $var_pos, $var_subs, $var_mmqs, $var_q2_reads, $var_q2_dist, $var_avg_rl, $var_dist_3) = split(/\t/, $var_result);
|
|
372
|
|
373 my $ref_strandedness = my $var_strandedness = 0.50;
|
|
374 $ref_dist_3 = 0.5 if(!$ref_dist_3);
|
|
375
|
|
376 ## Use conservative defaults if we can't get mismatch quality sums ##
|
|
377 $ref_mmqs = 50 if(!$ref_mmqs);
|
|
378 $var_mmqs = 0 if(!$var_mmqs);
|
|
379 my $mismatch_qualsum_diff = $var_mmqs - $ref_mmqs;
|
|
380
|
|
381 ## Determine map qual diff ##
|
|
382
|
|
383 my $mapqual_diff = $ref_map_qual - $var_map_qual;
|
|
384
|
|
385
|
|
386 ## Determine difference in average supporting read length ##
|
|
387
|
|
388 my $readlen_diff = $ref_avg_rl - $var_avg_rl;
|
|
389
|
|
390
|
|
391 ## Determine ref strandedness ##
|
|
392
|
|
393 if(($ref_plus + $ref_minus) > 0) {
|
|
394 $ref_strandedness = $ref_plus / ($ref_plus + $ref_minus);
|
|
395 $ref_strandedness = sprintf("%.2f", $ref_strandedness);
|
|
396 }
|
|
397
|
|
398 ## Determine var strandedness ##
|
|
399
|
|
400 if(($var_plus + $var_minus) > 0) {
|
|
401 $var_strandedness = $var_plus / ($var_plus + $var_minus);
|
|
402 $var_strandedness = sprintf("%.2f", $var_strandedness);
|
|
403 }
|
|
404
|
|
405 if($var_count && ($var_plus + $var_minus)) {
|
|
406 ## We must obtain variant read counts to proceed ##
|
|
407
|
|
408 my $var_freq = $var_count / ($ref_count + $var_count);
|
|
409
|
|
410 ## FAILURE 1: READ POSITION ##
|
|
411 if(($var_pos < $min_read_pos) || ($var_pos > $max_read_pos)) {
|
|
412 #$stats{'num_fail_pos'}++;
|
|
413 push @filter_names, $filters{'position'}->[0];
|
|
414 }
|
|
415
|
|
416 ## FAILURE 2: Variant is strand-specific but reference is NOT strand-specific ##
|
|
417 if(($var_strandedness < $min_strandedness || $var_strandedness > $max_strandedness) && ($ref_strandedness >= $min_strandedness && $ref_strandedness <= $max_strandedness)) {
|
|
418 #$stats{'num_fail_strand'}++;
|
|
419 push @filter_names, $filters{'strand_bias'}->[0];
|
|
420 }
|
|
421
|
|
422 ## FAILURE : Variant allele count does not meet minimum ##
|
|
423 if($var_count < $min_var_count) {
|
|
424 #$stats{'num_fail_varcount'}++;
|
|
425 push @filter_names, $filters{'min_var_count'}->[0];
|
|
426 }
|
|
427
|
|
428 ## FAILURE : Variant allele frequency does not meet minimum ##
|
|
429 if($var_freq < $min_var_freq) {
|
|
430 #$stats{'num_fail_varfreq'}++;
|
|
431 push @filter_names, $filters{'min_var_freq'}->[0];
|
|
432 }
|
|
433
|
|
434 ## FAILURE 3: Paralog filter for sites where variant allele mismatch-quality-sum is significantly higher than reference allele mmqs
|
|
435 if($mismatch_qualsum_diff> $max_mm_qualsum_diff) {
|
|
436 #$stats{'num_fail_mmqs'}++;
|
|
437 push @filter_names, $filters{'mmqs_diff'}->[0];
|
|
438 }
|
|
439
|
|
440 ## FAILURE 4: Mapping quality difference exceeds allowable maximum ##
|
|
441 if($mapqual_diff > $max_mapqual_diff) {
|
|
442 #$stats{'num_fail_mapqual'}++;
|
|
443 push @filter_names, $filters{'mq_diff'}->[0];
|
|
444 }
|
|
445
|
|
446 ## FAILURE 5: Read length difference exceeds allowable maximum ##
|
|
447 if($readlen_diff > $max_readlen_diff) {
|
|
448 #$stats{'num_fail_readlen'}++;
|
|
449 push @filter_names, $filters{'read_length'}->[0];
|
|
450 }
|
|
451
|
|
452 ## FAILURE 5: Read length difference exceeds allowable maximum ##
|
|
453 if($var_dist_3 < $min_var_dist_3) {
|
|
454 #$stats{'num_fail_dist3'}++;
|
|
455 push @filter_names, $filters{'dist3'}->[0];
|
|
456 }
|
|
457
|
|
458 if($max_var_mm_qualsum && $var_mmqs > $max_var_mm_qualsum) {
|
|
459 #$stats{'num_fail_var_mmqs'}++;
|
|
460 push @filter_names, $filters{'var_mmqs'}->[0];
|
|
461 }
|
|
462
|
|
463 ## SUCCESS: Pass Filter ##
|
|
464 if(@filter_names == 0) {
|
|
465 #$stats{'num_pass_filter'}++;
|
|
466 ## Print output, and append strandedness information ##
|
|
467 @filter_names = ('PASS');
|
|
468 }
|
|
469
|
|
470 }
|
|
471 else {
|
|
472 push @filter_names, $filters{'no_var_readcount'}->[0];
|
|
473 }
|
|
474 }
|
|
475 else {
|
|
476 #$stats{'num_no_readcounts'}++;
|
|
477 #print $fail_fh "$line\tno_readcounts\n";
|
|
478 push @filter_names, $filters{'incomplete_readcount'}->[0];
|
|
479 }
|
|
480 return @filter_names;
|
|
481 }
|
|
482
|
|
483 sub add_filters_to_vcf_header {
|
|
484 my ($parsed_header, @filter_refs) = @_;
|
|
485 my $column_header = pop @$parsed_header;
|
|
486 for my $filter_ref (@filter_refs) {
|
|
487 my ($filter_name, $filter_description) = @$filter_ref;
|
|
488 my $filter_line = qq{##FILTER=<ID=$filter_name,Description="$filter_description">\n};
|
|
489 push @$parsed_header, $filter_line;
|
|
490 }
|
|
491 push @$parsed_header, $column_header;
|
|
492 }
|
|
493
|
|
494 sub add_process_log_to_header {
|
|
495 my ($parsed_header, $input, @params) = @_;
|
|
496 my $column_header = pop @$parsed_header;
|
|
497 my $param_string = join(" ", @params);
|
|
498 push @$parsed_header, qq{##vcfProcessLog=<InputVCF=<$input>, InputVCFSource=<fpfilter>, InputVCFVer=<6.0>, InputVCFParam=<"$param_string"> InputVCFgeneAnno=<.>>\n}, $column_header;
|
|
499 }
|
|
500
|
|
501 sub filter_sites_in_hash {
|
|
502 my ($hash, $bam_readcount_path, $bam_file, $ref_fasta, $working_dir, $optional_param) = @_;
|
|
503 #done parsing vcf
|
|
504 $optional_param ||= '';
|
|
505 my $list_name = File::Spec->catfile($working_dir, "regions.txt");
|
|
506 my $list_fh = IO::File->new($list_name,"w") or die "Unable to open file for coordinates\n";
|
|
507 generate_region_list($hash, $list_fh);
|
|
508 $list_fh->close();
|
|
509
|
|
510 ## run bam-readcount
|
|
511 my $bam_readcount_cmd = "$bam_readcount_path -f $ref_fasta -l $list_name -w 0 -b 20 $optional_param $bam_file|";
|
|
512 my $rc_results = IO::File->new($bam_readcount_cmd) or die "Unable to open pipe to bam-readcount cmd: $bam_readcount_cmd\n";
|
|
513 while(my $rc_line = $rc_results->getline) {
|
|
514 chomp $rc_line;
|
|
515 my ($chrom, $position) = split(/\t/, $rc_line);
|
|
516 if($hash->{$chrom}{$position}) {
|
|
517 for my $ref (keys %{$hash->{$chrom}{$position}}) {
|
|
518 for my $var (keys %{$hash->{$chrom}{$position}{$ref}}) {
|
|
519 my $ref_result = read_counts_by_allele($rc_line, $ref);
|
|
520 my $var_result = read_counts_by_allele($rc_line, $var);
|
|
521 my @filters = filter_site($ref_result, $var_result);
|
|
522
|
|
523 my $vcf_line_ref = $hash->{$chrom}{$position}{$ref}{$var};
|
|
524 my @fields = split "\t", $$vcf_line_ref;
|
|
525 if($fields[6] eq '.' || $fields[6] eq 'PASS') {
|
|
526 $fields[6] = join(";", @filters);
|
|
527 }
|
|
528 else {
|
|
529 $fields[6] = join(";", $fields[6], @filters) if($filters[0] ne 'PASS');
|
|
530 }
|
|
531 $$vcf_line_ref = join("\t", @fields);
|
|
532 }
|
|
533 }
|
|
534 }
|
|
535 else {
|
|
536 die "Unknown site for rc\n";
|
|
537 }
|
|
538 }
|
|
539 unless($rc_results->close) {
|
|
540 die "Error running bam-readcount\n";
|
|
541 }
|
|
542 }
|
|
543
|
|
544 sub setup_workdir {
|
|
545 my ($reference, $bam_file, $bam_index) = @_;
|
|
546 $reference = abs_path($reference);
|
|
547 $bam_file = abs_path($bam_file);
|
|
548 $bam_index = abs_path($bam_index) if $bam_index;
|
|
549
|
|
550 my $dir = File::Temp->newdir('fpfilterXXXXX', TMPDIR => 1, CLEANUP => 1, DIR => './') or
|
|
551 die "Unable to create working directory\n";
|
|
552
|
|
553 #symlink in necessary files to run
|
|
554 my $working_reference = File::Spec->catfile($dir, "reference.fa");
|
|
555 symlink $reference, $working_reference;
|
|
556
|
|
557 my $fa_index = $reference . ".fai";
|
|
558 unless(-e $fa_index) {
|
|
559 index_fasta($working_reference);
|
|
560 }
|
|
561 else {
|
|
562 symlink $fa_index, File::Spec->catfile($dir, "reference.fa.fai");
|
|
563 }
|
|
564
|
|
565 my $working_bam = File::Spec->catfile($dir, "tumor.bam");
|
|
566 my $working_bam_index = File::Spec->catfile($dir, "tumor.bam.bai");
|
|
567 symlink $bam_file, $working_bam;
|
|
568 if($bam_index) {
|
|
569 symlink $bam_index, $working_bam_index;
|
|
570 }
|
|
571 elsif(-e $bam_file . ".bai") {
|
|
572 symlink $bam_file . ".bai", $working_bam_index;
|
|
573 }
|
|
574 else {
|
|
575 index_bam($working_bam);
|
|
576 }
|
|
577 return ($dir, $working_reference, $working_bam);
|
|
578 }
|
|
579
|
|
580 sub index_fasta {
|
|
581 my ($fasta) = @_;
|
|
582
|
|
583 print STDERR "Indexing fasta...\n";
|
|
584 my @args = ($samtools_path, "faidx", $fasta);
|
|
585 system(@args) == 0
|
|
586 or die "Unable to index $fasta: $?\n";
|
|
587 }
|
|
588
|
|
589 sub index_bam {
|
|
590 my ($bam) = @_;
|
|
591
|
|
592 print STDERR "Indexing BAM...\n";
|
|
593 my @args = ($samtools_path, "index", $bam);
|
|
594 system(@args) == 0
|
|
595 or die "Unable to index $bam: $?\n";
|
|
596 }
|