annotate fpfilter.pl @ 0:276875076be1 draft

Uploaded
author elixir-it
date Tue, 03 Jul 2018 06:05:44 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
276875076be1 Uploaded
elixir-it
parents:
diff changeset
1 #!/usr/bin/env perl
276875076be1 Uploaded
elixir-it
parents:
diff changeset
2
276875076be1 Uploaded
elixir-it
parents:
diff changeset
3 use warnings;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
4 use strict;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
5
276875076be1 Uploaded
elixir-it
parents:
diff changeset
6 use Getopt::Long;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
7 use IO::File;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
8 use File::Temp qw( tempdir );
276875076be1 Uploaded
elixir-it
parents:
diff changeset
9 use File::Spec;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
10 use Cwd qw( abs_path cwd);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
11
276875076be1 Uploaded
elixir-it
parents:
diff changeset
12 ## Define filtering parameters ##
276875076be1 Uploaded
elixir-it
parents:
diff changeset
13
276875076be1 Uploaded
elixir-it
parents:
diff changeset
14 my $min_read_pos = 0.10;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
15 my $max_read_pos = 1 - $min_read_pos;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
16 my $min_var_freq = 0.05;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
17 my $min_var_count = 4;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
18
276875076be1 Uploaded
elixir-it
parents:
diff changeset
19 my $min_strandedness = 0.01;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
20 my $max_strandedness = 1 - $min_strandedness;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
21
276875076be1 Uploaded
elixir-it
parents:
diff changeset
22 my $max_mm_qualsum_diff = 50;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
23 my $max_mapqual_diff = 30;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
24 my $max_readlen_diff = 25;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
25 my $min_var_dist_3 = 0.20;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
26 my $max_var_mm_qualsum = 100;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
27
276875076be1 Uploaded
elixir-it
parents:
diff changeset
28
276875076be1 Uploaded
elixir-it
parents:
diff changeset
29 ## Parse arguments ##
276875076be1 Uploaded
elixir-it
parents:
diff changeset
30
276875076be1 Uploaded
elixir-it
parents:
diff changeset
31 my $output_basename;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
32
276875076be1 Uploaded
elixir-it
parents:
diff changeset
33 my $vcf_file;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
34 my $output_file;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
35 my $bam_file;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
36 my $bam_index;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
37 my $sample;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
38 my $bam_readcount_path = 'bam-readcount';
276875076be1 Uploaded
elixir-it
parents:
diff changeset
39 my $samtools_path = 'samtools';
276875076be1 Uploaded
elixir-it
parents:
diff changeset
40 my $ref_fasta;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
41 my $help;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
42
276875076be1 Uploaded
elixir-it
parents:
diff changeset
43
276875076be1 Uploaded
elixir-it
parents:
diff changeset
44 my $opt_result;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
45 my @params = @ARGV;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
46
276875076be1 Uploaded
elixir-it
parents:
diff changeset
47 $opt_result = GetOptions(
276875076be1 Uploaded
elixir-it
parents:
diff changeset
48 'vcf-file=s' => \$vcf_file,
276875076be1 Uploaded
elixir-it
parents:
diff changeset
49 'bam-file=s' => \$bam_file,
276875076be1 Uploaded
elixir-it
parents:
diff changeset
50 'bam-index=s' => \$bam_index,
276875076be1 Uploaded
elixir-it
parents:
diff changeset
51 'sample=s' => \$sample,
276875076be1 Uploaded
elixir-it
parents:
diff changeset
52 'bam-readcount=s' => \$bam_readcount_path,
276875076be1 Uploaded
elixir-it
parents:
diff changeset
53 'samtools=s' => \$samtools_path,
276875076be1 Uploaded
elixir-it
parents:
diff changeset
54 'reference=s' => \$ref_fasta,
276875076be1 Uploaded
elixir-it
parents:
diff changeset
55 'output=s' => \$output_file,
276875076be1 Uploaded
elixir-it
parents:
diff changeset
56 'min-read-pos=f' => \$min_read_pos,
276875076be1 Uploaded
elixir-it
parents:
diff changeset
57 'min-var-freq=f' => \$min_var_freq,
276875076be1 Uploaded
elixir-it
parents:
diff changeset
58 'min-var-count=f' => \$min_var_count,
276875076be1 Uploaded
elixir-it
parents:
diff changeset
59 'min-strandedness=f' => \$min_strandedness,
276875076be1 Uploaded
elixir-it
parents:
diff changeset
60 'max-mm-qualsum-diff=f' => \$max_mm_qualsum_diff,
276875076be1 Uploaded
elixir-it
parents:
diff changeset
61 'max-mapqual-diff=f' => \$max_mapqual_diff,
276875076be1 Uploaded
elixir-it
parents:
diff changeset
62 'max-readlen-diff=f' => \$max_readlen_diff,
276875076be1 Uploaded
elixir-it
parents:
diff changeset
63 'min-var-dist-3=f' => \$min_var_dist_3,
276875076be1 Uploaded
elixir-it
parents:
diff changeset
64 'max-var-mm-qualsum=f' => \$max_var_mm_qualsum,
276875076be1 Uploaded
elixir-it
parents:
diff changeset
65 'help' => \$help,
276875076be1 Uploaded
elixir-it
parents:
diff changeset
66 );
276875076be1 Uploaded
elixir-it
parents:
diff changeset
67
276875076be1 Uploaded
elixir-it
parents:
diff changeset
68 unless($opt_result) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
69 die help_text();
276875076be1 Uploaded
elixir-it
parents:
diff changeset
70 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
71
276875076be1 Uploaded
elixir-it
parents:
diff changeset
72 if($help) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
73 print STDOUT help_text();
276875076be1 Uploaded
elixir-it
parents:
diff changeset
74 exit 0;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
75 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
76
276875076be1 Uploaded
elixir-it
parents:
diff changeset
77 unless($vcf_file) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
78 warn "You must provide a file to be filtered\n";
276875076be1 Uploaded
elixir-it
parents:
diff changeset
79 die help_text();
276875076be1 Uploaded
elixir-it
parents:
diff changeset
80 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
81
276875076be1 Uploaded
elixir-it
parents:
diff changeset
82 unless(-s $vcf_file) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
83 die "Can not find VCF file: $vcf_file\n";
276875076be1 Uploaded
elixir-it
parents:
diff changeset
84 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
85
276875076be1 Uploaded
elixir-it
parents:
diff changeset
86 unless($ref_fasta) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
87 warn "You must provide a reference fasta for generating readcounts to use for filtering\n";
276875076be1 Uploaded
elixir-it
parents:
diff changeset
88 die help_text();
276875076be1 Uploaded
elixir-it
parents:
diff changeset
89 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
90
276875076be1 Uploaded
elixir-it
parents:
diff changeset
91 unless(-s $ref_fasta) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
92 die "Can not find valid reference fasta: $ref_fasta\n";
276875076be1 Uploaded
elixir-it
parents:
diff changeset
93 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
94
276875076be1 Uploaded
elixir-it
parents:
diff changeset
95 unless($bam_file) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
96 die "You must provide a BAM file for generating readcounts\n";
276875076be1 Uploaded
elixir-it
parents:
diff changeset
97 die help_text();
276875076be1 Uploaded
elixir-it
parents:
diff changeset
98 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
99
276875076be1 Uploaded
elixir-it
parents:
diff changeset
100 unless(-s $bam_file) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
101 die "Can not find valid BAM file: $bam_file\n";
276875076be1 Uploaded
elixir-it
parents:
diff changeset
102 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
103
276875076be1 Uploaded
elixir-it
parents:
diff changeset
104 if($bam_index && !-s $bam_index) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
105 die "Can not find valid BAM index file: $bam_index\n";
276875076be1 Uploaded
elixir-it
parents:
diff changeset
106 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
107
276875076be1 Uploaded
elixir-it
parents:
diff changeset
108 unless($output_file) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
109 warn "You must provide an output file name: $output_file\n";
276875076be1 Uploaded
elixir-it
parents:
diff changeset
110 die help_text();
276875076be1 Uploaded
elixir-it
parents:
diff changeset
111 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
112 else {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
113 $output_file = abs_path($output_file); #make sure we have full path as manipulating cwd below
276875076be1 Uploaded
elixir-it
parents:
diff changeset
114 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
115
276875076be1 Uploaded
elixir-it
parents:
diff changeset
116 unless($sample) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
117 warn "You must provide a sample name\n";
276875076be1 Uploaded
elixir-it
parents:
diff changeset
118 die help_text();
276875076be1 Uploaded
elixir-it
parents:
diff changeset
119 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
120
276875076be1 Uploaded
elixir-it
parents:
diff changeset
121 my %filters;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
122 $filters{'position'} = [sprintf("PB%0.f",$min_read_pos*100), "Average position on read less than " . $min_read_pos . " or greater than " . $max_read_pos . " fraction of the read length"];
276875076be1 Uploaded
elixir-it
parents:
diff changeset
123 $filters{'strand_bias'} = [sprintf("SB%0.f",$min_strandedness*100), "Reads supporting the variant have less than " . $min_strandedness . " fraction of the reads on one strand, but reference supporting reads are not similarly biased"];
276875076be1 Uploaded
elixir-it
parents:
diff changeset
124 $filters{'min_var_count'} = [ "MVC".$min_var_count, "Less than " . $min_var_count . " high quality reads support the variant"];
276875076be1 Uploaded
elixir-it
parents:
diff changeset
125 $filters{'min_var_freq'} = [ sprintf("MVF%0.f",$min_var_freq*100), "Variant allele frequency is less than " . $min_var_freq];
276875076be1 Uploaded
elixir-it
parents:
diff changeset
126 $filters{'mmqs_diff'} = [ sprintf("MMQSD%d",$max_mm_qualsum_diff), "Difference in average mismatch quality sum between variant and reference supporting reads is greater than " . $max_mm_qualsum_diff];
276875076be1 Uploaded
elixir-it
parents:
diff changeset
127 $filters{'mq_diff'} = [ sprintf("MQD%d",$max_mapqual_diff), "Difference in average mapping quality sum between variant and reference supporting reads is greater than " . $max_mapqual_diff];
276875076be1 Uploaded
elixir-it
parents:
diff changeset
128 $filters{'read_length'} = [ sprintf("RLD%d",$max_readlen_diff), "Difference in average clipped read length between variant and reference supporting reads is greater than " . $max_readlen_diff];
276875076be1 Uploaded
elixir-it
parents:
diff changeset
129 $filters{'dist3'} = [ sprintf("DETP%0.f",$min_var_dist_3*100), "Average distance of the variant base to the effective 3' end is less than " . $min_var_dist_3];
276875076be1 Uploaded
elixir-it
parents:
diff changeset
130 $filters{'var_mmqs'} = [ sprintf("MMQS%d",$max_var_mm_qualsum), "The average mismatch quality sum of reads supporting the variant is greater than " . $max_var_mm_qualsum] if($max_var_mm_qualsum);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
131 $filters{'no_var_readcount'} = [ "NRC", "Unable to grab readcounts for variant allele"];
276875076be1 Uploaded
elixir-it
parents:
diff changeset
132 $filters{'incomplete_readcount'} = [ "IRC", "Unable to grab any sort of readcount for either the reference or the variant allele"];
276875076be1 Uploaded
elixir-it
parents:
diff changeset
133
276875076be1 Uploaded
elixir-it
parents:
diff changeset
134 my $vcf_header;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
135 my @vcf_lines;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
136
276875076be1 Uploaded
elixir-it
parents:
diff changeset
137 my %rc_for_snp; # store info on snp positions and VCF line
276875076be1 Uploaded
elixir-it
parents:
diff changeset
138 my %rc_for_indel; #store info on indel positions and VCF line
276875076be1 Uploaded
elixir-it
parents:
diff changeset
139
276875076be1 Uploaded
elixir-it
parents:
diff changeset
140 my ($workdir, $working_fasta, $working_bam) = setup_workdir($ref_fasta, $bam_file, $bam_index);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
141
276875076be1 Uploaded
elixir-it
parents:
diff changeset
142 my $starting_dir = cwd();
276875076be1 Uploaded
elixir-it
parents:
diff changeset
143
276875076be1 Uploaded
elixir-it
parents:
diff changeset
144 my $input = IO::File->new($vcf_file) or die "Unable to open input file $vcf_file: $!\n";
276875076be1 Uploaded
elixir-it
parents:
diff changeset
145 $vcf_header = parse_vcf_header($input);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
146 add_filters_to_vcf_header($vcf_header, values %filters);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
147 add_process_log_to_header($vcf_header, $vcf_file, @params);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
148
276875076be1 Uploaded
elixir-it
parents:
diff changeset
149 my $header_line = $vcf_header->[-1];
276875076be1 Uploaded
elixir-it
parents:
diff changeset
150 chomp $header_line;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
151 my @header_fields = split "\t", $header_line;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
152
276875076be1 Uploaded
elixir-it
parents:
diff changeset
153 while(my $entry = $input->getline) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
154 push @vcf_lines, $entry;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
155 chomp $entry;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
156
276875076be1 Uploaded
elixir-it
parents:
diff changeset
157 my %fields;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
158 @fields{@header_fields} = split("\t", $entry);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
159
276875076be1 Uploaded
elixir-it
parents:
diff changeset
160 my $filter_sample = $fields{$sample};
276875076be1 Uploaded
elixir-it
parents:
diff changeset
161 unless($filter_sample) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
162 die "Unable to find field for $sample\n";
276875076be1 Uploaded
elixir-it
parents:
diff changeset
163 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
164 my @sample_fields = split /:/, $filter_sample;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
165 unless(@sample_fields) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
166 die "Unable to parse field for $sample\n";
276875076be1 Uploaded
elixir-it
parents:
diff changeset
167 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
168 my $index = 0;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
169 my %format_keys = map { $_ => $sample_fields[$index++] } split /:/, $fields{FORMAT};
276875076be1 Uploaded
elixir-it
parents:
diff changeset
170 #these are in order ACGT
276875076be1 Uploaded
elixir-it
parents:
diff changeset
171 my @alleles = ($fields{REF}, split /,/, $fields{ALT});
276875076be1 Uploaded
elixir-it
parents:
diff changeset
172 my %gt_alleles = map {$_ => 1} grep { $_ > 0 } split /\//, $format_keys{GT};
276875076be1 Uploaded
elixir-it
parents:
diff changeset
173 my @used_alleles;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
174 for my $allele_index (keys %gt_alleles) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
175 push @used_alleles, $alleles[$allele_index];
276875076be1 Uploaded
elixir-it
parents:
diff changeset
176 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
177 my ($var) = sort @used_alleles; #follow existing convention of fp filter using alphabetical order to choose a single base on triallelic sites
276875076be1 Uploaded
elixir-it
parents:
diff changeset
178 $var = q{} unless defined $var; #in the case where there is no variant allele, set this to the empty string. Later it will be filtered as NRC or IRC
276875076be1 Uploaded
elixir-it
parents:
diff changeset
179 $var = uc($var);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
180 my $ref = uc($fields{REF});
276875076be1 Uploaded
elixir-it
parents:
diff changeset
181 my $chrom = $fields{'#CHROM'};
276875076be1 Uploaded
elixir-it
parents:
diff changeset
182 my $pos = $fields{POS};
276875076be1 Uploaded
elixir-it
parents:
diff changeset
183
276875076be1 Uploaded
elixir-it
parents:
diff changeset
184 if(length($ref) > 1 || length($var) > 1) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
185 #it's an indel or mnp
276875076be1 Uploaded
elixir-it
parents:
diff changeset
186 if(length($ref) == length($var)) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
187 die "MNPs unsupported\n";
276875076be1 Uploaded
elixir-it
parents:
diff changeset
188 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
189 elsif(length($ref) > length($var)) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
190 #it's a deletion
276875076be1 Uploaded
elixir-it
parents:
diff changeset
191 $pos += 1;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
192 ($ref, $var) = ($var, $ref);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
193 $ref = substr($var, 1, 1);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
194 $var = "-" . substr($var, 1);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
195 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
196 else {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
197 #it's an insertion
276875076be1 Uploaded
elixir-it
parents:
diff changeset
198 substr($var, 0, 1, "+");
276875076be1 Uploaded
elixir-it
parents:
diff changeset
199 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
200 $rc_for_indel{$chrom}{$pos}{$ref}{$var} = \$vcf_lines[-1];
276875076be1 Uploaded
elixir-it
parents:
diff changeset
201 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
202 else {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
203 #it's a SNP
276875076be1 Uploaded
elixir-it
parents:
diff changeset
204 $rc_for_snp{$chrom}{$pos}{$ref}{$var} = \$vcf_lines[-1];
276875076be1 Uploaded
elixir-it
parents:
diff changeset
205 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
206 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
207
276875076be1 Uploaded
elixir-it
parents:
diff changeset
208 if(%rc_for_snp) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
209 filter_sites_in_hash(\%rc_for_snp, $bam_readcount_path, $working_bam, $working_fasta, $workdir);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
210 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
211 else {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
212 print STDERR "No SNP sites identified\n";
276875076be1 Uploaded
elixir-it
parents:
diff changeset
213 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
214
276875076be1 Uploaded
elixir-it
parents:
diff changeset
215 if(%rc_for_indel) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
216 filter_sites_in_hash(\%rc_for_indel, $bam_readcount_path, $working_bam, $working_fasta, $workdir, '-i');
276875076be1 Uploaded
elixir-it
parents:
diff changeset
217 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
218 else {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
219 print STDERR "No Indel sites identified\n";
276875076be1 Uploaded
elixir-it
parents:
diff changeset
220 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
221
276875076be1 Uploaded
elixir-it
parents:
diff changeset
222 ## Open the output files ##
276875076be1 Uploaded
elixir-it
parents:
diff changeset
223 my $filtered_vcf = IO::File->new("$output_file","w") or die "Can't open output file $output_file: $!\n";
276875076be1 Uploaded
elixir-it
parents:
diff changeset
224 print $filtered_vcf @$vcf_header;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
225 print $filtered_vcf @vcf_lines;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
226
276875076be1 Uploaded
elixir-it
parents:
diff changeset
227 chdir $starting_dir or die "Unable to go back to starting dir\n";
276875076be1 Uploaded
elixir-it
parents:
diff changeset
228 exit(0);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
229
276875076be1 Uploaded
elixir-it
parents:
diff changeset
230 ################################################################################
276875076be1 Uploaded
elixir-it
parents:
diff changeset
231
276875076be1 Uploaded
elixir-it
parents:
diff changeset
232 =head3 read_counts_by_allele
276875076be1 Uploaded
elixir-it
parents:
diff changeset
233
276875076be1 Uploaded
elixir-it
parents:
diff changeset
234 Retrieve relevant read counts for a certain allele
276875076be1 Uploaded
elixir-it
parents:
diff changeset
235
276875076be1 Uploaded
elixir-it
parents:
diff changeset
236
276875076be1 Uploaded
elixir-it
parents:
diff changeset
237 =cut
276875076be1 Uploaded
elixir-it
parents:
diff changeset
238
276875076be1 Uploaded
elixir-it
parents:
diff changeset
239 sub read_counts_by_allele {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
240 (my $line, my $allele) = @_;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
241
276875076be1 Uploaded
elixir-it
parents:
diff changeset
242 my @lineContents = split(/\t/, $line);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
243 my $numContents = @lineContents;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
244
276875076be1 Uploaded
elixir-it
parents:
diff changeset
245 for(my $colCounter = 5; $colCounter < $numContents; $colCounter++) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
246 my $this_allele = $lineContents[$colCounter];
276875076be1 Uploaded
elixir-it
parents:
diff changeset
247 my @alleleContents = split(/\:/, $this_allele);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
248 if($alleleContents[0] eq $allele) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
249 my $numAlleleContents = @alleleContents;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
250
276875076be1 Uploaded
elixir-it
parents:
diff changeset
251 return("") if($numAlleleContents < 8);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
252
276875076be1 Uploaded
elixir-it
parents:
diff changeset
253 my $return_string = "";
276875076be1 Uploaded
elixir-it
parents:
diff changeset
254 my $return_sum = 0;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
255 for(my $printCounter = 1; $printCounter < $numAlleleContents; $printCounter++) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
256 $return_sum += $alleleContents[$printCounter];
276875076be1 Uploaded
elixir-it
parents:
diff changeset
257 $return_string .= "\t" if($return_string);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
258 $return_string .= $alleleContents[$printCounter];
276875076be1 Uploaded
elixir-it
parents:
diff changeset
259 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
260
276875076be1 Uploaded
elixir-it
parents:
diff changeset
261 return($return_string);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
262
276875076be1 Uploaded
elixir-it
parents:
diff changeset
263 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
264 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
265
276875076be1 Uploaded
elixir-it
parents:
diff changeset
266 return("");
276875076be1 Uploaded
elixir-it
parents:
diff changeset
267 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
268
276875076be1 Uploaded
elixir-it
parents:
diff changeset
269
276875076be1 Uploaded
elixir-it
parents:
diff changeset
270 sub help_text {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
271 return <<HELP;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
272 fpfilter - Filtering for Illumina Sequencing
276875076be1 Uploaded
elixir-it
parents:
diff changeset
273
276875076be1 Uploaded
elixir-it
parents:
diff changeset
274 SYNOPSIS
276875076be1 Uploaded
elixir-it
parents:
diff changeset
275 fpfilter [options] [file ...]
276875076be1 Uploaded
elixir-it
parents:
diff changeset
276
276875076be1 Uploaded
elixir-it
parents:
diff changeset
277 OPTIONS
276875076be1 Uploaded
elixir-it
parents:
diff changeset
278 --vcf-file the input VCF file. Must have a GT field.
276875076be1 Uploaded
elixir-it
parents:
diff changeset
279 --bam-file the BAM file of the sample you are filtering on. Typically the tumor.
276875076be1 Uploaded
elixir-it
parents:
diff changeset
280 --sample the sample name of the sample you want to filter on in the VCF file.
276875076be1 Uploaded
elixir-it
parents:
diff changeset
281 --reference a fasta containing the reference sequence the BAM file was aligned to.
276875076be1 Uploaded
elixir-it
parents:
diff changeset
282 --output the filename of the output VCF file
276875076be1 Uploaded
elixir-it
parents:
diff changeset
283 --min-read-pos minimum average relative distance from start/end of read
276875076be1 Uploaded
elixir-it
parents:
diff changeset
284 --min-var-freq minimum variant allele frequency
276875076be1 Uploaded
elixir-it
parents:
diff changeset
285 --min-var-count minimum number of variant-supporting reads
276875076be1 Uploaded
elixir-it
parents:
diff changeset
286 --min-strandedness minimum representation of variant allele on each strand
276875076be1 Uploaded
elixir-it
parents:
diff changeset
287 --max-mm-qualsum-diff maximum difference of mismatch quality sum between variant and reference reads (paralog filter)
276875076be1 Uploaded
elixir-it
parents:
diff changeset
288 --max_var_mm_qualsum maximum mismatch quality sum of reference-supporting reads
276875076be1 Uploaded
elixir-it
parents:
diff changeset
289 --max-mapqual-diff maximum difference of mapping quality between variant and reference reads
276875076be1 Uploaded
elixir-it
parents:
diff changeset
290 --max-readlen-diff maximum difference of average supporting read length between variant and reference reads (paralog filter)
276875076be1 Uploaded
elixir-it
parents:
diff changeset
291 --min-var-dist-3 minimum average distance to effective 3prime end of read (real end or Q2) for variant-supporting reads
276875076be1 Uploaded
elixir-it
parents:
diff changeset
292 --help this message
276875076be1 Uploaded
elixir-it
parents:
diff changeset
293
276875076be1 Uploaded
elixir-it
parents:
diff changeset
294 DESCRIPTION
276875076be1 Uploaded
elixir-it
parents:
diff changeset
295 This program will filter a VCF with a variety of filters as detailed in the VarScan2 paper (http://www.ncbi.nlm.nih.gov/pubmed/22300766). It requires the bam-readcount utility (https://github.com/genome/bam-readcount).
276875076be1 Uploaded
elixir-it
parents:
diff changeset
296
276875076be1 Uploaded
elixir-it
parents:
diff changeset
297 This filter was calibrated on 100bp PE Illumina reads. It is likely to be overly stringent for longer reads and may be less effective on shorter reads.
276875076be1 Uploaded
elixir-it
parents:
diff changeset
298
276875076be1 Uploaded
elixir-it
parents:
diff changeset
299 AUTHORS
276875076be1 Uploaded
elixir-it
parents:
diff changeset
300 Dan Koboldt Original code
276875076be1 Uploaded
elixir-it
parents:
diff changeset
301 Dave Larson Modifications for VCF and exportation.
276875076be1 Uploaded
elixir-it
parents:
diff changeset
302
276875076be1 Uploaded
elixir-it
parents:
diff changeset
303 HELP
276875076be1 Uploaded
elixir-it
parents:
diff changeset
304 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
305
276875076be1 Uploaded
elixir-it
parents:
diff changeset
306 ### methods copied from elsewhere begin here...
276875076be1 Uploaded
elixir-it
parents:
diff changeset
307 sub parse_vcf_header {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
308 my $input_fh = shift;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
309
276875076be1 Uploaded
elixir-it
parents:
diff changeset
310 my @header;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
311 my $header_end = 0;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
312 while (!$header_end) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
313 my $line = $input_fh->getline;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
314 if ($line =~ m/^##/) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
315 push @header, $line;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
316 } elsif ($line =~ m/^#/) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
317 push @header, $line;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
318 $header_end = 1;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
319 } else {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
320 die "Missed the final header line with the sample list? Last line examined: $line Header so far: " . join("\n", @header) . "\n";
276875076be1 Uploaded
elixir-it
parents:
diff changeset
321 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
322 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
323 return \@header;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
324 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
325
276875076be1 Uploaded
elixir-it
parents:
diff changeset
326 sub generate_region_list {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
327 my ($hash, $region_fh) = @_; #input_fh should be a filehandle to the VCF
276875076be1 Uploaded
elixir-it
parents:
diff changeset
328 print STDERR "Printing variants to temporary region_list file...\n";
276875076be1 Uploaded
elixir-it
parents:
diff changeset
329 for my $chr (keys %$hash) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
330 for my $pos (sort { $a <=> $b } keys %{$hash->{$chr}}) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
331 print $region_fh "$chr\t$pos\t$pos\n";
276875076be1 Uploaded
elixir-it
parents:
diff changeset
332 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
333 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
334 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
335
276875076be1 Uploaded
elixir-it
parents:
diff changeset
336 sub _simplify_indel_allele {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
337 my ($ref, $var) = @_;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
338 #these could be padded e.g. G, GT for a T insertion or GCC G for a 2bp deletion
276875076be1 Uploaded
elixir-it
parents:
diff changeset
339 #they could also be complex e.g. GCCCGT, GCGT for a 2bp deletion
276875076be1 Uploaded
elixir-it
parents:
diff changeset
340 #they could also represent an insertion deletion event e.g. GCCCGT GCGGGGT; these cannot be represented in genome bed. Throw an error or warn.
276875076be1 Uploaded
elixir-it
parents:
diff changeset
341 #
276875076be1 Uploaded
elixir-it
parents:
diff changeset
342 #I think the algorithm should be trim end (no updating of coords required)
276875076be1 Uploaded
elixir-it
parents:
diff changeset
343 #trim beginning and return number of bases trimmed
276875076be1 Uploaded
elixir-it
parents:
diff changeset
344
276875076be1 Uploaded
elixir-it
parents:
diff changeset
345 my @ref_array = map { uc } split //, $ref;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
346 my @var_array = map { uc } split //, $var;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
347
276875076be1 Uploaded
elixir-it
parents:
diff changeset
348 while(@ref_array and @var_array and $ref_array[-1] eq $var_array[-1]) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
349 pop @ref_array;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
350 pop @var_array;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
351 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
352
276875076be1 Uploaded
elixir-it
parents:
diff changeset
353 my $right_shift = 0;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
354 while(@ref_array and @var_array and $ref_array[0] eq $var_array[0]) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
355 shift @ref_array;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
356 shift @var_array;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
357 $right_shift++;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
358 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
359
276875076be1 Uploaded
elixir-it
parents:
diff changeset
360 return (join("",@ref_array), join("",@var_array), $right_shift);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
361 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
362
276875076be1 Uploaded
elixir-it
parents:
diff changeset
363 sub filter_site {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
364 my ($ref_result, $var_result) = @_;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
365 #this will return a list of filter names
276875076be1 Uploaded
elixir-it
parents:
diff changeset
366 my @filter_names;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
367 if($ref_result && $var_result) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
368 ## Parse out the bam-readcounts details for each allele. The fields should be: ##
276875076be1 Uploaded
elixir-it
parents:
diff changeset
369 #num_reads : avg_mapqual : avg_basequal : avg_semq : reads_plus : reads_minus : avg_clip_read_pos : avg_mmqs : reads_q2 : avg_dist_to_q2 : avgRLclipped : avg_eff_3'_dist
276875076be1 Uploaded
elixir-it
parents:
diff changeset
370 my ($ref_count, $ref_map_qual, $ref_base_qual, $ref_semq, $ref_plus, $ref_minus, $ref_pos, $ref_subs, $ref_mmqs, $ref_q2_reads, $ref_q2_dist, $ref_avg_rl, $ref_dist_3) = split(/\t/, $ref_result);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
371 my ($var_count, $var_map_qual, $var_base_qual, $var_semq, $var_plus, $var_minus, $var_pos, $var_subs, $var_mmqs, $var_q2_reads, $var_q2_dist, $var_avg_rl, $var_dist_3) = split(/\t/, $var_result);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
372
276875076be1 Uploaded
elixir-it
parents:
diff changeset
373 my $ref_strandedness = my $var_strandedness = 0.50;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
374 $ref_dist_3 = 0.5 if(!$ref_dist_3);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
375
276875076be1 Uploaded
elixir-it
parents:
diff changeset
376 ## Use conservative defaults if we can't get mismatch quality sums ##
276875076be1 Uploaded
elixir-it
parents:
diff changeset
377 $ref_mmqs = 50 if(!$ref_mmqs);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
378 $var_mmqs = 0 if(!$var_mmqs);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
379 my $mismatch_qualsum_diff = $var_mmqs - $ref_mmqs;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
380
276875076be1 Uploaded
elixir-it
parents:
diff changeset
381 ## Determine map qual diff ##
276875076be1 Uploaded
elixir-it
parents:
diff changeset
382
276875076be1 Uploaded
elixir-it
parents:
diff changeset
383 my $mapqual_diff = $ref_map_qual - $var_map_qual;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
384
276875076be1 Uploaded
elixir-it
parents:
diff changeset
385
276875076be1 Uploaded
elixir-it
parents:
diff changeset
386 ## Determine difference in average supporting read length ##
276875076be1 Uploaded
elixir-it
parents:
diff changeset
387
276875076be1 Uploaded
elixir-it
parents:
diff changeset
388 my $readlen_diff = $ref_avg_rl - $var_avg_rl;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
389
276875076be1 Uploaded
elixir-it
parents:
diff changeset
390
276875076be1 Uploaded
elixir-it
parents:
diff changeset
391 ## Determine ref strandedness ##
276875076be1 Uploaded
elixir-it
parents:
diff changeset
392
276875076be1 Uploaded
elixir-it
parents:
diff changeset
393 if(($ref_plus + $ref_minus) > 0) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
394 $ref_strandedness = $ref_plus / ($ref_plus + $ref_minus);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
395 $ref_strandedness = sprintf("%.2f", $ref_strandedness);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
396 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
397
276875076be1 Uploaded
elixir-it
parents:
diff changeset
398 ## Determine var strandedness ##
276875076be1 Uploaded
elixir-it
parents:
diff changeset
399
276875076be1 Uploaded
elixir-it
parents:
diff changeset
400 if(($var_plus + $var_minus) > 0) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
401 $var_strandedness = $var_plus / ($var_plus + $var_minus);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
402 $var_strandedness = sprintf("%.2f", $var_strandedness);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
403 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
404
276875076be1 Uploaded
elixir-it
parents:
diff changeset
405 if($var_count && ($var_plus + $var_minus)) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
406 ## We must obtain variant read counts to proceed ##
276875076be1 Uploaded
elixir-it
parents:
diff changeset
407
276875076be1 Uploaded
elixir-it
parents:
diff changeset
408 my $var_freq = $var_count / ($ref_count + $var_count);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
409
276875076be1 Uploaded
elixir-it
parents:
diff changeset
410 ## FAILURE 1: READ POSITION ##
276875076be1 Uploaded
elixir-it
parents:
diff changeset
411 if(($var_pos < $min_read_pos) || ($var_pos > $max_read_pos)) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
412 #$stats{'num_fail_pos'}++;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
413 push @filter_names, $filters{'position'}->[0];
276875076be1 Uploaded
elixir-it
parents:
diff changeset
414 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
415
276875076be1 Uploaded
elixir-it
parents:
diff changeset
416 ## FAILURE 2: Variant is strand-specific but reference is NOT strand-specific ##
276875076be1 Uploaded
elixir-it
parents:
diff changeset
417 if(($var_strandedness < $min_strandedness || $var_strandedness > $max_strandedness) && ($ref_strandedness >= $min_strandedness && $ref_strandedness <= $max_strandedness)) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
418 #$stats{'num_fail_strand'}++;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
419 push @filter_names, $filters{'strand_bias'}->[0];
276875076be1 Uploaded
elixir-it
parents:
diff changeset
420 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
421
276875076be1 Uploaded
elixir-it
parents:
diff changeset
422 ## FAILURE : Variant allele count does not meet minimum ##
276875076be1 Uploaded
elixir-it
parents:
diff changeset
423 if($var_count < $min_var_count) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
424 #$stats{'num_fail_varcount'}++;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
425 push @filter_names, $filters{'min_var_count'}->[0];
276875076be1 Uploaded
elixir-it
parents:
diff changeset
426 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
427
276875076be1 Uploaded
elixir-it
parents:
diff changeset
428 ## FAILURE : Variant allele frequency does not meet minimum ##
276875076be1 Uploaded
elixir-it
parents:
diff changeset
429 if($var_freq < $min_var_freq) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
430 #$stats{'num_fail_varfreq'}++;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
431 push @filter_names, $filters{'min_var_freq'}->[0];
276875076be1 Uploaded
elixir-it
parents:
diff changeset
432 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
433
276875076be1 Uploaded
elixir-it
parents:
diff changeset
434 ## FAILURE 3: Paralog filter for sites where variant allele mismatch-quality-sum is significantly higher than reference allele mmqs
276875076be1 Uploaded
elixir-it
parents:
diff changeset
435 if($mismatch_qualsum_diff> $max_mm_qualsum_diff) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
436 #$stats{'num_fail_mmqs'}++;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
437 push @filter_names, $filters{'mmqs_diff'}->[0];
276875076be1 Uploaded
elixir-it
parents:
diff changeset
438 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
439
276875076be1 Uploaded
elixir-it
parents:
diff changeset
440 ## FAILURE 4: Mapping quality difference exceeds allowable maximum ##
276875076be1 Uploaded
elixir-it
parents:
diff changeset
441 if($mapqual_diff > $max_mapqual_diff) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
442 #$stats{'num_fail_mapqual'}++;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
443 push @filter_names, $filters{'mq_diff'}->[0];
276875076be1 Uploaded
elixir-it
parents:
diff changeset
444 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
445
276875076be1 Uploaded
elixir-it
parents:
diff changeset
446 ## FAILURE 5: Read length difference exceeds allowable maximum ##
276875076be1 Uploaded
elixir-it
parents:
diff changeset
447 if($readlen_diff > $max_readlen_diff) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
448 #$stats{'num_fail_readlen'}++;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
449 push @filter_names, $filters{'read_length'}->[0];
276875076be1 Uploaded
elixir-it
parents:
diff changeset
450 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
451
276875076be1 Uploaded
elixir-it
parents:
diff changeset
452 ## FAILURE 5: Read length difference exceeds allowable maximum ##
276875076be1 Uploaded
elixir-it
parents:
diff changeset
453 if($var_dist_3 < $min_var_dist_3) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
454 #$stats{'num_fail_dist3'}++;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
455 push @filter_names, $filters{'dist3'}->[0];
276875076be1 Uploaded
elixir-it
parents:
diff changeset
456 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
457
276875076be1 Uploaded
elixir-it
parents:
diff changeset
458 if($max_var_mm_qualsum && $var_mmqs > $max_var_mm_qualsum) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
459 #$stats{'num_fail_var_mmqs'}++;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
460 push @filter_names, $filters{'var_mmqs'}->[0];
276875076be1 Uploaded
elixir-it
parents:
diff changeset
461 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
462
276875076be1 Uploaded
elixir-it
parents:
diff changeset
463 ## SUCCESS: Pass Filter ##
276875076be1 Uploaded
elixir-it
parents:
diff changeset
464 if(@filter_names == 0) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
465 #$stats{'num_pass_filter'}++;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
466 ## Print output, and append strandedness information ##
276875076be1 Uploaded
elixir-it
parents:
diff changeset
467 @filter_names = ('PASS');
276875076be1 Uploaded
elixir-it
parents:
diff changeset
468 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
469
276875076be1 Uploaded
elixir-it
parents:
diff changeset
470 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
471 else {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
472 push @filter_names, $filters{'no_var_readcount'}->[0];
276875076be1 Uploaded
elixir-it
parents:
diff changeset
473 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
474 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
475 else {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
476 #$stats{'num_no_readcounts'}++;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
477 #print $fail_fh "$line\tno_readcounts\n";
276875076be1 Uploaded
elixir-it
parents:
diff changeset
478 push @filter_names, $filters{'incomplete_readcount'}->[0];
276875076be1 Uploaded
elixir-it
parents:
diff changeset
479 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
480 return @filter_names;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
481 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
482
276875076be1 Uploaded
elixir-it
parents:
diff changeset
483 sub add_filters_to_vcf_header {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
484 my ($parsed_header, @filter_refs) = @_;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
485 my $column_header = pop @$parsed_header;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
486 for my $filter_ref (@filter_refs) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
487 my ($filter_name, $filter_description) = @$filter_ref;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
488 my $filter_line = qq{##FILTER=<ID=$filter_name,Description="$filter_description">\n};
276875076be1 Uploaded
elixir-it
parents:
diff changeset
489 push @$parsed_header, $filter_line;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
490 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
491 push @$parsed_header, $column_header;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
492 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
493
276875076be1 Uploaded
elixir-it
parents:
diff changeset
494 sub add_process_log_to_header {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
495 my ($parsed_header, $input, @params) = @_;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
496 my $column_header = pop @$parsed_header;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
497 my $param_string = join(" ", @params);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
498 push @$parsed_header, qq{##vcfProcessLog=<InputVCF=<$input>, InputVCFSource=<fpfilter>, InputVCFVer=<6.0>, InputVCFParam=<"$param_string"> InputVCFgeneAnno=<.>>\n}, $column_header;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
499 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
500
276875076be1 Uploaded
elixir-it
parents:
diff changeset
501 sub filter_sites_in_hash {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
502 my ($hash, $bam_readcount_path, $bam_file, $ref_fasta, $working_dir, $optional_param) = @_;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
503 #done parsing vcf
276875076be1 Uploaded
elixir-it
parents:
diff changeset
504 $optional_param ||= '';
276875076be1 Uploaded
elixir-it
parents:
diff changeset
505 my $list_name = File::Spec->catfile($working_dir, "regions.txt");
276875076be1 Uploaded
elixir-it
parents:
diff changeset
506 my $list_fh = IO::File->new($list_name,"w") or die "Unable to open file for coordinates\n";
276875076be1 Uploaded
elixir-it
parents:
diff changeset
507 generate_region_list($hash, $list_fh);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
508 $list_fh->close();
276875076be1 Uploaded
elixir-it
parents:
diff changeset
509
276875076be1 Uploaded
elixir-it
parents:
diff changeset
510 ## run bam-readcount
276875076be1 Uploaded
elixir-it
parents:
diff changeset
511 my $bam_readcount_cmd = "$bam_readcount_path -f $ref_fasta -l $list_name -w 0 -b 20 $optional_param $bam_file|";
276875076be1 Uploaded
elixir-it
parents:
diff changeset
512 my $rc_results = IO::File->new($bam_readcount_cmd) or die "Unable to open pipe to bam-readcount cmd: $bam_readcount_cmd\n";
276875076be1 Uploaded
elixir-it
parents:
diff changeset
513 while(my $rc_line = $rc_results->getline) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
514 chomp $rc_line;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
515 my ($chrom, $position) = split(/\t/, $rc_line);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
516 if($hash->{$chrom}{$position}) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
517 for my $ref (keys %{$hash->{$chrom}{$position}}) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
518 for my $var (keys %{$hash->{$chrom}{$position}{$ref}}) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
519 my $ref_result = read_counts_by_allele($rc_line, $ref);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
520 my $var_result = read_counts_by_allele($rc_line, $var);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
521 my @filters = filter_site($ref_result, $var_result);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
522
276875076be1 Uploaded
elixir-it
parents:
diff changeset
523 my $vcf_line_ref = $hash->{$chrom}{$position}{$ref}{$var};
276875076be1 Uploaded
elixir-it
parents:
diff changeset
524 my @fields = split "\t", $$vcf_line_ref;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
525 if($fields[6] eq '.' || $fields[6] eq 'PASS') {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
526 $fields[6] = join(";", @filters);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
527 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
528 else {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
529 $fields[6] = join(";", $fields[6], @filters) if($filters[0] ne 'PASS');
276875076be1 Uploaded
elixir-it
parents:
diff changeset
530 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
531 $$vcf_line_ref = join("\t", @fields);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
532 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
533 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
534 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
535 else {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
536 die "Unknown site for rc\n";
276875076be1 Uploaded
elixir-it
parents:
diff changeset
537 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
538 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
539 unless($rc_results->close) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
540 die "Error running bam-readcount\n";
276875076be1 Uploaded
elixir-it
parents:
diff changeset
541 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
542 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
543
276875076be1 Uploaded
elixir-it
parents:
diff changeset
544 sub setup_workdir {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
545 my ($reference, $bam_file, $bam_index) = @_;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
546 $reference = abs_path($reference);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
547 $bam_file = abs_path($bam_file);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
548 $bam_index = abs_path($bam_index) if $bam_index;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
549
276875076be1 Uploaded
elixir-it
parents:
diff changeset
550 my $dir = File::Temp->newdir('fpfilterXXXXX', TMPDIR => 1, CLEANUP => 1, DIR => './') or
276875076be1 Uploaded
elixir-it
parents:
diff changeset
551 die "Unable to create working directory\n";
276875076be1 Uploaded
elixir-it
parents:
diff changeset
552
276875076be1 Uploaded
elixir-it
parents:
diff changeset
553 #symlink in necessary files to run
276875076be1 Uploaded
elixir-it
parents:
diff changeset
554 my $working_reference = File::Spec->catfile($dir, "reference.fa");
276875076be1 Uploaded
elixir-it
parents:
diff changeset
555 symlink $reference, $working_reference;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
556
276875076be1 Uploaded
elixir-it
parents:
diff changeset
557 my $fa_index = $reference . ".fai";
276875076be1 Uploaded
elixir-it
parents:
diff changeset
558 unless(-e $fa_index) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
559 index_fasta($working_reference);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
560 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
561 else {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
562 symlink $fa_index, File::Spec->catfile($dir, "reference.fa.fai");
276875076be1 Uploaded
elixir-it
parents:
diff changeset
563 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
564
276875076be1 Uploaded
elixir-it
parents:
diff changeset
565 my $working_bam = File::Spec->catfile($dir, "tumor.bam");
276875076be1 Uploaded
elixir-it
parents:
diff changeset
566 my $working_bam_index = File::Spec->catfile($dir, "tumor.bam.bai");
276875076be1 Uploaded
elixir-it
parents:
diff changeset
567 symlink $bam_file, $working_bam;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
568 if($bam_index) {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
569 symlink $bam_index, $working_bam_index;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
570 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
571 elsif(-e $bam_file . ".bai") {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
572 symlink $bam_file . ".bai", $working_bam_index;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
573 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
574 else {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
575 index_bam($working_bam);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
576 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
577 return ($dir, $working_reference, $working_bam);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
578 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
579
276875076be1 Uploaded
elixir-it
parents:
diff changeset
580 sub index_fasta {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
581 my ($fasta) = @_;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
582
276875076be1 Uploaded
elixir-it
parents:
diff changeset
583 print STDERR "Indexing fasta...\n";
276875076be1 Uploaded
elixir-it
parents:
diff changeset
584 my @args = ($samtools_path, "faidx", $fasta);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
585 system(@args) == 0
276875076be1 Uploaded
elixir-it
parents:
diff changeset
586 or die "Unable to index $fasta: $?\n";
276875076be1 Uploaded
elixir-it
parents:
diff changeset
587 }
276875076be1 Uploaded
elixir-it
parents:
diff changeset
588
276875076be1 Uploaded
elixir-it
parents:
diff changeset
589 sub index_bam {
276875076be1 Uploaded
elixir-it
parents:
diff changeset
590 my ($bam) = @_;
276875076be1 Uploaded
elixir-it
parents:
diff changeset
591
276875076be1 Uploaded
elixir-it
parents:
diff changeset
592 print STDERR "Indexing BAM...\n";
276875076be1 Uploaded
elixir-it
parents:
diff changeset
593 my @args = ($samtools_path, "index", $bam);
276875076be1 Uploaded
elixir-it
parents:
diff changeset
594 system(@args) == 0
276875076be1 Uploaded
elixir-it
parents:
diff changeset
595 or die "Unable to index $bam: $?\n";
276875076be1 Uploaded
elixir-it
parents:
diff changeset
596 }