Mercurial > repos > elixir-it > fpfilter
view fpfilter.pl @ 0:276875076be1 draft
Uploaded
author | elixir-it |
---|---|
date | Tue, 03 Jul 2018 06:05:44 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env perl use warnings; use strict; use Getopt::Long; use IO::File; use File::Temp qw( tempdir ); use File::Spec; use Cwd qw( abs_path cwd); ## Define filtering parameters ## my $min_read_pos = 0.10; my $max_read_pos = 1 - $min_read_pos; my $min_var_freq = 0.05; my $min_var_count = 4; my $min_strandedness = 0.01; my $max_strandedness = 1 - $min_strandedness; my $max_mm_qualsum_diff = 50; my $max_mapqual_diff = 30; my $max_readlen_diff = 25; my $min_var_dist_3 = 0.20; my $max_var_mm_qualsum = 100; ## Parse arguments ## my $output_basename; my $vcf_file; my $output_file; my $bam_file; my $bam_index; my $sample; my $bam_readcount_path = 'bam-readcount'; my $samtools_path = 'samtools'; my $ref_fasta; my $help; my $opt_result; my @params = @ARGV; $opt_result = GetOptions( 'vcf-file=s' => \$vcf_file, 'bam-file=s' => \$bam_file, 'bam-index=s' => \$bam_index, 'sample=s' => \$sample, 'bam-readcount=s' => \$bam_readcount_path, 'samtools=s' => \$samtools_path, 'reference=s' => \$ref_fasta, 'output=s' => \$output_file, 'min-read-pos=f' => \$min_read_pos, 'min-var-freq=f' => \$min_var_freq, 'min-var-count=f' => \$min_var_count, 'min-strandedness=f' => \$min_strandedness, 'max-mm-qualsum-diff=f' => \$max_mm_qualsum_diff, 'max-mapqual-diff=f' => \$max_mapqual_diff, 'max-readlen-diff=f' => \$max_readlen_diff, 'min-var-dist-3=f' => \$min_var_dist_3, 'max-var-mm-qualsum=f' => \$max_var_mm_qualsum, 'help' => \$help, ); unless($opt_result) { die help_text(); } if($help) { print STDOUT help_text(); exit 0; } unless($vcf_file) { warn "You must provide a file to be filtered\n"; die help_text(); } unless(-s $vcf_file) { die "Can not find VCF file: $vcf_file\n"; } unless($ref_fasta) { warn "You must provide a reference fasta for generating readcounts to use for filtering\n"; die help_text(); } unless(-s $ref_fasta) { die "Can not find valid reference fasta: $ref_fasta\n"; } unless($bam_file) { die "You must provide a BAM file for generating readcounts\n"; die help_text(); } unless(-s $bam_file) { die "Can not find valid BAM file: $bam_file\n"; } if($bam_index && !-s $bam_index) { die "Can not find valid BAM index file: $bam_index\n"; } unless($output_file) { warn "You must provide an output file name: $output_file\n"; die help_text(); } else { $output_file = abs_path($output_file); #make sure we have full path as manipulating cwd below } unless($sample) { warn "You must provide a sample name\n"; die help_text(); } my %filters; $filters{'position'} = [sprintf("PB%0.f",$min_read_pos*100), "Average position on read less than " . $min_read_pos . " or greater than " . $max_read_pos . " fraction of the read length"]; $filters{'strand_bias'} = [sprintf("SB%0.f",$min_strandedness*100), "Reads supporting the variant have less than " . $min_strandedness . " fraction of the reads on one strand, but reference supporting reads are not similarly biased"]; $filters{'min_var_count'} = [ "MVC".$min_var_count, "Less than " . $min_var_count . " high quality reads support the variant"]; $filters{'min_var_freq'} = [ sprintf("MVF%0.f",$min_var_freq*100), "Variant allele frequency is less than " . $min_var_freq]; $filters{'mmqs_diff'} = [ sprintf("MMQSD%d",$max_mm_qualsum_diff), "Difference in average mismatch quality sum between variant and reference supporting reads is greater than " . $max_mm_qualsum_diff]; $filters{'mq_diff'} = [ sprintf("MQD%d",$max_mapqual_diff), "Difference in average mapping quality sum between variant and reference supporting reads is greater than " . $max_mapqual_diff]; $filters{'read_length'} = [ sprintf("RLD%d",$max_readlen_diff), "Difference in average clipped read length between variant and reference supporting reads is greater than " . $max_readlen_diff]; $filters{'dist3'} = [ sprintf("DETP%0.f",$min_var_dist_3*100), "Average distance of the variant base to the effective 3' end is less than " . $min_var_dist_3]; $filters{'var_mmqs'} = [ sprintf("MMQS%d",$max_var_mm_qualsum), "The average mismatch quality sum of reads supporting the variant is greater than " . $max_var_mm_qualsum] if($max_var_mm_qualsum); $filters{'no_var_readcount'} = [ "NRC", "Unable to grab readcounts for variant allele"]; $filters{'incomplete_readcount'} = [ "IRC", "Unable to grab any sort of readcount for either the reference or the variant allele"]; my $vcf_header; my @vcf_lines; my %rc_for_snp; # store info on snp positions and VCF line my %rc_for_indel; #store info on indel positions and VCF line my ($workdir, $working_fasta, $working_bam) = setup_workdir($ref_fasta, $bam_file, $bam_index); my $starting_dir = cwd(); my $input = IO::File->new($vcf_file) or die "Unable to open input file $vcf_file: $!\n"; $vcf_header = parse_vcf_header($input); add_filters_to_vcf_header($vcf_header, values %filters); add_process_log_to_header($vcf_header, $vcf_file, @params); my $header_line = $vcf_header->[-1]; chomp $header_line; my @header_fields = split "\t", $header_line; while(my $entry = $input->getline) { push @vcf_lines, $entry; chomp $entry; my %fields; @fields{@header_fields} = split("\t", $entry); my $filter_sample = $fields{$sample}; unless($filter_sample) { die "Unable to find field for $sample\n"; } my @sample_fields = split /:/, $filter_sample; unless(@sample_fields) { die "Unable to parse field for $sample\n"; } my $index = 0; my %format_keys = map { $_ => $sample_fields[$index++] } split /:/, $fields{FORMAT}; #these are in order ACGT my @alleles = ($fields{REF}, split /,/, $fields{ALT}); my %gt_alleles = map {$_ => 1} grep { $_ > 0 } split /\//, $format_keys{GT}; my @used_alleles; for my $allele_index (keys %gt_alleles) { push @used_alleles, $alleles[$allele_index]; } my ($var) = sort @used_alleles; #follow existing convention of fp filter using alphabetical order to choose a single base on triallelic sites $var = q{} unless defined $var; #in the case where there is no variant allele, set this to the empty string. Later it will be filtered as NRC or IRC $var = uc($var); my $ref = uc($fields{REF}); my $chrom = $fields{'#CHROM'}; my $pos = $fields{POS}; if(length($ref) > 1 || length($var) > 1) { #it's an indel or mnp if(length($ref) == length($var)) { die "MNPs unsupported\n"; } elsif(length($ref) > length($var)) { #it's a deletion $pos += 1; ($ref, $var) = ($var, $ref); $ref = substr($var, 1, 1); $var = "-" . substr($var, 1); } else { #it's an insertion substr($var, 0, 1, "+"); } $rc_for_indel{$chrom}{$pos}{$ref}{$var} = \$vcf_lines[-1]; } else { #it's a SNP $rc_for_snp{$chrom}{$pos}{$ref}{$var} = \$vcf_lines[-1]; } } if(%rc_for_snp) { filter_sites_in_hash(\%rc_for_snp, $bam_readcount_path, $working_bam, $working_fasta, $workdir); } else { print STDERR "No SNP sites identified\n"; } if(%rc_for_indel) { filter_sites_in_hash(\%rc_for_indel, $bam_readcount_path, $working_bam, $working_fasta, $workdir, '-i'); } else { print STDERR "No Indel sites identified\n"; } ## Open the output files ## my $filtered_vcf = IO::File->new("$output_file","w") or die "Can't open output file $output_file: $!\n"; print $filtered_vcf @$vcf_header; print $filtered_vcf @vcf_lines; chdir $starting_dir or die "Unable to go back to starting dir\n"; exit(0); ################################################################################ =head3 read_counts_by_allele Retrieve relevant read counts for a certain allele =cut sub read_counts_by_allele { (my $line, my $allele) = @_; my @lineContents = split(/\t/, $line); my $numContents = @lineContents; for(my $colCounter = 5; $colCounter < $numContents; $colCounter++) { my $this_allele = $lineContents[$colCounter]; my @alleleContents = split(/\:/, $this_allele); if($alleleContents[0] eq $allele) { my $numAlleleContents = @alleleContents; return("") if($numAlleleContents < 8); my $return_string = ""; my $return_sum = 0; for(my $printCounter = 1; $printCounter < $numAlleleContents; $printCounter++) { $return_sum += $alleleContents[$printCounter]; $return_string .= "\t" if($return_string); $return_string .= $alleleContents[$printCounter]; } return($return_string); } } return(""); } sub help_text { return <<HELP; fpfilter - Filtering for Illumina Sequencing SYNOPSIS fpfilter [options] [file ...] OPTIONS --vcf-file the input VCF file. Must have a GT field. --bam-file the BAM file of the sample you are filtering on. Typically the tumor. --sample the sample name of the sample you want to filter on in the VCF file. --reference a fasta containing the reference sequence the BAM file was aligned to. --output the filename of the output VCF file --min-read-pos minimum average relative distance from start/end of read --min-var-freq minimum variant allele frequency --min-var-count minimum number of variant-supporting reads --min-strandedness minimum representation of variant allele on each strand --max-mm-qualsum-diff maximum difference of mismatch quality sum between variant and reference reads (paralog filter) --max_var_mm_qualsum maximum mismatch quality sum of reference-supporting reads --max-mapqual-diff maximum difference of mapping quality between variant and reference reads --max-readlen-diff maximum difference of average supporting read length between variant and reference reads (paralog filter) --min-var-dist-3 minimum average distance to effective 3prime end of read (real end or Q2) for variant-supporting reads --help this message DESCRIPTION This program will filter a VCF with a variety of filters as detailed in the VarScan2 paper (http://www.ncbi.nlm.nih.gov/pubmed/22300766). It requires the bam-readcount utility (https://github.com/genome/bam-readcount). This filter was calibrated on 100bp PE Illumina reads. It is likely to be overly stringent for longer reads and may be less effective on shorter reads. AUTHORS Dan Koboldt Original code Dave Larson Modifications for VCF and exportation. HELP } ### methods copied from elsewhere begin here... sub parse_vcf_header { my $input_fh = shift; my @header; my $header_end = 0; while (!$header_end) { my $line = $input_fh->getline; if ($line =~ m/^##/) { push @header, $line; } elsif ($line =~ m/^#/) { push @header, $line; $header_end = 1; } else { die "Missed the final header line with the sample list? Last line examined: $line Header so far: " . join("\n", @header) . "\n"; } } return \@header; } sub generate_region_list { my ($hash, $region_fh) = @_; #input_fh should be a filehandle to the VCF print STDERR "Printing variants to temporary region_list file...\n"; for my $chr (keys %$hash) { for my $pos (sort { $a <=> $b } keys %{$hash->{$chr}}) { print $region_fh "$chr\t$pos\t$pos\n"; } } } sub _simplify_indel_allele { my ($ref, $var) = @_; #these could be padded e.g. G, GT for a T insertion or GCC G for a 2bp deletion #they could also be complex e.g. GCCCGT, GCGT for a 2bp deletion #they could also represent an insertion deletion event e.g. GCCCGT GCGGGGT; these cannot be represented in genome bed. Throw an error or warn. # #I think the algorithm should be trim end (no updating of coords required) #trim beginning and return number of bases trimmed my @ref_array = map { uc } split //, $ref; my @var_array = map { uc } split //, $var; while(@ref_array and @var_array and $ref_array[-1] eq $var_array[-1]) { pop @ref_array; pop @var_array; } my $right_shift = 0; while(@ref_array and @var_array and $ref_array[0] eq $var_array[0]) { shift @ref_array; shift @var_array; $right_shift++; } return (join("",@ref_array), join("",@var_array), $right_shift); } sub filter_site { my ($ref_result, $var_result) = @_; #this will return a list of filter names my @filter_names; if($ref_result && $var_result) { ## Parse out the bam-readcounts details for each allele. The fields should be: ## #num_reads : avg_mapqual : avg_basequal : avg_semq : reads_plus : reads_minus : avg_clip_read_pos : avg_mmqs : reads_q2 : avg_dist_to_q2 : avgRLclipped : avg_eff_3'_dist my ($ref_count, $ref_map_qual, $ref_base_qual, $ref_semq, $ref_plus, $ref_minus, $ref_pos, $ref_subs, $ref_mmqs, $ref_q2_reads, $ref_q2_dist, $ref_avg_rl, $ref_dist_3) = split(/\t/, $ref_result); my ($var_count, $var_map_qual, $var_base_qual, $var_semq, $var_plus, $var_minus, $var_pos, $var_subs, $var_mmqs, $var_q2_reads, $var_q2_dist, $var_avg_rl, $var_dist_3) = split(/\t/, $var_result); my $ref_strandedness = my $var_strandedness = 0.50; $ref_dist_3 = 0.5 if(!$ref_dist_3); ## Use conservative defaults if we can't get mismatch quality sums ## $ref_mmqs = 50 if(!$ref_mmqs); $var_mmqs = 0 if(!$var_mmqs); my $mismatch_qualsum_diff = $var_mmqs - $ref_mmqs; ## Determine map qual diff ## my $mapqual_diff = $ref_map_qual - $var_map_qual; ## Determine difference in average supporting read length ## my $readlen_diff = $ref_avg_rl - $var_avg_rl; ## Determine ref strandedness ## if(($ref_plus + $ref_minus) > 0) { $ref_strandedness = $ref_plus / ($ref_plus + $ref_minus); $ref_strandedness = sprintf("%.2f", $ref_strandedness); } ## Determine var strandedness ## if(($var_plus + $var_minus) > 0) { $var_strandedness = $var_plus / ($var_plus + $var_minus); $var_strandedness = sprintf("%.2f", $var_strandedness); } if($var_count && ($var_plus + $var_minus)) { ## We must obtain variant read counts to proceed ## my $var_freq = $var_count / ($ref_count + $var_count); ## FAILURE 1: READ POSITION ## if(($var_pos < $min_read_pos) || ($var_pos > $max_read_pos)) { #$stats{'num_fail_pos'}++; push @filter_names, $filters{'position'}->[0]; } ## FAILURE 2: Variant is strand-specific but reference is NOT strand-specific ## if(($var_strandedness < $min_strandedness || $var_strandedness > $max_strandedness) && ($ref_strandedness >= $min_strandedness && $ref_strandedness <= $max_strandedness)) { #$stats{'num_fail_strand'}++; push @filter_names, $filters{'strand_bias'}->[0]; } ## FAILURE : Variant allele count does not meet minimum ## if($var_count < $min_var_count) { #$stats{'num_fail_varcount'}++; push @filter_names, $filters{'min_var_count'}->[0]; } ## FAILURE : Variant allele frequency does not meet minimum ## if($var_freq < $min_var_freq) { #$stats{'num_fail_varfreq'}++; push @filter_names, $filters{'min_var_freq'}->[0]; } ## FAILURE 3: Paralog filter for sites where variant allele mismatch-quality-sum is significantly higher than reference allele mmqs if($mismatch_qualsum_diff> $max_mm_qualsum_diff) { #$stats{'num_fail_mmqs'}++; push @filter_names, $filters{'mmqs_diff'}->[0]; } ## FAILURE 4: Mapping quality difference exceeds allowable maximum ## if($mapqual_diff > $max_mapqual_diff) { #$stats{'num_fail_mapqual'}++; push @filter_names, $filters{'mq_diff'}->[0]; } ## FAILURE 5: Read length difference exceeds allowable maximum ## if($readlen_diff > $max_readlen_diff) { #$stats{'num_fail_readlen'}++; push @filter_names, $filters{'read_length'}->[0]; } ## FAILURE 5: Read length difference exceeds allowable maximum ## if($var_dist_3 < $min_var_dist_3) { #$stats{'num_fail_dist3'}++; push @filter_names, $filters{'dist3'}->[0]; } if($max_var_mm_qualsum && $var_mmqs > $max_var_mm_qualsum) { #$stats{'num_fail_var_mmqs'}++; push @filter_names, $filters{'var_mmqs'}->[0]; } ## SUCCESS: Pass Filter ## if(@filter_names == 0) { #$stats{'num_pass_filter'}++; ## Print output, and append strandedness information ## @filter_names = ('PASS'); } } else { push @filter_names, $filters{'no_var_readcount'}->[0]; } } else { #$stats{'num_no_readcounts'}++; #print $fail_fh "$line\tno_readcounts\n"; push @filter_names, $filters{'incomplete_readcount'}->[0]; } return @filter_names; } sub add_filters_to_vcf_header { my ($parsed_header, @filter_refs) = @_; my $column_header = pop @$parsed_header; for my $filter_ref (@filter_refs) { my ($filter_name, $filter_description) = @$filter_ref; my $filter_line = qq{##FILTER=<ID=$filter_name,Description="$filter_description">\n}; push @$parsed_header, $filter_line; } push @$parsed_header, $column_header; } sub add_process_log_to_header { my ($parsed_header, $input, @params) = @_; my $column_header = pop @$parsed_header; my $param_string = join(" ", @params); push @$parsed_header, qq{##vcfProcessLog=<InputVCF=<$input>, InputVCFSource=<fpfilter>, InputVCFVer=<6.0>, InputVCFParam=<"$param_string"> InputVCFgeneAnno=<.>>\n}, $column_header; } sub filter_sites_in_hash { my ($hash, $bam_readcount_path, $bam_file, $ref_fasta, $working_dir, $optional_param) = @_; #done parsing vcf $optional_param ||= ''; my $list_name = File::Spec->catfile($working_dir, "regions.txt"); my $list_fh = IO::File->new($list_name,"w") or die "Unable to open file for coordinates\n"; generate_region_list($hash, $list_fh); $list_fh->close(); ## run bam-readcount my $bam_readcount_cmd = "$bam_readcount_path -f $ref_fasta -l $list_name -w 0 -b 20 $optional_param $bam_file|"; my $rc_results = IO::File->new($bam_readcount_cmd) or die "Unable to open pipe to bam-readcount cmd: $bam_readcount_cmd\n"; while(my $rc_line = $rc_results->getline) { chomp $rc_line; my ($chrom, $position) = split(/\t/, $rc_line); if($hash->{$chrom}{$position}) { for my $ref (keys %{$hash->{$chrom}{$position}}) { for my $var (keys %{$hash->{$chrom}{$position}{$ref}}) { my $ref_result = read_counts_by_allele($rc_line, $ref); my $var_result = read_counts_by_allele($rc_line, $var); my @filters = filter_site($ref_result, $var_result); my $vcf_line_ref = $hash->{$chrom}{$position}{$ref}{$var}; my @fields = split "\t", $$vcf_line_ref; if($fields[6] eq '.' || $fields[6] eq 'PASS') { $fields[6] = join(";", @filters); } else { $fields[6] = join(";", $fields[6], @filters) if($filters[0] ne 'PASS'); } $$vcf_line_ref = join("\t", @fields); } } } else { die "Unknown site for rc\n"; } } unless($rc_results->close) { die "Error running bam-readcount\n"; } } sub setup_workdir { my ($reference, $bam_file, $bam_index) = @_; $reference = abs_path($reference); $bam_file = abs_path($bam_file); $bam_index = abs_path($bam_index) if $bam_index; my $dir = File::Temp->newdir('fpfilterXXXXX', TMPDIR => 1, CLEANUP => 1, DIR => './') or die "Unable to create working directory\n"; #symlink in necessary files to run my $working_reference = File::Spec->catfile($dir, "reference.fa"); symlink $reference, $working_reference; my $fa_index = $reference . ".fai"; unless(-e $fa_index) { index_fasta($working_reference); } else { symlink $fa_index, File::Spec->catfile($dir, "reference.fa.fai"); } my $working_bam = File::Spec->catfile($dir, "tumor.bam"); my $working_bam_index = File::Spec->catfile($dir, "tumor.bam.bai"); symlink $bam_file, $working_bam; if($bam_index) { symlink $bam_index, $working_bam_index; } elsif(-e $bam_file . ".bai") { symlink $bam_file . ".bai", $working_bam_index; } else { index_bam($working_bam); } return ($dir, $working_reference, $working_bam); } sub index_fasta { my ($fasta) = @_; print STDERR "Indexing fasta...\n"; my @args = ($samtools_path, "faidx", $fasta); system(@args) == 0 or die "Unable to index $fasta: $?\n"; } sub index_bam { my ($bam) = @_; print STDERR "Indexing BAM...\n"; my @args = ($samtools_path, "index", $bam); system(@args) == 0 or die "Unable to index $bam: $?\n"; }