Mercurial > repos > stef > falco
changeset 54:036033e74757 draft
Uploaded
author | stef |
---|---|
date | Wed, 31 Dec 2014 07:16:40 -0500 |
parents | b4e2b4efae61 |
children | 55a9c30fc11c |
files | README.rst falco/lib/perl/perAmpliconAnalysis.pl tool-data/fasta_index.loc.sample tool_data_table_conf.xml.sample |
diffstat | 4 files changed, 68 insertions(+), 92 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.rst Wed Dec 31 07:16:40 2014 -0500 @@ -0,0 +1,33 @@ +FALCO Amplicon Analysis Pipeline & Galaxy wrapper +======================================== + +FALCO variant-caller is part of the Amplicon Analysis Pipeline (AAP). + +The typical workflow is as follows: +* paired-end amplicon sequencing +* merge pairs by overlapping them +* map the single read fastq with BWA +* perform variant calling with FALCO +* create (html) report of the results + +FALCO uses samtools and straight-forward statistics to determine wether a +potential variant is likely a (technical) artifact or not. + +Input / Output + +Input of the caller is a BAM file, output VCF + + +History + +============== ================================================================ +Date Changes +-------------- ---------------------------------------------------------------- +December 2014 * first release in Test Tool Shed +============== ================================================================ + + +Bug Reports & other questions +============================= + +Issues can be reported via http://www.tgac.nl/
--- a/falco/lib/perl/perAmpliconAnalysis.pl Wed Dec 31 06:53:48 2014 -0500 +++ b/falco/lib/perl/perAmpliconAnalysis.pl Wed Dec 31 07:16:40 2014 -0500 @@ -3,13 +3,13 @@ use strict; my $bam = shift; -my $ref = shift || "/gsdata/tmp/TSACP/ampliconPipeLine/ref/hg19/bwa-5/hg19.fa"; -my $manifest = shift || "/gsdata/tmp/TSACP/ampliconPipeLine/ref/manifests/TSACP/TruSeq_Amplicon_Cancer_Panel_Manifest_AFP1_PN15032433.txt"; +my $ref = shift; +my $manifest = shift; my $base = shift || "output"; -my $samtools = shift || "/gsdata/tmp/TSACP/ampliconPipeLine/bin/samtools/samtools-0.1.18/samtools"; +my $samtools = shift || `which samtools`; my $fai = $ref . ".fai"; -my $Qthreshold = 10; -my $QavgLim = 20; +my $Qthreshold = 30; +my $QavgLim = 0; open FAI, "<$fai"; my %idx = (); @@ -52,7 +52,6 @@ s/\s+$//; my @row = split(/\t/, $_); push @{$targets{$row[3]}{$_}}, $row[0] foreach ($row[4] .. $row[5]); - # $manif{$aTarget} = [@row]; my $aTarget = $row[0] . "-" . $row[3] . ":". "$row[4]-$row[5]"; if (not exists($stats{$aTarget}{target})) { $stats{$aTarget}{target} = [@row]; @@ -69,40 +68,23 @@ $stats{$aTarget}{assayStart} = $row[4] + length($probe{$row[0]}{ULSO}); $stats{$aTarget}{assayEnd} = $row[5] - length($probe{$row[0]}{DLSO}); } -# print $row[0] . "\n"; foreach my $i (0 .. 5) { foreach my $j (0 .. 5) { -# print $row[4] + $i .":". ($row[5] - $row[4] + $j) .":". $row[0] . "\n"; -# print $row[4] + $i .":". ($row[5] - $row[4] - $j) .":". $row[0] . "\n"; -# print $row[4] - $i .":". ($row[5] - $row[4] + $j) .":". $row[0] . "\n"; -# print $row[4] - $i .":". ($row[5] - $row[4] - $j) .":". $row[0] . "\n"; $hash{$row[3]}{$row[4] + $i}{$row[5] - $row[4] + $j} = $aTarget; $hash{$row[3]}{$row[4] + $i}{$row[5] - $row[4] - $j} = $aTarget; $hash{$row[3]}{$row[4] - $i}{$row[5] - $row[4] + $j} = $aTarget; $hash{$row[3]}{$row[4] - $i}{$row[5] - $row[4] - $j} = $aTarget; } } -# sleep 1; } close M; -# samtools view -b 25MUT5.bam chr9:133738302-133738491 | samtools calmd - /gsdata/tmp/TSACP/ampliconPipeLine/ref/hg19/bwa-5/hg19.fa | head - -#foreach my $amp (keys(%stats)) { - -# print $amp . "\n"; -# my @SAM = (); - # Fetch reads -# my $coord = $stats{$amp}{chr} .":".$stats{$amp}{start}."-".$stats{$amp}{end}; -# my $ln = $stats{$amp}{ln}; -# print $coord . " $ln\n"; - # Open output open QC, ">$base.qc.ann.txt"; open QC2, ">$base.qc2.ann.txt"; open TARGET, ">$base.qc.targets.txt"; -#TODO Remove stampy check +# Stampy check my $stampy = 0; open SAMH, "$samtools view -H $bam |"; while (<SAMH>) { @@ -112,8 +94,6 @@ close SAMH; open SAM, "$samtools view $bam |"; -#open SAM, "$samtools view -b $bam | $samtools calmd - $ref |"; -# open SAM, "$samtools view -b $bam $coord | $samtools calmd - $ref |"; while (<SAM>) { last unless (/^@/); } @@ -128,7 +108,6 @@ print TARGET "#" . join("\t", qw/amp chr start end assayStart assayEnd depth/) . "\n"; while (<SAM>) { # Skip stampy reads? Nope, stampy is no longer needed - #TODO Remove stampy check # next if ($stampy == 1 && ! /XP:Z:BWA/); chomp; my @row = split(/\t/, $_); @@ -137,13 +116,11 @@ $cig =~ s/(\d+)[MD]/$l+=$1/eg; my $amp = $hash{$row[2]}{$row[3]}{$l} || next; - #print "$row[2] $row[3] $l $amp\n"; push @{$stats{$amp}{sam}}, [@row]; $stats{$amp}{depth}++; if (($pamp ne ".") && ($amp ne $pamp)) { print STDOUT "Flushing $pamp $stats{$pamp}{depth}\n"; -# print TARGET join("\t", $pamp, map { $stats{$pamp}{$_} } qw/chr start end assayStart assayEnd depth/) . "\n"; my $data = &call($stats{$pamp}{sam}); &flush($data, $stats{$pamp}{chr}, $pamp, $stats{$pamp}{assayStart}, $stats{$pamp}{assayEnd}); $stats{$pamp}{sam} = []; @@ -152,27 +129,18 @@ $cnt++; } +print STDOUT "Flushing $pamp $stats{$pamp}{depth}\n"; +my $data = &call($stats{$pamp}{sam}); +&flush($data, $stats{$pamp}{chr}, $pamp, $stats{$pamp}{assayStart}, $stats{$pamp}{assayEnd}); +$stats{$pamp}{sam} = []; + close QC; close QC2; foreach my $pamp (keys %stats) { print TARGET join("\t", $pamp, map { $stats{$pamp}{$_} || "NA"} qw/chr start end assayStart assayEnd depth/) . "\n"; } close TARGET; - - -# print join("\n", map {$hsh{$_} . ":" . $_} keys(%hsh)) . "\n"; -# print $cnt . "\n"; sleep 1; - close SAM; -# $stats{$amp}{depth} = $cnt; -# my $data = &call(\@SAM); -# &flush($data, $stats{$amp}{chr}, $amp); - -#} - -#foreach my $amp (keys(%stats)) { - -# print $amp . ":" . scalar(@{$stats{$amp}{sam}}) . "\n"; -#} +close SAM; sub call { my $SAM = shift; @@ -187,7 +155,6 @@ my $col_qual = 10; my $col_pos = 3; my $col_cigar = 5; - my $refSeqPad = "P"; my %cooc = (); my $cnt = 1; foreach my $r (0 .. $#$SAM) { @@ -199,41 +166,39 @@ my $aln = 0; # Calculate mean read quality and skip if it's below threshold - my $QSum = 0; - my $Qlen = length($qual); + if ($QavgLim > 0) { + my $QSum = 0; + my $Qlen = length($qual); - $QSum += ord($qual) foreach 1 .. $Qlen; - - my $Qavg = $QSum / $Qlen - 33; - next if ($Qavg < $QavgLim); + $QSum += ord($qual) foreach 1 .. $Qlen; + + my $Qavg = $QSum / $Qlen - 33; + next if ($Qavg < $QavgLim); + } while ($row[$col_cigar] =~ /(\d+)(\D)/g) { push @cig, [$1, $2]; $aln += $1; } if ($ppos != $pos) { - my $offset = $idx{$row[$col_chr]}->[2]; - my $nl = int (($row[$col_pos] - 1) / $idx{$row[$col_chr]}->[3]); - $offset += $row[$col_pos] + $nl - 1; + my $offset = $idx{$row[$col_chr]}->[2]; # Byte offset for chromosome + my $nl = int (($row[$col_pos] - 1) / $idx{$row[$col_chr]}->[3]); # Newline bytes to offset + $offset += $row[$col_pos] + $nl - 50 - 1; # Read in 50 nt prior to start of amplicon seek REF, $offset, 0; - $refSeqPad = substr($refSeq, -1, 1); - read REF, $refSeq, 1000;# length($read) * 5; + read REF, $refSeq, 350; $refSeq =~ s/\s//g; - $data{$row[$col_chr]}{$row[$col_pos] + $_}{"refNT"} = substr($refSeq, $_, 1) foreach (0 .. $aln); + $data{$row[$col_chr]}{$row[$col_pos] + $_ - 50 + 1}{"refNT"} = substr($refSeq, $_, 1) foreach (0 .. 300); $ppos = $pos; } foreach my $cigE (@cig) { my $n = $cigE->[0]; if ($cigE->[1] eq "I") { - my $rpos = $pos - $row[$col_pos]; - my $insert = substr($refSeq, $rpos - 1, 1) or print STDERR "Outside?" . $rpos . ":" . length($refSeq) . " $pos\n"; + my $insert = $data{$row[$col_chr]}{$pos - 1}{"refNT"}; $insert .= substr($read, 0, $n, ""); my $qinsert .= substr($qual, 0, $n, ""); push @{$data{$row[$col_chr]}{$pos - 1}{"IR"}{$insert}}, $r; $data{$row[$col_chr]}{$pos - 1}{"I"}{$insert}++; -# push @{$cooc{$pos - 1 }{$insert}}, $cnt; - # $cooc{($pos - 1) .":".$insert}{$cnt} = 0; } elsif ($cigE->[1] eq "M") { my $mseq = substr($read, 0, $n, ""); @@ -242,33 +207,24 @@ $data{$row[$col_chr]}{$pos + $nt}{"DP"}++; my $ntq = substr($mqual, $nt, 1); my $ntn = substr($mseq, $nt, 1); - # if (ord($ntq) - 33 < $Qthreshold) { - # $ntn = "N"; - # } + if (ord($ntq) - 33 < $Qthreshold) { + $ntn = "N"; + } $data{$row[$col_chr]}{$pos + $nt}{"NT"}{$ntn}++; push @{$data{$row[$col_chr]}{$pos + $nt}{"NTR"}{$ntn}}, $r if (uc($ntn) ne uc($data{$row[$col_chr]}{$pos + $nt}{refNT})); -# if (($ntn ne $data{$row[$col_chr]}{$pos + $nt}{"refNT"}) && ($ntn ne "N")) { - #push @{$cooc{$pos + $nt}{$ntn}}, $cnt; - # $cooc{($pos + $nt) .":".$ntn}{$cnt} = 0; -# } } $pos += $n; } elsif ($cigE->[1] eq "D") { - my $rpos = $pos - $row[$col_pos]; - my $deletion = substr($refSeq, $rpos - 1, $n + 1); # Substring out of string error ? + my $deletion = join("", map { $data{$row[$col_chr]}{$pos + $_ -1}{"refNT"} } (0 .. ($n))); push @{$data{$row[$col_chr]}{$pos - 1}{"DR"}{$deletion}}, $r; $data{$row[$col_chr]}{$pos - 1}{"D"}{$deletion}++; - #push @{$cooc{$pos - 1 }{$deletion}}, $cnt; - # $cooc{($pos - 1) . ":" . $deletion}{$cnt} = 0; $pos += $n; } } $cnt++; } - #my $thresh = scalar(@$SAM) * .05; - #&coocCalc(\%cooc, \%data, $thresh); return \%data; } @@ -287,7 +243,6 @@ } my @muts = keys(%{$cooc}); foreach my $i (0 .. $#muts) { -# my %hsh = map { $_ => 0 } keys(%{$cooc{$muts[$i]}}); foreach my $j (($i + 1) .. $#muts) { my $score = 0; foreach my $m (keys(%{$cooc->{$muts[$i]}})) { @@ -299,8 +254,6 @@ } } - #{read}{mutation} - #{mutation}{read} sleep 1; } @@ -313,21 +266,14 @@ return unless (exists($data->{$chr})); print STDOUT "printing ... "; my %reads = (); # read->pos->nt -# foreach my $p (sort {$a <=> $b} keys(%{$data->{$chr}})) { -# unless (($p >= $s) && ($p <= $e)) { -# next; -# } foreach my $p ($s .. $e) { my $posdat = $data->{$chr}{$p} || next; next unless (exists($posdat->{DP})); my $refNT = uc($posdat->{refNT}); my @varOrd = sort { $posdat->{NT}{$b} <=> $posdat->{NT}{$a} } grep {$_ ne "N"} grep { $_ ne $refNT } keys(%{$posdat->{NT}}); my $seqNT = join("/", @varOrd); - #my $refCNT = scalar(@{$posdat->{NT}{$refNT} || []}); my $refCNT = $posdat->{NT}{$refNT} || 0; my $varCNT = 0; - #$varCNT += scalar(@{$posdat->{NT}{$_}}) foreach (@varOrd); - #$varCNT += $posdat->{NT}{$_} foreach (@varOrd); $varCNT = ($varOrd[0])?$posdat->{NT}{$varOrd[0]}:0; my $Istr = "."; my $Dstr = "."; @@ -346,16 +292,13 @@ $reads{$_}{$p} = "$D/$refNT" foreach (@{$posdat->{DR}{$D}}); } } - #$Dstr = join("|", map { $_ . ":" . $posdat->{D}{$_} } keys(%{$posdat->{D}})) if exists($posdat->{D}); print QC join("\t", $chr, $p, $amp, $posdat->{DP} || 0, $posdat->{refNT} || ".", $seqNT || ".", $refCNT, $varCNT, - #(map { scalar(@{$posdat->{NT}{$_} || []})} qw(A C G T N)), $Istr, $Dstr, (map { $posdat->{NT}{$_} || 0 } qw(A C G T N)), $Istr, $Dstr, ) . "\n"; - # delete $data->{$chr}{$p}; } delete $data->{$chr}; foreach my $r (sort { $a <=> $b } keys(%reads)) {
--- a/tool-data/fasta_index.loc.sample Wed Dec 31 06:53:48 2014 -0500 +++ b/tool-data/fasta_index.loc.sample Wed Dec 31 07:16:40 2014 -0500 @@ -1,12 +1,12 @@ #This is a sample file distributed with Galaxy that enables tools -#to use a directory with indexed fasta files. The _index.loc file has +#to use a directory with indexed fasta files. The all_fasta.loc file has #this format (longer white space characters are TAB characters): # #<unique_build_id> <dbkey> <display_name> <file_path> # #So, for example, if you had phiX indexed stored in #/example/path/ -#then the _index.loc entry would look like this: +#then the all_fasta.loc entry would look like this: # #phiX174 phiX phiX Pretty /example/path/phiX.fa # @@ -16,7 +16,7 @@ # phiX.fa.fai # phiX.fa.dict # -#Your _index.loc file should include an entry per line for each +#Your all_fasta.loc file should include an entry per line for each #index set you have stored. For example: # #phiX174 phiX phiX174 /path/to/phiX.fa
--- a/tool_data_table_conf.xml.sample Wed Dec 31 06:53:48 2014 -0500 +++ b/tool_data_table_conf.xml.sample Wed Dec 31 07:16:40 2014 -0500 @@ -1,6 +1,6 @@ <tables> - <table name="fasta_indexes" comment_char="#"> + <table name="all_fasta" comment_char="#"> <columns>value, dbkey, name, path</columns> - <file path="tool-data/fasta_index.loc" /> + <file path="tool-data/all_fasta.loc" /> </table> </tables>