changeset 54:036033e74757 draft

Uploaded
author stef
date Wed, 31 Dec 2014 07:16:40 -0500
parents b4e2b4efae61
children 55a9c30fc11c
files README.rst falco/lib/perl/perAmpliconAnalysis.pl tool-data/fasta_index.loc.sample tool_data_table_conf.xml.sample
diffstat 4 files changed, 68 insertions(+), 92 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README.rst	Wed Dec 31 07:16:40 2014 -0500
@@ -0,0 +1,33 @@
+FALCO Amplicon Analysis Pipeline & Galaxy wrapper
+========================================
+
+FALCO variant-caller is part of the Amplicon Analysis Pipeline (AAP).
+
+The typical workflow is as follows:
+* paired-end amplicon sequencing
+* merge pairs by overlapping them
+* map the single read fastq with BWA
+* perform variant calling with FALCO
+* create (html) report of the results
+
+FALCO uses samtools and straight-forward statistics to determine wether a
+potential variant is likely a (technical) artifact or not.
+
+Input / Output
+
+Input of the caller is a BAM file, output VCF
+
+
+History
+
+============== ================================================================
+Date            Changes
+-------------- ----------------------------------------------------------------
+December 2014    * first release in Test Tool Shed
+============== ================================================================
+
+
+Bug Reports & other questions
+=============================
+
+Issues can be reported via http://www.tgac.nl/
--- a/falco/lib/perl/perAmpliconAnalysis.pl	Wed Dec 31 06:53:48 2014 -0500
+++ b/falco/lib/perl/perAmpliconAnalysis.pl	Wed Dec 31 07:16:40 2014 -0500
@@ -3,13 +3,13 @@
 use strict;
 
 my $bam = shift;
-my $ref = shift || "/gsdata/tmp/TSACP/ampliconPipeLine/ref/hg19/bwa-5/hg19.fa";
-my $manifest = shift || "/gsdata/tmp/TSACP/ampliconPipeLine/ref/manifests/TSACP/TruSeq_Amplicon_Cancer_Panel_Manifest_AFP1_PN15032433.txt";
+my $ref = shift;
+my $manifest = shift;
 my $base = shift || "output";
-my $samtools = shift || "/gsdata/tmp/TSACP/ampliconPipeLine/bin/samtools/samtools-0.1.18/samtools";
+my $samtools = shift || `which samtools`;
 my $fai = $ref . ".fai";
-my $Qthreshold = 10;
-my $QavgLim = 20;
+my $Qthreshold = 30;
+my $QavgLim = 0;
 
 open FAI, "<$fai";
 my %idx = ();
@@ -52,7 +52,6 @@
         s/\s+$//;
         my @row = split(/\t/, $_);
         push @{$targets{$row[3]}{$_}}, $row[0] foreach ($row[4] .. $row[5]);
-    #    $manif{$aTarget} = [@row];
 	my $aTarget = $row[0] . "-" . $row[3] . ":". "$row[4]-$row[5]";
         if (not exists($stats{$aTarget}{target})) {
                 $stats{$aTarget}{target} = [@row];
@@ -69,40 +68,23 @@
                 $stats{$aTarget}{assayStart} = $row[4] + length($probe{$row[0]}{ULSO});
                 $stats{$aTarget}{assayEnd} = $row[5] - length($probe{$row[0]}{DLSO});
         }
-#	print $row[0] . "\n";
 	foreach my $i (0 .. 5) {
 		foreach my $j (0 .. 5) {
-#			print $row[4] + $i .":". ($row[5] - $row[4] + $j) .":". $row[0] . "\n";
-#			print $row[4] + $i .":". ($row[5] - $row[4] - $j) .":". $row[0] . "\n";
-#			print $row[4] - $i .":". ($row[5] - $row[4] + $j) .":". $row[0] . "\n";
-#			print $row[4] - $i .":". ($row[5] - $row[4] - $j) .":". $row[0] . "\n";
 			$hash{$row[3]}{$row[4] + $i}{$row[5] - $row[4] + $j} = $aTarget;
 			$hash{$row[3]}{$row[4] + $i}{$row[5] - $row[4] - $j} = $aTarget;
 			$hash{$row[3]}{$row[4] - $i}{$row[5] - $row[4] + $j} = $aTarget;
 			$hash{$row[3]}{$row[4] - $i}{$row[5] - $row[4] - $j} = $aTarget;
 		}
 	}
-#	sleep 1;
 }
 close M;
 
-# samtools view -b 25MUT5.bam chr9:133738302-133738491 | samtools calmd - /gsdata/tmp/TSACP/ampliconPipeLine/ref/hg19/bwa-5/hg19.fa | head
-
-#foreach my $amp (keys(%stats)) {
-
-#	print $amp . "\n";
-#	my @SAM = ();
-	# Fetch reads
-#	my $coord = $stats{$amp}{chr} .":".$stats{$amp}{start}."-".$stats{$amp}{end};
-#	my $ln = $stats{$amp}{ln};
-#	print $coord . " $ln\n";
-
 # Open output 
 open QC, ">$base.qc.ann.txt";
 open QC2, ">$base.qc2.ann.txt";
 open TARGET, ">$base.qc.targets.txt";
 
-#TODO Remove stampy check
+# Stampy check
 my $stampy = 0;
 open SAMH, "$samtools view -H $bam |";
 while (<SAMH>) {
@@ -112,8 +94,6 @@
 close SAMH;
 
 open SAM, "$samtools view $bam |";
-#open SAM, "$samtools view -b $bam | $samtools calmd - $ref |";
-#	open SAM, "$samtools view -b $bam $coord  | $samtools calmd - $ref |";
 while (<SAM>) {
 	last unless (/^@/);
 }
@@ -128,7 +108,6 @@
 print TARGET "#" . join("\t", qw/amp chr start end assayStart assayEnd depth/) . "\n";
 while (<SAM>) {
 	# Skip stampy reads? Nope, stampy is no longer needed
-	#TODO Remove stampy check
 #	next if ($stampy == 1 && ! /XP:Z:BWA/);
 	chomp;
 	my @row = split(/\t/, $_);
@@ -137,13 +116,11 @@
 	$cig =~ s/(\d+)[MD]/$l+=$1/eg;
 		
 	my $amp = $hash{$row[2]}{$row[3]}{$l} || next;
-	#print "$row[2] $row[3] $l $amp\n"; 
 
 	push @{$stats{$amp}{sam}}, [@row];
 	$stats{$amp}{depth}++;
 	if (($pamp ne ".") && ($amp ne $pamp)) {
 		print STDOUT "Flushing $pamp $stats{$pamp}{depth}\n";
-#		print TARGET join("\t", $pamp, map { $stats{$pamp}{$_} } qw/chr start end assayStart assayEnd depth/) . "\n";
 		my $data = &call($stats{$pamp}{sam});
 		&flush($data, $stats{$pamp}{chr}, $pamp, $stats{$pamp}{assayStart}, $stats{$pamp}{assayEnd});
 		$stats{$pamp}{sam} = [];
@@ -152,27 +129,18 @@
 	$cnt++;
 }
 
+print STDOUT "Flushing $pamp $stats{$pamp}{depth}\n";
+my $data = &call($stats{$pamp}{sam});
+&flush($data, $stats{$pamp}{chr}, $pamp, $stats{$pamp}{assayStart}, $stats{$pamp}{assayEnd});
+$stats{$pamp}{sam} = [];
+
 close QC;
 close QC2;
 foreach my $pamp (keys %stats) {
 		print TARGET join("\t", $pamp, map { $stats{$pamp}{$_} || "NA"} qw/chr start end assayStart assayEnd depth/) . "\n";
 }
 close TARGET;
-	
-
-#	print join("\n", map {$hsh{$_} . ":" . $_} keys(%hsh)) . "\n";
-#	print $cnt . "\n"; sleep 1;
-	close SAM;
-#	$stats{$amp}{depth} = $cnt;
-#	my $data = &call(\@SAM);
-#	&flush($data, $stats{$amp}{chr}, $amp);
-
-#}
-
-#foreach my $amp (keys(%stats)) {
-
-#	print $amp . ":" . scalar(@{$stats{$amp}{sam}}) . "\n";
-#}
+close SAM;
 
 sub call {
 	my $SAM = shift;
@@ -187,7 +155,6 @@
 	my $col_qual = 10;
 	my $col_pos = 3;
 	my $col_cigar = 5;
-	my $refSeqPad = "P";
 	my %cooc = ();
 	my $cnt = 1;
 	foreach my $r (0 .. $#$SAM) {
@@ -199,41 +166,39 @@
 		my $aln = 0;
 
 		# Calculate mean read quality and skip if it's below threshold
-		my $QSum = 0;
-		my $Qlen = length($qual);
+		if ($QavgLim > 0) {
+			my $QSum = 0;
+			my $Qlen = length($qual);
 
-		$QSum += ord($qual) foreach 1 .. $Qlen;
-		
-		my $Qavg = $QSum / $Qlen - 33;
-		next if ($Qavg < $QavgLim);
+			$QSum += ord($qual) foreach 1 .. $Qlen;
+
+			my $Qavg = $QSum / $Qlen - 33;
+			next if ($Qavg < $QavgLim);
+		}
 
 		while ($row[$col_cigar] =~ /(\d+)(\D)/g) {
 			push @cig, [$1, $2];
 			$aln += $1;
 		}
 		if ($ppos != $pos) {
-			my $offset = $idx{$row[$col_chr]}->[2];
-			my $nl = int (($row[$col_pos] - 1) / $idx{$row[$col_chr]}->[3]);
-			$offset += $row[$col_pos] + $nl - 1;
+			my $offset = $idx{$row[$col_chr]}->[2]; # Byte offset for chromosome
+			my $nl = int (($row[$col_pos] - 1) / $idx{$row[$col_chr]}->[3]); # Newline bytes to offset
+			$offset += $row[$col_pos] + $nl - 50 - 1; # Read in 50 nt prior to start of amplicon
 			seek REF, $offset, 0;
-			$refSeqPad = substr($refSeq, -1, 1);
-			read REF, $refSeq, 1000;#  length($read) * 5;
+			read REF, $refSeq, 350;
 			$refSeq =~ s/\s//g;
-			$data{$row[$col_chr]}{$row[$col_pos] + $_}{"refNT"} = substr($refSeq, $_, 1) foreach (0 .. $aln);
+			$data{$row[$col_chr]}{$row[$col_pos] + $_ - 50 + 1}{"refNT"} = substr($refSeq, $_, 1) foreach (0 .. 300);
 			$ppos = $pos;
 		}
 
 		foreach my $cigE (@cig) {
 			my $n = $cigE->[0];
 			if ($cigE->[1] eq "I") {
-				my $rpos = $pos - $row[$col_pos];
-				my $insert = substr($refSeq, $rpos - 1, 1) or print STDERR "Outside?" . $rpos . ":" . length($refSeq) . " $pos\n";
+				my $insert = $data{$row[$col_chr]}{$pos - 1}{"refNT"};
 				$insert .= substr($read, 0, $n, "");
 				my $qinsert .= substr($qual, 0, $n, "");
 				push @{$data{$row[$col_chr]}{$pos - 1}{"IR"}{$insert}}, $r;
 				$data{$row[$col_chr]}{$pos - 1}{"I"}{$insert}++;
-#				push @{$cooc{$pos - 1 }{$insert}}, $cnt;
-	#			$cooc{($pos - 1) .":".$insert}{$cnt} = 0;
 			}
 			elsif ($cigE->[1] eq "M") {
 				my $mseq = substr($read, 0, $n, "");
@@ -242,33 +207,24 @@
 					$data{$row[$col_chr]}{$pos + $nt}{"DP"}++;
 					my $ntq = substr($mqual, $nt, 1);
 					my $ntn = substr($mseq, $nt, 1);
-				#	if (ord($ntq) - 33  < $Qthreshold) {
-				#		$ntn = "N";
-				#	}
+					if (ord($ntq) - 33  < $Qthreshold) {
+						$ntn = "N";
+					}
 					$data{$row[$col_chr]}{$pos + $nt}{"NT"}{$ntn}++;
 					push @{$data{$row[$col_chr]}{$pos + $nt}{"NTR"}{$ntn}}, $r if (uc($ntn) ne uc($data{$row[$col_chr]}{$pos + $nt}{refNT}));
-#					if (($ntn ne $data{$row[$col_chr]}{$pos + $nt}{"refNT"}) && ($ntn ne "N")) {
-						#push @{$cooc{$pos + $nt}{$ntn}}, $cnt;
-	#					$cooc{($pos + $nt) .":".$ntn}{$cnt} = 0;
-#					}
 				}
 				$pos += $n;
 
 			}
 			elsif ($cigE->[1] eq "D") {
-				my $rpos = $pos - $row[$col_pos];
-				my $deletion = substr($refSeq, $rpos - 1, $n + 1); # Substring out of string error ?
+				my $deletion = join("", map { $data{$row[$col_chr]}{$pos + $_ -1}{"refNT"} } (0 .. ($n)));
 				push @{$data{$row[$col_chr]}{$pos - 1}{"DR"}{$deletion}}, $r;
 				$data{$row[$col_chr]}{$pos - 1}{"D"}{$deletion}++;
-				#push @{$cooc{$pos - 1 }{$deletion}}, $cnt;
-	#			$cooc{($pos - 1) . ":" . $deletion}{$cnt} = 0;
 				$pos += $n;
 			}
 		}
 		$cnt++;
 	}
-	#my $thresh = scalar(@$SAM) * .05;
-	#&coocCalc(\%cooc, \%data, $thresh);	
 	return \%data;
 }
 
@@ -287,7 +243,6 @@
 	}
 	my @muts = keys(%{$cooc});
 	foreach my $i (0 .. $#muts) {
-#		my %hsh = map { $_ => 0 } keys(%{$cooc{$muts[$i]}});
 		foreach my $j (($i + 1) .. $#muts) {
 			my $score = 0;
 			foreach my $m (keys(%{$cooc->{$muts[$i]}})) {
@@ -299,8 +254,6 @@
 		}	
 	}		
 
-	#{read}{mutation}	
-	#{mutation}{read}	
 	sleep 1;
 }
 
@@ -313,21 +266,14 @@
         return unless (exists($data->{$chr}));
         print STDOUT "printing ... ";
 	my %reads = (); # read->pos->nt
-#        foreach my $p (sort {$a <=> $b} keys(%{$data->{$chr}})) {
-#        	unless (($p >=  $s) && ($p <= $e)) {
-#			next;	
-#		}
 	foreach my $p ($s .. $e) {
 	        my $posdat = $data->{$chr}{$p} || next;
 	        next unless (exists($posdat->{DP}));
                 my $refNT = uc($posdat->{refNT});
                 my @varOrd = sort { $posdat->{NT}{$b} <=> $posdat->{NT}{$a} } grep {$_ ne "N"} grep { $_ ne $refNT  } keys(%{$posdat->{NT}});
                 my $seqNT = join("/", @varOrd);
-                #my $refCNT = scalar(@{$posdat->{NT}{$refNT} || []});
 		my $refCNT = $posdat->{NT}{$refNT} || 0;
 		my $varCNT = 0;
-		#$varCNT += scalar(@{$posdat->{NT}{$_}}) foreach (@varOrd);
-		#$varCNT += $posdat->{NT}{$_} foreach (@varOrd);
 		$varCNT = ($varOrd[0])?$posdat->{NT}{$varOrd[0]}:0;
 		my $Istr = ".";
 		my $Dstr = ".";
@@ -346,16 +292,13 @@
 				$reads{$_}{$p} = "$D/$refNT" foreach (@{$posdat->{DR}{$D}});
 			}
 		}
-		#$Dstr = join("|", map { $_ . ":" . $posdat->{D}{$_} } keys(%{$posdat->{D}})) if exists($posdat->{D});
 		print QC join("\t", $chr, $p, $amp,
 				$posdat->{DP} || 0,
 				$posdat->{refNT} || ".",
 				$seqNT || ".",
 				$refCNT, $varCNT,
-				#(map { scalar(@{$posdat->{NT}{$_} || []})} qw(A C G T N)), $Istr, $Dstr,
 				(map { $posdat->{NT}{$_} || 0 } qw(A C G T N)), $Istr, $Dstr,
 			  ) . "\n";
-		# delete $data->{$chr}{$p};
 	}
 	delete $data->{$chr};
 	foreach my $r (sort { $a <=> $b } keys(%reads)) {
--- a/tool-data/fasta_index.loc.sample	Wed Dec 31 06:53:48 2014 -0500
+++ b/tool-data/fasta_index.loc.sample	Wed Dec 31 07:16:40 2014 -0500
@@ -1,12 +1,12 @@
 #This is a sample file distributed with Galaxy that enables tools
-#to use a directory with indexed fasta files. The _index.loc file has 
+#to use a directory with indexed fasta files. The all_fasta.loc file has 
 #this format (longer white space characters are TAB characters):
 #
 #<unique_build_id>   <dbkey>   <display_name>   <file_path>
 #
 #So, for example, if you had phiX indexed stored in 
 #/example/path/
-#then the _index.loc entry would look like this:
+#then the all_fasta.loc entry would look like this:
 #
 #phiX174   phiX   phiX Pretty   /example/path/phiX.fa
 #
@@ -16,7 +16,7 @@
 # phiX.fa.fai
 # phiX.fa.dict
 #
-#Your _index.loc file should include an entry per line for each
+#Your all_fasta.loc file should include an entry per line for each
 #index set you have stored. For example:
 #
 #phiX174	phiX	phiX174			/path/to/phiX.fa
--- a/tool_data_table_conf.xml.sample	Wed Dec 31 06:53:48 2014 -0500
+++ b/tool_data_table_conf.xml.sample	Wed Dec 31 07:16:40 2014 -0500
@@ -1,6 +1,6 @@
 <tables>
-    <table name="fasta_indexes" comment_char="#">
+    <table name="all_fasta" comment_char="#">
         <columns>value, dbkey, name, path</columns>
-        <file path="tool-data/fasta_index.loc" />
+        <file path="tool-data/all_fasta.loc" />
     </table>
 </tables>