view falco/lib/perl/perAmpliconAnalysis.pl @ 59:73eda23fb8fd draft

Uploaded
author stef
date Wed, 25 Feb 2015 08:50:56 -0500
parents 036033e74757
children
line wrap: on
line source

#!/usr/bin/perl -w

use strict;

my $bam = shift;
my $ref = shift;
my $manifest = shift;
my $base = shift || "output";
my $samtools = shift || `which samtools`;
my $fai = $ref . ".fai";
my $Qthreshold = 30;
my $QavgLim = 0;

open FAI, "<$fai";
my %idx = ();
print STDOUT "Reading fa index ... ";
while (<FAI>) {
        chomp;
        my @row = split(/\t/, $_);
        $idx{$row[0]} = [@row];
}
print STDOUT "done\n";

close FAI;
open REF, "<$ref";

my %stats = ();
my %manif = ();
my %targets = ();
my %probe = ();
my %hash = ();

open M, "<$manifest";
while (<M>) { last if (/\[Probes\]/) }
readline(M);
while (<M>) {
        chomp;
        if (/\[Targets\]/) {
                my $h = readline M;
                chomp $h;
                my @H = split(/\t/, $h);
                last;
        }
        my @row = split(/\t/, $_);
        $probe{$row[2]}{probe} = [@row];
        $probe{$row[2]}{ULSO} = $row[9];
        $probe{$row[2]}{DLSO} = $row[11];
}

while (<M>) {
        chomp;
        s/\s+$//;
        my @row = split(/\t/, $_);
        push @{$targets{$row[3]}{$_}}, $row[0] foreach ($row[4] .. $row[5]);
	my $aTarget = $row[0] . "-" . $row[3] . ":". "$row[4]-$row[5]";
        if (not exists($stats{$aTarget}{target})) {
                $stats{$aTarget}{target} = [@row];
                $stats{$aTarget}{ln} = $row[5] - $row[4] + 1;
                $stats{$aTarget}{n} = 0;
                $stats{$aTarget}{sam} = [];
                $stats{$aTarget}{depth} = 0;
                $stats{$aTarget}{covs} = [];
                $stats{$aTarget}{chr} = $row[3];
                $stats{$aTarget}{start} = $row[4];
                $stats{$aTarget}{end} = $row[5];
		$stats{$aTarget}{ULSO} = $probe{$row[0]}{ULSO};
		$stats{$aTarget}{DLSO} = $probe{$row[0]}{DLSO};
                $stats{$aTarget}{assayStart} = $row[4] + length($probe{$row[0]}{ULSO});
                $stats{$aTarget}{assayEnd} = $row[5] - length($probe{$row[0]}{DLSO});
        }
	foreach my $i (0 .. 5) {
		foreach my $j (0 .. 5) {
			$hash{$row[3]}{$row[4] + $i}{$row[5] - $row[4] + $j} = $aTarget;
			$hash{$row[3]}{$row[4] + $i}{$row[5] - $row[4] - $j} = $aTarget;
			$hash{$row[3]}{$row[4] - $i}{$row[5] - $row[4] + $j} = $aTarget;
			$hash{$row[3]}{$row[4] - $i}{$row[5] - $row[4] - $j} = $aTarget;
		}
	}
}
close M;

# Open output 
open QC, ">$base.qc.ann.txt";
open QC2, ">$base.qc2.ann.txt";
open TARGET, ">$base.qc.targets.txt";

# Stampy check
my $stampy = 0;
open SAMH, "$samtools view -H $bam |";
while (<SAMH>) {
	$stampy = 1 if (/PN:stampy/);
	if (/PN:stampy/) {
		print STDERR "[WARNING]: Stampy aligned reads is no longer supported.\n";
	}
	if (/SO:unsorted/) {
		print STDERR "[Warning]: Alignments seem to be unsorted.\n";
	}
}
close SAMH;

open SAM, "$samtools view -F 4 $bam |";
while (<SAM>) {
	last unless (/^@/);
}
my $cnt = 0;
my %hsh = ();
my $pamp = ".";

#Print header

print QC "#" . join("\t", qw/chr pos amp dp ntRef ntVar nRef nVar nA nC nG nT nN ins del/) . "\n";
print QC2 "#" . join("\t", qw/read chr pos amp event/) . "\n";
print TARGET "#" . join("\t", qw/amp chr start end assayStart assayEnd depth/) . "\n";
while (<SAM>) {
	# Skip stampy reads? Nope, stampy is no longer needed
#	next if ($stampy == 1 && ! /XP:Z:BWA/);
	chomp;
	my @row = split(/\t/, $_);
	my $l = 0;
	my $cig = $row[5];
	$cig =~ s/(\d+)[MD]/$l+=$1/eg;
	
	# Check if alignment fits amplicon probes / primers.
	my $amp = $hash{$row[2]}{$row[3]}{$l} || next; 

	push @{$stats{$amp}{sam}}, [@row];
	$stats{$amp}{depth}++;
	if (($pamp ne ".") && ($amp ne $pamp)) {
		print STDOUT "Flushing $pamp $stats{$pamp}{depth}\n";
		my $data = &call($stats{$pamp}{sam});
		&flush($data, $stats{$pamp}{chr}, $pamp, $stats{$pamp}{assayStart}, $stats{$pamp}{assayEnd});
		$stats{$pamp}{sam} = [];
	}
	$pamp = $amp;
	$cnt++;
}

print STDOUT "Flushing $pamp $stats{$pamp}{depth}\n";
my $data = &call($stats{$pamp}{sam});
&flush($data, $stats{$pamp}{chr}, $pamp, $stats{$pamp}{assayStart}, $stats{$pamp}{assayEnd});
$stats{$pamp}{sam} = [];

close QC;
close QC2;
foreach my $pamp (keys %stats) {
		print TARGET join("\t", $pamp, map { $stats{$pamp}{$_} || "NA"} qw/chr start end assayStart assayEnd depth/) . "\n";
}
close TARGET;
close SAM;

sub call {
	my $SAM = shift;
	my $ppos = 0;
	my $pchr = "";
	my %data = ();
	my $refSeq = "";
	my $c = 0;
	my $col_chr = 2;
	my $col_read = 9;
	my $col_qual = 10;
	my $col_pos = 3;
	my $col_cigar = 5;
	my %cooc = ();
	my $cnt = 1;
	foreach my $r (0 .. $#$SAM) {
		my @row = @{$SAM->[$r]};
		my $read = $row[$col_read];
		my $qual = $row[$col_qual];
		my $pos = $row[$col_pos];
		my @cig = ();
		my $aln = 0;

		# Calculate mean read quality and skip if it's below threshold
		if ($QavgLim > 0) {
			my $QSum = 0;
			my $Qlen = length($qual);

			$QSum += ord($qual) foreach 1 .. $Qlen;

			my $Qavg = $QSum / $Qlen - 33;
			next if ($Qavg < $QavgLim);
		}

		while ($row[$col_cigar] =~ /(\d+)(\D)/g) {
			push @cig, [$1, $2];
			$aln += $1;
		}
		if ($ppos != $pos) {
			my $offset = $idx{$row[$col_chr]}->[2]; # Byte offset for chromosome
			my $nl = int (($row[$col_pos] - 1) / $idx{$row[$col_chr]}->[3]); # Newline bytes to offset
			$offset += $row[$col_pos] + $nl - 50 - 1; # Read in 50 nt prior to start of amplicon
			seek REF, $offset, 0;
			read REF, $refSeq, 350;
			$refSeq =~ s/\s//g;
			$data{$row[$col_chr]}{$row[$col_pos] + $_ - 50 + 1}{"refNT"} = substr($refSeq, $_, 1) foreach (0 .. 300);
			$ppos = $pos;
		}

		foreach my $cigE (@cig) {
			my $n = $cigE->[0];
			if ($cigE->[1] eq "I") {
				my $insert = $data{$row[$col_chr]}{$pos - 1}{"refNT"};
				$insert .= substr($read, 0, $n, "");
				my $qinsert .= substr($qual, 0, $n, "");
				push @{$data{$row[$col_chr]}{$pos - 1}{"IR"}{$insert}}, $r;
				$data{$row[$col_chr]}{$pos - 1}{"I"}{$insert}++;
			}
			elsif ($cigE->[1] eq "M") {
				my $mseq = substr($read, 0, $n, "");
				my $mqual = substr($qual, 0, $n, "");
				foreach my $nt (0 .. (length($mseq) - 1)) {
					$data{$row[$col_chr]}{$pos + $nt}{"DP"}++;
					my $ntq = substr($mqual, $nt, 1);
					my $ntn = substr($mseq, $nt, 1);
					if (ord($ntq) - 33  < $Qthreshold) {
						$ntn = "N";
					}
					$data{$row[$col_chr]}{$pos + $nt}{"NT"}{$ntn}++;
					push @{$data{$row[$col_chr]}{$pos + $nt}{"NTR"}{$ntn}}, $r if (uc($ntn) ne uc($data{$row[$col_chr]}{$pos + $nt}{refNT}));
				}
				$pos += $n;

			}
			elsif ($cigE->[1] eq "D") {
				my $deletion = join("", map { $data{$row[$col_chr]}{$pos + $_ -1}{"refNT"} } (0 .. ($n)));
				push @{$data{$row[$col_chr]}{$pos - 1}{"DR"}{$deletion}}, $r;
				$data{$row[$col_chr]}{$pos - 1}{"D"}{$deletion}++;
				$pos += $n;
			}
		}
		$cnt++;
	}
	return \%data;
}

sub coocCalc {
	my $cooc = shift;
	my $data = shift;
	my $thresh = shift;
	my %rev = ();
	# Filter low frequency variants 
	foreach my $k (keys(%$cooc)) {
		if (scalar(keys(%{$cooc->{$k}})) < $thresh) {
				delete($cooc->{$k});
				next;
		}
		print STDERR "$k : $thresh : " . join(":", sort {$a cmp $b} keys(%{$cooc->{$k}})) . "\n";
	}
	my @muts = keys(%{$cooc});
	foreach my $i (0 .. $#muts) {
		foreach my $j (($i + 1) .. $#muts) {
			my $score = 0;
			foreach my $m (keys(%{$cooc->{$muts[$i]}})) {
				if (exists($cooc->{$muts[$j]}{$m})) {
					$score++;	
				}
			}
			print STDERR $muts[$i] .":". $muts[$j] . ":" . $score . "\n"; sleep 1;
		}	
	}		

	sleep 1;
}

sub flush {
        my $data = shift;
        my $chr = shift;
	my $amp = shift;
	my $s = shift;
	my $e = shift;
        return unless (exists($data->{$chr}));
        print STDOUT "printing ... ";
	my %reads = (); # read->pos->nt
	foreach my $p ($s .. $e) {
	        my $posdat = $data->{$chr}{$p} || next;
	        next unless (exists($posdat->{DP}));
                my $refNT = uc($posdat->{refNT});
                my @varOrd = sort { $posdat->{NT}{$b} <=> $posdat->{NT}{$a} } grep {$_ ne "N"} grep { $_ ne $refNT  } keys(%{$posdat->{NT}});
                my $seqNT = join("/", @varOrd);
		my $refCNT = $posdat->{NT}{$refNT} || 0;
		my $varCNT = 0;
		$varCNT = ($varOrd[0])?$posdat->{NT}{$varOrd[0]}:0;
		my $Istr = ".";
		my $Dstr = ".";
		if ($varOrd[0]) {
			$reads{$_}{$p} = "$refNT/$varOrd[0]" foreach (@{$posdat->{NTR}{$varOrd[0]}});
		}
		if (exists($posdat->{I})) {
			$Istr = join("|", map { $_ . ":" . $posdat->{I}{$_} } keys(%{$posdat->{I}}));
			foreach my $I (keys(%{$posdat->{IR}})) {
				$reads{$_}{$p} = "$refNT/$I" foreach (@{$posdat->{IR}{$I}});
			}
		}
		if (exists($posdat->{D})) {
			$Dstr = join("|", map { $_ . ":" . $posdat->{D}{$_} } keys(%{$posdat->{D}}));
			foreach my $D (keys(%{$posdat->{DR}})) {
				$reads{$_}{$p} = "$D/$refNT" foreach (@{$posdat->{DR}{$D}});
			}
		}
		print QC join("\t", $chr, $p, $amp,
				$posdat->{DP} || 0,
				$posdat->{refNT} || ".",
				$seqNT || ".",
				$refCNT, $varCNT,
				(map { $posdat->{NT}{$_} || 0 } qw(A C G T N)), $Istr, $Dstr,
			  ) . "\n";
	}
	delete $data->{$chr};
	foreach my $r (sort { $a <=> $b } keys(%reads)) {
		print QC2 join("\t", $r, $chr, $_, $amp, $reads{$r}{$_}) . "\n" foreach (sort {$a <=> $b} keys(%{$reads{$r}})); 
	}
	print STDOUT "done\n";
}