view calc_cnsegements_titan.pl @ 0:c6e48c6ac606 draft default tip

planemo upload for repository https://github.com/morinlab/tools-morinlab/tree/master/tools/calc_cnsegments commit 4ef2d91b7c1686a2696b92fe538d4aec51d05e40-dirty
author morinlab
date Tue, 11 Oct 2016 14:16:32 -0400
parents
children
line wrap: on
line source

#!/usr/bin/perl -w
=head1 NAME        

createTITANsegmentfiles.pl

-head1 SYNOPSIS

=head1 OPTIONS 
    -id <string>
    -outfile|o <string>
    -symmetric|s <boolean>  {1 - true; 0 - false}; Default: 1
    -outIGV|igv <string> Required: 
    -infile|i <string>   Required: TITAN *titan.txt file
  
=head1 DESCRIPTION

=head1 CONTACT

Gavin Ha <gha@bccrc.ca>

=cut

use strict;
use List::Util qw[min max];
use File::Basename;
use Getopt::Long;


my $version = '0.1.3';

sub usage () {
    exec('perldoc', $0);
    exit;
}

my ($id, $outfile, $symmetric, $outIGV, $infile, $calls, $help);
$id=0;
$calls = 1;
$symmetric = 1;
GetOptions (
		'id=s' => \$id,
	    'infile|i=s' => \$infile,
	    'symmetric|s=i' => \$symmetric,
        'outfile|o=s' => \$outfile,
        'outIGV|igv=s' => \$outIGV,
        'help|?' => \$help
            );

if($help) {
    &usage();
}
print "Parameters:\nid=$id\ninfile=$infile\noutfile=$outfile\noutIGV=$outIGV\n";

my ($name,$path,$suffix) = fileparse($infile);
#my @jnk;
#($id,@jnk) = split(/\_/,$name) if ~defined($id);
my $nameOut = $id;

open OUTFILE, ">$outfile" || die("Cannot write to $outfile\n");
open IGV, ">$outIGV" || die("Cannot write to $outIGV\n");
print OUTFILE "Sample\tChromosome\tStart_Position(bp)\tEnd_Position(bp)\tLength(bp)\tMedian_Ratio\tMedian_logR\tTITAN_state\tTITAN_call\tCopy_Number\tMinorCN\tMajorCN\tClonal_Cluster\tClonal_Frequency\n";
print IGV "sample\tchr\tstart\tend\tnum.mark\tmedian\n";
open(SEGFILE, $infile) || die("Can't open $infile!\n");
my $header = <SEGFILE>;
my $line = <SEGFILE>; chomp($line);
my($chr, $start, $ref, $nRef, $N, $ratio, $logR, $cn, $state, $call, $clust, $clustFreq, @rest) = split(/\t/,$line);
my $end = $start;
my @totalRatio=();
my @totalLogR=();
push(@totalRatio,max($ratio,1-$ratio));
push(@totalLogR,$logR);
while($line=<SEGFILE>) {
chomp($line);
my ($chrS, $startS, $refS, $nRefS, $NS, $ratioS, $logRS, $cnS, $stateS, $callS, $clustS, $clustFreqS, @rest) = split(/\t/,$line);	
if ($chrS ne $chr || $callS ne $call || $clustS ne $clust){
	my $medianRatio = max($ratio,1-$ratio); #1 position only
	my $medianLogR = $logR;
	$medianRatio = median(@totalRatio) if (($end-$start+1)>1); #more than 1 position
	$medianLogR = median(@totalLogR) if (($end-$start+1)>1);
	my @majmin;
	if ($symmetric == 0){
		@majmin = &getMinorMajorCN($state);
	}elsif ($symmetric == 1){
		@majmin = &getMinorMajorCNSymmetric($state);
	}
	my $output = $nameOut . "\t" . $chr . "\t" . $start . "\t" . $end . "\t" . ($end-$start+1). "\t" . sprintf("%.2f",$medianRatio) . "\t" . sprintf("%.2f",$medianLogR);	
	$output = $output . "\t" .  $state . "\t" . $call if ($calls);
	$output = $output . "\t" . $cn . "\t" . $majmin[0] . "\t" . $majmin[1] . "\t" . $clust . "\t" . $clustFreq;
	print OUTFILE $output . "\n";
	my $output2 = $nameOut . "\t" . $chr . "\t" . $start . "\t" . $end . "\t" . scalar(@totalRatio). "\t" . sprintf("%.2f",$medianLogR);
	print IGV $output2 . "\n";
	#reset
	@totalRatio = ();
	@totalLogR = ();
	$start = $startS; 
}else{
	
	#push(@totalRatio,max($ratioS,1-$ratioS));
	#assign current state to previous variables
	#new end, but start still the same
}
($chr, $end, $ref, $nRef, $N, $ratio, $logR, $cn, $state, $call, $clust, $clustFreq) = ($chrS, $startS, $refS, $nRefS, $NS, $ratioS, $logRS, $cnS, $stateS, $callS, $clustS, $clustFreqS);
push(@totalRatio,max($ratioS,1-$ratioS));
push(@totalLogR,$logRS);
}

#final output
my $medianRatio = max($ratio,1-$ratio); #1 position only
$medianRatio = median(@totalRatio) if (($end-$start+1)>1); #more than 1 position
my $medianLogR = $logR;
$medianLogR = median(@totalLogR) if (($end-$start+1)>1);
my $output = $nameOut . "\t" . $chr . "\t" . $start . "\t" . $end . "\t" . ($end-$start+1). "\t" . sprintf("%.4f",$medianRatio) . "\t" . sprintf("%.2f",$medianLogR);	
my @majmin;
if ($symmetric == 0){
		@majmin = &getMinorMajorCN($state);
}elsif ($symmetric == 1){
		@majmin = &getMinorMajorCNSymmetric($state);
}
$output = $output . "\t" .  $state . "\t" . $call if ($calls);
$output = $output . "\t" . $cn . "\t" . $majmin[0] . "\t" . $majmin[1] . "\t" . $clust . "\t" . $clustFreq;
print OUTFILE $output . "\n";
my $output2 = $nameOut . "\t" . $chr . "\t" . $start . "\t" . $end . "\t" . scalar(@totalRatio). "\t" . sprintf("%.2f",$medianLogR);
print IGV $output2 . "\n";
close OUTFILE;
close IGV;
#my $returnCode = `sort -k2,2n -k3,3n $outfile\.tmp > $outfile; rm $outfile\.tmp;`;
close SEGFILE;


sub median { 
	my (@array_ref) = @_; 
	my $count = scalar @array_ref; 
	# Sort a COPY of the array, leaving the original untouched 
	my @array = sort { $a <=> $b } @array_ref; 
	if ($count % 2) { 
		return $array[int($count/2)]; 
	} else { 
		return ($array[$count/2] + $array[$count/2 - 1]) / 2; 
	} 
} 


sub getMinorMajorCN {
	my ($state) = @_;
	my @majmin; $majmin[0]="NA"; $majmin[1]="NA";
	if ($state==0){
		$majmin[0] = 0; $majmin[1] = 0;
	}elsif ($state==1 || $state==2){
		$majmin[0] = 0; $majmin[1] = 1;
	}elsif($state==4){
		$majmin[0] = 1; $majmin[1] = 1;
	}elsif ($state==3 || $state==5){
		$majmin[0] = 0; $majmin[1] = 2;
	}elsif ($state==6 || $state==9){
		$majmin[0] = 0; $majmin[1] = 3;
	}elsif ($state==10 || $state==14){
		$majmin[0] = 0; $majmin[1] = 4;
	}elsif ($state==15 || $state==20){
		$majmin[0] = 0; $majmin[1] = 5;
	}elsif ($state==7 || $state==8){
		$majmin[0] = 1; $majmin[1] = 2;
	}elsif ($state==11 || $state==13){
		$majmin[0] = 1; $majmin[1] = 3;
	}elsif ($state==16 || $state==19){
		$majmin[0] = 1; $majmin[1] = 4;
	}elsif ($state==17 || $state==18){
		$majmin[0] = 2; $majmin[1] = 3;
	}elsif ($state==12){
		$majmin[0] = 2; $majmin[1] = 2;
	}
	#my $retVal = ($majmin[0],$majmin[1]);	
	return @majmin;
}

sub getMinorMajorCNSymmetric {
	my ($state) = @_;
	my @majmin; $majmin[0]="NA"; $majmin[1]="NA";
	if ($state==0){
		$majmin[0] = 0; $majmin[1] = 0;
	}elsif ($state==1){
		$majmin[0] = 0; $majmin[1] = 1;
	}elsif($state==2){
		$majmin[0] = 0; $majmin[1] = 2;
	}elsif ($state==3){
		$majmin[0] = 1; $majmin[1] = 1;
	}elsif ($state==4){
		$majmin[0] = 0; $majmin[1] = 3;
	}elsif ($state==5){
		$majmin[0] = 1; $majmin[1] = 2;
	}elsif ($state==6){
		$majmin[0] = 0; $majmin[1] = 4;
	}elsif ($state==7){
		$majmin[0] = 1; $majmin[1] = 3;
	}elsif ($state==8){
		$majmin[0] = 2; $majmin[1] = 2;
	}elsif ($state==9){
		$majmin[0] = 0; $majmin[1] = 5;
	}elsif ($state==10){
		$majmin[0] = 1; $majmin[1] = 4;
	}elsif ($state==11){
		$majmin[0] = 2; $majmin[1] = 3;
	}elsif ($state==12){
		$majmin[0] = 0; $majmin[1] = 6;
	}elsif ($state==13){
		$majmin[0] = 1; $majmin[1] = 5;
	}elsif ($state==14){
		$majmin[0] = 2; $majmin[1] = 4;
	}elsif ($state==15){
		$majmin[0] = 3; $majmin[1] = 3;
	}elsif ($state==16){
		$majmin[0] = 0; $majmin[1] = 7;
	}elsif ($state==17){
		$majmin[0] = 1; $majmin[1] = 6;
	}elsif ($state==18){
		$majmin[0] = 2; $majmin[1] = 5;
	}elsif ($state==19){
		$majmin[0] = 3; $majmin[1] = 4;
	}elsif ($state==20){
		$majmin[0] = 0; $majmin[1] = 8;
	}elsif ($state==21){
		$majmin[0] = 1; $majmin[1] = 7;
	}elsif ($state==22){
		$majmin[0] = 2; $majmin[1] = 6;
	}elsif ($state==23){
		$majmin[0] = 3; $majmin[1] = 5;
	}elsif ($state==24){
		$majmin[0] = 4; $majmin[1] = 4;
	}
	#my $retVal = ($majmin[0],$majmin[1]);	
	return @majmin;
}


#Read more: http://wiki.answers.com/Q/How_can_you_calculate_the_average_and_median_in_perl_by_subroutine#ixzz1LcMWdXEE
#sub median{
#	my @a = sort @_;
#	return ($a[$#a/2] + $a[@a/2]) / 2;
#}