view Calculate_TestVariants_Variant_Frequencies_0_1_0.pl @ 2:3a4894be7df2 draft

Uploaded
author bcrain-completegenomics
date Thu, 24 May 2012 15:16:40 -0400
parents
children
line wrap: on
line source

#!/usr/bin/perl
use strict;
#use feature "say";
#use File::Basename;
$| = 1;

# Get_TestVariants_Variant_Frequencies
# Calculate the frequencies of variants in a testvariants output file
# Two values calculated:
#	Frequency vs all alleles
# 	Frequency vs called alleles

# Input is a testvariants file
# Outputs:
# All data to *-Freq.tsv, including scores and quals
# vars and freqs to *-Freq_Short.tsv
# Exceptions to *-Freq_Log
# Stats to *-Freq_Stats

# Format:
# perl prog file dir
# ie
# perl Get_TestVariants_Variant_Frequencies \
# --Input input_file \
# --First_Genome_Field_Nr col_nr1 \
# --Last_Genome_Field_Nr col_nr2
# --Output1 output1 \
# --Output2 output2 

# eg
# perl /perl/Get_TestVariants_Variant_Frequencies_0_0_1.pl \
# --Input /data/Family_Quartet_testvariants.tsv \
# --First_Genome_Field_Nr 9 \
# --Last_Genome_Field_Nr 11
# --Output1 output1 \
# --Output2 output2 


# Rick Tearle 2010-11

my $Time -= time; # start time
my $Debug = 0;

# Parsing and storing input parameters
# Only childfields can be repeated
print "$0 @ARGV\nProcessing input parameters\n";
my %ExpectedParams =  GetExpectedParams ();
my %EnteredParams = GetEnteredParams ();

# Setting up prog paras from input paras
my $FileIn = $EnteredParams{input};
unless (-f $FileIn) {die "Testvariants input file $FileIn not found\n";} # requires existing file
#my $FileOut = $EnteredParams{output}; #
#$DirectoryOut =~ s/\/$//; # remove trailing slash if present
#unless (-d $DirectoryOut) {die "Output directory $DirectoryOut not found\n";} # requires existing file
#print "$FileIn\n$DirectoryOut\n";
#$FileIn =~ /(^.+\/)(.+?)\./; # get filename without path and without extensions

# my $FileOut1 = $FileOut."-Freq.tsv";
# my $FileOut2 = $FileOut."-Freq_Short.tsv";
# my $FileOut3 = $FileOut."-Freq_Stats.tsv";
# my $FileOut4 = $FileOut."-Freq_Log.tsv";

print "\nOpening Input File:\n\t$FileIn\n";
my $IN = OpenFile ($FileIn); # open the file with correct file format

#print "\nOpening Output Files:\n\t$FileOut1\n\t$FileOut2\n\t$FileOut3\n\t$FileOut4\n"; #exit;
open my $OUT1, ">", $EnteredParams{output1};
open my $OUT2, ">", $EnteredParams{output2};
#open my $OUT3, ">", $EnteredParams{output3};
#open my $OUT4, ">", $EnteredParams{output4};

# Get col header and genomes fields
my $ColHeader = <$IN>; # get col header
chomp $ColHeader;
my @ColHeader = split /\t/, $ColHeader;
my $StartGenomes = $EnteredParams{first_genome_field_nr} - 1; # first column with testvariants data, 1 based -> 0 based
my $StopGenomes = $EnteredParams{last_genome_field_nr} - 1; # first column with testvariants data, 1 based -> 0 based
if ($StartGenomes < 0) {die "No valid entry for First_Genome_Field_Nr, must be 1 or greater\n";}
if ($StopGenomes < 0) {die "No valid entry for Last_Genome_Field_Nr, must be 1 or greater\n";}
if ($StartGenomes > $StopGenomes) {die "Last_Genome_Field_Nr must be greater than or equal to First_Genome_Field_Nr\n";}
if ($StartGenomes > int @ColHeader) {die "First_Genome_Field_Nr > number of fields in column header\n";}
if ($StopGenomes > int @ColHeader) {die "Last_Genome_Field_Nr > number of fields in column header\n";}
my $NrGenomes = $StopGenomes - $StartGenomes + 1;
#print "$StartGenomes\t$StopGenomes\n"; #exit;
#print "First Genome Field:\n\t$ColHeader[$StartGenomes]\n";
#print "Last Genome Field:\n\t$ColHeader[$StopGenomes]\n\n";

# print column headers
print $OUT1 join("\t",@ColHeader),"\tAllFreq\tCalledFreq\n";
print $OUT2 join("\t",@ColHeader[0..7]),"\tAllFreq\tCalledFreq\n";
print join("\t",@ColHeader),"\n";
print "First Genome Field: $ColHeader[$StartGenomes]\n";
print "Last Genome Field: $ColHeader[$StopGenomes]\n";
print "Nr Genomes: $NrGenomes\n\n";

print "\nProcessing Variants....\n";
my $VariantCount = 0; # variant locus counter, not used
my %AllFreqCounts; # storing histogram of all freq counts
my %CalledFreqCounts; # storing histogram of called freq counts
my $Warnings;
while (<$IN>)
{
	# testvariants fields: variantId chromosome begin end varType reference alleleSeq xRef GS000000XX1-ASM GS000000XX2-ASM [GS000000XXN-ASM]
	my $Line = $_; # save line for output below
    chomp $Line;
    my @F = split /\t/, $Line; # split in to fields
    $VariantCount++; # increment variant counter
	my $UseFields = join ("",@F[$StartGenomes..$StopGenomes]); # get genome fields as string, to count 0s and 1s
	my $Count1 = () = $UseFields =~ /1/g; # count the number of 1s
	my $Count0 = () = $UseFields =~ /0/g; # count the number of 0s
	my $CountN = () = $UseFields =~ /N/g; # count the number of Ns
	my $NrAlleles = $Count1 + $Count0 + $CountN; # total count
	unless ($NrAlleles == $NrGenomes *2 or $NrAlleles == $NrGenomes) # count does not match expected for diploid/haploid locus
	{
		print "$NrAlleles alleles for variant ",join(" ",@F[0..7]),"\n"; # log warning
		#print "Expected $NrGenomes or ",$NrGenomes*2," alleles depending on ploidy of locus\n";
		#if ($Warnings++ > 10) {die "Have found $Warnings exceptions for this file, termnating processing\n";} # terminate if too many warnings
	}
	my $AllFreq = sprintf("%0.3f",$Count1/$NrAlleles); # calculate freq of 1s vs all alleles
	my $CalledFreq = sprintf("%0.3f",0);
	if ($Count1+$Count0) {$CalledFreq = sprintf("%0.3f",$Count1/($Count1+$Count0));} # calculate freq of 1s vs called alleles, if there are any
	$AllFreqCounts{$AllFreq}++; # increment all freq histogram
	$CalledFreqCounts{$CalledFreq}++; # increment called freq histogram
	#print "$Line\n$AlleleCount\t$Count1\t$Count0\t$AllFreq\t$CalledFreq\n"; #exit;
	print $OUT1 "$Line\t$AllFreq\t$CalledFreq\n"; # output full testvariants plus frequencies for this var
	print $OUT2 join("\t",@F[0..7]),"\t$AllFreq\t$CalledFreq\n"; # output just var info plus frequencies for this var
	#exit if $VariantCount > 20;
}
close $OUT1;
close $OUT2;

# Print frequency histograms
print "Nr Variants at each Frequency (All):\nFreq\tCount\n"; # header
foreach my $Freq (sort {$a <=> $b} keys %AllFreqCounts) {print "$Freq\t$AllFreqCounts{$Freq}\n";}

print "\nNr Variants at each Frequency (Called):\nFreq\tCount\n"; # header
foreach my $Freq (sort {$a <=> $b} keys %CalledFreqCounts) {print "$Freq\t$CalledFreqCounts{$Freq}\n";}

$Time += time;
print "\ntime $Time\n";

###########################################################################
#                                   SUBS                                  #
###########################################################################

sub GetExpectedParams
{
	my %Hash =
	(
	"input" => -1,
	"output_dir" => -1,
	); # store parameters and values
	return %Hash;
}

sub GetEnteredParams
{
	# Processing @ARGV
	my %Hash;

	my @ARGVs = split /--/, join (" ",@ARGV); # split args on --, into array
	#print "Start\n", join ("\n",@ARGVs),"\n",int @ARGVs - 1,"\n\n" if $Debug;
	#print "Key\tVal\n" if $Debug; #exit;
	for my $n (1..$#ARGVs) # parse each
	{
		$ARGVs[$n] =~ s/\s+$//; # remove any trailing spaces
		my ($Key, $Val) = split / /, $ARGVs[$n], 2; # put first element into key, any other elements into val
		$Key = lc $Key;
		$Hash{$Key} = $Val; # make a hash entry out of key and val
		#print "$Key\t$EnteredParams{$Key}\n" if $Debug;
	}
		#print int(keys %Hash),"\n" if $Debug;
		#foreach my $Arg (keys %Hash) {print "Arg: $Arg\t",$ExpectedParams{$Arg},"\n";}
		#print "Arg string:\t",join (" ",@ARGV),"\n" if $Debug;
		#exit if $Debug;
	return %Hash; # hash now has each -- entry param, with associated values
}

sub SaveArrayAsString
{
	my $FH = shift;
	my $Fields = shift;
	#print "$Fields\n";
	print $FH join("\t",@$Fields),"\n";
}

sub ConcatenateVariants
{
    my $ArrayIn = shift; # ptr to array
    my $StateFieldNr = shift; # field to process
    #print int(@$ArrayIn),"\n";
    my @ArrayOut; # array to store records out
    my $Nr = -1;
    foreach my $Entry (@$ArrayIn)
    {
	}
    return \@ArrayOut; # return ptr to array
}

sub LoadStateRecord
{
	my $Out = shift;
	my $In = shift;
	my $StateFieldNr = shift;

				$Out->{State} = $$In[$StateFieldNr]; # get state for new record
				$Out->{Chr} = $$In[1]; # get chr
				$Out->{Begin} = $$In[2]; # get begin of state range
				$Out->{End} = $$In[3]; # get current end of state range
				$Out->{Records}++; # record added to new count
}

sub OpenFile
{
    my $File = shift;
    my $FH;
		open ($FH, "$File") or die ("$!: can't open file $File");
    return $FH;
}

sub OpenFileold
{
    my $File = shift;
    my $FH;

    if ($File =~ /.bz2$/)
    {
		open ($FH, "bzcat $File |") or die ("$!: can't open file $File");
    }
    elsif ($File =~ /.gz$/)
    {
		open ($FH, "gunzip -c $File |") or die ("$!: can't open file $File");
    }
    elsif ($File =~ /.tsv$/)
    {
		open ($FH, "cat $File |") or die ("$!: can't open file $File");
    }
    else
    {
		die ("$!: do not recognise file type $File");
    }
    return $FH;
}

sub LoadNewRecord
{
    my $In = shift;
    my $Out = shift;
    $Out->{Chr} = $In->{Chr};
    $Out->{State} = $In->{State};
    $Out->{Begin} = $In->{Begin};
    $Out->{End} = $In->{End};
    $Out->{Records} = $In->{Records};
}

sub NewStateRecord
{
    my $Record =
    {
	Chr => "",
        Begin => -1,
        End => -1,
		State => "",
        Records => 0,
		MIEs => 0,
		StateErrors => 0,
        Length => -1,
    };
    return $Record;
}