changeset 2:3a4894be7df2 draft

Uploaded
author bcrain-completegenomics
date Thu, 24 May 2012 15:16:40 -0400
parents fec197eb2f00
children a2b8590be75e
files Calculate_TestVariants_Variant_Frequencies_0_1_0.pl
diffstat 1 files changed, 271 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Calculate_TestVariants_Variant_Frequencies_0_1_0.pl	Thu May 24 15:16:40 2012 -0400
@@ -0,0 +1,271 @@
+#!/usr/bin/perl
+use strict;
+#use feature "say";
+#use File::Basename;
+$| = 1;
+
+# Get_TestVariants_Variant_Frequencies
+# Calculate the frequencies of variants in a testvariants output file
+# Two values calculated:
+#	Frequency vs all alleles
+# 	Frequency vs called alleles
+
+# Input is a testvariants file
+# Outputs:
+# All data to *-Freq.tsv, including scores and quals
+# vars and freqs to *-Freq_Short.tsv
+# Exceptions to *-Freq_Log
+# Stats to *-Freq_Stats
+
+# Format:
+# perl prog file dir
+# ie
+# perl Get_TestVariants_Variant_Frequencies \
+# --Input input_file \
+# --First_Genome_Field_Nr col_nr1 \
+# --Last_Genome_Field_Nr col_nr2
+# --Output1 output1 \
+# --Output2 output2 
+
+# eg
+# perl /perl/Get_TestVariants_Variant_Frequencies_0_0_1.pl \
+# --Input /data/Family_Quartet_testvariants.tsv \
+# --First_Genome_Field_Nr 9 \
+# --Last_Genome_Field_Nr 11
+# --Output1 output1 \
+# --Output2 output2 
+
+
+# Rick Tearle 2010-11
+
+my $Time -= time; # start time
+my $Debug = 0;
+
+# Parsing and storing input parameters
+# Only childfields can be repeated
+print "$0 @ARGV\nProcessing input parameters\n";
+my %ExpectedParams =  GetExpectedParams ();
+my %EnteredParams = GetEnteredParams ();
+
+# Setting up prog paras from input paras
+my $FileIn = $EnteredParams{input};
+unless (-f $FileIn) {die "Testvariants input file $FileIn not found\n";} # requires existing file
+#my $FileOut = $EnteredParams{output}; #
+#$DirectoryOut =~ s/\/$//; # remove trailing slash if present
+#unless (-d $DirectoryOut) {die "Output directory $DirectoryOut not found\n";} # requires existing file
+#print "$FileIn\n$DirectoryOut\n";
+#$FileIn =~ /(^.+\/)(.+?)\./; # get filename without path and without extensions
+
+# my $FileOut1 = $FileOut."-Freq.tsv";
+# my $FileOut2 = $FileOut."-Freq_Short.tsv";
+# my $FileOut3 = $FileOut."-Freq_Stats.tsv";
+# my $FileOut4 = $FileOut."-Freq_Log.tsv";
+
+print "\nOpening Input File:\n\t$FileIn\n";
+my $IN = OpenFile ($FileIn); # open the file with correct file format
+
+#print "\nOpening Output Files:\n\t$FileOut1\n\t$FileOut2\n\t$FileOut3\n\t$FileOut4\n"; #exit;
+open my $OUT1, ">", $EnteredParams{output1};
+open my $OUT2, ">", $EnteredParams{output2};
+#open my $OUT3, ">", $EnteredParams{output3};
+#open my $OUT4, ">", $EnteredParams{output4};
+
+# Get col header and genomes fields
+my $ColHeader = <$IN>; # get col header
+chomp $ColHeader;
+my @ColHeader = split /\t/, $ColHeader;
+my $StartGenomes = $EnteredParams{first_genome_field_nr} - 1; # first column with testvariants data, 1 based -> 0 based
+my $StopGenomes = $EnteredParams{last_genome_field_nr} - 1; # first column with testvariants data, 1 based -> 0 based
+if ($StartGenomes < 0) {die "No valid entry for First_Genome_Field_Nr, must be 1 or greater\n";}
+if ($StopGenomes < 0) {die "No valid entry for Last_Genome_Field_Nr, must be 1 or greater\n";}
+if ($StartGenomes > $StopGenomes) {die "Last_Genome_Field_Nr must be greater than or equal to First_Genome_Field_Nr\n";}
+if ($StartGenomes > int @ColHeader) {die "First_Genome_Field_Nr > number of fields in column header\n";}
+if ($StopGenomes > int @ColHeader) {die "Last_Genome_Field_Nr > number of fields in column header\n";}
+my $NrGenomes = $StopGenomes - $StartGenomes + 1;
+#print "$StartGenomes\t$StopGenomes\n"; #exit;
+#print "First Genome Field:\n\t$ColHeader[$StartGenomes]\n";
+#print "Last Genome Field:\n\t$ColHeader[$StopGenomes]\n\n";
+
+# print column headers
+print $OUT1 join("\t",@ColHeader),"\tAllFreq\tCalledFreq\n";
+print $OUT2 join("\t",@ColHeader[0..7]),"\tAllFreq\tCalledFreq\n";
+print join("\t",@ColHeader),"\n";
+print "First Genome Field: $ColHeader[$StartGenomes]\n";
+print "Last Genome Field: $ColHeader[$StopGenomes]\n";
+print "Nr Genomes: $NrGenomes\n\n";
+
+print "\nProcessing Variants....\n";
+my $VariantCount = 0; # variant locus counter, not used
+my %AllFreqCounts; # storing histogram of all freq counts
+my %CalledFreqCounts; # storing histogram of called freq counts
+my $Warnings;
+while (<$IN>)
+{
+	# testvariants fields: variantId chromosome begin end varType reference alleleSeq xRef GS000000XX1-ASM GS000000XX2-ASM [GS000000XXN-ASM]
+	my $Line = $_; # save line for output below
+    chomp $Line;
+    my @F = split /\t/, $Line; # split in to fields
+    $VariantCount++; # increment variant counter
+	my $UseFields = join ("",@F[$StartGenomes..$StopGenomes]); # get genome fields as string, to count 0s and 1s
+	my $Count1 = () = $UseFields =~ /1/g; # count the number of 1s
+	my $Count0 = () = $UseFields =~ /0/g; # count the number of 0s
+	my $CountN = () = $UseFields =~ /N/g; # count the number of Ns
+	my $NrAlleles = $Count1 + $Count0 + $CountN; # total count
+	unless ($NrAlleles == $NrGenomes *2 or $NrAlleles == $NrGenomes) # count does not match expected for diploid/haploid locus
+	{
+		print "$NrAlleles alleles for variant ",join(" ",@F[0..7]),"\n"; # log warning
+		#print "Expected $NrGenomes or ",$NrGenomes*2," alleles depending on ploidy of locus\n";
+		#if ($Warnings++ > 10) {die "Have found $Warnings exceptions for this file, termnating processing\n";} # terminate if too many warnings
+	}
+	my $AllFreq = sprintf("%0.3f",$Count1/$NrAlleles); # calculate freq of 1s vs all alleles
+	my $CalledFreq = sprintf("%0.3f",0);
+	if ($Count1+$Count0) {$CalledFreq = sprintf("%0.3f",$Count1/($Count1+$Count0));} # calculate freq of 1s vs called alleles, if there are any
+	$AllFreqCounts{$AllFreq}++; # increment all freq histogram
+	$CalledFreqCounts{$CalledFreq}++; # increment called freq histogram
+	#print "$Line\n$AlleleCount\t$Count1\t$Count0\t$AllFreq\t$CalledFreq\n"; #exit;
+	print $OUT1 "$Line\t$AllFreq\t$CalledFreq\n"; # output full testvariants plus frequencies for this var
+	print $OUT2 join("\t",@F[0..7]),"\t$AllFreq\t$CalledFreq\n"; # output just var info plus frequencies for this var
+	#exit if $VariantCount > 20;
+}
+close $OUT1;
+close $OUT2;
+
+# Print frequency histograms
+print "Nr Variants at each Frequency (All):\nFreq\tCount\n"; # header
+foreach my $Freq (sort {$a <=> $b} keys %AllFreqCounts) {print "$Freq\t$AllFreqCounts{$Freq}\n";}
+
+print "\nNr Variants at each Frequency (Called):\nFreq\tCount\n"; # header
+foreach my $Freq (sort {$a <=> $b} keys %CalledFreqCounts) {print "$Freq\t$CalledFreqCounts{$Freq}\n";}
+
+$Time += time;
+print "\ntime $Time\n";
+
+###########################################################################
+#                                   SUBS                                  #
+###########################################################################
+
+sub GetExpectedParams
+{
+	my %Hash =
+	(
+	"input" => -1,
+	"output_dir" => -1,
+	); # store parameters and values
+	return %Hash;
+}
+
+sub GetEnteredParams
+{
+	# Processing @ARGV
+	my %Hash;
+
+	my @ARGVs = split /--/, join (" ",@ARGV); # split args on --, into array
+	#print "Start\n", join ("\n",@ARGVs),"\n",int @ARGVs - 1,"\n\n" if $Debug;
+	#print "Key\tVal\n" if $Debug; #exit;
+	for my $n (1..$#ARGVs) # parse each
+	{
+		$ARGVs[$n] =~ s/\s+$//; # remove any trailing spaces
+		my ($Key, $Val) = split / /, $ARGVs[$n], 2; # put first element into key, any other elements into val
+		$Key = lc $Key;
+		$Hash{$Key} = $Val; # make a hash entry out of key and val
+		#print "$Key\t$EnteredParams{$Key}\n" if $Debug;
+	}
+		#print int(keys %Hash),"\n" if $Debug;
+		#foreach my $Arg (keys %Hash) {print "Arg: $Arg\t",$ExpectedParams{$Arg},"\n";}
+		#print "Arg string:\t",join (" ",@ARGV),"\n" if $Debug;
+		#exit if $Debug;
+	return %Hash; # hash now has each -- entry param, with associated values
+}
+
+sub SaveArrayAsString
+{
+	my $FH = shift;
+	my $Fields = shift;
+	#print "$Fields\n";
+	print $FH join("\t",@$Fields),"\n";
+}
+
+sub ConcatenateVariants
+{
+    my $ArrayIn = shift; # ptr to array
+    my $StateFieldNr = shift; # field to process
+    #print int(@$ArrayIn),"\n";
+    my @ArrayOut; # array to store records out
+    my $Nr = -1;
+    foreach my $Entry (@$ArrayIn)
+    {
+	}
+    return \@ArrayOut; # return ptr to array
+}
+
+sub LoadStateRecord
+{
+	my $Out = shift;
+	my $In = shift;
+	my $StateFieldNr = shift;
+
+				$Out->{State} = $$In[$StateFieldNr]; # get state for new record
+				$Out->{Chr} = $$In[1]; # get chr
+				$Out->{Begin} = $$In[2]; # get begin of state range
+				$Out->{End} = $$In[3]; # get current end of state range
+				$Out->{Records}++; # record added to new count
+}
+
+sub OpenFile
+{
+    my $File = shift;
+    my $FH;
+		open ($FH, "$File") or die ("$!: can't open file $File");
+    return $FH;
+}
+
+sub OpenFileold
+{
+    my $File = shift;
+    my $FH;
+
+    if ($File =~ /.bz2$/)
+    {
+		open ($FH, "bzcat $File |") or die ("$!: can't open file $File");
+    }
+    elsif ($File =~ /.gz$/)
+    {
+		open ($FH, "gunzip -c $File |") or die ("$!: can't open file $File");
+    }
+    elsif ($File =~ /.tsv$/)
+    {
+		open ($FH, "cat $File |") or die ("$!: can't open file $File");
+    }
+    else
+    {
+		die ("$!: do not recognise file type $File");
+    }
+    return $FH;
+}
+
+sub LoadNewRecord
+{
+    my $In = shift;
+    my $Out = shift;
+    $Out->{Chr} = $In->{Chr};
+    $Out->{State} = $In->{State};
+    $Out->{Begin} = $In->{Begin};
+    $Out->{End} = $In->{End};
+    $Out->{Records} = $In->{Records};
+}
+
+sub NewStateRecord
+{
+    my $Record =
+    {
+	Chr => "",
+        Begin => -1,
+        End => -1,
+		State => "",
+        Records => 0,
+		MIEs => 0,
+		StateErrors => 0,
+        Length => -1,
+    };
+    return $Record;
+}