Mercurial > repos > bcrain-completegenomics > testing1
changeset 2:3a4894be7df2 draft
Uploaded
author | bcrain-completegenomics |
---|---|
date | Thu, 24 May 2012 15:16:40 -0400 |
parents | fec197eb2f00 |
children | a2b8590be75e |
files | Calculate_TestVariants_Variant_Frequencies_0_1_0.pl |
diffstat | 1 files changed, 271 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Calculate_TestVariants_Variant_Frequencies_0_1_0.pl Thu May 24 15:16:40 2012 -0400 @@ -0,0 +1,271 @@ +#!/usr/bin/perl +use strict; +#use feature "say"; +#use File::Basename; +$| = 1; + +# Get_TestVariants_Variant_Frequencies +# Calculate the frequencies of variants in a testvariants output file +# Two values calculated: +# Frequency vs all alleles +# Frequency vs called alleles + +# Input is a testvariants file +# Outputs: +# All data to *-Freq.tsv, including scores and quals +# vars and freqs to *-Freq_Short.tsv +# Exceptions to *-Freq_Log +# Stats to *-Freq_Stats + +# Format: +# perl prog file dir +# ie +# perl Get_TestVariants_Variant_Frequencies \ +# --Input input_file \ +# --First_Genome_Field_Nr col_nr1 \ +# --Last_Genome_Field_Nr col_nr2 +# --Output1 output1 \ +# --Output2 output2 + +# eg +# perl /perl/Get_TestVariants_Variant_Frequencies_0_0_1.pl \ +# --Input /data/Family_Quartet_testvariants.tsv \ +# --First_Genome_Field_Nr 9 \ +# --Last_Genome_Field_Nr 11 +# --Output1 output1 \ +# --Output2 output2 + + +# Rick Tearle 2010-11 + +my $Time -= time; # start time +my $Debug = 0; + +# Parsing and storing input parameters +# Only childfields can be repeated +print "$0 @ARGV\nProcessing input parameters\n"; +my %ExpectedParams = GetExpectedParams (); +my %EnteredParams = GetEnteredParams (); + +# Setting up prog paras from input paras +my $FileIn = $EnteredParams{input}; +unless (-f $FileIn) {die "Testvariants input file $FileIn not found\n";} # requires existing file +#my $FileOut = $EnteredParams{output}; # +#$DirectoryOut =~ s/\/$//; # remove trailing slash if present +#unless (-d $DirectoryOut) {die "Output directory $DirectoryOut not found\n";} # requires existing file +#print "$FileIn\n$DirectoryOut\n"; +#$FileIn =~ /(^.+\/)(.+?)\./; # get filename without path and without extensions + +# my $FileOut1 = $FileOut."-Freq.tsv"; +# my $FileOut2 = $FileOut."-Freq_Short.tsv"; +# my $FileOut3 = $FileOut."-Freq_Stats.tsv"; +# my $FileOut4 = $FileOut."-Freq_Log.tsv"; + +print "\nOpening Input File:\n\t$FileIn\n"; +my $IN = OpenFile ($FileIn); # open the file with correct file format + +#print "\nOpening Output Files:\n\t$FileOut1\n\t$FileOut2\n\t$FileOut3\n\t$FileOut4\n"; #exit; +open my $OUT1, ">", $EnteredParams{output1}; +open my $OUT2, ">", $EnteredParams{output2}; +#open my $OUT3, ">", $EnteredParams{output3}; +#open my $OUT4, ">", $EnteredParams{output4}; + +# Get col header and genomes fields +my $ColHeader = <$IN>; # get col header +chomp $ColHeader; +my @ColHeader = split /\t/, $ColHeader; +my $StartGenomes = $EnteredParams{first_genome_field_nr} - 1; # first column with testvariants data, 1 based -> 0 based +my $StopGenomes = $EnteredParams{last_genome_field_nr} - 1; # first column with testvariants data, 1 based -> 0 based +if ($StartGenomes < 0) {die "No valid entry for First_Genome_Field_Nr, must be 1 or greater\n";} +if ($StopGenomes < 0) {die "No valid entry for Last_Genome_Field_Nr, must be 1 or greater\n";} +if ($StartGenomes > $StopGenomes) {die "Last_Genome_Field_Nr must be greater than or equal to First_Genome_Field_Nr\n";} +if ($StartGenomes > int @ColHeader) {die "First_Genome_Field_Nr > number of fields in column header\n";} +if ($StopGenomes > int @ColHeader) {die "Last_Genome_Field_Nr > number of fields in column header\n";} +my $NrGenomes = $StopGenomes - $StartGenomes + 1; +#print "$StartGenomes\t$StopGenomes\n"; #exit; +#print "First Genome Field:\n\t$ColHeader[$StartGenomes]\n"; +#print "Last Genome Field:\n\t$ColHeader[$StopGenomes]\n\n"; + +# print column headers +print $OUT1 join("\t",@ColHeader),"\tAllFreq\tCalledFreq\n"; +print $OUT2 join("\t",@ColHeader[0..7]),"\tAllFreq\tCalledFreq\n"; +print join("\t",@ColHeader),"\n"; +print "First Genome Field: $ColHeader[$StartGenomes]\n"; +print "Last Genome Field: $ColHeader[$StopGenomes]\n"; +print "Nr Genomes: $NrGenomes\n\n"; + +print "\nProcessing Variants....\n"; +my $VariantCount = 0; # variant locus counter, not used +my %AllFreqCounts; # storing histogram of all freq counts +my %CalledFreqCounts; # storing histogram of called freq counts +my $Warnings; +while (<$IN>) +{ + # testvariants fields: variantId chromosome begin end varType reference alleleSeq xRef GS000000XX1-ASM GS000000XX2-ASM [GS000000XXN-ASM] + my $Line = $_; # save line for output below + chomp $Line; + my @F = split /\t/, $Line; # split in to fields + $VariantCount++; # increment variant counter + my $UseFields = join ("",@F[$StartGenomes..$StopGenomes]); # get genome fields as string, to count 0s and 1s + my $Count1 = () = $UseFields =~ /1/g; # count the number of 1s + my $Count0 = () = $UseFields =~ /0/g; # count the number of 0s + my $CountN = () = $UseFields =~ /N/g; # count the number of Ns + my $NrAlleles = $Count1 + $Count0 + $CountN; # total count + unless ($NrAlleles == $NrGenomes *2 or $NrAlleles == $NrGenomes) # count does not match expected for diploid/haploid locus + { + print "$NrAlleles alleles for variant ",join(" ",@F[0..7]),"\n"; # log warning + #print "Expected $NrGenomes or ",$NrGenomes*2," alleles depending on ploidy of locus\n"; + #if ($Warnings++ > 10) {die "Have found $Warnings exceptions for this file, termnating processing\n";} # terminate if too many warnings + } + my $AllFreq = sprintf("%0.3f",$Count1/$NrAlleles); # calculate freq of 1s vs all alleles + my $CalledFreq = sprintf("%0.3f",0); + if ($Count1+$Count0) {$CalledFreq = sprintf("%0.3f",$Count1/($Count1+$Count0));} # calculate freq of 1s vs called alleles, if there are any + $AllFreqCounts{$AllFreq}++; # increment all freq histogram + $CalledFreqCounts{$CalledFreq}++; # increment called freq histogram + #print "$Line\n$AlleleCount\t$Count1\t$Count0\t$AllFreq\t$CalledFreq\n"; #exit; + print $OUT1 "$Line\t$AllFreq\t$CalledFreq\n"; # output full testvariants plus frequencies for this var + print $OUT2 join("\t",@F[0..7]),"\t$AllFreq\t$CalledFreq\n"; # output just var info plus frequencies for this var + #exit if $VariantCount > 20; +} +close $OUT1; +close $OUT2; + +# Print frequency histograms +print "Nr Variants at each Frequency (All):\nFreq\tCount\n"; # header +foreach my $Freq (sort {$a <=> $b} keys %AllFreqCounts) {print "$Freq\t$AllFreqCounts{$Freq}\n";} + +print "\nNr Variants at each Frequency (Called):\nFreq\tCount\n"; # header +foreach my $Freq (sort {$a <=> $b} keys %CalledFreqCounts) {print "$Freq\t$CalledFreqCounts{$Freq}\n";} + +$Time += time; +print "\ntime $Time\n"; + +########################################################################### +# SUBS # +########################################################################### + +sub GetExpectedParams +{ + my %Hash = + ( + "input" => -1, + "output_dir" => -1, + ); # store parameters and values + return %Hash; +} + +sub GetEnteredParams +{ + # Processing @ARGV + my %Hash; + + my @ARGVs = split /--/, join (" ",@ARGV); # split args on --, into array + #print "Start\n", join ("\n",@ARGVs),"\n",int @ARGVs - 1,"\n\n" if $Debug; + #print "Key\tVal\n" if $Debug; #exit; + for my $n (1..$#ARGVs) # parse each + { + $ARGVs[$n] =~ s/\s+$//; # remove any trailing spaces + my ($Key, $Val) = split / /, $ARGVs[$n], 2; # put first element into key, any other elements into val + $Key = lc $Key; + $Hash{$Key} = $Val; # make a hash entry out of key and val + #print "$Key\t$EnteredParams{$Key}\n" if $Debug; + } + #print int(keys %Hash),"\n" if $Debug; + #foreach my $Arg (keys %Hash) {print "Arg: $Arg\t",$ExpectedParams{$Arg},"\n";} + #print "Arg string:\t",join (" ",@ARGV),"\n" if $Debug; + #exit if $Debug; + return %Hash; # hash now has each -- entry param, with associated values +} + +sub SaveArrayAsString +{ + my $FH = shift; + my $Fields = shift; + #print "$Fields\n"; + print $FH join("\t",@$Fields),"\n"; +} + +sub ConcatenateVariants +{ + my $ArrayIn = shift; # ptr to array + my $StateFieldNr = shift; # field to process + #print int(@$ArrayIn),"\n"; + my @ArrayOut; # array to store records out + my $Nr = -1; + foreach my $Entry (@$ArrayIn) + { + } + return \@ArrayOut; # return ptr to array +} + +sub LoadStateRecord +{ + my $Out = shift; + my $In = shift; + my $StateFieldNr = shift; + + $Out->{State} = $$In[$StateFieldNr]; # get state for new record + $Out->{Chr} = $$In[1]; # get chr + $Out->{Begin} = $$In[2]; # get begin of state range + $Out->{End} = $$In[3]; # get current end of state range + $Out->{Records}++; # record added to new count +} + +sub OpenFile +{ + my $File = shift; + my $FH; + open ($FH, "$File") or die ("$!: can't open file $File"); + return $FH; +} + +sub OpenFileold +{ + my $File = shift; + my $FH; + + if ($File =~ /.bz2$/) + { + open ($FH, "bzcat $File |") or die ("$!: can't open file $File"); + } + elsif ($File =~ /.gz$/) + { + open ($FH, "gunzip -c $File |") or die ("$!: can't open file $File"); + } + elsif ($File =~ /.tsv$/) + { + open ($FH, "cat $File |") or die ("$!: can't open file $File"); + } + else + { + die ("$!: do not recognise file type $File"); + } + return $FH; +} + +sub LoadNewRecord +{ + my $In = shift; + my $Out = shift; + $Out->{Chr} = $In->{Chr}; + $Out->{State} = $In->{State}; + $Out->{Begin} = $In->{Begin}; + $Out->{End} = $In->{End}; + $Out->{Records} = $In->{Records}; +} + +sub NewStateRecord +{ + my $Record = + { + Chr => "", + Begin => -1, + End => -1, + State => "", + Records => 0, + MIEs => 0, + StateErrors => 0, + Length => -1, + }; + return $Record; +}