Mercurial > repos > bcrain-completegenomics > testing1
changeset 4:cd57fc5f9ca0 draft
Deleted selected files
author | bcrain-completegenomics |
---|---|
date | Wed, 06 Jun 2012 14:28:03 -0400 |
parents | a2b8590be75e |
children | 58a08658f3e3 |
files | Calculate_TestVariants_Variant_Frequencies.xml Calculate_TestVariants_Variant_Frequencies_0_1_0.pl |
diffstat | 2 files changed, 0 insertions(+), 338 deletions(-) [+] |
line wrap: on
line diff
--- a/Calculate_TestVariants_Variant_Frequencies.xml Thu May 24 15:17:00 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,67 +0,0 @@ -<tool id="pl_calculatefreq" name="Calculate_Variant_Frequencies" version="0.0.1"> - - <description>in cgatools-testvariants file</description> <!--adds description in toolbar--> - - <command interpreter="perl"> - Calculate_TestVariants_Variant_Frequencies_0_1_0.pl - --Input $input - --First_Genome_Field_Nr $first_col - --Last_Genome_Field_Nr $last_col - --Output1 $output1 - --Output2 $output2 - </command> - - <outputs> - <data format="tabular" name="output1" label="TestVariant_Frequencies on "/> - <data format="tabular" name="output2" label="TestVariant_Frequencies_Short on "/> - </outputs> - - <inputs> - <param name="input" type="data" format="tabular" label="TestVariants input file"> - <validator type="unspecified_build" /> - </param> - <param name="first_col" type="text" label="What column number is the first genome"/> - <param name="last_col" type="text" label="What column number is the last genome"/> - </inputs> - - - <help> - -**What it does** - -This tool calculates the allele frequencies for all variants present in the testvariant file. - ------ - -**Instructions**:: - - Calculate the frequencies of variants in a testvariants output file - Two values calculated: - Frequency vs all alleles - Frequency vs called alleles - - Input: testvariants file - Outputs: - All data to *-Freq.tsv, including scores and quals - vars and freqs to *-Freq_Short.tsv - Exceptions to *-Freq_Log - Stats to *-Freq_Stats - - - perl Calculate_TestVariants_Variant_Frequencies_0_0_3.pl \ - --Input input_file \ - --First_Genome_Field_Nr col_nr1 \ - --Last_Genome_Field_Nr col_nr2 - --Output1 output1 \ - --Output2 output_short \ - eg - perl Calculate_TestVariants_Variant_Frequencies_0_0_3.pl \ - --Input /data/Family_Quartet_testvariants.tsv \ - --Output /data/Family_Quartet_testvariants - --First_Genome_Field_Nr 9 \ - --Last_Genome_Field_Nr 11 - --Output1 /data/Family_Quartet_testvariants - --Output2 /data/Family_Quartet_testvariants_short - - </help> -</tool>
--- a/Calculate_TestVariants_Variant_Frequencies_0_1_0.pl Thu May 24 15:17:00 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,271 +0,0 @@ -#!/usr/bin/perl -use strict; -#use feature "say"; -#use File::Basename; -$| = 1; - -# Get_TestVariants_Variant_Frequencies -# Calculate the frequencies of variants in a testvariants output file -# Two values calculated: -# Frequency vs all alleles -# Frequency vs called alleles - -# Input is a testvariants file -# Outputs: -# All data to *-Freq.tsv, including scores and quals -# vars and freqs to *-Freq_Short.tsv -# Exceptions to *-Freq_Log -# Stats to *-Freq_Stats - -# Format: -# perl prog file dir -# ie -# perl Get_TestVariants_Variant_Frequencies \ -# --Input input_file \ -# --First_Genome_Field_Nr col_nr1 \ -# --Last_Genome_Field_Nr col_nr2 -# --Output1 output1 \ -# --Output2 output2 - -# eg -# perl /perl/Get_TestVariants_Variant_Frequencies_0_0_1.pl \ -# --Input /data/Family_Quartet_testvariants.tsv \ -# --First_Genome_Field_Nr 9 \ -# --Last_Genome_Field_Nr 11 -# --Output1 output1 \ -# --Output2 output2 - - -# Rick Tearle 2010-11 - -my $Time -= time; # start time -my $Debug = 0; - -# Parsing and storing input parameters -# Only childfields can be repeated -print "$0 @ARGV\nProcessing input parameters\n"; -my %ExpectedParams = GetExpectedParams (); -my %EnteredParams = GetEnteredParams (); - -# Setting up prog paras from input paras -my $FileIn = $EnteredParams{input}; -unless (-f $FileIn) {die "Testvariants input file $FileIn not found\n";} # requires existing file -#my $FileOut = $EnteredParams{output}; # -#$DirectoryOut =~ s/\/$//; # remove trailing slash if present -#unless (-d $DirectoryOut) {die "Output directory $DirectoryOut not found\n";} # requires existing file -#print "$FileIn\n$DirectoryOut\n"; -#$FileIn =~ /(^.+\/)(.+?)\./; # get filename without path and without extensions - -# my $FileOut1 = $FileOut."-Freq.tsv"; -# my $FileOut2 = $FileOut."-Freq_Short.tsv"; -# my $FileOut3 = $FileOut."-Freq_Stats.tsv"; -# my $FileOut4 = $FileOut."-Freq_Log.tsv"; - -print "\nOpening Input File:\n\t$FileIn\n"; -my $IN = OpenFile ($FileIn); # open the file with correct file format - -#print "\nOpening Output Files:\n\t$FileOut1\n\t$FileOut2\n\t$FileOut3\n\t$FileOut4\n"; #exit; -open my $OUT1, ">", $EnteredParams{output1}; -open my $OUT2, ">", $EnteredParams{output2}; -#open my $OUT3, ">", $EnteredParams{output3}; -#open my $OUT4, ">", $EnteredParams{output4}; - -# Get col header and genomes fields -my $ColHeader = <$IN>; # get col header -chomp $ColHeader; -my @ColHeader = split /\t/, $ColHeader; -my $StartGenomes = $EnteredParams{first_genome_field_nr} - 1; # first column with testvariants data, 1 based -> 0 based -my $StopGenomes = $EnteredParams{last_genome_field_nr} - 1; # first column with testvariants data, 1 based -> 0 based -if ($StartGenomes < 0) {die "No valid entry for First_Genome_Field_Nr, must be 1 or greater\n";} -if ($StopGenomes < 0) {die "No valid entry for Last_Genome_Field_Nr, must be 1 or greater\n";} -if ($StartGenomes > $StopGenomes) {die "Last_Genome_Field_Nr must be greater than or equal to First_Genome_Field_Nr\n";} -if ($StartGenomes > int @ColHeader) {die "First_Genome_Field_Nr > number of fields in column header\n";} -if ($StopGenomes > int @ColHeader) {die "Last_Genome_Field_Nr > number of fields in column header\n";} -my $NrGenomes = $StopGenomes - $StartGenomes + 1; -#print "$StartGenomes\t$StopGenomes\n"; #exit; -#print "First Genome Field:\n\t$ColHeader[$StartGenomes]\n"; -#print "Last Genome Field:\n\t$ColHeader[$StopGenomes]\n\n"; - -# print column headers -print $OUT1 join("\t",@ColHeader),"\tAllFreq\tCalledFreq\n"; -print $OUT2 join("\t",@ColHeader[0..7]),"\tAllFreq\tCalledFreq\n"; -print join("\t",@ColHeader),"\n"; -print "First Genome Field: $ColHeader[$StartGenomes]\n"; -print "Last Genome Field: $ColHeader[$StopGenomes]\n"; -print "Nr Genomes: $NrGenomes\n\n"; - -print "\nProcessing Variants....\n"; -my $VariantCount = 0; # variant locus counter, not used -my %AllFreqCounts; # storing histogram of all freq counts -my %CalledFreqCounts; # storing histogram of called freq counts -my $Warnings; -while (<$IN>) -{ - # testvariants fields: variantId chromosome begin end varType reference alleleSeq xRef GS000000XX1-ASM GS000000XX2-ASM [GS000000XXN-ASM] - my $Line = $_; # save line for output below - chomp $Line; - my @F = split /\t/, $Line; # split in to fields - $VariantCount++; # increment variant counter - my $UseFields = join ("",@F[$StartGenomes..$StopGenomes]); # get genome fields as string, to count 0s and 1s - my $Count1 = () = $UseFields =~ /1/g; # count the number of 1s - my $Count0 = () = $UseFields =~ /0/g; # count the number of 0s - my $CountN = () = $UseFields =~ /N/g; # count the number of Ns - my $NrAlleles = $Count1 + $Count0 + $CountN; # total count - unless ($NrAlleles == $NrGenomes *2 or $NrAlleles == $NrGenomes) # count does not match expected for diploid/haploid locus - { - print "$NrAlleles alleles for variant ",join(" ",@F[0..7]),"\n"; # log warning - #print "Expected $NrGenomes or ",$NrGenomes*2," alleles depending on ploidy of locus\n"; - #if ($Warnings++ > 10) {die "Have found $Warnings exceptions for this file, termnating processing\n";} # terminate if too many warnings - } - my $AllFreq = sprintf("%0.3f",$Count1/$NrAlleles); # calculate freq of 1s vs all alleles - my $CalledFreq = sprintf("%0.3f",0); - if ($Count1+$Count0) {$CalledFreq = sprintf("%0.3f",$Count1/($Count1+$Count0));} # calculate freq of 1s vs called alleles, if there are any - $AllFreqCounts{$AllFreq}++; # increment all freq histogram - $CalledFreqCounts{$CalledFreq}++; # increment called freq histogram - #print "$Line\n$AlleleCount\t$Count1\t$Count0\t$AllFreq\t$CalledFreq\n"; #exit; - print $OUT1 "$Line\t$AllFreq\t$CalledFreq\n"; # output full testvariants plus frequencies for this var - print $OUT2 join("\t",@F[0..7]),"\t$AllFreq\t$CalledFreq\n"; # output just var info plus frequencies for this var - #exit if $VariantCount > 20; -} -close $OUT1; -close $OUT2; - -# Print frequency histograms -print "Nr Variants at each Frequency (All):\nFreq\tCount\n"; # header -foreach my $Freq (sort {$a <=> $b} keys %AllFreqCounts) {print "$Freq\t$AllFreqCounts{$Freq}\n";} - -print "\nNr Variants at each Frequency (Called):\nFreq\tCount\n"; # header -foreach my $Freq (sort {$a <=> $b} keys %CalledFreqCounts) {print "$Freq\t$CalledFreqCounts{$Freq}\n";} - -$Time += time; -print "\ntime $Time\n"; - -########################################################################### -# SUBS # -########################################################################### - -sub GetExpectedParams -{ - my %Hash = - ( - "input" => -1, - "output_dir" => -1, - ); # store parameters and values - return %Hash; -} - -sub GetEnteredParams -{ - # Processing @ARGV - my %Hash; - - my @ARGVs = split /--/, join (" ",@ARGV); # split args on --, into array - #print "Start\n", join ("\n",@ARGVs),"\n",int @ARGVs - 1,"\n\n" if $Debug; - #print "Key\tVal\n" if $Debug; #exit; - for my $n (1..$#ARGVs) # parse each - { - $ARGVs[$n] =~ s/\s+$//; # remove any trailing spaces - my ($Key, $Val) = split / /, $ARGVs[$n], 2; # put first element into key, any other elements into val - $Key = lc $Key; - $Hash{$Key} = $Val; # make a hash entry out of key and val - #print "$Key\t$EnteredParams{$Key}\n" if $Debug; - } - #print int(keys %Hash),"\n" if $Debug; - #foreach my $Arg (keys %Hash) {print "Arg: $Arg\t",$ExpectedParams{$Arg},"\n";} - #print "Arg string:\t",join (" ",@ARGV),"\n" if $Debug; - #exit if $Debug; - return %Hash; # hash now has each -- entry param, with associated values -} - -sub SaveArrayAsString -{ - my $FH = shift; - my $Fields = shift; - #print "$Fields\n"; - print $FH join("\t",@$Fields),"\n"; -} - -sub ConcatenateVariants -{ - my $ArrayIn = shift; # ptr to array - my $StateFieldNr = shift; # field to process - #print int(@$ArrayIn),"\n"; - my @ArrayOut; # array to store records out - my $Nr = -1; - foreach my $Entry (@$ArrayIn) - { - } - return \@ArrayOut; # return ptr to array -} - -sub LoadStateRecord -{ - my $Out = shift; - my $In = shift; - my $StateFieldNr = shift; - - $Out->{State} = $$In[$StateFieldNr]; # get state for new record - $Out->{Chr} = $$In[1]; # get chr - $Out->{Begin} = $$In[2]; # get begin of state range - $Out->{End} = $$In[3]; # get current end of state range - $Out->{Records}++; # record added to new count -} - -sub OpenFile -{ - my $File = shift; - my $FH; - open ($FH, "$File") or die ("$!: can't open file $File"); - return $FH; -} - -sub OpenFileold -{ - my $File = shift; - my $FH; - - if ($File =~ /.bz2$/) - { - open ($FH, "bzcat $File |") or die ("$!: can't open file $File"); - } - elsif ($File =~ /.gz$/) - { - open ($FH, "gunzip -c $File |") or die ("$!: can't open file $File"); - } - elsif ($File =~ /.tsv$/) - { - open ($FH, "cat $File |") or die ("$!: can't open file $File"); - } - else - { - die ("$!: do not recognise file type $File"); - } - return $FH; -} - -sub LoadNewRecord -{ - my $In = shift; - my $Out = shift; - $Out->{Chr} = $In->{Chr}; - $Out->{State} = $In->{State}; - $Out->{Begin} = $In->{Begin}; - $Out->{End} = $In->{End}; - $Out->{Records} = $In->{Records}; -} - -sub NewStateRecord -{ - my $Record = - { - Chr => "", - Begin => -1, - End => -1, - State => "", - Records => 0, - MIEs => 0, - StateErrors => 0, - Length => -1, - }; - return $Record; -}