changeset 4:cd57fc5f9ca0 draft

Deleted selected files
author bcrain-completegenomics
date Wed, 06 Jun 2012 14:28:03 -0400
parents a2b8590be75e
children 58a08658f3e3
files Calculate_TestVariants_Variant_Frequencies.xml Calculate_TestVariants_Variant_Frequencies_0_1_0.pl
diffstat 2 files changed, 0 insertions(+), 338 deletions(-) [+]
line wrap: on
line diff
--- a/Calculate_TestVariants_Variant_Frequencies.xml	Thu May 24 15:17:00 2012 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,67 +0,0 @@
-<tool id="pl_calculatefreq" name="Calculate_Variant_Frequencies" version="0.0.1">
-
-  <description>in cgatools-testvariants file</description> <!--adds description in toolbar-->
-  
-  <command interpreter="perl">
-  Calculate_TestVariants_Variant_Frequencies_0_1_0.pl
-  --Input $input
-  --First_Genome_Field_Nr $first_col
-  --Last_Genome_Field_Nr $last_col
-  --Output1 $output1
-	--Output2 $output2
-  </command>
-
-  <outputs>
-  	<data format="tabular" name="output1" label="TestVariant_Frequencies on "/>
-  	<data format="tabular" name="output2" label="TestVariant_Frequencies_Short on "/>
-  </outputs>
-
-  <inputs>
-    <param name="input" type="data" format="tabular" label="TestVariants input file">
-      <validator type="unspecified_build" />
-    </param>
-    <param name="first_col" type="text" label="What column number is the first genome"/>
-    <param name="last_col" type="text" label="What column number is the last genome"/>
-  </inputs>
-
-
-  <help>
-
-**What it does**
-
-This tool calculates the allele frequencies for all variants present in the testvariant file.
-
------
-
-**Instructions**::
-	
-	Calculate the frequencies of variants in a testvariants output file
-	Two values calculated:
-		Frequency vs all alleles
-		Frequency vs called alleles
-
-	Input: testvariants file
-	Outputs:
-		All data to *-Freq.tsv, including scores and quals
-		vars and freqs to *-Freq_Short.tsv
-		Exceptions to *-Freq_Log
-		Stats to *-Freq_Stats
-
-
-	perl Calculate_TestVariants_Variant_Frequencies_0_0_3.pl \
-	--Input input_file \
-	--First_Genome_Field_Nr col_nr1 \
-	--Last_Genome_Field_Nr col_nr2
-	--Output1 output1 \
-	--Output2 output_short \
-	eg
-	perl Calculate_TestVariants_Variant_Frequencies_0_0_3.pl \
-	--Input /data/Family_Quartet_testvariants.tsv \
-	--Output /data/Family_Quartet_testvariants
-	--First_Genome_Field_Nr 9 \
-	--Last_Genome_Field_Nr 11
-	--Output1 /data/Family_Quartet_testvariants
-	--Output2 /data/Family_Quartet_testvariants_short
-	
-  </help>
-</tool>
--- a/Calculate_TestVariants_Variant_Frequencies_0_1_0.pl	Thu May 24 15:17:00 2012 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,271 +0,0 @@
-#!/usr/bin/perl
-use strict;
-#use feature "say";
-#use File::Basename;
-$| = 1;
-
-# Get_TestVariants_Variant_Frequencies
-# Calculate the frequencies of variants in a testvariants output file
-# Two values calculated:
-#	Frequency vs all alleles
-# 	Frequency vs called alleles
-
-# Input is a testvariants file
-# Outputs:
-# All data to *-Freq.tsv, including scores and quals
-# vars and freqs to *-Freq_Short.tsv
-# Exceptions to *-Freq_Log
-# Stats to *-Freq_Stats
-
-# Format:
-# perl prog file dir
-# ie
-# perl Get_TestVariants_Variant_Frequencies \
-# --Input input_file \
-# --First_Genome_Field_Nr col_nr1 \
-# --Last_Genome_Field_Nr col_nr2
-# --Output1 output1 \
-# --Output2 output2 
-
-# eg
-# perl /perl/Get_TestVariants_Variant_Frequencies_0_0_1.pl \
-# --Input /data/Family_Quartet_testvariants.tsv \
-# --First_Genome_Field_Nr 9 \
-# --Last_Genome_Field_Nr 11
-# --Output1 output1 \
-# --Output2 output2 
-
-
-# Rick Tearle 2010-11
-
-my $Time -= time; # start time
-my $Debug = 0;
-
-# Parsing and storing input parameters
-# Only childfields can be repeated
-print "$0 @ARGV\nProcessing input parameters\n";
-my %ExpectedParams =  GetExpectedParams ();
-my %EnteredParams = GetEnteredParams ();
-
-# Setting up prog paras from input paras
-my $FileIn = $EnteredParams{input};
-unless (-f $FileIn) {die "Testvariants input file $FileIn not found\n";} # requires existing file
-#my $FileOut = $EnteredParams{output}; #
-#$DirectoryOut =~ s/\/$//; # remove trailing slash if present
-#unless (-d $DirectoryOut) {die "Output directory $DirectoryOut not found\n";} # requires existing file
-#print "$FileIn\n$DirectoryOut\n";
-#$FileIn =~ /(^.+\/)(.+?)\./; # get filename without path and without extensions
-
-# my $FileOut1 = $FileOut."-Freq.tsv";
-# my $FileOut2 = $FileOut."-Freq_Short.tsv";
-# my $FileOut3 = $FileOut."-Freq_Stats.tsv";
-# my $FileOut4 = $FileOut."-Freq_Log.tsv";
-
-print "\nOpening Input File:\n\t$FileIn\n";
-my $IN = OpenFile ($FileIn); # open the file with correct file format
-
-#print "\nOpening Output Files:\n\t$FileOut1\n\t$FileOut2\n\t$FileOut3\n\t$FileOut4\n"; #exit;
-open my $OUT1, ">", $EnteredParams{output1};
-open my $OUT2, ">", $EnteredParams{output2};
-#open my $OUT3, ">", $EnteredParams{output3};
-#open my $OUT4, ">", $EnteredParams{output4};
-
-# Get col header and genomes fields
-my $ColHeader = <$IN>; # get col header
-chomp $ColHeader;
-my @ColHeader = split /\t/, $ColHeader;
-my $StartGenomes = $EnteredParams{first_genome_field_nr} - 1; # first column with testvariants data, 1 based -> 0 based
-my $StopGenomes = $EnteredParams{last_genome_field_nr} - 1; # first column with testvariants data, 1 based -> 0 based
-if ($StartGenomes < 0) {die "No valid entry for First_Genome_Field_Nr, must be 1 or greater\n";}
-if ($StopGenomes < 0) {die "No valid entry for Last_Genome_Field_Nr, must be 1 or greater\n";}
-if ($StartGenomes > $StopGenomes) {die "Last_Genome_Field_Nr must be greater than or equal to First_Genome_Field_Nr\n";}
-if ($StartGenomes > int @ColHeader) {die "First_Genome_Field_Nr > number of fields in column header\n";}
-if ($StopGenomes > int @ColHeader) {die "Last_Genome_Field_Nr > number of fields in column header\n";}
-my $NrGenomes = $StopGenomes - $StartGenomes + 1;
-#print "$StartGenomes\t$StopGenomes\n"; #exit;
-#print "First Genome Field:\n\t$ColHeader[$StartGenomes]\n";
-#print "Last Genome Field:\n\t$ColHeader[$StopGenomes]\n\n";
-
-# print column headers
-print $OUT1 join("\t",@ColHeader),"\tAllFreq\tCalledFreq\n";
-print $OUT2 join("\t",@ColHeader[0..7]),"\tAllFreq\tCalledFreq\n";
-print join("\t",@ColHeader),"\n";
-print "First Genome Field: $ColHeader[$StartGenomes]\n";
-print "Last Genome Field: $ColHeader[$StopGenomes]\n";
-print "Nr Genomes: $NrGenomes\n\n";
-
-print "\nProcessing Variants....\n";
-my $VariantCount = 0; # variant locus counter, not used
-my %AllFreqCounts; # storing histogram of all freq counts
-my %CalledFreqCounts; # storing histogram of called freq counts
-my $Warnings;
-while (<$IN>)
-{
-	# testvariants fields: variantId chromosome begin end varType reference alleleSeq xRef GS000000XX1-ASM GS000000XX2-ASM [GS000000XXN-ASM]
-	my $Line = $_; # save line for output below
-    chomp $Line;
-    my @F = split /\t/, $Line; # split in to fields
-    $VariantCount++; # increment variant counter
-	my $UseFields = join ("",@F[$StartGenomes..$StopGenomes]); # get genome fields as string, to count 0s and 1s
-	my $Count1 = () = $UseFields =~ /1/g; # count the number of 1s
-	my $Count0 = () = $UseFields =~ /0/g; # count the number of 0s
-	my $CountN = () = $UseFields =~ /N/g; # count the number of Ns
-	my $NrAlleles = $Count1 + $Count0 + $CountN; # total count
-	unless ($NrAlleles == $NrGenomes *2 or $NrAlleles == $NrGenomes) # count does not match expected for diploid/haploid locus
-	{
-		print "$NrAlleles alleles for variant ",join(" ",@F[0..7]),"\n"; # log warning
-		#print "Expected $NrGenomes or ",$NrGenomes*2," alleles depending on ploidy of locus\n";
-		#if ($Warnings++ > 10) {die "Have found $Warnings exceptions for this file, termnating processing\n";} # terminate if too many warnings
-	}
-	my $AllFreq = sprintf("%0.3f",$Count1/$NrAlleles); # calculate freq of 1s vs all alleles
-	my $CalledFreq = sprintf("%0.3f",0);
-	if ($Count1+$Count0) {$CalledFreq = sprintf("%0.3f",$Count1/($Count1+$Count0));} # calculate freq of 1s vs called alleles, if there are any
-	$AllFreqCounts{$AllFreq}++; # increment all freq histogram
-	$CalledFreqCounts{$CalledFreq}++; # increment called freq histogram
-	#print "$Line\n$AlleleCount\t$Count1\t$Count0\t$AllFreq\t$CalledFreq\n"; #exit;
-	print $OUT1 "$Line\t$AllFreq\t$CalledFreq\n"; # output full testvariants plus frequencies for this var
-	print $OUT2 join("\t",@F[0..7]),"\t$AllFreq\t$CalledFreq\n"; # output just var info plus frequencies for this var
-	#exit if $VariantCount > 20;
-}
-close $OUT1;
-close $OUT2;
-
-# Print frequency histograms
-print "Nr Variants at each Frequency (All):\nFreq\tCount\n"; # header
-foreach my $Freq (sort {$a <=> $b} keys %AllFreqCounts) {print "$Freq\t$AllFreqCounts{$Freq}\n";}
-
-print "\nNr Variants at each Frequency (Called):\nFreq\tCount\n"; # header
-foreach my $Freq (sort {$a <=> $b} keys %CalledFreqCounts) {print "$Freq\t$CalledFreqCounts{$Freq}\n";}
-
-$Time += time;
-print "\ntime $Time\n";
-
-###########################################################################
-#                                   SUBS                                  #
-###########################################################################
-
-sub GetExpectedParams
-{
-	my %Hash =
-	(
-	"input" => -1,
-	"output_dir" => -1,
-	); # store parameters and values
-	return %Hash;
-}
-
-sub GetEnteredParams
-{
-	# Processing @ARGV
-	my %Hash;
-
-	my @ARGVs = split /--/, join (" ",@ARGV); # split args on --, into array
-	#print "Start\n", join ("\n",@ARGVs),"\n",int @ARGVs - 1,"\n\n" if $Debug;
-	#print "Key\tVal\n" if $Debug; #exit;
-	for my $n (1..$#ARGVs) # parse each
-	{
-		$ARGVs[$n] =~ s/\s+$//; # remove any trailing spaces
-		my ($Key, $Val) = split / /, $ARGVs[$n], 2; # put first element into key, any other elements into val
-		$Key = lc $Key;
-		$Hash{$Key} = $Val; # make a hash entry out of key and val
-		#print "$Key\t$EnteredParams{$Key}\n" if $Debug;
-	}
-		#print int(keys %Hash),"\n" if $Debug;
-		#foreach my $Arg (keys %Hash) {print "Arg: $Arg\t",$ExpectedParams{$Arg},"\n";}
-		#print "Arg string:\t",join (" ",@ARGV),"\n" if $Debug;
-		#exit if $Debug;
-	return %Hash; # hash now has each -- entry param, with associated values
-}
-
-sub SaveArrayAsString
-{
-	my $FH = shift;
-	my $Fields = shift;
-	#print "$Fields\n";
-	print $FH join("\t",@$Fields),"\n";
-}
-
-sub ConcatenateVariants
-{
-    my $ArrayIn = shift; # ptr to array
-    my $StateFieldNr = shift; # field to process
-    #print int(@$ArrayIn),"\n";
-    my @ArrayOut; # array to store records out
-    my $Nr = -1;
-    foreach my $Entry (@$ArrayIn)
-    {
-	}
-    return \@ArrayOut; # return ptr to array
-}
-
-sub LoadStateRecord
-{
-	my $Out = shift;
-	my $In = shift;
-	my $StateFieldNr = shift;
-
-				$Out->{State} = $$In[$StateFieldNr]; # get state for new record
-				$Out->{Chr} = $$In[1]; # get chr
-				$Out->{Begin} = $$In[2]; # get begin of state range
-				$Out->{End} = $$In[3]; # get current end of state range
-				$Out->{Records}++; # record added to new count
-}
-
-sub OpenFile
-{
-    my $File = shift;
-    my $FH;
-		open ($FH, "$File") or die ("$!: can't open file $File");
-    return $FH;
-}
-
-sub OpenFileold
-{
-    my $File = shift;
-    my $FH;
-
-    if ($File =~ /.bz2$/)
-    {
-		open ($FH, "bzcat $File |") or die ("$!: can't open file $File");
-    }
-    elsif ($File =~ /.gz$/)
-    {
-		open ($FH, "gunzip -c $File |") or die ("$!: can't open file $File");
-    }
-    elsif ($File =~ /.tsv$/)
-    {
-		open ($FH, "cat $File |") or die ("$!: can't open file $File");
-    }
-    else
-    {
-		die ("$!: do not recognise file type $File");
-    }
-    return $FH;
-}
-
-sub LoadNewRecord
-{
-    my $In = shift;
-    my $Out = shift;
-    $Out->{Chr} = $In->{Chr};
-    $Out->{State} = $In->{State};
-    $Out->{Begin} = $In->{Begin};
-    $Out->{End} = $In->{End};
-    $Out->{Records} = $In->{Records};
-}
-
-sub NewStateRecord
-{
-    my $Record =
-    {
-	Chr => "",
-        Begin => -1,
-        End => -1,
-		State => "",
-        Records => 0,
-		MIEs => 0,
-		StateErrors => 0,
-        Length => -1,
-    };
-    return $Record;
-}