# HG changeset patch
# User bcrain-completegenomics
# Date 1339689845 14400
# Node ID 745e2083374e10a6197b824159e7a635c6aee4d4
# Parent 58e466b93553d76abbdcd718c7d7d7cf7956c175
Deleted selected files
diff -r 58e466b93553 -r 745e2083374e scripts/datatypes_conf.xml
--- a/scripts/datatypes_conf.xml Wed Jun 13 14:06:45 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,19 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff -r 58e466b93553 -r 745e2083374e scripts/tool-data/cg_crr_files.loc.sample
--- a/scripts/tool-data/cg_crr_files.loc.sample Wed Jun 13 14:06:45 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,11 +0,0 @@
-#This is a sample file distributed with Galaxy that enables tools
-#to use .crr reference files. You will need to download or create
-#the .crr reference files and then create a cg_crr_files.loc file
-#similar to this one (store it in this directory) that points to
-#the location of the files. The cg_crr_files.loc
-#file has this format (white space characters are TAB characters):
-#
-#
-#
-#hg19 hg19 hg19.crr /Users/bcrain/Documents/hg19.crr
-
diff -r 58e466b93553 -r 745e2083374e scripts/tool_data_table_conf.xml.sample
--- a/scripts/tool_data_table_conf.xml.sample Wed Jun 13 14:06:45 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,11 +0,0 @@
-
-
-
-
- value, dbkey, name, path
-
-
-
-
diff -r 58e466b93553 -r 745e2083374e scripts/tools/cg_scripts/Calculate_TestVariants_Variant_Frequencies.xml
--- a/scripts/tools/cg_scripts/Calculate_TestVariants_Variant_Frequencies.xml Wed Jun 13 14:06:45 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,67 +0,0 @@
-
-
- in cgatools-testvariants file
-
-
- Calculate_TestVariants_Variant_Frequencies_0_1_0.pl
- --Input $input
- --First_Genome_Field_Nr $first_col
- --Last_Genome_Field_Nr $last_col
- --Output1 $output1
- --Output2 $output2
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-**What it does**
-
-This tool calculates the allele frequencies for all variants present in the testvariant file.
-
------
-
-**Instructions**::
-
- Calculate the frequencies of variants in a testvariants output file
- Two values calculated:
- Frequency vs all alleles
- Frequency vs called alleles
-
- Input: testvariants file
- Outputs:
- All data to *-Freq.tsv, including scores and quals
- vars and freqs to *-Freq_Short.tsv
- Exceptions to *-Freq_Log
- Stats to *-Freq_Stats
-
-
- perl Calculate_TestVariants_Variant_Frequencies_0_0_3.pl \
- --Input input_file \
- --First_Genome_Field_Nr col_nr1 \
- --Last_Genome_Field_Nr col_nr2
- --Output1 output1 \
- --Output2 output_short \
- eg
- perl Calculate_TestVariants_Variant_Frequencies_0_0_3.pl \
- --Input /data/Family_Quartet_testvariants.tsv \
- --Output /data/Family_Quartet_testvariants
- --First_Genome_Field_Nr 9 \
- --Last_Genome_Field_Nr 11
- --Output1 /data/Family_Quartet_testvariants
- --Output2 /data/Family_Quartet_testvariants_short
-
-
-
diff -r 58e466b93553 -r 745e2083374e scripts/tools/cg_scripts/Calculate_TestVariants_Variant_Frequencies_0_1_0.pl
--- a/scripts/tools/cg_scripts/Calculate_TestVariants_Variant_Frequencies_0_1_0.pl Wed Jun 13 14:06:45 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,271 +0,0 @@
-#!/usr/bin/perl
-use strict;
-#use feature "say";
-#use File::Basename;
-$| = 1;
-
-# Get_TestVariants_Variant_Frequencies
-# Calculate the frequencies of variants in a testvariants output file
-# Two values calculated:
-# Frequency vs all alleles
-# Frequency vs called alleles
-
-# Input is a testvariants file
-# Outputs:
-# All data to *-Freq.tsv, including scores and quals
-# vars and freqs to *-Freq_Short.tsv
-# Exceptions to *-Freq_Log
-# Stats to *-Freq_Stats
-
-# Format:
-# perl prog file dir
-# ie
-# perl Get_TestVariants_Variant_Frequencies \
-# --Input input_file \
-# --First_Genome_Field_Nr col_nr1 \
-# --Last_Genome_Field_Nr col_nr2
-# --Output1 output1 \
-# --Output2 output2
-
-# eg
-# perl /perl/Get_TestVariants_Variant_Frequencies_0_0_1.pl \
-# --Input /data/Family_Quartet_testvariants.tsv \
-# --First_Genome_Field_Nr 9 \
-# --Last_Genome_Field_Nr 11
-# --Output1 output1 \
-# --Output2 output2
-
-
-# Rick Tearle 2010-11
-
-my $Time -= time; # start time
-my $Debug = 0;
-
-# Parsing and storing input parameters
-# Only childfields can be repeated
-print "$0 @ARGV\nProcessing input parameters\n";
-my %ExpectedParams = GetExpectedParams ();
-my %EnteredParams = GetEnteredParams ();
-
-# Setting up prog paras from input paras
-my $FileIn = $EnteredParams{input};
-unless (-f $FileIn) {die "Testvariants input file $FileIn not found\n";} # requires existing file
-#my $FileOut = $EnteredParams{output}; #
-#$DirectoryOut =~ s/\/$//; # remove trailing slash if present
-#unless (-d $DirectoryOut) {die "Output directory $DirectoryOut not found\n";} # requires existing file
-#print "$FileIn\n$DirectoryOut\n";
-#$FileIn =~ /(^.+\/)(.+?)\./; # get filename without path and without extensions
-
-# my $FileOut1 = $FileOut."-Freq.tsv";
-# my $FileOut2 = $FileOut."-Freq_Short.tsv";
-# my $FileOut3 = $FileOut."-Freq_Stats.tsv";
-# my $FileOut4 = $FileOut."-Freq_Log.tsv";
-
-print "\nOpening Input File:\n\t$FileIn\n";
-my $IN = OpenFile ($FileIn); # open the file with correct file format
-
-#print "\nOpening Output Files:\n\t$FileOut1\n\t$FileOut2\n\t$FileOut3\n\t$FileOut4\n"; #exit;
-open my $OUT1, ">", $EnteredParams{output1};
-open my $OUT2, ">", $EnteredParams{output2};
-#open my $OUT3, ">", $EnteredParams{output3};
-#open my $OUT4, ">", $EnteredParams{output4};
-
-# Get col header and genomes fields
-my $ColHeader = <$IN>; # get col header
-chomp $ColHeader;
-my @ColHeader = split /\t/, $ColHeader;
-my $StartGenomes = $EnteredParams{first_genome_field_nr} - 1; # first column with testvariants data, 1 based -> 0 based
-my $StopGenomes = $EnteredParams{last_genome_field_nr} - 1; # first column with testvariants data, 1 based -> 0 based
-if ($StartGenomes < 0) {die "No valid entry for First_Genome_Field_Nr, must be 1 or greater\n";}
-if ($StopGenomes < 0) {die "No valid entry for Last_Genome_Field_Nr, must be 1 or greater\n";}
-if ($StartGenomes > $StopGenomes) {die "Last_Genome_Field_Nr must be greater than or equal to First_Genome_Field_Nr\n";}
-if ($StartGenomes > int @ColHeader) {die "First_Genome_Field_Nr > number of fields in column header\n";}
-if ($StopGenomes > int @ColHeader) {die "Last_Genome_Field_Nr > number of fields in column header\n";}
-my $NrGenomes = $StopGenomes - $StartGenomes + 1;
-#print "$StartGenomes\t$StopGenomes\n"; #exit;
-#print "First Genome Field:\n\t$ColHeader[$StartGenomes]\n";
-#print "Last Genome Field:\n\t$ColHeader[$StopGenomes]\n\n";
-
-# print column headers
-print $OUT1 join("\t",@ColHeader),"\tAllFreq\tCalledFreq\n";
-print $OUT2 join("\t",@ColHeader[0..7]),"\tAllFreq\tCalledFreq\n";
-print join("\t",@ColHeader),"\n";
-print "First Genome Field: $ColHeader[$StartGenomes]\n";
-print "Last Genome Field: $ColHeader[$StopGenomes]\n";
-print "Nr Genomes: $NrGenomes\n\n";
-
-print "\nProcessing Variants....\n";
-my $VariantCount = 0; # variant locus counter, not used
-my %AllFreqCounts; # storing histogram of all freq counts
-my %CalledFreqCounts; # storing histogram of called freq counts
-my $Warnings;
-while (<$IN>)
-{
- # testvariants fields: variantId chromosome begin end varType reference alleleSeq xRef GS000000XX1-ASM GS000000XX2-ASM [GS000000XXN-ASM]
- my $Line = $_; # save line for output below
- chomp $Line;
- my @F = split /\t/, $Line; # split in to fields
- $VariantCount++; # increment variant counter
- my $UseFields = join ("",@F[$StartGenomes..$StopGenomes]); # get genome fields as string, to count 0s and 1s
- my $Count1 = () = $UseFields =~ /1/g; # count the number of 1s
- my $Count0 = () = $UseFields =~ /0/g; # count the number of 0s
- my $CountN = () = $UseFields =~ /N/g; # count the number of Ns
- my $NrAlleles = $Count1 + $Count0 + $CountN; # total count
- unless ($NrAlleles == $NrGenomes *2 or $NrAlleles == $NrGenomes) # count does not match expected for diploid/haploid locus
- {
- print "$NrAlleles alleles for variant ",join(" ",@F[0..7]),"\n"; # log warning
- #print "Expected $NrGenomes or ",$NrGenomes*2," alleles depending on ploidy of locus\n";
- #if ($Warnings++ > 10) {die "Have found $Warnings exceptions for this file, termnating processing\n";} # terminate if too many warnings
- }
- my $AllFreq = sprintf("%0.3f",$Count1/$NrAlleles); # calculate freq of 1s vs all alleles
- my $CalledFreq = sprintf("%0.3f",0);
- if ($Count1+$Count0) {$CalledFreq = sprintf("%0.3f",$Count1/($Count1+$Count0));} # calculate freq of 1s vs called alleles, if there are any
- $AllFreqCounts{$AllFreq}++; # increment all freq histogram
- $CalledFreqCounts{$CalledFreq}++; # increment called freq histogram
- #print "$Line\n$AlleleCount\t$Count1\t$Count0\t$AllFreq\t$CalledFreq\n"; #exit;
- print $OUT1 "$Line\t$AllFreq\t$CalledFreq\n"; # output full testvariants plus frequencies for this var
- print $OUT2 join("\t",@F[0..7]),"\t$AllFreq\t$CalledFreq\n"; # output just var info plus frequencies for this var
- #exit if $VariantCount > 20;
-}
-close $OUT1;
-close $OUT2;
-
-# Print frequency histograms
-print "Nr Variants at each Frequency (All):\nFreq\tCount\n"; # header
-foreach my $Freq (sort {$a <=> $b} keys %AllFreqCounts) {print "$Freq\t$AllFreqCounts{$Freq}\n";}
-
-print "\nNr Variants at each Frequency (Called):\nFreq\tCount\n"; # header
-foreach my $Freq (sort {$a <=> $b} keys %CalledFreqCounts) {print "$Freq\t$CalledFreqCounts{$Freq}\n";}
-
-$Time += time;
-print "\ntime $Time\n";
-
-###########################################################################
-# SUBS #
-###########################################################################
-
-sub GetExpectedParams
-{
- my %Hash =
- (
- "input" => -1,
- "output_dir" => -1,
- ); # store parameters and values
- return %Hash;
-}
-
-sub GetEnteredParams
-{
- # Processing @ARGV
- my %Hash;
-
- my @ARGVs = split /--/, join (" ",@ARGV); # split args on --, into array
- #print "Start\n", join ("\n",@ARGVs),"\n",int @ARGVs - 1,"\n\n" if $Debug;
- #print "Key\tVal\n" if $Debug; #exit;
- for my $n (1..$#ARGVs) # parse each
- {
- $ARGVs[$n] =~ s/\s+$//; # remove any trailing spaces
- my ($Key, $Val) = split / /, $ARGVs[$n], 2; # put first element into key, any other elements into val
- $Key = lc $Key;
- $Hash{$Key} = $Val; # make a hash entry out of key and val
- #print "$Key\t$EnteredParams{$Key}\n" if $Debug;
- }
- #print int(keys %Hash),"\n" if $Debug;
- #foreach my $Arg (keys %Hash) {print "Arg: $Arg\t",$ExpectedParams{$Arg},"\n";}
- #print "Arg string:\t",join (" ",@ARGV),"\n" if $Debug;
- #exit if $Debug;
- return %Hash; # hash now has each -- entry param, with associated values
-}
-
-sub SaveArrayAsString
-{
- my $FH = shift;
- my $Fields = shift;
- #print "$Fields\n";
- print $FH join("\t",@$Fields),"\n";
-}
-
-sub ConcatenateVariants
-{
- my $ArrayIn = shift; # ptr to array
- my $StateFieldNr = shift; # field to process
- #print int(@$ArrayIn),"\n";
- my @ArrayOut; # array to store records out
- my $Nr = -1;
- foreach my $Entry (@$ArrayIn)
- {
- }
- return \@ArrayOut; # return ptr to array
-}
-
-sub LoadStateRecord
-{
- my $Out = shift;
- my $In = shift;
- my $StateFieldNr = shift;
-
- $Out->{State} = $$In[$StateFieldNr]; # get state for new record
- $Out->{Chr} = $$In[1]; # get chr
- $Out->{Begin} = $$In[2]; # get begin of state range
- $Out->{End} = $$In[3]; # get current end of state range
- $Out->{Records}++; # record added to new count
-}
-
-sub OpenFile
-{
- my $File = shift;
- my $FH;
- open ($FH, "$File") or die ("$!: can't open file $File");
- return $FH;
-}
-
-sub OpenFileold
-{
- my $File = shift;
- my $FH;
-
- if ($File =~ /.bz2$/)
- {
- open ($FH, "bzcat $File |") or die ("$!: can't open file $File");
- }
- elsif ($File =~ /.gz$/)
- {
- open ($FH, "gunzip -c $File |") or die ("$!: can't open file $File");
- }
- elsif ($File =~ /.tsv$/)
- {
- open ($FH, "cat $File |") or die ("$!: can't open file $File");
- }
- else
- {
- die ("$!: do not recognise file type $File");
- }
- return $FH;
-}
-
-sub LoadNewRecord
-{
- my $In = shift;
- my $Out = shift;
- $Out->{Chr} = $In->{Chr};
- $Out->{State} = $In->{State};
- $Out->{Begin} = $In->{Begin};
- $Out->{End} = $In->{End};
- $Out->{Records} = $In->{Records};
-}
-
-sub NewStateRecord
-{
- my $Record =
- {
- Chr => "",
- Begin => -1,
- End => -1,
- State => "",
- Records => 0,
- MIEs => 0,
- StateErrors => 0,
- Length => -1,
- };
- return $Record;
-}
diff -r 58e466b93553 -r 745e2083374e scripts/tools/cg_scripts/List_Unique_Variants.xml
--- a/scripts/tools/cg_scripts/List_Unique_Variants.xml Wed Jun 13 14:06:45 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,323 +0,0 @@
-
-
- with annotations from gene or var files
-
-
- #if $file_types.file_type =="var2"
- List_Unique_Variants_2_1_0.pl --File_Type V --Output_File $output
- --Var_Type $file_types.variants
- $file_types.scoresVAF
- $file_types.scoresEAF
- $file_types.varQuality
- #if $file_types.data_sources.data_source == "in"
- #for $v in $file_types.data_sources.varfiles
- --Input_File ${v.input}
- #end for
- #else
- `cat $file_types.data_sources.varlist`
- #end if
-
- #else if $file_types.file_type =="var1"
- List_Unique_Variants_2_1_0.pl --File_Type V --Output_File $output
- --Var_Type $file_types.variants
- $file_types.scores
- #if $file_types.data_sources.data_source == "in"
- #for $v in $file_types.data_sources.varfiles
- --Input_File ${v.input}
- #end for
- #else
- `cat $file_types.data_sources.varlist`
- #end if
-
- #else if $file_types.file_type =="gene"
- List_Unique_Variants_2_1_0.pl --File_Type G --Output_File $output
- --Var_Type $file_types.variants
- --Component $file_types.component
- --Impact $file_types.impact
- #if $file_types.data_sources.data_source == "in"
- #for $g in $file_types.data_sources.genefiles
- --Input_File ${g.input}
- #end for
- #else
- `cat $file_types.data_sources.genelist`
- #end if
- #end if
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-**What it does**
-
-This tool identifies all called variants present in the var or gene files and generates annotated variant list.
-
------
-
-**Instructions**::
-
- List Unique Variants for Pipeline 1.x and 2.x
- [Uses header if available, checks for position of xref field if not]
- Take one or more var or gene files
- Extract a non-redundant set of variants
-
- For var files:
- The fields used to define non-redundant variants are are:
- chromosome begin end varType reference alleleSeq xRef
- User can nominate class(es) of varType to filter on
- Outputs varScoreEAF, varScoreVAF and varQuality as a default but user can turn
- them off (separately)
- Scores and qualities stored in separate fields, all values for a variant across
- a set of genomes.
- Values for different genomes separated by ':', for two hom entries for the same
- genome by '|'
- Output is accepted by testvariants to generate a variant table, all fields kept
- in testvariants output
-
- For gene files:
- The fields used to define non-redundant gene variants are:
- chromosome begin end varType reference call xRef geneId mrnaAcc proteinAcc symbol
- orientation component componentIndex codingRegionKnown impact nucleotidePos
- proteinPos annotationRefSequence sampleSequence genomeRefSequence
- User can nominate class(es) of varType, component or impact to filter on
- All gene entries kept ie multiple entries if multiple transcripts
-
- NB Now treating xref as a separate component in var recs, as it is not consistent
- between X and Y vars
- Not fixed for gene recs yet
-
- perl List_Unique_Variants_2_0_11.pl
- --File_Type [V|G]
- --Input_File input_file_1 [set of var or gene files]
- --Input_File input_file_2
- ...
- --Input_File input_file_n
- --Output_File filename
- --Var_Type [For both file types, 'All' or any value from the varType field,
- multiple values allowed, separated by comma]
- --Component [Gene file specific,'All' or any value from component field of gene
- file, multiple allowed; 'All" is default]
- --Impact All [Gene file specific,'All' or any value from impact field of gene
- file, multiple allowed; 'All" is default]
- --Scores [1.x var file specific, yes|no, yes is default]
- --Scores_VAF [2.0 var file specific, yes|no, yes is default]
- --Scores_EAF [2.0 var file specific, yes|no, yes is default]
- --Score_Qualities [yes|no, yes is default]
- eg
- perl List_Unique_Variants_2_0_11.pl \
- --File_Type V \
- --Input_File /Yoruban_Trio_1100_37/GS19238-1100-37/GS00028-DNA_A01/ASM/gene-GS19238-1100-37-ASM.tsv.bz2 \
- --Input_File /Yoruban_Trio_1100_37/GS19239-1100-37/GS00028-DNA_B01/ASM/gene-GS19239-1100-37-ASM.tsv.bz2 \
- --Input_File /Yoruban_Trio_1100_37/GS19240-1100-37/GS00028-DNA_C01/ASM/gene-GS19240-1100-37-ASM.tsv.bz2 \
- --Output_File /Users/rtearle/Documents/TBF/YRI_Trio_Protein_Coding.tsv \
- --Var_Type All
- --Component All
- --Impact All
- --Scores_VAF yes \
- --Scores_EAF yes \
- --Score_Qualities yes
-
- var fields
- 1.x locus ploidy haplotype chromosome begin end varType reference alleleSeq
- totalScore hapLink xRef
- 2.0 locus ploidy allele chromosome begin end varType reference alleleSeq
- varScoreVAF varScoreEAF varQuality hapLink xRef
-
- gene fields
- 1.x index locus allele chromosome begin end varType reference call xRef geneId
- mrnaAcc proteinAcc symbol orientation component componentIndex
- codingRegionKnown impact nucleotidePos proteinPos annotationRefSequence
- sampleSequence genomeRefSequence
- 2.0 index locus allele chromosome begin end varType reference call xRef geneId
- mrnaAcc proteinAcc symbol orientation component componentIndex hasCodingRegion
- impact nucleotidePos proteinPos annotationRefSequence sampleSequence
- genomeRefSequence pfam
-
- Parsing and storing input parameters
- Only input_file fields can be repeated
- input paramaters are case insensitive
-
-
-
-
diff -r 58e466b93553 -r 745e2083374e scripts/tools/cg_scripts/List_Unique_Variants_2_1_0.pl
--- a/scripts/tools/cg_scripts/List_Unique_Variants_2_1_0.pl Wed Jun 13 14:06:45 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,583 +0,0 @@
-#!/usr/bin/perl
-use strict;
-use File::Basename;
-$| = 1;
-
-# List Unique Variants for Pipeline 1.x and 2.x
-# [Uses header if available, checks for position of xref field if not]
-# Take one or more var or gene files
-# Extract a non-redundant set of variants
-
-# For var files:
-# The fields used to define non-redundant variants are are:
-# chromosome begin end varType reference alleleSeq xRef
-# User can nominate class(es) of varType to filter on
-# Outputs varScoreEAF, varScoreVAF and varQuality as a default but user can turn them off (separately)
-# Scores and qualities stored in separate fields, all values for a variant across a set of genomes.
-# Values for different genomes separated by ':', for two hom entries for the same genome by '|'
-# Output is accepted by testvariants to generate a variant table, all fields kept in testvariants output
-
-# For gene files:
-# The fields used to define non-redundant gene variants are:
-# chromosome begin end varType reference call xRef geneId mrnaAcc proteinAcc symbol orientation component componentIndex codingRegionKnown impact nucleotidePos proteinPos annotationRefSequence sampleSequence genomeRefSequence
-# User can nominate class(es) of varType, component or impact to filter on
-# All gene entries kept ie multiple entries if multiple transcripts
-
-# NB Now treating xref as a separate component in var recs, as it is not consistent between X and Y vars
-# Not fixed for gene recs yet
-
-# perl List_Unique_Variants_2_0_9.pl
-# --File_Type [V|G]
-# --Input_File input_file_1 [set of var or gene files]
-# --Input_File input_file_2
-# ...
-# --Input_File input_file_n
-# --Output_File filename
-# --Var_Type [For both file types, 'All' or any value from the varType field, multiple values allowed, separated by comma]
-# --Component [Gene file specific,'All' or any value from component field of gene file, multiple allowed; 'All" is default]
-# --Impact All [Gene file specific,'All' or any value from impact field of gene file, multiple allowed; 'All" is default]
-# --Scores [1.x var file specific, yes|no, yes is default]
-# --Scores_VAF [2.0 var file specific, yes|no, yes is default]
-# --Scores_EAF [2.0 var file specific, yes|no, yes is default]
-# --Score_Qualities [yes|no, yes is default]
-# eg
-# perl /Users/rtearle/Documents/Programming/Perl/Scripts/Dev/List_Unique_Variants_2_0_4 \
-# --File_Type V \
-# --Input_File /Yoruban_Trio_1100_37/GS19238-1100-37/GS00028-DNA_A01/ASM/gene-GS19238-1100-37-ASM.tsv.bz2 \
-# --Input_File /Yoruban_Trio_1100_37/GS19239-1100-37/GS00028-DNA_B01/ASM/gene-GS19239-1100-37-ASM.tsv.bz2 \
-# --Input_File /Yoruban_Trio_1100_37/GS19240-1100-37/GS00028-DNA_C01/ASM/gene-GS19240-1100-37-ASM.tsv.bz2 \
-# --Output_File /Users/rtearle/Documents/TBF/YRI_Trio_Protein_Coding.tsv \
-# --Var_Type All
-# --Component All
-# --Impact All
-# --Scores_VAF yes \
-# --Scores_EAF yes \
-# --Score_Qualities yes
-
-# var fields
-# 1.x
-# locus ploidy haplotype chromosome begin end varType reference alleleSeq totalScore hapLink xRef
-# 2.0
-# locus ploidy allele chromosome begin end varType reference alleleSeq varScoreVAF varScoreEAF varQuality hapLink xRef
-
-# gene fields
-# 1.x index locus allele chromosome begin end varType reference call xRef geneId mrnaAcc proteinAcc symbol orientation
-# component componentIndex codingRegionKnown impact nucleotidePos proteinPos annotationRefSequence sampleSequence genomeRefSequence
-# 2.0 index locus allele chromosome begin end varType reference call xRef geneId mrnaAcc proteinAcc symbol orientation
-# component componentIndex hasCodingRegion impact nucleotidePos proteinPos annotationRefSequence sampleSequence genomeRefSequence pfam
-
-# Parsing and storing input parameters
-# Only input_file fields can be repeated
-# input paramaters are case insensitive
-
-print "$0 @ARGV\nProcessing input parameters\n";
-my $NrParams;
-my %ExpectedParams = GetExpectedParams (); # list of expected parms
-my %EnteredParams = GetEnteredParams (); # list of entered params
-
-# Input Files
-my $FileType = $EnteredParams{file_type};
-if ($FileType ne "V" and $FileType ne "G") {die "File Type must be 'V' or 'G', not '$FileType'\n";}
-my $FilesIn = $EnteredParams{input_file}; # ptr to list of input files
-print "File Type: $FileType\nInput Files:\n";
-my $NrInputFiles = int(@$FilesIn);
-foreach my $File (@$FilesIn) {print "$File\n";} # requires existing files
-foreach my $File (@$FilesIn) {unless (-f $File) {die "Input file $File not found\n";}} # requires existing files
-for my $n (0.. $NrInputFiles-2) # look for duplicates in list
- {for my $m ($n+1.. $NrInputFiles-1) {if ($$FilesIn[$n] eq $$FilesIn[$m] ) {die "File $$FilesIn[$n] is repeated in input file list\n";}}}
-
-# Output Dir
-#my $DirectoryOut = $EnteredParams{output_dir}; # output dir
-#$DirectoryOut =~ s/\/$//; # remove trailing slash if present
-#unless (-d $DirectoryOut) {mkdir $DirectoryOut or die "Cannot find/create output directory $DirectoryOut\n";} # uses existing dir or makes a new dir if it can
-
-# Ouput File
-my $FileOut = $EnteredParams{output_file}; # output file
-print "FileOut: $FileOut\n";
-$FileOut =~ /(^.+\/)?(.+$)/; # split in to path and filename
-my $DirectoryOut = $1; # assign path # NEED MORE TESTING EG EMPTY PATH #
-$FileOut = $2; # assign file prefix
-print "File: $FileOut\nDir: $DirectoryOut\n"; #exit;
-# if (-f $DirectoryOut.$FileOut) # ouput file exists, create a new one based on the name
-# {
-# print "Output file $FileOut exists, modifying to unique file name ";
-# $FileOut =~ /^(.+?)\./; # find name without extensions
-# my $Stub = $1; # set stub to name without extensions
-# $FileOut =~ /(\..+)?$/; # get extension(s)
-# my $Ext = $1; # set ext to extensions
-# my $n = 1; # n will increment to find a unique name
-# my $Suff = ""; # suff tracks n
-# while (-f $DirectoryOut.$Stub.$Suff.$Ext) {$Suff = "-$n"; $n++;} # loop till we have a new unique filename
-# $FileOut = $Stub.$Suff.$Ext; # file out now has same name, same extensions, but also -n at the end of the name, making it unique
-# print "$FileOut\n";
-# }
-
-#print "Files\n",join("\n",@$FilesIn),"\n\n";
-#print "Ouput Dir\n$DirectoryOut\n";
-#print "Ouput File\n$FileOut\n";
-
-# Extract Header & Column Header
-my $IN = OpenFile ($$FilesIn[0]); # open the first file with correct file format
-my $Header = GetHeaderAsString ($IN); # get header
-unless ($Header) {close $IN; $IN = OpenFile ($$FilesIn[0]);} # if there is no header, close and reopen file, ie start file again
-my $ColHeader = <$IN>; # get col header, first remaining line
-chomp $ColHeader;
-
-# Get version if filetype is var - needed because there are new fields in 2.0 and posn of xRef changed
-my ($Version, $XrefField);
-if ($FileType eq "V")
-{
- ($Version, $XrefField) = GetVersion ($Header, $ColHeader);
- #print "$Version $XrefField\n"; exit;
- unless ($Version) {die "Cannot determine format version of first file in list\nNeed either a native Complete header or a native Complete Column Header with an xRef field\n";}
-}
-
-# Shared input params
-my $OutputVarTypes = lc $EnteredParams{var_type} || $ExpectedParams{var_type}; # var types listed in file in lc
-$OutputVarTypes =~ s/\,/\|/g; # create regex string
-$OutputVarTypes =~ s/\,| //; # remove extraneous commas, spaces
-
-# Input Params for var file
-my ($KeepScoresVAF, $KeepScoresEAF, $KeepQuals, $KeepScores, $VarExtras);
-if ($FileType eq "V")
-{
- if ($Version == 2)
- {
- $KeepQuals = lc $EnteredParams{score_quality} || $ExpectedParams{score_quality}; # keep scoresQuality for 2.0
- $KeepQuals = 1 if $KeepQuals eq "yes"; # converting to boolean
- $KeepScoresVAF = lc $EnteredParams{scores_vaf} || $ExpectedParams{scores_vaf}; # keep scoresVAF for 2.0
- $KeepScoresVAF = 1 if $KeepScoresVAF eq "yes"; # converting to boolean
- $KeepScoresEAF = lc $EnteredParams{scores_eaf} || $ExpectedParams{scores_eaf}; # keep scoresEAF for 2.0 $KeepQuals = lc $EnteredParams{score_qualities} || $ExpectedParams{score_qualities}; # keep scoresQuality for 2.0
- $KeepScoresEAF = 1 if $KeepScoresEAF eq "yes"; # converting to boolean
- }
- else # Version 1
- {
- $KeepScores = lc $EnteredParams{scores} || $ExpectedParams{scores}; # keep scores for 1.x
- $KeepScores = 1 if $KeepScores eq "yes"; # converting to boolean
- }
- $VarExtras = 1 if $KeepScoresVAF or $KeepScoresEAF or $KeepQuals or $KeepScores; # flag to process var file for scores info
-}
-
-# Input Params for gene file
-my $OutputComponents = uc $EnteredParams{component} || $ExpectedParams{component}; # components listed in file in uc
-my $OutputImpacts = uc $EnteredParams{impact} || $ExpectedParams{impact}; # impacts listed in file in uc
-
-# Loading chr nrs, setting up var hash
-my @ChrNames = ('chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10',
- 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19',
- 'chr20', 'chr21', 'chr22', 'chrX', 'chrY', 'chrM'); # using this array forces the output order of chrs into the correct order
-my %Vars; # hash to store var records in an array for each chr
-foreach my $Chr (@ChrNames) {$Vars{$Chr} = {};} # print "$Chr\t";} # set up hash of hashes, one for each chr
-#print "\n"; #exit;
-
-# Create ouput col header
-if ($FileType eq "V")
-{
- # 1.x locus ploidy haplotype chromosome begin end varType reference alleleSeq totalScore hapLink xRef
- # 2.0 locus ploidy allele chromosome begin end varType reference alleleSeq varScoreVAF varScoreEAF varQuality hapLink xRef
- my @Fields = split "\t", $ColHeader;
- $ColHeader = join("\t",@Fields[3..8])."\t".$Fields[$XrefField];
- if ($Version == 2) # 2.0
- {
- if ($KeepScoresVAF) {$ColHeader .= "\tvarScoreVAF";}
- if ($KeepScoresEAF) {$ColHeader .= "\tvarScoreEAF";}
- if ($KeepQuals) {$ColHeader .= "\tvarQuality";}
- }
- else # 1.x
- {
- if ($KeepScores) {$ColHeader .= "\ttotalScore";}
- }
-}
-elsif ($FileType eq "G")
-{
- # 1.x index locus allele chromosome begin end varType reference call xRef geneId mrnaAcc proteinAcc symbol orientation
- # component componentIndex codingRegionKnown impact nucleotidePos proteinPos annotationRefSequence sampleSequence genomeRefSequence
- # 2.0 index locus allele chromosome begin end varType reference call xRef geneId mrnaAcc proteinAcc symbol orientation
- # component componentIndex hasCodingRegion impact nucleotidePos proteinPos annotationRefSequence sampleSequence genomeRefSequence pfam
- my @Fields = split "\t", $ColHeader;
- $ColHeader = join("\t",@Fields[3..30]); # 30 much more than needed
- $OutputComponents =~ s/\,/\|/g; # create regex string
- $OutputComponents =~ s/\,| //; # remove extraneous commas, spaces
- $OutputComponents =~ s/\,| //; # remove extraneous commas, spaces
- $OutputImpacts =~ s/\,/\|/g; # create regex string
- $OutputImpacts =~ s/\,| //; # remove extraneous commas, spaces
- #print "OutputVarTypes: $OutputVarTypes\nOutputComponents: $OutputComponents\nOutputImpacts: $OutputImpacts\n";
-}
-else
-{
- die "FileType $FileType not understood\n"; # redundant
-}
-
-# Set up Processing Subs
-if ($FileType eq "G") # use gene subs for 'G'
-{
- *ExtractSub2Use = \&ExtractGeneFields;
- *AddRecSub2Use = \&AddGeneRec;
-}
-else # use var subs for 'V'
-{
- *ExtractSub2Use = \&ExtractVarFields;
- *AddRecSub2Use = \&AddVarRec;
-}
-#print "Gene: ",\&ExtractGeneFields," ",\&AddGeneRec," Var: ",\&ExtractVarFields," ",\&AddVarRec," Using: ",\&ExtractSub2Use," ",\&AddRecSub2Use,"\n"; exit;
-
-# Process Files
-my $RecCount = 0; # total nr recs that are coding/splicing/spanning
-my $FileCount = 0; # keeps track of nr files to use to format eg scores field
-my $XRef;
-foreach my $File (@$FilesIn)
-{
- print "Processing file $File\n";
- $FileCount++; # counting nr files
-
- # Open file, process header, col header
- my $IN = OpenFile ($File); # open the file with correct file format
- my $Header = GetHeaderAsString ($IN); # get header
- unless ($Header) {close $IN; $IN = OpenFile ($$FilesIn[0]);} # if no header, close and reopen file, ie start file again
- my $ColHeader = <$IN>; # get col header, first remaining line
- unless ($ColHeader =~ /^>/) {print "Suspect column header for file $File:\n$ColHeader\n;"}
-
- my $Count = 0; # cnr recs for this file that are coding/splicing/spanning
- while (<$IN>) # loop through remainder of file ie data
- {
- my ($Rec, $Chr, $ScoreVAF, $ScoreEAF, $ScoreQual, $XRef) = ExtractSub2Use ($_, $XrefField); # sub extracts wanted fields in rec as string, chr, other fields optionally
- next unless $Rec;
- AddRecSub2Use ($Rec, $Vars{$Chr}, $ScoreVAF, $ScoreEAF, $ScoreQual, $FileCount, $XRef) if $Rec; # only process if rec is not empty
- $Count++; # increment count of coding/splicing/spanning vars
- }
- print "Nr matched records for this file: $Count\n";
- $RecCount += $Count; # add this file's count to total count
- close $IN;
-}
-print "Nr matched records across all files:\t $RecCount\n"; # total count
-
-# Open file out, write col header
-print "Sorting and Saving to file $DirectoryOut$FileOut ...\n";
-open my $OUT, ">", $DirectoryOut . $FileOut or die "could not write to $DirectoryOut/$FileOut\n";
-if ($FileType eq "V")
-{
- print $OUT "variantId\t";
-} # first col header for var file
-else
-{
- $ColHeader =~ s/\tcall\t/\talleleSeq\t/;
- print $OUT "index\t";
-} # first col header for gene file
-print $OUT "$ColHeader\n"; # remainder of col header
-
-$RecCount = 0; # reuse total count for nr of non-reduntant vars
-$FileCount--; # reduce by one, used below to add missing delimiters
-foreach my $Chr (@ChrNames) # sort records in each chr array and print with count
-{
- foreach my $Rec (sort {SortStringsasArrays ($a, $b)} keys %{$Vars{$Chr}}) # using sub to sort on being, end fields
- {
- next unless $Rec;
- $RecCount++; # increment count of coding/splicing/spanning vars
- print $OUT "$RecCount\t$Rec"; # printing rec and count
- print $OUT "\t",$Vars{$Chr}->{$Rec}->[4] if $FileType eq "V"; # print xref if var files
-
- if ($VarExtras)
- {
- my $FieldDelimiterCount = () = $Vars{$Chr}->{$Rec}->[3] =~ /:/g;
- #print "$SpacerCount $Vars{$Chr}->{$Rec}->[1]\n";
- #exit if $TmpCount++ > 10;
- my $Addition = ":" x ($FileCount - $FieldDelimiterCount);
- if ($Version == 2)
- {
- print $OUT "\t",$Vars{$Chr}->{$Rec}->[1],$Addition if $KeepScoresVAF;
- print $OUT "\t",$Vars{$Chr}->{$Rec}->[2],$Addition if $KeepScoresEAF;
- print $OUT "\t",$Vars{$Chr}->{$Rec}->[3],$Addition if $KeepQuals;
- }
- else
- {
- print $OUT "\t",$Vars{$Chr}->{$Rec}->[1],$Addition if $KeepScores;
- }
- #print $OUT "$Rec\t$Vars{$Chr}->{$Rec}\n"; # printing rec and count
- }
- print $OUT "\n";
- }
-}
-print "Nr saved records:\t $RecCount\n"; # count of non-redundant vars, c/f all vars abovve
-
-
-###########################################################################
-# SUBS #
-###########################################################################
-
-sub GetExpectedParams
-{
- my %Hash = # hash to store expected params
- (
- "file_type" => -1,
- "input_file" => [],
- "output_file" => -1,
- "var_type" => "all",
- "component" => "ALL",
- "impact" => "ALL",
- "scores" => "yes",
- "scores_eaf" => "yes",
- "scores_vaf" => "yes",
- "score_quality" => "yes",
- );
- $NrParams = int keys %Hash;
- return %Hash;
-}
-
-sub GetEnteredParams
-{
- # Processing @ARGV
- my %Hash;
- my @ARGVs = split /--/, join (" ",@ARGV); # split args on --, into array
- for my $n (1..$#ARGVs) # parse each [nb arg 0 is empty so ignored]
- {
- $ARGVs[$n] =~ s/\s+$//; # remove any trailing spaces
- my ($Key, $Val) = split / /, $ARGVs[$n], 2; # put first element into key, any other elements into val
- $Key = lc $Key; # make lower case, ie case insensitive
- if ($Key eq "input_file") # multiple entries expected, setting up array
- {
- push @{$Hash{$Key}}, $Val; # add input to input hash
-
- }
- else
- {
- $Hash{$Key} = $Val; # make a hash entry out of key and val
- }
- }
- return %Hash; # hash now has each --entry param, with associated values
-}
-
-sub OpenFile
-{
- my $File = shift;
- my $FH;
- open $FH, $File;
- return $FH;
-}
-
-sub OpenFileold
-{
- my $File = shift;
- my $FH;
-
- if ($File =~ /.bz2$/)
- {
- open ($FH, "bzcat $File |") or die ("$!: can't open file $File");
- }
- elsif ($File =~ /.gz$/)
- {
- open ($FH, "gunzip -c $File |") or die ("$!: can't open file $File");
- }
- elsif ($File =~ /.tsv$/ or $File =~ /.txt$/)
- {
- open ($FH, "cat $File |") or die ("$!: can't open file $File");
- }
- else
- {
- print ("Do not recognise file type for file $File.\nOpening as text file\n");
- open ($FH, "cat $File |") or die ("$!: can't open file $File");
- }
- return $FH;
-}
-
-sub GetHeaderAsString
-{
- my $FH = shift;
- my $Header = "";
- my $Count = 0;
- while (<$FH>) # loop until a line is empty
- {
- chomp;
- if ($_ eq "") # exit when empty line
- {
- return $Header ; # return ref to array
- }
- else
- {
- $Header .= $_;
- }
- return "" if $Count++ > 50; # too many lines for a header, must be no header, return empty array
- }
-}
-
-sub GetVersion
-{
- my $Header = shift;
- my $ColHeader = shift;
-
- my $Version = 0; # need to know if it is 1.x or 2.x
- my $XrefField = -1;
-
- if ($FileType eq "V")
- {
- if ($Header)
- {
- $Header =~ /#FORMAT_VERSION\t(\d)/;
- if ($1 == 1) {$Version = 1; $XrefField = 11;}
- elsif ($1 == 2) {$Version = 2; $XrefField = 13;}
- else {print "Warning: Format Version not found in Header\n";} # not in header
- }
- unless ($Version)
- {
- my @ColHeader = split /\t/, $ColHeader;
- for my $n (0..int(@ColHeader)-1)
- {
- if ($ColHeader eq "xRef")
- {
- $XrefField = $n;
- if ($n == 11) {$Version = 1;}
- elsif ($n == 13) {$Version = 2;}
- last;
- }
- }
- }
- }
- return ($Version, $XrefField);
-}
-
-sub ExtractGeneFields # expects a gene file rec, strips out file specific fields, gets chr
-{
- my $Rec = shift;
- # gene fields
- # >index locus allele chromosome begin end varType reference call xRef geneId mrnaAcc
- # proteinAcc symbol orientation component componentIndex hasCodingRegion impact
- # nucleotidePos proteinPos annotationRefSequence sampleSequence genomeRefSequence pfam
-
- chomp $Rec; # remove return
- my @Fields = split "\t", $Rec;
- #print "$Fields[6] $Fields[15] $Fields[18]\t$OutputVarTypes $OutputComponents $OutputImpacts\n";
- unless ($OutputVarTypes eq "all" or $Fields[6] =~ /$OutputVarTypes/) {return ("","");} # nominated vals not found, leave
- unless ($OutputComponents eq "ALL" or $Fields[15] =~ /$OutputComponents/) {return ("","");} # nominated vals not found, leave
- unless ($OutputImpacts eq "ALL" or $Fields[18] =~ /$OutputImpacts/) {return ("","");} # nominated vals not found, leave
- my $Chr = $Fields[3]; # assign chr
- $Rec = join("\t",@Fields[3..24]);
- #$Rec =~ s/\t$//; # remove trailing tab if there is one
-
- return ($Rec, $Chr);
-}
-
-sub ExtractVarFields # expects a gene file rec, strips out file specific fields, gets chr
-{
- my $Rec = shift;
- my $XrefField = shift;
- # var fields
- # 1.x locus ploidy haplotype chromosome begin end varType reference alleleSeq totalScore hapLink xRef
- # 2.0 locus ploidy allele chromosome begin end varType reference alleleSeq varScoreVAF varScoreEAF varQuality hapLink xRef
-
- chomp $Rec; # remove return
- my @Fields = split "\t", $Rec;
- #print "$Fields[6] $OutputVarTypes \n"; exit;
- unless ($OutputVarTypes eq "all" or $Fields[6] =~ /$OutputVarTypes/) {return ("","");} # nominated vals not found, leave
- my $Chr = $Fields[3]; # assign chr
- #$Rec = join("\t",@Fields[3..8]);
- $Rec = join("\t",@Fields[3..8]);
- #$Rec =~ s/\t$//; # remove trailing tab if there is one
-
- if ($VarExtras)
- {
- return ($Rec, $Chr, $Fields[9], $Fields[10], $Fields[11], $Fields[$XrefField]) if $Version == 2;
- return ($Rec, $Chr, $Fields[9], $Fields[$XrefField]) # $Version == 1;
- }
- else
- {
- return ($Rec, $Chr);
- }
-}
-
-sub AddVarRec
-{
- my $Rec = shift;
- my $RecHash = shift;
- my $ScoreVAF = shift;
- my $ScoreEAF = shift;
- my $ScoreQual = shift;
- my $FileCount = shift;
- my $XRef = shift;
-
- if ($VarExtras) # need to extract scores information
- {
- # locus ploidy allele chromosome begin end varType reference alleleSeq varScoreVAF varScoreEAF varQuality hapLink xRef
- # Set delimiter
- my $Delimiter;
- if ($RecHash->{$Rec}) # hash entry for this var already exists
- {
- if ($RecHash->{$Rec}->[0] == $FileCount)
- {
- $Delimiter = "|"; # same chr, var is hom, use |
- }
- else # diff chr, use :
- {
- my $FieldDelimiterCount = () = $RecHash->{$Rec}->[3] =~ /:/g; # count nr field delims
- $Delimiter = ":" x ($FileCount - $FieldDelimiterCount - 1); # delimiters for any processed files, that didnt have this var
- }
- $RecHash->{$Rec}->[4] = $XRef if length $XRef > length $RecHash->{$Rec}->[4]; # replace xref if new xref is longer
- }
- else # new var
- {
- $RecHash->{$Rec} = []; # create array to hold it
- $Delimiter = ":" x ($FileCount - 1); # delimiters for prev processed files, that didnt have this var
- $RecHash->{$Rec}->[4] = $XRef; # add xref
- }
-
- # Process var
- $RecHash->{$Rec}->[0] = $FileCount;
- if ($Version == 2)
- {
- $RecHash->{$Rec}->[1] .= $Delimiter.$ScoreVAF; # add delimiter, varScoreVAF
- $RecHash->{$Rec}->[2] .= $Delimiter.$ScoreEAF; # add delimiter, varScoreVAF
- $RecHash->{$Rec}->[3] .= $Delimiter.($ScoreQual eq "VQHIGH" ? "H" : "L"); # add delimiter, qual
- }
- else
- {
- $RecHash->{$Rec}->[1] .= $Delimiter.$ScoreVAF; # add delimiter, totalScore
- }
-
- }
- else # just the rec, no var extras being extrcted
- {
- $RecHash->{$Rec}++; # hash with rec as key, increment count for this key
- $RecHash->{$Rec}->[4] = $XRef if length $XRef > length $RecHash->{$Rec}->[4]; # replace xref if new xref is longer, wasting space here
- }
-}
-
-sub AddGeneRec
-{
- my $Rec = shift;
- my $RecHash = shift;
-
- $RecHash->{$Rec}++; # hash with rec as key, increment count for this key
-}
-
-sub SortStringsasArrays # sorts based on begin and end of two recs
-{
- my $String1 = shift; # first string
- my $String2 = shift; # second string
-
- my @Array1 = split "\t", $String1; # put fields into array
- my @Array2 = split "\t", $String2;
-
- # array[1] is begin, array[2] is end, returning order based on these fields
- if ($Array1[1] < $Array2[1]) # begin of 1 < begin of 2
- {
- return -1;
- }
- elsif ($Array1[1] == $Array2[1]) # begin of 1 == begin of 2
- {
- if ($Array1[2] < $Array2[2]) # end of 1 < end of 2
- {
- return -1;
- }
- elsif ($Array1[2] == $Array2[2]) # end of 1 == end of 2
- {
- return 0;
- }
- else # end of 1 > end of 2
- {
- return 1;
- }
- }
- else # begin of 1 > begin of 2
- {
- return 1;
- }
-
-}
diff -r 58e466b93553 -r 745e2083374e tool_data_table_conf.xml.sample
--- a/tool_data_table_conf.xml.sample Wed Jun 13 14:06:45 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,11 +0,0 @@
-
-
-
-