# HG changeset patch # User bcrain-completegenomics # Date 1339623029 14400 # Node ID 51fea6716ea54b32c27c2ec653a9b9c1e2f2ac31 # Parent 1fdc01496e711d1f0196c316efc46517b09ea50e Deleted selected files diff -r 1fdc01496e71 -r 51fea6716ea5 cgatools/tools/cgatools/join.xml --- a/cgatools/tools/cgatools/join.xml Wed Jun 13 17:29:06 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,206 +0,0 @@ - - - two tsv files based on equal fields or overlapping regions. - - - cgatools - - - - cgatools join --beta - --input $inputA - --input $inputB - --output $output - --output-mode $outmode - $dump - --select $col - #for $m in $matches - --match ${m.match} - #end for - #if $range_overlap.range == 'yes' - #for $o in $range_overlap.overlaps - --overlap ${o.overlap} - #end for - --overlap-mode $range_overlap.overlapmode - --overlap-fraction-A $range_overlap.fractionA - --boundary-uncertainty-A $range_overlap.boundaryA - --overlap-fraction-B $range_overlap.fractionB - --boundary-uncertainty-B $range_overlap.boundaryB - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool joins two tab-delimited files based on equal fields or overlapping regions. - -cgatools: http://sourceforge.net/projects/cgatools/files/ - ------ - -**cgatools Manual**:: - - COMMAND NAME - join - Joins two tab-delimited files based on equal fields or overlapping regions. - - DESCRIPTION - Joins two tab-delimited files based on equal fields or overlapping regions. - By default, an output record is produced for each match found between file - A and file B, but output format can be controlled by the --output-mode - parameter. - - OPTIONS - -h [ --help ] - Print this help message. - - --beta - This is a beta command. To run this command, you must pass the --beta - flag. - - --input arg - File name to use as input (may be passed in as arguments at the end of - the command), or omitted for stdin). There must be exactly two input - files to join. If only one file is specified by name, file A is taken - to be stdin and file B is the named file. File B is read fully into - memory, and file A is streamed. File A's columns appear first in the - output. - - --output arg (=STDOUT) - The output file name (may be omitted for stdout). - - --match arg - A match specification, which is a column from A and a column from B - separated by a colon. - - --overlap arg - Overlap specification. An overlap specification consists of a range - definition for files A and B, separated by a colon. A range definition - may be two columns, in which case they are interpreted as the beginning - and end of the range. Or it may be one column, in which case the range - is defined as the 1-base range starting at the given value. The records - from the two files must overlap in order to be considered for output. - Two ranges are considered to overlap if the overlap is at least one - base long, or if one of the ranges is length 0 and the ranges overlap - or abut. For example, "begin,end:offset" will match wherever end-begin - > 0, begin<offset+1, and end>offset, or wherever end-begin = 0, - begin<=offset+1, and end>=offset. - - - -m [ --output-mode ] arg (=full) - Output mode, one of the following: - full Print an output record for each match found between - file A and file B. - compact Print at most one record for each record of file A, - joining the file B values by a semicolon and - suppressing repeated B values and empty B values. - compact-pct Same as compact, but for each distinct B value, - annotate with the percentage of the A record that is - overlapped by B records with that B value. Percentage - is rounded up to nearest integer. - - --overlap-mode arg (=strict) - Overlap mode, one of the following: - strict Range A and B overlap if A.begin < B.end and - B.begin < A.end. - allow-abutting-points Range A and B overlap they meet the strict - requirements, or if A.begin <= B.end and - B.begin <= A.end and either A or B has zero - length. - - --select arg (=A.*,B.*) - Set of fields to select for output. - - -a [ --always-dump ] - Dump every record of A, even if there are no matches with file B. - - --overlap-fraction-A arg (=0) - Minimum fraction of A region overlap for filtering output. - - --boundary-uncertainty-A arg (=0) - Boundary uncertainty for overlap filtering. Specifically, records - failing the following predicate are filtered away: overlap >= - overlap-fraction-A * ( A-range-length - boundary-uncertainty-A ) - - --overlap-fraction-B arg (=0) - Minimum fraction of B region overlap for filtering output. - - --boundary-uncertainty-B arg (=0) - Boundary uncertainty for overlap filtering. Specifically, records - failing the following predicate are filtered away: overlap >= - overlap-fraction-B * ( B-range-length - boundary-uncertainty-B ) - - SUPPORTED FORMAT_VERSION - Any - - diff -r 1fdc01496e71 -r 51fea6716ea5 cgatools/tools/cgatools/junctiondiff.xml --- a/cgatools/tools/cgatools/junctiondiff.xml Wed Jun 13 17:29:06 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,146 +0,0 @@ - - - reports difference between junction calls - - - cgatools - - - - cgatools junctiondiff --beta - --reference $crr.fields.path - --junctionsA $data_sources.inputA - --junctionsB $data_sources.inputB - --scoreThresholdA $scoreA - --scoreThresholdB $scoreB - --distance $distance - --minlength $minlength - $stat - --output-prefix cg_ - ; - mv cg_diff-*tsv cg_diff.tsv - - - - - - (stat == '--statout') - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool reports difference between junction calls of Complete Genomics junctions files - -cgatools: http://sourceforge.net/projects/cgatools/files/ - ------ - -**cgatools Manual**:: - - COMMAND NAME - junctiondiff - Reports difference between junction calls of Complete Genomics junctions files. - - DESCRIPTION - junctiondiff takes two junction files A and B as input and produces the - following output: - - "diff-inputFileName" - the junctions from an input file A that are not - present in input file B. - - "report.txt" - a brief summary report (if --statout is used) - - Two junctions are considered equivalent if: - - they come from different files - - left and right positions of one junction are not more than "--distance" - bases apart from the corresponding positions of another junction - - the junction scores are equal or above the scoreThreshold - - they are on the same strands - - OPTIONS - -h [ --help ] - Print this help message. - - --beta - This is a beta command. To run this command, you must pass the --beta - flag. - - -s [ --reference ] arg - Reference file. - - -a [ --junctionsA ] arg - input junction file A. - - -b [ --junctionsB ] arg - input junction file B. - - -A [ --scoreThresholdA ] arg (=10) - score threshold value for the input file A. - - -B [ --scoreThresholdB ] arg (=0) - score threshold value for the input file B. - - -d [ --distance ] arg (=200) - Max distance between coordinates of potentially compatible junctions. - - -l [ --minlength ] arg (=500) - Minimum deletion junction length to be included into the difference - file. - - -o [ --output-prefix ] arg - The path prefix for all the output reports. - - -S [ --statout ] - (Debug) Report various input file statistics. Experimental feature. - - SUPPORTED FORMAT_VERSION - 1.5 or later - - diff -r 1fdc01496e71 -r 51fea6716ea5 cgatools/tools/cgatools/listtestvariants.xml --- a/cgatools/tools/cgatools/listtestvariants.xml Wed Jun 13 17:29:06 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,239 +0,0 @@ - - - - - - - cgatools - - - - cgatools listvariants - --beta - --reference ${crr.fields.path} - --output $output1 - #if $include_list.listing == "yes" - --variant-listing $include_list.list - #end if - $longvar - --variants - #if $file_types.data_sources.data_source == "in" - #for $v in $file_types.data_sources.varfiles - ${v.input} - #end for - #else - `cat $file_types.data_sources.varlist` - #end if - ; - - cgatools testvariants - --beta - --reference ${crr.fields.path} - --output $output2 - --input $output1 - --variants - #if $file_types.data_sources.data_source == "in" - #for $v in $file_types.data_sources.varfiles - ${v.input} - #end for - #else - `cat $file_types.data_sources.varlist` - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool uses the cgatools testvariants to test variant or mastervar files for the presence of variants. - -cgatools: http://sourceforge.net/projects/cgatools/files/ - ------ - -**cgatools Manual**:: - - COMMAND NAME - listvariants - Lists the variants present in a variant file. - - DESCRIPTION - Lists all called variants present in the specified variant files, in a - format suitable for processing by the testvariants command. The output is a - tab-delimited file consisting of the following columns: - - variantId Sequential id assigned to each variant. - chromosome The chromosome of the variant. - begin 0-based reference offset of the beginning of the variant. - end 0-based reference offset of the end of the variant. - varType The varType as extracted from the variant file. - reference The reference sequence. - alleleSeq The variant allele sequence as extracted from the variant - file. - xRef The xRef as extrated from the variant file. - - OPTIONS - -h [ --help ] - Print this help message. - - --beta - This is a beta command. To run this command, you must pass the --beta - flag. - - --reference arg - The reference crr file. - - --output arg (=STDOUT) - The output file (may be omitted for stdout). - - --variants arg - The input variant files (may be positional args). - - --variant-listing arg - The output of another listvariants run, to be merged in to produce the - output of this run. - - --list-long-variants - In addition to listing short variants, list longer variants as well - (10's of bases) by concatenating nearby calls. - - SUPPORTED FORMAT_VERSION - 0.3 or later - - - - COMMAND NAME - testvariants - Tests variant files for presence of variants. - - DESCRIPTION - Tests variant files for presence of variants. The output is a tab-delimited - file consisting of the columns of the input variants file, plus a column - for each assembly results file that contains a character code for each - allele. The character codes have meaning as follows: - - 0 This allele of this genome is consistent with the reference at this - locus but inconsistent with the variant. - 1 This allele of this genome has the input variant at this locus. - N This allele of this genome has no-calls but is consistent with the - input variant. - - OPTIONS - -h [ --help ] - Print this help message. - - --beta - This is a beta command. To run this command, you must pass the --beta - flag. - - --reference arg - The reference crr file. - - --input arg (=STDIN) - The input variants to test for. - - --output arg (=STDOUT) - The output file (may be omitted for stdout). - - --variants arg - The input variant files (may be passed in as arguments at the end of - the command). - - SUPPORTED FORMAT_VERSION - 0.3 or later - - diff -r 1fdc01496e71 -r 51fea6716ea5 cgatools/tools/cgatools/listvariants.xml --- a/cgatools/tools/cgatools/listvariants.xml Wed Jun 13 17:29:06 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,188 +0,0 @@ - - - - lists all called variants - - - cgatools - - - - cgatools listvariants - --beta - --reference ${crr.fields.path} - --output $output - #if $include_list.listing == "yes" - --variant-listing $include_list.list - #end if - $longvar - --variants - #if $file_types.data_sources.data_source == "in" - #for $v in $file_types.data_sources.varfiles - ${v.input} - #end for - #else - `cat $file_types.data_sources.varlist` - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool uses the cgatools listvariants to list all called variants present in the var or mastervar files. - -cgatools: http://sourceforge.net/projects/cgatools/files/ - ------ - -**cgatools Manual**:: - - COMMAND NAME - listvariants - Lists the variants present in a variant file. - - DESCRIPTION - Lists all called variants present in the specified variant files, in a - format suitable for processing by the testvariants command. The output is a - tab-delimited file consisting of the following columns: - - variantId Sequential id assigned to each variant. - chromosome The chromosome of the variant. - begin 0-based reference offset of the beginning of the variant. - end 0-based reference offset of the end of the variant. - varType The varType as extracted from the variant file. - reference The reference sequence. - alleleSeq The variant allele sequence as extracted from the variant - file. - xRef The xRef as extrated from the variant file. - - OPTIONS - -h [ --help ] - Print this help message. - - --beta - This is a beta command. To run this command, you must pass the --beta - flag. - - --reference arg - The reference crr file. - - --output arg (=STDOUT) - The output file (may be omitted for stdout). - - --variants arg - The input variant files (may be positional args). - - --variant-listing arg - The output of another listvariants run, to be merged in to produce the - output of this run. - - --list-long-variants - In addition to listing short variants, list longer variants as well - (10's of bases) by concatenating nearby calls. - - SUPPORTED FORMAT_VERSION - 0.3 or later - - diff -r 1fdc01496e71 -r 51fea6716ea5 cgatools/tools/cgatools/snpdiff.xml --- a/cgatools/tools/cgatools/snpdiff.xml Wed Jun 13 17:29:06 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,177 +0,0 @@ - - - compares snp calls to a Complete Genomics variant file. - - - cgatools - - - - cgatools snpdiff - --reference $crr.fields.path - --variants $varfile - --genotypes $genotype - --output-prefix cg_ - --reports `echo ${report1} ${report2} ${report3} | sed 's/ */,/g'` - - - - - (report1 == 'Output') - - - (report2 == 'Verbose') - - - (report3 == 'Stats') - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool ompares snp calls to a Complete Genomics variant file. - -cgatools: http://sourceforge.net/projects/cgatools/files/ - ------ - -**cgatools Manual**:: - - COMMAND NAME - snpdiff - Compares snp calls to a Complete Genomics variant file. - - DESCRIPTION - Compares the snp calls in the "genotypes" file to the calls in a Complete - Genomics variant file. The genotypes file is a tab-delimited file with at - least the following columns (additional columns may be given): - - Chromosome (Required) The name of the chromosome. - Offset0Based (Required) The 0-based offset in the chromosome. - GenotypesStrand (Optional) The strand of the calls in the Genotypes - column (+ or -, defaults to +). - Genotypes (Optional) The calls, one per allele. The following - calls are recognized: - A,C,G,T A called base. - N A no-call. - - A deleted base. - . A non-snp variation. - - The output is a tab-delimited file consisting of the columns of the - original genotypes file, plus the following additional columns: - - Reference The reference base at the given position. - VariantFile The calls made by the variant file, one per allele. - The character codes are the same as is described for - the Genotypes column. - DiscordantAlleles (Only if Genotypes is present) The number of - Genotypes alleles that are discordant with calls in - the VariantFile. If the VariantFile is described as - haploid at the given position but the Genotypes is - diploid, then each genotype allele is compared - against the haploid call of the VariantFile. - NoCallAlleles (Only if Genotypes is present) The number of - Genotypes alleles that were no-called by the - VariantFile. If the VariantFile is described as - haploid at the given position but the Genotypes is - diploid, then a VariantFile no-call is counted twice. - - The verbose output is a tab-delimited file consisting of the columns of the - original genotypes file, plus the following additional columns: - - Reference The reference base at the given position. - VariantFile The call made by the variant file for one allele (there is - a line in this file for each allele). The character codes - are the same as is described for the Genotypes column. - [CALLS] The rest of the columns are pasted in from the VariantFile, - describing the variant file line used to make the call. - - The stats output is a comma-separated file with several tables describing - the results of the snp comparison, for each diploid genotype. The tables - all describe the comparison result (column headers) versus the genotype - classification (row labels) in different ways. The "Locus classification" - tables have the most detailed match classifications, while the "Locus - concordance" tables roll these match classifications up into "discordance" - and "no-call". A locus is considered discordant if it is discordant for - either allele. A locus is considered no-call if it is concordant for both - alleles but has a no-call on either allele. The "Allele concordance" - describes the comparison result on a per-allele basis. - - OPTIONS - -h [ --help ] - Print this help message. - - --reference arg - The input crr file. - - --variants arg - The input variant file. - - --genotypes arg - The input genotypes file. - - --output-prefix arg - The path prefix for all output reports. - - --reports arg (=Output,Verbose,Stats) - Comma-separated list of reports to generate. A report is one of: - Output The output genotypes file. - Verbose The verbose output file. - Stats The stats output file. - - SUPPORTED FORMAT_VERSION - 0.3 or later - - diff -r 1fdc01496e71 -r 51fea6716ea5 cgatools/tools/cgatools/testvariants.xml --- a/cgatools/tools/cgatools/testvariants.xml Wed Jun 13 17:29:06 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,157 +0,0 @@ - - - - test for the presence of variants - - - cgatools - - - - cgatools testvariants - --beta - --reference ${crr.fields.path} - --output $output - --input $listing - --variants - #if $file_types.data_sources.data_source == "in" - #for $v in $file_types.data_sources.varfiles - ${v.input} - #end for - #else - `cat $file_types.data_sources.varlist` - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool uses the cgatools testvariants to test variant or mastervar files for the presence of variants. - -cgatools: http://sourceforge.net/projects/cgatools/files/ - ------ - -**cgatools Manual**:: - - COMMAND NAME - testvariants - Tests variant files for presence of variants. - - DESCRIPTION - Tests variant files for presence of variants. The output is a tab-delimited - file consisting of the columns of the input variants file, plus a column - for each assembly results file that contains a character code for each - allele. The character codes have meaning as follows: - - 0 This allele of this genome is consistent with the reference at this - locus but inconsistent with the variant. - 1 This allele of this genome has the input variant at this locus. - N This allele of this genome has no-calls but is consistent with the - input variant. - - OPTIONS - -h [ --help ] - Print this help message. - - --beta - This is a beta command. To run this command, you must pass the --beta - flag. - - --reference arg - The reference crr file. - - --input arg (=STDIN) - The input variants to test for. - - --output arg (=STDOUT) - The output file (may be omitted for stdout). - - --variants arg - The input variant files (may be passed in as arguments at the end of - the command). - - SUPPORTED FORMAT_VERSION - 0.3 or later - - diff -r 1fdc01496e71 -r 51fea6716ea5 cgatools/tools/cgatools/varfilter.xml --- a/cgatools/tools/cgatools/varfilter.xml Wed Jun 13 17:29:06 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,184 +0,0 @@ - - - - copies input file, applying filters. - - - cgatools - - - - varfilter_wrapper.pl - --reference $crr.fields.path - --output $output - --input $file_types.data_sources.input - #for $f in $filters - --zygosity $f.zygosity - --vartype $f.vartype - --varscorevaf x$f.varscorevaf - --varscoreeaf x$f.varscoreeaf - --varquality $f.varquality - #end for - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool copies input var file or masterVar file to output, applying specified filters. - -cgatools: http://sourceforge.net/projects/cgatools/files/ - ------ - -**cgatools Manual**:: - - COMMAND NAME - varfilter - Copies input var file or masterVar file to output, applying - specified filters. - - DESCRIPTION - Copies input var file or masterVar file to output, applying specified - filters (which are available to all cgatools commands that read a var file - or masterVar file as input). Filters are specified by appending the filter - specification to the var file name on the command line. For example: - - /path/to/var.tsv.bz2#varQuality!=VQHIGH - - The preceding example filters out any calls marked as VQLOW. The filter - specification follows the "#" sign, and consists of a list of filters to - apply, separated by a comma. Each filter is a colon-separated list of call - selectors. Any scored call that passes all the colon-separated call - selectors for one or more of the comma-separated filters is turned into a - no-call. The following call selectors are available: - - hom Selects only calls in homozygous loci. - het Selects any scored call not selected by the hom selector. - varType=XX Selects calls whose varType is XX. - varScoreVAF<XX Selects calls whose varScoreVAF<XX. - varScoreEAF<XX Selects calls whose varScoreEAF<XX. - varQuality!=XX Selects calls whose varQuality is not XX. - - Here is an example that filters homozygous SNPs with varScoreVAF < 25 and - heterozygous insertions with varScoreEAF < 50: - - - '/path/to/var.tsv.bz2#hom:varType=snp:varScoreVAF<25,het:varType=ins:varScoreEAF<50' - - - OPTIONS - -h [ --help ] - Print this help message. - - --beta - This is a beta command. To run this command, you must pass the --beta flag. - - --reference arg - The reference crr file. - - --input arg - The input var file or masterVar file (typically with filters specified). - - --output arg (=STDOUT) - The output file (may be omitted for stdout). - - SUPPORTED FORMAT_VERSION - 0.3 or later - - diff -r 1fdc01496e71 -r 51fea6716ea5 cgatools/tools/cgatools/varfilter_wrapper.pl --- a/cgatools/tools/cgatools/varfilter_wrapper.pl Wed Jun 13 17:29:06 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,56 +0,0 @@ -#!/usr/bin/perl -use strict; -use Getopt::Long; -use vars qw($opt_reference $opt_input $opt_output @opt_zygosity @opt_vartype @opt_varscorevaf @opt_varscoreeaf @opt_varquality); -$| = 1; # set autoflush to screen - -# This is a wrapper for the cgatools varfilter function to run cgatools varfilter in Galaxy. -# The wrapper generates the filter(s) in the correct format to be used with the input file. -# written 6-1-2012 by bcrain@completegenomics.com - - -#print join("\n", @ARGV), "\n"; -&GetOptions("reference=s", "input=s", "output=s", "zygosity=s@", "vartype=s@", "varscorevaf=s@", "varscoreeaf=s@", "varquality=s@"); - -my $append = ''; - -for (my $i = 0; $i <= $#opt_zygosity; $i ++) -{ - my $filter = ''; - unless ($opt_zygosity[$i] eq 'NA') {$filter = $opt_zygosity[$i];} - unless ($opt_vartype[$i] eq 'NA') - { - $filter ne '' and $filter .= ':'; - $filter .= 'varType=' . $opt_vartype[$i]; - } - unless ($opt_varscorevaf[$i] eq 'x') - { - $filter ne '' and $filter .= ':'; - $opt_varscorevaf[$i] =~ s/^x//; - $filter .= 'varScoreVAF<' . $opt_varscorevaf[$i]; - } - unless ($opt_varscoreeaf[$i] eq 'x') - { - $filter ne '' and $filter .= ':'; - $opt_varscoreeaf[$i] =~ s/^x//; - $filter .= 'varScoreEAF<' . $opt_varscoreeaf[$i]; - } - unless ($opt_varquality[$i] eq 'NA') - { - $filter ne '' and $filter .= ':'; - $filter .= 'varQuality!=' . $opt_varquality[$i]; - } - - if ($filter ne '') - { - if ($append eq '') {$append = '#' . $filter;} - else {$append .= ',' . $filter;} - } -} -print "cgatools varfilter ---beta ---reference $opt_reference ---output $opt_output ---input '${opt_input}${append}'\n"; - -`cgatools varfilter --beta --reference $opt_reference --output $opt_output --input '${opt_input}${append}'`; \ No newline at end of file