# HG changeset patch # User bcrain-completegenomics # Date 1346798800 14400 # Node ID 382c50ce05194c247c86bbe427bf4f659b5aea16 # Parent df82283e402d285aae35c11e38d65012cf8cb8c9 Uploaded diff -r df82283e402d -r 382c50ce0519 cgatools/README.txt --- a/cgatools/README.txt Fri Jun 22 15:49:43 2012 -0400 +++ b/cgatools/README.txt Tue Sep 04 18:46:40 2012 -0400 @@ -1,11 +1,15 @@ Provides galaxy tools for Complete Genomics' cgatools package - http://www.completegenomics.com This repository provides tools to execute functions of cgatools from Complete Genomics, Inc. -and includes the cgatools 1.5 executable. +and includes the cgatools 1.6 executable. -Reference genomes files for cgatools can be downloaded from Complete Genomics ftp site: +Reference genomes files for cgatools can be downloaded from Complete Genomics' ftp site: ftp://ftp.completegenomics.com/ReferenceFiles/build37.crr ftp://ftp.completegenomics.com/ReferenceFiles/build36.crr + +Calibration files for cgatools can be downloaded from Complete Genomics' ftp site: +ftp://ftp.completegenomics.com/ScoreCalibrationFiles/var-calibration-v2.tgz + After copying the files in the desired locations follow the instructions below to register the reference files with galaxy. @@ -15,7 +19,7 @@ AUTOMATIC INSTALL When prompted for a tool panel section to contain the installed tools create a new section -called 'Complete Genomics - cgatools 1.5'. +called 'Complete Genomics - cgatools 1.6'. After install create a cg_ccr_files.loc file in the tool-data directory of your Galaxy instance by copying the cg_ccr_files.loc.sample file. In cg_ccr_files.loc edit the path @@ -29,7 +33,7 @@ MANUAL INSTALL For manual install from compressed files move/copy the following files into your Galaxy instance: -directory tools/cgatools_v1.5 to tools/ +directory tools/cgatools_1.6 to tools/ file lib/galaxy/datatypes/completegenomics.py to lib/galaxy/datatypes/ file tool-data/cg_crr_files.loc.sample to tool-data/cg_crr_files.loc @@ -38,17 +42,20 @@ Paste from tool_config.xml.sample into the tool_config.xml of your Galaxy instance: -
- - - - - - - - +
+ + + + + + + + + +
diff -r df82283e402d -r 382c50ce0519 cgatools/tool_config.xml.sample --- a/cgatools/tool_config.xml.sample Fri Jun 22 15:49:43 2012 -0400 +++ b/cgatools/tool_config.xml.sample Tue Sep 04 18:46:40 2012 -0400 @@ -3,15 +3,17 @@ -
- - - - - - - - +
+ + + + + + + + + +
\ No newline at end of file diff -r df82283e402d -r 382c50ce0519 cgatools/tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools/tool_dependencies.xml Tue Sep 04 18:46:40 2012 -0400 @@ -0,0 +1,20 @@ + + + + + + http://sourceforge.net/projects/cgatools/files/1.6.0/cgatools-1.6.0.43-MacOSX_binary-x86_64.tar.gz + + cgatools-1.6.0.43-MacOSX_binary-x86_64/bin/cgatools + $INSTALL_DIR + + + $INSTALL_DIR + + + + +some text + + + \ No newline at end of file diff -r df82283e402d -r 382c50ce0519 cgatools/tools/cgatools_1.5/calldiff.xml --- a/cgatools/tools/cgatools_1.5/calldiff.xml Fri Jun 22 15:49:43 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,350 +0,0 @@ - - - - compares two Complete Genomics variant files. - - - cgatools - - - - cgatools | head -1; - cgatools calldiff --beta - --reference ${crr.fields.path} - --variantsA $data_sources.inputA - --variantsB $data_sources.inputB - $validation - $diploid - --locus-stats-column-count $column - --max-hypothesis-count $hypothesis - --output-prefix cg_ - --reports `echo ${report1} ${report2} ${report3} ${report4} ${report5} ${somatic.report6} | sed 's/ */,/g'` - #if $somatic.report6 == "SomaticOutput" - --genome-rootA $somatic.genomeA - --genome-rootB $somatic.genomeB - --calibration-root $somatic.calibration - #end if - - - - - (report1 == 'SuperlocusOutput') - - - (report2 == 'SuperlocusStats') - - - (report3 == 'LocusOutput') - - - (report4 == 'LocusStats') - - - (report5 == 'VariantOutput') - - - (report5 == 'VariantOutput') - - - (somatic['report6'] == 'SomaticOutput') - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool compares two Complete Genomics variant files. - -**cgatools 1.5.0 Documentation** - -Userguide: http://cgatools.sourceforge.net/docs/1.5.0/cgatools-user-guide.pdf - -Release notes: http://cgatools.sourceforge.net/docs/1.5.0/cgatools-release-notes.pdf - -**Command line reference**:: - - COMMAND NAME - calldiff - Compares two Complete Genomics variant files. - - DESCRIPTION - Compares two Complete Genomics variant files. Divides the genome up into - superloci of nearby variants, then compares the superloci. Also refines the - comparison to determine per-call or per-locus comparison results. - - Comparison results are usually described by a semi-colon separated string, - one per allele. Each allele's comparison result is one of the following - classifications: - - ref-identical The alleles of the two variant files are identical, and - they are consistent with the reference. - alt-identical The alleles of the two variant files are identical, and - they are inconsistent with the reference. - ref-consistent The alleles of the two variant files are consistent, - and they are consistent with the reference. - alt-consistent The alleles of the two variant files are consistent, - and they are inconsistent with the reference. - onlyA The alleles of the two variant files are inconsistent, - and only file A is inconsistent with the reference. - onlyB The alleles of the two variant files are inconsistent, - and only file B is inconsistent with the reference. - mismatch The alleles of the two variant files are inconsistent, - and they are both inconsistent with the reference. - phase-mismatch The two variant files would be consistent if the - hapLink field had been empty, but they are - inconsistent. - ploidy-mismatch The superlocus did not have uniform ploidy. - - In some contexts, this classification is rolled up into a simplified - classification, which is one of "identical", "consistent", "onlyA", - "onlyB", or "mismatch". - - A good place to start looking at the results is the superlocus-output file. - It has columns defined as follows: - - SuperlocusId An identifier given to the superlocus. - Chromosome The name of the chromosome. - Begin The 0-based offset of the start of the superlocus. - End The 0-based offset of the base one past the end of the - superlocus. - Classification The match classification of the superlocus. - Reference The reference sequence. - AllelesA A semicolon-separated list of the alleles (one per - haplotype) for variant file A, for the phasing with the - best comparison result. - AllelesB A semicolon-separated list of the alleles (one per - haplotype) for variant file B, for the phasing with the - best comparison result. - - The locus-output file contains, for each locus in file A and file B that is - not consistent with the reference, an annotated set of calls for the locus. - The calls are annotated with the following columns: - - SuperlocusId The id of the superlocus containing the locus. - File The variant file (A or B). - LocusClassification The locus classification is determined by the - varType column of the call that is inconsistent - with the reference, concatenated with a - modifier that describes whether the locus is - heterozygous, homozygous, or contains no-calls. - If there is no one variant in the locus (i.e., - it is heterozygous alt-alt), the locus - classification begins with "other". - LocusDiffClassification The match classification for the locus. This is - defined to be the best of the comparison of the - locus to the same region in the other file, or - the comparison of the superlocus. - - The somatic output file contains a list of putative somatic variations of - genome A. The output includes only those loci that can be classified as - snp, del, ins or sub in file A, and are called reference in the file B. - Every locus is annotated with the following columns: - - VarCvgA The totalReadCount from file A for this locus - (computed on the fly if file A is not a - masterVar file). - VarScoreA The varScoreVAF from file A, or varScoreEAF if - the "--diploid" option is used. - RefCvgB The maximum of the uniqueSequenceCoverage - values for the locus in genome B. - RefScoreB Minimum of the reference scores of the locus in - genome B. - SomaticCategory The category used for determining the - calibrated scores and the SomaticRank. - VarScoreACalib The calibrated variant score of file A, under - the model selected by using or not using the - "--diploid" option, and corrected for the count - of heterozygous variants observed in this - genome. See user guide for more information. - VarScoreBCalib The calibrated reference score of file B, under - the model selected by using or not using the - "--diploid" option, and corrected for the count - of heterozygous variants observed in this - genome. See user guide for more information. - SomaticRank The estimated rank of this somatic mutation, - amongst all true somatic mutations within this - SomaticCategory. The value is a number between - 0 and 1; a value of 0.012 means, for example, - that an estimated 1.2% of the true somatic - mutations in this somaticCategory have a - somaticScore less than the somaticScore for - this mutation. See user guide for more - information. - SomaticScore An integer that provides a total order on - quality for all somatic mutations. It is equal - to -10*log10( P(false)/P(true) ), under the - assumption that this genome has a rate of - somatic mutation equal to 1/Mb for - SomaticCategory snp, 1/10Mb for SomaticCategory - ins, 1/10Mb for SomaticCategory del, and 1/20Mb - for SomaticCategory sub. The computation is - based on the assumptions described in the user - guide, and is affected by choice of variant - model selected by using or not using the - "--diploid" option. - SomaticQuality Equal to VQHIGH for all somatic mutations where - SomaticScore >= -10. Otherwise, this column is - empty. - - OPTIONS - -h [ --help ] - Print this help message. - - --reference arg - The input crr file. - - --variantsA arg - The "A" input variant file. - - --variantsB arg - The "B" input variant file. - - --output-prefix arg - The path prefix for all output reports. - - --reports arg (=SuperlocusOutput,SuperlocusStats,LocusOutput,LocusStats) - Comma-separated list of reports to generate. (Beware any reports whose - name begins with "Debug".) A report is one of: - SuperlocusOutput Report for superlocus classification. - SuperlocusStats Report for superlocus classification stats. - LocusOutput Report for locus classification. - LocusStats Report for locus stats. - VariantOutput Both variant files annotated by comparison - results.If the somatic output report is - requested, file A is also annotated with the - same score ranks as produced in that report. - SomaticOutput Report for the list of simple variations that - are present only in file "A", annotated with - the score that indicates the probability of - the variation being truly somatic. Requires - beta, genome-rootA, and genome-rootB options - to be provided as well. Note: generating this - report slows calldiff by 10x-20x. - DebugCallOutput Report for call classification. - DebugSuperlocusOutput Report for debug superlocus information. - DebugSomaticOutput Report for distribution estimates used for - somatic rescoring. Only produced if - SomaticOutput is also turned on. - - --diploid - Uses varScoreEAF instead of varScoreVAF in somatic score computations. - Also, uses diploid variant model instead of variable allele mixture - model. - - --locus-stats-column-count arg (=15) - The number of columns for locus compare classification in the locus - stats file. - - --max-hypothesis-count arg (=32) - The maximum number of possible phasings to consider for a superlocus. - - --no-reference-cover-validation - Turns off validation that all bases of a chromosome are covered by - calls of the variant file. - - --genome-rootA arg - The "A" genome directory, for example /data/GS00118-DNA_A01; this - directory is expected to contain ASM/REF and ASM/EVIDENCE - subdirectories. - - --genome-rootB arg - The "B" genome directory. - - --calibration-root arg - The directory containing calibration data. For example, there should - exist a file calibration-root/0.0.0/metrics.tsv. - - --beta - This flag enables the SomaticOutput report, which is beta - functionality. - - SUPPORTED FORMAT_VERSION - 0.3 or later - - diff -r df82283e402d -r 382c50ce0519 cgatools/tools/cgatools_1.5/cgatools Binary file cgatools/tools/cgatools_1.5/cgatools has changed diff -r df82283e402d -r 382c50ce0519 cgatools/tools/cgatools_1.5/join.xml --- a/cgatools/tools/cgatools_1.5/join.xml Fri Jun 22 15:49:43 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,213 +0,0 @@ - - - - two tsv files based on equal fields or overlapping regions. - - - cgatools - - - - cgatools | head -1; - cgatools join --beta - --input $inputA - --input $inputB - --output $output - --output-mode $outmode - $dump - --select $col - #for $m in $matches - --match ${m.match} - #end for - #if $range_overlap.range == 'yes' - #for $o in $range_overlap.overlaps - --overlap ${o.overlap} - #end for - --overlap-mode $range_overlap.overlapmode - --overlap-fraction-A $range_overlap.fractionA - --boundary-uncertainty-A $range_overlap.boundaryA - --overlap-fraction-B $range_overlap.fractionB - --boundary-uncertainty-B $range_overlap.boundaryB - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool joins two tab-delimited files based on equal fields or overlapping regions. - -**cgatools 1.5.0 Documentation** - -Userguide: http://cgatools.sourceforge.net/docs/1.5.0/cgatools-user-guide.pdf - -Release notes: http://cgatools.sourceforge.net/docs/1.5.0/cgatools-release-notes.pdf - -**Command line reference**:: - - COMMAND NAME - join - Joins two tab-delimited files based on equal fields or overlapping regions. - - DESCRIPTION - Joins two tab-delimited files based on equal fields or overlapping regions. - By default, an output record is produced for each match found between file - A and file B, but output format can be controlled by the --output-mode - parameter. - - OPTIONS - -h [ --help ] - Print this help message. - - --beta - This is a beta command. To run this command, you must pass the --beta - flag. - - --input arg - File name to use as input (may be passed in as arguments at the end of - the command), or omitted for stdin). There must be exactly two input - files to join. If only one file is specified by name, file A is taken - to be stdin and file B is the named file. File B is read fully into - memory, and file A is streamed. File A's columns appear first in the - output. - - --output arg (=STDOUT) - The output file name (may be omitted for stdout). - - --match arg - A match specification, which is a column from A and a column from B - separated by a colon. - - --overlap arg - Overlap specification. An overlap specification consists of a range - definition for files A and B, separated by a colon. A range definition - may be two columns, in which case they are interpreted as the beginning - and end of the range. Or it may be one column, in which case the range - is defined as the 1-base range starting at the given value. The records - from the two files must overlap in order to be considered for output. - Two ranges are considered to overlap if the overlap is at least one - base long, or if one of the ranges is length 0 and the ranges overlap - or abut. For example, "begin,end:offset" will match wherever end-begin - > 0, begin<offset+1, and end>offset, or wherever end-begin = 0, - begin<=offset+1, and end>=offset. - - - -m [ --output-mode ] arg (=full) - Output mode, one of the following: - full Print an output record for each match found between - file A and file B. - compact Print at most one record for each record of file A, - joining the file B values by a semicolon and - suppressing repeated B values and empty B values. - compact-pct Same as compact, but for each distinct B value, - annotate with the percentage of the A record that is - overlapped by B records with that B value. Percentage - is rounded up to nearest integer. - - --overlap-mode arg (=strict) - Overlap mode, one of the following: - strict Range A and B overlap if A.begin < B.end and - B.begin < A.end. - allow-abutting-points Range A and B overlap they meet the strict - requirements, or if A.begin <= B.end and - B.begin <= A.end and either A or B has zero - length. - - --select arg (=A.*,B.*) - Set of fields to select for output. - - -a [ --always-dump ] - Dump every record of A, even if there are no matches with file B. - - --overlap-fraction-A arg (=0) - Minimum fraction of A region overlap for filtering output. - - --boundary-uncertainty-A arg (=0) - Boundary uncertainty for overlap filtering. Specifically, records - failing the following predicate are filtered away: overlap >= - overlap-fraction-A * ( A-range-length - boundary-uncertainty-A ) - - --overlap-fraction-B arg (=0) - Minimum fraction of B region overlap for filtering output. - - --boundary-uncertainty-B arg (=0) - Boundary uncertainty for overlap filtering. Specifically, records - failing the following predicate are filtered away: overlap >= - overlap-fraction-B * ( B-range-length - boundary-uncertainty-B ) - - SUPPORTED FORMAT_VERSION - Any - - diff -r df82283e402d -r 382c50ce0519 cgatools/tools/cgatools_1.5/junctiondiff.xml --- a/cgatools/tools/cgatools_1.5/junctiondiff.xml Fri Jun 22 15:49:43 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,156 +0,0 @@ - - - - reports difference between junction calls - - - - - - cgatools - - - - cgatools | head -1; - cgatools junctiondiff --beta - --reference $crr.fields.path - --junctionsA $data_sources.inputA - --junctionsB $data_sources.inputB - --scoreThresholdA $scoreA - --scoreThresholdB $scoreB - --distance $distance - --minlength $minlength - $stat - --output-prefix cg_ - ; - mv cg_diff-*tsv cg_diff.tsv - - - - - - (stat == '--statout') - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool reports difference between junction calls of Complete Genomics junctions files - -**cgatools 1.5.0 Documentation** - -Userguide: http://cgatools.sourceforge.net/docs/1.5.0/cgatools-user-guide.pdf - -Release notes: http://cgatools.sourceforge.net/docs/1.5.0/cgatools-release-notes.pdf - -**Command line reference**:: - - COMMAND NAME - junctiondiff - Reports difference between junction calls of Complete Genomics junctions files. - - DESCRIPTION - junctiondiff takes two junction files A and B as input and produces the - following output: - - "diff-inputFileName" - the junctions from an input file A that are not - present in input file B. - - "report.txt" - a brief summary report (if --statout is used) - - Two junctions are considered equivalent if: - - they come from different files - - left and right positions of one junction are not more than "--distance" - bases apart from the corresponding positions of another junction - - the junction scores are equal or above the scoreThreshold - - they are on the same strands - - OPTIONS - -h [ --help ] - Print this help message. - - --beta - This is a beta command. To run this command, you must pass the --beta - flag. - - -s [ --reference ] arg - Reference file. - - -a [ --junctionsA ] arg - input junction file A. - - -b [ --junctionsB ] arg - input junction file B. - - -A [ --scoreThresholdA ] arg (=10) - score threshold value for the input file A. - - -B [ --scoreThresholdB ] arg (=0) - score threshold value for the input file B. - - -d [ --distance ] arg (=200) - Max distance between coordinates of potentially compatible junctions. - - -l [ --minlength ] arg (=500) - Minimum deletion junction length to be included into the difference - file. - - -o [ --output-prefix ] arg - The path prefix for all the output reports. - - -S [ --statout ] - (Debug) Report various input file statistics. Experimental feature. - - SUPPORTED FORMAT_VERSION - 1.5 or later - - diff -r df82283e402d -r 382c50ce0519 cgatools/tools/cgatools_1.5/listtestvariants.xml --- a/cgatools/tools/cgatools_1.5/listtestvariants.xml Fri Jun 22 15:49:43 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,242 +0,0 @@ - - - - performs listsvariants and testvariants consecutively - - - cgatools - - - - cgatools | head -1; - cgatools listvariants - --beta - --reference ${crr.fields.path} - --output $output1 - #if $include_list.listing == "yes" - --variant-listing $include_list.list - #end if - $longvar - --variants - #if $file_types.data_sources.data_source == "in" - #for $v in $file_types.data_sources.varfiles - ${v.input} - #end for - #else - `cat $file_types.data_sources.varlist` - #end if - ; - - cgatools testvariants - --beta - --reference ${crr.fields.path} - --output $output2 - --input $output1 - --variants - #if $file_types.data_sources.data_source == "in" - #for $v in $file_types.data_sources.varfiles - ${v.input} - #end for - #else - `cat $file_types.data_sources.varlist` - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool uses the cgatools testvariants to test variant or mastervar files for the presence of variants. - -**cgatools 1.5.0 Documentation** - -Userguide: http://cgatools.sourceforge.net/docs/1.5.0/cgatools-user-guide.pdf - -Release notes: http://cgatools.sourceforge.net/docs/1.5.0/cgatools-release-notes.pdf - -**Command line reference**:: - - COMMAND NAME - listvariants - Lists the variants present in a variant file. - - DESCRIPTION - Lists all called variants present in the specified variant files, in a - format suitable for processing by the testvariants command. The output is a - tab-delimited file consisting of the following columns: - - variantId Sequential id assigned to each variant. - chromosome The chromosome of the variant. - begin 0-based reference offset of the beginning of the variant. - end 0-based reference offset of the end of the variant. - varType The varType as extracted from the variant file. - reference The reference sequence. - alleleSeq The variant allele sequence as extracted from the variant - file. - xRef The xRef as extrated from the variant file. - - OPTIONS - -h [ --help ] - Print this help message. - - --beta - This is a beta command. To run this command, you must pass the --beta - flag. - - --reference arg - The reference crr file. - - --output arg (=STDOUT) - The output file (may be omitted for stdout). - - --variants arg - The input variant files (may be positional args). - - --variant-listing arg - The output of another listvariants run, to be merged in to produce the - output of this run. - - --list-long-variants - In addition to listing short variants, list longer variants as well - (10's of bases) by concatenating nearby calls. - - SUPPORTED FORMAT_VERSION - 0.3 or later - - - - COMMAND NAME - testvariants - Tests variant files for presence of variants. - - DESCRIPTION - Tests variant files for presence of variants. The output is a tab-delimited - file consisting of the columns of the input variants file, plus a column - for each assembly results file that contains a character code for each - allele. The character codes have meaning as follows: - - 0 This allele of this genome is consistent with the reference at this - locus but inconsistent with the variant. - 1 This allele of this genome has the input variant at this locus. - N This allele of this genome has no-calls but is consistent with the - input variant. - - OPTIONS - -h [ --help ] - Print this help message. - - --beta - This is a beta command. To run this command, you must pass the --beta - flag. - - --reference arg - The reference crr file. - - --input arg (=STDIN) - The input variants to test for. - - --output arg (=STDOUT) - The output file (may be omitted for stdout). - - --variants arg - The input variant files (may be passed in as arguments at the end of - the command). - - SUPPORTED FORMAT_VERSION - 0.3 or later - - diff -r df82283e402d -r 382c50ce0519 cgatools/tools/cgatools_1.5/listvariants.xml --- a/cgatools/tools/cgatools_1.5/listvariants.xml Fri Jun 22 15:49:43 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,191 +0,0 @@ - - - - lists all called variants - - - cgatools - - - - cgatools | head -1; - cgatools listvariants - --beta - --reference ${crr.fields.path} - --output $output - #if $include_list.listing == "yes" - --variant-listing $include_list.list - #end if - $longvar - --variants - #if $file_types.data_sources.data_source == "in" - #for $v in $file_types.data_sources.varfiles - ${v.input} - #end for - #else - `cat $file_types.data_sources.varlist` - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool uses the cgatools listvariants to list all called variants present in the var or mastervar files. - -**cgatools 1.5.0 Documentation** - -Userguide: http://cgatools.sourceforge.net/docs/1.5.0/cgatools-user-guide.pdf - -Release notes: http://cgatools.sourceforge.net/docs/1.5.0/cgatools-release-notes.pdf - -**Command line reference**:: - - COMMAND NAME - listvariants - Lists the variants present in a variant file. - - DESCRIPTION - Lists all called variants present in the specified variant files, in a - format suitable for processing by the testvariants command. The output is a - tab-delimited file consisting of the following columns: - - variantId Sequential id assigned to each variant. - chromosome The chromosome of the variant. - begin 0-based reference offset of the beginning of the variant. - end 0-based reference offset of the end of the variant. - varType The varType as extracted from the variant file. - reference The reference sequence. - alleleSeq The variant allele sequence as extracted from the variant - file. - xRef The xRef as extrated from the variant file. - - OPTIONS - -h [ --help ] - Print this help message. - - --beta - This is a beta command. To run this command, you must pass the --beta - flag. - - --reference arg - The reference crr file. - - --output arg (=STDOUT) - The output file (may be omitted for stdout). - - --variants arg - The input variant files (may be positional args). - - --variant-listing arg - The output of another listvariants run, to be merged in to produce the - output of this run. - - --list-long-variants - In addition to listing short variants, list longer variants as well - (10's of bases) by concatenating nearby calls. - - SUPPORTED FORMAT_VERSION - 0.3 or later - - diff -r df82283e402d -r 382c50ce0519 cgatools/tools/cgatools_1.5/snpdiff.xml --- a/cgatools/tools/cgatools_1.5/snpdiff.xml Fri Jun 22 15:49:43 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,184 +0,0 @@ - - - - compares snp calls to a Complete Genomics variant file. - - - cgatools - - - - cgatools | head -1; - cgatools snpdiff - --reference $crr.fields.path - --variants $varfile - --genotypes $genotype - --output-prefix cg_ - --reports `echo ${report1} ${report2} ${report3} | sed 's/ */,/g'` - - - - - (report1 == 'Output') - - - (report2 == 'Verbose') - - - (report3 == 'Stats') - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool ompares snp calls to a Complete Genomics variant file. - -**cgatools 1.5.0 Documentation** - -Userguide: http://cgatools.sourceforge.net/docs/1.5.0/cgatools-user-guide.pdf - -Release notes: http://cgatools.sourceforge.net/docs/1.5.0/cgatools-release-notes.pdf - -**Command line reference**:: - - COMMAND NAME - snpdiff - Compares snp calls to a Complete Genomics variant file. - - DESCRIPTION - Compares the snp calls in the "genotypes" file to the calls in a Complete - Genomics variant file. The genotypes file is a tab-delimited file with at - least the following columns (additional columns may be given): - - Chromosome (Required) The name of the chromosome. - Offset0Based (Required) The 0-based offset in the chromosome. - GenotypesStrand (Optional) The strand of the calls in the Genotypes - column (+ or -, defaults to +). - Genotypes (Optional) The calls, one per allele. The following - calls are recognized: - A,C,G,T A called base. - N A no-call. - - A deleted base. - . A non-snp variation. - - The output is a tab-delimited file consisting of the columns of the - original genotypes file, plus the following additional columns: - - Reference The reference base at the given position. - VariantFile The calls made by the variant file, one per allele. - The character codes are the same as is described for - the Genotypes column. - DiscordantAlleles (Only if Genotypes is present) The number of - Genotypes alleles that are discordant with calls in - the VariantFile. If the VariantFile is described as - haploid at the given position but the Genotypes is - diploid, then each genotype allele is compared - against the haploid call of the VariantFile. - NoCallAlleles (Only if Genotypes is present) The number of - Genotypes alleles that were no-called by the - VariantFile. If the VariantFile is described as - haploid at the given position but the Genotypes is - diploid, then a VariantFile no-call is counted twice. - - The verbose output is a tab-delimited file consisting of the columns of the - original genotypes file, plus the following additional columns: - - Reference The reference base at the given position. - VariantFile The call made by the variant file for one allele (there is - a line in this file for each allele). The character codes - are the same as is described for the Genotypes column. - [CALLS] The rest of the columns are pasted in from the VariantFile, - describing the variant file line used to make the call. - - The stats output is a comma-separated file with several tables describing - the results of the snp comparison, for each diploid genotype. The tables - all describe the comparison result (column headers) versus the genotype - classification (row labels) in different ways. The "Locus classification" - tables have the most detailed match classifications, while the "Locus - concordance" tables roll these match classifications up into "discordance" - and "no-call". A locus is considered discordant if it is discordant for - either allele. A locus is considered no-call if it is concordant for both - alleles but has a no-call on either allele. The "Allele concordance" - describes the comparison result on a per-allele basis. - - OPTIONS - -h [ --help ] - Print this help message. - - --reference arg - The input crr file. - - --variants arg - The input variant file. - - --genotypes arg - The input genotypes file. - - --output-prefix arg - The path prefix for all output reports. - - --reports arg (=Output,Verbose,Stats) - Comma-separated list of reports to generate. A report is one of: - Output The output genotypes file. - Verbose The verbose output file. - Stats The stats output file. - - SUPPORTED FORMAT_VERSION - 0.3 or later - - diff -r df82283e402d -r 382c50ce0519 cgatools/tools/cgatools_1.5/testvariants.xml --- a/cgatools/tools/cgatools_1.5/testvariants.xml Fri Jun 22 15:49:43 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,160 +0,0 @@ - - - - test for the presence of variants - - - cgatools - - - - cgatools | head -1; - cgatools testvariants - --beta - --reference ${crr.fields.path} - --output $output - --input $listing - --variants - #if $file_types.data_sources.data_source == "in" - #for $v in $file_types.data_sources.varfiles - ${v.input} - #end for - #else - `cat $file_types.data_sources.varlist` - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool uses the cgatools testvariants to test variant or mastervar files for the presence of variants. - -**cgatools 1.5.0 Documentation** - -Userguide: http://cgatools.sourceforge.net/docs/1.5.0/cgatools-user-guide.pdf - -Release notes: http://cgatools.sourceforge.net/docs/1.5.0/cgatools-release-notes.pdf - -**Command line reference**:: - - COMMAND NAME - testvariants - Tests variant files for presence of variants. - - DESCRIPTION - Tests variant files for presence of variants. The output is a tab-delimited - file consisting of the columns of the input variants file, plus a column - for each assembly results file that contains a character code for each - allele. The character codes have meaning as follows: - - 0 This allele of this genome is consistent with the reference at this - locus but inconsistent with the variant. - 1 This allele of this genome has the input variant at this locus. - N This allele of this genome has no-calls but is consistent with the - input variant. - - OPTIONS - -h [ --help ] - Print this help message. - - --beta - This is a beta command. To run this command, you must pass the --beta - flag. - - --reference arg - The reference crr file. - - --input arg (=STDIN) - The input variants to test for. - - --output arg (=STDOUT) - The output file (may be omitted for stdout). - - --variants arg - The input variant files (may be passed in as arguments at the end of - the command). - - SUPPORTED FORMAT_VERSION - 0.3 or later - - diff -r df82283e402d -r 382c50ce0519 cgatools/tools/cgatools_1.5/varfilter.xml --- a/cgatools/tools/cgatools_1.5/varfilter.xml Fri Jun 22 15:49:43 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,187 +0,0 @@ - - - - copies input file, applying filters. - - - cgatools - - - - cgatools | head -1; - varfilter_wrapper.pl - --reference $crr.fields.path - --output $output - --input $file_types.data_sources.input - #for $f in $filters - --zygosity $f.zygosity - --vartype $f.vartype - --varscorevaf x$f.varscorevaf - --varscoreeaf x$f.varscoreeaf - --varquality $f.varquality - #end for - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool copies input var file or masterVar file to output, applying specified filters. - -**cgatools 1.5.0 Documentation** - -Userguide: http://cgatools.sourceforge.net/docs/1.5.0/cgatools-user-guide.pdf - -Release notes: http://cgatools.sourceforge.net/docs/1.5.0/cgatools-release-notes.pdf - -**Command line reference**:: - - COMMAND NAME - varfilter - Copies input var file or masterVar file to output, applying - specified filters. - - DESCRIPTION - Copies input var file or masterVar file to output, applying specified - filters (which are available to all cgatools commands that read a var file - or masterVar file as input). Filters are specified by appending the filter - specification to the var file name on the command line. For example: - - /path/to/var.tsv.bz2#varQuality!=VQHIGH - - The preceding example filters out any calls marked as VQLOW. The filter - specification follows the "#" sign, and consists of a list of filters to - apply, separated by a comma. Each filter is a colon-separated list of call - selectors. Any scored call that passes all the colon-separated call - selectors for one or more of the comma-separated filters is turned into a - no-call. The following call selectors are available: - - hom Selects only calls in homozygous loci. - het Selects any scored call not selected by the hom selector. - varType=XX Selects calls whose varType is XX. - varScoreVAF<XX Selects calls whose varScoreVAF<XX. - varScoreEAF<XX Selects calls whose varScoreEAF<XX. - varQuality!=XX Selects calls whose varQuality is not XX. - - Here is an example that filters homozygous SNPs with varScoreVAF < 25 and - heterozygous insertions with varScoreEAF < 50: - - - '/path/to/var.tsv.bz2#hom:varType=snp:varScoreVAF<25,het:varType=ins:varScoreEAF<50' - - - OPTIONS - -h [ --help ] - Print this help message. - - --beta - This is a beta command. To run this command, you must pass the --beta flag. - - --reference arg - The reference crr file. - - --input arg - The input var file or masterVar file (typically with filters specified). - - --output arg (=STDOUT) - The output file (may be omitted for stdout). - - SUPPORTED FORMAT_VERSION - 0.3 or later - - diff -r df82283e402d -r 382c50ce0519 cgatools/tools/cgatools_1.5/varfilter_wrapper.pl --- a/cgatools/tools/cgatools_1.5/varfilter_wrapper.pl Fri Jun 22 15:49:43 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,56 +0,0 @@ -#!/usr/bin/perl -use strict; -use Getopt::Long; -use vars qw($opt_reference $opt_input $opt_output @opt_zygosity @opt_vartype @opt_varscorevaf @opt_varscoreeaf @opt_varquality); -$| = 1; # set autoflush to screen - -# This is a wrapper for the cgatools varfilter function to run cgatools varfilter in Galaxy. -# The wrapper generates the filter(s) in the correct format to be used with the input file. -# written 6-1-2012 by bcrain@completegenomics.com - - -#print join("\n", @ARGV), "\n"; -&GetOptions("reference=s", "input=s", "output=s", "zygosity=s@", "vartype=s@", "varscorevaf=s@", "varscoreeaf=s@", "varquality=s@"); - -my $append = ''; - -for (my $i = 0; $i <= $#opt_zygosity; $i ++) -{ - my $filter = ''; - unless ($opt_zygosity[$i] eq 'NA') {$filter = $opt_zygosity[$i];} - unless ($opt_vartype[$i] eq 'NA') - { - $filter ne '' and $filter .= ':'; - $filter .= 'varType=' . $opt_vartype[$i]; - } - unless ($opt_varscorevaf[$i] eq 'x') - { - $filter ne '' and $filter .= ':'; - $opt_varscorevaf[$i] =~ s/^x//; - $filter .= 'varScoreVAF<' . $opt_varscorevaf[$i]; - } - unless ($opt_varscoreeaf[$i] eq 'x') - { - $filter ne '' and $filter .= ':'; - $opt_varscoreeaf[$i] =~ s/^x//; - $filter .= 'varScoreEAF<' . $opt_varscoreeaf[$i]; - } - unless ($opt_varquality[$i] eq 'NA') - { - $filter ne '' and $filter .= ':'; - $filter .= 'varQuality!=' . $opt_varquality[$i]; - } - - if ($filter ne '') - { - if ($append eq '') {$append = '#' . $filter;} - else {$append .= ',' . $filter;} - } -} -print "cgatools varfilter ---beta ---reference $opt_reference ---output $opt_output ---input '${opt_input}${append}'\n"; - -`cgatools varfilter --beta --reference $opt_reference --output $opt_output --input '${opt_input}${append}'`; \ No newline at end of file diff -r df82283e402d -r 382c50ce0519 cgatools/tools/cgatools_1.6/.DS_Store Binary file cgatools/tools/cgatools_1.6/.DS_Store has changed diff -r df82283e402d -r 382c50ce0519 cgatools/tools/cgatools_1.6/calldiff.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools/tools/cgatools_1.6/calldiff.xml Tue Sep 04 18:46:40 2012 -0400 @@ -0,0 +1,388 @@ + + + + compares two Complete Genomics variant files. + + + +cgatools | head -1; + + +echo "cgatools calldiff --beta +--reference ${crr.fields.path} +--variantsA $data_sources.inputA +--variantsB $data_sources.inputB +$validation +$diploid +--locus-stats-column-count $column +--max-hypothesis-count $hypothesis +--output-prefix cg_ +--reports `echo ${report1} ${report2} ${report3} ${report4} ${report5} ${somatic.report6} | sed 's/ */,/g'` +#if $somatic.report6 == "SomaticOutput" +--genome-rootA $somatic.genomeA +--genome-rootB $somatic.genomeB +--calibration-root $somatic.calibration +#end if +"; + + +cgatools calldiff --beta +--reference ${crr.fields.path} +--variantsA $data_sources.inputA +--variantsB $data_sources.inputB +$validation +$diploid +--locus-stats-column-count $column +--max-hypothesis-count $hypothesis +--output-prefix cg_ +--reports `echo ${report1} ${report2} ${report3} ${report4} ${report5} ${somatic.report6} | sed 's/ */,/g'` +#if $somatic.report6 == "SomaticOutput" + --genome-rootA $somatic.genomeA + --genome-rootB $somatic.genomeB + --calibration-root $somatic.calibration +#end if + + + + + (report1 == 'SuperlocusOutput') + + + (report2 == 'SuperlocusStats') + + + (report3 == 'LocusOutput') + + + (report4 == 'LocusStats') + + + (report5 == 'VariantOutput') + + + (report5 == 'VariantOutput') + + + (somatic['report6'] == 'SomaticOutput') + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +This tool uses cgatools calldiff to compare two Complete Genomics variant files. + +**cgatools 1.6.0 Documentation** + +Userguide: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-user-guide.pdf + +Release notes: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-release-notes.pdf + +**Command line reference**:: + + COMMAND NAME + calldiff - Compares two Complete Genomics variant files. + + DESCRIPTION + Compares two Complete Genomics variant files. Divides the genome up into + superloci of nearby variants, then compares the superloci. Also refines the + comparison to determine per-call or per-locus comparison results. + + Comparison results are usually described by a semi-colon separated string, + one per allele. Each allele's comparison result is one of the following + classifications: + + ref-identical The alleles of the two variant files are identical, and + they are consistent with the reference. + alt-identical The alleles of the two variant files are identical, and + they are inconsistent with the reference. + ref-consistent The alleles of the two variant files are consistent, + and they are consistent with the reference. + alt-consistent The alleles of the two variant files are consistent, + and they are inconsistent with the reference. + onlyA The alleles of the two variant files are inconsistent, + and only file A is inconsistent with the reference. + onlyB The alleles of the two variant files are inconsistent, + and only file B is inconsistent with the reference. + mismatch The alleles of the two variant files are inconsistent, + and they are both inconsistent with the reference. + phase-mismatch The two variant files would be consistent if the + hapLink field had been empty, but they are + inconsistent. + ploidy-mismatch The superlocus did not have uniform ploidy. + + In some contexts, this classification is rolled up into a simplified + classification, which is one of "identical", "consistent", "onlyA", + "onlyB", or "mismatch". + + A good place to start looking at the results is the superlocus-output file. + It has columns defined as follows: + + SuperlocusId An identifier given to the superlocus. + Chromosome The name of the chromosome. + Begin The 0-based offset of the start of the superlocus. + End The 0-based offset of the base one past the end of the + superlocus. + Classification The match classification of the superlocus. + Reference The reference sequence. + AllelesA A semicolon-separated list of the alleles (one per + haplotype) for variant file A, for the phasing with the + best comparison result. + AllelesB A semicolon-separated list of the alleles (one per + haplotype) for variant file B, for the phasing with the + best comparison result. + + The locus-output file contains, for each locus in file A and file B that is + not consistent with the reference, an annotated set of calls for the locus. + The calls are annotated with the following columns: + + SuperlocusId The id of the superlocus containing the locus. + File The variant file (A or B). + LocusClassification The locus classification is determined by the + varType column of the call that is inconsistent + with the reference, concatenated with a + modifier that describes whether the locus is + heterozygous, homozygous, or contains no-calls. + If there is no one variant in the locus (i.e., + it is heterozygous alt-alt), the locus + classification begins with "other". + LocusDiffClassification The match classification for the locus. This is + defined to be the best of the comparison of the + locus to the same region in the other file, or + the comparison of the superlocus. + + The somatic output file contains a list of putative somatic variations of + genome A. The output includes only those loci that can be classified as + snp, del, ins or sub in file A, and are called reference in the file B. + Every locus is annotated with the following columns: + + VarCvgA The totalReadCount from file A for this locus + (computed on the fly if file A is not a + masterVar file). + VarScoreA The varScoreVAF from file A, or varScoreEAF if + the "--diploid" option is used. + RefCvgB The maximum of the uniqueSequenceCoverage + values for the locus in genome B. + RefScoreB Minimum of the reference scores of the locus in + genome B. + SomaticCategory The category used for determining the + calibrated scores and the SomaticRank. + VarScoreACalib The calibrated variant score of file A, under + the model selected by using or not using the + "--diploid" option, and corrected for the count + of heterozygous variants observed in this + genome. See user guide for more information. + VarScoreBCalib The calibrated reference score of file B, under + the model selected by using or not using the + "--diploid" option, and corrected for the count + of heterozygous variants observed in this + genome. See user guide for more information. + SomaticRank The estimated rank of this somatic mutation, + amongst all true somatic mutations within this + SomaticCategory. The value is a number between + 0 and 1; a value of 0.012 means, for example, + that an estimated 1.2% of the true somatic + mutations in this somaticCategory have a + somaticScore less than the somaticScore for + this mutation. See user guide for more + information. + SomaticScore An integer that provides a total order on + quality for all somatic mutations. It is equal + to -10*log10( P(false)/P(true) ), under the + assumption that this genome has a rate of + somatic mutation equal to 1/Mb for + SomaticCategory snp, 1/10Mb for SomaticCategory + ins, 1/10Mb for SomaticCategory del, and 1/20Mb + for SomaticCategory sub. The computation is + based on the assumptions described in the user + guide, and is affected by choice of variant + model selected by using or not using the + "--diploid" option. + SomaticQuality Equal to VQHIGH for all somatic mutations where + SomaticScore >= -10. Otherwise, this column is + empty. + + OPTIONS + -h [ --help ] + Print this help message. + + --reference arg + The input crr file. + + --variantsA arg + The "A" input variant file. + + --variantsB arg + The "B" input variant file. + + --output-prefix arg + The path prefix for all output reports. + + --reports arg (=SuperlocusOutput,SuperlocusStats,LocusOutput,LocusStats) + Comma-separated list of reports to generate. (Beware any reports whose + name begins with "Debug".) A report is one of: + SuperlocusOutput Report for superlocus classification. + SuperlocusStats Report for superlocus classification stats. + LocusOutput Report for locus classification. + LocusStats Report for locus stats. + VariantOutput Both variant files annotated by comparison + results.If the somatic output report is + requested, file A is also annotated with the + same score ranks as produced in that report. + SomaticOutput Report for the list of simple variations that + are present only in file "A", annotated with + the score that indicates the probability of + the variation being truly somatic. Requires + beta, genome-rootA, and genome-rootB options + to be provided as well. Note: generating this + report slows calldiff by 10x-20x. + DebugCallOutput Report for call classification. + DebugSuperlocusOutput Report for debug superlocus information. + DebugSomaticOutput Report for distribution estimates used for + somatic rescoring. Only produced if + SomaticOutput is also turned on. + + --diploid + Uses varScoreEAF instead of varScoreVAF in somatic score computations. + Also, uses diploid variant model instead of variable allele mixture + model. + + --locus-stats-column-count arg (=15) + The number of columns for locus compare classification in the locus + stats file. + + --max-hypothesis-count arg (=32) + The maximum number of possible phasings to consider for a superlocus. + + --no-reference-cover-validation + Turns off validation that all bases of a chromosome are covered by + calls of the variant file. + + --genome-rootA arg + The "A" genome directory, for example /data/GS00118-DNA_A01; this + directory is expected to contain ASM/REF and ASM/EVIDENCE + subdirectories. + + --genome-rootB arg + The "B" genome directory. + + --calibration-root arg + The directory containing calibration data. For example, there should + exist a file calibration-root/0.0.0/metrics.tsv. + + --beta + This flag enables the SomaticOutput report, which is beta + functionality. + + SUPPORTED FORMAT_VERSION + 0.3 or later + + diff -r df82283e402d -r 382c50ce0519 cgatools/tools/cgatools_1.6/evidence2sam.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools/tools/cgatools_1.6/evidence2sam.xml Tue Sep 04 18:46:40 2012 -0400 @@ -0,0 +1,229 @@ + + + + converts evidence mappings to SAM format + + + +cgatools | head -1; + + +echo "cgatools evidence2sam --beta +--reference $crr.fields.path +--output $output +--evidence-dnbs $data_sources.input +--consistent-mapping-range $range +#if $region.selectregion == "yes" +--extract-genomic-region $region.coordinates +#end if +$duplicates +$mates +$intervals +$skip +$svcandidates +$unmapped +$primary +"; + + +cgatools evidence2sam --beta +--reference $crr.fields.path +--evidence-dnbs $data_sources.input +#if $region.selectregion == "yes" + --extract-genomic-region $region.coordinates +#end if +$duplicates +$mates +$intervals +$skip +$svcandidates +$unmapped +$primary +--consistent-mapping-range $range +--output $output + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +This tool uses cgatools evidence2sam to convert Complete Genomics evidence mappings to SAM format + +**cgatools 1.6.0 Documentation** + +Userguide: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-user-guide.pdf + +Release notes: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-release-notes.pdf + +**Command line reference**:: + + COMMAND NAME + evidence2sam - Converts CGI variant evidence data into SAM format. + + DESCRIPTION + The evidence2sam converter takes as input evidence mapping files + (evidenceDnbs-*) and generates one SAM file as an output. The output is + sent into stdout by default. By default, all the evidence mapping records + from the input are converted into a pair of corresponding SAM records - one + record for each HalfDNB. The negative gaps in CGI mappings are represented + using GS/GQ/GC tags. + + OPTIONS + -h [ --help ] + Print this help message. + + --beta + This is a beta command. To run this command, you must pass the --beta + flag. + + -e [ --evidence-dnbs ] arg + Input evidence dnbs file. + + -s [ --reference ] arg + Reference file. + + -o [ --output ] arg (=STDOUT) + The output SAM file (may be omitted for stdout). + + -r [ --extract-genomic-region ] arg + defines a region as a half-open interval 'chr,from,to'. + + --keep-duplicates + Keep local duplicates of DNB mappings.All the output SAM records will + be marked as not primary if this option is used. + + --add-allele-id + Generate interval id and allele id tags. + + --skip-not-mapped + Skip not mapped records + + --add-mate-sequence + Generate mate sequence and score tags. + + --mate-sv-candidates + Inconsistent mappings are normally converted as single arm mappings + with no mate information provided. If the option is used map2sam will + mate unique single arm mappings in SAM including those on different + stands and chromosomes. To distinguish these "artificially" mated + records a tag "XS:i:1" is used. The MAPQ provided for these records is + a single arm mapping weight. + + --add-unmapped-mate-info + works like add-mate-sequence, but is applied to inconsistent mappings + only + + --primary-mappings-only + report only the best mappings + + --consistent-mapping-range arg (=1300) + limit the maximum distance between consistent mates + + + SUPPORTED FORMAT_VERSION + 0.3 or later + + diff -r df82283e402d -r 382c50ce0519 cgatools/tools/cgatools_1.6/join.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools/tools/cgatools_1.6/join.xml Tue Sep 04 18:46:40 2012 -0400 @@ -0,0 +1,241 @@ + + + + two tsv files based on equal fields or overlapping regions. + + + +cgatools | head -1; + + +echo "cgatools join --beta +--input $inputA +--input $inputB +--output $output +--output-mode $outmode +$dump +--select $col +#for $m in $matches +--match ${m.match} +#end for +#if $range_overlap.range == 'yes' +#for $o in $range_overlap.overlaps +--overlap ${o.overlap} +#end for +--overlap-mode $range_overlap.overlapmode +--overlap-fraction-A $range_overlap.fractionA +--boundary-uncertainty-A $range_overlap.boundaryA +--overlap-fraction-B $range_overlap.fractionB +--boundary-uncertainty-B $range_overlap.boundaryB +#end if +"; + + +cgatools join --beta +--input $inputA +--input $inputB +--output $output +--output-mode $outmode +$dump +--select $col +#for $m in $matches + --match ${m.match} +#end for +#if $range_overlap.range == 'yes' + #for $o in $range_overlap.overlaps + --overlap ${o.overlap} + #end for + --overlap-mode $range_overlap.overlapmode + --overlap-fraction-A $range_overlap.fractionA + --boundary-uncertainty-A $range_overlap.boundaryA + --overlap-fraction-B $range_overlap.fractionB + --boundary-uncertainty-B $range_overlap.boundaryB +#end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +This tool joins two tab-delimited files based on equal fields or overlapping regions. + +**cgatools 1.6.0 Documentation** + +Userguide: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-user-guide.pdf + +Release notes: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-release-notes.pdf + +**Command line reference**:: + + COMMAND NAME + join - Joins two tab-delimited files based on equal fields or overlapping regions. + + DESCRIPTION + Joins two tab-delimited files based on equal fields or overlapping regions. + By default, an output record is produced for each match found between file + A and file B, but output format can be controlled by the --output-mode + parameter. + + OPTIONS + -h [ --help ] + Print this help message. + + --beta + This is a beta command. To run this command, you must pass the --beta + flag. + + --input arg + File name to use as input (may be passed in as arguments at the end of + the command), or omitted for stdin). There must be exactly two input + files to join. If only one file is specified by name, file A is taken + to be stdin and file B is the named file. File B is read fully into + memory, and file A is streamed. File A's columns appear first in the + output. + + --output arg (=STDOUT) + The output file name (may be omitted for stdout). + + --match arg + A match specification, which is a column from A and a column from B + separated by a colon. + + --overlap arg + Overlap specification. An overlap specification consists of a range + definition for files A and B, separated by a colon. A range definition + may be two columns, in which case they are interpreted as the beginning + and end of the range. Or it may be one column, in which case the range + is defined as the 1-base range starting at the given value. The records + from the two files must overlap in order to be considered for output. + Two ranges are considered to overlap if the overlap is at least one + base long, or if one of the ranges is length 0 and the ranges overlap + or abut. For example, "begin,end:offset" will match wherever end-begin + > 0, begin<offset+1, and end>offset, or wherever end-begin = 0, + begin<=offset+1, and end>=offset. + + + -m [ --output-mode ] arg (=full) + Output mode, one of the following: + full Print an output record for each match found between + file A and file B. + compact Print at most one record for each record of file A, + joining the file B values by a semicolon and + suppressing repeated B values and empty B values. + compact-pct Same as compact, but for each distinct B value, + annotate with the percentage of the A record that is + overlapped by B records with that B value. Percentage + is rounded up to nearest integer. + + --overlap-mode arg (=strict) + Overlap mode, one of the following: + strict Range A and B overlap if A.begin < B.end and + B.begin < A.end. + allow-abutting-points Range A and B overlap they meet the strict + requirements, or if A.begin <= B.end and + B.begin <= A.end and either A or B has zero + length. + + --select arg (=A.*,B.*) + Set of fields to select for output. + + -a [ --always-dump ] + Dump every record of A, even if there are no matches with file B. + + --overlap-fraction-A arg (=0) + Minimum fraction of A region overlap for filtering output. + + --boundary-uncertainty-A arg (=0) + Boundary uncertainty for overlap filtering. Specifically, records + failing the following predicate are filtered away: overlap >= + overlap-fraction-A * ( A-range-length - boundary-uncertainty-A ) + + --overlap-fraction-B arg (=0) + Minimum fraction of B region overlap for filtering output. + + --boundary-uncertainty-B arg (=0) + Boundary uncertainty for overlap filtering. Specifically, records + failing the following predicate are filtered away: overlap >= + overlap-fraction-B * ( B-range-length - boundary-uncertainty-B ) + + SUPPORTED FORMAT_VERSION + Any + + diff -r df82283e402d -r 382c50ce0519 cgatools/tools/cgatools_1.6/junctiondiff.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools/tools/cgatools_1.6/junctiondiff.xml Tue Sep 04 18:46:40 2012 -0400 @@ -0,0 +1,181 @@ + + + + reports difference between junction calls + + + +cgatools | head -1; + + +echo "cgatools junctiondiff --beta +--reference $crr.fields.path +--junctionsA $data_sources.inputA +--junctionsB $data_sources.inputB +--scoreThresholdA $scoreA +--scoreThresholdB $scoreB +--distance $distance +--minlength $minlength +--output-prefix cg_ +$stat +"; + + +cgatools junctiondiff --beta +--reference $crr.fields.path +--junctionsA $data_sources.inputA +--junctionsB $data_sources.inputB +--scoreThresholdA $scoreA +--scoreThresholdB $scoreB +--distance $distance +--minlength $minlength +--output-prefix cg_ +$stat +; +mv cg_diff-*tsv cg_diff.tsv + + + + + + (stat == '--statout') + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +This tool uses cgatools junctiondiff to report difference between junction calls of two Complete Genomics junctions files + +**cgatools 1.6.0 Documentation** + +Userguide: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-user-guide.pdf + +Release notes: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-release-notes.pdf + +**Command line reference**:: + + COMMAND NAME + junctiondiff - Reports difference between junction calls of Complete Genomics junctions files. + + DESCRIPTION + junctiondiff takes two junction files A and B as input and produces the + following output: + - "diff-inputFileName" - the junctions from an input file A that are not + present in input file B. + - "report.txt" - a brief summary report (if --statout is used) + + Two junctions are considered equivalent if: + - they come from different files + - left and right positions of one junction are not more than "--distance" + bases apart from the corresponding positions of another junction + - the junction scores are equal or above the scoreThreshold + - they are on the same strands + + OPTIONS + -h [ --help ] + Print this help message. + + --beta + This is a beta command. To run this command, you must pass the --beta + flag. + + -s [ --reference ] arg + Reference file. + + -a [ --junctionsA ] arg + input junction file A. + + -b [ --junctionsB ] arg + input junction file B. + + -A [ --scoreThresholdA ] arg (=10) + score threshold value for the input file A. + + -B [ --scoreThresholdB ] arg (=0) + score threshold value for the input file B. + + -d [ --distance ] arg (=200) + Max distance between coordinates of potentially compatible junctions. + + -l [ --minlength ] arg (=500) + Minimum deletion junction length to be included into the difference + file. + + -o [ --output-prefix ] arg + The path prefix for all the output reports. + + -S [ --statout ] + (Debug) Report various input file statistics. Experimental feature. + + SUPPORTED FORMAT_VERSION + 1.5 or later + + diff -r df82283e402d -r 382c50ce0519 cgatools/tools/cgatools_1.6/listtestvariants.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools/tools/cgatools_1.6/listtestvariants.xml Tue Sep 04 18:46:40 2012 -0400 @@ -0,0 +1,264 @@ + + + + performs listsvariants and testvariants consecutively + + + +cgatools | head -1; + + +echo "cgatools listvariants --beta +--reference ${crr.fields.path} +--output $output1 +#if $include_list.listing == "yes" +--variant-listing $include_list.list +#end if +$longvar +--variants +#if $data_sources.data_source == "in" +#for $v in $data_sources.file_types.files +${v.input} +#end for +#else +`cat $data_sources.file_types.list` +#end if +"; +echo "cgatools testvariants --beta +--reference ${crr.fields.path} +--output $output2 +--input $output1 +--variants +#if $data_sources.data_source == "in" +#for $v in $data_sources.file_types.files +${v.input} +#end for +#else +`cat $data_sources.file_types.list` +#end if +"; + + +cgatools listvariants +--beta +--reference ${crr.fields.path} +--output $output1 +#if $include_list.listing == "yes" + --variant-listing $include_list.list +#end if +$longvar +--variants +#if $data_sources.data_source == "in" + #for $v in $data_sources.file_types.files + ${v.input} + #end for +#else + `cat $data_sources.file_types.list` +#end if +; + +cgatools testvariants +--beta +--reference ${crr.fields.path} +--output $output2 +--input $output1 +--variants +#if $data_sources.data_source == "in" + #for $v in $data_sources.file_types.files + ${v.input} + #end for +#else + `cat $data_sources.file_types.list` +#end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +This tool uses the cgatools listvariants and testvariants to test variant or mastervar files for the presence of variants. + +**cgatools 1.6.0 Documentation** + +Userguide: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-user-guide.pdf + +Release notes: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-release-notes.pdf + +**Command line reference**:: + + COMMAND NAME + listvariants - Lists the variants present in a variant file. + + DESCRIPTION + Lists all called variants present in the specified variant files, in a + format suitable for processing by the testvariants command. The output is a + tab-delimited file consisting of the following columns: + + variantId Sequential id assigned to each variant. + chromosome The chromosome of the variant. + begin 0-based reference offset of the beginning of the variant. + end 0-based reference offset of the end of the variant. + varType The varType as extracted from the variant file. + reference The reference sequence. + alleleSeq The variant allele sequence as extracted from the variant + file. + xRef The xRef as extrated from the variant file. + + OPTIONS + -h [ --help ] + Print this help message. + + --beta + This is a beta command. To run this command, you must pass the --beta + flag. + + --reference arg + The reference crr file. + + --output arg (=STDOUT) + The output file (may be omitted for stdout). + + --variants arg + The input variant files (may be positional args). + + --variant-listing arg + The output of another listvariants run, to be merged in to produce the + output of this run. + + --list-long-variants + In addition to listing short variants, list longer variants as well + (10's of bases) by concatenating nearby calls. + + SUPPORTED FORMAT_VERSION + 0.3 or later + + + + COMMAND NAME + testvariants - Tests variant files for presence of variants. + + DESCRIPTION + Tests variant files for presence of variants. The output is a tab-delimited + file consisting of the columns of the input variants file, plus a column + for each assembly results file that contains a character code for each + allele. The character codes have meaning as follows: + + 0 This allele of this genome is consistent with the reference at this + locus but inconsistent with the variant. + 1 This allele of this genome has the input variant at this locus. + N This allele of this genome has no-calls but is consistent with the + input variant. + + OPTIONS + -h [ --help ] + Print this help message. + + --beta + This is a beta command. To run this command, you must pass the --beta + flag. + + --reference arg + The reference crr file. + + --input arg (=STDIN) + The input variants to test for. + + --output arg (=STDOUT) + The output file (may be omitted for stdout). + + --variants arg + The input variant files (may be passed in as arguments at the end of + the command). + + SUPPORTED FORMAT_VERSION + 0.3 or later + + diff -r df82283e402d -r 382c50ce0519 cgatools/tools/cgatools_1.6/listvariants.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools/tools/cgatools_1.6/listvariants.xml Tue Sep 04 18:46:40 2012 -0400 @@ -0,0 +1,192 @@ + + + + lists all called variants + + + +cgatools | head -1; + + +echo "cgatools listvariants --beta +--reference ${crr.fields.path} +--output $output +#if $include_list.listing == "yes" +--variant-listing $include_list.list +#end if +$longvar +--variants +#if $data_sources.data_source == "in" +#for $v in $data_sources.file_types.files +${v.input} +#end for +#else +`cat $data_sources.list` +#end if +"; + + +cgatools listvariants --beta +--reference ${crr.fields.path} +--output $output +#if $include_list.listing == "yes" + --variant-listing $include_list.list +#end if +$longvar +--variants +#if $data_sources.data_source == "in" + #for $v in $data_sources.file_types.files + ${v.input} + #end for +#else + `cat $data_sources.list` +#end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +This tool uses the cgatools listvariants to list all called variants present in the var or mastervar files. + +**cgatools 1.6.0 Documentation** + +Userguide: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-user-guide.pdf + +Release notes: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-release-notes.pdf + +**Command line reference**:: + + COMMAND NAME + listvariants - Lists the variants present in a variant file. + + DESCRIPTION + Lists all called variants present in the specified variant files, in a + format suitable for processing by the testvariants command. The output is a + tab-delimited file consisting of the following columns: + + variantId Sequential id assigned to each variant. + chromosome The chromosome of the variant. + begin 0-based reference offset of the beginning of the variant. + end 0-based reference offset of the end of the variant. + varType The varType as extracted from the variant file. + reference The reference sequence. + alleleSeq The variant allele sequence as extracted from the variant + file. + xRef The xRef as extrated from the variant file. + + OPTIONS + -h [ --help ] + Print this help message. + + --beta + This is a beta command. To run this command, you must pass the --beta + flag. + + --reference arg + The reference crr file. + + --output arg (=STDOUT) + The output file (may be omitted for stdout). + + --variants arg + The input variant files (may be positional args). + + --variant-listing arg + The output of another listvariants run, to be merged in to produce the + output of this run. + + --list-long-variants + In addition to listing short variants, list longer variants as well + (10's of bases) by concatenating nearby calls. + + SUPPORTED FORMAT_VERSION + 0.3 or later + + diff -r df82283e402d -r 382c50ce0519 cgatools/tools/cgatools_1.6/mkvcf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools/tools/cgatools_1.6/mkvcf.xml Tue Sep 04 18:46:40 2012 -0400 @@ -0,0 +1,1001 @@ + + + + converts to vcf + + + + mkvcf_wrapper.pl + --reference $crr.fields.path + --output $output + --genomes $count.genomes + --source $count.sources.source + --datasource $count.sources.data_sources.data_source + #if $count.sources.data_sources.data_source=="in" + #for $m in $count.sources.data_sources.files + --input $m.input + #end for + #else + --input $count.sources.data_sources.input + #end if + #if $count.sources.source=="masterVar" or $count.sources.source=="masterVar,CNV" + $count.sources.nocalls + --calibration $count.sources.calibration + #else if $count.sources.source=="SV" + --jctscore $count.sources.jctscore + --jctside $count.sources.jctside + --jctdistance $count.sources.jctdistance + --jctlength $count.sources.jctlength + $count.sources.jctpriority + $count.sources.jcttumor + #else if $count.sources.source=="masterVar,CNV,SV" or $count.sources.source=="masterVar,CNV,SV,MEI" + $count.sources.nocalls + --calibration $count.sources.calibration + --jctscore $count.sources.jctscore + --jctside $count.sources.jctside + --jctdistance $count.sources.jctdistance + --jctlength $count.sources.jctlength + $count.sources.jctpriority + $count.sources.jcttumor + #end if + --fields $count.sources.fields + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +This tool uses cgatools mkvcf to convert Complete Genomics masterVar files, including CNV, SV and/or MEI data, to vcf format version. + +**cgatools 1.6.0 Documentation** + +Userguide: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-user-guide.pdf + +Release notes: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-release-notes.pdf + +**Command line reference**:: + + COMMAND NAME + mkvcf - Converts var file(s) or masterVar file(s) to VCF. + + DESCRIPTION + Converts var file(s) or masterVar file(s) to VCF. + + OPTIONS + -h [ --help ] + Print this help message. + + --beta + This is a beta command. To run this command, you must pass the --beta + flag. + + --reference arg + The reference crr file. + + --output arg (=STDOUT) + The output file (may be omitted for stdout). + + --field-names arg (=GT,PS,NS,AN,AC,SS,FT,CGA_XR,CGA_FI,GQ,HQ,EHQ,CGA_CEHQ,GL, + CGA_CEGL,DP,AD,CGA_RDP,CGA_ODP,CGA_OAD,CGA_ORDP,CGA_PFAM,CGA_MIRB,CGA_RPT, + CGA_SDO,CGA_SOMC,CGA_SOMR,CGA_SOMS,CGA_GP,CGA_NP,CGA_CP,CGA_PS,CGA_CT, + CGA_TS,CGA_CL,CGA_LS,CGA_SCL,CGA_SLS,CGA_LAF,CGA_LLAF,CGA_ULAF,CGA_IS, + CGA_IDC,CGA_IDCL,CGA_IDCR,CGA_RDC,CGA_NBET,CGA_ETS,CGA_KES,CGA_BF, + CGA_MEDEL,MATEID,SVTYPE,CGA_BNDG,CGA_BNDGO,CGA_BNDMPC,CGA_BNDPOS,CGA_BNDDEF, + CGA_BNDP) + Comma-separated list of field names. By default, all fields are + included, but you may override this option to ensure only a subset of + the fields is included in the VCF output. For a description of each + field, see the cgatools user guide. + + --source-names arg (=masterVar,CNV,SV,MEI) + Comma-separated list of source names. The following source names are + available: + masterVar - Includes records extracted from the masterVar file. + CNV - Includes CNV-related records. + SV - Includes records derived from junctions files. + MEI - Includes records describing mobile element insertions. + Some of these source types are only available for more recent pipeline + versions, and some of these source types do not support multi-genome + VCFs. For more information about which source types are available for + which versions of the Complete Genomics pipeline software, see the + cgatools user guide. + + --genome-root arg + For each genome to include in the VCF, the genome root directory, for + example /data/GS00118-DNA_A01; this directory is expected to contain + the ASM and LIB subdirectories, for example. You must supply this + option for each genome in the VCF, unless you are using + --source-names=masterVar and you have specified the --master-var option + for each genome in the VCF. + + --master-var arg + For each genome to include in the VCF, the masterVar file. If + genome-roots parameter is given, this parameter defaults to the + masterVar in the given genome-root. + + --include-no-calls + Small variants VCF records include loci that have no + reference-inconsistent calls. + + --calibration-root arg + The directory containing calibration data. For example, there should + exist a file calibration-root/version0.0.0/metrics.tsv. This option is only + required if CGA_CEHQ or CGA_CEGL are included in the --field-names + parameter. + + --junction-file arg + For each genome to include in the VCF, the junctions file. If + genome-roots parameter is given, this parameter defaults to the + respective junctions file in the export directory. + + --junction-score-threshold arg (=10) + Junction score thresholds (discordant mate pair count). + + --junction-side-length-threshold arg (=70) + Junction side length threshold. + + --junction-distance-tolerance arg (=200) + Distance tolerance for junction compatibility. + + --junction-length-threshold arg (=500) + Length threshold for compatible junctions. + + --junction-normal-priority + Normal junction priority for vcf output. + + --junction-tumor-hc + use high confidence junctions for tumors. + + + SUPPORTED FORMAT_VERSION + 0.3 or later + + diff -r df82283e402d -r 382c50ce0519 cgatools/tools/cgatools_1.6/mkvcf_wrapper.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools/tools/cgatools_1.6/mkvcf_wrapper.pl Tue Sep 04 18:46:40 2012 -0400 @@ -0,0 +1,95 @@ +#!/usr/bin/perl +use strict; +use Getopt::Long; +use vars qw($opt_reference $opt_output @opt_input $opt_genomes $opt_source $opt_datasource $opt_fields $opt_nocalls $opt_calibration $opt_jctscore $opt_jctside $opt_jctdistance $opt_jctlength $opt_jctpriority $opt_jcttumor); +$| = 1; # set autoflush to screen + +# This is a wrapper for the cgatools mkvcf function to run cgatools mkvcf in Galaxy. +# written 8-10-2012 by bcrain@completegenomics.com + +#print join("\n", @ARGV), "\n"; +&GetOptions("reference=s", "output=s", "input=s@", "genomes=i", "source=s", "datasource=s", "fields=s", "nocalls", "calibration:s", "jctscore=i", "jctside=i", "jctdistance=i", "jctlength=i", "jctpriority", "jcttumor"); + +my $command = "cgatools mkvcf --beta --reference $opt_reference --output $opt_output --source-names $opt_source"; + +if ($opt_datasource eq 'in') +{ + foreach my $file (@opt_input) + { + if ($opt_source eq 'masterVar') {$command .= " --master-var ";} + elsif ($opt_source eq 'SV') {$command .= " --junction-file ";} + else {die "there is an error in the logic: wrong source $opt_source for datasource $opt_datasource.\n";} + $command .= $file + } +} +elsif ($opt_datasource eq 'out') +{ + if ($opt_genomes == 1) + { + if ($opt_input[0] =~ m/masterVar/ and $opt_source eq 'masterVar') + { + -f $opt_input[0] or die "$opt_input[0] is not a valid file.\n"; + $command .= " --master-var $opt_input[0]"; + } + elsif ($opt_input[0] =~ m/Junctions/ and $opt_source eq 'SV') + { + -f $opt_input[0] or die "$opt_input[0] is not a valid file.\n"; + $command .= " --junction-file $opt_input[0]"; + } + else + { + $opt_input[0] =~ s/\/$//; + -d $opt_input[0] or die "$opt_input[0] is not a valid directory.\n"; + $command .= " --genome-root $opt_input[0]"; + } + } + else + { + -T $opt_input[0] or die "$opt_input[0] is not a valid file.\n"; + my $count = 0; + foreach my $file (split /\s+/, `cat $opt_input[0]`) + { + $count ++; + ($opt_genomes == 2 and $count > 2) and die "The number of inputs in your list file cannot be greater than the number of genomes selected.\n"; + if ($file =~ m/masterVar/ and $opt_source eq 'masterVar') + { + -f $file or die "$file is not a valid file.\n"; + $command .= " --master-var "; + } + elsif ($file =~ m/Junctions/ and $opt_source eq 'SV') + { + -f $file or die "$file is not a valid file.\n"; + $command .= " --junction-file "; + } + else + { + -d $file or die "$file is not a valid directory.\n"; + $command .= " --genome-root "; + } + $command .= $file + } + } +} +else +{die "there is an error in the logic: wrong datasource $opt_datasource.\n";} + +if ($opt_calibration) +{ + (-r "$opt_calibration/0.0.0/metrics.tsv" or -r "$opt_calibration/version0.0.0/metrics.tsv") or die "This folder does not contain the calibration data\n"; + $command .= " --calibration-root $opt_calibration"; +} + +$opt_fields eq 'all' or $command .= " --field-names $opt_fields"; +$opt_nocalls and $command .= " --include-no-calls"; +$opt_jctscore and $command .= " --junction-score-threshold $opt_jctscore"; +$opt_jctside and $command .= " --junction-side-length-threshold $opt_jctside"; +$opt_jctdistance and $command .= " --junction-distance-tolerance $opt_jctdistance"; +$opt_jctlength and $command .= " --junction-length-threshold $opt_jctlength"; +$opt_jctpriority and $command .= " --junction-normal-priority"; +$opt_jcttumor and $command .= " --junction-tumor-hc"; + +my $version = `cgatools | head -1`; +print "$version\n"; +print "$command \n"; + +`$command`; \ No newline at end of file diff -r df82283e402d -r 382c50ce0519 cgatools/tools/cgatools_1.6/snpdiff.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools/tools/cgatools_1.6/snpdiff.xml Tue Sep 04 18:46:40 2012 -0400 @@ -0,0 +1,198 @@ + + + + compares snp calls to var or masterVar file. + + + +cgatools | head -1; + + +echo "cgatools snpdiff +--reference $crr.fields.path +--variants $data_sources.varfile +--genotypes $genotype +--output-prefix cg_ +--reports `echo ${report1} ${report2} ${report3} | sed 's/ */,/g'` +"; + + +cgatools snpdiff +--reference $crr.fields.path +--variants $data_sources.varfile +--genotypes $genotype +--output-prefix cg_ +--reports `echo ${report1} ${report2} ${report3} | sed 's/ */,/g'` + + + + + + (report1 == 'Output') + + + (report2 == 'Verbose') + + + (report3 == 'Stats') + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +This tool ompares snp calls to a Complete Genomics variant file. + +**cgatools 1.6.0 Documentation** + +Userguide: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-user-guide.pdf + +Release notes: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-release-notes.pdf + +**Command line reference**:: + + COMMAND NAME + snpdiff - Compares snp calls to a Complete Genomics variant file. + + DESCRIPTION + Compares the snp calls in the "genotypes" file to the calls in a Complete + Genomics variant file. The genotypes file is a tab-delimited file with at + least the following columns (additional columns may be given): + + Chromosome (Required) The name of the chromosome. + Offset0Based (Required) The 0-based offset in the chromosome. + GenotypesStrand (Optional) The strand of the calls in the Genotypes + column (+ or -, defaults to +). + Genotypes (Optional) The calls, one per allele. The following + calls are recognized: + A,C,G,T A called base. + N A no-call. + - A deleted base. + . A non-snp variation. + + The output is a tab-delimited file consisting of the columns of the + original genotypes file, plus the following additional columns: + + Reference The reference base at the given position. + VariantFile The calls made by the variant file, one per allele. + The character codes are the same as is described for + the Genotypes column. + DiscordantAlleles (Only if Genotypes is present) The number of + Genotypes alleles that are discordant with calls in + the VariantFile. If the VariantFile is described as + haploid at the given position but the Genotypes is + diploid, then each genotype allele is compared + against the haploid call of the VariantFile. + NoCallAlleles (Only if Genotypes is present) The number of + Genotypes alleles that were no-called by the + VariantFile. If the VariantFile is described as + haploid at the given position but the Genotypes is + diploid, then a VariantFile no-call is counted twice. + + The verbose output is a tab-delimited file consisting of the columns of the + original genotypes file, plus the following additional columns: + + Reference The reference base at the given position. + VariantFile The call made by the variant file for one allele (there is + a line in this file for each allele). The character codes + are the same as is described for the Genotypes column. + [CALLS] The rest of the columns are pasted in from the VariantFile, + describing the variant file line used to make the call. + + The stats output is a comma-separated file with several tables describing + the results of the snp comparison, for each diploid genotype. The tables + all describe the comparison result (column headers) versus the genotype + classification (row labels) in different ways. The "Locus classification" + tables have the most detailed match classifications, while the "Locus + concordance" tables roll these match classifications up into "discordance" + and "no-call". A locus is considered discordant if it is discordant for + either allele. A locus is considered no-call if it is concordant for both + alleles but has a no-call on either allele. The "Allele concordance" + describes the comparison result on a per-allele basis. + + OPTIONS + -h [ --help ] + Print this help message. + + --reference arg + The input crr file. + + --variants arg + The input variant file. + + --genotypes arg + The input genotypes file. + + --output-prefix arg + The path prefix for all output reports. + + --reports arg (=Output,Verbose,Stats) + Comma-separated list of reports to generate. A report is one of: + Output The output genotypes file. + Verbose The verbose output file. + Stats The stats output file. + + SUPPORTED FORMAT_VERSION + 0.3 or later + + diff -r df82283e402d -r 382c50ce0519 cgatools/tools/cgatools_1.6/testvariants.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools/tools/cgatools_1.6/testvariants.xml Tue Sep 04 18:46:40 2012 -0400 @@ -0,0 +1,166 @@ + + + + test for the presence of variants + + + +cgatools | head -1; + + +echo "cgatools testvariants --beta +--reference ${crr.fields.path} +--output $output +--input $listing +--variants +#if $data_sources.data_source == "in" +#for $v in $data_sources.file_types.files +${v.input} +#end for +#else +`cat $data_sources.list` +#end if +"; + + +cgatools testvariants +--beta +--reference ${crr.fields.path} +--output $output +--input $listing +--variants +#if $data_sources.data_source == "in" + #for $v in $data_sources.file_types.files + ${v.input} + #end for +#else + `cat $data_sources.list` +#end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +This tool uses the cgatools testvariants to test var or masterVar files for the presence of variants. + +**cgatools 1.6.0 Documentation** + +Userguide: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-user-guide.pdf + +Release notes: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-release-notes.pdf + +**Command line reference**:: + + COMMAND NAME + testvariants - Tests variant files for presence of variants. + + DESCRIPTION + Tests variant files for presence of variants. The output is a tab-delimited + file consisting of the columns of the input variants file, plus a column + for each assembly results file that contains a character code for each + allele. The character codes have meaning as follows: + + 0 This allele of this genome is consistent with the reference at this + locus but inconsistent with the variant. + 1 This allele of this genome has the input variant at this locus. + N This allele of this genome has no-calls but is consistent with the + input variant. + + OPTIONS + -h [ --help ] + Print this help message. + + --beta + This is a beta command. To run this command, you must pass the --beta + flag. + + --reference arg + The reference crr file. + + --input arg (=STDIN) + The input variants to test for. + + --output arg (=STDOUT) + The output file (may be omitted for stdout). + + --variants arg + The input variant files (may be passed in as arguments at the end of + the command). + + SUPPORTED FORMAT_VERSION + 0.3 or later + + diff -r df82283e402d -r 382c50ce0519 cgatools/tools/cgatools_1.6/varfilter.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools/tools/cgatools_1.6/varfilter.xml Tue Sep 04 18:46:40 2012 -0400 @@ -0,0 +1,180 @@ + + + + copies input file, applying filters. + + + + varfilter_wrapper.pl + --reference $crr.fields.path + --output $output + #if $data_sources.data_source == "in" + --input $data_sources.file_types.input + #else + --input $data_sources.input + #end if + #for $f in $filters + --zygosity $f.zygosity + --vartype $f.vartype + --varscorevaf $f.varscorevaf + --varscoreeaf $f.varscoreeaf + --varquality $f.varquality + #end for + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +This tool uses cgatools varfilter to copy input var file or masterVar file to output, applying specified filters. Loci that are filtered out are set to no-call. + +**cgatools 1.6.0 Documentation** + +Userguide: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-user-guide.pdf + +Release notes: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-release-notes.pdf + +**Command line reference**:: + + COMMAND NAME + varfilter - Copies input var file or masterVar file to output, applying + specified filters. + + DESCRIPTION + Copies input var file or masterVar file to output, applying specified + filters (which are available to all cgatools commands that read a var file + or masterVar file as input). Filters are specified by appending the filter + specification to the var file name on the command line. For example: + + /path/to/var.tsv.bz2#varQuality!=VQHIGH + + The preceding example filters out any calls marked as VQLOW. The filter + specification follows the "#" sign, and consists of a list of filters to + apply, separated by a comma. Each filter is a colon-separated list of call + selectors. Any scored call that passes all the colon-separated call + selectors for one or more of the comma-separated filters is turned into a + no-call. The following call selectors are available: + + hom Selects only calls in homozygous loci. + het Selects any scored call not selected by the hom selector. + varType=XX Selects calls whose varType is XX. + varScoreVAF<XX Selects calls whose varScoreVAF < XX. + varScoreEAF<XX Selects calls whose varScoreEAF < XX. + varQuality!=XX Selects calls whose varQuality is not XX. + + Here is an example that filters homozygous SNPs with varScoreVAF < 25 and + heterozygous insertions with varScoreEAF < 50: + + + '/path/to/var.tsv.bz2#hom:varType=snp:varScoreVAF<25,het:varType=ins:varScoreEAF<50' + + + OPTIONS + -h [ --help ] + Print this help message. + + --beta + This is a beta command. To run this command, you must pass the --beta flag. + + --reference arg + The reference crr file. + + --input arg + The input var file or masterVar file (typically with filters specified). + + --output arg (=STDOUT) + The output file (may be omitted for stdout). + + SUPPORTED FORMAT_VERSION + 0.3 or later + + diff -r df82283e402d -r 382c50ce0519 cgatools/tools/cgatools_1.6/varfilter_wrapper.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools/tools/cgatools_1.6/varfilter_wrapper.pl Tue Sep 04 18:46:40 2012 -0400 @@ -0,0 +1,57 @@ +#!/usr/bin/perl +use strict; +use Getopt::Long; +use vars qw($opt_reference $opt_input $opt_output @opt_zygosity @opt_vartype @opt_varscorevaf @opt_varscoreeaf @opt_varquality); +$| = 1; # set autoflush to screen + +# This is a wrapper for the cgatools varfilter function to run cgatools varfilter in Galaxy. +# The wrapper generates the filter(s) in the correct format to be used with the input file. +# written 6-1-2012 by bcrain@completegenomics.com + + +#print join("\n", @ARGV), "\n"; +&GetOptions("reference=s", "input=s", "output=s", "zygosity=s@", "vartype=s@", "varscorevaf:i@", "varscoreeaf:i@", "varquality=s@"); + +my $append = ''; + +for (my $i = 0; $i <= $#opt_zygosity; $i ++) +{ + my $filter = ''; + unless ($opt_zygosity[$i] eq 'NA') {$filter = $opt_zygosity[$i];} + unless ($opt_vartype[$i] eq 'NA') + { + $filter ne '' and $filter .= ':'; + $filter .= 'varType=' . $opt_vartype[$i]; + } + unless ($opt_varscorevaf[$i] == 0) + { + $filter ne '' and $filter .= ':'; + $filter .= 'varScoreVAF<' . $opt_varscorevaf[$i]; + } + unless ($opt_varscoreeaf[$i] == 0) + { + $filter ne '' and $filter .= ':'; + $filter .= 'varScoreEAF<' . $opt_varscoreeaf[$i]; + } + unless ($opt_varquality[$i] eq 'NA') + { + $filter ne '' and $filter .= ':'; + $filter .= 'varQuality!=' . $opt_varquality[$i]; + } + + if ($filter ne '') + { + if ($append eq '') {$append = '#' . $filter;} + else {$append .= ',' . $filter;} + } +} + +my $version = `cgatools | head -1`; +print "cgatools varfilter +--beta +--reference $opt_reference +--output $opt_output +--input '${opt_input}${append}'\n"; +print "$version\n"; + +`cgatools varfilter --beta --reference $opt_reference --output $opt_output --input '${opt_input}${append}'`; \ No newline at end of file