# HG changeset patch
# User devteam
# Date 1348767479 14400
# Node ID ef23f9cd599b2d6511cc467be9d20db027be2326
Uploaded
diff -r 000000000000 -r ef23f9cd599b testing_cgatools-982e19c29ec0/cgatools/README.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/testing_cgatools-982e19c29ec0/cgatools/README.txt Thu Sep 27 13:37:59 2012 -0400
@@ -0,0 +1,81 @@
+Provides galaxy tools for Complete Genomics' cgatools package - http://www.completegenomics.com
+
+This repository provides tools to execute functions of cgatools from Complete Genomics, Inc.
+and includes the cgatools 1.6 executable.
+
+Reference genomes files for cgatools can be downloaded from Complete Genomics' ftp site:
+ftp://ftp.completegenomics.com/ReferenceFiles/build37.crr
+ftp://ftp.completegenomics.com/ReferenceFiles/build36.crr
+
+Calibration files for cgatools can be downloaded from Complete Genomics' ftp site:
+ftp://ftp.completegenomics.com/ScoreCalibrationFiles/var-calibration-v2.tgz
+
+After copying the files in the desired locations follow the instructions below to register
+the reference files with galaxy.
+
+
+
+
+AUTOMATIC INSTALL
+
+When prompted for a tool panel section to contain the installed tools create a new section
+called 'Complete Genomics - cgatools 1.6'.
+
+After install create a cg_ccr_files.loc file in the tool-data directory of your Galaxy
+instance by copying the cg_ccr_files.loc.sample file. In cg_ccr_files.loc edit the path
+for the reference genome files (.crr files) downloaded from Complete Genomics' ftp site.
+
+Restart Galaxy instance after editing cg_crr_files.loc.
+
+
+
+
+MANUAL INSTALL
+
+For manual install from compressed files move/copy the following files into your Galaxy instance:
+directory tools/cgatools_1.6 to tools/
+file lib/galaxy/datatypes/completegenomics.py to lib/galaxy/datatypes/
+file tool-data/cg_crr_files.loc.sample to tool-data/cg_crr_files.loc
+
+In cg_ccr_files.loc edit the path for the reference genome files (.crr files) downloaded
+from Complete Genomics' ftp site.
+
+Paste from tool_config.xml.sample into the tool_config.xml of your Galaxy instance:
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Paste from tool_data_table_config.xml.sample into the tool_data_table_config.xml of your Galaxy instance:
+
+
+
+
diff -r 000000000000 -r ef23f9cd599b testing_cgatools-982e19c29ec0/cgatools/tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/testing_cgatools-982e19c29ec0/cgatools/tool_dependencies.xml Thu Sep 27 13:37:59 2012 -0400
@@ -0,0 +1,20 @@
+
+
+
+
+
+ http://sourceforge.net/projects/cgatools/files/1.6.0/cgatools-1.6.0.43-MacOSX_binary-x86_64.tar.gz
+
+ cgatools-1.6.0.43-MacOSX_binary-x86_64/bin
+ $INSTALL_DIR/bin
+
+
+ $INSTALL_DIR/bin
+
+
+
+
+some text
+
+
+
diff -r 000000000000 -r ef23f9cd599b testing_cgatools-982e19c29ec0/cgatools/tools/cgatools_1.6/calldiff.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/testing_cgatools-982e19c29ec0/cgatools/tools/cgatools_1.6/calldiff.xml Thu Sep 27 13:37:59 2012 -0400
@@ -0,0 +1,388 @@
+
+
+
+ compares two Complete Genomics variant files.
+
+
+
+cgatools | head -1;
+
+
+echo "cgatools calldiff --beta
+--reference ${crr.fields.path}
+--variantsA $data_sources.inputA
+--variantsB $data_sources.inputB
+$validation
+$diploid
+--locus-stats-column-count $column
+--max-hypothesis-count $hypothesis
+--output-prefix cg_
+--reports `echo ${report1} ${report2} ${report3} ${report4} ${report5} ${somatic.report6} | sed 's/ */,/g'`
+#if $somatic.report6 == "SomaticOutput"
+--genome-rootA $somatic.genomeA
+--genome-rootB $somatic.genomeB
+--calibration-root $somatic.calibration
+#end if
+";
+
+
+cgatools calldiff --beta
+--reference ${crr.fields.path}
+--variantsA $data_sources.inputA
+--variantsB $data_sources.inputB
+$validation
+$diploid
+--locus-stats-column-count $column
+--max-hypothesis-count $hypothesis
+--output-prefix cg_
+--reports `echo ${report1} ${report2} ${report3} ${report4} ${report5} ${somatic.report6} | sed 's/ */,/g'`
+#if $somatic.report6 == "SomaticOutput"
+ --genome-rootA $somatic.genomeA
+ --genome-rootB $somatic.genomeB
+ --calibration-root $somatic.calibration
+#end if
+
+
+
+
+ (report1 == 'SuperlocusOutput')
+
+
+ (report2 == 'SuperlocusStats')
+
+
+ (report3 == 'LocusOutput')
+
+
+ (report4 == 'LocusStats')
+
+
+ (report5 == 'VariantOutput')
+
+
+ (report5 == 'VariantOutput')
+
+
+ (somatic['report6'] == 'SomaticOutput')
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool uses cgatools calldiff to compare two Complete Genomics variant files.
+
+**cgatools 1.6.0 Documentation**
+
+Userguide: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-user-guide.pdf
+
+Release notes: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-release-notes.pdf
+
+**Command line reference**::
+
+ COMMAND NAME
+ calldiff - Compares two Complete Genomics variant files.
+
+ DESCRIPTION
+ Compares two Complete Genomics variant files. Divides the genome up into
+ superloci of nearby variants, then compares the superloci. Also refines the
+ comparison to determine per-call or per-locus comparison results.
+
+ Comparison results are usually described by a semi-colon separated string,
+ one per allele. Each allele's comparison result is one of the following
+ classifications:
+
+ ref-identical The alleles of the two variant files are identical, and
+ they are consistent with the reference.
+ alt-identical The alleles of the two variant files are identical, and
+ they are inconsistent with the reference.
+ ref-consistent The alleles of the two variant files are consistent,
+ and they are consistent with the reference.
+ alt-consistent The alleles of the two variant files are consistent,
+ and they are inconsistent with the reference.
+ onlyA The alleles of the two variant files are inconsistent,
+ and only file A is inconsistent with the reference.
+ onlyB The alleles of the two variant files are inconsistent,
+ and only file B is inconsistent with the reference.
+ mismatch The alleles of the two variant files are inconsistent,
+ and they are both inconsistent with the reference.
+ phase-mismatch The two variant files would be consistent if the
+ hapLink field had been empty, but they are
+ inconsistent.
+ ploidy-mismatch The superlocus did not have uniform ploidy.
+
+ In some contexts, this classification is rolled up into a simplified
+ classification, which is one of "identical", "consistent", "onlyA",
+ "onlyB", or "mismatch".
+
+ A good place to start looking at the results is the superlocus-output file.
+ It has columns defined as follows:
+
+ SuperlocusId An identifier given to the superlocus.
+ Chromosome The name of the chromosome.
+ Begin The 0-based offset of the start of the superlocus.
+ End The 0-based offset of the base one past the end of the
+ superlocus.
+ Classification The match classification of the superlocus.
+ Reference The reference sequence.
+ AllelesA A semicolon-separated list of the alleles (one per
+ haplotype) for variant file A, for the phasing with the
+ best comparison result.
+ AllelesB A semicolon-separated list of the alleles (one per
+ haplotype) for variant file B, for the phasing with the
+ best comparison result.
+
+ The locus-output file contains, for each locus in file A and file B that is
+ not consistent with the reference, an annotated set of calls for the locus.
+ The calls are annotated with the following columns:
+
+ SuperlocusId The id of the superlocus containing the locus.
+ File The variant file (A or B).
+ LocusClassification The locus classification is determined by the
+ varType column of the call that is inconsistent
+ with the reference, concatenated with a
+ modifier that describes whether the locus is
+ heterozygous, homozygous, or contains no-calls.
+ If there is no one variant in the locus (i.e.,
+ it is heterozygous alt-alt), the locus
+ classification begins with "other".
+ LocusDiffClassification The match classification for the locus. This is
+ defined to be the best of the comparison of the
+ locus to the same region in the other file, or
+ the comparison of the superlocus.
+
+ The somatic output file contains a list of putative somatic variations of
+ genome A. The output includes only those loci that can be classified as
+ snp, del, ins or sub in file A, and are called reference in the file B.
+ Every locus is annotated with the following columns:
+
+ VarCvgA The totalReadCount from file A for this locus
+ (computed on the fly if file A is not a
+ masterVar file).
+ VarScoreA The varScoreVAF from file A, or varScoreEAF if
+ the "--diploid" option is used.
+ RefCvgB The maximum of the uniqueSequenceCoverage
+ values for the locus in genome B.
+ RefScoreB Minimum of the reference scores of the locus in
+ genome B.
+ SomaticCategory The category used for determining the
+ calibrated scores and the SomaticRank.
+ VarScoreACalib The calibrated variant score of file A, under
+ the model selected by using or not using the
+ "--diploid" option, and corrected for the count
+ of heterozygous variants observed in this
+ genome. See user guide for more information.
+ VarScoreBCalib The calibrated reference score of file B, under
+ the model selected by using or not using the
+ "--diploid" option, and corrected for the count
+ of heterozygous variants observed in this
+ genome. See user guide for more information.
+ SomaticRank The estimated rank of this somatic mutation,
+ amongst all true somatic mutations within this
+ SomaticCategory. The value is a number between
+ 0 and 1; a value of 0.012 means, for example,
+ that an estimated 1.2% of the true somatic
+ mutations in this somaticCategory have a
+ somaticScore less than the somaticScore for
+ this mutation. See user guide for more
+ information.
+ SomaticScore An integer that provides a total order on
+ quality for all somatic mutations. It is equal
+ to -10*log10( P(false)/P(true) ), under the
+ assumption that this genome has a rate of
+ somatic mutation equal to 1/Mb for
+ SomaticCategory snp, 1/10Mb for SomaticCategory
+ ins, 1/10Mb for SomaticCategory del, and 1/20Mb
+ for SomaticCategory sub. The computation is
+ based on the assumptions described in the user
+ guide, and is affected by choice of variant
+ model selected by using or not using the
+ "--diploid" option.
+ SomaticQuality Equal to VQHIGH for all somatic mutations where
+ SomaticScore >= -10. Otherwise, this column is
+ empty.
+
+ OPTIONS
+ -h [ --help ]
+ Print this help message.
+
+ --reference arg
+ The input crr file.
+
+ --variantsA arg
+ The "A" input variant file.
+
+ --variantsB arg
+ The "B" input variant file.
+
+ --output-prefix arg
+ The path prefix for all output reports.
+
+ --reports arg (=SuperlocusOutput,SuperlocusStats,LocusOutput,LocusStats)
+ Comma-separated list of reports to generate. (Beware any reports whose
+ name begins with "Debug".) A report is one of:
+ SuperlocusOutput Report for superlocus classification.
+ SuperlocusStats Report for superlocus classification stats.
+ LocusOutput Report for locus classification.
+ LocusStats Report for locus stats.
+ VariantOutput Both variant files annotated by comparison
+ results.If the somatic output report is
+ requested, file A is also annotated with the
+ same score ranks as produced in that report.
+ SomaticOutput Report for the list of simple variations that
+ are present only in file "A", annotated with
+ the score that indicates the probability of
+ the variation being truly somatic. Requires
+ beta, genome-rootA, and genome-rootB options
+ to be provided as well. Note: generating this
+ report slows calldiff by 10x-20x.
+ DebugCallOutput Report for call classification.
+ DebugSuperlocusOutput Report for debug superlocus information.
+ DebugSomaticOutput Report for distribution estimates used for
+ somatic rescoring. Only produced if
+ SomaticOutput is also turned on.
+
+ --diploid
+ Uses varScoreEAF instead of varScoreVAF in somatic score computations.
+ Also, uses diploid variant model instead of variable allele mixture
+ model.
+
+ --locus-stats-column-count arg (=15)
+ The number of columns for locus compare classification in the locus
+ stats file.
+
+ --max-hypothesis-count arg (=32)
+ The maximum number of possible phasings to consider for a superlocus.
+
+ --no-reference-cover-validation
+ Turns off validation that all bases of a chromosome are covered by
+ calls of the variant file.
+
+ --genome-rootA arg
+ The "A" genome directory, for example /data/GS00118-DNA_A01; this
+ directory is expected to contain ASM/REF and ASM/EVIDENCE
+ subdirectories.
+
+ --genome-rootB arg
+ The "B" genome directory.
+
+ --calibration-root arg
+ The directory containing calibration data. For example, there should
+ exist a file calibration-root/0.0.0/metrics.tsv.
+
+ --beta
+ This flag enables the SomaticOutput report, which is beta
+ functionality.
+
+ SUPPORTED FORMAT_VERSION
+ 0.3 or later
+
+
diff -r 000000000000 -r ef23f9cd599b testing_cgatools-982e19c29ec0/cgatools/tools/cgatools_1.6/evidence2sam.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/testing_cgatools-982e19c29ec0/cgatools/tools/cgatools_1.6/evidence2sam.xml Thu Sep 27 13:37:59 2012 -0400
@@ -0,0 +1,229 @@
+
+
+
+ converts evidence mappings to SAM format
+
+
+
+cgatools | head -1;
+
+
+echo "cgatools evidence2sam --beta
+--reference $crr.fields.path
+--output $output
+--evidence-dnbs $data_sources.input
+--consistent-mapping-range $range
+#if $region.selectregion == "yes"
+--extract-genomic-region $region.coordinates
+#end if
+$duplicates
+$mates
+$intervals
+$skip
+$svcandidates
+$unmapped
+$primary
+";
+
+
+cgatools evidence2sam --beta
+--reference $crr.fields.path
+--evidence-dnbs $data_sources.input
+#if $region.selectregion == "yes"
+ --extract-genomic-region $region.coordinates
+#end if
+$duplicates
+$mates
+$intervals
+$skip
+$svcandidates
+$unmapped
+$primary
+--consistent-mapping-range $range
+--output $output
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool uses cgatools evidence2sam to convert Complete Genomics evidence mappings to SAM format
+
+**cgatools 1.6.0 Documentation**
+
+Userguide: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-user-guide.pdf
+
+Release notes: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-release-notes.pdf
+
+**Command line reference**::
+
+ COMMAND NAME
+ evidence2sam - Converts CGI variant evidence data into SAM format.
+
+ DESCRIPTION
+ The evidence2sam converter takes as input evidence mapping files
+ (evidenceDnbs-*) and generates one SAM file as an output. The output is
+ sent into stdout by default. By default, all the evidence mapping records
+ from the input are converted into a pair of corresponding SAM records - one
+ record for each HalfDNB. The negative gaps in CGI mappings are represented
+ using GS/GQ/GC tags.
+
+ OPTIONS
+ -h [ --help ]
+ Print this help message.
+
+ --beta
+ This is a beta command. To run this command, you must pass the --beta
+ flag.
+
+ -e [ --evidence-dnbs ] arg
+ Input evidence dnbs file.
+
+ -s [ --reference ] arg
+ Reference file.
+
+ -o [ --output ] arg (=STDOUT)
+ The output SAM file (may be omitted for stdout).
+
+ -r [ --extract-genomic-region ] arg
+ defines a region as a half-open interval 'chr,from,to'.
+
+ --keep-duplicates
+ Keep local duplicates of DNB mappings.All the output SAM records will
+ be marked as not primary if this option is used.
+
+ --add-allele-id
+ Generate interval id and allele id tags.
+
+ --skip-not-mapped
+ Skip not mapped records
+
+ --add-mate-sequence
+ Generate mate sequence and score tags.
+
+ --mate-sv-candidates
+ Inconsistent mappings are normally converted as single arm mappings
+ with no mate information provided. If the option is used map2sam will
+ mate unique single arm mappings in SAM including those on different
+ stands and chromosomes. To distinguish these "artificially" mated
+ records a tag "XS:i:1" is used. The MAPQ provided for these records is
+ a single arm mapping weight.
+
+ --add-unmapped-mate-info
+ works like add-mate-sequence, but is applied to inconsistent mappings
+ only
+
+ --primary-mappings-only
+ report only the best mappings
+
+ --consistent-mapping-range arg (=1300)
+ limit the maximum distance between consistent mates
+
+
+ SUPPORTED FORMAT_VERSION
+ 0.3 or later
+
+
diff -r 000000000000 -r ef23f9cd599b testing_cgatools-982e19c29ec0/cgatools/tools/cgatools_1.6/join.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/testing_cgatools-982e19c29ec0/cgatools/tools/cgatools_1.6/join.xml Thu Sep 27 13:37:59 2012 -0400
@@ -0,0 +1,241 @@
+
+
+
+ two tsv files based on equal fields or overlapping regions.
+
+
+
+cgatools | head -1;
+
+
+echo "cgatools join --beta
+--input $inputA
+--input $inputB
+--output $output
+--output-mode $outmode
+$dump
+--select $col
+#for $m in $matches
+--match ${m.match}
+#end for
+#if $range_overlap.range == 'yes'
+#for $o in $range_overlap.overlaps
+--overlap ${o.overlap}
+#end for
+--overlap-mode $range_overlap.overlapmode
+--overlap-fraction-A $range_overlap.fractionA
+--boundary-uncertainty-A $range_overlap.boundaryA
+--overlap-fraction-B $range_overlap.fractionB
+--boundary-uncertainty-B $range_overlap.boundaryB
+#end if
+";
+
+
+cgatools join --beta
+--input $inputA
+--input $inputB
+--output $output
+--output-mode $outmode
+$dump
+--select $col
+#for $m in $matches
+ --match ${m.match}
+#end for
+#if $range_overlap.range == 'yes'
+ #for $o in $range_overlap.overlaps
+ --overlap ${o.overlap}
+ #end for
+ --overlap-mode $range_overlap.overlapmode
+ --overlap-fraction-A $range_overlap.fractionA
+ --boundary-uncertainty-A $range_overlap.boundaryA
+ --overlap-fraction-B $range_overlap.fractionB
+ --boundary-uncertainty-B $range_overlap.boundaryB
+#end if
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool joins two tab-delimited files based on equal fields or overlapping regions.
+
+**cgatools 1.6.0 Documentation**
+
+Userguide: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-user-guide.pdf
+
+Release notes: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-release-notes.pdf
+
+**Command line reference**::
+
+ COMMAND NAME
+ join - Joins two tab-delimited files based on equal fields or overlapping regions.
+
+ DESCRIPTION
+ Joins two tab-delimited files based on equal fields or overlapping regions.
+ By default, an output record is produced for each match found between file
+ A and file B, but output format can be controlled by the --output-mode
+ parameter.
+
+ OPTIONS
+ -h [ --help ]
+ Print this help message.
+
+ --beta
+ This is a beta command. To run this command, you must pass the --beta
+ flag.
+
+ --input arg
+ File name to use as input (may be passed in as arguments at the end of
+ the command), or omitted for stdin). There must be exactly two input
+ files to join. If only one file is specified by name, file A is taken
+ to be stdin and file B is the named file. File B is read fully into
+ memory, and file A is streamed. File A's columns appear first in the
+ output.
+
+ --output arg (=STDOUT)
+ The output file name (may be omitted for stdout).
+
+ --match arg
+ A match specification, which is a column from A and a column from B
+ separated by a colon.
+
+ --overlap arg
+ Overlap specification. An overlap specification consists of a range
+ definition for files A and B, separated by a colon. A range definition
+ may be two columns, in which case they are interpreted as the beginning
+ and end of the range. Or it may be one column, in which case the range
+ is defined as the 1-base range starting at the given value. The records
+ from the two files must overlap in order to be considered for output.
+ Two ranges are considered to overlap if the overlap is at least one
+ base long, or if one of the ranges is length 0 and the ranges overlap
+ or abut. For example, "begin,end:offset" will match wherever end-begin
+ > 0, begin<offset+1, and end>offset, or wherever end-begin = 0,
+ begin<=offset+1, and end>=offset.
+
+
+ -m [ --output-mode ] arg (=full)
+ Output mode, one of the following:
+ full Print an output record for each match found between
+ file A and file B.
+ compact Print at most one record for each record of file A,
+ joining the file B values by a semicolon and
+ suppressing repeated B values and empty B values.
+ compact-pct Same as compact, but for each distinct B value,
+ annotate with the percentage of the A record that is
+ overlapped by B records with that B value. Percentage
+ is rounded up to nearest integer.
+
+ --overlap-mode arg (=strict)
+ Overlap mode, one of the following:
+ strict Range A and B overlap if A.begin < B.end and
+ B.begin < A.end.
+ allow-abutting-points Range A and B overlap they meet the strict
+ requirements, or if A.begin <= B.end and
+ B.begin <= A.end and either A or B has zero
+ length.
+
+ --select arg (=A.*,B.*)
+ Set of fields to select for output.
+
+ -a [ --always-dump ]
+ Dump every record of A, even if there are no matches with file B.
+
+ --overlap-fraction-A arg (=0)
+ Minimum fraction of A region overlap for filtering output.
+
+ --boundary-uncertainty-A arg (=0)
+ Boundary uncertainty for overlap filtering. Specifically, records
+ failing the following predicate are filtered away: overlap >=
+ overlap-fraction-A * ( A-range-length - boundary-uncertainty-A )
+
+ --overlap-fraction-B arg (=0)
+ Minimum fraction of B region overlap for filtering output.
+
+ --boundary-uncertainty-B arg (=0)
+ Boundary uncertainty for overlap filtering. Specifically, records
+ failing the following predicate are filtered away: overlap >=
+ overlap-fraction-B * ( B-range-length - boundary-uncertainty-B )
+
+ SUPPORTED FORMAT_VERSION
+ Any
+
+
diff -r 000000000000 -r ef23f9cd599b testing_cgatools-982e19c29ec0/cgatools/tools/cgatools_1.6/junctiondiff.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/testing_cgatools-982e19c29ec0/cgatools/tools/cgatools_1.6/junctiondiff.xml Thu Sep 27 13:37:59 2012 -0400
@@ -0,0 +1,181 @@
+
+
+
+ reports difference between junction calls
+
+
+
+cgatools | head -1;
+
+
+echo "cgatools junctiondiff --beta
+--reference $crr.fields.path
+--junctionsA $data_sources.inputA
+--junctionsB $data_sources.inputB
+--scoreThresholdA $scoreA
+--scoreThresholdB $scoreB
+--distance $distance
+--minlength $minlength
+--output-prefix cg_
+$stat
+";
+
+
+cgatools junctiondiff --beta
+--reference $crr.fields.path
+--junctionsA $data_sources.inputA
+--junctionsB $data_sources.inputB
+--scoreThresholdA $scoreA
+--scoreThresholdB $scoreB
+--distance $distance
+--minlength $minlength
+--output-prefix cg_
+$stat
+;
+mv cg_diff-*tsv cg_diff.tsv
+
+
+
+
+
+ (stat == '--statout')
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool uses cgatools junctiondiff to report difference between junction calls of two Complete Genomics junctions files
+
+**cgatools 1.6.0 Documentation**
+
+Userguide: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-user-guide.pdf
+
+Release notes: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-release-notes.pdf
+
+**Command line reference**::
+
+ COMMAND NAME
+ junctiondiff - Reports difference between junction calls of Complete Genomics junctions files.
+
+ DESCRIPTION
+ junctiondiff takes two junction files A and B as input and produces the
+ following output:
+ - "diff-inputFileName" - the junctions from an input file A that are not
+ present in input file B.
+ - "report.txt" - a brief summary report (if --statout is used)
+
+ Two junctions are considered equivalent if:
+ - they come from different files
+ - left and right positions of one junction are not more than "--distance"
+ bases apart from the corresponding positions of another junction
+ - the junction scores are equal or above the scoreThreshold
+ - they are on the same strands
+
+ OPTIONS
+ -h [ --help ]
+ Print this help message.
+
+ --beta
+ This is a beta command. To run this command, you must pass the --beta
+ flag.
+
+ -s [ --reference ] arg
+ Reference file.
+
+ -a [ --junctionsA ] arg
+ input junction file A.
+
+ -b [ --junctionsB ] arg
+ input junction file B.
+
+ -A [ --scoreThresholdA ] arg (=10)
+ score threshold value for the input file A.
+
+ -B [ --scoreThresholdB ] arg (=0)
+ score threshold value for the input file B.
+
+ -d [ --distance ] arg (=200)
+ Max distance between coordinates of potentially compatible junctions.
+
+ -l [ --minlength ] arg (=500)
+ Minimum deletion junction length to be included into the difference
+ file.
+
+ -o [ --output-prefix ] arg
+ The path prefix for all the output reports.
+
+ -S [ --statout ]
+ (Debug) Report various input file statistics. Experimental feature.
+
+ SUPPORTED FORMAT_VERSION
+ 1.5 or later
+
+
diff -r 000000000000 -r ef23f9cd599b testing_cgatools-982e19c29ec0/cgatools/tools/cgatools_1.6/listtestvariants.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/testing_cgatools-982e19c29ec0/cgatools/tools/cgatools_1.6/listtestvariants.xml Thu Sep 27 13:37:59 2012 -0400
@@ -0,0 +1,264 @@
+
+
+
+ performs listsvariants and testvariants consecutively
+
+
+
+cgatools | head -1;
+
+
+echo "cgatools listvariants --beta
+--reference ${crr.fields.path}
+--output $output1
+#if $include_list.listing == "yes"
+--variant-listing $include_list.list
+#end if
+$longvar
+--variants
+#if $data_sources.data_source == "in"
+#for $v in $data_sources.file_types.files
+${v.input}
+#end for
+#else
+`cat $data_sources.file_types.list`
+#end if
+";
+echo "cgatools testvariants --beta
+--reference ${crr.fields.path}
+--output $output2
+--input $output1
+--variants
+#if $data_sources.data_source == "in"
+#for $v in $data_sources.file_types.files
+${v.input}
+#end for
+#else
+`cat $data_sources.file_types.list`
+#end if
+";
+
+
+cgatools listvariants
+--beta
+--reference ${crr.fields.path}
+--output $output1
+#if $include_list.listing == "yes"
+ --variant-listing $include_list.list
+#end if
+$longvar
+--variants
+#if $data_sources.data_source == "in"
+ #for $v in $data_sources.file_types.files
+ ${v.input}
+ #end for
+#else
+ `cat $data_sources.file_types.list`
+#end if
+;
+
+cgatools testvariants
+--beta
+--reference ${crr.fields.path}
+--output $output2
+--input $output1
+--variants
+#if $data_sources.data_source == "in"
+ #for $v in $data_sources.file_types.files
+ ${v.input}
+ #end for
+#else
+ `cat $data_sources.file_types.list`
+#end if
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool uses the cgatools listvariants and testvariants to test variant or mastervar files for the presence of variants.
+
+**cgatools 1.6.0 Documentation**
+
+Userguide: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-user-guide.pdf
+
+Release notes: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-release-notes.pdf
+
+**Command line reference**::
+
+ COMMAND NAME
+ listvariants - Lists the variants present in a variant file.
+
+ DESCRIPTION
+ Lists all called variants present in the specified variant files, in a
+ format suitable for processing by the testvariants command. The output is a
+ tab-delimited file consisting of the following columns:
+
+ variantId Sequential id assigned to each variant.
+ chromosome The chromosome of the variant.
+ begin 0-based reference offset of the beginning of the variant.
+ end 0-based reference offset of the end of the variant.
+ varType The varType as extracted from the variant file.
+ reference The reference sequence.
+ alleleSeq The variant allele sequence as extracted from the variant
+ file.
+ xRef The xRef as extrated from the variant file.
+
+ OPTIONS
+ -h [ --help ]
+ Print this help message.
+
+ --beta
+ This is a beta command. To run this command, you must pass the --beta
+ flag.
+
+ --reference arg
+ The reference crr file.
+
+ --output arg (=STDOUT)
+ The output file (may be omitted for stdout).
+
+ --variants arg
+ The input variant files (may be positional args).
+
+ --variant-listing arg
+ The output of another listvariants run, to be merged in to produce the
+ output of this run.
+
+ --list-long-variants
+ In addition to listing short variants, list longer variants as well
+ (10's of bases) by concatenating nearby calls.
+
+ SUPPORTED FORMAT_VERSION
+ 0.3 or later
+
+
+
+ COMMAND NAME
+ testvariants - Tests variant files for presence of variants.
+
+ DESCRIPTION
+ Tests variant files for presence of variants. The output is a tab-delimited
+ file consisting of the columns of the input variants file, plus a column
+ for each assembly results file that contains a character code for each
+ allele. The character codes have meaning as follows:
+
+ 0 This allele of this genome is consistent with the reference at this
+ locus but inconsistent with the variant.
+ 1 This allele of this genome has the input variant at this locus.
+ N This allele of this genome has no-calls but is consistent with the
+ input variant.
+
+ OPTIONS
+ -h [ --help ]
+ Print this help message.
+
+ --beta
+ This is a beta command. To run this command, you must pass the --beta
+ flag.
+
+ --reference arg
+ The reference crr file.
+
+ --input arg (=STDIN)
+ The input variants to test for.
+
+ --output arg (=STDOUT)
+ The output file (may be omitted for stdout).
+
+ --variants arg
+ The input variant files (may be passed in as arguments at the end of
+ the command).
+
+ SUPPORTED FORMAT_VERSION
+ 0.3 or later
+
+
diff -r 000000000000 -r ef23f9cd599b testing_cgatools-982e19c29ec0/cgatools/tools/cgatools_1.6/listvariants.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/testing_cgatools-982e19c29ec0/cgatools/tools/cgatools_1.6/listvariants.xml Thu Sep 27 13:37:59 2012 -0400
@@ -0,0 +1,192 @@
+
+
+
+ lists all called variants
+
+
+
+cgatools | head -1;
+
+
+echo "cgatools listvariants --beta
+--reference ${crr.fields.path}
+--output $output
+#if $include_list.listing == "yes"
+--variant-listing $include_list.list
+#end if
+$longvar
+--variants
+#if $data_sources.data_source == "in"
+#for $v in $data_sources.file_types.files
+${v.input}
+#end for
+#else
+`cat $data_sources.list`
+#end if
+";
+
+
+cgatools listvariants --beta
+--reference ${crr.fields.path}
+--output $output
+#if $include_list.listing == "yes"
+ --variant-listing $include_list.list
+#end if
+$longvar
+--variants
+#if $data_sources.data_source == "in"
+ #for $v in $data_sources.file_types.files
+ ${v.input}
+ #end for
+#else
+ `cat $data_sources.list`
+#end if
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool uses the cgatools listvariants to list all called variants present in the var or mastervar files.
+
+**cgatools 1.6.0 Documentation**
+
+Userguide: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-user-guide.pdf
+
+Release notes: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-release-notes.pdf
+
+**Command line reference**::
+
+ COMMAND NAME
+ listvariants - Lists the variants present in a variant file.
+
+ DESCRIPTION
+ Lists all called variants present in the specified variant files, in a
+ format suitable for processing by the testvariants command. The output is a
+ tab-delimited file consisting of the following columns:
+
+ variantId Sequential id assigned to each variant.
+ chromosome The chromosome of the variant.
+ begin 0-based reference offset of the beginning of the variant.
+ end 0-based reference offset of the end of the variant.
+ varType The varType as extracted from the variant file.
+ reference The reference sequence.
+ alleleSeq The variant allele sequence as extracted from the variant
+ file.
+ xRef The xRef as extrated from the variant file.
+
+ OPTIONS
+ -h [ --help ]
+ Print this help message.
+
+ --beta
+ This is a beta command. To run this command, you must pass the --beta
+ flag.
+
+ --reference arg
+ The reference crr file.
+
+ --output arg (=STDOUT)
+ The output file (may be omitted for stdout).
+
+ --variants arg
+ The input variant files (may be positional args).
+
+ --variant-listing arg
+ The output of another listvariants run, to be merged in to produce the
+ output of this run.
+
+ --list-long-variants
+ In addition to listing short variants, list longer variants as well
+ (10's of bases) by concatenating nearby calls.
+
+ SUPPORTED FORMAT_VERSION
+ 0.3 or later
+
+
diff -r 000000000000 -r ef23f9cd599b testing_cgatools-982e19c29ec0/cgatools/tools/cgatools_1.6/mkvcf.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/testing_cgatools-982e19c29ec0/cgatools/tools/cgatools_1.6/mkvcf.xml Thu Sep 27 13:37:59 2012 -0400
@@ -0,0 +1,1001 @@
+
+
+
+ converts to vcf
+
+
+
+ mkvcf_wrapper.pl
+ --reference $crr.fields.path
+ --output $output
+ --genomes $count.genomes
+ --source $count.sources.source
+ --datasource $count.sources.data_sources.data_source
+ #if $count.sources.data_sources.data_source=="in"
+ #for $m in $count.sources.data_sources.files
+ --input $m.input
+ #end for
+ #else
+ --input $count.sources.data_sources.input
+ #end if
+ #if $count.sources.source=="masterVar" or $count.sources.source=="masterVar,CNV"
+ $count.sources.nocalls
+ --calibration $count.sources.calibration
+ #else if $count.sources.source=="SV"
+ --jctscore $count.sources.jctscore
+ --jctside $count.sources.jctside
+ --jctdistance $count.sources.jctdistance
+ --jctlength $count.sources.jctlength
+ $count.sources.jctpriority
+ $count.sources.jcttumor
+ #else if $count.sources.source=="masterVar,CNV,SV" or $count.sources.source=="masterVar,CNV,SV,MEI"
+ $count.sources.nocalls
+ --calibration $count.sources.calibration
+ --jctscore $count.sources.jctscore
+ --jctside $count.sources.jctside
+ --jctdistance $count.sources.jctdistance
+ --jctlength $count.sources.jctlength
+ $count.sources.jctpriority
+ $count.sources.jcttumor
+ #end if
+ --fields $count.sources.fields
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool uses cgatools mkvcf to convert Complete Genomics masterVar files, including CNV, SV and/or MEI data, to vcf format version.
+
+**cgatools 1.6.0 Documentation**
+
+Userguide: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-user-guide.pdf
+
+Release notes: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-release-notes.pdf
+
+**Command line reference**::
+
+ COMMAND NAME
+ mkvcf - Converts var file(s) or masterVar file(s) to VCF.
+
+ DESCRIPTION
+ Converts var file(s) or masterVar file(s) to VCF.
+
+ OPTIONS
+ -h [ --help ]
+ Print this help message.
+
+ --beta
+ This is a beta command. To run this command, you must pass the --beta
+ flag.
+
+ --reference arg
+ The reference crr file.
+
+ --output arg (=STDOUT)
+ The output file (may be omitted for stdout).
+
+ --field-names arg (=GT,PS,NS,AN,AC,SS,FT,CGA_XR,CGA_FI,GQ,HQ,EHQ,CGA_CEHQ,GL,
+ CGA_CEGL,DP,AD,CGA_RDP,CGA_ODP,CGA_OAD,CGA_ORDP,CGA_PFAM,CGA_MIRB,CGA_RPT,
+ CGA_SDO,CGA_SOMC,CGA_SOMR,CGA_SOMS,CGA_GP,CGA_NP,CGA_CP,CGA_PS,CGA_CT,
+ CGA_TS,CGA_CL,CGA_LS,CGA_SCL,CGA_SLS,CGA_LAF,CGA_LLAF,CGA_ULAF,CGA_IS,
+ CGA_IDC,CGA_IDCL,CGA_IDCR,CGA_RDC,CGA_NBET,CGA_ETS,CGA_KES,CGA_BF,
+ CGA_MEDEL,MATEID,SVTYPE,CGA_BNDG,CGA_BNDGO,CGA_BNDMPC,CGA_BNDPOS,CGA_BNDDEF,
+ CGA_BNDP)
+ Comma-separated list of field names. By default, all fields are
+ included, but you may override this option to ensure only a subset of
+ the fields is included in the VCF output. For a description of each
+ field, see the cgatools user guide.
+
+ --source-names arg (=masterVar,CNV,SV,MEI)
+ Comma-separated list of source names. The following source names are
+ available:
+ masterVar - Includes records extracted from the masterVar file.
+ CNV - Includes CNV-related records.
+ SV - Includes records derived from junctions files.
+ MEI - Includes records describing mobile element insertions.
+ Some of these source types are only available for more recent pipeline
+ versions, and some of these source types do not support multi-genome
+ VCFs. For more information about which source types are available for
+ which versions of the Complete Genomics pipeline software, see the
+ cgatools user guide.
+
+ --genome-root arg
+ For each genome to include in the VCF, the genome root directory, for
+ example /data/GS00118-DNA_A01; this directory is expected to contain
+ the ASM and LIB subdirectories, for example. You must supply this
+ option for each genome in the VCF, unless you are using
+ --source-names=masterVar and you have specified the --master-var option
+ for each genome in the VCF.
+
+ --master-var arg
+ For each genome to include in the VCF, the masterVar file. If
+ genome-roots parameter is given, this parameter defaults to the
+ masterVar in the given genome-root.
+
+ --include-no-calls
+ Small variants VCF records include loci that have no
+ reference-inconsistent calls.
+
+ --calibration-root arg
+ The directory containing calibration data. For example, there should
+ exist a file calibration-root/version0.0.0/metrics.tsv. This option is only
+ required if CGA_CEHQ or CGA_CEGL are included in the --field-names
+ parameter.
+
+ --junction-file arg
+ For each genome to include in the VCF, the junctions file. If
+ genome-roots parameter is given, this parameter defaults to the
+ respective junctions file in the export directory.
+
+ --junction-score-threshold arg (=10)
+ Junction score thresholds (discordant mate pair count).
+
+ --junction-side-length-threshold arg (=70)
+ Junction side length threshold.
+
+ --junction-distance-tolerance arg (=200)
+ Distance tolerance for junction compatibility.
+
+ --junction-length-threshold arg (=500)
+ Length threshold for compatible junctions.
+
+ --junction-normal-priority
+ Normal junction priority for vcf output.
+
+ --junction-tumor-hc
+ use high confidence junctions for tumors.
+
+
+ SUPPORTED FORMAT_VERSION
+ 0.3 or later
+
+
diff -r 000000000000 -r ef23f9cd599b testing_cgatools-982e19c29ec0/cgatools/tools/cgatools_1.6/mkvcf_wrapper.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/testing_cgatools-982e19c29ec0/cgatools/tools/cgatools_1.6/mkvcf_wrapper.pl Thu Sep 27 13:37:59 2012 -0400
@@ -0,0 +1,95 @@
+#!/usr/bin/perl
+use strict;
+use Getopt::Long;
+use vars qw($opt_reference $opt_output @opt_input $opt_genomes $opt_source $opt_datasource $opt_fields $opt_nocalls $opt_calibration $opt_jctscore $opt_jctside $opt_jctdistance $opt_jctlength $opt_jctpriority $opt_jcttumor);
+$| = 1; # set autoflush to screen
+
+# This is a wrapper for the cgatools mkvcf function to run cgatools mkvcf in Galaxy.
+# written 8-10-2012 by bcrain@completegenomics.com
+
+#print join("\n", @ARGV), "\n";
+&GetOptions("reference=s", "output=s", "input=s@", "genomes=i", "source=s", "datasource=s", "fields=s", "nocalls", "calibration:s", "jctscore=i", "jctside=i", "jctdistance=i", "jctlength=i", "jctpriority", "jcttumor");
+
+my $command = "cgatools mkvcf --beta --reference $opt_reference --output $opt_output --source-names $opt_source";
+
+if ($opt_datasource eq 'in')
+{
+ foreach my $file (@opt_input)
+ {
+ if ($opt_source eq 'masterVar') {$command .= " --master-var ";}
+ elsif ($opt_source eq 'SV') {$command .= " --junction-file ";}
+ else {die "there is an error in the logic: wrong source $opt_source for datasource $opt_datasource.\n";}
+ $command .= $file
+ }
+}
+elsif ($opt_datasource eq 'out')
+{
+ if ($opt_genomes == 1)
+ {
+ if ($opt_input[0] =~ m/masterVar/ and $opt_source eq 'masterVar')
+ {
+ -f $opt_input[0] or die "$opt_input[0] is not a valid file.\n";
+ $command .= " --master-var $opt_input[0]";
+ }
+ elsif ($opt_input[0] =~ m/Junctions/ and $opt_source eq 'SV')
+ {
+ -f $opt_input[0] or die "$opt_input[0] is not a valid file.\n";
+ $command .= " --junction-file $opt_input[0]";
+ }
+ else
+ {
+ $opt_input[0] =~ s/\/$//;
+ -d $opt_input[0] or die "$opt_input[0] is not a valid directory.\n";
+ $command .= " --genome-root $opt_input[0]";
+ }
+ }
+ else
+ {
+ -T $opt_input[0] or die "$opt_input[0] is not a valid file.\n";
+ my $count = 0;
+ foreach my $file (split /\s+/, `cat $opt_input[0]`)
+ {
+ $count ++;
+ ($opt_genomes == 2 and $count > 2) and die "The number of inputs in your list file cannot be greater than the number of genomes selected.\n";
+ if ($file =~ m/masterVar/ and $opt_source eq 'masterVar')
+ {
+ -f $file or die "$file is not a valid file.\n";
+ $command .= " --master-var ";
+ }
+ elsif ($file =~ m/Junctions/ and $opt_source eq 'SV')
+ {
+ -f $file or die "$file is not a valid file.\n";
+ $command .= " --junction-file ";
+ }
+ else
+ {
+ -d $file or die "$file is not a valid directory.\n";
+ $command .= " --genome-root ";
+ }
+ $command .= $file
+ }
+ }
+}
+else
+{die "there is an error in the logic: wrong datasource $opt_datasource.\n";}
+
+if ($opt_calibration)
+{
+ (-r "$opt_calibration/0.0.0/metrics.tsv" or -r "$opt_calibration/version0.0.0/metrics.tsv") or die "This folder does not contain the calibration data\n";
+ $command .= " --calibration-root $opt_calibration";
+}
+
+$opt_fields eq 'all' or $command .= " --field-names $opt_fields";
+$opt_nocalls and $command .= " --include-no-calls";
+$opt_jctscore and $command .= " --junction-score-threshold $opt_jctscore";
+$opt_jctside and $command .= " --junction-side-length-threshold $opt_jctside";
+$opt_jctdistance and $command .= " --junction-distance-tolerance $opt_jctdistance";
+$opt_jctlength and $command .= " --junction-length-threshold $opt_jctlength";
+$opt_jctpriority and $command .= " --junction-normal-priority";
+$opt_jcttumor and $command .= " --junction-tumor-hc";
+
+my $version = `cgatools | head -1`;
+print "$version\n";
+print "$command \n";
+
+`$command`;
\ No newline at end of file
diff -r 000000000000 -r ef23f9cd599b testing_cgatools-982e19c29ec0/cgatools/tools/cgatools_1.6/snpdiff.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/testing_cgatools-982e19c29ec0/cgatools/tools/cgatools_1.6/snpdiff.xml Thu Sep 27 13:37:59 2012 -0400
@@ -0,0 +1,198 @@
+
+
+
+ compares snp calls to var or masterVar file.
+
+
+
+cgatools | head -1;
+
+
+echo "cgatools snpdiff
+--reference $crr.fields.path
+--variants $data_sources.varfile
+--genotypes $genotype
+--output-prefix cg_
+--reports `echo ${report1} ${report2} ${report3} | sed 's/ */,/g'`
+";
+
+
+cgatools snpdiff
+--reference $crr.fields.path
+--variants $data_sources.varfile
+--genotypes $genotype
+--output-prefix cg_
+--reports `echo ${report1} ${report2} ${report3} | sed 's/ */,/g'`
+
+
+
+
+
+ (report1 == 'Output')
+
+
+ (report2 == 'Verbose')
+
+
+ (report3 == 'Stats')
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool ompares snp calls to a Complete Genomics variant file.
+
+**cgatools 1.6.0 Documentation**
+
+Userguide: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-user-guide.pdf
+
+Release notes: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-release-notes.pdf
+
+**Command line reference**::
+
+ COMMAND NAME
+ snpdiff - Compares snp calls to a Complete Genomics variant file.
+
+ DESCRIPTION
+ Compares the snp calls in the "genotypes" file to the calls in a Complete
+ Genomics variant file. The genotypes file is a tab-delimited file with at
+ least the following columns (additional columns may be given):
+
+ Chromosome (Required) The name of the chromosome.
+ Offset0Based (Required) The 0-based offset in the chromosome.
+ GenotypesStrand (Optional) The strand of the calls in the Genotypes
+ column (+ or -, defaults to +).
+ Genotypes (Optional) The calls, one per allele. The following
+ calls are recognized:
+ A,C,G,T A called base.
+ N A no-call.
+ - A deleted base.
+ . A non-snp variation.
+
+ The output is a tab-delimited file consisting of the columns of the
+ original genotypes file, plus the following additional columns:
+
+ Reference The reference base at the given position.
+ VariantFile The calls made by the variant file, one per allele.
+ The character codes are the same as is described for
+ the Genotypes column.
+ DiscordantAlleles (Only if Genotypes is present) The number of
+ Genotypes alleles that are discordant with calls in
+ the VariantFile. If the VariantFile is described as
+ haploid at the given position but the Genotypes is
+ diploid, then each genotype allele is compared
+ against the haploid call of the VariantFile.
+ NoCallAlleles (Only if Genotypes is present) The number of
+ Genotypes alleles that were no-called by the
+ VariantFile. If the VariantFile is described as
+ haploid at the given position but the Genotypes is
+ diploid, then a VariantFile no-call is counted twice.
+
+ The verbose output is a tab-delimited file consisting of the columns of the
+ original genotypes file, plus the following additional columns:
+
+ Reference The reference base at the given position.
+ VariantFile The call made by the variant file for one allele (there is
+ a line in this file for each allele). The character codes
+ are the same as is described for the Genotypes column.
+ [CALLS] The rest of the columns are pasted in from the VariantFile,
+ describing the variant file line used to make the call.
+
+ The stats output is a comma-separated file with several tables describing
+ the results of the snp comparison, for each diploid genotype. The tables
+ all describe the comparison result (column headers) versus the genotype
+ classification (row labels) in different ways. The "Locus classification"
+ tables have the most detailed match classifications, while the "Locus
+ concordance" tables roll these match classifications up into "discordance"
+ and "no-call". A locus is considered discordant if it is discordant for
+ either allele. A locus is considered no-call if it is concordant for both
+ alleles but has a no-call on either allele. The "Allele concordance"
+ describes the comparison result on a per-allele basis.
+
+ OPTIONS
+ -h [ --help ]
+ Print this help message.
+
+ --reference arg
+ The input crr file.
+
+ --variants arg
+ The input variant file.
+
+ --genotypes arg
+ The input genotypes file.
+
+ --output-prefix arg
+ The path prefix for all output reports.
+
+ --reports arg (=Output,Verbose,Stats)
+ Comma-separated list of reports to generate. A report is one of:
+ Output The output genotypes file.
+ Verbose The verbose output file.
+ Stats The stats output file.
+
+ SUPPORTED FORMAT_VERSION
+ 0.3 or later
+
+
diff -r 000000000000 -r ef23f9cd599b testing_cgatools-982e19c29ec0/cgatools/tools/cgatools_1.6/testvariants.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/testing_cgatools-982e19c29ec0/cgatools/tools/cgatools_1.6/testvariants.xml Thu Sep 27 13:37:59 2012 -0400
@@ -0,0 +1,166 @@
+
+
+
+ test for the presence of variants
+
+
+
+cgatools | head -1;
+
+
+echo "cgatools testvariants --beta
+--reference ${crr.fields.path}
+--output $output
+--input $listing
+--variants
+#if $data_sources.data_source == "in"
+#for $v in $data_sources.file_types.files
+${v.input}
+#end for
+#else
+`cat $data_sources.list`
+#end if
+";
+
+
+cgatools testvariants
+--beta
+--reference ${crr.fields.path}
+--output $output
+--input $listing
+--variants
+#if $data_sources.data_source == "in"
+ #for $v in $data_sources.file_types.files
+ ${v.input}
+ #end for
+#else
+ `cat $data_sources.list`
+#end if
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool uses the cgatools testvariants to test var or masterVar files for the presence of variants.
+
+**cgatools 1.6.0 Documentation**
+
+Userguide: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-user-guide.pdf
+
+Release notes: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-release-notes.pdf
+
+**Command line reference**::
+
+ COMMAND NAME
+ testvariants - Tests variant files for presence of variants.
+
+ DESCRIPTION
+ Tests variant files for presence of variants. The output is a tab-delimited
+ file consisting of the columns of the input variants file, plus a column
+ for each assembly results file that contains a character code for each
+ allele. The character codes have meaning as follows:
+
+ 0 This allele of this genome is consistent with the reference at this
+ locus but inconsistent with the variant.
+ 1 This allele of this genome has the input variant at this locus.
+ N This allele of this genome has no-calls but is consistent with the
+ input variant.
+
+ OPTIONS
+ -h [ --help ]
+ Print this help message.
+
+ --beta
+ This is a beta command. To run this command, you must pass the --beta
+ flag.
+
+ --reference arg
+ The reference crr file.
+
+ --input arg (=STDIN)
+ The input variants to test for.
+
+ --output arg (=STDOUT)
+ The output file (may be omitted for stdout).
+
+ --variants arg
+ The input variant files (may be passed in as arguments at the end of
+ the command).
+
+ SUPPORTED FORMAT_VERSION
+ 0.3 or later
+
+
diff -r 000000000000 -r ef23f9cd599b testing_cgatools-982e19c29ec0/cgatools/tools/cgatools_1.6/varfilter.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/testing_cgatools-982e19c29ec0/cgatools/tools/cgatools_1.6/varfilter.xml Thu Sep 27 13:37:59 2012 -0400
@@ -0,0 +1,180 @@
+
+
+
+ copies input file, applying filters.
+
+
+
+ varfilter_wrapper.pl
+ --reference $crr.fields.path
+ --output $output
+ #if $data_sources.data_source == "in"
+ --input $data_sources.file_types.input
+ #else
+ --input $data_sources.input
+ #end if
+ #for $f in $filters
+ --zygosity $f.zygosity
+ --vartype $f.vartype
+ --varscorevaf $f.varscorevaf
+ --varscoreeaf $f.varscoreeaf
+ --varquality $f.varquality
+ #end for
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool uses cgatools varfilter to copy input var file or masterVar file to output, applying specified filters. Loci that are filtered out are set to no-call.
+
+**cgatools 1.6.0 Documentation**
+
+Userguide: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-user-guide.pdf
+
+Release notes: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-release-notes.pdf
+
+**Command line reference**::
+
+ COMMAND NAME
+ varfilter - Copies input var file or masterVar file to output, applying
+ specified filters.
+
+ DESCRIPTION
+ Copies input var file or masterVar file to output, applying specified
+ filters (which are available to all cgatools commands that read a var file
+ or masterVar file as input). Filters are specified by appending the filter
+ specification to the var file name on the command line. For example:
+
+ /path/to/var.tsv.bz2#varQuality!=VQHIGH
+
+ The preceding example filters out any calls marked as VQLOW. The filter
+ specification follows the "#" sign, and consists of a list of filters to
+ apply, separated by a comma. Each filter is a colon-separated list of call
+ selectors. Any scored call that passes all the colon-separated call
+ selectors for one or more of the comma-separated filters is turned into a
+ no-call. The following call selectors are available:
+
+ hom Selects only calls in homozygous loci.
+ het Selects any scored call not selected by the hom selector.
+ varType=XX Selects calls whose varType is XX.
+ varScoreVAF<XX Selects calls whose varScoreVAF < XX.
+ varScoreEAF<XX Selects calls whose varScoreEAF < XX.
+ varQuality!=XX Selects calls whose varQuality is not XX.
+
+ Here is an example that filters homozygous SNPs with varScoreVAF < 25 and
+ heterozygous insertions with varScoreEAF < 50:
+
+
+ '/path/to/var.tsv.bz2#hom:varType=snp:varScoreVAF<25,het:varType=ins:varScoreEAF<50'
+
+
+ OPTIONS
+ -h [ --help ]
+ Print this help message.
+
+ --beta
+ This is a beta command. To run this command, you must pass the --beta flag.
+
+ --reference arg
+ The reference crr file.
+
+ --input arg
+ The input var file or masterVar file (typically with filters specified).
+
+ --output arg (=STDOUT)
+ The output file (may be omitted for stdout).
+
+ SUPPORTED FORMAT_VERSION
+ 0.3 or later
+
+
diff -r 000000000000 -r ef23f9cd599b testing_cgatools-982e19c29ec0/cgatools/tools/cgatools_1.6/varfilter_wrapper.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/testing_cgatools-982e19c29ec0/cgatools/tools/cgatools_1.6/varfilter_wrapper.pl Thu Sep 27 13:37:59 2012 -0400
@@ -0,0 +1,57 @@
+#!/usr/bin/perl
+use strict;
+use Getopt::Long;
+use vars qw($opt_reference $opt_input $opt_output @opt_zygosity @opt_vartype @opt_varscorevaf @opt_varscoreeaf @opt_varquality);
+$| = 1; # set autoflush to screen
+
+# This is a wrapper for the cgatools varfilter function to run cgatools varfilter in Galaxy.
+# The wrapper generates the filter(s) in the correct format to be used with the input file.
+# written 6-1-2012 by bcrain@completegenomics.com
+
+
+#print join("\n", @ARGV), "\n";
+&GetOptions("reference=s", "input=s", "output=s", "zygosity=s@", "vartype=s@", "varscorevaf:i@", "varscoreeaf:i@", "varquality=s@");
+
+my $append = '';
+
+for (my $i = 0; $i <= $#opt_zygosity; $i ++)
+{
+ my $filter = '';
+ unless ($opt_zygosity[$i] eq 'NA') {$filter = $opt_zygosity[$i];}
+ unless ($opt_vartype[$i] eq 'NA')
+ {
+ $filter ne '' and $filter .= ':';
+ $filter .= 'varType=' . $opt_vartype[$i];
+ }
+ unless ($opt_varscorevaf[$i] == 0)
+ {
+ $filter ne '' and $filter .= ':';
+ $filter .= 'varScoreVAF<' . $opt_varscorevaf[$i];
+ }
+ unless ($opt_varscoreeaf[$i] == 0)
+ {
+ $filter ne '' and $filter .= ':';
+ $filter .= 'varScoreEAF<' . $opt_varscoreeaf[$i];
+ }
+ unless ($opt_varquality[$i] eq 'NA')
+ {
+ $filter ne '' and $filter .= ':';
+ $filter .= 'varQuality!=' . $opt_varquality[$i];
+ }
+
+ if ($filter ne '')
+ {
+ if ($append eq '') {$append = '#' . $filter;}
+ else {$append .= ',' . $filter;}
+ }
+}
+
+my $version = `cgatools | head -1`;
+print "cgatools varfilter
+--beta
+--reference $opt_reference
+--output $opt_output
+--input '${opt_input}${append}'\n";
+print "$version\n";
+
+`cgatools varfilter --beta --reference $opt_reference --output $opt_output --input '${opt_input}${append}'`;
\ No newline at end of file