Mercurial > repos > bcrain-completegenomics > testing2
changeset 10:133c2a76561b draft
Uploaded
line wrap: on
 line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools/README.txt Thu Jun 07 17:32:39 2012 -0400 @@ -0,0 +1,3 @@ +Provides galaxy tools for Complete Genomics' cgatools package - http://www.completegenomics.com + +Create new section 'Complete Genomics' if it doesn't already exist in your tool panel. \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools/datatypes_conf.xml Thu Jun 07 17:32:39 2012 -0400 @@ -0,0 +1,19 @@ +<?xml version="1.0"?> +<datatypes> + <datatype_files> + <datatype_file name="completegenomics.py"/> + </datatype_files> + + <registration> + <!-- + Add the following section to datatypes_conf.xml file in your Galaxy distribution if you are adding Complete Genomics tools manually to your Galaxy instance + --> + <!-- Start Complete Genomics Datatypes --> + <datatype extension="cg_var" type="galaxy.datatypes.tabular:CG_Var" display_in_upload="true" /> + <datatype extension="cg_mastervar" type="galaxy.datatypes.tabular:CG_MasterVar" display_in_upload="true" /> + <datatype extension="cg_gene" type="galaxy.datatypes.tabular:CG_Gene" display_in_upload="true" /> + <!-- End Complete Genomics Datatypes --> + </registration> + <sniffers> + </sniffers> +</datatypes>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools/lib/galaxy/datatypes/completegenomics.py Thu Jun 07 17:32:39 2012 -0400 @@ -0,0 +1,71 @@ +""" +Complete Genomics datatypes +Birgit Crain - Complete Genomics, Inc +""" + +import pkg_resources +pkg_resources.require( "bx-python" ) + +import logging +from galaxy.datatypes import data +from galaxy import util +from cgi import escape +from galaxy.datatypes import metadata +from galaxy.datatypes import tabular +from galaxy.datatypes.metadata import MetadataElement +from galaxy.datatypes.tabular import Tabular +import galaxy_utils.sequence.vcf +from galaxy.datatypes.sniff import * + +log = logging.getLogger(__name__) + +class CG_Var( Tabular ): + file_ext = 'cg_var' + def __init__(self, **kwd): + """Initialize CG_Var datatype""" + Tabular.__init__( self, **kwd ) + self.column_names = ['locus', 'ploidy', 'allele', 'chromosome', 'begin', 'end', + 'varType', 'reference', 'alleleSeq', 'varScoreVAF', + 'varScoreEAF', 'varQuality', 'hapLink', 'xRef' + ] + def display_peek( self, dataset ): + """Returns formated html of peek""" + return Tabular.make_html_table( self, dataset, column_names=self.column_names ) + +class CG_MasterVar( Tabular ): + file_ext = 'cg_mastervar' + def __init__(self, **kwd): + """Initialize CG_MasterVar datatype""" + Tabular.__init__( self, **kwd ) + self.column_names = ['locus', 'ploidy', 'chromosome', 'begin', 'end', 'zygosity', + 'varType', 'reference', 'allele1Seq', 'allele2Seq', + 'allele1VarScoreVAF', 'allele2VarScoreVAF', 'allele1VarScoreEAF', + 'allele2VarScoreEAF', 'allele1VarQuality', 'allele2VarQuality', + 'allele1HapLink', 'allele2HapLink', 'allele1XRef', 'allele2XRef', + 'evidenceIntervalId', 'allele1ReadCount', 'allele2ReadCount', + 'referenceAlleleRead', 'totalReadCount', 'allele1Gene', + 'allele2Gene pfam', 'miRBaseId', 'repeatMasker', 'segDupOverlap', + 'relativeCoverageDiploid', 'calledPloidy', + 'relativeCoverageNondiploid', 'calledLevel' + ] + + def display_peek( self, dataset ): + """Returns formated html of peek""" + return Tabular.make_html_table( self, dataset, column_names=self.column_names ) + +class CG_Gene( Tabular ): + file_ext = 'cg_gene' + def __init__(self, **kwd): + """Initialize CG_Gene datatype""" + Tabular.__init__( self, **kwd ) + self.column_names = ['index', 'locus', 'allele', 'chromosome', 'begin', 'end', + 'varType', 'reference', 'call', 'xRef', 'geneId', + 'mrnaAcc', 'proteinAcc', 'symbol', 'orientation', 'component', + 'componentIndex', 'hasCodingRegion', 'impact', 'nucleotidePos', + 'proteinPos', 'annotationRefSequence', 'sampleSequence', + 'genomeRefSequence', 'pfam' + ] + + def display_peek( self, dataset ): + """Returns formated html of peek""" + return Tabular.make_html_table( self, dataset, column_names=self.column_names )
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools/tool-data/cg_crr_files.loc.sample Thu Jun 07 17:32:39 2012 -0400 @@ -0,0 +1,11 @@ +#This is a sample file distributed with Galaxy that enables tools +#to use .crr reference files. You will need to download or create +#the .crr reference files and then create a cg_crr_files.loc file +#similar to this one (store it in this directory) that points to +#the location of the files. The cg_crr_files.loc +#file has this format (white space characters are TAB characters): +# +#<value> <dbkey> <name> <path> +# +#hg19 hg19 hg19.crr /Users/bcrain/Documents/hg19.crr +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools/tool_config.xml.sample.xml Thu Jun 07 17:32:39 2012 -0400 @@ -0,0 +1,20 @@ +<?xml version="1.0"?> +<toolbox> + <!-- + Add the following section to tool_conf.xml file in your Galaxy distribution if you are adding Complete Genomics tools manually to your Galaxy instance + --> + <section name="Complete Genomics" id="cgi"> + <label text="cgatools" id="cgi_cgatools" /> + <tool file="cgatools/listvariants.xml" /> + <tool file="cgatools/testvariants.xml" /> + <tool file="cgatools/listtestvariants.xml" /> + <tool file="cgatools/join.xml" /> + <tool file="cgatools/calldiff.xml" /> + <tool file="cgatools/snpdiff.xml" /> + <tool file="cgatools/junctiondiff.xml" /> + <tool file="cgatools/varfilter.xml" /> + <label text="Perl scripts" id="cgi_perl" /> + <tool file="cgi_scripts/List_Unique_Variants.xml" /> + <tool file="cgi_scripts/Calculate_TestVariants_Variant_Frequencies.xml" /> + </section> +</toolbox> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools/tool_data_table_conf.xml.sample.xml Thu Jun 07 17:32:39 2012 -0400 @@ -0,0 +1,12 @@ +<tables> + <!-- + Add the following section to tool_data_table_conf.xml file in your Galaxy distribution if you are adding Complete Genomics tools manually to your Galaxy instance + --> + <!-- Start Location of cgatools crr files --> + <table name="cg_crr_files" comment_char="#"> + <columns>value, dbkey, name, path</columns> + <file path="tool-data/cg_crr_files.loc" /> + </table> + <!-- End Location of cgatools crr files --> +</tables> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools/tools/cgatools/calldiff.xml Thu Jun 07 17:32:39 2012 -0400 @@ -0,0 +1,343 @@ +<tool id="cga_calldiff" name="calldiff(beta)" version="0.0.1"> + + <description>compares two Complete Genomics variant files.</description> <!--adds description in toolbar--> + + <requirements> + <requirement type="binary">cgatools</requirement> + </requirements> + + <command> <!--run executable--> + cgatools calldiff --beta + --reference ${crr.fields.path} + --variantsA $data_sources.inputA + --variantsB $data_sources.inputB + $validation + $diploid + --locus-stats-column-count $column + --max-hypothesis-count $hypothesis + --output-prefix cg_ + --reports `echo ${report1} ${report2} ${report3} ${report4} ${report5} ${somatic.report6} | sed 's/ */,/g'` + #if $somatic.report6 == "SomaticOutput" + --genome-rootA $somatic.genomeA + --genome-rootB $somatic.genomeB + --calibration-root $somatic.calibration + #end if + </command> + + <outputs> + <data format="tabular" name="output1" from_work_dir="cg_SuperlocusOutput.tsv" label="${tool.name} on ${on_string}: SuperlocusOutput"> + <filter>(report1 == 'SuperlocusOutput')</filter> + </data> + <data format="tabular" name="output2" from_work_dir="cg_SuperlocusStats.tsv" label="${tool.name} on ${on_string}: SuperlocusStats"> + <filter>(report2 == 'SuperlocusStats')</filter> + </data> + <data format="tabular" name="output3" from_work_dir="cg_LocusOutput.tsv" label="${tool.name} on ${on_string}: LocusOutput"> + <filter>(report3 == 'LocusOutput')</filter> + </data> + <data format="tabular" name="output4" from_work_dir="cg_LocusStats.tsv" label="${tool.name} on ${on_string}: LocusStats"> + <filter>(report4 == 'LocusStats')</filter> + </data> + <data format="tabular" name="output5a" from_work_dir="cg_VariantsA.tsv" label="${tool.name} on ${on_string}: VariantsA"> + <filter>(report5 == 'VariantOutput')</filter> + </data> + <data format="tabular" name="output5b" from_work_dir="cg_VariantsB.tsv" label="${tool.name} on ${on_string}: VariantsB"> + <filter>(report5 == 'VariantOutput')</filter> + </data> + <data format="tabular" name="output6" from_work_dir="cg_SomaticOutput.tsv" label="${tool.name} on ${on_string}: SomaticOutput"> + <filter>(somatic['report6'] == 'SomaticOutput')</filter> + </data> + </outputs> + + <inputs> + <!--form field to select crr file--> + <param name="crr" type="select" label="Genome build"> + <options from_data_table="cg_crr_files" /> + </param> + + <!--conditional to select variant file input--> + <conditional name="data_sources"> + <param name="data_source" type="select" label="Where are the input varfiles?"> + <option value="in" selected="true">imported into Galaxy</option> + <option value="out">located outside Galaxy (available only for local Galaxy instances)</option> + </param> + <when value="in"> + <!--form field to select variant files--> + <param name="inputA" type="data" format="cg_var" label="Dataset A"> + <validator type="unspecified_build" /> + <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc" + metadata_name="dbkey" metadata_column="1" + message="cgatools is not currently available for this build."/> + </param> + <param name="inputB" type="data" format="cg_var" label="Dataset B"> + <validator type="unspecified_build" /> + <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc" + metadata_name="dbkey" metadata_column="1" + message="cgatools is not currently available for this build."/> + </param> + </when> + <when value="out"> + <!--form field to select crr file--> + <param name="inputA" type="text" label="Variant file A (path/file_name)" size="300" help="Variant files can be compressed (gz, bz2)."/> + <param name="inputB" type="text" label="Variant file B (path/file_name)" size="300" help="Variant files can be compressed (gz, bz2)."/> + </when> + </conditional> + + <param name="diploid" type="select" label="Use diploid variant model" help="Uses varScoreEAF instead of varScoreVAF in somatic score computations. Also, uses diploid variant model instead of variable allele mixture model."> + <option value="">no</option> + <option value="--diploid">yes</option> + </param> + + <param name="column" type="integer" label="Number of columns for locus compare classification in the locus stats file (default 15)" value="15"/> + + <param name="hypothesis" type="integer" label="Maximum number of possible phasings to consider for a superlocus (default 32)" value="32"/> + + <param name="validation" type="select" label="Reference cover validation" help="Turns on/off validation that all bases of a chromosome are covered by calls of the variant file."> + <option value="">on</option> + <option value="--no-reference-cover-validation">off</option> + </param> + + <param name="report1" type="select" label="Report SuperlocusOutput"> + <option value="">no</option> + <option value="SuperlocusOutput">yes</option> + </param> + <param name="report2" type="select" label="Report SuperlocusStats"> + <option value="">no</option> + <option value="SuperlocusStats">yes</option> + </param> + <param name="report3" type="select" label="Report LocusOutput"> + <option value="">no</option> + <option value="LocusOutput">yes</option> + </param> + <param name="report4" type="select" label="Report LocusStats"> + <option value="">no</option> + <option value="LocusStats">yes</option> + </param> + <param name="report5" type="select" label="Report VariantOutput" help="Both variant files annotated by comparison results.If the somatic output report is requested, file A is also annotated with the same score ranks as produced in that report."> + <option value="">no</option> + <option value="VariantOutput">yes</option> + </param> + + <conditional name="somatic"> + <param name="report6" type="select" label="Report SomaticOutput" help="This report can only be generated on local Galaxy instances. Report for the list of simple variations that are present only in file 'A', annotated with the score that indicates the probability of the variation being truly somatic. Note: generating this report slows calldiff by 10x-20x."> + <option value="">no</option> + <option value="SomaticOutput">yes</option> + </param> + <when value="SomaticOutput"> + <param name="genomeA" type="text" size="300" label="Directory for genome A (path/dir)" help="The 'A' genome directory, for example /data/GS00118-DNA_A01; this directory is expected to contain ASM/REF and ASM/EVIDENCE subdirectories."/> + <param name="genomeB" type="text" size="300" label="Directory for genome B (path/dir)" help="The 'B' genome directory"/> + <param name="calibration" type="text" size="300" label="Directory calibration data (path/dir)" help="The directory containing calibration data. For example, there should exist a file calibration-root/0.0.0/metrics.tsv. Calibration data can be downloaded from ftp://ftp.completegenomics.com/ScoreCalibrationFiles/var-calibration-v1.tgz"/> + </when> + </conditional> + + </inputs> + + <help> + +**What it does** + +This tool compares two Complete Genomics variant files. + +cgatools: http://sourceforge.net/projects/cgatools/files/ + +----- + +**cgatools Manual**:: + + COMMAND NAME + calldiff - Compares two Complete Genomics variant files. + + DESCRIPTION + Compares two Complete Genomics variant files. Divides the genome up into + superloci of nearby variants, then compares the superloci. Also refines the + comparison to determine per-call or per-locus comparison results. + + Comparison results are usually described by a semi-colon separated string, + one per allele. Each allele's comparison result is one of the following + classifications: + + ref-identical The alleles of the two variant files are identical, and + they are consistent with the reference. + alt-identical The alleles of the two variant files are identical, and + they are inconsistent with the reference. + ref-consistent The alleles of the two variant files are consistent, + and they are consistent with the reference. + alt-consistent The alleles of the two variant files are consistent, + and they are inconsistent with the reference. + onlyA The alleles of the two variant files are inconsistent, + and only file A is inconsistent with the reference. + onlyB The alleles of the two variant files are inconsistent, + and only file B is inconsistent with the reference. + mismatch The alleles of the two variant files are inconsistent, + and they are both inconsistent with the reference. + phase-mismatch The two variant files would be consistent if the + hapLink field had been empty, but they are + inconsistent. + ploidy-mismatch The superlocus did not have uniform ploidy. + + In some contexts, this classification is rolled up into a simplified + classification, which is one of "identical", "consistent", "onlyA", + "onlyB", or "mismatch". + + A good place to start looking at the results is the superlocus-output file. + It has columns defined as follows: + + SuperlocusId An identifier given to the superlocus. + Chromosome The name of the chromosome. + Begin The 0-based offset of the start of the superlocus. + End The 0-based offset of the base one past the end of the + superlocus. + Classification The match classification of the superlocus. + Reference The reference sequence. + AllelesA A semicolon-separated list of the alleles (one per + haplotype) for variant file A, for the phasing with the + best comparison result. + AllelesB A semicolon-separated list of the alleles (one per + haplotype) for variant file B, for the phasing with the + best comparison result. + + The locus-output file contains, for each locus in file A and file B that is + not consistent with the reference, an annotated set of calls for the locus. + The calls are annotated with the following columns: + + SuperlocusId The id of the superlocus containing the locus. + File The variant file (A or B). + LocusClassification The locus classification is determined by the + varType column of the call that is inconsistent + with the reference, concatenated with a + modifier that describes whether the locus is + heterozygous, homozygous, or contains no-calls. + If there is no one variant in the locus (i.e., + it is heterozygous alt-alt), the locus + classification begins with "other". + LocusDiffClassification The match classification for the locus. This is + defined to be the best of the comparison of the + locus to the same region in the other file, or + the comparison of the superlocus. + + The somatic output file contains a list of putative somatic variations of + genome A. The output includes only those loci that can be classified as + snp, del, ins or sub in file A, and are called reference in the file B. + Every locus is annotated with the following columns: + + VarCvgA The totalReadCount from file A for this locus + (computed on the fly if file A is not a + masterVar file). + VarScoreA The varScoreVAF from file A, or varScoreEAF if + the "--diploid" option is used. + RefCvgB The maximum of the uniqueSequenceCoverage + values for the locus in genome B. + RefScoreB Minimum of the reference scores of the locus in + genome B. + SomaticCategory The category used for determining the + calibrated scores and the SomaticRank. + VarScoreACalib The calibrated variant score of file A, under + the model selected by using or not using the + "--diploid" option, and corrected for the count + of heterozygous variants observed in this + genome. See user guide for more information. + VarScoreBCalib The calibrated reference score of file B, under + the model selected by using or not using the + "--diploid" option, and corrected for the count + of heterozygous variants observed in this + genome. See user guide for more information. + SomaticRank The estimated rank of this somatic mutation, + amongst all true somatic mutations within this + SomaticCategory. The value is a number between + 0 and 1; a value of 0.012 means, for example, + that an estimated 1.2% of the true somatic + mutations in this somaticCategory have a + somaticScore less than the somaticScore for + this mutation. See user guide for more + information. + SomaticScore An integer that provides a total order on + quality for all somatic mutations. It is equal + to -10*log10( P(false)/P(true) ), under the + assumption that this genome has a rate of + somatic mutation equal to 1/Mb for + SomaticCategory snp, 1/10Mb for SomaticCategory + ins, 1/10Mb for SomaticCategory del, and 1/20Mb + for SomaticCategory sub. The computation is + based on the assumptions described in the user + guide, and is affected by choice of variant + model selected by using or not using the + "--diploid" option. + SomaticQuality Equal to VQHIGH for all somatic mutations where + SomaticScore >= -10. Otherwise, this column is + empty. + + OPTIONS + -h [ --help ] + Print this help message. + + --reference arg + The input crr file. + + --variantsA arg + The "A" input variant file. + + --variantsB arg + The "B" input variant file. + + --output-prefix arg + The path prefix for all output reports. + + --reports arg (=SuperlocusOutput,SuperlocusStats,LocusOutput,LocusStats) + Comma-separated list of reports to generate. (Beware any reports whose + name begins with "Debug".) A report is one of: + SuperlocusOutput Report for superlocus classification. + SuperlocusStats Report for superlocus classification stats. + LocusOutput Report for locus classification. + LocusStats Report for locus stats. + VariantOutput Both variant files annotated by comparison + results.If the somatic output report is + requested, file A is also annotated with the + same score ranks as produced in that report. + SomaticOutput Report for the list of simple variations that + are present only in file "A", annotated with + the score that indicates the probability of + the variation being truly somatic. Requires + beta, genome-rootA, and genome-rootB options + to be provided as well. Note: generating this + report slows calldiff by 10x-20x. + DebugCallOutput Report for call classification. + DebugSuperlocusOutput Report for debug superlocus information. + DebugSomaticOutput Report for distribution estimates used for + somatic rescoring. Only produced if + SomaticOutput is also turned on. + + --diploid + Uses varScoreEAF instead of varScoreVAF in somatic score computations. + Also, uses diploid variant model instead of variable allele mixture + model. + + --locus-stats-column-count arg (=15) + The number of columns for locus compare classification in the locus + stats file. + + --max-hypothesis-count arg (=32) + The maximum number of possible phasings to consider for a superlocus. + + --no-reference-cover-validation + Turns off validation that all bases of a chromosome are covered by + calls of the variant file. + + --genome-rootA arg + The "A" genome directory, for example /data/GS00118-DNA_A01; this + directory is expected to contain ASM/REF and ASM/EVIDENCE + subdirectories. + + --genome-rootB arg + The "B" genome directory. + + --calibration-root arg + The directory containing calibration data. For example, there should + exist a file calibration-root/0.0.0/metrics.tsv. + + --beta + This flag enables the SomaticOutput report, which is beta + functionality. + + SUPPORTED FORMAT_VERSION + 0.3 or later + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools/tools/cgatools/join.xml Thu Jun 07 17:32:39 2012 -0400 @@ -0,0 +1,157 @@ +<tool id="cga_join" name="join(beta)" version="0.0.1"> + + <description>two tsv files based on equal fields or overlapping regions.</description> <!--adds description in toolbar--> + + <requirements> + <requirement type="binary">cgatools</requirement> + </requirements> + + <command> <!--run executable--> + cgatools join --beta + --input $input1 + --input $input2 + --output $output + --output-mode $outmode + $dump + --select $col + #for $m in $matched <!--get all matched columns--> + --match ${m.match} + #end for + </command> + + <outputs> + <data format="tabular" name="output" /> + </outputs> + + <inputs> + <!--form field to select input file A--> + <param name="input1" type="data" format="tabular" label="Select first input file (A)"> + <validator type="unspecified_build" /> + <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc" + metadata_name="dbkey" metadata_column="0" + message="cgatools is not currently available for this build."/> + </param> + + <!--form field to select input file B--> + <param name="input2" type="data" format="tabular" label="Select second input file (B)"> + <validator type="unspecified_build" /> + <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc" + metadata_name="dbkey" metadata_column="0" + message="cgatools is not currently available for this build."/> + </param> + + <!--form field to specify columns to match--> + <repeat name="matched" title="Matched column"> + <param name="match" type="text" label="Enter column A:column B"/> + </repeat> + + <!--form field to specify columns to print--> + <param name="col" type="text" value="A.*,B.*" label="Specify columns to print from file A and B in format A.col_name1,A.col_name2,B.col_name1" /> + + <!--form field to select output-mode--> + <param name="outmode" type="select" label="Select output mode"> + <option value="full" selected="true">full (1 line for each match of records in A and B)</option> + <option value="compact">compact (1 line for each record in A, joining multiple records in B by semicolon)</option> + <option value="compact-pct">compact-pct (same as compact, annotated with % overlap)</option> + </param> + + <!--form field to select columns to match--> + <param name="dump" type="select" label="Select records to print"> + <option value="--always-dump" selected="true">print all records of A even if not matched in B</option> + <option value="">print only records of A that are matched in B</option> + </param> + </inputs> + + <help> + +**What it does** + +This tool joins two tab-delimited files based on equal fields or overlapping regions. + +cgatools: http://sourceforge.net/projects/cgatools/files/ + +----- + +**cgatools Manual**:: + + COMMAND NAME + join - Joins two tab-delimited files based on equal fields or overlapping regions. + + DESCRIPTION + Joins two tab-delimited files based on equal fields or overlapping regions. + By default, an output record is produced for each match found between file + A and file B, but output format can be controlled by the --output-mode + parameter. + + OPTIONS + -h [ --help ] + Print this help message. + + --beta + This is a beta command. To run this command, you must pass the --beta + flag. + + --input arg + File name to use as input (may be passed in as arguments at the end of + the command), or omitted for stdin). There must be exactly two input + files to join. If only one file is specified by name, file A is taken + to be stdin and file B is the named file. File B is read fully into + memory, and file A is streamed. File A's columns appear first in the + output. + + --output arg (=STDOUT) + The output file name (may be omitted for stdout). + + --match arg + A match specification, which is a column from A and a column from B + separated by a colon. + + --overlap arg + + -m [ --output-mode ] arg (=full) + Output mode, one of the following: + full Print an output record for each match found between + file A and file B. + compact Print at most one record for each record of file A, + joining the file B values by a semicolon and + suppressing repeated B values and empty B values. + compact-pct Same as compact, but for each distinct B value, + annotate with the percentage of the A record that is + overlapped by B records with that B value. Percentage + is rounded up to nearest integer. + + --overlap-mode arg (=strict) + Overlap mode, one of the following: + strict Range A and B overlap if A.begin < B.end and + B.begin < A.end. + allow-abutting-points Range A and B overlap they meet the strict + requirements, or if A.begin <= B.end and + B.begin <= A.end and either A or B has zero + length. + + --select arg (=A.*,B.*) + Set of fields to select for output. + + -a [ --always-dump ] + Dump every record of A, even if there are no matches with file B. + + --overlap-fraction-A arg (=0) + Minimum fraction of A region overlap for filtering output. + + --boundary-uncertainty-A arg (=0) + Boundary uncertainty for overlap filtering. Specifically, records + failing the following predicate are filtered away: overlap >= + overlap-fraction-A * ( A-range-length - boundary-uncertainty-A ) + + --overlap-fraction-B arg (=0) + Minimum fraction of B region overlap for filtering output. + + --boundary-uncertainty-B arg (=0) + Boundary uncertainty for overlap filtering. Specifically, records + failing the following predicate are filtered away: overlap >= + overlap-fraction-B * ( B-range-length - boundary-uncertainty-B ) + + SUPPORTED FORMAT_VERSION + Any + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools/tools/cgatools/junctiondiff.xml Thu Jun 07 17:32:39 2012 -0400 @@ -0,0 +1,88 @@ +<tool id="cga_junctiondiff" name="junctiondiff(beta)" version="0.0.1"> + + <description>reports difference between junction calls</description> <!--adds description in toolbar--> + + <requirements> + <requirement type="binary">cgatools</requirement> + </requirements> + + <command> <!--run executable--> + cgatools junctiondiff --beta -h + </command> + + <outputs> + <data format="tabular" name="output" /> + </outputs> + + <inputs> + </inputs> + + <help> + +**What it does** + +This tool reports difference between junction calls of Complete Genomics junctions files + +cgatools: http://sourceforge.net/projects/cgatools/files/ + +----- + +**cgatools Manual**:: + + COMMAND NAME + junctiondiff - Reports difference between junction calls of Complete Genomics junctions files. + + DESCRIPTION + junctiondiff takes two junction files A and B as input and produces the + following output: + - "diff-inputFileName" - the junctions from an input file A that are not + present in input file B. + - "report.txt" - a brief summary report (if --statout is used) + + Two junctions are considered equivalent if: + - they come from different files + - left and right positions of one junction are not more than "--distance" + bases apart from the corresponding positions of another junction + - the junction scores are equal or above the scoreThreshold + - they are on the same strands + + OPTIONS + -h [ --help ] + Print this help message. + + --beta + This is a beta command. To run this command, you must pass the --beta + flag. + + -s [ --reference ] arg + Reference file. + + -a [ --junctionsA ] arg + input junction file A. + + -b [ --junctionsB ] arg + input junction file B. + + -A [ --scoreThresholdA ] arg (=10) + score threshold value for the input file A. + + -B [ --scoreThresholdB ] arg (=0) + score threshold value for the input file B. + + -d [ --distance ] arg (=200) + Max distance between coordinates of potentially compatible junctions. + + -l [ --minlength ] arg (=500) + Minimum deletion junction length to be included into the difference + file. + + -o [ --output-prefix ] arg + The path prefix for all the output reports. + + -S [ --statout ] + (Debug) Report various input file statistics. Experimental feature. + + SUPPORTED FORMAT_VERSION + 1.5 or later + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools/tools/cgatools/listtestvariants.xml Thu Jun 07 17:32:39 2012 -0400 @@ -0,0 +1,239 @@ +<tool id="cga_listtestvariants" name="listvariants(beta)-testvariants(beta)" version="1.0.1"> +<!-- +This tool creates a GUI for cgatools listvariants and testvariants from Complete Genomics, Inc. +to be run consecutively with the same input files. +written 5-29-2012 by bcrain@completegenomics.com +--> + + <description></description> <!--adds description in toolbar--> + + <requirements> + <requirement type="binary">cgatools</requirement> + </requirements> + + <command> <!--run executable--> + cgatools listvariants + --beta + --reference ${crr.fields.path} + --output $output1 + #if $include_list.listing == "yes" <!--only added when yes--> + --variant-listing $include_list.list + #end if + $longvar + --variants + #if $file_types.data_sources.data_source == "in" + #for $v in $file_types.data_sources.varfiles <!--get each var file--> + ${v.input} + #end for + #else + `cat $file_types.data_sources.varlist` + #end if + ; + + cgatools testvariants + --beta + --reference ${crr.fields.path} + --output $output2 + --input $output1 + --variants + #if $file_types.data_sources.data_source == "in" + #for $v in $file_types.data_sources.varfiles <!--get each var/mastervar file--> + ${v.input} + #end for + #else + `cat $file_types.data_sources.varlist` + #end if + </command> + + <outputs> + <data format="tabular" name="output1" label="listvariants output"/> + <data format="tabular" name="output2" label="testvariants output"/> + </outputs> + + <inputs> + <!--form field to select crr file--> + <param name="crr" type="select" label="Genome build"> + <options from_data_table="cg_crr_files" /> + </param> + + <!--form field to select long variants option--> + <param name="longvar" type="select" label="List long variants?"> + <option value="" selected="true">no</option> + <option value="--list-long-variants">yes</option> + </param> + + <!--form fields to include existing variant list--> + <conditional name="include_list"> + <param name="listing" type="select" label="Include variant listing?"> + <option value="no" selected="true">no</option> + <option value="yes">yes</option> + </param> + <when value="yes"> + <param name="list" type="data" format="tabular" label="Variant listing"> + <validator type="unspecified_build" /> + <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc" + metadata_name="dbkey" metadata_column="1" + message="cgatools is not currently available for this build."/> + </param> + </when> + </conditional> + + <!--conditional to select input file type--> + <conditional name="file_types"> + <param name="file_type" type="select" label="Select the input file type"> + <option value="var" selected="true">var files</option> + <option value="mastervar">mastervar files</option> + </param> + + <when value="var"> + <!--conditional to select variant file input--> + <conditional name="data_sources"> + <param name="data_source" type="select" label="Where are the input var files?"> + <option value="in" selected="true">imported into Galaxy</option> + <option value="out">located outside Galaxy (available only for local Galaxy instances)</option> + </param> + <when value="in"> + <!--form field to select variant files--> + <repeat name="varfiles" title="Variant files"> + <param name="input" type="data" format="cg_var" label="Dataset"> + <validator type="unspecified_build" /> + <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc" + metadata_name="dbkey" metadata_column="1" + message="cgatools is not currently available for this build."/> + </param> + </repeat> + </when> + <when value="out"> + <!--form field to select crr file--> + <param name="varlist" type="text" label="List of variant files (/path/file)" size="200" help="file with list of var files (/path/varfile), var files can be compressed (gz, bz2)."/> + </when> + </conditional> + </when> + + <when value="mastervar"> + <!--conditional to select variant file input--> + <conditional name="data_sources"> + <param name="data_source" type="select" label="Where are the input mastervar files?"> + <option value="in" selected="true">imported into Galaxy</option> + <option value="out">located outside Galaxy (available only for local Galaxy instances)</option> + </param> + <when value="in"> + <!--form field to select variant files--> + <repeat name="varfiles" title="Variant files"> + <param name="input" type="data" format="cg_mastervar" label="Dataset"> + <validator type="unspecified_build" /> + <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc" + metadata_name="dbkey" metadata_column="1" + message="cgatools is not currently available for this build."/> + </param> + </repeat> + </when> + <when value="out"> + <!--form field to select crr file--> + <param name="varlist" type="text" label="List of mastervar files (/path/file)" size="200" help="file with list of mastervar files (/path/varfile), mastervar files can be compressed (gz, bz2)."/> + </when> + </conditional> + </when> + </conditional> + </inputs> + + <help> + +**What it does** + +This tool uses the cgatools testvariants to test variant or mastervar files for the presence of variants. + +cgatools: http://sourceforge.net/projects/cgatools/files/ + +----- + +**cgatools Manual**:: + + COMMAND NAME + listvariants - Lists the variants present in a variant file. + + DESCRIPTION + Lists all called variants present in the specified variant files, in a + format suitable for processing by the testvariants command. The output is a + tab-delimited file consisting of the following columns: + + variantId Sequential id assigned to each variant. + chromosome The chromosome of the variant. + begin 0-based reference offset of the beginning of the variant. + end 0-based reference offset of the end of the variant. + varType The varType as extracted from the variant file. + reference The reference sequence. + alleleSeq The variant allele sequence as extracted from the variant + file. + xRef The xRef as extrated from the variant file. + + OPTIONS + -h [ --help ] + Print this help message. + + --beta + This is a beta command. To run this command, you must pass the --beta + flag. + + --reference arg + The reference crr file. + + --output arg (=STDOUT) + The output file (may be omitted for stdout). + + --variants arg + The input variant files (may be positional args). + + --variant-listing arg + The output of another listvariants run, to be merged in to produce the + output of this run. + + --list-long-variants + In addition to listing short variants, list longer variants as well + (10's of bases) by concatenating nearby calls. + + SUPPORTED FORMAT_VERSION + 0.3 or later + + + + COMMAND NAME + testvariants - Tests variant files for presence of variants. + + DESCRIPTION + Tests variant files for presence of variants. The output is a tab-delimited + file consisting of the columns of the input variants file, plus a column + for each assembly results file that contains a character code for each + allele. The character codes have meaning as follows: + + 0 This allele of this genome is consistent with the reference at this + locus but inconsistent with the variant. + 1 This allele of this genome has the input variant at this locus. + N This allele of this genome has no-calls but is consistent with the + input variant. + + OPTIONS + -h [ --help ] + Print this help message. + + --beta + This is a beta command. To run this command, you must pass the --beta + flag. + + --reference arg + The reference crr file. + + --input arg (=STDIN) + The input variants to test for. + + --output arg (=STDOUT) + The output file (may be omitted for stdout). + + --variants arg + The input variant files (may be passed in as arguments at the end of + the command). + + SUPPORTED FORMAT_VERSION + 0.3 or later + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools/tools/cgatools/listvariants.xml Thu Jun 07 17:32:39 2012 -0400 @@ -0,0 +1,177 @@ +<tool id="cga_listvariant" name="listvariants(beta)" version="0.0.1"> +<!-- +This tool creates a GUI for cgatools listvariants from Complete Genomics, Inc. +written 5-29-2012 by bcrain@completegenomics.com +--> + + <description>lists all called variants</description> <!--adds description in toolbar--> + + <requirements> + <requirement type="binary">cgatools</requirement> + </requirements> + + <command> <!--run executable--> + cgatools listvariants + --beta + --reference ${crr.fields.path} + --output $output + #if $include_list.listing == "yes" <!--only added when yes--> + --variant-listing $include_list.list + #end if + $longvar + --variants + #if $file_types.data_sources.data_source == "in" + #for $v in $file_types.data_sources.varfiles <!--get each var/mastervar file--> + ${v.input} + #end for + #else + `cat $file_types.data_sources.varlist` + #end if + </command> + + <inputs> + <!--form field to select crr file--> + <param name="crr" type="select" label="Genome build"> + <options from_data_table="cg_crr_files" /> + </param> + + <!--form field to select long variants option--> + <param name="longvar" type="select" label="List long variants?"> + <option value="" selected="true">no</option> + <option value="--list-long-variants">yes</option> + </param> + + <!--form fields to include existing variant list--> + <conditional name="include_list"> + <param name="listing" type="select" label="Include variant listing?"> + <option value="no" selected="true">no</option> + <option value="yes">yes</option> + </param> + <when value="yes"> + <param name="list" type="data" format="tabular" label="Variant listing"/> + </when> + </conditional> + + <!--conditional to select input file type--> + <conditional name="file_types"> + <param name="file_type" type="select" label="Select the input file type"> + <option value="var" selected="true">var files</option> + <option value="mastervar">mastervar files</option> + </param> + + <when value="var"> + <!--conditional to select variant file input--> + <conditional name="data_sources"> + <param name="data_source" type="select" label="Where are the input var files?"> + <option value="in" selected="true">imported into Galaxy</option> + <option value="out">located outside Galaxy (available only for local Galaxy instances)</option> + </param> + <when value="in"> + <!--form field to select variant files--> + <repeat name="varfiles" title="Variant files"> + <param name="input" type="data" format="cg_var" label="Dataset"> + <validator type="unspecified_build" /> + <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc" + metadata_name="dbkey" metadata_column="1" + message="cgatools is not currently available for this build."/> + <!--<validator type="expression" message="Dataset does not match selected build.">$dbkey == $crr.fields.dbkey</validator>--> + </param> + </repeat> + </when> + <when value="out"> + <!--form field to select crr file--> + <param name="varlist" type="text" label="List of variant files (/path/file)" size="200" help="file with list of var files (/path/varfile), var files can be compressed (gz, bz2)."/> + </when> + </conditional> + </when> + + <when value="mastervar"> + <!--conditional to select variant file input--> + <conditional name="data_sources"> + <param name="data_source" type="select" label="Where are the input mastervar files?"> + <option value="in" selected="true">imported into Galaxy</option> + <option value="out">located outside Galaxy (available only for local Galaxy instances)</option> + </param> + <when value="in"> + <!--form field to select variant files--> + <repeat name="varfiles" title="Variant files"> + <param name="input" type="data" format="cg_mastervar" label="Dataset"> + <validator type="unspecified_build" /> + <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc" + metadata_name="dbkey" metadata_column="1" + message="cgatools is not currently available for this build."/> + </param> + </repeat> + </when> + <when value="out"> + <!--form field to select crr file--> + <param name="varlist" type="text" label="List of mastervar files (/path/file)" size="200" help="file with list of mastervar files (/path/varfile), mastervar files can be compressed (gz, bz2)."/> + </when> + </conditional> + </when> + </conditional> + </inputs> + + <outputs> + <data format="tabular" name="output"/> + </outputs> + + <help> + +**What it does** + +This tool uses the cgatools listvariants to list all called variants present in the var or mastervar files. + +cgatools: http://sourceforge.net/projects/cgatools/files/ + +----- + +**cgatools Manual**:: + + COMMAND NAME + listvariants - Lists the variants present in a variant file. + + DESCRIPTION + Lists all called variants present in the specified variant files, in a + format suitable for processing by the testvariants command. The output is a + tab-delimited file consisting of the following columns: + + variantId Sequential id assigned to each variant. + chromosome The chromosome of the variant. + begin 0-based reference offset of the beginning of the variant. + end 0-based reference offset of the end of the variant. + varType The varType as extracted from the variant file. + reference The reference sequence. + alleleSeq The variant allele sequence as extracted from the variant + file. + xRef The xRef as extrated from the variant file. + + OPTIONS + -h [ --help ] + Print this help message. + + --beta + This is a beta command. To run this command, you must pass the --beta + flag. + + --reference arg + The reference crr file. + + --output arg (=STDOUT) + The output file (may be omitted for stdout). + + --variants arg + The input variant files (may be positional args). + + --variant-listing arg + The output of another listvariants run, to be merged in to produce the + output of this run. + + --list-long-variants + In addition to listing short variants, list longer variants as well + (10's of bases) by concatenating nearby calls. + + SUPPORTED FORMAT_VERSION + 0.3 or later + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools/tools/cgatools/snpdiff.xml Thu Jun 07 17:32:39 2012 -0400 @@ -0,0 +1,116 @@ +<tool id="cga_snpdiff" name="snpdiff" version="0.0.1"> + + <description>compares snp calls to a Complete Genomics variant file.</description> <!--adds description in toolbar--> + + <requirements> + <requirement type="binary">cgatools</requirement> + </requirements> + + <command> <!--run executable--> + cgatools snpdiff --beta -h + </command> + + <outputs> + <data format="tabular" name="output" /> + </outputs> + + <inputs> + </inputs> + + <help> + +**What it does** + +This tool ompares snp calls to a Complete Genomics variant file. + +cgatools: http://sourceforge.net/projects/cgatools/files/ + +----- + +**cgatools Manual**:: + + COMMAND NAME + snpdiff - Compares snp calls to a Complete Genomics variant file. + + DESCRIPTION + Compares the snp calls in the "genotypes" file to the calls in a Complete + Genomics variant file. The genotypes file is a tab-delimited file with at + least the following columns (additional columns may be given): + + Chromosome (Required) The name of the chromosome. + Offset0Based (Required) The 0-based offset in the chromosome. + GenotypesStrand (Optional) The strand of the calls in the Genotypes + column (+ or -, defaults to +). + Genotypes (Optional) The calls, one per allele. The following + calls are recognized: + A,C,G,T A called base. + N A no-call. + - A deleted base. + . A non-snp variation. + + The output is a tab-delimited file consisting of the columns of the + original genotypes file, plus the following additional columns: + + Reference The reference base at the given position. + VariantFile The calls made by the variant file, one per allele. + The character codes are the same as is described for + the Genotypes column. + DiscordantAlleles (Only if Genotypes is present) The number of + Genotypes alleles that are discordant with calls in + the VariantFile. If the VariantFile is described as + haploid at the given position but the Genotypes is + diploid, then each genotype allele is compared + against the haploid call of the VariantFile. + NoCallAlleles (Only if Genotypes is present) The number of + Genotypes alleles that were no-called by the + VariantFile. If the VariantFile is described as + haploid at the given position but the Genotypes is + diploid, then a VariantFile no-call is counted twice. + + The verbose output is a tab-delimited file consisting of the columns of the + original genotypes file, plus the following additional columns: + + Reference The reference base at the given position. + VariantFile The call made by the variant file for one allele (there is + a line in this file for each allele). The character codes + are the same as is described for the Genotypes column. + [CALLS] The rest of the columns are pasted in from the VariantFile, + describing the variant file line used to make the call. + + The stats output is a comma-separated file with several tables describing + the results of the snp comparison, for each diploid genotype. The tables + all describe the comparison result (column headers) versus the genotype + classification (row labels) in different ways. The "Locus classification" + tables have the most detailed match classifications, while the "Locus + concordance" tables roll these match classifications up into "discordance" + and "no-call". A locus is considered discordant if it is discordant for + either allele. A locus is considered no-call if it is concordant for both + alleles but has a no-call on either allele. The "Allele concordance" + describes the comparison result on a per-allele basis. + + OPTIONS + -h [ --help ] + Print this help message. + + --reference arg + The input crr file. + + --variants arg + The input variant file. + + --genotypes arg + The input genotypes file. + + --output-prefix arg + The path prefix for all output reports. + + --reports arg (=Output,Verbose,Stats) + Comma-separated list of reports to generate. A report is one of: + Output The output genotypes file. + Verbose The verbose output file. + Stats The stats output file. + + SUPPORTED FORMAT_VERSION + 0.3 or later + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools/tools/cgatools/testing.pl Thu Jun 07 17:32:39 2012 -0400 @@ -0,0 +1,10 @@ +#!/usr/bin/perl + +print "$0 @ARGV\n"; +open OUT, ">@ARGV[0]"; +print "test1 ok\ttest1 ok\ntest1 ok\ttest1 ok\n"; +print OUT "test ok\ttest ok\ntest ok\ttest ok\n"; +close OUT; +open OUT, ">somefile"; +print OUT "test2 ok\ttest2 ok\ntest2 ok\ttest2 ok\n"; +close OUT; \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools/tools/cgatools/testvariants.xml Thu Jun 07 17:32:39 2012 -0400 @@ -0,0 +1,157 @@ +<tool id="cga_testvariants" name="testvariants(beta)" version="0.0.1"> +<!-- +This tool creates a GUI for cgatools testvariants from Complete Genomics, Inc. +written 5-29-2012 by bcrain@completegenomics.com +--> + + <description>test for the presence of variants</description> <!--adds description in toolbar--> + + <requirements> + <requirement type="binary">cgatools</requirement> + </requirements> + + <command> <!--run executable--> + cgatools testvariants + --beta + --reference ${crr.fields.path} + --output $output + --input $listing + --variants + #if $file_types.data_sources.data_source == "in" + #for $v in $file_types.data_sources.varfiles <!--get each var/mastervar file--> + ${v.input} + #end for + #else + `cat $file_types.data_sources.varlist` + #end if + </command> + + <outputs> + <data format="tabular" name="output" /> + </outputs> + + <inputs> + <!--form field to select crr file--> + <param name="crr" type="select" label="Genome build"> + <options from_data_table="cg_crr_files" /> + </param> + + <!--form fields to select variant list--> + <param name="listing" type="data" format="tabular" label="Select variant list"> + <validator type="unspecified_build" /> + <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc" + metadata_name="dbkey" metadata_column="1" + message="cgatools is not currently available for this build."/> + </param> + + <!--conditional to select input file type--> + <conditional name="file_types"> + <param name="file_type" type="select" label="Select the input file type"> + <option value="var" selected="true">var files</option> + <option value="mastervar">mastervar files</option> + </param> + + <when value="var"> + <!--conditional to select variant file input--> + <conditional name="data_sources"> + <param name="data_source" type="select" label="Where are the input var files?"> + <option value="in" selected="true">imported into Galaxy</option> + <option value="out">located outside Galaxy (available only for local Galaxy instances)</option> + </param> + <when value="in"> + <!--form field to select variant files--> + <repeat name="varfiles" title="Variant files"> + <param name="input" type="data" format="cg_var" label="Dataset"> + <validator type="unspecified_build" /> + <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc" + metadata_name="dbkey" metadata_column="1" + message="cgatools is not currently available for this build."/> + </param> + </repeat> + </when> + <when value="out"> + <!--form field to select crr file--> + <param name="varlist" type="text" label="List of variant files (/path/file)" size="200" help="file with list of var files (/path/varfile), var files can be compressed (gz, bz2)."/> + </when> + </conditional> + </when> + + <when value="mastervar"> + <!--conditional to select variant file input--> + <conditional name="data_sources"> + <param name="data_source" type="select" label="Where are the input mastervar files?"> + <option value="in" selected="true">imported into Galaxy</option> + <option value="out">located outside Galaxy (available only for local Galaxy instances)</option> + </param> + <when value="in"> + <!--form field to select variant files--> + <repeat name="varfiles" title="Variant files"> + <param name="input" type="data" format="cg_mastervar" label="Dataset"> + <validator type="unspecified_build" /> + <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc" + metadata_name="dbkey" metadata_column="1" + message="cgatools is not currently available for this build."/> + </param> + </repeat> + </when> + <when value="out"> + <!--form field to select crr file--> + <param name="varlist" type="text" label="List of mastervar files (/path/file)" size="200" help="file with list of mastervar files (/path/varfile), mastervar files can be compressed (gz, bz2)."/> + </when> + </conditional> + </when> + </conditional> + </inputs> + + <help> + +**What it does** + +This tool uses the cgatools testvariants to test variant or mastervar files for the presence of variants. + +cgatools: http://sourceforge.net/projects/cgatools/files/ + +----- + +**cgatools Manual**:: + + COMMAND NAME + testvariants - Tests variant files for presence of variants. + + DESCRIPTION + Tests variant files for presence of variants. The output is a tab-delimited + file consisting of the columns of the input variants file, plus a column + for each assembly results file that contains a character code for each + allele. The character codes have meaning as follows: + + 0 This allele of this genome is consistent with the reference at this + locus but inconsistent with the variant. + 1 This allele of this genome has the input variant at this locus. + N This allele of this genome has no-calls but is consistent with the + input variant. + + OPTIONS + -h [ --help ] + Print this help message. + + --beta + This is a beta command. To run this command, you must pass the --beta + flag. + + --reference arg + The reference crr file. + + --input arg (=STDIN) + The input variants to test for. + + --output arg (=STDOUT) + The output file (may be omitted for stdout). + + --variants arg + The input variant files (may be passed in as arguments at the end of + the command). + + SUPPORTED FORMAT_VERSION + 0.3 or later + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools/tools/cgatools/varfilter.xml Thu Jun 07 17:32:39 2012 -0400 @@ -0,0 +1,184 @@ +<tool id="cga_varfilter" name="varfilter(beta)" version="0.0.1"> +<!-- +This tool creates a GUI for cgatools varfilter from Complete Genomics, Inc. +The function is called via a Perl script vartools_wrapper.pl, designed to generate the correctly formated filters to append the input file on the command line. +written 6-1-2012 by bcrain@completegenomics.com +--> + + <description>copies input file, applying filters.</description> <!--adds description in toolbar--> + + <requirements> + <requirement type="binary">cgatools</requirement> + </requirements> + + <command interpreter="perl"> + varfilter_wrapper.pl + --reference $crr.fields.path + --output $output + --input $file_types.data_sources.input + #for $f in $filters + --zygosity $f.zygosity + --vartype $f.vartype + --varscorevaf x$f.varscorevaf + --varscoreeaf x$f.varscoreeaf + --varquality $f.varquality + #end for + </command> + + <outputs> + <data format="cg_var" name="output" /> + </outputs> + + <inputs> + <!--form field to select crr file--> + <param name="crr" type="select" label="Genome build"> + <options from_data_table="cg_crr_files" /> + </param> + + <!--conditional to select input file type--> + <conditional name="file_types"> + <param name="file_type" type="select" label="Select the input file type"> + <option value="var" selected="true">var files</option> + <option value="mastervar">mastervar files</option> + </param> + + <when value="var"> + <!--conditional to select variant file input--> + <conditional name="data_sources"> + <param name="data_source" type="select" label="Where is the input var file?"> + <option value="in" selected="true">imported into Galaxy</option> + <option value="out">located outside Galaxy (available only for local Galaxy instances)</option> + </param> + <when value="in"> + <!--form field to select variant files--> + <param name="input" type="data" format="cg_var" label="Var file"> + <validator type="unspecified_build" /> + <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc" + metadata_name="dbkey" metadata_column="1" + message="cgatools is not currently available for this build."/> + </param> + </when> + <when value="out"> + <!--form field to select crr file--> + <param name="input" type="text" label="Var file (/path/file)" size="200" help="var file can be compressed (gz, bz2)."/> + </when> + </conditional> + </when> + + <when value="mastervar"> + <!--conditional to select variant file input--> + <conditional name="data_sources"> + <param name="data_source" type="select" label="Where is the input mastervar file?"> + <option value="in" selected="true">imported into Galaxy</option> + <option value="out">located outside Galaxy (available only for local Galaxy instances)</option> + </param> + <when value="in"> + <!--form field to select variant files--> + <param name="input" type="data" format="cg_mastervar" label="Mastervar file"> + <validator type="unspecified_build" /> + <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc" + metadata_name="dbkey" metadata_column="1" + message="cgatools is not currently available for this build."/> + </param> + </when> + <when value="out"> + <!--form field to select crr file--> + <param name="input" type="text" label="Mastervar file (/path/file)" size="200" help="mastervar file can be compressed (gz, bz2)."/> + </when> + </conditional> + </when> + </conditional> + + <!-- formfields to add filters --> + <repeat name="filters" title="Filter"> + <param name="zygosity" type="select" label="Filter out call (set to no-call) IF locus IS"> + <option value="NA">- all loci -</option> + <option value="hom">homozygous</option> + <option value="het">heterzygous</option> + </param> + + <param name="vartype" type="select" label="AND varType IS"> + <option value="NA">- any varType -</option> + <option value="snp">snp</option> + <option value="ins">ins</option> + <option value="del">del</option> + <option value="sub">sub</option> + <option value="ref">ref</option> + </param> + + <param name="varscorevaf" type="text" label="AND varScoreVAF IS LESS THAN"/> + <param name="varscoreeaf" type="text" label="AND varScoreEAF IS LESS THAN"/> + + <param name="varquality" type="select" label="AND varQuality IS NOT"> + <option value="NA"> </option> + <option value="VQHigh">VQHigh</option> + <option value="VQLOW">VQLOW</option> + </param> + </repeat> + </inputs> + + <help> + +**What it does** + +This tool copies input var file or masterVar file to output, applying specified filters. + +cgatools: http://sourceforge.net/projects/cgatools/files/ + +----- + +**cgatools Manual**:: + + COMMAND NAME + varfilter - Copies input var file or masterVar file to output, applying + specified filters. + + DESCRIPTION + Copies input var file or masterVar file to output, applying specified + filters (which are available to all cgatools commands that read a var file + or masterVar file as input). Filters are specified by appending the filter + specification to the var file name on the command line. For example: + + /path/to/var.tsv.bz2#varQuality!=VQHIGH + + The preceding example filters out any calls marked as VQLOW. The filter + specification follows the "#" sign, and consists of a list of filters to + apply, separated by a comma. Each filter is a colon-separated list of call + selectors. Any scored call that passes all the colon-separated call + selectors for one or more of the comma-separated filters is turned into a + no-call. The following call selectors are available: + + hom Selects only calls in homozygous loci. + het Selects any scored call not selected by the hom selector. + varType=XX Selects calls whose varType is XX. + varScoreVAF<XX Selects calls whose varScoreVAF<XX. + varScoreEAF<XX Selects calls whose varScoreEAF<XX. + varQuality!=XX Selects calls whose varQuality is not XX. + + Here is an example that filters homozygous SNPs with varScoreVAF < 25 and + heterozygous insertions with varScoreEAF < 50: + + + '/path/to/var.tsv.bz2#hom:varType=snp:varScoreVAF<25,het:varType=ins:varScoreEAF<50' + + + OPTIONS + -h [ --help ] + Print this help message. + + --beta + This is a beta command. To run this command, you must pass the --beta flag. + + --reference arg + The reference crr file. + + --input arg + The input var file or masterVar file (typically with filters specified). + + --output arg (=STDOUT) + The output file (may be omitted for stdout). + + SUPPORTED FORMAT_VERSION + 0.3 or later + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools/tools/cgatools/varfilter_wrapper.pl Thu Jun 07 17:32:39 2012 -0400 @@ -0,0 +1,56 @@ +#!/usr/bin/perl +use strict; +use Getopt::Long; +use vars qw($opt_reference $opt_input $opt_output @opt_zygosity @opt_vartype @opt_varscorevaf @opt_varscoreeaf @opt_varquality); +$| = 1; # set autoflush to screen + +# This is a wrapper for the cgatools varfilter function to run cgatools varfilter in Galaxy. +# The wrapper generates the filter(s) in the correct format to be used with the input file. +# written 6-1-2012 by bcrain@completegenomics.com + + +#print join("\n", @ARGV), "\n"; +&GetOptions("reference=s", "input=s", "output=s", "zygosity=s@", "vartype=s@", "varscorevaf=s@", "varscoreeaf=s@", "varquality=s@"); + +my $append = ''; + +for (my $i = 0; $i <= $#opt_zygosity; $i ++) +{ + my $filter = ''; + unless ($opt_zygosity[$i] eq 'NA') {$filter = $opt_zygosity[$i];} + unless ($opt_vartype[$i] eq 'NA') + { + $filter ne '' and $filter .= ':'; + $filter .= 'varType=' . $opt_vartype[$i]; + } + unless ($opt_varscorevaf[$i] eq 'x') + { + $filter ne '' and $filter .= ':'; + $opt_varscorevaf[$i] =~ s/^x//; + $filter .= 'varScoreVAF<' . $opt_varscorevaf[$i]; + } + unless ($opt_varscoreeaf[$i] eq 'x') + { + $filter ne '' and $filter .= ':'; + $opt_varscoreeaf[$i] =~ s/^x//; + $filter .= 'varScoreEAF<' . $opt_varscoreeaf[$i]; + } + unless ($opt_varquality[$i] eq 'NA') + { + $filter ne '' and $filter .= ':'; + $filter .= 'varQuality!=' . $opt_varquality[$i]; + } + + if ($filter ne '') + { + if ($append eq '') {$append = '#' . $filter;} + else {$append .= ',' . $filter;} + } +} +print "cgatools varfilter +--beta +--reference $opt_reference +--output $opt_output +--input '${opt_input}${append}'\n"; + +`cgatools varfilter --beta --reference $opt_reference --output $opt_output --input '${opt_input}${append}'`; \ No newline at end of file
