Mercurial > repos > jjohnson > gmap
changeset 3:488e9d642566 draft
GMAP wrappers v3.0.1 after linting and cleanup, still untested work-in-progress
author | peterjc |
---|---|
date | Wed, 28 Sep 2016 10:47:28 -0400 |
parents | f6ba0f12cca2 |
children | a88571642c6e |
files | gmap.xml gmap_build.xml gmap_v3.0.0_from_JJ.tar.gz gsnap.xml iit_store.xml lib/galaxy/datatypes/gmap.py snpindex.xml tool-data/datatypes_conf.xml |
diffstat | 8 files changed, 627 insertions(+), 557 deletions(-) [+] |
line wrap: on
line diff
--- a/gmap.xml Wed Sep 28 10:43:44 2016 -0400 +++ b/gmap.xml Wed Sep 28 10:47:28 2016 -0400 @@ -1,9 +1,9 @@ -<tool id="gmap" name="GMAP" version="3.0.0"> +<tool id="gmap" name="GMAP" version="3.0.1"> <description>Genomic Mapping and Alignment Program for mRNA and EST sequences</description> <requirements> <requirement type="package" version="2013-05-09">gmap</requirement> </requirements> - <version_string>gmap --version</version_string> + <version_command>gmap --version</version_command> <command> #import os,os.path gmap @@ -41,7 +41,7 @@ --protein_gen #elif $result.format == "sam": --format=$result.sam_paired_read - $result.no_sam_headers + $result.no_sam_headers $result.sam_use_0M $result.force_xs_dir $result.md_lowercase_snp @@ -127,7 +127,7 @@ ${i.added_input} #end for #if $split_output == True - 2> $gmap_stderr + 2> $gmap_stderr #else 2> $gmap_stderr > $output #end if @@ -194,7 +194,7 @@ </param> </when> <when value="gmapdb"> - <param name="gmapdb" type="data" format="gmapdb" metadata_name="dbkey" label="Select a gmapdb" + <param name="gmapdb" type="data" format="gmapdb" label="Select a gmapdb" help="A GMAP database built with GMAP Build"/> <param name="kmer" type="select" data_ref="gmapdb" label="kmer size" help="Defaults to highest available kmer size"> <options> @@ -208,12 +208,12 @@ </param> </when> <when value="history"> - <param name="ownFile" type="data" format="fasta" metadata_name="dbkey" label="Select the reference genome" + <param name="ownFile" type="data" format="fasta" label="Select the reference genome" help="Fasta containing genomic DNA sequence"/> </when> </conditional> - + <!-- Computation options --> <conditional name="computation"> <param name="options" type="select" label="<HR>Computational Settings" help=""> @@ -223,56 +223,56 @@ <when value="default"/> <when value="advanced"> <param name="nosplicing" type="boolean" truevalue="--nosplicing" falsevalue="" checked="false" label="Turn off splicing" help="(useful for aligning genomic sequences onto a genome)"/> - <param name="min_intronlength" type="integer" value="" optional="true" label="Min length for one internal intron (default 9)." help="Below this size, a genomic gap will be considered a deletion rather than an intron." > + <param name="min_intronlength" type="integer" value="" optional="true" label="Min length for one internal intron (default 9)." help="Below this size, a genomic gap will be considered a deletion rather than an intron." > <validator type="in_range" message="min_intronlength must be positive" min="0" /> </param> - <param name="intronlength" type="integer" value="" optional="true" label="Max length for one intron (default 1000000)" > + <param name="intronlength" type="integer" value="" optional="true" label="Max length for one intron (default 1000000)" > <validator type="in_range" message="intronlength must be positive" min="0" /> </param> - <param name="localsplicedist" type="integer" value="" optional="true" label="Max length for known splice sites at ends of sequence (default 200000)" > + <param name="localsplicedist" type="integer" value="" optional="true" label="Max length for known splice sites at ends of sequence (default 200000)" > <validator type="in_range" message="localsplicedist must be positive" min="0" /> </param> - <param name="totallength" type="integer" value="" optional="true" label="Max total intron length (default 2400000)" > + <param name="totallength" type="integer" value="" optional="true" label="Max total intron length (default 2400000)" > <validator type="in_range" message="totallength must be positive" min="0" /> </param> - <param name="chimera_margin" type="integer" value="" optional="true" label="Amount of unaligned sequence that triggers search for a chimera" - help=" default is 40, To turn off, set to 0" > + <param name="chimera_margin" type="integer" value="" optional="true" label="Amount of unaligned sequence that triggers search for a chimera" + help=" default is 40, To turn off, set to 0" > <validator type="in_range" message="chimera_margin must be positive" min="0" /> </param> - <param name="direction" type="select" label="cDNA direction"> + <param name="direction" type="select" label="cDNA direction"> <option value="auto">auto</option> <option value="sense_force">sense_force</option> <option value="antisense_force">antisense_force</option> <option value="sense_filter">sense_filter</option> <option value="antisense_filter">antisense_filter</option> </param> - <param name="trimendexons" type="integer" value="" optional="true" label="Trim end exons with fewer than given number of matches (in nt, default 12)" > + <param name="trimendexons" type="integer" value="" optional="true" label="Trim end exons with fewer than given number of matches (in nt, default 12)" > <validator type="in_range" message="trimendexons must be positive" min="1" /> </param> <param name="find_shifted_canonical" type="boolean" truevalue="--find-shifted-canonical-species" falsevalue="" checked="false" label="find-shifted-canonical Use a more sensitive search for canonical splicing" help=""/> <param name="cross_species" type="boolean" truevalue="--cross-species" falsevalue="" checked="false" label="Cross-species alignment" help="For cross-species alignments, use a more sensitive search for canonical splicing"/> - - <param name="canonical" type="select" label="Reward for canonical and semi-canonical introns"> + + <param name="canonical" type="select" label="Reward for canonical and semi-canonical introns"> <option value="1">high reward (default)</option> <option value="0">low reward</option> <option value="2">low reward for high-identity sequences</option> </param> - <param name="allow_close_indels" type="select" label="Allow an insertion and deletion close to each other"> + <param name="allow_close_indels" type="select" label="Allow an insertion and deletion close to each other"> <option value="1" selected="true">yes (default)</option> <option value="0">no</option> <option value="2">only for high-quality alignments</option> </param> - <param name="microexon_spliceprob" type="float" value="" optional="true" label="Micro Exon splice probablility threshold" - help="Allow microexons only if one of the splice site probabilities is greater than this value (default 0.90)" > - <validator type="in_range" message="slice probability between 0.00 and 1.00" min="0" max="1"/> + <param name="microexon_spliceprob" type="float" value="" optional="true" label="Micro Exon splice probablility threshold" + help="Allow microexons only if one of the splice site probabilities is greater than this value (default 0.90)" > + <validator type="in_range" message="slice probability between 0.00 and 1.00" min="0" max="1"/> </param> - <param name="prunelevel" type="select" label="Pruning level"> + <param name="prunelevel" type="select" label="Pruning level"> <option value="0">no pruning (default)</option> <option value="1">poor sequences</option> <option value="2">repetitive sequences</option> <option value="3">poor and repetitive sequences</option> </param> - <!-- could do this as a config file + <!-- could do this as a config file <param name="chrsubsetfile" type="data" format="fasta" label="User-supplied chromosome subset file" /> <param name="chrsubset" type="text" label="Chromosome subset to search" /> --> @@ -293,25 +293,25 @@ <option value="--invertmode=1">Invert cDNA and print genomic (-) strand</option> <option value="--invertmode=2">Invert cDNA and print genomic (+) strand</option> </param> - <param name="introngap" type="integer" value="" optional="true" label="Nucleotides to show on each end of intron (default=3)"> + <param name="introngap" type="integer" value="" optional="true" label="Nucleotides to show on each end of intron (default=3)"> <validator type="in_range" message="introngap must be positive" min="0" /> </param> - <param name="wraplength" type="integer" value="" optional="true" label="Line Wrap length for alignment (default=50)"> + <param name="wraplength" type="integer" value="" optional="true" label="Line Wrap length for alignment (default=50)"> <validator type="in_range" message="wraplength must be positive" min="1" /> </param> <param name="npaths" type="integer" value="" optional="true" - label="Maximum number of paths to show. Ignored if negative. If 0, prints two paths if chimera detected, else one." > + label="Maximum number of paths to show. Ignored if negative. If 0, prints two paths if chimera detected, else one." > <validator type="in_range" message="npaths must be positive" min="0" /> </param> <param name="suboptimal_score" type="integer" value="" optional="true" label="Report only paths whose score is within this value of the best path" - help="By default the program prints all paths found." > + help="By default the program prints all paths found." > <validator type="in_range" message="suboptimal_score must be positive" min="0" /> </param> - <param name="chimera_overlap" type="integer" value="" optional="true" label="Overlap to show, if any, at chimera breakpoint (default 0)" > + <param name="chimera_overlap" type="integer" value="" optional="true" label="Overlap to show, if any, at chimera breakpoint (default 0)" > <validator type="in_range" message="chimera_overlap must be positive" min="0" /> </param> - <param name="tolerant" type="boolean" checked="false" truevalue="--tolerant=true" falsevalue="" + <param name="tolerant" type="boolean" checked="false" truevalue="--tolerant=true" falsevalue="" label="Translates cDNA with corrections for frameshifts"/> <param name="protein" type="select" label="Protein alignment" help=""> <option value="">default</option> @@ -383,9 +383,9 @@ <param name="read_group_library" type="text" value="" label="Value to put into read-group library (RG-LB) field"/> <param name="read_group_platform" type="text" value="" label="Value to put into read-group library platform (RG-PL) field"/> <param name="sam_use_0M" type="boolean" truevalue="--sam-use-0M" falsevalue="" checked="false" label="Insert 0M in CIGAR between adjacent insertions and deletions" help="Required by Picard, but can cause errors in other tools"/> - <param name="force_xs_dir" type="boolean" truevalue="--force-xs-dir" falsevalue="" checked="false" label="Force direction (disallow XS:A:?)" + <param name="force_xs_dir" type="boolean" truevalue="--force-xs-dir" falsevalue="" checked="false" label="Force direction (disallow XS:A:?)" help="For RNA-Seq alignments, disallows XS:A:? when the sense direction is unclear, and replaces this value arbitrarily with XS:A:+. May be useful for some programs, such as Cufflinks, that cannot handle XS:A:?. However, if you use this flag, the reported value of XS:A:+ in these cases will not be meaningful."/> - <param name="md_lowercase_snp" type="boolean" truevalue="--md-lowercase-snp" falsevalue="" checked="false" label="MD lowercase SNP" + <param name="md_lowercase_snp" type="boolean" truevalue="--md-lowercase-snp" falsevalue="" checked="false" label="MD lowercase SNP" help="In MD string, when known SNPs are given by the -v flag, prints difference nucleotides as lower-case when they, differ from reference but match a known alternate allele"/> </when> </conditional> <!-- name="result" --> @@ -393,7 +393,7 @@ <param name="split_output" type="boolean" truevalue="--split-output=gmap_out" falsevalue="" checked="false" label="Separate outputs for nomapping, uniq, mult, and chimera" help="(chimera only when chimera-margin is selected)"/> - <!-- + <!-- map=iitfile Map file. If argument is '?' (with the quotes), this lists available map files. mapexons Map each exon separately mapboth Report hits from both strands of genome @@ -401,7 +401,7 @@ print-comment Show comment line for each hit --> - <!-- + <!-- min-trimmed-coverage=FLOAT Do not print alignments with trimmed coverage less this value (default=0.0, which means no filtering) Note that chimeric alignments will be output regardless @@ -484,13 +484,13 @@ </data> </outputs> <tests> - </tests> + </tests> <help> **What it does** -GMAP_ (Genomic Mapping and Alignment Program) The functionality provided by gmap allows a user to: (1) map and align a single cDNA interactively against a large genome in about a second, without the startup time of several minutes typically needed by existing mapping programs; (2) switch arbitrarily among different genomes, without the need for a preloaded server dedicated to each genome; (3) run the program on computers with as little as 128 MB of RAM (random access memory); (4) perform high-throughput batch processing of cDNAs by using memory mapping and multithreading when appropriate memory and hardware are available; (5) generate accurate gene models, even in the presence of substantial polymorphisms and sequence errors; (6) locate splice sites accurately without the use of probabilistic splice site models, allowing generalized use of the program across species; (7) detect statistically significant microexons and incorporate them into the alignment; and (8) handle mapping and alignment tasks on genomes having alternate assemblies, linkage groups or strains. It is developed by Thomas D. Wu of Genentech, Inc. +GMAP_ (Genomic Mapping and Alignment Program) The functionality provided by gmap allows a user to: (1) map and align a single cDNA interactively against a large genome in about a second, without the startup time of several minutes typically needed by existing mapping programs; (2) switch arbitrarily among different genomes, without the need for a preloaded server dedicated to each genome; (3) run the program on computers with as little as 128 MB of RAM (random access memory); (4) perform high-throughput batch processing of cDNAs by using memory mapping and multithreading when appropriate memory and hardware are available; (5) generate accurate gene models, even in the presence of substantial polymorphisms and sequence errors; (6) locate splice sites accurately without the use of probabilistic splice site models, allowing generalized use of the program across species; (7) detect statistically significant microexons and incorporate them into the alignment; and (8) handle mapping and alignment tasks on genomes having alternate assemblies, linkage groups or strains. It is developed by Thomas D. Wu of Genentech, Inc. Publication_ citation: Thomas D. Wu, Colin K. Watanabe Bioinformatics 2005 21(9):1859-1875; doi:10.1093/bioinformatics/bti310 @@ -506,7 +506,9 @@ You will want to read the README_ .. _README: http://research-pub.gene.com/gmap/src/README - </help> + <citations> + <citation type="doi">10.1093/bioinformatics/bti310</citation> + </citations> </tool>
--- a/gmap_build.xml Wed Sep 28 10:43:44 2016 -0400 +++ b/gmap_build.xml Wed Sep 28 10:47:28 2016 -0400 @@ -1,121 +1,10 @@ -<tool id="gmap_build" name="GMAP Build" version="3.0.0"> +<tool id="gmap_build" name="GMAP Build" version="3.0.1"> <description>a database genome index for GMAP and GSNAP</description> <requirements> <requirement type="package" version="2013-05-09">gmap</requirement> </requirements> - <version_string>gmap --version</version_string> + <version_command>gmap --version</version_command> <command interpreter="command"> /bin/bash $shscript > $output </command> - <inputs> - <!-- Name for this gmapdb --> - <param name="refname" type="text" label="Name you want to give this gmap database" help=""> - <validator type="empty_field" message="A database name is required."/> - </param> - <!-- Input data --> - <repeat name="inputs" title="Reference Sequence" min="1"> - <param name="input" type="data" format="fasta" label="reference sequence fasta" /> - </repeat> - - <param name="circular_chroms" type="text" value="" optional="true" label="Names of circular chromosomes" - help="a list of chromosomes, separated by commas, allow GSNAP and GMAP to align reads across the ends of the chromosome"> - </param> - - <param name="sort" type="select" label="Sort chromosomes" help=""> - <option value="none">none - use chromosomes as found in FASTA file(s)</option> - <option value="alpha">alpha - sort chromosomes alphabetically (chr10 before chr 1)</option> - <option value="numeric-alpha">numeric-alpha - chr1, chr1U, chr2, chrM, chrU, chrX, chrY</option> - <option value="chrom">chrom - chr1, chr2, chrM, chrX, chrY, chr1U, chrU</option> - </param> - - <param name="cmetindex" type="boolean" checked="true" truevalue="yes" falsevalue="no" label="Create cmetindex to process reads from bisulfite-treated DNA"/> - <param name="atoiindex" type="boolean" checked="true" truevalue="yes" falsevalue="no" label="Create atoiindex to process reads under RNA-editing tolerance"/> - <conditional name="splicesite"> - <param name="splice_source" type="select" label="Add splice and intron info from" > - <option value="none"></option> - <option value="refGeneTable">refGenes table from UCSC table browser</option> - <option value="gtf">GTF</option> - <option value="gff3">GFF3</option> - </param> - <when value="none"/> - <when value="refGeneTable"> - <param name="refGenes" type="data" format="tabular" optional="true" label="UCSC refGenes table" help="Example: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/refGene.txt.gz" /> - <param name="col_skip" type="integer" value="1" label="Columns to skip before the id/name column (default 1)" - help="Note that alignment tracks in UCSC sometimes have an extra column on the left."> - <validator type="in_range" message="The number of colmumns to skip must >= 0." min="0."/> - </param> - - </when> - <when value="gtf"> - <param name="gtfGenes" type="data" format="gtf" optional="true" label="Genes as GTF" help="" /> - </when> - <when value="gff3"> - <param name="gff3Genes" type="data" format="gff3" optional="true" label="Genes in GFF3 format" help="" /> - </when> - </conditional> - <conditional name="dbsnp"> - <param name="snp_source" type="select" label="Add SNP info from" > - <option value="none"></option> - <option value="snpTable">UCSC SNP Table</option> - <option value="snpFile">GMAP SNP File</option> - <option value="vcfFile">VCF File</option> - </param> - <when value="none"/> - <when value="snpTable"> - <param name="snps" type="data" format="tabular" optional="true" label="UCSC SNPs table" help="Example: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/snp130.txt.gz" /> - <param name="snpsex" type="data" format="tabular" optional="true" label="UCSC SNP Exceptions table" help="Example: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/snp130Exceptions.txt.gz" /> - <param name="weight" type="select" label="Include SNPs with at least Confidence Level" help=""> - <option value="1" selected="true">1 (High)</option> - <option value="2">2 (Medium)</option> - <option value="3">3 (All)</option> - </param> - </when> - <when value="snpFile"> - <param name="snps" type="data" format="gmap_snps" optional="true" label="GMAP SNPs file" - help="Format (3 columns): - <br>>rs62211261 21:14379270 CG - <br>>rs62211262 21:14379281 CG - <br>Each line must start with a > character, then be followed by an - identifier (which may have duplicates). Then there should be the - chromosomal coordinate of the SNP. (Coordinates are all 1-based, so - the first character of a chromosome is number 1.) Finally, there - should be the two possible alleles: ( AC AG AT CG CT GT or AN CN GN TN) - <br>These alleles must correspond to the possible nucleotides on the plus strand of the genome. - If the one of these two letters does not match the allele in the reference - sequence, that SNP will be ignored in subsequent processing as a probable error. - The N stands for any other allele." /> - </when> - <when value="vcfFile"> - <param name="snps" type="data" format="vcf" optional="true" label="VCF SNPs file" - help="Example: ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/00-All.vcf.gz - The VCF file contains multiple versions of dbSNP, so if you want a - particular version, such as 135. The vcf_iit program tries to pick - a subset of SNPs that somewhat parallel - the ones without exceptions in the UCSC dbSNP file. It keeps all SNPs - that have been validated (marked in the VCF file as "VLD") or have a - submitter link-out ("SLO"). Otherwise, it excludes SNPs that are - individual genotypes ("GNO"). If none of these conditions hold, then - the SNP is allowed. "/> - <param name="vcf_version" type="text" value="" optional="true" label="dbSNP version" - help="The VCF file contains multiple versions of dbSNP, so if you want a particular version, such as 135"/> - </when> - </conditional> - - <param name="kmer" type="select" multiple="true" force_select="true" label="kmer size" help="Use smaller values when building indexes on machines with limited RAM"> - <option value="12">12 (64MB RAM)</option> - <option value="13">13 (256MB RAM)</option> - <option value="14">14 (1GB RAM)</option> - <option value="15" selected="true">15 (4GB RAM)</option> - </param> - - </inputs> - <stdio> - <exit_code range="1" level="fatal" description="Error running gmap_build" /> - </stdio> - <outputs> - <!-- - <data format="txt" name="log" label="${tool.name} on ${on_string}: log"/> - --> - <data format="gmapdb" name="output" label="${tool.name} on ${on_string} gmapdb ${refname}" /> - </outputs> <configfiles> <configfile name="shscript"> #!/bin/bash @@ -145,8 +34,8 @@ #else: gmap_build -D $gmapdb -d $refname -s $sort $circular #for i in $inputs# ${i.input}#end for# #end if -get-genome -D $gmapdb -d '?' | sed 's/^Available .*/gmap db: /' -echo "kmers: " $kmer +get-genome -D $gmapdb -d '?' | sed 's/^Available .*/gmap db: /' +echo "kmers: " $kmer #if $splicesite.splice_source == 'refGeneTable': #if $splicesite.refGenes.__str__ != 'None': cat $splicesite.refGenes | psl_splicesites -s $splicesite.col_skip | iit_store -o $os.path.join($mapsdir,'splicesites') @@ -190,15 +79,122 @@ atoiindex -d $refname echo "atoiindex" -d $refname #end if -get-genome -D $gmapdb -d $refname -m '?' | sed 's/^Available maps .*/maps: /' +get-genome -D $gmapdb -d $refname -m '?' | sed 's/^Available maps .*/maps: /' </configfile> </configfiles> + <inputs> + <!-- Name for this gmapdb --> + <param name="refname" type="text" label="Name you want to give this gmap database" help=""> + <validator type="empty_field" message="A database name is required."/> + </param> + <!-- Input data --> + <repeat name="inputs" title="Reference Sequence" min="1"> + <param name="input" type="data" format="fasta" label="reference sequence fasta" /> + </repeat> + <param name="circular_chroms" type="text" value="" optional="true" label="Names of circular chromosomes" + help="a list of chromosomes, separated by commas, allow GSNAP and GMAP to align reads across the ends of the chromosome"> + </param> + + <param name="sort" type="select" label="Sort chromosomes" help=""> + <option value="none">none - use chromosomes as found in FASTA file(s)</option> + <option value="alpha">alpha - sort chromosomes alphabetically (chr10 before chr 1)</option> + <option value="numeric-alpha">numeric-alpha - chr1, chr1U, chr2, chrM, chrU, chrX, chrY</option> + <option value="chrom">chrom - chr1, chr2, chrM, chrX, chrY, chr1U, chrU</option> + </param> + + <param name="cmetindex" type="boolean" checked="true" truevalue="yes" falsevalue="no" label="Create cmetindex to process reads from bisulfite-treated DNA"/> + <param name="atoiindex" type="boolean" checked="true" truevalue="yes" falsevalue="no" label="Create atoiindex to process reads under RNA-editing tolerance"/> + <conditional name="splicesite"> + <param name="splice_source" type="select" label="Add splice and intron info from" > + <option value="none"></option> + <option value="refGeneTable">refGenes table from UCSC table browser</option> + <option value="gtf">GTF</option> + <option value="gff3">GFF3</option> + </param> + <when value="none"/> + <when value="refGeneTable"> + <param name="refGenes" type="data" format="tabular" optional="true" label="UCSC refGenes table" help="Example: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/refGene.txt.gz" /> + <param name="col_skip" type="integer" value="1" label="Columns to skip before the id/name column (default 1)" + help="Note that alignment tracks in UCSC sometimes have an extra column on the left."> + <validator type="in_range" message="The number of colmumns to skip must >= 0." min="0."/> + </param> + + </when> + <when value="gtf"> + <param name="gtfGenes" type="data" format="gtf" optional="true" label="Genes as GTF" help="" /> + </when> + <when value="gff3"> + <param name="gff3Genes" type="data" format="gff3" optional="true" label="Genes in GFF3 format" help="" /> + </when> + </conditional> + <conditional name="dbsnp"> + <param name="snp_source" type="select" label="Add SNP info from" > + <option value="none"></option> + <option value="snpTable">UCSC SNP Table</option> + <option value="snpFile">GMAP SNP File</option> + <option value="vcfFile">VCF File</option> + </param> + <when value="none"/> + <when value="snpTable"> + <param name="snps" type="data" format="tabular" optional="true" label="UCSC SNPs table" help="Example: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/snp130.txt.gz" /> + <param name="snpsex" type="data" format="tabular" optional="true" label="UCSC SNP Exceptions table" help="Example: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/snp130Exceptions.txt.gz" /> + <param name="weight" type="select" label="Include SNPs with at least Confidence Level" help=""> + <option value="1" selected="true">1 (High)</option> + <option value="2">2 (Medium)</option> + <option value="3">3 (All)</option> + </param> + </when> + <when value="snpFile"> + <param name="snps" type="data" format="gmap_snps" optional="true" label="GMAP SNPs file" + help="Format (3 columns): + <br>>rs62211261 21:14379270 CG + <br>>rs62211262 21:14379281 CG + <br>Each line must start with a > character, then be followed by an + identifier (which may have duplicates). Then there should be the + chromosomal coordinate of the SNP. (Coordinates are all 1-based, so + the first character of a chromosome is number 1.) Finally, there + should be the two possible alleles: ( AC AG AT CG CT GT or AN CN GN TN) + <br>These alleles must correspond to the possible nucleotides on the plus strand of the genome. + If the one of these two letters does not match the allele in the reference + sequence, that SNP will be ignored in subsequent processing as a probable error. + The N stands for any other allele." /> + </when> + <when value="vcfFile"> + <param name="snps" type="data" format="vcf" optional="true" label="VCF SNPs file" + help="Example: ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/00-All.vcf.gz + The VCF file contains multiple versions of dbSNP, so if you want a + particular version, such as 135. The vcf_iit program tries to pick + a subset of SNPs that somewhat parallel + the ones without exceptions in the UCSC dbSNP file. It keeps all SNPs + that have been validated (marked in the VCF file as "VLD") or have a + submitter link-out ("SLO"). Otherwise, it excludes SNPs that are + individual genotypes ("GNO"). If none of these conditions hold, then + the SNP is allowed. "/> + <param name="vcf_version" type="text" value="" optional="true" label="dbSNP version" + help="The VCF file contains multiple versions of dbSNP, so if you want a particular version, such as 135"/> + </when> + </conditional> + + <param name="kmer" type="select" multiple="true" force_select="true" label="kmer size" help="Use smaller values when building indexes on machines with limited RAM"> + <option value="12">12 (64MB RAM)</option> + <option value="13">13 (256MB RAM)</option> + <option value="14">14 (1GB RAM)</option> + <option value="15" selected="true">15 (4GB RAM)</option> + </param> + </inputs> + <stdio> + <exit_code range="1" level="fatal" description="Error running gmap_build" /> + </stdio> + <outputs> + <!-- + <data format="txt" name="log" label="${tool.name} on ${on_string}: log"/> + --> + <data format="gmapdb" name="output" label="${tool.name} on ${on_string} gmapdb ${refname}" /> + </outputs> <tests> - </tests> - + </tests> <help> - **GMAP Build** GMAP Build creates an index of a genomic sequence for mapping and alignment using GMAP_ (Genomic Mapping and Alignment Program for mRNA and EST sequences) and GSNAP_ (Genomic Short-read Nucleotide Alignment Program). (GMAP Build uses GMAP commands: gmap_build, iit_store, psl_splicesites, psl_introns, gtf_splicesites, gtf_introns, gff3_splicesites, gff3_introns, dbsnp_iit, snpindex, cmetindex, and atoiindex.) @@ -225,7 +221,7 @@ **Detecting known and novel splice sites in GSNAP** -GSNAP can detect splice junctions in individual reads. +GSNAP can detect splice junctions in individual reads. GSNAP allows for known splicing at two levels: at the level of known splice sites and at the level of known introns. At the site level, GSNAP finds splicing between arbitrary combinations of donor and @@ -237,8 +233,8 @@ than known introns, unless you are certain that all alternative splicing events are known are represented in your file. -Splice site files can be generated from a GTF file -or from refGenes table from UCSC. +Splice site files can be generated from a GTF file +or from refGenes table from UCSC. **SNP-tolerant alignment in GSNAP** @@ -285,7 +281,7 @@ GSNAP has the ability to align reads from bisulfite-treated DNA, which converts unmethylated cytosines to uracils that appear as thymines in -reads. GSNAP is able to identify genomic-T to read-C mismatches, +reads. GSNAP is able to identify genomic-T to read-C mismatches, if a cmetindex is generated. **RNA-editing tolerance in GSNAP** @@ -335,7 +331,8 @@ will overwrite only the identical files from the previous runs. You can then choose the k-mer size at run-time by using the -k flag for either GMAP or GSNAP. - </help> + <citations> + <citation type="doi">10.1093/bioinformatics/bti310</citation> + </citations> </tool> -
--- a/gsnap.xml Wed Sep 28 10:43:44 2016 -0400 +++ b/gsnap.xml Wed Sep 28 10:47:28 2016 -0400 @@ -1,9 +1,9 @@ -<tool id="gsnap" name="GSNAP" version="3.0.0"> +<tool id="gsnap" name="GSNAP" version="3.0.1"> <description>Genomic Short-read Nucleotide Alignment Program</description> <requirements> <requirement type="package" version="2013-05-09">gmap</requirement> </requirements> - <version_string>gsnap --version</version_string> + <version_command>gsnap --version</version_command> <command> #import os.path, re gsnap @@ -140,7 +140,7 @@ --npath=$output.npath #end if #if $output.maxsearch.__str__ != '': - --maxsearch=$output.maxsearch + --maxsearch=$output.maxsearch #end if $output.quiet_if_excessive $output.show_refdiff @@ -266,15 +266,15 @@ </param> <param name="pairmax_dna" type="integer" value="" optional="true" label="Max total genomic length for DNA-Seq paired reads, or other reads without splicing (default 1000)." help="Used if no splice file is provided and novelsplicing is off."/> <param name="pairmax_rna" type="integer" value="" optional="true" label="Max total genomic length for RNA-Seq paired reads, or other reads that could have a splice (default 200000)." help="Used when novel splicing is specified or a splice file is provided. Should probably match the value for localsplicedist."/> - <param name="pairexpect" type="integer" value="" optional="true" label="Expected paired-end length" + <param name="pairexpect" type="integer" value="" optional="true" label="Expected paired-end length" help="Used for calling splices in medial part of paired-end reads (default 200)"/> - <param name="pairdev" type="integer" value="" optional="true" label="Allowable deviation from expected paired-end length" + <param name="pairdev" type="integer" value="" optional="true" label="Allowable deviation from expected paired-end length" help="Used for calling splices in medial part of paired-end reads (default 25)"/> </when> </conditional> <param name="barcode_length" type="integer" value="" optional="true" label="Amount of barcode to remove from start of read (default 0)" /> <param name="fastq_id_start" type="integer" value="" optional="true" label="Starting field of identifier in FASTQ header, whitespace-delimited, starting from 1" /> - <param name="fastq_id_end" type="integer" value="" optional="true" label="Ending field of identifier in FASTQ header, whitespace-delimited, starting from 1" + <param name="fastq_id_end" type="integer" value="" optional="true" label="Ending field of identifier in FASTQ header, whitespace-delimited, starting from 1" help="Examples: <br>@HWUSI-EAS100R:6:73:941:1973#0/1 <br> . start=1, end=1 (default) => identifier is HWUSI-EAS100R:6:73:941:1973#0/1 @@ -283,8 +283,8 @@ <br> . start=2, end=2 => identifier is 071112_SLXA-EAS1_s_7:5:1:817:345 <br> . start=1, end=2 => identifier is SRR001666.1 071112_SLXA-EAS1_s_7:5:1:817:345" /> - <param name="filter_chastity" type="select" label="Skip reads marked by the Illumina chastity program" - help="String after the accession having a 'Y' after the first colon, like this: + <param name="filter_chastity" type="select" label="Skip reads marked by the Illumina chastity program" + help="String after the accession having a 'Y' after the first colon, like this: <br>@accession 1:Y:0:CTTGTA <br>where the 'Y' signifies filtering by chastity. <br> For 'either', a 'Y' on either end of a paired-end read will be filtered. @@ -303,10 +303,10 @@ <param name="gsnap" type="data" format="fasta" label="Select a single-end dataset" help="GSNAP fasta must have the sequence entirely on one line, a second line is interpreted as the paired-end sequence"/> <param name="circularinput" type="boolean" checked="false" truevalue="--circular-input=true" falsevalue="" label="Circular-end data (paired reads are on same strand)"/> </when> - + </conditional> <!-- No longer in options as of version 2011-11-30 - <param name="mapq_unique_score" type="integer" value="" optional="true" label="MAPQ score threshold" + <param name="mapq_unique_score" type="integer" value="" optional="true" label="MAPQ score threshold" help="For multiple results, consider as a unique result if only one of the results has a MAPQ score equal or greater than this (if not selected, then reports all multiple results, up to npaths)" /> --> @@ -350,8 +350,8 @@ </param> <conditional name="use_splicing"> - <param name="src" type="select" label="<HR>Known Splicesite and Introns" - help="Look for splicing involving known sites or known introns at short or long distances + <param name="src" type="select" label="<HR>Known Splicesite and Introns" + help="Look for splicing involving known sites or known introns at short or long distances See README instructions for the distinction between known sites and known introns"> <option value="none" selected="true">None</option> <option value="gmapdb">From the GMAP Database</option> @@ -359,7 +359,7 @@ </param> <when value="none"/> <when value="history"> - <param name="splicemap" type="data" format="splicesites.iit,introns.iit" metadata_name="dbkey" label="Select a splicesite map" + <param name="splicemap" type="data" format="splicesites.iit,introns.iit" label="Select a splicesite map" help="built with GMAP IIT"/> </when> <when value="gmapdb"> @@ -384,7 +384,7 @@ </param> <when value="none"/> <when value="history"> - <param name="snpindex" type="data" format="gmapsnpindex" metadata_name="dbkey" label="Select a snpindex" + <param name="snpindex" type="data" format="gmapsnpindex" label="Select a snpindex" help="built with GMAP SNP Index"/> </when> <when value="gmapdb"> @@ -403,7 +403,7 @@ </when> <when value="gmapdb"> - <param name="gmapdb" type="data" format="gmapdb" metadata_name="dbkey" label="Select a gmapdb" + <param name="gmapdb" type="data" format="gmapdb" label="Select a gmapdb" help="A GMAP database built with GMAP Build"/> <param name="kmer" type="select" data_ref="gmapdb" label="kmer size" help="Defaults to highest available kmer size"> <options> @@ -420,8 +420,8 @@ </param> <conditional name="use_splicing"> - <param name="src" type="select" label="<HR>Known Splicesite and Introns" - help="Look for splicing involving known sites or known introns at short or long distances + <param name="src" type="select" label="<HR>Known Splicesite and Introns" + help="Look for splicing involving known sites or known introns at short or long distances See README instructions for the distinction between known sites and known introns"> <option value="none" selected="true">None</option> <option value="gmapdb">From the GMAP Database</option> @@ -429,10 +429,10 @@ </param> <when value="none"/> <when value="history"> - <param name="splicemap" type="data" format="splicesites.iit,introns.iit" metadata_name="dbkey" label="Select a splicesite map" + <param name="splicemap" type="data" format="splicesites.iit,introns.iit" label="Select a splicesite map" help="built with GMAP IIT"/> <param name="ambig_splice_noclip" type="boolean" checked="false" truevalue="--ambig-splice-noclip" falsevalue="" label="Do not clip at ambiguous splice sites" - help="For ambiguous known splicing at ends of the read, do not clip at the splice site, but extend instead into the intron. + help="For ambiguous known splicing at ends of the read, do not clip at the splice site, but extend instead into the intron. This flag makes sense only if you are trying to eliminate all soft clipping with --trim-mismatch-score=0"/> </when> <when value="gmapdb"> @@ -442,7 +442,7 @@ </options> </param> <param name="ambig_splice_noclip" type="boolean" checked="false" truevalue="--ambig-splice-noclip" falsevalue="" label="Do not clip at ambiguous splice sites" - help="For ambiguous known splicing at ends of the read, do not clip at the splice site, but extend instead into the intron. + help="For ambiguous known splicing at ends of the read, do not clip at the splice site, but extend instead into the intron. This flag makes sense only if you are trying to eliminate all soft clipping with --trim-mismatch-score=0"/> </when> </conditional> @@ -455,7 +455,7 @@ </param> <when value="none"/> <when value="history"> - <param name="snpindex" type="data" format="gmapsnpindex" metadata_name="dbkey" label="Select a snpindex" + <param name="snpindex" type="data" format="gmapsnpindex" label="Select a snpindex" help="built with GMAP SNP Index"/> </when> <when value="gmapdb"> @@ -478,7 +478,7 @@ </param> <when value="default"/> <when value="advanced"> - <param name="max_mismatches" type="float" value="" optional="true" label="Maximum number of mismatches allowed (uses default when negative)" + <param name="max_mismatches" type="float" value="" optional="true" label="Maximum number of mismatches allowed (uses default when negative)" help="Maximum number of mismatches allowed (if not specified, then defaults to the ultrafast level of ((readlength+index_interval-1)/kmer - 2)) (By default, the genome index interval is 3, but this can be changed @@ -497,7 +497,7 @@ Keeping this number large will allow for random selection among multiple alignments. Reducing this number can speed up the program. "/> - <param name="terminal_threshold" type="integer" value="" optional="true" label="Threshold for searching for a terminal alignment" + <param name="terminal_threshold" type="integer" value="" optional="true" label="Threshold for searching for a terminal alignment" help="Threshold for searching for a terminal alignment (from one end of the read to the best possible position at the other end) (default 2 for standard, atoi-stranded, and atoi-nonstranded mode; default 100 @@ -510,7 +510,7 @@ To turn off terminal alignments, set this to a high value, greater than the value for max-mismatches. "/> - <param name="indel_penalty" type="integer" value="" optional="true" label="Penalty for an indel (default 2)" + <param name="indel_penalty" type="integer" value="" optional="true" label="Penalty for an indel (default 2)" help="Counts against mismatches allowed. To find indels, make indel-penalty less than or equal to max-mismatches. A value < 2 can lead to false positives at read ends" /> <param name="indel_endlength" type="integer" value="" optional="true" label="Minimum length at end required for indel alignments (default 4)" /> <param name="max_middle_insertions" type="integer" value="" optional="true" label="Maximum number of middle insertions allowed (default 9)" /> @@ -519,17 +519,17 @@ <param name="max_end_deletions" type="integer" value="" optional="true" label="Maximum number of end deletions allowed (default 6)" /> <param name="suboptimal_levels" type="integer" value="" optional="true" label="Report suboptimal hits beyond best hit (default 0)" help="All hits with best score plus suboptimal-levels are reported" /> - <param name="adapter_strip" type="select" label="Method for removing adapters from reads" + <param name="adapter_strip" type="select" label="Method for removing adapters from reads" help="Default is 'off'. To turn on, specify 'paired', which removes adapters from paired-end reads if they appear to be present."> <option value="paired">paired</option> <option value="off" selected="true">off</option> </param> - <param name="trim_mismatch_score" type="integer" value="" optional="true" label="Score to use for mismatches when trimming at ends (default is -3)" + <param name="trim_mismatch_score" type="integer" value="" optional="true" label="Score to use for mismatches when trimming at ends (default is -3)" help="to turn off trimming, specify 0 (Warning: turning trimming off will give false positive mismatches at the ends of reads)"/> - <param name="trim_indel_score" type="integer" value="" optional="true" label="Score to use for indels when trimming at ends (default is -4)" + <param name="trim_indel_score" type="integer" value="" optional="true" label="Score to use for indels when trimming at ends (default is -4)" help="to turn off trimming, specify 0 (Warning: turning trimming off will give false positive indels at the ends of reads)"/> - <param name="use_tally" type="data" format="tally.iit" optional="true" metadata_name="dbkey" label="Select a tally IIT file to resolve concordant multiple results" + <param name="use_tally" type="data" format="tally.iit" optional="true" label="Select a tally IIT file to resolve concordant multiple results" help="generated by gsnap_tally and iit_store"/> <!-- @@ -542,24 +542,24 @@ just give full path name to use-runlength instead. use-runlength=STRING Use this runlength IIT file to resolve concordant multiple results --> - + <!-- Options for GMAP alignment within GSNAP --> - <param name="gmap_mode" type="select" multiple="true" optional="true" display="checkboxes" label="Cases to use GMAP for complex alignments containing multiple splices or indels" + <param name="gmap_mode" type="select" multiple="true" optional="true" display="checkboxes" label="Cases to use GMAP for complex alignments containing multiple splices or indels" help="Default: pairsearch,terminal,improve"> <option value="pairsearch" selected="true">pairsearch</option> <option value="indel_knownsplice" selected="true">indel_knownsplice</option> <option value="terminal" selected="true">terminal</option> <option value="improve" selected="true">improve</option> </param> - <param name="trigger_score_for_gmap" type="integer" value="" optional="true" label="GMAP pairsearch threshold (default 5)" + <param name="trigger_score_for_gmap" type="integer" value="" optional="true" label="GMAP pairsearch threshold (default 5)" help="Try GMAP pairsearch on nearby genomic regions if best score (the total of both ends if paired-end) exceeds this value (default 5)" /> - <param name="max_gmap_pairsearch" type="integer" value="" optional="true" label="GMAP pairsearch threshold (default 3)" + <param name="max_gmap_pairsearch" type="integer" value="" optional="true" label="GMAP pairsearch threshold (default 3)" help="Perform GMAP pairsearch on nearby genomic regions up to this many candidate ends (default 3)." /> - <param name="max_gmap_terminal" type="integer" value="" optional="true" label="GMAP terminal threshold (default 3)" + <param name="max_gmap_terminal" type="integer" value="" optional="true" label="GMAP terminal threshold (default 3)" help="Perform GMAP terminal on nearby genomic regions up to this many candidate ends (default 3)." /> - <param name="max_gmap_improvement" type="integer" value="" optional="true" label="GMAP improvement threshold (default 3)" + <param name="max_gmap_improvement" type="integer" value="" optional="true" label="GMAP improvement threshold (default 3)" help="Perform GMAP improvement on nearby genomic regions up to this many candidate ends (default 3)." /> - <param name="microexon_spliceprob" type="float" value="" optional="true" label="GMAP microexons threshold (default .90)" + <param name="microexon_spliceprob" type="float" value="" optional="true" label="GMAP microexons threshold (default .90)" help="Allow microexons only if one of the splice site probabilities is greater than this value." > <validator type="in_range" message="The microexons probability must be between 0. and 1." min="0." max="1."/> </param> @@ -574,7 +574,7 @@ <when value="default"/> <when value="advanced"> <!-- Splicing options for RNA-Seq --> - <!-- use-splicing This should be either a select list from the gmapdb maps or a data type using splicesdir and use-splicing --> + <!-- use-splicing This should be either a select list from the gmapdb maps or a data type using splicesdir and use-splicing --> <!-- Neither novel splicing (-N) nor known splicing (-s) turned on => assume reads are DNA-Seq (genomic) --> <param name="novelsplicing" type="boolean" checked="false" truevalue="--novelsplicing=1" falsevalue="" label="Look for novel splicing "/> <param name="localsplicedist" type="integer" value="" optional="true" label="Definition of local novel splicing event (default 200000)"/> @@ -587,8 +587,8 @@ <param name="shortend_splice_endlength" type="integer" value="" optional="true" label="Minimum length at end required for short-end spliced alignments" help="(default 2, but unless known splice sites are provided, GSNAP may still need the end length to be the value of kmer size to find a given splice"/> <param name="distant_splice_identity" type="float" value="" optional="true" label="Minimum identity at end required for distant spliced alignments (default 0.95)"/> - <param name="antistranded_penalty" type="integer" value="" optional="true" label="Penalty for antistranded splicing when using stranded RNA-Seq protocols" - help="A positive value, such as 1, expects antisense on the first read and sense on the second read. + <param name="antistranded_penalty" type="integer" value="" optional="true" label="Penalty for antistranded splicing when using stranded RNA-Seq protocols" + help="A positive value, such as 1, expects antisense on the first read and sense on the second read. Default is 0, which treats sense and antisense equally well"/> </when> </conditional> @@ -602,11 +602,11 @@ <when value="default"/> <when value="advanced"> <param name="npath" type="integer" value="" optional="true" label="Maximum number of paths to print (default 100)"/> - <param name="quiet_if_excessive" type="boolean" checked="false" truevalue="--quiet-if-excessive" falsevalue="" label="Quiet if Excessive" + <param name="quiet_if_excessive" type="boolean" checked="false" truevalue="--quiet-if-excessive" falsevalue="" label="Quiet if Excessive" help="If more than maximum number of paths are found, then nothing is printed."/> - <param name="show_refdiff" type="boolean" checked="false" truevalue="--show-refdiff" falsevalue="" label="Show SNP-tolerant alignment" + <param name="show_refdiff" type="boolean" checked="false" truevalue="--show-refdiff" falsevalue="" label="Show SNP-tolerant alignment" help="For GSNAP output in SNP-tolerant alignment, shows all differences relative to the reference genome as lower case (otherwise, it shows all differences relative to both the reference and alternate genome)"/> - <param name="clip_overlap" type="boolean" checked="false" truevalue="--clip-overlap" falsevalue="" label="Clip Overlap" + <param name="clip_overlap" type="boolean" checked="false" truevalue="--clip-overlap" falsevalue="" label="Clip Overlap" help="For paired-end reads whose alignments overlap, clip the overlapping region."/> </when> </conditional> @@ -640,8 +640,8 @@ <!-- TODO combine fails and split_output --> <conditional name="results"> - <param name="split_output" type="select" label="<HR>Split outputs" - help="Separate outputs for: nomapping, halfmapping_uniq, halfmapping_mult, unpaired_uniq, unpaired_mult, paired_uniq, paired_mult, concordant_uniq, and concordant_mult results"> + <param name="split_output" type="select" label="<HR>Split outputs" + help="Separate outputs for: nomapping, halfmapping_uniq, halfmapping_mult, unpaired_uniq, unpaired_mult, paired_uniq, paired_mult, concordant_uniq, and concordant_mult results"> <option value="no">no</option> <option value="yes">yes</option> </param> @@ -655,8 +655,8 @@ <when value="default"/> <when value="nofails"/> <when value="failsonly"> - <param name="fails_as_input" type="boolean" truevalue="--fails-as-input" falsevalue="" checked="false" label="Print completely failed alignments as input FASTA or FASTQ format" - help=""/> + <param name="fails_as_input" type="boolean" truevalue="--fails-as-input" falsevalue="" checked="false" label="Print completely failed alignments as input FASTA or FASTQ format" + help=""/> </when> </conditional> </when> @@ -671,8 +671,8 @@ <when value="nofails"/> <when value="failsonly"/> </conditional> - <param name="fails_as_input" type="boolean" truevalue="--fails-as-input" falsevalue="" checked="false" label="Print completely failed alignments as input FASTA or FASTQ format" - help=""/> + <param name="fails_as_input" type="boolean" truevalue="--fails-as-input" falsevalue="" checked="false" label="Print completely failed alignments as input FASTA or FASTQ format" + help=""/> </when> </conditional> @@ -808,18 +808,18 @@ </outputs> <tests> - </tests> + </tests> <help> **What it does** -GSNAP_ (Genomic Short-read Nucleotide Alignment Program) is a short read aligner which can align both single- and paired-end reads as short as 14nt and of arbitrarily long length. It can detect short- and long-distance splicing, including interchromosomal splicing, in individual reads, using probabilistic models or a database of known splice sites. Our program also permits SNP-tolerant alignment to a reference space of all possible combinations of major and minor alleles, and can align reads from bisulfite-treated DNA for the study of methylation state. It is developed by Thomas D. Wu of Genentech, Inc. +GSNAP_ (Genomic Short-read Nucleotide Alignment Program) is a short read aligner which can align both single- and paired-end reads as short as 14nt and of arbitrarily long length. It can detect short- and long-distance splicing, including interchromosomal splicing, in individual reads, using probabilistic models or a database of known splice sites. Our program also permits SNP-tolerant alignment to a reference space of all possible combinations of major and minor alleles, and can align reads from bisulfite-treated DNA for the study of methylation state. It is developed by Thomas D. Wu of Genentech, Inc. Publication_ citation: Thomas D. Wu, Serban Nacu "Fast and SNP-tolerant detection of complex variants and splicing in short reads. Bioinformatics. 2010 Apr 1;26(7):873-81. Epub 2010 Feb 10. .. _GSNAP: http://research-pub.gene.com/gmap/ .. _Publication: http://bioinformatics.oupjournals.org/cgi/content/full/26/7/873 -http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2844994/?tool=pubmed +https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2844994/?tool=pubmed ------ @@ -835,10 +835,10 @@ **Input formats** -Input to GSNAP should be either in FASTQ or FASTA format. +Input to GSNAP should be either in FASTQ or FASTA format. The FASTQ input may include quality scores, which will then be included in SAM -output, if that output format is selected. +output, if that output format is selected. For FASTA format, you should include one line per read (or end of a paired-end read). The same FASTA file can have a mixture of @@ -880,10 +880,9 @@ Default GSNAP format See the README_ - - - - </help> + <citations> + <citation type="doi">10.1093/bioinformatics/btq057</citation> + </citations> </tool>
--- a/iit_store.xml Wed Sep 28 10:43:44 2016 -0400 +++ b/iit_store.xml Wed Sep 28 10:47:28 2016 -0400 @@ -1,109 +1,10 @@ -<tool id="gmap_iit_store" name="GMAP IIT" version="3.0.0"> +<tool id="gmap_iit_store" name="GMAP IIT" version="3.0.1"> <description>Create a map store for known genes or SNPs</description> <requirements> <requirement type="package" version="2013-05-09">gmap</requirement> </requirements> - <version_string>iit_store --version</version_string> + <version_command>iit_store --version</version_command> <command interpreter="command"> /bin/bash $shscript 2> $log </command> - <inputs> - <!-- Input data --> - <conditional name="map"> - <param name="type" type="select" label="Make map for" > - <option value="genes">Introns and Splice sites</option> - <option value="snps">SNPs</option> - <option value="gmap">GMAP Annotation</option> - </param> - <when value="genes"> - <conditional name="src"> - <param name="src_format" type="select" label="Add splice and intron info from" > - <option value="refGeneTable">refGenes table from UCSC table browser</option> - <option value="gtf">GTF</option> - <option value="gff3">GFF3</option> - </param> - <when value="refGeneTable"> - <param name="genes" type="data" format="tabular" label="UCSC refGenes table" help="Example: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/refGene.txt.gz" /> - <param name="col_skip" type="integer" value="1" label="Columns to skip before the id/name column (default 1)" - help="Note that alignment tracks in UCSC sometimes have an extra column on the left."> - <validator type="in_range" message="The number of colmumns to skip must >= 0." min="0."/> - </param> - </when> - <when value="gtf"> - <param name="genes" type="data" format="gtf" label="Genes as GTF" help="" /> - </when> - <when value="gff3"> - <param name="genes" type="data" format="gff3" label="Genes in GFF3 format" help="" /> - </when> - </conditional> - <param name="maps" type="select" display="checkboxes" multiple="true" force_select="true" label="Add splice and intron info from" > - <option value="splicesites" selected="true">splicesites.iit</option> - <option value="introns" selected="false">introns.iit</option> - </param> - </when> - <when value="snps"> - <conditional name="src"> - <param name="src_format" type="select" label="Add SNP info from" > - <option value="snpTable">UCSC SNP Table</option> - <option value="snpFile">GMAP SNP File</option> - </param> - <when value="snpTable"> - <param name="snps" type="data" format="tabular" label="UCSC SNPs table" help="Example: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/snp130.txt.gz" /> - <param name="snpsex" type="data" format="tabular" optional="true" label="UCSC SNP Exceptions table" help="Example: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/snp130Exceptions.txt.gz" /> - <param name="weight" type="select" label="Include SNPs with at least Confidence Level" help=""> - <option value="1" selected="true">1 (High)</option> - <option value="2">2 (Medium)</option> - <option value="3">3 (All)</option> - </param> - </when> - <when value="snpFile"> - <param name="snps" type="data" format="gmap_snps" optional="true" label="GMAP SNPs file" - help="Format (3 columns):<B> - <br>>rs62211261 21:14379270 CG - <br>>rs62211262 21:14379281 CG - </B> - <br>Each line must start with a > character, then be followed by an - identifier (which may have duplicates). Then there should be the - chromosomal coordinate of the SNP. (Coordinates are all 1-based, so - the first character of a chromosome is number 1.) Finally, there - should be the two possible alleles: ( AC AG AT CG CT GT or AN CN GN TN) - <br>These alleles must correspond to the possible nucleotides on the plus strand of the genome. - If the one of these two letters does not match the allele in the reference - sequence, that SNP will be ignored in subsequent processing as a probable error. - The N stands for any other allele." /> - </when> - </conditional> - </when> - <when value="gmap"> - <param name="annotation" type="data" format="gmap_annotation" label="GMAP mapfile" - help="Format (2 or columns): <B> - <br>>label coords optional_tag - <br>optional_annotation (which may be zero, one, or multiple lines) - </B> - <br>Each line must start with a > character, then be followed by an identifier (which may have duplicates). - <br>Then there should be the chromosomal coordinate range. (Coordinates are all 1-based, so the first character of a chromosome is number 1.) - <br>The coords should be of the form - <br> chr:position - <br> chr:startposition..endposition - <br>The term chr:position is equivalent to chr:position..position. - <br>If you want to indicate that the interval is on the minus strand or reverse direction, then endposition may be less than startposition. - " /> - </when> - </conditional> - </inputs> - <outputs> - <data format="txt" name="log" label="${tool.name} on ${on_string}: log"/> - <data format="splicesites.iit" name="splicesites_iit" label="${tool.name} on ${on_string} splicesites.iit"> - <filter>(map['type'] == 'genes' and 'splicesites' in map['maps'])</filter> - </data> - <data format="introns.iit" name="introns_iit" label="${tool.name} on ${on_string} introns.iit"> - <filter>(map['type'] == 'genes' and 'introns' in map['maps'])</filter> - </data> - <data format="snps.iit" name="snps_iit" label="${tool.name} on ${on_string} snps.iit"> - <filter>(map['type'] == 'snps')</filter> - </data> - <data format="iit" name="map_iit" label="${tool.name} on ${on_string} map.iit"> - <filter>(map['type'] == 'gmap')</filter> - </data> - </outputs> <configfiles> <configfile name="shscript"> #!/bin/bash @@ -146,23 +47,121 @@ #if $map.src.snpsex.__str__ != 'None': $catcmd $map.src.snps | dbsnp_iit -w $map.src.weight -e $map.src.snpsex | iit_store -o $snps_iit #else: - $catcmd $map.src.snps | dbsnp_iit -w $map.src.weight | iit_store -o $snps_iit + $catcmd $map.src.snps | dbsnp_iit -w $map.src.weight | iit_store -o $snps_iit #end if #else: - $catcmd $map.src.snps | iit_store -o $map_iit + $catcmd $map.src.snps | iit_store -o $map_iit #end if </configfile> </configfiles> - + <inputs> + <!-- Input data --> + <conditional name="map"> + <param name="type" type="select" label="Make map for" > + <option value="genes">Introns and Splice sites</option> + <option value="snps">SNPs</option> + <option value="gmap">GMAP Annotation</option> + </param> + <when value="genes"> + <conditional name="src"> + <param name="src_format" type="select" label="Add splice and intron info from" > + <option value="refGeneTable">refGenes table from UCSC table browser</option> + <option value="gtf">GTF</option> + <option value="gff3">GFF3</option> + </param> + <when value="refGeneTable"> + <param name="genes" type="data" format="tabular" label="UCSC refGenes table" help="Example: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/refGene.txt.gz" /> + <param name="col_skip" type="integer" value="1" label="Columns to skip before the id/name column (default 1)" + help="Note that alignment tracks in UCSC sometimes have an extra column on the left."> + <validator type="in_range" message="The number of colmumns to skip must >= 0." min="0."/> + </param> + </when> + <when value="gtf"> + <param name="genes" type="data" format="gtf" label="Genes as GTF" help="" /> + </when> + <when value="gff3"> + <param name="genes" type="data" format="gff3" label="Genes in GFF3 format" help="" /> + </when> + </conditional> + <param name="maps" type="select" display="checkboxes" multiple="true" force_select="true" label="Add splice and intron info from" > + <option value="splicesites" selected="true">splicesites.iit</option> + <option value="introns" selected="false">introns.iit</option> + </param> + </when> + <when value="snps"> + <conditional name="src"> + <param name="src_format" type="select" label="Add SNP info from" > + <option value="snpTable">UCSC SNP Table</option> + <option value="snpFile">GMAP SNP File</option> + </param> + <when value="snpTable"> + <param name="snps" type="data" format="tabular" label="UCSC SNPs table" help="Example: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/snp130.txt.gz" /> + <param name="snpsex" type="data" format="tabular" optional="true" label="UCSC SNP Exceptions table" help="Example: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/snp130Exceptions.txt.gz" /> + <param name="weight" type="select" label="Include SNPs with at least Confidence Level" help=""> + <option value="1" selected="true">1 (High)</option> + <option value="2">2 (Medium)</option> + <option value="3">3 (All)</option> + </param> + </when> + <when value="snpFile"> + <param name="snps" type="data" format="gmap_snps" optional="true" label="GMAP SNPs file" + help="Format (3 columns):<B> + <br>>rs62211261 21:14379270 CG + <br>>rs62211262 21:14379281 CG + </B> + <br>Each line must start with a > character, then be followed by an + identifier (which may have duplicates). Then there should be the + chromosomal coordinate of the SNP. (Coordinates are all 1-based, so + the first character of a chromosome is number 1.) Finally, there + should be the two possible alleles: ( AC AG AT CG CT GT or AN CN GN TN) + <br>These alleles must correspond to the possible nucleotides on the plus strand of the genome. + If the one of these two letters does not match the allele in the reference + sequence, that SNP will be ignored in subsequent processing as a probable error. + The N stands for any other allele." /> + </when> + </conditional> + </when> + <when value="gmap"> + <param name="annotation" type="data" format="gmap_annotation" label="GMAP mapfile" + help="Format (2 or columns): <B> + <br>>label coords optional_tag + <br>optional_annotation (which may be zero, one, or multiple lines) + </B> + <br>Each line must start with a > character, then be followed by an identifier (which may have duplicates). + <br>Then there should be the chromosomal coordinate range. (Coordinates are all 1-based, so the first character of a chromosome is number 1.) + <br>The coords should be of the form + <br> chr:position + <br> chr:startposition..endposition + <br>The term chr:position is equivalent to chr:position..position. + <br>If you want to indicate that the interval is on the minus strand or reverse direction, then endposition may be less than startposition. + " /> + </when> + </conditional> + </inputs> + <outputs> + <data format="txt" name="log" label="${tool.name} on ${on_string}: log"/> + <data format="splicesites.iit" name="splicesites_iit" label="${tool.name} on ${on_string} splicesites.iit"> + <filter>(map['type'] == 'genes' and 'splicesites' in map['maps'])</filter> + </data> + <data format="introns.iit" name="introns_iit" label="${tool.name} on ${on_string} introns.iit"> + <filter>(map['type'] == 'genes' and 'introns' in map['maps'])</filter> + </data> + <data format="snps.iit" name="snps_iit" label="${tool.name} on ${on_string} snps.iit"> + <filter>(map['type'] == 'snps')</filter> + </data> + <data format="iit" name="map_iit" label="${tool.name} on ${on_string} map.iit"> + <filter>(map['type'] == 'gmap')</filter> + </data> + </outputs> <tests> - </tests> + </tests> <help> **iit_store** -GMAP IIT creates an Interval Index Tree map of known splice sites, introns, or SNPs (it uses iit_store described in the GMAP documentation). The maps can be used in GMAP_ (Genomic Mapping and Alignment Program for mRNA and EST sequences) and GSNAP_ (Genomic Short-read Nucleotide Alignment Program). Maps are typically used for known splice sites, introns, or SNPs. +GMAP IIT creates an Interval Index Tree map of known splice sites, introns, or SNPs (it uses iit_store described in the GMAP documentation). The maps can be used in GMAP_ (Genomic Mapping and Alignment Program for mRNA and EST sequences) and GSNAP_ (Genomic Short-read Nucleotide Alignment Program). Maps are typically used for known splice sites, introns, or SNPs. You will want to read the README_ @@ -177,5 +176,8 @@ **inputs** </help> + <citations> + <citation type="doi">10.1093/bioinformatics/bti310</citation> + </citations> </tool>
--- a/lib/galaxy/datatypes/gmap.py Wed Sep 28 10:43:44 2016 -0400 +++ b/lib/galaxy/datatypes/gmap.py Wed Sep 28 10:47:28 2016 -0400 @@ -2,88 +2,109 @@ GMAP indexes """ import logging -import os,os.path,re,sys -import galaxy.datatypes.data +import os +import os.path +import re +import sys +from galaxy.datatypes import data from galaxy.datatypes.data import Text from galaxy import util from galaxy.datatypes.metadata import MetadataElement log = logging.getLogger(__name__) -class GmapDB( Text ): + +class GmapDB(Text): + """ A GMAP DB for indexes """ - MetadataElement( name="db_name", desc="The db name for this index set", default='unknown', set_in_upload=True, readonly=True ) - MetadataElement( name="chromosomes", desc="The chromosomes or contigs", no_value=[], readonly=False ) - MetadataElement( name="circular", desc="cirular chromosomes", no_value=[], readonly=False ) - MetadataElement( name="chromlength", desc="Chromosome lengths", no_value=[], readonly=False ) - MetadataElement( name="basesize", default="12", desc="The basesize for offsetscomp", visible=True, readonly=True ) - MetadataElement( name="kmers", desc="The kmer sizes for indexes", visible=True, no_value=[''], readonly=True ) - MetadataElement( name="map_dir", desc="The maps directory", default='unknown', set_in_upload=True, readonly=True ) - MetadataElement( name="maps", desc="The names of maps stored for this gmap gmapdb", visible=True, no_value=[''], readonly=True ) - MetadataElement( name="snps", desc="The names of SNP indexes stored for this gmapdb", visible=True, no_value=[''], readonly=True ) - MetadataElement( name="cmet", default=False, desc="Has a cmet index", visible=True, readonly=True ) - MetadataElement( name="atoi", default=False, desc="Has a atoi index", visible=True, readonly=True ) - + MetadataElement(name="db_name", desc="The db name for this index set", + default='unknown', set_in_upload=True, readonly=True) + MetadataElement( + name="chromosomes", desc="The chromosomes or contigs", no_value=[], readonly=False) + MetadataElement( + name="circular", desc="cirular chromosomes", no_value=[], readonly=False) + MetadataElement( + name="chromlength", desc="Chromosome lengths", no_value=[], readonly=False) + MetadataElement(name="basesize", default="12", + desc="The basesize for offsetscomp", visible=True, readonly=True) + MetadataElement(name="kmers", desc="The kmer sizes for indexes", + visible=True, no_value=[''], readonly=True) + MetadataElement(name="map_dir", desc="The maps directory", + default='unknown', set_in_upload=True, readonly=True) + MetadataElement(name="maps", desc="The names of maps stored for this gmap gmapdb", + visible=True, no_value=[''], readonly=True) + MetadataElement(name="snps", desc="The names of SNP indexes stored for this gmapdb", + visible=True, no_value=[''], readonly=True) + MetadataElement(name="cmet", default=False, + desc="Has a cmet index", visible=True, readonly=True) + MetadataElement(name="atoi", default=False, + desc="Has a atoi index", visible=True, readonly=True) + file_ext = 'gmapdb' is_binary = True composite_type = 'auto_primary_file' allow_datatype_change = False - def generate_primary_file( self, dataset = None ): - """ + def generate_primary_file(self, dataset=None): + """ This is called only at upload to write the html file cannot rename the datasets here - they come with the default unfortunately """ return '<html><head></head><body>AutoGenerated Primary File for Composite Dataset</body></html>' - - def regenerate_primary_file(self,dataset): + + def regenerate_primary_file(self, dataset): """ - cannot do this until we are setting metadata + cannot do this until we are setting metadata """ bn = dataset.metadata.db_name - log.info( "GmapDB regenerate_primary_file %s" % (bn)) + log.info("GmapDB regenerate_primary_file %s" % (bn)) rval = [] rval.append("GMAPDB: %s" % dataset.metadata.db_name) if dataset.metadata.chromosomes: rval.append("chromosomes: %s" % dataset.metadata.chromosomes) if dataset.metadata.chromlength and len(dataset.metadata.chromlength) == len(dataset.metadata.chromosomes): - rval.append( 'chrom\tlength' ) - for i,name in enumerate(dataset.metadata.chromosomes): - rval.append( '%s\t%d' % (dataset.metadata.chromosomes[i],dataset.metadata.chromlength[i])) + rval.append('chrom\tlength') + for i, name in enumerate(dataset.metadata.chromosomes): + rval.append( + '%s\t%d' % (dataset.metadata.chromosomes[i], dataset.metadata.chromlength[i])) if dataset.metadata.circular: rval.append("circular: %s" % dataset.metadata.circular) if dataset.metadata.kmers: rval.append("kmers: %s" % dataset.metadata.kmers) - rval.append("cmetindex: %s atoiindex: %s" % (dataset.metadata.cmet,dataset.metadata.atoi)) + rval.append("cmetindex: %s atoiindex: %s" % + (dataset.metadata.cmet, dataset.metadata.atoi)) if dataset.metadata.maps and len(dataset.metadata.maps) > 0: - rval.append( 'Maps:') - for i,name in enumerate(dataset.metadata.maps): + rval.append('Maps:') + for i, name in enumerate(dataset.metadata.maps): if name.strip() != '': - rval.append( ' %s' % name) - f = file(dataset.file_name,'w') - f.write("\n".join( rval )) + rval.append(' %s' % name) + f = open(dataset.file_name, 'w') + f.write("\n".join(rval)) f.write('\n') f.close() - def set_peek( self, dataset, is_multi_byte=False ): - log.info( "GmapDB set_peek %s" % (dataset)) + def set_peek(self, dataset, is_multi_byte=False): + log.info("GmapDB set_peek %s" % (dataset)) if not dataset.dataset.purged: - dataset.peek = "GMAPDB index %s\n chroms %s\n kmers %s cmet %s atoi %s\n maps %s" % ( dataset.metadata.db_name,dataset.metadata.chromosomes,dataset.metadata.kmers,dataset.metadata.cmet,dataset.metadata.atoi,dataset.metadata.maps ) - dataset.blurb = "GMAPDB %s" % ( dataset.metadata.db_name ) + dataset.peek = "GMAPDB index %s\n chroms %s\n kmers %s cmet %s atoi %s\n maps %s" % ( + dataset.metadata.db_name, dataset.metadata.chromosomes, dataset.metadata.kmers, dataset.metadata.cmet, dataset.metadata.atoi, dataset.metadata.maps) + dataset.blurb = "GMAPDB %s" % (dataset.metadata.db_name) else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' - def display_peek( self, dataset ): + + def display_peek(self, dataset): try: return dataset.peek except: return "GMAP index file" - def sniff( self, filename ): + def sniff(self, filename): return False - def set_meta( self, dataset, overwrite = True, **kwd ): + + def set_meta(self, dataset, overwrite=True, **kwd): """ extra_files_path/<db_name>/GRCh37_19 extra_files_path/<db_name>/GRCh37_19/GRCh37_19.a2iag12123offsetscomp @@ -110,81 +131,96 @@ extra_files_path/db_name/db_name.ref1[2345]1[2345]3offsetscomp extra_files_path/db_name/db_name.ref1[2345]1[2345]3positions extra_files_path/db_name/db_name.ref1[2345]1[2345]3gammaptrs - index maps: + index maps: extra_files_path/db_name/db_name.maps/*.iit """ - log.info( "GmapDB set_meta %s %s" % (dataset,dataset.extra_files_path)) + log.info("GmapDB set_meta %s %s" % (dataset, dataset.extra_files_path)) chrom_pat = '^(.+).chromosome$' - #pat = '(.*)\.((ref)|(met)[atgc][atgc]|(a2i)[atgc][atgc])((\d\d)(\d\d))?3positions(\.(.+))?' + # pat = '(.*)\.((ref)|(met)[atgc][atgc]|(a2i)[atgc][atgc])((\d\d)(\d\d))?3positions(\.(.+))?' pat = '(.*)\.((ref)|(met)[atgc][atgc]|(a2i)[atgc][atgc])((\d\d)(\d\d))?(\d)(offsetscomp)' efp = dataset.extra_files_path flist = os.listdir(efp) - for i,fname in enumerate(flist): - log.info( "GmapDB set_meta %s %s" % (i,fname)) - fpath = os.path.join(efp,fname) + for i, fname in enumerate(flist): + log.info("GmapDB set_meta %s %s" % (i, fname)) + fpath = os.path.join(efp, fname) if os.path.isdir(fpath): ilist = os.listdir(fpath) - # kmers = {'':'default'} # HACK '' empty key added so user has default choice when selecting kmer from metadata + # kmers = {'':'default'} # HACK '' empty key added so user + # has default choice when selecting kmer from metadata kmers = dict() - for j,iname in enumerate(ilist): - log.info( "GmapDB set_meta file %s %s" % (j,iname)) - ipath = os.path.join(fpath,iname) - print >> sys.stderr, "GmapDB set_meta file %s %s %s" % (j,iname,ipath) + for j, iname in enumerate(ilist): + log.info("GmapDB set_meta file %s %s" % (j, iname)) + ipath = os.path.join(fpath, iname) + print >> sys.stderr, "GmapDB set_meta file %s %s %s" % ( + j, iname, ipath) if os.path.isdir(ipath): # find maps dataset.metadata.map_dir = iname maps = [] snps = [] for mapfile in os.listdir(ipath): - mapname = mapfile.replace('.iit','') - log.info( "GmapDB set_meta map %s %s" % (mapname,mapfile)) - print >> sys.stderr, "GmapDB set_meta map %s %s " % (mapname,mapfile) + mapname = mapfile.replace('.iit', '') + log.info("GmapDB set_meta map %s %s" % + (mapname, mapfile)) + print >> sys.stderr, "GmapDB set_meta map %s %s " % ( + mapname, mapfile) maps.append(mapname) - if mapname.find('snp') >= 0: + if mapname.find('snp') >= 0: snps.append(mapname) if len(maps) > 0: dataset.metadata.maps = maps if len(snps) > 0: dataset.metadata.snps = snps - else: - m = re.match(chrom_pat,iname) + else: + m = re.match(chrom_pat, iname) if m and len(m.groups()) == 1: dataset.metadata.db_name = m.groups()[0] - print >> sys.stderr, "GmapDB set_meta file %s %s %s" % (j,iname,ipath) + print >> sys.stderr, "GmapDB set_meta file %s %s %s" % ( + j, iname, ipath) try: fh = open(ipath) dataset.metadata.chromosomes = [] dataset.metadata.circular = [] dataset.metadata.chromlength = [] - for k,line in enumerate(fh): - fields = line.strip().split('\t') - print >> sys.stderr, "GmapDB set_meta chrom %s fields %s" % (line,fields) - if len(fields) > 2: - dataset.metadata.chromosomes.append(str(fields[0])) - dataset.metadata.chromlength.append(int(fields[2])) - if len(fields) > 3 and fields[3] == 'circular': - dataset.metadata.circular.append(str(fields[0])) - print >> sys.stderr, "GmapDB set_meta db_name %s chromosomes %s circular %s" % (dataset.metadata.db_name,dataset.metadata.chromosomes,dataset.metadata.circular) - except Exception, e: - log.info( "GmapDB set_meta error %s %s " % (iname, e)) - print >> sys.stderr, "GmapDB set_meta file %s Error %s" % (ipath,e) + for k, line in enumerate(fh): + fields = line.strip().split('\t') + print >> sys.stderr, "GmapDB set_meta chrom %s fields %s" % ( + line, fields) + if len(fields) > 2: + dataset.metadata.chromosomes.append( + str(fields[0])) + dataset.metadata.chromlength.append( + int(fields[2])) + if len(fields) > 3 and fields[3] == 'circular': + dataset.metadata.circular.append( + str(fields[0])) + print >> sys.stderr, "GmapDB set_meta db_name %s chromosomes %s circular %s" % ( + dataset.metadata.db_name, dataset.metadata.chromosomes, dataset.metadata.circular) + except Exception as e: + log.info( + "GmapDB set_meta error %s %s " % (iname, e)) + print >> sys.stderr, "GmapDB set_meta file %s Error %s" % ( + ipath, e) finally: if fh: fh.close() continue - m = re.match(pat,iname) + m = re.match(pat, iname) if m: - log.info( "GmapDB set_meta m %s %s " % (iname, m)) - print >> sys.stderr, "GmapDB set_meta iname %s %s" % (iname,m.groups()) + log.info("GmapDB set_meta m %s %s " % (iname, m)) + print >> sys.stderr, "GmapDB set_meta iname %s %s" % ( + iname, m.groups()) assert len(m.groups()) == 10 if m.groups()[2] == 'ref': - if m.groups()[-1] != None and m.groups()[-1] != 'offsetscomp': - dataset.metadata.snps.append(m.groups()[-1]) + if m.groups()[-1] is not None and m.groups()[-1] != 'offsetscomp': + dataset.metadata.snps.append( + m.groups()[-1]) else: - if m.groups()[-3] != None: + if m.groups()[-3] is not None: k = int(m.groups()[-3]) kmers[k] = k - if m.groups()[-4] != None: - dataset.metadata.basesize = int( m.groups()[-4]) + if m.groups()[-4] is not None: + dataset.metadata.basesize = int( + m.groups()[-4]) elif m.groups()[3] == 'met': dataset.metadata.cmet = True elif m.groups()[4] == 'a2i': @@ -192,56 +228,66 @@ dataset.metadata.kmers = kmers.keys() self.regenerate_primary_file(dataset) -class GmapSnpIndex( Text ): + +class GmapSnpIndex(Text): + """ A GMAP SNP index created by snpindex """ - MetadataElement( name="db_name", desc="The db name for this index set", default='unknown', set_in_upload=True, readonly=True ) - MetadataElement( name="snps_name", default='snps', desc="The name of SNP index", visible=True, no_value='', readonly=True ) - + MetadataElement(name="db_name", desc="The db name for this index set", + default='unknown', set_in_upload=True, readonly=True) + MetadataElement(name="snps_name", default='snps', + desc="The name of SNP index", visible=True, no_value='', readonly=True) + file_ext = 'gmapsnpindex' is_binary = True composite_type = 'auto_primary_file' allow_datatype_change = False - def generate_primary_file( self, dataset = None ): - """ + def generate_primary_file(self, dataset=None): + """ This is called only at upload to write the html file cannot rename the datasets here - they come with the default unfortunately """ return '<html><head></head><body>AutoGenerated Primary File for Composite Dataset</body></html>' - - def regenerate_primary_file(self,dataset): + + def regenerate_primary_file(self, dataset): """ - cannot do this until we are setting metadata + cannot do this until we are setting metadata """ bn = dataset.metadata.db_name - log.info( "GmapDB regenerate_primary_file %s" % (bn)) - rval = ['<html><head><title>GMAPDB %s</title></head><p/><H3>GMAPDB %s</H3><p/>cmet %s<br>atoi %s<H4>Maps:</H4><ul>' % (bn,bn,dataset.metadata.cmet,dataset.metadata.atoi)] - for i,name in enumerate(dataset.metadata.maps): - rval.append( '<li>%s' % name) - rval.append( '</ul></html>' ) - f = file(dataset.file_name,'w') - f.write("\n".join( rval )) + log.info("GmapDB regenerate_primary_file %s" % (bn)) + rval = ['<html><head><title>GMAPDB %s</title></head><p/><H3>GMAPDB %s</H3><p/>cmet %s<br>atoi %s<H4>Maps:</H4><ul>' % + (bn, bn, dataset.metadata.cmet, dataset.metadata.atoi)] + for i, name in enumerate(dataset.metadata.maps): + rval.append('<li>%s' % name) + rval.append('</ul></html>') + f = open(dataset.file_name, 'w') + f.write("\n".join(rval)) f.write('\n') f.close() - def set_peek( self, dataset, is_multi_byte=False ): - log.info( "GmapSnpIndex set_peek %s" % (dataset)) + + def set_peek(self, dataset, is_multi_byte=False): + log.info("GmapSnpIndex set_peek %s" % (dataset)) if not dataset.dataset.purged: - dataset.peek = "GMAP SNPindex %s on %s\n" % ( dataset.metadata.snps_name,dataset.metadata.db_name) - dataset.blurb = "GMAP SNPindex %s on %s\n" % ( dataset.metadata.snps_name,dataset.metadata.db_name) + dataset.peek = "GMAP SNPindex %s on %s\n" % ( + dataset.metadata.snps_name, dataset.metadata.db_name) + dataset.blurb = "GMAP SNPindex %s on %s\n" % ( + dataset.metadata.snps_name, dataset.metadata.db_name) else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' - def display_peek( self, dataset ): + + def display_peek(self, dataset): try: return dataset.peek except: return "GMAP SNP index" - def sniff( self, filename ): + def sniff(self, filename): return False - def set_meta( self, dataset, overwrite = True, **kwd ): + + def set_meta(self, dataset, overwrite=True, **kwd): """ Expecting: extra_files_path/snp_name.iit @@ -249,21 +295,21 @@ extra_files_path/db_name/db_name.ref1[2345]1[2345]3positions.snp_name extra_files_path/db_name/db_name.ref1[2345]1[2345]3gammaptrs.snp_name """ - log.info( "GmapSnpIndex set_meta %s %s" % (dataset,dataset.extra_files_path)) + log.info("GmapSnpIndex set_meta %s %s" % + (dataset, dataset.extra_files_path)) pat = '(.*)\.(ref((\d\d)(\d\d))?3positions)\.(.+)?' efp = dataset.extra_files_path flist = os.listdir(efp) - for i,fname in enumerate(flist): - m = re.match(pat,fname) + for i, fname in enumerate(flist): + m = re.match(pat, fname) if m: assert len(m.groups()) == 6 dataset.metadata.db_name = m.groups()[0] dataset.metadata.snps_name = m.groups()[-1] - +class IntervalIndexTree(Text): -class IntervalIndexTree( Text ): """ A GMAP Interval Index Tree Map created by iit_store @@ -272,35 +318,45 @@ file_ext = 'iit' is_binary = True -class SpliceSitesIntervalIndexTree( IntervalIndexTree ): + +class SpliceSitesIntervalIndexTree(IntervalIndexTree): + """ - A GMAP Interval Index Tree Map + A GMAP Interval Index Tree Map created by iit_store """ file_ext = 'splicesites.iit' -class IntronsIntervalIndexTree( IntervalIndexTree ): + +class IntronsIntervalIndexTree(IntervalIndexTree): + """ A GMAP Interval Index Tree Map created by iit_store """ file_ext = 'introns.iit' -class SNPsIntervalIndexTree( IntervalIndexTree ): + +class SNPsIntervalIndexTree(IntervalIndexTree): + """ A GMAP Interval Index Tree Map created by iit_store """ file_ext = 'snps.iit' -class TallyIntervalIndexTree( IntervalIndexTree ): + +class TallyIntervalIndexTree(IntervalIndexTree): + """ A GMAP Interval Index Tree Map created by iit_store """ file_ext = 'tally.iit' -class IntervalAnnotation( Text ): + +class IntervalAnnotation(Text): + """ Class describing a GMAP Interval format: >label coords optional_tag @@ -311,38 +367,42 @@ """ file_ext = 'gmap_annotation' """Add metadata elements""" - MetadataElement( name="annotations", default=0, desc="Number of interval annotations", readonly=True, optional=True, visible=False, no_value=0 ) + MetadataElement(name="annotations", default=0, desc="Number of interval annotations", + readonly=True, optional=True, visible=False, no_value=0) - def set_meta( self, dataset, **kwd ): + def set_meta(self, dataset, **kwd): """ Set the number of annotations and the number of data lines in dataset. """ data_lines = 0 annotations = 0 - for line in file( dataset.file_name ): + for line in open(dataset.file_name): line = line.strip() - if line and line.startswith( '>' ): + if line and line.startswith('>'): annotations += 1 - data_lines +=1 + data_lines += 1 else: data_lines += 1 dataset.metadata.data_lines = data_lines dataset.metadata.annotations = annotations - def set_peek( self, dataset, is_multi_byte=False ): + + def set_peek(self, dataset, is_multi_byte=False): if not dataset.dataset.purged: - dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) + dataset.peek = data.get_file_peek( + dataset.file_name, is_multi_byte=is_multi_byte) if dataset.metadata.annotations: - dataset.blurb = "%s annotations" % util.commaify( str( dataset.metadata.annotations ) ) + dataset.blurb = "%s annotations" % util.commaify( + str(dataset.metadata.annotations)) else: - dataset.blurb = data.nice_size( dataset.get_size() ) + dataset.blurb = util.nice_size(dataset.get_size()) else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' - def sniff( self, filename ): + def sniff(self, filename): """ Determines whether the file is a gmap annotation file - Format: + Format: >label coords optional_tag optional_annotation (which may be zero, one, or multiple lines) For example, the label may be an EST accession, with the coords @@ -356,23 +416,25 @@ reverse direction, then <endposition> may be less than <startposition>. """ try: - pat = '>(\S+)\s((\S+):(\d+)(\.\.(\d+))?(\s.(.+))?$' #>label chr:position[..endposition][ optional_tag] - fh = open( filename ) + # >label chr:position[..endposition][ optional_tag] + pat = '>(\S+)\s((\S+):(\d+)(\.\.(\d+))?(\s.(.+))?$' + fh = open(filename) count = 0 while True and count < 10: line = fh.readline() if not line: - break #EOF + break # EOF line = line.strip() - if line: #first non-empty line - if line.startswith( '>' ): + if line: # first non-empty line + if line.startswith('>'): count += 1 - if re.match(pat,line) == None: # Failed to match + if re.match(pat, line) is None: # Failed to match return False finally: fh.close() return False + class SpliceSiteAnnotation(IntervalAnnotation): file_ext = 'gmap_splicesites' """ @@ -399,27 +461,30 @@ in the database; GSNAP will use the longest intron distance reported in searching for long introns. """ - def sniff( self, filename ): # TODO + + def sniff(self, filename): # TODO """ Determines whether the file is a gmap splice site annotation file """ try: - pat = '>(\S+\.intron\d+)\s((\S+):(\d+)\.\.(\d+))\s(donor|acceptor)(\s(\d+))?$' #>label chr:position..position donor|acceptor[ intron_dist] - fh = open( filename ) + # >label chr:position..position donor|acceptor[ intron_dist] + pat = '>(\S+\.intron\d+)\s((\S+):(\d+)\.\.(\d+))\s(donor|acceptor)(\s(\d+))?$' + fh = open(filename) count = 0 while True and count < 10: line = fh.readline() if not line: - break #EOF + break # EOF line = line.strip() - if line: #first non-empty line + if line: # first non-empty line count += 1 - if re.match(pat,line) == None: # Failed to match + if re.match(pat, line) is None: # Failed to match return False finally: fh.close() return False + class IntronAnnotation(IntervalAnnotation): file_ext = 'gmap_introns' """ @@ -432,27 +497,30 @@ surrounding the intron, with the first coordinate being from the donor exon and the second one being from the acceptor exon. """ - def sniff( self, filename ): # TODO + + def sniff(self, filename): # TODO """ Determines whether the file is a gmap Intron annotation file """ try: - pat = '>(\S+\.intron\d+)\s((\S+):(\d+)\.\.(\d+)(\s(.)+)?$' #>label chr:position - fh = open( filename ) + # >label chr:position + pat = '>(\S+\.intron\d+)\s((\S+):(\d+)\.\.(\d+)(\s(.)+)?$' + fh = open(filename) count = 0 while True and count < 10: line = fh.readline() if not line: - break #EOF + break # EOF line = line.strip() - if line: #first non-empty line + if line: # first non-empty line count += 1 - if re.match(pat,line) == None: # Failed to match + if re.match(pat, line) is None: # Failed to match return False finally: fh.close() return False + class SNPAnnotation(IntervalAnnotation): file_ext = 'gmap_snps' """ @@ -471,7 +539,7 @@ one of these two letters does not match the allele in the reference sequence, that SNP will be ignored in subsequent processing as a probable error. - + GSNAP also supports the idea of a wildcard SNP. A wildcard SNP allows all nucleotides to match at that position, not just a given reference and alternate allele. It is essentially as if an "N" were recorded at @@ -483,22 +551,24 @@ where "W" is the reference allele and "X" and "Y" are two different alternate alleles. """ - def sniff( self, filename ): + + def sniff(self, filename): """ Determines whether the file is a gmap SNP annotation file """ try: - pat = '>(\S+)\s((\S+):(\d+)\s([TACGW][TACGN])$' #>label chr:position ATCG - fh = open( filename ) + # >label chr:position ATCG + pat = '>(\S+)\s((\S+):(\d+)\s([TACGW][TACGN])$' + fh = open(filename) count = 0 while True and count < 10: line = fh.readline() if not line: - break #EOF + break # EOF line = line.strip() - if line: #first non-empty line + if line: # first non-empty line count += 1 - if re.match(pat,line) == None: # Failed to match + if re.match(pat, line) is None: # Failed to match return False finally: fh.close() @@ -517,32 +587,35 @@ C2 0.889,0.912,0.889,0.889,0.933,0.912,0.912,0.889,0.889,0.889 -2.66,-2.89,-2.66,-2.66,-3.16,-2.89,-2.89,-2.66,-2.66,-2.66 C1 T1 0.888,0.9,0.888,0.9,0.913,0.9,0.911,0.888,0.9,0.913 -2.66,-2.78,-2.66,-2.78,-2.91,-2.78,-2.89,-2.66,-2.78,-2.91 """ - def sniff( self, filename ): # TODO + + def sniff(self, filename): # TODO """ Determines whether the file is a gmap splice site annotation file """ try: - pat = '^>(\d+)\s((\S+):(\d+)\.\.(\d+))$' #>total chr:position..position - pat2 = '^[GATCN]\d.*$' #BaseCountDeatails - fh = open( filename ) + # >total chr:position..position + pat = '^>(\d+)\s((\S+):(\d+)\.\.(\d+))$' + pat2 = '^[GATCN]\d.*$' # BaseCountDeatails + fh = open(filename) count = 0 while True and count < 10: line = fh.readline() if not line: - break #EOF + break # EOF line = line.strip() - if line: #first non-empty line + if line: # first non-empty line count += 1 - if re.match(pat,line) == None and re.match(pat2,line) == None: # Failed to match + # Failed to match + if re.match(pat, line) is None and re.match(pat2, line) is None: return False finally: fh.close() return False -class GsnapResult( Text ): + +class GsnapResult(Text): + """ The default output format for gsnap. Can be used as input for gsnap_tally. """ file_ext = 'gsnap' - -
--- a/snpindex.xml Wed Sep 28 10:43:44 2016 -0400 +++ b/snpindex.xml Wed Sep 28 10:47:28 2016 -0400 @@ -1,77 +1,10 @@ -<tool id="gmap_snpindex" name="GMAP SNP Index" version="3.0.0"> +<tool id="gmap_snpindex" name="GMAP SNP Index" version="3.0.1"> <description>build index files for known SNPs</description> <requirements> <requirement type="package" version="2013-05-09">gmap</requirement> </requirements> - <version_string>snpindex --version</version_string> + <version_command>snpindex --version</version_command> <command interpreter="command"> /bin/bash $shscript 2>1 1> $output </command> - <inputs> - <conditional name="refGenomeSource"> - <param name="genomeSource" type="select" label="Will you map to a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options"> - <option value="indexed">Use a built-in index</option> - <option value="gmapdb">Use gmapdb from the history</option> - </param> - <when value="indexed"> - <param name="gmapindex" type="select" label="Select a reference genome" help="if your genome of interest is not listed - contact Galaxy team"> - <options from_file="gmap_indices.loc"> - <column name="uid" index="0" /> - <column name="dbkey" index="1" /> - <column name="name" index="2" /> - <column name="kmers" index="3" /> - <column name="maps" index="4" /> - <column name="snps" index="5" /> - <column name="value" index="6" /> - </options> - </param> - </when> - <when value="gmapdb"> - <param name="gmapdb" type="data" format="gmapdb" metadata_name="dbkey" label="Select a gmapdb" - help="A GMAP database built with GMAP Build"/> - </when> - </conditional> - <conditional name="dbsnp"> - <param name="snp_source" type="select" label="Add SNP info from" > - <option value="snpTable">UCSC SNP Table</option> - <option value="snpFile">GMAP SNP File</option> - <option value="snpIIT">"GMAP SNPs map from GMAP iit store</option> - </param> - <when value="snpTable"> - <param name="snps" type="data" format="tabular" label="UCSC SNPs table" help="Example: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/snp130.txt.gz" /> - <param name="snpsex" type="data" format="tabular" optional="true" label="UCSC SNP Exceptions table" help="Example: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/snp130Exceptions.txt.gz" /> - <param name="weight" type="select" label="Include SNPs with at least Confidence Level" help=""> - <option value="1" selected="true">1 (High)</option> - <option value="2">2 (Medium)</option> - <option value="3">3 (All)</option> - </param> - </when> - <when value="snpFile"> - <param name="snps" type="data" format="gmap_snps" label="GMAP SNPs file" - help="Format (3 columns): - <br>>rs62211261 21:14379270 CG - <br>>rs62211262 21:14379281 CG - <br>Each line must start with a > character, then be followed by an - identifier (which may have duplicates). Then there should be the - chromosomal coordinate of the SNP. (Coordinates are all 1-based, so - the first character of a chromosome is number 1.) Finally, there - should be the two possible alleles: ( AC AG AT CG CT GT or AN CN GN TN) - <br>These alleles must correspond to the possible nucleotides on the plus strand of the genome. - If the one of these two letters does not match the allele in the reference - sequence, that SNP will be ignored in subsequent processing as a probable error. - The N stands for any other allele." /> - </when> - <when value="snpIIT"> - <param name="snpIIT" type="data" format="snps.iit" label="GMAP SNPs map" help="Created by: GMAP iit store" /> - </when> - </conditional> - <param name="snps_name" type="text" value="snps" label="Name for this SNP index" help="no white space characters"> - </param> - </inputs> - <outputs> - <!-- - <data format="txt" name="log" label="${tool.name} on ${on_string}: log"/> - --> - <data format="gmapsnpindex" name="output" label="${tool.name} on ${on_string} snpindex" /> - </outputs> <configfiles> <configfile name="shscript"> #!/bin/bash @@ -84,7 +17,7 @@ #set $gmapdb = $refGenomeSource.gmapdb.extra_files_path #set $refname = $refGenomeSource.gmapdb.metadata.db_name #else: -#set $gmapdb = $os.path.dirname($refGenomeSource.gmapindex.value) +#set $gmapdb = $os.path.dirname($refGenomeSource.gmapindex.value) $refname = $os.path.basename($refGenomeSource.gmapindex.value) #end if #set $gmapsnpdir = $output.extra_files_path @@ -110,16 +43,79 @@ #end if </configfile> </configfiles> - + <inputs> + <conditional name="refGenomeSource"> + <param name="genomeSource" type="select" label="Will you map to a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options"> + <option value="indexed">Use a built-in index</option> + <option value="gmapdb">Use gmapdb from the history</option> + </param> + <when value="indexed"> + <param name="gmapindex" type="select" label="Select a reference genome" help="if your genome of interest is not listed - contact Galaxy team"> + <options from_file="gmap_indices.loc"> + <column name="uid" index="0" /> + <column name="dbkey" index="1" /> + <column name="name" index="2" /> + <column name="kmers" index="3" /> + <column name="maps" index="4" /> + <column name="snps" index="5" /> + <column name="value" index="6" /> + </options> + </param> + </when> + <when value="gmapdb"> + <param name="gmapdb" type="data" format="gmapdb" label="Select a gmapdb" + help="A GMAP database built with GMAP Build"/> + </when> + </conditional> + <conditional name="dbsnp"> + <param name="snp_source" type="select" label="Add SNP info from" > + <option value="snpTable">UCSC SNP Table</option> + <option value="snpFile">GMAP SNP File</option> + <option value="snpIIT">"GMAP SNPs map from GMAP iit store</option> + </param> + <when value="snpTable"> + <param name="snps" type="data" format="tabular" label="UCSC SNPs table" help="Example: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/snp130.txt.gz" /> + <param name="snpsex" type="data" format="tabular" optional="true" label="UCSC SNP Exceptions table" help="Example: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/snp130Exceptions.txt.gz" /> + <param name="weight" type="select" label="Include SNPs with at least Confidence Level" help=""> + <option value="1" selected="true">1 (High)</option> + <option value="2">2 (Medium)</option> + <option value="3">3 (All)</option> + </param> + </when> + <when value="snpFile"> + <param name="snps" type="data" format="gmap_snps" label="GMAP SNPs file" + help="Format (3 columns): + <br>>rs62211261 21:14379270 CG + <br>>rs62211262 21:14379281 CG + <br>Each line must start with a > character, then be followed by an + identifier (which may have duplicates). Then there should be the + chromosomal coordinate of the SNP. (Coordinates are all 1-based, so + the first character of a chromosome is number 1.) Finally, there + should be the two possible alleles: ( AC AG AT CG CT GT or AN CN GN TN) + <br>These alleles must correspond to the possible nucleotides on the plus strand of the genome. + If the one of these two letters does not match the allele in the reference + sequence, that SNP will be ignored in subsequent processing as a probable error. + The N stands for any other allele." /> + </when> + <when value="snpIIT"> + <param name="snpIIT" type="data" format="snps.iit" label="GMAP SNPs map" help="Created by: GMAP iit store" /> + </when> + </conditional> + <param name="snps_name" type="text" value="snps" label="Name for this SNP index" help="no white space characters"> + </param> + </inputs> + <outputs> + <!-- + <data format="txt" name="log" label="${tool.name} on ${on_string}: log"/> + --> + <data format="gmapsnpindex" name="output" label="${tool.name} on ${on_string} snpindex" /> + </outputs> <tests> - </tests> - + </tests> <help> - - **GMAP SNP Index** -GMAP SNP Index (snpindex in the GMAP documentaion) creates an index for known SNPs allowing for SNP tolerant mapping and alignment when using GMAP_ (Genomic Mapping and Alignment Program for mRNA and EST sequences) and GSNAP_ (Genomic Short-read Nucleotide Alignment Program). +GMAP SNP Index (snpindex in the GMAP documentaion) creates an index for known SNPs allowing for SNP tolerant mapping and alignment when using GMAP_ (Genomic Mapping and Alignment Program for mRNA and EST sequences) and GSNAP_ (Genomic Short-read Nucleotide Alignment Program). You will want to read the README_ @@ -129,8 +125,8 @@ .. _GSNAP: http://research-pub.gene.com/gmap/ .. _README: http://research-pub.gene.com/gmap/src/README .. _Publication: http://bioinformatics.oxfordjournals.org/cgi/content/full/21/9/1859 - - </help> + <citations> + <citation type="doi">10.1093/bioinformatics/bti310</citation> + </citations> </tool> -
--- a/tool-data/datatypes_conf.xml Wed Sep 28 10:43:44 2016 -0400 +++ b/tool-data/datatypes_conf.xml Wed Sep 28 10:47:28 2016 -0400 @@ -6,6 +6,7 @@ <registration> <datatype extension="gmapdb" type="galaxy.datatypes.gmap:GmapDB" display_in_upload="False"/> <datatype extension="gmapsnpindex" type="galaxy.datatypes.gmap:GmapSnpIndex" display_in_upload="False"/> + <datatype extension="tally.iit" type="galaxy.datatypes.gmap:TallyIntervalIndexTree" display_in_upload="True"/> <datatype extension="iit" type="galaxy.datatypes.gmap:IntervalIndexTree" display_in_upload="True"/> <datatype extension="splicesites.iit" type="galaxy.datatypes.gmap:SpliceSitesIntervalIndexTree" display_in_upload="True"/> <datatype extension="introns.iit" type="galaxy.datatypes.gmap:IntronsIntervalIndexTree" display_in_upload="True"/>