diff gmap_build.xml @ 3:488e9d642566 draft

GMAP wrappers v3.0.1 after linting and cleanup, still untested work-in-progress
author peterjc
date Wed, 28 Sep 2016 10:47:28 -0400
parents f6ba0f12cca2
children 14561eb803a5
line wrap: on
line diff
--- a/gmap_build.xml	Wed Sep 28 10:43:44 2016 -0400
+++ b/gmap_build.xml	Wed Sep 28 10:47:28 2016 -0400
@@ -1,121 +1,10 @@
-<tool id="gmap_build" name="GMAP Build" version="3.0.0">
+<tool id="gmap_build" name="GMAP Build" version="3.0.1">
   <description>a database genome index for GMAP and GSNAP</description>
   <requirements>
       <requirement type="package" version="2013-05-09">gmap</requirement>
   </requirements>
-  <version_string>gmap --version</version_string>
+  <version_command>gmap --version</version_command>
   <command interpreter="command"> /bin/bash $shscript > $output </command>
-  <inputs>
-    <!-- Name for this gmapdb -->
-    <param name="refname" type="text" label="Name you want to give this gmap database" help="">
-      <validator type="empty_field" message="A database name is required."/>
-    </param>
-    <!-- Input data -->
-    <repeat name="inputs" title="Reference Sequence" min="1">
-      <param name="input" type="data" format="fasta" label="reference sequence fasta" />
-    </repeat>
-
-    <param name="circular_chroms" type="text" value="" optional="true" label="Names of circular chromosomes" 
-           help="a list of chromosomes, separated by commas, allow GSNAP and GMAP to align reads across the ends of the chromosome">
-    </param>
-
-    <param name="sort" type="select" label="Sort chromosomes" help="">
-      <option value="none">none - use chromosomes as found in FASTA file(s)</option>
-      <option value="alpha">alpha - sort chromosomes alphabetically (chr10 before chr 1)</option>
-      <option value="numeric-alpha">numeric-alpha - chr1, chr1U, chr2, chrM, chrU, chrX, chrY</option>
-      <option value="chrom">chrom - chr1, chr2, chrM, chrX, chrY, chr1U, chrU</option>
-    </param>  
-
-    <param name="cmetindex" type="boolean" checked="true" truevalue="yes" falsevalue="no" label="Create cmetindex to process reads from bisulfite-treated DNA"/>
-    <param name="atoiindex" type="boolean" checked="true" truevalue="yes" falsevalue="no" label="Create atoiindex to process reads under RNA-editing tolerance"/>
-    <conditional name="splicesite">
-      <param name="splice_source" type="select" label="Add splice and intron info from" >
-        <option value="none"></option>
-        <option value="refGeneTable">refGenes table from UCSC table browser</option>
-        <option value="gtf">GTF</option>
-        <option value="gff3">GFF3</option>
-      </param>
-      <when value="none"/>
-      <when value="refGeneTable">
-        <param name="refGenes" type="data" format="tabular" optional="true" label="UCSC refGenes table" help="Example: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/refGene.txt.gz" />
-        <param name="col_skip" type="integer" value="1" label="Columns to skip before the id/name column (default 1)" 
-               help="Note that alignment tracks in UCSC sometimes have an extra column on the left.">
-          <validator type="in_range" message="The number of colmumns to skip must >= 0." min="0."/>
-        </param>
- 
-      </when>
-      <when value="gtf">
-        <param name="gtfGenes" type="data" format="gtf" optional="true" label="Genes as GTF" help="" />
-      </when>
-      <when value="gff3">
-        <param name="gff3Genes" type="data" format="gff3" optional="true" label="Genes in GFF3 format" help="" />
-      </when>
-    </conditional> 
-    <conditional name="dbsnp">
-      <param name="snp_source" type="select" label="Add SNP info from" >
-        <option value="none"></option>
-        <option value="snpTable">UCSC SNP Table</option>
-        <option value="snpFile">GMAP SNP File</option>
-        <option value="vcfFile">VCF File</option>
-      </param>
-      <when value="none"/>
-      <when value="snpTable">
-        <param name="snps" type="data" format="tabular" optional="true" label="UCSC SNPs table" help="Example: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/snp130.txt.gz" />
-        <param name="snpsex" type="data" format="tabular" optional="true" label="UCSC SNP Exceptions table" help="Example: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/snp130Exceptions.txt.gz" />
-        <param name="weight" type="select" label="Include SNPs with at least Confidence Level" help="">
-          <option value="1" selected="true">1 (High)</option>
-          <option value="2">2 (Medium)</option>
-          <option value="3">3 (All)</option>
-        </param>
-      </when>
-      <when value="snpFile">
-        <param name="snps" type="data" format="gmap_snps" optional="true" label="GMAP SNPs file" 
-           help="Format (3 columns):
-                &lt;br&gt;>rs62211261 21:14379270 CG
-                &lt;br&gt;>rs62211262 21:14379281 CG
-                &lt;br&gt;Each line must start with a &gt; character, then be followed by an
-                identifier (which may have duplicates).  Then there should be the
-                chromosomal coordinate of the SNP.  (Coordinates are all 1-based, so
-                the first character of a chromosome is number 1.)  Finally, there
-                should be the two possible alleles: ( AC AG AT CG CT GT or AN CN GN TN)
-                &lt;br&gt;These alleles must correspond to the possible nucleotides on the plus strand of the genome.  
-                If the one of these two letters does not match the allele in the reference
-                sequence, that SNP will be ignored in subsequent processing as a probable error.
-                The N stands for any other allele." />
-      </when>
-      <when value="vcfFile">
-        <param name="snps" type="data" format="vcf" optional="true" label="VCF SNPs file" 
-               help="Example: ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/00-All.vcf.gz
-                     The VCF file contains multiple versions of dbSNP, so if you want a
-                     particular version, such as 135.  The vcf_iit program tries to pick 
-                     a subset of SNPs that somewhat parallel
-                     the ones without exceptions in the UCSC dbSNP file.  It keeps all SNPs
-                     that have been validated (marked in the VCF file as &#34;VLD&#34;) or have a
-                     submitter link-out (&#34;SLO&#34;).  Otherwise, it excludes SNPs that are
-                     individual genotypes (&#34;GNO&#34;).  If none of these conditions hold, then
-                     the SNP is allowed.  "/>
-        <param name="vcf_version" type="text" value="" optional="true" label="dbSNP version" 
-           help="The VCF file contains multiple versions of dbSNP, so if you want a particular version, such as 135"/>
-      </when>
-    </conditional> 
-
-    <param name="kmer" type="select" multiple="true" force_select="true" label="kmer size" help="Use smaller values when building indexes on machines with limited RAM">
-      <option value="12">12 (64MB RAM)</option>
-      <option value="13">13 (256MB RAM)</option>
-      <option value="14">14 (1GB RAM)</option>
-      <option value="15" selected="true">15 (4GB RAM)</option>
-    </param>
-
-  </inputs>
-  <stdio>
-    <exit_code range="1"  level="fatal"   description="Error running gmap_build" />
-  </stdio>
-  <outputs>
-    <!--
-    <data format="txt" name="log" label="${tool.name} on ${on_string}: log"/>
-    -->
-    <data format="gmapdb" name="output" label="${tool.name} on ${on_string} gmapdb ${refname}" />
-  </outputs>
   <configfiles>
     <configfile name="shscript">
 #!/bin/bash
@@ -145,8 +34,8 @@
 #else:
 gmap_build -D $gmapdb -d $refname -s $sort $circular #for i in $inputs# ${i.input}#end for#
 #end if
-get-genome -D $gmapdb -d '?' | sed 's/^Available .*/gmap db: /' 
-echo "kmers: " $kmer 
+get-genome -D $gmapdb -d '?' | sed 's/^Available .*/gmap db: /'
+echo "kmers: " $kmer
 #if $splicesite.splice_source == 'refGeneTable':
 #if $splicesite.refGenes.__str__ != 'None':
 cat $splicesite.refGenes | psl_splicesites -s $splicesite.col_skip | iit_store -o  $os.path.join($mapsdir,'splicesites')
@@ -190,15 +79,122 @@
 atoiindex -d $refname
 echo "atoiindex" -d $refname
 #end if
-get-genome -D $gmapdb -d $refname -m '?' | sed 's/^Available maps .*/maps: /' 
+get-genome -D $gmapdb -d $refname -m '?' | sed 's/^Available maps .*/maps: /'
     </configfile>
   </configfiles>
+  <inputs>
+    <!-- Name for this gmapdb -->
+    <param name="refname" type="text" label="Name you want to give this gmap database" help="">
+      <validator type="empty_field" message="A database name is required."/>
+    </param>
+    <!-- Input data -->
+    <repeat name="inputs" title="Reference Sequence" min="1">
+      <param name="input" type="data" format="fasta" label="reference sequence fasta" />
+    </repeat>
 
+    <param name="circular_chroms" type="text" value="" optional="true" label="Names of circular chromosomes"
+           help="a list of chromosomes, separated by commas, allow GSNAP and GMAP to align reads across the ends of the chromosome">
+    </param>
+
+    <param name="sort" type="select" label="Sort chromosomes" help="">
+      <option value="none">none - use chromosomes as found in FASTA file(s)</option>
+      <option value="alpha">alpha - sort chromosomes alphabetically (chr10 before chr 1)</option>
+      <option value="numeric-alpha">numeric-alpha - chr1, chr1U, chr2, chrM, chrU, chrX, chrY</option>
+      <option value="chrom">chrom - chr1, chr2, chrM, chrX, chrY, chr1U, chrU</option>
+    </param>
+
+    <param name="cmetindex" type="boolean" checked="true" truevalue="yes" falsevalue="no" label="Create cmetindex to process reads from bisulfite-treated DNA"/>
+    <param name="atoiindex" type="boolean" checked="true" truevalue="yes" falsevalue="no" label="Create atoiindex to process reads under RNA-editing tolerance"/>
+    <conditional name="splicesite">
+      <param name="splice_source" type="select" label="Add splice and intron info from" >
+        <option value="none"></option>
+        <option value="refGeneTable">refGenes table from UCSC table browser</option>
+        <option value="gtf">GTF</option>
+        <option value="gff3">GFF3</option>
+      </param>
+      <when value="none"/>
+      <when value="refGeneTable">
+        <param name="refGenes" type="data" format="tabular" optional="true" label="UCSC refGenes table" help="Example: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/refGene.txt.gz" />
+        <param name="col_skip" type="integer" value="1" label="Columns to skip before the id/name column (default 1)"
+               help="Note that alignment tracks in UCSC sometimes have an extra column on the left.">
+          <validator type="in_range" message="The number of colmumns to skip must >= 0." min="0."/>
+        </param>
+
+      </when>
+      <when value="gtf">
+        <param name="gtfGenes" type="data" format="gtf" optional="true" label="Genes as GTF" help="" />
+      </when>
+      <when value="gff3">
+        <param name="gff3Genes" type="data" format="gff3" optional="true" label="Genes in GFF3 format" help="" />
+      </when>
+    </conditional>
+    <conditional name="dbsnp">
+      <param name="snp_source" type="select" label="Add SNP info from" >
+        <option value="none"></option>
+        <option value="snpTable">UCSC SNP Table</option>
+        <option value="snpFile">GMAP SNP File</option>
+        <option value="vcfFile">VCF File</option>
+      </param>
+      <when value="none"/>
+      <when value="snpTable">
+        <param name="snps" type="data" format="tabular" optional="true" label="UCSC SNPs table" help="Example: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/snp130.txt.gz" />
+        <param name="snpsex" type="data" format="tabular" optional="true" label="UCSC SNP Exceptions table" help="Example: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/snp130Exceptions.txt.gz" />
+        <param name="weight" type="select" label="Include SNPs with at least Confidence Level" help="">
+          <option value="1" selected="true">1 (High)</option>
+          <option value="2">2 (Medium)</option>
+          <option value="3">3 (All)</option>
+        </param>
+      </when>
+      <when value="snpFile">
+        <param name="snps" type="data" format="gmap_snps" optional="true" label="GMAP SNPs file"
+           help="Format (3 columns):
+                &lt;br&gt;>rs62211261 21:14379270 CG
+                &lt;br&gt;>rs62211262 21:14379281 CG
+                &lt;br&gt;Each line must start with a &gt; character, then be followed by an
+                identifier (which may have duplicates).  Then there should be the
+                chromosomal coordinate of the SNP.  (Coordinates are all 1-based, so
+                the first character of a chromosome is number 1.)  Finally, there
+                should be the two possible alleles: ( AC AG AT CG CT GT or AN CN GN TN)
+                &lt;br&gt;These alleles must correspond to the possible nucleotides on the plus strand of the genome.
+                If the one of these two letters does not match the allele in the reference
+                sequence, that SNP will be ignored in subsequent processing as a probable error.
+                The N stands for any other allele." />
+      </when>
+      <when value="vcfFile">
+        <param name="snps" type="data" format="vcf" optional="true" label="VCF SNPs file"
+               help="Example: ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/00-All.vcf.gz
+                     The VCF file contains multiple versions of dbSNP, so if you want a
+                     particular version, such as 135.  The vcf_iit program tries to pick
+                     a subset of SNPs that somewhat parallel
+                     the ones without exceptions in the UCSC dbSNP file.  It keeps all SNPs
+                     that have been validated (marked in the VCF file as &#34;VLD&#34;) or have a
+                     submitter link-out (&#34;SLO&#34;).  Otherwise, it excludes SNPs that are
+                     individual genotypes (&#34;GNO&#34;).  If none of these conditions hold, then
+                     the SNP is allowed.  "/>
+        <param name="vcf_version" type="text" value="" optional="true" label="dbSNP version"
+           help="The VCF file contains multiple versions of dbSNP, so if you want a particular version, such as 135"/>
+      </when>
+    </conditional>
+
+    <param name="kmer" type="select" multiple="true" force_select="true" label="kmer size" help="Use smaller values when building indexes on machines with limited RAM">
+      <option value="12">12 (64MB RAM)</option>
+      <option value="13">13 (256MB RAM)</option>
+      <option value="14">14 (1GB RAM)</option>
+      <option value="15" selected="true">15 (4GB RAM)</option>
+    </param>
+  </inputs>
+  <stdio>
+    <exit_code range="1"  level="fatal"   description="Error running gmap_build" />
+  </stdio>
+  <outputs>
+    <!--
+    <data format="txt" name="log" label="${tool.name} on ${on_string}: log"/>
+    -->
+    <data format="gmapdb" name="output" label="${tool.name} on ${on_string} gmapdb ${refname}" />
+  </outputs>
   <tests>
-  </tests> 
-
+  </tests>
   <help>
-
 **GMAP Build**
 
 GMAP Build creates an index of a genomic sequence for mapping and alignment using GMAP_ (Genomic Mapping and Alignment Program for mRNA and EST sequences) and GSNAP_ (Genomic Short-read Nucleotide Alignment Program).  (GMAP Build uses GMAP commands: gmap_build, iit_store, psl_splicesites, psl_introns, gtf_splicesites, gtf_introns, gff3_splicesites, gff3_introns, dbsnp_iit, snpindex, cmetindex, and atoiindex.)
@@ -225,7 +221,7 @@
 
 **Detecting known and novel splice sites in GSNAP**
 
-GSNAP can detect splice junctions in individual reads.  
+GSNAP can detect splice junctions in individual reads.
 GSNAP allows for known splicing at two levels: at the level of known
 splice sites and at the level of known introns.  At the site level,
 GSNAP finds splicing between arbitrary combinations of donor and
@@ -237,8 +233,8 @@
 than known introns, unless you are certain that all alternative
 splicing events are known are represented in your file.
 
-Splice site files can be generated from a GTF file 
-or from  refGenes table from UCSC.  
+Splice site files can be generated from a GTF file
+or from  refGenes table from UCSC.
 
 
 **SNP-tolerant alignment in GSNAP**
@@ -285,7 +281,7 @@
 
 GSNAP has the ability to align reads from bisulfite-treated DNA, which
 converts unmethylated cytosines to uracils that appear as thymines in
-reads.  GSNAP is able to identify genomic-T to read-C mismatches, 
+reads.  GSNAP is able to identify genomic-T to read-C mismatches,
 if a cmetindex is generated.
 
 **RNA-editing tolerance in GSNAP**
@@ -335,7 +331,8 @@
 will overwrite only the identical files from the previous runs.  You
 can then choose the k-mer size at run-time by using the -k flag for
 either GMAP or GSNAP.
-
   </help>
+  <citations>
+    <citation type="doi">10.1093/bioinformatics/bti310</citation>
+  </citations>
 </tool>
-