comparison gmap_build.xml @ 3:488e9d642566 draft

GMAP wrappers v3.0.1 after linting and cleanup, still untested work-in-progress
author peterjc
date Wed, 28 Sep 2016 10:47:28 -0400
parents f6ba0f12cca2
children 14561eb803a5
comparison
equal deleted inserted replaced
2:f6ba0f12cca2 3:488e9d642566
1 <tool id="gmap_build" name="GMAP Build" version="3.0.0"> 1 <tool id="gmap_build" name="GMAP Build" version="3.0.1">
2 <description>a database genome index for GMAP and GSNAP</description> 2 <description>a database genome index for GMAP and GSNAP</description>
3 <requirements> 3 <requirements>
4 <requirement type="package" version="2013-05-09">gmap</requirement> 4 <requirement type="package" version="2013-05-09">gmap</requirement>
5 </requirements> 5 </requirements>
6 <version_string>gmap --version</version_string> 6 <version_command>gmap --version</version_command>
7 <command interpreter="command"> /bin/bash $shscript > $output </command> 7 <command interpreter="command"> /bin/bash $shscript > $output </command>
8 <inputs>
9 <!-- Name for this gmapdb -->
10 <param name="refname" type="text" label="Name you want to give this gmap database" help="">
11 <validator type="empty_field" message="A database name is required."/>
12 </param>
13 <!-- Input data -->
14 <repeat name="inputs" title="Reference Sequence" min="1">
15 <param name="input" type="data" format="fasta" label="reference sequence fasta" />
16 </repeat>
17
18 <param name="circular_chroms" type="text" value="" optional="true" label="Names of circular chromosomes"
19 help="a list of chromosomes, separated by commas, allow GSNAP and GMAP to align reads across the ends of the chromosome">
20 </param>
21
22 <param name="sort" type="select" label="Sort chromosomes" help="">
23 <option value="none">none - use chromosomes as found in FASTA file(s)</option>
24 <option value="alpha">alpha - sort chromosomes alphabetically (chr10 before chr 1)</option>
25 <option value="numeric-alpha">numeric-alpha - chr1, chr1U, chr2, chrM, chrU, chrX, chrY</option>
26 <option value="chrom">chrom - chr1, chr2, chrM, chrX, chrY, chr1U, chrU</option>
27 </param>
28
29 <param name="cmetindex" type="boolean" checked="true" truevalue="yes" falsevalue="no" label="Create cmetindex to process reads from bisulfite-treated DNA"/>
30 <param name="atoiindex" type="boolean" checked="true" truevalue="yes" falsevalue="no" label="Create atoiindex to process reads under RNA-editing tolerance"/>
31 <conditional name="splicesite">
32 <param name="splice_source" type="select" label="Add splice and intron info from" >
33 <option value="none"></option>
34 <option value="refGeneTable">refGenes table from UCSC table browser</option>
35 <option value="gtf">GTF</option>
36 <option value="gff3">GFF3</option>
37 </param>
38 <when value="none"/>
39 <when value="refGeneTable">
40 <param name="refGenes" type="data" format="tabular" optional="true" label="UCSC refGenes table" help="Example: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/refGene.txt.gz" />
41 <param name="col_skip" type="integer" value="1" label="Columns to skip before the id/name column (default 1)"
42 help="Note that alignment tracks in UCSC sometimes have an extra column on the left.">
43 <validator type="in_range" message="The number of colmumns to skip must >= 0." min="0."/>
44 </param>
45
46 </when>
47 <when value="gtf">
48 <param name="gtfGenes" type="data" format="gtf" optional="true" label="Genes as GTF" help="" />
49 </when>
50 <when value="gff3">
51 <param name="gff3Genes" type="data" format="gff3" optional="true" label="Genes in GFF3 format" help="" />
52 </when>
53 </conditional>
54 <conditional name="dbsnp">
55 <param name="snp_source" type="select" label="Add SNP info from" >
56 <option value="none"></option>
57 <option value="snpTable">UCSC SNP Table</option>
58 <option value="snpFile">GMAP SNP File</option>
59 <option value="vcfFile">VCF File</option>
60 </param>
61 <when value="none"/>
62 <when value="snpTable">
63 <param name="snps" type="data" format="tabular" optional="true" label="UCSC SNPs table" help="Example: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/snp130.txt.gz" />
64 <param name="snpsex" type="data" format="tabular" optional="true" label="UCSC SNP Exceptions table" help="Example: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/snp130Exceptions.txt.gz" />
65 <param name="weight" type="select" label="Include SNPs with at least Confidence Level" help="">
66 <option value="1" selected="true">1 (High)</option>
67 <option value="2">2 (Medium)</option>
68 <option value="3">3 (All)</option>
69 </param>
70 </when>
71 <when value="snpFile">
72 <param name="snps" type="data" format="gmap_snps" optional="true" label="GMAP SNPs file"
73 help="Format (3 columns):
74 &lt;br&gt;>rs62211261 21:14379270 CG
75 &lt;br&gt;>rs62211262 21:14379281 CG
76 &lt;br&gt;Each line must start with a &gt; character, then be followed by an
77 identifier (which may have duplicates). Then there should be the
78 chromosomal coordinate of the SNP. (Coordinates are all 1-based, so
79 the first character of a chromosome is number 1.) Finally, there
80 should be the two possible alleles: ( AC AG AT CG CT GT or AN CN GN TN)
81 &lt;br&gt;These alleles must correspond to the possible nucleotides on the plus strand of the genome.
82 If the one of these two letters does not match the allele in the reference
83 sequence, that SNP will be ignored in subsequent processing as a probable error.
84 The N stands for any other allele." />
85 </when>
86 <when value="vcfFile">
87 <param name="snps" type="data" format="vcf" optional="true" label="VCF SNPs file"
88 help="Example: ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/00-All.vcf.gz
89 The VCF file contains multiple versions of dbSNP, so if you want a
90 particular version, such as 135. The vcf_iit program tries to pick
91 a subset of SNPs that somewhat parallel
92 the ones without exceptions in the UCSC dbSNP file. It keeps all SNPs
93 that have been validated (marked in the VCF file as &#34;VLD&#34;) or have a
94 submitter link-out (&#34;SLO&#34;). Otherwise, it excludes SNPs that are
95 individual genotypes (&#34;GNO&#34;). If none of these conditions hold, then
96 the SNP is allowed. "/>
97 <param name="vcf_version" type="text" value="" optional="true" label="dbSNP version"
98 help="The VCF file contains multiple versions of dbSNP, so if you want a particular version, such as 135"/>
99 </when>
100 </conditional>
101
102 <param name="kmer" type="select" multiple="true" force_select="true" label="kmer size" help="Use smaller values when building indexes on machines with limited RAM">
103 <option value="12">12 (64MB RAM)</option>
104 <option value="13">13 (256MB RAM)</option>
105 <option value="14">14 (1GB RAM)</option>
106 <option value="15" selected="true">15 (4GB RAM)</option>
107 </param>
108
109 </inputs>
110 <stdio>
111 <exit_code range="1" level="fatal" description="Error running gmap_build" />
112 </stdio>
113 <outputs>
114 <!--
115 <data format="txt" name="log" label="${tool.name} on ${on_string}: log"/>
116 -->
117 <data format="gmapdb" name="output" label="${tool.name} on ${on_string} gmapdb ${refname}" />
118 </outputs>
119 <configfiles> 8 <configfiles>
120 <configfile name="shscript"> 9 <configfile name="shscript">
121 #!/bin/bash 10 #!/bin/bash
122 #set $ds = chr(36) 11 #set $ds = chr(36)
123 #set $gt = chr(62) 12 #set $gt = chr(62)
143 gmap_build -D $gmapdb -d $refname -s $sort $circular -k $k #for i in $inputs# ${i.input}#end for# 32 gmap_build -D $gmapdb -d $refname -s $sort $circular -k $k #for i in $inputs# ${i.input}#end for#
144 #end for 33 #end for
145 #else: 34 #else:
146 gmap_build -D $gmapdb -d $refname -s $sort $circular #for i in $inputs# ${i.input}#end for# 35 gmap_build -D $gmapdb -d $refname -s $sort $circular #for i in $inputs# ${i.input}#end for#
147 #end if 36 #end if
148 get-genome -D $gmapdb -d '?' | sed 's/^Available .*/gmap db: /' 37 get-genome -D $gmapdb -d '?' | sed 's/^Available .*/gmap db: /'
149 echo "kmers: " $kmer 38 echo "kmers: " $kmer
150 #if $splicesite.splice_source == 'refGeneTable': 39 #if $splicesite.splice_source == 'refGeneTable':
151 #if $splicesite.refGenes.__str__ != 'None': 40 #if $splicesite.refGenes.__str__ != 'None':
152 cat $splicesite.refGenes | psl_splicesites -s $splicesite.col_skip | iit_store -o $os.path.join($mapsdir,'splicesites') 41 cat $splicesite.refGenes | psl_splicesites -s $splicesite.col_skip | iit_store -o $os.path.join($mapsdir,'splicesites')
153 cat $splicesite.refGenes | psl_introns -s $splicesite.col_skip | iit_store -o $os.path.join($mapsdir,'introns') 42 cat $splicesite.refGenes | psl_introns -s $splicesite.col_skip | iit_store -o $os.path.join($mapsdir,'introns')
154 #end if 43 #end if
188 #end if 77 #end if
189 #if $atoiindex.__str__ == 'yes': 78 #if $atoiindex.__str__ == 'yes':
190 atoiindex -d $refname 79 atoiindex -d $refname
191 echo "atoiindex" -d $refname 80 echo "atoiindex" -d $refname
192 #end if 81 #end if
193 get-genome -D $gmapdb -d $refname -m '?' | sed 's/^Available maps .*/maps: /' 82 get-genome -D $gmapdb -d $refname -m '?' | sed 's/^Available maps .*/maps: /'
194 </configfile> 83 </configfile>
195 </configfiles> 84 </configfiles>
196 85 <inputs>
86 <!-- Name for this gmapdb -->
87 <param name="refname" type="text" label="Name you want to give this gmap database" help="">
88 <validator type="empty_field" message="A database name is required."/>
89 </param>
90 <!-- Input data -->
91 <repeat name="inputs" title="Reference Sequence" min="1">
92 <param name="input" type="data" format="fasta" label="reference sequence fasta" />
93 </repeat>
94
95 <param name="circular_chroms" type="text" value="" optional="true" label="Names of circular chromosomes"
96 help="a list of chromosomes, separated by commas, allow GSNAP and GMAP to align reads across the ends of the chromosome">
97 </param>
98
99 <param name="sort" type="select" label="Sort chromosomes" help="">
100 <option value="none">none - use chromosomes as found in FASTA file(s)</option>
101 <option value="alpha">alpha - sort chromosomes alphabetically (chr10 before chr 1)</option>
102 <option value="numeric-alpha">numeric-alpha - chr1, chr1U, chr2, chrM, chrU, chrX, chrY</option>
103 <option value="chrom">chrom - chr1, chr2, chrM, chrX, chrY, chr1U, chrU</option>
104 </param>
105
106 <param name="cmetindex" type="boolean" checked="true" truevalue="yes" falsevalue="no" label="Create cmetindex to process reads from bisulfite-treated DNA"/>
107 <param name="atoiindex" type="boolean" checked="true" truevalue="yes" falsevalue="no" label="Create atoiindex to process reads under RNA-editing tolerance"/>
108 <conditional name="splicesite">
109 <param name="splice_source" type="select" label="Add splice and intron info from" >
110 <option value="none"></option>
111 <option value="refGeneTable">refGenes table from UCSC table browser</option>
112 <option value="gtf">GTF</option>
113 <option value="gff3">GFF3</option>
114 </param>
115 <when value="none"/>
116 <when value="refGeneTable">
117 <param name="refGenes" type="data" format="tabular" optional="true" label="UCSC refGenes table" help="Example: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/refGene.txt.gz" />
118 <param name="col_skip" type="integer" value="1" label="Columns to skip before the id/name column (default 1)"
119 help="Note that alignment tracks in UCSC sometimes have an extra column on the left.">
120 <validator type="in_range" message="The number of colmumns to skip must >= 0." min="0."/>
121 </param>
122
123 </when>
124 <when value="gtf">
125 <param name="gtfGenes" type="data" format="gtf" optional="true" label="Genes as GTF" help="" />
126 </when>
127 <when value="gff3">
128 <param name="gff3Genes" type="data" format="gff3" optional="true" label="Genes in GFF3 format" help="" />
129 </when>
130 </conditional>
131 <conditional name="dbsnp">
132 <param name="snp_source" type="select" label="Add SNP info from" >
133 <option value="none"></option>
134 <option value="snpTable">UCSC SNP Table</option>
135 <option value="snpFile">GMAP SNP File</option>
136 <option value="vcfFile">VCF File</option>
137 </param>
138 <when value="none"/>
139 <when value="snpTable">
140 <param name="snps" type="data" format="tabular" optional="true" label="UCSC SNPs table" help="Example: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/snp130.txt.gz" />
141 <param name="snpsex" type="data" format="tabular" optional="true" label="UCSC SNP Exceptions table" help="Example: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/snp130Exceptions.txt.gz" />
142 <param name="weight" type="select" label="Include SNPs with at least Confidence Level" help="">
143 <option value="1" selected="true">1 (High)</option>
144 <option value="2">2 (Medium)</option>
145 <option value="3">3 (All)</option>
146 </param>
147 </when>
148 <when value="snpFile">
149 <param name="snps" type="data" format="gmap_snps" optional="true" label="GMAP SNPs file"
150 help="Format (3 columns):
151 &lt;br&gt;>rs62211261 21:14379270 CG
152 &lt;br&gt;>rs62211262 21:14379281 CG
153 &lt;br&gt;Each line must start with a &gt; character, then be followed by an
154 identifier (which may have duplicates). Then there should be the
155 chromosomal coordinate of the SNP. (Coordinates are all 1-based, so
156 the first character of a chromosome is number 1.) Finally, there
157 should be the two possible alleles: ( AC AG AT CG CT GT or AN CN GN TN)
158 &lt;br&gt;These alleles must correspond to the possible nucleotides on the plus strand of the genome.
159 If the one of these two letters does not match the allele in the reference
160 sequence, that SNP will be ignored in subsequent processing as a probable error.
161 The N stands for any other allele." />
162 </when>
163 <when value="vcfFile">
164 <param name="snps" type="data" format="vcf" optional="true" label="VCF SNPs file"
165 help="Example: ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/00-All.vcf.gz
166 The VCF file contains multiple versions of dbSNP, so if you want a
167 particular version, such as 135. The vcf_iit program tries to pick
168 a subset of SNPs that somewhat parallel
169 the ones without exceptions in the UCSC dbSNP file. It keeps all SNPs
170 that have been validated (marked in the VCF file as &#34;VLD&#34;) or have a
171 submitter link-out (&#34;SLO&#34;). Otherwise, it excludes SNPs that are
172 individual genotypes (&#34;GNO&#34;). If none of these conditions hold, then
173 the SNP is allowed. "/>
174 <param name="vcf_version" type="text" value="" optional="true" label="dbSNP version"
175 help="The VCF file contains multiple versions of dbSNP, so if you want a particular version, such as 135"/>
176 </when>
177 </conditional>
178
179 <param name="kmer" type="select" multiple="true" force_select="true" label="kmer size" help="Use smaller values when building indexes on machines with limited RAM">
180 <option value="12">12 (64MB RAM)</option>
181 <option value="13">13 (256MB RAM)</option>
182 <option value="14">14 (1GB RAM)</option>
183 <option value="15" selected="true">15 (4GB RAM)</option>
184 </param>
185 </inputs>
186 <stdio>
187 <exit_code range="1" level="fatal" description="Error running gmap_build" />
188 </stdio>
189 <outputs>
190 <!--
191 <data format="txt" name="log" label="${tool.name} on ${on_string}: log"/>
192 -->
193 <data format="gmapdb" name="output" label="${tool.name} on ${on_string} gmapdb ${refname}" />
194 </outputs>
197 <tests> 195 <tests>
198 </tests> 196 </tests>
199
200 <help> 197 <help>
201
202 **GMAP Build** 198 **GMAP Build**
203 199
204 GMAP Build creates an index of a genomic sequence for mapping and alignment using GMAP_ (Genomic Mapping and Alignment Program for mRNA and EST sequences) and GSNAP_ (Genomic Short-read Nucleotide Alignment Program). (GMAP Build uses GMAP commands: gmap_build, iit_store, psl_splicesites, psl_introns, gtf_splicesites, gtf_introns, gff3_splicesites, gff3_introns, dbsnp_iit, snpindex, cmetindex, and atoiindex.) 200 GMAP Build creates an index of a genomic sequence for mapping and alignment using GMAP_ (Genomic Mapping and Alignment Program for mRNA and EST sequences) and GSNAP_ (Genomic Short-read Nucleotide Alignment Program). (GMAP Build uses GMAP commands: gmap_build, iit_store, psl_splicesites, psl_introns, gtf_splicesites, gtf_introns, gff3_splicesites, gff3_introns, dbsnp_iit, snpindex, cmetindex, and atoiindex.)
205 201
206 You will want to read the README_ 202 You will want to read the README_
223 genome in human beings is circular. 219 genome in human beings is circular.
224 220
225 221
226 **Detecting known and novel splice sites in GSNAP** 222 **Detecting known and novel splice sites in GSNAP**
227 223
228 GSNAP can detect splice junctions in individual reads. 224 GSNAP can detect splice junctions in individual reads.
229 GSNAP allows for known splicing at two levels: at the level of known 225 GSNAP allows for known splicing at two levels: at the level of known
230 splice sites and at the level of known introns. At the site level, 226 splice sites and at the level of known introns. At the site level,
231 GSNAP finds splicing between arbitrary combinations of donor and 227 GSNAP finds splicing between arbitrary combinations of donor and
232 acceptor splice sites, meaning that it can find alternative splicing 228 acceptor splice sites, meaning that it can find alternative splicing
233 events. At the intron level, GSNAP finds splicing only between the 229 events. At the intron level, GSNAP finds splicing only between the
235 alternative splicing events, only introns included in the given list. 231 alternative splicing events, only introns included in the given list.
236 For most purposes, I would recommend using known splice sites, rather 232 For most purposes, I would recommend using known splice sites, rather
237 than known introns, unless you are certain that all alternative 233 than known introns, unless you are certain that all alternative
238 splicing events are known are represented in your file. 234 splicing events are known are represented in your file.
239 235
240 Splice site files can be generated from a GTF file 236 Splice site files can be generated from a GTF file
241 or from refGenes table from UCSC. 237 or from refGenes table from UCSC.
242 238
243 239
244 **SNP-tolerant alignment in GSNAP** 240 **SNP-tolerant alignment in GSNAP**
245 241
246 GSNAP has the ability to align to a reference space of all possible 242 GSNAP has the ability to align to a reference space of all possible
283 279
284 **Alignment of reads from bisulfite-treated DNA in GSNAP** 280 **Alignment of reads from bisulfite-treated DNA in GSNAP**
285 281
286 GSNAP has the ability to align reads from bisulfite-treated DNA, which 282 GSNAP has the ability to align reads from bisulfite-treated DNA, which
287 converts unmethylated cytosines to uracils that appear as thymines in 283 converts unmethylated cytosines to uracils that appear as thymines in
288 reads. GSNAP is able to identify genomic-T to read-C mismatches, 284 reads. GSNAP is able to identify genomic-T to read-C mismatches,
289 if a cmetindex is generated. 285 if a cmetindex is generated.
290 286
291 **RNA-editing tolerance in GSNAP** 287 **RNA-editing tolerance in GSNAP**
292 288
293 Just as GSNAP has a program cmetindex and a mode called "cmet" for 289 Just as GSNAP has a program cmetindex and a mode called "cmet" for
333 If you want to build your genomic databases with more than one k-mer 329 If you want to build your genomic databases with more than one k-mer
334 size, you can re-run gmap_build with different values of -k. This 330 size, you can re-run gmap_build with different values of -k. This
335 will overwrite only the identical files from the previous runs. You 331 will overwrite only the identical files from the previous runs. You
336 can then choose the k-mer size at run-time by using the -k flag for 332 can then choose the k-mer size at run-time by using the -k flag for
337 either GMAP or GSNAP. 333 either GMAP or GSNAP.
338
339 </help> 334 </help>
335 <citations>
336 <citation type="doi">10.1093/bioinformatics/bti310</citation>
337 </citations>
340 </tool> 338 </tool>
341