comparison gmap_build.xml @ 2:f6ba0f12cca2 draft

Untested work-in-progress GMAP wrappers v3.0.0, from JJ back in June 2013
author peterjc
date Wed, 28 Sep 2016 10:43:44 -0400
parents 74391fc6e3f2
children 488e9d642566
comparison
equal deleted inserted replaced
1:74391fc6e3f2 2:f6ba0f12cca2
1 <tool id="gmap_build" name="GMAP Build" version="2.0.0"> 1 <tool id="gmap_build" name="GMAP Build" version="3.0.0">
2 <description>a database genome index for GMAP and GSNAP</description> 2 <description>a database genome index for GMAP and GSNAP</description>
3 <requirements> 3 <requirements>
4 <requirement type="package" version="2011-11-30">gmap</requirement> 4 <requirement type="package" version="2013-05-09">gmap</requirement>
5 </requirements> 5 </requirements>
6 <version_string>gmap --version</version_string> 6 <version_string>gmap --version</version_string>
7 <command interpreter="command"> /bin/bash $shscript 2>1 1> $output </command> 7 <command interpreter="command"> /bin/bash $shscript > $output </command>
8 <inputs> 8 <inputs>
9 <!-- Name for this gmapdb --> 9 <!-- Name for this gmapdb -->
10 <param name="refname" type="text" label="Name you want to give this gmap database" help=""> 10 <param name="refname" type="text" label="Name you want to give this gmap database" help="">
11 <validator type="empty_field" message="A database name is required."/> 11 <validator type="empty_field" message="A database name is required."/>
12 </param> 12 </param>
13 <!-- Input data --> 13 <!-- Input data -->
14 <repeat name="inputs" title="Reference Sequence" min="1"> 14 <repeat name="inputs" title="Reference Sequence" min="1">
15 <param name="input" type="data" format="fasta" label="reference sequence fasta" /> 15 <param name="input" type="data" format="fasta" label="reference sequence fasta" />
16 </repeat> 16 </repeat>
17 17
18 <param name="kmer" type="select" multiple="true" force_select="true" label="kmer size" help=""> 18 <param name="circular_chroms" type="text" value="" optional="true" label="Names of circular chromosomes"
19 <option value="12">12</option> 19 help="a list of chromosomes, separated by commas, allow GSNAP and GMAP to align reads across the ends of the chromosome">
20 <option value="13">13</option> 20 </param>
21 <option value="14">14</option> 21
22 <option value="15" selected="true">15</option> 22 <param name="sort" type="select" label="Sort chromosomes" help="">
23 <option value="none">none - use chromosomes as found in FASTA file(s)</option>
24 <option value="alpha">alpha - sort chromosomes alphabetically (chr10 before chr 1)</option>
25 <option value="numeric-alpha">numeric-alpha - chr1, chr1U, chr2, chrM, chrU, chrX, chrY</option>
26 <option value="chrom">chrom - chr1, chr2, chrM, chrX, chrY, chr1U, chrU</option>
23 </param> 27 </param>
28
24 <param name="cmetindex" type="boolean" checked="true" truevalue="yes" falsevalue="no" label="Create cmetindex to process reads from bisulfite-treated DNA"/> 29 <param name="cmetindex" type="boolean" checked="true" truevalue="yes" falsevalue="no" label="Create cmetindex to process reads from bisulfite-treated DNA"/>
25 <param name="atoiindex" type="boolean" checked="true" truevalue="yes" falsevalue="no" label="Create atoiindex to process reads under RNA-editing tolerance"/> 30 <param name="atoiindex" type="boolean" checked="true" truevalue="yes" falsevalue="no" label="Create atoiindex to process reads under RNA-editing tolerance"/>
26 <conditional name="splicesite"> 31 <conditional name="splicesite">
27 <param name="splice_source" type="select" label="Add splice and intron info from" > 32 <param name="splice_source" type="select" label="Add splice and intron info from" >
28 <option value="none"></option> 33 <option value="none"></option>
49 <conditional name="dbsnp"> 54 <conditional name="dbsnp">
50 <param name="snp_source" type="select" label="Add SNP info from" > 55 <param name="snp_source" type="select" label="Add SNP info from" >
51 <option value="none"></option> 56 <option value="none"></option>
52 <option value="snpTable">UCSC SNP Table</option> 57 <option value="snpTable">UCSC SNP Table</option>
53 <option value="snpFile">GMAP SNP File</option> 58 <option value="snpFile">GMAP SNP File</option>
59 <option value="vcfFile">VCF File</option>
54 </param> 60 </param>
55 <when value="none"/> 61 <when value="none"/>
56 <when value="snpTable"> 62 <when value="snpTable">
57 <param name="snps" type="data" format="tabular" optional="true" label="UCSC SNPs table" help="Example: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/snp130.txt.gz" /> 63 <param name="snps" type="data" format="tabular" optional="true" label="UCSC SNPs table" help="Example: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/snp130.txt.gz" />
58 <param name="snpsex" type="data" format="tabular" optional="true" label="UCSC SNP Exceptions table" help="Example: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/snp130Exceptions.txt.gz" /> 64 <param name="snpsex" type="data" format="tabular" optional="true" label="UCSC SNP Exceptions table" help="Example: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/snp130Exceptions.txt.gz" />
75 &lt;br&gt;These alleles must correspond to the possible nucleotides on the plus strand of the genome. 81 &lt;br&gt;These alleles must correspond to the possible nucleotides on the plus strand of the genome.
76 If the one of these two letters does not match the allele in the reference 82 If the one of these two letters does not match the allele in the reference
77 sequence, that SNP will be ignored in subsequent processing as a probable error. 83 sequence, that SNP will be ignored in subsequent processing as a probable error.
78 The N stands for any other allele." /> 84 The N stands for any other allele." />
79 </when> 85 </when>
86 <when value="vcfFile">
87 <param name="snps" type="data" format="vcf" optional="true" label="VCF SNPs file"
88 help="Example: ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/00-All.vcf.gz
89 The VCF file contains multiple versions of dbSNP, so if you want a
90 particular version, such as 135. The vcf_iit program tries to pick
91 a subset of SNPs that somewhat parallel
92 the ones without exceptions in the UCSC dbSNP file. It keeps all SNPs
93 that have been validated (marked in the VCF file as &#34;VLD&#34;) or have a
94 submitter link-out (&#34;SLO&#34;). Otherwise, it excludes SNPs that are
95 individual genotypes (&#34;GNO&#34;). If none of these conditions hold, then
96 the SNP is allowed. "/>
97 <param name="vcf_version" type="text" value="" optional="true" label="dbSNP version"
98 help="The VCF file contains multiple versions of dbSNP, so if you want a particular version, such as 135"/>
99 </when>
80 </conditional> 100 </conditional>
101
102 <param name="kmer" type="select" multiple="true" force_select="true" label="kmer size" help="Use smaller values when building indexes on machines with limited RAM">
103 <option value="12">12 (64MB RAM)</option>
104 <option value="13">13 (256MB RAM)</option>
105 <option value="14">14 (1GB RAM)</option>
106 <option value="15" selected="true">15 (4GB RAM)</option>
107 </param>
108
81 </inputs> 109 </inputs>
110 <stdio>
111 <exit_code range="1" level="fatal" description="Error running gmap_build" />
112 </stdio>
82 <outputs> 113 <outputs>
83 <!-- 114 <!--
84 <data format="txt" name="log" label="${tool.name} on ${on_string}: log"/> 115 <data format="txt" name="log" label="${tool.name} on ${on_string}: log"/>
85 --> 116 -->
86 <data format="gmapdb" name="output" label="${tool.name} on ${on_string} gmapdb ${refname}" /> 117 <data format="gmapdb" name="output" label="${tool.name} on ${on_string} gmapdb ${refname}" />
95 ## #set $ref_files = '' 126 ## #set $ref_files = ''
96 ## #for $i in $inputs: 127 ## #for $i in $inputs:
97 ## #set $ref_files = $ref_files $i.input 128 ## #set $ref_files = $ref_files $i.input
98 ## #end for 129 ## #end for
99 ## echo $ref_files 130 ## echo $ref_files
131 #set circular = ""
132 #if $circular_chroms.__str__.strip() != '':
133 #set circular = ('').join([' -c ','"', $circular_chroms.__str__,'"'])
134 #end if
100 #import os.path 135 #import os.path
101 #set $gmapdb = $output.extra_files_path 136 #set $gmapdb = $output.extra_files_path
102 #set $mapsdir = $os.path.join($os.path.join($gmapdb,str($refname)), str($refname) + '.maps') 137 #set $mapsdir = $os.path.join($os.path.join($gmapdb,str($refname)), str($refname) + '.maps')
103 mkdir -p $gmapdb 138 mkdir -p $gmapdb
104 ## export GMAPDB required for cmetindex and atoiindex 139 ## export GMAPDB required for cmetindex and atoiindex
105 export GMAPDB=$gmapdb 140 export GMAPDB=$gmapdb
141 #if $kmer:
106 #for $k in $kmer.__str__.split(','): 142 #for $k in $kmer.__str__.split(','):
107 gmap_build -D $gmapdb -d $refname -s numeric-alpha -k $k #for i in $inputs# ${i.input}#end for# 143 gmap_build -D $gmapdb -d $refname -s $sort $circular -k $k #for i in $inputs# ${i.input}#end for#
108 #end for 144 #end for
145 #else:
146 gmap_build -D $gmapdb -d $refname -s $sort $circular #for i in $inputs# ${i.input}#end for#
147 #end if
109 get-genome -D $gmapdb -d '?' | sed 's/^Available .*/gmap db: /' 148 get-genome -D $gmapdb -d '?' | sed 's/^Available .*/gmap db: /'
110 echo "kmers: " $kmer 149 echo "kmers: " $kmer
111 #if $splicesite.splice_source == 'refGeneTable': 150 #if $splicesite.splice_source == 'refGeneTable':
112 #if $splicesite.refGenes.__str__ != 'None': 151 #if $splicesite.refGenes.__str__ != 'None':
113 cat $splicesite.refGenes | psl_splicesites -s $splicesite.col_skip | iit_store -o $os.path.join($mapsdir,'splicesites') 152 cat $splicesite.refGenes | psl_splicesites -s $splicesite.col_skip | iit_store -o $os.path.join($mapsdir,'splicesites')
123 cat $splicesite.gff3Genes | gff3_splicesites | iit_store -o $os.path.join($mapsdir,'splicesites') 162 cat $splicesite.gff3Genes | gff3_splicesites | iit_store -o $os.path.join($mapsdir,'splicesites')
124 cat $splicesite.gff3Genes | gff3_introns | iit_store -o $os.path.join($mapsdir,'introns') 163 cat $splicesite.gff3Genes | gff3_introns | iit_store -o $os.path.join($mapsdir,'introns')
125 #end if 164 #end if
126 #end if 165 #end if
127 #if $dbsnp.snp_source != 'none' and $dbsnp.snps.__str__ != 'None': 166 #if $dbsnp.snp_source != 'none' and $dbsnp.snps.__str__ != 'None':
128 #if $dbsnp.snp_source == 'snpTable': 167 #if $dbsnp.snp_source == 'snpTable':
129 #if $dbsnp.snpsex.__str__ != 'None': 168 #if $dbsnp.snpsex.__str__ != 'None':
130 cat $dbsnp.snps | dbsnp_iit -w $dbsnp.weight -e $dbsnp.snpsex | iit_store -o $os.path.join($mapsdir,'snps') 169 cat $dbsnp.snps | dbsnp_iit -w $dbsnp.weight -e $dbsnp.snpsex | iit_store -o $os.path.join($mapsdir,'snps')
131 #else: 170 #else:
132 cat $dbsnp.snps | dbsnp_iit -w $dbsnp.weight | iit_store -o $os.path.join($mapsdir,'snps') 171 cat $dbsnp.snps | dbsnp_iit -w $dbsnp.weight | iit_store -o $os.path.join($mapsdir,'snps')
133 #end if 172 #end if
134 #else: 173 #elif $dbsnp.snp_source == 'vcfFile':
135 cat $dbsnp.snps | iit_store -o $os.path.join($mapsdir,'snps') 174 #if $dbsnp.vcf_version and len($dbsnp.vcf_version.__str__.strip()) > 0:
136 #end if 175 cat $dbsnp.snps | vcf_iit -v $dbsnp.vcf_version.__str__.strip() | iit_store -o $os.path.join($mapsdir,'snps')
137 snpindex -d $refname -v snps 176 #else:
138 echo "snpindex" -d $refname -v snps 177 cat $dbsnp.snps | vcf_iit | iit_store -o $os.path.join($mapsdir,'snps')
178 #end if
179 #else:
180 cat $dbsnp.snps | iit_store -o $os.path.join($mapsdir,'snps')
181 #end if
182 snpindex -d $refname -v snps
183 echo "snpindex" -d $refname -v snps
139 #end if 184 #end if
140 #if $cmetindex.__str__ == 'yes': 185 #if $cmetindex.__str__ == 'yes':
141 cmetindex -d $refname 186 cmetindex -d $refname
142 echo "cmetindex" -d $refname 187 echo "cmetindex" -d $refname
143 #end if 188 #end if
152 <tests> 197 <tests>
153 </tests> 198 </tests>
154 199
155 <help> 200 <help>
156 201
157
158 **GMAP Build** 202 **GMAP Build**
159 203
160 GMAP Build creates an index of a genomic sequence for mapping and alignment using GMAP_ (Genomic Mapping and Alignment Program for mRNA and EST sequences) and GSNAP_ (Genomic Short-read Nucleotide Alignment Program). (GMAP Build uses GMSP commands: gmap_build, iit_store, psl_splicesites, psl_introns, gtf_splicesites, gtf_introns, gff3_splicesites, gff3_introns, dbsnp_iit, snpindex, cmetindex, and atoiindex.) 204 GMAP Build creates an index of a genomic sequence for mapping and alignment using GMAP_ (Genomic Mapping and Alignment Program for mRNA and EST sequences) and GSNAP_ (Genomic Short-read Nucleotide Alignment Program). (GMAP Build uses GMAP commands: gmap_build, iit_store, psl_splicesites, psl_introns, gtf_splicesites, gtf_introns, gff3_splicesites, gff3_introns, dbsnp_iit, snpindex, cmetindex, and atoiindex.)
161 205
162 You will want to read the README_ 206 You will want to read the README_
163 207
164 Publication_ citation: Thomas D. Wu, Colin K. Watanabe Bioinformatics 2005 21(9):1859-1875; doi:10.1093/bioinformatics/bti310 208 Publication_ citation: Thomas D. Wu, Colin K. Watanabe Bioinformatics 2005 21(9):1859-1875; doi:10.1093/bioinformatics/bti310
165 209
167 .. _GSNAP: http://research-pub.gene.com/gmap/ 211 .. _GSNAP: http://research-pub.gene.com/gmap/
168 .. _README: http://research-pub.gene.com/gmap/src/README 212 .. _README: http://research-pub.gene.com/gmap/src/README
169 .. _Publication: http://bioinformatics.oxfordjournals.org/cgi/content/full/21/9/1859 213 .. _Publication: http://bioinformatics.oxfordjournals.org/cgi/content/full/21/9/1859
170 214
171 215
216 **circular chromosomes**
217
218 Finally, you can provide information to gmap_build that certain
219 chromosomes are circular, with the -c or -\-circular flag. The value
220 for these flags is a list of chromosomes, separated by commas. The
221 gmap_build program will then allow GSNAP and GMAP to align reads
222 across the ends of the chromosome. For example, the mitochondrial
223 genome in human beings is circular.
224
225
226 **Detecting known and novel splice sites in GSNAP**
227
228 GSNAP can detect splice junctions in individual reads.
229 GSNAP allows for known splicing at two levels: at the level of known
230 splice sites and at the level of known introns. At the site level,
231 GSNAP finds splicing between arbitrary combinations of donor and
232 acceptor splice sites, meaning that it can find alternative splicing
233 events. At the intron level, GSNAP finds splicing only between the
234 set of given donor-acceptor pairs, so it is constrained not to find
235 alternative splicing events, only introns included in the given list.
236 For most purposes, I would recommend using known splice sites, rather
237 than known introns, unless you are certain that all alternative
238 splicing events are known are represented in your file.
239
240 Splice site files can be generated from a GTF file
241 or from refGenes table from UCSC.
242
243
244 **SNP-tolerant alignment in GSNAP**
245
246 GSNAP has the ability to align to a reference space of all possible
247 major and minor alleles in a set of known SNPs provided by the user.
248
249
250 Process known SNP data, either from older dbSNP files or from newer
251 files in VCF format. The older dbSNP files can be obtained from UCSC,
252 either from the Galaxy UCSC table browser or downloaded:
253
254 ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/snp130.txt.gz
255
256 For versions before snp132, you may also want to exclude exceptions,
257 which will require this file:
258
259 ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/snp130Exceptions.txt.gz
260
261 The option "-w weight" makes use of the dbSNP item weight, a value
262 from 1 to 3, where lower weight means higher confidence. Items will
263 be included if the item weight is the given value weight or less.
264 The default value of -w is 1, which is the criterion UCSC uses to
265 build its ambiguous version of the genome. To allow all item weights,
266 specify "-w 3".
267
268 The more recent SNP data are provided in VCF format, and can be
269 retrieved like this:
270
271 ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/00-All.vcf.gz
272
273 The VCF file contains multiple versions of dbSNP, so if you want a
274 particular version, such as 135, you would use the flag "-v 135". The
275 vcf_iit program tries to pick a subset of SNPs that somewhat parallel
276 the ones without exceptions in the UCSC dbSNP file. It keeps all SNPs
277 that have been validated (marked in the VCF file as "VLD") or have a
278 submitter link-out ("SLO"). Otherwise, it excludes SNPs that are
279 individual genotypes ("GNO"). If none of these conditions hold, then
280 the SNP is allowed. These rules might not be the best ones; I made
281 them up by trying to compare version 135 of the VCF data with
282 version 135 of the UCSC dbSNP data.
283
284 **Alignment of reads from bisulfite-treated DNA in GSNAP**
285
286 GSNAP has the ability to align reads from bisulfite-treated DNA, which
287 converts unmethylated cytosines to uracils that appear as thymines in
288 reads. GSNAP is able to identify genomic-T to read-C mismatches,
289 if a cmetindex is generated.
290
291 **RNA-editing tolerance in GSNAP**
292
293 Just as GSNAP has a program cmetindex and a mode called "cmet" for
294 tolerance to C-to-T changes, it can be tolerant to A-to-G changes
295 using the program atoiindex and a mode called "atoi". This mode is
296 designed to facilitate alignments that are tolerant to RNA editing
297 where A's are converted to I's, which appear as G's to a sequencer.
298
299 To process reads under RNA-editing tolerance, you will first need to
300 create th atoi index.
301
302
303
304 **K-mer size**
305
306 You can control the k-mer size
307 for the genomic index with the -k flag, which can range from 12 to 15.
308 The default value for -k is 15, but this requires your machine to have
309 4 GB of RAM to build the indices. If you do not have 4 GB of RAM,
310 then you will need to reduce the value of -k or find another machine.
311 Here are the RAM requirements for building various indices::
312
313 k-mer of 12: 64 MB
314 k-mer of 13: 256 MB
315 k-mer of 14: 1 GB
316 k-mer of 15: 4 GB
317
318 These are the RAM requirements for building indices, but not to run
319 the GMAP/GSNAP programs once the indices are built, because the
320 genomic indices are compressed. For example, the genomic index for a
321 k-mer of 15 gives a gammaptrs file of 64 MB and an offsetscomp file of
322 about 350 MB, much smaller than the 4 GB that would otherwise be
323 required. Therefore, you may want to build your genomic index on a
324 computer with sufficient RAM, and distribute that index to be used by
325 computers with less RAM.
326
327 The amount of compression can be controlled using the -b or -\-basesize
328 parameter to gmap_build. By default, the value for k-mer size is 15,
329 and the value for basesize is 12. If you select a different value for
330 k-mer size, then basesize is made by default to be equal to that k-mer
331 size.
332
333 If you want to build your genomic databases with more than one k-mer
334 size, you can re-run gmap_build with different values of -k. This
335 will overwrite only the identical files from the previous runs. You
336 can then choose the k-mer size at run-time by using the -k flag for
337 either GMAP or GSNAP.
338
172 </help> 339 </help>
173 </tool> 340 </tool>
174 341