annotate data_manager/rna_star_index_builder.xml @ 1:67c59c6576db draft

Uploaded
author ieguinoa
date Thu, 26 Aug 2021 20:04:16 +0000
parents e23440b3332a
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
1 <tool id="rna_star_index_builder_data_manager_custom" name="rnastar index versioned" tool_type="manage_data" version="@IDX_VERSION@" profile="19.05">
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
2 <description>builder</description>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
3
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
4 <macros>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
5 <import>macros.xml</import>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
6 </macros>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
7
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
8 <expand macro="requirements">
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
9 <requirement type="package" version="3.7">python</requirement>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
10 </expand>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
11
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
12 <command><![CDATA[
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
13 if [ -z "\$GALAXY_MEMORY_MB" ] ; then
1
67c59c6576db Uploaded
ieguinoa
parents: 0
diff changeset
14 GALAXY_MEMORY_BYTES=310000000000 ;
0
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
15 else
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
16 GALAXY_MEMORY_BYTES=\$((GALAXY_MEMORY_MB * 1000000)) ;
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
17 fi ;
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
18
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
19 #import os
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
20 #set $target_directory = str($out_file.extra_files_path)
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
21 #set $subdir = os.path.basename($target_directory)
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
22
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
23 mkdir '${target_directory}' &&
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
24
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
25 STAR
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
26 --runMode genomeGenerate
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
27 --genomeFastaFiles '${all_fasta_source.fields.path}'
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
28 --genomeDir '${target_directory}'
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
29 --limitGenomeGenerateRAM \${GALAXY_MEMORY_BYTES}
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
30 #if $GTFconditional.GTFselect == "withGTF":
1
67c59c6576db Uploaded
ieguinoa
parents: 0
diff changeset
31 #if $GTFconditional.GTF_source_conditional.GTF_source_select == "builtin_gff":
67c59c6576db Uploaded
ieguinoa
parents: 0
diff changeset
32 --sjdbGTFfile '${GTFconditional.GTF_source_conditional.all_gff_source.fields.path}'
0
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
33 #else
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
34 --sjdbGTFfile '${GTFconditional.GTF_source_conditional.sjdbGTFfile}'
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
35 #end if
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
36 --sjdbOverhang ${GTFconditional.sjdbOverhang}
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
37 #end if
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
38 #if $advanced_options.advanced_options_selector == "advanced":
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
39 --genomeSAindexNbases ${advanced_options.genomeSAindexNbases}
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
40 --genomeChrBinNbits ${advanced_options.genomeChrBinNbits}
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
41 --genomeSAsparseD ${advanced_options.genomeSAsparseD}
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
42 #end if
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
43 --runThreadN \${GALAXY_SLOTS:-2} &&
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
44
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
45 python '${__tool_directory__}/rna_star_index_builder.py'
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
46 --config-file '${out_file}'
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
47 --value '${all_fasta_source.fields.value}'
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
48 --dbkey '${all_fasta_source.fields.dbkey}'
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
49 --index-version '@IDX_VERSION@'
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
50 #if $name:
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
51 --name '$name'
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
52 #else
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
53 --name '${all_fasta_source.fields.name}'
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
54 #end if
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
55 #if str($GTFconditional.GTFselect) == "withGTF":
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
56 --with-gene-model
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
57 #end if
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
58 --data-table @IDX_DATA_TABLE@
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
59 --subdir '${subdir}'
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
60 ]]></command>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
61 <inputs>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
62 <param name="all_fasta_source" type="select" label="Source FASTA Sequence">
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
63 <options from_data_table="all_fasta"/>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
64 </param>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
65 <param name="name" type="text" value="" label="Informative name for sequence index"
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
66 help="By using different settings, you may have several indices per reference genome. Give an appropriate description to the index to distinguish between indices"/>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
67 <conditional name="GTFconditional">
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
68 <param name="GTFselect" type="select" label="Reference genome with or without an annotation" help="Must the index have been created WITH a GTF file (if not you can specify one afterward).">
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
69 <option value="withoutGTF">use genome reference without builtin gene-model</option>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
70 <option value="withGTF">use genome reference with builtin gene-model</option>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
71 </param>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
72 <when value="withGTF">
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
73 <conditional name="GTF_source_conditional">
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
74 <param name="GTF_source_select" type="select" label="Select source of annotation GTF/GFF" >
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
75 <option value="builtin_gff">Use a builtin GTF/GFF</option>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
76 <option value="external_gff">Use an external GTF/GFF</option>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
77 </param>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
78 <when value="builtin_gff">
1
67c59c6576db Uploaded
ieguinoa
parents: 0
diff changeset
79 <param name="all_gff_source" type="select" label="Select source GFF/GTF File">
0
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
80 <options from_data_table="all_gff"/>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
81 </param>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
82 </when>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
83 <when value="external_gff">
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
84 <param argument="--sjdbGTFfile" type="data" format="gff3,gtf" label="Gene model (gff3,gtf) file for splice junctions" optional="false" help="Exon junction information for mapping splices"/>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
85 </when>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
86 </conditional>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
87 <param argument="--sjdbOverhang" type="integer" min="1" value="100" label="Length of the genomic sequence around annotated junctions" help="Used in constructing the splice junctions database. Ideal value is ReadLength-1"/>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
88 </when>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
89 <when value="withoutGTF" />
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
90 </conditional>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
91 <conditional name="advanced_options">
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
92 <param name="advanced_options_selector" type="select" label="Advanced options">
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
93 <option value="default" selected="true">Use default options</option>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
94 <option value="advanced">Set advanced options</option>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
95 </param>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
96 <when value="default" />
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
97 <when value="advanced">
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
98 <param argument="--genomeSAindexNbases" type="integer" min="1" value="14"
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
99 label="Length (bases) of the SA pre-indexing string"
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
100 help="Typically between 10 and 15. Longer strings will use much more memory, but allow
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
101 faster searches. For small genomes, the parameter –genomeSAindexNbases must be scaled
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
102 down to min(14, log2(GenomeLength)/2 - 1). For example, for 1 megaBase genome, this is
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
103 equal to 9, for 100 kiloBase genome, this is equal to 7."/>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
104 <param argument="--genomeChrBinNbits" type="integer" min="1" value="18"
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
105 label="Log2(chrBin), where chrBin is the size of the bins for genome storage"
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
106 help="Each chromosome will occupy an integer number of bins. For a genome with large number
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
107 of contigs, it is recommended to scale this parameter as min(18,
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
108 log2[max(GenomeLength/NumberOfReferences,ReadLength)]). For example, for 3 gigaBase
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
109 genome with 100,000 chromosomes/scaffolds, this is equal to 15."/>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
110 <param argument="--genomeSAsparseD" type="integer" min="1" value="1" label="Suffix array sparsity"
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
111 help="The distance between indices: use bigger numbers to decrease needed RAM at the cost of
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
112 mapping speed reduction"/>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
113 </when>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
114 </conditional>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
115 </inputs>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
116
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
117 <outputs>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
118 <data name="out_file" format="data_manager_json"/>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
119 </outputs>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
120
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
121 <tests>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
122 <test>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
123 <param name="all_fasta_source" value="phiX174"/>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
124 <param name="sequence_name" value="phiX"/>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
125 <param name="sequence_id" value="minimal-settings"/>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
126 <param name="GTFselect" value="withoutGTF"/>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
127
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
128 <output name="out_file" file="test_star_01.data_manager_json" compare="re_match"/>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
129 </test>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
130 </tests>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
131
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
132 <help><![CDATA[
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
133 .. class:: infomark
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
134
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
135 *What it does*
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
136
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
137 This is a Galaxy data manager tool for the gap-aware RNA aligner STAR.
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
138
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
139 This version of the tool builds STAR indices of the format first introduced
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
140 with STAR version @IDX_VERSION@.
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
141
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
142 Please read the fine manual - that and the google group are the places to learn about the options above.
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
143
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
144 *Memory requirements*
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
145
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
146 To run efficiently, RNA-STAR requires enough free memory to
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
147 hold the SA-indexed reference genome in RAM. For Human Genome hg19 this
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
148 index is about 27GB and running RNA-STAR requires approximately ~30GB of RAM.
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
149 For custom genomes, the rule of thumb is to multiply the size of the
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
150 reference FASTA file by 9 to estimated required amount of RAM.
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
151
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
152 *Note on sjdbOverhang*
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
153
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
154 From https://groups.google.com/forum/#!topic/rna-star/h9oh10UlvhI::
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
155
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
156 James is right, using large enough --sjdbOverhang is safer and should not generally cause any problems with reads of varying length. If your reads are very short, &lt;50b, then I would strongly recommend using optimum --sjdbOverhang=mateLength-1
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
157 By mate length I mean the length of one of the ends of the read, i.e. it's 100 for 2x100b PE or 1x100b SE. For longer reads you can simply use generic --sjdbOverhang 100.
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
158 It is a bit confusing because of the way I named this parameter. --sjdbOverhang Noverhang is only used at the genome generation step for constructing the reference sequence out of the annotations.
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
159 Basically, the Noverhang exonic bases from the donor site and Noverhang exonic bases from the acceptor site are spliced together for each of the junctions, and these spliced sequences are added to the genome sequence.
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
160
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
161 At the mapping stage, the reads are aligned to both genomic and splice sequences simultaneously. If a read maps to one of spliced sequences and crosses the "junction" in the middle of it, the coordinates of two pspliced pieces are translated back to genomic space and added to the collection of mapped pieces, which are then all "stitched" together to form the final alignment. Since in the process of "maximal mapped length" search the read is split into pieces of no longer than --seedSearchStartLmax (=50 by default) bases, even if the read (mate) is longer than --sjdbOverhang, it can still be mapped to the spliced reference, as long as --sjdbOverhang > --seedSearchStartLmax.
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
162
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
163 Cheers
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
164 Alex
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
165
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
166 *Note on gene model requirements for splice junctions*
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
167
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
168 From https://groups.google.com/forum/#!msg/rna-star/3Y_aaTuzBrE/lUylTB8h5vMJ::
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
169
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
170 When you generate a genome with annotations, you need to specify --sjdbOverhang value, which ideally should be equal to (oneMateLength-1), or you could use a generic value of ~100.
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
171
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
172 Your gtf lines look fine to me. STAR needs 3 features from a GTF file:
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
173 1. Chromosome names in col.1 that agree with chromosome names in genome .fasta files. If you have "chr2L" names in the genome .fasta files, and "2L" in the .gtf file, then you need to use --sjdbGTFchrPrefix chr option.
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
174 2. 'exon' in col.3 for the exons of all transcripts (this name can be changed with --sjdbGTFfeatureExon)
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
175 3. 'transcript_id' attribute that assigns each exon to a transcript (--this name can be changed with --sjdbGTFtagExonParentTranscript)
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
176
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
177 Cheers
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
178 Alex
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
179
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
180 **Notice:** If you leave name, description, or id blank, it will be generated automatically.
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
181 ]]></help>
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
182 <expand macro="citations" />
e23440b3332a Uploaded
ieguinoa
parents:
diff changeset
183 </tool>