0
|
1 <tool id="rna_star_index_builder_data_manager_custom" name="rnastar index versioned" tool_type="manage_data" version="@IDX_VERSION@" profile="19.05">
|
|
2 <description>builder</description>
|
|
3
|
|
4 <macros>
|
|
5 <import>macros.xml</import>
|
|
6 </macros>
|
|
7
|
|
8 <expand macro="requirements">
|
|
9 <requirement type="package" version="3.7">python</requirement>
|
|
10 </expand>
|
|
11
|
|
12 <command><![CDATA[
|
|
13 if [ -z "\$GALAXY_MEMORY_MB" ] ; then
|
1
|
14 GALAXY_MEMORY_BYTES=310000000000 ;
|
0
|
15 else
|
|
16 GALAXY_MEMORY_BYTES=\$((GALAXY_MEMORY_MB * 1000000)) ;
|
|
17 fi ;
|
|
18
|
|
19 #import os
|
|
20 #set $target_directory = str($out_file.extra_files_path)
|
|
21 #set $subdir = os.path.basename($target_directory)
|
|
22
|
|
23 mkdir '${target_directory}' &&
|
|
24
|
|
25 STAR
|
|
26 --runMode genomeGenerate
|
|
27 --genomeFastaFiles '${all_fasta_source.fields.path}'
|
|
28 --genomeDir '${target_directory}'
|
|
29 --limitGenomeGenerateRAM \${GALAXY_MEMORY_BYTES}
|
|
30 #if $GTFconditional.GTFselect == "withGTF":
|
1
|
31 #if $GTFconditional.GTF_source_conditional.GTF_source_select == "builtin_gff":
|
|
32 --sjdbGTFfile '${GTFconditional.GTF_source_conditional.all_gff_source.fields.path}'
|
0
|
33 #else
|
|
34 --sjdbGTFfile '${GTFconditional.GTF_source_conditional.sjdbGTFfile}'
|
|
35 #end if
|
|
36 --sjdbOverhang ${GTFconditional.sjdbOverhang}
|
|
37 #end if
|
|
38 #if $advanced_options.advanced_options_selector == "advanced":
|
|
39 --genomeSAindexNbases ${advanced_options.genomeSAindexNbases}
|
|
40 --genomeChrBinNbits ${advanced_options.genomeChrBinNbits}
|
|
41 --genomeSAsparseD ${advanced_options.genomeSAsparseD}
|
|
42 #end if
|
|
43 --runThreadN \${GALAXY_SLOTS:-2} &&
|
|
44
|
|
45 python '${__tool_directory__}/rna_star_index_builder.py'
|
|
46 --config-file '${out_file}'
|
|
47 --value '${all_fasta_source.fields.value}'
|
|
48 --dbkey '${all_fasta_source.fields.dbkey}'
|
|
49 --index-version '@IDX_VERSION@'
|
|
50 #if $name:
|
|
51 --name '$name'
|
|
52 #else
|
|
53 --name '${all_fasta_source.fields.name}'
|
|
54 #end if
|
|
55 #if str($GTFconditional.GTFselect) == "withGTF":
|
|
56 --with-gene-model
|
|
57 #end if
|
|
58 --data-table @IDX_DATA_TABLE@
|
|
59 --subdir '${subdir}'
|
|
60 ]]></command>
|
|
61 <inputs>
|
|
62 <param name="all_fasta_source" type="select" label="Source FASTA Sequence">
|
|
63 <options from_data_table="all_fasta"/>
|
|
64 </param>
|
|
65 <param name="name" type="text" value="" label="Informative name for sequence index"
|
|
66 help="By using different settings, you may have several indices per reference genome. Give an appropriate description to the index to distinguish between indices"/>
|
|
67 <conditional name="GTFconditional">
|
|
68 <param name="GTFselect" type="select" label="Reference genome with or without an annotation" help="Must the index have been created WITH a GTF file (if not you can specify one afterward).">
|
|
69 <option value="withoutGTF">use genome reference without builtin gene-model</option>
|
|
70 <option value="withGTF">use genome reference with builtin gene-model</option>
|
|
71 </param>
|
|
72 <when value="withGTF">
|
|
73 <conditional name="GTF_source_conditional">
|
|
74 <param name="GTF_source_select" type="select" label="Select source of annotation GTF/GFF" >
|
|
75 <option value="builtin_gff">Use a builtin GTF/GFF</option>
|
|
76 <option value="external_gff">Use an external GTF/GFF</option>
|
|
77 </param>
|
|
78 <when value="builtin_gff">
|
1
|
79 <param name="all_gff_source" type="select" label="Select source GFF/GTF File">
|
0
|
80 <options from_data_table="all_gff"/>
|
|
81 </param>
|
|
82 </when>
|
|
83 <when value="external_gff">
|
|
84 <param argument="--sjdbGTFfile" type="data" format="gff3,gtf" label="Gene model (gff3,gtf) file for splice junctions" optional="false" help="Exon junction information for mapping splices"/>
|
|
85 </when>
|
|
86 </conditional>
|
|
87 <param argument="--sjdbOverhang" type="integer" min="1" value="100" label="Length of the genomic sequence around annotated junctions" help="Used in constructing the splice junctions database. Ideal value is ReadLength-1"/>
|
|
88 </when>
|
|
89 <when value="withoutGTF" />
|
|
90 </conditional>
|
|
91 <conditional name="advanced_options">
|
|
92 <param name="advanced_options_selector" type="select" label="Advanced options">
|
|
93 <option value="default" selected="true">Use default options</option>
|
|
94 <option value="advanced">Set advanced options</option>
|
|
95 </param>
|
|
96 <when value="default" />
|
|
97 <when value="advanced">
|
|
98 <param argument="--genomeSAindexNbases" type="integer" min="1" value="14"
|
|
99 label="Length (bases) of the SA pre-indexing string"
|
|
100 help="Typically between 10 and 15. Longer strings will use much more memory, but allow
|
|
101 faster searches. For small genomes, the parameter –genomeSAindexNbases must be scaled
|
|
102 down to min(14, log2(GenomeLength)/2 - 1). For example, for 1 megaBase genome, this is
|
|
103 equal to 9, for 100 kiloBase genome, this is equal to 7."/>
|
|
104 <param argument="--genomeChrBinNbits" type="integer" min="1" value="18"
|
|
105 label="Log2(chrBin), where chrBin is the size of the bins for genome storage"
|
|
106 help="Each chromosome will occupy an integer number of bins. For a genome with large number
|
|
107 of contigs, it is recommended to scale this parameter as min(18,
|
|
108 log2[max(GenomeLength/NumberOfReferences,ReadLength)]). For example, for 3 gigaBase
|
|
109 genome with 100,000 chromosomes/scaffolds, this is equal to 15."/>
|
|
110 <param argument="--genomeSAsparseD" type="integer" min="1" value="1" label="Suffix array sparsity"
|
|
111 help="The distance between indices: use bigger numbers to decrease needed RAM at the cost of
|
|
112 mapping speed reduction"/>
|
|
113 </when>
|
|
114 </conditional>
|
|
115 </inputs>
|
|
116
|
|
117 <outputs>
|
|
118 <data name="out_file" format="data_manager_json"/>
|
|
119 </outputs>
|
|
120
|
|
121 <tests>
|
|
122 <test>
|
|
123 <param name="all_fasta_source" value="phiX174"/>
|
|
124 <param name="sequence_name" value="phiX"/>
|
|
125 <param name="sequence_id" value="minimal-settings"/>
|
|
126 <param name="GTFselect" value="withoutGTF"/>
|
|
127
|
|
128 <output name="out_file" file="test_star_01.data_manager_json" compare="re_match"/>
|
|
129 </test>
|
|
130 </tests>
|
|
131
|
|
132 <help><![CDATA[
|
|
133 .. class:: infomark
|
|
134
|
|
135 *What it does*
|
|
136
|
|
137 This is a Galaxy data manager tool for the gap-aware RNA aligner STAR.
|
|
138
|
|
139 This version of the tool builds STAR indices of the format first introduced
|
|
140 with STAR version @IDX_VERSION@.
|
|
141
|
|
142 Please read the fine manual - that and the google group are the places to learn about the options above.
|
|
143
|
|
144 *Memory requirements*
|
|
145
|
|
146 To run efficiently, RNA-STAR requires enough free memory to
|
|
147 hold the SA-indexed reference genome in RAM. For Human Genome hg19 this
|
|
148 index is about 27GB and running RNA-STAR requires approximately ~30GB of RAM.
|
|
149 For custom genomes, the rule of thumb is to multiply the size of the
|
|
150 reference FASTA file by 9 to estimated required amount of RAM.
|
|
151
|
|
152 *Note on sjdbOverhang*
|
|
153
|
|
154 From https://groups.google.com/forum/#!topic/rna-star/h9oh10UlvhI::
|
|
155
|
|
156 James is right, using large enough --sjdbOverhang is safer and should not generally cause any problems with reads of varying length. If your reads are very short, <50b, then I would strongly recommend using optimum --sjdbOverhang=mateLength-1
|
|
157 By mate length I mean the length of one of the ends of the read, i.e. it's 100 for 2x100b PE or 1x100b SE. For longer reads you can simply use generic --sjdbOverhang 100.
|
|
158 It is a bit confusing because of the way I named this parameter. --sjdbOverhang Noverhang is only used at the genome generation step for constructing the reference sequence out of the annotations.
|
|
159 Basically, the Noverhang exonic bases from the donor site and Noverhang exonic bases from the acceptor site are spliced together for each of the junctions, and these spliced sequences are added to the genome sequence.
|
|
160
|
|
161 At the mapping stage, the reads are aligned to both genomic and splice sequences simultaneously. If a read maps to one of spliced sequences and crosses the "junction" in the middle of it, the coordinates of two pspliced pieces are translated back to genomic space and added to the collection of mapped pieces, which are then all "stitched" together to form the final alignment. Since in the process of "maximal mapped length" search the read is split into pieces of no longer than --seedSearchStartLmax (=50 by default) bases, even if the read (mate) is longer than --sjdbOverhang, it can still be mapped to the spliced reference, as long as --sjdbOverhang > --seedSearchStartLmax.
|
|
162
|
|
163 Cheers
|
|
164 Alex
|
|
165
|
|
166 *Note on gene model requirements for splice junctions*
|
|
167
|
|
168 From https://groups.google.com/forum/#!msg/rna-star/3Y_aaTuzBrE/lUylTB8h5vMJ::
|
|
169
|
|
170 When you generate a genome with annotations, you need to specify --sjdbOverhang value, which ideally should be equal to (oneMateLength-1), or you could use a generic value of ~100.
|
|
171
|
|
172 Your gtf lines look fine to me. STAR needs 3 features from a GTF file:
|
|
173 1. Chromosome names in col.1 that agree with chromosome names in genome .fasta files. If you have "chr2L" names in the genome .fasta files, and "2L" in the .gtf file, then you need to use --sjdbGTFchrPrefix chr option.
|
|
174 2. 'exon' in col.3 for the exons of all transcripts (this name can be changed with --sjdbGTFfeatureExon)
|
|
175 3. 'transcript_id' attribute that assigns each exon to a transcript (--this name can be changed with --sjdbGTFtagExonParentTranscript)
|
|
176
|
|
177 Cheers
|
|
178 Alex
|
|
179
|
|
180 **Notice:** If you leave name, description, or id blank, it will be generated automatically.
|
|
181 ]]></help>
|
|
182 <expand macro="citations" />
|
|
183 </tool>
|