comparison ncPRO-QC.xml @ 1:0c34e0bef7d3 draft

Uploaded
author jbrayet
date Thu, 29 Oct 2015 10:24:53 -0400
parents
children
comparison
equal deleted inserted replaced
0:becdc75514a2 1:0c34e0bef7d3
1 <tool id="ncPRO-QC" name="Alignment and QC">
2 <description>of sRNA-seq data</description>
3 <requirements>
4 <requirement type="package" version="0.1">docker_nc_pro_seq</requirement>
5 </requirements>
6 <command interpreter="bash">ncPRO-QC.sh
7 #for $i in $input_conditional.sampleNumber.samples
8 -i ${i.input}
9 #end for
10 #for $i in $input_conditional.sampleNumber.samples
11 -s ${i.sampleName}
12 #end for
13 -t $input_conditional.input_type
14 -n $projectName
15 -g $genome
16 -f $Rfam
17 -l $outlog
18 -r $report
19 -h $outhtml
20 -p $outpdf
21 #if $input_conditional.input_type == "fastq"
22 -a $input_conditional.mapping
23 #if $input_conditional.sampleNumber.numberOfSample == "1"
24 -o $outbam_0
25 #end if
26 #if $input_conditional.sampleNumber.numberOfSample == "2"
27 -o $outbam_1 -o $outbam_2
28 #end if
29 #if $input_conditional.sampleNumber.numberOfSample == "3"
30 -o $outbam_3 -o $outbam_4 -o $outbam_5
31 #end if
32 #if $input_conditional.sampleNumber.numberOfSample == "4"
33 -o $outbam_6 -o $outbam_7 -o $outbam_8 -o $outbam_9
34 #end if
35 #end if
36 </command>
37 <inputs>
38 <param name="projectName" type="text" value="Project_1" size="20" label="Give a project name" >
39 <sanitizer invalid_char="">
40 <valid initial="string.letters,string.digits"><add value="_"/></valid>
41 </sanitizer>
42 </param>
43 <conditional name="input_conditional">
44 <param name="input_type" type="select" label="Select your input file format" help="Raw datafile (fastq) or aligned file (BAM) are allowed. Different treatment will be performed according to the data type.">
45 <option value="fastq" selected="true">fastq</option>
46 <option value="bam">bam</option>
47 </param>
48 <when value="fastq">
49 <conditional name="sampleNumber">
50 <param name="numberOfSample" type="select" label="Number of sample(s)">
51 <option value="1" selected="true">1</option>
52 <option value="2">2</option>
53 <option value="3">3</option>
54 <option value="4">4</option>
55 </param>
56 <when value="1">
57 <repeat name="samples" title="Sample Name" min="1" max="1" default="1">
58 <param name="sampleName" type="text" value="input" size="30" label="Name">
59 <sanitizer invalid_char="">
60 <valid initial="string.letters,string.digits"><add value="_"/></valid>
61 </sanitizer>
62 </param>
63 <param name="input" type="data" format="fastq" label="Raw Input file"/>
64 </repeat>
65 </when>
66 <when value="2">
67 <repeat name="samples" title="Sample Name" min="2" max="2" default="2">
68 <param name="sampleName" type="text" value="input" size="30" label="Name">
69 <sanitizer invalid_char="">
70 <valid initial="string.letters,string.digits"><add value="_"/></valid>
71 </sanitizer>
72 </param>
73 <param name="input" type="data" format="fastq" label="Raw Input file"/>
74 </repeat>
75 </when>
76 <when value="3">
77 <repeat name="samples" title="Sample Name" min="3" max="3" default="3">
78 <param name="sampleName" type="text" value="input" size="30" label="Name">
79 <sanitizer invalid_char="">
80 <valid initial="string.letters,string.digits"><add value="_"/></valid>
81 </sanitizer>
82 </param>
83 <param name="input" type="data" format="fastq" label="Raw Input file"/>
84 </repeat>
85 </when>
86 <when value="4">
87 <repeat name="samples" title="Sample Name" min="4" max="4" default="4">
88 <param name="sampleName" type="text" value="input" size="30" label="Name">
89 <sanitizer invalid_char="">
90 <valid initial="string.letters,string.digits"><add value="_"/></valid>
91 </sanitizer>
92 </param>
93 <param name="input" type="data" format="fastq" label="Raw Input file"/>
94 </repeat>
95 </when>
96 </conditional>
97 <param name="mapping" type="boolean" value="False" truevalue="True" falsevalue="False" label="Run Alignment" help="ncPRO-seq proposes to align the reads on a reference genome using the Bowtie aligner"/>
98 </when>
99 <when value="bam">
100 <repeat name="samples" title="Sample Name" min="1" max="4" default="1">
101 <param name="sampleName" type="text" value="input" size="30" label="Name">
102 <sanitizer invalid_char="">
103 <valid initial="string.letters,string.digits"><add value="_"/></valid>
104 </sanitizer>
105 </param>
106 <param name="input" type="data" format="bam" label="Input file"/>
107 </repeat>
108 </when>
109 </conditional>
110 <param name="genome" type="select" label="Select a reference genome">
111 <option value="mm9">mm9</option>
112 <option value="hg19">hg19</option>
113 </param>
114 <param name="Rfam" type="boolean" value="False" truevalue="True" falsevalue="False" label="Generate the annotation overview using the RFAM and RepeatMasker database (only for aligned data)" />
115 <!--<param name="Rmsk" type="boolean" value="False" truevalue="True" falsevalue="False" label="Rmsk overview" />-->
116 <param name="report" type="select" label="Select your report format" >
117 <option value="all" selected="True">html and pdf</option>
118 <option value="html">html</option>
119 <option value="pdf">pdf</option>
120 </param>
121 </inputs>
122 <outputs>
123 <data format="bam" name="outbam_0" label="ncPRO mapped file">
124 <filter>((input_conditional['input_type'] == 'fastq') and (input_conditional['mapping'] == True) and (input_conditional['sampleNumber']['numberOfSample'] == '1'))</filter>
125 </data>
126 <data format="bam" name="outbam_1" label="ncPRO mapped file 1">
127 <filter>((input_conditional['input_type'] == 'fastq') and (input_conditional['mapping'] == True) and (input_conditional['sampleNumber']['numberOfSample'] == '2'))</filter>
128 </data>
129 <data format="bam" name="outbam_2" label="ncPRO mapped file 2">
130 <filter>((input_conditional['input_type'] == 'fastq') and (input_conditional['mapping'] == True) and (input_conditional['sampleNumber']['numberOfSample'] == '2'))</filter>
131 </data>
132 <data format="bam" name="outbam_3" label="ncPRO mapped file 1">
133 <filter>((input_conditional['input_type'] == 'fastq') and (input_conditional['mapping'] == True) and (input_conditional['sampleNumber']['numberOfSample'] == '3'))</filter>
134 </data>
135 <data format="bam" name="outbam_4" label="ncPRO mapped file 2">
136 <filter>((input_conditional['input_type'] == 'fastq') and (input_conditional['mapping'] == True) and (input_conditional['sampleNumber']['numberOfSample'] == '3'))</filter>
137 </data>
138 <data format="bam" name="outbam_5" label="ncPRO mapped file 3">
139 <filter>((input_conditional['input_type'] == 'fastq') and (input_conditional['mapping'] == True) and (input_conditional['sampleNumber']['numberOfSample'] == '3'))</filter>
140 </data>
141 <data format="bam" name="outbam_6" label="ncPRO mapped file 1">
142 <filter>((input_conditional['input_type'] == 'fastq') and (input_conditional['mapping'] == True) and (input_conditional['sampleNumber']['numberOfSample'] == '4'))</filter>
143 </data>
144 <data format="bam" name="outbam_7" label="ncPRO mapped file 2">
145 <filter>((input_conditional['input_type'] == 'fastq') and (input_conditional['mapping'] == True) and (input_conditional['sampleNumber']['numberOfSample'] == '4'))</filter>
146 </data>
147 <data format="bam" name="outbam_8" label="ncPRO mapped file 3">
148 <filter>((input_conditional['input_type'] == 'fastq') and (input_conditional['mapping'] == True) and (input_conditional['sampleNumber']['numberOfSample'] == '4'))</filter>
149 </data>
150 <data format="bam" name="outbam_9" label="ncPRO mapped file 4">
151 <filter>((input_conditional['input_type'] == 'fastq') and (input_conditional['mapping'] == True) and (input_conditional['sampleNumber']['numberOfSample'] == '4'))</filter>
152 </data>
153 <data format="html" name="outhtml" label="ncPRO html report">
154 <filter>((report == 'all') or (report == 'html'))</filter>
155 </data>
156 <data format="pdf" name="outpdf" label="ncPRO pdf report">
157 <filter>((report == 'all') or (report == 'pdf'))</filter>
158 </data>
159 <data format="txt" name="outlog" label="ncPRO log">
160 </data>
161 </outputs>
162 <help>
163
164 **What ncPRO-seq does ?**
165
166 ------
167
168 ncPRO-seq is a tool for annotation and profiling of ncRNAs from smallRNA sequencing data. It aims to interrogate and perform detailed analysis on small RNAs derived from annotated non-coding regions in miRBase, Rfam and repeatMasker, and regions defined by users. A command line version and an online version are available at http://ncpro.curie.fr.
169 If you use the ncPRO-seq tool for your analysis, please cite the following paper :
170 Chen C., Servant N., Toedling J., Sarazin A., Marchais A., Duvernois-Berthet E., Cognat V., Colot V., Voinnet O., Heard E., Ciaudo C. and Barillot E. (2012) ncPRO-seq: a tool for annotation and profiling analysis of ncRNAs from small RNA-seq.Bioinformatics.28(23):3147-9.
171
172 ------
173
174 **Input Formats**
175
176 Raw datafile (fastq) or aligned file (BAM) are allowed. In all the case, ncPRO-seq will performed a quality control of your data.
177
178 ------
179
180 **Quality Control of raw data**
181
182 -Base Composition Information
183
184 Display the proportion of each base position for which each of the four normal DNA bases has been called (or GC content). If you see strong biases which change in different bases then this usually indicates an overrepresented sequence which is contaminating your library. A bias which is consistent across all bases either indicates that the original library was sequence biased, or that there was a systematic problem during the sequencing of the library.
185
186 -Quality Score
187
188 This view presents the quality values across all bases at each position in the FastQ file.
189 The y-axis on the graph shows the mean quality scores. The higher the score the better the base call. The quality of calls on most platforms will degrade as the run progresses, so it is common to see base calls falling into the orange area towards the end of a read.
190 We usually consider as good quality, the data with a mean quality higher than 20.
191
192 -Reads Length Distribution
193
194 The insert size distribution is the most important quality control in sRNA-seq data. ncPRO-seq provides two types of information, i.e. the abundant versus the distinct reads length distribution. The abundant distribution considers all reads as they are described in the fastq file. The distinct distribution merges all duplicated sequence as one. This view usually decreases the importance of miRNAs to highlight other population-based ncRNAs.
195
196 ------
197
198 **Reads Alignment**
199
200 In case of raw data, ncPRO-seq proposes to align them on a reference genome using the Bowtie aligner. A default alignment is performed to return the best read alignment with a few mismatches allowed (--best --strata -e 50 -nomaqround). Up to 20 locations for a given read are allowed (-a -m 20) in order to deal with ncRNAs repeated on the genome.
201
202 ------
203
204 **Quality Control of aligned data**
205
206 -Mapping statistics
207
208 The proportions of reads with unique, multiple mapping sites in the genome, and unmapped reads is plotted. For sRNA-seq data, we usually expect to have a large proportion of unique hits.
209
210 -Annotation overview
211
212 The reads annotation family is the most general overview, and counts the reads based on the following annotations: coding genes, ncRNAs from Rfam, smallRNAs from repeated regions, rRNAs, piRNAs from piRBase and precursor miRNAs from miRBase.
213
214 -miRNA reads proportion (miRBase)
215
216 A dedicated plot is available for pre-miRNAs. In this step, abundant reads mapped in mature miRNA regions are counted, and plotted as the proportion of all mapped reads in the genome. The annotation file of mature miRNA is generated using files from miRBase. Each miRNA count is calculated using the intersection of the reads alignment with the precursor position.
217 In a classical sRNA-seq experiment, we usually expect to have a high level of miRNAs (around 70%). This information can be used as a quality control for mammals. If a small proportion of miRNAs is observed, it means that another population of ncRNA predominates. This can be real biological information, or a contamination (tRNA, rRNA, etc.)
218
219 ------
220
221 **RFAM and RepeatMasker annotation overview**
222
223 After alignment, ncPRO-seq can give a first overview of your data annotation, by overlapping the aligned read with the known annotations from the RFAM or RepeatMasker database.
224
225 -ncRNA annotation (RFAM)
226
227 To compare the read expression in different repeat/Rfam families, we count the number of abundant reads in each family and plot the relative proportion.
228 We catalogue non-coding RNA genes in Rfam annotation into five big classes: tRNA, rRNA, snRNA, snoRNA and others. Note that miRNA annotations are excluded in the Rfam noncoding RNA analyses to be replaced by the miRBase annotation.
229
230 -Repeats annotation (RepeatMasker)
231
232 ncPRO-seq uses repeat annotations from RepeatMasker database. We classify different repeats based on the name of repeat family.
233
234 </help>
235 </tool>