annotate preprocess.xml @ 1:4d25dec9707e

correction
author blanck
date Tue, 28 Apr 2015 11:23:47 +0200
parents a89bae08bf2d
children 54d549210759
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
4d25dec9707e correction
blanck
parents: 0
diff changeset
1 <tool id="preprocess2" name="Data Normalization" force_history_refresh="True" version="0.1.0">
4d25dec9707e correction
blanck
parents: 0
diff changeset
2
0
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
3 <command interpreter="python">
1
4d25dec9707e correction
blanck
parents: 0
diff changeset
4 preprocess2.py
0
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
5 -s '$summary'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
6 -p '$__new_file_path__'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
7 -c '$inputcdffull.name'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
8 -f '$inputufl.name'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
9 -g '$inputugp.name'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
10 -a '$inputacs.name'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
11 -d '$inputcdffull'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
12 -v '$inputufl'
1
4d25dec9707e correction
blanck
parents: 0
diff changeset
13 -w '$inputugp'
4d25dec9707e correction
blanck
parents: 0
diff changeset
14 -b '$inputacs'
4d25dec9707e correction
blanck
parents: 0
diff changeset
15 -e 'datasetName'
0
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
16 #if $settings.settingsType == "tumor":
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
17 -t '$tumorcsv'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
18 #end if
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
19 #if $settings.settingsType == "standard":
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
20 -t 'none'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
21 #end if
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
22 -y '$settings.settingsType'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
23 -o '$outputgraph'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
24 -z '$zipfigures'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
25 -k '$outputlog'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
26 -l '$log'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
27 -u '$__user_id__'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
28 #for $input in $inputs
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
29 -i "${input}"
1
4d25dec9707e correction
blanck
parents: 0
diff changeset
30 -n "${input.name}"
0
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
31 #end for
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
32 </command>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
33 <inputs>
1
4d25dec9707e correction
blanck
parents: 0
diff changeset
34 <param name="datasetName" type="text" label="Dataset Name"/>
0
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
35 <param name="inputs" type="data" format="cel" multiple="True" label="Cel files dataset" help="Cel files dataset previously uploaded with the Multiple File Datasets tool."/>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
36 <param name="inputcdffull" type="data" format="cdf" label="cdf file" help=".cdf file name must comply with the following format : &lt; chiptype &gt;,&lt; tag &gt;.cdf (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full.cdf)." />
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
37 <param name="inputufl" type="data" format="ufl" label="ufl file" help=".ufl file name must start with &lt; chiptype &gt;,&lt; tag &gt; (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full,na31,hg19,HB20110328.ufl)."/>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
38 <param name="inputugp" type="data" format="ugp" label="ugp file" help=".ugp file name must start with &lt; chiptype &gt;,&lt; tag &gt; (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full,na31,hg19,HB20110328.ugp)."/>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
39 <param name="inputacs" type="data" format="acs" label="acs file" help=".acs file name must start with &lt; chiptype &gt;,&lt; tag &gt; (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,HB20080710.acs)."/>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
40 <conditional name="settings">
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
41 <param name="settingsType" type="select" label="Reference">
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
42 <option value="standard">Study without reference</option>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
43 <option value="tumor">Normal-tumor study with TumorBoost</option>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
44 </param>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
45 <when value="standard" />
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
46 <when value="tumor">
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
47 <param name="tumorcsv" type="data" format="csv" label="TumorBoost csv file" help="Normal-tumor csv file. See below for more information."/>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
48 </when>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
49 </conditional>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
50 <!--param name="outputgraph" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="False" label="Output figures" /-->
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
51 <!--param name="outputlog" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="False" label="Output log" /-->
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
52 <param name="outputgraph" type="select" label="Output figures">
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
53 <option value="TRUE">Yes</option>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
54 <option value="FALSE">No</option>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
55 </param>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
56 <param name="outputlog" type="select" label="Output log">
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
57 <option value="TRUE">Yes</option>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
58 <option value="FALSE">No</option>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
59 </param>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
60 <!--param name="chipType" type="text" label="chipType" /-->
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
61 <!--param name="workspace" type="text" label="Workspace"/-->
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
62 </inputs>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
63
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
64 <outputs>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
65 <!-- Would like to make this hidden or not appear all together, but
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
66 variable outputs require a primary dataset. If hidden refresh
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
67 doesn't occur.
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
68 -->
1
4d25dec9707e correction
blanck
parents: 0
diff changeset
69 <data format="dsf" name="summary" label="Dataset summary file of ${datasetName}" />
4d25dec9707e correction
blanck
parents: 0
diff changeset
70 <data format="zip" name="zipfigures" label="figures of normalization of ${datasetName}">
0
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
71 <filter>outputgraph == "TRUE"</filter>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
72 </data>
1
4d25dec9707e correction
blanck
parents: 0
diff changeset
73 <data format="log" name="log" label="log of normalization ${datasetName}">
0
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
74 <filter>outputlog == "TRUE"</filter>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
75 </data>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
76 </outputs>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
77
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
78 <stdio>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
79 <exit_code range="1:" level="fatal" description="See logs for more details" />
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
80 </stdio>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
81
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
82 <help>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
83
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
84 **What it does**
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
85
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
86 This preprocessing step consists in a correction of biological and technical biaises due to the experiment. Raw data from Affymetrix arrays are provided in different CEL files. These data must be normalized before statistical analysis.
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
87 The pre-processing is proposed as a wrapper of aroma.* packages (using CRMAv2 and TumorBoost when appropriate). Note that this implies that the pre-processing step is only available for Affymetrix arrays.
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
88
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
89 -----
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
90
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
91 **Chip file naming conventions**
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
92
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
93 Chip filenames must strictly follow the following rules :
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
94
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
95 - *.cdf* filename must comply with the following format : &lt; chiptype &gt;,&lt; tag &gt;.cdf (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full.cdf). Note the use of a comma (not a point) between &lt;chiptype&gt; and the tag "Full".
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
96
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
97 - *.ufl* filename must start with &lt; chiptype &gt;,&lt; tag &gt; (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full,na31,hg19,HB20110328.ufl).
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
98
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
99 - *.ugp* filename must start with &lt; chiptype &gt;,&lt; tag &gt; (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full,na31,hg19,HB20110328.ugp).
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
100
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
101 - *.acs* file name must start with &lt; chiptype &gt;,&lt; tag &gt; (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,HB20080710.acs).
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
102
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
103 -----
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
104
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
105 **Normal-tumor study with TumorBoost**
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
106
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
107 In cases where normal (control) samples match to tumor samples, normalization can be improved using TumorBoost. In this case, a normal-tumor csv file must be provided :
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
108
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
109 - The first column contains the names of the files corresponding to normal samples of the dataset.
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
110
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
111 - The second column contains the names of the tumor samples files.
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
112
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
113 - Column names of these two columns are respectively normal and tumor.
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
114
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
115 - Columns are separated by a comma.
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
116
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
117 - *Extensions of the files (.CEL for example) should be removed*
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
118
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
119
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
120
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
121 **Example**
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
122
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
123 Let 6 .cel files in the dataset studied (3 patients, each of them being represented by a couple of normal and tumor cel files.) ::
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
124
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
125 patient1_normal.cel
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
126 patient1_tumor.cel
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
127 patient2_normal.cel
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
128 patient2_tumor.cel
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
129 patient3_normal.cel
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
130 patient3_tumor.cel
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
131
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
132
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
133 The csv file should look like this ::
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
134
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
135 normal,tumor
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
136 patient1_normal,patient1_tumor
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
137 patient2_normal,patient2_tumor
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
138 patient3_normal,patient3_tumor
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
139
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
140
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
141 -----
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
142
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
143 **Citation**
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
144
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
145 When using this tool, please cite :
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
146
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
147 `Q. Grimonprez, A. Celisse, M. Cheok, M. Figeac, and G. Marot. MPAgenomics : An R package for multi-patients analysis of genomic markers, 2014. Preprint &lt;http://fr.arxiv.org/abs/1401.5035&gt;`_
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
148
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
149 As CRMAv2 normalization is used, please also cite `H. Bengtsson, P. Wirapati, and T. P. Speed. A single-array preprocessing method for estimating full-resolution raw copy numbers from all Affymetrix genotyping arrays including GenomeWideSNP 5 &amp; 6. Bioinformatics, 5(17):2149–2156, 2009. &lt;http://bioinformatics.oxfordjournals.org/content/25/17/2149.short&gt;`_
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
150
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
151 When using TumorBoost to improve normalization in a normal-tumor study, please cite `H. Bengtsson, P. Neuvial, and T. P. Speed. TumorBoost: Normalization of allele-specific tumor copy numbers from a single pair of tumor-normal genotyping microarrays. BMC Bioinformatics, 11, 2010 &lt;http://www.biomedcentral.com/1471-2105/11/245&gt;`_
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
152
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
153 </help>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
154 </tool>