annotate preprocess.xml @ 6:7dc6ce39fb89 default tip

add selection tool
author blanck
date Wed, 29 Apr 2015 10:08:52 +0200
parents 2b882515e1a3
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
c6ab9e172cc7 correction
blanck
parents: 2
diff changeset
1 <tool id="preprocess" name="Data Normalization" force_history_refresh="True" version="0.1.0">
c6ab9e172cc7 correction
blanck
parents: 2
diff changeset
2 <requirements>
c6ab9e172cc7 correction
blanck
parents: 2
diff changeset
3 <!--requirement type="set_environment">R_SCRIPT_PATH</requirement-->
c6ab9e172cc7 correction
blanck
parents: 2
diff changeset
4 <requirement type="package" version="1.1.2">mpagenomics</requirement>
c6ab9e172cc7 correction
blanck
parents: 2
diff changeset
5 </requirements>
0
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
6 <command interpreter="python">
2
54d549210759 correction
blanck
parents: 1
diff changeset
7 preprocess.py
0
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
8 -s '$summary'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
9 -p '$__new_file_path__'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
10 -c '$inputcdffull.name'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
11 -f '$inputufl.name'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
12 -g '$inputugp.name'
4
2b882515e1a3 correction
blanck
parents: 3
diff changeset
13 -a '$inputacs.name'
0
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
14 -d '$inputcdffull'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
15 -v '$inputufl'
1
4d25dec9707e correction
blanck
parents: 0
diff changeset
16 -w '$inputugp'
4d25dec9707e correction
blanck
parents: 0
diff changeset
17 -b '$inputacs'
4
2b882515e1a3 correction
blanck
parents: 3
diff changeset
18 -e '$datasetName'
0
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
19 #if $settings.settingsType == "tumor":
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
20 -t '$tumorcsv'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
21 #end if
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
22 #if $settings.settingsType == "standard":
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
23 -t 'none'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
24 #end if
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
25 -y '$settings.settingsType'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
26 -o '$outputgraph'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
27 -z '$zipfigures'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
28 -k '$outputlog'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
29 -l '$log'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
30 -u '$__user_id__'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
31 #for $input in $inputs
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
32 -i "${input}"
1
4d25dec9707e correction
blanck
parents: 0
diff changeset
33 -n "${input.name}"
0
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
34 #end for
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
35 </command>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
36 <inputs>
1
4d25dec9707e correction
blanck
parents: 0
diff changeset
37 <param name="datasetName" type="text" label="Dataset Name"/>
0
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
38 <param name="inputs" type="data" format="cel" multiple="True" label="Cel files dataset" help="Cel files dataset previously uploaded with the Multiple File Datasets tool."/>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
39 <param name="inputcdffull" type="data" format="cdf" label="cdf file" help=".cdf file name must comply with the following format : &lt; chiptype &gt;,&lt; tag &gt;.cdf (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full.cdf)." />
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
40 <param name="inputufl" type="data" format="ufl" label="ufl file" help=".ufl file name must start with &lt; chiptype &gt;,&lt; tag &gt; (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full,na31,hg19,HB20110328.ufl)."/>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
41 <param name="inputugp" type="data" format="ugp" label="ugp file" help=".ugp file name must start with &lt; chiptype &gt;,&lt; tag &gt; (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full,na31,hg19,HB20110328.ugp)."/>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
42 <param name="inputacs" type="data" format="acs" label="acs file" help=".acs file name must start with &lt; chiptype &gt;,&lt; tag &gt; (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,HB20080710.acs)."/>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
43 <conditional name="settings">
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
44 <param name="settingsType" type="select" label="Reference">
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
45 <option value="standard">Study without reference</option>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
46 <option value="tumor">Normal-tumor study with TumorBoost</option>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
47 </param>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
48 <when value="standard" />
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
49 <when value="tumor">
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
50 <param name="tumorcsv" type="data" format="csv" label="TumorBoost csv file" help="Normal-tumor csv file. See below for more information."/>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
51 </when>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
52 </conditional>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
53 <!--param name="outputgraph" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="False" label="Output figures" /-->
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
54 <!--param name="outputlog" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="False" label="Output log" /-->
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
55 <param name="outputgraph" type="select" label="Output figures">
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
56 <option value="TRUE">Yes</option>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
57 <option value="FALSE">No</option>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
58 </param>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
59 <param name="outputlog" type="select" label="Output log">
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
60 <option value="TRUE">Yes</option>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
61 <option value="FALSE">No</option>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
62 </param>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
63 <!--param name="chipType" type="text" label="chipType" /-->
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
64 <!--param name="workspace" type="text" label="Workspace"/-->
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
65 </inputs>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
66
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
67 <outputs>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
68 <!-- Would like to make this hidden or not appear all together, but
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
69 variable outputs require a primary dataset. If hidden refresh
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
70 doesn't occur.
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
71 -->
1
4d25dec9707e correction
blanck
parents: 0
diff changeset
72 <data format="dsf" name="summary" label="Dataset summary file of ${datasetName}" />
4d25dec9707e correction
blanck
parents: 0
diff changeset
73 <data format="zip" name="zipfigures" label="figures of normalization of ${datasetName}">
0
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
74 <filter>outputgraph == "TRUE"</filter>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
75 </data>
1
4d25dec9707e correction
blanck
parents: 0
diff changeset
76 <data format="log" name="log" label="log of normalization ${datasetName}">
0
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
77 <filter>outputlog == "TRUE"</filter>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
78 </data>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
79 </outputs>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
80
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
81 <stdio>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
82 <exit_code range="1:" level="fatal" description="See logs for more details" />
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
83 </stdio>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
84
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
85 <help>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
86
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
87 **What it does**
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
88
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
89 This preprocessing step consists in a correction of biological and technical biaises due to the experiment. Raw data from Affymetrix arrays are provided in different CEL files. These data must be normalized before statistical analysis.
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
90 The pre-processing is proposed as a wrapper of aroma.* packages (using CRMAv2 and TumorBoost when appropriate). Note that this implies that the pre-processing step is only available for Affymetrix arrays.
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
91
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
92 -----
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
93
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
94 **Chip file naming conventions**
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
95
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
96 Chip filenames must strictly follow the following rules :
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
97
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
98 - *.cdf* filename must comply with the following format : &lt; chiptype &gt;,&lt; tag &gt;.cdf (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full.cdf). Note the use of a comma (not a point) between &lt;chiptype&gt; and the tag "Full".
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
99
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
100 - *.ufl* filename must start with &lt; chiptype &gt;,&lt; tag &gt; (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full,na31,hg19,HB20110328.ufl).
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
101
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
102 - *.ugp* filename must start with &lt; chiptype &gt;,&lt; tag &gt; (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full,na31,hg19,HB20110328.ugp).
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
103
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
104 - *.acs* file name must start with &lt; chiptype &gt;,&lt; tag &gt; (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,HB20080710.acs).
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
105
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
106 -----
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
107
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
108 **Normal-tumor study with TumorBoost**
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
109
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
110 In cases where normal (control) samples match to tumor samples, normalization can be improved using TumorBoost. In this case, a normal-tumor csv file must be provided :
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
111
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
112 - The first column contains the names of the files corresponding to normal samples of the dataset.
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
113
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
114 - The second column contains the names of the tumor samples files.
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
115
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
116 - Column names of these two columns are respectively normal and tumor.
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
117
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
118 - Columns are separated by a comma.
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
119
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
120 - *Extensions of the files (.CEL for example) should be removed*
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
121
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
122
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
123
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
124 **Example**
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
125
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
126 Let 6 .cel files in the dataset studied (3 patients, each of them being represented by a couple of normal and tumor cel files.) ::
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
127
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
128 patient1_normal.cel
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
129 patient1_tumor.cel
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
130 patient2_normal.cel
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
131 patient2_tumor.cel
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
132 patient3_normal.cel
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
133 patient3_tumor.cel
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
134
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
135
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
136 The csv file should look like this ::
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
137
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
138 normal,tumor
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
139 patient1_normal,patient1_tumor
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
140 patient2_normal,patient2_tumor
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
141 patient3_normal,patient3_tumor
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
142
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
143
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
144 -----
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
145
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
146 **Citation**
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
147
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
148 When using this tool, please cite :
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
149
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
150 `Q. Grimonprez, A. Celisse, M. Cheok, M. Figeac, and G. Marot. MPAgenomics : An R package for multi-patients analysis of genomic markers, 2014. Preprint &lt;http://fr.arxiv.org/abs/1401.5035&gt;`_
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
151
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
152 As CRMAv2 normalization is used, please also cite `H. Bengtsson, P. Wirapati, and T. P. Speed. A single-array preprocessing method for estimating full-resolution raw copy numbers from all Affymetrix genotyping arrays including GenomeWideSNP 5 &amp; 6. Bioinformatics, 5(17):2149–2156, 2009. &lt;http://bioinformatics.oxfordjournals.org/content/25/17/2149.short&gt;`_
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
153
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
154 When using TumorBoost to improve normalization in a normal-tumor study, please cite `H. Bengtsson, P. Neuvial, and T. P. Speed. TumorBoost: Normalization of allele-specific tumor copy numbers from a single pair of tumor-normal genotyping microarrays. BMC Bioinformatics, 11, 2010 &lt;http://www.biomedcentral.com/1471-2105/11/245&gt;`_
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
155
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
156 </help>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
157 </tool>