3
|
1 <tool id="preprocess" name="Data Normalization" force_history_refresh="True" version="0.1.0">
|
|
2 <requirements>
|
|
3 <!--requirement type="set_environment">R_SCRIPT_PATH</requirement-->
|
|
4 <requirement type="package" version="1.1.2">mpagenomics</requirement>
|
|
5 </requirements>
|
0
|
6 <command interpreter="python">
|
2
|
7 preprocess.py
|
0
|
8 -s '$summary'
|
|
9 -p '$__new_file_path__'
|
|
10 -c '$inputcdffull.name'
|
|
11 -f '$inputufl.name'
|
|
12 -g '$inputugp.name'
|
4
|
13 -a '$inputacs.name'
|
0
|
14 -d '$inputcdffull'
|
|
15 -v '$inputufl'
|
1
|
16 -w '$inputugp'
|
|
17 -b '$inputacs'
|
4
|
18 -e '$datasetName'
|
0
|
19 #if $settings.settingsType == "tumor":
|
|
20 -t '$tumorcsv'
|
|
21 #end if
|
|
22 #if $settings.settingsType == "standard":
|
|
23 -t 'none'
|
|
24 #end if
|
|
25 -y '$settings.settingsType'
|
|
26 -o '$outputgraph'
|
|
27 -z '$zipfigures'
|
|
28 -k '$outputlog'
|
|
29 -l '$log'
|
|
30 -u '$__user_id__'
|
|
31 #for $input in $inputs
|
|
32 -i "${input}"
|
1
|
33 -n "${input.name}"
|
0
|
34 #end for
|
|
35 </command>
|
|
36 <inputs>
|
1
|
37 <param name="datasetName" type="text" label="Dataset Name"/>
|
0
|
38 <param name="inputs" type="data" format="cel" multiple="True" label="Cel files dataset" help="Cel files dataset previously uploaded with the Multiple File Datasets tool."/>
|
|
39 <param name="inputcdffull" type="data" format="cdf" label="cdf file" help=".cdf file name must comply with the following format : < chiptype >,< tag >.cdf (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full.cdf)." />
|
|
40 <param name="inputufl" type="data" format="ufl" label="ufl file" help=".ufl file name must start with < chiptype >,< tag > (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full,na31,hg19,HB20110328.ufl)."/>
|
|
41 <param name="inputugp" type="data" format="ugp" label="ugp file" help=".ugp file name must start with < chiptype >,< tag > (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full,na31,hg19,HB20110328.ugp)."/>
|
|
42 <param name="inputacs" type="data" format="acs" label="acs file" help=".acs file name must start with < chiptype >,< tag > (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,HB20080710.acs)."/>
|
|
43 <conditional name="settings">
|
|
44 <param name="settingsType" type="select" label="Reference">
|
|
45 <option value="standard">Study without reference</option>
|
|
46 <option value="tumor">Normal-tumor study with TumorBoost</option>
|
|
47 </param>
|
|
48 <when value="standard" />
|
|
49 <when value="tumor">
|
|
50 <param name="tumorcsv" type="data" format="csv" label="TumorBoost csv file" help="Normal-tumor csv file. See below for more information."/>
|
|
51 </when>
|
|
52 </conditional>
|
|
53 <!--param name="outputgraph" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="False" label="Output figures" /-->
|
|
54 <!--param name="outputlog" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="False" label="Output log" /-->
|
|
55 <param name="outputgraph" type="select" label="Output figures">
|
|
56 <option value="TRUE">Yes</option>
|
|
57 <option value="FALSE">No</option>
|
|
58 </param>
|
|
59 <param name="outputlog" type="select" label="Output log">
|
|
60 <option value="TRUE">Yes</option>
|
|
61 <option value="FALSE">No</option>
|
|
62 </param>
|
|
63 <!--param name="chipType" type="text" label="chipType" /-->
|
|
64 <!--param name="workspace" type="text" label="Workspace"/-->
|
|
65 </inputs>
|
|
66
|
|
67 <outputs>
|
|
68 <!-- Would like to make this hidden or not appear all together, but
|
|
69 variable outputs require a primary dataset. If hidden refresh
|
|
70 doesn't occur.
|
|
71 -->
|
1
|
72 <data format="dsf" name="summary" label="Dataset summary file of ${datasetName}" />
|
|
73 <data format="zip" name="zipfigures" label="figures of normalization of ${datasetName}">
|
0
|
74 <filter>outputgraph == "TRUE"</filter>
|
|
75 </data>
|
1
|
76 <data format="log" name="log" label="log of normalization ${datasetName}">
|
0
|
77 <filter>outputlog == "TRUE"</filter>
|
|
78 </data>
|
|
79 </outputs>
|
|
80
|
|
81 <stdio>
|
|
82 <exit_code range="1:" level="fatal" description="See logs for more details" />
|
|
83 </stdio>
|
|
84
|
|
85 <help>
|
|
86
|
|
87 **What it does**
|
|
88
|
|
89 This preprocessing step consists in a correction of biological and technical biaises due to the experiment. Raw data from Affymetrix arrays are provided in different CEL files. These data must be normalized before statistical analysis.
|
|
90 The pre-processing is proposed as a wrapper of aroma.* packages (using CRMAv2 and TumorBoost when appropriate). Note that this implies that the pre-processing step is only available for Affymetrix arrays.
|
|
91
|
|
92 -----
|
|
93
|
|
94 **Chip file naming conventions**
|
|
95
|
|
96 Chip filenames must strictly follow the following rules :
|
|
97
|
|
98 - *.cdf* filename must comply with the following format : < chiptype >,< tag >.cdf (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full.cdf). Note the use of a comma (not a point) between <chiptype> and the tag "Full".
|
|
99
|
|
100 - *.ufl* filename must start with < chiptype >,< tag > (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full,na31,hg19,HB20110328.ufl).
|
|
101
|
|
102 - *.ugp* filename must start with < chiptype >,< tag > (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full,na31,hg19,HB20110328.ugp).
|
|
103
|
|
104 - *.acs* file name must start with < chiptype >,< tag > (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,HB20080710.acs).
|
|
105
|
|
106 -----
|
|
107
|
|
108 **Normal-tumor study with TumorBoost**
|
|
109
|
|
110 In cases where normal (control) samples match to tumor samples, normalization can be improved using TumorBoost. In this case, a normal-tumor csv file must be provided :
|
|
111
|
|
112 - The first column contains the names of the files corresponding to normal samples of the dataset.
|
|
113
|
|
114 - The second column contains the names of the tumor samples files.
|
|
115
|
|
116 - Column names of these two columns are respectively normal and tumor.
|
|
117
|
|
118 - Columns are separated by a comma.
|
|
119
|
|
120 - *Extensions of the files (.CEL for example) should be removed*
|
|
121
|
|
122
|
|
123
|
|
124 **Example**
|
|
125
|
|
126 Let 6 .cel files in the dataset studied (3 patients, each of them being represented by a couple of normal and tumor cel files.) ::
|
|
127
|
|
128 patient1_normal.cel
|
|
129 patient1_tumor.cel
|
|
130 patient2_normal.cel
|
|
131 patient2_tumor.cel
|
|
132 patient3_normal.cel
|
|
133 patient3_tumor.cel
|
|
134
|
|
135
|
|
136 The csv file should look like this ::
|
|
137
|
|
138 normal,tumor
|
|
139 patient1_normal,patient1_tumor
|
|
140 patient2_normal,patient2_tumor
|
|
141 patient3_normal,patient3_tumor
|
|
142
|
|
143
|
|
144 -----
|
|
145
|
|
146 **Citation**
|
|
147
|
|
148 When using this tool, please cite :
|
|
149
|
|
150 `Q. Grimonprez, A. Celisse, M. Cheok, M. Figeac, and G. Marot. MPAgenomics : An R package for multi-patients analysis of genomic markers, 2014. Preprint <http://fr.arxiv.org/abs/1401.5035>`_
|
|
151
|
|
152 As CRMAv2 normalization is used, please also cite `H. Bengtsson, P. Wirapati, and T. P. Speed. A single-array preprocessing method for estimating full-resolution raw copy numbers from all Affymetrix genotyping arrays including GenomeWideSNP 5 & 6. Bioinformatics, 5(17):2149–2156, 2009. <http://bioinformatics.oxfordjournals.org/content/25/17/2149.short>`_
|
|
153
|
|
154 When using TumorBoost to improve normalization in a normal-tumor study, please cite `H. Bengtsson, P. Neuvial, and T. P. Speed. TumorBoost: Normalization of allele-specific tumor copy numbers from a single pair of tumor-normal genotyping microarrays. BMC Bioinformatics, 11, 2010 <http://www.biomedcentral.com/1471-2105/11/245>`_
|
|
155
|
|
156 </help>
|
|
157 </tool>
|