annotate preprocess.xml @ 0:a89bae08bf2d

Uploaded
author sblanck
date Mon, 27 Apr 2015 05:48:52 -0400
parents
children 4d25dec9707e
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
1 <tool id="preprocess" name="Data Normalization" force_history_refresh="True" version="0.1.0">
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
2 <requirements>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
3 <requirement type="set_environment">R_SCRIPT_PATH</requirement>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
4 <requirement type="package" version="1.1.2">mpagenomics</requirement>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
5 </requirements>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
6 <command interpreter="python">
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
7 preprocess.py
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
8 -s '$summary'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
9 -p '$__new_file_path__'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
10 -c '$inputcdffull.name'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
11 -f '$inputufl.name'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
12 -g '$inputugp.name'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
13 -a '$inputacs.name'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
14 -d '$inputcdffull'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
15 -v '$inputufl'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
16 -h '$inputugp'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
17 -b '$inputacs'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
18 #if $settings.settingsType == "tumor":
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
19 -t '$tumorcsv'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
20 #end if
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
21 #if $settings.settingsType == "standard":
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
22 -t 'none'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
23 #end if
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
24 -y '$settings.settingsType'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
25 -o '$outputgraph'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
26 -z '$zipfigures'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
27 -k '$outputlog'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
28 -l '$log'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
29 -u '$__user_id__'
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
30 #for $input in $inputs
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
31 -i "${input}"
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
32 -n "${input.name}
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
33 #end for
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
34 </command>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
35 <inputs>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
36 <param name="inputs" type="data" format="cel" multiple="True" label="Cel files dataset" help="Cel files dataset previously uploaded with the Multiple File Datasets tool."/>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
37 <param name="inputcdffull" type="data" format="cdf" label="cdf file" help=".cdf file name must comply with the following format : &lt; chiptype &gt;,&lt; tag &gt;.cdf (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full.cdf)." />
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
38 <param name="inputufl" type="data" format="ufl" label="ufl file" help=".ufl file name must start with &lt; chiptype &gt;,&lt; tag &gt; (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full,na31,hg19,HB20110328.ufl)."/>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
39 <param name="inputugp" type="data" format="ugp" label="ugp file" help=".ugp file name must start with &lt; chiptype &gt;,&lt; tag &gt; (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full,na31,hg19,HB20110328.ugp)."/>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
40 <param name="inputacs" type="data" format="acs" label="acs file" help=".acs file name must start with &lt; chiptype &gt;,&lt; tag &gt; (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,HB20080710.acs)."/>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
41 <conditional name="settings">
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
42 <param name="settingsType" type="select" label="Reference">
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
43 <option value="standard">Study without reference</option>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
44 <option value="tumor">Normal-tumor study with TumorBoost</option>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
45 </param>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
46 <when value="standard" />
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
47 <when value="tumor">
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
48 <param name="tumorcsv" type="data" format="csv" label="TumorBoost csv file" help="Normal-tumor csv file. See below for more information."/>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
49 </when>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
50 </conditional>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
51 <!--param name="outputgraph" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="False" label="Output figures" /-->
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
52 <!--param name="outputlog" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="False" label="Output log" /-->
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
53 <param name="outputgraph" type="select" label="Output figures">
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
54 <option value="TRUE">Yes</option>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
55 <option value="FALSE">No</option>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
56 </param>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
57 <param name="outputlog" type="select" label="Output log">
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
58 <option value="TRUE">Yes</option>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
59 <option value="FALSE">No</option>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
60 </param>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
61 <!--param name="chipType" type="text" label="chipType" /-->
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
62 <!--param name="workspace" type="text" label="Workspace"/-->
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
63 </inputs>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
64
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
65 <outputs>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
66 <!-- Would like to make this hidden or not appear all together, but
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
67 variable outputs require a primary dataset. If hidden refresh
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
68 doesn't occur.
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
69 -->
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
70 <data format="dsf" name="summary" label="Dataset summary file of ${input.name} " />
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
71 <data format="zip" name="zipfigures" label="figures of normalization of ${input.name}">
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
72 <filter>outputgraph == "TRUE"</filter>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
73 </data>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
74 <data format="log" name="log" label="log of normalization of ${input.name}">
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
75 <filter>outputlog == "TRUE"</filter>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
76 </data>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
77 </outputs>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
78
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
79 <stdio>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
80 <exit_code range="1:" level="fatal" description="See logs for more details" />
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
81 </stdio>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
82
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
83 <help>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
84
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
85 **What it does**
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
86
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
87 This preprocessing step consists in a correction of biological and technical biaises due to the experiment. Raw data from Affymetrix arrays are provided in different CEL files. These data must be normalized before statistical analysis.
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
88 The pre-processing is proposed as a wrapper of aroma.* packages (using CRMAv2 and TumorBoost when appropriate). Note that this implies that the pre-processing step is only available for Affymetrix arrays.
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
89
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
90 -----
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
91
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
92 **Chip file naming conventions**
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
93
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
94 Chip filenames must strictly follow the following rules :
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
95
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
96 - *.cdf* filename must comply with the following format : &lt; chiptype &gt;,&lt; tag &gt;.cdf (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full.cdf). Note the use of a comma (not a point) between &lt;chiptype&gt; and the tag "Full".
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
97
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
98 - *.ufl* filename must start with &lt; chiptype &gt;,&lt; tag &gt; (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full,na31,hg19,HB20110328.ufl).
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
99
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
100 - *.ugp* filename must start with &lt; chiptype &gt;,&lt; tag &gt; (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full,na31,hg19,HB20110328.ugp).
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
101
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
102 - *.acs* file name must start with &lt; chiptype &gt;,&lt; tag &gt; (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,HB20080710.acs).
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
103
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
104 -----
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
105
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
106 **Normal-tumor study with TumorBoost**
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
107
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
108 In cases where normal (control) samples match to tumor samples, normalization can be improved using TumorBoost. In this case, a normal-tumor csv file must be provided :
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
109
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
110 - The first column contains the names of the files corresponding to normal samples of the dataset.
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
111
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
112 - The second column contains the names of the tumor samples files.
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
113
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
114 - Column names of these two columns are respectively normal and tumor.
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
115
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
116 - Columns are separated by a comma.
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
117
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
118 - *Extensions of the files (.CEL for example) should be removed*
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
119
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
120
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
121
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
122 **Example**
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
123
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
124 Let 6 .cel files in the dataset studied (3 patients, each of them being represented by a couple of normal and tumor cel files.) ::
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
125
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
126 patient1_normal.cel
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
127 patient1_tumor.cel
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
128 patient2_normal.cel
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
129 patient2_tumor.cel
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
130 patient3_normal.cel
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
131 patient3_tumor.cel
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
132
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
133
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
134 The csv file should look like this ::
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
135
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
136 normal,tumor
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
137 patient1_normal,patient1_tumor
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
138 patient2_normal,patient2_tumor
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
139 patient3_normal,patient3_tumor
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
140
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
141
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
142 -----
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
143
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
144 **Citation**
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
145
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
146 When using this tool, please cite :
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
147
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
148 `Q. Grimonprez, A. Celisse, M. Cheok, M. Figeac, and G. Marot. MPAgenomics : An R package for multi-patients analysis of genomic markers, 2014. Preprint &lt;http://fr.arxiv.org/abs/1401.5035&gt;`_
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
149
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
150 As CRMAv2 normalization is used, please also cite `H. Bengtsson, P. Wirapati, and T. P. Speed. A single-array preprocessing method for estimating full-resolution raw copy numbers from all Affymetrix genotyping arrays including GenomeWideSNP 5 &amp; 6. Bioinformatics, 5(17):2149–2156, 2009. &lt;http://bioinformatics.oxfordjournals.org/content/25/17/2149.short&gt;`_
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
151
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
152 When using TumorBoost to improve normalization in a normal-tumor study, please cite `H. Bengtsson, P. Neuvial, and T. P. Speed. TumorBoost: Normalization of allele-specific tumor copy numbers from a single pair of tumor-normal genotyping microarrays. BMC Bioinformatics, 11, 2010 &lt;http://www.biomedcentral.com/1471-2105/11/245&gt;`_
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
153
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
154 </help>
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
155 </tool>