annotate mpagenomics_normalize-7dc6ce39fb89/preprocess.xml @ 3:52452ea9e4a7

correction repository dependencies
author samuel blanck
date Wed, 13 May 2015 11:55:11 +0200
parents 84b13b0e2b85
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
1 <tool id="preprocess" name="Data Normalization" force_history_refresh="True" version="0.1.0">
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
2 <requirements>
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
3 <!--requirement type="set_environment">R_SCRIPT_PATH</requirement-->
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
4 <requirement type="package" version="1.1.2">mpagenomics</requirement>
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
5 </requirements>
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
6 <command interpreter="python">
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
7 preprocess.py
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
8 -s '$summary'
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
9 -p '$__new_file_path__'
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
10 -c '$inputcdffull.name'
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
11 -f '$inputufl.name'
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
12 -g '$inputugp.name'
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
13 -a '$inputacs.name'
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
14 -d '$inputcdffull'
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
15 -v '$inputufl'
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
16 -w '$inputugp'
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
17 -b '$inputacs'
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
18 -e '$datasetName'
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
19 #if $settings.settingsType == "tumor":
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
20 -t '$tumorcsv'
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
21 #end if
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
22 #if $settings.settingsType == "standard":
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
23 -t 'none'
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
24 #end if
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
25 -y '$settings.settingsType'
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
26 -o '$outputgraph'
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
27 -z '$zipfigures'
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
28 -k '$outputlog'
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
29 -l '$log'
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
30 -u '$__user_id__'
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
31 #for $input in $inputs
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
32 -i "${input}"
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
33 -n "${input.name}"
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
34 #end for
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
35 </command>
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
36 <inputs>
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
37 <param name="datasetName" type="text" label="Dataset Name"/>
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
38 <param name="inputs" type="data" format="cel" multiple="True" label="Cel files dataset" help="Cel files dataset previously uploaded with the Multiple File Datasets tool."/>
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
39 <param name="inputcdffull" type="data" format="cdf" label="cdf file" help=".cdf file name must comply with the following format : &lt; chiptype &gt;,&lt; tag &gt;.cdf (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full.cdf)." />
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
40 <param name="inputufl" type="data" format="ufl" label="ufl file" help=".ufl file name must start with &lt; chiptype &gt;,&lt; tag &gt; (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full,na31,hg19,HB20110328.ufl)."/>
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
41 <param name="inputugp" type="data" format="ugp" label="ugp file" help=".ugp file name must start with &lt; chiptype &gt;,&lt; tag &gt; (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full,na31,hg19,HB20110328.ugp)."/>
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
42 <param name="inputacs" type="data" format="acs" label="acs file" help=".acs file name must start with &lt; chiptype &gt;,&lt; tag &gt; (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,HB20080710.acs)."/>
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
43 <conditional name="settings">
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
44 <param name="settingsType" type="select" label="Reference">
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
45 <option value="standard">Study without reference</option>
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
46 <option value="tumor">Normal-tumor study with TumorBoost</option>
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
47 </param>
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
48 <when value="standard" />
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
49 <when value="tumor">
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
50 <param name="tumorcsv" type="data" format="csv" label="TumorBoost csv file" help="Normal-tumor csv file. See below for more information."/>
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
51 </when>
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
52 </conditional>
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
53 <!--param name="outputgraph" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="False" label="Output figures" /-->
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
54 <!--param name="outputlog" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="False" label="Output log" /-->
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
55 <param name="outputgraph" type="select" label="Output figures">
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
56 <option value="TRUE">Yes</option>
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
57 <option value="FALSE">No</option>
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
58 </param>
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
59 <param name="outputlog" type="select" label="Output log">
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
60 <option value="TRUE">Yes</option>
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
61 <option value="FALSE">No</option>
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
62 </param>
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
63 <!--param name="chipType" type="text" label="chipType" /-->
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
64 <!--param name="workspace" type="text" label="Workspace"/-->
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
65 </inputs>
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
66
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
67 <outputs>
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
68 <!-- Would like to make this hidden or not appear all together, but
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
69 variable outputs require a primary dataset. If hidden refresh
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
70 doesn't occur.
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
71 -->
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
72 <data format="dsf" name="summary" label="Dataset summary file of ${datasetName}" />
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
73 <data format="zip" name="zipfigures" label="figures of normalization of ${datasetName}">
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
74 <filter>outputgraph == "TRUE"</filter>
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
75 </data>
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
76 <data format="log" name="log" label="log of normalization ${datasetName}">
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
77 <filter>outputlog == "TRUE"</filter>
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
78 </data>
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
79 </outputs>
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
80
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
81 <stdio>
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
82 <exit_code range="1:" level="fatal" description="See logs for more details" />
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
83 </stdio>
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
84
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
85 <help>
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
86
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
87 **What it does**
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
88
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
89 This preprocessing step consists in a correction of biological and technical biaises due to the experiment. Raw data from Affymetrix arrays are provided in different CEL files. These data must be normalized before statistical analysis.
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
90 The pre-processing is proposed as a wrapper of aroma.* packages (using CRMAv2 and TumorBoost when appropriate). Note that this implies that the pre-processing step is only available for Affymetrix arrays.
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
91
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
92 -----
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
93
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
94 **Chip file naming conventions**
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
95
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
96 Chip filenames must strictly follow the following rules :
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
97
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
98 - *.cdf* filename must comply with the following format : &lt; chiptype &gt;,&lt; tag &gt;.cdf (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full.cdf). Note the use of a comma (not a point) between &lt;chiptype&gt; and the tag "Full".
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
99
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
100 - *.ufl* filename must start with &lt; chiptype &gt;,&lt; tag &gt; (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full,na31,hg19,HB20110328.ufl).
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
101
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
102 - *.ugp* filename must start with &lt; chiptype &gt;,&lt; tag &gt; (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full,na31,hg19,HB20110328.ugp).
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
103
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
104 - *.acs* file name must start with &lt; chiptype &gt;,&lt; tag &gt; (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,HB20080710.acs).
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
105
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
106 -----
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
107
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
108 **Normal-tumor study with TumorBoost**
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
109
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
110 In cases where normal (control) samples match to tumor samples, normalization can be improved using TumorBoost. In this case, a normal-tumor csv file must be provided :
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
111
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
112 - The first column contains the names of the files corresponding to normal samples of the dataset.
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
113
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
114 - The second column contains the names of the tumor samples files.
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
115
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
116 - Column names of these two columns are respectively normal and tumor.
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
117
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
118 - Columns are separated by a comma.
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
119
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
120 - *Extensions of the files (.CEL for example) should be removed*
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
121
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
122
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
123
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
124 **Example**
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
125
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
126 Let 6 .cel files in the dataset studied (3 patients, each of them being represented by a couple of normal and tumor cel files.) ::
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
127
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
128 patient1_normal.cel
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
129 patient1_tumor.cel
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
130 patient2_normal.cel
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
131 patient2_tumor.cel
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
132 patient3_normal.cel
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
133 patient3_tumor.cel
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
134
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
135
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
136 The csv file should look like this ::
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
137
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
138 normal,tumor
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
139 patient1_normal,patient1_tumor
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
140 patient2_normal,patient2_tumor
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
141 patient3_normal,patient3_tumor
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
142
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
143
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
144 -----
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
145
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
146 **Citation**
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
147
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
148 When using this tool, please cite :
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
149
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
150 `Q. Grimonprez, A. Celisse, M. Cheok, M. Figeac, and G. Marot. MPAgenomics : An R package for multi-patients analysis of genomic markers, 2014. Preprint &lt;http://fr.arxiv.org/abs/1401.5035&gt;`_
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
151
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
152 As CRMAv2 normalization is used, please also cite `H. Bengtsson, P. Wirapati, and T. P. Speed. A single-array preprocessing method for estimating full-resolution raw copy numbers from all Affymetrix genotyping arrays including GenomeWideSNP 5 &amp; 6. Bioinformatics, 5(17):2149–2156, 2009. &lt;http://bioinformatics.oxfordjournals.org/content/25/17/2149.short&gt;`_
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
153
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
154 When using TumorBoost to improve normalization in a normal-tumor study, please cite `H. Bengtsson, P. Neuvial, and T. P. Speed. TumorBoost: Normalization of allele-specific tumor copy numbers from a single pair of tumor-normal genotyping microarrays. BMC Bioinformatics, 11, 2010 &lt;http://www.biomedcentral.com/1471-2105/11/245&gt;`_
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
155
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
156 </help>
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
157 </tool>