Mercurial > repos > sblanck > mpagenomics_normalize
changeset 0:a89bae08bf2d
Uploaded
author | sblanck |
---|---|
date | Mon, 27 Apr 2015 05:48:52 -0400 |
parents | |
children | 4d25dec9707e |
files | preprocess.R preprocess.py preprocess.xml tool_dependencies.xml |
diffstat | 4 files changed, 312 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/preprocess.R Mon Apr 27 05:48:52 2015 -0400 @@ -0,0 +1,25 @@ +args<-commandArgs(TRUE) + +chip=args[1] +dataset=args[2] +workdir=args[3] +celPath=args[4] +chipPath=args[4] +tumor=args[5] +settingType=args[6] +outputgraph=type.convert(args[7]) +tag=args[8] + +if (tag=="") +{ + tag=NULL +} + +library(MPAgenomics) +setwd(workdir) +if (settingType=="standard") +{ + signalPreProcess(dataSetName=dataset, chipType=chip, dataSetPath=celPath,chipFilesPath=chipPath, path=workdir,createArchitecture=TRUE, savePlot=outputgraph, tags=tag) +} else { + signalPreProcess(dataSetName=dataset, chipType=chip, dataSetPath=celPath,chipFilesPath=chipPath, normalTumorArray=tumor, path=workdir,createArchitecture=TRUE, savePlot=outputgraph, tags=tag) +} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/preprocess.py Mon Apr 27 05:48:52 2015 -0400 @@ -0,0 +1,122 @@ +import os +import re +import shutil +import sys +import subprocess +import zipfile +import optparse + +def main(): + + parser = optparse.OptionParser() + parser.add_option('-s', action="store", dest='summary') + parser.add_option('-p', action="store", dest='new_file_path') + parser.add_option('-c', action="store", dest='inputcdffull_name') + parser.add_option('-f', action="store", dest='inputufl_name') + parser.add_option('-g', action="store", dest='inputugp_name') + parser.add_option('-a', action="store", dest='inputacs_name') + parser.add_option('-d', action="store", dest='inputcdffull') + parser.add_option('-v', action="store", dest='inputufl') + parser.add_option('-h', action="store", dest='inputugp') + parser.add_option('-b', action="store", dest='inputacs') + parser.add_option('-t', action="store", dest='tumorcsv') + parser.add_option('-y', action="store", dest='settingsType') + parser.add_option('-o', action="store", dest='outputgraph') + parser.add_option('-z', action="store", dest='zipfigures') + parser.add_option('-k', action="store", dest='outputlog') + parser.add_option('-l', action="store", dest='log') + parser.add_option('-u', action="store", dest='user_id') + + parser.add_option('-i', action="append", dest='inputFile', default=[]) + parser.add_option('-n', action='append', dest='inputFileName', default=[]) + + options, args = parser.parse_args() + outputFileName=options.outputFile + + print options.inputFile + print options.inputFileName + + dataSetName="dataset" + destinationPath=os.path.join(options.new_file_path, user, dataset) + + mpagenomics_dir = os.path.join(destinationPath,"mpagenomics",user) + data_dir = os.path.join(options.new_file_path, user) + + try: + os.makedirs(data_dir) + except: + shutil.rmtree(data_dir) + os.makedirs(data_dir) + + if (not os.path.isdir(mpagenomics_dir)): + os.makedirs(mpagenomics_dir) + + for inputFile, inputFileName in zip(options.inputFile,options.inputFileName): + source = inputFile + destination=os.path.join(data_dir,inputFileName) + os.symlink(source,destination) + + if (cdffull_name.count(",") != 0): + chipType=cdffull_name.split(",",1)[0] + tagExt=cdffull_name.split(",",1)[1] + tag=tagExt.split(".",1)[0] + else: + chipType=cdffull_name.split(".",1)[0] + tag="" + + _copy(cdffull,os.path.join(data_dir, cdffull_name)) + _copy(ugp,os.path.join(data_dir, ugp_name)) + _copy(ufl,os.path.join(data_dir, ufl_name)) + _copy(acs,os.path.join(data_dir, acs_name)) + + + fig_dir = os.path.join("mpagenomics", user, "figures", dataset, "signal") + abs_fig_dir = os.path.join(new_files_directory, fig_dir) + + + retcode = _preprocess(chipType, dataSetName, mpagenomics_dir, data_dir, options.new_file_path, options.tumorcsv, options.settingType, options.outputgraph, options.outputlog, options.log, tag) + + if (retcode == 0): + if (os.path.isdir(abs_fig_dir)) and (outputgraph == "TRUE"): + + new_files = os.listdir(abs_fig_dir) + zipbuf = zipfile.ZipFile(os.path.join(abs_fig_dir, zipfigures), 'w', zipfile.ZIP_DEFLATED) + for current_file in new_files: + fn = os.path.join(abs_fig_dir, current_file) + relfn = fn[len(abs_fig_dir) + len(os.sep):] + zipbuf.write(fn, relfn) + + f = open(report, "w") + # Create report + try: + for name in extra_file_names: + f.write("%s\t%s\t%s\n" %(re.match(r"^\d+_task_(.*).dat$", name).group(1),dataset,chipType)) + finally: + shutil.rmtree(data_dir) + f.close() + + sys.exit(retcode) + + sys.exit(retcode) + + +def _copy(source, destination): + try: + os.link(source, destination) + except: + shutil.copy(source, destination) + +def _preprocess (chipType,dataset,mpagenomics_dir,data_dir,tmp_dir,tumor,settingType,outputgraph,outputlog,log,tag): + script_dir=os.path.dirname(os.path.abspath(__file__)) + + if (outputlog=="TRUE"): + errfile=open(log,'w') + else: + errfile=open(os.path.join(tmp_dir,"errfile.log"),'w') + + retcode = subprocess.call(["Rscript", os.path.join(script_dir,"preprocess.R"), chipType, dataset, mpagenomics_dir, data_dir, tumor, settingType, outputgraph, tag], stdout = errfile, stderr = errfile) + return(retcode) + + +if __name__ == "__main__": + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/preprocess.xml Mon Apr 27 05:48:52 2015 -0400 @@ -0,0 +1,155 @@ +<tool id="preprocess" name="Data Normalization" force_history_refresh="True" version="0.1.0"> + <requirements> + <requirement type="set_environment">R_SCRIPT_PATH</requirement> + <requirement type="package" version="1.1.2">mpagenomics</requirement> + </requirements> + <command interpreter="python"> + preprocess.py + -s '$summary' + -p '$__new_file_path__' + -c '$inputcdffull.name' + -f '$inputufl.name' + -g '$inputugp.name' + -a '$inputacs.name' + -d '$inputcdffull' + -v '$inputufl' + -h '$inputugp' + -b '$inputacs' + #if $settings.settingsType == "tumor": + -t '$tumorcsv' + #end if + #if $settings.settingsType == "standard": + -t 'none' + #end if + -y '$settings.settingsType' + -o '$outputgraph' + -z '$zipfigures' + -k '$outputlog' + -l '$log' + -u '$__user_id__' + #for $input in $inputs + -i "${input}" + -n "${input.name} + #end for + </command> + <inputs> + <param name="inputs" type="data" format="cel" multiple="True" label="Cel files dataset" help="Cel files dataset previously uploaded with the Multiple File Datasets tool."/> + <param name="inputcdffull" type="data" format="cdf" label="cdf file" help=".cdf file name must comply with the following format : < chiptype >,< tag >.cdf (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full.cdf)." /> + <param name="inputufl" type="data" format="ufl" label="ufl file" help=".ufl file name must start with < chiptype >,< tag > (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full,na31,hg19,HB20110328.ufl)."/> + <param name="inputugp" type="data" format="ugp" label="ugp file" help=".ugp file name must start with < chiptype >,< tag > (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full,na31,hg19,HB20110328.ugp)."/> + <param name="inputacs" type="data" format="acs" label="acs file" help=".acs file name must start with < chiptype >,< tag > (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,HB20080710.acs)."/> + <conditional name="settings"> + <param name="settingsType" type="select" label="Reference"> + <option value="standard">Study without reference</option> + <option value="tumor">Normal-tumor study with TumorBoost</option> + </param> + <when value="standard" /> + <when value="tumor"> + <param name="tumorcsv" type="data" format="csv" label="TumorBoost csv file" help="Normal-tumor csv file. See below for more information."/> + </when> + </conditional> + <!--param name="outputgraph" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="False" label="Output figures" /--> + <!--param name="outputlog" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="False" label="Output log" /--> + <param name="outputgraph" type="select" label="Output figures"> + <option value="TRUE">Yes</option> + <option value="FALSE">No</option> + </param> + <param name="outputlog" type="select" label="Output log"> + <option value="TRUE">Yes</option> + <option value="FALSE">No</option> + </param> + <!--param name="chipType" type="text" label="chipType" /--> + <!--param name="workspace" type="text" label="Workspace"/--> + </inputs> + + <outputs> + <!-- Would like to make this hidden or not appear all together, but + variable outputs require a primary dataset. If hidden refresh + doesn't occur. + --> + <data format="dsf" name="summary" label="Dataset summary file of ${input.name} " /> + <data format="zip" name="zipfigures" label="figures of normalization of ${input.name}"> + <filter>outputgraph == "TRUE"</filter> + </data> + <data format="log" name="log" label="log of normalization of ${input.name}"> + <filter>outputlog == "TRUE"</filter> + </data> + </outputs> + + <stdio> + <exit_code range="1:" level="fatal" description="See logs for more details" /> + </stdio> + + <help> + +**What it does** + +This preprocessing step consists in a correction of biological and technical biaises due to the experiment. Raw data from Affymetrix arrays are provided in different CEL files. These data must be normalized before statistical analysis. +The pre-processing is proposed as a wrapper of aroma.* packages (using CRMAv2 and TumorBoost when appropriate). Note that this implies that the pre-processing step is only available for Affymetrix arrays. + +----- + +**Chip file naming conventions** + +Chip filenames must strictly follow the following rules : + +- *.cdf* filename must comply with the following format : < chiptype >,< tag >.cdf (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full.cdf). Note the use of a comma (not a point) between <chiptype> and the tag "Full". + +- *.ufl* filename must start with < chiptype >,< tag > (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full,na31,hg19,HB20110328.ufl). + +- *.ugp* filename must start with < chiptype >,< tag > (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full,na31,hg19,HB20110328.ugp). + +- *.acs* file name must start with < chiptype >,< tag > (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,HB20080710.acs). + +----- + +**Normal-tumor study with TumorBoost** + +In cases where normal (control) samples match to tumor samples, normalization can be improved using TumorBoost. In this case, a normal-tumor csv file must be provided : + + - The first column contains the names of the files corresponding to normal samples of the dataset. + + - The second column contains the names of the tumor samples files. + + - Column names of these two columns are respectively normal and tumor. + + - Columns are separated by a comma. + + - *Extensions of the files (.CEL for example) should be removed* + + + +**Example** + +Let 6 .cel files in the dataset studied (3 patients, each of them being represented by a couple of normal and tumor cel files.) :: + + patient1_normal.cel + patient1_tumor.cel + patient2_normal.cel + patient2_tumor.cel + patient3_normal.cel + patient3_tumor.cel + + +The csv file should look like this :: + + normal,tumor + patient1_normal,patient1_tumor + patient2_normal,patient2_tumor + patient3_normal,patient3_tumor + + +----- + +**Citation** + +When using this tool, please cite : + +`Q. Grimonprez, A. Celisse, M. Cheok, M. Figeac, and G. Marot. MPAgenomics : An R package for multi-patients analysis of genomic markers, 2014. Preprint <http://fr.arxiv.org/abs/1401.5035>`_ + +As CRMAv2 normalization is used, please also cite `H. Bengtsson, P. Wirapati, and T. P. Speed. A single-array preprocessing method for estimating full-resolution raw copy numbers from all Affymetrix genotyping arrays including GenomeWideSNP 5 & 6. Bioinformatics, 5(17):2149–2156, 2009. <http://bioinformatics.oxfordjournals.org/content/25/17/2149.short>`_ + +When using TumorBoost to improve normalization in a normal-tumor study, please cite `H. Bengtsson, P. Neuvial, and T. P. Speed. TumorBoost: Normalization of allele-specific tumor copy numbers from a single pair of tumor-normal genotyping microarrays. BMC Bioinformatics, 11, 2010 <http://www.biomedcentral.com/1471-2105/11/245>`_ + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Mon Apr 27 05:48:52 2015 -0400 @@ -0,0 +1,10 @@ +<?xml version="1.0"?> +<tool_dependency> + + <package name="mpagenomics" version="1.1.2"> + + <repository changeset_revision="5a3b73fecec5" name="package_mpagenomics_1_1_2" owner="sblanck" prior_installation_required="True" toolshed="https://testtoolshed.g2.bx.psu.edu" /> + + </package> + +</tool_dependency>