# HG changeset patch
# User sblanck
# Date 1430128132 14400
# Node ID a89bae08bf2db4ff55bb0002344c657220fae1b2
Uploaded
diff -r 000000000000 -r a89bae08bf2d preprocess.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/preprocess.R Mon Apr 27 05:48:52 2015 -0400
@@ -0,0 +1,25 @@
+args<-commandArgs(TRUE)
+
+chip=args[1]
+dataset=args[2]
+workdir=args[3]
+celPath=args[4]
+chipPath=args[4]
+tumor=args[5]
+settingType=args[6]
+outputgraph=type.convert(args[7])
+tag=args[8]
+
+if (tag=="")
+{
+ tag=NULL
+}
+
+library(MPAgenomics)
+setwd(workdir)
+if (settingType=="standard")
+{
+ signalPreProcess(dataSetName=dataset, chipType=chip, dataSetPath=celPath,chipFilesPath=chipPath, path=workdir,createArchitecture=TRUE, savePlot=outputgraph, tags=tag)
+} else {
+ signalPreProcess(dataSetName=dataset, chipType=chip, dataSetPath=celPath,chipFilesPath=chipPath, normalTumorArray=tumor, path=workdir,createArchitecture=TRUE, savePlot=outputgraph, tags=tag)
+}
\ No newline at end of file
diff -r 000000000000 -r a89bae08bf2d preprocess.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/preprocess.py Mon Apr 27 05:48:52 2015 -0400
@@ -0,0 +1,122 @@
+import os
+import re
+import shutil
+import sys
+import subprocess
+import zipfile
+import optparse
+
+def main():
+
+ parser = optparse.OptionParser()
+ parser.add_option('-s', action="store", dest='summary')
+ parser.add_option('-p', action="store", dest='new_file_path')
+ parser.add_option('-c', action="store", dest='inputcdffull_name')
+ parser.add_option('-f', action="store", dest='inputufl_name')
+ parser.add_option('-g', action="store", dest='inputugp_name')
+ parser.add_option('-a', action="store", dest='inputacs_name')
+ parser.add_option('-d', action="store", dest='inputcdffull')
+ parser.add_option('-v', action="store", dest='inputufl')
+ parser.add_option('-h', action="store", dest='inputugp')
+ parser.add_option('-b', action="store", dest='inputacs')
+ parser.add_option('-t', action="store", dest='tumorcsv')
+ parser.add_option('-y', action="store", dest='settingsType')
+ parser.add_option('-o', action="store", dest='outputgraph')
+ parser.add_option('-z', action="store", dest='zipfigures')
+ parser.add_option('-k', action="store", dest='outputlog')
+ parser.add_option('-l', action="store", dest='log')
+ parser.add_option('-u', action="store", dest='user_id')
+
+ parser.add_option('-i', action="append", dest='inputFile', default=[])
+ parser.add_option('-n', action='append', dest='inputFileName', default=[])
+
+ options, args = parser.parse_args()
+ outputFileName=options.outputFile
+
+ print options.inputFile
+ print options.inputFileName
+
+ dataSetName="dataset"
+ destinationPath=os.path.join(options.new_file_path, user, dataset)
+
+ mpagenomics_dir = os.path.join(destinationPath,"mpagenomics",user)
+ data_dir = os.path.join(options.new_file_path, user)
+
+ try:
+ os.makedirs(data_dir)
+ except:
+ shutil.rmtree(data_dir)
+ os.makedirs(data_dir)
+
+ if (not os.path.isdir(mpagenomics_dir)):
+ os.makedirs(mpagenomics_dir)
+
+ for inputFile, inputFileName in zip(options.inputFile,options.inputFileName):
+ source = inputFile
+ destination=os.path.join(data_dir,inputFileName)
+ os.symlink(source,destination)
+
+ if (cdffull_name.count(",") != 0):
+ chipType=cdffull_name.split(",",1)[0]
+ tagExt=cdffull_name.split(",",1)[1]
+ tag=tagExt.split(".",1)[0]
+ else:
+ chipType=cdffull_name.split(".",1)[0]
+ tag=""
+
+ _copy(cdffull,os.path.join(data_dir, cdffull_name))
+ _copy(ugp,os.path.join(data_dir, ugp_name))
+ _copy(ufl,os.path.join(data_dir, ufl_name))
+ _copy(acs,os.path.join(data_dir, acs_name))
+
+
+ fig_dir = os.path.join("mpagenomics", user, "figures", dataset, "signal")
+ abs_fig_dir = os.path.join(new_files_directory, fig_dir)
+
+
+ retcode = _preprocess(chipType, dataSetName, mpagenomics_dir, data_dir, options.new_file_path, options.tumorcsv, options.settingType, options.outputgraph, options.outputlog, options.log, tag)
+
+ if (retcode == 0):
+ if (os.path.isdir(abs_fig_dir)) and (outputgraph == "TRUE"):
+
+ new_files = os.listdir(abs_fig_dir)
+ zipbuf = zipfile.ZipFile(os.path.join(abs_fig_dir, zipfigures), 'w', zipfile.ZIP_DEFLATED)
+ for current_file in new_files:
+ fn = os.path.join(abs_fig_dir, current_file)
+ relfn = fn[len(abs_fig_dir) + len(os.sep):]
+ zipbuf.write(fn, relfn)
+
+ f = open(report, "w")
+ # Create report
+ try:
+ for name in extra_file_names:
+ f.write("%s\t%s\t%s\n" %(re.match(r"^\d+_task_(.*).dat$", name).group(1),dataset,chipType))
+ finally:
+ shutil.rmtree(data_dir)
+ f.close()
+
+ sys.exit(retcode)
+
+ sys.exit(retcode)
+
+
+def _copy(source, destination):
+ try:
+ os.link(source, destination)
+ except:
+ shutil.copy(source, destination)
+
+def _preprocess (chipType,dataset,mpagenomics_dir,data_dir,tmp_dir,tumor,settingType,outputgraph,outputlog,log,tag):
+ script_dir=os.path.dirname(os.path.abspath(__file__))
+
+ if (outputlog=="TRUE"):
+ errfile=open(log,'w')
+ else:
+ errfile=open(os.path.join(tmp_dir,"errfile.log"),'w')
+
+ retcode = subprocess.call(["Rscript", os.path.join(script_dir,"preprocess.R"), chipType, dataset, mpagenomics_dir, data_dir, tumor, settingType, outputgraph, tag], stdout = errfile, stderr = errfile)
+ return(retcode)
+
+
+if __name__ == "__main__":
+ main()
diff -r 000000000000 -r a89bae08bf2d preprocess.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/preprocess.xml Mon Apr 27 05:48:52 2015 -0400
@@ -0,0 +1,155 @@
+
+
+ R_SCRIPT_PATH
+ mpagenomics
+
+
+ preprocess.py
+ -s '$summary'
+ -p '$__new_file_path__'
+ -c '$inputcdffull.name'
+ -f '$inputufl.name'
+ -g '$inputugp.name'
+ -a '$inputacs.name'
+ -d '$inputcdffull'
+ -v '$inputufl'
+ -h '$inputugp'
+ -b '$inputacs'
+ #if $settings.settingsType == "tumor":
+ -t '$tumorcsv'
+ #end if
+ #if $settings.settingsType == "standard":
+ -t 'none'
+ #end if
+ -y '$settings.settingsType'
+ -o '$outputgraph'
+ -z '$zipfigures'
+ -k '$outputlog'
+ -l '$log'
+ -u '$__user_id__'
+ #for $input in $inputs
+ -i "${input}"
+ -n "${input.name}
+ #end for
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ outputgraph == "TRUE"
+
+
+ outputlog == "TRUE"
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This preprocessing step consists in a correction of biological and technical biaises due to the experiment. Raw data from Affymetrix arrays are provided in different CEL files. These data must be normalized before statistical analysis.
+The pre-processing is proposed as a wrapper of aroma.* packages (using CRMAv2 and TumorBoost when appropriate). Note that this implies that the pre-processing step is only available for Affymetrix arrays.
+
+-----
+
+**Chip file naming conventions**
+
+Chip filenames must strictly follow the following rules :
+
+- *.cdf* filename must comply with the following format : < chiptype >,< tag >.cdf (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full.cdf). Note the use of a comma (not a point) between <chiptype> and the tag "Full".
+
+- *.ufl* filename must start with < chiptype >,< tag > (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full,na31,hg19,HB20110328.ufl).
+
+- *.ugp* filename must start with < chiptype >,< tag > (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full,na31,hg19,HB20110328.ugp).
+
+- *.acs* file name must start with < chiptype >,< tag > (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,HB20080710.acs).
+
+-----
+
+**Normal-tumor study with TumorBoost**
+
+In cases where normal (control) samples match to tumor samples, normalization can be improved using TumorBoost. In this case, a normal-tumor csv file must be provided :
+
+ - The first column contains the names of the files corresponding to normal samples of the dataset.
+
+ - The second column contains the names of the tumor samples files.
+
+ - Column names of these two columns are respectively normal and tumor.
+
+ - Columns are separated by a comma.
+
+ - *Extensions of the files (.CEL for example) should be removed*
+
+
+
+**Example**
+
+Let 6 .cel files in the dataset studied (3 patients, each of them being represented by a couple of normal and tumor cel files.) ::
+
+ patient1_normal.cel
+ patient1_tumor.cel
+ patient2_normal.cel
+ patient2_tumor.cel
+ patient3_normal.cel
+ patient3_tumor.cel
+
+
+The csv file should look like this ::
+
+ normal,tumor
+ patient1_normal,patient1_tumor
+ patient2_normal,patient2_tumor
+ patient3_normal,patient3_tumor
+
+
+-----
+
+**Citation**
+
+When using this tool, please cite :
+
+`Q. Grimonprez, A. Celisse, M. Cheok, M. Figeac, and G. Marot. MPAgenomics : An R package for multi-patients analysis of genomic markers, 2014. Preprint <http://fr.arxiv.org/abs/1401.5035>`_
+
+As CRMAv2 normalization is used, please also cite `H. Bengtsson, P. Wirapati, and T. P. Speed. A single-array preprocessing method for estimating full-resolution raw copy numbers from all Affymetrix genotyping arrays including GenomeWideSNP 5 & 6. Bioinformatics, 5(17):2149–2156, 2009. <http://bioinformatics.oxfordjournals.org/content/25/17/2149.short>`_
+
+When using TumorBoost to improve normalization in a normal-tumor study, please cite `H. Bengtsson, P. Neuvial, and T. P. Speed. TumorBoost: Normalization of allele-specific tumor copy numbers from a single pair of tumor-normal genotyping microarrays. BMC Bioinformatics, 11, 2010 <http://www.biomedcentral.com/1471-2105/11/245>`_
+
+
+
diff -r 000000000000 -r a89bae08bf2d tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml Mon Apr 27 05:48:52 2015 -0400
@@ -0,0 +1,10 @@
+
+
+
+
+
+
+
+
+
+