# HG changeset patch # User sblanck # Date 1586360057 0 # Node ID af4f63f27c77a9348087ac99ff1587dc25b82535 # Parent bbf427bd6967e59799ad785c24a179a26d243cb1 planemo upload for repository https://github.com/sblanck/MPAgenomics4Galaxy/tree/master/mpagenomics_wrappers commit 11d660a2de749dae548b2fae0dd81f9f2b2c4b4f diff -r bbf427bd6967 -r af4f63f27c77 extractCN.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/extractCN.R Wed Apr 08 15:34:17 2020 +0000 @@ -0,0 +1,170 @@ +#!/usr/bin/env Rscript +# setup R error handling to go to stderr +options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) + +# we need that to not crash galaxy with an UTF8 error on German LC settings. +loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") + +library("optparse") + +##### Read options +option_list=list( + make_option("--chrom",type="character",default=NULL, dest="chrom"), + make_option("--input",type="character",default=NULL, dest="input"), + make_option("--output",type="character",default=NULL, dest="output"), + make_option("--new_file_path",type="character",default=NULL, dest="new_file_path"), + make_option("--settings_type",type="character",default=NULL, dest="settings_type"), + make_option("--settings_tumor",type="character",default=NULL, dest="settings_tumor"), + make_option("--symmetrize",type="character",default=NULL, dest="symmetrize"), + make_option("--settings_signal",type="character",default=NULL, dest="settings_signal"), + make_option("--settings_snp",type="character",default=NULL, dest="settings_snp"), + make_option("--outputlog",type="character",default=NULL, dest="outputlog"), + make_option("--log",type="character",default=NULL, dest="log"), + make_option("--userid",type="character",default=NULL, dest="userid") +); + +opt_parser = OptionParser(option_list=option_list); +opt = parse_args(opt_parser); + +if(is.null(opt$input)){ + print_help(opt_parser) + stop("input required.", call.=FALSE) +} + +#loading libraries + +chrom=opt$chrom +input=opt$input +tmp_dir=opt$new_file_path +output=opt$output +settingsType=opt$settings_type +tumorcsv=opt$settings_tumor +symmetrize=opt$symmetrize +signal=opt$settings_signal +snp=type.convert(opt$settings_snp) +outputlog=opt$outputlog +log=opt$log +user=opt$userid + +library(MPAgenomics) +workdir=file.path(tmp_dir, "mpagenomics",user) +setwd(workdir) + +inputDataset=read.table(file=input,stringsAsFactors=FALSE) +dataset=inputDataset[1,2] + +if (outputlog){ + sinklog <- file(log, open = "wt") + sink(sinklog ,type = "output") + sink(sinklog, type = "message") +} + + +if (grepl("all",tolower(chrom)) | chrom=="None") { + chrom_vec=c(1:25) + } else { + chrom_tmp <- strsplit(chrom,",") + chrom_vecstring <-unlist(chrom_tmp) + chrom_vec <- as.numeric(chrom_vecstring) + } +if (signal == "CN") +{ + if (settingsType == "dataset") { + if (tumorcsv== "None") + { + CN=getCopyNumberSignal(dataset,chromosome=chrom_vec, onlySNP=snp) + + } else { + CN=getCopyNumberSignal(dataset,chromosome=chrom_vec, normalTumorArray=tumorcsv, onlySNP=snp) + } + } else { + input_tmp <- strsplit(settingsType,",") + input_tmp_vecstring <-unlist(input_tmp) + input_vecstring = sub("^([^.]*).*", "\\1", input_tmp_vecstring) + if (tumorcsv== "None") + { + CN=getCopyNumberSignal(dataset,chromosome=chrom_vec, listOfFiles=input_vecstring, onlySNP=snp) + } else { + CN=getCopyNumberSignal(dataset,chromosome=chrom_vec, normalTumorArray=tumorcsv, listOfFiles=input_vecstring, onlySNP=snp ) + } + } + + list_chr=names(CN) + CN_global=data.frame(check.names = FALSE) + for (i in list_chr) { + chr_data=data.frame(CN[[i]],check.names = FALSE) + CN_global=rbind(CN_global,chr_data) + } + names(CN_global)[names(CN_global)=="featureNames"] <- "probeName" + write.table(format(CN_global), output, row.names = FALSE, quote = FALSE, sep = "\t") + +} else { + if (symmetrize=="TRUE") { + if (settingsType == "dataset") { + input_vecstring = getListOfFiles(dataset) + } else { + input_tmp <- strsplit(settingsType,",") + input_tmp_vecstring <-unlist(input_tmp) + input_vecstring = sub("^([^.]*).*", "\\1", input_tmp_vecstring) + } + + symFracB_global=data.frame(check.names = FALSE) + + for (currentFile in input_vecstring) { + cat(paste0("extracting signal from ",currentFile,".\n")) + currentSymFracB=data.frame() + symFracB=getSymFracBSignal(dataset,chromosome=chrom_vec,file=currentFile,normalTumorArray=tumorcsv) + list_chr=names(symFracB) + for (i in list_chr) { + cat(paste0(" extracting ",i,".\n")) + chr_data=data.frame(symFracB[[i]]$tumor,check.names = FALSE) + currentSymFracB=rbind(currentSymFracB,chr_data) + + } + if (is.null(symFracB_global) || nrow(symFracB_global)==0) { + symFracB_global=currentSymFracB + } else { + symFracB_global=cbind(symFracB_global,currentFile=currentSymFracB[[3]]) + } + } + names(symFracB_global)[names(symFracB_global)=="featureNames"] <- "probeName" + + write.table(format(symFracB_global), output, row.names = FALSE, quote = FALSE, sep = "\t") + } else { + if (settingsType == "dataset") { + if (tumorcsv== "None") + { + fracB=getFracBSignal(dataset,chromosome=chrom_vec) + + } else { + fracB=getFracBSignal(dataset,chromosome=chrom_vec, normalTumorArray=tumorcsv) + } + } else { + input_tmp <- strsplit(settingsType,",") + input_tmp_vecstring <-unlist(input_tmp) + input_vecstring = sub("^([^.]*).*", "\\1", input_tmp_vecstring) + if (tumorcsv== "None") + { + fracB=getFracBSignal(dataset,chromosome=chrom_vec, listOfFiles=input_vecstring) + } else { + fracB=getFracBSignal(dataset,chromosome=chrom_vec, normalTumorArray=tumorcsv, listOfFiles=input_vecstring) + } + } + #formatage des données + list_chr=names(fracB) + fracB_global=data.frame(check.names = FALSE) + for (i in list_chr) { + chr_data=data.frame(fracB[[i]]$tumor,check.names = FALSE) + fracB_global=rbind(fracB_global,chr_data) + } + names(fracB_global)[names(fracB_global)=="featureNames"] <- "probeName" + write.table(format(fracB_global), output, row.names = FALSE, quote = FALSE, sep = "\t") + } + +} + +if (outputlog){ + sink(type="output") + sink(type="message") + close(sinklog) +} \ No newline at end of file diff -r bbf427bd6967 -r af4f63f27c77 extractCN.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/extractCN.xml Wed Apr 08 15:34:17 2020 +0000 @@ -0,0 +1,222 @@ + + copy number or allele B fraction signal + mpagenomics + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + outputlog == "TRUE" + + + + + + +.. class:: warningmark + +Data normalization must be run (with the data normalization tool) prior to signal extraction. + +----- + +**What it does** +This tool extracts the copy number profile from the normalized data. + +Outputs: + +*A tabular text file containing 3 fixed columns and 1 column per sample:* + + - chr: Chromosome. + - position: Genomic position (in bp). + - probeNames: Name of the probes of the microarray. + - One column per sample which contains the copy number profile for each sample. + +----- + +**Normal-tumor study** + +In cases where normal (control) samples match to tumor samples, normalization can be improved using TumorBoost. In this case, a normal-tumor csv file must be provided : + + - The first column contains the names of the files corresponding to normal samples of the dataset. + + - The second column contains the names of the tumor samples files. + + - Column names of these two columns are respectively normal and tumor. + + - Columns are separated by a comma. + + - *Extensions of the files (.CEL for example) should be removed* + + + +**Example** + +Let 6 .cel files in the studied dataset (3 patients, each of them being represented by a couple of normal and tumor cel files.) :: + + patient1_normal.cel + patient1_tumor.cel + patient2_normal.cel + patient2_tumor.cel + patient3_normal.cel + patient3_tumor.cel + + +The csv file should look like this :: + + normal,tumor + patient1_normal,patient1_tumor + patient2_normal,patient2_tumor + patient3_normal,patient3_tumor + +----- + + +**Citation** + +If you use this tool please cite : + +`Q. Grimonprez, A. Celisse, M. Cheok, M. Figeac, and G. Marot. Mpagenomics : An r package for multi-patients analysis of genomic markers, 2014. Preprint <http://fr.arxiv.org/abs/1401.5035>`_ + + + + diff -r bbf427bd6967 -r af4f63f27c77 filter.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter.R Wed Apr 08 15:34:17 2020 +0000 @@ -0,0 +1,67 @@ +#!/usr/bin/env Rscript +# setup R error handling to go to stderr +options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) + +# we need that to not crash galaxy with an UTF8 error on German LC settings. +loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") + +library("optparse") + +##### Read options +option_list=list( + make_option("--input",type="character",default=NULL, dest="input"), + make_option("--output",type="character",default=NULL, dest="output"), + make_option("--new_file_path",type="character",default=NULL, dest="new_file_path"), + make_option("--nbcall",type="character",default=NULL, dest="nbcall"), + make_option("--length",type="character",default=NULL, dest="length"), + make_option("--probes",type="character",default=NULL, dest="probes"), + make_option("--outputlog",type="character",default=NULL, dest="outputlog"), + make_option("--log",type="character",default=NULL, dest="log") + ); + +opt_parser = OptionParser(option_list=option_list); +opt = parse_args(opt_parser); + +if(is.null(opt$input)){ + print_help(opt_parser) + stop("input required.", call.=FALSE) +} + +#loading libraries + +input=opt$input +output=opt$output +tmp_dir=opt$new_file_path +nbcall=opt$nbcall +length=as.numeric(opt$length) +probes=as.numeric(opt$probes) +log=opt$log +outputlog=opt$outputlog + +if (outputlog){ + sinklog <- file(log, open = "wt") + sink(sinklog ,type = "output") + sink(sinklog, type = "message") +} + +nbcall_tmp <- strsplit(nbcall,",") +nbcall_vecstring <-unlist(nbcall_tmp) + +nbcall_vecstring + +library(MPAgenomics) +workdir=file.path(tmp_dir, "mpagenomics") +setwd(workdir) + +segcall = read.table(input, header = TRUE) +filtercall=filterSeg(segcall,length,probes,nbcall_vecstring) +#sink(output) +#print(format(filtercall),row.names=FALSE) +#sink() +if (outputlog){ + sink(type="output") + sink(type="message") + close(sinklog) +} +write.table(filtercall,output,row.names = FALSE, quote = FALSE, sep = "\t") + diff -r bbf427bd6967 -r af4f63f27c77 filter.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter.xml Wed Apr 08 15:34:17 2020 +0000 @@ -0,0 +1,74 @@ + + mpagenomics + + + + + + + + + + + + + + + + + + + + + + + + + outputlog == "TRUE" + + + + + + + + +**What it does** + +This tool filters results obtained by the segmentation and calling tool. + +----- + +Input/Output file: + +*A tabular text file containing 7 columns:* + + - sampleNames: Name of the file. + - chrom: Chromosome of the segment. + - chromStart: Starting position (in bp) of the segment. This position is not included in the segment. + - chromEnd: Ending position (in bp) of the segment. This position is included in the segment. + - probes: Number of probes in the segment. + - means: Mean of the segment. + - calls: Calling of the segment (”double loss”, ”loss”, ”normal”, ”gain” or ”amplification”). + +----- + +**Citation** + +If you use this tool please cite : + +`Q. Grimonprez, A. Celisse, M. Cheok, M. Figeac, and G. Marot. MPAgenomics : An R package for multi-patients analysis of genomic markers, 2014. Preprint <http://fr.arxiv.org/abs/1401.5035>`_ + + + diff -r bbf427bd6967 -r af4f63f27c77 markersSelection.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/markersSelection.R Wed Apr 08 15:34:17 2020 +0000 @@ -0,0 +1,35 @@ +args<-commandArgs(TRUE) + +input=args[1] +response=args[2] +tmp_dir=args[3] +nbFolds=as.numeric(args[4]) +loss=args[5] +output=args[6] + +library(MPAgenomics) +workdir=file.path(tmp_dir, "mpagenomics") +setwd(workdir) + +CN=read.table(input,header=TRUE,check.names=FALSE) +drops=c("chromosome","position","probeName") +CNsignal=CN[,!(names(CN)%in% drops)] +samples=names(CNsignal) +CNsignalMatrix=t(data.matrix(CNsignal)) +resp=read.table(response,header=TRUE,sep=",") +listOfFile=resp[[1]] +responseValue=resp[[2]] +index = match(listOfFile,rownames(CNsignalMatrix)) +responseValueOrder=responseValue[index] + +result=variableSelection(CNsignalMatrix,responseValueOrder,nbFolds=nbFolds,loss=loss,plot=TRUE) + +CNsignalResult=CN[result$markers.index,(names(CN)%in% drops)] + +CNsignalResult["coefficient"]=result$coefficient +CNsignalResult["index"]=result$markers.index + +sink(output) +print(format(CNsignalResult),row.names=FALSE) +sink() +#write.table(CNsignalResult,output,row.names = FALSE, quote=FALSE, sep = "\t") diff -r bbf427bd6967 -r af4f63f27c77 markersSelection.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/markersSelection.py Wed Apr 08 15:34:17 2020 +0000 @@ -0,0 +1,34 @@ +import os +import sys +import subprocess + +def main(): + + inputdata=sys.argv[1] + response=sys.argv[2] + tmp_dir=sys.argv[3] + nbfold=sys.argv[4] + loss=sys.argv[5] + outputlog=sys.argv[6] + output=sys.argv[7] + log=sys.argv[8] + + script_dir=os.path.dirname(os.path.abspath(__file__)) + + if (outputlog=="TRUE"): + errfile=open(log,'w') + else: + errfile=open(os.path.join(tmp_dir,"errfile.log"),'w') + + + retcode=subprocess.call(["Rscript", os.path.join(script_dir,"markersSelection.R"), inputdata, response, tmp_dir, nbfold, loss, output], stdout = errfile, stderr = errfile) + +# if (plot=="TRUE"): +# shutil.copy(os.path.join(tmp_dir,"mpagenomics","Rplots.pdf"), pdffigures) + + errfile.close() + + sys.exit(retcode) + +if __name__ == "__main__": + main() \ No newline at end of file diff -r bbf427bd6967 -r af4f63f27c77 markersSelection.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/markersSelection.xml Wed Apr 08 15:34:17 2020 +0000 @@ -0,0 +1,100 @@ + + mpagenomics + + markersSelection.py '$input' '$response' '$__new_file_path__' '$folds' '$loss' '$outputlog' '$output' '$log' + + + + + + + + + + + + + + + + + + + + + + + outputlog == "TRUE" + + + + + + + **What it does** + +This tool selects some relevant markers according to a response using penalized regressions. + +Input: + +*A tabular text file containing 3 fixed columns and 1 column per sample:* + + - chr: Chromosome. + - position: Genomic position (in bp). + - probeNames: Names of the probes. + - One column per sample which contain the copy number signal for each sample. + +Output: + +*A tabular text file containing 5 columns which describe all the selected SNPs (1 line per SNP):* + + - chr: Chromosome containing the selected SNP. + - position: Position of the selected SNP. + - index: Index of the selected SNP. + - names: Name of the selected SNP. + - coefficient: Regression coefficient of the selected SNP. + +----- + +**Data Response csv file** + +Data response csv file format: + + - The first column contains the names of the different files of the dataset. + + - The second column is the response associated with each file. + + - Column names of these two columns are respectively files and response. + + - Columns are separated by a comma + + - *Extensions of the files (.CEL for example) should be removed* + + + +**Example** + +Let 3 .cel files in the studied dataset :: + + patient1.cel + patient2.cel + patient3.cel + +The csv file should look like this :: + + files,response + patient1,1.92145 + patient2,2.12481 + patient3,1.23545 + + +----- + +**Citation** + +If you use this tool please cite : + +`Q. Grimonprez, A. Celisse, M. Cheok, M. Figeac, and G. Marot. MPAgenomics : An R package for multi-patients analysis of genomic markers, 2014. Preprint <http://fr.arxiv.org/abs/1401.5035>`_ + + + diff -r bbf427bd6967 -r af4f63f27c77 mpagenomics_normalize-7dc6ce39fb89/extractCN.R --- a/mpagenomics_normalize-7dc6ce39fb89/extractCN.R Wed May 13 14:21:27 2015 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,120 +0,0 @@ -args<-commandArgs(TRUE) - -chrom=args[1] -dataset=args[2] -output=args[3] -tmp_dir=args[4] -input=args[5] -tumorcsv=args[6] -signal=args[7] -snp=type.convert(args[8]) -user=args[9] -symmetrize=args[10] - -library(MPAgenomics) -workdir=file.path(tmp_dir, "mpagenomics",user) -setwd(workdir) - - -if (grepl("all",tolower(chrom)) | chrom=="None") { - chrom_vec=c(1:25) - } else { - chrom_tmp <- strsplit(chrom,",") - chrom_vecstring <-unlist(chrom_tmp) - chrom_vec <- as.numeric(chrom_vecstring) - } -if (signal == "CN") -{ - if (input == "dataset") { - if (tumorcsv== "None") - { - CN=getCopyNumberSignal(dataset,chromosome=chrom_vec, onlySNP=snp) - - } else { - CN=getCopyNumberSignal(dataset,chromosome=chrom_vec, normalTumorArray=tumorcsv, onlySNP=snp) - } - } else { - input_tmp <- strsplit(input,",") - input_tmp_vecstring <-unlist(input_tmp) - input_vecstring = sub("^([^.]*).*", "\\1", input_tmp_vecstring) - if (tumorcsv== "None") - { - CN=getCopyNumberSignal(dataset,chromosome=chrom_vec, listOfFiles=input_vecstring, onlySNP=snp) - } else { - CN=getCopyNumberSignal(dataset,chromosome=chrom_vec, normalTumorArray=tumorcsv, listOfFiles=input_vecstring, onlySNP=snp ) - } - } - - list_chr=names(CN) - CN_global=data.frame(check.names = FALSE) - for (i in list_chr) { - chr_data=data.frame(CN[[i]],check.names = FALSE) - CN_global=rbind(CN_global,chr_data) - } - names(CN_global)[names(CN_global)=="featureNames"] <- "probeName" - write.table(format(CN_global), output, row.names = FALSE, quote = FALSE, sep = "\t") - -} else { - if (symmetrize=="TRUE") { - if (input == "dataset") { - input_vecstring = getListOfFiles(dataset) - } else { - input_tmp <- strsplit(input,",") - input_tmp_vecstring <-unlist(input_tmp) - input_vecstring = sub("^([^.]*).*", "\\1", input_tmp_vecstring) - } - - symFracB_global=data.frame(check.names = FALSE) - - for (currentFile in input_vecstring) { - cat(paste0("extracting signal from ",currentFile,".\n")) - currentSymFracB=data.frame() - symFracB=getSymFracBSignal(dataset,chromosome=chrom_vec,file=currentFile,normalTumorArray=tumorcsv) - list_chr=names(symFracB) - for (i in list_chr) { - cat(paste0(" extracting ",i,".\n")) - chr_data=data.frame(symFracB[[i]]$tumor,check.names = FALSE) - currentSymFracB=rbind(currentSymFracB,chr_data) - - } - if (is.null(symFracB_global) || nrow(symFracB_global)==0) { - symFracB_global=currentSymFracB - } else { - symFracB_global=cbind(symFracB_global,currentFile=currentSymFracB[[3]]) - } - } - names(symFracB_global)[names(symFracB_global)=="featureNames"] <- "probeName" - - write.table(format(symFracB_global), output, row.names = FALSE, quote = FALSE, sep = "\t") - } else { - if (input == "dataset") { - if (tumorcsv== "None") - { - fracB=getFracBSignal(dataset,chromosome=chrom_vec) - - } else { - fracB=getFracBSignal(dataset,chromosome=chrom_vec, normalTumorArray=tumorcsv) - } - } else { - input_tmp <- strsplit(input,",") - input_tmp_vecstring <-unlist(input_tmp) - input_vecstring = sub("^([^.]*).*", "\\1", input_tmp_vecstring) - if (tumorcsv== "None") - { - fracB=getFracBSignal(dataset,chromosome=chrom_vec, listOfFiles=input_vecstring) - } else { - fracB=getFracBSignal(dataset,chromosome=chrom_vec, normalTumorArray=tumorcsv, listOfFiles=input_vecstring) - } - } - #formatage des données - list_chr=names(fracB) - fracB_global=data.frame(check.names = FALSE) - for (i in list_chr) { - chr_data=data.frame(fracB[[i]]$tumor,check.names = FALSE) - fracB_global=rbind(fracB_global,chr_data) - } - names(fracB_global)[names(fracB_global)=="featureNames"] <- "probeName" - write.table(format(fracB_global), output, row.names = FALSE, quote = FALSE, sep = "\t") - } - -} \ No newline at end of file diff -r bbf427bd6967 -r af4f63f27c77 mpagenomics_normalize-7dc6ce39fb89/extractCN.py --- a/mpagenomics_normalize-7dc6ce39fb89/extractCN.py Wed May 13 14:21:27 2015 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,78 +0,0 @@ -import os -import sys -import subprocess -import getopt - -def main(argv): - - symmetrize="False" - - try: - opts, args = getopt.getopt(argv,"hc:i:o:f:s:y:t:p:l:g:n:u:",["chrom=","input=","output=","new_file_path=","settings_type=","settings_tumor=","symmetrize=","outputlog=","log=","settings_signal=","settings_snp=","userid="]) - except getopt.GetoptError as err: - print str(err) - sys.exit(2) - for opt, arg in opts: - if opt == '-h': - print 'extractCN.py' - sys.exit() - elif opt in ("-c", "--chrom"): - chromosome = arg - elif opt in ("-i", "--input"): - input_file = arg - elif opt in ("-o", "--output"): - output_file = arg - elif opt in ("-f", "--new_file_path"): - tmp_dir = arg - elif opt in ("-s", "--settings_type"): - input_type = arg - elif opt in ("-t", "--settings_tumor"): - settings_tumor = arg - elif opt in ("-y", "--symmetrize"): - symmetrize = arg - elif opt in ("-p", "--outputlog"): - outputlog = arg - elif opt in ("-l", "--log"): - log = arg - elif opt in ("-g", "--settings_signal"): - signal = arg - elif opt in ("-n", "--settings_snp"): - snp = arg - elif opt in ("-u", "--userid"): - user_id = arg - - - - #=========================================================================== - #chromosome=sys.argv[1] - #input_file=sys.argv[2] - # output_file=sys.argv[3] - # tmp_dir=sys.argv[4] - # input_type=sys.argv[5] - # settings_tumor=sys.argv[6] - # outputlog=sys.argv[7] - # log=sys.argv[8] - # signal=sys.argv[9] - # snp=sys.argv[10] - # user_id=sys.argv[11] - #=========================================================================== - script_dir=os.path.dirname(os.path.abspath(__file__)) - - iFile=open(input_file,'r') - dataSetLine=iFile.readline() - dataset=dataSetLine.split("\t")[1] - iFile.close() - - if (outputlog=="TRUE"): - errfile=open(log,'w') - else: - errfile=open(os.path.join(tmp_dir,"errfile.log"),'w') - - retcode=subprocess.call(["Rscript", os.path.join(script_dir,"extractCN.R"), chromosome, dataset, output_file, tmp_dir, input_type, settings_tumor, signal,snp,user_id, symmetrize], stdout = errfile, stderr = errfile) - - errfile.close() - - sys.exit(retcode) - -if __name__ == "__main__": - main(main(sys.argv[1:])) diff -r bbf427bd6967 -r af4f63f27c77 mpagenomics_normalize-7dc6ce39fb89/extractCN.xml --- a/mpagenomics_normalize-7dc6ce39fb89/extractCN.xml Wed May 13 14:21:27 2015 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,219 +0,0 @@ - - copy number or allele B fraction signal - mpagenomics - - extractCN.py - --chrom '$chrom' - --input '$input' - --output '$output' - --new_file_path '$__new_file_path__' - #if $settings.settingsType == "file": - --settings_type '$settings.inputs' - #end if - #if $settings.settingsType == "dataset": - --settings_type 'dataset' - #end if - #if $settingsSNP.signal == "fracB": - --settings_snp 'TRUE' - - #if $settingsSNP.sym.symmetrize=="TRUE" - --settings_tumor '$tumorcsvFracBsym' - #elif $settingsSNP.sym.symmetrize=="FALSE" - #if $settingsSNP.sym.settingsTumorFracB.settingsTypeTumorFracB == "standard": - --settings_tumor 'None' - #elif $settingsSNP.sym.settingsTumorFracB.settingsTypeTumorFracB == "tumor": - --settings_tumor '$tumorcsvFracB' - #end if - #end if - --symmetrize '$settingsSNP.sym.symmetrize' - #else - --settings_snp '$settingsSNP.snp' - #if $settingsSNP.settingsTumor.settingsTypeTumor == "standard": - --settings_tumor 'None' - #elif $settingsSNP.settingsTumor.settingsTypeTumor == "tumor": - --settings_tumor '$tumorcsvCN' - #end if - #end if - --outputlog '$outputlog' - --log '$log' - --settings_signal '$settingsSNP.signal' - --userid '$__user_id__' - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - outputlog == "TRUE" - - - - - - -.. class:: warningmark - -Data normalization must be run (with the data normalization tool) prior to signal extraction. - ------ - -**What it does** -This tool extracts the copy number profile from the normalized data. - -Outputs: - -*A tabular text file containing 3 fixed columns and 1 column per sample:* - - - chr: Chromosome. - - position: Genomic position (in bp). - - probeNames: Name of the probes of the microarray. - - One column per sample which contains the copy number profile for each sample. - ------ - -**Normal-tumor study** - -In cases where normal (control) samples match to tumor samples, normalization can be improved using TumorBoost. In this case, a normal-tumor csv file must be provided : - - - The first column contains the names of the files corresponding to normal samples of the dataset. - - - The second column contains the names of the tumor samples files. - - - Column names of these two columns are respectively normal and tumor. - - - Columns are separated by a comma. - - - *Extensions of the files (.CEL for example) should be removed* - - - -**Example** - -Let 6 .cel files in the studied dataset (3 patients, each of them being represented by a couple of normal and tumor cel files.) :: - - patient1_normal.cel - patient1_tumor.cel - patient2_normal.cel - patient2_tumor.cel - patient3_normal.cel - patient3_tumor.cel - - -The csv file should look like this :: - - normal,tumor - patient1_normal,patient1_tumor - patient2_normal,patient2_tumor - patient3_normal,patient3_tumor - ------ - - -**Citation** - -If you use this tool please cite : - -`Q. Grimonprez, A. Celisse, M. Cheok, M. Figeac, and G. Marot. Mpagenomics : An r package for multi-patients analysis of genomic markers, 2014. Preprint <http://fr.arxiv.org/abs/1401.5035>`_ - - - - diff -r bbf427bd6967 -r af4f63f27c77 mpagenomics_normalize-7dc6ce39fb89/filter.R --- a/mpagenomics_normalize-7dc6ce39fb89/filter.R Wed May 13 14:21:27 2015 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,25 +0,0 @@ -args<-commandArgs(TRUE) - -input=args[1] -length=as.numeric(args[2]) -probes=as.numeric(args[3]) -tmp_dir=args[4] -nbcall=as.vector(args[5]) -output=args[6] - -nbcall_tmp <- strsplit(nbcall,",") -nbcall_vecstring <-unlist(nbcall_tmp) - -nbcall_vecstring - -library(MPAgenomics) -workdir=file.path(tmp_dir, "mpagenomics") -setwd(workdir) - -segcall = read.table(input, header = TRUE) -filtercall=filterSeg(segcall,length,probes,nbcall_vecstring) -sink(output) -print(format(filtercall),row.names=FALSE) -sink() -#write.table(filtercall,output,row.names = FALSE, quote = FALSE, sep = "\t") - diff -r bbf427bd6967 -r af4f63f27c77 mpagenomics_normalize-7dc6ce39fb89/filter.py --- a/mpagenomics_normalize-7dc6ce39fb89/filter.py Wed May 13 14:21:27 2015 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,23 +0,0 @@ -import os -import sys -import subprocess - -def main(): - - tmp_dir=sys.argv[4] - outputlog=sys.argv[7] - log=sys.argv[8] - script_dir=os.path.dirname(os.path.abspath(__file__)) - - if (outputlog=="TRUE"): - errfile=open(log,'w') - else: - errfile=open(os.path.join(tmp_dir,"errfile.log"),'w') - - - retcode=(subprocess.call(["Rscript", os.path.join(script_dir,"filter.R"), sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[6]], stdout = errfile, stderr = errfile)) - errfile.close(); - sys.exit(retcode) - -if __name__ == "__main__": - main() \ No newline at end of file diff -r bbf427bd6967 -r af4f63f27c77 mpagenomics_normalize-7dc6ce39fb89/filter.xml --- a/mpagenomics_normalize-7dc6ce39fb89/filter.xml Wed May 13 14:21:27 2015 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,63 +0,0 @@ - - mpagenomics - - filter.py '$input' '$length' '$probes' '$__new_file_path__' '$nbcall' '$output' '$outputlog' '$log' - - - - - - - - - - - - - - - - - - - - - - - outputlog == "TRUE" - - - - - - - - -**What it does** - -This tool filters results obtained by the segmentation and calling tool. - ------ - -Input/Output file: - -*A tabular text file containing 7 columns:* - - - sampleNames: Name of the file. - - chrom: Chromosome of the segment. - - chromStart: Starting position (in bp) of the segment. This position is not included in the segment. - - chromEnd: Ending position (in bp) of the segment. This position is included in the segment. - - probes: Number of probes in the segment. - - means: Mean of the segment. - - calls: Calling of the segment (”double loss”, ”loss”, ”normal”, ”gain” or ”amplification”). - ------ - -**Citation** - -If you use this tool please cite : - -`Q. Grimonprez, A. Celisse, M. Cheok, M. Figeac, and G. Marot. MPAgenomics : An R package for multi-patients analysis of genomic markers, 2014. Preprint <http://fr.arxiv.org/abs/1401.5035>`_ - - - diff -r bbf427bd6967 -r af4f63f27c77 mpagenomics_normalize-7dc6ce39fb89/markersSelection.R --- a/mpagenomics_normalize-7dc6ce39fb89/markersSelection.R Wed May 13 14:21:27 2015 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,35 +0,0 @@ -args<-commandArgs(TRUE) - -input=args[1] -response=args[2] -tmp_dir=args[3] -nbFolds=as.numeric(args[4]) -loss=args[5] -output=args[6] - -library(MPAgenomics) -workdir=file.path(tmp_dir, "mpagenomics") -setwd(workdir) - -CN=read.table(input,header=TRUE,check.names=FALSE) -drops=c("chromosome","position","probeName") -CNsignal=CN[,!(names(CN)%in% drops)] -samples=names(CNsignal) -CNsignalMatrix=t(data.matrix(CNsignal)) -resp=read.table(response,header=TRUE,sep=",") -listOfFile=resp[[1]] -responseValue=resp[[2]] -index = match(listOfFile,rownames(CNsignalMatrix)) -responseValueOrder=responseValue[index] - -result=variableSelection(CNsignalMatrix,responseValueOrder,nbFolds=nbFolds,loss=loss,plot=TRUE) - -CNsignalResult=CN[result$markers.index,(names(CN)%in% drops)] - -CNsignalResult["coefficient"]=result$coefficient -CNsignalResult["index"]=result$markers.index - -sink(output) -print(format(CNsignalResult),row.names=FALSE) -sink() -#write.table(CNsignalResult,output,row.names = FALSE, quote=FALSE, sep = "\t") diff -r bbf427bd6967 -r af4f63f27c77 mpagenomics_normalize-7dc6ce39fb89/markersSelection.py --- a/mpagenomics_normalize-7dc6ce39fb89/markersSelection.py Wed May 13 14:21:27 2015 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,34 +0,0 @@ -import os -import sys -import subprocess - -def main(): - - inputdata=sys.argv[1] - response=sys.argv[2] - tmp_dir=sys.argv[3] - nbfold=sys.argv[4] - loss=sys.argv[5] - outputlog=sys.argv[6] - output=sys.argv[7] - log=sys.argv[8] - - script_dir=os.path.dirname(os.path.abspath(__file__)) - - if (outputlog=="TRUE"): - errfile=open(log,'w') - else: - errfile=open(os.path.join(tmp_dir,"errfile.log"),'w') - - - retcode=subprocess.call(["Rscript", os.path.join(script_dir,"markersSelection.R"), inputdata, response, tmp_dir, nbfold, loss, output], stdout = errfile, stderr = errfile) - -# if (plot=="TRUE"): -# shutil.copy(os.path.join(tmp_dir,"mpagenomics","Rplots.pdf"), pdffigures) - - errfile.close() - - sys.exit(retcode) - -if __name__ == "__main__": - main() \ No newline at end of file diff -r bbf427bd6967 -r af4f63f27c77 mpagenomics_normalize-7dc6ce39fb89/markersSelection.xml --- a/mpagenomics_normalize-7dc6ce39fb89/markersSelection.xml Wed May 13 14:21:27 2015 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,100 +0,0 @@ - - mpagenomics - - markersSelection.py '$input' '$response' '$__new_file_path__' '$folds' '$loss' '$outputlog' '$output' '$log' - - - - - - - - - - - - - - - - - - - - - - - outputlog == "TRUE" - - - - - - - **What it does** - -This tool selects some relevant markers according to a response using penalized regressions. - -Input: - -*A tabular text file containing 3 fixed columns and 1 column per sample:* - - - chr: Chromosome. - - position: Genomic position (in bp). - - probeNames: Names of the probes. - - One column per sample which contain the copy number signal for each sample. - -Output: - -*A tabular text file containing 5 columns which describe all the selected SNPs (1 line per SNP):* - - - chr: Chromosome containing the selected SNP. - - position: Position of the selected SNP. - - index: Index of the selected SNP. - - names: Name of the selected SNP. - - coefficient: Regression coefficient of the selected SNP. - ------ - -**Data Response csv file** - -Data response csv file format: - - - The first column contains the names of the different files of the dataset. - - - The second column is the response associated with each file. - - - Column names of these two columns are respectively files and response. - - - Columns are separated by a comma - - - *Extensions of the files (.CEL for example) should be removed* - - - -**Example** - -Let 3 .cel files in the studied dataset :: - - patient1.cel - patient2.cel - patient3.cel - -The csv file should look like this :: - - files,response - patient1,1.92145 - patient2,2.12481 - patient3,1.23545 - - ------ - -**Citation** - -If you use this tool please cite : - -`Q. Grimonprez, A. Celisse, M. Cheok, M. Figeac, and G. Marot. MPAgenomics : An R package for multi-patients analysis of genomic markers, 2014. Preprint <http://fr.arxiv.org/abs/1401.5035>`_ - - - diff -r bbf427bd6967 -r af4f63f27c77 mpagenomics_normalize-7dc6ce39fb89/preprocess.R --- a/mpagenomics_normalize-7dc6ce39fb89/preprocess.R Wed May 13 14:21:27 2015 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,25 +0,0 @@ -args<-commandArgs(TRUE) - -chip=args[1] -dataset=args[2] -workdir=args[3] -celPath=args[4] -chipPath=args[4] -tumor=args[5] -settingType=args[6] -outputgraph=type.convert(args[7]) -tag=args[8] - -if (tag=="") -{ - tag=NULL -} - -library(MPAgenomics) -setwd(workdir) -if (settingType=="standard") -{ - signalPreProcess(dataSetName=dataset, chipType=chip, dataSetPath=celPath,chipFilesPath=chipPath, path=workdir,createArchitecture=TRUE, savePlot=outputgraph, tags=tag) -} else { - signalPreProcess(dataSetName=dataset, chipType=chip, dataSetPath=celPath,chipFilesPath=chipPath, normalTumorArray=tumor, path=workdir,createArchitecture=TRUE, savePlot=outputgraph, tags=tag) -} \ No newline at end of file diff -r bbf427bd6967 -r af4f63f27c77 mpagenomics_normalize-7dc6ce39fb89/preprocess.py --- a/mpagenomics_normalize-7dc6ce39fb89/preprocess.py Wed May 13 14:21:27 2015 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,121 +0,0 @@ -import os -import re -import shutil -import sys -import subprocess -import zipfile -import optparse - -def main(): - - parser = optparse.OptionParser() - parser.add_option('-s', action="store", dest='summary') - parser.add_option('-e', action="store", dest='dataSetName') - parser.add_option('-p', action="store", dest='new_file_path') - parser.add_option('-c', action="store", dest='inputcdffull_name') - parser.add_option('-f', action="store", dest='inputufl_name') - parser.add_option('-g', action="store", dest='inputugp_name') - parser.add_option('-a', action="store", dest='inputacs_name') - parser.add_option('-d', action="store", dest='inputcdffull') - parser.add_option('-v', action="store", dest='inputufl') - parser.add_option('-w', action="store", dest='inputugp') - parser.add_option('-b', action="store", dest='inputacs') - parser.add_option('-t', action="store", dest='tumorcsv') - parser.add_option('-y', action="store", dest='settingsType') - parser.add_option('-o', action="store", dest='outputgraph') - parser.add_option('-z', action="store", dest='zipfigures') - parser.add_option('-k', action="store", dest='outputlog') - parser.add_option('-l', action="store", dest='log') - parser.add_option('-u', action="store", dest='user_id') - - parser.add_option('-i', action="append", dest='inputFile', default=[]) - parser.add_option('-n', action='append', dest='inputFileName', default=[]) - - options, args = parser.parse_args() - - dataSetName=options.dataSetName - destinationPath=os.path.join(options.new_file_path, options.user_id, dataSetName) - - mpagenomics_dir = os.path.join(options.new_file_path,"mpagenomics",options.user_id) - data_dir = os.path.join(options.new_file_path, options.user_id) - - try: - os.makedirs(data_dir) - except: - shutil.rmtree(data_dir) - os.makedirs(data_dir) - - if (not os.path.isdir(mpagenomics_dir)): - os.makedirs(mpagenomics_dir) - - for inputFile, inputFileName in zip(options.inputFile,options.inputFileName): - source = inputFile - destination=os.path.join(data_dir,inputFileName) - _copy(source,destination) - - - cdffull_name=options.inputcdffull_name - if (cdffull_name.count(",") != 0): - chipType=cdffull_name.split(",",1)[0] - tagExt=cdffull_name.split(",",1)[1] - tag=tagExt.split(".",1)[0] - else: - chipType=cdffull_name.split(".",1)[0] - tag="" - - _copy(options.inputcdffull,os.path.join(data_dir, options.inputcdffull_name)) - _copy(options.inputugp,os.path.join(data_dir, options.inputugp_name)) - _copy(options.inputufl,os.path.join(data_dir, options.inputufl_name)) - _copy(options.inputacs,os.path.join(data_dir, options.inputacs_name)) - - - fig_dir = os.path.join("mpagenomics", options.user_id, "figures", dataSetName, "signal") - abs_fig_dir = os.path.join(options.new_file_path, fig_dir) - - - retcode = _preprocess(chipType, dataSetName, mpagenomics_dir, data_dir, options.new_file_path, options.tumorcsv, options.settingsType, options.outputgraph, options.outputlog, options.log, tag) - - if (retcode == 0): - if (os.path.isdir(abs_fig_dir)) and (options.outputgraph == "TRUE"): - - new_files = os.listdir(abs_fig_dir) - zipbuf = zipfile.ZipFile(os.path.join(abs_fig_dir, options.zipfigures), 'w', zipfile.ZIP_DEFLATED) - for current_file in new_files: - fn = os.path.join(abs_fig_dir, current_file) - relfn = fn[len(abs_fig_dir) + len(os.sep):] - zipbuf.write(fn, relfn) - - f = open(options.summary, "w") - # Create report - try: - for inputFileName in options.inputFileName: - f.write("%s\t%s\t%s\n" %(inputFileName,dataSetName,chipType)) - finally: - shutil.rmtree(data_dir) - f.close() - - sys.exit(retcode) - - sys.exit(retcode) - - -def _copy(source, destination): - try: - os.symlink(source, destination) - except: - shutil.copy(source, destination) - -def _preprocess (chipType,dataset,mpagenomics_dir,data_dir,tmp_dir,tumor,settingType,outputgraph,outputlog,log,tag): - script_dir=os.path.dirname(os.path.abspath(__file__)) - - if (outputlog=="TRUE"): - errfile=open(log,'w') - else: - errfile=open(os.path.join(tmp_dir,"errfile.log"),'w') - - retcode = subprocess.call(["Rscript", os.path.join(script_dir,"preprocess.R"), chipType, dataset, mpagenomics_dir, data_dir, tumor, settingType, outputgraph, tag], stdout = errfile, stderr = errfile) - return(retcode) - - -if __name__ == "__main__": - main() diff -r bbf427bd6967 -r af4f63f27c77 mpagenomics_normalize-7dc6ce39fb89/preprocess.xml --- a/mpagenomics_normalize-7dc6ce39fb89/preprocess.xml Wed May 13 14:21:27 2015 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,157 +0,0 @@ - - - - mpagenomics - - - preprocess.py - -s '$summary' - -p '$__new_file_path__' - -c '$inputcdffull.name' - -f '$inputufl.name' - -g '$inputugp.name' - -a '$inputacs.name' - -d '$inputcdffull' - -v '$inputufl' - -w '$inputugp' - -b '$inputacs' - -e '$datasetName' - #if $settings.settingsType == "tumor": - -t '$tumorcsv' - #end if - #if $settings.settingsType == "standard": - -t 'none' - #end if - -y '$settings.settingsType' - -o '$outputgraph' - -z '$zipfigures' - -k '$outputlog' - -l '$log' - -u '$__user_id__' - #for $input in $inputs - -i "${input}" - -n "${input.name}" - #end for - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - outputgraph == "TRUE" - - - outputlog == "TRUE" - - - - - - - - - -**What it does** - -This preprocessing step consists in a correction of biological and technical biaises due to the experiment. Raw data from Affymetrix arrays are provided in different CEL files. These data must be normalized before statistical analysis. -The pre-processing is proposed as a wrapper of aroma.* packages (using CRMAv2 and TumorBoost when appropriate). Note that this implies that the pre-processing step is only available for Affymetrix arrays. - ------ - -**Chip file naming conventions** - -Chip filenames must strictly follow the following rules : - -- *.cdf* filename must comply with the following format : < chiptype >,< tag >.cdf (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full.cdf). Note the use of a comma (not a point) between <chiptype> and the tag "Full". - -- *.ufl* filename must start with < chiptype >,< tag > (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full,na31,hg19,HB20110328.ufl). - -- *.ugp* filename must start with < chiptype >,< tag > (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full,na31,hg19,HB20110328.ugp). - -- *.acs* file name must start with < chiptype >,< tag > (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,HB20080710.acs). - ------ - -**Normal-tumor study with TumorBoost** - -In cases where normal (control) samples match to tumor samples, normalization can be improved using TumorBoost. In this case, a normal-tumor csv file must be provided : - - - The first column contains the names of the files corresponding to normal samples of the dataset. - - - The second column contains the names of the tumor samples files. - - - Column names of these two columns are respectively normal and tumor. - - - Columns are separated by a comma. - - - *Extensions of the files (.CEL for example) should be removed* - - - -**Example** - -Let 6 .cel files in the dataset studied (3 patients, each of them being represented by a couple of normal and tumor cel files.) :: - - patient1_normal.cel - patient1_tumor.cel - patient2_normal.cel - patient2_tumor.cel - patient3_normal.cel - patient3_tumor.cel - - -The csv file should look like this :: - - normal,tumor - patient1_normal,patient1_tumor - patient2_normal,patient2_tumor - patient3_normal,patient3_tumor - - ------ - -**Citation** - -When using this tool, please cite : - -`Q. Grimonprez, A. Celisse, M. Cheok, M. Figeac, and G. Marot. MPAgenomics : An R package for multi-patients analysis of genomic markers, 2014. Preprint <http://fr.arxiv.org/abs/1401.5035>`_ - -As CRMAv2 normalization is used, please also cite `H. Bengtsson, P. Wirapati, and T. P. Speed. A single-array preprocessing method for estimating full-resolution raw copy numbers from all Affymetrix genotyping arrays including GenomeWideSNP 5 & 6. Bioinformatics, 5(17):2149–2156, 2009. <http://bioinformatics.oxfordjournals.org/content/25/17/2149.short>`_ - -When using TumorBoost to improve normalization in a normal-tumor study, please cite `H. Bengtsson, P. Neuvial, and T. P. Speed. TumorBoost: Normalization of allele-specific tumor copy numbers from a single pair of tumor-normal genotyping microarrays. BMC Bioinformatics, 11, 2010 <http://www.biomedcentral.com/1471-2105/11/245>`_ - - - diff -r bbf427bd6967 -r af4f63f27c77 mpagenomics_normalize-7dc6ce39fb89/segcall.R --- a/mpagenomics_normalize-7dc6ce39fb89/segcall.R Wed May 13 14:21:27 2015 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,55 +0,0 @@ -args<-commandArgs(TRUE) - -chrom=args[1] -dataset=args[2] -output=args[3] -tmp_dir=args[4] -nbcall=as.numeric(args[5]) -input=args[6] -outputfigures=type.convert(args[7]) -snp=type.convert(args[8]) -tumorcsv=args[9] -cellularity=as.numeric(args[10]) -user=args[11] -method=args[12] - -library(MPAgenomics) -workdir=file.path(tmp_dir, "mpagenomics",user) -setwd(workdir) - -if (grepl("all",tolower(chrom)) | chrom=="None") { - chrom_vec=c(1:25) - } else { - chrom_tmp <- strsplit(chrom,",") - chrom_vecstring <-unlist(chrom_tmp) - chrom_vec <- as.numeric(chrom_vecstring) - } - -input_tmp <- strsplit(input,",") -input_tmp_vecstring <-unlist(input_tmp) - - -input_vecstring = sub("^([^.]*).*", "\\1", input_tmp_vecstring) - -if (dataset == input) { - if (tumorcsv== "none") - { - segcall=cnSegCallingProcess(dataset,chromosome=chrom_vec, nclass=nbcall, savePlot=outputfigures,onlySNP=snp, cellularity=cellularity, method=method) - } else { - segcall=cnSegCallingProcess(dataset,chromosome=chrom_vec, normalTumorArray=tumorcsv, nclass=nbcall, savePlot=outputfigures,onlySNP=snp, cellularity=cellularity, method=method) - } -} else { - if (tumorcsv== "none") - { - segcall=cnSegCallingProcess(dataset,chromosome=chrom_vec, listOfFiles=input_vecstring, nclass=nbcall, savePlot=outputfigures, onlySNP=snp, cellularity=cellularity, method=method) - } else { - segcall=cnSegCallingProcess(dataset,chromosome=chrom_vec, normalTumorArray=tumorcsv, listOfFiles=input_vecstring, nclass=nbcall, savePlot=outputfigures, onlySNP=snp, cellularity=cellularity, method=method) - } -} - -sink(output) -print(format(segcall)) -sink() -#write.table(format(segcall),output,row.names = FALSE, quote=FALSE, sep = "\t") -#write.fwf(segcall,output,rownames = FALSE, quote=FALSE, sep = "\t") -quit() diff -r bbf427bd6967 -r af4f63f27c77 mpagenomics_normalize-7dc6ce39fb89/segcall.py --- a/mpagenomics_normalize-7dc6ce39fb89/segcall.py Wed May 13 14:21:27 2015 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,62 +0,0 @@ -import os -import sys -import subprocess -import zipfile - - -def main(): - - input_file=sys.argv[2] - tmp_dir=sys.argv[4] - settingsType=sys.argv[6] - zip_file=sys.argv[9] - tumorcsv=sys.argv[10] - cellularity=sys.argv[11] - outputlog=sys.argv[12] - log=sys.argv[13] - user=sys.argv[14] - method=sys.argv[15] - script_dir=os.path.dirname(os.path.abspath(__file__)) - - iFile=open(input_file,'r') - dataSetLine=iFile.readline() - dataset=dataSetLine.split("\t")[1] - iFile.close() - - - if settingsType=="dataset": - settingsType=dataset - - if (outputlog=="TRUE"): - errfile=open(log,'w') - else: - errfile=open(os.path.join(tmp_dir,"errfile.log"),'w') - - fig_dir=os.path.join("mpagenomics",user,"figures",dataset,"segmentation/CN") - - abs_fig_dir=os.path.join(tmp_dir,fig_dir) - if (os.path.isdir(abs_fig_dir)) and (sys.argv[7]=="TRUE"): - old_files=os.listdir(abs_fig_dir) - for ifile in old_files: - os.remove(os.path.join(abs_fig_dir,ifile)) - - - retcode=subprocess.call(["Rscript", os.path.join(script_dir,"segcall.R"), sys.argv[1], dataset, sys.argv[3], sys.argv[4], sys.argv[5], settingsType, sys.argv[7], sys.argv[8], tumorcsv, cellularity, user, method], stdout = errfile, stderr = errfile) - - errfile.close() - - if (retcode == 0): - if (os.path.isdir(abs_fig_dir)) and (sys.argv[7]=="TRUE"): - - new_files=os.listdir(abs_fig_dir) - zipbuf = zipfile.ZipFile(os.path.join(abs_fig_dir,zip_file), 'w', zipfile.ZIP_DEFLATED) - for current_file in new_files: - fn = os.path.join(abs_fig_dir,current_file) - relfn=fn[len(abs_fig_dir)+len(os.sep):] - zipbuf.write(fn,relfn) - sys.exit(retcode) - else: - sys.exit(retcode) - -if __name__ == "__main__": - main() diff -r bbf427bd6967 -r af4f63f27c77 mpagenomics_normalize-7dc6ce39fb89/segcall.xml --- a/mpagenomics_normalize-7dc6ce39fb89/segcall.xml Wed May 13 14:21:27 2015 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,198 +0,0 @@ - - of the normalized data - mpagenomics - - segcall.py '$chrom' '$input' '$output' '$__new_file_path__' '$nbcall' - #if $settings.settingsType == "file": - '$settings.inputs' - #end if - #if $settings.settingsType == "dataset": - '$settings.settingsType' - #end if - '$outputgraph' '$snp' '$zipfigures' - #if $settingsTumor.settingsTypeTumor == "standard": - 'none' - #end if - #if $settingsTumor.settingsTypeTumor == "tumor": - '$tumorcsv' - #end if - '$cellularity' '$outputlog' '$log' '$__user_id__' '$method' - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - outputgraph == "TRUE" - - - outputlog == "TRUE" - - - - - - -.. class:: warningmark - -Data normalization must be run with the Data Normalization tool prior to segmentation. Otherwise, the standalone version can be used to perform marker selection from matrices containing data normalized with tools different from the one proposed in this instance. - - ------ - -**What it does** -This tool segments the previously normalized profiles and labels segments found in the copy-number profiles. Otherwise, the standalone version can be used to perform segmentation from matrices containing data normalized with tools different from the one proposed in this instance. - -Outputs: - -*A tabular text file containing 7 columns which describe all the segments (1 line per segment):* - - - sampleNames: Names of the original .CEL files. - - chrom: Chromosome of the segment. - - chromStart: Starting position (in bp) of the segment. This position is not included in the segment. - - chromEnd: Ending position (in bp) of the segment. This position is included in the segment. - - probes: Number of probes in the segment. - - means: Mean of the segment. - - calls: Calling of the segment (”double loss”, ”loss”, ”normal”, ”gain” or ”amplification”). - -*A .zip file containing all the figures (optionnal)* - ------ - -**Normal-tumor study** - -In cases where normal (control) samples match to tumor samples, they are taken as references to extract copy number profile. In this case, a normal-tumor csv file must be provided : - - - The first column contains the names of the files corresponding to normal samples of the dataset. - - - The second column contains the names of the tumor samples files. - - - Column names of these two columns are respectively normal and tumor. - - - Columns are separated by a comma. - - - *Extensions of the files (.CEL for example) should be removed* - - - -**Example** - -Let 6 .cel files in the studied dataset (3 patients, each of them being represented by a couple of normal and tumor cel files.) :: - - patient1_normal.cel - patient1_tumor.cel - patient2_normal.cel - patient2_tumor.cel - patient3_normal.cel - patient3_tumor.cel - - -The csv file should look like this :: - - normal,tumor - patient1_normal,patient1_tumor - patient2_normal,patient2_tumor - patient3_normal,patient3_tumor - ------ - - -**Citation** - -If you use this tool please cite : - -`Q. Grimonprez, A. Celisse, M. Cheok, M. Figeac, and G. Marot. MPAgenomics : An R package for multi-patients analysis of genomic markers, 2014. Preprint <http://fr.arxiv.org/abs/1401.5035>`_ - -As segmentation is performed with PELT, please also cite `R. Killick, P. Fearnhead, and I. A. Eckley. Optimal detection of changepoints with a linear computational cost. Journal of the American Statistical Association, 107(500):1590–1598, 2012. <http://arxiv.org/abs/1101.1438>`_ - -As segmentation is performed by cghseg, please cite `Picard, F., Robin, S., Lavielle, M., Vaisse, C., and Daudin, J.-J. (2005). A statistical approach for array CGH data analysis. BMC Bioinformatics, 6(1):27. <http://www.ncbi.nlm.nih.gov/pubmed/15705208>`_ , -and also cite Rigaill, G. (2010). `Pruned dynamic programming for optimal multiple change-point detection. <http://arxiv.org/abs/1004.0887>`_ - -When using the labels of the segments, please cite CGHCall `M. A. van de Wiel, K. I. Kim, S. J. Vosse, W. N. van Wieringen, S. M. Wilting, and B. Ylstra. CGHcall: calling aberrations for array CGH tumor profiles. Bioinformatics, 23(7):892–894, 2007. <http://bioinformatics.oxfordjournals.org/content/23/7/892.abstract>`_ - - - diff -r bbf427bd6967 -r af4f63f27c77 mpagenomics_normalize-7dc6ce39fb89/segmentation.R --- a/mpagenomics_normalize-7dc6ce39fb89/segmentation.R Wed May 13 14:21:27 2015 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,81 +0,0 @@ -args<-commandArgs(TRUE) - -input=args[1] -tmp_dir=args[2] -nbcall=as.numeric(args[3]) -cellularity=as.numeric(args[4]) -output=args[5] -method=args[6] -userId=args[7] -signalType=args[8] - -library(MPAgenomics) -workdir=file.path(tmp_dir, "mpagenomics",userId) -setwd(workdir) - -CN=read.table(input,header=TRUE) -uniqueChr=unique(CN$chromosome) -drops=c("chromosome","position","probeName") -CNsignal=CN[,!(names(CN)%in% drops),drop=FALSE] - -samples=names(CNsignal) - -if (signalType=="CN") -{ - -result=data.frame(sampleNames=character(0),chrom=character(0),chromStart=numeric(0),chromEnd=numeric(0),probes=numeric(0),means=numeric(0),calls=character(0),stringsAsFactors=FALSE) - -for (chr in uniqueChr) -{ -currentSubset=subset(CN, chromosome==chr) -currentPositions=currentSubset["position"] -for (sample in samples) - { - currentSignal=currentSubset[sample] - if (length(which(!is.na(unlist(currentSignal))))>1) - { - currentSeg=segmentation(signal=unlist(currentSignal),position=unlist(currentPositions),method=method) - callobj= callingObject(copynumber=currentSeg$signal, segmented=currentSeg$segmented,chromosome=rep(chr,length(currentSeg$signal)), position=currentSeg$startPos,sampleNames=sample) - currentCall=callingProcess(callobj,nclass=nbcall,cellularity=cellularity,verbose=TRUE) - currentResult=currentCall$segment - currentResult["sampleNames"]=c(rep(sample,length(currentCall$segment$chrom))) - result=rbind(result,currentResult) - } - } -} -finalResult=data.frame(sampleNames=result["sampleNames"],chrom=result["chrom"],chromStart=result["chromStart"],chromEnd=result["chromEnd"],probes=result["probes"],means=result["means"],calls=result["calls"],stringsAsFactors=FALSE) -sink(output) -print(format(finalResult)) -sink() -#write.table(finalResult,output,row.names = FALSE, quote=FALSE, sep = "\t") -} else { - result=data.frame(sampleNames=character(0),chrom=character(0),start=numeric(0),end=numeric(0),points=numeric(0),means=numeric(0),stringsAsFactors=FALSE) - - for (chr in uniqueChr) - { - cat(paste0("chromosome ",chr,"\n")) - currentSubset=subset(CN, chromosome==chr) - currentPositions=currentSubset["position"] - for (sample in samples) - { - cat(paste0(" sample ",sample,"...")) - currentSignal=currentSubset[sample] - if (length(which(!is.na(unlist(currentSignal))))>1) - { - currentSeg=segmentation(signal=unlist(currentSignal),position=unlist(currentPositions),method=method) - currentResult=currentSeg$segment - currentResult["chrom"]=c(rep(chr,length(currentSeg$segment$means))) - currentResult["sampleNames"]=c(rep(sample,length(currentSeg$segment$means))) - result=rbind(result,currentResult) - - } - cat(paste0("OK\n")) - } - } - finalResult=data.frame(sampleNames=result["sampleNames"],chrom=result["chrom"],chromStart=result["start"],chromEnd=result["end"],probes=result["points"],means=result["means"],stringsAsFactors=FALSE) - sink(output) - print(format(finalResult)) - sink() - #write.table(finalResult,output,row.names = FALSE, quote=FALSE, sep = "\t") -} - diff -r bbf427bd6967 -r af4f63f27c77 mpagenomics_normalize-7dc6ce39fb89/segmentation.py --- a/mpagenomics_normalize-7dc6ce39fb89/segmentation.py Wed May 13 14:21:27 2015 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,78 +0,0 @@ -import os -import sys -import subprocess -import shutil -import getopt - - -def main(argv): - - #default values - cellularity="1" - nbcall="3" - - try: - opts, args = getopt.getopt(argv,"h:i:f:p:o:l:og:g:m:st:u:",["input=","new_file_path=","outputlog=","output=","log=","outputgraph=", "graph=", "method=", "signalType=", "user_id=", "nbcall=", "cellularity="]) - except getopt.GetoptError as err: - print str(err) - sys.exit(2) - for opt, arg in opts: - if opt == '-h': - print 'segmentation.py' - sys.exit() - elif opt in ("-i", "--input"): - inputdata = arg - elif opt in ("-f", "--new_file_path"): - tmp_dir = arg - elif opt in ("-p", "--outputlog"): - outputlog = arg - elif opt in ("-o", "--output"): - output = arg - elif opt in ("-l", "--log"): - log = arg - elif opt in ("-og", "--outputgraph"): - plot = arg - elif opt in ("-g", "--graph"): - pdffigures = arg - elif opt in ("-m", "--method"): - method = arg - elif opt in ("-st", "--signalType"): - signalType = arg - elif opt in ("-u", "--user_id"): - userId = arg - elif opt in ("-c", "--nbcall"): - nbcall = arg - elif opt in ("-e", "--cellularity"): - cellularity = arg - - #=========================================================================== - # inputdata=sys.argv[1] - # tmp_dir=sys.argv[2] - # nbcall=sys.argv[3] - # cellularity=sys.argv[4] - # outputlog=sys.argv[5] - # output=sys.argv[6] - # log=sys.argv[7] - # plot=sys.argv[8] - # pdffigures=sys.argv[9] - # method=sys.argv[10] - #=========================================================================== - - script_dir=os.path.dirname(os.path.abspath(__file__)) - - if (outputlog=="TRUE"): - errfile=open(log,'w') - else: - errfile=open(os.path.join(tmp_dir,"errfile.log"),'w') - - retcode=subprocess.call(["Rscript", os.path.join(script_dir,"segmentation.R"), inputdata, tmp_dir, nbcall, cellularity, output, method, userId, signalType], stdout = errfile, stderr = errfile) - - if (plot=="TRUE"): - shutil.copy(os.path.join(tmp_dir,"mpagenomics",userId,"Rplots.pdf"), pdffigures) - - errfile.close() - - sys.exit(retcode) - -if __name__ == "__main__": - main(main(sys.argv[1:])) \ No newline at end of file diff -r bbf427bd6967 -r af4f63f27c77 mpagenomics_normalize-7dc6ce39fb89/segmentation.xml --- a/mpagenomics_normalize-7dc6ce39fb89/segmentation.xml Wed May 13 14:21:27 2015 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,111 +0,0 @@ - - of a previously normalized signal - mpagenomics - - segmentation.py - #if $signalType.signal == "CN": - --nbcall '$signalType.nbcall' - --cellularity '$signalType.cellularity' - #else - --nbcall '3' - --cellularity '1.0' - #end if - --input '$input' - --new_file_path '$__new_file_path__' - --outputlog '$outputlog' - --output '$output' - --log '$log' - --outputgraph '$outputgraph' - --graph '$graph' - --method '$method' - --signalType '$signalType.signal' - --user_id '$__user_id__' - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - outputlog == "TRUE" - - - outputgraph == "TRUE" - - - - - - - -**What it does** -This tool segments normalized profiles provided by the user and labels segments found in the copy-number profiles. - -Input format: - -*A tabular text file containing 3 fixed columns and 1 column per sample:* - - - chr: Chromosome. - - position: Genomic position (in bp) - - probeName: Probes names. - - One column per sample which contains the copy number profile for each sample - -Output format: - -*A tabular text file containing 7 columns which describe all the segments (1 line per segment):* - - - sampleNames: Column names corresponding to samples in the input file. - - chrom: Chromosome of the segment. - - chromStart: Starting position (in bp) of the segment. This position is not included in the segment. - - chromEnd: Ending position (in bp) of the segment. This position is included in the segment. - - probes: Number of probes in the segment. - - means: Mean of the segment. - - calls: Calling of the segment (”double loss”, ”loss”, ”normal”, ”gain” or ”amplification”). - ------ - -**Citation** -If you use this tool please cite : - -`Q. Grimonprez, A. Celisse, M. Cheok, M. Figeac, and G. Marot. MPAgenomics : An R package for multi-patients analysis of genomic markers, 2014. Preprint <http://fr.arxiv.org/abs/1401.5035>`_ - -If segmentation is performed with PELT, please also cite `R. Killick, P. Fearnhead, and I. A. Eckley. Optimal detection of changepoints with a linear computational cost. Journal of the American Statistical Association, 107(500):1590–1598, 2012. <http://arxiv.org/abs/1101.1438>`_ - -If segmentation is performed by cghseg, please cite `Picard, F., Robin, S., Lavielle, M., Vaisse, C., and Daudin, J.-J. (2005). A statistical approach for array CGH data analysis. BMC Bioinformatics, 6(1):27. <http://www.ncbi.nlm.nih.gov/pubmed/15705208>`_ , -and also cite Rigaill, G. (2010). `Pruned dynamic programming for optimal multiple change-point detection. <http://arxiv.org/abs/1004.0887>`_ - -When using the labels of the segments, please cite CGHCall `M. A. van de Wiel, K. I. Kim, S. J. Vosse, W. N. van Wieringen, S. M. Wilting, and B. Ylstra. CGHcall: calling aberrations for array CGH tumor profiles. Bioinformatics, 23(7):892–894, 2007. <http://bioinformatics.oxfordjournals.org/content/23/7/892.abstract>`_ - - - diff -r bbf427bd6967 -r af4f63f27c77 mpagenomics_normalize-7dc6ce39fb89/selection.R --- a/mpagenomics_normalize-7dc6ce39fb89/selection.R Wed May 13 14:21:27 2015 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,75 +0,0 @@ -args<-commandArgs(TRUE) - -input=args[1] -dataResponse=args[2] -chrom=args[3] -tmp_dir=args[4] -signal=args[5] -snp=type.convert(args[6]) -settingsType=args[7] -tumor=args[8] -fold=as.integer(args[9]) -loss=args[10] -plot=type.convert(args[11]) -output=args[12] -user=args[13] -package=args[14] - - -library(MPAgenomics) -library(glmnet) -library(spikeslab) -library(lars) -workdir=file.path(tmp_dir, "mpagenomics",user) -setwd(workdir) - -if (grepl("all",tolower(chrom)) | chrom=="None") { - chrom_vec=c(1:25) - } else { - chrom_tmp <- strsplit(chrom,",") - chrom_vecstring <-unlist(chrom_tmp) - chrom_vec <- as.numeric(chrom_vecstring) - } - - -if (settingsType == "tumor") { - if (signal=="CN") { - res=markerSelection(input,dataResponse, chromosome=chrom_vec, signal=signal, normalTumorArray=tumor, onlySNP=snp, loss=loss, plot=plot, nbFolds=fold, pkg=package) - } else { - res=markerSelection(input,dataResponse, chromosome=chrom_vec,signal=signal,normalTumorArray=tumor, loss=loss, plot=plot, nbFolds=fold,pkg=package) - } -} else { - if (signal=="CN") { - res=markerSelection(input,dataResponse, chromosome=chrom_vec, signal=signal, onlySNP=snp, loss=loss, plot=plot, nbFolds=fold,pkg=package) - } else { - res=markerSelection(input,dataResponse, chromosome=chrom_vec, signal=signal, loss=loss, plot=plot, nbFolds=fold,pkg=package) - } -} - -res - -df=data.frame() -list_chr=names(res) -markerSelected=FALSE - -for (i in list_chr) { - chr_data=res[[i]] - len=length(chr_data$markers.index) - if (len != 0) - { - markerSelected=TRUE - chrdf=data.frame(rep(i,len),chr_data$markers.position,chr_data$markers.index,chr_data$markers.names,chr_data$coefficient) - df=rbind(df,chrdf) - } -} - -if (markerSelected) { - colnames(df) <- c("chr","position","index","names","coefficient") - sink(output) - print(format(df),row.names=FALSE) - sink() - #write.table(df,output,row.names = FALSE, quote = FALSE, sep = "\t") -} else - writeLines("no SNP selected", output) - - diff -r bbf427bd6967 -r af4f63f27c77 mpagenomics_normalize-7dc6ce39fb89/selection.py --- a/mpagenomics_normalize-7dc6ce39fb89/selection.py Wed May 13 14:21:27 2015 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,38 +0,0 @@ -import os -import sys -import subprocess -import shutil - -def main(): - - input_file=sys.argv[1] - tmp_dir=sys.argv[4] - script_dir=os.path.dirname(os.path.abspath(__file__)) - plot=sys.argv[11] - pdffigures=sys.argv[13] - outputlog=sys.argv[14] - log=sys.argv[15] - user=sys.argv[16] - package=sys.argv[17] - - iFile=open(input_file,'r') - dataSetLine=iFile.readline() - dataset=dataSetLine.split("\t")[1] - iFile.close() - - if (outputlog=="TRUE"): - errfile=open(log,'w') - else: - errfile=open(os.path.join(tmp_dir,"errfile.log"),'w') - - retcode=subprocess.call(["Rscript", os.path.join(script_dir,"selection.R"), dataset, sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[6], sys.argv[7], sys.argv[8], sys.argv[9], sys.argv[10], sys.argv[11], sys.argv[12],sys.argv[16],package], stdout = errfile, stderr = errfile) - - if (plot=="TRUE"): - shutil.copy(os.path.join(tmp_dir,"mpagenomics",user,"Rplots.pdf"), pdffigures) - - errfile.close() - - sys.exit(retcode) - -if __name__ == "__main__": - main() diff -r bbf427bd6967 -r af4f63f27c77 mpagenomics_normalize-7dc6ce39fb89/selection.xml --- a/mpagenomics_normalize-7dc6ce39fb89/selection.xml Wed May 13 14:21:27 2015 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,220 +0,0 @@ - - - selection.py '$input' '$response' '$chromosome' '$__new_file_path__' '$settingsSNP.signal' - #if $settingsSNP.signal == "CN": - '$settingsSNP.snp' - #end if - #if $settingsSNP.signal == "fracB": - 'none' - #end if - '$settings.settingsType' - #if $settings.settingsType == "tumor": - '$tumorcsv' - #end if - #if $settings.settingsType == "standard": - 'none' - #end if - '$folds' '$settingsLoss.loss' '$outputgraph' '$output' '$pdffigures' '$outputlog' '$log' '$__user_id__' - #if $settingsLoss.loss == "linear": - '$settingsLoss.package' - #end if - #if $settingsLoss.loss == "logistic": - 'HDPenReg' - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - outputgraph == "TRUE" - (settingsLoss['package'] != 'spikeslab') - - - outputlog == "TRUE" - - - - - - -.. class:: warningmark - -Data normalization must be run with the Data Normalization tool prior to SNPs selection. Otherwise, the standalone version can be used to perform marker selection from matrices containing data normalized with tools different from the one proposed in this instance. - ------ - -**What it does** - -This tool selects some relevant markers according to a response using penalized regressions. - -Output: - -A tabular text file containing 5 columns which describe all the selected SNPs (1 line per SNPs): - - - chr: Chromosome containing the selected SNP. - - position: Position of the selected SNP. - - index: Index of the selected SNP. - - names: Name of the selected SNP. - - coefficient: Regression coefficient of the selected SNP. - ------ - -**Data Response csv file** - -Data response csv file format: - - - The first column contains the names of the different files of the data-set. - - - The second column contains the response associated with each file. - - - Column names of these two columns are respectively files and response. - - - Columns are separated by a comma - - - *Extensions of the files (.CEL for example) should be removed* - - - -**Example** - -Let 3 .cel files in the studied dataset :: - - patient1.cel - patient2.cel - patient3.cel - -The csv file should look like this :: - - files,response - patient1,1.92145 - patient2,2.12481 - patient3,1.23545 - - ------ - -**Normal-tumor study** - -In cases where normal (control) samples match to tumor samples, they are taken as references to extract copy number profile. In this case, a normal-tumor csv file must be provided : - - - The first column contains the names of the files corresponding to normal samples of the dataset. - - - The second column contains the names of the tumor samples files. - - - Column names of these two columns are respectively normal and tumor. - - - Columns are separated by a comma. - - - *Extensions of the files (.CEL for example) should be removed* - - -**Example** - -Let 6 .cel files in the studied dataset (3 patients, each of them being represented by a couple of normal and tumor cel file.) :: - - patient1_normal.cel - patient1_tumor.cel - patient2_normal.cel - patient2_tumor.cel - patient3_normal.cel - patient3_tumor.cel - - -The csv file should look like this :: - - normal,tumor - patient1_normal,patient1_tumor - patient2_normal,patient2_tumor - patient3_normal,patient3_tumor - ------ - - - -**Citation** - -If you use this tool please cite : - -`Q. Grimonprez, A. Celisse, M. Cheok, M. Figeac, and G. Marot. MPAgenomics : An R package for multi-patients analysis of genomic markers, 2014. Preprint <http://fr.arxiv.org/abs/1401.5035>`_ - - - diff -r bbf427bd6967 -r af4f63f27c77 preprocess.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/preprocess.R Wed Apr 08 15:34:17 2020 +0000 @@ -0,0 +1,158 @@ +#!/usr/bin/env Rscript +# setup R error handling to go to stderr +options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) + +# we need that to not crash galaxy with an UTF8 error on German LC settings. +loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") + +library("optparse") + +##### Read options +option_list=list( + make_option("--summary",type="character",default=NULL, dest="summary"), + make_option("--dataSetName",type="character",default=NULL, dest="dataSetName"), + make_option("--new_file_path",type="character",default=NULL, dest="new_file_path"), + make_option("--inputcdffull_name",type="character",default=NULL, dest="inputcdffull_name"), + make_option("--inputufl_name",type="character",default=NULL, dest="inputufl_name"), + make_option("--inputugp_name",type="character",default=NULL, dest="inputugp_name"), + make_option("--inputacs_name",type="character",default=NULL, dest="inputacs_name"), + make_option("--inputcdffull",type="character",default=NULL, dest="inputcdffull"), + make_option("--inputufl",type="character",default=NULL, dest="inputufl"), + make_option("--inputugp",type="character",default=NULL, dest="inputugp"), + make_option("--inputacs",type="character",default=NULL, dest="inputacs"), + make_option("--tumorcsv",type="character",default=NULL, dest="tumorcsv"), + make_option("--settingsType",type="character",default=NULL, dest="settingsType"), + make_option("--outputgraph",type="character",default=NULL, dest="outputgraph"), + make_option("--zipfigures",type="character",default=NULL, dest="zipfigures"), + make_option("--outputlog",type="character",default=NULL, dest="outputlog"), + make_option("--log",type="character",default=NULL, dest="log"), + make_option("--user_id",type="character",default=NULL, dest="user_id"), + make_option("--input",type="character",default=NULL, dest="input") +); + +opt_parser = OptionParser(option_list=option_list); +opt = parse_args(opt_parser); + +if(is.null(opt$input)){ + print_help(opt_parser) + stop("input required.", call.=FALSE) +} + +#loading libraries + +summary=opt$summary +dataSetName=opt$dataSetName +newFilePath=opt$new_file_path +inputCDFName=opt$inputcdffull_name +inputUFLName=opt$inputufl_name +inputUGPName=opt$inputugp_name +inputACSName=opt$inputacs_name +inputCDF=opt$inputcdffull +inputUFL=opt$inputufl +inputUGP=opt$inputugp +inputACS=opt$inputacs +tumorcsv=opt$tumorcsv +settingsType=opt$settingsType +outputGraph=opt$outputgraph +zipfigures=opt$zipfigures +outputlog=opt$outputlog +log=opt$log +userId=opt$user_id + +destinationPath=file.path(newFilePath, userId, dataSetName) +mpagenomicsDir = file.path(newFilePath,"mpagenomics",userId) +dataDir = file.path(newFilePath, userId) +chipDir = file.path(newFilePath,"mpagenomics",userId,"annotationData","chipTypes") +createArchitecture=TRUE + +if (dir.exists(chipDir)) + system(paste0("rm -r ", chipDir)) + +if (!dir.exists(mpagenomicsDir)) + dir.create(mpagenomicsDir, showWarnings = TRUE, recursive = TRUE) + +if (!dir.exists(dataDir)) + dir.create(dataDir, showWarnings = TRUE, recursive = TRUE) + +listInput <- trimws( unlist( strsplit(trimws(opt$input), ",") ) ) +if(length(listInput)<2){ + stop("To few .CEL files selected : At least 2 .CEL files are required", call.=FALSE) +} + + +celList=vector() +celFileNameList=vector() + +for (i in 1:length(listInput)) +{ + inputFileInfo <- unlist( strsplit( listInput[i], ';' ) ) + celList=c(celList,inputFileInfo[1]) + celFileNameList=c(celFileNameList,inputFileInfo[2]) +} + + +for (i in 1:length(celFileNameList)) + { + source = celList[i] + destination=file.path(dataDir,celFileNameList[i]) + file.copy(source, destination) +} +split=unlist(strsplit(inputCDFName,",",fixed=TRUE)) +tag=NULL +if (length(split) != 0) { + chipType=split[1] + tagExt=split[2] + tag=unlist(strsplit(tagExt,".",fixed=TRUE))[1] + } else { + chipType=split[1] +} + +if(!file.exists(file.path(dataDir,inputCDFName))) + file.symlink(inputCDF,file.path(dataDir,inputCDFName)) +if(!file.exists(file.path(dataDir,inputACSName))) + file.symlink(inputACS,file.path(dataDir,inputACSName)) +if(!file.exists(file.path(dataDir,inputUFLName))) + file.symlink(inputUFL,file.path(dataDir,inputUFLName)) +if(!file.exists(file.path(dataDir,inputUGPName))) + file.symlink(inputUGP,file.path(dataDir,inputUGPName)) + +fig_dir = file.path("mpagenomics", userId, "figures", dataSetName, "signal") +abs_fig_dir = file.path(newFilePath, fig_dir) + +chip=chipType +dataset=dataSetName +workdir=mpagenomicsDir +celPath=dataDir +chipPath=dataDir +tumor=tumorcsv +outputgraph=type.convert(outputGraph) + + +library(MPAgenomics) +setwd(workdir) + +if (outputlog){ + sinklog <- file(log, open = "wt") + sink(sinklog ,type = "output") + sink(sinklog, type = "message") +} + +if (settingsType=="standard") +{ + signalPreProcess(dataSetName=dataset, chipType=chip, dataSetPath=celPath,chipFilesPath=chipPath, path=workdir,createArchitecture=createArchitecture, savePlot=outputgraph, tags=tag) +} else { + signalPreProcess(dataSetName=dataset, chipType=chip, dataSetPath=celPath,chipFilesPath=chipPath, normalTumorArray=tumor, path=workdir,createArchitecture=createArchitecture, savePlot=outputgraph, tags=tag) +} +setwd(abs_fig_dir) +files2zip <- dir(abs_fig_dir) +zip(zipfile = "figures.zip", files = files2zip) +file.rename("figures.zip",zipfigures) +summarydf=data.frame(celFileNameList,rep(dataSetName,length(celFileNameList)),rep(chipType,length(celFileNameList))) +write.table(summarydf,file=summary,quote=FALSE,row.names=FALSE,col.names=FALSE,sep="\t") + +if (outputlog){ + sink(type="output") + sink(type="message") + close(sinklog) +} + diff -r bbf427bd6967 -r af4f63f27c77 preprocess.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/preprocess.xml Wed Apr 08 15:34:17 2020 +0000 @@ -0,0 +1,159 @@ + + + + mpagenomics + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + outputgraph == "TRUE" + + + outputlog == "TRUE" + + + + + + + + + +**What it does** + +This preprocessing step consists in a correction of biological and technical biaises due to the experiment. Raw data from Affymetrix arrays are provided in different CEL files. These data must be normalized before statistical analysis. +The pre-processing is proposed as a wrapper of aroma.* packages (using CRMAv2 and TumorBoost when appropriate). Note that this implies that the pre-processing step is only available for Affymetrix arrays. + +----- + +**Chip file naming conventions** + +Chip filenames must strictly follow the following rules : + +- *.cdf* filename must comply with the following format : < chiptype >,< tag >.cdf (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full.cdf). Note the use of a comma (not a point) between <chiptype> and the tag "Full". + +- *.ufl* filename must start with < chiptype >,< tag > (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full,na31,hg19,HB20110328.ufl). + +- *.ugp* filename must start with < chiptype >,< tag > (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full,na31,hg19,HB20110328.ugp). + +- *.acs* file name must start with < chiptype >,< tag > (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,HB20080710.acs). + +----- + +**Normal-tumor study with TumorBoost** + +In cases where normal (control) samples match to tumor samples, normalization can be improved using TumorBoost. In this case, a normal-tumor csv file must be provided : + + - The first column contains the names of the files corresponding to normal samples of the dataset. + + - The second column contains the names of the tumor samples files. + + - Column names of these two columns are respectively normal and tumor. + + - Columns are separated by a comma. + + - *Extensions of the files (.CEL for example) should be removed* + + + +**Example** + +Let 6 .cel files in the dataset studied (3 patients, each of them being represented by a couple of normal and tumor cel files.) :: + + patient1_normal.cel + patient1_tumor.cel + patient2_normal.cel + patient2_tumor.cel + patient3_normal.cel + patient3_tumor.cel + + +The csv file should look like this :: + + normal,tumor + patient1_normal,patient1_tumor + patient2_normal,patient2_tumor + patient3_normal,patient3_tumor + + +----- + +**Citation** + +When using this tool, please cite : + +`Q. Grimonprez, A. Celisse, M. Cheok, M. Figeac, and G. Marot. MPAgenomics : An R package for multi-patients analysis of genomic markers, 2014. Preprint <http://fr.arxiv.org/abs/1401.5035>`_ + +As CRMAv2 normalization is used, please also cite `H. Bengtsson, P. Wirapati, and T. P. Speed. A single-array preprocessing method for estimating full-resolution raw copy numbers from all Affymetrix genotyping arrays including GenomeWideSNP 5 & 6. Bioinformatics, 5(17):2149–2156, 2009. <http://bioinformatics.oxfordjournals.org/content/25/17/2149.short>`_ + +When using TumorBoost to improve normalization in a normal-tumor study, please cite `H. Bengtsson, P. Neuvial, and T. P. Speed. TumorBoost: Normalization of allele-specific tumor copy numbers from a single pair of tumor-normal genotyping microarrays. BMC Bioinformatics, 11, 2010 <http://www.biomedcentral.com/1471-2105/11/245>`_ + + + diff -r bbf427bd6967 -r af4f63f27c77 segcall.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/segcall.R Wed Apr 08 15:34:17 2020 +0000 @@ -0,0 +1,124 @@ +#!/usr/bin/env Rscript +# setup R error handling to go to stderr +options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) + +# we need that to not crash galaxy with an UTF8 error on German LC settings. +loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") + +library("optparse") + +##### Read options +option_list=list( + make_option("--chrom",type="character",default=NULL, dest="chrom"), + make_option("--input",type="character",default=NULL, dest="input"), + make_option("--output",type="character",default=NULL, dest="output"), + make_option("--new_file_path",type="character",default=NULL, dest="new_file_path"), + make_option("--nbcall",type="character",default=NULL, dest="nbcall"), + make_option("--settingsType",type="character",default=NULL, dest="settingsType"), + make_option("--outputgraph",type="character",default=NULL, dest="outputgraph"), + make_option("--snp",type="character",default=NULL, dest="snp"), + make_option("--zipfigures",type="character",default=NULL, dest="zipfigures"), + make_option("--settingsTypeTumor",type="character",default=NULL, dest="settingsTypeTumor"), + make_option("--cellularity",type="character",default=NULL, dest="cellularity"), + make_option("--outputlog",type="character",default=NULL, dest="outputlog"), + make_option("--log",type="character",default=NULL, dest="log"), + make_option("--userid",type="character",default=NULL, dest="userid"), + make_option("--method",type="character",default=NULL, dest="method") +); + +opt_parser = OptionParser(option_list=option_list); +opt = parse_args(opt_parser); + +if(is.null(opt$input)){ + print_help(opt_parser) + stop("input required.", call.=FALSE) +} + +#loading libraries + +chrom=opt$chrom +datasetFile=opt$input +output=opt$output +tmp_dir=opt$new_file_path +nbcall=as.numeric(opt$nbcall) +settingsType=opt$settingsType +outputfigures=type.convert(opt$outputgraph) +snp=type.convert(opt$snp) +tumorcsv=opt$settingsTypeTumor +cellularity=as.numeric(opt$cellularity) +user=opt$userid +method=opt$method +log=opt$log +outputlog=opt$outputlog +outputgraph=opt$outputgraph +zipfigures=opt$zipfigures + +library(MPAgenomics) +workdir=file.path(tmp_dir, "mpagenomics",user) +setwd(workdir) + +if (grepl("all",tolower(chrom)) | chrom=="None") { + chrom_vec=c(1:25) + } else { + chrom_tmp <- strsplit(chrom,",") + chrom_vecstring <-unlist(chrom_tmp) + chrom_vec <- as.numeric(chrom_vecstring) + } + + +if (outputlog){ + sinklog <- file(log, open = "wt") + sink(sinklog ,type = "output") + sink(sinklog, type = "message") +} + + +inputDataset=read.table(file=datasetFile,stringsAsFactors=FALSE) +dataset=inputDataset[1,2] + +fig_dir = file.path("mpagenomics", user, "figures", dataset, "segmentation","CN") +abs_fig_dir = file.path(tmp_dir, fig_dir) + +if (outputgraph) { + if (dir.exists(abs_fig_dir)) { + system(paste0("rm -r ", abs_fig_dir)) + } +} + +if (settingsType == 'dataset') { + if (tumorcsv== "none") + { + segcall=cnSegCallingProcess(dataset,chromosome=chrom_vec, nclass=nbcall, savePlot=outputfigures,onlySNP=snp, cellularity=cellularity, method=method) + } else { + segcall=cnSegCallingProcess(dataset,chromosome=chrom_vec, normalTumorArray=tumorcsv, nclass=nbcall, savePlot=outputfigures,onlySNP=snp, cellularity=cellularity, method=method) + } +} else { + input_tmp <- strsplit(settingsType,",") + input_tmp_vecstring <-unlist(input_tmp) + input_vecstring = sub("^([^.]*).*", "\\1", input_tmp_vecstring) + if (tumorcsv== "none") + { + segcall=cnSegCallingProcess(dataset,chromosome=chrom_vec, listOfFiles=input_vecstring, nclass=nbcall, savePlot=outputfigures, onlySNP=snp, cellularity=cellularity, method=method) + } else { + segcall=cnSegCallingProcess(dataset,chromosome=chrom_vec, normalTumorArray=tumorcsv, listOfFiles=input_vecstring, nclass=nbcall, savePlot=outputfigures, onlySNP=snp, cellularity=cellularity, method=method) + } +} + + +write.table(format(segcall),output,row.names = FALSE, quote=FALSE, sep = "\t") + + +if (outputgraph) { + setwd(abs_fig_dir) + files2zip <- dir(abs_fig_dir) + zip(zipfile = "figures.zip", files = files2zip) + file.rename("figures.zip",zipfigures) +} + +if (outputlog){ + sink(type="output") + sink(type="message") + close(sinklog) +} +#write.fwf(segcall,output,rownames = FALSE, quote=FALSE, sep = "\t") + diff -r bbf427bd6967 -r af4f63f27c77 segcall.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/segcall.xml Wed Apr 08 15:34:17 2020 +0000 @@ -0,0 +1,212 @@ + + of the normalized data + mpagenomics + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + outputgraph == "TRUE" + + + outputlog == "TRUE" + + + + + + +.. class:: warningmark + +Data normalization must be run with the Data Normalization tool prior to segmentation. Otherwise, the standalone version can be used to perform marker selection from matrices containing data normalized with tools different from the one proposed in this instance. + + +----- + +**What it does** +This tool segments the previously normalized profiles and labels segments found in the copy-number profiles. Otherwise, the standalone version can be used to perform segmentation from matrices containing data normalized with tools different from the one proposed in this instance. + +Outputs: + +*A tabular text file containing 7 columns which describe all the segments (1 line per segment):* + + - sampleNames: Names of the original .CEL files. + - chrom: Chromosome of the segment. + - chromStart: Starting position (in bp) of the segment. This position is not included in the segment. + - chromEnd: Ending position (in bp) of the segment. This position is included in the segment. + - probes: Number of probes in the segment. + - means: Mean of the segment. + - calls: Calling of the segment (”double loss”, ”loss”, ”normal”, ”gain” or ”amplification”). + +*A .zip file containing all the figures (optionnal)* + +----- + +**Normal-tumor study** + +In cases where normal (control) samples match to tumor samples, they are taken as references to extract copy number profile. In this case, a normal-tumor csv file must be provided : + + - The first column contains the names of the files corresponding to normal samples of the dataset. + + - The second column contains the names of the tumor samples files. + + - Column names of these two columns are respectively normal and tumor. + + - Columns are separated by a comma. + + - *Extensions of the files (.CEL for example) should be removed* + + + +**Example** + +Let 6 .cel files in the studied dataset (3 patients, each of them being represented by a couple of normal and tumor cel files.) :: + + patient1_normal.cel + patient1_tumor.cel + patient2_normal.cel + patient2_tumor.cel + patient3_normal.cel + patient3_tumor.cel + + +The csv file should look like this :: + + normal,tumor + patient1_normal,patient1_tumor + patient2_normal,patient2_tumor + patient3_normal,patient3_tumor + +----- + + +**Citation** + +If you use this tool please cite : + +`Q. Grimonprez, A. Celisse, M. Cheok, M. Figeac, and G. Marot. MPAgenomics : An R package for multi-patients analysis of genomic markers, 2014. Preprint <http://fr.arxiv.org/abs/1401.5035>`_ + +As segmentation is performed with PELT, please also cite `R. Killick, P. Fearnhead, and I. A. Eckley. Optimal detection of changepoints with a linear computational cost. Journal of the American Statistical Association, 107(500):1590–1598, 2012. <http://arxiv.org/abs/1101.1438>`_ + +As segmentation is performed by cghseg, please cite `Picard, F., Robin, S., Lavielle, M., Vaisse, C., and Daudin, J.-J. (2005). A statistical approach for array CGH data analysis. BMC Bioinformatics, 6(1):27. <http://www.ncbi.nlm.nih.gov/pubmed/15705208>`_ , +and also cite Rigaill, G. (2010). `Pruned dynamic programming for optimal multiple change-point detection. <http://arxiv.org/abs/1004.0887>`_ + +When using the labels of the segments, please cite CGHCall `M. A. van de Wiel, K. I. Kim, S. J. Vosse, W. N. van Wieringen, S. M. Wilting, and B. Ylstra. CGHcall: calling aberrations for array CGH tumor profiles. Bioinformatics, 23(7):892–894, 2007. <http://bioinformatics.oxfordjournals.org/content/23/7/892.abstract>`_ + + + diff -r bbf427bd6967 -r af4f63f27c77 segmentFracB.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/segmentFracB.R Wed Apr 08 15:34:17 2020 +0000 @@ -0,0 +1,43 @@ +args<-commandArgs(TRUE) + +chrom=args[1] +dataset=args[2] +output=args[3] +tmp_dir=args[4] +input=args[5] +outputfigures=type.convert(args[6]) +tumorcsv=args[7] +user=args[8] +method=args[9] + +library(MPAgenomics) +workdir=file.path(tmp_dir, "mpagenomics",user) +setwd(workdir) + +if (grepl("all",tolower(chrom)) | chrom=="None") { + chrom_vec=c(1:25) +} else { + chrom_tmp <- strsplit(chrom,",") + chrom_vecstring <-unlist(chrom_tmp) + chrom_vec <- as.numeric(chrom_vecstring) +} + +input_tmp <- strsplit(input,",") +input_tmp_vecstring <-unlist(input_tmp) + + +input_vecstring = sub("^([^.]*).*", "\\1", input_tmp_vecstring) + +if (dataset == input) { + segcall=segFracBSignal(dataset,chromosome=chrom_vec, normalTumorArray=tumorcsv, savePlot=outputfigures, method=method) + +} else { + segcall=segFracBSignal(dataset,chromosome=chrom_vec, normalTumorArray=tumorcsv, listOfFiles=input_vecstring, savePlot=outputfigures, method=method) + +} +sink(output) +print(format(segcall)) +sink() +#write.table(segcall,output,row.names = FALSE, quote=FALSE, sep = "\t") + +quit() diff -r bbf427bd6967 -r af4f63f27c77 segmentFracB.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/segmentFracB.py Wed Apr 08 15:34:17 2020 +0000 @@ -0,0 +1,87 @@ +import os +import sys +import subprocess +import zipfile +import getopt + + +def main(argv): + + try: + opts, args = getopt.getopt(argv,"hc:i:o:f:s:og:fig:t:p:l:u:m:",["chrom=","input=","output=","new_file_path=","settings_type=","output_graph=","zip_figures=","settings_tumor=","outputlog=","log=","userid=","method="]) + except getopt.GetoptError as err: + print str(err) + sys.exit(2) + for opt, arg in opts: + if opt == '-h': + print 'extractCNopts.py' + sys.exit() + elif opt in ("-c", "--chrom"): + chromosome = arg + elif opt in ("-i", "--input"): + input_file = arg + elif opt in ("-o", "--output"): + output_file = arg + elif opt in ("-f", "--new_file_path"): + tmp_dir = arg + elif opt in ("-s", "--settings_type"): + input_type = arg + elif opt in ("-og", "--output_graph"): + output_graph = arg + elif opt in ("-fig", "--zip_figures"): + zip_file = arg + elif opt in ("-t", "--settings_tumor"): + tumorcsv = arg + elif opt in ("-p", "--outputlog"): + outputlog = arg + elif opt in ("-l", "--log"): + log = arg + elif opt in ("-u", "--userid"): + user_id = arg + elif opt in ("-m", "--method"): + method = arg + + script_dir=os.path.dirname(os.path.abspath(__file__)) + + iFile=open(input_file,'r') + dataSetLine=iFile.readline() + dataset=dataSetLine.split("\t")[1] + iFile.close() + + + if input_type=="dataset": + input_type=dataset + + if (outputlog=="TRUE"): + errfile=open(log,'w') + else: + errfile=open(os.path.join(tmp_dir,"errfile.log"),'w') + + fig_dir=os.path.join("mpagenomics",user_id,"figures",dataset,"segmentation/fracB") + + abs_fig_dir=os.path.join(tmp_dir,fig_dir) + if (os.path.isdir(abs_fig_dir)) and (output_graph=="TRUE"): + old_files=os.listdir(abs_fig_dir) + for ifile in old_files: + os.remove(os.path.join(abs_fig_dir,ifile)) + + + retcode=subprocess.call(["Rscript", os.path.join(script_dir,"segmentFracB.R"), chromosome, dataset, output_file, tmp_dir, input_type, output_graph, tumorcsv, user_id, method], stdout = errfile, stderr = errfile) + + errfile.close() + + if (retcode == 0): + if (os.path.isdir(abs_fig_dir)) and (output_graph=="TRUE"): + + new_files=os.listdir(abs_fig_dir) + zipbuf = zipfile.ZipFile(os.path.join(abs_fig_dir,zip_file), 'w', zipfile.ZIP_DEFLATED) + for current_file in new_files: + fn = os.path.join(abs_fig_dir,current_file) + relfn=fn[len(abs_fig_dir)+len(os.sep):] + zipbuf.write(fn,relfn) + sys.exit(retcode) + else: + sys.exit(retcode) + +if __name__ == "__main__": + main(main(sys.argv[1:])) diff -r bbf427bd6967 -r af4f63f27c77 segmentFracB.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/segmentFracB.xml Wed Apr 08 15:34:17 2020 +0000 @@ -0,0 +1,175 @@ + + + + segmentFracB.py + --chrom '$chrom' + --input '$input' + --output '$output' + --new_file_path '$__new_file_path__' + #if $settings.settingsType == "file": + --settings_type '$settings.inputs' + #end if + #if $settings.settingsType == "dataset": + --settings_type '$settings.settingsType' + #end if + --output_graph '$outputgraph' + --zip_figures '$zipfigures' + --settings_tumor '$tumorcsv' + --outputlog '$outputlog' + --log '$log' + --userid '$__user_id__' + --method '$method' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + outputgraph == "TRUE" + + + outputlog == "TRUE" + + + + + + +.. class:: warningmark + +Data normalization must be run (with the data normalization tool) prior to segmentation. + +----- + +**What it does** +This tool segments allele B fraction extracted from the previously normalized data. This tools works only on normal-tumor study. + +Outputs: + +*A tabular text file containing 6 columns which describe all the segment (1 line per segment):* + + - sampleNames: Name of the file. + - chrom: The chromosome of the segment. + - chromStart: The starting position (in bp) of the segment. This position is not included in the segment. + - chromEnd: The ending position (in bp) of the segment. This position is included in the segment. + - probes: Number of probes in the segment. + - means: Mean of the segment. + +*A .zip file containing all the figures (optionnal)* + +----- + +**Normal-tumor csv files** + +Normal-tumor csv file is required to segment Allele B fraction, because naive genotyping is based on normal samples : + + - The first column contains the names of the files corresponding to normal samples of the dataset. + + - The second column contains the names of the tumor samples files. + + - Column names of these two columns are respectively normal and tumor. + + - Columns are separated by a comma. + + - *Extensions of the files (.CEL for example) should be removed* + + + +**Example** + +Let 6 .cel files in the studied dataset (3 patients, each of them being represented by a couple of normal and tumor cel files.) :: + + patient1_normal.cel + patient1_tumor.cel + patient2_normal.cel + patient2_tumor.cel + patient3_normal.cel + patient3_tumor.cel + + +The csv file should look like this :: + + normal,tumor + patient1_normal,patient1_tumor + patient2_normal,patient2_tumor + patient3_normal,patient3_tumor + +----- + + +**Citation** + +If you use this tool please cite : + +`Q. Grimonprez, A. Celisse, M. Cheok, M. Figeac, and G. Marot. MPAgenomics : An R package for multi-patients analysis of genomic markers, 2014. Preprint <http://fr.arxiv.org/abs/1401.5035>`_ + +If segmentation is performed with PELT, please cite `R. Killick, P. Fearnhead, and I. A. Eckley. Optimal detection of changepoints with a linear computational cost. Journal of the American Statistical Association, 107(500):1590–1598, 2012. <http://arxiv.org/abs/1101.1438>`_ + +If segmentation is performed by cghseg, please cite `Picard, F., Robin, S., Lavielle, M., Vaisse, C., and Daudin, J.-J. (2005). A statistical approach for array CGH data analysis. BMC Bioinformatics, 6(1):27. <http://www.ncbi.nlm.nih.gov/pubmed/15705208>`_ , +and also cite Rigaill, G. (2010). `Pruned dynamic programming for optimal multiple change-point detection. <http://arxiv.org/abs/1004.0887>`_ + + + \ No newline at end of file diff -r bbf427bd6967 -r af4f63f27c77 segmentation.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/segmentation.R Wed Apr 08 15:34:17 2020 +0000 @@ -0,0 +1,139 @@ +#!/usr/bin/env Rscript +# setup R error handling to go to stderr +options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) + +# we need that to not crash galaxy with an UTF8 error on German LC settings. +loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") + +library("optparse") + +##### Read options +option_list=list( + make_option("--input",type="character",default=NULL, dest="input"), + make_option("--output",type="character",default=NULL, dest="output"), + make_option("--new_file_path",type="character",default=NULL, dest="new_file_path"), + make_option("--nbcall",type="character",default=NULL, dest="nbcall"), + make_option("--outputgraph",type="character",default=NULL, dest="outputgraph"), + make_option("--graph",type="character",default=NULL, dest="graph"), + make_option("--signalType",type="character",default=NULL, dest="signalType"), + make_option("--cellularity",type="character",default=NULL, dest="cellularity"), + make_option("--outputlog",type="character",default=NULL, dest="outputlog"), + make_option("--log",type="character",default=NULL, dest="log"), + make_option("--user_id",type="character",default=NULL, dest="userid"), + make_option("--method",type="character",default=NULL, dest="method") +); + +opt_parser = OptionParser(option_list=option_list); +opt = parse_args(opt_parser); + +if(is.null(opt$input)){ + print_help(opt_parser) + stop("input required.", call.=FALSE) +} + +#loading libraries + +input=opt$input +output=opt$output +tmp_dir=opt$new_file_path +nbcall=as.numeric(opt$nbcall) +outputgraph=type.convert(opt$outputgraph) +cellularity=as.numeric(opt$cellularity) +userId=opt$userid +method=opt$method +log=opt$log +outputlog=opt$outputlog +graph=opt$graph +signalType=opt$signalType + +#args<-commandArgs(TRUE) +# +#input=args[1] +#tmp_dir=args[2] +#nbcall=as.numeric(args[3]) +#cellularity=as.numeric(args[4]) +#output=args[5] +#method=args[6] +#userId=args[7] +#signalType=args[8] + +library(MPAgenomics) +workdir=file.path(tmp_dir,"mpagenomics",userId) +setwd(workdir) + +if (outputlog){ + sinklog <- file(log, open = "wt") + sink(sinklog ,type = "output") + sink(sinklog, type = "message") +} + +CN=read.table(input,header=TRUE) +uniqueChr=unique(CN$chromosome) +drops=c("chromosome","position","probeName") +CNsignal=CN[,!(names(CN)%in% drops),drop=FALSE] + +samples=names(CNsignal) + +if (signalType=="CN") +{ + +result=data.frame(sampleNames=character(0),chrom=character(0),chromStart=numeric(0),chromEnd=numeric(0),probes=numeric(0),means=numeric(0),calls=character(0),stringsAsFactors=FALSE) + +for (chr in uniqueChr) +{ +currentSubset=subset(CN, chromosome==chr) +currentPositions=currentSubset["position"] +for (sample in samples) + { + currentSignal=currentSubset[sample] + if (length(which(!is.na(unlist(currentSignal))))>1) + { + currentSeg=segmentation(signal=unlist(currentSignal),position=unlist(currentPositions),method=method) + callobj= callingObject(copynumber=currentSeg$signal, segmented=currentSeg$segmented,chromosome=rep(chr,length(currentSeg$signal)), position=currentSeg$startPos,sampleNames=sample) + currentCall=callingProcess(callobj,nclass=nbcall,cellularity=cellularity,verbose=TRUE) + currentResult=currentCall$segment + currentResult["sampleNames"]=c(rep(sample,length(currentCall$segment$chrom))) + result=rbind(result,currentResult) + } + } +} +finalResult=data.frame(sampleNames=result["sampleNames"],chrom=result["chrom"],chromStart=result["chromStart"],chromEnd=result["chromEnd"],probes=result["probes"],means=result["means"],calls=result["calls"],stringsAsFactors=FALSE) + +write.table(finalResult,output,row.names = FALSE, quote=FALSE, sep = "\t") +} else { + result=data.frame(sampleNames=character(0),chrom=character(0),start=numeric(0),end=numeric(0),points=numeric(0),means=numeric(0),stringsAsFactors=FALSE) + + for (chr in uniqueChr) + { + cat(paste0("chromosome ",chr,"\n")) + currentSubset=subset(CN, chromosome==chr) + currentPositions=currentSubset["position"] + for (sample in samples) + { + cat(paste0(" sample ",sample,"...")) + currentSignal=currentSubset[sample] + if (length(which(!is.na(unlist(currentSignal))))>1) + { + currentSeg=segmentation(signal=unlist(currentSignal),position=unlist(currentPositions),method=method) + currentResult=currentSeg$segment + currentResult["chrom"]=c(rep(chr,length(currentSeg$segment$means))) + currentResult["sampleNames"]=c(rep(sample,length(currentSeg$segment$means))) + result=rbind(result,currentResult) + + } + cat(paste0("OK\n")) + } + } + finalResult=data.frame(sampleNames=result["sampleNames"],chrom=result["chrom"],chromStart=result["start"],chromEnd=result["end"],probes=result["points"],means=result["means"],stringsAsFactors=FALSE) + write.table(finalResult,output,row.names = FALSE, quote=FALSE, sep = "\t") +} + +if (outputgraph){ + file.rename(file.path(tmp_dir,"mpagenomics",userId,"Rplots.pdf"), graph) +} + +if (outputlog){ + sink(type="output") + sink(type="message") + close(sinklog) +} diff -r bbf427bd6967 -r af4f63f27c77 segmentation.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/segmentation.xml Wed Apr 08 15:34:17 2020 +0000 @@ -0,0 +1,114 @@ + + of a previously normalized signal + mpagenomics + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + outputlog == "TRUE" + + + outputgraph == "TRUE" + + + + + + + +**What it does** +This tool segments normalized profiles provided by the user and labels segments found in the copy-number profiles. + +Input format: + +*A tabular text file containing 3 fixed columns and 1 column per sample:* + + - chr: Chromosome. + - position: Genomic position (in bp) + - probeName: Probes names. + - One column per sample which contains the copy number profile for each sample + +Output format: + +*A tabular text file containing 7 columns which describe all the segments (1 line per segment):* + + - sampleNames: Column names corresponding to samples in the input file. + - chrom: Chromosome of the segment. + - chromStart: Starting position (in bp) of the segment. This position is not included in the segment. + - chromEnd: Ending position (in bp) of the segment. This position is included in the segment. + - probes: Number of probes in the segment. + - means: Mean of the segment. + - calls: Calling of the segment (”double loss”, ”loss”, ”normal”, ”gain” or ”amplification”). + +----- + +**Citation** +If you use this tool please cite : + +`Q. Grimonprez, A. Celisse, M. Cheok, M. Figeac, and G. Marot. MPAgenomics : An R package for multi-patients analysis of genomic markers, 2014. Preprint <http://fr.arxiv.org/abs/1401.5035>`_ + +If segmentation is performed with PELT, please also cite `R. Killick, P. Fearnhead, and I. A. Eckley. Optimal detection of changepoints with a linear computational cost. Journal of the American Statistical Association, 107(500):1590–1598, 2012. <http://arxiv.org/abs/1101.1438>`_ + +If segmentation is performed by cghseg, please cite `Picard, F., Robin, S., Lavielle, M., Vaisse, C., and Daudin, J.-J. (2005). A statistical approach for array CGH data analysis. BMC Bioinformatics, 6(1):27. <http://www.ncbi.nlm.nih.gov/pubmed/15705208>`_ , +and also cite Rigaill, G. (2010). `Pruned dynamic programming for optimal multiple change-point detection. <http://arxiv.org/abs/1004.0887>`_ + +When using the labels of the segments, please cite CGHCall `M. A. van de Wiel, K. I. Kim, S. J. Vosse, W. N. van Wieringen, S. M. Wilting, and B. Ylstra. CGHcall: calling aberrations for array CGH tumor profiles. Bioinformatics, 23(7):892–894, 2007. <http://bioinformatics.oxfordjournals.org/content/23/7/892.abstract>`_ + + + diff -r bbf427bd6967 -r af4f63f27c77 selection.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/selection.R Wed Apr 08 15:34:17 2020 +0000 @@ -0,0 +1,75 @@ +args<-commandArgs(TRUE) + +input=args[1] +dataResponse=args[2] +chrom=args[3] +tmp_dir=args[4] +signal=args[5] +snp=type.convert(args[6]) +settingsType=args[7] +tumor=args[8] +fold=as.integer(args[9]) +loss=args[10] +plot=type.convert(args[11]) +output=args[12] +user=args[13] +package=args[14] + + +library(MPAgenomics) +library(glmnet) +library(spikeslab) +library(lars) +workdir=file.path(tmp_dir, "mpagenomics",user) +setwd(workdir) + +if (grepl("all",tolower(chrom)) | chrom=="None") { + chrom_vec=c(1:25) + } else { + chrom_tmp <- strsplit(chrom,",") + chrom_vecstring <-unlist(chrom_tmp) + chrom_vec <- as.numeric(chrom_vecstring) + } + + +if (settingsType == "tumor") { + if (signal=="CN") { + res=markerSelection(input,dataResponse, chromosome=chrom_vec, signal=signal, normalTumorArray=tumor, onlySNP=snp, loss=loss, plot=plot, nbFolds=fold, pkg=package) + } else { + res=markerSelection(input,dataResponse, chromosome=chrom_vec,signal=signal,normalTumorArray=tumor, loss=loss, plot=plot, nbFolds=fold,pkg=package) + } +} else { + if (signal=="CN") { + res=markerSelection(input,dataResponse, chromosome=chrom_vec, signal=signal, onlySNP=snp, loss=loss, plot=plot, nbFolds=fold,pkg=package) + } else { + res=markerSelection(input,dataResponse, chromosome=chrom_vec, signal=signal, loss=loss, plot=plot, nbFolds=fold,pkg=package) + } +} + +res + +df=data.frame() +list_chr=names(res) +markerSelected=FALSE + +for (i in list_chr) { + chr_data=res[[i]] + len=length(chr_data$markers.index) + if (len != 0) + { + markerSelected=TRUE + chrdf=data.frame(rep(i,len),chr_data$markers.position,chr_data$markers.index,chr_data$markers.names,chr_data$coefficient) + df=rbind(df,chrdf) + } +} + +if (markerSelected) { + colnames(df) <- c("chr","position","index","names","coefficient") + sink(output) + print(format(df),row.names=FALSE) + sink() + #write.table(df,output,row.names = FALSE, quote = FALSE, sep = "\t") +} else + writeLines("no SNP selected", output) + + diff -r bbf427bd6967 -r af4f63f27c77 selection.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/selection.py Wed Apr 08 15:34:17 2020 +0000 @@ -0,0 +1,38 @@ +import os +import sys +import subprocess +import shutil + +def main(): + + input_file=sys.argv[1] + tmp_dir=sys.argv[4] + script_dir=os.path.dirname(os.path.abspath(__file__)) + plot=sys.argv[11] + pdffigures=sys.argv[13] + outputlog=sys.argv[14] + log=sys.argv[15] + user=sys.argv[16] + package=sys.argv[17] + + iFile=open(input_file,'r') + dataSetLine=iFile.readline() + dataset=dataSetLine.split("\t")[1] + iFile.close() + + if (outputlog=="TRUE"): + errfile=open(log,'w') + else: + errfile=open(os.path.join(tmp_dir,"errfile.log"),'w') + + retcode=subprocess.call(["Rscript", os.path.join(script_dir,"selection.R"), dataset, sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[6], sys.argv[7], sys.argv[8], sys.argv[9], sys.argv[10], sys.argv[11], sys.argv[12],sys.argv[16],package], stdout = errfile, stderr = errfile) + + if (plot=="TRUE"): + shutil.copy(os.path.join(tmp_dir,"mpagenomics",user,"Rplots.pdf"), pdffigures) + + errfile.close() + + sys.exit(retcode) + +if __name__ == "__main__": + main() diff -r bbf427bd6967 -r af4f63f27c77 selection.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/selection.xml Wed Apr 08 15:34:17 2020 +0000 @@ -0,0 +1,220 @@ + + + selection.py '$input' '$response' '$chromosome' '$__new_file_path__' '$settingsSNP.signal' + #if $settingsSNP.signal == "CN": + '$settingsSNP.snp' + #end if + #if $settingsSNP.signal == "fracB": + 'none' + #end if + '$settings.settingsType' + #if $settings.settingsType == "tumor": + '$tumorcsv' + #end if + #if $settings.settingsType == "standard": + 'none' + #end if + '$folds' '$settingsLoss.loss' '$outputgraph' '$output' '$pdffigures' '$outputlog' '$log' '$__user_id__' + #if $settingsLoss.loss == "linear": + '$settingsLoss.package' + #end if + #if $settingsLoss.loss == "logistic": + 'HDPenReg' + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + outputgraph == "TRUE" + (settingsLoss['package'] != 'spikeslab') + + + outputlog == "TRUE" + + + + + + +.. class:: warningmark + +Data normalization must be run with the Data Normalization tool prior to SNPs selection. Otherwise, the standalone version can be used to perform marker selection from matrices containing data normalized with tools different from the one proposed in this instance. + +----- + +**What it does** + +This tool selects some relevant markers according to a response using penalized regressions. + +Output: + +A tabular text file containing 5 columns which describe all the selected SNPs (1 line per SNPs): + + - chr: Chromosome containing the selected SNP. + - position: Position of the selected SNP. + - index: Index of the selected SNP. + - names: Name of the selected SNP. + - coefficient: Regression coefficient of the selected SNP. + +----- + +**Data Response csv file** + +Data response csv file format: + + - The first column contains the names of the different files of the data-set. + + - The second column contains the response associated with each file. + + - Column names of these two columns are respectively files and response. + + - Columns are separated by a comma + + - *Extensions of the files (.CEL for example) should be removed* + + + +**Example** + +Let 3 .cel files in the studied dataset :: + + patient1.cel + patient2.cel + patient3.cel + +The csv file should look like this :: + + files,response + patient1,1.92145 + patient2,2.12481 + patient3,1.23545 + + +----- + +**Normal-tumor study** + +In cases where normal (control) samples match to tumor samples, they are taken as references to extract copy number profile. In this case, a normal-tumor csv file must be provided : + + - The first column contains the names of the files corresponding to normal samples of the dataset. + + - The second column contains the names of the tumor samples files. + + - Column names of these two columns are respectively normal and tumor. + + - Columns are separated by a comma. + + - *Extensions of the files (.CEL for example) should be removed* + + +**Example** + +Let 6 .cel files in the studied dataset (3 patients, each of them being represented by a couple of normal and tumor cel file.) :: + + patient1_normal.cel + patient1_tumor.cel + patient2_normal.cel + patient2_tumor.cel + patient3_normal.cel + patient3_tumor.cel + + +The csv file should look like this :: + + normal,tumor + patient1_normal,patient1_tumor + patient2_normal,patient2_tumor + patient3_normal,patient3_tumor + +----- + + + +**Citation** + +If you use this tool please cite : + +`Q. Grimonprez, A. Celisse, M. Cheok, M. Figeac, and G. Marot. MPAgenomics : An R package for multi-patients analysis of genomic markers, 2014. Preprint <http://fr.arxiv.org/abs/1401.5035>`_ + + +