# HG changeset patch # User blanck # Date 1430294932 -7200 # Node ID 7dc6ce39fb896a52e847324b7c151a7703f8fdc7 # Parent b7f3854e08f8bcc642db02aac1985c676ffbf73a add selection tool diff -r b7f3854e08f8 -r 7dc6ce39fb89 selection.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/selection.R Wed Apr 29 10:08:52 2015 +0200 @@ -0,0 +1,75 @@ +args<-commandArgs(TRUE) + +input=args[1] +dataResponse=args[2] +chrom=args[3] +tmp_dir=args[4] +signal=args[5] +snp=type.convert(args[6]) +settingsType=args[7] +tumor=args[8] +fold=as.integer(args[9]) +loss=args[10] +plot=type.convert(args[11]) +output=args[12] +user=args[13] +package=args[14] + + +library(MPAgenomics) +library(glmnet) +library(spikeslab) +library(lars) +workdir=file.path(tmp_dir, "mpagenomics",user) +setwd(workdir) + +if (grepl("all",tolower(chrom)) | chrom=="None") { + chrom_vec=c(1:25) + } else { + chrom_tmp <- strsplit(chrom,",") + chrom_vecstring <-unlist(chrom_tmp) + chrom_vec <- as.numeric(chrom_vecstring) + } + + +if (settingsType == "tumor") { + if (signal=="CN") { + res=markerSelection(input,dataResponse, chromosome=chrom_vec, signal=signal, normalTumorArray=tumor, onlySNP=snp, loss=loss, plot=plot, nbFolds=fold, pkg=package) + } else { + res=markerSelection(input,dataResponse, chromosome=chrom_vec,signal=signal,normalTumorArray=tumor, loss=loss, plot=plot, nbFolds=fold,pkg=package) + } +} else { + if (signal=="CN") { + res=markerSelection(input,dataResponse, chromosome=chrom_vec, signal=signal, onlySNP=snp, loss=loss, plot=plot, nbFolds=fold,pkg=package) + } else { + res=markerSelection(input,dataResponse, chromosome=chrom_vec, signal=signal, loss=loss, plot=plot, nbFolds=fold,pkg=package) + } +} + +res + +df=data.frame() +list_chr=names(res) +markerSelected=FALSE + +for (i in list_chr) { + chr_data=res[[i]] + len=length(chr_data$markers.index) + if (len != 0) + { + markerSelected=TRUE + chrdf=data.frame(rep(i,len),chr_data$markers.position,chr_data$markers.index,chr_data$markers.names,chr_data$coefficient) + df=rbind(df,chrdf) + } +} + +if (markerSelected) { + colnames(df) <- c("chr","position","index","names","coefficient") + sink(output) + print(format(df),row.names=FALSE) + sink() + #write.table(df,output,row.names = FALSE, quote = FALSE, sep = "\t") +} else + writeLines("no SNP selected", output) + + diff -r b7f3854e08f8 -r 7dc6ce39fb89 selection.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/selection.py Wed Apr 29 10:08:52 2015 +0200 @@ -0,0 +1,38 @@ +import os +import sys +import subprocess +import shutil + +def main(): + + input_file=sys.argv[1] + tmp_dir=sys.argv[4] + script_dir=os.path.dirname(os.path.abspath(__file__)) + plot=sys.argv[11] + pdffigures=sys.argv[13] + outputlog=sys.argv[14] + log=sys.argv[15] + user=sys.argv[16] + package=sys.argv[17] + + iFile=open(input_file,'r') + dataSetLine=iFile.readline() + dataset=dataSetLine.split("\t")[1] + iFile.close() + + if (outputlog=="TRUE"): + errfile=open(log,'w') + else: + errfile=open(os.path.join(tmp_dir,"errfile.log"),'w') + + retcode=subprocess.call(["Rscript", os.path.join(script_dir,"selection.R"), dataset, sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[6], sys.argv[7], sys.argv[8], sys.argv[9], sys.argv[10], sys.argv[11], sys.argv[12],sys.argv[16],package], stdout = errfile, stderr = errfile) + + if (plot=="TRUE"): + shutil.copy(os.path.join(tmp_dir,"mpagenomics",user,"Rplots.pdf"), pdffigures) + + errfile.close() + + sys.exit(retcode) + +if __name__ == "__main__": + main() diff -r b7f3854e08f8 -r 7dc6ce39fb89 selection.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/selection.xml Wed Apr 29 10:08:52 2015 +0200 @@ -0,0 +1,220 @@ + + + selection.py '$input' '$response' '$chromosome' '$__new_file_path__' '$settingsSNP.signal' + #if $settingsSNP.signal == "CN": + '$settingsSNP.snp' + #end if + #if $settingsSNP.signal == "fracB": + 'none' + #end if + '$settings.settingsType' + #if $settings.settingsType == "tumor": + '$tumorcsv' + #end if + #if $settings.settingsType == "standard": + 'none' + #end if + '$folds' '$settingsLoss.loss' '$outputgraph' '$output' '$pdffigures' '$outputlog' '$log' '$__user_id__' + #if $settingsLoss.loss == "linear": + '$settingsLoss.package' + #end if + #if $settingsLoss.loss == "logistic": + 'HDPenReg' + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + outputgraph == "TRUE" + (settingsLoss['package'] != 'spikeslab') + + + outputlog == "TRUE" + + + + + + +.. class:: warningmark + +Data normalization must be run with the Data Normalization tool prior to SNPs selection. Otherwise, the standalone version can be used to perform marker selection from matrices containing data normalized with tools different from the one proposed in this instance. + +----- + +**What it does** + +This tool selects some relevant markers according to a response using penalized regressions. + +Output: + +A tabular text file containing 5 columns which describe all the selected SNPs (1 line per SNPs): + + - chr: Chromosome containing the selected SNP. + - position: Position of the selected SNP. + - index: Index of the selected SNP. + - names: Name of the selected SNP. + - coefficient: Regression coefficient of the selected SNP. + +----- + +**Data Response csv file** + +Data response csv file format: + + - The first column contains the names of the different files of the data-set. + + - The second column contains the response associated with each file. + + - Column names of these two columns are respectively files and response. + + - Columns are separated by a comma + + - *Extensions of the files (.CEL for example) should be removed* + + + +**Example** + +Let 3 .cel files in the studied dataset :: + + patient1.cel + patient2.cel + patient3.cel + +The csv file should look like this :: + + files,response + patient1,1.92145 + patient2,2.12481 + patient3,1.23545 + + +----- + +**Normal-tumor study** + +In cases where normal (control) samples match to tumor samples, they are taken as references to extract copy number profile. In this case, a normal-tumor csv file must be provided : + + - The first column contains the names of the files corresponding to normal samples of the dataset. + + - The second column contains the names of the tumor samples files. + + - Column names of these two columns are respectively normal and tumor. + + - Columns are separated by a comma. + + - *Extensions of the files (.CEL for example) should be removed* + + +**Example** + +Let 6 .cel files in the studied dataset (3 patients, each of them being represented by a couple of normal and tumor cel file.) :: + + patient1_normal.cel + patient1_tumor.cel + patient2_normal.cel + patient2_tumor.cel + patient3_normal.cel + patient3_tumor.cel + + +The csv file should look like this :: + + normal,tumor + patient1_normal,patient1_tumor + patient2_normal,patient2_tumor + patient3_normal,patient3_tumor + +----- + + + +**Citation** + +If you use this tool please cite : + +`Q. Grimonprez, A. Celisse, M. Cheok, M. Figeac, and G. Marot. MPAgenomics : An R package for multi-patients analysis of genomic markers, 2014. Preprint <http://fr.arxiv.org/abs/1401.5035>`_ + + +