Mercurial > repos > fubar > rglasso
diff rglasso_cox.xml @ 0:cf295f36d606 draft
Initial commit for iuc/test rglasso
author | fubar |
---|---|
date | Sat, 31 Oct 2015 01:07:28 -0400 |
parents | |
children | 31be675baa50 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rglasso_cox.xml Sat Oct 31 01:07:28 2015 -0400 @@ -0,0 +1,910 @@ +<tool id="rglasso_cox" name="Lasso" version="0.03"> + <description>and cox regression using elastic net</description> + <requirements> + <requirement type="package" version="3.2.2">R_3_2_2</requirement> + <requirement type="package" version="1.3.18">graphicsmagick</requirement> + <requirement type="package" version="9.10">ghostscript</requirement> + <requirement type="package" version="3.2">glmnet_lars_3_2</requirement> + </requirements> + <command interpreter="python"> + rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "rglasso" + --output_dir "$html_file.files_path" --output_html "$html_file" --make_HTML "yes" + </command> +<configfiles> +<configfile name="runme"> +<![CDATA[ +library('glmnet') +library('lars') +library('survival') +library('pec') + + +message=function(x) {print.noquote(paste(x,sep=''))} + + +ross.cv.glmnet = function (x, y, weights, offset = NULL, lambda = NULL, type.measure = c("mse", + "deviance", "class", "auc", "mae"), nfolds = 10, foldid, + grouped = TRUE, keep = FALSE, parallel = FALSE, ...) +{ + if (missing(type.measure)) + type.measure = "default" + else type.measure = match.arg(type.measure) + if (!is.null(lambda) && length(lambda) < 2) + stop("Need more than one value of lambda for cv.glmnet") + N = nrow(x) + if (missing(weights)) + weights = rep(1, N) + else weights = as.double(weights) + y = drop(y) + glmnet.call = match.call(expand.dots = TRUE) + sel = match(c("type.measure", "nfolds", "foldid", "grouped", + "keep"), names(glmnet.call), F) + if (any(sel)) + glmnet.call = glmnet.call[-sel] + glmnet.call[[1]] = as.name("glmnet") + glmnet.object = glmnet(x, y, weights = weights, offset = offset, + lambda = lambda, ...) + glmnet.object\$call = glmnet.call + is.offset = glmnet.object\$offset + lambda = glmnet.object\$lambda + if (inherits(glmnet.object, "multnet")) { + nz = predict(glmnet.object, type = "nonzero") + nz = sapply(nz, function(x) sapply(x, length)) + nz = ceiling(apply(nz, 1, median)) + } + else nz = sapply(predict(glmnet.object, type = "nonzero"), + length) + if (missing(foldid)) + foldid = sample(rep(seq(nfolds), length = N)) + else nfolds = max(foldid) + if (nfolds < 3) + stop("nfolds must be bigger than 3; nfolds=10 recommended") + outlist = as.list(seq(nfolds)) + if (parallel && require(foreach)) { + outlist = foreach(i = seq(nfolds), .packages = c("glmnet")) %dopar% + { + sel = foldid == i + if (is.matrix(y)) + y_sub = y[!sel, ] + else y_sub = y[!sel] + if (is.offset) + offset_sub = as.matrix(offset)[!sel, ] + else offset_sub = NULL + glmnet(x[!sel, , drop = FALSE], y_sub, lambda = lambda, + offset = offset_sub, weights = weights[!sel], + ...) + } + } + else { + for (i in seq(nfolds)) { + sel = foldid == i + if (is.matrix(y)) + y_sub = y[!sel, ] + else y_sub = y[!sel] + if (is.offset) + offset_sub = as.matrix(offset)[!sel, ] + else offset_sub = NULL + outlist[[i]] = glmnet(x[!sel, , drop = FALSE], + y_sub, lambda = lambda, offset = offset_sub, + weights = weights[!sel], ...) + } + } + fun = paste("cv", class(glmnet.object)[[1]], sep = ".") + cvstuff = do.call(fun, list(outlist, lambda, x, y, weights, + offset, foldid, type.measure, grouped, keep)) + cvm = cvstuff\$cvm + cvsd = cvstuff\$cvsd + cvname = cvstuff\$name + + out = list(lambda = lambda, cvm = cvm, cvsd = cvsd, cvup = cvm + + cvsd, cvlo = cvm - cvsd, nzero = nz, name = cvname, glmnet.fit = glmnet.object) + if (keep) + out = c(out, list(fit.preval = cvstuff\$fit.preval, foldid = foldid)) + + lamin = if (type.measure == "auc") + getmin(lambda, -cvm, cvsd) + else getmin(lambda, cvm, cvsd) + out = c(out, as.list(lamin)) + hitsse = rep(0,ncol(x)) + hitsmin = rep(0,ncol(x)) + names(hitsse) = colnames(x) + names(hitsmin) = colnames(x) + olmin = lamin\$lambda.min + ol1sd = lamin\$lambda.1se + lambs = c(olmin,ol1sd) + names(lambs) = c('olmin','ol1sd') + for (cvfit in outlist) { + colmin = which(cvfit\$lambda == olmin) + col1se = which(cvfit\$lambda == ol1sd) + nzmin = which(cvfit\$beta[,colmin] != 0) + nz1se = which(cvfit\$beta[,col1se] != 0) + hitsse[nz1se] = hitsse[nz1se] + 1 + hitsmin[nzmin] = hitsmin[nzmin] + 1 + } + obj = c(out,list(cvhits.1se=hitsse,cvhits.min=hitsmin)) + class(obj) = "cv.glmnet" + obj +} + +mdsPlot = function(dm,myTitle,groups=NA,outpdfname,transpose=T) +{ + + samples = colnames(dm) + mt = myTitle + pcols=c('maroon') + if (! is.na(groups)) + { + gu = unique(groups) + colours = rainbow(length(gu),start=0.1,end=0.9) + pcols = colours[match(groups,gu)] + } + mydata = dm + if (transpose==T) + { + mydata = t(dm) + } + npred = ncol(mydata) + d = dist(mydata) + fit = cmdscale(d,eig=TRUE, k=min(10,npred-2)) + xmds = fit\$points[,1] + ymds = fit\$points[,2] + pdf(outpdfname) + plot(xmds, ymds, xlab="Dimension 1", ylab="Dimension 2", + main=paste(mt,"MDS Plot"),type="n", col=pcols, cex=0.35) + text(xmds, ymds, labels = row.names(mydata), cex=0.35, col=pcols) + grid(col="lightgray",lty="dotted") + dev.off() +} + + +getpredp_logistic = function(x,yvec,yvarname,id) +{ + yvals = unique(yvec) + if (length(yvals) != 2) { + message(c('ERROR: y does not have 2 values =',paste(yvals,collapse=','))) + return(NA) + } + cols = colnames(x) + if (length(cols) == 0) { + message('ERROR: No columns in input x? Cannot predict!') + return(NA) + } + cn = paste(cols, collapse = ' + ') + + formstring=paste("y ~",cn) + form = as.formula(formstring) + ok = complete.cases(x) + + if (sum(ok) < length(ok)) { + x = x[ok,] + yvec = yvec[ok] + id = id[ok] + } + nx = data.frame(id=id,x,y=yvec) + print('nx,yvec:') + print(head(nx,n=3)) + print(yvec) + mdl = glm(form, data=nx, family="binomial", na.action=na.omit) + message(c('Model format =',formstring)) + message(paste('Predictive model details used to generate logistic outcome probabilities for',yvarname,':')) + print(summary(md1)) + print(anova(md1)) + predp = predict(md1,nx,type="response") + p1 = data.frame(id=id,pred_response=predp,obs_response=yvec) + return(p1) +} + +getpredp_cox = function(x,time,status,id,predict_at) +{ + cols = colnames(x) + if (length(cols) == 0) { + message('ERROR: No columns in input x? Cannot predict!') + return(NA) + } + cn = paste(colnames(x), collapse = ' + ') + + formstring=paste("Surv(time, status) ~",cn) + + form = as.formula(formstring) + + ok = complete.cases(x) + + if (sum(ok) < length(ok)) { + x = x[ok,] + time = time[ok] + status = status[ok] + id = id[ok] + } + nx = data.frame(x,time=time,status=status) + m1 = coxph(form, data=nx,singular.ok=TRUE) + print.noquote('Predictive model details used to generate survival probabilities:') + print.noquote(m1) + predpq = predictSurvProb(object=m1, newdata=nx, times=predict_at) + predpq = 1-predpq + colnames(predpq) = paste('p_surv_to',predict_at,sep='_') + p1 = data.frame(id=id,predpq,time=time,status=status) + return(p1) +} + + +dolasso_cox = function(x,y,debugOn=F,maxsteps=10000,nfold=10,xcolnames,ycolnames,optLambda='lambda.1se',out_full=F,out_full_file=NA, + out_pred=F,out_pred_file=NA,cox_id=NA, descr='Cox test',do_standard=F,alpha=0.9,penalty,predict_at,mdsplots=F) +{ + logf = file("cox_rglasso.log", open = "a") + sink(logf,type = c("output", "message")) + res = NULL + if (mdsplots==T) { + outpdfname = 'cox_x_in_sample_space_MDS.pdf' + p = try({ mdsPlot(x,'measurements in sample space',groups=NA,outpdfname=outpdfname,transpose=T) },T) + if (class(p) == "try-error") + { + print.noquote(paste('Unable to produce predictors in sample space mds plot',p)) + } + outpdfname = 'cox_samples_in_x_space_MDS.pdf' + p = try({mdsPlot(x,'samples in measurement space',groups=y,outpdfname=outpdfname,transpose=F) },T) + if (class(p) == "try-error") + { + print.noquote(paste('Unable to produce samples in measurement space mds plots',p)) + } + } + if (is.na(predict_at)) { predict_at = quantile(y) } + message(paste('@@@ Cox model will be predicted at times =',paste(predict_at,collapse=','))) + do_standard = do_standard + standardize = do_standard + normalize = do_standard + p = try({larsres = glmnet(x,y,family='cox',standardize=standardize,alpha=alpha,penalty.factor=penalty )},T) + if (class(p) == "try-error") + { + print.noquote('Unable to run cox glmnet on your data') + print.noquote(p) + sink() + return(NA) + } + if (out_full == T) + { + b = as.matrix(larsres\$beta) + nb = length(colnames(b)) + bcoef = b[,nb] + lastl = larsres\$lambda[length(larsres\$lambda)] + allres = data.frame(x=rownames(b),beta=bcoef,lambda=lastl) + write.table(format(allres,digits=5),out_full_file,quote=FALSE, sep="\t",row.names=F) + } + + outpdf = paste('cox',descr,'glmnetdev.pdf',sep='_') + try( + { + pdf(outpdf) + plot(larsres,main='cox glmnet',label=T) + grid() + dev.off() + },T) + + larscv = NA + + p = try({larscv=ross.cv.glmnet(x,y,family=fam,type.measure='deviance',penalty=penalty)},T) + if (class(p) == "try-error") { + print.noquote(paste('Unable to cross validate your data',p)) + sink() + return(NA) + } + lse = larscv\$cvhits.1se + lmin = larscv\$cvhits.min + tot = lse + lmin + allhits = data.frame(hits_lambda_1se = lse,hits_lambda_min = lmin) + nzhits = allhits[which(tot != 0),] + message('Times each predictor was selected in CV models (excluding zero count predictors)') + print.noquote(nzhits) + out_nz_file = 'cox_cross_validation_model_counts.xls' + write.table(nzhits,out_nz_file,quote=FALSE, sep="\t",row.names=F) + + outpdf = paste('cox',descr,'glmnet_cvdeviance.pdf',sep='_') + + p = try( + { + pdf(outpdf) + plot(larscv,main='Deviance',label=T) + grid() + dev.off() + },T) + if (optLambda == 'lambda.min') { + best_lambda = larscv\$lambda.min + bestcoef = as.matrix(coef(larscv, s = "lambda.min")) + } else { + best_lambda = larscv\$lambda.1se + bestcoef = as.matrix(coef(larscv, s = "lambda.1se")) + } + inmodel = which(bestcoef != 0) + coefs = bestcoef[inmodel] + preds = rownames(bestcoef)[inmodel] + + names(coefs) = preds + pen = as.logical( ! penalty[inmodel]) + if (out_pred==T) + { + if (length(inmodel) > 0 ) { + predcols = inmodel + xmat = as.matrix(x[,predcols]) + colnames(xmat) = preds + bestpred = getpredp_cox(x=xmat,time=y[,'time'],status=y[,'status'],id=cox_id, predict_at=predict_at) + pred = data.frame(responsep=bestpred, best_lambda=best_lambda,lamchoice=optLambda,alpha=alpha) + write.table(pred,out_pred_file,quote=FALSE, sep="\t",row.names=F) + } else { print.noquote('WARNING: No coefficients in selected model to predict with - no predictions made') } + } + if (debugOn) { + print.noquote(paste('best_lambda=',best_lambda,'saving cox respreds=',paste(names(coefs),collapse=','),'as predictors of survival. Coefs=',paste(coefs,collapse=','))) + } + p = try({res = data.frame(regulator=names(coefs),partial_likelihood=coefs,forced_in=pen,glmnet_model='cox',best_lambda=best_lambda, + lambdaChoice=optLambda,alpha=alpha)},T) + if (class(p) == "try-error") { + message(paste('@@@ unable to return a dataframe',p)) + sink() + return(NA) + } + print.noquote('@@@ Results preview:') + print.noquote(res,digits=5) + sink() + return(res) + +} + + +do_lasso = function(x=NA,y=NA,do_standard=T,debugOn=T,defaultFam="gaussian",optLambda='minLambda',descr='description', indx=1,target='target',sane=F, + alpha=0.9,nfold=10,penalty=c(),out_pred=F,out_pred_file='outpred',yvarname='yvar',id=c(),mdsplots=F) +{ + logf = file(paste(target,"rglasso.log",sep='_'), open = "a") + sink(logf,type = c("output", "message")) + res = NA + phe_is_bin = (length(unique(y)) == 2) + forcedin = paste(colnames(x)[which(penalty == 0)],collapse=',') + fam = "gaussian" + if (defaultFam %in% c("poisson","binomial","gaussian","multinomial")) fam=defaultFam + if (phe_is_bin == T) { + fam = "binomial" + } + print.noquote(paste('target=',target,'is binary=',phe_is_bin,'dim(x)=',paste(dim(x),collapse=','),'length(y)=',length(y),'force=',forcedin,'fam=',fam)) + standardize = do_standard + p = try({larsres = glmnet(x,y,family=fam,standardize=standardize,maxit=10000,alpha=alpha,penalty.factor=penalty) },T) + if (class(p) == "try-error") + { + print(paste('ERROR: unable to run glmnet for target',target,'error=',p)) + sink() + return(NA) + } + + mt = paste('Glmnet fraction deviance for',target) + outpdf = paste(target,'glmnetPath.pdf',sep='_') + pdf(outpdf) + plot(larsres,main=mt,label=T) + grid() + dev.off() + + outpdf = paste(target,'glmnetDeviance.pdf',sep='_') + + mt2 = paste('Glmnet lambda for',target) + + pdf(outpdf) + plot(larsres,xvar="lambda",main=mt2,label=T) + grid() + dev.off() + + larscv = NA + if (fam=="binomial") { + tmain = paste(target,'AUC') + outpdf = paste(target,'glmnetCV_AUC.pdf',sep='_') + p = try({larscv = ross.cv.glmnet(x=x,y=y,family=fam,type.measure='auc')},T) + } else { + tmain = paste(target,'CV MSE') + outpdf = paste(target,'glmnetCV_MSE.pdf',sep='_') + p = try({larscv = ross.cv.glmnet(x=x,y=y,family=fam,type.measure='mse')},T) + } + if (class(p) == "try-error") + { + print(paste('ERROR: unable to run cross validation for target',target,'error=',p)) + sink() + return(NA) + } + + pdf(outpdf) + plot(larscv,main=tmain) + grid() + dev.off() + + lse = larscv\$cvhits.1se + lmin = larscv\$cvhits.min + tot = lse + lmin + allhits = data.frame(pred=colnames(x),hits_lambda_1se = lse,hits_lambda_min = lmin) + nzhits = allhits[which(tot != 0),] + message('Total hit count for each predictor over all CV models (excluding zero count predictors)') + print.noquote(nzhits) + out_nz_file = paste(target,'cross_validation_model_counts.xls',sep='_') + write.table(nzhits,out_nz_file,quote=FALSE, sep="\t",row.names=F) + + ipenalty = c(0,penalty) + if (optLambda == 'lambda.min') { + best_lambda = larscv\$lambda.min + bestpred = as.matrix(coef(larscv, s = "lambda.min")) + } else { + best_lambda = larscv\$lambda.1se + bestpred = as.matrix(coef(larscv, s = "lambda.1se")) + } + inmodel = which(bestpred != 0) + coefs = bestpred[inmodel,1] + preds = rownames(bestpred)[inmodel] + iforced = ipenalty[inmodel] + forced = ! as.logical(iforced) + names(coefs) = preds + ncoef = length(coefs) - 1 + if (out_pred==T && fam=="binomial") + { + print.noquote(paste('Predicting',target,'probabilities from binomial glmnet at alpha',alpha,'and lambda',best_lambda)) + bestpred = predict.glmnet(larsres,s=best_lambda,newx=x,type="response") + bestpred = exp(bestpred)/(1+exp(bestpred)) + pred = data.frame(id=id,y=y,predp=as.vector(bestpred), best_lambda=best_lambda) + write.table(pred,out_pred_file,quote=FALSE, sep="\t",row.names=F) + } + if (debugOn) {cat(indx,'best_lambda=',best_lambda,'saving',fam,'respreds=',names(coefs),'as predictors of',target,'coefs=',coefs,'\n')} + res = try(data.frame(i=indx,pred=target,regulator=names(coefs),coef=coefs,forced_in=forced,glmnet_model=fam,ncoef=ncoef, + best_lambda=best_lambda,lambdaChoice=optLambda,alpha=alpha),T) + if (class(res) == "try-error") { + sink() + return(NA) } + print.noquote(res) + sink() + return(res) +} + + +dolasso_generic = function(predvars=NA,depvars=NA,debugOn=T,maxsteps=100, alpha=0.9,nfold=10,xcolnames=c(),ycolnames=c(),optLambda='minLambda', out_pred_file=NA, + descr="describe me",do_standard=F,defaultFam="gaussian",penalty=c(),out_pred=F,cox_id=c(),mdsplots=F,xfilt=0.95) +{ + logf = file("rglasso.log", open = "a") + sink(logf,type = c("output", "message")) + xdat = predvars + xm = data.matrix(xdat) + res = NULL + id = cox_id + depnames = ycolnames + ndep = length(depnames) + if (mdsplots==T) { + outpdfname = 'rglasso_x_in_sample_space_MDS.pdf' + p = try({ mdsPlot(xm,'measurements in sample space',groups=NA,outpdfname=outpdfname,transpose=T) },T) + if (class(p) == "try-error") + { + print.noquote(paste('Unable to produce predictors in sample space mds plot. Error:',p)) + } + outpdfname = 'rglasso_samples_in_x_space_MDS.pdf' + p = try({mdsPlot(xm,'samples in measurement space',groups=NA,outpdfname=outpdfname,transpose=F) },T) + if (class(p) == "try-error") + { + print.noquote(paste('Unable to produce samples in measurement space mds plot. Error:',p)) + } + } + ndat = nrow(xm) + cfracs = colSums(! is.na(xm))/ndat + keepme = (cfracs >= xfilt) + print.noquote(paste('Removing', sum(! keepme), 'xvars with more than',xfilt,'fraction missing')) + vars = apply(xm,2,var,na.rm=T) + xm = xm[,keepme] + for (i in c(1:max(1,ndep))) { + target = depnames[i] + if (length(target) < 1) { target='y' } + if (i %% 100 == 0) { cat(i,target,'\n') } + if (ndep <= 1) { + y=depvars + } else { + y = depvars[,i] + } + if (fam == "binomial") {y = as.factor(y)} + x = xm + id = cox_id + if (fam != "cox") { + ok = complete.cases(x,y) + if (sum(! ok) > 0) { + message(paste('@@@ Removing',sum(! ok),'cases with missing y of',length(y),'@@@')) + y = y[(ok)] + x = x[(ok),] + id = id[(ok)] + } + } + ok = complete.cases(y) + if (sum(ok) == 0 ) { + print(paste("No complete cases found for",target,"in input x dim =",paste(dim(xm),collapse=','),"length y=",length(y))) + } else { + if (i == 1) { outpred = out_pred_file + } else { + outpred = paste(target,'predicted_output.xls') + } + regres = do_lasso(x=x,y=y,do_standard=do_standard,debugOn=debugOn,defaultFam=defaultFam,optLambda=optLambda,out_pred_file=outpred, + descr=descr,indx=i,target=target,alpha=alpha,nfold=nfold,penalty=penalty,out_pred=out_pred,yvarname=target,id=id,mdsplots=mdsplots) + if (! is.na(regres)) { res = rbind(res,regres) } + } + } + print.noquote('@@@ Results preview:') + print.noquote(res,digits=5) + sink() + return(res) +} + + +corPlot=function(xdat=c(),main='main title',is_raw=T) +{ + library(pheatmap) + library(gplots) + if (is_raw) { + cxdat = cor(xdat,method="spearman",use="pairwise.complete.obs") + } else { + cxdat=xdat + } + xro = nrow(cxdat) + if (xro > 1000) stop("Too many rows for heatmap, who can read?!") + fontsize_col = 5.0 + pheatmap(cxdat, main=main, show_colnames = F, width=30, height=30, + fontsize_row=fontsize_col, border_color=NA) +} + + +runTest = function(n=10) +{ + set.seed (NULL) + Y = data.frame(y1=runif (n),y2=runif(n)) + Xv <- runif(n*n) + X <- matrix(Xv, nrow = n, ncol = n) + + mydf <- data.frame(Y, X) + + regres_out = dolasso_generic(predvars=X,depvars=Y,debugOn=T,p.cutoff = 0.05,maxsteps=10000,nfold=10, + descr='randomdata',do_standard=do_standard,defaultFam="gaussian",alpha=0.05) + return(regres_out) +} +]]> +options(width=512) +options(digits=5) +alpha = $alpha +nfold = $nfold +optLambda = "$optLambda" +Out_Dir = "$html_file.files_path" +Input = "$input1" +indat = read.table(Input,head=T,sep='\t') +datcols = colnames(indat) +myTitle = "$title" +outtab = "$model_file" +do_standard = as.logical("$do_standard") +mdsplots = as.logical("$mdsplots") +fam = "$model.fam" +xvar_cols_in = "$xvar_cols" +force_xvar_cols_in = "$force_xvar_cols" +xvar_cols = as.numeric(strsplit(xvar_cols_in,",")[[1]]) +force_xvar_cols = c() +penalties = rep(1,length(datcols)) +forced_in = NA + +logxform = "$logxform_cols" +if (logxform != "None") { + logxform_cols = as.numeric(strsplit(logxform,",")[[1]]) + if (length(logxform_cols) > 0) { + small = 1e-10 + sset = indat[,logxform_cols] + zeros = which(sset==0,arr.ind=T) + nz = nrow(zeros) + if (nz > 0) { + message(paste('Log transforming encountered',nz,'zeros - added 1e-10')) + sset[zeros] = sset[zeros] + small + lset = log(sset) + indat[,logxform_cols] = lset + } + } +} +if (force_xvar_cols_in != "None") +{ + force_xvar_cols = as.numeric(strsplit(force_xvar_cols_in,",")[[1]]) + allx = c(xvar_cols,force_xvar_cols) + xvar_cols = unique(allx) + xvar_cols = xvar_cols[order(xvar_cols)] + penalties[force_xvar_cols] = 0 +} +penalty = penalties[xvar_cols] +forcedin = paste(datcols[which(penalties == 0)],collapse=',') +cox_id_col = NA +cox_id = NA + +message(paste('@@@ Using alpha =',alpha,'for all models')) +x = indat[,xvar_cols] +nx = nrow(x) +cx = ncol(x) +message(paste('@@@@ Input has',nx,'samples and',cx,'predictors')) +if (cx > nx) { +message('@@@ WARNING: Models will have more variables than cases so glmnet will likely return one of many possible solutions! Please DO NOT expect reliable results - glmnet is clever but not magical @@@') +} + +xcolnames = datcols[xvar_cols] + +if (file.exists(Out_Dir) == F) dir.create(Out_Dir) +out_full = F +out_full_file = NA +out_pred_file = "" +out_pred = as.logical("$model.output_pred") + +#if $model.fam == "binomial" or $model.fam == "cox": + cox_id_col = $model.cox_id + cox_id = indat[,cox_id_col] + if (out_pred == T) { + out_pred_file="$output_pred_file" + rownames(x) = cox_id + } +#end if +#if $model.fam == "cox": + cox_time = $model.cox_time + cox_status = $model.cox_status + out_full = as.logical("$model.output_full") + if (out_full == T) { out_full_file="$output_full_file" } + yvar_cols = c(cox_time,cox_status) + ycolnames = c('time','status') + istat = as.double(indat[,cox_status]) + itime = as.double(indat[,cox_time]) + predict_at = quantile(itime) + if ("$model.predict_at" > "") + { + pa = "$model.predict_at" + predict_at = as.numeric(strsplit(pa,",")[[1]]) + } + y = data.frame(time = itime, status = istat) + ustat = unique(istat) + if ((length(ustat) != 2) | (! 1 %in% ustat ) | (! 0 %in% ustat)) + { + print.noquote(paste('INPUT ERROR: status must have 0 (censored) or 1 (event) but found',paste(ustat,collapse=',') )) + quit(save='no',status=1) + } + y = as.matrix(y) + x = as.matrix(x) + print.noquote(paste('@@@ Cox model will predict yvar=',datcols[cox_status],'using cols=',paste(xcolnames,collapse=','),'n preds=',length(xcolnames), + 'forced in=',forcedin)) + regres_out = dolasso_cox(x=x,y=y,debugOn=F,maxsteps=10000,nfold=nfold,xcolnames=xcolnames,ycolnames=ycolnames,optLambda=optLambda,out_full=out_full,out_full_file=out_full_file, + out_pred=out_pred,out_pred_file=out_pred_file,cox_id=cox_id,descr=myTitle,do_standard=do_standard,alpha=alpha,penalty=penalty,predict_at=predict_at,mdsplots=mdsplots) +#else: + yvar_cols = "$model.yvar_cols" + yvar_cols = as.numeric(strsplit(yvar_cols,",")[[1]]) + ycolnames = datcols[yvar_cols] + print.noquote(paste('@@@',fam,'model will predict yvar=',paste(ycolnames,collapse=','),'using cols=',paste(xcolnames,collapse=','),'n preds=',length(xcolnames), + 'forced in=',forcedin)) + y = data.matrix(indat[,yvar_cols]) + print.noquote(paste('Model will use',fam,'link function to predict yvar=',paste(ycolnames,collapse=','),'n preds=',length(xcolnames),'forced in=',forcedin)) + regres_out = dolasso_generic(predvars=x,depvars=y,debugOn=F, maxsteps=10000,nfold=nfold,xcolnames=xcolnames,ycolnames=ycolnames,optLambda=optLambda,out_pred_file=out_pred_file, + descr=myTitle,do_standard=do_standard,defaultFam=fam,alpha=alpha,penalty=penalty,out_pred=out_pred,cox_id=cox_id,mdsplots=mdsplots) +#end if + +write.table(format(regres_out,digits=5),outtab,quote=FALSE, sep="\t",row.names=F) +print.noquote('@@@ SessionInfo for this R session:') +sessionInfo() +warnings() + +</configfile> +</configfiles> + <inputs> + <param name="title" type="text" value="lasso test" label="Title for job outputs" help="Typing a short, meaningful text here will help remind you (and explain to others) what the outputs represent"> + <sanitizer invalid_char=""> + <valid initial="string.letters,string.digits"><add value="_" /> </valid> + </sanitizer> + </param> + <param name="input1" type="data" format="tabular" label="Select an input tabular text file from your history. Rows represent samples; Columns are measured phenotypes" + multiple='False' optional="False" help="Tabular text data with samples as rows, phenotypes as columns with a header row of column identifiers" /> + <param name="xvar_cols" label="Select columns containing numeric variables to use as predictor (x) variables" type="data_column" data_ref="input1" numerical="False" + multiple="True" use_header_names="True" force_select="True" /> + <param name="force_xvar_cols" label="Select numeric columns containing variables ALWAYS included as predictors in cross validation" type="data_column" data_ref="input1" numerical="False" + multiple="True" use_header_names="True" force_select="False"/> + <conditional name="model"> + <param name="fam" type="select" label="GLM Link function for models" + help="Binary dependant variables will automatically be set to Binomial no matter what this is set to"> + <option value="gaussian" selected="true">Gaussian - continuous dependent (y)</option> + <option value="binomial">Binomial dependent variables</option> + <option value="poisson">Poisson (eg counts)</option> + <option value="cox">Cox models - require special setup for y variables - see below</option> + </param> + <when value="gaussian"> + <param name="yvar_cols" label="Select numeric columns containing variables to use as the dependent (y) in elasticnet" type="data_column" data_ref="input1" numerical="False" + multiple="True" use_header_names="True" help = "If multiple, each will be modelled against all the x variables and reported separately." force_select="True"/> + <param name="output_full" type="hidden" value='F' /> + <param name="output_pred" type="hidden" value='F' /> + <param name="cox_id" label="Select column containing a unique sample identifier" + help = "Only really needed for output sample specific predicted values downstream." + type="data_column" data_ref="input1" numerical="False" force_select="True" + multiple="False" use_header_names="True" /> + </when> + <when value="binomial"> + <param name="yvar_cols" label="Select numeric columns containing variables to use as the dependent (y) in elasticnet" type="data_column" data_ref="input1" numerical="False" + multiple="True" use_header_names="True" help = "If multiple, each will be modelled against all the x variables and reported separately." force_select="True"/> + <param name="output_full" type="hidden" value='F' /> + <param name="output_pred" type="select" label="Create a tabular output with predicted values for each subject from the optimal model for (eg) NRI estimates" > + <option value="F" selected="true">No predicted value output file</option> + <option value="T">Create a predicted value output file</option> + </param> + <param name="cox_id" label="Select column containing a unique sample identifier" + help = "Only really needed for output sample specific predicted values downstream." + type="data_column" data_ref="input1" numerical="False" force_select="True" + multiple="False" use_header_names="True" /> + <param name="predict_at" type="hidden" value='' /> + + </when> + <when value="poisson"> + <param name="yvar_cols" label="Select columns containing variables to use as the dependent (y) in elasticnet" type="data_column" data_ref="input1" numerical="True" + multiple="True" use_header_names="True" help = "If multiple, each will be modelled against all the x variables and reported separately." force_select="True"/> + <param name="output_full" type="hidden" value='F' /> + <param name="output_pred" type="hidden" value='F' /> + <param name="predict_at" type="hidden" value='' /> + <param name="cox_id" label="Select column containing a unique sample identifier" + help = "Optional. Only really needed for output sample specific predicted values downstream. Free - enjoy" + type="data_column" data_ref="input1" numerical="True" force_select="False" + multiple="False" use_header_names="True" /> + </when> + <when value="cox"> + <param name="cox_time" label="Select column containing time under observation for Cox regression" + type="data_column" data_ref="input1" numerical="True" force_select="True" + multiple="False" use_header_names="True" help = "This MUST contain a time period - eg continuous years or days to failure or right censoring"/> + <param name="cox_status" label="Select column containing status = 1 for outcome of interest at the end of the time under observation or 0 for right censoring" + type="data_column" data_ref="input1" numerical="True" force_select="True" + multiple="False" use_header_names="True" help = "This MUST contain 1 for subjects who had an event at that time or 0 for a right censored observation"/> + <param name="cox_id" label="Select column containing a unique sample identifier" + help = "Optional. Only really needed for output sample specific predicted values downstream. Free - enjoy" + type="data_column" data_ref="input1" numerical="False" force_select="False" + multiple="False" use_header_names="True" /> + <param name="output_full" type="select" label="Create a tabular output with coefficients for all predictors" > + <option value="F" selected="true">No full model output file</option> + <option value="T">Create a full model output file</option> + </param> + <param name="output_pred" type="select" label="Create a tabular output with predicted values for each subject from the optimal model for (eg) NRI estimates" > + <option value="F" selected="true">No predicted value output file</option> + <option value="T">Create a predicted value output file</option> + </param> + <param name="predict_at" type="text" value='' label="Provide a comma separated list of times to make a prediction for each subject" + optional="True" help="Default (blank) will return predictions at 0%,25%,50%,75%,100% of the observed times which should be informative" /> + + </when> + </conditional> + <param name="optLambda" type="select" label="Value to use when reporting optimal model and coefficients" help="minLambda will have more predictors - 1SDLambda will be more parsimonious"> + <option value="lambda.1se" selected="true">Lambda + 1 SE of min MSE or AUC (fewer coefficients - more false negatives)</option> + <option value="lambda.min">Lambda at min MSE or max AUC (more coefficients - more false positives)</option> + </param> + <param name="logxform_cols" optional="True" label="Select numeric columns to be log transformed before use as predictors or dependent variables" type="data_column" + data_ref="input1" numerical="True" multiple="True" use_header_names="True" help = "The wisdom of doing this depends entirely on your predictors - eg can help diminish long-tailed outlier influence" + force_select="False"/> + <param name="do_standard" type="select" label="Standardise x vars" + help="If all measurements on same scale, may not be needed. Coefficients are always returned on the original scale."> + <option value="False" selected="true">No standardisation of predictors</option>l + <option value="True">Standardise predictors before model</option> + </param> + <param name="mdsplots" type="select" label="Generate MDS plots of samples in measurement space and measurements in sample space" > + <option value="False" selected="true">No MDS plots</option>l + <option value="True">Yes create MDS plots</option> + </param> + <param name="alpha" type="float" value="0.95" min="0.01" max="1.0" label="Alpha - see glmnet docs. 1 for pure lasso. 0.0 for pure ridge regression" + help="Default 0.95 allows lasso to cope better with expected predictor collinearity. Use (eg) 0.5 for hybrid regularised regression or (eg) 0.025 for ridge regression"/> + <param name="nfold" type="integer" value="10" label="Number of folds for internal cross validation" + help="Default of 10 is usually ok"/> + </inputs> + <outputs> + <data format="html" name="html_file" label="${title}.html"/> + <data format="tabular" name="model_file" label="${title}_modelres.xls"/> + <data format="tabular" name="output_full_file" label="${title}_full_cox_model.xls"> + <filter>model['output_full'] == 'T'</filter> + </data> + <data format="tabular" name="output_pred_file" label="${title}_predicted_from_model.xls"> + <filter>model['output_pred'] == 'T'</filter> + </data> + </outputs> + <tests> + <test> + <param name='input1' value='cox_test.xls' ftype='tabular' /> + <param name='treatment_name' value='case' /> + <param name='title' value='Cox glmnet test' /> + <param name='nfold' value='10' /> + <param name='logxform_cols' value='' /> + <param name='alpha' value='0.95' /> + <param name='do_standard' value="True" /> + <param name='cox_time' value='1' /> + <param name='cox_status' value='2' /> + <param name='cox_id' value='1' /> + <param name='predict_at' value='' /> + <param name='fam' value='cox' /> + <param name='yvar_cols' value='' /> + <param name='xvar_cols' value='3,4,5' /> + <param name='force_xvar_cols' value='3' /> + <param name='output_full' value='F' /> + <param name='output_pred' value='F' /> + <output name='model_file' file='coxlassotest_modelres.xls'> + <assert_contents> + <has_text text="rhubarb" /> + <has_text text="TRUE" /> + <!-- 	 is XML escape code for tab --> + <!-- has_line line="regulator	partial_likelihood	forced_in	glmnet_model	best_lambda" / --> + <has_line line="regulator	partial_likelihood	forced_in	glmnet_model	best_lambda	lambdaChoice	alpha" /> + <has_n_columns n="7" /> + </assert_contents> + </output> + <output name='html_file' file='coxlassotest.html' compare='diff' lines_diff='16' /> + </test> +</tests> +<help> + +**Before you start** + +Please read the glmnet documentation @ glmnet_ + +This Galaxy wrapper merely exposes that code and the glmnet_ documentation is essential reading +before getting useful results here. + +**What it does** + +From documentation at glmnet_ :: + + Glmnet is a package that fits a generalized linear model via penalized maximum likelihood. + The regularization path is computed for the lasso or elasticnet penalty at a grid of values for the regularization parameter lambda. + The algorithm is extremely fast, and can exploit sparsity in the input matrix x. + It fits linear, logistic and multinomial, poisson, and Cox regression models. + A variety of predictions can be made from the fitted models. + +Internal cross validation is used to optimise the choice of lambda based on CV AUC for logistic (binomial outcome) models, or CV mse for gaussian. + +**Warning about the tyrany of dimensionality** + +Yes, this package will select 'optimal' models even when you (optimistically) supply more predictors than you have cases. +The model returned is unlikely to represent the only informative regularisation path through your data - if you run repeatedly with +exactly the same settings, you will probably see many different models being selected. +This is not a software bug - the real problem is that you just don't have enough information in your data. + +Sufficiently big jobs will take a while (eg each lasso regression with 20k features on 1k samples takes about 2-3 minutes on our aged cluster) + +**Input** + +Assuming you have more measurements than samples, you supply data as a tabular text file where each row is a sample and columns +are variables. You specify which columns are dependent (predictors) and which are observations for each sample. Each of multiple +dependent variable columns will be run and reported independently. Predictors can be forced in to the model. + +**Output** + +For each selected dependent regression variable, a brief report of the model coefficients predicted at the +'optimal' nfold CV value of lambda. + +**Predicted event probabilities for Cox and Logistic models** + +If you want to compare (eg) two competing clinical predictions, there's a companion generic NRI tool +for predicted event probabilities. Estimates dozens of measures of improvement in prediction. Currently only works for identical id subjects +but can probably be extended to independent sample predictions. + +Given a model, we can generate a predicted p (for status 1) in binomial or cox frameworks so models can be evaluated in terms of NRI. +Of course, estimates are likely substantially inflated over 'real world' performance by being estimated from the same sample - but you probably +already knew that since you were smart enough to reach this far down into the on screen help. The author salutes you, intrepid reader! + +It may seem an odd thing to do, but we can predict p for an event for each subject from our original data, given a parsimonious model. Doing +this for two separate models (eg, forcing in an additional known explanatory measurement to the new model) allows comparison of the two models +predicted status for each subject, or the same model in independent populations to see how badly it does + +**Attributions** + +glmnet_ is the R package exposed by this Galaxy tool. + +Galaxy_ (that's what you are using right now!) for gluing everything together + +Otherwise, all code and documentation comprising this tool was written by Ross Lazarus and is +licensed to you under the LGPL_ like other rgenetics artefacts + +.. _LGPL: http://www.gnu.org/copyleft/lesser.html +.. _glmnet: http://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html +.. _Galaxy: http://getgalaxy.org +</help> + +<citations> + <citation type="bibtex"> +@Article{Friedman2010, title = {Regularization Paths for Generalized Linear Models via Coordinate Descent}, + author = {Jerome Friedman and Trevor Hastie and Robert Tibshirani}, + journal = {Journal of Statistical Software}, + year = {2010}, + volume = {33}, + number = {1}, + pages = {1--22}, + url = {http://www.jstatsoft.org/v33/i01/} + } + </citation> + <citation type="doi"> +10.1093/bioinformatics/bts573 + </citation> +</citations> +</tool>