Mercurial > repos > fubar > rglasso

diff rglasso_cox.xml @ 0:cf295f36d606 draft
Initial commit for iuc/test rglasso
author: fubar
date: Sat, 31 Oct 2015 01:07:28 -0400
children: 31be675baa50
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rglasso_cox.xml	Sat Oct 31 01:07:28 2015 -0400
@@ -0,0 +1,910 @@
+<tool id="rglasso_cox" name="Lasso" version="0.03">
+  <description>and cox regression using elastic net</description>
+  <requirements>
+      <requirement type="package" version="3.2.2">R_3_2_2</requirement>
+      <requirement type="package" version="1.3.18">graphicsmagick</requirement>
+      <requirement type="package" version="9.10">ghostscript</requirement>
+      <requirement type="package" version="3.2">glmnet_lars_3_2</requirement>
+  </requirements>
+  <command interpreter="python">
+     rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "rglasso"
+    --output_dir "$html_file.files_path" --output_html "$html_file" --make_HTML "yes"
+  </command>
+<configfiles>
+<configfile name="runme">
+<![CDATA[
+library('glmnet')
+library('lars')
+library('survival')
+library('pec')
+
+
+message=function(x) {print.noquote(paste(x,sep=''))}
+
+
+ross.cv.glmnet = function (x, y, weights, offset = NULL, lambda = NULL, type.measure = c("mse",
+    "deviance", "class", "auc", "mae"), nfolds = 10, foldid,
+    grouped = TRUE, keep = FALSE, parallel = FALSE, ...)
+{
+    if (missing(type.measure))
+        type.measure = "default"
+    else type.measure = match.arg(type.measure)
+    if (!is.null(lambda) && length(lambda) < 2)
+        stop("Need more than one value of lambda for cv.glmnet")
+    N = nrow(x)
+    if (missing(weights))
+        weights = rep(1, N)
+    else weights = as.double(weights)
+    y = drop(y)
+    glmnet.call = match.call(expand.dots = TRUE)
+    sel = match(c("type.measure", "nfolds", "foldid", "grouped",
+        "keep"), names(glmnet.call), F)
+    if (any(sel))
+        glmnet.call = glmnet.call[-sel]
+    glmnet.call[[1]] = as.name("glmnet")
+    glmnet.object = glmnet(x, y, weights = weights, offset = offset,
+        lambda = lambda, ...)
+    glmnet.object\$call = glmnet.call
+    is.offset = glmnet.object\$offset
+    lambda = glmnet.object\$lambda
+    if (inherits(glmnet.object, "multnet")) {
+        nz = predict(glmnet.object, type = "nonzero")
+        nz = sapply(nz, function(x) sapply(x, length))
+        nz = ceiling(apply(nz, 1, median))
+    }
+    else nz = sapply(predict(glmnet.object, type = "nonzero"),
+        length)
+    if (missing(foldid))
+        foldid = sample(rep(seq(nfolds), length = N))
+    else nfolds = max(foldid)
+    if (nfolds < 3)
+        stop("nfolds must be bigger than 3; nfolds=10 recommended")
+    outlist = as.list(seq(nfolds))
+    if (parallel && require(foreach)) {
+        outlist = foreach(i = seq(nfolds), .packages = c("glmnet")) %dopar%
+            {
+                sel = foldid == i
+                if (is.matrix(y))
+                  y_sub = y[!sel, ]
+                else y_sub = y[!sel]
+                if (is.offset)
+                  offset_sub = as.matrix(offset)[!sel, ]
+                else offset_sub = NULL
+                glmnet(x[!sel, , drop = FALSE], y_sub, lambda = lambda,
+                  offset = offset_sub, weights = weights[!sel],
+                  ...)
+            }
+    }
+    else {
+        for (i in seq(nfolds)) {
+            sel = foldid == i
+            if (is.matrix(y))
+                y_sub = y[!sel, ]
+            else y_sub = y[!sel]
+            if (is.offset)
+                offset_sub = as.matrix(offset)[!sel, ]
+            else offset_sub = NULL
+            outlist[[i]] = glmnet(x[!sel, , drop = FALSE],
+                y_sub, lambda = lambda, offset = offset_sub,
+                weights = weights[!sel], ...)
+        }
+    }
+    fun = paste("cv", class(glmnet.object)[[1]], sep = ".")
+    cvstuff = do.call(fun, list(outlist, lambda, x, y, weights,
+        offset, foldid, type.measure, grouped, keep))
+    cvm = cvstuff\$cvm
+    cvsd = cvstuff\$cvsd
+    cvname = cvstuff\$name
+
+    out = list(lambda = lambda, cvm = cvm, cvsd = cvsd, cvup = cvm +
+        cvsd, cvlo = cvm - cvsd, nzero = nz, name = cvname, glmnet.fit = glmnet.object)
+    if (keep)
+        out = c(out, list(fit.preval = cvstuff\$fit.preval, foldid = foldid))
+
+    lamin = if (type.measure == "auc")
+        getmin(lambda, -cvm, cvsd)
+    else getmin(lambda, cvm, cvsd)
+    out = c(out, as.list(lamin))
+    hitsse = rep(0,ncol(x))
+    hitsmin = rep(0,ncol(x))
+    names(hitsse) = colnames(x)
+    names(hitsmin) = colnames(x)
+    olmin = lamin\$lambda.min
+    ol1sd = lamin\$lambda.1se
+    lambs = c(olmin,ol1sd)
+    names(lambs) = c('olmin','ol1sd')
+    for (cvfit in outlist) {
+        colmin = which(cvfit\$lambda == olmin)
+        col1se = which(cvfit\$lambda == ol1sd)
+        nzmin = which(cvfit\$beta[,colmin] != 0)
+        nz1se = which(cvfit\$beta[,col1se] != 0)
+        hitsse[nz1se] = hitsse[nz1se] + 1
+        hitsmin[nzmin] = hitsmin[nzmin] + 1
+    }
+    obj = c(out,list(cvhits.1se=hitsse,cvhits.min=hitsmin))
+    class(obj) = "cv.glmnet"
+    obj
+}
+
+mdsPlot = function(dm,myTitle,groups=NA,outpdfname,transpose=T)
+{
+
+  samples = colnames(dm)
+  mt = myTitle
+  pcols=c('maroon')
+  if (! is.na(groups))
+  {
+  gu = unique(groups)
+  colours = rainbow(length(gu),start=0.1,end=0.9)
+  pcols = colours[match(groups,gu)]
+  }
+  mydata = dm
+  if (transpose==T)
+  {
+  mydata = t(dm)
+  }
+  npred = ncol(mydata)
+  d = dist(mydata)
+  fit = cmdscale(d,eig=TRUE, k=min(10,npred-2))
+  xmds = fit\$points[,1]
+  ymds = fit\$points[,2]
+  pdf(outpdfname)
+  plot(xmds, ymds, xlab="Dimension 1", ylab="Dimension 2",
+       main=paste(mt,"MDS Plot"),type="n", col=pcols, cex=0.35)
+  text(xmds, ymds, labels = row.names(mydata), cex=0.35, col=pcols)
+  grid(col="lightgray",lty="dotted")
+  dev.off()
+}
+
+
+getpredp_logistic = function(x,yvec,yvarname,id)
+{
+  yvals = unique(yvec)
+  if (length(yvals) != 2) {
+       message(c('ERROR: y does not have 2 values =',paste(yvals,collapse=',')))
+       return(NA)
+       }
+  cols = colnames(x)
+  if (length(cols) == 0) {
+       message('ERROR: No columns in input x? Cannot predict!')
+       return(NA)
+       }
+  cn = paste(cols, collapse = ' + ')
+
+  formstring=paste("y ~",cn)
+  form = as.formula(formstring)
+  ok = complete.cases(x)
+
+  if (sum(ok) < length(ok)) {
+    x = x[ok,]
+    yvec = yvec[ok]
+    id = id[ok]
+    }
+  nx = data.frame(id=id,x,y=yvec)
+  print('nx,yvec:')
+  print(head(nx,n=3))
+  print(yvec)
+  mdl = glm(form, data=nx, family="binomial", na.action=na.omit)
+  message(c('Model format =',formstring))
+  message(paste('Predictive model details used to generate logistic outcome probabilities for',yvarname,':'))
+  print(summary(md1))
+  print(anova(md1))
+  predp = predict(md1,nx,type="response")
+  p1 = data.frame(id=id,pred_response=predp,obs_response=yvec)
+  return(p1)
+}
+
+getpredp_cox = function(x,time,status,id,predict_at)
+{
+  cols = colnames(x)
+  if (length(cols) == 0) {
+       message('ERROR: No columns in input x? Cannot predict!')
+       return(NA)
+       }
+  cn = paste(colnames(x), collapse = ' + ')
+
+  formstring=paste("Surv(time, status) ~",cn)
+
+  form = as.formula(formstring)
+
+  ok = complete.cases(x)
+
+  if (sum(ok) < length(ok)) {
+    x = x[ok,]
+    time = time[ok]
+    status = status[ok]
+    id = id[ok]
+    }
+  nx = data.frame(x,time=time,status=status)
+  m1 = coxph(form, data=nx,singular.ok=TRUE)
+  print.noquote('Predictive model details used to generate survival probabilities:')
+  print.noquote(m1)
+  predpq = predictSurvProb(object=m1, newdata=nx, times=predict_at)
+  predpq = 1-predpq
+  colnames(predpq) = paste('p_surv_to',predict_at,sep='_')
+  p1 = data.frame(id=id,predpq,time=time,status=status)
+  return(p1)
+}
+
+
+dolasso_cox = function(x,y,debugOn=F,maxsteps=10000,nfold=10,xcolnames,ycolnames,optLambda='lambda.1se',out_full=F,out_full_file=NA,
+                             out_pred=F,out_pred_file=NA,cox_id=NA, descr='Cox test',do_standard=F,alpha=0.9,penalty,predict_at,mdsplots=F)
+{
+  logf = file("cox_rglasso.log", open = "a")
+  sink(logf,type = c("output", "message"))
+  res = NULL
+  if (mdsplots==T) {
+      outpdfname = 'cox_x_in_sample_space_MDS.pdf'
+      p = try({ mdsPlot(x,'measurements in sample space',groups=NA,outpdfname=outpdfname,transpose=T) },T)
+      if (class(p) == "try-error")
+      {
+        print.noquote(paste('Unable to produce predictors in sample space mds plot',p))
+      }
+      outpdfname = 'cox_samples_in_x_space_MDS.pdf'
+      p = try({mdsPlot(x,'samples in measurement space',groups=y,outpdfname=outpdfname,transpose=F) },T)
+      if (class(p) == "try-error")
+      {
+        print.noquote(paste('Unable to produce samples in measurement space mds plots',p))
+      }
+  }
+  if (is.na(predict_at)) { predict_at = quantile(y) }
+  message(paste('@@@ Cox model will be predicted at times =',paste(predict_at,collapse=',')))
+  do_standard = do_standard
+  standardize = do_standard
+  normalize = do_standard
+  p = try({larsres = glmnet(x,y,family='cox',standardize=standardize,alpha=alpha,penalty.factor=penalty )},T)
+  if (class(p) == "try-error")
+  {
+    print.noquote('Unable to run cox glmnet on your data')
+    print.noquote(p)
+    sink()
+    return(NA)
+  }
+  if (out_full == T)
+  {
+  b = as.matrix(larsres\$beta)
+  nb = length(colnames(b))
+  bcoef = b[,nb]
+  lastl = larsres\$lambda[length(larsres\$lambda)]
+  allres = data.frame(x=rownames(b),beta=bcoef,lambda=lastl)
+  write.table(format(allres,digits=5),out_full_file,quote=FALSE, sep="\t",row.names=F)
+  }
+
+  outpdf = paste('cox',descr,'glmnetdev.pdf',sep='_')
+  try(
+      {
+      pdf(outpdf)
+      plot(larsres,main='cox glmnet',label=T)
+      grid()
+      dev.off()
+      },T)
+
+  larscv = NA
+
+  p = try({larscv=ross.cv.glmnet(x,y,family=fam,type.measure='deviance',penalty=penalty)},T)
+  if (class(p) == "try-error") {
+     print.noquote(paste('Unable to cross validate your data',p))
+     sink()
+     return(NA)
+     }
+  lse = larscv\$cvhits.1se
+  lmin = larscv\$cvhits.min
+  tot = lse + lmin
+  allhits = data.frame(hits_lambda_1se = lse,hits_lambda_min = lmin)
+  nzhits = allhits[which(tot != 0),]
+  message('Times each predictor was selected in CV models (excluding zero count predictors)')
+  print.noquote(nzhits)
+  out_nz_file = 'cox_cross_validation_model_counts.xls'
+  write.table(nzhits,out_nz_file,quote=FALSE, sep="\t",row.names=F)
+
+  outpdf = paste('cox',descr,'glmnet_cvdeviance.pdf',sep='_')
+
+  p = try(
+     {
+     pdf(outpdf)
+     plot(larscv,main='Deviance',label=T)
+     grid()
+     dev.off()
+     },T)
+  if (optLambda == 'lambda.min') {
+      best_lambda = larscv\$lambda.min
+      bestcoef = as.matrix(coef(larscv, s = "lambda.min"))
+  } else {
+      best_lambda = larscv\$lambda.1se
+      bestcoef = as.matrix(coef(larscv, s = "lambda.1se"))
+  }
+  inmodel = which(bestcoef != 0)
+  coefs = bestcoef[inmodel]
+  preds = rownames(bestcoef)[inmodel]
+
+  names(coefs) = preds
+  pen = as.logical( ! penalty[inmodel])
+  if (out_pred==T)
+  {
+      if (length(inmodel) > 0 ) {
+          predcols = inmodel
+          xmat = as.matrix(x[,predcols])
+          colnames(xmat) = preds
+          bestpred = getpredp_cox(x=xmat,time=y[,'time'],status=y[,'status'],id=cox_id, predict_at=predict_at)
+          pred = data.frame(responsep=bestpred, best_lambda=best_lambda,lamchoice=optLambda,alpha=alpha)
+          write.table(pred,out_pred_file,quote=FALSE, sep="\t",row.names=F)
+        } else { print.noquote('WARNING: No coefficients in selected model to predict with - no predictions made') }
+    }
+  if (debugOn) {
+      print.noquote(paste('best_lambda=',best_lambda,'saving cox respreds=',paste(names(coefs),collapse=','),'as predictors of survival. Coefs=',paste(coefs,collapse=',')))
+      }
+  p = try({res = data.frame(regulator=names(coefs),partial_likelihood=coefs,forced_in=pen,glmnet_model='cox',best_lambda=best_lambda,
+     lambdaChoice=optLambda,alpha=alpha)},T)
+  if (class(p) == "try-error") {
+    message(paste('@@@ unable to return a dataframe',p))
+    sink()
+    return(NA)
+    }
+  print.noquote('@@@ Results preview:')
+  print.noquote(res,digits=5)
+  sink()
+  return(res)
+
+}
+
+
+do_lasso = function(x=NA,y=NA,do_standard=T,debugOn=T,defaultFam="gaussian",optLambda='minLambda',descr='description', indx=1,target='target',sane=F,
+                    alpha=0.9,nfold=10,penalty=c(),out_pred=F,out_pred_file='outpred',yvarname='yvar',id=c(),mdsplots=F)
+{
+  logf = file(paste(target,"rglasso.log",sep='_'), open = "a")
+  sink(logf,type = c("output", "message"))
+  res = NA
+  phe_is_bin = (length(unique(y)) == 2)
+  forcedin = paste(colnames(x)[which(penalty == 0)],collapse=',')
+  fam = "gaussian"
+  if (defaultFam %in% c("poisson","binomial","gaussian","multinomial")) fam=defaultFam
+  if (phe_is_bin == T) {
+    fam = "binomial"
+  }
+  print.noquote(paste('target=',target,'is binary=',phe_is_bin,'dim(x)=',paste(dim(x),collapse=','),'length(y)=',length(y),'force=',forcedin,'fam=',fam))
+  standardize = do_standard
+  p = try({larsres = glmnet(x,y,family=fam,standardize=standardize,maxit=10000,alpha=alpha,penalty.factor=penalty) },T)
+  if (class(p) == "try-error")
+  {
+    print(paste('ERROR: unable to run glmnet for target',target,'error=',p))
+    sink()
+    return(NA)
+  }
+
+  mt = paste('Glmnet fraction deviance for',target)
+  outpdf = paste(target,'glmnetPath.pdf',sep='_')
+  pdf(outpdf)
+  plot(larsres,main=mt,label=T)
+  grid()
+  dev.off()
+
+  outpdf = paste(target,'glmnetDeviance.pdf',sep='_')
+
+  mt2 = paste('Glmnet lambda for',target)
+
+  pdf(outpdf)
+  plot(larsres,xvar="lambda",main=mt2,label=T)
+  grid()
+  dev.off()
+
+  larscv = NA
+  if (fam=="binomial") {
+    tmain = paste(target,'AUC')
+    outpdf = paste(target,'glmnetCV_AUC.pdf',sep='_')
+    p = try({larscv = ross.cv.glmnet(x=x,y=y,family=fam,type.measure='auc')},T)
+  } else {
+    tmain = paste(target,'CV MSE')
+    outpdf = paste(target,'glmnetCV_MSE.pdf',sep='_')
+    p = try({larscv = ross.cv.glmnet(x=x,y=y,family=fam,type.measure='mse')},T)
+  }
+  if (class(p) == "try-error")
+  {
+    print(paste('ERROR: unable to run cross validation for target',target,'error=',p))
+    sink()
+    return(NA)
+  }
+
+  pdf(outpdf)
+  plot(larscv,main=tmain)
+  grid()
+  dev.off()
+
+  lse = larscv\$cvhits.1se
+  lmin = larscv\$cvhits.min
+  tot = lse + lmin
+  allhits = data.frame(pred=colnames(x),hits_lambda_1se = lse,hits_lambda_min = lmin)
+  nzhits = allhits[which(tot != 0),]
+  message('Total hit count for each predictor over all CV models (excluding zero count predictors)')
+  print.noquote(nzhits)
+  out_nz_file = paste(target,'cross_validation_model_counts.xls',sep='_')
+  write.table(nzhits,out_nz_file,quote=FALSE, sep="\t",row.names=F)
+
+  ipenalty = c(0,penalty)
+  if (optLambda == 'lambda.min') {
+    best_lambda = larscv\$lambda.min
+    bestpred = as.matrix(coef(larscv, s = "lambda.min"))
+  } else {
+    best_lambda = larscv\$lambda.1se
+    bestpred = as.matrix(coef(larscv, s = "lambda.1se"))
+  }
+  inmodel = which(bestpred != 0)
+  coefs = bestpred[inmodel,1]
+  preds = rownames(bestpred)[inmodel]
+  iforced = ipenalty[inmodel]
+  forced = ! as.logical(iforced)
+  names(coefs) = preds
+  ncoef = length(coefs) - 1
+  if (out_pred==T && fam=="binomial")
+  {
+    print.noquote(paste('Predicting',target,'probabilities from binomial glmnet at alpha',alpha,'and lambda',best_lambda))
+    bestpred = predict.glmnet(larsres,s=best_lambda,newx=x,type="response")
+    bestpred = exp(bestpred)/(1+exp(bestpred))
+    pred = data.frame(id=id,y=y,predp=as.vector(bestpred), best_lambda=best_lambda)
+    write.table(pred,out_pred_file,quote=FALSE, sep="\t",row.names=F)
+  }
+  if (debugOn) {cat(indx,'best_lambda=',best_lambda,'saving',fam,'respreds=',names(coefs),'as predictors of',target,'coefs=',coefs,'\n')}
+  res = try(data.frame(i=indx,pred=target,regulator=names(coefs),coef=coefs,forced_in=forced,glmnet_model=fam,ncoef=ncoef,
+     best_lambda=best_lambda,lambdaChoice=optLambda,alpha=alpha),T)
+  if (class(res) == "try-error") {
+    sink()
+    return(NA) }
+  print.noquote(res)
+  sink()
+  return(res)
+}
+
+
+dolasso_generic = function(predvars=NA,depvars=NA,debugOn=T,maxsteps=100, alpha=0.9,nfold=10,xcolnames=c(),ycolnames=c(),optLambda='minLambda', out_pred_file=NA,
+                           descr="describe me",do_standard=F,defaultFam="gaussian",penalty=c(),out_pred=F,cox_id=c(),mdsplots=F,xfilt=0.95)
+{
+  logf = file("rglasso.log", open = "a")
+  sink(logf,type = c("output", "message"))
+  xdat = predvars
+  xm = data.matrix(xdat)
+  res = NULL
+  id = cox_id
+  depnames = ycolnames
+  ndep = length(depnames)
+  if (mdsplots==T) {
+    outpdfname = 'rglasso_x_in_sample_space_MDS.pdf'
+    p = try({ mdsPlot(xm,'measurements in sample space',groups=NA,outpdfname=outpdfname,transpose=T) },T)
+    if (class(p) == "try-error")
+    {
+      print.noquote(paste('Unable to produce predictors in sample space mds plot. Error:',p))
+    }
+    outpdfname = 'rglasso_samples_in_x_space_MDS.pdf'
+    p = try({mdsPlot(xm,'samples in measurement space',groups=NA,outpdfname=outpdfname,transpose=F) },T)
+    if (class(p) == "try-error")
+    {
+      print.noquote(paste('Unable to produce samples in measurement space mds plot. Error:',p))
+    }
+  }
+  ndat = nrow(xm)
+  cfracs = colSums(! is.na(xm))/ndat
+  keepme = (cfracs >= xfilt)
+  print.noquote(paste('Removing', sum(! keepme), 'xvars with more than',xfilt,'fraction missing'))
+  vars = apply(xm,2,var,na.rm=T)
+  xm = xm[,keepme]
+  for (i in c(1:max(1,ndep)))   {
+    target = depnames[i]
+    if (length(target) < 1) { target='y' }
+    if (i %% 100 == 0) { cat(i,target,'\n') }
+    if (ndep <= 1) {
+      y=depvars
+    } else {
+      y = depvars[,i]
+    }
+    if (fam == "binomial") {y = as.factor(y)}
+    x = xm
+    id = cox_id
+    if (fam != "cox") {
+         ok = complete.cases(x,y)
+         if (sum(! ok) > 0) {
+            message(paste('@@@ Removing',sum(! ok),'cases with missing y of',length(y),'@@@'))
+            y = y[(ok)]
+            x = x[(ok),]
+            id = id[(ok)]
+           }
+    }
+    ok = complete.cases(y)
+    if (sum(ok) == 0 ) {
+      print(paste("No complete cases found for",target,"in input x dim =",paste(dim(xm),collapse=','),"length y=",length(y)))
+    } else {
+      if (i == 1) { outpred = out_pred_file
+      } else {
+        outpred = paste(target,'predicted_output.xls')
+      }
+      regres = do_lasso(x=x,y=y,do_standard=do_standard,debugOn=debugOn,defaultFam=defaultFam,optLambda=optLambda,out_pred_file=outpred,
+                        descr=descr,indx=i,target=target,alpha=alpha,nfold=nfold,penalty=penalty,out_pred=out_pred,yvarname=target,id=id,mdsplots=mdsplots)
+      if (! is.na(regres)) { res = rbind(res,regres) }
+    }
+  }
+  print.noquote('@@@ Results preview:')
+  print.noquote(res,digits=5)
+  sink()
+  return(res)
+}
+
+
+corPlot=function(xdat=c(),main='main title',is_raw=T)
+{
+  library(pheatmap)
+  library(gplots)
+  if (is_raw) {
+    cxdat = cor(xdat,method="spearman",use="pairwise.complete.obs")
+  } else {
+    cxdat=xdat
+  }
+  xro = nrow(cxdat)
+  if (xro > 1000) stop("Too many rows for heatmap, who can read?!")
+  fontsize_col = 5.0
+  pheatmap(cxdat, main=main, show_colnames = F, width=30, height=30,
+           fontsize_row=fontsize_col, border_color=NA)
+}
+
+
+runTest = function(n=10)
+{
+  set.seed (NULL)
+  Y = data.frame(y1=runif (n),y2=runif(n))
+  Xv <- runif(n*n)
+  X <- matrix(Xv, nrow = n, ncol = n)
+
+  mydf <- data.frame(Y, X)
+
+  regres_out = dolasso_generic(predvars=X,depvars=Y,debugOn=T,p.cutoff = 0.05,maxsteps=10000,nfold=10,
+                               descr='randomdata',do_standard=do_standard,defaultFam="gaussian",alpha=0.05)
+  return(regres_out)
+}
+]]>
+options(width=512)
+options(digits=5)
+alpha = $alpha
+nfold = $nfold
+optLambda = "$optLambda"
+Out_Dir = "$html_file.files_path"
+Input =  "$input1"
+indat = read.table(Input,head=T,sep='\t')
+datcols = colnames(indat)
+myTitle = "$title"
+outtab = "$model_file"
+do_standard = as.logical("$do_standard")
+mdsplots = as.logical("$mdsplots")
+fam = "$model.fam"
+xvar_cols_in = "$xvar_cols"
+force_xvar_cols_in = "$force_xvar_cols"
+xvar_cols = as.numeric(strsplit(xvar_cols_in,",")[[1]])
+force_xvar_cols = c()
+penalties = rep(1,length(datcols))
+forced_in = NA
+
+logxform = "$logxform_cols"
+if (logxform != "None") {
+    logxform_cols = as.numeric(strsplit(logxform,",")[[1]])
+    if (length(logxform_cols) > 0) {
+         small = 1e-10
+         sset = indat[,logxform_cols]
+         zeros = which(sset==0,arr.ind=T)
+         nz = nrow(zeros)
+         if (nz &gt; 0) {
+             message(paste('Log transforming encountered',nz,'zeros - added 1e-10'))
+             sset[zeros] = sset[zeros] + small
+             lset = log(sset)
+             indat[,logxform_cols] = lset
+             }
+         }
+}
+if (force_xvar_cols_in != "None")
+{
+  force_xvar_cols = as.numeric(strsplit(force_xvar_cols_in,",")[[1]])
+  allx = c(xvar_cols,force_xvar_cols)
+  xvar_cols = unique(allx)
+  xvar_cols = xvar_cols[order(xvar_cols)]
+  penalties[force_xvar_cols] = 0
+}
+penalty = penalties[xvar_cols]
+forcedin = paste(datcols[which(penalties == 0)],collapse=',')
+cox_id_col = NA
+cox_id = NA
+
+message(paste('@@@ Using alpha =',alpha,'for all models'))
+x = indat[,xvar_cols]
+nx = nrow(x)
+cx = ncol(x)
+message(paste('@@@@ Input has',nx,'samples and',cx,'predictors'))
+if (cx > nx) {
+message('@@@ WARNING: Models will have more variables than cases so glmnet will likely return one of many possible solutions! Please DO NOT expect reliable results - glmnet is clever but not magical @@@')
+}
+
+xcolnames = datcols[xvar_cols]
+
+if (file.exists(Out_Dir) == F) dir.create(Out_Dir)
+out_full = F
+out_full_file = NA
+out_pred_file = ""
+out_pred = as.logical("$model.output_pred")
+
+#if $model.fam == "binomial" or $model.fam == "cox":
+   cox_id_col = $model.cox_id
+   cox_id = indat[,cox_id_col]
+   if (out_pred == T) {
+     out_pred_file="$output_pred_file"
+     rownames(x) = cox_id
+     }
+#end if
+#if $model.fam == "cox":
+  cox_time = $model.cox_time
+  cox_status = $model.cox_status
+  out_full = as.logical("$model.output_full")
+  if (out_full == T) { out_full_file="$output_full_file" }
+  yvar_cols = c(cox_time,cox_status)
+  ycolnames = c('time','status')
+  istat = as.double(indat[,cox_status])
+  itime = as.double(indat[,cox_time])
+  predict_at = quantile(itime)
+  if ("$model.predict_at" &gt; "")
+  {
+      pa = "$model.predict_at"
+      predict_at = as.numeric(strsplit(pa,",")[[1]])
+  }
+  y = data.frame(time = itime, status = istat)
+  ustat = unique(istat)
+  if ((length(ustat) != 2) | (! 1 %in% ustat ) | (! 0 %in% ustat))
+  {
+   print.noquote(paste('INPUT ERROR: status must have 0 (censored) or 1 (event) but found',paste(ustat,collapse=',') ))
+   quit(save='no',status=1)
+  }
+  y = as.matrix(y)
+  x = as.matrix(x)
+  print.noquote(paste('@@@ Cox model will predict yvar=',datcols[cox_status],'using cols=',paste(xcolnames,collapse=','),'n preds=',length(xcolnames),
+    'forced in=',forcedin))
+  regres_out = dolasso_cox(x=x,y=y,debugOn=F,maxsteps=10000,nfold=nfold,xcolnames=xcolnames,ycolnames=ycolnames,optLambda=optLambda,out_full=out_full,out_full_file=out_full_file,
+       out_pred=out_pred,out_pred_file=out_pred_file,cox_id=cox_id,descr=myTitle,do_standard=do_standard,alpha=alpha,penalty=penalty,predict_at=predict_at,mdsplots=mdsplots)
+#else:
+    yvar_cols = "$model.yvar_cols"
+    yvar_cols = as.numeric(strsplit(yvar_cols,",")[[1]])
+    ycolnames = datcols[yvar_cols]
+    print.noquote(paste('@@@',fam,'model will predict yvar=',paste(ycolnames,collapse=','),'using cols=',paste(xcolnames,collapse=','),'n preds=',length(xcolnames),
+    'forced in=',forcedin))
+    y = data.matrix(indat[,yvar_cols])
+    print.noquote(paste('Model will use',fam,'link function to predict yvar=',paste(ycolnames,collapse=','),'n preds=',length(xcolnames),'forced in=',forcedin))
+    regres_out = dolasso_generic(predvars=x,depvars=y,debugOn=F, maxsteps=10000,nfold=nfold,xcolnames=xcolnames,ycolnames=ycolnames,optLambda=optLambda,out_pred_file=out_pred_file,
+                             descr=myTitle,do_standard=do_standard,defaultFam=fam,alpha=alpha,penalty=penalty,out_pred=out_pred,cox_id=cox_id,mdsplots=mdsplots)
+#end if
+
+write.table(format(regres_out,digits=5),outtab,quote=FALSE, sep="\t",row.names=F)
+print.noquote('@@@ SessionInfo for this R session:')
+sessionInfo()
+warnings()
+
+</configfile>
+</configfiles>
+  <inputs>
+     <param name="title" type="text" value="lasso test" label="Title for job outputs" help="Typing a short, meaningful text here will help remind you (and explain to others) what the outputs represent">
+      <sanitizer invalid_char="">
+        <valid initial="string.letters,string.digits"><add value="_" /> </valid>
+      </sanitizer>
+    </param>
+    <param name="input1"  type="data" format="tabular" label="Select an input tabular text file from your history. Rows represent samples; Columns are measured phenotypes"
+    multiple='False' optional="False" help="Tabular text data with samples as rows, phenotypes as columns with a header row of column identifiers" />
+    <param name="xvar_cols" label="Select columns containing numeric variables to use as predictor (x) variables" type="data_column" data_ref="input1" numerical="False"
+         multiple="True" use_header_names="True" force_select="True" />
+    <param name="force_xvar_cols" label="Select numeric columns containing variables ALWAYS included as predictors in cross validation" type="data_column" data_ref="input1" numerical="False"
+         multiple="True" use_header_names="True" force_select="False"/>
+    <conditional name="model">
+        <param name="fam" type="select" label="GLM Link function for models"
+             help="Binary dependant variables will automatically be set to Binomial no matter what this is set to">
+                <option value="gaussian" selected="true">Gaussian - continuous dependent (y)</option>
+                <option value="binomial">Binomial dependent variables</option>
+                <option value="poisson">Poisson (eg counts)</option>
+                <option value="cox">Cox models - require special setup for y variables - see below</option>
+        </param>
+        <when value="gaussian">
+            <param name="yvar_cols" label="Select numeric columns containing variables to use as the dependent (y) in elasticnet" type="data_column" data_ref="input1" numerical="False"
+             multiple="True" use_header_names="True"  help = "If multiple, each will be modelled against all the x variables and reported separately." force_select="True"/>
+            <param name="output_full" type="hidden" value='F' />
+            <param name="output_pred" type="hidden" value='F' />
+              <param name="cox_id" label="Select column containing a unique sample identifier"
+                 help = "Only really needed for output sample specific predicted values downstream."
+                 type="data_column" data_ref="input1" numerical="False" force_select="True"
+                 multiple="False" use_header_names="True" />
+      </when>
+        <when value="binomial">
+            <param name="yvar_cols" label="Select numeric columns containing variables to use as the dependent (y) in elasticnet" type="data_column" data_ref="input1" numerical="False"
+             multiple="True" use_header_names="True"  help = "If multiple, each will be modelled against all the x variables and reported separately." force_select="True"/>
+             <param name="output_full" type="hidden" value='F' />
+             <param name="output_pred" type="select" label="Create a tabular output with predicted values for each subject from the optimal model for (eg) NRI estimates" >
+                <option value="F" selected="true">No predicted value output file</option>
+                <option value="T">Create a predicted value output file</option>
+             </param>
+              <param name="cox_id" label="Select column containing a unique sample identifier"
+                 help = "Only really needed for output sample specific predicted values downstream."
+                 type="data_column" data_ref="input1" numerical="False" force_select="True"
+                 multiple="False" use_header_names="True" />
+             <param name="predict_at" type="hidden" value='' />
+
+        </when>
+        <when value="poisson">
+            <param name="yvar_cols" label="Select columns containing variables to use as the dependent (y) in elasticnet" type="data_column" data_ref="input1" numerical="True"
+             multiple="True" use_header_names="True"  help = "If multiple, each will be modelled against all the x variables and reported separately." force_select="True"/>
+             <param name="output_full" type="hidden" value='F' />
+             <param name="output_pred" type="hidden" value='F' />
+             <param name="predict_at" type="hidden" value='' />
+              <param name="cox_id" label="Select column containing a unique sample identifier"
+                 help = "Optional. Only really needed for output sample specific predicted values downstream. Free - enjoy"
+                 type="data_column" data_ref="input1" numerical="True" force_select="False"
+                 multiple="False" use_header_names="True" />
+        </when>
+        <when value="cox">
+             <param name="cox_time" label="Select column containing time under observation for Cox regression"
+                 type="data_column" data_ref="input1" numerical="True" force_select="True"
+                 multiple="False" use_header_names="True"  help = "This MUST contain a time period - eg continuous years or days to failure or right censoring"/>
+             <param name="cox_status" label="Select column containing status = 1 for outcome of interest at the end of the time under observation or 0 for right censoring"
+                 type="data_column" data_ref="input1" numerical="True" force_select="True"
+                 multiple="False" use_header_names="True"  help = "This MUST contain 1 for subjects who had an event at that time or 0 for a right censored observation"/>
+              <param name="cox_id" label="Select column containing a unique sample identifier"
+                 help = "Optional. Only really needed for output sample specific predicted values downstream. Free - enjoy"
+                 type="data_column" data_ref="input1" numerical="False" force_select="False"
+                 multiple="False" use_header_names="True" />
+             <param name="output_full" type="select" label="Create a tabular output with coefficients for all predictors" >
+                <option value="F" selected="true">No full model output file</option>
+                <option value="T">Create a full model output file</option>
+             </param>
+             <param name="output_pred" type="select" label="Create a tabular output with predicted values for each subject from the optimal model for (eg) NRI estimates" >
+                <option value="F" selected="true">No predicted value output file</option>
+                <option value="T">Create a predicted value output file</option>
+             </param>
+             <param name="predict_at"  type="text" value='' label="Provide a comma separated list of times to make a prediction for each subject"
+                 optional="True" help="Default (blank) will return predictions at 0%,25%,50%,75%,100% of the observed times which should be informative" />
+
+        </when>
+    </conditional>
+    <param name="optLambda" type="select" label="Value to use when reporting optimal model and coefficients" help="minLambda will have more predictors - 1SDLambda will be more parsimonious">
+            <option value="lambda.1se" selected="true">Lambda + 1 SE of min MSE or AUC (fewer coefficients - more false negatives)</option>
+            <option value="lambda.min">Lambda at min MSE or max AUC (more coefficients - more false positives)</option>
+    </param>
+    <param name="logxform_cols"  optional="True" label="Select numeric columns to be log transformed before use as predictors or dependent variables" type="data_column"
+        data_ref="input1" numerical="True" multiple="True" use_header_names="True" help = "The wisdom of doing this depends entirely on your predictors - eg can help diminish long-tailed outlier influence"
+        force_select="False"/>
+    <param name="do_standard" type="select" label="Standardise x vars"
+         help="If all measurements on same scale, may not be needed. Coefficients are always returned on the original scale.">
+            <option value="False" selected="true">No standardisation of predictors</option>l
+            <option value="True">Standardise predictors before model</option>
+    </param>
+    <param name="mdsplots" type="select" label="Generate MDS plots of samples in measurement space and measurements in sample space" >
+            <option value="False" selected="true">No MDS plots</option>l
+            <option value="True">Yes create MDS plots</option>
+    </param>
+    <param name="alpha" type="float" value="0.95" min="0.01" max="1.0" label="Alpha - see glmnet docs. 1 for pure lasso. 0.0 for pure ridge regression"
+     help="Default 0.95 allows lasso to cope better with expected predictor collinearity. Use (eg) 0.5 for hybrid regularised regression or (eg) 0.025 for ridge regression"/>
+    <param name="nfold" type="integer" value="10" label="Number of folds for internal cross validation"
+     help="Default of 10 is usually ok"/>
+  </inputs>
+  <outputs>
+    <data format="html" name="html_file" label="${title}.html"/>
+    <data format="tabular" name="model_file" label="${title}_modelres.xls"/>
+    <data format="tabular" name="output_full_file" label="${title}_full_cox_model.xls">
+        <filter>model['output_full'] == 'T'</filter>
+    </data>
+    <data format="tabular" name="output_pred_file" label="${title}_predicted_from_model.xls">
+        <filter>model['output_pred'] == 'T'</filter>
+    </data>
+  </outputs>
+ <tests>
+    <test>
+     <param name='input1' value='cox_test.xls' ftype='tabular' />
+     <param name='treatment_name' value='case' />
+     <param name='title' value='Cox glmnet test' />
+     <param name='nfold' value='10' />
+     <param name='logxform_cols' value='' />
+     <param name='alpha' value='0.95' />
+     <param name='do_standard' value="True" />
+     <param name='cox_time' value='1' />
+     <param name='cox_status' value='2' />
+     <param name='cox_id' value='1' />
+     <param name='predict_at' value='' />
+     <param name='fam' value='cox' />
+     <param name='yvar_cols' value='' />
+     <param name='xvar_cols' value='3,4,5' />
+     <param name='force_xvar_cols' value='3' />
+     <param name='output_full' value='F' />
+     <param name='output_pred' value='F' />
+     <output name='model_file' file='coxlassotest_modelres.xls'>
+          <assert_contents>
+                <has_text text="rhubarb" />
+                <has_text text="TRUE" />
+                <!-- &#009; is XML escape code for tab -->
+                <!-- has_line line="regulator&#009;partial_likelihood&#009;forced_in&#009;glmnet_model&#009;best_lambda" / -->
+                <has_line line="regulator&#009;partial_likelihood&#009;forced_in&#009;glmnet_model&#009;best_lambda&#009;lambdaChoice&#009;alpha" />
+                <has_n_columns n="7" />
+           </assert_contents>
+     </output>
+     <output name='html_file' file='coxlassotest.html'  compare='diff' lines_diff='16' />
+    </test>
+</tests>
+<help>
+
+**Before you start**
+
+Please read the glmnet documentation @ glmnet_
+
+This Galaxy wrapper merely exposes that code and the glmnet_ documentation is essential reading
+before getting useful results here.
+
+**What it does**
+
+From documentation at glmnet_ ::
+
+ Glmnet is a package that fits a generalized linear model via penalized maximum likelihood.
+ The regularization path is computed for the lasso or elasticnet penalty at a grid of values for the regularization parameter lambda.
+ The algorithm is extremely fast, and can exploit sparsity in the input matrix x.
+ It fits linear, logistic and multinomial, poisson, and Cox regression models.
+ A variety of predictions can be made from the fitted models.
+
+Internal cross validation is used to optimise the choice of lambda based on CV AUC for logistic (binomial outcome) models, or CV mse for gaussian.
+
+**Warning about the tyrany of dimensionality**
+
+Yes, this package will select 'optimal' models even when you (optimistically) supply more predictors than you have cases.
+The model returned is unlikely to represent the only informative regularisation path through your data - if you run repeatedly with
+exactly the same settings, you will probably see many different models being selected.
+This is not a software bug - the real problem is that you just don't have enough information in your data.
+
+Sufficiently big jobs will take a while (eg each lasso regression with 20k features on 1k samples takes about 2-3 minutes on our aged cluster)
+
+**Input**
+
+Assuming you have more measurements than samples, you supply data as a tabular text file where each row is a sample and columns
+are variables. You specify which columns are dependent (predictors) and which are observations for each sample. Each of multiple
+dependent variable columns will be run and reported independently. Predictors can be forced in to the model.
+
+**Output**
+
+For each selected dependent regression variable, a brief report of the model coefficients predicted at the
+'optimal' nfold CV value of lambda.
+
+**Predicted event probabilities for Cox and Logistic models**
+
+If you want to compare (eg) two competing clinical predictions, there's a companion generic NRI tool
+for predicted event probabilities. Estimates dozens of measures of improvement in prediction. Currently only works for identical id subjects
+but can probably be extended to independent sample predictions.
+
+Given a model, we can generate a predicted p (for status 1) in binomial or cox frameworks so models can be evaluated in terms of NRI.
+Of course, estimates are likely substantially inflated over 'real world' performance by being estimated from the same sample - but you probably
+already knew that since you were smart enough to reach this far down into the on screen help. The author salutes you, intrepid reader!
+
+It may seem an odd thing to do, but we can predict p for an event for each subject from our original data, given a parsimonious model. Doing
+this for two separate models (eg, forcing in an additional known explanatory measurement to the new model) allows comparison of the two models
+predicted status for each subject, or the same model in independent populations to see how badly it does
+
+**Attributions**
+
+glmnet_ is the R package exposed by this Galaxy tool.
+
+Galaxy_ (that's what you are using right now!) for gluing everything together
+
+Otherwise, all code and documentation comprising this tool was written by Ross Lazarus and is
+licensed to you under the LGPL_ like other rgenetics artefacts
+
+.. _LGPL: http://www.gnu.org/copyleft/lesser.html
+.. _glmnet: http://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html
+.. _Galaxy: http://getgalaxy.org
+</help>
+
+<citations>
+    <citation type="bibtex">
+@Article{Friedman2010, title = {Regularization Paths for Generalized Linear Models via Coordinate Descent},
+    author = {Jerome Friedman and Trevor Hastie and Robert Tibshirani},
+    journal = {Journal of Statistical Software},
+    year = {2010},
+    volume = {33},
+    number = {1},
+    pages = {1--22},
+    url = {http://www.jstatsoft.org/v33/i01/}
+  }
+    </citation>
+    <citation type="doi">
+10.1093/bioinformatics/bts573
+    </citation>
+</citations>
+</tool>
author	fubar
date	Sat, 31 Oct 2015 01:07:28 -0400
parents
children	31be675baa50