view rglasso_cox.xml @ 11:d351a1ec1e18 draft

GRRR!!! as at nov 2015, glmnet fails to load methods. see https://github.com/dhimmel/elevcan/issues/1
author fubar
date Sun, 01 Nov 2015 18:26:40 -0500
parents 31be675baa50
children
line wrap: on
line source

<tool id="rglasso_cox" name="Lasso" version="0.03">
  <description>and cox regression using elastic net</description>
  <requirements>
      <requirement type="package" version="3.2.2">R</requirement>
      <requirement type="package" version="1.3.18">graphicsmagick</requirement>
      <requirement type="package" version="9.10">ghostscript</requirement>
      <requirement type="package" version="3.2">glmnet_lars_3_2</requirement>
  </requirements>
  <command interpreter="python">
     rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "rglasso"
    --output_dir "$html_file.files_path" --output_html "$html_file" --make_HTML "yes"
  </command>
<configfiles>
<configfile name="runme">
<![CDATA[
library('methods')
#### as at nov 2015, glmnet fails to load methods. see https://github.com/dhimmel/elevcan/issues/1
library('glmnet')
library('lars')
library('survival')
library('pec')


message=function(x) {print.noquote(paste(x,sep=''))}


ross.cv.glmnet = function (x, y, weights, offset = NULL, lambda = NULL, type.measure = c("mse",
    "deviance", "class", "auc", "mae"), nfolds = 10, foldid,
    grouped = TRUE, keep = FALSE, parallel = FALSE, ...)
{
    if (missing(type.measure))
        type.measure = "default"
    else type.measure = match.arg(type.measure)
    if (!is.null(lambda) && length(lambda) < 2)
        stop("Need more than one value of lambda for cv.glmnet")
    N = nrow(x)
    if (missing(weights))
        weights = rep(1, N)
    else weights = as.double(weights)
    y = drop(y)
    glmnet.call = match.call(expand.dots = TRUE)
    sel = match(c("type.measure", "nfolds", "foldid", "grouped",
        "keep"), names(glmnet.call), F)
    if (any(sel))
        glmnet.call = glmnet.call[-sel]
    glmnet.call[[1]] = as.name("glmnet")
    glmnet.object = glmnet(x, y, weights = weights, offset = offset,
        lambda = lambda, ...)
    glmnet.object\$call = glmnet.call
    is.offset = glmnet.object\$offset
    lambda = glmnet.object\$lambda
    if (inherits(glmnet.object, "multnet")) {
        nz = predict(glmnet.object, type = "nonzero")
        nz = sapply(nz, function(x) sapply(x, length))
        nz = ceiling(apply(nz, 1, median))
    }
    else nz = sapply(predict(glmnet.object, type = "nonzero"),
        length)
    if (missing(foldid))
        foldid = sample(rep(seq(nfolds), length = N))
    else nfolds = max(foldid)
    if (nfolds < 3)
        stop("nfolds must be bigger than 3; nfolds=10 recommended")
    outlist = as.list(seq(nfolds))
    if (parallel && require(foreach)) {
        outlist = foreach(i = seq(nfolds), .packages = c("glmnet")) %dopar%
            {
                sel = foldid == i
                if (is.matrix(y))
                  y_sub = y[!sel, ]
                else y_sub = y[!sel]
                if (is.offset)
                  offset_sub = as.matrix(offset)[!sel, ]
                else offset_sub = NULL
                glmnet(x[!sel, , drop = FALSE], y_sub, lambda = lambda,
                  offset = offset_sub, weights = weights[!sel],
                  ...)
            }
    }
    else {
        for (i in seq(nfolds)) {
            sel = foldid == i
            if (is.matrix(y))
                y_sub = y[!sel, ]
            else y_sub = y[!sel]
            if (is.offset)
                offset_sub = as.matrix(offset)[!sel, ]
            else offset_sub = NULL
            outlist[[i]] = glmnet(x[!sel, , drop = FALSE],
                y_sub, lambda = lambda, offset = offset_sub,
                weights = weights[!sel], ...)
        }
    }
    fun = paste("cv", class(glmnet.object)[[1]], sep = ".")
    cvstuff = do.call(fun, list(outlist, lambda, x, y, weights,
        offset, foldid, type.measure, grouped, keep))
    cvm = cvstuff\$cvm
    cvsd = cvstuff\$cvsd
    cvname = cvstuff\$name

    out = list(lambda = lambda, cvm = cvm, cvsd = cvsd, cvup = cvm +
        cvsd, cvlo = cvm - cvsd, nzero = nz, name = cvname, glmnet.fit = glmnet.object)
    if (keep)
        out = c(out, list(fit.preval = cvstuff\$fit.preval, foldid = foldid))

    lamin = if (type.measure == "auc")
        getmin(lambda, -cvm, cvsd)
    else getmin(lambda, cvm, cvsd)
    out = c(out, as.list(lamin))
    hitsse = rep(0,ncol(x))
    hitsmin = rep(0,ncol(x))
    names(hitsse) = colnames(x)
    names(hitsmin) = colnames(x)
    olmin = lamin\$lambda.min
    ol1sd = lamin\$lambda.1se
    lambs = c(olmin,ol1sd)
    names(lambs) = c('olmin','ol1sd')
    for (cvfit in outlist) {
        colmin = which(cvfit\$lambda == olmin)
        col1se = which(cvfit\$lambda == ol1sd)
        nzmin = which(cvfit\$beta[,colmin] != 0)
        nz1se = which(cvfit\$beta[,col1se] != 0)
        hitsse[nz1se] = hitsse[nz1se] + 1
        hitsmin[nzmin] = hitsmin[nzmin] + 1
    }
    obj = c(out,list(cvhits.1se=hitsse,cvhits.min=hitsmin))
    class(obj) = "cv.glmnet"
    obj
}

mdsPlot = function(dm,myTitle,groups=NA,outpdfname,transpose=T)
{

  samples = colnames(dm)
  mt = myTitle
  pcols=c('maroon')
  if (! is.na(groups))
  {
  gu = unique(groups)
  colours = rainbow(length(gu),start=0.1,end=0.9)
  pcols = colours[match(groups,gu)]
  }
  mydata = dm
  if (transpose==T)
  {
  mydata = t(dm)
  }
  npred = ncol(mydata)
  d = dist(mydata)
  fit = cmdscale(d,eig=TRUE, k=min(10,npred-2))
  xmds = fit\$points[,1]
  ymds = fit\$points[,2]
  pdf(outpdfname)
  plot(xmds, ymds, xlab="Dimension 1", ylab="Dimension 2",
       main=paste(mt,"MDS Plot"),type="n", col=pcols, cex=0.35)
  text(xmds, ymds, labels = row.names(mydata), cex=0.35, col=pcols)
  grid(col="lightgray",lty="dotted")
  dev.off()
}


getpredp_logistic = function(x,yvec,yvarname,id)
{
  yvals = unique(yvec)
  if (length(yvals) != 2) {
       message(c('ERROR: y does not have 2 values =',paste(yvals,collapse=',')))
       return(NA)
       }
  cols = colnames(x)
  if (length(cols) == 0) {
       message('ERROR: No columns in input x? Cannot predict!')
       return(NA)
       }
  cn = paste(cols, collapse = ' + ')

  formstring=paste("y ~",cn)
  form = as.formula(formstring)
  ok = complete.cases(x)

  if (sum(ok) < length(ok)) {
    x = x[ok,]
    yvec = yvec[ok]
    id = id[ok]
    }
  nx = data.frame(id=id,x,y=yvec)
  print('nx,yvec:')
  print(head(nx,n=3))
  print(yvec)
  mdl = glm(form, data=nx, family="binomial", na.action=na.omit)
  message(c('Model format =',formstring))
  message(paste('Predictive model details used to generate logistic outcome probabilities for',yvarname,':'))
  print(summary(md1))
  print(anova(md1))
  predp = predict(md1,nx,type="response")
  p1 = data.frame(id=id,pred_response=predp,obs_response=yvec)
  return(p1)
}

getpredp_cox = function(x,time,status,id,predict_at)
{
  cols = colnames(x)
  if (length(cols) == 0) {
       message('ERROR: No columns in input x? Cannot predict!')
       return(NA)
       }
  cn = paste(colnames(x), collapse = ' + ')

  formstring=paste("Surv(time, status) ~",cn)

  form = as.formula(formstring)

  ok = complete.cases(x)

  if (sum(ok) < length(ok)) {
    x = x[ok,]
    time = time[ok]
    status = status[ok]
    id = id[ok]
    }
  nx = data.frame(x,time=time,status=status)
  m1 = coxph(form, data=nx,singular.ok=TRUE)
  print.noquote('Predictive model details used to generate survival probabilities:')
  print.noquote(m1)
  predpq = predictSurvProb(object=m1, newdata=nx, times=predict_at)
  predpq = 1-predpq
  colnames(predpq) = paste('p_surv_to',predict_at,sep='_')
  p1 = data.frame(id=id,predpq,time=time,status=status)
  return(p1)
}


dolasso_cox = function(x,y,debugOn=F,maxsteps=10000,nfold=10,xcolnames,ycolnames,optLambda='lambda.1se',out_full=F,out_full_file=NA,
                             out_pred=F,out_pred_file=NA,cox_id=NA, descr='Cox test',do_standard=F,alpha=0.9,penalty,predict_at,mdsplots=F)
{
  logf = file("cox_rglasso.log", open = "a")
  sink(logf,type = c("output", "message"))
  res = NULL
  if (mdsplots==T) {
      outpdfname = 'cox_x_in_sample_space_MDS.pdf'
      p = try({ mdsPlot(x,'measurements in sample space',groups=NA,outpdfname=outpdfname,transpose=T) },T)
      if (class(p) == "try-error")
      {
        print.noquote(paste('Unable to produce predictors in sample space mds plot',p))
      }
      outpdfname = 'cox_samples_in_x_space_MDS.pdf'
      p = try({mdsPlot(x,'samples in measurement space',groups=y,outpdfname=outpdfname,transpose=F) },T)
      if (class(p) == "try-error")
      {
        print.noquote(paste('Unable to produce samples in measurement space mds plots',p))
      }
  }
  if (is.na(predict_at)) { predict_at = quantile(y) }
  message(paste('@@@ Cox model will be predicted at times =',paste(predict_at,collapse=',')))
  do_standard = do_standard
  standardize = do_standard
  normalize = do_standard
  p = try({larsres = glmnet(x,y,family='cox',standardize=standardize,alpha=alpha,penalty.factor=penalty )},T)
  if (class(p) == "try-error")
  {
    print.noquote('Unable to run cox glmnet on your data')
    print.noquote(p)
    sink()
    return(NA)
  }
  if (out_full == T)
  {
  b = as.matrix(larsres\$beta)
  nb = length(colnames(b))
  bcoef = b[,nb]
  lastl = larsres\$lambda[length(larsres\$lambda)]
  allres = data.frame(x=rownames(b),beta=bcoef,lambda=lastl)
  write.table(format(allres,digits=5),out_full_file,quote=FALSE, sep="\t",row.names=F)
  }

  outpdf = paste('cox',descr,'glmnetdev.pdf',sep='_')
  try(
      {
      pdf(outpdf)
      plot(larsres,main='cox glmnet',label=T)
      grid()
      dev.off()
      },T)

  larscv = NA

  p = try({larscv=ross.cv.glmnet(x,y,family=fam,type.measure='deviance',penalty=penalty)},T)
  if (class(p) == "try-error") {
     print.noquote(paste('Unable to cross validate your data',p))
     sink()
     return(NA)
     }
  lse = larscv\$cvhits.1se
  lmin = larscv\$cvhits.min
  tot = lse + lmin
  allhits = data.frame(hits_lambda_1se = lse,hits_lambda_min = lmin)
  nzhits = allhits[which(tot != 0),]
  message('Times each predictor was selected in CV models (excluding zero count predictors)')
  print.noquote(nzhits)
  out_nz_file = 'cox_cross_validation_model_counts.xls'
  write.table(nzhits,out_nz_file,quote=FALSE, sep="\t",row.names=F)

  outpdf = paste('cox',descr,'glmnet_cvdeviance.pdf',sep='_')

  p = try(
     {
     pdf(outpdf)
     plot(larscv,main='Deviance',label=T)
     grid()
     dev.off()
     },T)
  if (optLambda == 'lambda.min') {
      best_lambda = larscv\$lambda.min
      bestcoef = as.matrix(coef(larscv, s = "lambda.min"))
  } else {
      best_lambda = larscv\$lambda.1se
      bestcoef = as.matrix(coef(larscv, s = "lambda.1se"))
  }
  inmodel = which(bestcoef != 0)
  coefs = bestcoef[inmodel]
  preds = rownames(bestcoef)[inmodel]

  names(coefs) = preds
  pen = as.logical( ! penalty[inmodel])
  if (out_pred==T)
  {
      if (length(inmodel) > 0 ) {
          predcols = inmodel
          xmat = as.matrix(x[,predcols])
          colnames(xmat) = preds
          bestpred = getpredp_cox(x=xmat,time=y[,'time'],status=y[,'status'],id=cox_id, predict_at=predict_at)
          pred = data.frame(responsep=bestpred, best_lambda=best_lambda,lamchoice=optLambda,alpha=alpha)
          write.table(pred,out_pred_file,quote=FALSE, sep="\t",row.names=F)
        } else { print.noquote('WARNING: No coefficients in selected model to predict with - no predictions made') }
    }
  if (debugOn) {
      print.noquote(paste('best_lambda=',best_lambda,'saving cox respreds=',paste(names(coefs),collapse=','),'as predictors of survival. Coefs=',paste(coefs,collapse=',')))
      }
  p = try({res = data.frame(regulator=names(coefs),partial_likelihood=coefs,forced_in=pen,glmnet_model='cox',best_lambda=best_lambda,
     lambdaChoice=optLambda,alpha=alpha)},T)
  if (class(p) == "try-error") {
    message(paste('@@@ unable to return a dataframe',p))
    sink()
    return(NA)
    }
  print.noquote('@@@ Results preview:')
  print.noquote(res,digits=5)
  sink()
  return(res)

}


do_lasso = function(x=NA,y=NA,do_standard=T,debugOn=T,defaultFam="gaussian",optLambda='minLambda',descr='description', indx=1,target='target',sane=F,
                    alpha=0.9,nfold=10,penalty=c(),out_pred=F,out_pred_file='outpred',yvarname='yvar',id=c(),mdsplots=F)
{
  logf = file(paste(target,"rglasso.log",sep='_'), open = "a")
  sink(logf,type = c("output", "message"))
  res = NA
  phe_is_bin = (length(unique(y)) == 2)
  forcedin = paste(colnames(x)[which(penalty == 0)],collapse=',')
  fam = "gaussian"
  if (defaultFam %in% c("poisson","binomial","gaussian","multinomial")) fam=defaultFam
  if (phe_is_bin == T) {
    fam = "binomial"
  }
  print.noquote(paste('target=',target,'is binary=',phe_is_bin,'dim(x)=',paste(dim(x),collapse=','),'length(y)=',length(y),'force=',forcedin,'fam=',fam))
  standardize = do_standard
  p = try({larsres = glmnet(x,y,family=fam,standardize=standardize,maxit=10000,alpha=alpha,penalty.factor=penalty) },T)
  if (class(p) == "try-error")
  {
    print(paste('ERROR: unable to run glmnet for target',target,'error=',p))
    sink()
    return(NA)
  }

  mt = paste('Glmnet fraction deviance for',target)
  outpdf = paste(target,'glmnetPath.pdf',sep='_')
  pdf(outpdf)
  plot(larsres,main=mt,label=T)
  grid()
  dev.off()

  outpdf = paste(target,'glmnetDeviance.pdf',sep='_')

  mt2 = paste('Glmnet lambda for',target)

  pdf(outpdf)
  plot(larsres,xvar="lambda",main=mt2,label=T)
  grid()
  dev.off()

  larscv = NA
  if (fam=="binomial") {
    tmain = paste(target,'AUC')
    outpdf = paste(target,'glmnetCV_AUC.pdf',sep='_')
    p = try({larscv = ross.cv.glmnet(x=x,y=y,family=fam,type.measure='auc')},T)
  } else {
    tmain = paste(target,'CV MSE')
    outpdf = paste(target,'glmnetCV_MSE.pdf',sep='_')
    p = try({larscv = ross.cv.glmnet(x=x,y=y,family=fam,type.measure='mse')},T)
  }
  if (class(p) == "try-error")
  {
    print(paste('ERROR: unable to run cross validation for target',target,'error=',p))
    sink()
    return(NA)
  }

  pdf(outpdf)
  plot(larscv,main=tmain)
  grid()
  dev.off()

  lse = larscv\$cvhits.1se
  lmin = larscv\$cvhits.min
  tot = lse + lmin
  allhits = data.frame(pred=colnames(x),hits_lambda_1se = lse,hits_lambda_min = lmin)
  nzhits = allhits[which(tot != 0),]
  message('Total hit count for each predictor over all CV models (excluding zero count predictors)')
  print.noquote(nzhits)
  out_nz_file = paste(target,'cross_validation_model_counts.xls',sep='_')
  write.table(nzhits,out_nz_file,quote=FALSE, sep="\t",row.names=F)

  ipenalty = c(0,penalty)
  if (optLambda == 'lambda.min') {
    best_lambda = larscv\$lambda.min
    bestpred = as.matrix(coef(larscv, s = "lambda.min"))
  } else {
    best_lambda = larscv\$lambda.1se
    bestpred = as.matrix(coef(larscv, s = "lambda.1se"))
  }
  inmodel = which(bestpred != 0)
  coefs = bestpred[inmodel,1]
  preds = rownames(bestpred)[inmodel]
  iforced = ipenalty[inmodel]
  forced = ! as.logical(iforced)
  names(coefs) = preds
  ncoef = length(coefs) - 1
  if (out_pred==T && fam=="binomial")
  {
    print.noquote(paste('Predicting',target,'probabilities from binomial glmnet at alpha',alpha,'and lambda',best_lambda))
    bestpred = predict.glmnet(larsres,s=best_lambda,newx=x,type="response")
    bestpred = exp(bestpred)/(1+exp(bestpred))
    pred = data.frame(id=id,y=y,predp=as.vector(bestpred), best_lambda=best_lambda)
    write.table(pred,out_pred_file,quote=FALSE, sep="\t",row.names=F)
  }
  if (debugOn) {cat(indx,'best_lambda=',best_lambda,'saving',fam,'respreds=',names(coefs),'as predictors of',target,'coefs=',coefs,'\n')}
  res = try(data.frame(i=indx,pred=target,regulator=names(coefs),coef=coefs,forced_in=forced,glmnet_model=fam,ncoef=ncoef,
     best_lambda=best_lambda,lambdaChoice=optLambda,alpha=alpha),T)
  if (class(res) == "try-error") {
    sink()
    return(NA) }
  print.noquote(res)
  sink()
  return(res)
}


dolasso_generic = function(predvars=NA,depvars=NA,debugOn=T,maxsteps=100, alpha=0.9,nfold=10,xcolnames=c(),ycolnames=c(),optLambda='minLambda', out_pred_file=NA,
                           descr="describe me",do_standard=F,defaultFam="gaussian",penalty=c(),out_pred=F,cox_id=c(),mdsplots=F,xfilt=0.95)
{
  logf = file("rglasso.log", open = "a")
  sink(logf,type = c("output", "message"))
  xdat = predvars
  xm = data.matrix(xdat)
  res = NULL
  id = cox_id
  depnames = ycolnames
  ndep = length(depnames)
  if (mdsplots==T) {
    outpdfname = 'rglasso_x_in_sample_space_MDS.pdf'
    p = try({ mdsPlot(xm,'measurements in sample space',groups=NA,outpdfname=outpdfname,transpose=T) },T)
    if (class(p) == "try-error")
    {
      print.noquote(paste('Unable to produce predictors in sample space mds plot. Error:',p))
    }
    outpdfname = 'rglasso_samples_in_x_space_MDS.pdf'
    p = try({mdsPlot(xm,'samples in measurement space',groups=NA,outpdfname=outpdfname,transpose=F) },T)
    if (class(p) == "try-error")
    {
      print.noquote(paste('Unable to produce samples in measurement space mds plot. Error:',p))
    }
  }
  ndat = nrow(xm)
  cfracs = colSums(! is.na(xm))/ndat
  keepme = (cfracs >= xfilt)
  print.noquote(paste('Removing', sum(! keepme), 'xvars with more than',xfilt,'fraction missing'))
  vars = apply(xm,2,var,na.rm=T)
  xm = xm[,keepme]
  for (i in c(1:max(1,ndep)))   {
    target = depnames[i]
    if (length(target) < 1) { target='y' }
    if (i %% 100 == 0) { cat(i,target,'\n') }
    if (ndep <= 1) {
      y=depvars
    } else {
      y = depvars[,i]
    }
    if (fam == "binomial") {y = as.factor(y)}
    x = xm
    id = cox_id
    if (fam != "cox") {
         ok = complete.cases(x,y)
         if (sum(! ok) > 0) {
            message(paste('@@@ Removing',sum(! ok),'cases with missing y of',length(y),'@@@'))
            y = y[(ok)]
            x = x[(ok),]
            id = id[(ok)]
           }
    }
    ok = complete.cases(y)
    if (sum(ok) == 0 ) {
      print(paste("No complete cases found for",target,"in input x dim =",paste(dim(xm),collapse=','),"length y=",length(y)))
    } else {
      if (i == 1) { outpred = out_pred_file
      } else {
        outpred = paste(target,'predicted_output.xls')
      }
      regres = do_lasso(x=x,y=y,do_standard=do_standard,debugOn=debugOn,defaultFam=defaultFam,optLambda=optLambda,out_pred_file=outpred,
                        descr=descr,indx=i,target=target,alpha=alpha,nfold=nfold,penalty=penalty,out_pred=out_pred,yvarname=target,id=id,mdsplots=mdsplots)
      if (! is.na(regres)) { res = rbind(res,regres) }
    }
  }
  print.noquote('@@@ Results preview:')
  print.noquote(res,digits=5)
  sink()
  return(res)
}


corPlot=function(xdat=c(),main='main title',is_raw=T)
{
  library(pheatmap)
  library(gplots)
  if (is_raw) {
    cxdat = cor(xdat,method="spearman",use="pairwise.complete.obs")
  } else {
    cxdat=xdat
  }
  xro = nrow(cxdat)
  if (xro > 1000) stop("Too many rows for heatmap, who can read?!")
  fontsize_col = 5.0
  pheatmap(cxdat, main=main, show_colnames = F, width=30, height=30,
           fontsize_row=fontsize_col, border_color=NA)
}


runTest = function(n=10)
{
  set.seed (NULL)
  Y = data.frame(y1=runif (n),y2=runif(n))
  Xv <- runif(n*n)
  X <- matrix(Xv, nrow = n, ncol = n)

  mydf <- data.frame(Y, X)

  regres_out = dolasso_generic(predvars=X,depvars=Y,debugOn=T,p.cutoff = 0.05,maxsteps=10000,nfold=10,
                               descr='randomdata',do_standard=do_standard,defaultFam="gaussian",alpha=0.05)
  return(regres_out)
}
]]>
options(width=512)
options(digits=5)
alpha = $alpha
nfold = $nfold
optLambda = "$optLambda"
Out_Dir = "$html_file.files_path"
Input =  "$input1"
indat = read.table(Input,head=T,sep='\t')
datcols = colnames(indat)
myTitle = "$title"
outtab = "$model_file"
do_standard = as.logical("$do_standard")
mdsplots = as.logical("$mdsplots")
fam = "$model.fam"
xvar_cols_in = "$xvar_cols"
force_xvar_cols_in = "$force_xvar_cols"
xvar_cols = as.numeric(strsplit(xvar_cols_in,",")[[1]])
force_xvar_cols = c()
penalties = rep(1,length(datcols))
forced_in = NA

logxform = "$logxform_cols"
if (logxform != "None") {
    logxform_cols = as.numeric(strsplit(logxform,",")[[1]])
    if (length(logxform_cols) > 0) {
         small = 1e-10
         sset = indat[,logxform_cols]
         zeros = which(sset==0,arr.ind=T)
         nz = nrow(zeros)
         if (nz &gt; 0) {
             message(paste('Log transforming encountered',nz,'zeros - added 1e-10'))
             sset[zeros] = sset[zeros] + small
             lset = log(sset)
             indat[,logxform_cols] = lset
             }
         }
}
if (force_xvar_cols_in != "None")
{
  force_xvar_cols = as.numeric(strsplit(force_xvar_cols_in,",")[[1]])
  allx = c(xvar_cols,force_xvar_cols)
  xvar_cols = unique(allx)
  xvar_cols = xvar_cols[order(xvar_cols)]
  penalties[force_xvar_cols] = 0
}
penalty = penalties[xvar_cols]
forcedin = paste(datcols[which(penalties == 0)],collapse=',')
cox_id_col = NA
cox_id = NA

message(paste('@@@ Using alpha =',alpha,'for all models'))
x = indat[,xvar_cols]
nx = nrow(x)
cx = ncol(x)
message(paste('@@@@ Input has',nx,'samples and',cx,'predictors'))
if (cx > nx) {
message('@@@ WARNING: Models will have more variables than cases so glmnet will likely return one of many possible solutions! Please DO NOT expect reliable results - glmnet is clever but not magical @@@')
}

xcolnames = datcols[xvar_cols]

if (file.exists(Out_Dir) == F) dir.create(Out_Dir)
out_full = F
out_full_file = NA
out_pred_file = ""
out_pred = as.logical("$model.output_pred")

#if $model.fam == "binomial" or $model.fam == "cox":
   cox_id_col = $model.cox_id
   cox_id = indat[,cox_id_col]
   if (out_pred == T) {
     out_pred_file="$output_pred_file"
     rownames(x) = cox_id
     }
#end if
#if $model.fam == "cox":
  cox_time = $model.cox_time
  cox_status = $model.cox_status
  out_full = as.logical("$model.output_full")
  if (out_full == T) { out_full_file="$output_full_file" }
  yvar_cols = c(cox_time,cox_status)
  ycolnames = c('time','status')
  istat = as.double(indat[,cox_status])
  itime = as.double(indat[,cox_time])
  predict_at = quantile(itime)
  if ("$model.predict_at" &gt; "")
  {
      pa = "$model.predict_at"
      predict_at = as.numeric(strsplit(pa,",")[[1]])
  }
  y = data.frame(time = itime, status = istat)
  ustat = unique(istat)
  if ((length(ustat) != 2) | (! 1 %in% ustat ) | (! 0 %in% ustat))
  {
   print.noquote(paste('INPUT ERROR: status must have 0 (censored) or 1 (event) but found',paste(ustat,collapse=',') ))
   quit(save='no',status=1)
  }
  y = as.matrix(y)
  x = as.matrix(x)
  print.noquote(paste('@@@ Cox model will predict yvar=',datcols[cox_status],'using cols=',paste(xcolnames,collapse=','),'n preds=',length(xcolnames),
    'forced in=',forcedin))
  regres_out = dolasso_cox(x=x,y=y,debugOn=F,maxsteps=10000,nfold=nfold,xcolnames=xcolnames,ycolnames=ycolnames,optLambda=optLambda,out_full=out_full,out_full_file=out_full_file,
       out_pred=out_pred,out_pred_file=out_pred_file,cox_id=cox_id,descr=myTitle,do_standard=do_standard,alpha=alpha,penalty=penalty,predict_at=predict_at,mdsplots=mdsplots)
#else:
    yvar_cols = "$model.yvar_cols"
    yvar_cols = as.numeric(strsplit(yvar_cols,",")[[1]])
    ycolnames = datcols[yvar_cols]
    print.noquote(paste('@@@',fam,'model will predict yvar=',paste(ycolnames,collapse=','),'using cols=',paste(xcolnames,collapse=','),'n preds=',length(xcolnames),
    'forced in=',forcedin))
    y = data.matrix(indat[,yvar_cols])
    print.noquote(paste('Model will use',fam,'link function to predict yvar=',paste(ycolnames,collapse=','),'n preds=',length(xcolnames),'forced in=',forcedin))
    regres_out = dolasso_generic(predvars=x,depvars=y,debugOn=F, maxsteps=10000,nfold=nfold,xcolnames=xcolnames,ycolnames=ycolnames,optLambda=optLambda,out_pred_file=out_pred_file,
                             descr=myTitle,do_standard=do_standard,defaultFam=fam,alpha=alpha,penalty=penalty,out_pred=out_pred,cox_id=cox_id,mdsplots=mdsplots)
#end if

write.table(format(regres_out,digits=5),outtab,quote=FALSE, sep="\t",row.names=F)
print.noquote('@@@ SessionInfo for this R session:')
sessionInfo()
warnings()

</configfile>
</configfiles>
  <inputs>
     <param name="title" type="text" value="lasso test" label="Title for job outputs" help="Typing a short, meaningful text here will help remind you (and explain to others) what the outputs represent">
      <sanitizer invalid_char="">
        <valid initial="string.letters,string.digits"><add value="_" /> </valid>
      </sanitizer>
    </param>
    <param name="input1"  type="data" format="tabular" label="Select an input tabular text file from your history. Rows represent samples; Columns are measured phenotypes"
    multiple='False' optional="False" help="Tabular text data with samples as rows, phenotypes as columns with a header row of column identifiers" />
    <param name="xvar_cols" label="Select columns containing numeric variables to use as predictor (x) variables" type="data_column" data_ref="input1" numerical="False"
         multiple="True" use_header_names="True" force_select="True" />
    <param name="force_xvar_cols" label="Select numeric columns containing variables ALWAYS included as predictors in cross validation" type="data_column" data_ref="input1" numerical="False"
         multiple="True" use_header_names="True" force_select="False"/>
    <conditional name="model">
        <param name="fam" type="select" label="GLM Link function for models"
             help="Binary dependant variables will automatically be set to Binomial no matter what this is set to">
                <option value="gaussian" selected="true">Gaussian - continuous dependent (y)</option>
                <option value="binomial">Binomial dependent variables</option>
                <option value="poisson">Poisson (eg counts)</option>
                <option value="cox">Cox models - require special setup for y variables - see below</option>
        </param>
        <when value="gaussian">
            <param name="yvar_cols" label="Select numeric columns containing variables to use as the dependent (y) in elasticnet" type="data_column" data_ref="input1" numerical="False"
             multiple="True" use_header_names="True"  help = "If multiple, each will be modelled against all the x variables and reported separately." force_select="True"/>
            <param name="output_full" type="hidden" value='F' />
            <param name="output_pred" type="hidden" value='F' />
              <param name="cox_id" label="Select column containing a unique sample identifier"
                 help = "Only really needed for output sample specific predicted values downstream."
                 type="data_column" data_ref="input1" numerical="False" force_select="True"
                 multiple="False" use_header_names="True" />
      </when>
        <when value="binomial">
            <param name="yvar_cols" label="Select numeric columns containing variables to use as the dependent (y) in elasticnet" type="data_column" data_ref="input1" numerical="False"
             multiple="True" use_header_names="True"  help = "If multiple, each will be modelled against all the x variables and reported separately." force_select="True"/>
             <param name="output_full" type="hidden" value='F' />
             <param name="output_pred" type="select" label="Create a tabular output with predicted values for each subject from the optimal model for (eg) NRI estimates" >
                <option value="F" selected="true">No predicted value output file</option>
                <option value="T">Create a predicted value output file</option>
             </param>
              <param name="cox_id" label="Select column containing a unique sample identifier"
                 help = "Only really needed for output sample specific predicted values downstream."
                 type="data_column" data_ref="input1" numerical="False" force_select="True"
                 multiple="False" use_header_names="True" />
             <param name="predict_at" type="hidden" value='' />

        </when>
        <when value="poisson">
            <param name="yvar_cols" label="Select columns containing variables to use as the dependent (y) in elasticnet" type="data_column" data_ref="input1" numerical="True"
             multiple="True" use_header_names="True"  help = "If multiple, each will be modelled against all the x variables and reported separately." force_select="True"/>
             <param name="output_full" type="hidden" value='F' />
             <param name="output_pred" type="hidden" value='F' />
             <param name="predict_at" type="hidden" value='' />
              <param name="cox_id" label="Select column containing a unique sample identifier"
                 help = "Optional. Only really needed for output sample specific predicted values downstream. Free - enjoy"
                 type="data_column" data_ref="input1" numerical="True" force_select="False"
                 multiple="False" use_header_names="True" />
        </when>
        <when value="cox">
             <param name="cox_time" label="Select column containing time under observation for Cox regression"
                 type="data_column" data_ref="input1" numerical="True" force_select="True"
                 multiple="False" use_header_names="True"  help = "This MUST contain a time period - eg continuous years or days to failure or right censoring"/>
             <param name="cox_status" label="Select column containing status = 1 for outcome of interest at the end of the time under observation or 0 for right censoring"
                 type="data_column" data_ref="input1" numerical="True" force_select="True"
                 multiple="False" use_header_names="True"  help = "This MUST contain 1 for subjects who had an event at that time or 0 for a right censored observation"/>
              <param name="cox_id" label="Select column containing a unique sample identifier"
                 help = "Optional. Only really needed for output sample specific predicted values downstream. Free - enjoy"
                 type="data_column" data_ref="input1" numerical="False" force_select="False"
                 multiple="False" use_header_names="True" />
             <param name="output_full" type="select" label="Create a tabular output with coefficients for all predictors" >
                <option value="F" selected="true">No full model output file</option>
                <option value="T">Create a full model output file</option>
             </param>
             <param name="output_pred" type="select" label="Create a tabular output with predicted values for each subject from the optimal model for (eg) NRI estimates" >
                <option value="F" selected="true">No predicted value output file</option>
                <option value="T">Create a predicted value output file</option>
             </param>
             <param name="predict_at"  type="text" value='' label="Provide a comma separated list of times to make a prediction for each subject"
                 optional="True" help="Default (blank) will return predictions at 0%,25%,50%,75%,100% of the observed times which should be informative" />

        </when>
    </conditional>
    <param name="optLambda" type="select" label="Value to use when reporting optimal model and coefficients" help="minLambda will have more predictors - 1SDLambda will be more parsimonious">
            <option value="lambda.1se" selected="true">Lambda + 1 SE of min MSE or AUC (fewer coefficients - more false negatives)</option>
            <option value="lambda.min">Lambda at min MSE or max AUC (more coefficients - more false positives)</option>
    </param>
    <param name="logxform_cols"  optional="True" label="Select numeric columns to be log transformed before use as predictors or dependent variables" type="data_column"
        data_ref="input1" numerical="True" multiple="True" use_header_names="True" help = "The wisdom of doing this depends entirely on your predictors - eg can help diminish long-tailed outlier influence"
        force_select="False"/>
    <param name="do_standard" type="select" label="Standardise x vars"
         help="If all measurements on same scale, may not be needed. Coefficients are always returned on the original scale.">
            <option value="False" selected="true">No standardisation of predictors</option>l
            <option value="True">Standardise predictors before model</option>
    </param>
    <param name="mdsplots" type="select" label="Generate MDS plots of samples in measurement space and measurements in sample space" >
            <option value="False" selected="true">No MDS plots</option>l
            <option value="True">Yes create MDS plots</option>
    </param>
    <param name="alpha" type="float" value="0.95" min="0.01" max="1.0" label="Alpha - see glmnet docs. 1 for pure lasso. 0.0 for pure ridge regression"
     help="Default 0.95 allows lasso to cope better with expected predictor collinearity. Use (eg) 0.5 for hybrid regularised regression or (eg) 0.025 for ridge regression"/>
    <param name="nfold" type="integer" value="10" label="Number of folds for internal cross validation"
     help="Default of 10 is usually ok"/>
  </inputs>
  <outputs>
    <data format="html" name="html_file" label="${title}.html"/>
    <data format="tabular" name="model_file" label="${title}_modelres.xls"/>
    <data format="tabular" name="output_full_file" label="${title}_full_cox_model.xls">
        <filter>model['output_full'] == 'T'</filter>
    </data>
    <data format="tabular" name="output_pred_file" label="${title}_predicted_from_model.xls">
        <filter>model['output_pred'] == 'T'</filter>
    </data>
  </outputs>
 <tests>
    <test>
     <param name='input1' value='cox_test.xls' ftype='tabular' />
     <param name='treatment_name' value='case' />
     <param name='title' value='Cox glmnet test' />
     <param name='nfold' value='10' />
     <param name='logxform_cols' value='' />
     <param name='alpha' value='0.95' />
     <param name='do_standard' value="True" />
     <param name='cox_time' value='1' />
     <param name='cox_status' value='2' />
     <param name='cox_id' value='1' />
     <param name='predict_at' value='' />
     <param name='fam' value='cox' />
     <param name='yvar_cols' value='' />
     <param name='xvar_cols' value='3,4,5' />
     <param name='force_xvar_cols' value='3' />
     <param name='output_full' value='F' />
     <param name='output_pred' value='F' />
     <output name='model_file' file='coxlassotest_modelres.xls'>
          <assert_contents>
                <has_text text="rhubarb" />
                <has_text text="TRUE" />
                <!-- &#009; is XML escape code for tab -->
                <!-- has_line line="regulator&#009;partial_likelihood&#009;forced_in&#009;glmnet_model&#009;best_lambda" / -->
                <has_line line="regulator&#009;partial_likelihood&#009;forced_in&#009;glmnet_model&#009;best_lambda&#009;lambdaChoice&#009;alpha" />
                <has_n_columns n="7" />
           </assert_contents>
     </output>
     <output name='html_file' file='coxlassotest.html'  compare='diff' lines_diff='16' />
    </test>
</tests>
<help>

**Before you start**

Please read the glmnet documentation @ glmnet_

This Galaxy wrapper merely exposes that code and the glmnet_ documentation is essential reading
before getting useful results here.

**What it does**

From documentation at glmnet_ ::

 Glmnet is a package that fits a generalized linear model via penalized maximum likelihood.
 The regularization path is computed for the lasso or elasticnet penalty at a grid of values for the regularization parameter lambda.
 The algorithm is extremely fast, and can exploit sparsity in the input matrix x.
 It fits linear, logistic and multinomial, poisson, and Cox regression models.
 A variety of predictions can be made from the fitted models.

Internal cross validation is used to optimise the choice of lambda based on CV AUC for logistic (binomial outcome) models, or CV mse for gaussian.

**Warning about the tyrany of dimensionality**

Yes, this package will select 'optimal' models even when you (optimistically) supply more predictors than you have cases.
The model returned is unlikely to represent the only informative regularisation path through your data - if you run repeatedly with
exactly the same settings, you will probably see many different models being selected.
This is not a software bug - the real problem is that you just don't have enough information in your data.

Sufficiently big jobs will take a while (eg each lasso regression with 20k features on 1k samples takes about 2-3 minutes on our aged cluster)

**Input**

Assuming you have more measurements than samples, you supply data as a tabular text file where each row is a sample and columns
are variables. You specify which columns are dependent (predictors) and which are observations for each sample. Each of multiple
dependent variable columns will be run and reported independently. Predictors can be forced in to the model.

**Output**

For each selected dependent regression variable, a brief report of the model coefficients predicted at the
'optimal' nfold CV value of lambda.

**Predicted event probabilities for Cox and Logistic models**

If you want to compare (eg) two competing clinical predictions, there's a companion generic NRI tool
for predicted event probabilities. Estimates dozens of measures of improvement in prediction. Currently only works for identical id subjects
but can probably be extended to independent sample predictions.

Given a model, we can generate a predicted p (for status 1) in binomial or cox frameworks so models can be evaluated in terms of NRI.
Of course, estimates are likely substantially inflated over 'real world' performance by being estimated from the same sample - but you probably
already knew that since you were smart enough to reach this far down into the on screen help. The author salutes you, intrepid reader!

It may seem an odd thing to do, but we can predict p for an event for each subject from our original data, given a parsimonious model. Doing
this for two separate models (eg, forcing in an additional known explanatory measurement to the new model) allows comparison of the two models
predicted status for each subject, or the same model in independent populations to see how badly it does

**Attributions**

glmnet_ is the R package exposed by this Galaxy tool.

Galaxy_ (that's what you are using right now!) for gluing everything together

Otherwise, all code and documentation comprising this tool was written by Ross Lazarus and is
licensed to you under the LGPL_ like other rgenetics artefacts

.. _LGPL: http://www.gnu.org/copyleft/lesser.html
.. _glmnet: http://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html
.. _Galaxy: http://getgalaxy.org
</help>

<citations>
    <citation type="bibtex">
@Article{Friedman2010, title = {Regularization Paths for Generalized Linear Models via Coordinate Descent},
    author = {Jerome Friedman and Trevor Hastie and Robert Tibshirani},
    journal = {Journal of Statistical Software},
    year = {2010},
    volume = {33},
    number = {1},
    pages = {1--22},
    url = {http://www.jstatsoft.org/v33/i01/}
  }
    </citation>
    <citation type="doi">
10.1093/bioinformatics/bts573
    </citation>
</citations>
</tool>