comparison rglasso_cox.xml @ 0:cf295f36d606 draft

Initial commit for iuc/test rglasso
author fubar
date Sat, 31 Oct 2015 01:07:28 -0400
parents
children 31be675baa50
comparison
equal deleted inserted replaced
-1:000000000000 0:cf295f36d606
1 <tool id="rglasso_cox" name="Lasso" version="0.03">
2 <description>and cox regression using elastic net</description>
3 <requirements>
4 <requirement type="package" version="3.2.2">R_3_2_2</requirement>
5 <requirement type="package" version="1.3.18">graphicsmagick</requirement>
6 <requirement type="package" version="9.10">ghostscript</requirement>
7 <requirement type="package" version="3.2">glmnet_lars_3_2</requirement>
8 </requirements>
9 <command interpreter="python">
10 rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "rglasso"
11 --output_dir "$html_file.files_path" --output_html "$html_file" --make_HTML "yes"
12 </command>
13 <configfiles>
14 <configfile name="runme">
15 <![CDATA[
16 library('glmnet')
17 library('lars')
18 library('survival')
19 library('pec')
20
21
22 message=function(x) {print.noquote(paste(x,sep=''))}
23
24
25 ross.cv.glmnet = function (x, y, weights, offset = NULL, lambda = NULL, type.measure = c("mse",
26 "deviance", "class", "auc", "mae"), nfolds = 10, foldid,
27 grouped = TRUE, keep = FALSE, parallel = FALSE, ...)
28 {
29 if (missing(type.measure))
30 type.measure = "default"
31 else type.measure = match.arg(type.measure)
32 if (!is.null(lambda) && length(lambda) < 2)
33 stop("Need more than one value of lambda for cv.glmnet")
34 N = nrow(x)
35 if (missing(weights))
36 weights = rep(1, N)
37 else weights = as.double(weights)
38 y = drop(y)
39 glmnet.call = match.call(expand.dots = TRUE)
40 sel = match(c("type.measure", "nfolds", "foldid", "grouped",
41 "keep"), names(glmnet.call), F)
42 if (any(sel))
43 glmnet.call = glmnet.call[-sel]
44 glmnet.call[[1]] = as.name("glmnet")
45 glmnet.object = glmnet(x, y, weights = weights, offset = offset,
46 lambda = lambda, ...)
47 glmnet.object\$call = glmnet.call
48 is.offset = glmnet.object\$offset
49 lambda = glmnet.object\$lambda
50 if (inherits(glmnet.object, "multnet")) {
51 nz = predict(glmnet.object, type = "nonzero")
52 nz = sapply(nz, function(x) sapply(x, length))
53 nz = ceiling(apply(nz, 1, median))
54 }
55 else nz = sapply(predict(glmnet.object, type = "nonzero"),
56 length)
57 if (missing(foldid))
58 foldid = sample(rep(seq(nfolds), length = N))
59 else nfolds = max(foldid)
60 if (nfolds < 3)
61 stop("nfolds must be bigger than 3; nfolds=10 recommended")
62 outlist = as.list(seq(nfolds))
63 if (parallel && require(foreach)) {
64 outlist = foreach(i = seq(nfolds), .packages = c("glmnet")) %dopar%
65 {
66 sel = foldid == i
67 if (is.matrix(y))
68 y_sub = y[!sel, ]
69 else y_sub = y[!sel]
70 if (is.offset)
71 offset_sub = as.matrix(offset)[!sel, ]
72 else offset_sub = NULL
73 glmnet(x[!sel, , drop = FALSE], y_sub, lambda = lambda,
74 offset = offset_sub, weights = weights[!sel],
75 ...)
76 }
77 }
78 else {
79 for (i in seq(nfolds)) {
80 sel = foldid == i
81 if (is.matrix(y))
82 y_sub = y[!sel, ]
83 else y_sub = y[!sel]
84 if (is.offset)
85 offset_sub = as.matrix(offset)[!sel, ]
86 else offset_sub = NULL
87 outlist[[i]] = glmnet(x[!sel, , drop = FALSE],
88 y_sub, lambda = lambda, offset = offset_sub,
89 weights = weights[!sel], ...)
90 }
91 }
92 fun = paste("cv", class(glmnet.object)[[1]], sep = ".")
93 cvstuff = do.call(fun, list(outlist, lambda, x, y, weights,
94 offset, foldid, type.measure, grouped, keep))
95 cvm = cvstuff\$cvm
96 cvsd = cvstuff\$cvsd
97 cvname = cvstuff\$name
98
99 out = list(lambda = lambda, cvm = cvm, cvsd = cvsd, cvup = cvm +
100 cvsd, cvlo = cvm - cvsd, nzero = nz, name = cvname, glmnet.fit = glmnet.object)
101 if (keep)
102 out = c(out, list(fit.preval = cvstuff\$fit.preval, foldid = foldid))
103
104 lamin = if (type.measure == "auc")
105 getmin(lambda, -cvm, cvsd)
106 else getmin(lambda, cvm, cvsd)
107 out = c(out, as.list(lamin))
108 hitsse = rep(0,ncol(x))
109 hitsmin = rep(0,ncol(x))
110 names(hitsse) = colnames(x)
111 names(hitsmin) = colnames(x)
112 olmin = lamin\$lambda.min
113 ol1sd = lamin\$lambda.1se
114 lambs = c(olmin,ol1sd)
115 names(lambs) = c('olmin','ol1sd')
116 for (cvfit in outlist) {
117 colmin = which(cvfit\$lambda == olmin)
118 col1se = which(cvfit\$lambda == ol1sd)
119 nzmin = which(cvfit\$beta[,colmin] != 0)
120 nz1se = which(cvfit\$beta[,col1se] != 0)
121 hitsse[nz1se] = hitsse[nz1se] + 1
122 hitsmin[nzmin] = hitsmin[nzmin] + 1
123 }
124 obj = c(out,list(cvhits.1se=hitsse,cvhits.min=hitsmin))
125 class(obj) = "cv.glmnet"
126 obj
127 }
128
129 mdsPlot = function(dm,myTitle,groups=NA,outpdfname,transpose=T)
130 {
131
132 samples = colnames(dm)
133 mt = myTitle
134 pcols=c('maroon')
135 if (! is.na(groups))
136 {
137 gu = unique(groups)
138 colours = rainbow(length(gu),start=0.1,end=0.9)
139 pcols = colours[match(groups,gu)]
140 }
141 mydata = dm
142 if (transpose==T)
143 {
144 mydata = t(dm)
145 }
146 npred = ncol(mydata)
147 d = dist(mydata)
148 fit = cmdscale(d,eig=TRUE, k=min(10,npred-2))
149 xmds = fit\$points[,1]
150 ymds = fit\$points[,2]
151 pdf(outpdfname)
152 plot(xmds, ymds, xlab="Dimension 1", ylab="Dimension 2",
153 main=paste(mt,"MDS Plot"),type="n", col=pcols, cex=0.35)
154 text(xmds, ymds, labels = row.names(mydata), cex=0.35, col=pcols)
155 grid(col="lightgray",lty="dotted")
156 dev.off()
157 }
158
159
160 getpredp_logistic = function(x,yvec,yvarname,id)
161 {
162 yvals = unique(yvec)
163 if (length(yvals) != 2) {
164 message(c('ERROR: y does not have 2 values =',paste(yvals,collapse=',')))
165 return(NA)
166 }
167 cols = colnames(x)
168 if (length(cols) == 0) {
169 message('ERROR: No columns in input x? Cannot predict!')
170 return(NA)
171 }
172 cn = paste(cols, collapse = ' + ')
173
174 formstring=paste("y ~",cn)
175 form = as.formula(formstring)
176 ok = complete.cases(x)
177
178 if (sum(ok) < length(ok)) {
179 x = x[ok,]
180 yvec = yvec[ok]
181 id = id[ok]
182 }
183 nx = data.frame(id=id,x,y=yvec)
184 print('nx,yvec:')
185 print(head(nx,n=3))
186 print(yvec)
187 mdl = glm(form, data=nx, family="binomial", na.action=na.omit)
188 message(c('Model format =',formstring))
189 message(paste('Predictive model details used to generate logistic outcome probabilities for',yvarname,':'))
190 print(summary(md1))
191 print(anova(md1))
192 predp = predict(md1,nx,type="response")
193 p1 = data.frame(id=id,pred_response=predp,obs_response=yvec)
194 return(p1)
195 }
196
197 getpredp_cox = function(x,time,status,id,predict_at)
198 {
199 cols = colnames(x)
200 if (length(cols) == 0) {
201 message('ERROR: No columns in input x? Cannot predict!')
202 return(NA)
203 }
204 cn = paste(colnames(x), collapse = ' + ')
205
206 formstring=paste("Surv(time, status) ~",cn)
207
208 form = as.formula(formstring)
209
210 ok = complete.cases(x)
211
212 if (sum(ok) < length(ok)) {
213 x = x[ok,]
214 time = time[ok]
215 status = status[ok]
216 id = id[ok]
217 }
218 nx = data.frame(x,time=time,status=status)
219 m1 = coxph(form, data=nx,singular.ok=TRUE)
220 print.noquote('Predictive model details used to generate survival probabilities:')
221 print.noquote(m1)
222 predpq = predictSurvProb(object=m1, newdata=nx, times=predict_at)
223 predpq = 1-predpq
224 colnames(predpq) = paste('p_surv_to',predict_at,sep='_')
225 p1 = data.frame(id=id,predpq,time=time,status=status)
226 return(p1)
227 }
228
229
230 dolasso_cox = function(x,y,debugOn=F,maxsteps=10000,nfold=10,xcolnames,ycolnames,optLambda='lambda.1se',out_full=F,out_full_file=NA,
231 out_pred=F,out_pred_file=NA,cox_id=NA, descr='Cox test',do_standard=F,alpha=0.9,penalty,predict_at,mdsplots=F)
232 {
233 logf = file("cox_rglasso.log", open = "a")
234 sink(logf,type = c("output", "message"))
235 res = NULL
236 if (mdsplots==T) {
237 outpdfname = 'cox_x_in_sample_space_MDS.pdf'
238 p = try({ mdsPlot(x,'measurements in sample space',groups=NA,outpdfname=outpdfname,transpose=T) },T)
239 if (class(p) == "try-error")
240 {
241 print.noquote(paste('Unable to produce predictors in sample space mds plot',p))
242 }
243 outpdfname = 'cox_samples_in_x_space_MDS.pdf'
244 p = try({mdsPlot(x,'samples in measurement space',groups=y,outpdfname=outpdfname,transpose=F) },T)
245 if (class(p) == "try-error")
246 {
247 print.noquote(paste('Unable to produce samples in measurement space mds plots',p))
248 }
249 }
250 if (is.na(predict_at)) { predict_at = quantile(y) }
251 message(paste('@@@ Cox model will be predicted at times =',paste(predict_at,collapse=',')))
252 do_standard = do_standard
253 standardize = do_standard
254 normalize = do_standard
255 p = try({larsres = glmnet(x,y,family='cox',standardize=standardize,alpha=alpha,penalty.factor=penalty )},T)
256 if (class(p) == "try-error")
257 {
258 print.noquote('Unable to run cox glmnet on your data')
259 print.noquote(p)
260 sink()
261 return(NA)
262 }
263 if (out_full == T)
264 {
265 b = as.matrix(larsres\$beta)
266 nb = length(colnames(b))
267 bcoef = b[,nb]
268 lastl = larsres\$lambda[length(larsres\$lambda)]
269 allres = data.frame(x=rownames(b),beta=bcoef,lambda=lastl)
270 write.table(format(allres,digits=5),out_full_file,quote=FALSE, sep="\t",row.names=F)
271 }
272
273 outpdf = paste('cox',descr,'glmnetdev.pdf',sep='_')
274 try(
275 {
276 pdf(outpdf)
277 plot(larsres,main='cox glmnet',label=T)
278 grid()
279 dev.off()
280 },T)
281
282 larscv = NA
283
284 p = try({larscv=ross.cv.glmnet(x,y,family=fam,type.measure='deviance',penalty=penalty)},T)
285 if (class(p) == "try-error") {
286 print.noquote(paste('Unable to cross validate your data',p))
287 sink()
288 return(NA)
289 }
290 lse = larscv\$cvhits.1se
291 lmin = larscv\$cvhits.min
292 tot = lse + lmin
293 allhits = data.frame(hits_lambda_1se = lse,hits_lambda_min = lmin)
294 nzhits = allhits[which(tot != 0),]
295 message('Times each predictor was selected in CV models (excluding zero count predictors)')
296 print.noquote(nzhits)
297 out_nz_file = 'cox_cross_validation_model_counts.xls'
298 write.table(nzhits,out_nz_file,quote=FALSE, sep="\t",row.names=F)
299
300 outpdf = paste('cox',descr,'glmnet_cvdeviance.pdf',sep='_')
301
302 p = try(
303 {
304 pdf(outpdf)
305 plot(larscv,main='Deviance',label=T)
306 grid()
307 dev.off()
308 },T)
309 if (optLambda == 'lambda.min') {
310 best_lambda = larscv\$lambda.min
311 bestcoef = as.matrix(coef(larscv, s = "lambda.min"))
312 } else {
313 best_lambda = larscv\$lambda.1se
314 bestcoef = as.matrix(coef(larscv, s = "lambda.1se"))
315 }
316 inmodel = which(bestcoef != 0)
317 coefs = bestcoef[inmodel]
318 preds = rownames(bestcoef)[inmodel]
319
320 names(coefs) = preds
321 pen = as.logical( ! penalty[inmodel])
322 if (out_pred==T)
323 {
324 if (length(inmodel) > 0 ) {
325 predcols = inmodel
326 xmat = as.matrix(x[,predcols])
327 colnames(xmat) = preds
328 bestpred = getpredp_cox(x=xmat,time=y[,'time'],status=y[,'status'],id=cox_id, predict_at=predict_at)
329 pred = data.frame(responsep=bestpred, best_lambda=best_lambda,lamchoice=optLambda,alpha=alpha)
330 write.table(pred,out_pred_file,quote=FALSE, sep="\t",row.names=F)
331 } else { print.noquote('WARNING: No coefficients in selected model to predict with - no predictions made') }
332 }
333 if (debugOn) {
334 print.noquote(paste('best_lambda=',best_lambda,'saving cox respreds=',paste(names(coefs),collapse=','),'as predictors of survival. Coefs=',paste(coefs,collapse=',')))
335 }
336 p = try({res = data.frame(regulator=names(coefs),partial_likelihood=coefs,forced_in=pen,glmnet_model='cox',best_lambda=best_lambda,
337 lambdaChoice=optLambda,alpha=alpha)},T)
338 if (class(p) == "try-error") {
339 message(paste('@@@ unable to return a dataframe',p))
340 sink()
341 return(NA)
342 }
343 print.noquote('@@@ Results preview:')
344 print.noquote(res,digits=5)
345 sink()
346 return(res)
347
348 }
349
350
351 do_lasso = function(x=NA,y=NA,do_standard=T,debugOn=T,defaultFam="gaussian",optLambda='minLambda',descr='description', indx=1,target='target',sane=F,
352 alpha=0.9,nfold=10,penalty=c(),out_pred=F,out_pred_file='outpred',yvarname='yvar',id=c(),mdsplots=F)
353 {
354 logf = file(paste(target,"rglasso.log",sep='_'), open = "a")
355 sink(logf,type = c("output", "message"))
356 res = NA
357 phe_is_bin = (length(unique(y)) == 2)
358 forcedin = paste(colnames(x)[which(penalty == 0)],collapse=',')
359 fam = "gaussian"
360 if (defaultFam %in% c("poisson","binomial","gaussian","multinomial")) fam=defaultFam
361 if (phe_is_bin == T) {
362 fam = "binomial"
363 }
364 print.noquote(paste('target=',target,'is binary=',phe_is_bin,'dim(x)=',paste(dim(x),collapse=','),'length(y)=',length(y),'force=',forcedin,'fam=',fam))
365 standardize = do_standard
366 p = try({larsres = glmnet(x,y,family=fam,standardize=standardize,maxit=10000,alpha=alpha,penalty.factor=penalty) },T)
367 if (class(p) == "try-error")
368 {
369 print(paste('ERROR: unable to run glmnet for target',target,'error=',p))
370 sink()
371 return(NA)
372 }
373
374 mt = paste('Glmnet fraction deviance for',target)
375 outpdf = paste(target,'glmnetPath.pdf',sep='_')
376 pdf(outpdf)
377 plot(larsres,main=mt,label=T)
378 grid()
379 dev.off()
380
381 outpdf = paste(target,'glmnetDeviance.pdf',sep='_')
382
383 mt2 = paste('Glmnet lambda for',target)
384
385 pdf(outpdf)
386 plot(larsres,xvar="lambda",main=mt2,label=T)
387 grid()
388 dev.off()
389
390 larscv = NA
391 if (fam=="binomial") {
392 tmain = paste(target,'AUC')
393 outpdf = paste(target,'glmnetCV_AUC.pdf',sep='_')
394 p = try({larscv = ross.cv.glmnet(x=x,y=y,family=fam,type.measure='auc')},T)
395 } else {
396 tmain = paste(target,'CV MSE')
397 outpdf = paste(target,'glmnetCV_MSE.pdf',sep='_')
398 p = try({larscv = ross.cv.glmnet(x=x,y=y,family=fam,type.measure='mse')},T)
399 }
400 if (class(p) == "try-error")
401 {
402 print(paste('ERROR: unable to run cross validation for target',target,'error=',p))
403 sink()
404 return(NA)
405 }
406
407 pdf(outpdf)
408 plot(larscv,main=tmain)
409 grid()
410 dev.off()
411
412 lse = larscv\$cvhits.1se
413 lmin = larscv\$cvhits.min
414 tot = lse + lmin
415 allhits = data.frame(pred=colnames(x),hits_lambda_1se = lse,hits_lambda_min = lmin)
416 nzhits = allhits[which(tot != 0),]
417 message('Total hit count for each predictor over all CV models (excluding zero count predictors)')
418 print.noquote(nzhits)
419 out_nz_file = paste(target,'cross_validation_model_counts.xls',sep='_')
420 write.table(nzhits,out_nz_file,quote=FALSE, sep="\t",row.names=F)
421
422 ipenalty = c(0,penalty)
423 if (optLambda == 'lambda.min') {
424 best_lambda = larscv\$lambda.min
425 bestpred = as.matrix(coef(larscv, s = "lambda.min"))
426 } else {
427 best_lambda = larscv\$lambda.1se
428 bestpred = as.matrix(coef(larscv, s = "lambda.1se"))
429 }
430 inmodel = which(bestpred != 0)
431 coefs = bestpred[inmodel,1]
432 preds = rownames(bestpred)[inmodel]
433 iforced = ipenalty[inmodel]
434 forced = ! as.logical(iforced)
435 names(coefs) = preds
436 ncoef = length(coefs) - 1
437 if (out_pred==T && fam=="binomial")
438 {
439 print.noquote(paste('Predicting',target,'probabilities from binomial glmnet at alpha',alpha,'and lambda',best_lambda))
440 bestpred = predict.glmnet(larsres,s=best_lambda,newx=x,type="response")
441 bestpred = exp(bestpred)/(1+exp(bestpred))
442 pred = data.frame(id=id,y=y,predp=as.vector(bestpred), best_lambda=best_lambda)
443 write.table(pred,out_pred_file,quote=FALSE, sep="\t",row.names=F)
444 }
445 if (debugOn) {cat(indx,'best_lambda=',best_lambda,'saving',fam,'respreds=',names(coefs),'as predictors of',target,'coefs=',coefs,'\n')}
446 res = try(data.frame(i=indx,pred=target,regulator=names(coefs),coef=coefs,forced_in=forced,glmnet_model=fam,ncoef=ncoef,
447 best_lambda=best_lambda,lambdaChoice=optLambda,alpha=alpha),T)
448 if (class(res) == "try-error") {
449 sink()
450 return(NA) }
451 print.noquote(res)
452 sink()
453 return(res)
454 }
455
456
457 dolasso_generic = function(predvars=NA,depvars=NA,debugOn=T,maxsteps=100, alpha=0.9,nfold=10,xcolnames=c(),ycolnames=c(),optLambda='minLambda', out_pred_file=NA,
458 descr="describe me",do_standard=F,defaultFam="gaussian",penalty=c(),out_pred=F,cox_id=c(),mdsplots=F,xfilt=0.95)
459 {
460 logf = file("rglasso.log", open = "a")
461 sink(logf,type = c("output", "message"))
462 xdat = predvars
463 xm = data.matrix(xdat)
464 res = NULL
465 id = cox_id
466 depnames = ycolnames
467 ndep = length(depnames)
468 if (mdsplots==T) {
469 outpdfname = 'rglasso_x_in_sample_space_MDS.pdf'
470 p = try({ mdsPlot(xm,'measurements in sample space',groups=NA,outpdfname=outpdfname,transpose=T) },T)
471 if (class(p) == "try-error")
472 {
473 print.noquote(paste('Unable to produce predictors in sample space mds plot. Error:',p))
474 }
475 outpdfname = 'rglasso_samples_in_x_space_MDS.pdf'
476 p = try({mdsPlot(xm,'samples in measurement space',groups=NA,outpdfname=outpdfname,transpose=F) },T)
477 if (class(p) == "try-error")
478 {
479 print.noquote(paste('Unable to produce samples in measurement space mds plot. Error:',p))
480 }
481 }
482 ndat = nrow(xm)
483 cfracs = colSums(! is.na(xm))/ndat
484 keepme = (cfracs >= xfilt)
485 print.noquote(paste('Removing', sum(! keepme), 'xvars with more than',xfilt,'fraction missing'))
486 vars = apply(xm,2,var,na.rm=T)
487 xm = xm[,keepme]
488 for (i in c(1:max(1,ndep))) {
489 target = depnames[i]
490 if (length(target) < 1) { target='y' }
491 if (i %% 100 == 0) { cat(i,target,'\n') }
492 if (ndep <= 1) {
493 y=depvars
494 } else {
495 y = depvars[,i]
496 }
497 if (fam == "binomial") {y = as.factor(y)}
498 x = xm
499 id = cox_id
500 if (fam != "cox") {
501 ok = complete.cases(x,y)
502 if (sum(! ok) > 0) {
503 message(paste('@@@ Removing',sum(! ok),'cases with missing y of',length(y),'@@@'))
504 y = y[(ok)]
505 x = x[(ok),]
506 id = id[(ok)]
507 }
508 }
509 ok = complete.cases(y)
510 if (sum(ok) == 0 ) {
511 print(paste("No complete cases found for",target,"in input x dim =",paste(dim(xm),collapse=','),"length y=",length(y)))
512 } else {
513 if (i == 1) { outpred = out_pred_file
514 } else {
515 outpred = paste(target,'predicted_output.xls')
516 }
517 regres = do_lasso(x=x,y=y,do_standard=do_standard,debugOn=debugOn,defaultFam=defaultFam,optLambda=optLambda,out_pred_file=outpred,
518 descr=descr,indx=i,target=target,alpha=alpha,nfold=nfold,penalty=penalty,out_pred=out_pred,yvarname=target,id=id,mdsplots=mdsplots)
519 if (! is.na(regres)) { res = rbind(res,regres) }
520 }
521 }
522 print.noquote('@@@ Results preview:')
523 print.noquote(res,digits=5)
524 sink()
525 return(res)
526 }
527
528
529 corPlot=function(xdat=c(),main='main title',is_raw=T)
530 {
531 library(pheatmap)
532 library(gplots)
533 if (is_raw) {
534 cxdat = cor(xdat,method="spearman",use="pairwise.complete.obs")
535 } else {
536 cxdat=xdat
537 }
538 xro = nrow(cxdat)
539 if (xro > 1000) stop("Too many rows for heatmap, who can read?!")
540 fontsize_col = 5.0
541 pheatmap(cxdat, main=main, show_colnames = F, width=30, height=30,
542 fontsize_row=fontsize_col, border_color=NA)
543 }
544
545
546 runTest = function(n=10)
547 {
548 set.seed (NULL)
549 Y = data.frame(y1=runif (n),y2=runif(n))
550 Xv <- runif(n*n)
551 X <- matrix(Xv, nrow = n, ncol = n)
552
553 mydf <- data.frame(Y, X)
554
555 regres_out = dolasso_generic(predvars=X,depvars=Y,debugOn=T,p.cutoff = 0.05,maxsteps=10000,nfold=10,
556 descr='randomdata',do_standard=do_standard,defaultFam="gaussian",alpha=0.05)
557 return(regres_out)
558 }
559 ]]>
560 options(width=512)
561 options(digits=5)
562 alpha = $alpha
563 nfold = $nfold
564 optLambda = "$optLambda"
565 Out_Dir = "$html_file.files_path"
566 Input = "$input1"
567 indat = read.table(Input,head=T,sep='\t')
568 datcols = colnames(indat)
569 myTitle = "$title"
570 outtab = "$model_file"
571 do_standard = as.logical("$do_standard")
572 mdsplots = as.logical("$mdsplots")
573 fam = "$model.fam"
574 xvar_cols_in = "$xvar_cols"
575 force_xvar_cols_in = "$force_xvar_cols"
576 xvar_cols = as.numeric(strsplit(xvar_cols_in,",")[[1]])
577 force_xvar_cols = c()
578 penalties = rep(1,length(datcols))
579 forced_in = NA
580
581 logxform = "$logxform_cols"
582 if (logxform != "None") {
583 logxform_cols = as.numeric(strsplit(logxform,",")[[1]])
584 if (length(logxform_cols) > 0) {
585 small = 1e-10
586 sset = indat[,logxform_cols]
587 zeros = which(sset==0,arr.ind=T)
588 nz = nrow(zeros)
589 if (nz &gt; 0) {
590 message(paste('Log transforming encountered',nz,'zeros - added 1e-10'))
591 sset[zeros] = sset[zeros] + small
592 lset = log(sset)
593 indat[,logxform_cols] = lset
594 }
595 }
596 }
597 if (force_xvar_cols_in != "None")
598 {
599 force_xvar_cols = as.numeric(strsplit(force_xvar_cols_in,",")[[1]])
600 allx = c(xvar_cols,force_xvar_cols)
601 xvar_cols = unique(allx)
602 xvar_cols = xvar_cols[order(xvar_cols)]
603 penalties[force_xvar_cols] = 0
604 }
605 penalty = penalties[xvar_cols]
606 forcedin = paste(datcols[which(penalties == 0)],collapse=',')
607 cox_id_col = NA
608 cox_id = NA
609
610 message(paste('@@@ Using alpha =',alpha,'for all models'))
611 x = indat[,xvar_cols]
612 nx = nrow(x)
613 cx = ncol(x)
614 message(paste('@@@@ Input has',nx,'samples and',cx,'predictors'))
615 if (cx > nx) {
616 message('@@@ WARNING: Models will have more variables than cases so glmnet will likely return one of many possible solutions! Please DO NOT expect reliable results - glmnet is clever but not magical @@@')
617 }
618
619 xcolnames = datcols[xvar_cols]
620
621 if (file.exists(Out_Dir) == F) dir.create(Out_Dir)
622 out_full = F
623 out_full_file = NA
624 out_pred_file = ""
625 out_pred = as.logical("$model.output_pred")
626
627 #if $model.fam == "binomial" or $model.fam == "cox":
628 cox_id_col = $model.cox_id
629 cox_id = indat[,cox_id_col]
630 if (out_pred == T) {
631 out_pred_file="$output_pred_file"
632 rownames(x) = cox_id
633 }
634 #end if
635 #if $model.fam == "cox":
636 cox_time = $model.cox_time
637 cox_status = $model.cox_status
638 out_full = as.logical("$model.output_full")
639 if (out_full == T) { out_full_file="$output_full_file" }
640 yvar_cols = c(cox_time,cox_status)
641 ycolnames = c('time','status')
642 istat = as.double(indat[,cox_status])
643 itime = as.double(indat[,cox_time])
644 predict_at = quantile(itime)
645 if ("$model.predict_at" &gt; "")
646 {
647 pa = "$model.predict_at"
648 predict_at = as.numeric(strsplit(pa,",")[[1]])
649 }
650 y = data.frame(time = itime, status = istat)
651 ustat = unique(istat)
652 if ((length(ustat) != 2) | (! 1 %in% ustat ) | (! 0 %in% ustat))
653 {
654 print.noquote(paste('INPUT ERROR: status must have 0 (censored) or 1 (event) but found',paste(ustat,collapse=',') ))
655 quit(save='no',status=1)
656 }
657 y = as.matrix(y)
658 x = as.matrix(x)
659 print.noquote(paste('@@@ Cox model will predict yvar=',datcols[cox_status],'using cols=',paste(xcolnames,collapse=','),'n preds=',length(xcolnames),
660 'forced in=',forcedin))
661 regres_out = dolasso_cox(x=x,y=y,debugOn=F,maxsteps=10000,nfold=nfold,xcolnames=xcolnames,ycolnames=ycolnames,optLambda=optLambda,out_full=out_full,out_full_file=out_full_file,
662 out_pred=out_pred,out_pred_file=out_pred_file,cox_id=cox_id,descr=myTitle,do_standard=do_standard,alpha=alpha,penalty=penalty,predict_at=predict_at,mdsplots=mdsplots)
663 #else:
664 yvar_cols = "$model.yvar_cols"
665 yvar_cols = as.numeric(strsplit(yvar_cols,",")[[1]])
666 ycolnames = datcols[yvar_cols]
667 print.noquote(paste('@@@',fam,'model will predict yvar=',paste(ycolnames,collapse=','),'using cols=',paste(xcolnames,collapse=','),'n preds=',length(xcolnames),
668 'forced in=',forcedin))
669 y = data.matrix(indat[,yvar_cols])
670 print.noquote(paste('Model will use',fam,'link function to predict yvar=',paste(ycolnames,collapse=','),'n preds=',length(xcolnames),'forced in=',forcedin))
671 regres_out = dolasso_generic(predvars=x,depvars=y,debugOn=F, maxsteps=10000,nfold=nfold,xcolnames=xcolnames,ycolnames=ycolnames,optLambda=optLambda,out_pred_file=out_pred_file,
672 descr=myTitle,do_standard=do_standard,defaultFam=fam,alpha=alpha,penalty=penalty,out_pred=out_pred,cox_id=cox_id,mdsplots=mdsplots)
673 #end if
674
675 write.table(format(regres_out,digits=5),outtab,quote=FALSE, sep="\t",row.names=F)
676 print.noquote('@@@ SessionInfo for this R session:')
677 sessionInfo()
678 warnings()
679
680 </configfile>
681 </configfiles>
682 <inputs>
683 <param name="title" type="text" value="lasso test" label="Title for job outputs" help="Typing a short, meaningful text here will help remind you (and explain to others) what the outputs represent">
684 <sanitizer invalid_char="">
685 <valid initial="string.letters,string.digits"><add value="_" /> </valid>
686 </sanitizer>
687 </param>
688 <param name="input1" type="data" format="tabular" label="Select an input tabular text file from your history. Rows represent samples; Columns are measured phenotypes"
689 multiple='False' optional="False" help="Tabular text data with samples as rows, phenotypes as columns with a header row of column identifiers" />
690 <param name="xvar_cols" label="Select columns containing numeric variables to use as predictor (x) variables" type="data_column" data_ref="input1" numerical="False"
691 multiple="True" use_header_names="True" force_select="True" />
692 <param name="force_xvar_cols" label="Select numeric columns containing variables ALWAYS included as predictors in cross validation" type="data_column" data_ref="input1" numerical="False"
693 multiple="True" use_header_names="True" force_select="False"/>
694 <conditional name="model">
695 <param name="fam" type="select" label="GLM Link function for models"
696 help="Binary dependant variables will automatically be set to Binomial no matter what this is set to">
697 <option value="gaussian" selected="true">Gaussian - continuous dependent (y)</option>
698 <option value="binomial">Binomial dependent variables</option>
699 <option value="poisson">Poisson (eg counts)</option>
700 <option value="cox">Cox models - require special setup for y variables - see below</option>
701 </param>
702 <when value="gaussian">
703 <param name="yvar_cols" label="Select numeric columns containing variables to use as the dependent (y) in elasticnet" type="data_column" data_ref="input1" numerical="False"
704 multiple="True" use_header_names="True" help = "If multiple, each will be modelled against all the x variables and reported separately." force_select="True"/>
705 <param name="output_full" type="hidden" value='F' />
706 <param name="output_pred" type="hidden" value='F' />
707 <param name="cox_id" label="Select column containing a unique sample identifier"
708 help = "Only really needed for output sample specific predicted values downstream."
709 type="data_column" data_ref="input1" numerical="False" force_select="True"
710 multiple="False" use_header_names="True" />
711 </when>
712 <when value="binomial">
713 <param name="yvar_cols" label="Select numeric columns containing variables to use as the dependent (y) in elasticnet" type="data_column" data_ref="input1" numerical="False"
714 multiple="True" use_header_names="True" help = "If multiple, each will be modelled against all the x variables and reported separately." force_select="True"/>
715 <param name="output_full" type="hidden" value='F' />
716 <param name="output_pred" type="select" label="Create a tabular output with predicted values for each subject from the optimal model for (eg) NRI estimates" >
717 <option value="F" selected="true">No predicted value output file</option>
718 <option value="T">Create a predicted value output file</option>
719 </param>
720 <param name="cox_id" label="Select column containing a unique sample identifier"
721 help = "Only really needed for output sample specific predicted values downstream."
722 type="data_column" data_ref="input1" numerical="False" force_select="True"
723 multiple="False" use_header_names="True" />
724 <param name="predict_at" type="hidden" value='' />
725
726 </when>
727 <when value="poisson">
728 <param name="yvar_cols" label="Select columns containing variables to use as the dependent (y) in elasticnet" type="data_column" data_ref="input1" numerical="True"
729 multiple="True" use_header_names="True" help = "If multiple, each will be modelled against all the x variables and reported separately." force_select="True"/>
730 <param name="output_full" type="hidden" value='F' />
731 <param name="output_pred" type="hidden" value='F' />
732 <param name="predict_at" type="hidden" value='' />
733 <param name="cox_id" label="Select column containing a unique sample identifier"
734 help = "Optional. Only really needed for output sample specific predicted values downstream. Free - enjoy"
735 type="data_column" data_ref="input1" numerical="True" force_select="False"
736 multiple="False" use_header_names="True" />
737 </when>
738 <when value="cox">
739 <param name="cox_time" label="Select column containing time under observation for Cox regression"
740 type="data_column" data_ref="input1" numerical="True" force_select="True"
741 multiple="False" use_header_names="True" help = "This MUST contain a time period - eg continuous years or days to failure or right censoring"/>
742 <param name="cox_status" label="Select column containing status = 1 for outcome of interest at the end of the time under observation or 0 for right censoring"
743 type="data_column" data_ref="input1" numerical="True" force_select="True"
744 multiple="False" use_header_names="True" help = "This MUST contain 1 for subjects who had an event at that time or 0 for a right censored observation"/>
745 <param name="cox_id" label="Select column containing a unique sample identifier"
746 help = "Optional. Only really needed for output sample specific predicted values downstream. Free - enjoy"
747 type="data_column" data_ref="input1" numerical="False" force_select="False"
748 multiple="False" use_header_names="True" />
749 <param name="output_full" type="select" label="Create a tabular output with coefficients for all predictors" >
750 <option value="F" selected="true">No full model output file</option>
751 <option value="T">Create a full model output file</option>
752 </param>
753 <param name="output_pred" type="select" label="Create a tabular output with predicted values for each subject from the optimal model for (eg) NRI estimates" >
754 <option value="F" selected="true">No predicted value output file</option>
755 <option value="T">Create a predicted value output file</option>
756 </param>
757 <param name="predict_at" type="text" value='' label="Provide a comma separated list of times to make a prediction for each subject"
758 optional="True" help="Default (blank) will return predictions at 0%,25%,50%,75%,100% of the observed times which should be informative" />
759
760 </when>
761 </conditional>
762 <param name="optLambda" type="select" label="Value to use when reporting optimal model and coefficients" help="minLambda will have more predictors - 1SDLambda will be more parsimonious">
763 <option value="lambda.1se" selected="true">Lambda + 1 SE of min MSE or AUC (fewer coefficients - more false negatives)</option>
764 <option value="lambda.min">Lambda at min MSE or max AUC (more coefficients - more false positives)</option>
765 </param>
766 <param name="logxform_cols" optional="True" label="Select numeric columns to be log transformed before use as predictors or dependent variables" type="data_column"
767 data_ref="input1" numerical="True" multiple="True" use_header_names="True" help = "The wisdom of doing this depends entirely on your predictors - eg can help diminish long-tailed outlier influence"
768 force_select="False"/>
769 <param name="do_standard" type="select" label="Standardise x vars"
770 help="If all measurements on same scale, may not be needed. Coefficients are always returned on the original scale.">
771 <option value="False" selected="true">No standardisation of predictors</option>l
772 <option value="True">Standardise predictors before model</option>
773 </param>
774 <param name="mdsplots" type="select" label="Generate MDS plots of samples in measurement space and measurements in sample space" >
775 <option value="False" selected="true">No MDS plots</option>l
776 <option value="True">Yes create MDS plots</option>
777 </param>
778 <param name="alpha" type="float" value="0.95" min="0.01" max="1.0" label="Alpha - see glmnet docs. 1 for pure lasso. 0.0 for pure ridge regression"
779 help="Default 0.95 allows lasso to cope better with expected predictor collinearity. Use (eg) 0.5 for hybrid regularised regression or (eg) 0.025 for ridge regression"/>
780 <param name="nfold" type="integer" value="10" label="Number of folds for internal cross validation"
781 help="Default of 10 is usually ok"/>
782 </inputs>
783 <outputs>
784 <data format="html" name="html_file" label="${title}.html"/>
785 <data format="tabular" name="model_file" label="${title}_modelres.xls"/>
786 <data format="tabular" name="output_full_file" label="${title}_full_cox_model.xls">
787 <filter>model['output_full'] == 'T'</filter>
788 </data>
789 <data format="tabular" name="output_pred_file" label="${title}_predicted_from_model.xls">
790 <filter>model['output_pred'] == 'T'</filter>
791 </data>
792 </outputs>
793 <tests>
794 <test>
795 <param name='input1' value='cox_test.xls' ftype='tabular' />
796 <param name='treatment_name' value='case' />
797 <param name='title' value='Cox glmnet test' />
798 <param name='nfold' value='10' />
799 <param name='logxform_cols' value='' />
800 <param name='alpha' value='0.95' />
801 <param name='do_standard' value="True" />
802 <param name='cox_time' value='1' />
803 <param name='cox_status' value='2' />
804 <param name='cox_id' value='1' />
805 <param name='predict_at' value='' />
806 <param name='fam' value='cox' />
807 <param name='yvar_cols' value='' />
808 <param name='xvar_cols' value='3,4,5' />
809 <param name='force_xvar_cols' value='3' />
810 <param name='output_full' value='F' />
811 <param name='output_pred' value='F' />
812 <output name='model_file' file='coxlassotest_modelres.xls'>
813 <assert_contents>
814 <has_text text="rhubarb" />
815 <has_text text="TRUE" />
816 <!-- &#009; is XML escape code for tab -->
817 <!-- has_line line="regulator&#009;partial_likelihood&#009;forced_in&#009;glmnet_model&#009;best_lambda" / -->
818 <has_line line="regulator&#009;partial_likelihood&#009;forced_in&#009;glmnet_model&#009;best_lambda&#009;lambdaChoice&#009;alpha" />
819 <has_n_columns n="7" />
820 </assert_contents>
821 </output>
822 <output name='html_file' file='coxlassotest.html' compare='diff' lines_diff='16' />
823 </test>
824 </tests>
825 <help>
826
827 **Before you start**
828
829 Please read the glmnet documentation @ glmnet_
830
831 This Galaxy wrapper merely exposes that code and the glmnet_ documentation is essential reading
832 before getting useful results here.
833
834 **What it does**
835
836 From documentation at glmnet_ ::
837
838 Glmnet is a package that fits a generalized linear model via penalized maximum likelihood.
839 The regularization path is computed for the lasso or elasticnet penalty at a grid of values for the regularization parameter lambda.
840 The algorithm is extremely fast, and can exploit sparsity in the input matrix x.
841 It fits linear, logistic and multinomial, poisson, and Cox regression models.
842 A variety of predictions can be made from the fitted models.
843
844 Internal cross validation is used to optimise the choice of lambda based on CV AUC for logistic (binomial outcome) models, or CV mse for gaussian.
845
846 **Warning about the tyrany of dimensionality**
847
848 Yes, this package will select 'optimal' models even when you (optimistically) supply more predictors than you have cases.
849 The model returned is unlikely to represent the only informative regularisation path through your data - if you run repeatedly with
850 exactly the same settings, you will probably see many different models being selected.
851 This is not a software bug - the real problem is that you just don't have enough information in your data.
852
853 Sufficiently big jobs will take a while (eg each lasso regression with 20k features on 1k samples takes about 2-3 minutes on our aged cluster)
854
855 **Input**
856
857 Assuming you have more measurements than samples, you supply data as a tabular text file where each row is a sample and columns
858 are variables. You specify which columns are dependent (predictors) and which are observations for each sample. Each of multiple
859 dependent variable columns will be run and reported independently. Predictors can be forced in to the model.
860
861 **Output**
862
863 For each selected dependent regression variable, a brief report of the model coefficients predicted at the
864 'optimal' nfold CV value of lambda.
865
866 **Predicted event probabilities for Cox and Logistic models**
867
868 If you want to compare (eg) two competing clinical predictions, there's a companion generic NRI tool
869 for predicted event probabilities. Estimates dozens of measures of improvement in prediction. Currently only works for identical id subjects
870 but can probably be extended to independent sample predictions.
871
872 Given a model, we can generate a predicted p (for status 1) in binomial or cox frameworks so models can be evaluated in terms of NRI.
873 Of course, estimates are likely substantially inflated over 'real world' performance by being estimated from the same sample - but you probably
874 already knew that since you were smart enough to reach this far down into the on screen help. The author salutes you, intrepid reader!
875
876 It may seem an odd thing to do, but we can predict p for an event for each subject from our original data, given a parsimonious model. Doing
877 this for two separate models (eg, forcing in an additional known explanatory measurement to the new model) allows comparison of the two models
878 predicted status for each subject, or the same model in independent populations to see how badly it does
879
880 **Attributions**
881
882 glmnet_ is the R package exposed by this Galaxy tool.
883
884 Galaxy_ (that's what you are using right now!) for gluing everything together
885
886 Otherwise, all code and documentation comprising this tool was written by Ross Lazarus and is
887 licensed to you under the LGPL_ like other rgenetics artefacts
888
889 .. _LGPL: http://www.gnu.org/copyleft/lesser.html
890 .. _glmnet: http://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html
891 .. _Galaxy: http://getgalaxy.org
892 </help>
893
894 <citations>
895 <citation type="bibtex">
896 @Article{Friedman2010, title = {Regularization Paths for Generalized Linear Models via Coordinate Descent},
897 author = {Jerome Friedman and Trevor Hastie and Robert Tibshirani},
898 journal = {Journal of Statistical Software},
899 year = {2010},
900 volume = {33},
901 number = {1},
902 pages = {1--22},
903 url = {http://www.jstatsoft.org/v33/i01/}
904 }
905 </citation>
906 <citation type="doi">
907 10.1093/bioinformatics/bts573
908 </citation>
909 </citations>
910 </tool>