Mercurial > repos > fubar > rglasso
annotate rglasso_cox.xml @ 12:7949cc3b28b0 draft default tip
Deleted selected files
author | fubar |
---|---|
date | Sun, 01 Nov 2015 18:27:03 -0500 |
parents | d351a1ec1e18 |
children |
rev | line source |
---|---|
0 | 1 <tool id="rglasso_cox" name="Lasso" version="0.03"> |
2 <description>and cox regression using elastic net</description> | |
3 <requirements> | |
10 | 4 <requirement type="package" version="3.2.2">R</requirement> |
0 | 5 <requirement type="package" version="1.3.18">graphicsmagick</requirement> |
6 <requirement type="package" version="9.10">ghostscript</requirement> | |
7 <requirement type="package" version="3.2">glmnet_lars_3_2</requirement> | |
8 </requirements> | |
9 <command interpreter="python"> | |
10 rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "rglasso" | |
11 --output_dir "$html_file.files_path" --output_html "$html_file" --make_HTML "yes" | |
12 </command> | |
13 <configfiles> | |
14 <configfile name="runme"> | |
15 <![CDATA[ | |
11
d351a1ec1e18
GRRR!!! as at nov 2015, glmnet fails to load methods. see https://github.com/dhimmel/elevcan/issues/1
fubar
parents:
10
diff
changeset
|
16 library('methods') |
d351a1ec1e18
GRRR!!! as at nov 2015, glmnet fails to load methods. see https://github.com/dhimmel/elevcan/issues/1
fubar
parents:
10
diff
changeset
|
17 #### as at nov 2015, glmnet fails to load methods. see https://github.com/dhimmel/elevcan/issues/1 |
0 | 18 library('glmnet') |
19 library('lars') | |
20 library('survival') | |
21 library('pec') | |
22 | |
23 | |
24 message=function(x) {print.noquote(paste(x,sep=''))} | |
25 | |
26 | |
27 ross.cv.glmnet = function (x, y, weights, offset = NULL, lambda = NULL, type.measure = c("mse", | |
28 "deviance", "class", "auc", "mae"), nfolds = 10, foldid, | |
29 grouped = TRUE, keep = FALSE, parallel = FALSE, ...) | |
30 { | |
31 if (missing(type.measure)) | |
32 type.measure = "default" | |
33 else type.measure = match.arg(type.measure) | |
34 if (!is.null(lambda) && length(lambda) < 2) | |
35 stop("Need more than one value of lambda for cv.glmnet") | |
36 N = nrow(x) | |
37 if (missing(weights)) | |
38 weights = rep(1, N) | |
39 else weights = as.double(weights) | |
40 y = drop(y) | |
41 glmnet.call = match.call(expand.dots = TRUE) | |
42 sel = match(c("type.measure", "nfolds", "foldid", "grouped", | |
43 "keep"), names(glmnet.call), F) | |
44 if (any(sel)) | |
45 glmnet.call = glmnet.call[-sel] | |
46 glmnet.call[[1]] = as.name("glmnet") | |
47 glmnet.object = glmnet(x, y, weights = weights, offset = offset, | |
48 lambda = lambda, ...) | |
49 glmnet.object\$call = glmnet.call | |
50 is.offset = glmnet.object\$offset | |
51 lambda = glmnet.object\$lambda | |
52 if (inherits(glmnet.object, "multnet")) { | |
53 nz = predict(glmnet.object, type = "nonzero") | |
54 nz = sapply(nz, function(x) sapply(x, length)) | |
55 nz = ceiling(apply(nz, 1, median)) | |
56 } | |
57 else nz = sapply(predict(glmnet.object, type = "nonzero"), | |
58 length) | |
59 if (missing(foldid)) | |
60 foldid = sample(rep(seq(nfolds), length = N)) | |
61 else nfolds = max(foldid) | |
62 if (nfolds < 3) | |
63 stop("nfolds must be bigger than 3; nfolds=10 recommended") | |
64 outlist = as.list(seq(nfolds)) | |
65 if (parallel && require(foreach)) { | |
66 outlist = foreach(i = seq(nfolds), .packages = c("glmnet")) %dopar% | |
67 { | |
68 sel = foldid == i | |
69 if (is.matrix(y)) | |
70 y_sub = y[!sel, ] | |
71 else y_sub = y[!sel] | |
72 if (is.offset) | |
73 offset_sub = as.matrix(offset)[!sel, ] | |
74 else offset_sub = NULL | |
75 glmnet(x[!sel, , drop = FALSE], y_sub, lambda = lambda, | |
76 offset = offset_sub, weights = weights[!sel], | |
77 ...) | |
78 } | |
79 } | |
80 else { | |
81 for (i in seq(nfolds)) { | |
82 sel = foldid == i | |
83 if (is.matrix(y)) | |
84 y_sub = y[!sel, ] | |
85 else y_sub = y[!sel] | |
86 if (is.offset) | |
87 offset_sub = as.matrix(offset)[!sel, ] | |
88 else offset_sub = NULL | |
89 outlist[[i]] = glmnet(x[!sel, , drop = FALSE], | |
90 y_sub, lambda = lambda, offset = offset_sub, | |
91 weights = weights[!sel], ...) | |
92 } | |
93 } | |
94 fun = paste("cv", class(glmnet.object)[[1]], sep = ".") | |
95 cvstuff = do.call(fun, list(outlist, lambda, x, y, weights, | |
96 offset, foldid, type.measure, grouped, keep)) | |
97 cvm = cvstuff\$cvm | |
98 cvsd = cvstuff\$cvsd | |
99 cvname = cvstuff\$name | |
100 | |
101 out = list(lambda = lambda, cvm = cvm, cvsd = cvsd, cvup = cvm + | |
102 cvsd, cvlo = cvm - cvsd, nzero = nz, name = cvname, glmnet.fit = glmnet.object) | |
103 if (keep) | |
104 out = c(out, list(fit.preval = cvstuff\$fit.preval, foldid = foldid)) | |
105 | |
106 lamin = if (type.measure == "auc") | |
107 getmin(lambda, -cvm, cvsd) | |
108 else getmin(lambda, cvm, cvsd) | |
109 out = c(out, as.list(lamin)) | |
110 hitsse = rep(0,ncol(x)) | |
111 hitsmin = rep(0,ncol(x)) | |
112 names(hitsse) = colnames(x) | |
113 names(hitsmin) = colnames(x) | |
114 olmin = lamin\$lambda.min | |
115 ol1sd = lamin\$lambda.1se | |
116 lambs = c(olmin,ol1sd) | |
117 names(lambs) = c('olmin','ol1sd') | |
118 for (cvfit in outlist) { | |
119 colmin = which(cvfit\$lambda == olmin) | |
120 col1se = which(cvfit\$lambda == ol1sd) | |
121 nzmin = which(cvfit\$beta[,colmin] != 0) | |
122 nz1se = which(cvfit\$beta[,col1se] != 0) | |
123 hitsse[nz1se] = hitsse[nz1se] + 1 | |
124 hitsmin[nzmin] = hitsmin[nzmin] + 1 | |
125 } | |
126 obj = c(out,list(cvhits.1se=hitsse,cvhits.min=hitsmin)) | |
127 class(obj) = "cv.glmnet" | |
128 obj | |
129 } | |
130 | |
131 mdsPlot = function(dm,myTitle,groups=NA,outpdfname,transpose=T) | |
132 { | |
133 | |
134 samples = colnames(dm) | |
135 mt = myTitle | |
136 pcols=c('maroon') | |
137 if (! is.na(groups)) | |
138 { | |
139 gu = unique(groups) | |
140 colours = rainbow(length(gu),start=0.1,end=0.9) | |
141 pcols = colours[match(groups,gu)] | |
142 } | |
143 mydata = dm | |
144 if (transpose==T) | |
145 { | |
146 mydata = t(dm) | |
147 } | |
148 npred = ncol(mydata) | |
149 d = dist(mydata) | |
150 fit = cmdscale(d,eig=TRUE, k=min(10,npred-2)) | |
151 xmds = fit\$points[,1] | |
152 ymds = fit\$points[,2] | |
153 pdf(outpdfname) | |
154 plot(xmds, ymds, xlab="Dimension 1", ylab="Dimension 2", | |
155 main=paste(mt,"MDS Plot"),type="n", col=pcols, cex=0.35) | |
156 text(xmds, ymds, labels = row.names(mydata), cex=0.35, col=pcols) | |
157 grid(col="lightgray",lty="dotted") | |
158 dev.off() | |
159 } | |
160 | |
161 | |
162 getpredp_logistic = function(x,yvec,yvarname,id) | |
163 { | |
164 yvals = unique(yvec) | |
165 if (length(yvals) != 2) { | |
166 message(c('ERROR: y does not have 2 values =',paste(yvals,collapse=','))) | |
167 return(NA) | |
168 } | |
169 cols = colnames(x) | |
170 if (length(cols) == 0) { | |
171 message('ERROR: No columns in input x? Cannot predict!') | |
172 return(NA) | |
173 } | |
174 cn = paste(cols, collapse = ' + ') | |
175 | |
176 formstring=paste("y ~",cn) | |
177 form = as.formula(formstring) | |
178 ok = complete.cases(x) | |
179 | |
180 if (sum(ok) < length(ok)) { | |
181 x = x[ok,] | |
182 yvec = yvec[ok] | |
183 id = id[ok] | |
184 } | |
185 nx = data.frame(id=id,x,y=yvec) | |
186 print('nx,yvec:') | |
187 print(head(nx,n=3)) | |
188 print(yvec) | |
189 mdl = glm(form, data=nx, family="binomial", na.action=na.omit) | |
190 message(c('Model format =',formstring)) | |
191 message(paste('Predictive model details used to generate logistic outcome probabilities for',yvarname,':')) | |
192 print(summary(md1)) | |
193 print(anova(md1)) | |
194 predp = predict(md1,nx,type="response") | |
195 p1 = data.frame(id=id,pred_response=predp,obs_response=yvec) | |
196 return(p1) | |
197 } | |
198 | |
199 getpredp_cox = function(x,time,status,id,predict_at) | |
200 { | |
201 cols = colnames(x) | |
202 if (length(cols) == 0) { | |
203 message('ERROR: No columns in input x? Cannot predict!') | |
204 return(NA) | |
205 } | |
206 cn = paste(colnames(x), collapse = ' + ') | |
207 | |
208 formstring=paste("Surv(time, status) ~",cn) | |
209 | |
210 form = as.formula(formstring) | |
211 | |
212 ok = complete.cases(x) | |
213 | |
214 if (sum(ok) < length(ok)) { | |
215 x = x[ok,] | |
216 time = time[ok] | |
217 status = status[ok] | |
218 id = id[ok] | |
219 } | |
220 nx = data.frame(x,time=time,status=status) | |
221 m1 = coxph(form, data=nx,singular.ok=TRUE) | |
222 print.noquote('Predictive model details used to generate survival probabilities:') | |
223 print.noquote(m1) | |
224 predpq = predictSurvProb(object=m1, newdata=nx, times=predict_at) | |
225 predpq = 1-predpq | |
226 colnames(predpq) = paste('p_surv_to',predict_at,sep='_') | |
227 p1 = data.frame(id=id,predpq,time=time,status=status) | |
228 return(p1) | |
229 } | |
230 | |
231 | |
232 dolasso_cox = function(x,y,debugOn=F,maxsteps=10000,nfold=10,xcolnames,ycolnames,optLambda='lambda.1se',out_full=F,out_full_file=NA, | |
233 out_pred=F,out_pred_file=NA,cox_id=NA, descr='Cox test',do_standard=F,alpha=0.9,penalty,predict_at,mdsplots=F) | |
234 { | |
235 logf = file("cox_rglasso.log", open = "a") | |
236 sink(logf,type = c("output", "message")) | |
237 res = NULL | |
238 if (mdsplots==T) { | |
239 outpdfname = 'cox_x_in_sample_space_MDS.pdf' | |
240 p = try({ mdsPlot(x,'measurements in sample space',groups=NA,outpdfname=outpdfname,transpose=T) },T) | |
241 if (class(p) == "try-error") | |
242 { | |
243 print.noquote(paste('Unable to produce predictors in sample space mds plot',p)) | |
244 } | |
245 outpdfname = 'cox_samples_in_x_space_MDS.pdf' | |
246 p = try({mdsPlot(x,'samples in measurement space',groups=y,outpdfname=outpdfname,transpose=F) },T) | |
247 if (class(p) == "try-error") | |
248 { | |
249 print.noquote(paste('Unable to produce samples in measurement space mds plots',p)) | |
250 } | |
251 } | |
252 if (is.na(predict_at)) { predict_at = quantile(y) } | |
253 message(paste('@@@ Cox model will be predicted at times =',paste(predict_at,collapse=','))) | |
254 do_standard = do_standard | |
255 standardize = do_standard | |
256 normalize = do_standard | |
257 p = try({larsres = glmnet(x,y,family='cox',standardize=standardize,alpha=alpha,penalty.factor=penalty )},T) | |
258 if (class(p) == "try-error") | |
259 { | |
260 print.noquote('Unable to run cox glmnet on your data') | |
261 print.noquote(p) | |
262 sink() | |
263 return(NA) | |
264 } | |
265 if (out_full == T) | |
266 { | |
267 b = as.matrix(larsres\$beta) | |
268 nb = length(colnames(b)) | |
269 bcoef = b[,nb] | |
270 lastl = larsres\$lambda[length(larsres\$lambda)] | |
271 allres = data.frame(x=rownames(b),beta=bcoef,lambda=lastl) | |
272 write.table(format(allres,digits=5),out_full_file,quote=FALSE, sep="\t",row.names=F) | |
273 } | |
274 | |
275 outpdf = paste('cox',descr,'glmnetdev.pdf',sep='_') | |
276 try( | |
277 { | |
278 pdf(outpdf) | |
279 plot(larsres,main='cox glmnet',label=T) | |
280 grid() | |
281 dev.off() | |
282 },T) | |
283 | |
284 larscv = NA | |
285 | |
286 p = try({larscv=ross.cv.glmnet(x,y,family=fam,type.measure='deviance',penalty=penalty)},T) | |
287 if (class(p) == "try-error") { | |
288 print.noquote(paste('Unable to cross validate your data',p)) | |
289 sink() | |
290 return(NA) | |
291 } | |
292 lse = larscv\$cvhits.1se | |
293 lmin = larscv\$cvhits.min | |
294 tot = lse + lmin | |
295 allhits = data.frame(hits_lambda_1se = lse,hits_lambda_min = lmin) | |
296 nzhits = allhits[which(tot != 0),] | |
297 message('Times each predictor was selected in CV models (excluding zero count predictors)') | |
298 print.noquote(nzhits) | |
299 out_nz_file = 'cox_cross_validation_model_counts.xls' | |
300 write.table(nzhits,out_nz_file,quote=FALSE, sep="\t",row.names=F) | |
301 | |
302 outpdf = paste('cox',descr,'glmnet_cvdeviance.pdf',sep='_') | |
303 | |
304 p = try( | |
305 { | |
306 pdf(outpdf) | |
307 plot(larscv,main='Deviance',label=T) | |
308 grid() | |
309 dev.off() | |
310 },T) | |
311 if (optLambda == 'lambda.min') { | |
312 best_lambda = larscv\$lambda.min | |
313 bestcoef = as.matrix(coef(larscv, s = "lambda.min")) | |
314 } else { | |
315 best_lambda = larscv\$lambda.1se | |
316 bestcoef = as.matrix(coef(larscv, s = "lambda.1se")) | |
317 } | |
318 inmodel = which(bestcoef != 0) | |
319 coefs = bestcoef[inmodel] | |
320 preds = rownames(bestcoef)[inmodel] | |
321 | |
322 names(coefs) = preds | |
323 pen = as.logical( ! penalty[inmodel]) | |
324 if (out_pred==T) | |
325 { | |
326 if (length(inmodel) > 0 ) { | |
327 predcols = inmodel | |
328 xmat = as.matrix(x[,predcols]) | |
329 colnames(xmat) = preds | |
330 bestpred = getpredp_cox(x=xmat,time=y[,'time'],status=y[,'status'],id=cox_id, predict_at=predict_at) | |
331 pred = data.frame(responsep=bestpred, best_lambda=best_lambda,lamchoice=optLambda,alpha=alpha) | |
332 write.table(pred,out_pred_file,quote=FALSE, sep="\t",row.names=F) | |
333 } else { print.noquote('WARNING: No coefficients in selected model to predict with - no predictions made') } | |
334 } | |
335 if (debugOn) { | |
336 print.noquote(paste('best_lambda=',best_lambda,'saving cox respreds=',paste(names(coefs),collapse=','),'as predictors of survival. Coefs=',paste(coefs,collapse=','))) | |
337 } | |
338 p = try({res = data.frame(regulator=names(coefs),partial_likelihood=coefs,forced_in=pen,glmnet_model='cox',best_lambda=best_lambda, | |
339 lambdaChoice=optLambda,alpha=alpha)},T) | |
340 if (class(p) == "try-error") { | |
341 message(paste('@@@ unable to return a dataframe',p)) | |
342 sink() | |
343 return(NA) | |
344 } | |
345 print.noquote('@@@ Results preview:') | |
346 print.noquote(res,digits=5) | |
347 sink() | |
348 return(res) | |
349 | |
350 } | |
351 | |
352 | |
353 do_lasso = function(x=NA,y=NA,do_standard=T,debugOn=T,defaultFam="gaussian",optLambda='minLambda',descr='description', indx=1,target='target',sane=F, | |
354 alpha=0.9,nfold=10,penalty=c(),out_pred=F,out_pred_file='outpred',yvarname='yvar',id=c(),mdsplots=F) | |
355 { | |
356 logf = file(paste(target,"rglasso.log",sep='_'), open = "a") | |
357 sink(logf,type = c("output", "message")) | |
358 res = NA | |
359 phe_is_bin = (length(unique(y)) == 2) | |
360 forcedin = paste(colnames(x)[which(penalty == 0)],collapse=',') | |
361 fam = "gaussian" | |
362 if (defaultFam %in% c("poisson","binomial","gaussian","multinomial")) fam=defaultFam | |
363 if (phe_is_bin == T) { | |
364 fam = "binomial" | |
365 } | |
366 print.noquote(paste('target=',target,'is binary=',phe_is_bin,'dim(x)=',paste(dim(x),collapse=','),'length(y)=',length(y),'force=',forcedin,'fam=',fam)) | |
367 standardize = do_standard | |
368 p = try({larsres = glmnet(x,y,family=fam,standardize=standardize,maxit=10000,alpha=alpha,penalty.factor=penalty) },T) | |
369 if (class(p) == "try-error") | |
370 { | |
371 print(paste('ERROR: unable to run glmnet for target',target,'error=',p)) | |
372 sink() | |
373 return(NA) | |
374 } | |
375 | |
376 mt = paste('Glmnet fraction deviance for',target) | |
377 outpdf = paste(target,'glmnetPath.pdf',sep='_') | |
378 pdf(outpdf) | |
379 plot(larsres,main=mt,label=T) | |
380 grid() | |
381 dev.off() | |
382 | |
383 outpdf = paste(target,'glmnetDeviance.pdf',sep='_') | |
384 | |
385 mt2 = paste('Glmnet lambda for',target) | |
386 | |
387 pdf(outpdf) | |
388 plot(larsres,xvar="lambda",main=mt2,label=T) | |
389 grid() | |
390 dev.off() | |
391 | |
392 larscv = NA | |
393 if (fam=="binomial") { | |
394 tmain = paste(target,'AUC') | |
395 outpdf = paste(target,'glmnetCV_AUC.pdf',sep='_') | |
396 p = try({larscv = ross.cv.glmnet(x=x,y=y,family=fam,type.measure='auc')},T) | |
397 } else { | |
398 tmain = paste(target,'CV MSE') | |
399 outpdf = paste(target,'glmnetCV_MSE.pdf',sep='_') | |
400 p = try({larscv = ross.cv.glmnet(x=x,y=y,family=fam,type.measure='mse')},T) | |
401 } | |
402 if (class(p) == "try-error") | |
403 { | |
404 print(paste('ERROR: unable to run cross validation for target',target,'error=',p)) | |
405 sink() | |
406 return(NA) | |
407 } | |
408 | |
409 pdf(outpdf) | |
410 plot(larscv,main=tmain) | |
411 grid() | |
412 dev.off() | |
413 | |
414 lse = larscv\$cvhits.1se | |
415 lmin = larscv\$cvhits.min | |
416 tot = lse + lmin | |
417 allhits = data.frame(pred=colnames(x),hits_lambda_1se = lse,hits_lambda_min = lmin) | |
418 nzhits = allhits[which(tot != 0),] | |
419 message('Total hit count for each predictor over all CV models (excluding zero count predictors)') | |
420 print.noquote(nzhits) | |
421 out_nz_file = paste(target,'cross_validation_model_counts.xls',sep='_') | |
422 write.table(nzhits,out_nz_file,quote=FALSE, sep="\t",row.names=F) | |
423 | |
424 ipenalty = c(0,penalty) | |
425 if (optLambda == 'lambda.min') { | |
426 best_lambda = larscv\$lambda.min | |
427 bestpred = as.matrix(coef(larscv, s = "lambda.min")) | |
428 } else { | |
429 best_lambda = larscv\$lambda.1se | |
430 bestpred = as.matrix(coef(larscv, s = "lambda.1se")) | |
431 } | |
432 inmodel = which(bestpred != 0) | |
433 coefs = bestpred[inmodel,1] | |
434 preds = rownames(bestpred)[inmodel] | |
435 iforced = ipenalty[inmodel] | |
436 forced = ! as.logical(iforced) | |
437 names(coefs) = preds | |
438 ncoef = length(coefs) - 1 | |
439 if (out_pred==T && fam=="binomial") | |
440 { | |
441 print.noquote(paste('Predicting',target,'probabilities from binomial glmnet at alpha',alpha,'and lambda',best_lambda)) | |
442 bestpred = predict.glmnet(larsres,s=best_lambda,newx=x,type="response") | |
443 bestpred = exp(bestpred)/(1+exp(bestpred)) | |
444 pred = data.frame(id=id,y=y,predp=as.vector(bestpred), best_lambda=best_lambda) | |
445 write.table(pred,out_pred_file,quote=FALSE, sep="\t",row.names=F) | |
446 } | |
447 if (debugOn) {cat(indx,'best_lambda=',best_lambda,'saving',fam,'respreds=',names(coefs),'as predictors of',target,'coefs=',coefs,'\n')} | |
448 res = try(data.frame(i=indx,pred=target,regulator=names(coefs),coef=coefs,forced_in=forced,glmnet_model=fam,ncoef=ncoef, | |
449 best_lambda=best_lambda,lambdaChoice=optLambda,alpha=alpha),T) | |
450 if (class(res) == "try-error") { | |
451 sink() | |
452 return(NA) } | |
453 print.noquote(res) | |
454 sink() | |
455 return(res) | |
456 } | |
457 | |
458 | |
459 dolasso_generic = function(predvars=NA,depvars=NA,debugOn=T,maxsteps=100, alpha=0.9,nfold=10,xcolnames=c(),ycolnames=c(),optLambda='minLambda', out_pred_file=NA, | |
460 descr="describe me",do_standard=F,defaultFam="gaussian",penalty=c(),out_pred=F,cox_id=c(),mdsplots=F,xfilt=0.95) | |
461 { | |
462 logf = file("rglasso.log", open = "a") | |
463 sink(logf,type = c("output", "message")) | |
464 xdat = predvars | |
465 xm = data.matrix(xdat) | |
466 res = NULL | |
467 id = cox_id | |
468 depnames = ycolnames | |
469 ndep = length(depnames) | |
470 if (mdsplots==T) { | |
471 outpdfname = 'rglasso_x_in_sample_space_MDS.pdf' | |
472 p = try({ mdsPlot(xm,'measurements in sample space',groups=NA,outpdfname=outpdfname,transpose=T) },T) | |
473 if (class(p) == "try-error") | |
474 { | |
475 print.noquote(paste('Unable to produce predictors in sample space mds plot. Error:',p)) | |
476 } | |
477 outpdfname = 'rglasso_samples_in_x_space_MDS.pdf' | |
478 p = try({mdsPlot(xm,'samples in measurement space',groups=NA,outpdfname=outpdfname,transpose=F) },T) | |
479 if (class(p) == "try-error") | |
480 { | |
481 print.noquote(paste('Unable to produce samples in measurement space mds plot. Error:',p)) | |
482 } | |
483 } | |
484 ndat = nrow(xm) | |
485 cfracs = colSums(! is.na(xm))/ndat | |
486 keepme = (cfracs >= xfilt) | |
487 print.noquote(paste('Removing', sum(! keepme), 'xvars with more than',xfilt,'fraction missing')) | |
488 vars = apply(xm,2,var,na.rm=T) | |
489 xm = xm[,keepme] | |
490 for (i in c(1:max(1,ndep))) { | |
491 target = depnames[i] | |
492 if (length(target) < 1) { target='y' } | |
493 if (i %% 100 == 0) { cat(i,target,'\n') } | |
494 if (ndep <= 1) { | |
495 y=depvars | |
496 } else { | |
497 y = depvars[,i] | |
498 } | |
499 if (fam == "binomial") {y = as.factor(y)} | |
500 x = xm | |
501 id = cox_id | |
502 if (fam != "cox") { | |
503 ok = complete.cases(x,y) | |
504 if (sum(! ok) > 0) { | |
505 message(paste('@@@ Removing',sum(! ok),'cases with missing y of',length(y),'@@@')) | |
506 y = y[(ok)] | |
507 x = x[(ok),] | |
508 id = id[(ok)] | |
509 } | |
510 } | |
511 ok = complete.cases(y) | |
512 if (sum(ok) == 0 ) { | |
513 print(paste("No complete cases found for",target,"in input x dim =",paste(dim(xm),collapse=','),"length y=",length(y))) | |
514 } else { | |
515 if (i == 1) { outpred = out_pred_file | |
516 } else { | |
517 outpred = paste(target,'predicted_output.xls') | |
518 } | |
519 regres = do_lasso(x=x,y=y,do_standard=do_standard,debugOn=debugOn,defaultFam=defaultFam,optLambda=optLambda,out_pred_file=outpred, | |
520 descr=descr,indx=i,target=target,alpha=alpha,nfold=nfold,penalty=penalty,out_pred=out_pred,yvarname=target,id=id,mdsplots=mdsplots) | |
521 if (! is.na(regres)) { res = rbind(res,regres) } | |
522 } | |
523 } | |
524 print.noquote('@@@ Results preview:') | |
525 print.noquote(res,digits=5) | |
526 sink() | |
527 return(res) | |
528 } | |
529 | |
530 | |
531 corPlot=function(xdat=c(),main='main title',is_raw=T) | |
532 { | |
533 library(pheatmap) | |
534 library(gplots) | |
535 if (is_raw) { | |
536 cxdat = cor(xdat,method="spearman",use="pairwise.complete.obs") | |
537 } else { | |
538 cxdat=xdat | |
539 } | |
540 xro = nrow(cxdat) | |
541 if (xro > 1000) stop("Too many rows for heatmap, who can read?!") | |
542 fontsize_col = 5.0 | |
543 pheatmap(cxdat, main=main, show_colnames = F, width=30, height=30, | |
544 fontsize_row=fontsize_col, border_color=NA) | |
545 } | |
546 | |
547 | |
548 runTest = function(n=10) | |
549 { | |
550 set.seed (NULL) | |
551 Y = data.frame(y1=runif (n),y2=runif(n)) | |
552 Xv <- runif(n*n) | |
553 X <- matrix(Xv, nrow = n, ncol = n) | |
554 | |
555 mydf <- data.frame(Y, X) | |
556 | |
557 regres_out = dolasso_generic(predvars=X,depvars=Y,debugOn=T,p.cutoff = 0.05,maxsteps=10000,nfold=10, | |
558 descr='randomdata',do_standard=do_standard,defaultFam="gaussian",alpha=0.05) | |
559 return(regres_out) | |
560 } | |
561 ]]> | |
562 options(width=512) | |
563 options(digits=5) | |
564 alpha = $alpha | |
565 nfold = $nfold | |
566 optLambda = "$optLambda" | |
567 Out_Dir = "$html_file.files_path" | |
568 Input = "$input1" | |
569 indat = read.table(Input,head=T,sep='\t') | |
570 datcols = colnames(indat) | |
571 myTitle = "$title" | |
572 outtab = "$model_file" | |
573 do_standard = as.logical("$do_standard") | |
574 mdsplots = as.logical("$mdsplots") | |
575 fam = "$model.fam" | |
576 xvar_cols_in = "$xvar_cols" | |
577 force_xvar_cols_in = "$force_xvar_cols" | |
578 xvar_cols = as.numeric(strsplit(xvar_cols_in,",")[[1]]) | |
579 force_xvar_cols = c() | |
580 penalties = rep(1,length(datcols)) | |
581 forced_in = NA | |
582 | |
583 logxform = "$logxform_cols" | |
584 if (logxform != "None") { | |
585 logxform_cols = as.numeric(strsplit(logxform,",")[[1]]) | |
586 if (length(logxform_cols) > 0) { | |
587 small = 1e-10 | |
588 sset = indat[,logxform_cols] | |
589 zeros = which(sset==0,arr.ind=T) | |
590 nz = nrow(zeros) | |
591 if (nz > 0) { | |
592 message(paste('Log transforming encountered',nz,'zeros - added 1e-10')) | |
593 sset[zeros] = sset[zeros] + small | |
594 lset = log(sset) | |
595 indat[,logxform_cols] = lset | |
596 } | |
597 } | |
598 } | |
599 if (force_xvar_cols_in != "None") | |
600 { | |
601 force_xvar_cols = as.numeric(strsplit(force_xvar_cols_in,",")[[1]]) | |
602 allx = c(xvar_cols,force_xvar_cols) | |
603 xvar_cols = unique(allx) | |
604 xvar_cols = xvar_cols[order(xvar_cols)] | |
605 penalties[force_xvar_cols] = 0 | |
606 } | |
607 penalty = penalties[xvar_cols] | |
608 forcedin = paste(datcols[which(penalties == 0)],collapse=',') | |
609 cox_id_col = NA | |
610 cox_id = NA | |
611 | |
612 message(paste('@@@ Using alpha =',alpha,'for all models')) | |
613 x = indat[,xvar_cols] | |
614 nx = nrow(x) | |
615 cx = ncol(x) | |
616 message(paste('@@@@ Input has',nx,'samples and',cx,'predictors')) | |
617 if (cx > nx) { | |
618 message('@@@ WARNING: Models will have more variables than cases so glmnet will likely return one of many possible solutions! Please DO NOT expect reliable results - glmnet is clever but not magical @@@') | |
619 } | |
620 | |
621 xcolnames = datcols[xvar_cols] | |
622 | |
623 if (file.exists(Out_Dir) == F) dir.create(Out_Dir) | |
624 out_full = F | |
625 out_full_file = NA | |
626 out_pred_file = "" | |
627 out_pred = as.logical("$model.output_pred") | |
628 | |
629 #if $model.fam == "binomial" or $model.fam == "cox": | |
630 cox_id_col = $model.cox_id | |
631 cox_id = indat[,cox_id_col] | |
632 if (out_pred == T) { | |
633 out_pred_file="$output_pred_file" | |
634 rownames(x) = cox_id | |
635 } | |
636 #end if | |
637 #if $model.fam == "cox": | |
638 cox_time = $model.cox_time | |
639 cox_status = $model.cox_status | |
640 out_full = as.logical("$model.output_full") | |
641 if (out_full == T) { out_full_file="$output_full_file" } | |
642 yvar_cols = c(cox_time,cox_status) | |
643 ycolnames = c('time','status') | |
644 istat = as.double(indat[,cox_status]) | |
645 itime = as.double(indat[,cox_time]) | |
646 predict_at = quantile(itime) | |
647 if ("$model.predict_at" > "") | |
648 { | |
649 pa = "$model.predict_at" | |
650 predict_at = as.numeric(strsplit(pa,",")[[1]]) | |
651 } | |
652 y = data.frame(time = itime, status = istat) | |
653 ustat = unique(istat) | |
654 if ((length(ustat) != 2) | (! 1 %in% ustat ) | (! 0 %in% ustat)) | |
655 { | |
656 print.noquote(paste('INPUT ERROR: status must have 0 (censored) or 1 (event) but found',paste(ustat,collapse=',') )) | |
657 quit(save='no',status=1) | |
658 } | |
659 y = as.matrix(y) | |
660 x = as.matrix(x) | |
661 print.noquote(paste('@@@ Cox model will predict yvar=',datcols[cox_status],'using cols=',paste(xcolnames,collapse=','),'n preds=',length(xcolnames), | |
662 'forced in=',forcedin)) | |
663 regres_out = dolasso_cox(x=x,y=y,debugOn=F,maxsteps=10000,nfold=nfold,xcolnames=xcolnames,ycolnames=ycolnames,optLambda=optLambda,out_full=out_full,out_full_file=out_full_file, | |
664 out_pred=out_pred,out_pred_file=out_pred_file,cox_id=cox_id,descr=myTitle,do_standard=do_standard,alpha=alpha,penalty=penalty,predict_at=predict_at,mdsplots=mdsplots) | |
665 #else: | |
666 yvar_cols = "$model.yvar_cols" | |
667 yvar_cols = as.numeric(strsplit(yvar_cols,",")[[1]]) | |
668 ycolnames = datcols[yvar_cols] | |
669 print.noquote(paste('@@@',fam,'model will predict yvar=',paste(ycolnames,collapse=','),'using cols=',paste(xcolnames,collapse=','),'n preds=',length(xcolnames), | |
670 'forced in=',forcedin)) | |
671 y = data.matrix(indat[,yvar_cols]) | |
672 print.noquote(paste('Model will use',fam,'link function to predict yvar=',paste(ycolnames,collapse=','),'n preds=',length(xcolnames),'forced in=',forcedin)) | |
673 regres_out = dolasso_generic(predvars=x,depvars=y,debugOn=F, maxsteps=10000,nfold=nfold,xcolnames=xcolnames,ycolnames=ycolnames,optLambda=optLambda,out_pred_file=out_pred_file, | |
674 descr=myTitle,do_standard=do_standard,defaultFam=fam,alpha=alpha,penalty=penalty,out_pred=out_pred,cox_id=cox_id,mdsplots=mdsplots) | |
675 #end if | |
676 | |
677 write.table(format(regres_out,digits=5),outtab,quote=FALSE, sep="\t",row.names=F) | |
678 print.noquote('@@@ SessionInfo for this R session:') | |
679 sessionInfo() | |
680 warnings() | |
681 | |
682 </configfile> | |
683 </configfiles> | |
684 <inputs> | |
685 <param name="title" type="text" value="lasso test" label="Title for job outputs" help="Typing a short, meaningful text here will help remind you (and explain to others) what the outputs represent"> | |
686 <sanitizer invalid_char=""> | |
687 <valid initial="string.letters,string.digits"><add value="_" /> </valid> | |
688 </sanitizer> | |
689 </param> | |
690 <param name="input1" type="data" format="tabular" label="Select an input tabular text file from your history. Rows represent samples; Columns are measured phenotypes" | |
691 multiple='False' optional="False" help="Tabular text data with samples as rows, phenotypes as columns with a header row of column identifiers" /> | |
692 <param name="xvar_cols" label="Select columns containing numeric variables to use as predictor (x) variables" type="data_column" data_ref="input1" numerical="False" | |
693 multiple="True" use_header_names="True" force_select="True" /> | |
694 <param name="force_xvar_cols" label="Select numeric columns containing variables ALWAYS included as predictors in cross validation" type="data_column" data_ref="input1" numerical="False" | |
695 multiple="True" use_header_names="True" force_select="False"/> | |
696 <conditional name="model"> | |
697 <param name="fam" type="select" label="GLM Link function for models" | |
698 help="Binary dependant variables will automatically be set to Binomial no matter what this is set to"> | |
699 <option value="gaussian" selected="true">Gaussian - continuous dependent (y)</option> | |
700 <option value="binomial">Binomial dependent variables</option> | |
701 <option value="poisson">Poisson (eg counts)</option> | |
702 <option value="cox">Cox models - require special setup for y variables - see below</option> | |
703 </param> | |
704 <when value="gaussian"> | |
705 <param name="yvar_cols" label="Select numeric columns containing variables to use as the dependent (y) in elasticnet" type="data_column" data_ref="input1" numerical="False" | |
706 multiple="True" use_header_names="True" help = "If multiple, each will be modelled against all the x variables and reported separately." force_select="True"/> | |
707 <param name="output_full" type="hidden" value='F' /> | |
708 <param name="output_pred" type="hidden" value='F' /> | |
709 <param name="cox_id" label="Select column containing a unique sample identifier" | |
710 help = "Only really needed for output sample specific predicted values downstream." | |
711 type="data_column" data_ref="input1" numerical="False" force_select="True" | |
712 multiple="False" use_header_names="True" /> | |
713 </when> | |
714 <when value="binomial"> | |
715 <param name="yvar_cols" label="Select numeric columns containing variables to use as the dependent (y) in elasticnet" type="data_column" data_ref="input1" numerical="False" | |
716 multiple="True" use_header_names="True" help = "If multiple, each will be modelled against all the x variables and reported separately." force_select="True"/> | |
717 <param name="output_full" type="hidden" value='F' /> | |
718 <param name="output_pred" type="select" label="Create a tabular output with predicted values for each subject from the optimal model for (eg) NRI estimates" > | |
719 <option value="F" selected="true">No predicted value output file</option> | |
720 <option value="T">Create a predicted value output file</option> | |
721 </param> | |
722 <param name="cox_id" label="Select column containing a unique sample identifier" | |
723 help = "Only really needed for output sample specific predicted values downstream." | |
724 type="data_column" data_ref="input1" numerical="False" force_select="True" | |
725 multiple="False" use_header_names="True" /> | |
726 <param name="predict_at" type="hidden" value='' /> | |
727 | |
728 </when> | |
729 <when value="poisson"> | |
730 <param name="yvar_cols" label="Select columns containing variables to use as the dependent (y) in elasticnet" type="data_column" data_ref="input1" numerical="True" | |
731 multiple="True" use_header_names="True" help = "If multiple, each will be modelled against all the x variables and reported separately." force_select="True"/> | |
732 <param name="output_full" type="hidden" value='F' /> | |
733 <param name="output_pred" type="hidden" value='F' /> | |
734 <param name="predict_at" type="hidden" value='' /> | |
735 <param name="cox_id" label="Select column containing a unique sample identifier" | |
736 help = "Optional. Only really needed for output sample specific predicted values downstream. Free - enjoy" | |
737 type="data_column" data_ref="input1" numerical="True" force_select="False" | |
738 multiple="False" use_header_names="True" /> | |
739 </when> | |
740 <when value="cox"> | |
741 <param name="cox_time" label="Select column containing time under observation for Cox regression" | |
742 type="data_column" data_ref="input1" numerical="True" force_select="True" | |
743 multiple="False" use_header_names="True" help = "This MUST contain a time period - eg continuous years or days to failure or right censoring"/> | |
744 <param name="cox_status" label="Select column containing status = 1 for outcome of interest at the end of the time under observation or 0 for right censoring" | |
745 type="data_column" data_ref="input1" numerical="True" force_select="True" | |
746 multiple="False" use_header_names="True" help = "This MUST contain 1 for subjects who had an event at that time or 0 for a right censored observation"/> | |
747 <param name="cox_id" label="Select column containing a unique sample identifier" | |
748 help = "Optional. Only really needed for output sample specific predicted values downstream. Free - enjoy" | |
749 type="data_column" data_ref="input1" numerical="False" force_select="False" | |
750 multiple="False" use_header_names="True" /> | |
751 <param name="output_full" type="select" label="Create a tabular output with coefficients for all predictors" > | |
752 <option value="F" selected="true">No full model output file</option> | |
753 <option value="T">Create a full model output file</option> | |
754 </param> | |
755 <param name="output_pred" type="select" label="Create a tabular output with predicted values for each subject from the optimal model for (eg) NRI estimates" > | |
756 <option value="F" selected="true">No predicted value output file</option> | |
757 <option value="T">Create a predicted value output file</option> | |
758 </param> | |
759 <param name="predict_at" type="text" value='' label="Provide a comma separated list of times to make a prediction for each subject" | |
760 optional="True" help="Default (blank) will return predictions at 0%,25%,50%,75%,100% of the observed times which should be informative" /> | |
761 | |
762 </when> | |
763 </conditional> | |
764 <param name="optLambda" type="select" label="Value to use when reporting optimal model and coefficients" help="minLambda will have more predictors - 1SDLambda will be more parsimonious"> | |
765 <option value="lambda.1se" selected="true">Lambda + 1 SE of min MSE or AUC (fewer coefficients - more false negatives)</option> | |
766 <option value="lambda.min">Lambda at min MSE or max AUC (more coefficients - more false positives)</option> | |
767 </param> | |
768 <param name="logxform_cols" optional="True" label="Select numeric columns to be log transformed before use as predictors or dependent variables" type="data_column" | |
769 data_ref="input1" numerical="True" multiple="True" use_header_names="True" help = "The wisdom of doing this depends entirely on your predictors - eg can help diminish long-tailed outlier influence" | |
770 force_select="False"/> | |
771 <param name="do_standard" type="select" label="Standardise x vars" | |
772 help="If all measurements on same scale, may not be needed. Coefficients are always returned on the original scale."> | |
773 <option value="False" selected="true">No standardisation of predictors</option>l | |
774 <option value="True">Standardise predictors before model</option> | |
775 </param> | |
776 <param name="mdsplots" type="select" label="Generate MDS plots of samples in measurement space and measurements in sample space" > | |
777 <option value="False" selected="true">No MDS plots</option>l | |
778 <option value="True">Yes create MDS plots</option> | |
779 </param> | |
780 <param name="alpha" type="float" value="0.95" min="0.01" max="1.0" label="Alpha - see glmnet docs. 1 for pure lasso. 0.0 for pure ridge regression" | |
781 help="Default 0.95 allows lasso to cope better with expected predictor collinearity. Use (eg) 0.5 for hybrid regularised regression or (eg) 0.025 for ridge regression"/> | |
782 <param name="nfold" type="integer" value="10" label="Number of folds for internal cross validation" | |
783 help="Default of 10 is usually ok"/> | |
784 </inputs> | |
785 <outputs> | |
786 <data format="html" name="html_file" label="${title}.html"/> | |
787 <data format="tabular" name="model_file" label="${title}_modelres.xls"/> | |
788 <data format="tabular" name="output_full_file" label="${title}_full_cox_model.xls"> | |
789 <filter>model['output_full'] == 'T'</filter> | |
790 </data> | |
791 <data format="tabular" name="output_pred_file" label="${title}_predicted_from_model.xls"> | |
792 <filter>model['output_pred'] == 'T'</filter> | |
793 </data> | |
794 </outputs> | |
795 <tests> | |
796 <test> | |
797 <param name='input1' value='cox_test.xls' ftype='tabular' /> | |
798 <param name='treatment_name' value='case' /> | |
799 <param name='title' value='Cox glmnet test' /> | |
800 <param name='nfold' value='10' /> | |
801 <param name='logxform_cols' value='' /> | |
802 <param name='alpha' value='0.95' /> | |
803 <param name='do_standard' value="True" /> | |
804 <param name='cox_time' value='1' /> | |
805 <param name='cox_status' value='2' /> | |
806 <param name='cox_id' value='1' /> | |
807 <param name='predict_at' value='' /> | |
808 <param name='fam' value='cox' /> | |
809 <param name='yvar_cols' value='' /> | |
810 <param name='xvar_cols' value='3,4,5' /> | |
811 <param name='force_xvar_cols' value='3' /> | |
812 <param name='output_full' value='F' /> | |
813 <param name='output_pred' value='F' /> | |
814 <output name='model_file' file='coxlassotest_modelres.xls'> | |
815 <assert_contents> | |
816 <has_text text="rhubarb" /> | |
817 <has_text text="TRUE" /> | |
818 <!-- 	 is XML escape code for tab --> | |
819 <!-- has_line line="regulator	partial_likelihood	forced_in	glmnet_model	best_lambda" / --> | |
820 <has_line line="regulator	partial_likelihood	forced_in	glmnet_model	best_lambda	lambdaChoice	alpha" /> | |
821 <has_n_columns n="7" /> | |
822 </assert_contents> | |
823 </output> | |
824 <output name='html_file' file='coxlassotest.html' compare='diff' lines_diff='16' /> | |
825 </test> | |
826 </tests> | |
827 <help> | |
828 | |
829 **Before you start** | |
830 | |
831 Please read the glmnet documentation @ glmnet_ | |
832 | |
833 This Galaxy wrapper merely exposes that code and the glmnet_ documentation is essential reading | |
834 before getting useful results here. | |
835 | |
836 **What it does** | |
837 | |
838 From documentation at glmnet_ :: | |
839 | |
840 Glmnet is a package that fits a generalized linear model via penalized maximum likelihood. | |
841 The regularization path is computed for the lasso or elasticnet penalty at a grid of values for the regularization parameter lambda. | |
842 The algorithm is extremely fast, and can exploit sparsity in the input matrix x. | |
843 It fits linear, logistic and multinomial, poisson, and Cox regression models. | |
844 A variety of predictions can be made from the fitted models. | |
845 | |
846 Internal cross validation is used to optimise the choice of lambda based on CV AUC for logistic (binomial outcome) models, or CV mse for gaussian. | |
847 | |
848 **Warning about the tyrany of dimensionality** | |
849 | |
850 Yes, this package will select 'optimal' models even when you (optimistically) supply more predictors than you have cases. | |
851 The model returned is unlikely to represent the only informative regularisation path through your data - if you run repeatedly with | |
852 exactly the same settings, you will probably see many different models being selected. | |
853 This is not a software bug - the real problem is that you just don't have enough information in your data. | |
854 | |
855 Sufficiently big jobs will take a while (eg each lasso regression with 20k features on 1k samples takes about 2-3 minutes on our aged cluster) | |
856 | |
857 **Input** | |
858 | |
859 Assuming you have more measurements than samples, you supply data as a tabular text file where each row is a sample and columns | |
860 are variables. You specify which columns are dependent (predictors) and which are observations for each sample. Each of multiple | |
861 dependent variable columns will be run and reported independently. Predictors can be forced in to the model. | |
862 | |
863 **Output** | |
864 | |
865 For each selected dependent regression variable, a brief report of the model coefficients predicted at the | |
866 'optimal' nfold CV value of lambda. | |
867 | |
868 **Predicted event probabilities for Cox and Logistic models** | |
869 | |
870 If you want to compare (eg) two competing clinical predictions, there's a companion generic NRI tool | |
871 for predicted event probabilities. Estimates dozens of measures of improvement in prediction. Currently only works for identical id subjects | |
872 but can probably be extended to independent sample predictions. | |
873 | |
874 Given a model, we can generate a predicted p (for status 1) in binomial or cox frameworks so models can be evaluated in terms of NRI. | |
875 Of course, estimates are likely substantially inflated over 'real world' performance by being estimated from the same sample - but you probably | |
876 already knew that since you were smart enough to reach this far down into the on screen help. The author salutes you, intrepid reader! | |
877 | |
878 It may seem an odd thing to do, but we can predict p for an event for each subject from our original data, given a parsimonious model. Doing | |
879 this for two separate models (eg, forcing in an additional known explanatory measurement to the new model) allows comparison of the two models | |
880 predicted status for each subject, or the same model in independent populations to see how badly it does | |
881 | |
882 **Attributions** | |
883 | |
884 glmnet_ is the R package exposed by this Galaxy tool. | |
885 | |
886 Galaxy_ (that's what you are using right now!) for gluing everything together | |
887 | |
888 Otherwise, all code and documentation comprising this tool was written by Ross Lazarus and is | |
889 licensed to you under the LGPL_ like other rgenetics artefacts | |
890 | |
891 .. _LGPL: http://www.gnu.org/copyleft/lesser.html | |
892 .. _glmnet: http://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html | |
893 .. _Galaxy: http://getgalaxy.org | |
894 </help> | |
895 | |
896 <citations> | |
897 <citation type="bibtex"> | |
898 @Article{Friedman2010, title = {Regularization Paths for Generalized Linear Models via Coordinate Descent}, | |
899 author = {Jerome Friedman and Trevor Hastie and Robert Tibshirani}, | |
900 journal = {Journal of Statistical Software}, | |
901 year = {2010}, | |
902 volume = {33}, | |
903 number = {1}, | |
904 pages = {1--22}, | |
905 url = {http://www.jstatsoft.org/v33/i01/} | |
906 } | |
907 </citation> | |
908 <citation type="doi"> | |
909 10.1093/bioinformatics/bts573 | |
910 </citation> | |
911 </citations> | |
912 </tool> |