comparison rglasso_cox.xml @ 19:8c31e2aac682 draft

Uploaded
author iuc
date Wed, 29 Apr 2015 12:07:11 -0400
parents 0e87f636bdd8
children bb725f6d6d38
comparison
equal deleted inserted replaced
18:23155115ed77 19:8c31e2aac682
1 <tool id="rglasso_cox" name="Lasso" version="0.03">
2 <description>and cox regression using elastic net</description>
3 <requirements>
4 <requirement type="package" version="3.1.1">R_3_1_1</requirement>
5 <requirement type="package" version="1.3.18">graphicsmagick</requirement>
6 <requirement type="package" version="9.10">ghostscript</requirement>
7 <requirement type="package" version="2.14">glmnet_lars_2_14</requirement>
8 </requirements>
9 <command interpreter="python">
10 rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "rglasso"
11 --output_dir "$html_file.files_path" --output_html "$html_file" --make_HTML "yes"
12 </command>
13 <inputs>
14 <param name="title" type="text" value="lasso test" size="80" label="Title for job outputs" help="Typing a short, meaningful text here will help remind you (and explain to others) what the outputs represent">
15 <sanitizer invalid_char="">
16 <valid initial="string.letters,string.digits"><add value="_" /> </valid>
17 </sanitizer>
18 </param>
19 <param name="input1" type="data" format="tabular" label="Select an input tabular text file from your history. Rows represent samples; Columns are measured phenotypes"
20 multiple='False' optional="False" help="Tabular text data with samples as rows, phenotypes as columns with a header row of column identifiers" />
21 <param name="xvar_cols" label="Select columns containing numeric variables to use as predictor (x) variables" type="data_column" data_ref="input1" numerical="False"
22 multiple="True" use_header_names="True" force_select="True" />
23 <param name="force_xvar_cols" label="Select numeric columns containing variables ALWAYS included as predictors in cross validation" type="data_column" data_ref="input1" numerical="False"
24 multiple="True" use_header_names="True" force_select="False"/>
25 <conditional name="model">
26 <param name="fam" type="select" label="GLM Link function for models"
27 help="Binary dependant variables will automatically be set to Binomial no matter what this is set to">
28 <option value="gaussian" selected="true">Gaussian - continuous dependent (y)</option>
29 <option value="binomial">Binomial dependent variables</option>
30 <option value="poisson">Poisson (eg counts)</option>
31 <option value="cox">Cox models - require special setup for y variables - see below</option>
32 </param>
33 <when value="gaussian">
34 <param name="yvar_cols" label="Select numeric columns containing variables to use as the dependent (y) in elasticnet" type="data_column" data_ref="input1" numerical="False"
35 multiple="True" use_header_names="True" help = "If multiple, each will be modelled against all the x variables and reported separately." force_select="True"/>
36 <param name="output_full" type="hidden" value='F' />
37 <param name="output_pred" type="hidden" value='F' />
38 <param name="cox_id" label="Select column containing a unique sample identifier"
39 help = "Only really needed for output sample specific predicted values downstream."
40 type="data_column" data_ref="input1" numerical="False" force_select="True"
41 multiple="False" use_header_names="True" />
42 </when>
43 <when value="binomial">
44 <param name="yvar_cols" label="Select numeric columns containing variables to use as the dependent (y) in elasticnet" type="data_column" data_ref="input1" numerical="False"
45 multiple="True" use_header_names="True" help = "If multiple, each will be modelled against all the x variables and reported separately." force_select="True"/>
46 <param name="output_full" type="hidden" value='F' />
47 <param name="output_pred" type="select" label="Create a tabular output with predicted values for each subject from the optimal model for (eg) NRI estimates" >
48 <option value="F" selected="true">No predicted value output file</option>
49 <option value="T">Create a predicted value output file</option>
50 </param>
51 <param name="cox_id" label="Select column containing a unique sample identifier"
52 help = "Only really needed for output sample specific predicted values downstream."
53 type="data_column" data_ref="input1" numerical="False" force_select="True"
54 multiple="False" use_header_names="True" />
55 <param name="predict_at" type="hidden" value='' />
56
57 </when>
58 <when value="poisson">
59 <param name="yvar_cols" label="Select columns containing variables to use as the dependent (y) in elasticnet" type="data_column" data_ref="input1" numerical="True"
60 multiple="True" use_header_names="True" help = "If multiple, each will be modelled against all the x variables and reported separately." force_select="True"/>
61 <param name="output_full" type="hidden" value='F' />
62 <param name="output_pred" type="hidden" value='F' />
63 <param name="predict_at" type="hidden" value='' />
64 <param name="cox_id" label="Select column containing a unique sample identifier"
65 help = "Optional. Only really needed for output sample specific predicted values downstream. Free - enjoy"
66 type="data_column" data_ref="input1" numerical="True" force_select="False"
67 multiple="False" use_header_names="True" />
68 </when>
69 <when value="cox">
70 <param name="cox_time" label="Select column containing time under observation for Cox regression"
71 type="data_column" data_ref="input1" numerical="True" force_select="True"
72 multiple="False" use_header_names="True" help = "This MUST contain a time period - eg continuous years or days to failure or right censoring"/>
73 <param name="cox_status" label="Select column containing status = 1 for outcome of interest at the end of the time under observation or 0 for right censoring"
74 type="data_column" data_ref="input1" numerical="True" force_select="True"
75 multiple="False" use_header_names="True" help = "This MUST contain 1 for subjects who had an event at that time or 0 for a right censored observation"/>
76 <param name="cox_id" label="Select column containing a unique sample identifier"
77 help = "Optional. Only really needed for output sample specific predicted values downstream. Free - enjoy"
78 type="data_column" data_ref="input1" numerical="False" force_select="False"
79 multiple="False" use_header_names="True" />
80 <param name="output_full" type="select" label="Create a tabular output with coefficients for all predictors" >
81 <option value="F" selected="true">No full model output file</option>
82 <option value="T">Create a full model output file</option>
83 </param>
84 <param name="output_pred" type="select" label="Create a tabular output with predicted values for each subject from the optimal model for (eg) NRI estimates" >
85 <option value="F" selected="true">No predicted value output file</option>
86 <option value="T">Create a predicted value output file</option>
87 </param>
88 <param name="predict_at" type="text" value='' label="Provide a comma separated list of times to make a prediction for each subject"
89 optional="True" help="Default (blank) will return predictions at 0%,25%,50%,75%,100% of the observed times which should be informative" />
90
91 </when>
92 </conditional>
93 <param name="optLambda" type="select" label="Value to use when reporting optimal model and coefficients" help="minLambda will have more predictors - 1SDLambda will be more parsimonious">
94 <option value="lambda.1se" selected="true">Lambda + 1 SE of min MSE or AUC (fewer coefficients - more false negatives)</option>
95 <option value="lambda.min">Lambda at min MSE or max AUC (more coefficients - more false positives)</option>
96 </param>
97 <param name="logxform_cols" optional="True" label="Select numeric columns to be log transformed before use as predictors or dependent variables" type="data_column"
98 data_ref="input1" numerical="True" multiple="True" use_header_names="True" help = "The wisdom of doing this depends entirely on your predictors - eg can help diminish long-tailed outlier influence"
99 force_select="False"/>
100 <param name="do_standard" type="select" label="Standardise x vars"
101 help="If all measurements on same scale, may not be needed. Coefficients are always returned on the original scale.">
102 <option value="False" selected="true">No standardisation of predictors</option>l
103 <option value="True">Standardise predictors before model</option>
104 </param>
105 <param name="mdsplots" type="select" label="Generate MDS plots of samples in measurement space and measurements in sample space" >
106 <option value="False" selected="true">No MDS plots</option>l
107 <option value="True">Yes create MDS plots</option>
108 </param>
109 <param name="alpha" type="float" value="0.95" size="5" min="0.01" max="1.0" label="Alpha - see glmnet docs. 1 for pure lasso. 0.0 for pure ridge regression"
110 help="Default 0.95 allows lasso to cope better with expected predictor collinearity. Use (eg) 0.5 for hybrid regularised regression or (eg) 0.025 for ridge regression"/>
111 <param name="nfold" type="integer" value="10" size="5" label="Number of folds for internal cross validation"
112 help="Default of 10 is usually ok"/>
113 </inputs>
114 <outputs>
115 <data format="html" name="html_file" label="${title}.html"/>
116 <data format="tabular" name="model_file" label="${title}_modelres.xls"/>
117 <data format="tabular" name="output_full_file" label="${title}_full_cox_model.xls">
118 <filter>model['output_full'] == 'T'</filter>
119 </data>
120 <data format="tabular" name="output_pred_file" label="${title}_predicted_from_model.xls">
121 <filter>model['output_pred'] == 'T'</filter>
122 </data>
123 </outputs>
124 <tests>
125 <test>
126 <param name='input1' value='cox_test.xls' ftype='tabular' />
127 <param name='treatment_name' value='case' />
128 <param name='title' value='Cox glmnet test' />
129 <param name='nfold' value='10' />
130 <param name='logxform_cols' value='' />
131 <param name='alpha' value='0.95' />
132 <param name='do_standard' value="True" />
133 <param name='cox_time' value='1' />
134 <param name='cox_status' value='2' />
135 <param name='cox_id' value='1' />
136 <param name='predict_at' value='' />
137 <param name='fam' value='cox' />
138 <param name='yvar_cols' value='' />
139 <param name='xvar_cols' value='3,4,5' />
140 <param name='force_xvar_cols' value='3' />
141 <param name='output_full' value='F' />
142 <param name='output_pred' value='F' />
143 <output name='model_file' file='coxlassotest_modelres.xls'>
144 <assert_contents>
145 <has_text text="rhubarb" />
146 <has_text text="TRUE" />
147 <!-- &#009; is XML escape code for tab -->
148 <!-- has_line line="regulator&#009;partial_likelihood&#009;forced_in&#009;glmnet_model&#009;best_lambda" / -->
149 <has_line line="regulator&#009;partial_likelihood&#009;forced_in&#009;glmnet_model&#009;best_lambda&#009;lambdaChoice&#009;alpha" />
150 <has_n_columns n="7" />
151 </assert_contents>
152 </output>
153 <output name='html_file' file='coxlassotest.html' compare='diff' lines_diff='16' />
154 </test>
155 </tests>
156 <help>
157
158 **Before you start**
159
160 Please read the glmnet documentation @ glmnet_
161
162 This Galaxy wrapper merely exposes that code and the glmnet_ documentation is essential reading
163 before getting useful results here.
164
165 **What it does**
166
167 From documentation at glmnet_ ::
168
169 Glmnet is a package that fits a generalized linear model via penalized maximum likelihood.
170 The regularization path is computed for the lasso or elasticnet penalty at a grid of values for the regularization parameter lambda.
171 The algorithm is extremely fast, and can exploit sparsity in the input matrix x.
172 It fits linear, logistic and multinomial, poisson, and Cox regression models.
173 A variety of predictions can be made from the fitted models.
174
175 Internal cross validation is used to optimise the choice of lambda based on CV AUC for logistic (binomial outcome) models, or CV mse for gaussian.
176
177 **Warning about the tyrany of dimensionality**
178
179 Yes, this package will select 'optimal' models even when you (optimistically) supply more predictors than you have cases.
180 The model returned is unlikely to represent the only informative regularisation path through your data - if you run repeatedly with
181 exactly the same settings, you will probably see many different models being selected.
182 This is not a software bug - the real problem is that you just don't have enough information in your data.
183
184 Sufficiently big jobs will take a while (eg each lasso regression with 20k features on 1k samples takes about 2-3 minutes on our aged cluster)
185
186 **Input**
187
188 Assuming you have more measurements than samples, you supply data as a tabular text file where each row is a sample and columns
189 are variables. You specify which columns are dependent (predictors) and which are observations for each sample. Each of multiple
190 dependent variable columns will be run and reported independently. Predictors can be forced in to the model.
191
192 **Output**
193
194 For each selected dependent regression variable, a brief report of the model coefficients predicted at the
195 'optimal' nfold CV value of lambda.
196
197 **Predicted event probabilities for Cox and Logistic models**
198
199 If you want to compare (eg) two competing clinical predictions, there's a companion generic NRI tool
200 for predicted event probabilities. Estimates dozens of measures of improvement in prediction. Currently only works for identical id subjects
201 but can probably be extended to independent sample predictions.
202
203 Given a model, we can generate a predicted p (for status 1) in binomial or cox frameworks so models can be evaluated in terms of NRI.
204 Of course, estimates are likely substantially inflated over 'real world' performance by being estimated from the same sample - but you probably
205 already knew that since you were smart enough to reach this far down into the on screen help. The author salutes you, intrepid reader!
206
207 It may seem an odd thing to do, but we can predict p for an event for each subject from our original data, given a parsimonious model. Doing
208 this for two separate models (eg, forcing in an additional known explanatory measurement to the new model) allows comparison of the two models
209 predicted status for each subject, or the same model in independent populations to see how badly it does
210
211 **Attributions**
212
213 glmnet_ is the R package exposed by this Galaxy tool.
214
215 Galaxy_ (that's what you are using right now!) for gluing everything together
216
217 Otherwise, all code and documentation comprising this tool was written by Ross Lazarus and is
218 licensed to you under the LGPL_ like other rgenetics artefacts
219
220 .. _LGPL: http://www.gnu.org/copyleft/lesser.html
221 .. _glmnet: http://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html
222 .. _Galaxy: http://getgalaxy.org
223
224
225 </help>
226
227 <configfiles>
228 <configfile name="runme">
229 <![CDATA[
230 library('glmnet')
231 library('lars')
232 library('survival')
233 library('pec')
234
235
236 message=function(x) {print.noquote(paste(x,sep=''))}
237
238
239 ross.cv.glmnet = function (x, y, weights, offset = NULL, lambda = NULL, type.measure = c("mse",
240 "deviance", "class", "auc", "mae"), nfolds = 10, foldid,
241 grouped = TRUE, keep = FALSE, parallel = FALSE, ...)
242 {
243 if (missing(type.measure))
244 type.measure = "default"
245 else type.measure = match.arg(type.measure)
246 if (!is.null(lambda) && length(lambda) < 2)
247 stop("Need more than one value of lambda for cv.glmnet")
248 N = nrow(x)
249 if (missing(weights))
250 weights = rep(1, N)
251 else weights = as.double(weights)
252 y = drop(y)
253 glmnet.call = match.call(expand.dots = TRUE)
254 sel = match(c("type.measure", "nfolds", "foldid", "grouped",
255 "keep"), names(glmnet.call), F)
256 if (any(sel))
257 glmnet.call = glmnet.call[-sel]
258 glmnet.call[[1]] = as.name("glmnet")
259 glmnet.object = glmnet(x, y, weights = weights, offset = offset,
260 lambda = lambda, ...)
261 glmnet.object\$call = glmnet.call
262 is.offset = glmnet.object\$offset
263 lambda = glmnet.object\$lambda
264 if (inherits(glmnet.object, "multnet")) {
265 nz = predict(glmnet.object, type = "nonzero")
266 nz = sapply(nz, function(x) sapply(x, length))
267 nz = ceiling(apply(nz, 1, median))
268 }
269 else nz = sapply(predict(glmnet.object, type = "nonzero"),
270 length)
271 if (missing(foldid))
272 foldid = sample(rep(seq(nfolds), length = N))
273 else nfolds = max(foldid)
274 if (nfolds < 3)
275 stop("nfolds must be bigger than 3; nfolds=10 recommended")
276 outlist = as.list(seq(nfolds))
277 if (parallel && require(foreach)) {
278 outlist = foreach(i = seq(nfolds), .packages = c("glmnet")) %dopar%
279 {
280 sel = foldid == i
281 if (is.matrix(y))
282 y_sub = y[!sel, ]
283 else y_sub = y[!sel]
284 if (is.offset)
285 offset_sub = as.matrix(offset)[!sel, ]
286 else offset_sub = NULL
287 glmnet(x[!sel, , drop = FALSE], y_sub, lambda = lambda,
288 offset = offset_sub, weights = weights[!sel],
289 ...)
290 }
291 }
292 else {
293 for (i in seq(nfolds)) {
294 sel = foldid == i
295 if (is.matrix(y))
296 y_sub = y[!sel, ]
297 else y_sub = y[!sel]
298 if (is.offset)
299 offset_sub = as.matrix(offset)[!sel, ]
300 else offset_sub = NULL
301 outlist[[i]] = glmnet(x[!sel, , drop = FALSE],
302 y_sub, lambda = lambda, offset = offset_sub,
303 weights = weights[!sel], ...)
304 }
305 }
306 fun = paste("cv", class(glmnet.object)[[1]], sep = ".")
307 cvstuff = do.call(fun, list(outlist, lambda, x, y, weights,
308 offset, foldid, type.measure, grouped, keep))
309 cvm = cvstuff\$cvm
310 cvsd = cvstuff\$cvsd
311 cvname = cvstuff\$name
312
313 out = list(lambda = lambda, cvm = cvm, cvsd = cvsd, cvup = cvm +
314 cvsd, cvlo = cvm - cvsd, nzero = nz, name = cvname, glmnet.fit = glmnet.object)
315 if (keep)
316 out = c(out, list(fit.preval = cvstuff\$fit.preval, foldid = foldid))
317
318 lamin = if (type.measure == "auc")
319 getmin(lambda, -cvm, cvsd)
320 else getmin(lambda, cvm, cvsd)
321 out = c(out, as.list(lamin))
322 hitsse = rep(0,ncol(x))
323 hitsmin = rep(0,ncol(x))
324 names(hitsse) = colnames(x)
325 names(hitsmin) = colnames(x)
326 olmin = lamin\$lambda.min
327 ol1sd = lamin\$lambda.1se
328 lambs = c(olmin,ol1sd)
329 names(lambs) = c('olmin','ol1sd')
330 for (cvfit in outlist) {
331 colmin = which(cvfit\$lambda == olmin)
332 col1se = which(cvfit\$lambda == ol1sd)
333 nzmin = which(cvfit\$beta[,colmin] != 0)
334 nz1se = which(cvfit\$beta[,col1se] != 0)
335 hitsse[nz1se] = hitsse[nz1se] + 1
336 hitsmin[nzmin] = hitsmin[nzmin] + 1
337 }
338 obj = c(out,list(cvhits.1se=hitsse,cvhits.min=hitsmin))
339 class(obj) = "cv.glmnet"
340 obj
341 }
342
343 mdsPlot = function(dm,myTitle,groups=NA,outpdfname,transpose=T)
344 {
345
346 samples = colnames(dm)
347 mt = myTitle
348 pcols=c('maroon')
349 if (! is.na(groups))
350 {
351 gu = unique(groups)
352 colours = rainbow(length(gu),start=0.1,end=0.9)
353 pcols = colours[match(groups,gu)]
354 }
355 mydata = dm
356 if (transpose==T)
357 {
358 mydata = t(dm)
359 }
360 npred = ncol(mydata)
361 d = dist(mydata)
362 fit = cmdscale(d,eig=TRUE, k=min(10,npred-2))
363 xmds = fit\$points[,1]
364 ymds = fit\$points[,2]
365 pdf(outpdfname)
366 plot(xmds, ymds, xlab="Dimension 1", ylab="Dimension 2",
367 main=paste(mt,"MDS Plot"),type="n", col=pcols, cex=0.35)
368 text(xmds, ymds, labels = row.names(mydata), cex=0.35, col=pcols)
369 grid(col="lightgray",lty="dotted")
370 dev.off()
371 }
372
373
374 getpredp_logistic = function(x,yvec,yvarname,id)
375 {
376 yvals = unique(yvec)
377 if (length(yvals) != 2) {
378 message(c('ERROR: y does not have 2 values =',paste(yvals,collapse=',')))
379 return(NA)
380 }
381 cols = colnames(x)
382 if (length(cols) == 0) {
383 message('ERROR: No columns in input x? Cannot predict!')
384 return(NA)
385 }
386 cn = paste(cols, collapse = ' + ')
387
388 formstring=paste("y ~",cn)
389 form = as.formula(formstring)
390 ok = complete.cases(x)
391
392 if (sum(ok) < length(ok)) {
393 x = x[ok,]
394 yvec = yvec[ok]
395 id = id[ok]
396 }
397 nx = data.frame(id=id,x,y=yvec)
398 print('nx,yvec:')
399 print(head(nx,n=3))
400 print(yvec)
401 mdl = glm(form, data=nx, family="binomial", na.action=na.omit)
402 message(c('Model format =',formstring))
403 message(paste('Predictive model details used to generate logistic outcome probabilities for',yvarname,':'))
404 print(summary(md1))
405 print(anova(md1))
406 predp = predict(md1,nx,type="response")
407 p1 = data.frame(id=id,pred_response=predp,obs_response=yvec)
408 return(p1)
409 }
410
411
412
413
414 getpredp_cox = function(x,time,status,id,predict_at)
415 {
416 cols = colnames(x)
417 if (length(cols) == 0) {
418 message('ERROR: No columns in input x? Cannot predict!')
419 return(NA)
420 }
421 cn = paste(colnames(x), collapse = ' + ')
422
423 formstring=paste("Surv(time, status) ~",cn)
424
425 form = as.formula(formstring)
426
427 ok = complete.cases(x)
428
429 if (sum(ok) < length(ok)) {
430 x = x[ok,]
431 time = time[ok]
432 status = status[ok]
433 id = id[ok]
434 }
435 nx = data.frame(x,time=time,status=status)
436 m1 = coxph(form, data=nx,singular.ok=TRUE)
437 print.noquote('Predictive model details used to generate survival probabilities:')
438 print.noquote(m1)
439 predpq = predictSurvProb(object=m1, newdata=nx, times=predict_at)
440 predpq = 1-predpq
441 colnames(predpq) = paste('p_surv_to',predict_at,sep='_')
442 p1 = data.frame(id=id,predpq,time=time,status=status)
443 return(p1)
444 }
445
446
447
448 dolasso_cox = function(x,y,debugOn=F,maxsteps=10000,nfold=10,xcolnames,ycolnames,optLambda='lambda.1se',out_full=F,out_full_file=NA,
449 out_pred=F,out_pred_file=NA,cox_id=NA, descr='Cox test',do_standard=F,alpha=0.9,penalty,predict_at,mdsplots=F)
450 {
451 logf = file("cox_rglasso.log", open = "a")
452 sink(logf,type = c("output", "message"))
453 res = NULL
454 if (mdsplots==T) {
455 outpdfname = 'cox_x_in_sample_space_MDS.pdf'
456 p = try({ mdsPlot(x,'measurements in sample space',groups=NA,outpdfname=outpdfname,transpose=T) },T)
457 if (class(p) == "try-error")
458 {
459 print.noquote(paste('Unable to produce predictors in sample space mds plot',p))
460 }
461 outpdfname = 'cox_samples_in_x_space_MDS.pdf'
462 p = try({mdsPlot(x,'samples in measurement space',groups=y,outpdfname=outpdfname,transpose=F) },T)
463 if (class(p) == "try-error")
464 {
465 print.noquote(paste('Unable to produce samples in measurement space mds plots',p))
466 }
467 }
468 if (is.na(predict_at)) { predict_at = quantile(y) }
469 message(paste('@@@ Cox model will be predicted at times =',paste(predict_at,collapse=',')))
470 do_standard = do_standard
471 standardize = do_standard
472 normalize = do_standard
473 p = try({larsres = glmnet(x,y,family='cox',standardize=standardize,alpha=alpha,penalty.factor=penalty )},T)
474 if (class(p) == "try-error")
475 {
476 print.noquote('Unable to run cox glmnet on your data')
477 print.noquote(p)
478 sink()
479 return(NA)
480 }
481 if (out_full == T)
482 {
483 b = as.matrix(larsres\$beta)
484 nb = length(colnames(b))
485 bcoef = b[,nb]
486 lastl = larsres\$lambda[length(larsres\$lambda)]
487 allres = data.frame(x=rownames(b),beta=bcoef,lambda=lastl)
488 write.table(format(allres,digits=5),out_full_file,quote=FALSE, sep="\t",row.names=F)
489 }
490
491 outpdf = paste('cox',descr,'glmnetdev.pdf',sep='_')
492 try(
493 {
494 pdf(outpdf)
495 plot(larsres,main='cox glmnet',label=T)
496 grid()
497 dev.off()
498 },T)
499
500 larscv = NA
501
502 p = try({larscv=ross.cv.glmnet(x,y,family=fam,type.measure='deviance',penalty=penalty)},T)
503 if (class(p) == "try-error") {
504 print.noquote(paste('Unable to cross validate your data',p))
505 sink()
506 return(NA)
507 }
508 lse = larscv\$cvhits.1se
509 lmin = larscv\$cvhits.min
510 tot = lse + lmin
511 allhits = data.frame(hits_lambda_1se = lse,hits_lambda_min = lmin)
512 nzhits = allhits[which(tot != 0),]
513 message('Times each predictor was selected in CV models (excluding zero count predictors)')
514 print.noquote(nzhits)
515 out_nz_file = 'cox_cross_validation_model_counts.xls'
516 write.table(nzhits,out_nz_file,quote=FALSE, sep="\t",row.names=F)
517
518 outpdf = paste('cox',descr,'glmnet_cvdeviance.pdf',sep='_')
519
520 p = try(
521 {
522 pdf(outpdf)
523 plot(larscv,main='Deviance',label=T)
524 grid()
525 dev.off()
526 },T)
527 if (optLambda == 'lambda.min') {
528 best_lambda = larscv\$lambda.min
529 bestcoef = as.matrix(coef(larscv, s = "lambda.min"))
530 } else {
531 best_lambda = larscv\$lambda.1se
532 bestcoef = as.matrix(coef(larscv, s = "lambda.1se"))
533 }
534 inmodel = which(bestcoef != 0)
535 coefs = bestcoef[inmodel]
536 preds = rownames(bestcoef)[inmodel]
537
538 names(coefs) = preds
539 pen = as.logical( ! penalty[inmodel])
540 if (out_pred==T)
541 {
542 if (length(inmodel) > 0 ) {
543 predcols = inmodel
544 xmat = as.matrix(x[,predcols])
545 colnames(xmat) = preds
546 bestpred = getpredp_cox(x=xmat,time=y[,'time'],status=y[,'status'],id=cox_id, predict_at=predict_at)
547 pred = data.frame(responsep=bestpred, best_lambda=best_lambda,lamchoice=optLambda,alpha=alpha)
548 write.table(pred,out_pred_file,quote=FALSE, sep="\t",row.names=F)
549 } else { print.noquote('WARNING: No coefficients in selected model to predict with - no predictions made') }
550 }
551 if (debugOn) {
552 print.noquote(paste('best_lambda=',best_lambda,'saving cox respreds=',paste(names(coefs),collapse=','),'as predictors of survival. Coefs=',paste(coefs,collapse=',')))
553 }
554 p = try({res = data.frame(regulator=names(coefs),partial_likelihood=coefs,forced_in=pen,glmnet_model='cox',best_lambda=best_lambda,
555 lambdaChoice=optLambda,alpha=alpha)},T)
556 if (class(p) == "try-error") {
557 message(paste('@@@ unable to return a dataframe',p))
558 sink()
559 return(NA)
560 }
561 print.noquote('@@@ Results preview:')
562 print.noquote(res,digits=5)
563 sink()
564 return(res)
565
566 }
567
568
569 do_lasso = function(x=NA,y=NA,do_standard=T,debugOn=T,defaultFam="gaussian",optLambda='minLambda',descr='description', indx=1,target='target',sane=F,
570 alpha=0.9,nfold=10,penalty=c(),out_pred=F,out_pred_file='outpred',yvarname='yvar',id=c(),mdsplots=F)
571 {
572 logf = file(paste(target,"rglasso.log",sep='_'), open = "a")
573 sink(logf,type = c("output", "message"))
574 res = NA
575 phe_is_bin = (length(unique(y)) == 2)
576 forcedin = paste(colnames(x)[which(penalty == 0)],collapse=',')
577 fam = "gaussian"
578 if (defaultFam %in% c("poisson","binomial","gaussian","multinomial")) fam=defaultFam
579 if (phe_is_bin == T) {
580 fam = "binomial"
581 }
582 print.noquote(paste('target=',target,'is binary=',phe_is_bin,'dim(x)=',paste(dim(x),collapse=','),'length(y)=',length(y),'force=',forcedin,'fam=',fam))
583 standardize = do_standard
584 p = try({larsres = glmnet(x,y,family=fam,standardize=standardize,maxit=10000,alpha=alpha,penalty.factor=penalty) },T)
585 if (class(p) == "try-error")
586 {
587 print(paste('ERROR: unable to run glmnet for target',target,'error=',p))
588 sink()
589 return(NA)
590 }
591
592 mt = paste('Glmnet fraction deviance for',target)
593 outpdf = paste(target,'glmnetPath.pdf',sep='_')
594 pdf(outpdf)
595 plot(larsres,main=mt,label=T)
596 grid()
597 dev.off()
598
599 outpdf = paste(target,'glmnetDeviance.pdf',sep='_')
600
601 mt2 = paste('Glmnet lambda for',target)
602
603 pdf(outpdf)
604 plot(larsres,xvar="lambda",main=mt2,label=T)
605 grid()
606 dev.off()
607
608 larscv = NA
609 if (fam=="binomial") {
610 tmain = paste(target,'AUC')
611 outpdf = paste(target,'glmnetCV_AUC.pdf',sep='_')
612 p = try({larscv = ross.cv.glmnet(x=x,y=y,family=fam,type.measure='auc')},T)
613 } else {
614 tmain = paste(target,'CV MSE')
615 outpdf = paste(target,'glmnetCV_MSE.pdf',sep='_')
616 p = try({larscv = ross.cv.glmnet(x=x,y=y,family=fam,type.measure='mse')},T)
617 }
618 if (class(p) == "try-error")
619 {
620 print(paste('ERROR: unable to run cross validation for target',target,'error=',p))
621 sink()
622 return(NA)
623 }
624
625
626 pdf(outpdf)
627 plot(larscv,main=tmain)
628 grid()
629 dev.off()
630
631 lse = larscv\$cvhits.1se
632 lmin = larscv\$cvhits.min
633 tot = lse + lmin
634 allhits = data.frame(pred=colnames(x),hits_lambda_1se = lse,hits_lambda_min = lmin)
635 nzhits = allhits[which(tot != 0),]
636 message('Total hit count for each predictor over all CV models (excluding zero count predictors)')
637 print.noquote(nzhits)
638 out_nz_file = paste(target,'cross_validation_model_counts.xls',sep='_')
639 write.table(nzhits,out_nz_file,quote=FALSE, sep="\t",row.names=F)
640
641 ipenalty = c(0,penalty)
642 if (optLambda == 'lambda.min') {
643 best_lambda = larscv\$lambda.min
644 bestpred = as.matrix(coef(larscv, s = "lambda.min"))
645 } else {
646 best_lambda = larscv\$lambda.1se
647 bestpred = as.matrix(coef(larscv, s = "lambda.1se"))
648 }
649 inmodel = which(bestpred != 0)
650 coefs = bestpred[inmodel,1]
651 preds = rownames(bestpred)[inmodel]
652 iforced = ipenalty[inmodel]
653 forced = ! as.logical(iforced)
654 names(coefs) = preds
655 ncoef = length(coefs) - 1
656 if (out_pred==T && fam=="binomial")
657 {
658 print.noquote(paste('Predicting',target,'probabilities from binomial glmnet at alpha',alpha,'and lambda',best_lambda))
659 bestpred = predict.glmnet(larsres,s=best_lambda,newx=x,type="response")
660 bestpred = exp(bestpred)/(1+exp(bestpred))
661 pred = data.frame(id=id,y=y,predp=as.vector(bestpred), best_lambda=best_lambda)
662 write.table(pred,out_pred_file,quote=FALSE, sep="\t",row.names=F)
663 }
664 if (debugOn) {cat(indx,'best_lambda=',best_lambda,'saving',fam,'respreds=',names(coefs),'as predictors of',target,'coefs=',coefs,'\n')}
665 res = try(data.frame(i=indx,pred=target,regulator=names(coefs),coef=coefs,forced_in=forced,glmnet_model=fam,ncoef=ncoef,
666 best_lambda=best_lambda,lambdaChoice=optLambda,alpha=alpha),T)
667 if (class(res) == "try-error") {
668 sink()
669 return(NA) }
670 print.noquote(res)
671 sink()
672 return(res)
673 }
674
675
676 dolasso_generic = function(predvars=NA,depvars=NA,debugOn=T,maxsteps=100, alpha=0.9,nfold=10,xcolnames=c(),ycolnames=c(),optLambda='minLambda', out_pred_file=NA,
677 descr="describe me",do_standard=F,defaultFam="gaussian",penalty=c(),out_pred=F,cox_id=c(),mdsplots=F,xfilt=0.95)
678 {
679 logf = file("rglasso.log", open = "a")
680 sink(logf,type = c("output", "message"))
681 xdat = predvars
682 xm = data.matrix(xdat)
683 res = NULL
684 id = cox_id
685 depnames = ycolnames
686 ndep = length(depnames)
687 if (mdsplots==T) {
688 outpdfname = 'rglasso_x_in_sample_space_MDS.pdf'
689 p = try({ mdsPlot(xm,'measurements in sample space',groups=NA,outpdfname=outpdfname,transpose=T) },T)
690 if (class(p) == "try-error")
691 {
692 print.noquote(paste('Unable to produce predictors in sample space mds plot. Error:',p))
693 }
694 outpdfname = 'rglasso_samples_in_x_space_MDS.pdf'
695 p = try({mdsPlot(xm,'samples in measurement space',groups=NA,outpdfname=outpdfname,transpose=F) },T)
696 if (class(p) == "try-error")
697 {
698 print.noquote(paste('Unable to produce samples in measurement space mds plot. Error:',p))
699 }
700 }
701 ndat = nrow(xm)
702 cfracs = colSums(! is.na(xm))/ndat
703 keepme = (cfracs >= xfilt)
704 print.noquote(paste('Removing', sum(! keepme), 'xvars with more than',xfilt,'fraction missing'))
705 vars = apply(xm,2,var,na.rm=T)
706 xm = xm[,keepme]
707 for (i in c(1:max(1,ndep))) {
708 target = depnames[i]
709 if (length(target) < 1) { target='y' }
710 if (i %% 100 == 0) { cat(i,target,'\n') }
711 if (ndep <= 1) {
712 y=depvars
713 } else {
714 y = depvars[,i]
715 }
716 if (fam == "binomial") {y = as.factor(y)}
717 x = xm
718 id = cox_id
719 if (fam != "cox") {
720 ok = complete.cases(x,y)
721 if (sum(! ok) > 0) {
722 message(paste('@@@ Removing',sum(! ok),'cases with missing y of',length(y),'@@@'))
723 y = y[(ok)]
724 x = x[(ok),]
725 id = id[(ok)]
726 }
727 }
728 ok = complete.cases(y)
729 if (sum(ok) == 0 ) {
730 print(paste("No complete cases found for",target,"in input x dim =",paste(dim(xm),collapse=','),"length y=",length(y)))
731 } else {
732 if (i == 1) { outpred = out_pred_file
733 } else {
734 outpred = paste(target,'predicted_output.xls')
735 }
736 regres = do_lasso(x=x,y=y,do_standard=do_standard,debugOn=debugOn,defaultFam=defaultFam,optLambda=optLambda,out_pred_file=outpred,
737 descr=descr,indx=i,target=target,alpha=alpha,nfold=nfold,penalty=penalty,out_pred=out_pred,yvarname=target,id=id,mdsplots=mdsplots)
738 if (! is.na(regres)) { res = rbind(res,regres) }
739 }
740 }
741 print.noquote('@@@ Results preview:')
742 print.noquote(res,digits=5)
743 sink()
744 return(res)
745 }
746
747
748 corPlot=function(xdat=c(),main='main title',is_raw=T)
749 {
750 library(pheatmap)
751 library(gplots)
752 if (is_raw) {
753 cxdat = cor(xdat,method="spearman",use="pairwise.complete.obs")
754 } else {
755 cxdat=xdat
756 }
757 xro = nrow(cxdat)
758 if (xro > 1000) stop("Too many rows for heatmap, who can read?!")
759 fontsize_col = 5.0
760 pheatmap(cxdat, main=main, show_colnames = F, width=30, height=30,
761 fontsize_row=fontsize_col, border_color=NA)
762 }
763
764
765
766
767 runTest = function(n=10)
768 {
769 set.seed (NULL)
770 Y = data.frame(y1=runif (n),y2=runif(n))
771 Xv <- runif(n*n)
772 X <- matrix(Xv, nrow = n, ncol = n)
773
774 mydf <- data.frame(Y, X)
775
776 regres_out = dolasso_generic(predvars=X,depvars=Y,debugOn=T,p.cutoff = 0.05,maxsteps=10000,nfold=10,
777 descr='randomdata',do_standard=do_standard,defaultFam="gaussian",alpha=0.05)
778 return(regres_out)
779 }
780 ]]>
781 options(width=512)
782 options(digits=5)
783 alpha = $alpha
784 nfold = $nfold
785 optLambda = "$optLambda"
786 Out_Dir = "$html_file.files_path"
787 Input = "$input1"
788 indat = read.table(Input,head=T,sep='\t')
789 datcols = colnames(indat)
790 myTitle = "$title"
791 outtab = "$model_file"
792 do_standard = as.logical("$do_standard")
793 mdsplots = as.logical("$mdsplots")
794 fam = "$model.fam"
795 xvar_cols_in = "$xvar_cols"
796 force_xvar_cols_in = "$force_xvar_cols"
797 xvar_cols = as.numeric(strsplit(xvar_cols_in,",")[[1]])
798 force_xvar_cols = c()
799 penalties = rep(1,length(datcols))
800 forced_in = NA
801
802 logxform = "$logxform_cols"
803 if (logxform != "None") {
804 logxform_cols = as.numeric(strsplit(logxform,",")[[1]])
805 if (length(logxform_cols) > 0) {
806 small = 1e-10
807 sset = indat[,logxform_cols]
808 zeros = which(sset==0,arr.ind=T)
809 nz = nrow(zeros)
810 if (nz &gt; 0) {
811 message(paste('Log transforming encountered',nz,'zeros - added 1e-10'))
812 sset[zeros] = sset[zeros] + small
813 lset = log(sset)
814 indat[,logxform_cols] = lset
815 }
816 }
817 }
818 if (force_xvar_cols_in != "None")
819 {
820 force_xvar_cols = as.numeric(strsplit(force_xvar_cols_in,",")[[1]])
821 allx = c(xvar_cols,force_xvar_cols)
822 xvar_cols = unique(allx)
823 xvar_cols = xvar_cols[order(xvar_cols)]
824 penalties[force_xvar_cols] = 0
825 }
826 penalty = penalties[xvar_cols]
827 forcedin = paste(datcols[which(penalties == 0)],collapse=',')
828 cox_id_col = NA
829 cox_id = NA
830
831 message(paste('@@@ Using alpha =',alpha,'for all models'))
832 x = indat[,xvar_cols]
833 nx = nrow(x)
834 cx = ncol(x)
835 message(paste('@@@@ Input has',nx,'samples and',cx,'predictors'))
836 if (cx > nx) {
837 message('@@@ WARNING: Models will have more variables than cases so glmnet will likely return one of many possible solutions! Please DO NOT expect reliable results - glmnet is clever but not magical @@@')
838 }
839
840 xcolnames = datcols[xvar_cols]
841
842 if (file.exists(Out_Dir) == F) dir.create(Out_Dir)
843 out_full = F
844 out_full_file = NA
845 out_pred_file = ""
846 out_pred = as.logical("$model.output_pred")
847
848 #if $model.fam == "binomial" or $model.fam == "cox":
849 cox_id_col = $model.cox_id
850 cox_id = indat[,cox_id_col]
851 if (out_pred == T) {
852 out_pred_file="$output_pred_file"
853 rownames(x) = cox_id
854 }
855 #end if
856 #if $model.fam == "cox":
857 cox_time = $model.cox_time
858 cox_status = $model.cox_status
859 out_full = as.logical("$model.output_full")
860 if (out_full == T) { out_full_file="$output_full_file" }
861 yvar_cols = c(cox_time,cox_status)
862 ycolnames = c('time','status')
863 istat = as.double(indat[,cox_status])
864 itime = as.double(indat[,cox_time])
865 predict_at = quantile(itime)
866 if ("$model.predict_at" &gt; "")
867 {
868 pa = "$model.predict_at"
869 predict_at = as.numeric(strsplit(pa,",")[[1]])
870 }
871 y = data.frame(time = itime, status = istat)
872 ustat = unique(istat)
873 if ((length(ustat) != 2) | (! 1 %in% ustat ) | (! 0 %in% ustat))
874 {
875 print.noquote(paste('INPUT ERROR: status must have 0 (censored) or 1 (event) but found',paste(ustat,collapse=',') ))
876 quit(save='no',status=1)
877 }
878 y = as.matrix(y)
879 x = as.matrix(x)
880 print.noquote(paste('@@@ Cox model will predict yvar=',datcols[cox_status],'using cols=',paste(xcolnames,collapse=','),'n preds=',length(xcolnames),
881 'forced in=',forcedin))
882 regres_out = dolasso_cox(x=x,y=y,debugOn=F,maxsteps=10000,nfold=nfold,xcolnames=xcolnames,ycolnames=ycolnames,optLambda=optLambda,out_full=out_full,out_full_file=out_full_file,
883 out_pred=out_pred,out_pred_file=out_pred_file,cox_id=cox_id,descr=myTitle,do_standard=do_standard,alpha=alpha,penalty=penalty,predict_at=predict_at,mdsplots=mdsplots)
884 #else:
885 yvar_cols = "$model.yvar_cols"
886 yvar_cols = as.numeric(strsplit(yvar_cols,",")[[1]])
887 ycolnames = datcols[yvar_cols]
888 print.noquote(paste('@@@',fam,'model will predict yvar=',paste(ycolnames,collapse=','),'using cols=',paste(xcolnames,collapse=','),'n preds=',length(xcolnames),
889 'forced in=',forcedin))
890 y = data.matrix(indat[,yvar_cols])
891 print.noquote(paste('Model will use',fam,'link function to predict yvar=',paste(ycolnames,collapse=','),'n preds=',length(xcolnames),'forced in=',forcedin))
892 regres_out = dolasso_generic(predvars=x,depvars=y,debugOn=F, maxsteps=10000,nfold=nfold,xcolnames=xcolnames,ycolnames=ycolnames,optLambda=optLambda,out_pred_file=out_pred_file,
893 descr=myTitle,do_standard=do_standard,defaultFam=fam,alpha=alpha,penalty=penalty,out_pred=out_pred,cox_id=cox_id,mdsplots=mdsplots)
894 #end if
895
896 write.table(format(regres_out,digits=5),outtab,quote=FALSE, sep="\t",row.names=F)
897 print.noquote('@@@ SessionInfo for this R session:')
898 sessionInfo()
899 warnings()
900
901 </configfile>
902 </configfiles>
903 <citations>
904 <citation type="bibtex">
905 @Article{Friedman2010, title = {Regularization Paths for Generalized Linear Models via Coordinate Descent},
906 author = {Jerome Friedman and Trevor Hastie and Robert Tibshirani},
907 journal = {Journal of Statistical Software},
908 year = {2010},
909 volume = {33},
910 number = {1},
911 pages = {1--22},
912 url = {http://www.jstatsoft.org/v33/i01/}
913 }
914 </citation>
915 <citation type="doi">
916 10.1093/bioinformatics/bts573
917 </citation>
918 </citations>
919 </tool>
920
921
922