Mercurial > repos > fubar > rglasso_1_9_8
changeset 20:bb725f6d6d38 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/rglasso commit 344140b8df53b8b7024618bb04594607a045c03a
author | iuc |
---|---|
date | Mon, 04 May 2015 22:47:29 -0400 (2015-05-05) |
parents | 8c31e2aac682 |
children | |
files | rg_nri.xml rglasso_cox.xml |
diffstat | 2 files changed, 494 insertions(+), 506 deletions(-) [+] |
line wrap: on
line diff
--- a/rg_nri.xml Wed Apr 29 12:07:11 2015 -0400 +++ b/rg_nri.xml Mon May 04 22:47:29 2015 -0400 @@ -7,145 +7,12 @@ <requirement type="package" version="2.14">glmnet_lars_2_14</requirement> </requirements> <command interpreter="python"> - rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "rg_NRI" + rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "rg_NRI" --output_dir "$html_file.files_path" --output_html "$html_file" --make_HTML "yes" </command> - <inputs> - <param name="title" type="text" value="NRI test" size="80" label="Plot Title" help="Will appear as the title for the comparison plot"/> - <param name="input1" type="data" format="tabular" label="Select a tabular file from the baseline model with predicted and observed outcome column for each subject" - multiple='False' help="Observed and predicted status columns must be selected from this file below - NOTE both models must be in same order with exact matches in all observed outcomes" optional="False"/> - <param name="input1_observed" label="Select column containing observed outcome (0 for no event, 1 for an event)" type="data_column" data_ref="input1" numerical="True" - multiple="False" use_header_names="True" optional="False" help = "Observed outcomes are compared in the two files to check that the datasets are from the same data"/> - <param name="input1_predicted" label="Select column containing predicted event probabilies from baseline model" type="data_column" data_ref="input1" numerical="True" - multiple="False" use_header_names="True" optional="False" help="Must be in range 0-1"/> - <param name="input1_id" label="Select column containing subject ID from baseline model" type="data_column" data_ref="input1" numerical="True" - multiple="False" use_header_names="True" optional="False" help="Subect IDs are needed to match subjects to compare predictions in the two inputs"/> - <param name="input2" type="data" format="tabular" label="Select a tabular file from the new model with predicted and observed outcome columns for each subject" - multiple='False' help="Observed and predicted status columns must be selected from this file below" /> - <param name="input2_observed" label="Select column containing observed outcome (0 for no event, 1 for an event)" type="data_column" data_ref="input2" numerical="True" - multiple="False" use_header_names="True" optional="False" help = "Observed outcomes are compared in the two files to check that the datasets are from the same data"/> - <param name="input2_predicted" label="Select column containing predicted event probabilities from the new model" type="data_column" data_ref="input2" numerical="True" - multiple="False" use_header_names="True" optional="False" help="Must be in range 0-1"/> - <param name="input2_id" label="Select column containing subject ID from the new model" type="data_column" data_ref="input2" numerical="True" - multiple="False" use_header_names="True" optional="False" help="Subect IDs are needed to match subjects to compare predictions in the two inputs"/> - <conditional name="CImeth"> - <param name="cis" type="select" label="CI calculation method" - help="Bootstrap will take time - a long time for thousands - asymptotic is quick and informative"> - <option value="asymptotic" selected="true">Asymptotic estimate</option> - <option value="boot">Bootstrap for empirical CIs</option> - </param> - <when value="boot"> - <param name="nboot" type="integer" value="1000" label="Number of bootstrap replicates"/> - </when> - <when value="asymptotic"> - <param name="nboot" type="hidden" value="1000"/> - </when> - </conditional> - </inputs> - <outputs> - <data format="html" name="html_file" label="${title}.html"/> - <data format="tabular" name="nri_file" label="${title}_nrires.xls"/> - </outputs> - <tests> - <test> - <param name='title' value='nri_test1' /> - <param name='input1' value='nri_test1.xls' ftype='tabular' /> - <param name='input2' value='nri_test1.xls' ftype='tabular' /> - <param name='input1_id' value="1" /> - <param name='input1_observed' value="2" /> - <param name='input1_predicted' value="3" /> - <param name='input2_observed' value="2" /> - <param name='input2_predicted' value="4" /> - <output name='html_file' file='nri_test1_out.html' compare='diff' lines_diff='10' /> - <output name='nri_file' file='nri_test1_out.xls' /> - </test> -</tests> -<help> - -**Before you start** - -This is a simple tool to calculate various measures of improvement in prediction between two models described in pickering_paper_ -It is based on an R script pickering_code_ written by Dr John W Pickering and Dr David Cairns from sunny Otago University which -has been debugged and slightly adjusted to fit a Galaxy tool wrapper. - - -**What it does** - -Copied from the documentation in pickering_code_ :: - - - Functions to create risk assessment plots and associated summary statistics - - - (c) 2012 Dr John W Pickering, john.pickering@otago.ac.nz, and Dr David Cairns - Last modified August 2014 - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the distribution - - FUNCTIONS - raplot - Produces a Risk Assessment Plot and outputs the coordinates of the four curves - Based on: Pickering, J. W. and Endre, Z. H. (2012). New Metrics for Assessing Diagnostic Potential of - Candidate Biomarkers. Clinical Journal of the American Society of Nephrology, 7, 1355–1364. doi:10.2215/CJN.09590911 - - statistics.raplot - Produces the NRIs, IDIs, IS, IP, AUCs. - Based on: Pencina, M. J., D'Agostino, R. B. and Steyerberg, E. W. (2011). Extensions of net reclassification improvement calculations to - measure usefulness of new biomarkers. Statistics in Medicine, 30(1), 11–21. doi:10.1002/sim.4085 - Pencina, M. J., D'Agostino, R. B. and Vasan, R. S. (2008). Evaluating the added predictive ability of a new marker: From area under the - ROC curve to reclassification and beyond. - Statistics in Medicine, 27(2), 157–172. doi:10.1002/sim.2929 - DeLong, E., DeLong, D. and Clarke-Pearson, D. (1988). Comparing the areas under 2 or more correlated receiver operating characteristic curves - a nonparametric approach. - Biometrics, 44(3), 837–845. - - summary.raplot - Produces the NRIs, IDIs, IS, IP, AUCs with confidence intervals using a bootstrap or asymptotic procedure. (I prefer bootstrap which is chosed by cis=c("boot")) - - - Required arguments for all functions: - x1 is calculated risk (eg from a glm) for the null model, i.e. predict(,type="response") on a glm object - x2 is calculated risk (eg from a glm) for the alternative model - y is the case-control indicator (0 for controls, 1 for cases) - Optional argument - t are the boundaries of the risks for each group (ie 0, 1 and the thresholds beteween. eg c(0,0,3,0,7,1)). If missing, defaults to c(0, the incidence, 1) - - -**Input** - -The observed and predicted outcomes from two models to be compared. - -**Output** - -Lots'o'measures (TM) see pickering_paper_ for details - -**Attributions** - -pickering_paper_ is the paper the caclulations performed by this tool is based on - -pickering_code_ is the R function from John Pickering exposed by this Galaxy tool with minor modifications and hacks by Ross Lazarus. - -Galaxy_ (that's what you are using right now!) for gluing everything together - -Otherwise, all code and documentation comprising this tool was written by Ross Lazarus and is -licensed to you under the LGPL_ like other rgenetics artefacts - -.. _LGPL: http://www.gnu.org/copyleft/lesser.html -.. _pickering_code: http://www.researchgate.net/publication/264672640_R_function_for_Risk_Assessment_Plot__reclassification_metrics_NRI_IDI_cfNRI -.. _pickering_paper: http://cjasn.asnjournals.org/content/early/2012/05/24/CJN.09590911.full -.. _Galaxy: http://getgalaxy.org - - -</help> - <configfiles> <configfile name="runme"> - + <![CDATA[ ### http://www.researchgate.net/publication/264672640_R_function_for_Risk_Assessment_Plot__reclassification_metrics_NRI_IDI_cfNRI code @@ -159,13 +26,13 @@ ### ### (c) 2012 Dr John W Pickering, john.pickering@otago.ac.nz, and Dr David Cairns ### Last modified August 2014 -### -### Redistribution and use in source and binary forms, with or without +### +### Redistribution and use in source and binary forms, with or without ### modification, are permitted provided that the following conditions are met: -### * Redistributions of source code must retain the above copyright +### * Redistributions of source code must retain the above copyright ### notice, this list of conditions and the following disclaimer. -### * Redistributions in binary form must reproduce the above copyright -### notice, this list of conditions and the following disclaimer in +### * Redistributions in binary form must reproduce the above copyright +### notice, this list of conditions and the following disclaimer in ### the documentation and/or other materials provided with the distribution ### FUNCTIONS @@ -174,7 +41,7 @@ ### Based on: Pickering, J. W. and Endre, Z. H. (2012). New Metrics for Assessing Diagnostic Potential of ### Candidate Biomarkers. Clinical Journal of the American Society of Nephrology, 7, 1355–1364. doi:10.2215/CJN.09590911 ### -### statistics.raplot +### statistics.raplot ### Produces the NRIs, IDIs, IS, IP, AUCs. ### Based on: Pencina, M. J., D'Agostino, R. B. and Steyerberg, E. W. (2011). ### Extensions of net reclassification improvement calculations to measure usefulness of new biomarkers. Statistics in Medicine, 30(1), 11–21. doi:10.1002/sim.4085 @@ -183,7 +50,7 @@ ### DeLong, E., DeLong, D. and Clarke-Pearson, D. (1988). Comparing the areas under 2 or more correlated receiver operating characteristic curves - a nonparametric approach. ### Biometrics, 44(3), 837–845. ### -### summary.raplot +### summary.raplot ### Produces the NRIs, IDIs, IS, IP, AUCs with confidence intervals using a bootstrap or asymptotic procedure. (I prefer bootstrap which is chosed by cis=c("boot")) @@ -229,13 +96,13 @@ ### actual plotting pdf(outplot) - plot(thresh.model1, sens.model1, xlim = c(0, 1), ylim = c(0, 1), type = "n", + plot(thresh.model1, sens.model1, xlim = c(0, 1), ylim = c(0, 1), type = "n", lty = 2, lwd = 2, xlab = "Risk of Event", ylab = "", col = "black", main=title) grid() - polygon(x = c(thresh.model1, thresh.model2[n.model2:1]), + polygon(x = c(thresh.model1, thresh.model2[n.model2:1]), y = c(sens.model1, sens.model2[n.model2:1]), border = NA, col = gray(0.8)) - polygon(x = c(thresh.model1, thresh.model2[n.model2:1]), + polygon(x = c(thresh.model1, thresh.model2[n.model2:1]), y = c(spec.model1, spec.model2[n.model2:1]), border = NA, col = gray(0.8)) lines(thresh.model1, sens.model1, type = "l", lty = 2, lwd = 2, col = "black") @@ -246,9 +113,9 @@ text(x = -0.15, y = 0.4, labels = "Sensitivity, ", col = "black", xpd = TRUE, srt = 90) text(x = -0.15, y = 0.4 + 0.175, labels = "1-Specificity", col = "red", xpd = TRUE, srt = 90) - legend("topleft", c("Event: New model", "Event: Baseline model", - "No Event: New model", "No Event: Baseline model"), - col = c("black", "black", "red", "red"), + legend("topleft", c("Event: New model", "Event: Baseline model", + "No Event: New model", "No Event: Baseline model"), + col = c("black", "black", "red", "red"), lty = c(1,2, 1, 2), lwd = 2, bty = "n") dev.off() return(data.frame("Null.p.sens"=thresh.model1, @@ -267,8 +134,8 @@ ### statistics from a raplot (is an adaptation of improveProb() from Hmisc) statistics.raplot = function(x1, x2, y, threshvec) -{ - +{ + s = is.na(x1 + x2 + y) ###Remove rows with missing data if (any(s)) { smiss = sum(s) @@ -302,7 +169,7 @@ na = sum(a) nb = sum(b) d = x2 - x1 - ### NRI + ### NRI n.thresh=length(threshvec)-1 risk.class.x1.ev=cut2(x1[a],threshvec) risk.class.x2.ev=cut2(x2[a],threshvec) @@ -317,18 +184,18 @@ pup.ev=0 pdown.ev=0 for (i in 1:(n.thresh-1)) { pup.ev = pup.ev + sum(cM.ev\$table[(i+1):n.thresh,i])} - for (i in 2:n.thresh) { pdown.ev = pdown.ev + sum(cM.ev\$table[1:(i-1),i])} - pup.ev=pup.ev/na + for (i in 2:n.thresh) { pdown.ev = pdown.ev + sum(cM.ev\$table[1:(i-1),i])} + pup.ev=pup.ev/na pdown.ev=pdown.ev/na risk.class.x1.ne=cut2(x1[b],threshvec) - risk.class.x2.ne=cut2(x2[b],threshvec) + risk.class.x2.ne=cut2(x2[b],threshvec) levels(risk.class.x1.ne)=thresh levels(risk.class.x2.ne)=thresh cM.ne=confusionMatrix(risk.class.x2.ne,risk.class.x1.ne) pup.ne=0 pdown.ne=0 for (i in 1:(n.thresh-1)){pup.ne=pup.ev+sum(cM.ne\$table[(i+1):n.thresh,i])} - for (i in 2:n.thresh){pdown.ne=pdown.ne+sum(cM.ne\$table[1:(i-1),i])} + for (i in 2:n.thresh){pdown.ne=pdown.ne+sum(cM.ne\$table[1:(i-1),i])} pdown.ne=pdown.ne/nb pup.ne=pup.ne/nb nri = pup.ev - pdown.ev - (pup.ne - pdown.ne) @@ -339,7 +206,7 @@ z.nri.ev = nri.ev/se.nri.ev nri.ne = pdown.ne - pup.ne se.nri.ne = sqrt((pdown.ne + pup.ne)/nb) - z.nri.ne = nri.ne/se.nri.ne + z.nri.ne = nri.ne/se.nri.ne ### Category Free NRI calculations cfpup.ev = mean(d[a] > 0) cfpup.ne = mean(d[b] > 0) @@ -400,31 +267,31 @@ ### Output output = c(n, na, nb, pup.ev, pup.ne, pdown.ev, pdown.ne, nri, se.nri, z.nri, - nri.ev, se.nri.ev, z.nri.ev, nri.ne, se.nri.ne, z.nri.ne, + nri.ev, se.nri.ev, z.nri.ev, nri.ne, se.nri.ne, z.nri.ne, cfpup.ev, cfpup.ne, cfpdown.ev, cfpdown.ne, cfnri, se.cfnri, z.cfnri, - cfnri.ev, se.cfnri.ev, z.cfnri.ev, cfnri.ne, se.cfnri.ne, z.cfnri.ne, - improveSens, improveSpec, idi.ev, se.idi.ev, z.idi.ev, idi.ne, - se.idi.ne, z.idi.ne, idi, se.idi, z.idi, is.x1, NA, is.x2, NA, - ip.x1, NA, ip.x2, NA, auc.x1, se.auc.x1, auc.x2, se.auc.x2, + cfnri.ev, se.cfnri.ev, z.cfnri.ev, cfnri.ne, se.cfnri.ne, z.cfnri.ne, + improveSens, improveSpec, idi.ev, se.idi.ev, z.idi.ev, idi.ne, + se.idi.ne, z.idi.ne, idi, se.idi, z.idi, is.x1, NA, is.x2, NA, + ip.x1, NA, ip.x2, NA, auc.x1, se.auc.x1, auc.x2, se.auc.x2, roc.test.x1.x2\$p.value,incidence) - names(output) = c("n", "na", "nb", "pup.ev", "pup.ne", "pdown.ev", "pdown.ne", + names(output) = c("n", "na", "nb", "pup.ev", "pup.ne", "pdown.ev", "pdown.ne", "nri", "se.nri", "z.nri", "nri.ev", "se.nri.ev", "z.nri.ev", "nri.ne", "se.nri.ne", "z.nri.ne", - "cfpup.ev", "cfpup.ne", "cfpdown.ev", "cfpdown.ne", + "cfpup.ev", "cfpup.ne", "cfpdown.ev", "cfpdown.ne", "cfnri", "se.cfnri", "z.cfnri", "cfnri.ev", "se.cfnri.ev", "z.cfnri.ev", "cfnri.ne", "se.cfnri.ne", "z.cfnri.ne", "improveSens", "improveSpec", - "idi.ev", "se.idi.ev", "z.idi.ev", "idi.ne", "se.idi.ne", - "z.idi.ne", "idi", "se.idi", "z.idi", "is.x1", "se.is.x1", - "is.x2", "se.is.x2", "ip.x1", "se.ip.x1", "ip.x2", "se.ip.x2", - "auc.x1", "se.auc.x1", "auc.x2", "se.auc.x2", + "idi.ev", "se.idi.ev", "z.idi.ev", "idi.ne", "se.idi.ne", + "z.idi.ne", "idi", "se.idi", "z.idi", "is.x1", "se.is.x1", + "is.x2", "se.is.x2", "ip.x1", "se.ip.x1", "ip.x2", "se.ip.x2", + "auc.x1", "se.auc.x1", "auc.x2", "se.auc.x2", "roc.test.x1.x2.pvalue","incidence") resdf = data.frame(N=n, Na=na, Nb=nb, pup.ev=pup.ev, pup.ne=pup.ne, pdown.ev=pdown.ev, pdown.ne=pdown.ne, NRI=nri, NRI.se=se.nri, NRI.z=z.nri, - NRI.ev=nri.ev, NRI.ev.se=se.nri.ev, NRI.ev.z=z.nri.ev, NRI.ne=nri.ne, NRI.ne.se=se.nri.ne, NRI.ne.z=z.nri.ne, + NRI.ev=nri.ev, NRI.ev.se=se.nri.ev, NRI.ev.z=z.nri.ev, NRI.ne=nri.ne, NRI.ne.se=se.nri.ne, NRI.ne.z=z.nri.ne, cfpup.ev=cfpup.ev, cfpup.ne=cfpup.ne, cfpdown.ev=cfpdown.ev, cfpdown.ne=cfpdown.ne, CFNRI=cfnri, CFNRI.se=se.cfnri, CFNRI.z=z.cfnri, - CFNRI.ev=cfnri.ev, CFNRI.ev.se=se.cfnri.ev, CFNRI.ev.z=z.cfnri.ev, CFNRI.ne=cfnri.ne, CFNRI.ne.se=se.cfnri.ne, CFNRI.ne.z=z.cfnri.ne, - improvSens=improveSens, improvSpec=improveSpec, IDI.ev=idi.ev, IDI.ev.se=se.idi.ev, IDI.ev.z=z.idi.ev, IDI.ne=idi.ne, - IDI.ne.se=se.idi.ne, IDI.ne.z=z.idi.ne, IDI=idi, IDI.se=se.idi, IDI.z=z.idi, isx1=is.x1, isx2=is.x2, - ipxi=ip.x1, ipx2=ip.x2, AUC.x1=auc.x1, AUC.x1.se=se.auc.x1, AUC.x2=auc.x2, AUC.x2.se=se.auc.x2, + CFNRI.ev=cfnri.ev, CFNRI.ev.se=se.cfnri.ev, CFNRI.ev.z=z.cfnri.ev, CFNRI.ne=cfnri.ne, CFNRI.ne.se=se.cfnri.ne, CFNRI.ne.z=z.cfnri.ne, + improvSens=improveSens, improvSpec=improveSpec, IDI.ev=idi.ev, IDI.ev.se=se.idi.ev, IDI.ev.z=z.idi.ev, IDI.ne=idi.ne, + IDI.ne.se=se.idi.ne, IDI.ne.z=z.idi.ne, IDI=idi, IDI.se=se.idi, IDI.z=z.idi, isx1=is.x1, isx2=is.x2, + ipxi=ip.x1, ipx2=ip.x2, AUC.x1=auc.x1, AUC.x1.se=se.auc.x1, AUC.x2=auc.x2, AUC.x2.se=se.auc.x2, roctestpval=roc.test.x1.x2\$p.value,incidence=incidence) tr = t(resdf) tresdf = data.frame(measure=colnames(resdf),value=tr[,1]) @@ -453,7 +320,7 @@ boot.index = sample(length(y), replace = TRUE) risk.model1.boot = x1[boot.index] risk.model2.boot = x2[boot.index] - cc.status.boot = y[boot.index] + cc.status.boot = y[boot.index] r = statistics.raplot(x1 = risk.model1.boot, x2 = risk.model2.boot, y = cc.status.boot) results.boot[i, ] = r\$output } @@ -478,87 +345,87 @@ results.matrix[2, ] = c("Events (n)", results["na"]) results.matrix[3, ] = c("Non-events (n)", results["nb"]) results.matrix[4, ] = c("Category free NRI and summary statistics","-------------------------") - results.matrix[5, ] = c("cfNRI events (%)", - paste(round(100*results["cfnri.ev"], dp-2), " (", + results.matrix[5, ] = c("cfNRI events (%)", + paste(round(100*results["cfnri.ev"], dp-2), " (", round(100*results["cfnri.ev"] - z * 100*results["se.cfnri.ev"], dp-2), - " to ", round(100*results["cfnri.ev"] + + " to ", round(100*results["cfnri.ev"] + z * 100*results["se.cfnri.ev"], dp-2), ")", sep = "")) - results.matrix[6, ] = c("cfNRI non-events (%)", + results.matrix[6, ] = c("cfNRI non-events (%)", paste(round(100*results["cfnri.ne"], dp-2), " (", round(100*results["cfnri.ne"] - z * 100*results["se.cfnri.ne"], dp)-2, - " to ", round(100*results["cfnri.ne"] + z * 100*results["se.cfnri.ne"], - dp-2), ")", sep = "")) - results.matrix[7, ] = c("cfNRI (%)", - paste(round(100*results["cfnri"], dp-2), " (", - round(100*results["cfnri"] - z * 100*results["se.cfnri"], dp-2), - " to ", round(100*results["cfnri"] + z * 100*results["se.cfnri"], + " to ", round(100*results["cfnri.ne"] + z * 100*results["se.cfnri.ne"], + dp-2), ")", sep = "")) + results.matrix[7, ] = c("cfNRI (%)", + paste(round(100*results["cfnri"], dp-2), " (", + round(100*results["cfnri"] - z * 100*results["se.cfnri"], dp-2), + " to ", round(100*results["cfnri"] + z * 100*results["se.cfnri"], dp-2), ")", sep = "")) results.matrix[8, ] = c("NRI and summary statistics","-------------------------") - results.matrix[9, ] = c("NRI events (%)", - paste(round(100*results["nri.ev"], dp-2), " (", + results.matrix[9, ] = c("NRI events (%)", + paste(round(100*results["nri.ev"], dp-2), " (", round(100*results["nri.ev"] - z * 100*results["se.nri.ev"], dp-2), - " to ", round(100*results["nri.ev"] + + " to ", round(100*results["nri.ev"] + z * 100*results["se.nri.ev"], dp-2), ")", sep = "")) - results.matrix[10, ] = c("NRI non-events (%)", + results.matrix[10, ] = c("NRI non-events (%)", paste(round(100*results["nri.ne"], dp-2), " (", round(100*results["nri.ne"] - z * 100*results["se.nri.ne"], dp-2), - " to ", round(100*results["nri.ne"] + z * 100*results["se.nri.ne"], - dp-2), ")", sep = "")) - results.matrix[11, ] = c("NRI (%)", - paste(round(100*results["nri"], dp-2), " (", - round(100*results["nri"] - z * 100*results["se.nri"], dp-2), - " to ", round(100*results["nri"] + z * 100*results["se.nri"], + " to ", round(100*results["nri.ne"] + z * 100*results["se.nri.ne"], + dp-2), ")", sep = "")) + results.matrix[11, ] = c("NRI (%)", + paste(round(100*results["nri"], dp-2), " (", + round(100*results["nri"] - z * 100*results["se.nri"], dp-2), + " to ", round(100*results["nri"] + z * 100*results["se.nri"], dp-2), ")", sep = "")) results.matrix[12, ] = c("IDI and summary statistics","-------------------------") - results.matrix[13, ] = c("IDI events", - paste(round(results["idi.ev"], dp), " (", - round(results["idi.ev"] - z * results["se.idi.ev"], dp), - " to ", round(results["idi.ev"] + z * results["se.idi.ev"], + results.matrix[13, ] = c("IDI events", + paste(round(results["idi.ev"], dp), " (", + round(results["idi.ev"] - z * results["se.idi.ev"], dp), + " to ", round(results["idi.ev"] + z * results["se.idi.ev"], dp), ")", sep = "")) - results.matrix[14, ] = c("IDI non-events", - paste(round(results["idi.ne"], dp), " (", - round(results["idi.ne"] - z * results["se.idi.ne"], dp), - " to ", round(results["idi.ne"] + z * results["se.idi.ne"], + results.matrix[14, ] = c("IDI non-events", + paste(round(results["idi.ne"], dp), " (", + round(results["idi.ne"] - z * results["se.idi.ne"], dp), + " to ", round(results["idi.ne"] + z * results["se.idi.ne"], dp), ")", sep = "")) - results.matrix[15, ] = c("IDI", - paste(round(results["idi"], dp), " (", - round(results["idi"] - z * results["se.idi"], dp), - " to ", round(results["idi"] + z * results["se.idi"], + results.matrix[15, ] = c("IDI", + paste(round(results["idi"], dp), " (", + round(results["idi"] - z * results["se.idi"], dp), + " to ", round(results["idi"] + z * results["se.idi"], dp), ")", sep = "")) - results.matrix[16, ] = c("IS (null model)", - paste(round(results["is.x1"], dp), " (", - round(results["is.x1"] - z * results["se.is.x1"], dp), - " to ", round(results["is.x1"] + z * results["se.is.x1"], + results.matrix[16, ] = c("IS (null model)", + paste(round(results["is.x1"], dp), " (", + round(results["is.x1"] - z * results["se.is.x1"], dp), + " to ", round(results["is.x1"] + z * results["se.is.x1"], dp), ")", sep = "")) - results.matrix[17, ] = c("IS (alt model)", - paste(round(results["is.x2"], dp), " (", - round(results["is.x2"] - z * results["se.is.x2"], dp), - " to ", round(results["is.x2"] + z * results["se.is.x2"], + results.matrix[17, ] = c("IS (alt model)", + paste(round(results["is.x2"], dp), " (", + round(results["is.x2"] - z * results["se.is.x2"], dp), + " to ", round(results["is.x2"] + z * results["se.is.x2"], dp), ")", sep = "")) - results.matrix[18, ] = c("IP (null model)", - paste(round(results["ip.x1"], dp), " (", - round(results["ip.x1"] - z * results["se.ip.x1"], dp), - " to ", round(results["ip.x1"] + z * results["se.ip.x1"], + results.matrix[18, ] = c("IP (null model)", + paste(round(results["ip.x1"], dp), " (", + round(results["ip.x1"] - z * results["se.ip.x1"], dp), + " to ", round(results["ip.x1"] + z * results["se.ip.x1"], dp), ")", sep = "")) - results.matrix[19, ] = c("IP (alt model)", - paste(round(results["ip.x2"], dp), " (", - round(results["ip.x2"] - z * results["se.ip.x2"], dp), - " to ", round(results["ip.x2"] + z * results["se.ip.x2"], + results.matrix[19, ] = c("IP (alt model)", + paste(round(results["ip.x2"], dp), " (", + round(results["ip.x2"] - z * results["se.ip.x2"], dp), + " to ", round(results["ip.x2"] + z * results["se.ip.x2"], dp), ")", sep = "")) results.matrix[20, ] = c("AUC","-------------------------") - results.matrix[21, ] = c("AUC (null model)", - paste(round(results["auc.x1"], dp), " (", - round(results["auc.x1"] - z * results["se.auc.x1"], dp), - " to ", round(results["auc.x1"] + z * results["se.auc.x1"], + results.matrix[21, ] = c("AUC (null model)", + paste(round(results["auc.x1"], dp), " (", + round(results["auc.x1"] - z * results["se.auc.x1"], dp), + " to ", round(results["auc.x1"] + z * results["se.auc.x1"], dp), ")", sep = "")) - results.matrix[22, ] = c("AUC (alt model)", - paste(round(results["auc.x2"], dp), " (", - round(results["auc.x2"] - z * results["se.auc.x2"], dp), - " to ", round(results["auc.x2"] + z * results["se.auc.x2"], + results.matrix[22, ] = c("AUC (alt model)", + paste(round(results["auc.x2"], dp), " (", + round(results["auc.x2"] - z * results["se.auc.x2"], dp), + " to ", round(results["auc.x2"] + z * results["se.auc.x2"], dp), ")", sep = "")) results.matrix[23, ] = c("difference (P)", round(results["roc.test.x1.x2.pvalue"], dp)) results.matrix[24, ] = c("Incidence", round(results["incidence"], dp)) - + return(results.matrix) } @@ -624,6 +491,139 @@ sink() </configfile> </configfiles> + <inputs> + <param name="title" type="text" value="NRI test" size="80" label="Plot Title" help="Will appear as the title for the comparison plot"/> + <param name="input1" type="data" format="tabular" label="Select a tabular file from the baseline model with predicted and observed outcome column for each subject" + multiple='False' help="Observed and predicted status columns must be selected from this file below - NOTE both models must be in same order with exact matches in all observed outcomes" optional="False"/> + <param name="input1_observed" label="Select column containing observed outcome (0 for no event, 1 for an event)" type="data_column" data_ref="input1" numerical="True" + multiple="False" use_header_names="True" optional="False" help = "Observed outcomes are compared in the two files to check that the datasets are from the same data"/> + <param name="input1_predicted" label="Select column containing predicted event probabilies from baseline model" type="data_column" data_ref="input1" numerical="True" + multiple="False" use_header_names="True" optional="False" help="Must be in range 0-1"/> + <param name="input1_id" label="Select column containing subject ID from baseline model" type="data_column" data_ref="input1" numerical="True" + multiple="False" use_header_names="True" optional="False" help="Subect IDs are needed to match subjects to compare predictions in the two inputs"/> + <param name="input2" type="data" format="tabular" label="Select a tabular file from the new model with predicted and observed outcome columns for each subject" + multiple='False' help="Observed and predicted status columns must be selected from this file below" /> + <param name="input2_observed" label="Select column containing observed outcome (0 for no event, 1 for an event)" type="data_column" data_ref="input2" numerical="True" + multiple="False" use_header_names="True" optional="False" help = "Observed outcomes are compared in the two files to check that the datasets are from the same data"/> + <param name="input2_predicted" label="Select column containing predicted event probabilities from the new model" type="data_column" data_ref="input2" numerical="True" + multiple="False" use_header_names="True" optional="False" help="Must be in range 0-1"/> + <param name="input2_id" label="Select column containing subject ID from the new model" type="data_column" data_ref="input2" numerical="True" + multiple="False" use_header_names="True" optional="False" help="Subect IDs are needed to match subjects to compare predictions in the two inputs"/> + <conditional name="CImeth"> + <param name="cis" type="select" label="CI calculation method" + help="Bootstrap will take time - a long time for thousands - asymptotic is quick and informative"> + <option value="asymptotic" selected="true">Asymptotic estimate</option> + <option value="boot">Bootstrap for empirical CIs</option> + </param> + <when value="boot"> + <param name="nboot" type="integer" value="1000" label="Number of bootstrap replicates"/> + </when> + <when value="asymptotic"> + <param name="nboot" type="hidden" value="1000"/> + </when> + </conditional> + </inputs> + <outputs> + <data format="html" name="html_file" label="${title}.html"/> + <data format="tabular" name="nri_file" label="${title}_nrires.xls"/> + </outputs> + <tests> + <test> + <param name='title' value='nri_test1' /> + <param name='input1' value='nri_test1.xls' ftype='tabular' /> + <param name='input2' value='nri_test1.xls' ftype='tabular' /> + <param name='input1_id' value="1" /> + <param name='input1_observed' value="2" /> + <param name='input1_predicted' value="3" /> + <param name='input2_observed' value="2" /> + <param name='input2_predicted' value="4" /> + <output name='html_file' file='nri_test1_out.html' compare='diff' lines_diff='10' /> + <output name='nri_file' file='nri_test1_out.xls' /> + </test> +</tests> +<help> + +**Before you start** + +This is a simple tool to calculate various measures of improvement in prediction between two models described in pickering_paper_ +It is based on an R script pickering_code_ written by Dr John W Pickering and Dr David Cairns from sunny Otago University which +has been debugged and slightly adjusted to fit a Galaxy tool wrapper. + + +**What it does** + +Copied from the documentation in pickering_code_ :: + + + Functions to create risk assessment plots and associated summary statistics + + + (c) 2012 Dr John W Pickering, john.pickering@otago.ac.nz, and Dr David Cairns + Last modified August 2014 + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the distribution + + FUNCTIONS + raplot + Produces a Risk Assessment Plot and outputs the coordinates of the four curves + Based on: Pickering, J. W. and Endre, Z. H. (2012). New Metrics for Assessing Diagnostic Potential of + Candidate Biomarkers. Clinical Journal of the American Society of Nephrology, 7, 1355–1364. doi:10.2215/CJN.09590911 + + statistics.raplot + Produces the NRIs, IDIs, IS, IP, AUCs. + Based on: Pencina, M. J., D'Agostino, R. B. and Steyerberg, E. W. (2011). Extensions of net reclassification improvement calculations to + measure usefulness of new biomarkers. Statistics in Medicine, 30(1), 11–21. doi:10.1002/sim.4085 + Pencina, M. J., D'Agostino, R. B. and Vasan, R. S. (2008). Evaluating the added predictive ability of a new marker: From area under the + ROC curve to reclassification and beyond. + Statistics in Medicine, 27(2), 157–172. doi:10.1002/sim.2929 + DeLong, E., DeLong, D. and Clarke-Pearson, D. (1988). Comparing the areas under 2 or more correlated receiver operating characteristic curves - a nonparametric approach. + Biometrics, 44(3), 837–845. + + summary.raplot + Produces the NRIs, IDIs, IS, IP, AUCs with confidence intervals using a bootstrap or asymptotic procedure. (I prefer bootstrap which is chosed by cis=c("boot")) + + + Required arguments for all functions: + x1 is calculated risk (eg from a glm) for the null model, i.e. predict(,type="response") on a glm object + x2 is calculated risk (eg from a glm) for the alternative model + y is the case-control indicator (0 for controls, 1 for cases) + Optional argument + t are the boundaries of the risks for each group (ie 0, 1 and the thresholds beteween. eg c(0,0,3,0,7,1)). If missing, defaults to c(0, the incidence, 1) + + +**Input** + +The observed and predicted outcomes from two models to be compared. + +**Output** + +Lots'o'measures (TM) see pickering_paper_ for details + +**Attributions** + +pickering_paper_ is the paper the caclulations performed by this tool is based on + +pickering_code_ is the R function from John Pickering exposed by this Galaxy tool with minor modifications and hacks by Ross Lazarus. + +Galaxy_ (that's what you are using right now!) for gluing everything together + +Otherwise, all code and documentation comprising this tool was written by Ross Lazarus and is +licensed to you under the LGPL_ like other rgenetics artefacts + +.. _LGPL: http://www.gnu.org/copyleft/lesser.html +.. _pickering_code: http://www.researchgate.net/publication/264672640_R_function_for_Risk_Assessment_Plot__reclassification_metrics_NRI_IDI_cfNRI +.. _pickering_paper: http://cjasn.asnjournals.org/content/early/2012/05/24/CJN.09590911.full +.. _Galaxy: http://getgalaxy.org + + +</help> + <citations> <citation type="doi">doi: 10.2215/CJN.09590911</citation> </citations>
--- a/rglasso_cox.xml Wed Apr 29 12:07:11 2015 -0400 +++ b/rglasso_cox.xml Mon May 04 22:47:29 2015 -0400 @@ -7,223 +7,9 @@ <requirement type="package" version="2.14">glmnet_lars_2_14</requirement> </requirements> <command interpreter="python"> - rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "rglasso" + rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "rglasso" --output_dir "$html_file.files_path" --output_html "$html_file" --make_HTML "yes" </command> - <inputs> - <param name="title" type="text" value="lasso test" size="80" label="Title for job outputs" help="Typing a short, meaningful text here will help remind you (and explain to others) what the outputs represent"> - <sanitizer invalid_char=""> - <valid initial="string.letters,string.digits"><add value="_" /> </valid> - </sanitizer> - </param> - <param name="input1" type="data" format="tabular" label="Select an input tabular text file from your history. Rows represent samples; Columns are measured phenotypes" - multiple='False' optional="False" help="Tabular text data with samples as rows, phenotypes as columns with a header row of column identifiers" /> - <param name="xvar_cols" label="Select columns containing numeric variables to use as predictor (x) variables" type="data_column" data_ref="input1" numerical="False" - multiple="True" use_header_names="True" force_select="True" /> - <param name="force_xvar_cols" label="Select numeric columns containing variables ALWAYS included as predictors in cross validation" type="data_column" data_ref="input1" numerical="False" - multiple="True" use_header_names="True" force_select="False"/> - <conditional name="model"> - <param name="fam" type="select" label="GLM Link function for models" - help="Binary dependant variables will automatically be set to Binomial no matter what this is set to"> - <option value="gaussian" selected="true">Gaussian - continuous dependent (y)</option> - <option value="binomial">Binomial dependent variables</option> - <option value="poisson">Poisson (eg counts)</option> - <option value="cox">Cox models - require special setup for y variables - see below</option> - </param> - <when value="gaussian"> - <param name="yvar_cols" label="Select numeric columns containing variables to use as the dependent (y) in elasticnet" type="data_column" data_ref="input1" numerical="False" - multiple="True" use_header_names="True" help = "If multiple, each will be modelled against all the x variables and reported separately." force_select="True"/> - <param name="output_full" type="hidden" value='F' /> - <param name="output_pred" type="hidden" value='F' /> - <param name="cox_id" label="Select column containing a unique sample identifier" - help = "Only really needed for output sample specific predicted values downstream." - type="data_column" data_ref="input1" numerical="False" force_select="True" - multiple="False" use_header_names="True" /> - </when> - <when value="binomial"> - <param name="yvar_cols" label="Select numeric columns containing variables to use as the dependent (y) in elasticnet" type="data_column" data_ref="input1" numerical="False" - multiple="True" use_header_names="True" help = "If multiple, each will be modelled against all the x variables and reported separately." force_select="True"/> - <param name="output_full" type="hidden" value='F' /> - <param name="output_pred" type="select" label="Create a tabular output with predicted values for each subject from the optimal model for (eg) NRI estimates" > - <option value="F" selected="true">No predicted value output file</option> - <option value="T">Create a predicted value output file</option> - </param> - <param name="cox_id" label="Select column containing a unique sample identifier" - help = "Only really needed for output sample specific predicted values downstream." - type="data_column" data_ref="input1" numerical="False" force_select="True" - multiple="False" use_header_names="True" /> - <param name="predict_at" type="hidden" value='' /> - - </when> - <when value="poisson"> - <param name="yvar_cols" label="Select columns containing variables to use as the dependent (y) in elasticnet" type="data_column" data_ref="input1" numerical="True" - multiple="True" use_header_names="True" help = "If multiple, each will be modelled against all the x variables and reported separately." force_select="True"/> - <param name="output_full" type="hidden" value='F' /> - <param name="output_pred" type="hidden" value='F' /> - <param name="predict_at" type="hidden" value='' /> - <param name="cox_id" label="Select column containing a unique sample identifier" - help = "Optional. Only really needed for output sample specific predicted values downstream. Free - enjoy" - type="data_column" data_ref="input1" numerical="True" force_select="False" - multiple="False" use_header_names="True" /> - </when> - <when value="cox"> - <param name="cox_time" label="Select column containing time under observation for Cox regression" - type="data_column" data_ref="input1" numerical="True" force_select="True" - multiple="False" use_header_names="True" help = "This MUST contain a time period - eg continuous years or days to failure or right censoring"/> - <param name="cox_status" label="Select column containing status = 1 for outcome of interest at the end of the time under observation or 0 for right censoring" - type="data_column" data_ref="input1" numerical="True" force_select="True" - multiple="False" use_header_names="True" help = "This MUST contain 1 for subjects who had an event at that time or 0 for a right censored observation"/> - <param name="cox_id" label="Select column containing a unique sample identifier" - help = "Optional. Only really needed for output sample specific predicted values downstream. Free - enjoy" - type="data_column" data_ref="input1" numerical="False" force_select="False" - multiple="False" use_header_names="True" /> - <param name="output_full" type="select" label="Create a tabular output with coefficients for all predictors" > - <option value="F" selected="true">No full model output file</option> - <option value="T">Create a full model output file</option> - </param> - <param name="output_pred" type="select" label="Create a tabular output with predicted values for each subject from the optimal model for (eg) NRI estimates" > - <option value="F" selected="true">No predicted value output file</option> - <option value="T">Create a predicted value output file</option> - </param> - <param name="predict_at" type="text" value='' label="Provide a comma separated list of times to make a prediction for each subject" - optional="True" help="Default (blank) will return predictions at 0%,25%,50%,75%,100% of the observed times which should be informative" /> - - </when> - </conditional> - <param name="optLambda" type="select" label="Value to use when reporting optimal model and coefficients" help="minLambda will have more predictors - 1SDLambda will be more parsimonious"> - <option value="lambda.1se" selected="true">Lambda + 1 SE of min MSE or AUC (fewer coefficients - more false negatives)</option> - <option value="lambda.min">Lambda at min MSE or max AUC (more coefficients - more false positives)</option> - </param> - <param name="logxform_cols" optional="True" label="Select numeric columns to be log transformed before use as predictors or dependent variables" type="data_column" - data_ref="input1" numerical="True" multiple="True" use_header_names="True" help = "The wisdom of doing this depends entirely on your predictors - eg can help diminish long-tailed outlier influence" - force_select="False"/> - <param name="do_standard" type="select" label="Standardise x vars" - help="If all measurements on same scale, may not be needed. Coefficients are always returned on the original scale."> - <option value="False" selected="true">No standardisation of predictors</option>l - <option value="True">Standardise predictors before model</option> - </param> - <param name="mdsplots" type="select" label="Generate MDS plots of samples in measurement space and measurements in sample space" > - <option value="False" selected="true">No MDS plots</option>l - <option value="True">Yes create MDS plots</option> - </param> - <param name="alpha" type="float" value="0.95" size="5" min="0.01" max="1.0" label="Alpha - see glmnet docs. 1 for pure lasso. 0.0 for pure ridge regression" - help="Default 0.95 allows lasso to cope better with expected predictor collinearity. Use (eg) 0.5 for hybrid regularised regression or (eg) 0.025 for ridge regression"/> - <param name="nfold" type="integer" value="10" size="5" label="Number of folds for internal cross validation" - help="Default of 10 is usually ok"/> - </inputs> - <outputs> - <data format="html" name="html_file" label="${title}.html"/> - <data format="tabular" name="model_file" label="${title}_modelres.xls"/> - <data format="tabular" name="output_full_file" label="${title}_full_cox_model.xls"> - <filter>model['output_full'] == 'T'</filter> - </data> - <data format="tabular" name="output_pred_file" label="${title}_predicted_from_model.xls"> - <filter>model['output_pred'] == 'T'</filter> - </data> - </outputs> - <tests> - <test> - <param name='input1' value='cox_test.xls' ftype='tabular' /> - <param name='treatment_name' value='case' /> - <param name='title' value='Cox glmnet test' /> - <param name='nfold' value='10' /> - <param name='logxform_cols' value='' /> - <param name='alpha' value='0.95' /> - <param name='do_standard' value="True" /> - <param name='cox_time' value='1' /> - <param name='cox_status' value='2' /> - <param name='cox_id' value='1' /> - <param name='predict_at' value='' /> - <param name='fam' value='cox' /> - <param name='yvar_cols' value='' /> - <param name='xvar_cols' value='3,4,5' /> - <param name='force_xvar_cols' value='3' /> - <param name='output_full' value='F' /> - <param name='output_pred' value='F' /> - <output name='model_file' file='coxlassotest_modelres.xls'> - <assert_contents> - <has_text text="rhubarb" /> - <has_text text="TRUE" /> - <!-- 	 is XML escape code for tab --> - <!-- has_line line="regulator	partial_likelihood	forced_in	glmnet_model	best_lambda" / --> - <has_line line="regulator	partial_likelihood	forced_in	glmnet_model	best_lambda	lambdaChoice	alpha" /> - <has_n_columns n="7" /> - </assert_contents> - </output> - <output name='html_file' file='coxlassotest.html' compare='diff' lines_diff='16' /> - </test> -</tests> -<help> - -**Before you start** - -Please read the glmnet documentation @ glmnet_ - -This Galaxy wrapper merely exposes that code and the glmnet_ documentation is essential reading -before getting useful results here. - -**What it does** - -From documentation at glmnet_ :: - - Glmnet is a package that fits a generalized linear model via penalized maximum likelihood. - The regularization path is computed for the lasso or elasticnet penalty at a grid of values for the regularization parameter lambda. - The algorithm is extremely fast, and can exploit sparsity in the input matrix x. - It fits linear, logistic and multinomial, poisson, and Cox regression models. - A variety of predictions can be made from the fitted models. - -Internal cross validation is used to optimise the choice of lambda based on CV AUC for logistic (binomial outcome) models, or CV mse for gaussian. - -**Warning about the tyrany of dimensionality** - -Yes, this package will select 'optimal' models even when you (optimistically) supply more predictors than you have cases. -The model returned is unlikely to represent the only informative regularisation path through your data - if you run repeatedly with -exactly the same settings, you will probably see many different models being selected. -This is not a software bug - the real problem is that you just don't have enough information in your data. - -Sufficiently big jobs will take a while (eg each lasso regression with 20k features on 1k samples takes about 2-3 minutes on our aged cluster) - -**Input** - -Assuming you have more measurements than samples, you supply data as a tabular text file where each row is a sample and columns -are variables. You specify which columns are dependent (predictors) and which are observations for each sample. Each of multiple -dependent variable columns will be run and reported independently. Predictors can be forced in to the model. - -**Output** - -For each selected dependent regression variable, a brief report of the model coefficients predicted at the -'optimal' nfold CV value of lambda. - -**Predicted event probabilities for Cox and Logistic models** - -If you want to compare (eg) two competing clinical predictions, there's a companion generic NRI tool -for predicted event probabilities. Estimates dozens of measures of improvement in prediction. Currently only works for identical id subjects -but can probably be extended to independent sample predictions. - -Given a model, we can generate a predicted p (for status 1) in binomial or cox frameworks so models can be evaluated in terms of NRI. -Of course, estimates are likely substantially inflated over 'real world' performance by being estimated from the same sample - but you probably -already knew that since you were smart enough to reach this far down into the on screen help. The author salutes you, intrepid reader! - -It may seem an odd thing to do, but we can predict p for an event for each subject from our original data, given a parsimonious model. Doing -this for two separate models (eg, forcing in an additional known explanatory measurement to the new model) allows comparison of the two models -predicted status for each subject, or the same model in independent populations to see how badly it does - -**Attributions** - -glmnet_ is the R package exposed by this Galaxy tool. - -Galaxy_ (that's what you are using right now!) for gluing everything together - -Otherwise, all code and documentation comprising this tool was written by Ross Lazarus and is -licensed to you under the LGPL_ like other rgenetics artefacts - -.. _LGPL: http://www.gnu.org/copyleft/lesser.html -.. _glmnet: http://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html -.. _Galaxy: http://getgalaxy.org - - -</help> - <configfiles> <configfile name="runme"> <![CDATA[ @@ -236,27 +22,27 @@ message=function(x) {print.noquote(paste(x,sep=''))} -ross.cv.glmnet = function (x, y, weights, offset = NULL, lambda = NULL, type.measure = c("mse", - "deviance", "class", "auc", "mae"), nfolds = 10, foldid, - grouped = TRUE, keep = FALSE, parallel = FALSE, ...) +ross.cv.glmnet = function (x, y, weights, offset = NULL, lambda = NULL, type.measure = c("mse", + "deviance", "class", "auc", "mae"), nfolds = 10, foldid, + grouped = TRUE, keep = FALSE, parallel = FALSE, ...) { - if (missing(type.measure)) + if (missing(type.measure)) type.measure = "default" else type.measure = match.arg(type.measure) - if (!is.null(lambda) && length(lambda) < 2) + if (!is.null(lambda) && length(lambda) < 2) stop("Need more than one value of lambda for cv.glmnet") N = nrow(x) - if (missing(weights)) + if (missing(weights)) weights = rep(1, N) else weights = as.double(weights) y = drop(y) glmnet.call = match.call(expand.dots = TRUE) - sel = match(c("type.measure", "nfolds", "foldid", "grouped", + sel = match(c("type.measure", "nfolds", "foldid", "grouped", "keep"), names(glmnet.call), F) - if (any(sel)) + if (any(sel)) glmnet.call = glmnet.call[-sel] glmnet.call[[1]] = as.name("glmnet") - glmnet.object = glmnet(x, y, weights = weights, offset = offset, + glmnet.object = glmnet(x, y, weights = weights, offset = offset, lambda = lambda, ...) glmnet.object\$call = glmnet.call is.offset = glmnet.object\$offset @@ -266,56 +52,56 @@ nz = sapply(nz, function(x) sapply(x, length)) nz = ceiling(apply(nz, 1, median)) } - else nz = sapply(predict(glmnet.object, type = "nonzero"), + else nz = sapply(predict(glmnet.object, type = "nonzero"), length) - if (missing(foldid)) + if (missing(foldid)) foldid = sample(rep(seq(nfolds), length = N)) else nfolds = max(foldid) - if (nfolds < 3) + if (nfolds < 3) stop("nfolds must be bigger than 3; nfolds=10 recommended") outlist = as.list(seq(nfolds)) if (parallel && require(foreach)) { - outlist = foreach(i = seq(nfolds), .packages = c("glmnet")) %dopar% + outlist = foreach(i = seq(nfolds), .packages = c("glmnet")) %dopar% { sel = foldid == i - if (is.matrix(y)) + if (is.matrix(y)) y_sub = y[!sel, ] else y_sub = y[!sel] - if (is.offset) + if (is.offset) offset_sub = as.matrix(offset)[!sel, ] else offset_sub = NULL - glmnet(x[!sel, , drop = FALSE], y_sub, lambda = lambda, - offset = offset_sub, weights = weights[!sel], + glmnet(x[!sel, , drop = FALSE], y_sub, lambda = lambda, + offset = offset_sub, weights = weights[!sel], ...) } } else { for (i in seq(nfolds)) { sel = foldid == i - if (is.matrix(y)) + if (is.matrix(y)) y_sub = y[!sel, ] else y_sub = y[!sel] - if (is.offset) + if (is.offset) offset_sub = as.matrix(offset)[!sel, ] else offset_sub = NULL - outlist[[i]] = glmnet(x[!sel, , drop = FALSE], - y_sub, lambda = lambda, offset = offset_sub, + outlist[[i]] = glmnet(x[!sel, , drop = FALSE], + y_sub, lambda = lambda, offset = offset_sub, weights = weights[!sel], ...) } } fun = paste("cv", class(glmnet.object)[[1]], sep = ".") - cvstuff = do.call(fun, list(outlist, lambda, x, y, weights, + cvstuff = do.call(fun, list(outlist, lambda, x, y, weights, offset, foldid, type.measure, grouped, keep)) cvm = cvstuff\$cvm cvsd = cvstuff\$cvsd cvname = cvstuff\$name - out = list(lambda = lambda, cvm = cvm, cvsd = cvsd, cvup = cvm + + out = list(lambda = lambda, cvm = cvm, cvsd = cvsd, cvup = cvm + cvsd, cvlo = cvm - cvsd, nzero = nz, name = cvname, glmnet.fit = glmnet.object) - if (keep) + if (keep) out = c(out, list(fit.preval = cvstuff\$fit.preval, foldid = foldid)) - lamin = if (type.measure == "auc") + lamin = if (type.measure == "auc") getmin(lambda, -cvm, cvsd) else getmin(lambda, cvm, cvsd) out = c(out, as.list(lamin)) @@ -342,7 +128,7 @@ mdsPlot = function(dm,myTitle,groups=NA,outpdfname,transpose=T) { - + samples = colnames(dm) mt = myTitle pcols=c('maroon') @@ -355,11 +141,11 @@ mydata = dm if (transpose==T) { - mydata = t(dm) + mydata = t(dm) } npred = ncol(mydata) - d = dist(mydata) - fit = cmdscale(d,eig=TRUE, k=min(10,npred-2)) + d = dist(mydata) + fit = cmdscale(d,eig=TRUE, k=min(10,npred-2)) xmds = fit\$points[,1] ymds = fit\$points[,2] pdf(outpdfname) @@ -388,7 +174,7 @@ formstring=paste("y ~",cn) form = as.formula(formstring) ok = complete.cases(x) - + if (sum(ok) < length(ok)) { x = x[ok,] yvec = yvec[ok] @@ -408,9 +194,6 @@ return(p1) } - - - getpredp_cox = function(x,time,status,id,predict_at) { cols = colnames(x) @@ -419,13 +202,13 @@ return(NA) } cn = paste(colnames(x), collapse = ' + ') - + formstring=paste("Surv(time, status) ~",cn) - + form = as.formula(formstring) - + ok = complete.cases(x) - + if (sum(ok) < length(ok)) { x = x[ok,] time = time[ok] @@ -444,7 +227,6 @@ } - dolasso_cox = function(x,y,debugOn=F,maxsteps=10000,nfold=10,xcolnames,ycolnames,optLambda='lambda.1se',out_full=F,out_full_file=NA, out_pred=F,out_pred_file=NA,cox_id=NA, descr='Cox test',do_standard=F,alpha=0.9,penalty,predict_at,mdsplots=F) { @@ -457,7 +239,7 @@ if (class(p) == "try-error") { print.noquote(paste('Unable to produce predictors in sample space mds plot',p)) - } + } outpdfname = 'cox_samples_in_x_space_MDS.pdf' p = try({mdsPlot(x,'samples in measurement space',groups=y,outpdfname=outpdfname,transpose=F) },T) if (class(p) == "try-error") @@ -468,7 +250,7 @@ if (is.na(predict_at)) { predict_at = quantile(y) } message(paste('@@@ Cox model will be predicted at times =',paste(predict_at,collapse=','))) do_standard = do_standard - standardize = do_standard + standardize = do_standard normalize = do_standard p = try({larsres = glmnet(x,y,family='cox',standardize=standardize,alpha=alpha,penalty.factor=penalty )},T) if (class(p) == "try-error") @@ -492,13 +274,13 @@ try( { pdf(outpdf) - plot(larsres,main='cox glmnet',label=T) + plot(larsres,main='cox glmnet',label=T) grid() dev.off() },T) - + larscv = NA - + p = try({larscv=ross.cv.glmnet(x,y,family=fam,type.measure='deviance',penalty=penalty)},T) if (class(p) == "try-error") { print.noquote(paste('Unable to cross validate your data',p)) @@ -588,23 +370,23 @@ sink() return(NA) } - + mt = paste('Glmnet fraction deviance for',target) outpdf = paste(target,'glmnetPath.pdf',sep='_') pdf(outpdf) plot(larsres,main=mt,label=T) grid() dev.off() - + outpdf = paste(target,'glmnetDeviance.pdf',sep='_') - + mt2 = paste('Glmnet lambda for',target) - + pdf(outpdf) plot(larsres,xvar="lambda",main=mt2,label=T) grid() dev.off() - + larscv = NA if (fam=="binomial") { tmain = paste(target,'AUC') @@ -621,13 +403,12 @@ sink() return(NA) } - - + pdf(outpdf) plot(larscv,main=tmain) grid() dev.off() - + lse = larscv\$cvhits.1se lmin = larscv\$cvhits.min tot = lse + lmin @@ -637,7 +418,7 @@ print.noquote(nzhits) out_nz_file = paste(target,'cross_validation_model_counts.xls',sep='_') write.table(nzhits,out_nz_file,quote=FALSE, sep="\t",row.names=F) - + ipenalty = c(0,penalty) if (optLambda == 'lambda.min') { best_lambda = larscv\$lambda.min @@ -747,7 +528,7 @@ corPlot=function(xdat=c(),main='main title',is_raw=T) { - library(pheatmap) + library(pheatmap) library(gplots) if (is_raw) { cxdat = cor(xdat,method="spearman",use="pairwise.complete.obs") @@ -762,17 +543,15 @@ } - - runTest = function(n=10) -{ +{ set.seed (NULL) Y = data.frame(y1=runif (n),y2=runif(n)) Xv <- runif(n*n) X <- matrix(Xv, nrow = n, ncol = n) - + mydf <- data.frame(Y, X) - + regres_out = dolasso_generic(predvars=X,depvars=Y,debugOn=T,p.cutoff = 0.05,maxsteps=10000,nfold=10, descr='randomdata',do_standard=do_standard,defaultFam="gaussian",alpha=0.05) return(regres_out) @@ -900,6 +679,218 @@ </configfile> </configfiles> + <inputs> + <param name="title" type="text" value="lasso test" size="80" label="Title for job outputs" help="Typing a short, meaningful text here will help remind you (and explain to others) what the outputs represent"> + <sanitizer invalid_char=""> + <valid initial="string.letters,string.digits"><add value="_" /> </valid> + </sanitizer> + </param> + <param name="input1" type="data" format="tabular" label="Select an input tabular text file from your history. Rows represent samples; Columns are measured phenotypes" + multiple='False' optional="False" help="Tabular text data with samples as rows, phenotypes as columns with a header row of column identifiers" /> + <param name="xvar_cols" label="Select columns containing numeric variables to use as predictor (x) variables" type="data_column" data_ref="input1" numerical="False" + multiple="True" use_header_names="True" force_select="True" /> + <param name="force_xvar_cols" label="Select numeric columns containing variables ALWAYS included as predictors in cross validation" type="data_column" data_ref="input1" numerical="False" + multiple="True" use_header_names="True" force_select="False"/> + <conditional name="model"> + <param name="fam" type="select" label="GLM Link function for models" + help="Binary dependant variables will automatically be set to Binomial no matter what this is set to"> + <option value="gaussian" selected="true">Gaussian - continuous dependent (y)</option> + <option value="binomial">Binomial dependent variables</option> + <option value="poisson">Poisson (eg counts)</option> + <option value="cox">Cox models - require special setup for y variables - see below</option> + </param> + <when value="gaussian"> + <param name="yvar_cols" label="Select numeric columns containing variables to use as the dependent (y) in elasticnet" type="data_column" data_ref="input1" numerical="False" + multiple="True" use_header_names="True" help = "If multiple, each will be modelled against all the x variables and reported separately." force_select="True"/> + <param name="output_full" type="hidden" value='F' /> + <param name="output_pred" type="hidden" value='F' /> + <param name="cox_id" label="Select column containing a unique sample identifier" + help = "Only really needed for output sample specific predicted values downstream." + type="data_column" data_ref="input1" numerical="False" force_select="True" + multiple="False" use_header_names="True" /> + </when> + <when value="binomial"> + <param name="yvar_cols" label="Select numeric columns containing variables to use as the dependent (y) in elasticnet" type="data_column" data_ref="input1" numerical="False" + multiple="True" use_header_names="True" help = "If multiple, each will be modelled against all the x variables and reported separately." force_select="True"/> + <param name="output_full" type="hidden" value='F' /> + <param name="output_pred" type="select" label="Create a tabular output with predicted values for each subject from the optimal model for (eg) NRI estimates" > + <option value="F" selected="true">No predicted value output file</option> + <option value="T">Create a predicted value output file</option> + </param> + <param name="cox_id" label="Select column containing a unique sample identifier" + help = "Only really needed for output sample specific predicted values downstream." + type="data_column" data_ref="input1" numerical="False" force_select="True" + multiple="False" use_header_names="True" /> + <param name="predict_at" type="hidden" value='' /> + + </when> + <when value="poisson"> + <param name="yvar_cols" label="Select columns containing variables to use as the dependent (y) in elasticnet" type="data_column" data_ref="input1" numerical="True" + multiple="True" use_header_names="True" help = "If multiple, each will be modelled against all the x variables and reported separately." force_select="True"/> + <param name="output_full" type="hidden" value='F' /> + <param name="output_pred" type="hidden" value='F' /> + <param name="predict_at" type="hidden" value='' /> + <param name="cox_id" label="Select column containing a unique sample identifier" + help = "Optional. Only really needed for output sample specific predicted values downstream. Free - enjoy" + type="data_column" data_ref="input1" numerical="True" force_select="False" + multiple="False" use_header_names="True" /> + </when> + <when value="cox"> + <param name="cox_time" label="Select column containing time under observation for Cox regression" + type="data_column" data_ref="input1" numerical="True" force_select="True" + multiple="False" use_header_names="True" help = "This MUST contain a time period - eg continuous years or days to failure or right censoring"/> + <param name="cox_status" label="Select column containing status = 1 for outcome of interest at the end of the time under observation or 0 for right censoring" + type="data_column" data_ref="input1" numerical="True" force_select="True" + multiple="False" use_header_names="True" help = "This MUST contain 1 for subjects who had an event at that time or 0 for a right censored observation"/> + <param name="cox_id" label="Select column containing a unique sample identifier" + help = "Optional. Only really needed for output sample specific predicted values downstream. Free - enjoy" + type="data_column" data_ref="input1" numerical="False" force_select="False" + multiple="False" use_header_names="True" /> + <param name="output_full" type="select" label="Create a tabular output with coefficients for all predictors" > + <option value="F" selected="true">No full model output file</option> + <option value="T">Create a full model output file</option> + </param> + <param name="output_pred" type="select" label="Create a tabular output with predicted values for each subject from the optimal model for (eg) NRI estimates" > + <option value="F" selected="true">No predicted value output file</option> + <option value="T">Create a predicted value output file</option> + </param> + <param name="predict_at" type="text" value='' label="Provide a comma separated list of times to make a prediction for each subject" + optional="True" help="Default (blank) will return predictions at 0%,25%,50%,75%,100% of the observed times which should be informative" /> + + </when> + </conditional> + <param name="optLambda" type="select" label="Value to use when reporting optimal model and coefficients" help="minLambda will have more predictors - 1SDLambda will be more parsimonious"> + <option value="lambda.1se" selected="true">Lambda + 1 SE of min MSE or AUC (fewer coefficients - more false negatives)</option> + <option value="lambda.min">Lambda at min MSE or max AUC (more coefficients - more false positives)</option> + </param> + <param name="logxform_cols" optional="True" label="Select numeric columns to be log transformed before use as predictors or dependent variables" type="data_column" + data_ref="input1" numerical="True" multiple="True" use_header_names="True" help = "The wisdom of doing this depends entirely on your predictors - eg can help diminish long-tailed outlier influence" + force_select="False"/> + <param name="do_standard" type="select" label="Standardise x vars" + help="If all measurements on same scale, may not be needed. Coefficients are always returned on the original scale."> + <option value="False" selected="true">No standardisation of predictors</option>l + <option value="True">Standardise predictors before model</option> + </param> + <param name="mdsplots" type="select" label="Generate MDS plots of samples in measurement space and measurements in sample space" > + <option value="False" selected="true">No MDS plots</option>l + <option value="True">Yes create MDS plots</option> + </param> + <param name="alpha" type="float" value="0.95" size="5" min="0.01" max="1.0" label="Alpha - see glmnet docs. 1 for pure lasso. 0.0 for pure ridge regression" + help="Default 0.95 allows lasso to cope better with expected predictor collinearity. Use (eg) 0.5 for hybrid regularised regression or (eg) 0.025 for ridge regression"/> + <param name="nfold" type="integer" value="10" size="5" label="Number of folds for internal cross validation" + help="Default of 10 is usually ok"/> + </inputs> + <outputs> + <data format="html" name="html_file" label="${title}.html"/> + <data format="tabular" name="model_file" label="${title}_modelres.xls"/> + <data format="tabular" name="output_full_file" label="${title}_full_cox_model.xls"> + <filter>model['output_full'] == 'T'</filter> + </data> + <data format="tabular" name="output_pred_file" label="${title}_predicted_from_model.xls"> + <filter>model['output_pred'] == 'T'</filter> + </data> + </outputs> + <tests> + <test> + <param name='input1' value='cox_test.xls' ftype='tabular' /> + <param name='treatment_name' value='case' /> + <param name='title' value='Cox glmnet test' /> + <param name='nfold' value='10' /> + <param name='logxform_cols' value='' /> + <param name='alpha' value='0.95' /> + <param name='do_standard' value="True" /> + <param name='cox_time' value='1' /> + <param name='cox_status' value='2' /> + <param name='cox_id' value='1' /> + <param name='predict_at' value='' /> + <param name='fam' value='cox' /> + <param name='yvar_cols' value='' /> + <param name='xvar_cols' value='3,4,5' /> + <param name='force_xvar_cols' value='3' /> + <param name='output_full' value='F' /> + <param name='output_pred' value='F' /> + <output name='model_file' file='coxlassotest_modelres.xls'> + <assert_contents> + <has_text text="rhubarb" /> + <has_text text="TRUE" /> + <!-- 	 is XML escape code for tab --> + <!-- has_line line="regulator	partial_likelihood	forced_in	glmnet_model	best_lambda" / --> + <has_line line="regulator	partial_likelihood	forced_in	glmnet_model	best_lambda	lambdaChoice	alpha" /> + <has_n_columns n="7" /> + </assert_contents> + </output> + <output name='html_file' file='coxlassotest.html' compare='diff' lines_diff='16' /> + </test> +</tests> +<help> + +**Before you start** + +Please read the glmnet documentation @ glmnet_ + +This Galaxy wrapper merely exposes that code and the glmnet_ documentation is essential reading +before getting useful results here. + +**What it does** + +From documentation at glmnet_ :: + + Glmnet is a package that fits a generalized linear model via penalized maximum likelihood. + The regularization path is computed for the lasso or elasticnet penalty at a grid of values for the regularization parameter lambda. + The algorithm is extremely fast, and can exploit sparsity in the input matrix x. + It fits linear, logistic and multinomial, poisson, and Cox regression models. + A variety of predictions can be made from the fitted models. + +Internal cross validation is used to optimise the choice of lambda based on CV AUC for logistic (binomial outcome) models, or CV mse for gaussian. + +**Warning about the tyrany of dimensionality** + +Yes, this package will select 'optimal' models even when you (optimistically) supply more predictors than you have cases. +The model returned is unlikely to represent the only informative regularisation path through your data - if you run repeatedly with +exactly the same settings, you will probably see many different models being selected. +This is not a software bug - the real problem is that you just don't have enough information in your data. + +Sufficiently big jobs will take a while (eg each lasso regression with 20k features on 1k samples takes about 2-3 minutes on our aged cluster) + +**Input** + +Assuming you have more measurements than samples, you supply data as a tabular text file where each row is a sample and columns +are variables. You specify which columns are dependent (predictors) and which are observations for each sample. Each of multiple +dependent variable columns will be run and reported independently. Predictors can be forced in to the model. + +**Output** + +For each selected dependent regression variable, a brief report of the model coefficients predicted at the +'optimal' nfold CV value of lambda. + +**Predicted event probabilities for Cox and Logistic models** + +If you want to compare (eg) two competing clinical predictions, there's a companion generic NRI tool +for predicted event probabilities. Estimates dozens of measures of improvement in prediction. Currently only works for identical id subjects +but can probably be extended to independent sample predictions. + +Given a model, we can generate a predicted p (for status 1) in binomial or cox frameworks so models can be evaluated in terms of NRI. +Of course, estimates are likely substantially inflated over 'real world' performance by being estimated from the same sample - but you probably +already knew that since you were smart enough to reach this far down into the on screen help. The author salutes you, intrepid reader! + +It may seem an odd thing to do, but we can predict p for an event for each subject from our original data, given a parsimonious model. Doing +this for two separate models (eg, forcing in an additional known explanatory measurement to the new model) allows comparison of the two models +predicted status for each subject, or the same model in independent populations to see how badly it does + +**Attributions** + +glmnet_ is the R package exposed by this Galaxy tool. + +Galaxy_ (that's what you are using right now!) for gluing everything together + +Otherwise, all code and documentation comprising this tool was written by Ross Lazarus and is +licensed to you under the LGPL_ like other rgenetics artefacts + +.. _LGPL: http://www.gnu.org/copyleft/lesser.html +.. _glmnet: http://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html +.. _Galaxy: http://getgalaxy.org +</help> + <citations> <citation type="bibtex"> @Article{Friedman2010, title = {Regularization Paths for Generalized Linear Models via Coordinate Descent}, @@ -917,6 +908,3 @@ </citation> </citations> </tool> - - -