Mercurial > repos > deepakjadmin > r_caret_test
comparison caret_regression/tool2/templateLibrary.py @ 0:68300206e90d draft default tip
Uploaded
| author | deepakjadmin |
|---|---|
| date | Thu, 05 Nov 2015 02:41:30 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:68300206e90d |
|---|---|
| 1 def __template4Rnw(): | |
| 2 | |
| 3 template4Rnw = r'''%% Regression Modeling Script | |
| 4 %% Max Kuhn (max.kuhn@pfizer.com, mxkuhn@gmail.com) | |
| 5 %% Version: 1.00 | |
| 6 %% Created on: 2010/10/02 | |
| 7 %% | |
| 8 %% Lynn Group | |
| 9 %% Version: 2.00 | |
| 10 %% Created on: 2014/11/15 | |
| 11 | |
| 12 %% This is an Sweave template for building and describing | |
| 13 %% classification models. It mixes R and LaTeX code. The document can | |
| 14 %% be processing using R's Sweave function to produce a tex file. | |
| 15 %% | |
| 16 %% The inputs are: | |
| 17 %% - the initial data set in a data frame called 'rawData' | |
| 18 %% - a numeric column in the data set called 'outcome'. this should be the | |
| 19 %% outcome variable | |
| 20 %% - all other columns in rawData should be predictor variables | |
| 21 %% - the type of model should be in a variable called 'modName'. | |
| 22 %% | |
| 23 %% The script attempts to make some intelligent choices based on the | |
| 24 %% model being used. For example, if modName is "pls", the script will | |
| 25 %% automatically center and scale the predictor data. There are | |
| 26 %% situations where these choices can (and should be) changed. | |
| 27 %% | |
| 28 %% There are other options that may make sense to change. For example, | |
| 29 %% the user may want to adjust the type of resampling. To find these | |
| 30 %% parts of the script, search on the string 'OPTION'. These parts of | |
| 31 %% the code will document the options. | |
| 32 | |
| 33 \documentclass[12pt]{report} | |
| 34 \usepackage{amsmath} | |
| 35 \usepackage[pdftex]{graphicx} | |
| 36 \usepackage{color} | |
| 37 \usepackage{ctable} | |
| 38 \usepackage{xspace} | |
| 39 \usepackage{fancyvrb} | |
| 40 \usepackage{fancyhdr} | |
| 41 \usepackage{lastpage} | |
| 42 \usepackage{longtable} | |
| 43 \usepackage{algorithm2e} | |
| 44 \usepackage[ | |
| 45 colorlinks=true, | |
| 46 linkcolor=blue, | |
| 47 citecolor=blue, | |
| 48 urlcolor=blue] | |
| 49 {hyperref} | |
| 50 \usepackage{lscape} | |
| 51 | |
| 52 \usepackage{Sweave} | |
| 53 \SweaveOpts{keep.source = TRUE} | |
| 54 | |
| 55 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | |
| 56 | |
| 57 % define new colors for use | |
| 58 \definecolor{darkgreen}{rgb}{0,0.6,0} | |
| 59 \definecolor{darkred}{rgb}{0.6,0.0,0} | |
| 60 \definecolor{lightbrown}{rgb}{1,0.9,0.8} | |
| 61 \definecolor{brown}{rgb}{0.6,0.3,0.3} | |
| 62 \definecolor{darkblue}{rgb}{0,0,0.8} | |
| 63 \definecolor{darkmagenta}{rgb}{0.5,0,0.5} | |
| 64 | |
| 65 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | |
| 66 | |
| 67 \newcommand{\bld}[1]{\mbox{\boldmath $#1$}} | |
| 68 \newcommand{\shell}[1]{\mbox{$#1$}} | |
| 69 \renewcommand{\vec}[1]{\mbox{\bf {#1}}} | |
| 70 | |
| 71 \newcommand{\ReallySmallSpacing}{\renewcommand{\baselinestretch}{.6}\Large\normalsize} | |
| 72 \newcommand{\SmallSpacing}{\renewcommand{\baselinestretch}{1.1}\Large\normalsize} | |
| 73 | |
| 74 \newcommand{\halfs}{\frac{1}{2}} | |
| 75 | |
| 76 \setlength{\oddsidemargin}{-.25 truein} | |
| 77 \setlength{\evensidemargin}{0truein} | |
| 78 \setlength{\topmargin}{-0.2truein} | |
| 79 \setlength{\textwidth}{7 truein} | |
| 80 \setlength{\textheight}{8.5 truein} | |
| 81 \setlength{\parindent}{0.20truein} | |
| 82 \setlength{\parskip}{0.10truein} | |
| 83 | |
| 84 \setcounter{LTchunksize}{50} | |
| 85 | |
| 86 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | |
| 87 \pagestyle{fancy} | |
| 88 \lhead{} | |
| 89 %% OPTION Report header name | |
| 90 \chead{Regression Model Script} | |
| 91 \rhead{} | |
| 92 \lfoot{} | |
| 93 \cfoot{} | |
| 94 \rfoot{\thepage\ of \pageref{LastPage}} | |
| 95 \renewcommand{\headrulewidth}{1pt} | |
| 96 \renewcommand{\footrulewidth}{1pt} | |
| 97 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | |
| 98 | |
| 99 %% OPTION Report title and modeler name | |
| 100 \title{Regression Model Script using $METHOD } | |
| 101 \author{"M. Kuhn and Lynn Group, SCIS, JNU, New Delhi"} | |
| 102 | |
| 103 \begin{document} | |
| 104 | |
| 105 \maketitle | |
| 106 | |
| 107 \thispagestyle{empty} | |
| 108 | |
| 109 <<startup, eval= TRUE, results = hide, echo = FALSE>>= | |
| 110 library(Hmisc) | |
| 111 library(caret) | |
| 112 versionTest <- compareVersion(packageDescription("caret")$Version, | |
| 113 "4.65") | |
| 114 if(versionTest < 0) stop("caret version 4.65 or later is required") | |
| 115 | |
| 116 library(RColorBrewer) | |
| 117 | |
| 118 | |
| 119 listString <- function (x, period = FALSE, verbose = FALSE) | |
| 120 { | |
| 121 if (verbose) cat("\n entering listString\n") | |
| 122 flush.console() | |
| 123 if (!is.character(x)) | |
| 124 x <- as.character(x) | |
| 125 numElements <- length(x) | |
| 126 out <- if (length(x) > 0) { | |
| 127 switch(min(numElements, 3), x, paste(x, collapse = " and "), | |
| 128 { | |
| 129 x <- paste(x, c(rep(",", numElements - 2), " and", ""), sep = "") | |
| 130 paste(x, collapse = " ") | |
| 131 }) | |
| 132 } | |
| 133 else "" | |
| 134 if (period) out <- paste(out, ".", sep = "") | |
| 135 if (verbose) cat(" leaving listString\n\n") | |
| 136 flush.console() | |
| 137 out | |
| 138 } | |
| 139 | |
| 140 resampleStats <- function(x, digits = 3) | |
| 141 { | |
| 142 bestPerf <- x$bestTune | |
| 143 colnames(bestPerf) <- gsub("^\\.", "", colnames(bestPerf)) | |
| 144 out <- merge(x$results, bestPerf) | |
| 145 out <- out[, colnames(out) %in% x$perfNames] | |
| 146 names(out) <- gsub("ROC", "area under the ROC curve", names(out), fixed = TRUE) | |
| 147 names(out) <- gsub("Sens", "sensitivity", names(out), fixed = TRUE) | |
| 148 names(out) <- gsub("Spec", "specificity", names(out), fixed = TRUE) | |
| 149 names(out) <- gsub("Accuracy", "overall accuracy", names(out), fixed = TRUE) | |
| 150 names(out) <- gsub("Kappa", "Kappa statistics", names(out), fixed = TRUE) | |
| 151 names(out) <- gsub("RMSE", "root mean squared error", names(out), fixed = TRUE) | |
| 152 names(out) <- gsub("Rsquared", "$R^2$", names(out), fixed = TRUE) | |
| 153 | |
| 154 out <- format(out, digits = digits) | |
| 155 listString(paste(names(out), "was", out)) | |
| 156 } | |
| 157 | |
| 158 latticeBubble <- function(x, y, z, offset = .5, splits = 10, | |
| 159 pal = colorRampPalette(brewer.pal(9,"YlOrRd")[-(1:2)]), | |
| 160 ...) | |
| 161 { | |
| 162 cexValues <- rank(z)/length(z) + offset | |
| 163 splits <- unique(quantile(z, probs = seq(0, 1, length = splits))) | |
| 164 splitup <- cut(z, breaks = splits, include.lowest = TRUE) | |
| 165 cols <- pal(length(levels(splitup))) | |
| 166 colValues <- cols[as.numeric(splitup)] | |
| 167 if(is.data.frame(x)) | |
| 168 { | |
| 169 out <- splom(~x, col = colValues, cex = cexValues, ...) | |
| 170 | |
| 171 } else out <- xyplot(y~x, col = colValues, cex = cexValues, ...) | |
| 172 out | |
| 173 | |
| 174 } | |
| 175 | |
| 176 | |
| 177 ##OPTION: model name: see ?train for more values/models | |
| 178 modName <- "$METHOD" | |
| 179 load("$RDATA") | |
| 180 rawData <- dataX | |
| 181 rawData$$outcome <- dataY | |
| 182 | |
| 183 | |
| 184 | |
| 185 @ | |
| 186 | |
| 187 | |
| 188 \section*{Data Sets}\label{S:data} | |
| 189 | |
| 190 %% OPTION: provide some background on the problem, the experimental | |
| 191 %% data, how the compounds were selected etc | |
| 192 | |
| 193 | |
| 194 <<getDataInfo, eval = $GETDATAINFOEVAL, echo = $GETDATAINFOECHO, results = $GETDATAINFORESULT>>= | |
| 195 if(!any(names(rawData) == "outcome")) stop("a variable called outcome should be in the data set") | |
| 196 if(!is.numeric(rawData$outcome)) stop("the outcome should be a numeric vector") | |
| 197 | |
| 198 numSamples <- nrow(rawData) | |
| 199 numPredictors <- ncol(rawData) - 1 | |
| 200 predictorNames <- names(rawData)[names(rawData) != "outcome"] | |
| 201 | |
| 202 isNum <- apply(rawData[,predictorNames, drop = FALSE], 2, is.numeric) | |
| 203 if(any(!isNum)) stop("all predictors in rawData should be numeric") | |
| 204 | |
| 205 @ | |
| 206 | |
| 207 | |
| 208 <<missingFilter, eval = $MISSINGFILTEREVAL, echo = $MISSINGFILTERECHO, results = $MISSINGFILTERRESULT>>= | |
| 209 | |
| 210 colRate <- apply(rawData[, predictorNames, drop = FALSE], | |
| 211 2, function(x) mean(is.na(x))) | |
| 212 | |
| 213 ##OPTION thresholds can be changed | |
| 214 colExclude <- colRate > $MISSINGFILTERTHRESHC | |
| 215 | |
| 216 missingText <- "" | |
| 217 | |
| 218 if(any(colExclude)) | |
| 219 { | |
| 220 missingText <- paste(missingText, | |
| 221 ifelse(sum(colExclude) > 1, | |
| 222 " There were ", | |
| 223 " There was "), | |
| 224 sum(colExclude), | |
| 225 ifelse(sum(colExclude) > 1, | |
| 226 " predictors ", | |
| 227 " predictor "), | |
| 228 "with an excessive number of ", | |
| 229 "missing data. ", | |
| 230 ifelse(sum(colExclude) > 1, | |
| 231 " These were excluded. ", | |
| 232 " This was excluded. ")) | |
| 233 predictorNames <- predictorNames[!colExclude] | |
| 234 rawData <- rawData[, names(rawData) %in% c("outcome", predictorNames), drop = FALSE] | |
| 235 } | |
| 236 | |
| 237 | |
| 238 rowRate <- apply(rawData[, predictorNames, drop = FALSE], | |
| 239 1, function(x) mean(is.na(x))) | |
| 240 | |
| 241 rowExclude <- rowRate > $MISSINGFILTERTHRESHR | |
| 242 | |
| 243 | |
| 244 if(any(rowExclude)) | |
| 245 { | |
| 246 missingText <- paste(missingText, | |
| 247 ifelse(sum(rowExclude) > 1, | |
| 248 " There were ", | |
| 249 " There was "), | |
| 250 sum(colExclude), | |
| 251 ifelse(sum(rowExclude) > 1, | |
| 252 " samples ", | |
| 253 " sample "), | |
| 254 "with an excessive number of ", | |
| 255 "missing data. ", | |
| 256 ifelse(sum(rowExclude) > 1, | |
| 257 " These were excluded. ", | |
| 258 " This was excluded. "), | |
| 259 "After filtering, ", | |
| 260 sum(!rowExclude), | |
| 261 " samples remained.") | |
| 262 rawData <- rawData[!rowExclude, ] | |
| 263 hasMissing <- apply(rawData[, predictorNames, drop = FALSE], | |
| 264 1, function(x) mean(is.na(x))) | |
| 265 } else { | |
| 266 hasMissing <- apply(rawData[, predictorNames, drop = FALSE], | |
| 267 1, function(x) any(is.na(x))) | |
| 268 missingText <- paste(missingText, | |
| 269 ifelse(missingText == "", | |
| 270 "There ", | |
| 271 "Subsequently, there "), | |
| 272 ifelse(sum(hasMissing) == 1, | |
| 273 "was ", | |
| 274 "were "), | |
| 275 ifelse(sum(hasMissing) > 0, | |
| 276 sum(hasMissing), | |
| 277 "no"), | |
| 278 ifelse(sum(hasMissing) == 1, | |
| 279 "sample ", | |
| 280 "samples "), | |
| 281 "with missing values.") | |
| 282 rawData <- rawData[complete.cases(rawData),] | |
| 283 } | |
| 284 | |
| 285 dataDist <- summary(rawData$outcome) | |
| 286 dataSD <- sd(rawData$outcome, na.rm = TRUE) | |
| 287 dataText <- paste("The average outcome value was ", | |
| 288 dataDist["Mean"], | |
| 289 " and a standard deviation of ", | |
| 290 dataSD, ". The minimum and maximum values were ", | |
| 291 dataDist["Min."], " and ", dataDist["Max."], | |
| 292 ", respectively. Figure \\\\ref{F:dens} shows a ", | |
| 293 " density plot (i.e. a smooth histogram) of the response.", | |
| 294 sep = "") | |
| 295 | |
| 296 rawData1 <- rawData[,1:length(rawData)-1] | |
| 297 rawData2 <- rawData[,length(rawData)] | |
| 298 | |
| 299 set.seed(222) | |
| 300 nzv1 <- nearZeroVar(rawData1) | |
| 301 if(length(nzv1) > 0) | |
| 302 { | |
| 303 nzvVars1 <- names(rawData1)[nzv1] | |
| 304 rawData <- rawData1[,-nzv1] | |
| 305 rawData$outcome <- rawData2 | |
| 306 nzvText1 <- paste("There were ", | |
| 307 length(nzv1), | |
| 308 " predictors that were removed from original data due to", | |
| 309 " severely unbalanced distributions that", | |
| 310 " could negatively affect the model fit", | |
| 311 ifelse(length(nzv1) > 10, | |
| 312 ".", | |
| 313 paste(": ", | |
| 314 listString(nzvVars1), | |
| 315 ".", | |
| 316 sep = "")), | |
| 317 sep = "") | |
| 318 | |
| 319 } else { | |
| 320 rawData <- rawData1 | |
| 321 rawData$outcome <- rawData2 | |
| 322 nzvText1 <- "" | |
| 323 | |
| 324 } | |
| 325 | |
| 326 remove("rawData1") | |
| 327 remove("rawData2") | |
| 328 | |
| 329 @ | |
| 330 | |
| 331 The initial data set consisted of \Sexpr{numSamples} samples and | |
| 332 \Sexpr{numPredictors} predictor variables. \Sexpr{dataText} \Sexpr{missingText} | |
| 333 \Sexpr{nzvText1} | |
| 334 | |
| 335 \setkeys{Gin}{width = 0.8\textwidth} | |
| 336 \begin{figure}[b] | |
| 337 \begin{center} | |
| 338 | |
| 339 <<densityplot, echo = FALSE, results = hide, fig = TRUE, width = 8, height = 4.5>>= | |
| 340 trellis.par.set(caretTheme(), warn = TRUE) | |
| 341 print(densityplot(~rawData$outcome, pch = "|", adjust = 1.25, xlab = "")) | |
| 342 @ | |
| 343 \caption[Data Density]{A density plot of the response. The marks | |
| 344 along the $x$--axis show the locations of the data points.} | |
| 345 \label{F:dens} | |
| 346 \end{center} | |
| 347 \end{figure} | |
| 348 | |
| 349 | |
| 350 <<pca, eval= $PCAEVAL, echo = $PCAECHO, results = $PCARESULT>>= | |
| 351 predictorNames <- names(rawData)[names(rawData) != "outcome"] | |
| 352 numPredictors <- length(predictorNames) | |
| 353 | |
| 354 predictors <- rawData[, predictorNames, drop = FALSE] | |
| 355 ## PCA will fail with predictors having less than 2 unique values | |
| 356 isZeroVar <- apply(predictors, 2, | |
| 357 function(x) length(unique(x)) < 2) | |
| 358 if(any(isZeroVar)) predictors <- predictors[, !isZeroVar, drop = FALSE] | |
| 359 ## For whatever, only the formula interface to prcomp | |
| 360 ## handles missing values | |
| 361 pcaForm <- as.formula( | |
| 362 paste("~", | |
| 363 paste(names(predictors), collapse = "+"))) | |
| 364 pca <- prcomp(pcaForm, | |
| 365 data = predictors, | |
| 366 center = TRUE, | |
| 367 scale. = TRUE, | |
| 368 na.action = na.omit) | |
| 369 ## OPTION: the number of components plotted/discussed can be set | |
| 370 numPCAcomp <- $PCACOMP | |
| 371 pctVar <- pca$sdev^2/sum(pca$sdev^2)*100 | |
| 372 pcaText <- paste(round(pctVar[1:numPCAcomp], 1), | |
| 373 "\\\\%", | |
| 374 sep = "") | |
| 375 pcaText <- listString(pcaText) | |
| 376 @ | |
| 377 | |
| 378 To get an initial assessment of the separability of the classes, | |
| 379 principal component analysis (PCA) was used to distill the | |
| 380 \Sexpr{numPredictors} predictors down into \Sexpr{numPCAcomp} | |
| 381 surrogate variables (i.e. the principal components) in a manner that | |
| 382 attempts to maximize the amount of information preserved from the | |
| 383 original predictor set. Figure \ref{F:inititalPCA} contains plots of | |
| 384 the first \Sexpr{numPCAcomp} components, which accounted for | |
| 385 \Sexpr{pcaText} percent of the variability in the original predictors | |
| 386 (respectively). | |
| 387 | |
| 388 %% OPTION: remark on how well (or poorly) the data separated | |
| 389 | |
| 390 \setkeys{Gin}{width = 0.8\textwidth} | |
| 391 \begin{figure}[p] | |
| 392 \begin{center} | |
| 393 | |
| 394 <<pcaPlot, eval = $PCAPLOTEVAL, echo = $PCAPLOTECHO, results = $PCAPLOTRESULT, fig = $PCAPLOTFIG, width = 8, height = 8>>= | |
| 395 trellis.par.set(caretTheme(), warn = TRUE) | |
| 396 if(numPCAcomp == 2) | |
| 397 { | |
| 398 axisRange <- extendrange(pca$x[, 1:2]) | |
| 399 print( | |
| 400 latticeBubble(x = as.data.frame(pca$x)$PC1, | |
| 401 y = as.data.frame(pca$x)$PC2, | |
| 402 z = rawData$outcome, | |
| 403 type = c("p", "g"), | |
| 404 xlab = "PC1", ylab = "PC2", | |
| 405 xlim = axisRange, | |
| 406 ylim = axisRange)) | |
| 407 | |
| 408 } else { | |
| 409 axisRange <- extendrange(pca$x[, 1:numPCAcomp]) | |
| 410 print( | |
| 411 latticeBubble(x = as.data.frame(pca$x)[,1:numPCAcomp], | |
| 412 | |
| 413 z = rawData$outcome, | |
| 414 type = c("p", "g"), | |
| 415 xlab = "PC1", ylab = "PC2", | |
| 416 xlim = axisRange, | |
| 417 ylim = axisRange)) | |
| 418 | |
| 419 | |
| 420 } | |
| 421 @ | |
| 422 \caption[PCA Plot]{A plot of the first \Sexpr{numPCAcomp} | |
| 423 principal components for the original data set. Smaller, lighter | |
| 424 points indicate smaller values of the response while darker, | |
| 425 larger points correspond to larger values of the outcome} | |
| 426 \label{F:inititalPCA} | |
| 427 \end{center} | |
| 428 \end{figure} | |
| 429 | |
| 430 | |
| 431 <<initialDataSplit, eval = $INITIALDATASPLITEVAL, echo = $INITIALDATASPLITECHO, results = $INITIALDATASPLITRESULT>>= | |
| 432 | |
| 433 ## OPTION: in small samples sizes, you may not want to set aside a | |
| 434 ## training set and focus on the resampling results. | |
| 435 numSamples <- nrow(rawData) | |
| 436 | |
| 437 predictorNames <- names(rawData)[names(rawData) != "outcome"] | |
| 438 numPredictors <- length(predictorNames) | |
| 439 | |
| 440 # pctTrain <- .15 | |
| 441 pctTrain <- $PERCENT | |
| 442 | |
| 443 if(pctTrain < 1) | |
| 444 { | |
| 445 ## OPTION: seed number can be changed | |
| 446 set.seed(1) | |
| 447 inTrain <- createDataPartition(rawData$outcome, | |
| 448 p = pctTrain, | |
| 449 list = FALSE) | |
| 450 trainX <- rawData[ inTrain, predictorNames] | |
| 451 testX <- rawData[-inTrain, predictorNames] | |
| 452 trainY <- rawData[ inTrain, "outcome"] | |
| 453 testY <- rawData[-inTrain, "outcome"] | |
| 454 splitText <- paste("The original data were split into ", | |
| 455 "a training set ($n$=", | |
| 456 nrow(trainX), | |
| 457 ") and a test set ($n$=", | |
| 458 nrow(testX), | |
| 459 ") in a manner that preserved the ", | |
| 460 "distribution of the response.", | |
| 461 sep = "") | |
| 462 isZeroVar <- apply(trainX, 2, | |
| 463 function(x) length(unique(x)) < 2) | |
| 464 if(any(isZeroVar)) | |
| 465 { | |
| 466 trainX <- trainX[, !isZeroVar, drop = FALSE] | |
| 467 testX <- testX[, !isZeroVar, drop = FALSE] | |
| 468 } | |
| 469 | |
| 470 } else { | |
| 471 trainX <- rawData[, predictorNames] | |
| 472 testX <- NULL | |
| 473 trainY <- rawData[, "outcome"] | |
| 474 testY <- NULL | |
| 475 splitText <- "The entire data set was used as the training set." | |
| 476 } | |
| 477 | |
| 478 remove("rawData") | |
| 479 | |
| 480 @ | |
| 481 | |
| 482 \Sexpr{splitText} | |
| 483 The data set for model building consisted of \Sexpr{numSamples} samples and | |
| 484 \Sexpr{numPredictors} predictor variables. | |
| 485 | |
| 486 | |
| 487 | |
| 488 <<nzv, eval= $NZVEVAL, results = $NZVRESULT, echo = $NZVECHO>>= | |
| 489 ## OPTION: other pre-processing steps can be used | |
| 490 ppSteps <- caret:::suggestions(modName) | |
| 491 | |
| 492 set.seed(2) | |
| 493 if(ppSteps["nzv"]) | |
| 494 { | |
| 495 nzv <- nearZeroVar(trainX) | |
| 496 if(length(nzv) > 0) | |
| 497 { | |
| 498 nzvVars <- names(trainX)[nzv] | |
| 499 trainX <- trainX[, -nzv] | |
| 500 nzvText <- paste("There were ", | |
| 501 length(nzv), | |
| 502 " predictors that were removed due to", | |
| 503 " severely unbalanced distributions that", | |
| 504 " could negatively affect the model fit", | |
| 505 ifelse(length(nzv) > 10, | |
| 506 ".", | |
| 507 paste(": ", | |
| 508 listString(nzvVars), | |
| 509 ".", | |
| 510 sep = "")), | |
| 511 sep = "") | |
| 512 testX <- testX[, -nzv] | |
| 513 } else nzvText <- "" | |
| 514 } else nzvText <- "" | |
| 515 @ | |
| 516 | |
| 517 \Sexpr{nzvText} | |
| 518 | |
| 519 <<corrFilter, eval = $CORRFILTEREVAL, results = $CORRFILTERRESULT, echo = $CORRFILTERECHO>>= | |
| 520 | |
| 521 if(ppSteps["corr"]) | |
| 522 { | |
| 523 ## OPTION: | |
| 524 ##corrThresh <- .75 | |
| 525 corrThresh <- $THRESHHOLDCOR | |
| 526 highCorr <- findCorrelation(cor(trainX, use = "pairwise.complete.obs"), | |
| 527 corrThresh) | |
| 528 if(length(highCorr) > 0) | |
| 529 { | |
| 530 corrVars <- names(trainX)[highCorr] | |
| 531 trainX <- trainX[, -highCorr] | |
| 532 corrText <- paste("There were ", | |
| 533 length(highCorr), | |
| 534 " predictors that were removed due to", | |
| 535 " large between--predictor correlations that", | |
| 536 " could negatively affect the model fit", | |
| 537 ifelse(length(highCorr) > 10, | |
| 538 ".", | |
| 539 paste(": ", | |
| 540 listString(highCorr), | |
| 541 ".", | |
| 542 sep = "")), | |
| 543 " Removing these predictors forced", | |
| 544 " all pair--wise correlations to be", | |
| 545 " less than ", | |
| 546 corrThresh, | |
| 547 ".", | |
| 548 sep = "") | |
| 549 testX <- testX[, -highCorr] | |
| 550 } else corrText <- "" | |
| 551 }else corrText <- "" | |
| 552 @ | |
| 553 | |
| 554 \Sexpr{corrText} | |
| 555 | |
| 556 <<preProc, eval = $PREPROCEVAL, echo = $PREPROCECHO, results = $PREPROCRESULT>>= | |
| 557 | |
| 558 ppMethods <- NULL | |
| 559 if(ppSteps["center"]) ppMethods <- c(ppMethods, "center") | |
| 560 if(ppSteps["scale"]) ppMethods <- c(ppMethods, "scale") | |
| 561 if(any(hasMissing) > 0) ppMethods <- c(ppMethods, "knnImpute") | |
| 562 ##OPTION other methods, such as spatial sign, can be added to this list | |
| 563 | |
| 564 if(length(ppMethods) > 0) | |
| 565 { | |
| 566 ppInfo <- preProcess(trainX, method = ppMethods) | |
| 567 trainX <- predict(ppInfo, trainX) | |
| 568 if(pctTrain < 1) testX <- predict(ppInfo, testX) | |
| 569 ppText <- paste("The following pre--processing methods were", | |
| 570 " applied to the training", | |
| 571 ifelse(pctTrain < 1, " and test", ""), | |
| 572 " data: ", | |
| 573 listString(ppMethods), | |
| 574 ".", | |
| 575 sep = "") | |
| 576 ppText <- gsub("center", "mean centering", ppText) | |
| 577 ppText <- gsub("scale", "scaling to unit variance", ppText) | |
| 578 ppText <- gsub("knnImpute", | |
| 579 paste(ppInfo$k, "--nearest neighbor imputation", sep = ""), | |
| 580 ppText) | |
| 581 ppText <- gsub("spatialSign", "the spatial sign transformation", ppText) | |
| 582 ppText <- gsub("pca", "principal component feature extraction", ppText) | |
| 583 ppText <- gsub("ica", "independent component feature extraction", ppText) | |
| 584 } else { | |
| 585 ppInfo <- NULL | |
| 586 ppText <- "" | |
| 587 } | |
| 588 | |
| 589 predictorNames <- names(trainX) | |
| 590 if(nzvText != "" | corrText != "" | ppText != "") | |
| 591 { | |
| 592 varText <- paste("After pre--processing, ", | |
| 593 ncol(trainX), | |
| 594 "predictors remained for modeling.") | |
| 595 } else varText <- "" | |
| 596 | |
| 597 @ | |
| 598 | |
| 599 \Sexpr{ppText} \Sexpr{varText} | |
| 600 | |
| 601 \clearpage | |
| 602 \section*{Model Building} | |
| 603 | |
| 604 <<setupWorkers, eval = TRUE, echo = $SETUPWORKERSECHO, results = $SETUPWORKERSRESULT>>= | |
| 605 | |
| 606 numWorkers <- $NUMWORKERS | |
| 607 ##OPTION: turn up numWorkers to use MPI | |
| 608 if(numWorkers > 1) | |
| 609 { | |
| 610 mpiCalcs <- function(X, FUN, ...) | |
| 611 { | |
| 612 theDots <- list(...) | |
| 613 parLapply(theDots$cl, X, FUN) | |
| 614 } | |
| 615 | |
| 616 library(snow) | |
| 617 cl <- makeCluster(numWorkers, "MPI") | |
| 618 } | |
| 619 @ | |
| 620 | |
| 621 <<setupResampling, echo = $SETUPRESAMPLINGECHO, results = $SETUPRESAMPLINGRESULT>>= | |
| 622 ##<<setupResampling, echo = FALSE, results = hide>>= | |
| 623 ##OPTION: the resampling options can be changed. See | |
| 624 ## ?trainControl for details | |
| 625 resampName <- "repeatedcv" | |
| 626 resampNumber <- $RESAMPLENUMBER | |
| 627 numRepeat <- 3 | |
| 628 resampP <- $RESAMPLENUMBERPERCENT | |
| 629 | |
| 630 modelInfo <- modelLookup(modName) | |
| 631 | |
| 632 set.seed(3) | |
| 633 ctlObj <- trainControl(method = resampName, | |
| 634 number = resampNumber, | |
| 635 repeats = numRepeat, | |
| 636 p = resampP) | |
| 637 | |
| 638 | |
| 639 ##OPTION select other performance metrics as needed | |
| 640 optMetric <- "RMSE" | |
| 641 | |
| 642 if(numWorkers > 1) | |
| 643 { | |
| 644 ctlObj$workers <- numWorkers | |
| 645 ctlObj$computeFunction <- mpiCalcs | |
| 646 ctlObj$computeArgs <- list(cl = cl) | |
| 647 } | |
| 648 @ | |
| 649 | |
| 650 <<setupGrid, results = $SETUPGRIDRESULT, echo = $SETUPGRIDECHO>>= | |
| 651 | |
| 652 ##OPTION expand or contract these grids as needed (or | |
| 653 ## add more models | |
| 654 | |
| 655 gridSize <- $SETUPGRIDSIZE | |
| 656 | |
| 657 if(modName %in% c("svmPoly", "svmRadial", "svmLinear", "ctree2", "ctree")) gridSize <- 5 | |
| 658 if(modName %in% c("earth")) gridSize <- 7 | |
| 659 if(modName %in% c("knn", "glmboost", "rf", "nodeHarvest")) gridSize <- 10 | |
| 660 | |
| 661 if(modName %in% c("rpart")) gridSize <- 15 | |
| 662 if(modName %in% c("pls", "lars2", "lars")) gridSize <- min(20, ncol(trainX)) | |
| 663 | |
| 664 if(modName == "gbm") | |
| 665 { | |
| 666 tGrid <- expand.grid(.interaction.depth = -1 + (1:5)*2 , | |
| 667 .n.trees = (1:10)*20, | |
| 668 .shrinkage = .1) | |
| 669 } | |
| 670 | |
| 671 if(modName == "nnet") | |
| 672 { | |
| 673 tGrid <- expand.grid(.size = -1 + (1:5)*2 , | |
| 674 .decay = c(0, .001, .01, .1)) | |
| 675 } | |
| 676 | |
| 677 @ | |
| 678 | |
| 679 | |
| 680 <<fitModel, results = $FITMODELRESULT, echo = $FITMODELECHO, eval = $FITMODELEVAL>>= | |
| 681 | |
| 682 ##OPTION alter as needed | |
| 683 | |
| 684 set.seed(4) | |
| 685 modelFit <- switch(modName, | |
| 686 gbm = | |
| 687 { | |
| 688 mix <- sample(seq(along = trainY)) | |
| 689 train( | |
| 690 trainX[mix,], trainY[mix], modName, | |
| 691 verbose = FALSE, | |
| 692 bag.fraction = .9, | |
| 693 metric = optMetric, | |
| 694 trControl = ctlObj, | |
| 695 tuneGrid = tGrid) | |
| 696 }, | |
| 697 | |
| 698 nnet = | |
| 699 { | |
| 700 train( | |
| 701 trainX, trainY, modName, | |
| 702 metric = optMetric, | |
| 703 linout = TRUE, | |
| 704 trace = FALSE, | |
| 705 maxiter = 1000, | |
| 706 MaxNWts = 5000, | |
| 707 trControl = ctlObj, | |
| 708 tuneGrid = tGrid) | |
| 709 | |
| 710 }, | |
| 711 | |
| 712 svmRadial =, svmPoly =, svmLinear = | |
| 713 { | |
| 714 train( | |
| 715 trainX, trainY, modName, | |
| 716 metric = optMetric, | |
| 717 scaled = TRUE, | |
| 718 trControl = ctlObj, | |
| 719 tuneLength = gridSize) | |
| 720 }, | |
| 721 { | |
| 722 train(trainX, trainY, modName, | |
| 723 trControl = ctlObj, | |
| 724 metric = optMetric, | |
| 725 tuneLength = gridSize) | |
| 726 }) | |
| 727 | |
| 728 @ | |
| 729 | |
| 730 <<modelDescr, echo = $MODELDESCRECHO, results = $MODELDESCRRESULT>>= | |
| 731 | |
| 732 | |
| 733 summaryText <- "" | |
| 734 | |
| 735 resampleName <- switch(tolower(modelFit$control$method), | |
| 736 boot = paste("the bootstrap (", length(modelFit$control$index), " reps)", sep = ""), | |
| 737 boot632 = paste("the bootstrap 632 rule (", length(modelFit$control$index), " reps)", sep = ""), | |
| 738 cv = paste("cross-validation (", modelFit$control$number, " fold)", sep = ""), | |
| 739 repeatedcv = paste("cross-validation (", modelFit$control$number, " fold, repeated ", | |
| 740 modelFit$control$repeats, " times)", sep = ""), | |
| 741 lgocv = paste("repeated train/test splits (", length(modelFit$control$index), " reps, ", | |
| 742 round(modelFit$control$p, 2), "$\\%$)", sep = "")) | |
| 743 | |
| 744 tuneVars <- latexTranslate(tolower(modelInfo$label)) | |
| 745 tuneVars <- gsub("\\#", "the number of ", tuneVars, fixed = TRUE) | |
| 746 if(ncol(modelFit$bestTune) == 1 && colnames(modelFit$bestTune) == ".parameter") | |
| 747 { | |
| 748 summaryText <- paste(summaryText, | |
| 749 "\n\n", | |
| 750 "There are no tuning parameters associated with this model.", | |
| 751 "To characterize the model performance on the training set,", | |
| 752 resampleName, | |
| 753 "was used.", | |
| 754 "Table \\\\ref{T:resamps} and Figure \\\\ref{F:profile}", | |
| 755 "show summaries of the resampling results. ") | |
| 756 | |
| 757 } else { | |
| 758 summaryText <- paste("There", | |
| 759 ifelse(nrow(modelInfo) > 1, "are", "is"), | |
| 760 nrow(modelInfo), | |
| 761 ifelse(nrow(modelInfo) > 1, "tuning parameters", "tuning parameter"), | |
| 762 "associated with this model:", | |
| 763 listString(tuneVars, period = TRUE)) | |
| 764 | |
| 765 | |
| 766 | |
| 767 paramNames <- gsub(".", "", names(modelFit$bestTune), fixed = TRUE) | |
| 768 for(i in seq(along = paramNames)) | |
| 769 { | |
| 770 check <- modelInfo$parameter %in% paramNames[i] | |
| 771 if(any(check)) | |
| 772 { | |
| 773 paramNames[i] <- modelInfo$label[which(check)] | |
| 774 } | |
| 775 } | |
| 776 | |
| 777 paramNames <- gsub("#", "the number of ", paramNames, fixed = TRUE) | |
| 778 ## Check to see if there was only one combination fit | |
| 779 summaryText <- paste(summaryText, | |
| 780 "To choose", | |
| 781 ifelse(nrow(modelInfo) > 1, | |
| 782 "appropriate values of the tuning parameters,", | |
| 783 "an appropriate value of the tuning parameter,"), | |
| 784 resampleName, | |
| 785 "was used to generated a profile of performance across the", | |
| 786 nrow(modelFit$results), | |
| 787 ifelse(nrow(modelInfo) > 1, | |
| 788 "combinations of the tuning parameters.", | |
| 789 "candidate values."), | |
| 790 | |
| 791 "Table \\\\ref{T:resamps} and Figure \\\\ref{F:profile} show", | |
| 792 "summaries of the resampling profile. ", "The final model fitted to the entire training set was:", | |
| 793 listString(paste(latexTranslate(tolower(paramNames)), "=", modelFit$bestTune[1,]), period = TRUE)) | |
| 794 | |
| 795 } | |
| 796 @ | |
| 797 | |
| 798 \Sexpr{summaryText} | |
| 799 | |
| 800 <<resampTable, echo = $RESAMPTABLEECHO, results = $RESAMPTABLERESULT>>= | |
| 801 | |
| 802 tableData <- modelFit$results | |
| 803 | |
| 804 if(all(modelInfo$parameter == "parameter")) | |
| 805 { | |
| 806 tableData <- tableData[,-1, drop = FALSE] | |
| 807 colNums <- c(length(modelFit$perfNames), length(modelFit$perfNames)) | |
| 808 colLabels <- c("Mean", "Standard Deviation") | |
| 809 constString <- "" | |
| 810 isConst <- NULL | |
| 811 } else { | |
| 812 | |
| 813 isConst <- apply(tableData[, modelInfo$parameter, drop = FALSE], | |
| 814 2, | |
| 815 function(x) length(unique(x)) == 1) | |
| 816 | |
| 817 numParamInTable <- sum(!isConst) | |
| 818 | |
| 819 if(any(isConst)) | |
| 820 { | |
| 821 constParam <- modelInfo$parameter[isConst] | |
| 822 constValues <- format(tableData[, constParam, drop = FALSE], digits = 4)[1,,drop = FALSE] | |
| 823 tableData <- tableData[, !(names(tableData) %in% constParam), drop = FALSE] | |
| 824 constString <- paste("The tuning", | |
| 825 ifelse(sum(isConst) > 1, | |
| 826 "parmeters", | |
| 827 "parameter"), | |
| 828 listString(paste("``", names(constValues), "''", sep = "")), | |
| 829 ifelse(sum(isConst) > 1, | |
| 830 "were", | |
| 831 "was"), | |
| 832 "held constant at", | |
| 833 ifelse(sum(isConst) > 1, | |
| 834 "a value of", | |
| 835 "values of"), | |
| 836 listString(constValues[1,])) | |
| 837 | |
| 838 } else constString <- "" | |
| 839 | |
| 840 cn <- colnames(tableData) | |
| 841 for(i in seq(along = cn)) | |
| 842 { | |
| 843 check <- modelInfo$parameter %in% cn[i] | |
| 844 if(any(check)) | |
| 845 { | |
| 846 cn[i] <- modelInfo$label[which(check)] | |
| 847 } | |
| 848 } | |
| 849 colnames(tableData) <- cn | |
| 850 | |
| 851 colNums <- c(numParamInTable, | |
| 852 length(modelFit$perfNames), | |
| 853 length(modelFit$perfNames)) | |
| 854 colLabels <- c("", "Mean", "Standard Deviation") | |
| 855 } | |
| 856 | |
| 857 colnames(tableData) <- gsub("SD$", "", colnames(tableData)) | |
| 858 colnames(tableData) <- latexTranslate(colnames(tableData)) | |
| 859 rownames(tableData) <- latexTranslate(rownames(tableData)) | |
| 860 | |
| 861 latex(tableData, | |
| 862 rowname = NULL, | |
| 863 file = "", | |
| 864 cgroup = colLabels, | |
| 865 n.cgroup = colNums, | |
| 866 where = "h!", | |
| 867 digits = 4, | |
| 868 longtable = nrow(tableData) > 30, | |
| 869 caption = paste(resampleName, "results from the model fit.", constString), | |
| 870 label = "T:resamps") | |
| 871 @ | |
| 872 | |
| 873 \setkeys{Gin}{ width = 0.9\textwidth} | |
| 874 \begin{figure}[b] | |
| 875 \begin{center} | |
| 876 | |
| 877 <<profilePlot, echo = $PROFILEPLOTECHO, fig = $PROFILEPLOTFIG, width = 8, height = 6>>= | |
| 878 | |
| 879 trellis.par.set(caretTheme(), warn = TRUE) | |
| 880 if(all(modelInfo$parameter == "parameter") | all(isConst) | modName == "nb") | |
| 881 { | |
| 882 resultsPlot <- resampleHist(modelFit) | |
| 883 plotCaption <- paste("Distributions of model performance from the ", | |
| 884 "training set estimated using ", | |
| 885 resampleName) | |
| 886 } else { | |
| 887 if(modName %in% c("svmPoly", "svmRadial", "svmLinear")) | |
| 888 { | |
| 889 resultsPlot <- plot(modelFit, | |
| 890 metric = optMetric, | |
| 891 xTrans = function(x) log10(x)) | |
| 892 resultsPlot <- update(resultsPlot, | |
| 893 type = c("g", "p", "l"), | |
| 894 ylab = paste(optMetric, " (", resampleName, ")", sep = "")) | |
| 895 | |
| 896 } else { | |
| 897 resultsPlot <- plot(modelFit, | |
| 898 metric = optMetric) | |
| 899 resultsPlot <- update(resultsPlot, | |
| 900 type = c("g", "p", "l"), | |
| 901 ylab = paste(optMetric, " (", resampleName, ")", sep = "")) | |
| 902 } | |
| 903 plotCaption <- paste("A plot of the estimates of the", | |
| 904 optMetric, | |
| 905 "values calculated using", | |
| 906 resampleName) | |
| 907 } | |
| 908 print(resultsPlot) | |
| 909 @ | |
| 910 \caption[Performance Plot]{\Sexpr{plotCaption}.} | |
| 911 \label{F:profile} | |
| 912 \end{center} | |
| 913 \end{figure} | |
| 914 | |
| 915 <<stopWorkers, echo = $STOPWORKERSECHO, results = $STOPWORKERSRESULT>>= | |
| 916 ##<<stopWorkers, echo = FALSE, results = hide>>= | |
| 917 if(numWorkers > 1) stopCluster(cl) | |
| 918 @ | |
| 919 | |
| 920 <<testPred, results = $TESTPREDRESULT, echo = $TESTPREDECHO>>= | |
| 921 | |
| 922 | |
| 923 if(pctTrain < 1) | |
| 924 { | |
| 925 cat("\\clearpage\n\\section*{Test Set Results}\n\n") | |
| 926 | |
| 927 testPreds <- extractPrediction(list(fit = modelFit), | |
| 928 testX = testX, testY = testY) | |
| 929 testPreds <- subset(testPreds, dataType == "Test") | |
| 930 values <- modelFit$control$summaryFunction(testPreds) | |
| 931 names(values) <- gsub("RMSE", "root mean squared error", names(values), fixed = TRUE) | |
| 932 names(values) <- gsub("Rsquared", "$R^2$", names(values), fixed = TRUE) | |
| 933 values <- format(values, digits = 3) | |
| 934 | |
| 935 testString <- paste("Based on the test set of", | |
| 936 nrow(testX), | |
| 937 "samples,", | |
| 938 listString(paste(names(values), "was", values), period = TRUE), | |
| 939 " A plot of the observed and predicted outcomes for the test set ", | |
| 940 "is given in Figure \\\\ref{F:obsPred}.") | |
| 941 testString <- paste(testString, | |
| 942 " Using ", resampleName, | |
| 943 ", the training set estimates were ", | |
| 944 resampleStats(modelFit), | |
| 945 ".", | |
| 946 sep = "") | |
| 947 | |
| 948 axisRange <- extendrange(testPreds[, c("obs", "pred")]) | |
| 949 obsPred <- xyplot(obs ~ pred, | |
| 950 data = testPreds, | |
| 951 xlim = axisRange, | |
| 952 ylim = axisRange, | |
| 953 panel = function(x, y) | |
| 954 { | |
| 955 panel.abline(0, 1, col = "darkgrey", lty = 2) | |
| 956 panel.xyplot(x, y, type = c("p", "g")) | |
| 957 panel.loess(x, y, col = "darkred", lwd = 2) | |
| 958 | |
| 959 | |
| 960 }, | |
| 961 ylab = "Observed Response", | |
| 962 xlab = "Predicted Response") | |
| 963 | |
| 964 pdf("obsPred.pdf", height = 8, width = 8) | |
| 965 trellis.par.set(caretTheme()) | |
| 966 print(obsPred) | |
| 967 dev.off() | |
| 968 | |
| 969 } else testString <- "" | |
| 970 @ | |
| 971 \Sexpr{testString} | |
| 972 | |
| 973 | |
| 974 <<classProbsTex, results = $CLASSPROBSTEXRESULT, echo = $CLASSPROBSTEXECHO>>= | |
| 975 | |
| 976 | |
| 977 if(pctTrain < 1) | |
| 978 { | |
| 979 cat( | |
| 980 paste("\\begin{figure}[p]\n", | |
| 981 "\\begin{center}\n", | |
| 982 "\\includegraphics{obsPred}", | |
| 983 "\\caption[Observed V Fitted Values]{", | |
| 984 "The observed and predicted responses. ", | |
| 985 "The grey line is the line of identity while the", | |
| 986 "solid red line is a smoothed trend line.}\n", | |
| 987 "\\label{F:obsPred}\n", | |
| 988 "\\end{center}\n", | |
| 989 "\\end{figure}")) | |
| 990 } | |
| 991 | |
| 992 @ | |
| 993 | |
| 994 \section*{Versions} | |
| 995 | |
| 996 <<versions, echo = FALSE, results = tex>>= | |
| 997 toLatex(sessionInfo()) | |
| 998 | |
| 999 @ | |
| 1000 | |
| 1001 <<save-data, echo = $SAVEDATAECHO, results = $SAVEDATARESULT>>= | |
| 1002 ## change this to the name of modName.... | |
| 1003 Fit<-modelFit | |
| 1004 save(Fit,file="$METHOD-Fit.RData") | |
| 1005 @ | |
| 1006 The model was built using $METHOD and is saved as $METHOD-Fit.RData for reuse. This contains the variable Fit. | |
| 1007 | |
| 1008 | |
| 1009 \end{document}''' | |
| 1010 return template4Rnw |
