comparison templateLibrary.py @ 0:b82c88293260 draft

Uploaded
author deepakjadmin
date Fri, 22 Jan 2016 14:16:12 -0500
parents
children 251849bed194
comparison
equal deleted inserted replaced
-1:000000000000 0:b82c88293260
1 def __template4Rnw():
2
3 template4Rnw = r'''%% Regression Modeling Script
4 %% Max Kuhn (max.kuhn@pfizer.com, mxkuhn@gmail.com)
5 %% Version: 1.00
6 %% Created on: 2010/10/02
7 %%
8 %% Lynn Group
9 %% Version: 2.00
10 %% Created on: 2014/11/15
11
12 %% This is an Sweave template for building and describing
13 %% classification models. It mixes R and LaTeX code. The document can
14 %% be processing using R's Sweave function to produce a tex file.
15 %%
16 %% The inputs are:
17 %% - the initial data set in a data frame called 'rawData'
18 %% - a numeric column in the data set called 'outcome'. this should be the
19 %% outcome variable
20 %% - all other columns in rawData should be predictor variables
21 %% - the type of model should be in a variable called 'modName'.
22 %%
23 %% The script attempts to make some intelligent choices based on the
24 %% model being used. For example, if modName is "pls", the script will
25 %% automatically center and scale the predictor data. There are
26 %% situations where these choices can (and should be) changed.
27 %%
28 %% There are other options that may make sense to change. For example,
29 %% the user may want to adjust the type of resampling. To find these
30 %% parts of the script, search on the string 'OPTION'. These parts of
31 %% the code will document the options.
32
33 \documentclass[12pt]{report}
34 \usepackage{amsmath}
35 \usepackage[pdftex]{graphicx}
36 \usepackage{color}
37 \usepackage{ctable}
38 \usepackage{xspace}
39 \usepackage{fancyvrb}
40 \usepackage{fancyhdr}
41 \usepackage{lastpage}
42 \usepackage{longtable}
43 \usepackage{algorithm2e}
44 \usepackage[
45 colorlinks=true,
46 linkcolor=blue,
47 citecolor=blue,
48 urlcolor=blue]
49 {hyperref}
50 \usepackage{lscape}
51
52 \usepackage{Sweave}
53 \SweaveOpts{keep.source = TRUE}
54
55 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
56
57 % define new colors for use
58 \definecolor{darkgreen}{rgb}{0,0.6,0}
59 \definecolor{darkred}{rgb}{0.6,0.0,0}
60 \definecolor{lightbrown}{rgb}{1,0.9,0.8}
61 \definecolor{brown}{rgb}{0.6,0.3,0.3}
62 \definecolor{darkblue}{rgb}{0,0,0.8}
63 \definecolor{darkmagenta}{rgb}{0.5,0,0.5}
64
65 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
66
67 \newcommand{\bld}[1]{\mbox{\boldmath $#1$}}
68 \newcommand{\shell}[1]{\mbox{$#1$}}
69 \renewcommand{\vec}[1]{\mbox{\bf {#1}}}
70
71 \newcommand{\ReallySmallSpacing}{\renewcommand{\baselinestretch}{.6}\Large\normalsize}
72 \newcommand{\SmallSpacing}{\renewcommand{\baselinestretch}{1.1}\Large\normalsize}
73
74 \newcommand{\halfs}{\frac{1}{2}}
75
76 \setlength{\oddsidemargin}{-.25 truein}
77 \setlength{\evensidemargin}{0truein}
78 \setlength{\topmargin}{-0.2truein}
79 \setlength{\textwidth}{7 truein}
80 \setlength{\textheight}{8.5 truein}
81 \setlength{\parindent}{0.20truein}
82 \setlength{\parskip}{0.10truein}
83
84 \setcounter{LTchunksize}{50}
85
86 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
87 \pagestyle{fancy}
88 \lhead{}
89 %% OPTION Report header name
90 \chead{Regression Model Script}
91 \rhead{}
92 \lfoot{}
93 \cfoot{}
94 \rfoot{\thepage\ of \pageref{LastPage}}
95 \renewcommand{\headrulewidth}{1pt}
96 \renewcommand{\footrulewidth}{1pt}
97 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
98
99 %% OPTION Report title and modeler name
100 \title{Regression Model Script using $METHOD }
101 \author{"M. Kuhn and Lynn Group, SCIS, JNU, New Delhi"}
102
103 \begin{document}
104
105 \maketitle
106
107 \thispagestyle{empty}
108
109 <<startup, eval= TRUE, results = hide, echo = FALSE>>=
110 library(Hmisc)
111 library(caret)
112 versionTest <- compareVersion(packageDescription("caret")$Version,
113 "4.65")
114 if(versionTest < 0) stop("caret version 4.65 or later is required")
115
116 library(RColorBrewer)
117
118
119 listString <- function (x, period = FALSE, verbose = FALSE)
120 {
121 if (verbose) cat("\n entering listString\n")
122 flush.console()
123 if (!is.character(x))
124 x <- as.character(x)
125 numElements <- length(x)
126 out <- if (length(x) > 0) {
127 switch(min(numElements, 3), x, paste(x, collapse = " and "),
128 {
129 x <- paste(x, c(rep(",", numElements - 2), " and", ""), sep = "")
130 paste(x, collapse = " ")
131 })
132 }
133 else ""
134 if (period) out <- paste(out, ".", sep = "")
135 if (verbose) cat(" leaving listString\n\n")
136 flush.console()
137 out
138 }
139
140 resampleStats <- function(x, digits = 3)
141 {
142 bestPerf <- x$bestTune
143 colnames(bestPerf) <- gsub("^\\.", "", colnames(bestPerf))
144 out <- merge(x$results, bestPerf)
145 out <- out[, colnames(out) %in% x$perfNames]
146 names(out) <- gsub("ROC", "area under the ROC curve", names(out), fixed = TRUE)
147 names(out) <- gsub("Sens", "sensitivity", names(out), fixed = TRUE)
148 names(out) <- gsub("Spec", "specificity", names(out), fixed = TRUE)
149 names(out) <- gsub("Accuracy", "overall accuracy", names(out), fixed = TRUE)
150 names(out) <- gsub("Kappa", "Kappa statistics", names(out), fixed = TRUE)
151 names(out) <- gsub("RMSE", "root mean squared error", names(out), fixed = TRUE)
152 names(out) <- gsub("Rsquared", "$R^2$", names(out), fixed = TRUE)
153
154 out <- format(out, digits = digits)
155 listString(paste(names(out), "was", out))
156 }
157
158 latticeBubble <- function(x, y, z, offset = .5, splits = 10,
159 pal = colorRampPalette(brewer.pal(9,"YlOrRd")[-(1:2)]),
160 ...)
161 {
162 cexValues <- rank(z)/length(z) + offset
163 splits <- unique(quantile(z, probs = seq(0, 1, length = splits)))
164 splitup <- cut(z, breaks = splits, include.lowest = TRUE)
165 cols <- pal(length(levels(splitup)))
166 colValues <- cols[as.numeric(splitup)]
167 if(is.data.frame(x))
168 {
169 out <- splom(~x, col = colValues, cex = cexValues, ...)
170
171 } else out <- xyplot(y~x, col = colValues, cex = cexValues, ...)
172 out
173
174 }
175
176
177 ##OPTION: model name: see ?train for more values/models
178 modName <- "$METHOD"
179 load("$RDATA")
180 rawData <- dataX
181 rawData$$outcome <- dataY
182
183
184
185 @
186
187
188 \section*{Data Sets}\label{S:data}
189
190 %% OPTION: provide some background on the problem, the experimental
191 %% data, how the compounds were selected etc
192
193
194 <<getDataInfo, eval = $GETDATAINFOEVAL, echo = $GETDATAINFOECHO, results = $GETDATAINFORESULT>>=
195 if(!any(names(rawData) == "outcome")) stop("a variable called outcome should be in the data set")
196 if(!is.numeric(rawData$outcome)) stop("the outcome should be a numeric vector")
197
198 numSamples <- nrow(rawData)
199 numPredictors <- ncol(rawData) - 1
200 predictorNames <- names(rawData)[names(rawData) != "outcome"]
201
202 isNum <- apply(rawData[,predictorNames, drop = FALSE], 2, is.numeric)
203 if(any(!isNum)) stop("all predictors in rawData should be numeric")
204
205 @
206
207
208 <<missingFilter, eval = $MISSINGFILTEREVAL, echo = $MISSINGFILTERECHO, results = $MISSINGFILTERRESULT>>=
209
210 colRate <- apply(rawData[, predictorNames, drop = FALSE],
211 2, function(x) mean(is.na(x)))
212
213 ##OPTION thresholds can be changed
214 colExclude <- colRate > $MISSINGFILTERTHRESHC
215
216 missingText <- ""
217
218 if(any(colExclude))
219 {
220 missingText <- paste(missingText,
221 ifelse(sum(colExclude) > 1,
222 " There were ",
223 " There was "),
224 sum(colExclude),
225 ifelse(sum(colExclude) > 1,
226 " predictors ",
227 " predictor "),
228 "with an excessive number of ",
229 "missing data. ",
230 ifelse(sum(colExclude) > 1,
231 " These were excluded. ",
232 " This was excluded. "))
233 predictorNames <- predictorNames[!colExclude]
234 rawData <- rawData[, names(rawData) %in% c("outcome", predictorNames), drop = FALSE]
235 }
236
237
238 rowRate <- apply(rawData[, predictorNames, drop = FALSE],
239 1, function(x) mean(is.na(x)))
240
241 rowExclude <- rowRate > $MISSINGFILTERTHRESHR
242
243
244 if(any(rowExclude))
245 {
246 missingText <- paste(missingText,
247 ifelse(sum(rowExclude) > 1,
248 " There were ",
249 " There was "),
250 sum(colExclude),
251 ifelse(sum(rowExclude) > 1,
252 " samples ",
253 " sample "),
254 "with an excessive number of ",
255 "missing data. ",
256 ifelse(sum(rowExclude) > 1,
257 " These were excluded. ",
258 " This was excluded. "),
259 "After filtering, ",
260 sum(!rowExclude),
261 " samples remained.")
262 rawData <- rawData[!rowExclude, ]
263 hasMissing <- apply(rawData[, predictorNames, drop = FALSE],
264 1, function(x) mean(is.na(x)))
265 } else {
266 hasMissing <- apply(rawData[, predictorNames, drop = FALSE],
267 1, function(x) any(is.na(x)))
268 missingText <- paste(missingText,
269 ifelse(missingText == "",
270 "There ",
271 "Subsequently, there "),
272 ifelse(sum(hasMissing) == 1,
273 "was ",
274 "were "),
275 ifelse(sum(hasMissing) > 0,
276 sum(hasMissing),
277 "no"),
278 ifelse(sum(hasMissing) == 1,
279 "sample ",
280 "samples "),
281 "with missing values.")
282 rawData <- rawData[complete.cases(rawData),]
283 }
284
285 dataDist <- summary(rawData$outcome)
286 dataSD <- sd(rawData$outcome, na.rm = TRUE)
287 dataText <- paste("The average outcome value was ",
288 dataDist["Mean"],
289 " and a standard deviation of ",
290 dataSD, ". The minimum and maximum values were ",
291 dataDist["Min."], " and ", dataDist["Max."],
292 ", respectively. Figure \\\\ref{F:dens} shows a ",
293 " density plot (i.e. a smooth histogram) of the response.",
294 sep = "")
295
296 rawData1 <- rawData[,1:length(rawData)-1]
297 rawData2 <- rawData[,length(rawData)]
298
299 set.seed(222)
300 nzv1 <- nearZeroVar(rawData1)
301 if(length(nzv1) > 0)
302 {
303 nzvVars1 <- names(rawData1)[nzv1]
304 rawData <- rawData1[,-nzv1]
305 rawData$outcome <- rawData2
306 nzvText1 <- paste("There were ",
307 length(nzv1),
308 " predictors that were removed from original data due to",
309 " severely unbalanced distributions that",
310 " could negatively affect the model fit",
311 ifelse(length(nzv1) > 10,
312 ".",
313 paste(": ",
314 listString(nzvVars1),
315 ".",
316 sep = "")),
317 sep = "")
318
319 } else {
320 rawData <- rawData1
321 rawData$outcome <- rawData2
322 nzvText1 <- ""
323
324 }
325
326 remove("rawData1")
327 remove("rawData2")
328
329 @
330
331 The initial data set consisted of \Sexpr{numSamples} samples and
332 \Sexpr{numPredictors} predictor variables. \Sexpr{dataText} \Sexpr{missingText}
333 \Sexpr{nzvText1}
334
335 \setkeys{Gin}{width = 0.8\textwidth}
336 \begin{figure}[b]
337 \begin{center}
338
339 <<densityplot, echo = FALSE, results = hide, fig = TRUE, width = 8, height = 4.5>>=
340 trellis.par.set(caretTheme(), warn = TRUE)
341 print(densityplot(~rawData$outcome, pch = "|", adjust = 1.25, xlab = ""))
342 @
343 \caption[Data Density]{A density plot of the response. The marks
344 along the $x$--axis show the locations of the data points.}
345 \label{F:dens}
346 \end{center}
347 \end{figure}
348
349
350 <<pca, eval= $PCAEVAL, echo = $PCAECHO, results = $PCARESULT>>=
351 predictorNames <- names(rawData)[names(rawData) != "outcome"]
352 numPredictors <- length(predictorNames)
353
354 predictors <- rawData[, predictorNames, drop = FALSE]
355 ## PCA will fail with predictors having less than 2 unique values
356 isZeroVar <- apply(predictors, 2,
357 function(x) length(unique(x)) < 2)
358 if(any(isZeroVar)) predictors <- predictors[, !isZeroVar, drop = FALSE]
359 ## For whatever, only the formula interface to prcomp
360 ## handles missing values
361 pcaForm <- as.formula(
362 paste("~",
363 paste(names(predictors), collapse = "+")))
364 pca <- prcomp(pcaForm,
365 data = predictors,
366 center = TRUE,
367 scale. = TRUE,
368 na.action = na.omit)
369 ## OPTION: the number of components plotted/discussed can be set
370 numPCAcomp <- $PCACOMP
371 pctVar <- pca$sdev^2/sum(pca$sdev^2)*100
372 pcaText <- paste(round(pctVar[1:numPCAcomp], 1),
373 "\\\\%",
374 sep = "")
375 pcaText <- listString(pcaText)
376 @
377
378 To get an initial assessment of the separability of the classes,
379 principal component analysis (PCA) was used to distill the
380 \Sexpr{numPredictors} predictors down into \Sexpr{numPCAcomp}
381 surrogate variables (i.e. the principal components) in a manner that
382 attempts to maximize the amount of information preserved from the
383 original predictor set. Figure \ref{F:inititalPCA} contains plots of
384 the first \Sexpr{numPCAcomp} components, which accounted for
385 \Sexpr{pcaText} percent of the variability in the original predictors
386 (respectively).
387
388 %% OPTION: remark on how well (or poorly) the data separated
389
390 \setkeys{Gin}{width = 0.8\textwidth}
391 \begin{figure}[p]
392 \begin{center}
393
394 <<pcaPlot, eval = $PCAPLOTEVAL, echo = $PCAPLOTECHO, results = $PCAPLOTRESULT, fig = $PCAPLOTFIG, width = 8, height = 8>>=
395 trellis.par.set(caretTheme(), warn = TRUE)
396 if(numPCAcomp == 2)
397 {
398 axisRange <- extendrange(pca$x[, 1:2])
399 print(
400 latticeBubble(x = as.data.frame(pca$x)$PC1,
401 y = as.data.frame(pca$x)$PC2,
402 z = rawData$outcome,
403 type = c("p", "g"),
404 xlab = "PC1", ylab = "PC2",
405 xlim = axisRange,
406 ylim = axisRange))
407
408 } else {
409 axisRange <- extendrange(pca$x[, 1:numPCAcomp])
410 print(
411 latticeBubble(x = as.data.frame(pca$x)[,1:numPCAcomp],
412
413 z = rawData$outcome,
414 type = c("p", "g"),
415 xlab = "PC1", ylab = "PC2",
416 xlim = axisRange,
417 ylim = axisRange))
418
419
420 }
421 @
422 \caption[PCA Plot]{A plot of the first \Sexpr{numPCAcomp}
423 principal components for the original data set. Smaller, lighter
424 points indicate smaller values of the response while darker,
425 larger points correspond to larger values of the outcome}
426 \label{F:inititalPCA}
427 \end{center}
428 \end{figure}
429
430
431 <<initialDataSplit, eval = $INITIALDATASPLITEVAL, echo = $INITIALDATASPLITECHO, results = $INITIALDATASPLITRESULT>>=
432
433 ## OPTION: in small samples sizes, you may not want to set aside a
434 ## training set and focus on the resampling results.
435 numSamples <- nrow(rawData)
436
437 predictorNames <- names(rawData)[names(rawData) != "outcome"]
438 numPredictors <- length(predictorNames)
439
440 # pctTrain <- .15
441 pctTrain <- $PERCENT
442
443 if(pctTrain < 1)
444 {
445 ## OPTION: seed number can be changed
446 set.seed(1)
447 inTrain <- createDataPartition(rawData$outcome,
448 p = pctTrain,
449 list = FALSE)
450 trainX <- rawData[ inTrain, predictorNames]
451 testX <- rawData[-inTrain, predictorNames]
452 trainY <- rawData[ inTrain, "outcome"]
453 testY <- rawData[-inTrain, "outcome"]
454 splitText <- paste("The original data were split into ",
455 "a training set ($n$=",
456 nrow(trainX),
457 ") and a test set ($n$=",
458 nrow(testX),
459 ") in a manner that preserved the ",
460 "distribution of the response.",
461 sep = "")
462 isZeroVar <- apply(trainX, 2,
463 function(x) length(unique(x)) < 2)
464 if(any(isZeroVar))
465 {
466 trainX <- trainX[, !isZeroVar, drop = FALSE]
467 testX <- testX[, !isZeroVar, drop = FALSE]
468 }
469
470 } else {
471 trainX <- rawData[, predictorNames]
472 testX <- NULL
473 trainY <- rawData[, "outcome"]
474 testY <- NULL
475 splitText <- "The entire data set was used as the training set."
476 }
477
478 remove("rawData")
479
480 @
481
482 \Sexpr{splitText}
483 The data set for model building consisted of \Sexpr{numSamples} samples and
484 \Sexpr{numPredictors} predictor variables.
485
486
487
488 <<nzv, eval= $NZVEVAL, results = $NZVRESULT, echo = $NZVECHO>>=
489 ## OPTION: other pre-processing steps can be used
490 ppSteps <- caret:::suggestions(modName)
491
492 set.seed(2)
493 if(ppSteps["nzv"])
494 {
495 nzv <- nearZeroVar(trainX)
496 if(length(nzv) > 0)
497 {
498 nzvVars <- names(trainX)[nzv]
499 trainX <- trainX[, -nzv]
500 nzvText <- paste("There were ",
501 length(nzv),
502 " predictors that were removed due to",
503 " severely unbalanced distributions that",
504 " could negatively affect the model fit",
505 ifelse(length(nzv) > 10,
506 ".",
507 paste(": ",
508 listString(nzvVars),
509 ".",
510 sep = "")),
511 sep = "")
512 testX <- testX[, -nzv]
513 } else nzvText <- ""
514 } else nzvText <- ""
515 @
516
517 \Sexpr{nzvText}
518
519 <<corrFilter, eval = $CORRFILTEREVAL, results = $CORRFILTERRESULT, echo = $CORRFILTERECHO>>=
520
521 if(ppSteps["corr"])
522 {
523 ## OPTION:
524 ##corrThresh <- .75
525 corrThresh <- $THRESHHOLDCOR
526 highCorr <- findCorrelation(cor(trainX, use = "pairwise.complete.obs"),
527 corrThresh)
528 if(length(highCorr) > 0)
529 {
530 corrVars <- names(trainX)[highCorr]
531 trainX <- trainX[, -highCorr]
532 corrText <- paste("There were ",
533 length(highCorr),
534 " predictors that were removed due to",
535 " large between--predictor correlations that",
536 " could negatively affect the model fit",
537 ifelse(length(highCorr) > 10,
538 ".",
539 paste(": ",
540 listString(highCorr),
541 ".",
542 sep = "")),
543 " Removing these predictors forced",
544 " all pair--wise correlations to be",
545 " less than ",
546 corrThresh,
547 ".",
548 sep = "")
549 testX <- testX[, -highCorr]
550 } else corrText <- ""
551 }else corrText <- ""
552 @
553
554 \Sexpr{corrText}
555
556 <<preProc, eval = $PREPROCEVAL, echo = $PREPROCECHO, results = $PREPROCRESULT>>=
557
558 ppMethods <- NULL
559 if(ppSteps["center"]) ppMethods <- c(ppMethods, "center")
560 if(ppSteps["scale"]) ppMethods <- c(ppMethods, "scale")
561 if(any(hasMissing) > 0) ppMethods <- c(ppMethods, "knnImpute")
562 ##OPTION other methods, such as spatial sign, can be added to this list
563
564 if(length(ppMethods) > 0)
565 {
566 ppInfo <- preProcess(trainX, method = ppMethods)
567 trainX <- predict(ppInfo, trainX)
568 if(pctTrain < 1) testX <- predict(ppInfo, testX)
569 ppText <- paste("The following pre--processing methods were",
570 " applied to the training",
571 ifelse(pctTrain < 1, " and test", ""),
572 " data: ",
573 listString(ppMethods),
574 ".",
575 sep = "")
576 ppText <- gsub("center", "mean centering", ppText)
577 ppText <- gsub("scale", "scaling to unit variance", ppText)
578 ppText <- gsub("knnImpute",
579 paste(ppInfo$k, "--nearest neighbor imputation", sep = ""),
580 ppText)
581 ppText <- gsub("spatialSign", "the spatial sign transformation", ppText)
582 ppText <- gsub("pca", "principal component feature extraction", ppText)
583 ppText <- gsub("ica", "independent component feature extraction", ppText)
584 } else {
585 ppInfo <- NULL
586 ppText <- ""
587 }
588
589 predictorNames <- names(trainX)
590 if(nzvText != "" | corrText != "" | ppText != "")
591 {
592 varText <- paste("After pre--processing, ",
593 ncol(trainX),
594 "predictors remained for modeling.")
595 } else varText <- ""
596
597 @
598
599 \Sexpr{ppText} \Sexpr{varText}
600
601 \clearpage
602 \section*{Model Building}
603
604 <<setupWorkers, eval = TRUE, echo = $SETUPWORKERSECHO, results = $SETUPWORKERSRESULT>>=
605
606 numWorkers <- $NUMWORKERS
607 ##OPTION: turn up numWorkers to use MPI
608 if(numWorkers > 1)
609 {
610 mpiCalcs <- function(X, FUN, ...)
611 {
612 theDots <- list(...)
613 parLapply(theDots$cl, X, FUN)
614 }
615
616 library(snow)
617 cl <- makeCluster(numWorkers, "MPI")
618 }
619 @
620
621 <<setupResampling, echo = $SETUPRESAMPLINGECHO, results = $SETUPRESAMPLINGRESULT>>=
622 ##<<setupResampling, echo = FALSE, results = hide>>=
623 ##OPTION: the resampling options can be changed. See
624 ## ?trainControl for details
625 resampName <- "repeatedcv"
626 resampNumber <- $RESAMPLENUMBER
627 numRepeat <- 3
628 resampP <- $RESAMPLENUMBERPERCENT
629
630 modelInfo <- modelLookup(modName)
631
632 set.seed(3)
633 ctlObj <- trainControl(method = resampName,
634 number = resampNumber,
635 repeats = numRepeat,
636 p = resampP)
637
638
639 ##OPTION select other performance metrics as needed
640 optMetric <- "RMSE"
641
642 if(numWorkers > 1)
643 {
644 ctlObj$workers <- numWorkers
645 ctlObj$computeFunction <- mpiCalcs
646 ctlObj$computeArgs <- list(cl = cl)
647 }
648 @
649
650 <<setupGrid, results = $SETUPGRIDRESULT, echo = $SETUPGRIDECHO>>=
651
652 ##OPTION expand or contract these grids as needed (or
653 ## add more models
654
655 gridSize <- $SETUPGRIDSIZE
656
657 if(modName %in% c("svmPoly", "svmRadial", "svmLinear", "ctree2", "ctree")) gridSize <- 5
658 if(modName %in% c("earth")) gridSize <- 7
659 if(modName %in% c("knn", "glmboost", "rf", "nodeHarvest")) gridSize <- 10
660
661 if(modName %in% c("rpart")) gridSize <- 15
662 if(modName %in% c("pls", "lars2", "lars")) gridSize <- min(20, ncol(trainX))
663
664 if(modName == "gbm")
665 {
666 tGrid <- expand.grid(.interaction.depth = -1 + (1:5)*2 ,
667 .n.trees = (1:10)*20,
668 .shrinkage = .1)
669 }
670
671 if(modName == "nnet")
672 {
673 tGrid <- expand.grid(.size = -1 + (1:5)*2 ,
674 .decay = c(0, .001, .01, .1))
675 }
676
677 @
678
679
680 <<fitModel, results = $FITMODELRESULT, echo = $FITMODELECHO, eval = $FITMODELEVAL>>=
681
682 ##OPTION alter as needed
683
684 set.seed(4)
685 modelFit <- switch(modName,
686 gbm =
687 {
688 mix <- sample(seq(along = trainY))
689 train(
690 trainX[mix,], trainY[mix], modName,
691 verbose = FALSE,
692 bag.fraction = .9,
693 metric = optMetric,
694 trControl = ctlObj,
695 tuneGrid = tGrid)
696 },
697
698 nnet =
699 {
700 train(
701 trainX, trainY, modName,
702 metric = optMetric,
703 linout = TRUE,
704 trace = FALSE,
705 maxiter = 1000,
706 MaxNWts = 5000,
707 trControl = ctlObj,
708 tuneGrid = tGrid)
709
710 },
711
712 svmRadial =, svmPoly =, svmLinear =
713 {
714 train(
715 trainX, trainY, modName,
716 metric = optMetric,
717 scaled = TRUE,
718 trControl = ctlObj,
719 tuneLength = gridSize)
720 },
721 {
722 train(trainX, trainY, modName,
723 trControl = ctlObj,
724 metric = optMetric,
725 tuneLength = gridSize)
726 })
727
728 @
729
730 <<modelDescr, echo = $MODELDESCRECHO, results = $MODELDESCRRESULT>>=
731
732
733 summaryText <- ""
734
735 resampleName <- switch(tolower(modelFit$control$method),
736 boot = paste("the bootstrap (", length(modelFit$control$index), " reps)", sep = ""),
737 boot632 = paste("the bootstrap 632 rule (", length(modelFit$control$index), " reps)", sep = ""),
738 cv = paste("cross-validation (", modelFit$control$number, " fold)", sep = ""),
739 repeatedcv = paste("cross-validation (", modelFit$control$number, " fold, repeated ",
740 modelFit$control$repeats, " times)", sep = ""),
741 lgocv = paste("repeated train/test splits (", length(modelFit$control$index), " reps, ",
742 round(modelFit$control$p, 2), "$\\%$)", sep = ""))
743
744 tuneVars <- latexTranslate(tolower(modelInfo$label))
745 tuneVars <- gsub("\\#", "the number of ", tuneVars, fixed = TRUE)
746 if(ncol(modelFit$bestTune) == 1 && colnames(modelFit$bestTune) == ".parameter")
747 {
748 summaryText <- paste(summaryText,
749 "\n\n",
750 "There are no tuning parameters associated with this model.",
751 "To characterize the model performance on the training set,",
752 resampleName,
753 "was used.",
754 "Table \\\\ref{T:resamps} and Figure \\\\ref{F:profile}",
755 "show summaries of the resampling results. ")
756
757 } else {
758 summaryText <- paste("There",
759 ifelse(nrow(modelInfo) > 1, "are", "is"),
760 nrow(modelInfo),
761 ifelse(nrow(modelInfo) > 1, "tuning parameters", "tuning parameter"),
762 "associated with this model:",
763 listString(tuneVars, period = TRUE))
764
765
766
767 paramNames <- gsub(".", "", names(modelFit$bestTune), fixed = TRUE)
768 for(i in seq(along = paramNames))
769 {
770 check <- modelInfo$parameter %in% paramNames[i]
771 if(any(check))
772 {
773 paramNames[i] <- modelInfo$label[which(check)]
774 }
775 }
776
777 paramNames <- gsub("#", "the number of ", paramNames, fixed = TRUE)
778 ## Check to see if there was only one combination fit
779 summaryText <- paste(summaryText,
780 "To choose",
781 ifelse(nrow(modelInfo) > 1,
782 "appropriate values of the tuning parameters,",
783 "an appropriate value of the tuning parameter,"),
784 resampleName,
785 "was used to generated a profile of performance across the",
786 nrow(modelFit$results),
787 ifelse(nrow(modelInfo) > 1,
788 "combinations of the tuning parameters.",
789 "candidate values."),
790
791 "Table \\\\ref{T:resamps} and Figure \\\\ref{F:profile} show",
792 "summaries of the resampling profile. ", "The final model fitted to the entire training set was:",
793 listString(paste(latexTranslate(tolower(paramNames)), "=", modelFit$bestTune[1,]), period = TRUE))
794
795 }
796 @
797
798 \Sexpr{summaryText}
799
800 <<resampTable, echo = $RESAMPTABLEECHO, results = $RESAMPTABLERESULT>>=
801
802 tableData <- modelFit$results
803
804 if(all(modelInfo$parameter == "parameter"))
805 {
806 tableData <- tableData[,-1, drop = FALSE]
807 colNums <- c(length(modelFit$perfNames), length(modelFit$perfNames))
808 colLabels <- c("Mean", "Standard Deviation")
809 constString <- ""
810 isConst <- NULL
811 } else {
812
813 isConst <- apply(tableData[, modelInfo$parameter, drop = FALSE],
814 2,
815 function(x) length(unique(x)) == 1)
816
817 numParamInTable <- sum(!isConst)
818
819 if(any(isConst))
820 {
821 constParam <- modelInfo$parameter[isConst]
822 constValues <- format(tableData[, constParam, drop = FALSE], digits = 4)[1,,drop = FALSE]
823 tableData <- tableData[, !(names(tableData) %in% constParam), drop = FALSE]
824 constString <- paste("The tuning",
825 ifelse(sum(isConst) > 1,
826 "parmeters",
827 "parameter"),
828 listString(paste("``", names(constValues), "''", sep = "")),
829 ifelse(sum(isConst) > 1,
830 "were",
831 "was"),
832 "held constant at",
833 ifelse(sum(isConst) > 1,
834 "a value of",
835 "values of"),
836 listString(constValues[1,]))
837
838 } else constString <- ""
839
840 cn <- colnames(tableData)
841 for(i in seq(along = cn))
842 {
843 check <- modelInfo$parameter %in% cn[i]
844 if(any(check))
845 {
846 cn[i] <- modelInfo$label[which(check)]
847 }
848 }
849 colnames(tableData) <- cn
850
851 colNums <- c(numParamInTable,
852 length(modelFit$perfNames),
853 length(modelFit$perfNames))
854 colLabels <- c("", "Mean", "Standard Deviation")
855 }
856
857 colnames(tableData) <- gsub("SD$", "", colnames(tableData))
858 colnames(tableData) <- latexTranslate(colnames(tableData))
859 rownames(tableData) <- latexTranslate(rownames(tableData))
860
861 latex(tableData,
862 rowname = NULL,
863 file = "",
864 cgroup = colLabels,
865 n.cgroup = colNums,
866 where = "h!",
867 digits = 4,
868 longtable = nrow(tableData) > 30,
869 caption = paste(resampleName, "results from the model fit.", constString),
870 label = "T:resamps")
871 @
872
873 \setkeys{Gin}{ width = 0.9\textwidth}
874 \begin{figure}[b]
875 \begin{center}
876
877 <<profilePlot, echo = $PROFILEPLOTECHO, fig = $PROFILEPLOTFIG, width = 8, height = 6>>=
878
879 trellis.par.set(caretTheme(), warn = TRUE)
880 if(all(modelInfo$parameter == "parameter") | all(isConst) | modName == "nb")
881 {
882 resultsPlot <- resampleHist(modelFit)
883 plotCaption <- paste("Distributions of model performance from the ",
884 "training set estimated using ",
885 resampleName)
886 } else {
887 if(modName %in% c("svmPoly", "svmRadial", "svmLinear"))
888 {
889 resultsPlot <- plot(modelFit,
890 metric = optMetric,
891 xTrans = function(x) log10(x))
892 resultsPlot <- update(resultsPlot,
893 type = c("g", "p", "l"),
894 ylab = paste(optMetric, " (", resampleName, ")", sep = ""))
895
896 } else {
897 resultsPlot <- plot(modelFit,
898 metric = optMetric)
899 resultsPlot <- update(resultsPlot,
900 type = c("g", "p", "l"),
901 ylab = paste(optMetric, " (", resampleName, ")", sep = ""))
902 }
903 plotCaption <- paste("A plot of the estimates of the",
904 optMetric,
905 "values calculated using",
906 resampleName)
907 }
908 print(resultsPlot)
909 @
910 \caption[Performance Plot]{\Sexpr{plotCaption}.}
911 \label{F:profile}
912 \end{center}
913 \end{figure}
914
915 <<stopWorkers, echo = $STOPWORKERSECHO, results = $STOPWORKERSRESULT>>=
916 ##<<stopWorkers, echo = FALSE, results = hide>>=
917 if(numWorkers > 1) stopCluster(cl)
918 @
919
920 <<testPred, results = $TESTPREDRESULT, echo = $TESTPREDECHO>>=
921
922
923 if(pctTrain < 1)
924 {
925 cat("\\clearpage\n\\section*{Test Set Results}\n\n")
926
927 testPreds <- extractPrediction(list(fit = modelFit),
928 testX = testX, testY = testY)
929 testPreds <- subset(testPreds, dataType == "Test")
930 values <- modelFit$control$summaryFunction(testPreds)
931 names(values) <- gsub("RMSE", "root mean squared error", names(values), fixed = TRUE)
932 names(values) <- gsub("Rsquared", "$R^2$", names(values), fixed = TRUE)
933 values <- format(values, digits = 3)
934
935 testString <- paste("Based on the test set of",
936 nrow(testX),
937 "samples,",
938 listString(paste(names(values), "was", values), period = TRUE),
939 " A plot of the observed and predicted outcomes for the test set ",
940 "is given in Figure \\\\ref{F:obsPred}.")
941 testString <- paste(testString,
942 " Using ", resampleName,
943 ", the training set estimates were ",
944 resampleStats(modelFit),
945 ".",
946 sep = "")
947
948 axisRange <- extendrange(testPreds[, c("obs", "pred")])
949 obsPred <- xyplot(obs ~ pred,
950 data = testPreds,
951 xlim = axisRange,
952 ylim = axisRange,
953 panel = function(x, y)
954 {
955 panel.abline(0, 1, col = "darkgrey", lty = 2)
956 panel.xyplot(x, y, type = c("p", "g"))
957 panel.loess(x, y, col = "darkred", lwd = 2)
958
959
960 },
961 ylab = "Observed Response",
962 xlab = "Predicted Response")
963
964 pdf("obsPred.pdf", height = 8, width = 8)
965 trellis.par.set(caretTheme())
966 print(obsPred)
967 dev.off()
968
969 } else testString <- ""
970 @
971 \Sexpr{testString}
972
973
974 <<classProbsTex, results = $CLASSPROBSTEXRESULT, echo = $CLASSPROBSTEXECHO>>=
975
976
977 if(pctTrain < 1)
978 {
979 cat(
980 paste("\\begin{figure}[p]\n",
981 "\\begin{center}\n",
982 "\\includegraphics{obsPred}",
983 "\\caption[Observed V Fitted Values]{",
984 "The observed and predicted responses. ",
985 "The grey line is the line of identity while the",
986 "solid red line is a smoothed trend line.}\n",
987 "\\label{F:obsPred}\n",
988 "\\end{center}\n",
989 "\\end{figure}"))
990 }
991
992 @
993
994 \section*{Versions}
995
996 <<versions, echo = FALSE, results = tex>>=
997 toLatex(sessionInfo())
998
999 @
1000
1001 <<save-data, echo = $SAVEDATAECHO, results = $SAVEDATARESULT>>=
1002 ## change this to the name of modName....
1003 Fit<-modelFit
1004 save(Fit,file="$METHOD-Fit.RData")
1005 @
1006 The model was built using $METHOD and is saved as $METHOD-Fit.RData for reuse. This contains the variable Fit.
1007
1008
1009 \end{document}'''
1010 return template4Rnw