comparison caret_future/tool2/templateLibrary.py.orig @ 0:a4a2ad5a214e draft default tip

Uploaded
author deepakjadmin
date Thu, 05 Nov 2015 02:37:56 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:a4a2ad5a214e
1 def __template4Rnw():
2
3 template4Rnw = r'''%% Classification Modeling Script
4 %% Max Kuhn (max.kuhn@pfizer.com, mxkuhn@gmail.com)
5 %% Version: 1.00
6 %% Created on: 2010/10/02
7 %%
8 %% This is an Sweave template for building and describing
9 %% classification models. It mixes R and LaTeX code. The document can
10 %% be processing using R's Sweave function to produce a tex file.
11 %%
12 %% The inputs are:
13 %% - the initial data set in a data frame called 'rawData'
14 %% - a factor column in the data set called 'class'. this should be the
15 %% outcome variable
16 %% - all other columns in rawData should be predictor variables
17 %% - the type of model should be in a variable called 'modName'.
18 %%
19 %% The script attempts to make some intelligent choices based on the
20 %% model being used. For example, if modName is "pls", the script will
21 %% automatically center and scale the predictor data. There are
22 %% situations where these choices can (and should be) changed.
23 %%
24 %% There are other options that may make sense to change. For example,
25 %% the user may want to adjust the type of resampling. To find these
26 %% parts of the script, search on the string 'OPTION'. These parts of
27 %% the code will document the options.
28
29 \documentclass[14pt]{report}
30 \usepackage{amsmath}
31 \usepackage[pdftex]{graphicx}
32 \usepackage{color}
33 \usepackage{ctable}
34 \usepackage{xspace}
35 \usepackage{fancyvrb}
36 \usepackage{fancyhdr}
37 \usepackage{lastpage}
38 \usepackage{longtable}
39 \usepackage{algorithm2e}
40 \usepackage[
41 colorlinks=true,
42 linkcolor=blue,
43 citecolor=blue,
44 urlcolor=blue]
45 {hyperref}
46 \usepackage{lscape}
47 \usepackage{Sweave}
48 \SweaveOpts{keep.source = TRUE}
49
50 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
51
52 % define new colors for use
53 \definecolor{darkgreen}{rgb}{0,0.6,0}
54 \definecolor{darkred}{rgb}{0.6,0.0,0}
55 \definecolor{lightbrown}{rgb}{1,0.9,0.8}
56 \definecolor{brown}{rgb}{0.6,0.3,0.3}
57 \definecolor{darkblue}{rgb}{0,0,0.8}
58 \definecolor{darkmagenta}{rgb}{0.5,0,0.5}
59
60 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
61
62 \newcommand{\bld}[1]{\mbox{\boldmath $$#1$$}}
63 \newcommand{\shell}[1]{\mbox{$$#1$$}}
64 \renewcommand{\vec}[1]{\mbox{\bf {#1}}}
65
66 \newcommand{\ReallySmallSpacing}{\renewcommand{\baselinestretch}{.6}\Large\normalsize}
67 \newcommand{\SmallSpacing}{\renewcommand{\baselinestretch}{1.1}\Large\normalsize}
68
69 \newcommand{\halfs}{\frac{1}{2}}
70
71 \setlength{\oddsidemargin}{-.25 truein}
72 \setlength{\evensidemargin}{0truein}
73 \setlength{\topmargin}{-0.2truein}
74 \setlength{\textwidth}{7 truein}
75 \setlength{\textheight}{8.5 truein}
76 \setlength{\parindent}{0.20truein}
77 \setlength{\parskip}{0.10truein}
78
79 \setcounter{LTchunksize}{50}
80
81 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
82 \pagestyle{fancy}
83 \lhead{}
84 %% OPTION Report header name
85 \chead{Classification Model Script}
86 \rhead{}
87 \lfoot{}
88 \cfoot{}
89 \rfoot{\thepage\ of \pageref{LastPage}}
90 \renewcommand{\headrulewidth}{1pt}
91 \renewcommand{\footrulewidth}{1pt}
92 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
93
94 %% OPTION Report title and modeler name
95 \title{Classification Model Script using $METHOD}
96 \author{"Lynn Group with M. Kuhn, SCIS, JNU, New Delhi"}
97
98 \begin{document}
99
100 \maketitle
101
102 \thispagestyle{empty}
103 <<dummy, eval=TRUE, echo=FALSE, results=hide>>=
104 # sets values for variables used later in the program to prevent the \Sexpr error on parsing with Sweave
105 numSamples=''
106 classDistString=''
107 missingText=''
108 numPredictors=''
109 numPCAcomp=''
110 pcaText=''
111 nzvText=''
112 corrText=''
113 ppText=''
114 varText=''
115 splitText="Dummy Text"
116 nirText="Dummy Text"
117 # pctTrain is a variable that is initialised in Data splitting, and reused later in testPred
118 pctTrain=0.8
119 Smpling=''
120 nzvText1=''
121 classDistString1=''
122 dwnsmpl=''
123 upsmpl=''
124
125 @
126 <<startup, eval= TRUE, results = hide, echo = FALSE>>=
127 library(Hmisc)
128 library(caret)
129 library(pROC)
130 versionTest <- compareVersion(packageDescription("caret")$$Version,
131 "4.65")
132 if(versionTest < 0) stop("caret version 4.65 or later is required")
133
134 library(RColorBrewer)
135
136
137 listString <- function (x, period = FALSE, verbose = FALSE)
138 {
139 if (verbose) cat("\n entering listString\n")
140 flush.console()
141 if (!is.character(x))
142 x <- as.character(x)
143 numElements <- length(x)
144 out <- if (length(x) > 0) {
145 switch(min(numElements, 3), x, paste(x, collapse = " and "),
146 {
147 x <- paste(x, c(rep(",", numElements - 2), " and", ""), sep = "")
148 paste(x, collapse = " ")
149 })
150 }
151 else ""
152 if (period) out <- paste(out, ".", sep = "")
153 if (verbose) cat(" leaving listString\n\n")
154 flush.console()
155 out
156 }
157
158 resampleStats <- function(x, digits = 3)
159 {
160 bestPerf <- x$$bestTune
161 colnames(bestPerf) <- gsub("^\\.", "", colnames(bestPerf))
162 out <- merge(x$$results, bestPerf)
163 out <- out[, colnames(out) %in% x$$perfNames]
164 names(out) <- gsub("ROC", "area under the ROC curve", names(out), fixed = TRUE)
165 names(out) <- gsub("Sens", "sensitivity", names(out), fixed = TRUE)
166 names(out) <- gsub("Spec", "specificity", names(out), fixed = TRUE)
167 names(out) <- gsub("Accuracy", "overall accuracy", names(out), fixed = TRUE)
168 names(out) <- gsub("Kappa", "Kappa statistics", names(out), fixed = TRUE)
169
170 out <- format(out, digits = digits)
171 listString(paste(names(out), "was", out))
172 }
173
174 twoClassNoProbs <- function (data, lev = NULL, model = NULL)
175 {
176 out <- c(sensitivity(data[, "pred"], data[, "obs"], lev[1]),
177 specificity(data[, "pred"], data[, "obs"], lev[2]),
178 confusionMatrix(data[, "pred"], data[, "obs"])$$overall["Kappa"])
179
180 names(out) <- c("Sens", "Spec", "Kappa")
181 out
182 }
183
184
185
186 ##OPTION: model name: see ?train for more values/models
187 modName <- "$METHOD"
188
189
190 load("$RDATA")
191 rawData <- dataX
192 rawData$$outcome <- dataY
193
194 @
195
196
197 \section*{Data Sets}\label{S:data}
198
199 %% OPTION: provide some background on the problem, the experimental
200 %% data, how the compounds were selected etc
201
202 <<getDataInfo, eval = $GETDATAINFOEVAL, echo = $GETDATAINFOECHO, results = $GETDATAINFORESULT>>=
203 if(!any(names(rawData) == "outcome")) stop("a variable called outcome should be in the data set")
204 if(!is.factor(rawData$$outcome)) stop("the outcome should be a factor vector")
205
206 ## OPTION: when there are only two classes, the first level of the
207 ## factor is used as the "positive" or "event" for calculating
208 ## sensitivity and specificity. Adjust the outcome factor accordingly.
209 numClasses <- length(levels(rawData$$outcome))
210 numSamples <- nrow(rawData)
211 numPredictors <- ncol(rawData) - 1
212 predictorNames <- names(rawData)[names(rawData) != "outcome"]
213
214 isNum <- apply(rawData[,predictorNames, drop = FALSE], 2, is.numeric)
215 if(any(!isNum)) stop("all predictors in rawData should be numeric")
216
217 classTextCheck <- all.equal(levels(rawData$$outcome), make.names(levels(rawData$$outcome)))
218 if(!classTextCheck) warning("the class levels are not valid R variable names; this may cause errors")
219
220 ## Get the class distribution
221 classDist <- table(rawData$$outcome)
222 classDistString <- paste("``",
223 names(classDist),
224 "'' ($$n$$=",
225 classDist,
226 ")",
227 sep = "")
228 classDistString <- listString(classDistString)
229 @
230
231 <<missingFilter, eval = $MISSINGFILTEREVAL, echo = $MISSINGFILTERECHO, results = $MISSINGFILTERRESULT>>=
232 colRate <- apply(rawData[, predictorNames, drop = FALSE],
233 2, function(x) mean(is.na(x)))
234
235 ##OPTION thresholds can be changed
236 colExclude <- colRate > $MISSINGFILTERTHRESHC
237
238 missingText <- ""
239
240 if(any(colExclude))
241 {
242 missingText <- paste(missingText,
243 ifelse(sum(colExclude) > 1,
244 " There were ",
245 " There was "),
246 sum(colExclude),
247 ifelse(sum(colExclude) > 1,
248 " predictors ",
249 " predictor "),
250 "with an excessive number of ",
251 "missing data. ",
252 ifelse(sum(colExclude) > 1,
253 " These were excluded. ",
254 " This was excluded. "))
255 predictorNames <- predictorNames[!colExclude]
256 rawData <- rawData[, names(rawData) %in% c("outcome", predictorNames), drop = FALSE]
257 }
258
259
260 rowRate <- apply(rawData[, predictorNames, drop = FALSE],
261 1, function(x) mean(is.na(x)))
262
263 rowExclude <- rowRate > $MISSINGFILTERTHRESHR
264
265
266 if(any(rowExclude)) {
267 missingText <- paste(missingText,
268 ifelse(sum(rowExclude) > 1,
269 " There were ",
270 " There was "),
271 sum(colExclude),
272 ifelse(sum(rowExclude) > 1,
273 " samples ",
274 " sample "),
275 "with an excessive number of ",
276 "missing data. ",
277 ifelse(sum(rowExclude) > 1,
278 " These were excluded. ",
279 " This was excluded. "),
280 "After filtering, ",
281 sum(!rowExclude),
282 " samples remained.")
283 rawData <- rawData[!rowExclude, ]
284 hasMissing <- apply(rawData[, predictorNames, drop = FALSE],
285 1, function(x) mean(is.na(x)))
286 } else {
287 hasMissing <- apply(rawData[, predictorNames, drop = FALSE],
288 1, function(x) any(is.na(x)))
289 missingText <- paste(missingText,
290 ifelse(missingText == "",
291 "There ",
292 "Subsequently, there "),
293 ifelse(sum(hasMissing) == 1,
294 "was ",
295 "were "),
296 ifelse(sum(hasMissing) > 0,
297 sum(hasMissing),
298 "no"),
299 ifelse(sum(hasMissing) == 1,
300 "sample ",
301 "samples "),
302 "with missing values.")
303
304 rawData <- rawData[complete.cases(rawData),]
305
306 }
307
308 rawData1 <- rawData[,1:length(rawData)-1]
309 rawData2 <- rawData[,length(rawData)]
310
311 set.seed(222)
312 nzv1 <- nearZeroVar(rawData1)
313 if(length(nzv1) > 0)
314 {
315 nzvVars1 <- names(rawData1)[nzv1]
316 rawData <- rawData1[, -nzv1]
317 rawData$outcome <- rawData2
318 nzvText1 <- paste("There were ",
319 length(nzv1),
320 " predictors that were removed from original data due to",
321 " severely unbalanced distributions that",
322 " could negatively affect the model fit",
323 ifelse(length(nzv1) > 10,
324 ".",
325 paste(": ",
326 listString(nzvVars1),
327 ".",
328 sep = "")),
329 sep = "")
330
331 } else {
332 rawData <- rawData1
333 rawData$outcome <- rawData2
334 nzvText1 <- ""
335
336 }
337
338 remove("rawData1")
339 remove("rawData2")
340
341 @
342
343 The initial data set consisted of \Sexpr{numSamples} samples and
344 \Sexpr{numPredictors} predictor variables. The breakdown of the
345 outcome data classes were: \Sexpr{classDistString}.
346
347 \Sexpr{missingText}
348
349 \Sexpr{nzvText1}
350
351 <<pca, eval= $PCAEVAL, echo = $PCAECHO, results = $PCARESULT>>=
352
353 predictorNames <- names(rawData)[names(rawData) != "outcome"]
354 numPredictors <- length(predictorNames)
355 predictors <- rawData[, predictorNames, drop = FALSE]
356 ## PCA will fail with predictors having less than 2 unique values
357 isZeroVar <- apply(predictors, 2,
358 function(x) length(unique(x)) < 2)
359 if(any(isZeroVar)) predictors <- predictors[, !isZeroVar, drop = FALSE]
360 ## For whatever, only the formula interface to prcomp
361 ## handles missing values
362 pcaForm <- as.formula(
363 paste("~",
364 paste(names(predictors), collapse = "+")))
365 pca <- prcomp(pcaForm,
366 data = predictors,
367 center = TRUE,
368 scale. = TRUE,
369 na.action = na.omit)
370 ## OPTION: the number of components plotted/discussed can be set
371 numPCAcomp <- $PCACOMP
372 pctVar <- pca$$sdev^2/sum(pca$$sdev^2)*100
373 pcaText <- paste(round(pctVar[1:numPCAcomp], 1),
374 "\\\\%",
375 sep = "")
376 pcaText <- listString(pcaText)
377 @
378
379 To get an initial assessment of the separability of the classes,
380 principal component analysis (PCA) was used to distill the
381 \Sexpr{numPredictors} predictors down into \Sexpr{numPCAcomp}
382 surrogate variables (i.e. the principal components) in a manner that
383 attempts to maximize the amount of information preserved from the
384 original predictor set. Figure \ref{F:inititalPCA} contains plots of
385 the first \Sexpr{numPCAcomp} components, which accounted for
386 \Sexpr{pcaText} percent of the variability in the original predictors
387 (respectively).
388
389
390 %% OPTION: remark on how well (or poorly) the data separated
391
392 \setkeys{Gin}{width = 0.8\textwidth}
393 \begin{figure}[p]
394 \begin{center}
395
396 <<pcaPlot, eval = $PCAPLOTEVAL, echo = $PCAPLOTECHO, results = $PCAPLOTRESULT, fig = $PCAPLOTFIG, width = 8, height = 8>>=
397 trellis.par.set(caretTheme(), warn = TRUE)
398 if(numPCAcomp == 2)
399 {
400 axisRange <- extendrange(pca$$x[, 1:2])
401 print(
402 xyplot(PC1 ~ PC2,
403 data = as.data.frame(pca$$x),
404 type = c("p", "g"),
405 groups = rawData$$outcome,
406 auto.key = list(columns = 2),
407 xlim = axisRange,
408 ylim = axisRange))
409 } else {
410 axisRange <- extendrange(pca$$x[, 1:numPCAcomp])
411 print(
412 splom(~as.data.frame(pca$$x)[, 1:numPCAcomp],
413 type = c("p", "g"),
414 groups = rawData$$outcome,
415 auto.key = list(columns = 2),
416 as.table = TRUE,
417 prepanel.limits = function(x) axisRange
418 ))
419
420 }
421
422 @
423
424 \caption[PCA Plot]{A plot of the first \Sexpr{numPCAcomp}
425 principal components for the original data set.}
426 \label{F:inititalPCA}
427 \end{center}
428 \end{figure}
429
430
431
432 <<initialDataSplit, eval = $INITIALDATASPLITEVAL, echo = $INITIALDATASPLITECHO, results = $INITIALDATASPLITRESULT>>=
433
434 ## OPTION: in small samples sizes, you may not want to set aside a
435 ## training set and focus on the resampling results.
436
437 set.seed(1234)
438 dataX <- rawData[,1:length(rawData)-1]
439 dataY <- rawData[,length(rawData)]
440
441 Smpling <- "$SAAMPLING"
442
443 if(Smpling=="downsampling")
444 {
445 dwnsmpl <- downSample(dataX,dataY)
446 rawData <- dwnsmpl[,1:length(dwnsmpl)-1]
447 rawData$outcome <- dwnsmpl[,length(dwnsmpl)]
448 remove("dwnsmpl")
449 remove("dataX")
450 remove("dataY")
451 }else if(Smpling=="upsampling"){
452 upsmpl <- upSample(dataX,dataY)
453 rawData <- upsmpl[,1:length(upsmpl)-1]
454 rawData$outcome <- upsmpl[,length(upsmpl)]
455 remove("upsmpl")
456 remove("dataX")
457 remove("dataY")
458 }else{remove("dataX")
459 remove("dataY")
460 }
461
462
463
464 numSamples <- nrow(rawData)
465
466 predictorNames <- names(rawData)[names(rawData) != "outcome"]
467 numPredictors <- length(predictorNames)
468
469
470 classDist1 <- table(rawData$outcome)
471 classDistString1 <- paste("``",
472 names(classDist1),
473 "'' ($n$=",
474 classDist1,
475 ")",
476 sep = "")
477 classDistString1 <- listString(classDistString1)
478
479 pctTrain <- $PERCENT
480
481 if(pctTrain < 1)
482 {
483 ## OPTION: seed number can be changed
484 set.seed(1)
485 inTrain <- createDataPartition(rawData$$outcome,
486 p = pctTrain,
487 list = FALSE)
488 trainX <- rawData[ inTrain, predictorNames]
489 testX <- rawData[-inTrain, predictorNames]
490 trainY <- rawData[ inTrain, "outcome"]
491 testY <- rawData[-inTrain, "outcome"]
492 splitText <- paste("The original data were split into ",
493 "a training set ($$n$$=",
494 nrow(trainX),
495 ") and a test set ($$n$$=",
496 nrow(testX),
497 ") in a manner that preserved the ",
498 "distribution of the classes.",
499 sep = "")
500 isZeroVar <- apply(trainX, 2,
501 function(x) length(unique(x)) < 2)
502 if(any(isZeroVar))
503 {
504 trainX <- trainX[, !isZeroVar, drop = FALSE]
505 testX <- testX[, !isZeroVar, drop = FALSE]
506 }
507
508 } else {
509 trainX <- rawData[, predictorNames]
510 testX <- NULL
511 trainY <- rawData[, "outcome"]
512 testY <- NULL
513 splitText <- "The entire data set was used as the training set."
514 }
515 trainDist <- table(trainY)
516 nir <- max(trainDist)/length(trainY)*100
517 niClass <- names(trainDist)[which.max(trainDist)]
518 nirText <- paste("The non--information rate is the accuracy that can be ",
519 "achieved by predicting all samples using the most ",
520 "dominant class. For these data, the rate is ",
521 round(nir, 2), "\\\\% using the ``",
522 niClass,
523 "'' class.",
524 sep = "")
525
526 remove("rawData")
527
528 @
529
530 \Sexpr{splitText}
531
532 \Sexpr{nirText}
533
534 The data set for model building consisted of \Sexpr{numSamples} samples and
535 \Sexpr{numPredictors} predictor variables. The breakdown of the
536 outcome data classes were: \Sexpr{classDistString1}.
537
538 <<nzv, eval= $NZVEVAL, results = $NZVRESULT, echo = $NZVECHO>>=
539 ## OPTION: other pre-processing steps can be used
540 ppSteps <- caret:::suggestions(modName)
541
542 set.seed(2)
543 if(ppSteps["nzv"])
544 {
545 nzv <- nearZeroVar(trainX)
546 if(length(nzv) > 0)
547 {
548 nzvVars <- names(trainX)[nzv]
549 trainX <- trainX[, -nzv]
550 nzvText <- paste("There were ",
551 length(nzv),
552 " predictors that were removed from train set due to",
553 " severely unbalanced distributions that",
554 " could negatively affect the model",
555 ifelse(length(nzv) > 10,
556 ".",
557 paste(": ",
558 listString(nzvVars),
559 ".",
560 sep = "")),
561 sep = "")
562 testX <- testX[, -nzv]
563 } else nzvText <- ""
564 } else nzvText <- ""
565 @
566
567 \Sexpr{nzvText}
568
569
570 <<corrFilter, eval = $CORRFILTEREVAL, results = $CORRFILTERRESULT, echo = $CORRFILTERECHO>>=
571 if(ppSteps["corr"])
572 {
573 ## OPTION:
574 corrThresh <- $THRESHHOLDCOR
575 highCorr <- findCorrelation(cor(trainX, use = "pairwise.complete.obs"),
576 corrThresh)
577 if(length(highCorr) > 0)
578 {
579 corrVars <- names(trainX)[highCorr]
580 trainX <- trainX[, -highCorr]
581 corrText <- paste("There were ",
582 length(highCorr),
583 " predictors that were removed due to",
584 " large between--predictor correlations that",
585 " could negatively affect the model fit",
586 ifelse(length(highCorr) > 10,
587 ".",
588 paste(": ",
589 listString(highCorr),
590 ".",
591 sep = "")),
592 " Removing these predictors forced",
593 " all pair--wise correlations to be",
594 " less than ",
595 corrThresh,
596 ".",
597 sep = "")
598 testX <- testX[, -highCorr]
599 } else corrText <- "No correlation among data on given threshold"
600 }else corrText <- ""
601 @
602
603 \Sexpr{corrText}
604
605 <<preProc, eval = $PREPROCEVAL, echo = $PREPROCECHO, results = $PREPROCRESULT>>=
606 ppMethods <- NULL
607 if(ppSteps["center"]) ppMethods <- c(ppMethods, "center")
608 if(ppSteps["scale"]) ppMethods <- c(ppMethods, "scale")
609 if(any(hasMissing) > 0) ppMethods <- c(ppMethods, "knnImpute")
610 ##OPTION other methods, such as spatial sign, can be added to this list
611
612 if(length(ppMethods) > 0)
613 {
614 ppInfo <- preProcess(trainX, method = ppMethods)
615 trainX <- predict(ppInfo, trainX)
616 if(pctTrain < 1) testX <- predict(ppInfo, testX)
617 ppText <- paste("The following pre--processing methods were",
618 " applied to the training",
619 ifelse(pctTrain < 1, " and test", ""),
620 " data: ",
621 listString(ppMethods),
622 ".",
623 sep = "")
624 ppText <- gsub("center", "mean centering", ppText)
625 ppText <- gsub("scale", "scaling to unit variance", ppText)
626 ppText <- gsub("knnImpute",
627 paste(ppInfo$$k, "--nearest neighbor imputation", sep = ""),
628 ppText)
629 ppText <- gsub("spatialSign", "the spatial sign transformation", ppText)
630 ppText <- gsub("pca", "principal component feature extraction", ppText)
631 ppText <- gsub("ica", "independent component feature extraction", ppText)
632 } else {
633 ppInfo <- NULL
634 ppText <- ""
635 }
636
637 predictorNames <- names(trainX)
638 if(nzvText != "" | corrText != "" | ppText != "")
639 {
640 varText <- paste("After pre--processing, ",
641 ncol(trainX),
642 "predictors remained for modeling.")
643 } else varText <- ""
644
645 @
646
647 \Sexpr{ppText}
648 \Sexpr{varText}
649
650 \clearpage
651
652 \section*{Model Building}
653
654 <<setupWorkers, eval = TRUE, echo = $SETUPWORKERSECHO, results = $SETUPWORKERSRESULT>>=
655 numWorkers <- $NUMWORKERS
656 ##OPTION: turn up numWorkers to use MPI
657 if(numWorkers > 1)
658 {
659 mpiCalcs <- function(X, FUN, ...)
660 {
661 theDots <- list(...)
662 parLapply(theDots$$cl, X, FUN)
663 }
664
665 library(snow)
666 cl <- makeCluster(numWorkers, "MPI")
667 }
668 @
669
670 <<setupResampling, echo = $SETUPRESAMPLINGECHO, results = $SETUPRESAMPLINGRESULT>>=
671 ##OPTION: the resampling options can be changed. See
672 ## ?trainControl for details
673
674 resampName <- "$RESAMPNAME"
675 resampNumber <- $RESAMPLENUMBER
676 numRepeat <- $NUMREPEAT
677 resampP <- $RESAMPLENUMBERPERCENT
678
679 modelInfo <- modelLookup(modName)
680
681 if(numClasses == 2)
682 {
683 foo <- if(any(modelInfo$$probModel)) twoClassSummary else twoClassNoProbs
684 } else foo <- defaultSummary
685
686 set.seed(3)
687 ctlObj <- trainControl(method = resampName,
688 number = resampNumber,
689 repeats = numRepeat,
690 p = resampP,
691 classProbs = any(modelInfo$$probModel),
692 summaryFunction = foo)
693
694
695 ##OPTION select other performance metrics as needed
696 optMetric <- if(numClasses == 2 & any(modelInfo$$probModel)) "ROC" else "Kappa"
697
698 if(numWorkers > 1)
699 {
700 ctlObj$$workers <- numWorkers
701 ctlObj$$computeFunction <- mpiCalcs
702 ctlObj$$computeArgs <- list(cl = cl)
703 }
704 @
705
706 <<setupGrid, results = $SETUPGRIDRESULT, echo = $SETUPGRIDECHO>>=
707 ##OPTION expand or contract these grids as needed (or
708 ## add more models
709
710 gridSize <- $SETUPGRIDSIZE
711
712 if(modName %in% c("svmPoly", "svmRadial", "svmLinear", "lvq", "ctree2", "ctree")) gridSize <- 5
713 if(modName %in% c("earth", "fda")) gridSize <- 7
714 if(modName %in% c("knn", "rocc", "glmboost", "rf", "nodeHarvest")) gridSize <- 10
715
716 if(modName %in% c("nb")) gridSize <- 2
717 if(modName %in% c("pam", "rpart")) gridSize <- 15
718 if(modName %in% c("pls")) gridSize <- min(20, ncol(trainX))
719
720 if(modName == "gbm")
721 {
722 tGrid <- expand.grid(.interaction.depth = -1 + (1:5)*2 ,
723 .n.trees = (1:10)*20,
724 .shrinkage = .1)
725 }
726
727 if(modName == "nnet")
728 {
729 tGrid <- expand.grid(.size = -1 + (1:5)*2 ,
730 .decay = c(0, .001, .01, .1))
731 }
732
733 if(modName == "ada")
734 {
735 tGrid <- expand.grid(.maxdepth = 1, .iter = c(100,200,300,400), .nu = 1 )
736
737 }
738
739
740 @
741
742 <<fitModel, results = $FITMODELRESULT, echo = $FITMODELECHO, eval = $FITMODELEVAL>>=
743 ##OPTION alter as needed
744
745 set.seed(4)
746 modelFit <- switch(modName,
747 gbm =
748 {
749 mix <- sample(seq(along = trainY))
750 train(
751 trainX[mix,], trainY[mix], modName,
752 verbose = FALSE,
753 bag.fraction = .9,
754 metric = optMetric,
755 trControl = ctlObj,
756 tuneGrid = tGrid)
757 },
758
759 multinom =
760 {
761 train(
762 trainX, trainY, modName,
763 trace = FALSE,
764 metric = optMetric,
765 maxiter = 1000,
766 MaxNWts = 5000,
767 trControl = ctlObj,
768 tuneLength = gridSize)
769 },
770
771 nnet =
772 {
773 train(
774 trainX, trainY, modName,
775 metric = optMetric,
776 linout = FALSE,
777 trace = FALSE,
778 maxiter = 1000,
779 MaxNWts = 5000,
780 trControl = ctlObj,
781 tuneGrid = tGrid)
782
783 },
784
785 svmRadial =, svmPoly =, svmLinear =
786 {
787 train(
788 trainX, trainY, modName,
789 metric = optMetric,
790 scaled = TRUE,
791 trControl = ctlObj,
792 tuneLength = gridSize)
793 },
794 {
795 train(trainX, trainY, modName,
796 trControl = ctlObj,
797 metric = optMetric,
798 tuneLength = gridSize)
799 })
800
801 @
802
803 <<modelDescr, echo = $MODELDESCRECHO, results = $MODELDESCRRESULT>>=
804 summaryText <- ""
805
806 resampleName <- switch(tolower(modelFit$$control$$method),
807 boot = paste("the bootstrap (", length(modelFit$$control$$index), " reps)", sep = ""),
808 boot632 = paste("the bootstrap 632 rule (", length(modelFit$$control$$index), " reps)", sep = ""),
809 cv = paste("cross-validation (", modelFit$$control$$number, " fold)", sep = ""),
810 repeatedcv = paste("cross-validation (", modelFit$$control$$number, " fold, repeated ",
811 modelFit$$control$$repeats, " times)", sep = ""),
812 lgocv = paste("repeated train/test splits (", length(modelFit$$control$$index), " reps, ",
813 round(modelFit$$control$$p, 2), "$$\\%$$)", sep = ""))
814
815 tuneVars <- latexTranslate(tolower(modelInfo$$label))
816 tuneVars <- gsub("\\#", "the number of ", tuneVars, fixed = TRUE)
817 if(ncol(modelFit$$bestTune) == 1 && colnames(modelFit$$bestTune) == ".parameter")
818 {
819 summaryText <- paste(summaryText,
820 "\n\n",
821 "There are no tuning parameters associated with this model.",
822 "To characterize the model performance on the training set,",
823 resampleName,
824 "was used.",
825 "Table \\\\ref{T:resamps} and Figure \\\\ref{F:profile}",
826 "show summaries of the resampling results. ")
827
828 } else {
829 summaryText <- paste("There",
830 ifelse(nrow(modelInfo) > 1, "are", "is"),
831 nrow(modelInfo),
832 ifelse(nrow(modelInfo) > 1, "tuning parameters", "tuning parameter"),
833 "associated with this model:",
834 listString(tuneVars, period = TRUE))
835
836
837
838 paramNames <- gsub(".", "", names(modelFit$$bestTune), fixed = TRUE)
839 for(i in seq(along = paramNames))
840 {
841 check <- modelInfo$$parameter %in% paramNames[i]
842 if(any(check))
843 {
844 paramNames[i] <- modelInfo$$label[which(check)]
845 }
846 }
847
848 paramNames <- gsub("#", "the number of ", paramNames, fixed = TRUE)
849 ## Check to see if there was only one combination fit
850 summaryText <- paste(summaryText,
851 "To choose",
852 ifelse(nrow(modelInfo) > 1,
853 "appropriate values of the tuning parameters,",
854 "an appropriate value of the tuning parameter,"),
855 resampleName,
856 "was used to generated a profile of performance across the",
857 nrow(modelFit$$results),
858 ifelse(nrow(modelInfo) > 1,
859 "combinations of the tuning parameters.",
860 "candidate values."),
861
862 "Table \\\\ref{T:resamps} and Figure \\\\ref{F:profile} show",
863 "summaries of the resampling profile. ", "The final model fitted to the entire training set was:",
864 listString(paste(latexTranslate(tolower(paramNames)), "=", modelFit$$bestTune[1,]), period = TRUE))
865
866 }
867 @
868
869 \Sexpr{summaryText}
870
871 <<resampTable, echo = $RESAMPTABLEECHO, results = $RESAMPTABLERESULT>>=
872 tableData <- modelFit$$results
873
874 if(all(modelInfo$$parameter == "parameter") && resampName == "boot632")
875 {
876 tableData <- tableData[,-1, drop = FALSE]
877 colNums <- c( length(modelFit$perfNames), length(modelFit$perfNames), length(modelFit$perfNames))
878 colLabels <- c("Mean", "Standard Deviation","Apparant")
879 constString <- ""
880 isConst <- NULL
881 }else if (all(modelInfo$$parameter == "parameter") && (resampName == "boot" | resampName == "cv" | resampName = "repeatedcv" )){
882 tableData <- tableData[,-1, drop = FALSE]
883 colNums <- c(length(modelFit$perfNames), length(modelFit$perfNames))
884 colLabels <- c("Mean", "Standard Deviation")
885 constString <- ""
886 isConst <- NULL
887 }else if (all(modelInfo$$parameter == "parameter") && resampName == "LOOCV" ){
888 tableData <- tableData[,-1, drop = FALSE]
889 colNums <- length(modelFit$perfNames)
890 colLabels <- c("Measures")
891 constString <- ""
892 isConst <- NULL
893 }else if (all(modelInfo$$parameter != "parameter") && (resampName == "boot" | resampName == "cv" | resampName = "repeatedcv" )){
894 isConst <- apply(tableData[, modelInfo$$parameter, drop = FALSE],
895 2,
896 function(x) length(unique(x)) == 1)
897
898 numParamInTable <- sum(!isConst)
899
900 if(any(isConst))
901 {
902 constParam <- modelInfo$$parameter[isConst]
903 constValues <- format(tableData[, constParam, drop = FALSE], digits = 4)[1,,drop = FALSE]
904 tableData <- tableData[, !(names(tableData) %in% constParam), drop = FALSE]
905 constString <- paste("The tuning",
906 ifelse(sum(isConst) > 1,
907 "parmeters",
908 "parameter"),
909 listString(paste("``", names(constValues), "''", sep = "")),
910 ifelse(sum(isConst) > 1,
911 "were",
912 "was"),
913 "held constant at",
914 ifelse(sum(isConst) > 1,
915 (sum(isConst) > 1,
916 "a value of",
917 "values of"),
918 listString(constValues[1,]))
919
920 } else constString <- ""
921
922 cn <- colnames(tableData)
923 for(i in seq(along = cn))
924 {
925 check <- modelInfo$$parameter %in% cn[i]
926 if(any(check))
927 {
928 cn[i] <- modelInfo$$label[which(check)]
929 }
930 }
931 colnames(tableData) <- cn
932
933 colNums <- c(numParamInTable,
934 length(modelFit$perfNames),
935 length(modelFit$perfNames))
936 colLabels <- c("", "Meaures", "Standard Deviation")
937
938 } else if (all(modelInfo$$parameter != "parameter") && (resampName == "LOOCV" )){
939 isConst <- apply(tableData[, modelInfo$$parameter, drop = FALSE],
940 2,
941 function(x) length(unique(x)) == 1)
942
943 numParamInTable <- sum(!isConst)
944
945 if(any(isConst))
946 {
947 constParam <- modelInfo$$parameter[isConst]
948 constValues <- format(tableData[, constParam, drop = FALSE], digits = 4)[1,,drop = FALSE]
949 tableData <- tableData[, !(names(tableData) %in% constParam), drop = FALSE]
950 constString <- paste("The tuning",
951 ifelse(sum(isConst) > 1,
952 "parmeters",
953 "parameter"),
954 listString(paste("``", names(constValues), "''", sep = "")),
955 ifelse(sum(isConst) > 1,
956 "were",
957 "was"),
958 "held constant at",
959 ifelse(sum(isConst) > 1,
960 (sum(isConst) > 1,
961 "a value of",
962 "values of"),
963 listString(constValues[1,]))
964
965 } else constString <- ""
966
967 cn <- colnames(tableData)
968 for(i in seq(along = cn))
969 {
970 check <- modelInfo$$parameter %in% cn[i]
971 if(any(check))
972 {
973 cn[i] <- modelInfo$$label[which(check)]
974 }
975 }
976 colnames(tableData) <- cn
977
978 colNums <- c(numParamInTable,
979 length(modelFit$perfNames))
980 colLabels <- c("", "Measures" )
981
982
983 } else if (all(modelInfo$$parameter != "parameter") && (resampName == "boot632" )) {
984 isConst <- apply(tableData[, modelInfo$$parameter, drop = FALSE],
985 2,
986 function(x) length(unique(x)) == 1)
987
988 numParamInTable <- sum(!isConst)
989
990 if(any(isConst))
991 {
992 constParam <- modelInfo$$parameter[isConst]
993 constValues <- format(tableData[, constParam, drop = FALSE], digits = 4)[1,,drop = FALSE]
994 tableData <- tableData[, !(names(tableData) %in% constParam), drop = FALSE]
995 constString <- paste("The tuning",
996 ifelse(sum(isConst) > 1,
997 "parmeters",
998 "parameter"),
999 listString(paste("``", names(constValues), "''", sep = "")),
1000 ifelse(sum(isConst) > 1,
1001 "were",
1002 "was"),
1003 "held constant at",
1004 ifelse(sum(isConst) > 1,
1005 (sum(isConst) > 1,
1006 "a value of",
1007 "values of"),
1008 listString(constValues[1,]))
1009
1010 } else constString <- ""
1011
1012 cn <- colnames(tableData)
1013 for(i in seq(along = cn))
1014 {
1015 check <- modelInfo$$parameter %in% cn[i]
1016 if(any(check))
1017 {
1018 cn[i] <- modelInfo$$label[which(check)]
1019 }
1020 }
1021 colnames(tableData) <- cn
1022
1023 colNums <- c(numParamInTable,
1024 length(modelFit$$perfNames),
1025 length(modelFit$$perfNames),
1026 length(modelFit$$perfNames))
1027 colLabels <- c("", "Mean", "Standard Deviation", "Apparant")
1028 } else {
1029 constString <- paste("you played with wrong parameters in resampling method")
1030 }
1031
1032
1033
1034
1035 colnames(tableData) <- gsub("SD$$", "", colnames(tableData))
1036 colnames(tableData) <- gsub("Apparent$$", "", colnames(tableData))
1037 colnames(tableData) <- latexTranslate(colnames(tableData))
1038 rownames(tableData) <- latexTranslate(rownames(tableData))
1039
1040 latex(tableData,
1041 rowname = NULL,
1042 file = "",
1043 cgroup = colLabels,
1044 n.cgroup = colNums,
1045 where = "h!",
1046 digits = 4,
1047 longtable = nrow(tableData) > 30,
1048 caption = paste(resampleName, "results from the model fit.", constString),
1049 label = "T:resamps")
1050 @
1051
1052 \setkeys{Gin}{ width = 0.9\textwidth}
1053 \begin{figure}[b]
1054 \begin{center}
1055
1056 <<profilePlot, echo = $PROFILEPLOTECHO, fig = $PROFILEPLOTFIG, width = 8, height = 6>>=
1057 trellis.par.set(caretTheme(), warn = TRUE)
1058 if(all(modelInfo$$parameter == "parameter") | all(isConst) | modName == "nb")
1059 {
1060 resultsPlot <- resampleHist(modelFit)
1061 plotCaption <- paste("Distributions of model performance from the ",
1062 "training set estimated using ",
1063 resampleName)
1064 } else {
1065 if(modName %in% c("svmPoly", "svmRadial", "svmLinear"))
1066 {
1067 resultsPlot <- plot(modelFit,
1068 metric = optMetric,
1069 xTrans = function(x) log10(x))
1070 resultsPlot <- update(resultsPlot,
1071 type = c("g", "p", "l"),
1072 ylab = paste(optMetric, " (", resampleName, ")", sep = ""))
1073
1074 } else {
1075 resultsPlot <- plot(modelFit,
1076 metric = optMetric)
1077 resultsPlot <- update(resultsPlot,
1078 type = c("g", "p", "l"),
1079 ylab = paste(optMetric, " (", resampleName, ")", sep = ""))
1080 }
1081 plotCaption <- paste("A plot of the estimates of the",
1082 optMetric,
1083 "values calculated using",
1084 resampleName)
1085 }
1086 print(resultsPlot)
1087 @
1088 \caption[Performance Plot]{\Sexpr{plotCaption}.}
1089 \label{F:profile}
1090 \end{center}
1091 \end{figure}
1092
1093
1094 <<stopWorkers, echo = $STOPWORKERSECHO, results = $STOPWORKERSRESULT>>=
1095 if(numWorkers > 1) stopCluster(cl)
1096 @
1097
1098 <<testPred, results = $TESTPREDRESULT, echo = $TESTPREDECHO>>=
1099 if(pctTrain < 1)
1100 {
1101 cat("\\clearpage\n\\section*{Test Set Results}\n\n")
1102 classPred <- predict(modelFit, testX)
1103 cm <- confusionMatrix(classPred, testY)
1104 values <- cm$$overall[c("Accuracy", "Kappa", "AccuracyPValue", "McnemarPValue")]
1105
1106 values <- values[!is.na(values) & !is.nan(values)]
1107 values <- c(format(values[1:2], digits = 3),
1108 format.pval(values[-(1:2)], digits = 5))
1109 nms <- c("the overall accuracy", "the Kappa statistic",
1110 "the $$p$$--value that accuracy is greater than the no--information rate",
1111 "the $$p$$--value of concordance from McNemar's test")
1112 nms <- nms[seq(along = values)]
1113 names(values) <- nms
1114
1115 if(any(modelInfo$$probModel))
1116 {
1117 classProbs <- extractProb(list(fit = modelFit),
1118 testX = testX,
1119 testY = testY)
1120 classProbs <- subset(classProbs, dataType == "Test")
1121 if(numClasses == 2)
1122 {
1123 tmp <- twoClassSummary(classProbs, lev = levels(classProbs$$obs))
1124 tmp <- c(format(tmp, digits = 3))
1125 names(tmp) <- c("the sensitivity", "the specificity",
1126 "the area under the ROC curve")
1127 values <- c(values, tmp)
1128
1129 }
1130 probPlot <- plotClassProbs(classProbs)
1131 }
1132 testString <- paste("Based on the test set of",
1133 nrow(testX),
1134 "samples,",
1135 listString(paste(names(values), "was", values), period = TRUE),
1136 "The confusion matrix for the test set is shown in Table",
1137 "\\\\ref{T:cm}.")
1138 testString <- paste(testString,
1139 " Using ", resampleName,
1140 ", the training set estimates were ",
1141 resampleStats(modelFit),
1142 ".",
1143 sep = "")
1144
1145 if(any(modelInfo$$probModel)) testString <- paste(testString,
1146 "Histograms of the class probabilities",
1147 "for the test set samples are shown in",
1148 "Figure \\\\ref{F:probs}",
1149 ifelse(numClasses == 2,
1150 " and the test set ROC curve is in Figure \\\\ref{F:roc}.",
1151 "."))
1152
1153
1154
1155 latex(cm$$table,
1156 title = "",
1157 file = "",
1158 where = "h",
1159 cgroup = "Observed Values",
1160 n.cgroup = numClasses,
1161 caption = "The confusion matrix for the test set",
1162 label = "T:cm")
1163
1164 } else testString <- ""
1165 @
1166 \Sexpr{testString}
1167
1168
1169 <<classProbsTex, results = $CLASSPROBSTEXRESULT, echo = $CLASSPROBSTEXECHO>>=
1170 if(any(modelInfo$$probModel))
1171 {
1172 cat(
1173 paste("\\begin{figure}[p]\n",
1174 "\\begin{center}\n",
1175 "\\includegraphics{classProbs}",
1176 "\\caption[PCA Plot]{Class probabilities",
1177 "for the test set. Each panel contains ",
1178 "separate classes}\n",
1179 "\\label{F:probs}\n",
1180 "\\end{center}\n",
1181 "\\end{figure}"))
1182 }
1183 if(any(modelInfo$$probModel) & numClasses == 2)
1184 {
1185 cat(
1186 paste("\\begin{figure}[p]\n",
1187 "\\begin{center}\n",
1188 "\\includegraphics[clip, width = .8\\textwidth]{roc}",
1189 "\\caption[ROC Plot]{ROC Curve",
1190 "for the test set.}\n",
1191 "\\label{F:roc}\n",
1192 "\\end{center}\n",
1193 "\\end{figure}"))
1194 }
1195 @
1196 <<classProbsTex, results = $CLASSPROBSTEXRESULT1, echo = $CLASSPROBSTEXECHO1 >>=
1197 if(any(modelInfo$$probModel))
1198 {
1199 pdf("classProbs.pdf", height = 7, width = 7)
1200 trellis.par.set(caretTheme(), warn = FALSE)
1201 print(probPlot)
1202 dev.off()
1203 }
1204
1205 if(any(modelInfo$$probModel) & numClasses == 2)
1206 { resPonse<-testY
1207 preDictor<-classProbs[, levels(trainY)[1]]
1208 pdf("roc.pdf", height = 8, width = 8)
1209 # from pROC example at http://web.expasy.org/pROC/screenshots.htm
1210 plot.roc(resPonse, preDictor, # data
1211 percent=TRUE, # show all values in percent
1212 partial.auc=c(100, 90), partial.auc.correct=TRUE, # define a partial AUC (pAUC)
1213 print.auc=TRUE, #display pAUC value on the plot with following options:
1214 print.auc.pattern="Corrected pAUC (100-90%% SP):\n%.1f%%", print.auc.col="#1c61b6",
1215 auc.polygon=TRUE, auc.polygon.col="#1c61b6", # show pAUC as a polygon
1216 max.auc.polygon=TRUE, max.auc.polygon.col="#1c61b622", # also show the 100% polygon
1217 main="Partial AUC (pAUC)")
1218 plot.roc(resPonse, preDictor,
1219 percent=TRUE, add=TRUE, type="n", # add to plot, but don't re-add the ROC itself (useless)
1220 partial.auc=c(100, 90), partial.auc.correct=TRUE,
1221 partial.auc.focus="se", # focus pAUC on the sensitivity
1222 print.auc=TRUE, print.auc.pattern="Corrected pAUC (100-90%% SE):\n%.1f%%", print.auc.col="#008600",
1223 print.auc.y=40, # do not print auc over the previous one
1224 auc.polygon=TRUE, auc.polygon.col="#008600",
1225 max.auc.polygon=TRUE, max.auc.polygon.col="#00860022")
1226 dev.off()
1227 }
1228
1229
1230 @
1231
1232 \section*{Versions}
1233
1234 <<versions, echo = FALSE, results = tex>>=
1235 toLatex(sessionInfo())
1236
1237 @
1238
1239 <<save-data, echo = $SAVEDATAECHO, results = $SAVEDATARESULT>>=
1240 ## change this to the name of modName....
1241 Fit<-modelFit
1242 save(Fit,file="$METHOD-Fit.RData")
1243 @
1244 The model was built using $METHOD and is saved as $METHOD-Fit.RData for reuse. This contains the variable Fit.
1245
1246 \end{document}'''
1247
1248 return template4Rnw