comparison templateLibrary.py @ 6:f8c9f83c7abb draft

Uploaded
author deepakjadmin
date Mon, 04 Apr 2016 02:55:52 -0400
parents
children b9ca220a8090
comparison
equal deleted inserted replaced
5:1118410b26dc 6:f8c9f83c7abb
1 def __template4Rnw():
2
3 template4Rnw = r'''%% Classification Modeling Script
4 %% Max Kuhn (max.kuhn@pfizer.com, mxkuhn@gmail.com)
5 %% Version: 1.00
6 %% Created on: 2010/10/02
7 %%
8 %% This is an Sweave template for building and describing
9 %% classification models. It mixes R and LaTeX code. The document can
10 %% be processing using R's Sweave function to produce a tex file.
11 %%
12 %% The inputs are:
13 %% - the initial data set in a data frame called 'rawData'
14 %% - a factor column in the data set called 'class'. this should be the
15 %% outcome variable
16 %% - all other columns in rawData should be predictor variables
17 %% - the type of model should be in a variable called 'modName'.
18 %%
19 %% The script attempts to make some intelligent choices based on the
20 %% model being used. For example, if modName is "pls", the script will
21 %% automatically center and scale the predictor data. There are
22 %% situations where these choices can (and should be) changed.
23 %%
24 %% There are other options that may make sense to change. For example,
25 %% the user may want to adjust the type of resampling. To find these
26 %% parts of the script, search on the string 'OPTION'. These parts of
27 %% the code will document the options.
28
29 \documentclass[14pt]{report}
30 \usepackage{amsmath}
31 \usepackage[pdftex]{graphicx}
32 \usepackage{color}
33 \usepackage{ctable}
34 \usepackage{xspace}
35 \usepackage{fancyvrb}
36 \usepackage{fancyhdr}
37 \usepackage{lastpage}
38 \usepackage{longtable}
39 \usepackage{algorithm2e}
40 \usepackage[
41 colorlinks=true,
42 linkcolor=blue,
43 citecolor=blue,
44 urlcolor=blue]
45 {hyperref}
46 \usepackage{lscape}
47 \usepackage{Sweave}
48 \SweaveOpts{keep.source = TRUE}
49
50 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
51
52 % define new colors for use
53 \definecolor{darkgreen}{rgb}{0,0.6,0}
54 \definecolor{darkred}{rgb}{0.6,0.0,0}
55 \definecolor{lightbrown}{rgb}{1,0.9,0.8}
56 \definecolor{brown}{rgb}{0.6,0.3,0.3}
57 \definecolor{darkblue}{rgb}{0,0,0.8}
58 \definecolor{darkmagenta}{rgb}{0.5,0,0.5}
59
60 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
61
62 \newcommand{\bld}[1]{\mbox{\boldmath $$#1$$}}
63 \newcommand{\shell}[1]{\mbox{$$#1$$}}
64 \renewcommand{\vec}[1]{\mbox{\bf {#1}}}
65
66 \newcommand{\ReallySmallSpacing}{\renewcommand{\baselinestretch}{.6}\Large\normalsize}
67 \newcommand{\SmallSpacing}{\renewcommand{\baselinestretch}{1.1}\Large\normalsize}
68
69 \newcommand{\halfs}{\frac{1}{2}}
70
71 \setlength{\oddsidemargin}{-.25 truein}
72 \setlength{\evensidemargin}{0truein}
73 \setlength{\topmargin}{-0.2truein}
74 \setlength{\textwidth}{7 truein}
75 \setlength{\textheight}{8.5 truein}
76 \setlength{\parindent}{0.20truein}
77 \setlength{\parskip}{0.10truein}
78
79 \setcounter{LTchunksize}{50}
80
81 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
82 \pagestyle{fancy}
83 \lhead{}
84 %% OPTION Report header name
85 \chead{Classification Model Script}
86 \rhead{}
87 \lfoot{}
88 \cfoot{}
89 \rfoot{\thepage\ of \pageref{LastPage}}
90 \renewcommand{\headrulewidth}{1pt}
91 \renewcommand{\footrulewidth}{1pt}
92 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
93
94 %% OPTION Report title and modeler name
95 \title{Classification Model Script using $METHOD}
96 \author{"Lynn Group with M. Kuhn, SCIS, JNU, New Delhi"}
97
98 \begin{document}
99
100 \maketitle
101
102 \thispagestyle{empty}
103 <<dummy, eval=TRUE, echo=FALSE, results=hide>>=
104 # sets values for variables used later in the program to prevent the \Sexpr error on parsing with Sweave
105 numSamples=''
106 classDistString=''
107 missingText=''
108 numPredictors=''
109 numPCAcomp=''
110 pcaText=''
111 nzvText=''
112 corrText=''
113 ppText=''
114 varText=''
115 splitText="Dummy Text"
116 nirText="Dummy Text"
117 # pctTrain is a variable that is initialised in Data splitting, and reused later in testPred
118 pctTrain=0.8
119 Smpling=''
120 nzvText1=''
121 classDistString1=''
122 dwnsmpl=''
123 upsmpl=''
124
125 @
126 <<startup, eval= TRUE, results = hide, echo = FALSE>>=
127 library(Hmisc)
128 library(caret)
129 library(pROC)
130 versionTest <- compareVersion(packageDescription("caret")$$Version,
131 "4.65")
132 if(versionTest < 0) stop("caret version 4.65 or later is required")
133
134 library(RColorBrewer)
135
136
137 listString <- function (x, period = FALSE, verbose = FALSE)
138 {
139 if (verbose) cat("\n entering listString\n")
140 flush.console()
141 if (!is.character(x))
142 x <- as.character(x)
143 numElements <- length(x)
144 out <- if (length(x) > 0) {
145 switch(min(numElements, 3), x, paste(x, collapse = " and "),
146 {
147 x <- paste(x, c(rep(",", numElements - 2), " and", ""), sep = "")
148 paste(x, collapse = " ")
149 })
150 }
151 else ""
152 if (period) out <- paste(out, ".", sep = "")
153 if (verbose) cat(" leaving listString\n\n")
154 flush.console()
155 out
156 }
157
158 resampleStats <- function(x, digits = 3)
159 {
160 bestPerf <- x$$bestTune
161 colnames(bestPerf) <- gsub("^\\.", "", colnames(bestPerf))
162 out <- merge(x$$results, bestPerf)
163 out <- out[, colnames(out) %in% x$$perfNames]
164 names(out) <- gsub("ROC", "area under the ROC curve", names(out), fixed = TRUE)
165 names(out) <- gsub("Sens", "sensitivity", names(out), fixed = TRUE)
166 names(out) <- gsub("Spec", "specificity", names(out), fixed = TRUE)
167 names(out) <- gsub("Accuracy", "overall accuracy", names(out), fixed = TRUE)
168 names(out) <- gsub("Kappa", "Kappa statistics", names(out), fixed = TRUE)
169
170 out <- format(out, digits = digits)
171 listString(paste(names(out), "was", out))
172 }
173
174 twoClassNoProbs <- function (data, lev = NULL, model = NULL)
175 {
176 out <- c(sensitivity(data[, "pred"], data[, "obs"], lev[1]),
177 specificity(data[, "pred"], data[, "obs"], lev[2]),
178 confusionMatrix(data[, "pred"], data[, "obs"])$$overall["Kappa"])
179
180 names(out) <- c("Sens", "Spec", "Kappa")
181 out
182 }
183
184
185
186 ##OPTION: model name: see ?train for more values/models
187 modName <- "$METHOD"
188
189
190 load("$RDATA")
191 rawData <- dataX
192 rawData$$outcome <- dataY
193
194 @
195
196
197 \section*{Data Sets}\label{S:data}
198
199 %% OPTION: provide some background on the problem, the experimental
200 %% data, how the compounds were selected etc
201
202 <<getDataInfo, eval = $GETDATAINFOEVAL, echo = $GETDATAINFOECHO, results = $GETDATAINFORESULT>>=
203 if(!any(names(rawData) == "outcome")) stop("a variable called outcome should be in the data set")
204 if(!is.factor(rawData$$outcome)) stop("the outcome should be a factor vector")
205
206 ## OPTION: when there are only two classes, the first level of the
207 ## factor is used as the "positive" or "event" for calculating
208 ## sensitivity and specificity. Adjust the outcome factor accordingly.
209 numClasses <- length(levels(rawData$$outcome))
210 numSamples <- nrow(rawData)
211 numPredictors <- ncol(rawData) - 1
212 predictorNames <- names(rawData)[names(rawData) != "outcome"]
213
214 isNum <- apply(rawData[,predictorNames, drop = FALSE], 2, is.numeric)
215 if(any(!isNum)) stop("all predictors in rawData should be numeric")
216
217 classTextCheck <- all.equal(levels(rawData$$outcome), make.names(levels(rawData$$outcome)))
218 if(!classTextCheck) warning("the class levels are not valid R variable names; this may cause errors")
219
220 ## Get the class distribution
221 classDist <- table(rawData$$outcome)
222 classDistString <- paste("``",
223 names(classDist),
224 "'' ($$n$$=",
225 classDist,
226 ")",
227 sep = "")
228 classDistString <- listString(classDistString)
229 @
230
231 <<missingFilter, eval = $MISSINGFILTEREVAL, echo = $MISSINGFILTERECHO, results = $MISSINGFILTERRESULT>>=
232 colRate <- apply(rawData[, predictorNames, drop = FALSE],
233 2, function(x) mean(is.na(x)))
234
235 ##OPTION thresholds can be changed
236 colExclude <- colRate > $MISSINGFILTERTHRESHC
237
238 missingText <- ""
239
240 if(any(colExclude))
241 {
242 missingText <- paste(missingText,
243 ifelse(sum(colExclude) > 1,
244 " There were ",
245 " There was "),
246 sum(colExclude),
247 ifelse(sum(colExclude) > 1,
248 " predictors ",
249 " predictor "),
250 "with an excessive number of ",
251 "missing data. ",
252 ifelse(sum(colExclude) > 1,
253 " These were excluded. ",
254 " This was excluded. "))
255 predictorNames <- predictorNames[!colExclude]
256 rawData <- rawData[, names(rawData) %in% c("outcome", predictorNames), drop = FALSE]
257 }
258
259
260 rowRate <- apply(rawData[, predictorNames, drop = FALSE],
261 1, function(x) mean(is.na(x)))
262
263 rowExclude <- rowRate > $MISSINGFILTERTHRESHR
264
265
266 if(any(rowExclude)) {
267 missingText <- paste(missingText,
268 ifelse(sum(rowExclude) > 1,
269 " There were ",
270 " There was "),
271 sum(colExclude),
272 ifelse(sum(rowExclude) > 1,
273 " samples ",
274 " sample "),
275 "with an excessive number of ",
276 "missing data. ",
277 ifelse(sum(rowExclude) > 1,
278 " These were excluded. ",
279 " This was excluded. "),
280 "After filtering, ",
281 sum(!rowExclude),
282 " samples remained.")
283 rawData <- rawData[!rowExclude, ]
284 hasMissing <- apply(rawData[, predictorNames, drop = FALSE],
285 1, function(x) mean(is.na(x)))
286 } else {
287 hasMissing <- apply(rawData[, predictorNames, drop = FALSE],
288 1, function(x) any(is.na(x)))
289 missingText <- paste(missingText,
290 ifelse(missingText == "",
291 "There ",
292 "Subsequently, there "),
293 ifelse(sum(hasMissing) == 1,
294 "was ",
295 "were "),
296 ifelse(sum(hasMissing) > 0,
297 sum(hasMissing),
298 "no"),
299 ifelse(sum(hasMissing) == 1,
300 "sample ",
301 "samples "),
302 "with missing values.")
303
304 rawData <- rawData[complete.cases(rawData),]
305
306 }
307
308 rawData1 <- rawData[,1:length(rawData)-1]
309 rawData2 <- rawData[,length(rawData)]
310
311 set.seed(222)
312 nzv1 <- nearZeroVar(rawData1)
313 if(length(nzv1) > 0)
314 {
315 nzvVars1 <- names(rawData1)[nzv1]
316 rawData <- rawData1[, -nzv1]
317 rawData$outcome <- rawData2
318 nzvText1 <- paste("There were ",
319 length(nzv1),
320 " predictors that were removed from original data due to",
321 " severely unbalanced distributions that",
322 " could negatively affect the model fit",
323 ifelse(length(nzv1) > 10,
324 ".",
325 paste(": ",
326 listString(nzvVars1),
327 ".",
328 sep = "")),
329 sep = "")
330
331 } else {
332 rawData <- rawData1
333 rawData$outcome <- rawData2
334 nzvText1 <- ""
335
336 }
337
338 remove("rawData1")
339 remove("rawData2")
340
341 @
342
343 The initial data set consisted of \Sexpr{numSamples} samples and
344 \Sexpr{numPredictors} predictor variables. The breakdown of the
345 outcome data classes were: \Sexpr{classDistString}.
346
347 \Sexpr{missingText}
348
349 \Sexpr{nzvText1}
350
351 <<pca, eval= $PCAEVAL, echo = $PCAECHO, results = $PCARESULT>>=
352
353 predictorNames <- names(rawData)[names(rawData) != "outcome"]
354 numPredictors <- length(predictorNames)
355 predictors <- rawData[, predictorNames, drop = FALSE]
356 ## PCA will fail with predictors having less than 2 unique values
357 isZeroVar <- apply(predictors, 2,
358 function(x) length(unique(x)) < 2)
359 if(any(isZeroVar)) predictors <- predictors[, !isZeroVar, drop = FALSE]
360 ## For whatever, only the formula interface to prcomp
361 ## handles missing values
362 pcaForm <- as.formula(
363 paste("~",
364 paste(names(predictors), collapse = "+")))
365 pca <- prcomp(pcaForm,
366 data = predictors,
367 center = TRUE,
368 scale. = TRUE,
369 na.action = na.omit)
370 ## OPTION: the number of components plotted/discussed can be set
371 numPCAcomp <- $PCACOMP
372 pctVar <- pca$$sdev^2/sum(pca$$sdev^2)*100
373 pcaText <- paste(round(pctVar[1:numPCAcomp], 1),
374 "\\\\%",
375 sep = "")
376 pcaText <- listString(pcaText)
377 @
378
379 To get an initial assessment of the separability of the classes,
380 principal component analysis (PCA) was used to distill the
381 \Sexpr{numPredictors} predictors down into \Sexpr{numPCAcomp}
382 surrogate variables (i.e. the principal components) in a manner that
383 attempts to maximize the amount of information preserved from the
384 original predictor set. Figure \ref{F:inititalPCA} contains plots of
385 the first \Sexpr{numPCAcomp} components, which accounted for
386 \Sexpr{pcaText} percent of the variability in the original predictors
387 (respectively).
388
389
390 %% OPTION: remark on how well (or poorly) the data separated
391
392 \setkeys{Gin}{width = 0.8\textwidth}
393 \begin{figure}[p]
394 \begin{center}
395
396 <<pcaPlot, eval = $PCAPLOTEVAL, echo = $PCAPLOTECHO, results = $PCAPLOTRESULT, fig = $PCAPLOTFIG, width = 8, height = 8>>=
397 trellis.par.set(caretTheme(), warn = TRUE)
398 if(numPCAcomp == 2)
399 {
400 axisRange <- extendrange(pca$$x[, 1:2])
401 print(
402 xyplot(PC1 ~ PC2,
403 data = as.data.frame(pca$$x),
404 type = c("p", "g"),
405 groups = rawData$$outcome,
406 auto.key = list(columns = 2),
407 xlim = axisRange,
408 ylim = axisRange))
409 } else {
410 axisRange <- extendrange(pca$$x[, 1:numPCAcomp])
411 print(
412 splom(~as.data.frame(pca$$x)[, 1:numPCAcomp],
413 type = c("p", "g"),
414 groups = rawData$$outcome,
415 auto.key = list(columns = 2),
416 as.table = TRUE,
417 prepanel.limits = function(x) axisRange
418 ))
419
420 }
421
422 @
423
424 \caption[PCA Plot]{A plot of the first \Sexpr{numPCAcomp}
425 principal components for the original data set.}
426 \label{F:inititalPCA}
427 \end{center}
428 \end{figure}
429
430
431
432 <<initialDataSplit, eval = $INITIALDATASPLITEVAL, echo = $INITIALDATASPLITECHO, results = $INITIALDATASPLITRESULT>>=
433
434 ## OPTION: in small samples sizes, you may not want to set aside a
435 ## training set and focus on the resampling results.
436
437 set.seed(1234)
438 dataX <- rawData[,1:length(rawData)-1]
439 dataY <- rawData[,length(rawData)]
440
441 Smpling <- "$SAAMPLING"
442
443 if(Smpling=="downsampling")
444 {
445 dwnsmpl <- downSample(dataX,dataY)
446 rawData <- dwnsmpl[,1:length(dwnsmpl)-1]
447 rawData$outcome <- dwnsmpl[,length(dwnsmpl)]
448 remove("dwnsmpl")
449 remove("dataX")
450 remove("dataY")
451 }else if(Smpling=="upsampling"){
452 upsmpl <- upSample(dataX,dataY)
453 rawData <- upsmpl[,1:length(upsmpl)-1]
454 rawData$outcome <- upsmpl[,length(upsmpl)]
455 remove("upsmpl")
456 remove("dataX")
457 remove("dataY")
458 }else{remove("dataX")
459 remove("dataY")
460 }
461
462
463
464 numSamples <- nrow(rawData)
465
466 predictorNames <- names(rawData)[names(rawData) != "outcome"]
467 numPredictors <- length(predictorNames)
468
469
470 classDist1 <- table(rawData$outcome)
471 classDistString1 <- paste("``",
472 names(classDist1),
473 "'' ($n$=",
474 classDist1,
475 ")",
476 sep = "")
477 classDistString1 <- listString(classDistString1)
478
479 pctTrain <- $PERCENT
480
481 if(pctTrain < 1)
482 {
483 ## OPTION: seed number can be changed
484 set.seed(1)
485 inTrain <- createDataPartition(rawData$$outcome,
486 p = pctTrain,
487 list = FALSE)
488 trainX <- rawData[ inTrain, predictorNames]
489 testX <- rawData[-inTrain, predictorNames]
490 trainY <- rawData[ inTrain, "outcome"]
491 testY <- rawData[-inTrain, "outcome"]
492 splitText <- paste("The original data were split into ",
493 "a training set ($$n$$=",
494 nrow(trainX),
495 ") and a test set ($$n$$=",
496 nrow(testX),
497 ") in a manner that preserved the ",
498 "distribution of the classes.",
499 sep = "")
500 isZeroVar <- apply(trainX, 2,
501 function(x) length(unique(x)) < 2)
502 if(any(isZeroVar))
503 {
504 trainX <- trainX[, !isZeroVar, drop = FALSE]
505 testX <- testX[, !isZeroVar, drop = FALSE]
506 }
507
508 } else {
509 trainX <- rawData[, predictorNames]
510 testX <- NULL
511 trainY <- rawData[, "outcome"]
512 testY <- NULL
513 splitText <- "The entire data set was used as the training set."
514 }
515 trainDist <- table(trainY)
516 nir <- max(trainDist)/length(trainY)*100
517 niClass <- names(trainDist)[which.max(trainDist)]
518 nirText <- paste("The non--information rate is the accuracy that can be ",
519 "achieved by predicting all samples using the most ",
520 "dominant class. For these data, the rate is ",
521 round(nir, 2), "\\\\% using the ``",
522 niClass,
523 "'' class.",
524 sep = "")
525
526 remove("rawData")
527
528 if((!is.null(testX)) && (!is.null(testY))){
529 save(trainX,trainY,testX,testY,file="datasets.RData")
530 } else {
531 save(trainX,trainY,file="datasets.RData")
532 }
533
534 @
535
536 \Sexpr{splitText}
537
538 \Sexpr{nirText}
539
540 The data set for model building consisted of \Sexpr{numSamples} samples and
541 \Sexpr{numPredictors} predictor variables. The breakdown of the
542 outcome data classes were: \Sexpr{classDistString1}.
543
544 <<nzv, eval= $NZVEVAL, results = $NZVRESULT, echo = $NZVECHO>>=
545 ## OPTION: other pre-processing steps can be used
546 ppSteps <- caret:::suggestions(modName)
547
548 set.seed(2)
549 if(ppSteps["nzv"])
550 {
551 nzv <- nearZeroVar(trainX)
552 if(length(nzv) > 0)
553 {
554 nzvVars <- names(trainX)[nzv]
555 trainX <- trainX[, -nzv]
556 nzvText <- paste("There were ",
557 length(nzv),
558 " predictors that were removed from train set due to",
559 " severely unbalanced distributions that",
560 " could negatively affect the model",
561 ifelse(length(nzv) > 10,
562 ".",
563 paste(": ",
564 listString(nzvVars),
565 ".",
566 sep = "")),
567 sep = "")
568 testX <- testX[, -nzv]
569 } else nzvText <- ""
570 } else nzvText <- ""
571 @
572
573 \Sexpr{nzvText}
574
575
576 <<corrFilter, eval = $CORRFILTEREVAL, results = $CORRFILTERRESULT, echo = $CORRFILTERECHO>>=
577 if(ppSteps["corr"])
578 {
579 ## OPTION:
580 corrThresh <- $THRESHHOLDCOR
581 highCorr <- findCorrelation(cor(trainX, use = "pairwise.complete.obs"),
582 corrThresh)
583 if(length(highCorr) > 0)
584 {
585 corrVars <- names(trainX)[highCorr]
586 trainX <- trainX[, -highCorr]
587 corrText <- paste("There were ",
588 length(highCorr),
589 " predictors that were removed due to",
590 " large between--predictor correlations that",
591 " could negatively affect the model fit",
592 ifelse(length(highCorr) > 10,
593 ".",
594 paste(": ",
595 listString(highCorr),
596 ".",
597 sep = "")),
598 " Removing these predictors forced",
599 " all pair--wise correlations to be",
600 " less than ",
601 corrThresh,
602 ".",
603 sep = "")
604 testX <- testX[, -highCorr]
605 } else corrText <- "No correlation among data on given threshold"
606 }else corrText <- ""
607 @
608
609 \Sexpr{corrText}
610
611 <<preProc, eval = $PREPROCEVAL, echo = $PREPROCECHO, results = $PREPROCRESULT>>=
612 ppMethods <- NULL
613 if(ppSteps["center"]) ppMethods <- c(ppMethods, "center")
614 if(ppSteps["scale"]) ppMethods <- c(ppMethods, "scale")
615 if(any(hasMissing) > 0) ppMethods <- c(ppMethods, "knnImpute")
616 ##OPTION other methods, such as spatial sign, can be added to this list
617
618 if(length(ppMethods) > 0)
619 {
620 ppInfo <- preProcess(trainX, method = ppMethods)
621 trainX <- predict(ppInfo, trainX)
622 if(pctTrain < 1) testX <- predict(ppInfo, testX)
623 ppText <- paste("The following pre--processing methods were",
624 " applied to the training",
625 ifelse(pctTrain < 1, " and test", ""),
626 " data: ",
627 listString(ppMethods),
628 ".",
629 sep = "")
630 ppText <- gsub("center", "mean centering", ppText)
631 ppText <- gsub("scale", "scaling to unit variance", ppText)
632 ppText <- gsub("knnImpute",
633 paste(ppInfo$$k, "--nearest neighbor imputation", sep = ""),
634 ppText)
635 ppText <- gsub("spatialSign", "the spatial sign transformation", ppText)
636 ppText <- gsub("pca", "principal component feature extraction", ppText)
637 ppText <- gsub("ica", "independent component feature extraction", ppText)
638 } else {
639 ppInfo <- NULL
640 ppText <- ""
641 }
642
643 predictorNames <- names(trainX)
644 if(nzvText != "" | corrText != "" | ppText != "")
645 {
646 varText <- paste("After pre--processing, ",
647 ncol(trainX),
648 "predictors remained for modeling.")
649 } else varText <- ""
650
651 @
652
653 \Sexpr{ppText}
654 \Sexpr{varText}
655
656 \clearpage
657
658 \section*{Model Building}
659
660 <<setupWorkers, eval = TRUE, echo = $SETUPWORKERSECHO, results = $SETUPWORKERSRESULT>>=
661 numWorkers <- $NUMWORKERS
662 ##OPTION: turn up numWorkers to use MPI
663 if(numWorkers > 1)
664 {
665 mpiCalcs <- function(X, FUN, ...)
666 {
667 theDots <- list(...)
668 parLapply(theDots$$cl, X, FUN)
669 }
670
671 library(snow)
672 cl <- makeCluster(numWorkers, "MPI")
673 }
674 @
675
676 <<setupResampling, echo = $SETUPRESAMPLINGECHO, results = $SETUPRESAMPLINGRESULT>>=
677 ##OPTION: the resampling options can be changed. See
678 ## ?trainControl for details
679
680 resampName <- "$RESAMPNAME"
681 resampNumber <- $RESAMPLENUMBER
682 numRepeat <- $NUMREPEAT
683 resampP <- $RESAMPLENUMBERPERCENT
684
685 modelInfo <- modelLookup(modName)
686
687 if(numClasses == 2)
688 {
689 foo <- if(any(modelInfo$$probModel)) twoClassSummary else twoClassNoProbs
690 } else foo <- defaultSummary
691
692 set.seed(3)
693 ctlObj <- trainControl(method = resampName,
694 number = resampNumber,
695 repeats = numRepeat,
696 p = resampP,
697 classProbs = any(modelInfo$$probModel),
698 summaryFunction = foo)
699
700
701 ##OPTION select other performance metrics as needed
702 optMetric <- if(numClasses == 2 & any(modelInfo$$probModel)) "ROC" else "Kappa"
703
704 if(numWorkers > 1)
705 {
706 ctlObj$$workers <- numWorkers
707 ctlObj$$computeFunction <- mpiCalcs
708 ctlObj$$computeArgs <- list(cl = cl)
709 }
710 @
711
712 <<setupGrid, results = $SETUPGRIDRESULT, echo = $SETUPGRIDECHO>>=
713 ##OPTION expand or contract these grids as needed (or
714 ## add more models
715
716 gridSize <- $SETUPGRIDSIZE
717
718 if(modName %in% c("svmPoly", "svmRadial", "svmLinear", "lvq", "ctree2", "ctree")) gridSize <- 5
719 if(modName %in% c("earth", "fda")) gridSize <- 7
720 if(modName %in% c("knn", "rocc", "glmboost", "rf", "nodeHarvest")) gridSize <- 10
721
722 if(modName %in% c("nb")) gridSize <- 2
723 if(modName %in% c("pam", "rpart")) gridSize <- 15
724 if(modName %in% c("pls")) gridSize <- min(20, ncol(trainX))
725
726 if(modName == "gbm")
727 {
728 tGrid <- expand.grid(.interaction.depth = -1 + (1:5)*2 ,
729 .n.trees = (1:10)*20,
730 .shrinkage = .1)
731 }
732
733 if(modName == "nnet")
734 {
735 tGrid <- expand.grid(.size = -1 + (1:5)*2 ,
736 .decay = c(0, .001, .01, .1))
737 }
738
739 if(modName == "ada")
740 {
741 tGrid <- expand.grid(.maxdepth = 1, .iter = c(100,200,300,400), .nu = 1 )
742
743 }
744
745
746 @
747
748 <<fitModel, results = $FITMODELRESULT, echo = $FITMODELECHO, eval = $FITMODELEVAL>>=
749 ##OPTION alter as needed
750
751 set.seed(4)
752 modelFit <- switch(modName,
753 gbm =
754 {
755 mix <- sample(seq(along = trainY))
756 train(
757 trainX[mix,], trainY[mix], modName,
758 verbose = FALSE,
759 bag.fraction = .9,
760 metric = optMetric,
761 trControl = ctlObj,
762 tuneGrid = tGrid)
763 },
764
765 multinom =
766 {
767 train(
768 trainX, trainY, modName,
769 trace = FALSE,
770 metric = optMetric,
771 maxiter = 1000,
772 MaxNWts = 5000,
773 trControl = ctlObj,
774 tuneLength = gridSize)
775 },
776
777 nnet =
778 {
779 train(
780 trainX, trainY, modName,
781 metric = optMetric,
782 linout = FALSE,
783 trace = FALSE,
784 maxiter = 1000,
785 MaxNWts = 5000,
786 trControl = ctlObj,
787 tuneGrid = tGrid)
788
789 },
790
791 svmRadial =, svmPoly =, svmLinear =
792 {
793 train(
794 trainX, trainY, modName,
795 metric = optMetric,
796 scaled = TRUE,
797 trControl = ctlObj,
798 tuneLength = gridSize)
799 },
800 {
801 train(trainX, trainY, modName,
802 trControl = ctlObj,
803 metric = optMetric,
804 tuneLength = gridSize)
805 })
806
807 @
808
809 <<modelDescr, echo = $MODELDESCRECHO, results = $MODELDESCRRESULT>>=
810 summaryText <- ""
811
812 resampleName <- switch(tolower(modelFit$$control$$method),
813 boot = paste("the bootstrap (", length(modelFit$$control$$index), " reps)", sep = ""),
814 boot632 = paste("the bootstrap 632 rule (", length(modelFit$$control$$index), " reps)", sep = ""),
815 cv = paste("cross-validation (", modelFit$$control$$number, " fold)", sep = ""),
816 repeatedcv = paste("cross-validation (", modelFit$$control$$number, " fold, repeated ",
817 modelFit$$control$$repeats, " times)", sep = ""),
818 lgocv = paste("repeated train/test splits (", length(modelFit$$control$$index), " reps, ",
819 round(modelFit$$control$$p, 2), "$$\\%$$)", sep = ""))
820
821 tuneVars <- latexTranslate(tolower(modelInfo$$label))
822 tuneVars <- gsub("\\#", "the number of ", tuneVars, fixed = TRUE)
823 if(ncol(modelFit$$bestTune) == 1 && colnames(modelFit$$bestTune) == ".parameter")
824 {
825 summaryText <- paste(summaryText,
826 "\n\n",
827 "There are no tuning parameters associated with this model.",
828 "To characterize the model performance on the training set,",
829 resampleName,
830 "was used.",
831 "Table \\\\ref{T:resamps} and Figure \\\\ref{F:profile}",
832 "show summaries of the resampling results. ")
833
834 } else {
835 summaryText <- paste("There",
836 ifelse(nrow(modelInfo) > 1, "are", "is"),
837 nrow(modelInfo),
838 ifelse(nrow(modelInfo) > 1, "tuning parameters", "tuning parameter"),
839 "associated with this model:",
840 listString(tuneVars, period = TRUE))
841
842
843
844 paramNames <- gsub(".", "", names(modelFit$$bestTune), fixed = TRUE)
845 ## (i in seq(along = paramNames))
846 ## {
847 ## check <- modelInfo$$parameter %in% paramNames[i]
848 ## if(any(check))
849 ## {
850 ## paramNames[i] <- modelInfo$$label[which(check)]
851 ## }
852 ## }
853
854 paramNames <- gsub("#", "the number of ", paramNames, fixed = TRUE)
855 ## Check to see if there was only one combination fit
856 summaryText <- paste(summaryText,
857 "To choose",
858 ifelse(nrow(modelInfo) > 1,
859 "appropriate values of the tuning parameters,",
860 "an appropriate value of the tuning parameter,"),
861 resampleName,
862 "was used to generated a profile of performance across the",
863 nrow(modelFit$$results),
864 ifelse(nrow(modelInfo) > 1,
865 "combinations of the tuning parameters.",
866 "candidate values."),
867
868 "Table \\\\ref{T:resamps} and Figure \\\\ref{F:profile} show",
869 "summaries of the resampling profile. ", "The final model fitted to the entire training set was:",
870 listString(paste(latexTranslate(tolower(paramNames)), "=", modelFit$$bestTune[1,]), period = TRUE))
871
872 }
873 @
874
875 \Sexpr{summaryText}
876
877 <<resampTable, echo = $RESAMPTABLEECHO, results = $RESAMPTABLERESULT>>=
878 tableData <- modelFit$$results
879
880 if(all(modelInfo$$parameter == "parameter") && resampName == "boot632")
881 {
882 tableData <- tableData[,-1, drop = FALSE]
883 colNums <- c( length(modelFit$$perfNames), length(modelFit$$perfNames), length(modelFit$$perfNames))
884 colLabels <- c("Mean", "Standard Deviation","Apparant")
885 constString <- ""
886 isConst <- NULL
887 } else if (all(modelInfo$$parameter == "parameter") && (resampName == "boot" | resampName == "cv" | resampName == "repeatedcv" )){
888 tableData <- tableData[,-1, drop = FALSE]
889 colNums <- c(length(modelFit$$perfNames), length(modelFit$$perfNames))
890 colLabels <- c("Mean", "Standard Deviation")
891 constString <- ""
892 isConst <- NULL
893 } else if (all(modelInfo$$parameter == "parameter") && resampName == "LOOCV" ){
894 tableData <- tableData[,-1, drop = FALSE]
895 colNums <- length(modelFit$$perfNames)
896 colLabels <- c("Measures")
897 constString <- ""
898 isConst <- NULL
899 } else {
900 if (all(modelInfo$$parameter != "parameter") && resampName == "boot632" ){
901 isConst <- apply(tableData[, modelInfo$$parameter, drop = FALSE],
902 2,
903 function(x) length(unique(x)) == 1)
904
905 numParamInTable <- sum(!isConst)
906
907 if(any(isConst))
908 {
909 constParam <- modelInfo$$parameter[isConst]
910 constValues <- format(tableData[, constParam, drop = FALSE], digits = 4)[1,,drop = FALSE]
911 tableData <- tableData[, !(names(tableData) %in% constParam), drop = FALSE]
912 constString <- paste("The tuning",
913 ifelse(sum(isConst) > 1,
914 "parmeters",
915 "parameter"),
916 listString(paste("``", names(constValues), "''", sep = "")),
917 ifelse(sum(isConst) > 1,
918 "were",
919 "was"),
920 "held constant at",
921 ifelse(sum(isConst) > 1,
922 "a value of",
923 "values of"),
924 listString(constValues[1,]))
925
926 } else constString <- ""
927
928 cn <- colnames(tableData)
929 ## for(i in seq(along = cn))
930 ## {
931 ## check <- modelInfo$$parameter %in% cn[i]
932 ## if(any(check))
933 ## {
934 ## cn[i] <- modelInfo$$label[which(check)]
935 ## }
936 ## }
937 ## colnames(tableData) <- cn
938
939 colNums <- c(numParamInTable,
940 length(modelFit$$perfNames),
941 length(modelFit$$perfNames),
942 length(modelFit$$perfNames))
943 colLabels <- c("", "Mean", "Standard Deviation", "Apparant")
944
945 }else if (all(modelInfo$$parameter != "parameter") && (resampName == "boot" | resampName == "repeatedcv" | resampName == "cv") ){
946 isConst <- apply(tableData[, modelInfo$$parameter, drop = FALSE],
947 2,
948 function(x) length(unique(x)) == 1)
949
950 numParamInTable <- sum(!isConst)
951
952 if(any(isConst))
953 {
954 constParam <- modelInfo$$parameter[isConst]
955 constValues <- format(tableData[, constParam, drop = FALSE], digits = 4)[1,,drop = FALSE]
956 tableData <- tableData[, !(names(tableData) %in% constParam), drop = FALSE]
957 constString <- paste("The tuning",
958 ifelse(sum(isConst) > 1,
959 "parmeters",
960 "parameter"),
961 listString(paste("``", names(constValues), "''", sep = "")),
962 ifelse(sum(isConst) > 1,
963 "were",
964 "was"),
965 "held constant at",
966 ifelse(sum(isConst) > 1,
967 "a value of",
968 "values of"),
969 listString(constValues[1,]))
970
971 } else constString <- ""
972
973 cn <- colnames(tableData)
974 ## for(i in seq(along = cn))
975 ## {
976 ## check <- modelInfo$$parameter %in% cn[i]
977 ## if(any(check))
978 ## {
979 ## cn[i] <- modelInfo$$label[which(check)]
980 ## }
981 ## }
982 ## colnames(tableData) <- cn
983
984 colNums <- c(numParamInTable,
985 length(modelFit$$perfNames),
986 length(modelFit$$perfNames))
987 colLabels <- c("", "Mean", "Standard Deviation")
988
989 }
990 else if (all(modelInfo$$parameter != "parameter") && resampName == "LOOCV"){
991 isConst <- apply(tableData[, modelInfo$$parameter, drop = FALSE],
992 2,
993 function(x) length(unique(x)) == 1)
994
995 numParamInTable <- sum(!isConst)
996
997 if(any(isConst))
998 {
999 constParam <- modelInfo$$parameter[isConst]
1000 constValues <- format(tableData[, constParam, drop = FALSE], digits = 4)[1,,drop = FALSE]
1001 tableData <- tableData[, !(names(tableData) %in% constParam), drop = FALSE]
1002 constString <- paste("The tuning",
1003 ifelse(sum(isConst) > 1,
1004 "parmeters",
1005 "parameter"),
1006 listString(paste("``", names(constValues), "''", sep = "")),
1007 ifelse(sum(isConst) > 1,
1008 "were",
1009 "was"),
1010 "held constant at",
1011 ifelse(sum(isConst) > 1,
1012 "a value of",
1013 "values of"),
1014 listString(constValues[1,]))
1015
1016 } else constString <- ""
1017
1018 cn <- colnames(tableData)
1019 ## for(i in seq(along = cn))
1020 ## {
1021 ## check <- modelInfo$$parameter %in% cn[i]
1022 ## if(any(check))
1023 ## {
1024 ## cn[i] <- modelInfo$$label[which(check)]
1025 ## }
1026 ## }
1027 ## colnames(tableData) <- cn
1028
1029 colNums <- c(numParamInTable,
1030 length(modelFit$$perfNames))
1031 colLabels <- c("", "Measures")
1032
1033 }
1034
1035 }
1036
1037
1038
1039 colnames(tableData) <- gsub("SD$$", "", colnames(tableData))
1040 colnames(tableData) <- gsub("Apparent$$", "", colnames(tableData))
1041 colnames(tableData) <- latexTranslate(colnames(tableData))
1042 rownames(tableData) <- latexTranslate(rownames(tableData))
1043
1044 latex(tableData,
1045 rowname = NULL,
1046 file = "",
1047 cgroup = colLabels,
1048 n.cgroup = colNums,
1049 where = "h!",
1050 digits = 4,
1051 longtable = nrow(tableData) > 30,
1052 caption = paste(resampleName, "results from the model fit.", constString),
1053 label = "T:resamps")
1054 @
1055
1056 \setkeys{Gin}{ width = 0.9\textwidth}
1057 \begin{figure}[b]
1058 \begin{center}
1059
1060 <<profilePlot, echo = $PROFILEPLOTECHO, fig = $PROFILEPLOTFIG, width = 8, height = 6>>=
1061 trellis.par.set(caretTheme(), warn = TRUE)
1062 if(all(modelInfo$$parameter == "parameter") | all(isConst) | modName == "nb")
1063 {
1064 resultsPlot <- resampleHist(modelFit)
1065 plotCaption <- paste("Distributions of model performance from the ",
1066 "training set estimated using ",
1067 resampleName)
1068 } else {
1069 if(modName %in% c("svmPoly", "svmRadial", "svmLinear"))
1070 {
1071 resultsPlot <- plot(modelFit,
1072 metric = optMetric,
1073 xTrans = function(x) log10(x))
1074 resultsPlot <- update(resultsPlot,
1075 type = c("g", "p", "l"),
1076 ylab = paste(optMetric, " (", resampleName, ")", sep = ""))
1077
1078 } else {
1079 resultsPlot <- plot(modelFit,
1080 metric = optMetric)
1081 resultsPlot <- update(resultsPlot,
1082 type = c("g", "p", "l"),
1083 ylab = paste(optMetric, " (", resampleName, ")", sep = ""))
1084 }
1085 plotCaption <- paste("A plot of the estimates of the",
1086 optMetric,
1087 "values calculated using",
1088 resampleName)
1089 }
1090 print(resultsPlot)
1091 @
1092 \caption[Performance Plot]{\Sexpr{plotCaption}.}
1093 \label{F:profile}
1094 \end{center}
1095 \end{figure}
1096
1097
1098 <<stopWorkers, echo = $STOPWORKERSECHO, results = $STOPWORKERSRESULT>>=
1099 if(numWorkers > 1) stopCluster(cl)
1100 @
1101
1102 <<testPred, results = $TESTPREDRESULT, echo = $TESTPREDECHO>>=
1103 if(pctTrain < 1)
1104 {
1105 cat("\\clearpage\n\\section*{Test Set Results}\n\n")
1106 classPred <- predict(modelFit, testX)
1107 cm <- confusionMatrix(classPred, testY)
1108 values <- cm$$overall[c("Accuracy", "Kappa", "AccuracyPValue", "McnemarPValue")]
1109
1110 values <- values[!is.na(values) & !is.nan(values)]
1111 values <- c(format(values[1:2], digits = 3),
1112 format.pval(values[-(1:2)], digits = 5))
1113 nms <- c("the overall accuracy", "the Kappa statistic",
1114 "the $$p$$--value that accuracy is greater than the no--information rate",
1115 "the $$p$$--value of concordance from McNemar's test")
1116 nms <- nms[seq(along = values)]
1117 names(values) <- nms
1118
1119 if(any(modelInfo$$probModel))
1120 {
1121 classProbs <- extractProb(list(fit = modelFit),
1122 testX = testX,
1123 testY = testY)
1124 classProbs <- subset(classProbs, dataType == "Test")
1125 if(numClasses == 2)
1126 {
1127 tmp <- twoClassSummary(classProbs, lev = levels(classProbs$$obs))
1128 tmp <- c(format(tmp, digits = 3))
1129 names(tmp) <- c("the area under the ROC curve", "the sensitivity", "the specificity")
1130
1131 values <- c(values, tmp)
1132
1133 }
1134 probPlot <- plotClassProbs(classProbs)
1135 }
1136 testString <- paste("Based on the test set of",
1137 nrow(testX),
1138 "samples,",
1139 listString(paste(names(values), "was", values), period = TRUE),
1140 "The confusion matrix for the test set is shown in Table",
1141 "\\\\ref{T:cm}.")
1142 testString <- paste(testString,
1143 " Using ", resampleName,
1144 ", the training set estimates were ",
1145 resampleStats(modelFit),
1146 ".",
1147 sep = "")
1148
1149 if(any(modelInfo$$probModel)) testString <- paste(testString,
1150 "Histograms of the class probabilities",
1151 "for the test set samples are shown in",
1152 "Figure \\\\ref{F:probs}",
1153 ifelse(numClasses == 2,
1154 " and the test set ROC curve is in Figure \\\\ref{F:roc}.",
1155 "."))
1156
1157
1158
1159 latex(cm$$table,
1160 title = "",
1161 file = "",
1162 where = "h",
1163 cgroup = "Observed Values",
1164 n.cgroup = numClasses,
1165 caption = "The confusion matrix for the test set",
1166 label = "T:cm")
1167
1168 } else testString <- ""
1169 @
1170 \Sexpr{testString}
1171
1172
1173 <<classProbsTex, results = $CLASSPROBSTEXRESULT, echo = $CLASSPROBSTEXECHO>>=
1174 if(any(modelInfo$probModel) && pctTrain < 1 ) {
1175 cat(
1176 paste("\\begin{figure}[p]\n",
1177 "\\begin{center}\n",
1178 "\\includegraphics{classProbs}",
1179 "\\caption[PCA Plot]{Class probabilities",
1180 "for the test set. Each panel contains ",
1181 "separate classes}\n",
1182 "\\label{F:probs}\n",
1183 "\\end{center}\n",
1184 "\\end{figure}"))
1185 }
1186 if(any(modelInfo$$probModel) & numClasses == 2 & pctTrain < 1 )
1187 {
1188 cat(
1189 paste("\\begin{figure}[p]\n",
1190 "\\begin{center}\n",
1191 "\\includegraphics[clip, width = .8\\textwidth]{roc}",
1192 "\\caption[ROC Plot]{ROC Curve",
1193 "for the test set.}\n",
1194 "\\label{F:roc}\n",
1195 "\\end{center}\n",
1196 "\\end{figure}"))
1197 } else {
1198 cat (paste(""))
1199 }
1200
1201 @
1202 <<classProbsTex, results = $CLASSPROBSTEXRESULT1, echo = $CLASSPROBSTEXECHO1 >>=
1203 if(any(modelInfo$probModel) && pctTrain < 1) {
1204 pdf("classProbs.pdf", height = 7, width = 7)
1205 trellis.par.set(caretTheme(), warn = FALSE)
1206 print(probPlot)
1207 dev.off()
1208 }
1209 if(any(modelInfo$probModel) & numClasses == 2 & pctTrain < 1) {
1210 resPonse<-testY
1211 preDictor<-classProbs[, levels(trainY)[1]]
1212 pdf("roc.pdf", height = 8, width = 8)
1213 # from pROC example at http://web.expasy.org/pROC/screenshots.htm
1214 plot.roc(resPonse, preDictor, # data
1215 percent=TRUE, # show all values in percent
1216 partial.auc=c(100, 90), partial.auc.correct=TRUE, # define a partial AUC (pAUC)
1217 print.auc=TRUE, #display pAUC value on the plot with following options:
1218 print.auc.pattern="Corrected pAUC (100-90%% SP):\n%.1f%%", print.auc.col="#1c61b6",
1219 auc.polygon=TRUE, auc.polygon.col="#1c61b6", # show pAUC as a polygon
1220 max.auc.polygon=TRUE, max.auc.polygon.col="#1c61b622", # also show the 100% polygon
1221 main="Partial AUC (pAUC)")
1222 plot.roc(resPonse, preDictor,
1223 percent=TRUE, add=TRUE, type="n", # add to plot, but don't re-add the ROC itself (useless)
1224 partial.auc=c(100, 90), partial.auc.correct=TRUE,
1225 partial.auc.focus="se", # focus pAUC on the sensitivity
1226 print.auc=TRUE, print.auc.pattern="Corrected pAUC (100-90%% SE):\n%.1f%%", print.auc.col="#008600",
1227 print.auc.y=40, # do not print auc over the previous one
1228 auc.polygon=TRUE, auc.polygon.col="#008600",
1229 max.auc.polygon=TRUE, max.auc.polygon.col="#00860022")
1230 dev.off()
1231 } else {
1232 cat("")
1233 }
1234
1235 @
1236
1237 \section*{Versions}
1238
1239 <<versions, echo = FALSE, results = tex>>=
1240 toLatex(sessionInfo())
1241
1242 @
1243
1244 <<save-data, echo = $SAVEDATAECHO, results = $SAVEDATARESULT>>=
1245 ## change this to the name of modName....
1246 Fit<-modelFit
1247 save(Fit,file="$METHOD-Fit.RData")
1248 @
1249 The model was built using $METHOD and is saved as $METHOD Model for reuse. This contains the variable Fit.
1250
1251 \end{document}'''
1252
1253 return template4Rnw