This page contains the test models, the code and the model selection process details for the accompanying project Predicting Barbell Lift Quality from Accelerometer Data

 

Data preprocessing

library(caret); library(dplyr)

trainingSet <- read.csv('pml-training.csv'); testingSet <- read.csv('pml-testing.csv')

# remove irrelevant columns
trainingSet <- trainingSet[, -(1:7)]; testingSet <- testingSet[, -(1:7)]

# remove columns with missing / invalid data
badCols <- colSums(is.na(trainingSet) | trainingSet == '' | trainingSet == '#DIV/0!')
badCols <- badCols[badCols > (dim(trainingSet)[1] * .95)]
library(dplyr)
trainingSet <- trainingSet %>% select(-all_of(names(badCols)))
testingSet <- testingSet %>%  select(-all_of(names(badCols)))

# remove highly correlated columns
library(caret)
collinear <- findCorrelation(cor(trainingSet[, -53]), cutoff=0.95)
trainingSet <- trainingSet[, -collinear]; testingSet <- testingSet[, -collinear]


# factor the output column
trainingSet$classe <- factor(trainingSet$classe)

## divide the data into training and validation sets
set.seed(345325)
inTrain <- createDataPartition(y=trainingSet$classe, p=0.80, list=F)
trainingSet <- trainingSet[inTrain,]; validationSet <- trainingSet[-inTrain, ]

 

Test Models

Random Forests

Random Forests [ntree: 100, metric:Accuracy]

set.seed(3333)
seedV <- vector(mode = 'list', length=2); seedV[[1]] <- sample.int(n=1000, 7); seedV[[2]] <- sample.int(n=1000, 1)
tuningGrid <- data.frame(mtry=6:12)

startTime <- proc.time()
mrf100 <- train(classe ~ ., data=trainingSet, method='rf', nodesize=5, 
                  metric='Accuracy', importance=T,
                  trControl= trainControl(method='oob', allowParallel=T, seeds=seedV),
                  tuneGrid= tuningGrid, ntree=100,
                  allowParallel=T)
mrf100time <- proc.time() - startTime
mrf100
## Random Forest 
## 
## 15699 samples
##    48 predictor
##     5 classes: 'A', 'B', 'C', 'D', 'E' 
## 
## No pre-processing
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    6    0.9927384  0.9908139
##    7    0.9938850  0.9922645
##    8    0.9928021  0.9908950
##    9    0.9932480  0.9914589
##   10    0.9946493  0.9932318
##   11    0.9943945  0.9929094
##   12    0.9942035  0.9926675
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 10.

 

Random Forests [ntree: 150, metric:Accuracy]

set.seed(3333)
startTime <- proc.time()
mrf150 <- train(classe ~ ., data=trainingSet, method='rf', nodesize=5, 
                  metric='Accuracy', importance=T,
                  trControl= trainControl(method='oob', allowParallel=T, seeds=seedV),
                  tuneGrid= tuningGrid, ntree=150,
                  allowParallel=T)
mrf150time <- proc.time() - startTime
mrf150
## Random Forest 
## 
## 15699 samples
##    48 predictor
##     5 classes: 'A', 'B', 'C', 'D', 'E' 
## 
## No pre-processing
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    6    0.9938213  0.9921839
##    7    0.9942035  0.9926676
##    8    0.9938213  0.9921841
##    9    0.9940761  0.9925065
##   10    0.9947767  0.9933929
##   11    0.9943945  0.9929094
##   12    0.9946493  0.9932318
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 10.

 

Random Forests [ntree: 200, metric:Accuracy]

set.seed(3333)
startTime <- proc.time()
mrf200 <- train(classe ~ ., data=trainingSet, method='rf', nodesize=5, 
                  metric='Accuracy', importance=T,
                  trControl= trainControl(method='oob', allowParallel=T, seeds=seedV),
                  tuneGrid= tuningGrid, ntree=200,
                  allowParallel=T)
mrf200time <- proc.time() - startTime
mrf200
## Random Forest 
## 
## 15699 samples
##    48 predictor
##     5 classes: 'A', 'B', 'C', 'D', 'E' 
## 
## No pre-processing
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    6    0.9939487  0.9923451
##    7    0.9942035  0.9926675
##    8    0.9942035  0.9926675
##    9    0.9936939  0.9920228
##   10    0.9948404  0.9934735
##   11    0.9950315  0.9937151
##   12    0.9947767  0.9933930
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 11.

 

Gradient Boosting

[interaction depth: 1:3, ntree: 50:150, shrinkage: 0.1]

tuningGrid <- expand.grid(
    interaction.depth= 1:3,
    n.trees = (1:3)*50,
    shrinkage = .1,
    n.minobsinnode = 10)

set.seed(3333)
seedV <- vector(mode = 'list', length=51); 
for(i in 1:50) seedV[[i]] <- sample.int(n=1000, 400); seedV[[51]] <- sample.int(n=1000, 1)

startTime <- proc.time()
mgbm <- train(classe ~ ., data=trainingSet, method='gbm', metric='Accuracy', 
                   trControl= trainControl(method='repeatedcv', repeats=5, allowParallel=T, seeds=seedV),
                   tuneGrid=tuningGrid,  verbose=F)
mgbmTime <- proc.time() - startTime
mgbm
## Stochastic Gradient Boosting 
## 
## 15699 samples
##    48 predictor
##     5 classes: 'A', 'B', 'C', 'D', 'E' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 14129, 14130, 14128, 14128, 14131, 14130, ... 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  Accuracy   Kappa    
##   1                   50      0.7388364  0.6689213
##   1                  100      0.8120013  0.7620552
##   1                  150      0.8456335  0.8046150
##   2                   50      0.8496209  0.8094589
##   2                  100      0.9028847  0.8770850
##   2                  150      0.9278675  0.9087181
##   3                   50      0.8926426  0.8640696
##   3                  100      0.9400212  0.9240919
##   3                  150      0.9599584  0.9493371
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 150, interaction.depth =
##  3, shrinkage = 0.1 and n.minobsinnode = 10.

Testing the Models

Model mrf100

mrf100Preds <- predict(mrf100$finalModel, validationSet)
crf100 <- confusionMatrix(mrf100Preds, validationSet$classe)
mrf100Accuracy <- round(sum(mrf100Preds == validationSet$classe) / nrow(validationSet), digits= 2)
mrf100Error <- 1 - mrf100Accuracy
crf100
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   A   B   C   D   E
##          A 891   0   0   0   0
##          B   0 604   0   0   0
##          C   0   0 574   0   0
##          D   0   0   0 491   0
##          E   0   0   0   0 574
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9988, 1)
##     No Information Rate : 0.2843     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            1.0000   1.0000   1.0000   1.0000   1.0000
## Specificity            1.0000   1.0000   1.0000   1.0000   1.0000
## Pos Pred Value         1.0000   1.0000   1.0000   1.0000   1.0000
## Neg Pred Value         1.0000   1.0000   1.0000   1.0000   1.0000
## Prevalence             0.2843   0.1927   0.1832   0.1567   0.1832
## Detection Rate         0.2843   0.1927   0.1832   0.1567   0.1832
## Detection Prevalence   0.2843   0.1927   0.1832   0.1567   0.1832
## Balanced Accuracy      1.0000   1.0000   1.0000   1.0000   1.0000

 
Model mrf150

mrf150Preds <- predict(mrf150$finalModel, validationSet)
crf150 <- confusionMatrix(mrf150Preds, validationSet$classe)
mrf150Accuracy <- round(sum(mrf150Preds == validationSet$classe) / nrow(validationSet), digits= 2)
mrf150Error <- 1 - mrf150Accuracy
crf150
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   A   B   C   D   E
##          A 891   0   0   0   0
##          B   0 604   0   0   0
##          C   0   0 574   0   0
##          D   0   0   0 491   0
##          E   0   0   0   0 574
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9988, 1)
##     No Information Rate : 0.2843     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            1.0000   1.0000   1.0000   1.0000   1.0000
## Specificity            1.0000   1.0000   1.0000   1.0000   1.0000
## Pos Pred Value         1.0000   1.0000   1.0000   1.0000   1.0000
## Neg Pred Value         1.0000   1.0000   1.0000   1.0000   1.0000
## Prevalence             0.2843   0.1927   0.1832   0.1567   0.1832
## Detection Rate         0.2843   0.1927   0.1832   0.1567   0.1832
## Detection Prevalence   0.2843   0.1927   0.1832   0.1567   0.1832
## Balanced Accuracy      1.0000   1.0000   1.0000   1.0000   1.0000

 
Model mrf200

mrf200Preds <- predict(mrf200$finalModel, validationSet)
crf200 <- confusionMatrix(mrf200Preds, validationSet$classe)
mrf200Accuracy <- round(sum(mrf200Preds == validationSet$classe) / nrow(validationSet), digits= 2)
mrf200Error <- 1 - mrf200Accuracy
crf200
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   A   B   C   D   E
##          A 891   0   0   0   0
##          B   0 604   0   0   0
##          C   0   0 574   0   0
##          D   0   0   0 491   0
##          E   0   0   0   0 574
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9988, 1)
##     No Information Rate : 0.2843     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            1.0000   1.0000   1.0000   1.0000   1.0000
## Specificity            1.0000   1.0000   1.0000   1.0000   1.0000
## Pos Pred Value         1.0000   1.0000   1.0000   1.0000   1.0000
## Neg Pred Value         1.0000   1.0000   1.0000   1.0000   1.0000
## Prevalence             0.2843   0.1927   0.1832   0.1567   0.1832
## Detection Rate         0.2843   0.1927   0.1832   0.1567   0.1832
## Detection Prevalence   0.2843   0.1927   0.1832   0.1567   0.1832
## Balanced Accuracy      1.0000   1.0000   1.0000   1.0000   1.0000

 

Model mgmb

mgbmPreds <- predict(mgbm, validationSet)
cmgbm <- confusionMatrix(mgbmPreds, validationSet$classe)
mgbmAccuracy <- round(sum(mgbmPreds == validationSet$classe) / nrow(validationSet), digits= 2)
mgbmError <- 1 - mgbmAccuracy
cmgbm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   A   B   C   D   E
##          A 882   8   0   1   2
##          B   6 586  11   3   5
##          C   2   8 559  11   7
##          D   1   0   4 471   3
##          E   0   2   0   5 557
## 
## Overall Statistics
##                                         
##                Accuracy : 0.9748        
##                  95% CI : (0.9687, 0.98)
##     No Information Rate : 0.2843        
##     P-Value [Acc > NIR] : < 2e-16       
##                                         
##                   Kappa : 0.9681        
##                                         
##  Mcnemar's Test P-Value : 0.03108       
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.9899   0.9702   0.9739   0.9593   0.9704
## Specificity            0.9951   0.9901   0.9891   0.9970   0.9973
## Pos Pred Value         0.9877   0.9591   0.9523   0.9833   0.9876
## Neg Pred Value         0.9960   0.9929   0.9941   0.9925   0.9934
## Prevalence             0.2843   0.1927   0.1832   0.1567   0.1832
## Detection Rate         0.2814   0.1870   0.1784   0.1503   0.1777
## Detection Prevalence   0.2849   0.1950   0.1873   0.1528   0.1800
## Balanced Accuracy      0.9925   0.9802   0.9815   0.9781   0.9838

 

Selecting the model

mbest <- mgbm$bestTune
mrf100M <- c(Algorithm = 'Random Forest', ntree = 100, mtry = mrf100$bestTune['mtry'], shrinkage = NA,
             interaction.depth = NA, Metric = 'Accuracy', Accuracy = '99.46%',
             Compute.Time = round(mrf100time['elapsed'], 2), Validation.Accuracy = round(mrf100Accuracy, 2),
             Misclassification.Error = round(mrf100Error, 2))
mrf150M <- c(Algorithm = 'Random Forest', ntree = 150, mtry = mrf150$bestTune['mtry'], shrinkage = NA,
             interaction.depth = NA, Metric = 'Accuracy', Accuracy = '99.48%',
             Compute.Time = mrf150time['elapsed'], Validation.Accuracy = round(mrf150Accuracy, 2),
             Misclassification.Error = round(mrf150Error, 2))
mrf200M <- c(Algorithm = 'Random Forest', ntree = 200, mtry = mrf200$bestTune['mtry'], shrinkage = NA,
             interaction.depth = NA, Metric = 'Accuracy', Accuracy = '99.50%', 
             Compute.Time = mrf200time['elapsed'], Validation.Accuracy = round(mrf200Accuracy, 2),
             Misclassification.Error = round(mrf200Error, 2))
mgbmM <-   c(Algorithm = 'Gradient Boosting', ntree = mbest['n.trees'], mtry = NA, shrinkage = mbest['shrinkage'],
             interaction.depth = mbest['interaction.depth'], Metric = 'Accuracy', Accuracy = '96.00%',
             Compute.Time = mgbmTime['elapsed'], Validation.Accuracy = round(mgbmAccuracy, 2),
             Misclassification.Error = round(mgbmError, 2))

models <- cbind(mrf100M, mrf150M, mrf200M, mgbmM)
colnames(models) <- c('Rf ntree 100', 'Rf ntree 150', 'Rf ntree 200', 'Gradient Boosting')
kable(models)
Rf ntree 100 Rf ntree 150 Rf ntree 200 Gradient Boosting
Algorithm Random Forest Random Forest Random Forest Gradient Boosting
ntree 100 150 200 150
mtry.mtry 10 10 11 NA
shrinkage NA NA NA 0.1
interaction.depth NA NA NA 3
Metric Accuracy Accuracy Accuracy Accuracy
Accuracy 99.46% 99.48% 99.50% 96.00%
Compute.Time.elapsed 96.34 141.64 188.92 2056.04
Validation.Accuracy 1 1 1 0.97
Misclassification.Error 0 0 0 0.03

** In the table above, while Accuracy is the accuracy of the model as provided by the train function, Validation.Accuracy is calculated based on predictions obtained on the validation set.