library(ggplot2) # for plots
library(dplyr) # for wrangling
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret) # for machine learning algorithms
## Loading required package: lattice
library(fivethirtyeight)
data(candy_rankings)
head(candy_rankings)
## # A tibble: 6 x 13
## competitorname chocolate fruity caramel peanutyalmondy nougat
## <chr> <lgl> <lgl> <lgl> <lgl> <lgl>
## 1 100 Grand TRUE FALSE TRUE FALSE FALSE
## 2 3 Musketeers TRUE FALSE FALSE FALSE TRUE
## 3 One dime FALSE FALSE FALSE FALSE FALSE
## 4 One quarter FALSE FALSE FALSE FALSE FALSE
## 5 Air Heads FALSE TRUE FALSE FALSE FALSE
## 6 Almond Joy TRUE FALSE FALSE TRUE FALSE
## # … with 7 more variables: crispedricewafer <lgl>, hard <lgl>, bar <lgl>,
## # pluribus <lgl>, sugarpercent <dbl>, pricepercent <dbl>,
## # winpercent <dbl>
(@) **Process and get to know the data**
Here we are determining which of the variables in the dataset might be an identifying variable.
library(tree)
## Registered S3 method overwritten by 'tree':
## method from
## print.tree cli
candy_cluster <- hclust(dist(candy_rankings), method = "complete")
## Warning in dist(candy_rankings): NAs introduced by coercion
plot(candy_cluster)
What characterisitcs the most likeable candies share between each other. Maybe the caramel the candy, the more popular it is.
Now, we check out the scales of your features and we sacle these variables when we eventually conduct a hierarhical algorithm so that it converts all the variable values into the same scale, which makes it easier to compare and interpret. Explain why it makes sense to *scale* these variables when you eventually conduct a hierarchical algorithm.
library(tibble)
candy_cluster <- candy_rankings %>%
column_to_rownames("competitorname")
hier_model <- hclust(dist(scale(candy_cluster)), method = "complete")
heatmap(data.matrix(scale(candy_cluster)), Colv = NA, Rowv = NA)
Dendrogram of scaled dataset using complete linkage method.
plot(hier_model, cex=0.8)
cluster_4 <- as.factor(cutree(hier_model, k = 4))
plot(cluster_4)
cluster_6 <- as.factor(cutree(hier_model, k=6))
plot(cluster_6)
Construct a dendrogram using the single linkage approach.
Single finds the minimum distances between clusters whereas complete finds the maximum distance. THis one is a lot lower height is can be reduced to fewer clusters.
hier_model_single <- hclust(dist(scale(candy_cluster)), method = "single")
plot(hier_model_single, cex=0.8)
Centroid linkage
Centroid linkage finds the centroid of each cluster and calculate the distance between centroids of two clusters. This one has more clusters and is more scattered. one side is a lot more dense than the other in the original split.
hier_model_center <- hclust(dist(scale(candy_cluster)), method = "centroid")
plot(hier_model_center, cex=0.8)
Average linkage
The average linkage finds all possible pair distances for points belonging to two different clusters and then calculates the average. This one is also lob sided but we can cut of the number of clusters at a smaller cluster number.
hier_model_average <- hclust(dist(scale(candy_cluster)), method = "average")
plot(hier_model_average, cex=0.8)
Complete linkage seems like the best type of linkage for this I would use the complete because it produces an even dendrogram that has a relatively small amount of clusters.
Now let’s try the K-means approach. The first step is to pick K. So how do we choose?
library(caret)
library(fivethirtyeight)
data(candy_rankings)
head(candy_rankings)
## # A tibble: 6 x 13
## competitorname chocolate fruity caramel peanutyalmondy nougat
## <chr> <lgl> <lgl> <lgl> <lgl> <lgl>
## 1 100 Grand TRUE FALSE TRUE FALSE FALSE
## 2 3 Musketeers TRUE FALSE FALSE FALSE TRUE
## 3 One dime FALSE FALSE FALSE FALSE FALSE
## 4 One quarter FALSE FALSE FALSE FALSE FALSE
## 5 Air Heads FALSE TRUE FALSE FALSE FALSE
## 6 Almond Joy TRUE FALSE FALSE TRUE FALSE
## # … with 7 more variables: crispedricewafer <lgl>, hard <lgl>, bar <lgl>,
## # pluribus <lgl>, sugarpercent <dbl>, pricepercent <dbl>,
## # winpercent <dbl>
library(tree)
candy_cluster <- hclust(dist(candy_rankings), method = "complete")
## Warning in dist(candy_rankings): NAs introduced by coercion
library(tibble)
candy_cluster <- candy_rankings %>%
column_to_rownames("competitorname")
hier_model <- hclust(dist(scale(candy_cluster)), method = "complete")
set.seed(253)
kmeans_model <- kmeans(scale(candy_cluster), centers = 3)
kmeans_model_2 <- kmeans(scale(candy_cluster), centers = 6)
SS <- rep(0, 2)
for(i in 1:2){
SS[i] <- kmeans(scale(candy_cluster), centers = i)$tot.withinss
}
tune_data <- data.frame(K = 1:2, SS)
as.factor(kmeans_model$cluster)
## 100 Grand 3 Musketeers
## 1 1
## One dime One quarter
## 2 2
## Air Heads Almond Joy
## 2 1
## Baby Ruth Boston Baked Beans
## 1 2
## Candy Corn Caramel Apple Pops
## 2 2
## Charleston Chew Chewey Lemonhead Fruit Mix
## 1 2
## Chiclets Dots
## 2 2
## Dum Dums Fruit Chews
## 3 2
## Fun Dip Gobstopper
## 3 3
## Haribo Gold Bears Haribo Happy Cola
## 2 2
## Haribo Sour Bears Haribo Twin Snakes
## 2 2
## Hershey's Kisses Hershey's Krackel
## 2 1
## Hershey's Milk Chocolate Hershey's Special Dark
## 1 1
## Jawbusters Junior Mints
## 3 2
## Kit Kat Laffy Taffy
## 1 2
## Lemonhead Lifesavers big ring gummies
## 3 2
## Peanut butter M&M's M&M's
## 1 1
## Mike & Ike Milk Duds
## 2 2
## Milky Way Milky Way Midnight
## 1 1
## Milky Way Simply Caramel Mounds
## 1 1
## Mr Good Bar Nerds
## 1 3
## Nestle Butterfinger Nestle Crunch
## 1 1
## Nik L Nip Now & Later
## 2 2
## Payday Peanut M&Ms
## 1 1
## Pixie Sticks Pop Rocks
## 2 3
## Red vines Reese's Miniatures
## 2 1
## Reese's Peanut Butter cup Reese's pieces
## 1 1
## Reese's stuffed with pieces Ring pop
## 1 3
## Rolo Root Beer Barrels
## 1 3
## Runts Sixlets
## 3 2
## Skittles original Skittles wildberry
## 2 2
## Nestle Smarties Smarties candy
## 2 3
## Snickers Snickers Crisper
## 1 1
## Sour Patch Kids Sour Patch Tricksters
## 2 2
## Starburst Strawberry bon bons
## 2 3
## Sugar Babies Sugar Daddy
## 2 2
## Super Bubble Swedish Fish
## 2 2
## Tootsie Pop Tootsie Roll Juniors
## 3 2
## Tootsie Roll Midgies Tootsie Roll Snack Bars
## 2 1
## Trolli Sour Bites Twix
## 2 1
## Twizzlers Warheads
## 2 3
## Welch's Fruit Snacks Werther's Original Caramel
## 2 3
## Whoppers
## 1
## Levels: 1 2 3
total within- cluster sum of squares).kmeans_model$tot.withinss
## [1] 621.3659
tune_data <- data.frame(K = 1:2, SS)
tune_data
## K SS
## 1 1 1008.0000
## 2 2 703.9851
Based on the plot, the best value of K seems to be 2 since it is a lower sum of squared distances.
SS_2 <- rep(0, 18)
for(i in 1:18){
SS_2[i] <- kmeans(scale(candy_cluster), centers = i)$tot.withinss
}
tune_data_2 <- data.frame(K = 1:18, SS_2)
candy_cluster %>% mutate(as.factor(kmeans_model$cluster))
## chocolate fruity caramel peanutyalmondy nougat crispedricewafer hard
## 1 TRUE FALSE TRUE FALSE FALSE TRUE FALSE
## 2 TRUE FALSE FALSE FALSE TRUE FALSE FALSE
## 3 FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 4 FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 5 FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## 6 TRUE FALSE FALSE TRUE FALSE FALSE FALSE
## 7 TRUE FALSE TRUE TRUE TRUE FALSE FALSE
## 8 FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## 9 FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 10 FALSE TRUE TRUE FALSE FALSE FALSE FALSE
## 11 TRUE FALSE FALSE FALSE TRUE FALSE FALSE
## 12 FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## 13 FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## 14 FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## 15 FALSE TRUE FALSE FALSE FALSE FALSE TRUE
## 16 FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## 17 FALSE TRUE FALSE FALSE FALSE FALSE TRUE
## 18 FALSE TRUE FALSE FALSE FALSE FALSE TRUE
## 19 FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## 20 FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 21 FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## 22 FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## 23 TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## 24 TRUE FALSE FALSE FALSE FALSE TRUE FALSE
## 25 TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## 26 TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## 27 FALSE TRUE FALSE FALSE FALSE FALSE TRUE
## 28 TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## 29 TRUE FALSE FALSE FALSE FALSE TRUE FALSE
## 30 FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## 31 FALSE TRUE FALSE FALSE FALSE FALSE TRUE
## 32 FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## 33 TRUE FALSE FALSE TRUE FALSE FALSE FALSE
## 34 TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## 35 FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## 36 TRUE FALSE TRUE FALSE FALSE FALSE FALSE
## 37 TRUE FALSE TRUE FALSE TRUE FALSE FALSE
## 38 TRUE FALSE TRUE FALSE TRUE FALSE FALSE
## 39 TRUE FALSE TRUE FALSE FALSE FALSE FALSE
## 40 TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## 41 TRUE FALSE FALSE TRUE FALSE FALSE FALSE
## 42 FALSE TRUE FALSE FALSE FALSE FALSE TRUE
## 43 TRUE FALSE FALSE TRUE FALSE FALSE FALSE
## 44 TRUE FALSE FALSE FALSE FALSE TRUE FALSE
## 45 FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## 46 FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## 47 FALSE FALSE FALSE TRUE TRUE FALSE FALSE
## 48 TRUE FALSE FALSE TRUE FALSE FALSE FALSE
## 49 FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 50 FALSE TRUE FALSE FALSE FALSE FALSE TRUE
## 51 FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## 52 TRUE FALSE FALSE TRUE FALSE FALSE FALSE
## 53 TRUE FALSE FALSE TRUE FALSE FALSE FALSE
## 54 TRUE FALSE FALSE TRUE FALSE FALSE FALSE
## 55 TRUE FALSE FALSE TRUE FALSE FALSE FALSE
## 56 FALSE TRUE FALSE FALSE FALSE FALSE TRUE
## 57 TRUE FALSE TRUE FALSE FALSE FALSE FALSE
## 58 FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## 59 FALSE TRUE FALSE FALSE FALSE FALSE TRUE
## 60 TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## 61 FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## 62 FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## 63 TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## 64 FALSE TRUE FALSE FALSE FALSE FALSE TRUE
## 65 TRUE FALSE TRUE TRUE TRUE FALSE FALSE
## 66 TRUE FALSE TRUE TRUE FALSE TRUE FALSE
## 67 FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## 68 FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## 69 FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## 70 FALSE TRUE FALSE FALSE FALSE FALSE TRUE
## 71 FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## 72 FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## 73 FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## 74 FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## 75 TRUE TRUE FALSE FALSE FALSE FALSE TRUE
## 76 TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## 77 TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## 78 TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## 79 FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## 80 TRUE FALSE TRUE FALSE FALSE TRUE FALSE
## 81 FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## 82 FALSE TRUE FALSE FALSE FALSE FALSE TRUE
## 83 FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## 84 FALSE FALSE TRUE FALSE FALSE FALSE TRUE
## 85 TRUE FALSE FALSE FALSE FALSE TRUE FALSE
## bar pluribus sugarpercent pricepercent winpercent
## 1 TRUE FALSE 0.732 0.860 66.97173
## 2 TRUE FALSE 0.604 0.511 67.60294
## 3 FALSE FALSE 0.011 0.116 32.26109
## 4 FALSE FALSE 0.011 0.511 46.11650
## 5 FALSE FALSE 0.906 0.511 52.34146
## 6 TRUE FALSE 0.465 0.767 50.34755
## 7 TRUE FALSE 0.604 0.767 56.91455
## 8 FALSE TRUE 0.313 0.511 23.41782
## 9 FALSE TRUE 0.906 0.325 38.01096
## 10 FALSE FALSE 0.604 0.325 34.51768
## 11 TRUE FALSE 0.604 0.511 38.97504
## 12 FALSE TRUE 0.732 0.511 36.01763
## 13 FALSE TRUE 0.046 0.325 24.52499
## 14 FALSE TRUE 0.732 0.511 42.27208
## 15 FALSE FALSE 0.732 0.034 39.46056
## 16 FALSE TRUE 0.127 0.034 43.08892
## 17 FALSE FALSE 0.732 0.325 39.18550
## 18 FALSE TRUE 0.906 0.453 46.78335
## 19 FALSE TRUE 0.465 0.465 57.11974
## 20 FALSE TRUE 0.465 0.465 34.15896
## 21 FALSE TRUE 0.465 0.465 51.41243
## 22 FALSE TRUE 0.465 0.465 42.17877
## 23 FALSE TRUE 0.127 0.093 55.37545
## 24 TRUE FALSE 0.430 0.918 62.28448
## 25 TRUE FALSE 0.430 0.918 56.49050
## 26 TRUE FALSE 0.430 0.918 59.23612
## 27 FALSE TRUE 0.093 0.511 28.12744
## 28 FALSE TRUE 0.197 0.511 57.21925
## 29 TRUE FALSE 0.313 0.511 76.76860
## 30 FALSE FALSE 0.220 0.116 41.38956
## 31 FALSE FALSE 0.046 0.104 39.14106
## 32 FALSE FALSE 0.267 0.279 52.91139
## 33 FALSE TRUE 0.825 0.651 71.46505
## 34 FALSE TRUE 0.825 0.651 66.57458
## 35 FALSE TRUE 0.872 0.325 46.41172
## 36 FALSE TRUE 0.302 0.511 55.06407
## 37 TRUE FALSE 0.604 0.651 73.09956
## 38 TRUE FALSE 0.732 0.441 60.80070
## 39 TRUE FALSE 0.965 0.860 64.35334
## 40 TRUE FALSE 0.313 0.860 47.82975
## 41 TRUE FALSE 0.313 0.918 54.52645
## 42 FALSE TRUE 0.848 0.325 55.35405
## 43 TRUE FALSE 0.604 0.767 70.73564
## 44 TRUE FALSE 0.313 0.767 66.47068
## 45 FALSE TRUE 0.197 0.976 22.44534
## 46 FALSE TRUE 0.220 0.325 39.44680
## 47 TRUE FALSE 0.465 0.767 46.29660
## 48 FALSE TRUE 0.593 0.651 69.48379
## 49 FALSE TRUE 0.093 0.023 37.72234
## 50 FALSE TRUE 0.604 0.837 41.26551
## 51 FALSE TRUE 0.581 0.116 37.34852
## 52 FALSE FALSE 0.034 0.279 81.86626
## 53 FALSE FALSE 0.720 0.651 84.18029
## 54 FALSE TRUE 0.406 0.651 73.43499
## 55 FALSE FALSE 0.988 0.651 72.88790
## 56 FALSE FALSE 0.732 0.965 35.29076
## 57 FALSE TRUE 0.860 0.860 65.71629
## 58 FALSE TRUE 0.732 0.069 29.70369
## 59 FALSE TRUE 0.872 0.279 42.84914
## 60 FALSE TRUE 0.220 0.081 34.72200
## 61 FALSE TRUE 0.941 0.220 63.08514
## 62 FALSE TRUE 0.941 0.220 55.10370
## 63 FALSE TRUE 0.267 0.976 37.88719
## 64 FALSE TRUE 0.267 0.116 45.99583
## 65 TRUE FALSE 0.546 0.651 76.67378
## 66 TRUE FALSE 0.604 0.651 59.52925
## 67 FALSE TRUE 0.069 0.116 59.86400
## 68 FALSE TRUE 0.069 0.116 52.82595
## 69 FALSE TRUE 0.151 0.220 67.03763
## 70 FALSE TRUE 0.569 0.058 34.57899
## 71 FALSE TRUE 0.965 0.767 33.43755
## 72 FALSE FALSE 0.418 0.325 32.23100
## 73 FALSE FALSE 0.162 0.116 27.30386
## 74 FALSE TRUE 0.604 0.755 54.86111
## 75 FALSE FALSE 0.604 0.325 48.98265
## 76 FALSE FALSE 0.313 0.511 43.06890
## 77 FALSE TRUE 0.174 0.011 45.73675
## 78 TRUE FALSE 0.465 0.325 49.65350
## 79 FALSE TRUE 0.313 0.255 47.17323
## 80 TRUE FALSE 0.546 0.906 81.64291
## 81 FALSE FALSE 0.220 0.116 45.46628
## 82 FALSE FALSE 0.093 0.116 39.01190
## 83 FALSE TRUE 0.313 0.313 44.37552
## 84 FALSE FALSE 0.186 0.267 41.90431
## 85 FALSE TRUE 0.872 0.848 49.52411
## as.factor(kmeans_model$cluster)
## 1 1
## 2 1
## 3 2
## 4 2
## 5 2
## 6 1
## 7 1
## 8 2
## 9 2
## 10 2
## 11 1
## 12 2
## 13 2
## 14 2
## 15 3
## 16 2
## 17 3
## 18 3
## 19 2
## 20 2
## 21 2
## 22 2
## 23 2
## 24 1
## 25 1
## 26 1
## 27 3
## 28 2
## 29 1
## 30 2
## 31 3
## 32 2
## 33 1
## 34 1
## 35 2
## 36 2
## 37 1
## 38 1
## 39 1
## 40 1
## 41 1
## 42 3
## 43 1
## 44 1
## 45 2
## 46 2
## 47 1
## 48 1
## 49 2
## 50 3
## 51 2
## 52 1
## 53 1
## 54 1
## 55 1
## 56 3
## 57 1
## 58 3
## 59 3
## 60 2
## 61 2
## 62 2
## 63 2
## 64 3
## 65 1
## 66 1
## 67 2
## 68 2
## 69 2
## 70 3
## 71 2
## 72 2
## 73 2
## 74 2
## 75 3
## 76 2
## 77 2
## 78 1
## 79 2
## 80 1
## 81 2
## 82 3
## 83 2
## 84 3
## 85 1
tune_data_2
## K SS_2
## 1 1 1008.0000
## 2 2 703.9851
## 3 3 637.2731
## 4 4 580.5499
## 5 5 451.8226
## 6 6 404.0259
## 7 7 460.4268
## 8 8 357.8766
## 9 9 307.6470
## 10 10 295.8493
## 11 11 281.6885
## 12 12 258.7438
## 13 13 258.7426
## 14 14 233.8311
## 15 15 222.3764
## 16 16 225.8736
## 17 17 194.9466
## 18 18 196.0123
To get a sense for how some of the features might have played into the clustering, we calculate the mean of each feature within each cluster.
kmeans_model_2$withinss
## [1] 78.57306 35.08662 89.36640 118.75488 65.46992 34.13899
head(candy_cluster)
## chocolate fruity caramel peanutyalmondy nougat
## 100 Grand TRUE FALSE TRUE FALSE FALSE
## 3 Musketeers TRUE FALSE FALSE FALSE TRUE
## One dime FALSE FALSE FALSE FALSE FALSE
## One quarter FALSE FALSE FALSE FALSE FALSE
## Air Heads FALSE TRUE FALSE FALSE FALSE
## Almond Joy TRUE FALSE FALSE TRUE FALSE
## crispedricewafer hard bar pluribus sugarpercent
## 100 Grand TRUE FALSE TRUE FALSE 0.732
## 3 Musketeers FALSE FALSE TRUE FALSE 0.604
## One dime FALSE FALSE FALSE FALSE 0.011
## One quarter FALSE FALSE FALSE FALSE 0.011
## Air Heads FALSE FALSE FALSE FALSE 0.906
## Almond Joy FALSE FALSE TRUE FALSE 0.465
## pricepercent winpercent
## 100 Grand 0.860 66.97173
## 3 Musketeers 0.511 67.60294
## One dime 0.116 32.26109
## One quarter 0.511 46.11650
## Air Heads 0.511 52.34146
## Almond Joy 0.767 50.34755
Final reflection
Our final cluster is 5 clusters. After five the sum of squared distances decreases relatively little with each new cluster split added. There are few clusters that could be characterized as being caramel, chocolate, fruity, peanutyalmondy, and nougat.candy_nowin <- candy_cluster %>% select(-"winpercent")
# Compute PCA
pca_results <- prcomp(candy_nowin, scale = TRUE)
# Loadings (ie. definition of PCs)
pca_results$rotation
## PC1 PC2 PC3 PC4
## chocolate -0.4108662 0.21740236 -0.07367612 0.03080335
## fruity 0.3958992 -0.26069583 -0.01334054 -0.06923952
## caramel -0.2522898 -0.39874749 -0.01962328 -0.03862553
## peanutyalmondy -0.2421395 0.23192057 0.20248013 0.45441432
## nougat -0.2505358 -0.33209030 0.52893191 0.14935275
## crispedricewafer -0.2287828 0.02336145 -0.51840794 -0.49204868
## hard 0.2172832 -0.49780524 -0.01311721 -0.11399766
## bar -0.4246558 -0.12831774 0.10346994 -0.21942543
## pluribus 0.2848606 0.21680859 -0.30355746 0.40878040
## sugarpercent -0.1038562 -0.49896801 -0.41539256 0.50519217
## pricepercent -0.3429068 -0.03011511 -0.35905239 0.19593637
## PC5 PC6 PC7 PC8
## chocolate -0.08236125 0.12184878 -0.48511279 0.02384081
## fruity -0.07628294 -0.35070344 0.41272241 -0.02092286
## caramel 0.52640570 0.59218305 0.26620561 0.08889254
## peanutyalmondy -0.44568311 0.26885853 0.46430139 -0.36312121
## nougat 0.22275924 -0.35149766 -0.17714020 -0.33313353
## crispedricewafer -0.02404426 -0.05104141 0.15578026 -0.57870373
## hard -0.55174148 0.33405346 -0.39369684 -0.08038925
## bar -0.07908975 -0.34381552 -0.03210197 -0.04688733
## pluribus 0.35421215 -0.05175213 -0.28334896 -0.33485706
## sugarpercent -0.03922886 -0.13259924 -0.02824700 -0.10908916
## pricepercent -0.15421519 -0.25095178 0.12298857 0.52978152
## PC9 PC10 PC11
## chocolate -0.14653170 0.66558792 0.23566084
## fruity 0.09592179 0.67093829 0.10668030
## caramel 0.16952920 0.18571962 -0.07543422
## peanutyalmondy 0.11721967 0.06910818 -0.05677309
## nougat 0.22040317 -0.10800392 0.38613640
## crispedricewafer 0.07878386 -0.10911849 0.23620200
## hard 0.33379917 -0.02198341 -0.04934822
## bar 0.07225016 0.12218271 -0.77233834
## pluribus 0.45502641 0.10742194 -0.27101842
## sugarpercent -0.52345689 -0.06608491 -0.04775548
## pricepercent 0.52379795 -0.11259022 0.21119596
# Scree plot: % of variance explained by each principal component
library(factoextra)
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
fviz_eig(pca_results)
# Plot cumulative % of variance explained
plot(get_eig(pca_results)$cumulative.variance.percent)
# Numerical examination
get_eig(pca_results)
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 3.9187452 35.624956 35.62496
## Dim.2 1.2692196 11.538360 47.16332
## Dim.3 1.2118369 11.016699 58.18002
## Dim.4 1.1419752 10.381593 68.56161
## Dim.5 0.8941622 8.128748 76.69036
## Dim.6 0.6668683 6.062439 82.75279
## Dim.7 0.5557406 5.052188 87.80498
## Dim.8 0.4784604 4.349640 92.15462
## Dim.9 0.4397143 3.997402 96.15202
## Dim.10 0.2344714 2.131558 98.28358
## Dim.11 0.1888059 1.716417 100.00000
Chocolate peanut better is in top left. Chocolate is bottom left. More fruity is top right and more hard and fruity is bottom right.
# Score plot: plot PC1 scores vs PC2 scores
fviz_pca_ind(pca_results, repel = TRUE)
Chocolate and peanuty alomndy seem to highly correlated. Caramel and nougat are seem to be positively correlated.
# Loadings plot
fviz_pca_var(pca_results, repel = TRUE)
Based on the loadings plot, these seem to be split up very similar to how we saw it in the prevous plot with the four quardrants having similar themes as the last graph.
library(caret)
library(fivethirtyeight)
data(candy_rankings)
head(candy_rankings)
## # A tibble: 6 x 13
## competitorname chocolate fruity caramel peanutyalmondy nougat
## <chr> <lgl> <lgl> <lgl> <lgl> <lgl>
## 1 100 Grand TRUE FALSE TRUE FALSE FALSE
## 2 3 Musketeers TRUE FALSE FALSE FALSE TRUE
## 3 One dime FALSE FALSE FALSE FALSE FALSE
## 4 One quarter FALSE FALSE FALSE FALSE FALSE
## 5 Air Heads FALSE TRUE FALSE FALSE FALSE
## 6 Almond Joy TRUE FALSE FALSE TRUE FALSE
## # … with 7 more variables: crispedricewafer <lgl>, hard <lgl>, bar <lgl>,
## # pluribus <lgl>, sugarpercent <dbl>, pricepercent <dbl>,
## # winpercent <dbl>
library(tree)
candy_cluster <- hclust(dist(candy_rankings), method = "complete")
## Warning in dist(candy_rankings): NAs introduced by coercion
library(tibble)
candy_cluster <- candy_rankings %>%
column_to_rownames("competitorname")
hier_model <- hclust(dist(scale(candy_cluster)), method = "complete")
# Set the seed
set.seed(253)
# Run the algorithm
pcr_model <- train(
winpercent ~ .,
data = candy_cluster,
method = "pcr",
scale = TRUE,
tuneGrid = data.frame(ncomp = 1:5), # number of PCs to keep as predictors
trControl = trainControl("cv", number = 10, selectionFunction = "oneSE"),
metric = "MAE"
)
# CV metrics by number of PCs
plot(pcr_model)
pcr_model$results
## ncomp RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 1 11.91983 0.3697331 9.756087 1.716672 0.1858384 1.510205
## 2 2 11.88149 0.3712643 9.826124 1.832493 0.1761292 1.701233
## 3 3 11.90058 0.3627448 9.748641 2.163314 0.1786692 1.911637
## 4 4 11.79580 0.3658765 9.548093 2.162562 0.1712402 1.985936
## 5 5 11.93489 0.3540514 9.730294 2.412922 0.1909888 2.173016
# Variation in original predictors explained by each PC
summary(pcr_model)
## Data: X dimension: 85 11
## Y dimension: 85 1
## Fit method: svdpc
## Number of components considered: 1
## TRAINING: % variance explained
## 1 comps
## X 35.62
## .outcome 33.83
head(candy_cluster)
## chocolate fruity caramel peanutyalmondy nougat
## 100 Grand TRUE FALSE TRUE FALSE FALSE
## 3 Musketeers TRUE FALSE FALSE FALSE TRUE
## One dime FALSE FALSE FALSE FALSE FALSE
## One quarter FALSE FALSE FALSE FALSE FALSE
## Air Heads FALSE TRUE FALSE FALSE FALSE
## Almond Joy TRUE FALSE FALSE TRUE FALSE
## crispedricewafer hard bar pluribus sugarpercent
## 100 Grand TRUE FALSE TRUE FALSE 0.732
## 3 Musketeers FALSE FALSE TRUE FALSE 0.604
## One dime FALSE FALSE FALSE FALSE 0.011
## One quarter FALSE FALSE FALSE FALSE 0.011
## Air Heads FALSE FALSE FALSE FALSE 0.906
## Almond Joy FALSE FALSE TRUE FALSE 0.465
## pricepercent winpercent
## 100 Grand 0.860 66.97173
## 3 Musketeers 0.511 67.60294
## One dime 0.116 32.26109
## One quarter 0.511 46.11650
## Air Heads 0.511 52.34146
## Almond Joy 0.767 50.34755
Here we are trying to identify the optimal number of PCs to utilize in your model.
Looking at the MAE graph we see that MAE is relatively low for 4 componenets and even lower for 10 componenets. We still might want to use 4 components instead of 10 to make sure we are not we are not overfitting.
We repeat our analysis using LASSO.
set.seed(253)
lambda_grid <- seq(0, 1, length = 100)
lasso_model_2 <- train(
winpercent ~ .,
data = candy_cluster,
method = "glmnet",
tuneGrid = data.frame(alpha = 1, lambda = lambda_grid),
trControl = trainControl(method = "cv", number = 10, selectionFunction = "best"),
metric = "MAE",
na.action = na.omit
)
coef(lasso_model_2$finalModel, lasso_model_2$bestTune$lambda) #oneSE
## 12 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 37.99529386
## chocolateTRUE 15.48022718
## fruityTRUE 3.30211268
## caramelTRUE 0.21153100
## peanutyalmondyTRUE 7.26386692
## nougatTRUE .
## crispedricewaferTRUE 6.15897327
## hardTRUE -3.39999268
## barTRUE .
## pluribusTRUE -0.02087903
## sugarpercent 6.22407389
## pricepercent .
lasso_model_2$bestTune$lambda
## [1] 0.6060606
plot(lasso_model_2)
In the optimally tuned LASSO model, there are 8 predictors.
lasso_model_2$resample
## RMSE Rsquared MAE Resample
## 1 9.060705 0.4652186 7.633885 Fold01
## 2 10.417977 0.5664095 8.326282 Fold10
## 3 11.405492 0.2951101 8.451525 Fold08
## 4 10.007884 0.4652922 7.727180 Fold04
## 5 10.295582 0.3405933 8.386983 Fold06
## 6 15.827480 0.2387633 13.601954 Fold03
## 7 9.052159 0.8196754 6.909666 Fold07
## 8 11.063978 0.5099477 9.419892 Fold02
## 9 10.450285 0.3901884 7.880982 Fold05
## 10 12.502720 0.6935619 10.970107 Fold09
From the results above, I would use the LASSO model because it has a lower MAE. The optimal MAE is around 6.9 whereas it was 9.55 using the prinicpal components model.