Data
library(class)
df <- iris
summary(df)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
head(df)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
Scaling
str(df)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
df.scaled <- scale(df[, -5])
summary(df.scaled)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :-1.86378 Min. :-2.4258 Min. :-1.5623 Min. :-1.4422
## 1st Qu.:-0.89767 1st Qu.:-0.5904 1st Qu.:-1.2225 1st Qu.:-1.1799
## Median :-0.05233 Median :-0.1315 Median : 0.3354 Median : 0.1321
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.67225 3rd Qu.: 0.5567 3rd Qu.: 0.7602 3rd Qu.: 0.7880
## Max. : 2.48370 Max. : 3.0805 Max. : 1.7799 Max. : 1.7064
apply(df.scaled, 2, sd)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 1 1 1
Partiioning (splitting)
set.seed(100)
sp <- sample(1:nrow(df), 100)
df.train <- df.scaled[sp,]
df.test <- df.scaled[-sp,]
df.train.y <- df$Species[sp]
df.test.y <- df$Species[-sp]
dim(df.train)
## [1] 100 4
set.seed(1)
knn.prediction1 <-
knn(
train = df.train,
test = df.test,
cl = df.train.y,
k = 1
)
table(knn.prediction1, df.test.y)
## df.test.y
## knn.prediction1 setosa versicolor virginica
## setosa 16 0 0
## versicolor 0 15 2
## virginica 0 2 15
knn.prediction2 <-
knn(
train = df.train,
test = df.test,
cl = df.train.y,
k = 3
)
table(knn.prediction2, df.test.y)
## df.test.y
## knn.prediction2 setosa versicolor virginica
## setosa 16 0 0
## versicolor 0 15 0
## virginica 0 2 17
Finetune K
knn.pred.iris = NULL
error.rate.iris = NULL
for(i in 1:30){
set.seed(1)
knn.pred.iris=knn(
train = df.train,
test = df.test,
cl = df.train.y,
k = i
)
error.rate.iris[i] = mean(df.test.y != knn.pred.iris)
}
print(error.rate.iris)
## [1] 0.08 0.08 0.04 0.06 0.06 0.06 0.06 0.06 0.06 0.06 0.06 0.06 0.04 0.04 0.04
## [16] 0.04 0.04 0.02 0.02 0.02 0.02 0.04 0.06 0.06 0.04 0.04 0.06 0.04 0.06 0.04
min.error.rate = min(error.rate.iris)
print(min.error.rate)
## [1] 0.02
Kiris = which(error.rate.iris == min.error.rate)
print(Kiris)
## [1] 18 19 20 21
library(ggplot2)
qplot(1:30, error.rate.iris*100, xlab = "K", ylab = "Error Rate",ylim = c(0,40), geom=c("point", "line"))
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
KNN with Caret
library(caret)
## Loading required package: lattice
scaled.iris=scale(iris[,-5])
set.seed(100)
train=sample(1:150,100)
train
## [1] 102 112 4 55 70 98 135 7 43 140 51 25 2 68 137 48 32 85
## [19] 91 121 16 116 66 146 93 45 30 124 126 87 95 97 120 29 92 31
## [37] 54 41 105 113 24 142 143 63 65 9 150 20 14 78 88 3 36 27
## [55] 46 59 96 69 47 147 129 136 12 141 130 56 22 82 53 99 5 44
## [73] 28 52 139 42 15 57 75 37 26 110 100 149 132 107 35 58 127 111
## [91] 144 86 114 71 123 119 18 8 128 83
train.irisX=scaled.iris[train,]
test.irisX=scaled.iris[-train,]
train.irisY=iris$Species[train]
test.irisY=iris$Species[-train]
iristr=data.frame(train.irisX,train.irisY)
iristr$Species=iristr$train.irisY
iristr$train.irisY=NULL
str(iristr)
## 'data.frame': 100 obs. of 5 variables:
## $ Sepal.Length: num -0.0523 0.6722 -1.5015 0.793 -0.2939 ...
## $ Sepal.Width : num -0.8198 -0.8198 0.0979 -0.5904 -1.2787 ...
## $ Petal.Length: num 0.7602 0.8735 -1.2791 0.477 0.0804 ...
## $ Petal.Width : num 0.919 0.919 -1.311 0.394 -0.13 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 3 3 1 2 2 2 3 1 1 3 ...
iriste=data.frame(test.irisX,test.irisY)
iriste$Species=iriste$test.irisY
iriste$test.irisY=NULL
str(iriste)
## 'data.frame': 50 obs. of 5 variables:
## $ Sepal.Length: num -0.898 -0.535 -1.139 -0.535 -1.26 ...
## $ Sepal.Width : num 1.0156 1.9333 0.0979 1.4745 -0.1315 ...
## $ Petal.Length: num -1.34 -1.17 -1.28 -1.28 -1.34 ...
## $ Petal.Width : num -1.31 -1.05 -1.44 -1.31 -1.44 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
knn.grid <- expand.grid(k = 1:10)
knn.ctrl <- trainControl(method = "cv", number = 5)
set.seed(1000)
knn.iris <-
train(
Species ~ .,
method = "knn",
tuneGrid = knn.grid,
trControl = knn.ctrl,
metric = "Accuracy",
data = iristr
)
knn.iris
## k-Nearest Neighbors
##
## 100 samples
## 4 predictor
## 3 classes: 'setosa', 'versicolor', 'virginica'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 80, 79, 81, 79, 81
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.9594236 0.9389533
## 2 0.9298496 0.8945485
## 3 0.9498997 0.9246676
## 4 0.9699499 0.9548523
## 5 0.9699499 0.9548523
## 6 0.9704261 0.9556609
## 7 0.9604261 0.9405666
## 8 0.9604261 0.9405666
## 9 0.9604261 0.9405666
## 10 0.9699499 0.9548523
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 6.