Data

library(class)
df <- iris
summary(df)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 
head(df)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

Scaling

str(df)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
df.scaled <- scale(df[, -5])
summary(df.scaled)
##   Sepal.Length       Sepal.Width       Petal.Length      Petal.Width     
##  Min.   :-1.86378   Min.   :-2.4258   Min.   :-1.5623   Min.   :-1.4422  
##  1st Qu.:-0.89767   1st Qu.:-0.5904   1st Qu.:-1.2225   1st Qu.:-1.1799  
##  Median :-0.05233   Median :-0.1315   Median : 0.3354   Median : 0.1321  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.67225   3rd Qu.: 0.5567   3rd Qu.: 0.7602   3rd Qu.: 0.7880  
##  Max.   : 2.48370   Max.   : 3.0805   Max.   : 1.7799   Max.   : 1.7064
apply(df.scaled, 2, sd)
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##            1            1            1            1

Partiioning (splitting)

set.seed(100)
sp <- sample(1:nrow(df), 100)
df.train <- df.scaled[sp,]
df.test <- df.scaled[-sp,]
df.train.y <- df$Species[sp]
df.test.y <- df$Species[-sp]
dim(df.train)
## [1] 100   4
set.seed(1)
knn.prediction1 <-
  knn(
    train = df.train,
    test = df.test,
    cl = df.train.y,
    k = 1
  )
table(knn.prediction1, df.test.y)
##                df.test.y
## knn.prediction1 setosa versicolor virginica
##      setosa         16          0         0
##      versicolor      0         15         2
##      virginica       0          2        15
knn.prediction2 <-
  knn(
    train = df.train,
    test = df.test,
    cl = df.train.y,
    k = 3
  )
table(knn.prediction2, df.test.y)
##                df.test.y
## knn.prediction2 setosa versicolor virginica
##      setosa         16          0         0
##      versicolor      0         15         0
##      virginica       0          2        17

Finetune K

knn.pred.iris = NULL
error.rate.iris = NULL
for(i in 1:30){
  set.seed(1)
  knn.pred.iris=knn(
    train = df.train,
    test = df.test,
    cl = df.train.y,
    k = i
  )
  error.rate.iris[i] = mean(df.test.y != knn.pred.iris)
}
print(error.rate.iris)
##  [1] 0.08 0.08 0.04 0.06 0.06 0.06 0.06 0.06 0.06 0.06 0.06 0.06 0.04 0.04 0.04
## [16] 0.04 0.04 0.02 0.02 0.02 0.02 0.04 0.06 0.06 0.04 0.04 0.06 0.04 0.06 0.04
min.error.rate = min(error.rate.iris)
print(min.error.rate)
## [1] 0.02
Kiris = which(error.rate.iris == min.error.rate)
print(Kiris)
## [1] 18 19 20 21
library(ggplot2)
qplot(1:30, error.rate.iris*100, xlab = "K", ylab = "Error Rate",ylim = c(0,40), geom=c("point", "line"))
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

KNN with Caret

library(caret)
## Loading required package: lattice
scaled.iris=scale(iris[,-5])
set.seed(100)
train=sample(1:150,100)
train
##   [1] 102 112   4  55  70  98 135   7  43 140  51  25   2  68 137  48  32  85
##  [19]  91 121  16 116  66 146  93  45  30 124 126  87  95  97 120  29  92  31
##  [37]  54  41 105 113  24 142 143  63  65   9 150  20  14  78  88   3  36  27
##  [55]  46  59  96  69  47 147 129 136  12 141 130  56  22  82  53  99   5  44
##  [73]  28  52 139  42  15  57  75  37  26 110 100 149 132 107  35  58 127 111
##  [91] 144  86 114  71 123 119  18   8 128  83
train.irisX=scaled.iris[train,]
test.irisX=scaled.iris[-train,]
train.irisY=iris$Species[train]
test.irisY=iris$Species[-train]
iristr=data.frame(train.irisX,train.irisY)
iristr$Species=iristr$train.irisY
iristr$train.irisY=NULL
str(iristr)
## 'data.frame':    100 obs. of  5 variables:
##  $ Sepal.Length: num  -0.0523 0.6722 -1.5015 0.793 -0.2939 ...
##  $ Sepal.Width : num  -0.8198 -0.8198 0.0979 -0.5904 -1.2787 ...
##  $ Petal.Length: num  0.7602 0.8735 -1.2791 0.477 0.0804 ...
##  $ Petal.Width : num  0.919 0.919 -1.311 0.394 -0.13 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 3 3 1 2 2 2 3 1 1 3 ...
iriste=data.frame(test.irisX,test.irisY)
iriste$Species=iriste$test.irisY
iriste$test.irisY=NULL
str(iriste)
## 'data.frame':    50 obs. of  5 variables:
##  $ Sepal.Length: num  -0.898 -0.535 -1.139 -0.535 -1.26 ...
##  $ Sepal.Width : num  1.0156 1.9333 0.0979 1.4745 -0.1315 ...
##  $ Petal.Length: num  -1.34 -1.17 -1.28 -1.28 -1.34 ...
##  $ Petal.Width : num  -1.31 -1.05 -1.44 -1.31 -1.44 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
knn.grid <- expand.grid(k = 1:10)
knn.ctrl <- trainControl(method = "cv", number = 5)
set.seed(1000)
knn.iris <-
  train(
    Species ~ .,
    method = "knn",
    tuneGrid = knn.grid,
    trControl = knn.ctrl,
    metric = "Accuracy",
    data = iristr
  )
knn.iris
## k-Nearest Neighbors 
## 
## 100 samples
##   4 predictor
##   3 classes: 'setosa', 'versicolor', 'virginica' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 80, 79, 81, 79, 81 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    1  0.9594236  0.9389533
##    2  0.9298496  0.8945485
##    3  0.9498997  0.9246676
##    4  0.9699499  0.9548523
##    5  0.9699499  0.9548523
##    6  0.9704261  0.9556609
##    7  0.9604261  0.9405666
##    8  0.9604261  0.9405666
##    9  0.9604261  0.9405666
##   10  0.9699499  0.9548523
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 6.