Formulations:
$$ \[\begin{split} Pr\{Y=1|X\} &= p(X) \\ \text{ (probability)}\ p(X) &= \frac{e^{\hat \beta_0 + \hat \beta_1x_1 + ...}}{1 + e^{\hat \beta_0 + \hat \beta_1x_1 + ...}} \\ \text{ (odds)}\ \frac{p(X)}{1-p(X)} &= e^{\hat \beta_0 + \hat \beta_1x_1 + ...} \\ \text{ (logit)}\ ln\big(\frac{p(X)}{1-p(X)}\big) &= \hat \beta_0 + \hat \beta_1x_1 + ... \end{split}\]$$
Make predictions: Just put x and coeffs into the formula.
library(ISLR2)
def=Default
str(def)
## 'data.frame': 10000 obs. of 4 variables:
## $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
## $ balance: num 730 817 1074 529 786 ...
## $ income : num 44362 12106 31767 35704 38463 ...
head(def)
lr1=glm(default~balance,data=def,family=binomial(link = "logit"))
\[
\log\left(\frac{p}{1-p}\right) = \beta_0 + \beta_1 \cdot
\text{{balance}}
\] - binomial
: y has only two vale.
- logit
: relationship between X and Y is logit : \(\log\left(\frac{p}{1-p}\right)\)
summary(lr1)
##
## Call:
## glm(formula = default ~ balance, family = binomial(link = "logit"),
## data = def)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.065e+01 3.612e-01 -29.49 <2e-16 ***
## balance 5.499e-03 2.204e-04 24.95 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1596.5 on 9998 degrees of freedom
## AIC: 1600.5
##
## Number of Fisher Scoring iterations: 8
pred=predict(lr1,type="response")
summary(pred)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000237 0.0003346 0.0021888 0.0333000 0.0142324 0.9810081
table(def$default,pred>=0.5)
##
## FALSE TRUE
## No 9625 42
## Yes 233 100
nulldev=-2*(9667*log(9667/10000)+333*log(333/10000))
nulldev
## [1] 2920.65
\[ \text{{Null Deviance}} = -2 \times \left( \sum_{i=1}^{n} y_i \log\left(\frac{y_i}{n}\right) + (1 - y_i) \log\left(\frac{1 - y_i}{n}\right) \right) \]
predict(lr1,data.frame(balance=c(1000,2000)),type="response")
## 1 2
## 0.005752145 0.585769370
# Define points
x1 <- c(1, 0)
x2 <- c(0, 1)
x3 <- c(2, 1)
x4 <- c(3, 3)
x5 <- c(5, 4)
# Combine points into a matrix
X <- rbind(x1, x2, x3, x4, x5)
X
## [,1] [,2]
## x1 1 0
## x2 0 1
## x3 2 1
## x4 3 3
## x5 5 4
plot(X)
round(dist(
X,
method = "euclidean",
diag = TRUE,
upper = TRUE
), 2)
## x1 x2 x3 x4 x5
## x1 0.00 1.41 1.41 3.61 5.66
## x2 1.41 0.00 2.00 3.61 5.83
## x3 1.41 2.00 0.00 2.24 4.24
## x4 3.61 3.61 2.24 0.00 2.24
## x5 5.66 5.83 4.24 2.24 0.00
squared_euclidean <-
dist(X,
method = "euclidean",
diag = TRUE,
upper = TRUE) ^ 2
squared_euclidean
## x1 x2 x3 x4 x5
## x1 0 2 2 13 32
## x2 2 0 4 13 34
## x3 2 4 0 5 18
## x4 13 13 5 0 5
## x5 32 34 18 5 0
round(dist(
X,
method = "manhattan",
diag = TRUE,
upper = TRUE
), 2)
## x1 x2 x3 x4 x5
## x1 0 2 2 5 8
## x2 2 0 2 5 8
## x3 2 2 0 3 6
## x4 5 5 3 0 3
## x5 8 8 6 3 0
## k-means clustering
set.seed(581)
km.out=kmeans(x=X,centers=2,nstart=20)
km.out
## K-means clustering with 2 clusters of sizes 2, 3
##
## Cluster means:
## [,1] [,2]
## 1 4 3.5000000
## 2 1 0.6666667
##
## Clustering vector:
## x1 x2 x3 x4 x5
## 2 2 2 1 1
##
## Within cluster sum of squares by cluster:
## [1] 2.500000 2.666667
## (between_SS / total_SS = 79.8 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
km.out$cluster
## x1 x2 x3 x4 x5
## 2 2 2 1 1
km.out$tot.withinss
## [1] 5.166667
The total within-cluster sum of squares (tot.withinss) is a measure of the compactness of the clustering, where lower values indicate a better fit.
\[ TotalWithinSS = \sum_{i=1}^{k} \sum_{x \in C_i} ||x - m_i||^2 \] Here’s the explanation:
So, \(TotalWithinSS\) is the sum of the squared distances of each point in each cluster to the centroid of its cluster.
plot(X, col=(km.out$cluster+1), main="K-Means Clustering Results with K=2", xlab="", ylab="", pch=2, cex=1)
# Define points
x1 <- c(1, 3)
x2 <- c(1, 1)
x3 <- c(2.5, 1)
x4 <- c(6, 1)
x5 <- c(6, 3)
# Combine points into a matrix
X <- rbind(x1, x2, x3, x4, x5)
X
## [,1] [,2]
## x1 1.0 3
## x2 1.0 1
## x3 2.5 1
## x4 6.0 1
## x5 6.0 3
plot(X)
euc <- round(dist(
X,
method = "euclidean",
diag = TRUE,
upper = TRUE
), 2)
squared_euc <-
dist(X,
method = "euclidean",
diag = TRUE,
upper = TRUE) ^ 2
man <- round(dist(
X,
method = "manhattan",
diag = TRUE,
upper = TRUE
), 2)
euc
## x1 x2 x3 x4 x5
## x1 0.00 2.00 2.50 5.39 5.00
## x2 2.00 0.00 1.50 5.00 5.39
## x3 2.50 1.50 0.00 3.50 4.03
## x4 5.39 5.00 3.50 0.00 2.00
## x5 5.00 5.39 4.03 2.00 0.00
squared_euc
## x1 x2 x3 x4 x5
## x1 0.00 4.00 6.25 29.00 25.00
## x2 4.00 0.00 2.25 25.00 29.00
## x3 6.25 2.25 0.00 12.25 16.25
## x4 29.00 25.00 12.25 0.00 4.00
## x5 25.00 29.00 16.25 4.00 0.00
man
## x1 x2 x3 x4 x5
## x1 0.0 2.0 3.5 7.0 5.0
## x2 2.0 0.0 1.5 5.0 7.0
## x3 3.5 1.5 0.0 3.5 5.5
## x4 7.0 5.0 3.5 0.0 2.0
## x5 5.0 7.0 5.5 2.0 0.0
# Hierarchical Clustering
hc.single = hclust(d = squared_euc, method = "single")
hc.complete = hclust(d = squared_euc, method = "complete")
hc.average = hclust(d = squared_euc, method = "average")
hc.centroid = hclust(d = squared_euc, method = "centroid")
plot(hc.single,
main = "Single Linkage",
xlab = "",
cex = .9)
hc.single$merge
## [,1] [,2]
## [1,] -2 -3
## [2,] -1 1
## [3,] -4 -5
## [4,] 2 3
plot(hc.complete,
main = "Complete Linkage",
xlab = "",
cex = .9)
hc.complete$merge
## [,1] [,2]
## [1,] -2 -3
## [2,] -4 -5
## [3,] -1 1
## [4,] 2 3
plot(hc.average,
main = "Average Linkage",
xlab = "",
cex = .9)
hc.average$merge
## [,1] [,2]
## [1,] -2 -3
## [2,] -4 -5
## [3,] -1 1
## [4,] 2 3
plot(hc.centroid,
main = "Centroid Linkage",
xlab = "",
cex = .9)
hc.centroid$merge
## [,1] [,2]
## [1,] -2 -3
## [2,] -4 -5
## [3,] -1 1
## [4,] 2 3
3 Steps:
1. calc \(S_i\) for points
2. calc \(S_j\) for clusters
3. calc S for clustering
to calculate a and b and S, we still need the distance matrix. (distance can be any distance, euc or man or sqr-euc)
The formula for the silhouette index s(i) of a sample i is:
s(i) = (b(i) - a(i)) / max{a(i), b(i)}
where
a(i) is the average distance from the ith point to the other points in the same cluster as i, and b(i) is the minimum average distance from the ith point to points in a different cluster, minimized over clusters.
# get the model
iris.cl2=kmeans(iris[,-5],centers=2,nstart=10)
iris.cl3=kmeans(iris[,-5],3,nstart=10)
iris.cl4=kmeans(iris[,-5],4,nstart=10)
iris.cl4$betweenss/iris.cl4$tot.withinss
## [1] 10.90615
betweenss / tot.withinss
- This ratio provides a
measure of the separation of the clusters compared to the compactness of
the clusters. Larger values indicate better clustering because they mean
the clusters are well separated (high between-cluster variation) and
tight (low within-cluster variation).## silhouette plots
library(cluster)
dist.iris=dist(iris[,-5],method="euclidean")
sil.iris.cl2=silhouette(iris.cl2$cl, dist.iris)
sil.iris.cl3=silhouette(iris.cl3$cl, dist.iris)
sil.iris.cl4=silhouette(iris.cl4$cl, dist.iris)
plot(sil.iris.cl2)
plot(sil.iris.cl3)
plot(sil.iris.cl4)
plot(hc.complete,
main = "Complete Linkage",
xlab = "",
cex = .9)
cluster_assignment <- cutree(hc.complete, k=2)
cluster_assignment
## x1 x2 x3 x4 x5
## 1 1 1 2 2
sil <- silhouette(cluster_assignment, squared_euc)
sil
## cluster neighbor sil_width
## [1,] 1 2 0.8101852
## [2,] 1 2 0.8842593
## [3,] 1 2 0.7017544
## [4,] 2 1 0.8188679
## [5,] 2 1 0.8291815
## attr(,"Ordered")
## [1] FALSE
## attr(,"call")
## silhouette.default(x = cluster_assignment, dist = squared_euc)
## attr(,"class")
## [1] "silhouette"
each row: one obs, with sil for that obs.
# total sil
mean(sil[, 3])
## [1] 0.8088496
plot(sil)