head(ubike)
date hour sno sarea sna lat lng
1 2014-12-08 15 2 大安區 捷運國父紀念館站(2號出口) 25.04100 121.5569
2 2014-12-08 15 3 信義區 台北市政府 25.03780 121.5652
3 2014-12-08 15 4 信義區 市民廣場 25.03604 121.5623
4 2014-12-08 15 5 信義區 興雅國中 25.03656 121.5687
5 2014-12-08 15 6 信義區 世貿二館 25.03474 121.5657
6 2014-12-08 15 7 信義區 信義廣場(台北101) 25.03304 121.5656
tot sbi bemp temp humidity pressure
1 48 24 22 16.49 89.87 1020.56
2 40 11 30 16.24 91.76 1020.15
3 60 40 21 16.31 91.62 1020.36
4 60 35 26 16.16 92.38 1020.11
5 80 32 48 16.25 92.32 1020.33
6 80 24 57 16.27 92.58 1020.43
# 製作資料集,並將含有 NA 值的觀察過濾掉
dat <- ubike %>% filter(sarea == '信義區') %>%
select(sbi, temp, humidity, pressure) %>% na.omit()
# PCA
pr <- prcomp(~ temp + humidity + pressure,
data = dat,
scale = TRUE) # 正規化
pr
Standard deviations:
[1] 1.5408655 0.7488696 0.2548097
Rotation:
PC1 PC2 PC3
temp 0.5908020 -0.5100526 0.6251394
humidity 0.5039703 0.8383670 0.2077371
pressure -0.6300531 0.1923202 0.7523603
summary(pr)
Importance of components:
PC1 PC2 PC3
Standard deviation 1.5409 0.7489 0.25481
Proportion of Variance 0.7914 0.1869 0.02164
Cumulative Proportion 0.7914 0.9784 1.00000
str(pr)
List of 6
$ sdev : num [1:3] 1.541 0.749 0.255
$ rotation: num [1:3, 1:3] 0.591 0.504 -0.63 -0.51 0.838 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:3] "temp" "humidity" "pressure"
.. ..$ : chr [1:3] "PC1" "PC2" "PC3"
$ center : Named num [1:3] 16.7 83 1021.1
..- attr(*, "names")= chr [1:3] "temp" "humidity" "pressure"
$ scale : Named num [1:3] 2.54 10.67 3.3
..- attr(*, "names")= chr [1:3] "temp" "humidity" "pressure"
$ x : num [1:3366, 1:3] 0.496 0.466 0.514 0.491 0.488 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:3366] "1" "2" "3" "4" ...
.. ..$ : chr [1:3] "PC1" "PC2" "PC3"
$ call : language prcomp(formula = ~temp + humidity + pressure, data = dat, scale = TRUE)
- attr(*, "class")= chr "prcomp"
## 特徵向量(原變數的線性組合)
# loadings: 依照係數組成判斷
pr$rotation
PC1 PC2 PC3
temp 0.5908020 -0.5100526 0.6251394
humidity 0.5039703 0.8383670 0.2077371
pressure -0.6300531 0.1923202 0.7523603
# score
head(pr$x)
PC1 PC2 PC3
1 0.4961015 0.7244646 -0.16954624
2 0.4657366 0.7116128 -0.10719870
3 0.5143951 0.7869517 -0.18629316
4 0.4905604 0.7769493 -0.11518197
5 0.4884308 0.7991861 -0.08241585
6 0.4795237 0.7425617 -0.10758756
## 陡坡圖(Scree plot): 選擇成分
plot(pr, type="line")
# 主成分分析會將特徵值最大的因素先萃取出來,一般而言特徵值大於1,就是需選擇的因子。
# 用藍線標示出特徵值=1的地方
abline(h=1, col="blue")
# 經過主成份分析後,會轉換成新的以主成份代替的資料集
head(pr$x)
PC1 PC2 PC3
1 0.4961015 0.7244646 -0.16954624
2 0.4657366 0.7116128 -0.10719870
3 0.5143951 0.7869517 -0.18629316
4 0.4905604 0.7769493 -0.11518197
5 0.4884308 0.7991861 -0.08241585
6 0.4795237 0.7425617 -0.10758756
# https://cran.r-project.org/web/packages/ggfortify/vignettes/plot_pca.html
library(ggfortify)
autoplot(pr, loadings = TRUE, loadings.label = TRUE)
pr$sdev^2 #因子的標準差平方就是特徵值
[1] 2.37426635 0.56080567 0.06492798
fa <- factanal(dat[,2:4], factors = 1)
fa
Call:
factanal(x = dat[, 2:4], factors = 1)
Uniquenesses:
temp humidity pressure
0.172 0.575 0.005
Loadings:
Factor1
temp 0.910
humidity 0.652
pressure -0.998
Factor1
SS loadings 2.248
Proportion Var 0.749
The degrees of freedom for the model is 0 and the fit was 0.156
summary(fa)
Length Class Mode
converged 1 -none- logical
loadings 3 loadings numeric
uniquenesses 3 -none- numeric
correlation 9 -none- numeric
criteria 3 -none- numeric
factors 1 -none- numeric
dof 1 -none- numeric
method 1 -none- character
n.obs 1 -none- numeric
call 3 -none- call
install.packages('sem')
Q: 使用分群演算法對台北市腳踏車站進行分群
dat <- ubike %>% select(sna, sarea, lat, lng) %>%
distinct(sna, .keep_all = T)
# 經度(longitude, x),緯度(latitude, y)
library(ggplot2)
dat %>% ggplot(aes(x = lng, y = lat)) +
geom_point(aes(colour = sarea)) +
theme(text = element_text(family = 'STHeiti'))
## 繪製地圖
# https://blog.gtwang.org/r/r-ggmap-package-spatial-data-visualization/
library(ggmap)
basemap <- get_map("Taipei, Taiwan", zoom = 12, maptype = "toner-lite")
ggmap(basemap)
p <- ggmap(basemap, darken = c(0.5, "white"))
p1 <- p + geom_point(data = dat, aes(x = lng, y = lat, colour = sarea)) +
theme(legend.position="none")
p1
myCluster <- kmeans(dat[, 3:4], centers = 5, nstart = 20)
myCluster
K-means clustering with 5 clusters of sizes 52, 36, 25, 31, 87
Cluster means:
lat lng
1 25.00595 121.5361
2 25.04375 121.4848
3 25.06245 121.6215
4 25.10282 121.5148
5 25.04815 121.5522
Clustering vector:
[1] 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 1 5 5 3 5 5 3 3 5 1 1 1 5 5 5 5 1
[36] 5 1 3 5 3 5 3 3 1 3 5 3 5 5 5 5 5 1 5 5 1 5 5 5 1 1 5 2 2 1 2 2 1 5 5
[71] 5 4 5 5 5 5 4 5 2 1 2 5 2 5 5 5 5 1 2 4 2 5 1 2 5 4 1 5 5 5 1 2 1 2 2
[106] 2 4 5 5 5 4 5 4 5 4 4 4 5 2 5 5 4 4 2 1 3 5 3 1 5 1 4 4 4 4 4 5 2 2 1
[141] 1 1 1 4 4 4 4 4 5 1 4 2 4 5 3 5 3 5 5 4 5 1 4 4 1 4 1 2 5 5 5 5 5 5 4
[176] 3 1 4 5 1 3 3 3 1 1 3 2 2 2 3 3 3 2 2 2 1 4 2 2 2 3 1 1 1 1 1 3 2 2 2
[211] 1 2 1 1 1 1 1 1 1 2 5 1 1 5 5 3 3 2 1 1 2
Within cluster sum of squares by cluster:
[1] 0.03011591 0.04103994 0.01482504 0.01996608 0.04759593
(between_SS / total_SS = 76.5 %)
Available components:
[1] "cluster" "centers" "totss" "withinss"
[5] "tot.withinss" "betweenss" "size" "iter"
[9] "ifault"
myCluster$cluster <- as.factor(myCluster$cluster)
ggplot(data = dat, aes(x = lng, y = lat, color = myCluster$cluster)) + geom_point()
## 繪製分群後地圖
p2 <- p + geom_point(data = dat, aes(x = lng, y = lat, color = myCluster$cluster)) +
theme(legend.position="none")
p2
## 前後比較
#install.packages('gridExtra')
gridExtra::grid.arrange(p1, p2, nrow = 1)