3. 散點圖
目錄
3. 散點圖
3.1 繪制基本散點圖
3.2 使用點形和顏色屬性進行分組
3.3 使用不同于默認設(shè)置的點形
3.4 將連續(xù)型變量映射到點的顏色或大小屬性上
3.5 處理圖形重疊
3.6 添加回歸模型擬合線
3.7 根據(jù)已有模型向散點圖添加擬合線
3.8 添加來自多個模型的擬合線
3.9 向散點圖添加模型系數(shù)
3.10 向散點圖添加邊際地毯
3.11 向散點圖添加標簽
3.12 繪制氣泡圖
3.13 繪制散點圖矩陣往期文章參考書籍
散點圖通常用于刻畫兩個連續(xù)型變量之間的關(guān)系。繪制散點圖時,數(shù)據(jù)集中的每一個觀測值都由每個點表示俘陷。
3.1 繪制基本散點圖
library(gcookbook)
library(ggplot2)
# 列出我們用到的列
head(heightweight[, c("ageYear", "heightIn")])
> head(heightweight[, c("ageYear", "heightIn")])
ageYear heightIn
1 11.92 56.3
2 12.92 62.3
3 12.75 63.3
4 13.42 59.0
5 15.92 62.5
6 14.25 62.5
ggplot(heightweight, aes(x=ageYear, y=heightIn)) + geom_point()
# shape參數(shù)設(shè)置點型 size設(shè)置點的大小
ggplot(heightweight, aes(x=ageYear, y=heightIn)) +
geom_point(shape=21)
ggplot(heightweight, aes(x=ageYear, y=heightIn)) +
geom_point(size=1.5)
3.2 使用點形和顏色屬性進行分組
head(heightweight[, c("sex", "ageYear", "heightIn")])
> head(heightweight[, c("sex", "ageYear", "heightIn")])
sex ageYear heightIn
1 f 11.92 56.3
2 f 12.92 62.3
3 f 12.75 63.3
4 f 13.42 59.0
5 f 15.92 62.5
6 f 14.25 62.5
ggplot(heightweight, aes(x=ageYear, y=heightIn, colour=sex)) +
geom_point()
ggplot(heightweight, aes(x=ageYear, y=heightIn, shape=sex)) +
geom_point()
# scale_shape_manual()使用其它點形狀
#scale_colour_brewer()使用其它顏色
ggplot(heightweight, aes(x=ageYear, y=heightIn, shape=sex, colour=sex)) +
geom_point() +
scale_shape_manual(values=c(1,2)) +
scale_colour_brewer(palette="Set1")
3.3 使用不同于默認設(shè)置的點形
# 使用點形和填充色屬性分別表示不同變量
hw <- heightweight
# 分組 Categorize into <100 and >=100 groups
hw$weightGroup <- cut(hw$weightLb, breaks=c(-Inf, 100, Inf),
labels=c("< 100", ">= 100"))
# 使用具有顏色和填充色的點形及對應(yīng)于空值(NA)和填充色的顏色
ggplot(hw, aes(x=ageYear, y=heightIn, shape=sex, fill=weightGroup)) +
geom_point(size=2.5) +
scale_shape_manual(values=c(21, 24)) +
scale_fill_manual(values=c(NA, "black"),
guide=guide_legend(override.aes=list(shape=21)))
3.4 將連續(xù)型變量映射到點的顏色或大小屬性上
ggplot(heightweight, aes(x=ageYear, y=heightIn, colour=weightLb)) +
geom_point()
ggplot(heightweight, aes(x=ageYear, y=heightIn, size=weightLb)) +
geom_point()
# 默認點的大小范圍為1-6mm
# scale_size_continuous(range=c(2, 5))修改點的大小范圍
# 將色階設(shè)定為由黑至白
ggplot(heightweight, aes(x=weightLb, y=heightIn, fill=ageYear)) +
geom_point(shape=21, size=2.5) +
scale_fill_gradient(low="black", high="white")
# 使用 guide_legend() 函數(shù)以離散的圖例代替色階
ggplot(heightweight, aes(x=weightLb, y=heightIn, fill=ageYear)) +
geom_point(shape=21, size=2.5) +
scale_fill_gradient(low="black", high="white", breaks=12:17,
guide=guide_legend())
# 調(diào)用scale_size_area()函數(shù)使數(shù)據(jù)點的面積正比于變量值赃份。
ggplot(heightweight, aes(x=ageYear, y=heightIn, size=weightLb, colour=sex)) +
geom_point(alpha=.5) +
scale_size_area() +
scale_colour_brewer(palette="Set1")
3.5 處理圖形重疊
方法:
- 使用半透明的點
- 將數(shù)據(jù)分箱(bin),并用矩形表示
- 將數(shù)據(jù)分箱(bin)贝搁,并用六邊形表示
- 使用箱線圖
sp <- ggplot(diamonds, aes(x=carat, y=price))
sp + geom_point()
# 透明度
sp + geom_point(alpha=.1)
sp + geom_point(alpha=.01)
# stat_bin2d()函數(shù)默認分別在x軸和y軸方向上將數(shù)據(jù)分割為30各組
sp + stat_bin2d()
# bin=50設(shè)置箱數(shù),limits參數(shù)設(shè)定圖例范圍
sp + stat_bin2d(bins=50) +
scale_fill_gradient(low="lightblue", high="red", limits=c(0, 6000))
# stat_binhex()函數(shù)使用六邊形分箱
library(hexbin)
sp + stat_binhex() +
scale_fill_gradient(low="lightblue", high="red",
limits=c(0, 8000))
sp + stat_binhex() +
scale_fill_gradient(low="lightblue", high="red",
breaks=c(0, 250, 500, 1000, 2000, 4000, 6000),
limits=c(0, 6000))
sp1 <- ggplot(ChickWeight, aes(x=Time, y=weight))
sp1 + geom_point()
# 調(diào)用position_jitter()函數(shù)給數(shù)據(jù)點增加隨機擾動,通過width,height參數(shù)調(diào)節(jié)
sp1 + geom_point(position="jitter")
# 也可以調(diào)用 geom_jitter()
sp1 + geom_point(position=position_jitter(width=.5, height=0))
# 箱線圖
sp1 + geom_boxplot(aes(group=Time))
3.6 添加回歸模型擬合線
# 運行stat_smooth()函數(shù)并設(shè)定 method=lm 即可向散點圖中添加線性回歸擬合線
# 默認情況下 stat_smooth() 函數(shù)會為回歸擬合線自動添加95% 的置信域,可以設(shè)置 level 參數(shù)對置信水平進行調(diào)整。設(shè)置 se = FALSE, 則不添加置信域
library(gcookbook) # For the data set
sp <- ggplot(heightweight, aes(x=ageYear, y=heightIn))
sp + geom_point() + stat_smooth(method=lm)
# 99% 置信域
sp + geom_point() + stat_smooth(method=lm, level=0.99)
# 沒有置信域
sp + geom_point() + stat_smooth(method=lm, se=FALSE)
# 設(shè)置擬合線的顏色
sp + geom_point(colour="grey60") +
stat_smooth(method=lm, se=FALSE, colour="black")
# stat_smooth()函數(shù)默認的模型為 loess 曲線
sp + geom_point(colour="grey60") + stat_smooth()
sp + geom_point(colour="grey60") + stat_smooth(method=loess)
# 分組繪制模型擬合線
sps <- ggplot(heightweight, aes(x=ageYear, y=heightIn, colour=sex)) +
geom_point() +
scale_colour_brewer(palette="Set1")
sps + geom_smooth()
sps + geom_smooth(method=lm, se=FALSE, fullrange=TRUE)
值得注意的是:loess()
函數(shù)只能根據(jù)數(shù)據(jù)對應(yīng)的x軸的范圍進行預(yù)測耐床。如果想基于數(shù)據(jù)集對擬合線進行外推,必須使用支持外推的函數(shù)楔脯,比如lm()
撩轰,并將fullrange=TRUE
參數(shù)傳遞給 stat_smooth()
函數(shù)。
3.7 根據(jù)已有模型向散點圖添加擬合線
使用 lm()
函數(shù)建立一個以 ageYear
為預(yù)測變量對 heightIn
進行預(yù)測的模型昧廷。然后堪嫂,調(diào)用 predict()
函數(shù)對 heightIn
進行預(yù)測。
model <- lm(heightIn ~ ageYear + I(ageYear^2), heightweight)
model
> model
Call:
lm(formula = heightIn ~ ageYear + I(ageYear^2), data = heightweight)
Coefficients:
(Intercept) ageYear I(ageYear^2)
-10.3136 8.6673 -0.2478
# 創(chuàng)建一個 ageYear 列木柬,并對其進行插值皆串。
xmin <- min(heightweight$ageYear)
xmax <- max(heightweight$ageYear)
predicted <- data.frame(ageYear=seq(xmin, xmax, length.out=100))
# 計算 heightIn 的預(yù)測值
predicted$heightIn <- predict(model, predicted)
head(predicted)
> head(predicted)
ageYear heightIn
1 11.58000 56.82624
2 11.63980 57.00047
3 11.69960 57.17294
4 11.75939 57.34363
5 11.81919 57.51255
6 11.87899 57.67969
# 將預(yù)測曲線繪制的數(shù)據(jù)點散點圖上
sp <- ggplot(heightweight, aes(x=ageYear, y=heightIn)) +
geom_point(colour="grey40")
sp + geom_line(data=predicted, size=1)
# 應(yīng)用定義的 predictvals() 函數(shù)可以簡化向散點圖添加模型擬合線的過程
predictvals <- function(model, xvar, yvar, xrange=NULL, samples=100, ...) {
if (is.null(xrange)) {
if (any(class(model) %in% c("lm", "glm")))
xrange <- range(model$model[[xvar]])
else if (any(class(model) %in% "loess"))
xrange <- range(model$x)
}
newdata <- data.frame(x = seq(xrange[1], xrange[2], length.out = samples))
names(newdata) <- xvar
newdata[[yvar]] <- predict(model, newdata = newdata, ...)
newdata
}
# 調(diào)用lm() 函數(shù)和 loess() 函數(shù)對數(shù)據(jù)集建立線性和LOESS模型
modlinear <- lm(heightIn ~ ageYear, heightweight)
modloess <- loess(heightIn ~ ageYear, heightweight)
lm_predicted <- predictvals(modlinear, "ageYear", "heightIn")
loess_predicted <- predictvals(modloess, "ageYear", "heightIn")
ggplot(heightweight, aes(x=ageYear, y=heightIn)) +
geom_point(colour="grey40") +
geom_line(data=lm_predicted, colour="red", size=.8) +
geom_line(data=loess_predicted, colour="blue", size=.8)
3.8 添加來自多個模型的擬合線
根據(jù)變量 sex 的水平對 heightweight 數(shù)據(jù)集進行分組,調(diào)用 lm() 函數(shù)對每組數(shù)據(jù)分別建立線性模型眉枕,并將模型結(jié)果放在一個列表內(nèi)恶复。隨后怜森,通過下面定義的 make_model() 函數(shù)建立模型。
make_model <- function(data) {
lm(heightIn ~ ageYear, data)
}
# 將heighweight 數(shù)據(jù)集分別切分為男性和女性組并建立模型
ibrary(gcookbook)
library(plyr)
models <- dlply(heightweight, "sex", .fun = make_model)
# 查看兩個lm對象f和m組成的列表
models
> models
$f
Call:
lm(formula = heightIn ~ ageYear, data = data)
Coefficients:
(Intercept) ageYear
43.963 1.209
$m
Call:
lm(formula = heightIn ~ ageYear, data = data)
Coefficients:
(Intercept) ageYear
30.658 2.301
attr(,"split_type")
[1] "data.frame"
attr(,"split_labels")
sex
1 f
2 m
predvals <- ldply(models, .fun=predictvals, xvar="ageYear", yvar="heightIn")
head(predvals)
ggplot(heightweight, aes(x=ageYear, y=heightIn, colour=sex)) +
geom_point() + geom_line(data=predvals)
# 設(shè)置 xrange 參數(shù)使兩組預(yù)測線對應(yīng)的xz軸范圍與整個數(shù)據(jù)集對應(yīng)的x軸范圍詳談
predvals <- ldply(models, .fun=predictvals, xvar="ageYear", yvar="heightIn",
xrange=range(heightweight$ageYear))
ggplot(heightweight, aes(x=ageYear, y=heightIn, colour=sex)) +
geom_point() + geom_line(data=predvals)
3.9 向散點圖添加模型系數(shù)
調(diào)用 annotate()
函數(shù)在圖形中添加文本谤牡。
model <- lm(heightIn ~ ageYear, heightweight)
# 查看模型參數(shù)
summary(model)
> summary(model)
Call:
lm(formula = heightIn ~ ageYear, data = heightweight)
Residuals:
Min 1Q Median 3Q Max
-8.3517 -1.9006 0.1378 1.9071 8.3371
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 37.4356 1.8281 20.48 <2e-16 ***
ageYear 1.7483 0.1329 13.15 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.989 on 234 degrees of freedom
Multiple R-squared: 0.4249, Adjusted R-squared: 0.4225
F-statistic: 172.9 on 1 and 234 DF, p-value: < 2.2e-16
pred <- predictvals(model, "ageYear", "heightIn")
sp <- ggplot(heightweight, aes(x=ageYear, y=heightIn)) + geom_point() +
geom_line(data=pred)
# x,y參數(shù)設(shè)置標簽位置
sp + annotate("text", label="r^2=0.42", x=16.5, y=52)
# parse = TRUE 調(diào)用R的數(shù)學(xué)表達式語法
sp + annotate("text", label="r^2 == 0.42", parse = TRUE, x=16.5, y=52)
# 自動生成公式
eqn <- as.character(as.expression(
substitute(italic(y) == a + b * italic(x) * "," ~~ italic(r)^2 ~ "=" ~ r2,
list(a = format(coef(model)[1], digits=3),
b = format(coef(model)[2], digits=3),
r2 = format(summary(model)$r.squared, digits=2)
))))
eqn
parse(text=eqn) # Parsing turns it into an expression
sp + annotate("text", label=eqn, parse=TRUE, x=Inf, y=-Inf, hjust=1.1, vjust=-.5)
3.10 向散點圖添加邊際地毯
# 使用 geom_rug() 函數(shù)添加邊際地毯
ggplot(faithful, aes(x=eruptions, y=waiting)) +
geom_point() +
geom_rug()
# 通過向邊際地毯線的位置坐標添加擾動并設(shè)定size減小線寬可以減輕邊際地毯線的重疊程度
ggplot(faithful, aes(x=eruptions, y=waiting)) +
geom_point() +
geom_rug(position="jitter", size=.2)
3.11 向散點圖添加標簽
library(gcookbook)
# 以countries數(shù)據(jù)集為例副硅,對各國醫(yī)療保健支出與嬰兒死亡率之間的關(guān)系進行可視化
# 選取人均支出大于2000美元的國家的數(shù)據(jù)子集進行分析
subset(countries, Year==2009 & healthexp>2000)
sp <- ggplot(subset(countries, Year==2009 & healthexp>2000),
aes(x=healthexp, y=infmortality)) +
geom_point()
# annotate()函數(shù)指定標簽坐標和標簽文本
sp + annotate("text", x=4350, y=5.4, label="Canada") +
annotate("text", x=7400, y=6.8, label="USA")
# geom_text()函數(shù)自動添加數(shù)據(jù)標簽
sp + geom_text(aes(label=Name), size=4)
調(diào)整標簽位置,大家自行嘗試翅萤。
# 對標簽的位置進行調(diào)整
sp + geom_text(aes(label=Name), size=4, vjust=0)
sp + geom_text(aes(y=infmortality+.1, label=Name), size=4, vjust=0)
sp + geom_text(aes(label=Name), size=4, hjust=0)
sp + geom_text(aes(x=healthexp+100, label=Name), size=4, hjust=0)
如何只對自己想要的數(shù)據(jù)點添加標簽恐疲。
注:有很多人在后臺問我如何在火山圖里給自己想要的基因添加注釋。這里提供了一個思路套么。
# 新建一個數(shù)據(jù)
cdat <- subset(countries, Year==2009 & healthexp>2000)
cdat$Name1 <- cdat$Name
# 用%in%運算符找出繪圖時希望抱怨的標簽
idx <- cdat$Name1 %in% c("Canada", "Ireland", "United Kingdom", "United States",
"New Zealand", "Iceland", "Japan", "Luxembourg",
"Netherlands", "Switzerland")
idx
# 根據(jù)上面的邏輯向量用 NA 重寫變量 Name1 中的其它取值
cdat$Name1[!idx] <- NA
cdat
ggplot(cdat, aes(x=healthexp, y=infmortality)) +
geom_point() +
geom_text(aes(x=healthexp+100, label=Name1), size=4, hjust=0) +
xlim(2000, 10000)
3.12 繪制氣泡圖
調(diào)用 geom_point()
和 scale_size_area()
函數(shù)即可繪制氣泡圖癞季。
# 示例數(shù)據(jù)
library(gcookbook) # For the data set
cdat <- subset(countries, Year==2009 &
Name %in% c("Canada", "Ireland", "United Kingdom", "United States",
"New Zealand", "Iceland", "Japan", "Luxembourg",
"Netherlands", "Switzerland"))
cdat
> cdat
Name Code Year GDP laborrate healthexp infmortality
1733 Canada CAN 2009 39599.04 67.8 4379.761 5.2
4436 Iceland ISL 2009 37972.24 77.5 3130.391 1.7
4691 Ireland IRL 2009 49737.93 63.6 4951.845 3.4
4946 Japan JPN 2009 39456.44 59.5 3321.466 2.4
5864 Luxembourg LUX 2009 106252.24 55.5 8182.855 2.2
7088 Netherlands NLD 2009 48068.35 66.1 5163.740 3.8
7190 New Zealand NZL 2009 29352.45 68.6 2633.625 4.9
9587 Switzerland CHE 2009 63524.65 66.9 7140.729 4.1
10454 United Kingdom GBR 2009 35163.41 62.2 3285.050 4.7
10505 United States USA 2009 45744.56 65.0 7410.163 6.6
p <- ggplot(cdat, aes(x=healthexp, y=infmortality, size=GDP)) +
geom_point(shape=21, colour="black", fill="cornsilk")
# 將GDP 映射給半徑 (scale_size_continuous)
p
# 將GDP 映射給面積
p + scale_size_area(max_size=15)
如果x軸辛燥,y軸皆是分類變量,氣泡圖可以用來表示網(wǎng)格上的變量值。
# 對男性組和女性組求和
hec <- HairEyeColor[,,"Male"] + HairEyeColor[,,"Female"]
# 轉(zhuǎn)化為長格式(long format)
library(reshape2)
hec <- melt(hec, value.name="count")
ggplot(hec, aes(x=Eye, y=Hair)) +
geom_point(aes(size=count), shape=21, colour="black", fill="cornsilk") +
scale_size_area(max_size=20, guide=FALSE) +
geom_text(aes(y=as.numeric(Hair)-sqrt(count)/22, label=count), vjust=1,
colour="grey60", size=4)
3.13 繪制散點圖矩陣
散點圖矩陣是一種對多個變量兩兩之間關(guān)系進行可視化的有效方法想鹰。pairs()函數(shù)可以繪制散點圖矩陣。
注:現(xiàn)在散點圖矩陣有現(xiàn)成的R包(如
GGally_ggpairs
)塔鳍。以下內(nèi)容僅供了解汉矿。
# 示例數(shù)據(jù)
library(gcookbook) # For the data set
c2009 <- subset(countries, Year==2009,
select=c(Name, GDP, laborrate, healthexp, infmortality))
head(c2009
> head(c2009)
Name GDP laborrate healthexp infmortality
50 Afghanistan NA 59.8 50.88597 103.2
101 Albania 3772.605 59.5 264.60406 17.2
152 Algeria 4022.199 58.5 267.94653 32.0
203 American Samoa NA NA NA NA
254 Andorra NA NA 3089.63589 3.1
305 Angola 4068.576 81.3 203.80787 99.9
pairs(c2009[,2:5])
# 定義一個panel.cor函數(shù)來展示變量兩兩之間的相關(guān)系數(shù)以代替默認的散點圖
panel.cor <- function(x, y, digits=2, prefix="", cex.cor, ...) {
usr <- par("usr")
on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r <- abs(cor(x, y, use="complete.obs"))
txt <- format(c(r, 0.123456789), digits=digits)[1]
txt <- paste(prefix, txt, sep="")
if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
text(0.5, 0.5, txt, cex = cex.cor * (1 + r) / 2)
}
# 定義 panel.hist 函數(shù)展示各個變量的直方圖
panel.hist <- function(x, ...) {
usr <- par("usr")
on.exit(par(usr))
par(usr = c(usr[1:2], 0, 1.5) )
h <- hist(x, plot = FALSE)
breaks <- h$breaks
nB <- length(breaks)
y <- h$counts
y <- y/max(y)
rect(breaks[-nB], 0, breaks[-1], y, col="white", ...)
}
pairs(c2009[,2:5], upper.panel = panel.cor,
diag.panel = panel.hist,
lower.panel = panel.smooth)
# 線性模型替代lowess 模型
panel.lm <- function (x, y, col = par("col"), bg = NA, pch = par("pch"),
cex = 1, col.smooth = "black", ...) {
points(x, y, pch = pch, col = col, bg = bg, cex = cex)
abline(stats::lm(y ~ x), col = col.smooth, ...)
}
pairs(c2009[,2:5], pch=".",
upper.panel = panel.cor,
diag.panel = panel.hist,
lower.panel = panel.lm)
往期文章
參考書籍
- R Graphics Cookbook, 2nd edition.