使用lm()函數(shù)建立多元線性回歸模型植影,使用glm()建立廣義線性回歸模型戴已。
1. 青少年賭博情況數(shù)據(jù)集
# 加載包
library(faraway)
library(ggcorrplot)
library(ggplot2)
# 加載數(shù)據(jù)
data(teengamb)
head(teengamb)
# 建立回歸模型
## 一元線性回歸模型
lm1 <- lm(gamble~income,data = teengamb)
summary(lm1)
ggplot(teengamb,aes(x = income,y = gamble)) +
theme_bw() +
geom_point()
## 探索各個(gè)變量之間的相關(guān)性
gamble_cor <- cor(teengamb)
ggcorrplot(gamble_cor,method = "square",lab = TRUE)+ # square表示矩形熱圖,lab=TRUE表示添加相關(guān)系數(shù)
theme(axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10))
## 多元線性回歸模型
lm1 <- lm(gamble~sex+income+verbal,data = teengamb)
summary(lm1)
## 剔除無關(guān)變量后的多元線性回歸模型
lm2 <- lm(gamble~sex+income,data = teengamb)
summary(lm2)
## sex和income跟gamble相關(guān)性更強(qiáng)
# 利用所有變量進(jìn)行逐步回歸
## 分為訓(xùn)練集和測(cè)試集
set.seed(12)
index <- sample(nrow(teengamb),round(nrow(teengamb)*0.7))
train <- teengamb[index,]
test <- teengamb[-index,]
teengamblm <- lm(gamble~.,data = train)
summary(teengamblm)
## 預(yù)測(cè)回歸模型
library(Metrics)
prelm <- predict(teengamblm,test)
sprintf("均方根誤差為:%f",mse(test$gamble,prelm))
# '均方根誤差為:322.534961'
## 使用step()函數(shù)進(jìn)行逐步回歸
gamblestep <- step(teengamblm,direction = "both")
summary(gamblestep)
prestep <- predict(gamblestep,test)
sprintf("均方根誤差:%f",mse(test$gamble,prestep))
## 逐步回歸后的均方根誤差反而增大了肛冶,Adjusted R-squared值有稍許增加事哭,
## 說明逐步回歸的結(jié)果相比之前的模型結(jié)果并沒有很好的提升
# Logistic回歸模型
## 在訓(xùn)練集上使用所有變量進(jìn)行邏輯回歸
teengamblm <- glm(sex~.,data = teengamb,family = "binomial")
## 對(duì)邏輯回歸模型進(jìn)行逐步回歸,來篩選變量
teengambstep <- step(teengamblm,direction = "both")
summary(teengambstep)
2. Adult收入數(shù)據(jù)集
基于美國(guó)人口adult數(shù)據(jù)集R語言分析實(shí)戰(zhàn)
# 數(shù)據(jù)處理
## 讀取數(shù)據(jù)
## 因?yàn)楹凶址妥兞克塘詾榱嘶貧w分析习蓬,需要把變量變?yōu)橐蜃幼兞?adult <- read.csv("adult.csv",header = F,stringsAsFactors = T)
head(adult)
## 添加變量名
colname<-c("age","workclass","fnlwgt","education","education.num",
"marital.status","occupation","relationship",
"race","sex","capital.gain","capital.loss","hours.per.week",
"native.country","class")
colnames(adult)<-colname
## 查看各變量類型
str(adult)
table()函數(shù)--R語言
## 缺失值識(shí)別
sum(is.na(adult))
## 判斷是否存在非NA型的缺失值情況
## table()函數(shù)統(tǒng)計(jì)每個(gè)因子的頻數(shù)
table(adult$workclass,exclude = NULL)
## 缺失值處理
## ?字符前面有一個(gè)空格
## 可以看到結(jié)果當(dāng)中還有?這一項(xiàng)措嵌,再找找有沒有更合適的處理方法
adult$occupation[adult$occupation==" ?"] <- NA
adult$native.country[adult$native.country==" ?"] <- NA
table(adult$workclass,exclude = NULL)
## 缺失值刪除
adult1 <- na.omit(adult)
nrow(adult)
nrow(adult1)
head(adult1)
# 可視化分析數(shù)據(jù)信息
## 相關(guān)系數(shù)熱力圖,只能選取數(shù)值型變量
library(dplyr)
adult_cor <- cor(adult1 %>% select(age,fnlwgt,education.num,
capital.gain,capital.loss,hours.per.week))
ggcorrplot(adult_cor,method = "square",lab = TRUE)+
theme(axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10))
## 不同教育程度的收入差異(直方圖)
library(patchwork)
p1 <- ggplot(adult1,aes(x = education,fill=class)) +
theme_bw(base_family = "STKaiti") +
geom_histogram(stat = "count") +
coord_flip() +
labs(x="教育程度",y="人數(shù)",title = "不同教育程度的收入差異情況") +
theme(axis.text.x = element_text(vjust = 0.5),plot.title = element_text(hjust = 0.5))
## geom_histogram()函數(shù)默認(rèn)position="stack",show.legend=T
## x是因子變量,這里應(yīng)該是stat="count"躲叼,如果是數(shù)值變量,這里應(yīng)該是stat="identity"
## 給了我啟示企巢,字符型變量的count不需要先計(jì)算生成一列num再繪圖枫慷,
## 轉(zhuǎn)換成因子變量可以直接繪圖
p2 <- ggplot(adult1,aes(x = education,fill=class)) +
theme_bw(base_family = "STKaiti") +
geom_histogram(stat = "count",position = "fill") +
coord_flip() +
labs(x="教育程度",y="人數(shù)",title = "不同教育程度的收入差異情況") +
theme(axis.text.x = element_text(vjust = 0.5),plot.title = element_text(hjust = 0.5))
options(repr.plot.width=12, repr.plot.height=6)
p1+p2+plot_layout(guides = 'collect')
## 博士和碩士的人數(shù)雖然較少,但博士和碩士學(xué)歷大多數(shù)都能拿到較高的工資
## 受教育年限不同的收入差異
p1 <- ggplot(adult1,aes(x = education.num,fill=class)) +
theme_bw(base_family = "STKaiti") +
geom_histogram(stat = "count") +
coord_flip() +
labs(x="教育年限",y="人數(shù)",title = "不同教育年限的收入差異情況") +
theme(axis.text.x = element_text(vjust = 0.5),plot.title = element_text(hjust = 0.5))
p2 <- ggplot(adult1,aes(x = education.num,fill=class)) +
theme_bw(base_family = "STKaiti") +
geom_histogram(stat = "count",position = "fill") +
coord_flip() +
labs(x="教育年限",y="人數(shù)",title = "不同教育年限的收入差異情況") +
theme(axis.text.x = element_text(vjust = 0.5),plot.title = element_text(hjust = 0.5))
p1+p2+plot_layout(guides = 'collect')
## 接受教育的年限越長(zhǎng)浪规,其收入較高的可能性越大
## 不同性別的收入差異
p1 <- ggplot(adult1,aes(x = sex,fill=class)) +
theme_bw(base_family = "STKaiti") +
geom_histogram(stat = "count") +
labs(x="性別",y="人數(shù)",title = "不同性別的收入差異情況") +
theme(axis.text.x = element_text(vjust = 0.5),plot.title = element_text(hjust = 0.5))
p2 <- ggplot(adult1,aes(x = sex,fill=class)) +
theme_bw(base_family = "STKaiti") +
geom_histogram(stat = "count",position = "fill") +
coord_flip() +
labs(x="性別",y="人數(shù)",title = "不同性別的收入差異情況") +
theme(axis.text.x = element_text(vjust = 0.5),plot.title = element_text(hjust = 0.5))
## 不同年齡的收入差異
p3 <- ggplot(adult1,aes(x = class,y = age,fill=class)) +
theme_bw(base_family = "STKaiti") +
geom_boxplot() +
labs(x="收入",y="年齡",title = "不同年齡的收入差異情況") +
theme(axis.text.x = element_text(vjust = 0.5),plot.title = element_text(hjust = 0.5))
## 不同工作時(shí)間的收入差異
p4 <- ggplot(adult1,aes(x = class,y = hours.per.week,fill=class)) +
theme_bw(base_family = "STKaiti") +
geom_boxplot() +
labs(x="收入",y="工作時(shí)間",title = "不同工作時(shí)間的收入差異情況") +
theme(axis.text.x = element_text(vjust = 0.5),plot.title = element_text(hjust = 0.5))
options(repr.plot.width=12, repr.plot.height=12)
(p1+p2)/(p3+p4)+plot_layout(guides = 'collect')
# Logistic回歸模型對(duì)年收入進(jìn)行預(yù)測(cè)
adult1$class <- factor(adult1$class,levels=c(" <=50K"," >50K"),labels=c(0,1))
## 數(shù)據(jù)集切分為訓(xùn)練集和測(cè)試集
index <- sample(nrow(adult1),round(nrow(adult1)*0.7))
train <- adult1[index,]
test <- adult1[-index,]
## 在訓(xùn)練集上使用變量進(jìn)行邏輯回歸
adult1train <- glm(class~.,data = train,family = "binomial")
## 逐步回歸篩選變量
adult1step <- step(adult1train,direction = "both")
summary(adult1step)
3. Prostate前列腺病人的數(shù)據(jù)集
# 讀取數(shù)據(jù)
data(prostate)
head(prostate)
# 多元線性回歸
## 探索各個(gè)變量之間的相關(guān)性
prostate_cor <- cor(prostate)
options(repr.plot.width=6, repr.plot.height=6)
ggcorrplot(prostate_cor,method = "square",lab = TRUE)+ # square表示矩形熱圖或听,lab=TRUE表示添加相關(guān)系數(shù)
theme(axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10))
## 多元線性回歸模型
lm <- lm(lpsa~.,data = prostate)
summary(lm)
# 逐步回歸
lmstep <- step(lm,direction = "both")
summary(lmstep)
## Adjusted R-squared值基本沒變化,但去掉了三個(gè)變量lcp,gleason,pgg45
# Ridge回歸
## 加載包
library(glmnet)
## 數(shù)據(jù)集拆分為訓(xùn)練集和測(cè)試集
set.seed(123)
index <- sample(nrow(prostate),round(nrow(prostate)*0.7))
train <- prostate[index,]
test <- prostate[-index,]
## 數(shù)據(jù)標(biāo)準(zhǔn)化
train_s <- scale(train,center = TRUE,scale = TRUE)
test_s <- scale(test,center = TRUE,scale = TRUE)
head(train_s)
head(test_s)
## 利用訓(xùn)練集和cv.glmnet()函數(shù)分析模型參數(shù)的影響
lambdas <- seq(0,5,length.out = 200)
x <- train_s[,1:8]
y <- train_s[,9]
set.seed(1234)
ridge_model <- cv.glmnet(x,y,alpha=0,lambda=lambdas,nfolds = 3)
summary(ridge_model)
p1 <- plot(ridge_model)
p2 <- plot(ridge_model$glmnet.fit,"lambda",label = T)
p1+p2
## 建立ridge模型
ridge_min <- ridge_model$lambda.min
ridge_min
ridge_best <- glmnet(x,y,alpha = 0,lambda = ridge_min)
coef(ridge_best)
## 驗(yàn)證預(yù)測(cè)結(jié)果
test_pre <- predict(ridge_best,as.matrix(test_s[,1:8]))
sprintf("標(biāo)準(zhǔn)化后平均絕對(duì)誤差為:%f",mae(test_s[,9],test_pre))
# '標(biāo)準(zhǔn)化后平均絕對(duì)誤差為:0.416944'
# Lasso回歸
set.seed(1234)
lasso_model <- cv.glmnet(x,y,alpha=1,lambda = lambdas,nfolds = 3)
plot(lasso_model)
plot(lasso_model$glmnet.fit,"lambda",label = T)
## 建立lasso模型
lasso_min <- lasso_model$lambda.min
lasso_best <- glmnet(x,y,alpha = 1,lambda = lasso_min)
coef(lasso_best)
## 驗(yàn)證預(yù)測(cè)結(jié)果
test_pre <- predict(lasso_best,test_s[,1:8])
sprintf("標(biāo)準(zhǔn)化后平均絕對(duì)誤差為:%f",mae(test_s[,9],test_pre))
# '標(biāo)準(zhǔn)化后平均絕對(duì)誤差為:0.421402'
日常廢話:這章內(nèi)容好多啊笋婿,包括很多公式推導(dǎo)誉裆,還得好好消化一番~~~