2022-11-28第五章回歸分析

使用lm()函數(shù)建立多元線性回歸模型植影，使用glm()建立廣義線性回歸模型戴已。

廣義線性模型之泊松回歸
(1+1/n)^n的極限為e仗哨，單調(diào)有界必有極限

1. 青少年賭博情況數(shù)據(jù)集

# 加載包
library(faraway)
library(ggcorrplot)
library(ggplot2)

# 加載數(shù)據(jù)
data(teengamb)
head(teengamb)

teengamb數(shù)據(jù)集

# 建立回歸模型
## 一元線性回歸模型
lm1 <- lm(gamble~income,data = teengamb)
summary(lm1)

ggplot(teengamb,aes(x = income,y = gamble)) +
    theme_bw() +
    geom_point()

一元線性回歸結(jié)果

## 探索各個(gè)變量之間的相關(guān)性
gamble_cor <- cor(teengamb)
ggcorrplot(gamble_cor,method = "square",lab = TRUE)+ # square表示矩形熱圖，lab=TRUE表示添加相關(guān)系數(shù)
    theme(axis.text.x = element_text(size = 10),
         axis.text.y = element_text(size = 10))

## 多元線性回歸模型
lm1 <- lm(gamble~sex+income+verbal,data = teengamb)
summary(lm1)

## 剔除無關(guān)變量后的多元線性回歸模型
lm2 <- lm(gamble~sex+income,data = teengamb)
summary(lm2)
## sex和income跟gamble相關(guān)性更強(qiáng)

變量之間的相關(guān)性熱圖

多元線性回歸結(jié)果

# 利用所有變量進(jìn)行逐步回歸
## 分為訓(xùn)練集和測(cè)試集
set.seed(12)
index <- sample(nrow(teengamb),round(nrow(teengamb)*0.7))
train <- teengamb[index,]
test <- teengamb[-index,]
teengamblm <- lm(gamble~.,data = train)
summary(teengamblm)

## 預(yù)測(cè)回歸模型
library(Metrics)
prelm <- predict(teengamblm,test)
sprintf("均方根誤差為：%f",mse(test$gamble,prelm))
# '均方根誤差為：322.534961'

## 使用step()函數(shù)進(jìn)行逐步回歸
gamblestep <- step(teengamblm,direction = "both")
summary(gamblestep)

prestep <- predict(gamblestep,test)
sprintf("均方根誤差：%f",mse(test$gamble,prestep))
## 逐步回歸后的均方根誤差反而增大了肛冶，Adjusted R-squared值有稍許增加事哭，
## 說明逐步回歸的結(jié)果相比之前的模型結(jié)果并沒有很好的提升

# Logistic回歸模型
## 在訓(xùn)練集上使用所有變量進(jìn)行邏輯回歸
teengamblm <- glm(sex~.,data = teengamb,family = "binomial")
## 對(duì)邏輯回歸模型進(jìn)行逐步回歸，來篩選變量
teengambstep <- step(teengamblm,direction = "both")
summary(teengambstep)

邏輯回歸模型結(jié)果

2. Adult收入數(shù)據(jù)集

基于美國(guó)人口adult數(shù)據(jù)集R語言分析實(shí)戰(zhàn)

# 數(shù)據(jù)處理
## 讀取數(shù)據(jù)
## 因?yàn)楹凶址妥兞克塘詾榱嘶貧w分析习蓬，需要把變量變?yōu)橐蜃幼兞?adult <- read.csv("adult.csv",header = F,stringsAsFactors = T)
head(adult)

## 添加變量名
colname<-c("age","workclass","fnlwgt","education","education.num",
           "marital.status","occupation","relationship",
           "race","sex","capital.gain","capital.loss","hours.per.week",
           "native.country","class")
colnames(adult)<-colname
## 查看各變量類型
str(adult)

數(shù)據(jù)集

table()函數(shù)--R語言

## 缺失值識(shí)別
sum(is.na(adult))
## 判斷是否存在非NA型的缺失值情況
## table()函數(shù)統(tǒng)計(jì)每個(gè)因子的頻數(shù)
table(adult$workclass,exclude = NULL)

缺失值識(shí)別

## 缺失值處理
## ?字符前面有一個(gè)空格
## 可以看到結(jié)果當(dāng)中還有？這一項(xiàng)措嵌，再找找有沒有更合適的處理方法
adult$occupation[adult$occupation==" ?"] <- NA
adult$native.country[adult$native.country==" ?"] <- NA
table(adult$workclass,exclude = NULL)

## 缺失值刪除
adult1 <- na.omit(adult)
nrow(adult)
nrow(adult1)
head(adult1)

缺失值處理結(jié)果

# 可視化分析數(shù)據(jù)信息
## 相關(guān)系數(shù)熱力圖,只能選取數(shù)值型變量
library(dplyr)
adult_cor <- cor(adult1 %>% select(age,fnlwgt,education.num,
                                   capital.gain,capital.loss,hours.per.week))
ggcorrplot(adult_cor,method = "square",lab = TRUE)+ 
    theme(axis.text.x = element_text(size = 10),
         axis.text.y = element_text(size = 10))

相關(guān)系數(shù)熱力圖

## 不同教育程度的收入差異（直方圖）
library(patchwork)
p1 <- ggplot(adult1,aes(x = education,fill=class)) +
    theme_bw(base_family = "STKaiti") +
    geom_histogram(stat = "count") +
    coord_flip() +
    labs(x="教育程度",y="人數(shù)",title = "不同教育程度的收入差異情況") +
    theme(axis.text.x = element_text(vjust = 0.5),plot.title = element_text(hjust = 0.5))
## geom_histogram()函數(shù)默認(rèn)position="stack",show.legend=T
## x是因子變量,這里應(yīng)該是stat="count"躲叼，如果是數(shù)值變量，這里應(yīng)該是stat="identity"
## 給了我啟示企巢，字符型變量的count不需要先計(jì)算生成一列num再繪圖枫慷，
## 轉(zhuǎn)換成因子變量可以直接繪圖

p2 <- ggplot(adult1,aes(x = education,fill=class)) +
    theme_bw(base_family = "STKaiti") +
    geom_histogram(stat = "count",position = "fill") +
    coord_flip() +
    labs(x="教育程度",y="人數(shù)",title = "不同教育程度的收入差異情況") +
    theme(axis.text.x = element_text(vjust = 0.5),plot.title = element_text(hjust = 0.5))

options(repr.plot.width=12, repr.plot.height=6)
p1+p2+plot_layout(guides = 'collect')

## 博士和碩士的人數(shù)雖然較少，但博士和碩士學(xué)歷大多數(shù)都能拿到較高的工資

不同教育程度的收入差距

## 受教育年限不同的收入差異
p1 <- ggplot(adult1,aes(x = education.num,fill=class)) +
    theme_bw(base_family = "STKaiti") +
    geom_histogram(stat = "count") +
    coord_flip() +
    labs(x="教育年限",y="人數(shù)",title = "不同教育年限的收入差異情況") +
    theme(axis.text.x = element_text(vjust = 0.5),plot.title = element_text(hjust = 0.5))

p2 <- ggplot(adult1,aes(x = education.num,fill=class)) +
    theme_bw(base_family = "STKaiti") +
    geom_histogram(stat = "count",position = "fill") +
    coord_flip() +
    labs(x="教育年限",y="人數(shù)",title = "不同教育年限的收入差異情況") +
    theme(axis.text.x = element_text(vjust = 0.5),plot.title = element_text(hjust = 0.5))

p1+p2+plot_layout(guides = 'collect')
## 接受教育的年限越長(zhǎng)浪规，其收入較高的可能性越大

不同教育年限的收入差異

## 不同性別的收入差異
p1 <- ggplot(adult1,aes(x = sex,fill=class)) +
    theme_bw(base_family = "STKaiti") +
    geom_histogram(stat = "count") +
    labs(x="性別",y="人數(shù)",title = "不同性別的收入差異情況") +
    theme(axis.text.x = element_text(vjust = 0.5),plot.title = element_text(hjust = 0.5))

p2 <- ggplot(adult1,aes(x = sex,fill=class)) +
    theme_bw(base_family = "STKaiti") +
    geom_histogram(stat = "count",position = "fill") +
    coord_flip() +
    labs(x="性別",y="人數(shù)",title = "不同性別的收入差異情況") +
    theme(axis.text.x = element_text(vjust = 0.5),plot.title = element_text(hjust = 0.5))

## 不同年齡的收入差異
p3 <- ggplot(adult1,aes(x = class,y = age,fill=class)) +
    theme_bw(base_family = "STKaiti") +
    geom_boxplot() +
    labs(x="收入",y="年齡",title = "不同年齡的收入差異情況") +
    theme(axis.text.x = element_text(vjust = 0.5),plot.title = element_text(hjust = 0.5))

## 不同工作時(shí)間的收入差異
p4 <- ggplot(adult1,aes(x = class,y = hours.per.week,fill=class)) +
    theme_bw(base_family = "STKaiti") +
    geom_boxplot() +
    labs(x="收入",y="工作時(shí)間",title = "不同工作時(shí)間的收入差異情況") +
    theme(axis.text.x = element_text(vjust = 0.5),plot.title = element_text(hjust = 0.5))

options(repr.plot.width=12, repr.plot.height=12)
(p1+p2)/(p3+p4)+plot_layout(guides = 'collect')

不同因素的收入差異

# Logistic回歸模型對(duì)年收入進(jìn)行預(yù)測(cè)
adult1$class <- factor(adult1$class,levels=c(" <=50K"," >50K"),labels=c(0,1))
## 數(shù)據(jù)集切分為訓(xùn)練集和測(cè)試集
index <- sample(nrow(adult1),round(nrow(adult1)*0.7))
train <- adult1[index,]
test <- adult1[-index,]
## 在訓(xùn)練集上使用變量進(jìn)行邏輯回歸
adult1train <- glm(class~.,data = train,family = "binomial")
## 逐步回歸篩選變量
adult1step <- step(adult1train,direction = "both")
summary(adult1step)

3. Prostate前列腺病人的數(shù)據(jù)集

# 讀取數(shù)據(jù)
data(prostate)
head(prostate)

# 多元線性回歸
## 探索各個(gè)變量之間的相關(guān)性
prostate_cor <- cor(prostate)
options(repr.plot.width=6, repr.plot.height=6)
ggcorrplot(prostate_cor,method = "square",lab = TRUE)+ # square表示矩形熱圖或听，lab=TRUE表示添加相關(guān)系數(shù)
    theme(axis.text.x = element_text(size = 10),
         axis.text.y = element_text(size = 10))

## 多元線性回歸模型
lm <- lm(lpsa~.,data = prostate)
summary(lm)

數(shù)據(jù)集

變量之間的相關(guān)系數(shù)熱力圖

# 逐步回歸
lmstep <- step(lm,direction = "both")
summary(lmstep)
## Adjusted R-squared值基本沒變化，但去掉了三個(gè)變量lcp,gleason,pgg45

# Ridge回歸
## 加載包
library(glmnet)
## 數(shù)據(jù)集拆分為訓(xùn)練集和測(cè)試集
set.seed(123)
index <- sample(nrow(prostate),round(nrow(prostate)*0.7))
train <- prostate[index,]
test <- prostate[-index,]
## 數(shù)據(jù)標(biāo)準(zhǔn)化
train_s <- scale(train,center = TRUE,scale = TRUE)
test_s <- scale(test,center = TRUE,scale = TRUE)
head(train_s)
head(test_s)

## 利用訓(xùn)練集和cv.glmnet()函數(shù)分析模型參數(shù)的影響
lambdas <- seq(0,5,length.out = 200)
x <- train_s[,1:8]
y <- train_s[,9]
set.seed(1234)
ridge_model <- cv.glmnet(x,y,alpha=0,lambda=lambdas,nfolds = 3)
summary(ridge_model)
p1 <- plot(ridge_model)
p2 <- plot(ridge_model$glmnet.fit,"lambda",label = T)

p1+p2

## 建立ridge模型
ridge_min <- ridge_model$lambda.min
ridge_min

ridge_best <- glmnet(x,y,alpha = 0,lambda = ridge_min)
coef(ridge_best)

## 驗(yàn)證預(yù)測(cè)結(jié)果
test_pre <- predict(ridge_best,as.matrix(test_s[,1:8]))
sprintf("標(biāo)準(zhǔn)化后平均絕對(duì)誤差為：%f",mae(test_s[,9],test_pre))
# '標(biāo)準(zhǔn)化后平均絕對(duì)誤差為：0.416944'

# Lasso回歸
set.seed(1234)
lasso_model <- cv.glmnet(x,y,alpha=1,lambda = lambdas,nfolds = 3)

plot(lasso_model)
plot(lasso_model$glmnet.fit,"lambda",label = T)

## 建立lasso模型
lasso_min <- lasso_model$lambda.min
lasso_best <- glmnet(x,y,alpha = 1,lambda = lasso_min)
coef(lasso_best)

## 驗(yàn)證預(yù)測(cè)結(jié)果
test_pre <- predict(lasso_best,test_s[,1:8])
sprintf("標(biāo)準(zhǔn)化后平均絕對(duì)誤差為：%f",mae(test_s[,9],test_pre))
# '標(biāo)準(zhǔn)化后平均絕對(duì)誤差為：0.421402'

日常廢話：這章內(nèi)容好多啊笋婿，包括很多公式推導(dǎo)誉裆，還得好好消化一番~~~

最后編輯于：2022.12.04 11:31:46

?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者

人面猴
序言：七十年代末，一起剝皮案震驚了整個(gè)濱河市缸濒，隨后出現(xiàn)的幾起案子足丢，更是在濱河造成了極大的恐慌粱腻，老刑警劉巖，帶你破解...
沈念sama閱讀 206,013評(píng)論 6贊 481
死咒
序言：濱河連續(xù)發(fā)生了三起死亡事件斩跌，死亡現(xiàn)場(chǎng)離奇詭異绍些，居然都是意外死亡，警方通過查閱死者的電腦和手機(jī)耀鸦，發(fā)現(xiàn)死者居然都...
沈念sama閱讀 88,205評(píng)論 2贊 382
救了他兩次的神仙讓他今天三更去死
文/潘曉璐我一進(jìn)店門柬批，熙熙樓的掌柜王于貴愁眉苦臉地迎上來，“玉大人袖订，你說我怎么就攤上這事萝快。” “怎么了著角？”我有些...
開封第一講書人閱讀 152,370評(píng)論 0贊 342
道士緝兇錄：失蹤的賣姜人
文/不壞的土叔我叫張陵，是天一觀的道長(zhǎng)旋恼。經(jīng)常有香客問我吏口，道長(zhǎng)，這世上最難降的妖魔是什么冰更？我笑而不...
開封第一講書人閱讀 55,168評(píng)論 1贊 278
?港島之戀（遺憾婚禮）
正文為了忘掉前任产徊，我火速辦了婚禮，結(jié)果婚禮上蜀细，老公的妹妹穿的比我還像新娘舟铜。我一直安慰自己，他們只是感情好奠衔，可當(dāng)我...
茶點(diǎn)故事閱讀 64,153評(píng)論 5贊 371
惡毒庶女頂嫁案：這布局不是一般人想出來的
文/花漫我一把揭開白布谆刨。她就那樣靜靜地躺著，像睡著了一般归斤。火紅的嫁衣襯著肌膚如雪痊夭。梳的紋絲不亂的頭發(fā)上，一...
開封第一講書人閱讀 48,954評(píng)論 1贊 283
城市分裂傳說
那天脏里，我揣著相機(jī)與錄音她我，去河邊找鬼。笑死迫横，一個(gè)胖子當(dāng)著我的面吹牛番舆，可吹牛的內(nèi)容都是我干的。我是一名探鬼主播矾踱，決...
沈念sama閱讀 38,271評(píng)論 3贊 399
雙鴛鴦連環(huán)套：你想象不到人心有多黑
文/蒼蘭香墨我猛地睜開眼恨狈，長(zhǎng)吁一口氣：“原來是場(chǎng)噩夢(mèng)啊……” “哼！你這毒婦竟也來了呛讲？” 一聲冷哼從身側(cè)響起拴事，我...
開封第一講書人閱讀 36,916評(píng)論 0贊 259
萬榮殺人案實(shí)錄
序言：老撾萬榮一對(duì)情侶失蹤沃斤，失蹤者是張志新（化名）和其女友劉穎，沒想到半個(gè)月后刃宵，有當(dāng)?shù)厝嗽跇淞掷锇l(fā)現(xiàn)了一具尸體衡瓶，經(jīng)...
沈念sama閱讀 43,382評(píng)論 1贊 300
?護(hù)林員之死
正文獨(dú)居荒郊野嶺守林人離奇死亡，尸身上長(zhǎng)有42處帶血的膿包…… 初始之章·張勛以下內(nèi)容為張勛視角年9月15日...
茶點(diǎn)故事閱讀 35,877評(píng)論 2贊 323
?白月光啟示錄
正文我和宋清朗相戀三年牲证，在試婚紗的時(shí)候發(fā)現(xiàn)自己被綠了哮针。大學(xué)時(shí)的朋友給我發(fā)了我未婚夫和他白月光在一起吃飯的照片。...
茶點(diǎn)故事閱讀 37,989評(píng)論 1贊 333
活死人
序言：一個(gè)原本活蹦亂跳的男人離奇死亡坦袍，死狀恐怖十厢，靈堂內(nèi)的尸體忽然破棺而出，到底是詐尸還是另有隱情捂齐，我是刑警寧澤蛮放，帶...
沈念sama閱讀 33,624評(píng)論 4贊 322
?日本核電站爆炸內(nèi)幕
正文年R本政府宣布，位于F島的核電站奠宜，受9級(jí)特大地震影響包颁，放射性物質(zhì)發(fā)生泄漏。R本人自食惡果不足惜压真，卻給世界環(huán)境...
茶點(diǎn)故事閱讀 39,209評(píng)論 3贊 307
男人毒藥：我在死后第九天來索命
文/蒙蒙一娩嚼、第九天我趴在偏房一處隱蔽的房頂上張望。院中可真熱鬧滴肿，春花似錦岳悟、人聲如沸。這莊子的主人今日做“春日...
開封第一講書人閱讀 30,199評(píng)論 0贊 19
一樁弒父案贵少，背后竟有這般陰謀
文/蒼蘭香墨我抬頭看了看天上的太陽(yáng)。三九已至堆缘，卻和暖如春春瞬，著一層夾襖步出監(jiān)牢的瞬間，已是汗流浹背套啤。一陣腳步聲響...
開封第一講書人閱讀 31,418評(píng)論 1贊 260
情欲美人皮
我被黑心中介騙來泰國(guó)打工宽气，沒想到剛下飛機(jī)就差點(diǎn)兒被人妖公主榨干…… 1. 我叫王不留，地道東北人潜沦。一個(gè)月前我還...
沈念sama閱讀 45,401評(píng)論 2贊 352
代替公主和親
正文我出身青樓萄涯，卻偏偏與公主長(zhǎng)得像，于是被迫代替她去往敵國(guó)和親唆鸡。傳聞我的和親對(duì)象是個(gè)殘疾皇子涝影，可洞房花燭夜當(dāng)晚...
茶點(diǎn)故事閱讀 42,700評(píng)論 2贊 345

2022-11-28第五章回歸分析

1. 青少年賭博情況數(shù)據(jù)集

2. Adult收入數(shù)據(jù)集

3. Prostate前列腺病人的數(shù)據(jù)集

推薦閱讀更多精彩內(nèi)容