# --------------------------------------------------------
# ST4061 / ST6041
# 2021-2022
# Eric Wolsztynski
#### Exercises Section 1: Date pre-processing####
# --------------------------------------------------------
library(glmnet)
library(survival)
library(ISLR)
###############################################################
#### Exercise 1: effect of scaling ####
#scale:標(biāo)準(zhǔn)化"(x-miu)/sigma"
###############################################################
summary(iris)
#取前100個作為訓(xùn)練樣本
dat = iris[1:100,]
#去掉沒用的level
dat$Species = droplevels(dat$Species)?
##由于只取前100行數(shù)據(jù)作為訓(xùn)練樣本断医,virginica 這個species沒用到,所以用droplevels把它去掉
##所以在數(shù)據(jù)前期處理時舅柜,如果新選擇的訓(xùn)練樣本中有沒用到的level厘贼,應(yīng)該用droplevels把它刪掉
#分別取自變量x 因變量y(species)
x = dat[,1:4]
y = dat$Species
# 對數(shù)據(jù)做scale 標(biāo)準(zhǔn)化
## method 1 先scale 再自變量因變量分離
dats = dat
dats[,1:4] = apply(dats[,1:4],2,scale)
##method 2 先自變量因變量分離,再對自變量做標(biāo)準(zhǔn)化
## we can apply scaling to the x data directly:
xs = apply(x,2,scale)
# (1)
##PCA
##iris數(shù)據(jù)中有四個成分Sepal.Length Sepal.Width Petal.Length Petal.Width;方差最大的為PCA中的第一主成分
##(但是我們并不能看出來第一主成分是原數(shù)據(jù)中的哪個成分)铛楣,方差第n大的是PCA中的第n主成分
## 用prcomp()做PCA弱左,summary得到的第一行,是對應(yīng)主成分的標(biāo)準(zhǔn)差棘钞;第二行是Var(PCi)/sum(Var(PCi)); 第三行是第二行的累計和
pca.unscaled = prcomp(x)
pca.scaled = prcomp(x,scale=TRUE)
pca.scaled.2 = prcomp(xs) # should be the same as pca.scaled
summary(pca.unscaled)
summary(pca.scaled)
## Proportion of Variance越大缠借,說明該成分包含的信息越多,如果該成分信息變化宜猜,那么總體受影響更大
## summary(pca.unscaled)里泼返,Cumulative Proportion means:
## if you take the first component(PC1), you will capture the 91%, which means it contains the most information
## if you take the first two components, you will capture the 98%, the other two dimensions in the data sets are probably redundant.
## 如果你有前兩個成分的數(shù)據(jù),那你就抓住了98%的信息, 那么剩下兩個成分就是多余的姨拥,不重要的
# plot the data on its first 2 dimensions in each space:
#biplot:主成分分析散點圖(雙標(biāo)圖)
#分析biplot:
#兩環(huán)境線段之間的夾角的余弦值是它們的相關(guān)系數(shù)绅喉,夾角小于90度表示正相關(guān)渠鸽,說明兩環(huán)境對品種排序相似,大于90度表示負(fù)相關(guān)柴罐,表示兩環(huán)境對品種排序相反拱绑,等于90度說明兩環(huán)境不相關(guān)。夾角較小說明試驗點是重復(fù)設(shè)置的丽蝎,去掉一個不影響對品種的評價猎拨。
#環(huán)境線段的長度是試驗點對品種的區(qū)分能力,線段越長屠阻,區(qū)分能力越強红省。
par(mfrow=c(1,2))
plot(x[,1:2], pch=20, col=y, main="Data in original space")
biplot(pca.unscaled, main="Data in PCA space")
abline(v=0, col='orange')
# see the binary separation in orange along PC1?
# re-use this into biplot in original space:
pca.cols = c("blue","orange")[1+as.numeric(pca.unscaled$x[,1]>0)]
plot(x[,1:2], pch=20, col=pca.cols,
main="Data in original space\n colour-coded using PC1-split")
par(mfrow=c(2,2))
plot(pca.unscaled) # scree plot
biplot(pca.unscaled) # biplot
plot(pca.scaled) # scree plot
biplot(pca.scaled) # biplot
# now analyse this plot :)
# 從圖中可以看出兩點:
#1、柱狀圖中国觉,PL和PW對應(yīng)的柱子很長吧恃,說明他們兩個包含的信息多(包含主要信息)
#2、在biplot圖中麻诀,PL和PW的線方向一致痕寓,說明他們兩個的相關(guān)性很大
# Petal.Length和Petal.Width capture the same kind of direction of information, Sepal.Length and Sepal.Width have smaller contribution than those two.
# (2)logistic regression
#dat——unscaled,dats——scaled
logreg.unscaled = glm(Species~., data=dat) # make this work
#family='binomial' means do logistic regression
logreg.unscaled = glm(Species~., data=dat, family='binomial')
logreg.scaled = glm(Species~., data=dats, family='binomial')
#family='binomial' means do logistic regression
# discuss...
# (...unscaled and scaled 's coefficients are the fits different?)
cbind(coef(logreg.unscaled), coef(logreg.scaled))
#the signs are the same(兩個數(shù)據(jù)的正負(fù)符號是一致的)
#but the contributions of each of the variables has changed, sometimes relate to each other(但是系數(shù)和影響變化了,但是有時是有聯(lián)系的)
#have a swap between SP and SL, probably because there is a lot of redundancy between them.(SP和SL的大小交換了蝇闭,可能因為有太多冗雜的東西在里面)
cor(dats$Petal.Length,dats$Petal.Width)
#0.9793217 means a high correlation them, 相關(guān)性很強呻率,SP和SL可以混同(confused),他們可以相互交換(interchangeable)
# (... does this align 一致 with the PCA analysis?)
# (yes, we see a change in the role of each variable in the information space,但是PCA里PL和PW是主成分PC1 PC2呻引,他們在邏輯回歸里的系數(shù)也是很大的兩個
# (... and why this align 一致 with the PCA analysis?為什么PCA和邏輯回歸會一致)
# (They are both linear techniques)
# (3)
#lasso要先把數(shù)據(jù)變成matrix
x.m = model.matrix(Species~.+0, data=dat)#unscaled
lasso.cv = cv.glmnet(x.m, y, family="binomial")#用cv.glmnet去找到后面lasso要用的lambda
lasso.unscaled = glmnet(x.m, y, family="binomial", lambda=lasso.cv$lambda.min)
lasso.pred = predict(lasso.unscaled, newx=x.m, type="class")
#
xs.m = model.matrix(Species~.+0, data=dats)#scaled
lasso.cv = cv.glmnet(xs.m, y, family="binomial")
lasso.scaled = glmnet(xs.m, y, family="binomial", lambda=lasso.cv$lambda.min)
lasso.s.pred = predict(lasso.scaled, newx=xs.m, type="class")
#
cbind(coef(lasso.unscaled), coef(lasso.scaled))
#由此可見礼仗,scaled和unscaled的intercept很不一樣,這是因為我們在scaled里centred all the covariance,which means the data in scaled are all central zero,所以there is very little need for an intercept parameter
table(lasso.pred, lasso.s.pred)
###############################################################
#### Exercise 2: data imputation ####
###############################################################
par(mfrow=c(1,1))
summary(lung)
boxplot(lung$meal.cal~lung$sex, col=c("cyan","pink"))
# can you think of other ways of analyzing this?
# (1) lung cancer data: compare meal.cal values between male and female cohorts,
# and discuss w.r.t. gender-specific data imputation
# NB: "missing at random" vs "missing not at random"??
?lung
View(lung)
dim(lung)#228
dim(na.omit(lung))#167
#數(shù)據(jù)中有NA逻悠,處理missing value
#167/228= 0.7324561元践,有73%的數(shù)據(jù),剩下的是不全(丟失)的數(shù)據(jù)
#data imputation:fill out the missing value補全丟失數(shù)據(jù)
#replacing the missing values with the mean or media of the variable for the data set.
nas = is.na(lung$meal.cal) # track missing values 看看哪些值是NA
table(nas, lung$sex) #統(tǒng)計男女中各自NA的個數(shù)
imales = which(lung$sex==1)#將data set男性的數(shù)據(jù)位置找出來
m.all = mean(lung$meal.cal, na.rm=TRUE)#男女總體的mean
m.males = mean(lung$meal.cal[imales], na.rm=TRUE)#男性的mean
m.females = mean(lung$meal.cal[-imales], na.rm=TRUE)#女性的mean
t.test(lung$meal.cal[imales], lung$meal.cal[-imales])#預(yù)測男女包含missing value的mean是否相等
# p= 0.01989<0.05, 意味著alternative hypothesis: true difference in means is not equal to 0, 男女包含missing value的mean是不相等的童谒,所以要用不同的imputation值去填補他們
# significant difference, hence must use different imputation
# values for each gender
# (2) Run Cox PHMs on original, overall imputed and gender-specific imputed
# datsets, using the cohort sample mean for data imputation. Compare and discuss
#用不同的數(shù)據(jù)去填充missing value单旁,比較他們用COX模型擬合的差別
# model fitting output.
dat1 = dat2 = lung
# impute overall mean in dat1:
dat1$meal.cal[nas] = m.all #假如都用整體的mean來填充
# impute gender=sepcific mean in dat2:
dat2$meal.cal[(is.na(lung$meal.cal) & (lung$sex==1))] = m.males#用女的均值填充女
dat2$meal.cal[(is.na(lung$meal.cal) & (lung$sex==2))] = m.females#用男的均值填充男
#Fit Cox proportional hazard models: Cox比例風(fēng)險回歸模型
?coxph()
cox0 = coxph(Surv(time,status)~.,data=lung) #沒有填充missing value
cox1 = coxph(Surv(time,status)~.,data=dat1) #直接填充不分男女
cox2 = coxph(Surv(time,status)~.,data=dat2) #分男女填充
summary(cox0)
summary(cox1)
summary(cox2)
cbind(coef(coxph(Surv(time,status)~.,data=lung)),
? ? ? coef(coxph(Surv(time,status)~.,data=dat1)),coef(coxph(Surv(time,status)~.,data=dat2)))
round(cbind(coef(coxph(Surv(time,status)~.,data=lung)),
? ? ? ? ? ? coef(coxph(Surv(time,status)~.,data=dat1)),coef(coxph(Surv(time,status)~.,data=dat2))),3)
#由比較可見,沒有填充和兩種填充方法三者meal.cal coefficient的結(jié)果差不多
# - dat1 and dat2 yield increased sample size (from 167 to 209, both imputed
# datasets having 209 observations) dat1和dat2產(chǎn)生了增加的樣本大小(從167到209饥伊,都有209個觀測數(shù)據(jù)集)
# - overall coefficient effects comparable between the 2 sets
# - marginal differences in covariate effect and significance between lung and {dat1;dat2}
# - no substantial difference between dat1 and dat2 outputs
###############################################################
#### Exercise 3: data imputation ####
###############################################################
library(ISLR)
dat = Hitters
View(Hitters)
# (1) (Deletion)
?na.omit()
sdat = na.omit(dat)#返回刪除NA后的數(shù)據(jù)
sx = model.matrix(Salary~.+0,data=sdat)
sy = sdat$Salary
cv.l = cv.glmnet(sx,sy)
slo = glmnet(sx,sy,lambda=cv.l$lambda.min)
# (2) Simple imputation (of Y) using overall mean
ina = which(is.na(dat$Salary))#把NA的位置找出來
dat$Salary[ina] = mean(dat$Salary[-ina])
x = model.matrix(Salary~.+0,data=dat)
y = dat$Salary
cv.l = cv.glmnet(x,y)
lo = glmnet(x,y,lambda=cv.l$lambda.min)
# (3)
#刪除 OR 補充缺失值象浑;fit lasso模型;再用擬合好的lasso模型做出估計的y^撵渡;比較真實值與用lasso模型估計出來的估計值的標(biāo)準(zhǔn)差
slop = predict(slo,newx=sx)#用fit好的lasso模型
lop = predict(lo,newx=x)
sqrt(mean((slop-sy)^2))
sqrt(mean((lop-y)^2))#標(biāo)準(zhǔn)差差不多
#畫圖比較用delete和Simple imputation兩種方法擬合的模型做出來的每一個y^的差距融柬,如果是45度直線死嗦,說明兩種方法預(yù)測的y^是差不多的
plot(slop,lop[-ina])
abline(a=0,b=1)
abline(lm(lop[-ina]~slop), col='navy')
# What could we do instead of imputing the Y?
###############################################################
#### Exercise 4: resampling ####
###############################################################
###############################################################
#### Exercise 5: resampling (CV vs bootstrapping)####
###############################################################
# Implement this simple analysis and discuss - think about
# (sub)sample sizes!
# 跳過shuff過程
#n = nrow(trees)#都要先洗牌趋距,重新隨機排列一下原數(shù)據(jù)
#trees = trees[sample(1:n, n), ]
x = trees$Girth? # sorted in increasing order...
y = trees$Height
plot(x, y, pch=20)
summary(lm(y~x))
N = nrow(trees)
# (1) 10-fold CV on original dataset
set.seed(4060)
K = 10
slp <- numeric(K)
itc <- numeric(K)
cc <- numeric(K)
folds = cut(1:N, K, labels=FALSE)
for(k in 1:K){
i = which(folds==k)
# train:
lmo = lm(y[-i]~x[-i])
slp[k] <- lmo$coefficient[2]
itc[k] <- lmo$coefficient[1]
cc[k] = summary(lmo)$coef[2,2]
# (NB: no testing here, so not the conventional use of CV)
}
mean(cc)#0.3443734
####cc是啥,不是想儲存slop么####
# (2) 10-fold CV on randomized dataset
# shuffle it
set.seed(1)
mix = sample(1:nrow(trees), replace=FALSE)
xr = trees$Girth[mix]
yr = trees$Height[mix]
set.seed(4060)
K = 10
ccr = numeric(K)
folds = cut(1:N, K, labels=FALSE)
for(k in 1:K){
i = which(folds==k)
lmo = lm(yr[-i]~xr[-i])
ccr[k] = summary(lmo)$coef[2,2]
}
mean(ccr)#0.3408507越除,shuff后的mean更小一些
mean(cc)#0.3443734
#(3)
sd(ccr)#0.01854404节腐,shuff后的sd更小一些
sd(cc)#0.03990858
boxplot(cc,ccr)
t.test(cc,ccr)#p>0.05外盯,拒絕備擇假設(shè)alternative hypothesis,true difference in means is equal to 0
var.test(cc,ccr)#F-test (var.test()) p<0.05,接受備擇假設(shè)alternative hypothesis翼雀,true ratio of variances is not equal to 1
?var.test
#p>0.05,accept H0, rejept H1(alternative hypothesis)
# (4) Bootstrapping (additional note)用BT來做饱苟,跟CV比較
set.seed(4060)
K = 100
cb = numeric(K)
for(i in 1:K){
# bootstrapping
ib = sample(1:N,N,replace=TRUE)
lmb = lm(y[ib]~x[ib])
cb[i] = summary(lmb)$coef[2,2]
}
mean(cb)#0.3163658,不如shuff后的CV結(jié)果好
sd(cb)#0.0505108,不如shuff后的CV結(jié)果好
dev.new()
par(font=2, font.axis=2, font.lab=2)
boxplot(cbind(cc,ccr,cb), names=c("CV","CVr","Bootstrap"))
####abline(h=1.0544)???####
t.test(cc,cb)
####Explain why these are different?如何評價####
round(cc, 3)
#(5)
#we have 36.8% points out-of-BT, so using 368 points as test set, 632 points as train set
#(6)
N = 1000
x = runif(N, 2, 20)
y = 2 + 5*x + rnorm(N)
R=33
K1=3
pred.err.cv=numeric(R*K1)
###CV
for(j in 1:R){
? mix=sample(1:N,replace = F)
? xr=x[mix]
? yr=y[mix]
? folds=cut(1:N,K1,labels = F)
? data53=as.data.frame(cbind(xr,yr))
? mse = pred.err = numeric(K1)
? for(k in 1:K1){
? ? i.train = which(folds!=k)
? ? o = lm(yr~xr, data=data53, subset=i.train)
? ? i.test = which(folds==k)
? ? yp = predict(o, newdata=data53[i.test,])
? ? pred.err[k] = mean((yp-yr[i.test])^2)
? }
? pred.err.cv[c(K1*j-K1+1):c(K1*j)]=pred.err
}
mean(pred.err.cv)
###Bootstrapping command+shift+c 全選加注釋
# K2 = R*K1
# pred.err.BT = numeric(K2)
# for(j in 1:K2){
#? mix=sample(1:N,replace = T)
#? xr=x[mix]
#? yr=y[mix]
#? data53=as.data.frame(cbind(xr,yr))
#? o = lm(yr~xr, data=data53)
#? yh = o$fitted.values
#? pred.err.BT[j] = mean((yh-yr)^2)
# }
# mean(pred.err.BT)
# --------------------------------------------------------
# ST4061 / ST6041
# 2021-2022
# Eric Wolsztynski
# ...
#### Exercises Section 2: Regularization ####
# --------------------------------------------------------
###############################################################
#### Exercise 1: tuning LASSO ####
###############################################################
# Have a go at this exercise yourself...
# you can refer to ST4060 material:)
library(ISLR)
library(glmnet)
Hitters = na.omit(Hitters)
x = model.matrix(Salary~., Hitters)[,-1]
y = Hitters$Salary
cv.rd <- cv.glmnet(x,y)
ridge_mod = glmnet(x, y, alpha = 0, lambda = cv.rd$lambda.min)
pred.rd <- predict(ridge_mod, newx=x)
length(pred.rd)
length(y)
sqrt(mean((pred.rd - y)^2))
summary(ridge_mod)
coef(ridge_mod)
ridge_mod$lambda
the_grid = 10^seq(10, -2, length = 100)
n <- length(the_grid)
criterion = numeric(length(the_grid))
for(i in 1:n){
? fit = glmnet(x, y, alpha = 0, lambda = the_grid[i])
? pred.fit <- predict(fit, newx=x)
? criterion[i] = sqrt(mean((pred.fit - y)^2))
}
which.min(criterion)
the_grid[which.min(criterion)]
###############################################################
#### Exercise 2: tuning LASSO + validation split ####
###############################################################
# Have a go at this exercise yourself too...
# you can refer to ST4060 material:)
# --------------------------------------------------------
# ST4061 / ST6041
# 2021-2022
# Eric Wolsztynski
# ...
#### Exercises Section 3: Classification Exercises ####
# --------------------------------------------------------
# install.packages("class")
# install.packages("MASS")
# install.packages("car")
# install.packages("ISLR")
# install.packages("pROC")
# install.packages("carData")
library(class) # contains knn()
library(MASS)? # to have lda()
library(carData)
library(car)
library(ISLR) # contains the datasets
library(pROC)
###############################################################
#### Exercise 1: kNN on iris data ####
###############################################################
set.seed(1)
# shuffle dataset first:
z = iris[sample(1:nrow(iris)),]
# Here we focus on sepal information only
plot(z[,1:2], col=c(1,2,4)[z[,5]],
? ? pch=20, cex=2)
x = z[,1:2] # sepal variables only
y = z$Species
# Here's how to use the knn() classifier:
K = 1
n = nrow(x)
# split the data into train+test:
i.train = sample(1:n, 100)
x.train = x[i.train,]
x.test = x[-i.train,]
y.train = y[i.train]
y.test = y[-i.train]
ko = knn(x.train, x.test, y.train, K)
tb = table(ko, y.test)
1 - sum(diag(tb)) / sum(tb) # overall classification error rate, error rate越小越好
library(caret)
confusionMatrix(data=ko, reference=y.test)#顯示the error matrix
#Here狼渊, Accuracy : 0.7 which is sum(diag(tb)) / sum(tb); 95%Confidence interval around its accuracy matrix箱熬;還能比較3種class的Sensitivity 和 Specificity
?confusionMatrix#創(chuàng)建一個給定特定邊界的混淆矩陣
# Build a loop around that to find best k:
# (NB: assess effect of various k-values
# on the same data-split)
Kmax = 30
acc = numeric(Kmax)#acc越大越好
for(k in 1:Kmax){
? ko = knn(x.train, x.test, y.train, k)
? tb = table(ko, y.test)
? acc[k] = sum(diag(tb)) / sum(tb)
}
plot(1-acc, pch=20, t='b', xlab='k')#從圖中找到最低點,就是最好的k
#評價:if k is too small(overfitting), decision boundry is overly flexible, low bias+large variance
#? ? ? if k is too large(underfitting), decision boundry is not flexible enough, high bias+small variance
###############################################################
#### Exercise 2: GLM(logistic regression) on 2-class iris data ####
###############################################################
n = nrow(iris)
is = sample(1:n, size=n, replace=FALSE)
dat = iris[is,-c(3,4)] # shuffled version of the original set, and hold sepal variables only
# record into 2-class problem:轉(zhuǎn)化成2-class problem “是virginica”=1 OR “不是virginica ”=0
dat$is.virginica = as.numeric(dat$Species=="virginica")
dat$Species = NULL # "remove" this component
names(dat)
is = 1:100 # training set
fit = glm(is.virginica~., data=dat, subset=is, family=binomial(logit))
pred = predict(fit, newdata=dat[-is,], type="response")#返回值是estimated probabilities
hist(pred)
?predict.glm
y.test = dat$is.virginica[-is] # true test set classes
boxplot(pred~y.test, names=c("other","virginica"))#通過箱線圖可以看出“virginica”和“not virginica”通過glm分開的比較徹底
abline(h=0.5, col=3)
#roughly 70% of data would be classified correctly.
# for varying cut-off (ie threshold) values, compute corresponding
# predicted labels, and corresponding confusion matrix:
#find the best cutoff
err = NULL
for(cut.off in seq(.1, .9, by=.1)){
? pred.y = as.numeric(pred>cut.off)
? tb = table(pred.y, y.test)
? err = c(err, (1-sum(diag(tb))/sum(tb)))
}
plot(seq(.1, .9, by=.1), err, t='b')
#choose the best cutoff to classification.
cut.off=0.2
pred.y = as.numeric(pred>cut.off)
tb = table(pred.y, y.test)
err = (1-sum(diag(tb))/sum(tb))
###############################################################
#### Exercise 3: LDA assumptions ####
###############################################################
## (1) 2-class classification problem
#多類轉(zhuǎn)化成2類的三種方法
#(1)轉(zhuǎn)換成0 1
dat = iris
dat$Species = as.factor(ifelse(iris$Species=="virginica",1,0))
#(2)轉(zhuǎn)換成帶命名的(更直觀)
#用0 1 區(qū)分是virginica還是非virginica并不直觀狈邑,不如用virginica or other 來區(qū)分城须,如下:
# to recode cleanly, you could use for instance:
dat$Species = car::recode(dat$Species, "0='other'; 1='virginica'")
#(3)先轉(zhuǎn)化成0 1,再命名
# or:直接讓levels變成virginica or other
#dat$Species = as.factor(ifelse(iris$Species=="virginica",1,0))#先變成0 1
#levels(dat$Species) = c("other","virginica")#再對0 1 命名
##(2)粗略查看分類情況
par(mfrow=c(1,2))
plot(iris[,1:2], pch=20, col=c(1,2,4)[iris$Species], cex=2)
legend("topright",col=c(1,2,4),
? ? ? legend=levels(iris$Species),
? ? ? pch=20, bty='n')#3-class
plot(dat[,1:2], pch=20, col=c(1,4)[dat$Species], cex=2)
legend("topright",col=c(1,4),
? ? ? legend=levels(dat$Species),
? ? ? pch=20, bty='n')#2-class:virginica or not virginica
#由于LDA需要滿足兩個assumption:normal-distribution米苹;equal variance糕伐,否則將不能用LDA方法分類
#First assumption: all histograms are looking like normal distribution
#Second assumption: 每個predictor的方差相同
##First,we need to explore distribution of each predictors(每一列指標(biāo)):(看是否滿足正態(tài)分布)
head(dat)
#here, the first four columns are the predictors(or features/attributes/covariates/variables), the last column is class
# 1.boxplots seem relatively symmetric畫箱線圖看他是否對稱:
#look at each column of the dataset:
par(mfrow=c(2,2))
for(j in 1:4){
? boxplot(dat[,j]~dat$Species,
? ? ? ? ? ylab='predictor',
? ? ? ? ? col=c('cyan','pink'),
? ? ? ? ? main=names(dat)[j])
}
#here you can see there is a clear separation between the two classes, except the sepal.width
#so we decide to use the three other predictors to make a classification
#由于正態(tài)分布是對稱的蘸嘶,如果箱線圖的箱子是對稱(symmetric)的良瞧,那么很可能是滿足正態(tài)分布的,如果箱子不對稱训唱,那么會有skew
#看correlation
cor(dat[,1:4])
#here we can see there is 96%(超過90%就可以這么說) correlation between Petal.Width and Petal.Length, which means these two predictors can tell us the same information
#2.只看箱線圖是不夠的褥蚯,還要看是否是histogram正態(tài)分布 but we'd rather check for Normality more specifically:
par(mfrow=c(2,4), font.lab=2, cex=1.2)
for(j in 1:4){
? hist(dat[which(dat$Species=='other'),j], col='cyan',
? ? ? xlab='predictor for class other',
? ? ? main=names(dat)[j])
? hist(dat[which(dat$Species!='other'),j], col='pink',
? ? ? xlab='predictor for class virginica',
? ? ? main=names(dat)[j])
}
#PL and PW for the other class are definitely not normal,其他的圖都是approximately normal
#It means there may be a limitation of our analysis if we decide to use LDA here.
#這里其實就不適合用LDA了,但是為了了解這個方法况增,我們繼續(xù)往下做
#3.除了用柱狀圖判斷是否normal遵岩,也可以用QQ圖 could also use QQ-plots:
par(mfrow=c(2,4), cex=1.2)
for(j in 1:4){
? x.other = dat[which(dat$Species=='other'),j]
? qqnorm(x.other, pch=20, col='cyan',
? ? ? ? main=names(dat)[j])
? abline(a=mean(x.other), b=sd(x.other))
? x.virginica = dat[which(dat$Species!='other'),j]
? qqnorm(x.virginica, pch=20, col='pink',
? ? ? ? main=names(dat)[j])
? abline(a=mean(x.virginica), b=sd(x.virginica))
}
# So what do you think?
#在45度線上的是normal distribution,很明顯PL和PW在other上不是
## Check initial assumption of equality of variances:all predictors have same variance for each class
# Bartlett's test with H0: all variances are equal, p>0.05 接受H0巡通,有相同的variance
for(j in 1:4){
? print( bartlett.test(dat[,j]~dat$Species)$p.value )
}
#只有SL滿足
# Shapiro's test with H0: the distribution is Normal
for(j in 1:4){
? print( shapiro.test(dat[which(dat$Species=='virginica'),j])$p.value )
}#p都>0.05尘执,Normal
for(j in 1:4){
? print( shapiro.test(dat[which(dat$Species=='other'),j])$p.value )
}#PL和PW的p value 很小,不normal
## Fit LDA model to this dataset and check accuracy:
lda.o = lda(Species~., data=dat)
(lda.o)
# can we track some of the values in that summary?
table(dat$Species)/nrow(dat)
#Here tells us there are one third data belongs to virginica, and two thirds belongs to other class.
#It means we don't have balance in how each class represented in the data set.
rbind(
? apply(dat[which(dat$Species=='other'),1:4], 2, mean),
? apply(dat[which(dat$Species=='virginica'),1:4], 2, mean)
)#按列求每一列的均值
# what about the coefficients of linear discriminates?那么線性微分的系數(shù)呢?
x = as.matrix(dat[,1:4])
proj = x %*% lda.o$scaling #取每一列的數(shù)據(jù)宴凉,乘lda.o$scaling
plot(proj, pch=20, col=dat$Species, cex=2)
#we can see, the projections increase as we move from one class to another
# little hack to recover the fitted values quickly
predo = predict(lda.o, newdata=dat)
names(predo)
predo$class#顯示每一個測試樣本的分類
predo$posterior#有0.9990663400的概率第一個測試樣本屬于other class誊锭,有0.0009336600的概率它屬于virginica
y = predo$x
??predo
plot(proj, y)#由此我們能看出,the scores used by LDA in order to generate the predictions and probabilities are the result of proj by using lda.o$scaling directly.(y=proj)
plot(y, predo$posterior[,2])
boxplot(y ~ (predo$posterior[,2]>.5))#第二列probabilities > 0.5 是virginica(TRUE)
boxplot(proj ~ (predo$posterior[,2]>.5))#兩種畫出來的圖是一樣的弥锄,因為y和proj相等
# NB: The way these coefs is calculated follows the MANOVA approach
# popular hack to recover the fitted values:
fitted.values = predict(lda.o, newdata=dat)$class?
boxplot(y~dat$Species)
boxplot(proj~dat$Species)
(tb.2 = table(fitted.values, dat$Species))
sum(diag(tb.2)) / sum(tb.2)#accuracy
#LDA will be more efficient than LR when class are well separated.
## (2) 3-class classification problem
dat = iris
## Explore distribution of predictors:
# boxplots seem relatively symmetric:
par(mfrow=c(2,2))
# here's just a loop to save having to write 4 boxplot
# instructions with names by hand (being lazy often
# makes for nicer code):
for(j in 1:4){
? boxplot(dat[,j]~dat$Species,
? ? ? ? ? xlab = 'Species',
? ? ? ? ? ylab = 'predictor',
? ? ? ? ? col=c('cyan','pink'),
? ? ? ? ? main=names(dat)[j])
}
# but we'd rather check for Normality more specifically:
Ls = levels(dat$Species)
par(mfcol=c(3,4))
for(j in 1:4){
? hist(dat[which(dat$Species==Ls[1]),j], col='cyan',
? ? ? main=names(dat)[j])
? hist(dat[which(dat$Species==Ls[2]),j], col='pink',
? ? ? main=names(dat)[j])
? hist(dat[which(dat$Species==Ls[3]),j], col='green',
? ? ? main=names(dat)[j])
}
# could also use QQ-plots:
par(mfcol=c(3,4))
for(j in 1:4){
? x1 = dat[which(dat$Species==Ls[1]),j]
? qqnorm(x1, pch=20, col='cyan', main=names(dat)[j])
? abline(a=mean(x1), b=sd(x1))
? x2 = dat[which(dat$Species==Ls[2]),j]
? qqnorm(x2, pch=20, col='pink', main=names(dat)[j])
? abline(a=mean(x2), b=sd(x2))
? x3 = dat[which(dat$Species==Ls[3]),j]
? qqnorm(x3, pch=20, col='green', main=names(dat)[j])
? abline(a=mean(x3), b=sd(x3))
}
# So what do you think now?
## Check initial assumption of equality of variances:
# Bartlett's test with H0: all variances are equal
print( bartlett.test(dat[,1]~dat$Species)$p.value )
print( bartlett.test(dat[,2]~dat$Species)$p.value )
print( bartlett.test(dat[,3]~dat$Species)$p.value )
print( bartlett.test(dat[,4]~dat$Species)$p.value )
## or if in lazy mode:
for(j in 1:4){
? print( bartlett.test(dat[,j]~dat$Species)$p.value )
}
## Fit LDA model to this dataset and check accuracy:
lda.o = lda(Species~., data=dat)
(lda.o)
ftted.values = predict(lda.o, newdata=dat)$class
(tb.3 = table(ftted.values, dat$Species))
sum(diag(tb.3)) / sum(tb.3)
###############################################################
### Exercise 4: LDA
###############################################################
## (1) 2-class classification problem
dat = iris
dat$Species = as.factor(ifelse(iris$Species=="virginica",1,0))
levels(dat$Species) = c("other","virginica")
n = nrow(dat)
set.seed(4061)
dat = dat[sample(1:n),] # shuffle dataset
i.train = 1:100
dat.train = dat[i.train,]
dat.test = dat[-i.train,]
lda.o = lda(Species~., data=dat.train)
lda.p = predict(lda.o, newdata=dat.test)
names(lda.p)
(tb = table(lda.p$class, dat.test$Species))
sum(diag(tb))/sum(tb)
# QDA:
qda.o = qda(Species~., data=dat.train)
qda.p = predict(qda.o, newdata=dat.test)
(tb = table(qda.p$class, dat.test$Species))
sum(diag(tb))/sum(tb)
## (2) 3-class classification problem
dat = iris
n = nrow(dat)
set.seed(4061)
dat = dat[sample(1:n),]
i.train = 1:100
dat.train = dat[i.train,]
dat.test = dat[-i.train,]
# LDA:
lda.o = lda(Species~., data=dat.train)
lda.p = predict(lda.o, newdata=dat.test)
names(lda.p)
(tb = table(lda.p$class, dat.test$Species))
sum(diag(tb))/sum(tb)
# QDA:
qda.o = qda(Species~., data=dat.train)
qda.p = predict(qda.o, newdata=dat.test)
(tb = table(qda.p$class, dat.test$Species))
sum(diag(tb))/sum(tb)
###############################################################
### Exercise 5: benchmarking
###############################################################
## (1) benchmarking on unscaled data
#ROC圖像 橫縱坐標(biāo)都越接近1越好丧靡, AOC是線與橫軸包圍的面積,也是越大越好
#sensitivity:縱坐標(biāo)
#specificity:橫坐標(biāo)
#confusionMatrix:混淆矩陣
set.seed(4061)
n = nrow(Default)
dat = Default[sample(1:n, n, replace=FALSE), ]
# get a random training sample containing 70% of original sample:
i.cv = sample(1:n, round(.7*n), replace=FALSE)
dat.cv = dat[i.cv,] # use this for CV (train+test)
dat.valid = dat[-i.cv,] # save this for later (after CV) (HOLD-OUT)
# tuning of the classifiers:
K.knn = 3
# perform K-fold CV:
K = 10
N = length(i.cv)
folds = cut(1:N, K, labels=FALSE)
acc.knn = acc.glm = acc.lda = acc.qda = numeric(K)
auc.knn = auc.glm = auc.lda = auc.qda = numeric(K)
#
for(k in 1:K){ # 10-fold CV loop
? # split into train and test samples:
? i.train = which(folds!=k)
? dat.train = dat.cv[i.train, ]
? dat.test = dat.cv[-i.train, ]
? # adapt these sets for kNN:
? x.train = dat.train[,-1]
? y.train = dat.train[,1]
? x.test = dat.test[,-1]
? y.test = dat.test[,1]
? x.train[,1] = as.numeric(x.train[,1])
? x.test[,1] = as.numeric(x.test[,1])
? # train classifiers:
? knn.o = knn(x.train, x.test, y.train, K.knn)
? glm.o = glm(default~., data=dat.train, family=binomial(logit))
? lda.o = lda(default~., data=dat.train)
? qda.o = qda(default~., data=dat.train)
? # test classifiers:
? knn.p = knn.o
? glm.p = ( predict(glm.o, newdata=dat.test, type="response") > 0.5 )
? lda.p = predict(lda.o, newdata=dat.test)$class
? qda.p = predict(qda.o, newdata=dat.test)$class
? tb.knn = table(knn.p, y.test)
? tb.glm = table(glm.p, y.test)
? tb.lda = table(lda.p, y.test)
? tb.qda = table(qda.p, y.test)
? # store prediction accuracies:
? acc.knn[k] = sum(diag(tb.knn)) / sum(tb.knn)
? acc.glm[k] = sum(diag(tb.glm)) / sum(tb.glm)
? acc.lda[k] = sum(diag(tb.lda)) / sum(tb.lda)
? acc.qda[k] = sum(diag(tb.qda)) / sum(tb.qda)
? #
? # ROC/AUC analysis:AUC值越大越好
? # WARNING: THIS IS NOT Pr(Y=1 | X), BUT Pr(Y = Y_hat | X):
? knn.p = attributes(knn(x.train, x.test, y.train, K.knn, prob=TRUE))$prob
? glm.p = predict(glm.o, newdata=dat.test, type="response")
? lda.p = predict(lda.o, newdata=dat.test)$posterior[,2]
? qda.p = predict(qda.o, newdata=dat.test)$posterior[,2]
? auc.knn[k] = roc(y.test, knn.p)$auc
? auc.glm[k] = roc(y.test, glm.p)$auc
? auc.lda[k] = roc(y.test, lda.p)$auc
? auc.qda[k] = roc(y.test, qda.p)$auc
}
boxplot(acc.knn, acc.glm, acc.lda, acc.qda,
? ? ? ? main="Overall CV prediction accuracy",
? ? ? ? names=c("kNN","GLM","LDA","QDA"))
boxplot(auc.knn,auc.glm, auc.lda, auc.qda,
? ? ? ? main="Overall CV AUC",
? ? ? ? names=c("KNN","GLM","LDA","QDA"))
##### Taking a closer look at performance
roc(y.test, glm.p)$auc#真實值 預(yù)測值
plot(roc(y.test, glm.p))
#取threshold為0.5籽暇,用caret建立混淆矩陣温治,計算精確度
library(caret)
(tb = table(y.test, glm.p>.5))
pred = as.factor(glm.p>.5)
pred = car::recode(pred, "FALSE='No'; TRUE='Yes'")
caret::confusionMatrix(y.test, pred)
sum(diag(tb))/sum(tb)
(683+3)/(683+3+14)#=98%
#看總體數(shù)據(jù)的真實情況纫谅,No占96.67%浇辜,與98%相差不大蔫仙,說明預(yù)測有效
table(Default$default)
table(Default$default)/nrow(Default)
##### Further exercises for you to do:
## adapt code to evaluate sensitivity and specificity
## add validation analysis...
## repeat on scaled data... 會對KNN有影響
###############################################################
### Exercise 6: benchmarking, again
###############################################################
## (1) benchmarking on unscaled data
set.seed(4061)
n = nrow(Default)
dat = Default[sample(1:n, n, replace=FALSE), ]
# get a random training sample containing 70% of original sample:
i.cv = sample(1:n, round(.7*n), replace=FALSE)
x = dat.cv = dat[i.cv,] # use this for CV (train+test)
dat.valid = dat[-i.cv,] # save this for later (after CV)
# Recover ROC curve manually from whole set:
n = nrow(x)
acc = numeric(length(thrs))
sens = spec = numeric(length(thrs))
thrs = seq(.05,.95,by=.05)
for(ithr in 1:length(thrs)){
? thr = thrs[ithr]
? glmo = glm(default~., data=x,
? ? ? ? ? ? family=binomial)
? tb = table(glmo$fitted.values>thr, x$default)
? acc[ithr] = sum(diag(tb))/sum(tb)
? #
? # calculate sensitivity for a given threshold
? sens[ithr] = tb[2,2]/sum(tb[,2])
? # calculate specificity for a given threshold
? spec[ithr] = tb[1,1]/sum(tb[,1])
? # prediction
}
plot(1-spec,sens,xlim=c(0,1),ylim=c(0,1),t='b')
abline(h=c(0,1),v=c(0,1),col=8)
abline(a=0,b=1,col=8)
plot(acc)
plot(spec, sens)#能看出來spec增加输硝,sens會下降
confusionMatrix(y.test,pred)
# Evaluate a cross-validated ROC curve manually:
# 手工評估交叉驗證的ROC曲線:
n = nrow(x)
K = 10
train.acc = test.acc = matrix(NA, nrow=K, ncol=length(thrs))
folds = cut(1:n, K, labels=FALSE)
k = 1
thrs = seq(.05,.95,by=.05)
for(ithr in 1:length(thrs)){
? thr = thrs[ithr]
? for(k in 1:K){
? ? itrain = which(folds!=k)
? ? glmo = glm(default~., data=x,
? ? ? ? ? ? ? family=binomial,
? ? ? ? ? ? ? subset=itrain)
? ? tb = table(glmo$fitted.values>thr, x$default[itrain])
? ? train.acc[k, ithr] = sum(diag(tb))/sum(tb)
? ? #
? ? # calculate sensitivity for a given threshold
? ? sens[ithr] = tb[2,2]/sum(tb[,2])
? ? # calculate specificity for a given threshold
? ? spec[ithr] = tb[1,1]/sum(tb[,1])
? ? # prediction
? ? p.test = predict(glmo, x[-itrain,], type='response')
? ? tb = table(p.test>thr, x$default[-itrain])
? ? test.acc[k,ithr] = sum(diag(tb))/sum(tb)
? }
}
boxplot(test.acc)
mean(train.acc)?
mean(test.acc)
# --------------------------------------------------------
# ST4061 / ST6041
# 2021-2022
# Eric Wolsztynski
# ...
#### Exercises Section 4: Tree-based methods 用樹來分類####
# --------------------------------------------------------
###############################################################
#### Exercise 1: growing and pruning a tree ####
###############################################################
#decision tree:從所有變量中挑出一個最重要的來作為根結(jié)點唠摹,后面的每一階都是從所有變量里選最重要的一個作為下一個節(jié)點花吟,這會使得有重復(fù)的變量被選中萄喳,畫出的樹特別大洞焙,造成overfitting
#因此,我們需要對做出的tree做刪減pruning突琳,目的是找到一個最理想的tree size分級大小若债,也就是方差dev最小的那個
install.packages("tree")
library(ISLR) # contains the dataset
library(tree) # contains... tree-building methods
# Recode response variable so as to make it a classification problem
High = ifelse(Carseats$Sales<=8, "No", "Yes")
# Create a data frame that includes both the predictors and response
# (a data frame is like an Excel spreadsheet, if you like)
CS = data.frame(Carseats, High)
CS$Sales = NULL#把sale去掉
CS$High = as.factor(CS$High) # <-- this bit was missing 必須有 不然后面會warning
# Fit the tree using all predictors (except for variable Sales,
# which we have "recoded" into a cateorical response variable)
# to response variable High
tree.out = tree(High~., CS)
summary(tree.out)
# plot the tree
plot(tree.out)
text(tree.out, pretty=0)
#The tree is fully grown, there is lots of little branches, maybe have a very detailed breakdown, means there is an overfitting of the dataset
# pruning:修剪
?cv.tree
set.seed(3)
cv.CS = cv.tree(tree.out, FUN=prune.misclass)#添加prune.misclass函數(shù)以找到合適的size
names(cv.CS)
# - size:
# number of terminal nodes in each tree in the cost-complexity pruning sequence.
#two method to control the depth of the tree:
# (1)- deviance:
# total deviance of each tree in the cost-complexity pruning sequence.
# (2)- k:
# the value of the cost-complexity pruning parameter of each tree in the sequence.
cv.CS
par(mfrow=c(1,2))
plot(cv.CS$size,cv.CS$dev,t='b')#find which size? has the smallest deviation 93 (偏差:觀測值-真實值)
min(cv.CS$dev)
cv.CS$size[which.min(cv.CS$dev)]
abline(v=cv.CS$size[which.min(cv.CS$dev)])#marking the 最合適size的location
plot(cv.CS$k,cv.CS$dev,t='b')#還可以找到最小的dev對應(yīng)的最合適的k
# use pruning:
# - use which.min(cv.CS$dev) to get the location of the optimum
# - retrieve the corresponding tree size
# - pass this information on to pruning function
opt.size = cv.CS$size[which.min(cv.CS$dev)] #Find the optimal size
# see:
plot(cv.CS$size,cv.CS$dev,t='b')
abline(v=cv.CS$size[which.min(cv.CS$dev)])
ptree = prune.misclass(tree.out, best=opt.size) #using the optimal size to pruning the tree
ptree
summary(ptree)
par(mfrow=c(1,2))
plot(tree.out)#initially
text(tree.out, pretty=0)
plot(ptree)#pruned
text(ptree, pretty=0)
#Compare with the initial treem, there is fewer branches in prune tree.
###############################################################
#### Exercise 2: apply CV and ROC analysis ####
###############################################################
# Train/test:
set.seed(2)
n = nrow(CS)
itrain = sample(1:n, 200)#一共有400組數(shù)據(jù),隨機選取其中的200個作為訓(xùn)練向本
CS.test = CS[-itrain,]
High.test = High[-itrain]
# argument 'subset' makes it easy to handle training/test splits:
tree.out = tree(High~., CS, subset=itrain)#總體的樹拆融,沒有修剪過
summary(tree.out)
plot(tree.out)
text(tree.out, pretty=0)
# prediction from full tree:
tree.pred = predict(tree.out, CS.test, type="class")#總體的樹的預(yù)測值
(tb1 = table(tree.pred,High.test))#總體的樹對High的混淆矩陣
# prediction from pruned tree:
ptree.pred = predict(ptree, CS.test, type="class")#修剪過的樹的預(yù)測值
(tb2 = table(ptree.pred,High.test)) # 修剪過的樹的confusion matrix
sum(diag(tb1))/sum(tb1)#總體的樹的準(zhǔn)確度#classification rate
sum(diag(tb2))/sum(tb2)#修剪過的樹的準(zhǔn)確度
#預(yù)測錯的值老師叫做misclassified obsevations
#預(yù)測錯的概率叫misclassification rate
#準(zhǔn)確度高的老師叫做is better suited to predict unseen data
# perform ROC analysis
library(pROC)
# here we specify 'type="vector"' to retrieve continuous scores
# as opposed to predicted labels, so that we can apply varying
# threshold values to these scores to build the ROC curve:
#在ROC中蠢琳,圖像越靠近左上角越好
ptree.probs = predict(ptree, CS.test, type="vector")#在畫ROC的時候,必須用type="vector"
roc.p = roc(response=(High.test), predictor=ptree.probs[,1])
plot(roc.p)
#AUC
#AUC是曲線下面的面積镜豹,越大越好
#下面是兩種方法挪凑,都可以
#取值范圍在0.5和1之間
#越接近1.0,檢測方法真實性越高;
#等于0.5時逛艰,則真實性最低躏碳,無應(yīng)用價值
#AUC < 0.5,比隨機猜測還差
auc(roc.p)
roc.p$auc
###############################################################
#### Exercise 3: find the tree ####
###############################################################
#Bagging
#由于decision tree有許多缺點散怖,比如重復(fù)選擇菇绵,太龐大,overfitting镇眷,用一個更為優(yōu)化的分類方法叫做Bagging
#用Bootstrap咬最,每次BT里選一個新的variable做節(jié)點往下分,high variance欠动, low bias永乌,用于分類和回歸都可以
#但是每一層選出來的variable會與前面的有很強的相關(guān)性
# ... can you find it?
###############################################################
#### Exercise 4: grow a random forest ####
###############################################################
#更好的一種方法,each learner:可以針對某一種特性來分類random subset is considering feature Q<P具伍,每個tree是基于一種特性分類的翅雏,許多個特性就有許多個tree,這組成了一個森林random forest
#不容易overfit人芽,不敏感
library(tree)
library(ISLR)
#install.packages("randomForest")
library(randomForest)
# ?Carseats
High = as.factor(ifelse(Carseats$Sales <= 8, 'No', 'Yes'))
CS = data.frame(Carseats, High)
CS$Sales = NULL
P = ncol(CS)-1? #?number of features(把High去掉)
# grow a single (unpruned) tree
tree.out = tree(High~., CS)
# fitted values for "training set"按照decision tree直接全部預(yù)測
tree.yhat = predict(tree.out, CS, type="class")
# grow a forest:建立一個森林
rf.out = randomForest(High~., CS)
# fitted values for "training set"按照randomForest直接全部預(yù)測
rf.yhat = predict(rf.out, CS, type="class")
#If you want to perform bagging instead of random forest
# compare to bagging: using all features,not include y 這種方法叫bagging(用了全部的variables)望几;rf可以隨便選用mtry,自定義用多少feature
bag.out = randomForest(High~., CS, mtry=P)#添加mtry= P = ncol(CS)-1? #?number of features(把High去掉)
# fitted values for "training set"
bag.yhat = predict(bag.out, CS, type="class")#按照bag直接全部預(yù)測
# confusion matrix for tree:
(tb.tree = table(tree.yhat, High))#按照decision tree直接全部預(yù)測的混淆矩陣
# confusion matrix for RF
(tb.rf = table(rf.yhat, High))#按照randomForest直接全部預(yù)測的混淆矩陣萤厅,可以看出分類全部正確橄抹,perfect
# confusion matrix for bagging
(tb.bag = table(bag.yhat, High))#按照bag直接全部預(yù)測的混淆矩陣
# Note this is different to the confusion of RF
# matrix for the OOB observations:OOB對建樹時未使用的數(shù)據(jù)(bootstrap沒用到的數(shù)據(jù))進行誤差估計
(tb.rf2 = rf.out$confusion)#rf.out是真實的數(shù)據(jù)直接放到RF里,不做預(yù)測惕味,說明真實情況下按照用RF方法應(yīng)該有部分?jǐn)?shù)據(jù)是錯誤分類的
sum(diag(tb.tree))/sum(tb.tree)#decision tree方法精確度91%
sum(diag(tb.rf))/sum(tb.rf)#RF方法精確度1(完全正確)但不是什么數(shù)據(jù)都是1楼誓,這里是巧了
sum(diag(tb.bag))/sum(tb.bag)#accurate=1
sum(diag(tb.rf2))/sum(tb.rf2)
#原本bag做出來的精確度應(yīng)該比rf低,但現(xiàn)在它比rf的精確度高了名挥,1>0.8141762, 說明bag出現(xiàn)了over fit
# train-test split
set.seed(6041)
N = nrow(CS)
itrain = sample(1:N, 200)
CS.train = CS[itrain,]
CS.test = CS[-itrain,]
tree.out = tree(High~., CS.train)
# fitted values for "train set"
tree.yhat = predict(tree.out, CS.train, type="class")
# fitted values for "test set"
tree.pred = predict(tree.out, CS.test, type="class")
rf.out = randomForest(High~., CS.train)
# fitted values for "training set"
rf.yhat = predict(rf.out, CS.train, type="class")
# fitted values for "test set"
rf.pred = predict(rf.out, CS.test, type="class")
bag.out = randomForest(High~., CS.train, mtry=(ncol(CS)-2))
# fitted values for "training set"
bag.yhat = predict(bag.out, CS.train, type="class")
# fitted values for "test set"
bag.pred = predict(bag.out, CS.test, type="class")
# confusion matrix for tree (test data):
(tb.tree = table(tree.pred, CS.test$High))
# confusion matrix for RF (test data):
(tb.rf = table(rf.pred, CS.test$High))#better performance of rf
# confusion matrix for Bagging (test data):
(tb.bag = table(bag.pred, CS.test$High))#bagging is not as good as rf
sum(diag(tb.tree))/sum(tb.tree)
sum(diag(tb.rf))/sum(tb.rf)
sum(diag(tb.bag))/sum(tb.bag)
###############################################################
#### Exercise 5: benchmarking ####
###############################################################
###### Exercise 5: benchmarking (this exercise is left as homework)######
# bring in that code from Section 2 (below) and add to it:
library(class) # contains knn()
library(ISLR) # contains the datasets
library(pROC)
library(tree)
library(randomForest)
#### (1) benchmarking on unscaled data對未縮放數(shù)據(jù)進行基準(zhǔn)測試####
#對一類測試對象的某項性能指標(biāo)進行定量的和可對比的測試
set.seed(4061)
n = nrow(Default)
dat = Default[sample(1:n, n, replace=FALSE), ]
i.cv = sample(1:n, round(.7*n), replace=FALSE)
dat.cv = dat[i.cv,] # use this for CV (train+test)
dat.valid = dat[-i.cv,] # save this for later (after CV)
# tuning of the classifiers:
K.knn = 3
K = 10
N = length(i.cv)
folds = cut(1:N, K, labels=FALSE)
acc.knn = acc.glm = acc.lda = acc.qda = numeric(K)#prediction accuracies
auc.knn = auc.glm = auc.lda = auc.qda = numeric(K)#AUC
acc.rf = auc.rf = numeric(K)
for(k in 1:K){ # 10-fold CV loop
? # split into train and test samples:
? #首先要劃分訓(xùn)練樣本和測試樣本
? i.train = which(folds!=k)
? dat.train = dat.cv[i.train, ]
? dat.test = dat.cv[-i.train, ]
? #作出KNN擬合
? # adapt these sets for kNN:
? x.train = dat.train[,-1]
? y.train = dat.train[,1]
? x.test = dat.test[,-1]
? y.test = dat.test[,1]
? x.train[,1] = as.numeric(x.train[,1])
? x.test[,1] = as.numeric(x.test[,1])
? # train classifiers:
? knn.o = knn(x.train, x.test, y.train, K.knn)
? #通訓(xùn)練樣本作出其他擬合
? glm.o = glm(default~., data=dat.train, family=binomial(logit))# 做logistic regression
? lda.o = lda(default~., data=dat.train)#做LDA
? qda.o = qda(default~., data=dat.train)#做QDA
? rf.o = randomForest(default~., data=dat.train)#做randomForest
? #用測試樣本進行預(yù)測
? # test classifiers:
? # (notice that predict.glm() does not have a functionality to
? # return categorical values, so we copmute them based on the
? # scores by applying a threshold of 50%)
? # 測試分類器:
? # (注意 predict.glm() 沒有功能
? # 返回分類值疟羹,因此我們根據(jù)
? # 通過應(yīng)用 50% 的閾值得分)
? knn.p = knn.o
? glm.p = ( predict(glm.o, newdata=dat.test, type="response") > 0.5 )
? lda.p = predict(lda.o, newdata=dat.test)$class
? qda.p = predict(qda.o, newdata=dat.test)$class
? rf.p = predict(rf.o, newdata=dat.test)
? #做各個方法的測試樣本的混淆矩陣
? # corresponding confusion matrices:
? tb.knn = table(knn.p, y.test)
? tb.glm = table(glm.p, y.test)
? tb.lda = table(lda.p, y.test)
? tb.qda = table(qda.p, y.test)
? tb.rf = table(rf.p, y.test)
? #每個方法的準(zhǔn)確度
? # store prediction accuracies:
? acc.knn[k] = sum(diag(tb.knn)) / sum(tb.knn)
? acc.glm[k] = sum(diag(tb.glm)) / sum(tb.glm)
? acc.lda[k] = sum(diag(tb.lda)) / sum(tb.lda)
? acc.qda[k] = sum(diag(tb.qda)) / sum(tb.qda)
? acc.rf[k] = sum(diag(tb.rf)) / sum(tb.rf)
? #
? # ROC/AUC analysis:
? #FandonForest沒有
? # WARNING: THIS IS NOT PR(Y=1 | X), BUT Pr(Y = Y_hat | X):
? knn.p = attributes(knn(x.train, x.test, y.train, K.knn, prob=TRUE))$prob
? glm.p = predict(glm.o, newdata=dat.test, type="response")
? lda.p = predict(lda.o, newdata=dat.test)$posterior[,2]
? qda.p = predict(qda.o, newdata=dat.test)$posterior[,2]
? #auc.knn[k] = roc(y.test, knn.p)$auc
? auc.glm[k] = roc(y.test, glm.p)$auc
? auc.lda[k] = roc(y.test, lda.p)$auc
? auc.qda[k] = roc(y.test, qda.p)$auc
}
boxplot(acc.knn, acc.glm, acc.lda, acc.qda,
? ? ? ? main="Overall CV prediction accuracy",
? ? ? ? names=c("kNN","GLM","LDA","QDA"))
#下面兩個的區(qū)別在于有沒有KNN的AUC(因為KNN的AUC的圖和其他方法放不在一起)
boxplot(auc.glm, auc.lda, auc.qda,
? ? ? ? main="Overall CV AUC",
? ? ? ? names=c("GLM","LDA","QDA"))
boxplot(auc.knn, auc.glm, auc.lda, auc.qda,
? ? ? ? main="Overall CV AUC",
? ? ? ? names=c("kNN","GLM","LDA","QDA"))
###############################################################
### Exercise 6: Variable importance from RF ####
###############################################################
#由RF看哪個variable是最重要的(系數(shù)越大越重要)
library(ISLR)
library(randomForest)
# ?Carseats
High = as.factor(ifelse(Carseats$Sales <= 8, 'No', 'Yes'))
CS = data.frame(Carseats, High)
CS$Sales = NULL
# grow a forest:
rf.out = randomForest(High~., CS)
# compare to bagging:
bag.out = randomForest(High~., CS, mtry=(ncol(CS)-1))
cbind(rf.out$importance, bag.out$importance)
#bag會讓原本重要的variable更重要,會讓不重要的variable更不重要,because essentially those strong predictors have a better chance to get selected every time.
#比如阁猜,CompPrice,用bag之后重要性增加了蹋艺,后面四個variable剃袍,用bag之后重要性減少了,還可以說用rf捎谨,CompPrice的重要性是US的3倍
par(mfrow=c(1,2))
varImpPlot(rf.out, pch=15, main="Ensemble method 1")#Price最重要民效,urban最不重要
varImpPlot(bag.out, pch=15, main="Ensemble method 2")
#Mean decrease Gini, 縱坐標(biāo)是減小的幅度,減小的幅度越大涛救,也就是數(shù)值越大畏邢,variable越重要。
?randomForest
###############################################################
#### Exercise 7: gradient boosting ####
###############################################################
# gradient boosting model? 梯度增強模型
# 之前的方法检吆,下一階與上一階會有相關(guān)性舒萎,為了減小correlation
# 找到真實值與預(yù)測值的殘差,下一階基于上一階的殘差分析蹭沛,每一階都彌補了上一階的殘差臂寝,可以減小偏差
# 缺點:容易over fit,比較敏感
library(ISLR) # contains the dataset
library(tree) # contains... tree-building methods
#install.packages('gbm')
library(gbm)? # contains the GB model implementation
library(pROC)
# Recode response variable so as to make it a classification problem
High = as.factor(ifelse(Carseats$Sales<=8, "No", "Yes"))
CS = data.frame(Carseats, High)
# remove Sales from data frame, just to make formulas simpler!
CS$Sales = NULL?
####(1)####
set.seed(1)
itest = sample(1:nrow(CS), round(nrow(CS)/4))#題目里面要求選100個作為測試樣本
CS.test = CS[itest,]#測試樣本
CS = CS[-itest,]
####(2)####
set.seed(4061)
# Note:
gbm(High~., data=CS, distribution="bernoulli")
#對于分類問題摊灭,選擇bernoulli或者adaboost咆贬,前者更為推薦
#對于連續(xù)因變量(回歸),選擇gaussian或者laplace
# so we must recode the levels...
CS$High = (as.numeric(CS$High=="Yes")) # 前面HIGH是factor帚呼,這里又變會numeric(yes=1,no=0), this hack could be useful
gb.out = gbm(High~., data=CS,
? ? ? ? ? ? distribution="bernoulli", # use "gaussian" instead for regression
? ? ? ? ? ? n.trees=5000, # size of the ensemble
? ? ? ? ? ? interaction.depth=1) # depth of the trees, 1 = stumps 限制每棵樹的深度
#distribution:模型計算損失函數(shù)時掏缎,需要對輸出變量的數(shù)據(jù)分布做出假設(shè)。
#對于分類問題煤杀,選擇bernoulli或者adaboost眷蜈,前者更為推薦
#對于連續(xù)因變量,選擇gaussian或者laplace
#n.trees:即number of iteration—迭代次數(shù)沈自。迭代次數(shù)的選擇與學(xué)習(xí)速率密切相關(guān)
#interaction.depth和n.minobsinnode:
#子決策樹即基礎(chǔ)學(xué)習(xí)器的深度和決策樹葉節(jié)點包含的最小觀測樹端蛆,
#若基礎(chǔ)學(xué)習(xí)器訓(xùn)練得過于復(fù)雜,將提升模型對于樣本的擬合能力而導(dǎo)致過擬合問題酥泛,
#因此子決策樹深度不宜過大今豆,
#葉節(jié)點可包含的最小觀測書不宜過小。
summary(gb.out$train.error)
# inspect output:
par(mar=c(4.5,6,1,1))
summary(gb.out, las=1)#summary函數(shù)返回自變量的相對重要性
plot(gb.out)
plot(gb.out, i="Price")# i is the index of variable or which variable you want to pick
#從圖中可以看出柔袁,隨著price的增加呆躲,預(yù)測的High數(shù)值的中位數(shù)越小
plot(gb.out, i="ShelveLoc")
#由于ShelveLoc是離散的,畫出來是點捶索,可以看出隨著ShelveLoc變好插掂,預(yù)測的High數(shù)值越大
gb.p = predict(gb.out, newdata=CS.test, n.trees=5000)
gb.p
roc.gb = roc(response=CS.test$High, predictor=gb.p)
plot(roc.gb)
roc.gb$auc
# compare AUC's with a Random Forest:
library(randomForest)
CS$High = as.factor(CS$High)
rf.out = randomForest(High~., CS, ntree=5000)
# fitted values for "training set"
rf.p = predict(rf.out, CS.test, type="prob")[,2]
roc.rf = roc(response=CS.test$High, predictor=rf.p)
plot(roc.rf, add=TRUE, col=2)
roc.gb$auc
roc.rf$auc
#AUC高的那個 great illustrate the model
###############################################################
#### Exercise 8: gradient boosting using caret...####
###############################################################
# Plug in the following snips of code within demo code
# Section4b_demo_using_caret.R:
############ (I) Classification example ############
### Gradient boosting (using caret) for classification
rm(list=ls()) #?clear the environment
library(ISLR) # contains the data
library(caret) # contains everything else
set.seed(4061) # for reproducibility
# Set up the data (take a subset of the Hitters dataset)
data(Hitters)
Hitters = na.omit(Hitters)
dat = Hitters
n = nrow(dat)
NC = ncol(dat)
# Change the response variable to a factor to make this a
# classification problem:
dat$Salary = as.factor(ifelse(Hitters$Salary>median(Hitters$Salary),
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? "High","Low"))
# Data partition
itrain = sample(1:n, size=round(.7*n))
dat.train = dat[itrain,]
dat.validation = dat[-itrain,] # independent validation set for later
# x = select(dat.train,-"Salary") ###?if using dplyr
# training set:
x = dat.train
x$Salary = NULL
y = as.factor(dat.train$Salary)
gb.out = train(Salary~., data=dat.train, method='gbm', distribution='bernoulli')
gb.fitted = predict(gb.out) # corresponding fitted values
gb.pred = predict(gb.out, dat.validation)
confusionMatrix(reference=dat.validation$Salary, data=gb.pred,
? ? ? ? ? ? ? ? mode="everything")
############ (II) Regression example ############
### Gradient boosting (using caret) for regression
rm(list=ls()) #?clear the environment
# Set up the data (take a subset of the Hitters dataset)
data(Hitters)
Hitters = na.omit(Hitters)
dat = Hitters
# hist(dat$Salary)
dat$Salary = log(dat$Salary)
n = nrow(dat)
NC = ncol(dat)
# Data partition
itrain = sample(1:n, size=round(.7*n))
dat.train = dat[itrain,]
dat.validation = dat[-itrain,]
x = dat.train
x$Salary = NULL
y = dat.train$Salary
ytrue = dat.validation$Salary
gb.out = train(Salary~., data=dat.train, method='gbm', distribution='gaussian')
gb.fitted = predict(gb.out) # corresponding fitted values
gb.pred = predict(gb.out, dat.validation)
mean((gb.pred-ytrue)^2)