R筆記 ST4061 12/02/2022

r文件下載鏈接

# --------------------------------------------------------

# ST4061 / ST6041

# 2021-2022

# Eric Wolsztynski

#### Exercises Section 1: Date pre-processing####

# --------------------------------------------------------

library(glmnet)

library(survival)

library(ISLR)

###############################################################

#### Exercise 1: effect of scaling ####

#scale:標(biāo)準(zhǔn)化"(x-miu)/sigma"

###############################################################

summary(iris)

#取前100個作為訓(xùn)練樣本

dat = iris[1:100,]

#去掉沒用的level

dat$Species = droplevels(dat$Species)?

##由于只取前100行數(shù)據(jù)作為訓(xùn)練樣本断医,virginica 這個species沒用到,所以用droplevels把它去掉

##所以在數(shù)據(jù)前期處理時舅柜,如果新選擇的訓(xùn)練樣本中有沒用到的level厘贼,應(yīng)該用droplevels把它刪掉

#分別取自變量x 因變量y(species)

x = dat[,1:4]

y = dat$Species

# 對數(shù)據(jù)做scale 標(biāo)準(zhǔn)化

## method 1 先scale 再自變量因變量分離

dats = dat

dats[,1:4] = apply(dats[,1:4],2,scale)

##method 2 先自變量因變量分離,再對自變量做標(biāo)準(zhǔn)化

## we can apply scaling to the x data directly:

xs = apply(x,2,scale)

# (1)

##PCA

##iris數(shù)據(jù)中有四個成分Sepal.Length Sepal.Width Petal.Length Petal.Width;方差最大的為PCA中的第一主成分

##(但是我們并不能看出來第一主成分是原數(shù)據(jù)中的哪個成分)铛楣,方差第n大的是PCA中的第n主成分

## 用prcomp()做PCA弱左,summary得到的第一行,是對應(yīng)主成分的標(biāo)準(zhǔn)差棘钞;第二行是Var(PCi)/sum(Var(PCi)); 第三行是第二行的累計和

pca.unscaled = prcomp(x)

pca.scaled = prcomp(x,scale=TRUE)

pca.scaled.2 = prcomp(xs) # should be the same as pca.scaled

summary(pca.unscaled)

summary(pca.scaled)

## Proportion of Variance越大缠借,說明該成分包含的信息越多,如果該成分信息變化宜猜,那么總體受影響更大

## summary(pca.unscaled)里泼返,Cumulative Proportion means:

## if you take the first component(PC1), you will capture the 91%, which means it contains the most information

## if you take the first two components, you will capture the 98%, the other two dimensions in the data sets are probably redundant.

## 如果你有前兩個成分的數(shù)據(jù),那你就抓住了98%的信息, 那么剩下兩個成分就是多余的姨拥,不重要的

# plot the data on its first 2 dimensions in each space:

#biplot:主成分分析散點圖(雙標(biāo)圖)

#分析biplot:

#兩環(huán)境線段之間的夾角的余弦值是它們的相關(guān)系數(shù)绅喉,夾角小于90度表示正相關(guān)渠鸽,說明兩環(huán)境對品種排序相似,大于90度表示負(fù)相關(guān)柴罐,表示兩環(huán)境對品種排序相反拱绑,等于90度說明兩環(huán)境不相關(guān)。夾角較小說明試驗點是重復(fù)設(shè)置的丽蝎,去掉一個不影響對品種的評價猎拨。

#環(huán)境線段的長度是試驗點對品種的區(qū)分能力,線段越長屠阻,區(qū)分能力越強红省。

par(mfrow=c(1,2))

plot(x[,1:2], pch=20, col=y, main="Data in original space")

biplot(pca.unscaled, main="Data in PCA space")

abline(v=0, col='orange')

# see the binary separation in orange along PC1?

# re-use this into biplot in original space:

pca.cols = c("blue","orange")[1+as.numeric(pca.unscaled$x[,1]>0)]

plot(x[,1:2], pch=20, col=pca.cols,

main="Data in original space\n colour-coded using PC1-split")

par(mfrow=c(2,2))

plot(pca.unscaled) # scree plot

biplot(pca.unscaled) # biplot

plot(pca.scaled) # scree plot

biplot(pca.scaled) # biplot

# now analyse this plot :)

# 從圖中可以看出兩點:

#1、柱狀圖中国觉,PL和PW對應(yīng)的柱子很長吧恃,說明他們兩個包含的信息多(包含主要信息)

#2、在biplot圖中麻诀,PL和PW的線方向一致痕寓,說明他們兩個的相關(guān)性很大

# Petal.Length和Petal.Width capture the same kind of direction of information, Sepal.Length and Sepal.Width have smaller contribution than those two.

# (2)logistic regression

#dat——unscaled,dats——scaled

logreg.unscaled = glm(Species~., data=dat) # make this work

#family='binomial' means do logistic regression

logreg.unscaled = glm(Species~., data=dat, family='binomial')

logreg.scaled = glm(Species~., data=dats, family='binomial')

#family='binomial' means do logistic regression

# discuss...

# (...unscaled and scaled 's coefficients are the fits different?)

cbind(coef(logreg.unscaled), coef(logreg.scaled))

#the signs are the same(兩個數(shù)據(jù)的正負(fù)符號是一致的)

#but the contributions of each of the variables has changed, sometimes relate to each other(但是系數(shù)和影響變化了,但是有時是有聯(lián)系的)

#have a swap between SP and SL, probably because there is a lot of redundancy between them.(SP和SL的大小交換了蝇闭,可能因為有太多冗雜的東西在里面)

cor(dats$Petal.Length,dats$Petal.Width)

#0.9793217 means a high correlation them, 相關(guān)性很強呻率,SP和SL可以混同(confused),他們可以相互交換(interchangeable)

# (... does this align 一致 with the PCA analysis?)

# (yes, we see a change in the role of each variable in the information space,但是PCA里PL和PW是主成分PC1 PC2呻引,他們在邏輯回歸里的系數(shù)也是很大的兩個

# (... and why this align 一致 with the PCA analysis?為什么PCA和邏輯回歸會一致)

# (They are both linear techniques)

# (3)

#lasso要先把數(shù)據(jù)變成matrix

x.m = model.matrix(Species~.+0, data=dat)#unscaled

lasso.cv = cv.glmnet(x.m, y, family="binomial")#用cv.glmnet去找到后面lasso要用的lambda

lasso.unscaled = glmnet(x.m, y, family="binomial", lambda=lasso.cv$lambda.min)

lasso.pred = predict(lasso.unscaled, newx=x.m, type="class")

#

xs.m = model.matrix(Species~.+0, data=dats)#scaled

lasso.cv = cv.glmnet(xs.m, y, family="binomial")

lasso.scaled = glmnet(xs.m, y, family="binomial", lambda=lasso.cv$lambda.min)

lasso.s.pred = predict(lasso.scaled, newx=xs.m, type="class")

#

cbind(coef(lasso.unscaled), coef(lasso.scaled))

#由此可見礼仗,scaled和unscaled的intercept很不一樣,這是因為我們在scaled里centred all the covariance,which means the data in scaled are all central zero,所以there is very little need for an intercept parameter

table(lasso.pred, lasso.s.pred)

###############################################################

#### Exercise 2: data imputation ####

###############################################################

par(mfrow=c(1,1))

summary(lung)

boxplot(lung$meal.cal~lung$sex, col=c("cyan","pink"))

# can you think of other ways of analyzing this?

# (1) lung cancer data: compare meal.cal values between male and female cohorts,

# and discuss w.r.t. gender-specific data imputation

# NB: "missing at random" vs "missing not at random"??

?lung

View(lung)

dim(lung)#228

dim(na.omit(lung))#167

#數(shù)據(jù)中有NA逻悠,處理missing value

#167/228= 0.7324561元践,有73%的數(shù)據(jù),剩下的是不全(丟失)的數(shù)據(jù)

#data imputation:fill out the missing value補全丟失數(shù)據(jù)

#replacing the missing values with the mean or media of the variable for the data set.

nas = is.na(lung$meal.cal) # track missing values 看看哪些值是NA

table(nas, lung$sex) #統(tǒng)計男女中各自NA的個數(shù)

imales = which(lung$sex==1)#將data set男性的數(shù)據(jù)位置找出來

m.all = mean(lung$meal.cal, na.rm=TRUE)#男女總體的mean

m.males = mean(lung$meal.cal[imales], na.rm=TRUE)#男性的mean

m.females = mean(lung$meal.cal[-imales], na.rm=TRUE)#女性的mean

t.test(lung$meal.cal[imales], lung$meal.cal[-imales])#預(yù)測男女包含missing value的mean是否相等

# p= 0.01989<0.05, 意味著alternative hypothesis: true difference in means is not equal to 0, 男女包含missing value的mean是不相等的童谒,所以要用不同的imputation值去填補他們

# significant difference, hence must use different imputation

# values for each gender

# (2) Run Cox PHMs on original, overall imputed and gender-specific imputed

# datsets, using the cohort sample mean for data imputation. Compare and discuss

#用不同的數(shù)據(jù)去填充missing value单旁,比較他們用COX模型擬合的差別

# model fitting output.

dat1 = dat2 = lung

# impute overall mean in dat1:

dat1$meal.cal[nas] = m.all #假如都用整體的mean來填充

# impute gender=sepcific mean in dat2:

dat2$meal.cal[(is.na(lung$meal.cal) & (lung$sex==1))] = m.males#用女的均值填充女

dat2$meal.cal[(is.na(lung$meal.cal) & (lung$sex==2))] = m.females#用男的均值填充男

#Fit Cox proportional hazard models: Cox比例風(fēng)險回歸模型

?coxph()

cox0 = coxph(Surv(time,status)~.,data=lung) #沒有填充missing value

cox1 = coxph(Surv(time,status)~.,data=dat1) #直接填充不分男女

cox2 = coxph(Surv(time,status)~.,data=dat2) #分男女填充

summary(cox0)

summary(cox1)

summary(cox2)

cbind(coef(coxph(Surv(time,status)~.,data=lung)),

? ? ? coef(coxph(Surv(time,status)~.,data=dat1)),coef(coxph(Surv(time,status)~.,data=dat2)))

round(cbind(coef(coxph(Surv(time,status)~.,data=lung)),

? ? ? ? ? ? coef(coxph(Surv(time,status)~.,data=dat1)),coef(coxph(Surv(time,status)~.,data=dat2))),3)

#由比較可見,沒有填充和兩種填充方法三者meal.cal coefficient的結(jié)果差不多

# - dat1 and dat2 yield increased sample size (from 167 to 209, both imputed

# datasets having 209 observations) dat1和dat2產(chǎn)生了增加的樣本大小(從167到209饥伊,都有209個觀測數(shù)據(jù)集)

# - overall coefficient effects comparable between the 2 sets

# - marginal differences in covariate effect and significance between lung and {dat1;dat2}

# - no substantial difference between dat1 and dat2 outputs

###############################################################

#### Exercise 3: data imputation ####

###############################################################

library(ISLR)

dat = Hitters

View(Hitters)

# (1) (Deletion)

?na.omit()

sdat = na.omit(dat)#返回刪除NA后的數(shù)據(jù)

sx = model.matrix(Salary~.+0,data=sdat)

sy = sdat$Salary

cv.l = cv.glmnet(sx,sy)

slo = glmnet(sx,sy,lambda=cv.l$lambda.min)

# (2) Simple imputation (of Y) using overall mean

ina = which(is.na(dat$Salary))#把NA的位置找出來

dat$Salary[ina] = mean(dat$Salary[-ina])

x = model.matrix(Salary~.+0,data=dat)

y = dat$Salary

cv.l = cv.glmnet(x,y)

lo = glmnet(x,y,lambda=cv.l$lambda.min)

# (3)

#刪除 OR 補充缺失值象浑;fit lasso模型;再用擬合好的lasso模型做出估計的y^撵渡;比較真實值與用lasso模型估計出來的估計值的標(biāo)準(zhǔn)差

slop = predict(slo,newx=sx)#用fit好的lasso模型

lop = predict(lo,newx=x)

sqrt(mean((slop-sy)^2))

sqrt(mean((lop-y)^2))#標(biāo)準(zhǔn)差差不多

#畫圖比較用delete和Simple imputation兩種方法擬合的模型做出來的每一個y^的差距融柬,如果是45度直線死嗦,說明兩種方法預(yù)測的y^是差不多的

plot(slop,lop[-ina])

abline(a=0,b=1)

abline(lm(lop[-ina]~slop), col='navy')

# What could we do instead of imputing the Y?

###############################################################

#### Exercise 4: resampling ####

###############################################################

###############################################################

#### Exercise 5: resampling (CV vs bootstrapping)####

###############################################################

# Implement this simple analysis and discuss - think about

# (sub)sample sizes!

# 跳過shuff過程

#n = nrow(trees)#都要先洗牌趋距,重新隨機排列一下原數(shù)據(jù)

#trees = trees[sample(1:n, n), ]

x = trees$Girth? # sorted in increasing order...

y = trees$Height

plot(x, y, pch=20)

summary(lm(y~x))

N = nrow(trees)

# (1) 10-fold CV on original dataset

set.seed(4060)

K = 10

slp <- numeric(K)

itc <- numeric(K)

cc <- numeric(K)

folds = cut(1:N, K, labels=FALSE)

for(k in 1:K){

i = which(folds==k)

# train:

lmo = lm(y[-i]~x[-i])

slp[k] <- lmo$coefficient[2]

itc[k] <- lmo$coefficient[1]

cc[k] = summary(lmo)$coef[2,2]

# (NB: no testing here, so not the conventional use of CV)

}

mean(cc)#0.3443734

####cc是啥,不是想儲存slop么####

# (2) 10-fold CV on randomized dataset

# shuffle it

set.seed(1)

mix = sample(1:nrow(trees), replace=FALSE)

xr = trees$Girth[mix]

yr = trees$Height[mix]

set.seed(4060)

K = 10

ccr = numeric(K)

folds = cut(1:N, K, labels=FALSE)

for(k in 1:K){

i = which(folds==k)

lmo = lm(yr[-i]~xr[-i])

ccr[k] = summary(lmo)$coef[2,2]

}

mean(ccr)#0.3408507越除,shuff后的mean更小一些

mean(cc)#0.3443734

#(3)

sd(ccr)#0.01854404节腐,shuff后的sd更小一些

sd(cc)#0.03990858

boxplot(cc,ccr)

t.test(cc,ccr)#p>0.05外盯,拒絕備擇假設(shè)alternative hypothesis,true difference in means is equal to 0

var.test(cc,ccr)#F-test (var.test()) p<0.05,接受備擇假設(shè)alternative hypothesis翼雀,true ratio of variances is not equal to 1

?var.test

#p>0.05,accept H0, rejept H1(alternative hypothesis)

# (4) Bootstrapping (additional note)用BT來做饱苟,跟CV比較

set.seed(4060)

K = 100

cb = numeric(K)

for(i in 1:K){

# bootstrapping

ib = sample(1:N,N,replace=TRUE)

lmb = lm(y[ib]~x[ib])

cb[i] = summary(lmb)$coef[2,2]

}

mean(cb)#0.3163658,不如shuff后的CV結(jié)果好

sd(cb)#0.0505108,不如shuff后的CV結(jié)果好

dev.new()

par(font=2, font.axis=2, font.lab=2)

boxplot(cbind(cc,ccr,cb), names=c("CV","CVr","Bootstrap"))

####abline(h=1.0544)???####

t.test(cc,cb)

####Explain why these are different?如何評價####

round(cc, 3)

#(5)

#we have 36.8% points out-of-BT, so using 368 points as test set, 632 points as train set

#(6)

N = 1000

x = runif(N, 2, 20)

y = 2 + 5*x + rnorm(N)

R=33

K1=3

pred.err.cv=numeric(R*K1)

###CV

for(j in 1:R){

? mix=sample(1:N,replace = F)

? xr=x[mix]

? yr=y[mix]

? folds=cut(1:N,K1,labels = F)

? data53=as.data.frame(cbind(xr,yr))

? mse = pred.err = numeric(K1)

? for(k in 1:K1){

? ? i.train = which(folds!=k)

? ? o = lm(yr~xr, data=data53, subset=i.train)

? ? i.test = which(folds==k)

? ? yp = predict(o, newdata=data53[i.test,])

? ? pred.err[k] = mean((yp-yr[i.test])^2)

? }

? pred.err.cv[c(K1*j-K1+1):c(K1*j)]=pred.err

}

mean(pred.err.cv)

###Bootstrapping command+shift+c 全選加注釋

# K2 = R*K1

# pred.err.BT = numeric(K2)

# for(j in 1:K2){

#? mix=sample(1:N,replace = T)

#? xr=x[mix]

#? yr=y[mix]

#? data53=as.data.frame(cbind(xr,yr))

#? o = lm(yr~xr, data=data53)

#? yh = o$fitted.values

#? pred.err.BT[j] = mean((yh-yr)^2)

# }

# mean(pred.err.BT)

# --------------------------------------------------------

# ST4061 / ST6041

# 2021-2022

# Eric Wolsztynski

# ...

#### Exercises Section 2: Regularization ####

# --------------------------------------------------------

###############################################################

#### Exercise 1: tuning LASSO ####

###############################################################

# Have a go at this exercise yourself...

# you can refer to ST4060 material:)

library(ISLR)

library(glmnet)

Hitters = na.omit(Hitters)

x = model.matrix(Salary~., Hitters)[,-1]

y = Hitters$Salary

cv.rd <- cv.glmnet(x,y)

ridge_mod = glmnet(x, y, alpha = 0, lambda = cv.rd$lambda.min)

pred.rd <- predict(ridge_mod, newx=x)

length(pred.rd)

length(y)

sqrt(mean((pred.rd - y)^2))

summary(ridge_mod)

coef(ridge_mod)

ridge_mod$lambda

the_grid = 10^seq(10, -2, length = 100)

n <- length(the_grid)

criterion = numeric(length(the_grid))

for(i in 1:n){

? fit = glmnet(x, y, alpha = 0, lambda = the_grid[i])

? pred.fit <- predict(fit, newx=x)

? criterion[i] = sqrt(mean((pred.fit - y)^2))

}

which.min(criterion)

the_grid[which.min(criterion)]

###############################################################

#### Exercise 2: tuning LASSO + validation split ####

###############################################################

# Have a go at this exercise yourself too...

# you can refer to ST4060 material:)

# --------------------------------------------------------

# ST4061 / ST6041

# 2021-2022

# Eric Wolsztynski

# ...

#### Exercises Section 3: Classification Exercises ####

# --------------------------------------------------------

# install.packages("class")

# install.packages("MASS")

# install.packages("car")

# install.packages("ISLR")

# install.packages("pROC")

# install.packages("carData")

library(class) # contains knn()

library(MASS)? # to have lda()

library(carData)

library(car)

library(ISLR) # contains the datasets

library(pROC)

###############################################################

#### Exercise 1: kNN on iris data ####

###############################################################

set.seed(1)

# shuffle dataset first:

z = iris[sample(1:nrow(iris)),]

# Here we focus on sepal information only

plot(z[,1:2], col=c(1,2,4)[z[,5]],

? ? pch=20, cex=2)

x = z[,1:2] # sepal variables only

y = z$Species

# Here's how to use the knn() classifier:

K = 1

n = nrow(x)

# split the data into train+test:

i.train = sample(1:n, 100)

x.train = x[i.train,]

x.test = x[-i.train,]

y.train = y[i.train]

y.test = y[-i.train]

ko = knn(x.train, x.test, y.train, K)

tb = table(ko, y.test)

1 - sum(diag(tb)) / sum(tb) # overall classification error rate, error rate越小越好

library(caret)

confusionMatrix(data=ko, reference=y.test)#顯示the error matrix

#Here狼渊, Accuracy : 0.7 which is sum(diag(tb)) / sum(tb); 95%Confidence interval around its accuracy matrix箱熬;還能比較3種class的Sensitivity 和 Specificity

?confusionMatrix#創(chuàng)建一個給定特定邊界的混淆矩陣

# Build a loop around that to find best k:

# (NB: assess effect of various k-values

# on the same data-split)

Kmax = 30

acc = numeric(Kmax)#acc越大越好

for(k in 1:Kmax){

? ko = knn(x.train, x.test, y.train, k)

? tb = table(ko, y.test)

? acc[k] = sum(diag(tb)) / sum(tb)

}

plot(1-acc, pch=20, t='b', xlab='k')#從圖中找到最低點,就是最好的k

#評價:if k is too small(overfitting), decision boundry is overly flexible, low bias+large variance

#? ? ? if k is too large(underfitting), decision boundry is not flexible enough, high bias+small variance

###############################################################

#### Exercise 2: GLM(logistic regression) on 2-class iris data ####

###############################################################

n = nrow(iris)

is = sample(1:n, size=n, replace=FALSE)

dat = iris[is,-c(3,4)] # shuffled version of the original set, and hold sepal variables only

# record into 2-class problem:轉(zhuǎn)化成2-class problem “是virginica”=1 OR “不是virginica ”=0

dat$is.virginica = as.numeric(dat$Species=="virginica")

dat$Species = NULL # "remove" this component

names(dat)

is = 1:100 # training set

fit = glm(is.virginica~., data=dat, subset=is, family=binomial(logit))

pred = predict(fit, newdata=dat[-is,], type="response")#返回值是estimated probabilities

hist(pred)

?predict.glm

y.test = dat$is.virginica[-is] # true test set classes

boxplot(pred~y.test, names=c("other","virginica"))#通過箱線圖可以看出“virginica”和“not virginica”通過glm分開的比較徹底

abline(h=0.5, col=3)

#roughly 70% of data would be classified correctly.

# for varying cut-off (ie threshold) values, compute corresponding

# predicted labels, and corresponding confusion matrix:

#find the best cutoff

err = NULL

for(cut.off in seq(.1, .9, by=.1)){

? pred.y = as.numeric(pred>cut.off)

? tb = table(pred.y, y.test)

? err = c(err, (1-sum(diag(tb))/sum(tb)))

}

plot(seq(.1, .9, by=.1), err, t='b')

#choose the best cutoff to classification.

cut.off=0.2

pred.y = as.numeric(pred>cut.off)

tb = table(pred.y, y.test)

err = (1-sum(diag(tb))/sum(tb))

###############################################################

#### Exercise 3: LDA assumptions ####

###############################################################

## (1) 2-class classification problem

#多類轉(zhuǎn)化成2類的三種方法

#(1)轉(zhuǎn)換成0 1

dat = iris

dat$Species = as.factor(ifelse(iris$Species=="virginica",1,0))

#(2)轉(zhuǎn)換成帶命名的(更直觀)

#用0 1 區(qū)分是virginica還是非virginica并不直觀狈邑,不如用virginica or other 來區(qū)分城须,如下:

# to recode cleanly, you could use for instance:

dat$Species = car::recode(dat$Species, "0='other'; 1='virginica'")

#(3)先轉(zhuǎn)化成0 1,再命名

# or:直接讓levels變成virginica or other

#dat$Species = as.factor(ifelse(iris$Species=="virginica",1,0))#先變成0 1

#levels(dat$Species) = c("other","virginica")#再對0 1 命名

##(2)粗略查看分類情況

par(mfrow=c(1,2))

plot(iris[,1:2], pch=20, col=c(1,2,4)[iris$Species], cex=2)

legend("topright",col=c(1,2,4),

? ? ? legend=levels(iris$Species),

? ? ? pch=20, bty='n')#3-class

plot(dat[,1:2], pch=20, col=c(1,4)[dat$Species], cex=2)

legend("topright",col=c(1,4),

? ? ? legend=levels(dat$Species),

? ? ? pch=20, bty='n')#2-class:virginica or not virginica

#由于LDA需要滿足兩個assumption:normal-distribution米苹;equal variance糕伐,否則將不能用LDA方法分類

#First assumption: all histograms are looking like normal distribution

#Second assumption: 每個predictor的方差相同

##First,we need to explore distribution of each predictors(每一列指標(biāo)):(看是否滿足正態(tài)分布)

head(dat)

#here, the first four columns are the predictors(or features/attributes/covariates/variables), the last column is class

# 1.boxplots seem relatively symmetric畫箱線圖看他是否對稱:

#look at each column of the dataset:

par(mfrow=c(2,2))

for(j in 1:4){

? boxplot(dat[,j]~dat$Species,

? ? ? ? ? ylab='predictor',

? ? ? ? ? col=c('cyan','pink'),

? ? ? ? ? main=names(dat)[j])

}

#here you can see there is a clear separation between the two classes, except the sepal.width

#so we decide to use the three other predictors to make a classification

#由于正態(tài)分布是對稱的蘸嘶,如果箱線圖的箱子是對稱(symmetric)的良瞧,那么很可能是滿足正態(tài)分布的,如果箱子不對稱训唱,那么會有skew

#看correlation

cor(dat[,1:4])

#here we can see there is 96%(超過90%就可以這么說) correlation between Petal.Width and Petal.Length, which means these two predictors can tell us the same information

#2.只看箱線圖是不夠的褥蚯,還要看是否是histogram正態(tài)分布 but we'd rather check for Normality more specifically:

par(mfrow=c(2,4), font.lab=2, cex=1.2)

for(j in 1:4){

? hist(dat[which(dat$Species=='other'),j], col='cyan',

? ? ? xlab='predictor for class other',

? ? ? main=names(dat)[j])

? hist(dat[which(dat$Species!='other'),j], col='pink',

? ? ? xlab='predictor for class virginica',

? ? ? main=names(dat)[j])

}

#PL and PW for the other class are definitely not normal,其他的圖都是approximately normal

#It means there may be a limitation of our analysis if we decide to use LDA here.

#這里其實就不適合用LDA了,但是為了了解這個方法况增,我們繼續(xù)往下做

#3.除了用柱狀圖判斷是否normal遵岩,也可以用QQ圖 could also use QQ-plots:

par(mfrow=c(2,4), cex=1.2)

for(j in 1:4){

? x.other = dat[which(dat$Species=='other'),j]

? qqnorm(x.other, pch=20, col='cyan',

? ? ? ? main=names(dat)[j])

? abline(a=mean(x.other), b=sd(x.other))

? x.virginica = dat[which(dat$Species!='other'),j]

? qqnorm(x.virginica, pch=20, col='pink',

? ? ? ? main=names(dat)[j])

? abline(a=mean(x.virginica), b=sd(x.virginica))

}

# So what do you think?

#在45度線上的是normal distribution,很明顯PL和PW在other上不是

## Check initial assumption of equality of variances:all predictors have same variance for each class

# Bartlett's test with H0: all variances are equal, p>0.05 接受H0巡通,有相同的variance

for(j in 1:4){

? print( bartlett.test(dat[,j]~dat$Species)$p.value )

}

#只有SL滿足

# Shapiro's test with H0: the distribution is Normal

for(j in 1:4){

? print( shapiro.test(dat[which(dat$Species=='virginica'),j])$p.value )

}#p都>0.05尘执,Normal

for(j in 1:4){

? print( shapiro.test(dat[which(dat$Species=='other'),j])$p.value )

}#PL和PW的p value 很小,不normal

## Fit LDA model to this dataset and check accuracy:

lda.o = lda(Species~., data=dat)

(lda.o)

# can we track some of the values in that summary?

table(dat$Species)/nrow(dat)

#Here tells us there are one third data belongs to virginica, and two thirds belongs to other class.

#It means we don't have balance in how each class represented in the data set.

rbind(

? apply(dat[which(dat$Species=='other'),1:4], 2, mean),

? apply(dat[which(dat$Species=='virginica'),1:4], 2, mean)

)#按列求每一列的均值

# what about the coefficients of linear discriminates?那么線性微分的系數(shù)呢?

x = as.matrix(dat[,1:4])

proj = x %*% lda.o$scaling #取每一列的數(shù)據(jù)宴凉,乘lda.o$scaling

plot(proj, pch=20, col=dat$Species, cex=2)

#we can see, the projections increase as we move from one class to another

# little hack to recover the fitted values quickly

predo = predict(lda.o, newdata=dat)

names(predo)

predo$class#顯示每一個測試樣本的分類

predo$posterior#有0.9990663400的概率第一個測試樣本屬于other class誊锭,有0.0009336600的概率它屬于virginica

y = predo$x

??predo

plot(proj, y)#由此我們能看出,the scores used by LDA in order to generate the predictions and probabilities are the result of proj by using lda.o$scaling directly.(y=proj)

plot(y, predo$posterior[,2])

boxplot(y ~ (predo$posterior[,2]>.5))#第二列probabilities > 0.5 是virginica(TRUE)

boxplot(proj ~ (predo$posterior[,2]>.5))#兩種畫出來的圖是一樣的弥锄,因為y和proj相等

# NB: The way these coefs is calculated follows the MANOVA approach

# popular hack to recover the fitted values:

fitted.values = predict(lda.o, newdata=dat)$class?

boxplot(y~dat$Species)

boxplot(proj~dat$Species)

(tb.2 = table(fitted.values, dat$Species))

sum(diag(tb.2)) / sum(tb.2)#accuracy

#LDA will be more efficient than LR when class are well separated.

## (2) 3-class classification problem

dat = iris

## Explore distribution of predictors:

# boxplots seem relatively symmetric:

par(mfrow=c(2,2))

# here's just a loop to save having to write 4 boxplot

# instructions with names by hand (being lazy often

# makes for nicer code):

for(j in 1:4){

? boxplot(dat[,j]~dat$Species,

? ? ? ? ? xlab = 'Species',

? ? ? ? ? ylab = 'predictor',

? ? ? ? ? col=c('cyan','pink'),

? ? ? ? ? main=names(dat)[j])

}

# but we'd rather check for Normality more specifically:

Ls = levels(dat$Species)

par(mfcol=c(3,4))

for(j in 1:4){

? hist(dat[which(dat$Species==Ls[1]),j], col='cyan',

? ? ? main=names(dat)[j])

? hist(dat[which(dat$Species==Ls[2]),j], col='pink',

? ? ? main=names(dat)[j])

? hist(dat[which(dat$Species==Ls[3]),j], col='green',

? ? ? main=names(dat)[j])

}

# could also use QQ-plots:

par(mfcol=c(3,4))

for(j in 1:4){

? x1 = dat[which(dat$Species==Ls[1]),j]

? qqnorm(x1, pch=20, col='cyan', main=names(dat)[j])

? abline(a=mean(x1), b=sd(x1))

? x2 = dat[which(dat$Species==Ls[2]),j]

? qqnorm(x2, pch=20, col='pink', main=names(dat)[j])

? abline(a=mean(x2), b=sd(x2))

? x3 = dat[which(dat$Species==Ls[3]),j]

? qqnorm(x3, pch=20, col='green', main=names(dat)[j])

? abline(a=mean(x3), b=sd(x3))

}

# So what do you think now?

## Check initial assumption of equality of variances:

# Bartlett's test with H0: all variances are equal

print( bartlett.test(dat[,1]~dat$Species)$p.value )

print( bartlett.test(dat[,2]~dat$Species)$p.value )

print( bartlett.test(dat[,3]~dat$Species)$p.value )

print( bartlett.test(dat[,4]~dat$Species)$p.value )

## or if in lazy mode:

for(j in 1:4){

? print( bartlett.test(dat[,j]~dat$Species)$p.value )

}

## Fit LDA model to this dataset and check accuracy:

lda.o = lda(Species~., data=dat)

(lda.o)

ftted.values = predict(lda.o, newdata=dat)$class

(tb.3 = table(ftted.values, dat$Species))

sum(diag(tb.3)) / sum(tb.3)

###############################################################

### Exercise 4: LDA

###############################################################

## (1) 2-class classification problem

dat = iris

dat$Species = as.factor(ifelse(iris$Species=="virginica",1,0))

levels(dat$Species) = c("other","virginica")

n = nrow(dat)

set.seed(4061)

dat = dat[sample(1:n),] # shuffle dataset

i.train = 1:100

dat.train = dat[i.train,]

dat.test = dat[-i.train,]

lda.o = lda(Species~., data=dat.train)

lda.p = predict(lda.o, newdata=dat.test)

names(lda.p)

(tb = table(lda.p$class, dat.test$Species))

sum(diag(tb))/sum(tb)

# QDA:

qda.o = qda(Species~., data=dat.train)

qda.p = predict(qda.o, newdata=dat.test)

(tb = table(qda.p$class, dat.test$Species))

sum(diag(tb))/sum(tb)

## (2) 3-class classification problem

dat = iris

n = nrow(dat)

set.seed(4061)

dat = dat[sample(1:n),]

i.train = 1:100

dat.train = dat[i.train,]

dat.test = dat[-i.train,]

# LDA:

lda.o = lda(Species~., data=dat.train)

lda.p = predict(lda.o, newdata=dat.test)

names(lda.p)

(tb = table(lda.p$class, dat.test$Species))

sum(diag(tb))/sum(tb)

# QDA:

qda.o = qda(Species~., data=dat.train)

qda.p = predict(qda.o, newdata=dat.test)

(tb = table(qda.p$class, dat.test$Species))

sum(diag(tb))/sum(tb)

###############################################################

### Exercise 5: benchmarking

###############################################################

## (1) benchmarking on unscaled data

#ROC圖像 橫縱坐標(biāo)都越接近1越好丧靡, AOC是線與橫軸包圍的面積,也是越大越好

#sensitivity:縱坐標(biāo)

#specificity:橫坐標(biāo)

#confusionMatrix:混淆矩陣

set.seed(4061)

n = nrow(Default)

dat = Default[sample(1:n, n, replace=FALSE), ]

# get a random training sample containing 70% of original sample:

i.cv = sample(1:n, round(.7*n), replace=FALSE)

dat.cv = dat[i.cv,] # use this for CV (train+test)

dat.valid = dat[-i.cv,] # save this for later (after CV) (HOLD-OUT)

# tuning of the classifiers:

K.knn = 3

# perform K-fold CV:

K = 10

N = length(i.cv)

folds = cut(1:N, K, labels=FALSE)

acc.knn = acc.glm = acc.lda = acc.qda = numeric(K)

auc.knn = auc.glm = auc.lda = auc.qda = numeric(K)

#

for(k in 1:K){ # 10-fold CV loop

? # split into train and test samples:

? i.train = which(folds!=k)

? dat.train = dat.cv[i.train, ]

? dat.test = dat.cv[-i.train, ]

? # adapt these sets for kNN:

? x.train = dat.train[,-1]

? y.train = dat.train[,1]

? x.test = dat.test[,-1]

? y.test = dat.test[,1]

? x.train[,1] = as.numeric(x.train[,1])

? x.test[,1] = as.numeric(x.test[,1])

? # train classifiers:

? knn.o = knn(x.train, x.test, y.train, K.knn)

? glm.o = glm(default~., data=dat.train, family=binomial(logit))

? lda.o = lda(default~., data=dat.train)

? qda.o = qda(default~., data=dat.train)

? # test classifiers:

? knn.p = knn.o

? glm.p = ( predict(glm.o, newdata=dat.test, type="response") > 0.5 )

? lda.p = predict(lda.o, newdata=dat.test)$class

? qda.p = predict(qda.o, newdata=dat.test)$class

? tb.knn = table(knn.p, y.test)

? tb.glm = table(glm.p, y.test)

? tb.lda = table(lda.p, y.test)

? tb.qda = table(qda.p, y.test)

? # store prediction accuracies:

? acc.knn[k] = sum(diag(tb.knn)) / sum(tb.knn)

? acc.glm[k] = sum(diag(tb.glm)) / sum(tb.glm)

? acc.lda[k] = sum(diag(tb.lda)) / sum(tb.lda)

? acc.qda[k] = sum(diag(tb.qda)) / sum(tb.qda)

? #

? # ROC/AUC analysis:AUC值越大越好

? # WARNING: THIS IS NOT Pr(Y=1 | X), BUT Pr(Y = Y_hat | X):

? knn.p = attributes(knn(x.train, x.test, y.train, K.knn, prob=TRUE))$prob

? glm.p = predict(glm.o, newdata=dat.test, type="response")

? lda.p = predict(lda.o, newdata=dat.test)$posterior[,2]

? qda.p = predict(qda.o, newdata=dat.test)$posterior[,2]

? auc.knn[k] = roc(y.test, knn.p)$auc

? auc.glm[k] = roc(y.test, glm.p)$auc

? auc.lda[k] = roc(y.test, lda.p)$auc

? auc.qda[k] = roc(y.test, qda.p)$auc

}

boxplot(acc.knn, acc.glm, acc.lda, acc.qda,

? ? ? ? main="Overall CV prediction accuracy",

? ? ? ? names=c("kNN","GLM","LDA","QDA"))

boxplot(auc.knn,auc.glm, auc.lda, auc.qda,

? ? ? ? main="Overall CV AUC",

? ? ? ? names=c("KNN","GLM","LDA","QDA"))

##### Taking a closer look at performance

roc(y.test, glm.p)$auc#真實值 預(yù)測值

plot(roc(y.test, glm.p))

#取threshold為0.5籽暇,用caret建立混淆矩陣温治,計算精確度

library(caret)

(tb = table(y.test, glm.p>.5))

pred = as.factor(glm.p>.5)

pred = car::recode(pred, "FALSE='No'; TRUE='Yes'")

caret::confusionMatrix(y.test, pred)

sum(diag(tb))/sum(tb)

(683+3)/(683+3+14)#=98%

#看總體數(shù)據(jù)的真實情況纫谅,No占96.67%浇辜,與98%相差不大蔫仙,說明預(yù)測有效

table(Default$default)

table(Default$default)/nrow(Default)

##### Further exercises for you to do:

## adapt code to evaluate sensitivity and specificity

## add validation analysis...

## repeat on scaled data... 會對KNN有影響

###############################################################

### Exercise 6: benchmarking, again

###############################################################

## (1) benchmarking on unscaled data

set.seed(4061)

n = nrow(Default)

dat = Default[sample(1:n, n, replace=FALSE), ]

# get a random training sample containing 70% of original sample:

i.cv = sample(1:n, round(.7*n), replace=FALSE)

x = dat.cv = dat[i.cv,] # use this for CV (train+test)

dat.valid = dat[-i.cv,] # save this for later (after CV)

# Recover ROC curve manually from whole set:

n = nrow(x)

acc = numeric(length(thrs))

sens = spec = numeric(length(thrs))

thrs = seq(.05,.95,by=.05)

for(ithr in 1:length(thrs)){

? thr = thrs[ithr]

? glmo = glm(default~., data=x,

? ? ? ? ? ? family=binomial)

? tb = table(glmo$fitted.values>thr, x$default)

? acc[ithr] = sum(diag(tb))/sum(tb)

? #

? # calculate sensitivity for a given threshold

? sens[ithr] = tb[2,2]/sum(tb[,2])

? # calculate specificity for a given threshold

? spec[ithr] = tb[1,1]/sum(tb[,1])

? # prediction

}

plot(1-spec,sens,xlim=c(0,1),ylim=c(0,1),t='b')

abline(h=c(0,1),v=c(0,1),col=8)

abline(a=0,b=1,col=8)

plot(acc)

plot(spec, sens)#能看出來spec增加输硝,sens會下降

confusionMatrix(y.test,pred)

# Evaluate a cross-validated ROC curve manually:

# 手工評估交叉驗證的ROC曲線:

n = nrow(x)

K = 10

train.acc = test.acc = matrix(NA, nrow=K, ncol=length(thrs))

folds = cut(1:n, K, labels=FALSE)

k = 1

thrs = seq(.05,.95,by=.05)

for(ithr in 1:length(thrs)){

? thr = thrs[ithr]

? for(k in 1:K){

? ? itrain = which(folds!=k)

? ? glmo = glm(default~., data=x,

? ? ? ? ? ? ? family=binomial,

? ? ? ? ? ? ? subset=itrain)

? ? tb = table(glmo$fitted.values>thr, x$default[itrain])

? ? train.acc[k, ithr] = sum(diag(tb))/sum(tb)

? ? #

? ? # calculate sensitivity for a given threshold

? ? sens[ithr] = tb[2,2]/sum(tb[,2])

? ? # calculate specificity for a given threshold

? ? spec[ithr] = tb[1,1]/sum(tb[,1])

? ? # prediction

? ? p.test = predict(glmo, x[-itrain,], type='response')

? ? tb = table(p.test>thr, x$default[-itrain])

? ? test.acc[k,ithr] = sum(diag(tb))/sum(tb)

? }

}

boxplot(test.acc)

mean(train.acc)?

mean(test.acc)

# --------------------------------------------------------

# ST4061 / ST6041

# 2021-2022

# Eric Wolsztynski

# ...

#### Exercises Section 4: Tree-based methods 用樹來分類####

# --------------------------------------------------------

###############################################################

#### Exercise 1: growing and pruning a tree ####

###############################################################

#decision tree:從所有變量中挑出一個最重要的來作為根結(jié)點唠摹,后面的每一階都是從所有變量里選最重要的一個作為下一個節(jié)點花吟,這會使得有重復(fù)的變量被選中萄喳,畫出的樹特別大洞焙,造成overfitting

#因此,我們需要對做出的tree做刪減pruning突琳,目的是找到一個最理想的tree size分級大小若债,也就是方差dev最小的那個

install.packages("tree")

library(ISLR) # contains the dataset

library(tree) # contains... tree-building methods

# Recode response variable so as to make it a classification problem

High = ifelse(Carseats$Sales<=8, "No", "Yes")

# Create a data frame that includes both the predictors and response

# (a data frame is like an Excel spreadsheet, if you like)

CS = data.frame(Carseats, High)

CS$Sales = NULL#把sale去掉

CS$High = as.factor(CS$High) # <-- this bit was missing 必須有 不然后面會warning

# Fit the tree using all predictors (except for variable Sales,

# which we have "recoded" into a cateorical response variable)

# to response variable High

tree.out = tree(High~., CS)

summary(tree.out)

# plot the tree

plot(tree.out)

text(tree.out, pretty=0)

#The tree is fully grown, there is lots of little branches, maybe have a very detailed breakdown, means there is an overfitting of the dataset

# pruning:修剪

?cv.tree

set.seed(3)

cv.CS = cv.tree(tree.out, FUN=prune.misclass)#添加prune.misclass函數(shù)以找到合適的size

names(cv.CS)

# - size:

# number of terminal nodes in each tree in the cost-complexity pruning sequence.

#two method to control the depth of the tree:

# (1)- deviance:

# total deviance of each tree in the cost-complexity pruning sequence.

# (2)- k:

# the value of the cost-complexity pruning parameter of each tree in the sequence.

cv.CS

par(mfrow=c(1,2))

plot(cv.CS$size,cv.CS$dev,t='b')#find which size? has the smallest deviation 93 (偏差:觀測值-真實值)

min(cv.CS$dev)

cv.CS$size[which.min(cv.CS$dev)]

abline(v=cv.CS$size[which.min(cv.CS$dev)])#marking the 最合適size的location

plot(cv.CS$k,cv.CS$dev,t='b')#還可以找到最小的dev對應(yīng)的最合適的k

# use pruning:

# - use which.min(cv.CS$dev) to get the location of the optimum

# - retrieve the corresponding tree size

# - pass this information on to pruning function

opt.size = cv.CS$size[which.min(cv.CS$dev)] #Find the optimal size

# see:

plot(cv.CS$size,cv.CS$dev,t='b')

abline(v=cv.CS$size[which.min(cv.CS$dev)])

ptree = prune.misclass(tree.out, best=opt.size) #using the optimal size to pruning the tree

ptree

summary(ptree)

par(mfrow=c(1,2))

plot(tree.out)#initially

text(tree.out, pretty=0)

plot(ptree)#pruned

text(ptree, pretty=0)

#Compare with the initial treem, there is fewer branches in prune tree.

###############################################################

#### Exercise 2: apply CV and ROC analysis ####

###############################################################

# Train/test:

set.seed(2)

n = nrow(CS)

itrain = sample(1:n, 200)#一共有400組數(shù)據(jù),隨機選取其中的200個作為訓(xùn)練向本

CS.test = CS[-itrain,]

High.test = High[-itrain]

# argument 'subset' makes it easy to handle training/test splits:

tree.out = tree(High~., CS, subset=itrain)#總體的樹拆融,沒有修剪過

summary(tree.out)

plot(tree.out)

text(tree.out, pretty=0)

# prediction from full tree:

tree.pred = predict(tree.out, CS.test, type="class")#總體的樹的預(yù)測值

(tb1 = table(tree.pred,High.test))#總體的樹對High的混淆矩陣

# prediction from pruned tree:

ptree.pred = predict(ptree, CS.test, type="class")#修剪過的樹的預(yù)測值

(tb2 = table(ptree.pred,High.test)) # 修剪過的樹的confusion matrix

sum(diag(tb1))/sum(tb1)#總體的樹的準(zhǔn)確度#classification rate

sum(diag(tb2))/sum(tb2)#修剪過的樹的準(zhǔn)確度

#預(yù)測錯的值老師叫做misclassified obsevations

#預(yù)測錯的概率叫misclassification rate

#準(zhǔn)確度高的老師叫做is better suited to predict unseen data

# perform ROC analysis

library(pROC)

# here we specify 'type="vector"' to retrieve continuous scores

# as opposed to predicted labels, so that we can apply varying

# threshold values to these scores to build the ROC curve:

#在ROC中蠢琳,圖像越靠近左上角越好

ptree.probs = predict(ptree, CS.test, type="vector")#在畫ROC的時候,必須用type="vector"

roc.p = roc(response=(High.test), predictor=ptree.probs[,1])

plot(roc.p)

#AUC

#AUC是曲線下面的面積镜豹,越大越好

#下面是兩種方法挪凑,都可以

#取值范圍在0.5和1之間

#越接近1.0,檢測方法真實性越高;

#等于0.5時逛艰,則真實性最低躏碳,無應(yīng)用價值

#AUC < 0.5,比隨機猜測還差

auc(roc.p)

roc.p$auc

###############################################################

#### Exercise 3: find the tree ####

###############################################################

#Bagging

#由于decision tree有許多缺點散怖,比如重復(fù)選擇菇绵,太龐大,overfitting镇眷,用一個更為優(yōu)化的分類方法叫做Bagging

#用Bootstrap咬最,每次BT里選一個新的variable做節(jié)點往下分,high variance欠动, low bias永乌,用于分類和回歸都可以

#但是每一層選出來的variable會與前面的有很強的相關(guān)性

# ... can you find it?

###############################################################

#### Exercise 4: grow a random forest ####

###############################################################

#更好的一種方法,each learner:可以針對某一種特性來分類random subset is considering feature Q<P具伍,每個tree是基于一種特性分類的翅雏,許多個特性就有許多個tree,這組成了一個森林random forest

#不容易overfit人芽,不敏感

library(tree)

library(ISLR)

#install.packages("randomForest")

library(randomForest)

# ?Carseats

High = as.factor(ifelse(Carseats$Sales <= 8, 'No', 'Yes'))

CS = data.frame(Carseats, High)

CS$Sales = NULL

P = ncol(CS)-1? #?number of features(把High去掉)

# grow a single (unpruned) tree

tree.out = tree(High~., CS)

# fitted values for "training set"按照decision tree直接全部預(yù)測

tree.yhat = predict(tree.out, CS, type="class")

# grow a forest:建立一個森林

rf.out = randomForest(High~., CS)

# fitted values for "training set"按照randomForest直接全部預(yù)測

rf.yhat = predict(rf.out, CS, type="class")

#If you want to perform bagging instead of random forest

# compare to bagging: using all features,not include y 這種方法叫bagging(用了全部的variables)望几;rf可以隨便選用mtry,自定義用多少feature

bag.out = randomForest(High~., CS, mtry=P)#添加mtry= P = ncol(CS)-1? #?number of features(把High去掉)

# fitted values for "training set"

bag.yhat = predict(bag.out, CS, type="class")#按照bag直接全部預(yù)測

# confusion matrix for tree:

(tb.tree = table(tree.yhat, High))#按照decision tree直接全部預(yù)測的混淆矩陣

# confusion matrix for RF

(tb.rf = table(rf.yhat, High))#按照randomForest直接全部預(yù)測的混淆矩陣萤厅,可以看出分類全部正確橄抹,perfect

# confusion matrix for bagging

(tb.bag = table(bag.yhat, High))#按照bag直接全部預(yù)測的混淆矩陣

# Note this is different to the confusion of RF

# matrix for the OOB observations:OOB對建樹時未使用的數(shù)據(jù)(bootstrap沒用到的數(shù)據(jù))進行誤差估計

(tb.rf2 = rf.out$confusion)#rf.out是真實的數(shù)據(jù)直接放到RF里,不做預(yù)測惕味,說明真實情況下按照用RF方法應(yīng)該有部分?jǐn)?shù)據(jù)是錯誤分類的

sum(diag(tb.tree))/sum(tb.tree)#decision tree方法精確度91%

sum(diag(tb.rf))/sum(tb.rf)#RF方法精確度1(完全正確)但不是什么數(shù)據(jù)都是1楼誓,這里是巧了

sum(diag(tb.bag))/sum(tb.bag)#accurate=1

sum(diag(tb.rf2))/sum(tb.rf2)

#原本bag做出來的精確度應(yīng)該比rf低,但現(xiàn)在它比rf的精確度高了名挥,1>0.8141762, 說明bag出現(xiàn)了over fit

# train-test split

set.seed(6041)

N = nrow(CS)

itrain = sample(1:N, 200)

CS.train = CS[itrain,]

CS.test = CS[-itrain,]

tree.out = tree(High~., CS.train)

# fitted values for "train set"

tree.yhat = predict(tree.out, CS.train, type="class")

# fitted values for "test set"

tree.pred = predict(tree.out, CS.test, type="class")

rf.out = randomForest(High~., CS.train)

# fitted values for "training set"

rf.yhat = predict(rf.out, CS.train, type="class")

# fitted values for "test set"

rf.pred = predict(rf.out, CS.test, type="class")

bag.out = randomForest(High~., CS.train, mtry=(ncol(CS)-2))

# fitted values for "training set"

bag.yhat = predict(bag.out, CS.train, type="class")

# fitted values for "test set"

bag.pred = predict(bag.out, CS.test, type="class")

# confusion matrix for tree (test data):

(tb.tree = table(tree.pred, CS.test$High))

# confusion matrix for RF (test data):

(tb.rf = table(rf.pred, CS.test$High))#better performance of rf

# confusion matrix for Bagging (test data):

(tb.bag = table(bag.pred, CS.test$High))#bagging is not as good as rf

sum(diag(tb.tree))/sum(tb.tree)

sum(diag(tb.rf))/sum(tb.rf)

sum(diag(tb.bag))/sum(tb.bag)

###############################################################

#### Exercise 5: benchmarking ####

###############################################################

###### Exercise 5: benchmarking (this exercise is left as homework)######

# bring in that code from Section 2 (below) and add to it:

library(class) # contains knn()

library(ISLR) # contains the datasets

library(pROC)

library(tree)

library(randomForest)

#### (1) benchmarking on unscaled data對未縮放數(shù)據(jù)進行基準(zhǔn)測試####

#對一類測試對象的某項性能指標(biāo)進行定量的和可對比的測試

set.seed(4061)

n = nrow(Default)

dat = Default[sample(1:n, n, replace=FALSE), ]

i.cv = sample(1:n, round(.7*n), replace=FALSE)

dat.cv = dat[i.cv,] # use this for CV (train+test)

dat.valid = dat[-i.cv,] # save this for later (after CV)

# tuning of the classifiers:

K.knn = 3

K = 10

N = length(i.cv)

folds = cut(1:N, K, labels=FALSE)

acc.knn = acc.glm = acc.lda = acc.qda = numeric(K)#prediction accuracies

auc.knn = auc.glm = auc.lda = auc.qda = numeric(K)#AUC

acc.rf = auc.rf = numeric(K)

for(k in 1:K){ # 10-fold CV loop

? # split into train and test samples:

? #首先要劃分訓(xùn)練樣本和測試樣本

? i.train = which(folds!=k)

? dat.train = dat.cv[i.train, ]

? dat.test = dat.cv[-i.train, ]


? #作出KNN擬合

? # adapt these sets for kNN:

? x.train = dat.train[,-1]

? y.train = dat.train[,1]

? x.test = dat.test[,-1]

? y.test = dat.test[,1]

? x.train[,1] = as.numeric(x.train[,1])

? x.test[,1] = as.numeric(x.test[,1])

? # train classifiers:

? knn.o = knn(x.train, x.test, y.train, K.knn)


? #通訓(xùn)練樣本作出其他擬合

? glm.o = glm(default~., data=dat.train, family=binomial(logit))# 做logistic regression

? lda.o = lda(default~., data=dat.train)#做LDA

? qda.o = qda(default~., data=dat.train)#做QDA

? rf.o = randomForest(default~., data=dat.train)#做randomForest




? #用測試樣本進行預(yù)測

? # test classifiers:

? # (notice that predict.glm() does not have a functionality to

? # return categorical values, so we copmute them based on the

? # scores by applying a threshold of 50%)

? # 測試分類器:

? # (注意 predict.glm() 沒有功能

? # 返回分類值疟羹,因此我們根據(jù)

? # 通過應(yīng)用 50% 的閾值得分)

? knn.p = knn.o

? glm.p = ( predict(glm.o, newdata=dat.test, type="response") > 0.5 )

? lda.p = predict(lda.o, newdata=dat.test)$class

? qda.p = predict(qda.o, newdata=dat.test)$class

? rf.p = predict(rf.o, newdata=dat.test)


? #做各個方法的測試樣本的混淆矩陣

? # corresponding confusion matrices:

? tb.knn = table(knn.p, y.test)

? tb.glm = table(glm.p, y.test)

? tb.lda = table(lda.p, y.test)

? tb.qda = table(qda.p, y.test)

? tb.rf = table(rf.p, y.test)


? #每個方法的準(zhǔn)確度

? # store prediction accuracies:

? acc.knn[k] = sum(diag(tb.knn)) / sum(tb.knn)

? acc.glm[k] = sum(diag(tb.glm)) / sum(tb.glm)

? acc.lda[k] = sum(diag(tb.lda)) / sum(tb.lda)

? acc.qda[k] = sum(diag(tb.qda)) / sum(tb.qda)

? acc.rf[k] = sum(diag(tb.rf)) / sum(tb.rf)




? #

? # ROC/AUC analysis:

? #FandonForest沒有

? # WARNING: THIS IS NOT PR(Y=1 | X), BUT Pr(Y = Y_hat | X):

? knn.p = attributes(knn(x.train, x.test, y.train, K.knn, prob=TRUE))$prob

? glm.p = predict(glm.o, newdata=dat.test, type="response")

? lda.p = predict(lda.o, newdata=dat.test)$posterior[,2]

? qda.p = predict(qda.o, newdata=dat.test)$posterior[,2]



? #auc.knn[k] = roc(y.test, knn.p)$auc

? auc.glm[k] = roc(y.test, glm.p)$auc

? auc.lda[k] = roc(y.test, lda.p)$auc

? auc.qda[k] = roc(y.test, qda.p)$auc

}

boxplot(acc.knn, acc.glm, acc.lda, acc.qda,

? ? ? ? main="Overall CV prediction accuracy",

? ? ? ? names=c("kNN","GLM","LDA","QDA"))

#下面兩個的區(qū)別在于有沒有KNN的AUC(因為KNN的AUC的圖和其他方法放不在一起)

boxplot(auc.glm, auc.lda, auc.qda,

? ? ? ? main="Overall CV AUC",

? ? ? ? names=c("GLM","LDA","QDA"))

boxplot(auc.knn, auc.glm, auc.lda, auc.qda,

? ? ? ? main="Overall CV AUC",

? ? ? ? names=c("kNN","GLM","LDA","QDA"))

###############################################################

### Exercise 6: Variable importance from RF ####

###############################################################

#由RF看哪個variable是最重要的(系數(shù)越大越重要)

library(ISLR)

library(randomForest)

# ?Carseats

High = as.factor(ifelse(Carseats$Sales <= 8, 'No', 'Yes'))

CS = data.frame(Carseats, High)

CS$Sales = NULL

# grow a forest:

rf.out = randomForest(High~., CS)

# compare to bagging:

bag.out = randomForest(High~., CS, mtry=(ncol(CS)-1))

cbind(rf.out$importance, bag.out$importance)

#bag會讓原本重要的variable更重要,會讓不重要的variable更不重要,because essentially those strong predictors have a better chance to get selected every time.

#比如阁猜,CompPrice,用bag之后重要性增加了蹋艺,后面四個variable剃袍,用bag之后重要性減少了,還可以說用rf捎谨,CompPrice的重要性是US的3倍

par(mfrow=c(1,2))

varImpPlot(rf.out, pch=15, main="Ensemble method 1")#Price最重要民效,urban最不重要

varImpPlot(bag.out, pch=15, main="Ensemble method 2")

#Mean decrease Gini, 縱坐標(biāo)是減小的幅度,減小的幅度越大涛救,也就是數(shù)值越大畏邢,variable越重要。

?randomForest

###############################################################

#### Exercise 7: gradient boosting ####

###############################################################

# gradient boosting model? 梯度增強模型

# 之前的方法检吆,下一階與上一階會有相關(guān)性舒萎,為了減小correlation

# 找到真實值與預(yù)測值的殘差,下一階基于上一階的殘差分析蹭沛,每一階都彌補了上一階的殘差臂寝,可以減小偏差

# 缺點:容易over fit,比較敏感

library(ISLR) # contains the dataset

library(tree) # contains... tree-building methods

#install.packages('gbm')

library(gbm)? # contains the GB model implementation

library(pROC)

# Recode response variable so as to make it a classification problem

High = as.factor(ifelse(Carseats$Sales<=8, "No", "Yes"))

CS = data.frame(Carseats, High)

# remove Sales from data frame, just to make formulas simpler!

CS$Sales = NULL?

####(1)####

set.seed(1)

itest = sample(1:nrow(CS), round(nrow(CS)/4))#題目里面要求選100個作為測試樣本

CS.test = CS[itest,]#測試樣本

CS = CS[-itest,]

####(2)####

set.seed(4061)

# Note:

gbm(High~., data=CS, distribution="bernoulli")

#對于分類問題摊灭,選擇bernoulli或者adaboost咆贬,前者更為推薦

#對于連續(xù)因變量(回歸),選擇gaussian或者laplace

# so we must recode the levels...

CS$High = (as.numeric(CS$High=="Yes")) # 前面HIGH是factor帚呼,這里又變會numeric(yes=1,no=0), this hack could be useful

gb.out = gbm(High~., data=CS,

? ? ? ? ? ? distribution="bernoulli", # use "gaussian" instead for regression

? ? ? ? ? ? n.trees=5000, # size of the ensemble

? ? ? ? ? ? interaction.depth=1) # depth of the trees, 1 = stumps 限制每棵樹的深度

#distribution:模型計算損失函數(shù)時掏缎,需要對輸出變量的數(shù)據(jù)分布做出假設(shè)。

#對于分類問題煤杀,選擇bernoulli或者adaboost眷蜈,前者更為推薦

#對于連續(xù)因變量,選擇gaussian或者laplace

#n.trees:即number of iteration—迭代次數(shù)沈自。迭代次數(shù)的選擇與學(xué)習(xí)速率密切相關(guān)

#interaction.depth和n.minobsinnode:

#子決策樹即基礎(chǔ)學(xué)習(xí)器的深度和決策樹葉節(jié)點包含的最小觀測樹端蛆,

#若基礎(chǔ)學(xué)習(xí)器訓(xùn)練得過于復(fù)雜,將提升模型對于樣本的擬合能力而導(dǎo)致過擬合問題酥泛,

#因此子決策樹深度不宜過大今豆,

#葉節(jié)點可包含的最小觀測書不宜過小。

summary(gb.out$train.error)

# inspect output:

par(mar=c(4.5,6,1,1))

summary(gb.out, las=1)#summary函數(shù)返回自變量的相對重要性

plot(gb.out)

plot(gb.out, i="Price")# i is the index of variable or which variable you want to pick

#從圖中可以看出柔袁,隨著price的增加呆躲,預(yù)測的High數(shù)值的中位數(shù)越小

plot(gb.out, i="ShelveLoc")

#由于ShelveLoc是離散的,畫出來是點捶索,可以看出隨著ShelveLoc變好插掂,預(yù)測的High數(shù)值越大

gb.p = predict(gb.out, newdata=CS.test, n.trees=5000)

gb.p

roc.gb = roc(response=CS.test$High, predictor=gb.p)

plot(roc.gb)

roc.gb$auc

# compare AUC's with a Random Forest:

library(randomForest)

CS$High = as.factor(CS$High)

rf.out = randomForest(High~., CS, ntree=5000)

# fitted values for "training set"

rf.p = predict(rf.out, CS.test, type="prob")[,2]

roc.rf = roc(response=CS.test$High, predictor=rf.p)

plot(roc.rf, add=TRUE, col=2)

roc.gb$auc

roc.rf$auc

#AUC高的那個 great illustrate the model

###############################################################

#### Exercise 8: gradient boosting using caret...####

###############################################################

# Plug in the following snips of code within demo code

# Section4b_demo_using_caret.R:

############ (I) Classification example ############

### Gradient boosting (using caret) for classification

rm(list=ls()) #?clear the environment

library(ISLR) # contains the data

library(caret) # contains everything else

set.seed(4061) # for reproducibility

# Set up the data (take a subset of the Hitters dataset)

data(Hitters)

Hitters = na.omit(Hitters)

dat = Hitters

n = nrow(dat)

NC = ncol(dat)

# Change the response variable to a factor to make this a

# classification problem:

dat$Salary = as.factor(ifelse(Hitters$Salary>median(Hitters$Salary),

? ? ? ? ? ? ? ? ? ? ? ? ? ? ? "High","Low"))

# Data partition

itrain = sample(1:n, size=round(.7*n))

dat.train = dat[itrain,]

dat.validation = dat[-itrain,] # independent validation set for later

# x = select(dat.train,-"Salary") ###?if using dplyr

# training set:

x = dat.train

x$Salary = NULL

y = as.factor(dat.train$Salary)

gb.out = train(Salary~., data=dat.train, method='gbm', distribution='bernoulli')

gb.fitted = predict(gb.out) # corresponding fitted values

gb.pred = predict(gb.out, dat.validation)

confusionMatrix(reference=dat.validation$Salary, data=gb.pred,

? ? ? ? ? ? ? ? mode="everything")

############ (II) Regression example ############

### Gradient boosting (using caret) for regression

rm(list=ls()) #?clear the environment

# Set up the data (take a subset of the Hitters dataset)

data(Hitters)

Hitters = na.omit(Hitters)

dat = Hitters

# hist(dat$Salary)

dat$Salary = log(dat$Salary)

n = nrow(dat)

NC = ncol(dat)

# Data partition

itrain = sample(1:n, size=round(.7*n))

dat.train = dat[itrain,]

dat.validation = dat[-itrain,]

x = dat.train

x$Salary = NULL

y = dat.train$Salary

ytrue = dat.validation$Salary

gb.out = train(Salary~., data=dat.train, method='gbm', distribution='gaussian')

gb.fitted = predict(gb.out) # corresponding fitted values

gb.pred = predict(gb.out, dat.validation)

mean((gb.pred-ytrue)^2)

最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
  • 序言:七十年代末,一起剝皮案震驚了整個濱河市,隨后出現(xiàn)的幾起案子辅甥,更是在濱河造成了極大的恐慌酝润,老刑警劉巖,帶你破解...
    沈念sama閱讀 216,591評論 6 501
  • 序言:濱河連續(xù)發(fā)生了三起死亡事件璃弄,死亡現(xiàn)場離奇詭異要销,居然都是意外死亡,警方通過查閱死者的電腦和手機夏块,發(fā)現(xiàn)死者居然都...
    沈念sama閱讀 92,448評論 3 392
  • 文/潘曉璐 我一進店門疏咐,熙熙樓的掌柜王于貴愁眉苦臉地迎上來,“玉大人脐供,你說我怎么就攤上這事浑塞。” “怎么了政己?”我有些...
    開封第一講書人閱讀 162,823評論 0 353
  • 文/不壞的土叔 我叫張陵酌壕,是天一觀的道長。 經(jīng)常有香客問我歇由,道長仅孩,這世上最難降的妖魔是什么? 我笑而不...
    開封第一講書人閱讀 58,204評論 1 292
  • 正文 為了忘掉前任印蓖,我火速辦了婚禮辽慕,結(jié)果婚禮上,老公的妹妹穿的比我還像新娘赦肃。我一直安慰自己溅蛉,他們只是感情好,可當(dāng)我...
    茶點故事閱讀 67,228評論 6 388
  • 文/花漫 我一把揭開白布他宛。 她就那樣靜靜地躺著船侧,像睡著了一般。 火紅的嫁衣襯著肌膚如雪厅各。 梳的紋絲不亂的頭發(fā)上镜撩,一...
    開封第一講書人閱讀 51,190評論 1 299
  • 那天,我揣著相機與錄音队塘,去河邊找鬼袁梗。 笑死,一個胖子當(dāng)著我的面吹牛憔古,可吹牛的內(nèi)容都是我干的遮怜。 我是一名探鬼主播,決...
    沈念sama閱讀 40,078評論 3 418
  • 文/蒼蘭香墨 我猛地睜開眼鸿市,長吁一口氣:“原來是場噩夢啊……” “哼锯梁!你這毒婦竟也來了即碗?” 一聲冷哼從身側(cè)響起,我...
    開封第一講書人閱讀 38,923評論 0 274
  • 序言:老撾萬榮一對情侶失蹤陌凳,失蹤者是張志新(化名)和其女友劉穎剥懒,沒想到半個月后,有當(dāng)?shù)厝嗽跇淞掷锇l(fā)現(xiàn)了一具尸體合敦,經(jīng)...
    沈念sama閱讀 45,334評論 1 310
  • 正文 獨居荒郊野嶺守林人離奇死亡初橘,尸身上長有42處帶血的膿包…… 初始之章·張勛 以下內(nèi)容為張勛視角 年9月15日...
    茶點故事閱讀 37,550評論 2 333
  • 正文 我和宋清朗相戀三年,在試婚紗的時候發(fā)現(xiàn)自己被綠了蛤肌。 大學(xué)時的朋友給我發(fā)了我未婚夫和他白月光在一起吃飯的照片壁却。...
    茶點故事閱讀 39,727評論 1 348
  • 序言:一個原本活蹦亂跳的男人離奇死亡批狱,死狀恐怖裸准,靈堂內(nèi)的尸體忽然破棺而出,到底是詐尸還是另有隱情赔硫,我是刑警寧澤炒俱,帶...
    沈念sama閱讀 35,428評論 5 343
  • 正文 年R本政府宣布,位于F島的核電站爪膊,受9級特大地震影響权悟,放射性物質(zhì)發(fā)生泄漏。R本人自食惡果不足惜推盛,卻給世界環(huán)境...
    茶點故事閱讀 41,022評論 3 326
  • 文/蒙蒙 一峦阁、第九天 我趴在偏房一處隱蔽的房頂上張望。 院中可真熱鬧耘成,春花似錦榔昔、人聲如沸。這莊子的主人今日做“春日...
    開封第一講書人閱讀 31,672評論 0 22
  • 文/蒼蘭香墨 我抬頭看了看天上的太陽。三九已至师妙,卻和暖如春诵肛,著一層夾襖步出監(jiān)牢的瞬間,已是汗流浹背默穴。 一陣腳步聲響...
    開封第一講書人閱讀 32,826評論 1 269
  • 我被黑心中介騙來泰國打工怔檩, 沒想到剛下飛機就差點兒被人妖公主榨干…… 1. 我叫王不留,地道東北人蓄诽。 一個月前我還...
    沈念sama閱讀 47,734評論 2 368
  • 正文 我出身青樓珠洗,卻偏偏與公主長得像,于是被迫代替她去往敵國和親若专。 傳聞我的和親對象是個殘疾皇子许蓖,可洞房花燭夜當(dāng)晚...
    茶點故事閱讀 44,619評論 2 354

推薦閱讀更多精彩內(nèi)容

  • 姓名:楊晶晶 學(xué)號:21011210420 學(xué)院:通信工程學(xué)院 轉(zhuǎn)載自:https://blog.csdn.net...
    歸去來兮_c94f閱讀 2,956評論 0 10
  • http://www.ithao123.cn/content-647680.html Classification...
    hzyido閱讀 2,688評論 0 1
  • 16宿命:用概率思維提高你的勝算 以前的我是風(fēng)險厭惡者,不喜歡去冒險,但是人生放棄了冒險膊爪,也就放棄了無數(shù)的可能自阱。 ...
    yichen大刀閱讀 6,046評論 0 4
  • 公元:2019年11月28日19時42分農(nóng)歷:二零一九年 十一月 初三日 戌時干支:己亥乙亥己巳甲戌當(dāng)月節(jié)氣:立冬...
    石放閱讀 6,877評論 0 2
  • 昨天考過了阿里規(guī)范,心里舒坦了好多米酬,敲代碼也猶如神助沛豌。早早完成工作回家嘍
    常亞星閱讀 3,037評論 0 1