kilkjt2

# --------------------------------------------------------

# ST4061 / ST6041

# 2021-2022

# Eric Wolsztynski

# ...

# Exercises Section 3: Classification Exercises

# --------------------------------------------------------

library(class) # contains knn()

library(MASS)? # to have lda()

library(car)

library(ISLR) # contains the datasets

library(pROC)

###############################################################

### Exercise 1: kNN on iris data

###############################################################

set.seed(1)

# shuffle dataset first:

z = iris[sample(1:nrow(iris)),]

# Here we focus on sepal information only

plot(z[,1:2], col=c(1,2,4)[z[,5]],

pch=20, cex=2)

x = z[,1:2] # sepal variables only

y = z$Species

# Here's how to use the knn() classifier:

K = 5

n = nrow(x)

# split the data into train+test:

i.train = sample(1:n, 100)

x.train = x[i.train,]

x.test = x[-i.train,]

y.train = y[i.train]

y.test = y[-i.train]

ko = knn(x.train, x.test, y.train, K)

tb = table(ko, y.test)

1 - sum(diag(tb)) / sum(tb) # overall classification error rate

confusionMatrix(data=ko, reference=y.test)

# Build a loop around that to find best k:

# (NB: assess effect of various k-values

# on the same data-split)

Kmax = 30

acc = numeric(Kmax)

for(k in 1:Kmax){

ko = knn(x.train, x.test, y.train, k)

tb = table(ko, y.test)

acc[k] = sum(diag(tb)) / sum(tb)

}

plot(1-acc, pch=20, t='b', xlab='k')

###############################################################

### Exercise 2: GLM on 2-class iris data

###############################################################

n = nrow(iris)

is = sample(1:n, size=n, replace=FALSE)

dat = iris[is,-c(3,4)] # shuffled version of the original set

# recode into 2-class problem:

dat$is.virginica = as.numeric(dat$Species=="virginica")

dat$Species = NULL # "remove" this component

names(dat)

is = 1:100 # training set

fit = glm(is.virginica~., data=dat, subset=is, family=binomial(logit))

pred = predict(fit, newdata=dat[-is,], type="response")

y.test = dat$is.virginica[-is] # true test set classes

# boxplot(pred~y.test, names=c("other","virginica"))

# boxplot(pred~y.test, names=c("cancer absent","cancer present"))

abline(h=0.5, col=3)

abline(h=0.1, col=4)

# for varying cut-off (ie threshold) values, compute corresponding

# predicted labels, and corresponding confusion matrix:

err = NULL

for(cut.off in seq(.1, .9, by=.1)){

pred.y = as.numeric(pred>cut.off)

tb = table(pred.y, y.test)

err = c(err, (1-sum(diag(tb))/sum(tb)))

}

plot(seq(.1, .9, by=.1), err, t='b')

###############################################################

### Exercise 3: LDA assumptions

###############################################################

## (1) 2-class classification problem

dat = iris

dat$Species = as.factor(ifelse(iris$Species=="virginica",1,0))

# to recode cleanly, you could use for instance:

dat$Species = car::recode(dat$Species, "0='other'; 1='virginica'")

# or:

# levels(dat$Species) = c("other","virginica")

par(mfrow=c(1,2))

plot(iris[,1:2], pch=20, col=c(1,2,4)[iris$Species], cex=2)

legend("topright",col=c(1,2,4),

legend=levels(iris$Species),

pch=20, bty='n')

plot(dat[,1:2], pch=20, col=c(1,4)[dat$Species], cex=2)

legend("topright",col=c(1,4),

legend=levels(dat$Species),

pch=20, bty='n')

## Explore distribution of predictors:

# boxplots seem relatively symmetric:

par(mfrow=c(2,2))

for(j in 1:4){

boxplot(dat[,j]~dat$Species,

? ? ? ylab='predictor',

col=c('cyan','pink'),

main=names(dat)[j])

}

# but we'd rather check for Normality more specifically:

par(mfrow=c(2,4), font.lab=2, cex=1.2)

for(j in 1:4){

hist(dat[which(dat$Species=='other'),j], col='cyan',

? ? ? xlab='predictor for class other',

main=names(dat)[j])

hist(dat[which(dat$Species!='other'),j], col='pink',

? ? xlab='predictor for class virginica',

? ? main=names(dat)[j])

}

# could also use QQ-plots:

par(mfrow=c(2,4), cex=1.2)

for(j in 1:4){

x.other = dat[which(dat$Species=='other'),j]

qqnorm(x.other, pch=20, col='cyan',

main=names(dat)[j])

abline(a=mean(x.other), b=sd(x.other))

x.virginica = dat[which(dat$Species!='other'),j]

qqnorm(x.virginica, pch=20, col='pink',

main=names(dat)[j])

abline(a=mean(x.virginica), b=sd(x.virginica))

}

# So what do you think?

## Check initial assumption of equality of variances:

# Bartlett's test with H0: all variances are equal

for(j in 1:4){

print( bartlett.test(dat[,1]~dat$Species)$p.value )

}

# Shapiro's test with H0: the distribution is Normal

for(j in 1:4){

? print( shapiro.test(dat[which(dat$Species=='virginica'),j])$p.value )

}

for(j in 1:4){

? print( shapiro.test(dat[which(dat$Species=='other'),j])$p.value )

}

## Fit LDA model to this dataset and check accuracy:

lda.o = lda(Species~., data=dat)

(lda.o)

# can we track some of the values in that summary?

table(dat$Species)/nrow(dat)

rbind(

? ? apply(dat[which(dat$Species=='other'),1:4], 2, mean),

? ? apply(dat[which(dat$Species=='virginica'),1:4], 2, mean)

)

# what about the coefficients of linear discriminants?

x = as.matrix(dat[,1:4])

proj = x %*% lda.o$scaling

plot(proj, pch=20, col=dat$Species, cex=2)

# little hack to recover the fitted values quickly

predo = predict(lda.o, newdata=dat)

y = predo$x

plot(proj, y)

plot(y, predo$posterior[,2])

boxplot(y ~ (predo$posterior[,2]>.5))

boxplot(proj ~ (predo$posterior[,2]>.5))

# NB: The way these coefs is calculated follows the MANOVA approach

# popular hack to recover the fitted values:

fitted.values = predict(lda.o, newdata=dat)$class?

boxplot(y~dat$Species)

boxplot(proj~dat$Species)

(tb.2 = table(fitted.values, dat$Species))

sum(diag(tb.2)) / sum(tb.2)

## (2) 3-class classification problem

dat = iris

## Explore distribution of predictors:

# boxplots seem relatively symmetric:

par(mfrow=c(2,2))

# here's just a loop to save having to write 4 boxplot

# instructions with names by hand (being lazy often

# makes for nicer code):

for(j in 1:4){

boxplot(dat[,j]~dat$Species,

xlab = 'Species',

ylab = 'predictor',

col=c('cyan','pink'),

main=names(dat)[j])

}

# but we'd rather check for Normality more specifically:

Ls = levels(dat$Species)

par(mfcol=c(3,4))

for(j in 1:4){

hist(dat[which(dat$Species==Ls[1]),j], col='cyan',

main=names(dat)[j])

hist(dat[which(dat$Species==Ls[2]),j], col='pink',

main=names(dat)[j])

hist(dat[which(dat$Species==Ls[3]),j], col='green',

main=names(dat)[j])

}

# could also use QQ-plots:

par(mfcol=c(3,4))

for(j in 1:4){

x1 = dat[which(dat$Species==Ls[1]),j]

qqnorm(x1, pch=20, col='cyan', main=names(dat)[j])

abline(a=mean(x1), b=sd(x1))

x2 = dat[which(dat$Species==Ls[2]),j]

qqnorm(x2, pch=20, col='pink', main=names(dat)[j])

abline(a=mean(x2), b=sd(x2))

x3 = dat[which(dat$Species==Ls[3]),j]

qqnorm(x3, pch=20, col='green', main=names(dat)[j])

abline(a=mean(x3), b=sd(x3))

}

# So what do you think now?

## Check initial assumption of equality of variances:

# Bartlett's test with H0: all variances are equal

print( bartlett.test(dat[,1]~dat$Species)$p.value )

print( bartlett.test(dat[,2]~dat$Species)$p.value )

print( bartlett.test(dat[,3]~dat$Species)$p.value )

print( bartlett.test(dat[,4]~dat$Species)$p.value )

## or if in lazy mode:

for(j in 1:4){

print( bartlett.test(dat[,j]~dat$Species)$p.value )

}

## Fit LDA model to this dataset and check accuracy:

lda.o = lda(Species~., data=dat)

(lda.o)

ftted.values = predict(lda.o, newdata=dat)$class

(tb.3 = table(ftted.values, dat$Species))

sum(diag(tb.3)) / sum(tb.3)

###############################################################

### Exercise 4: LDA

###############################################################

## (1) 2-class classification problem

dat = iris

dat$Species = as.factor(ifelse(iris$Species=="virginica",1,0))

levels(dat$Species) = c("other","virginica")

n = nrow(dat)

set.seed(4061)

dat = dat[sample(1:n),] # shuffle dataset

i.train = 1:100

dat.train = dat[i.train,]

dat.test = dat[-i.train,]

lda.o = lda(Species~., data=dat.train)

lda.p = predict(lda.o, newdata=dat.test)

names(lda.p)

(tb = table(lda.p$class, dat.test$Species))

sum(diag(tb))/sum(tb)

# QDA:

qda.o = qda(Species~., data=dat.train)

qda.p = predict(qda.o, newdata=dat.test)

(tb = table(qda.p$class, dat.test$Species))

sum(diag(tb))/sum(tb)

## (2) 3-class classification problem

dat = iris

n = nrow(dat)

set.seed(4061)

dat = dat[sample(1:n),]

i.train = 1:100

dat.train = dat[i.train,]

dat.test = dat[-i.train,]

# LDA:

lda.o = lda(Species~., data=dat.train)

lda.p = predict(lda.o, newdata=dat.test)

names(lda.p)

(tb = table(lda.p$class, dat.test$Species))

sum(diag(tb))/sum(tb)

# QDA:

qda.o = qda(Species~., data=dat.train)

qda.p = predict(qda.o, newdata=dat.test)

(tb = table(qda.p$class, dat.test$Species))

sum(diag(tb))/sum(tb)

###############################################################

### Exercise 5: benchmarking

###############################################################

## (1) benchmarking on unscaled data

set.seed(4061)

n = nrow(Default)

dat = Default[sample(1:n, n, replace=FALSE), ]

# get a random training sample containing 70% of original sample:

i.cv = sample(1:n, round(.7*n), replace=FALSE)

dat.cv = dat[i.cv,] # use this for CV (train+test)

dat.valid = dat[-i.cv,] # save this for later (after CV) (HOLD-OUT)

# tuning of the classifiers:

K.knn = 3

# perform K-fold CV:

K = 10

N = length(i.cv)

folds = cut(1:N, K, labels=FALSE)

acc.knn = acc.glm = acc.lda = acc.qda = numeric(K)

auc.knn = auc.glm = auc.lda = auc.qda = numeric(K)

for(k in 1:K){ # 10-fold CV loop

# split into train and test samples:

i.train = which(folds!=k)

dat.train = dat.cv[i.train, ]

dat.test = dat.cv[-i.train, ]

# adapt these sets for kNN:

x.train = dat.train[,-1]

y.train = dat.train[,1]

x.test = dat.test[,-1]

y.test = dat.test[,1]

x.train[,1] = as.numeric(x.train[,1])

x.test[,1] = as.numeric(x.test[,1])

# train classifiers:

knn.o = knn(x.train, x.test, y.train, K.knn)

glm.o = glm(default~., data=dat.train, family=binomial(logit))

lda.o = lda(default~., data=dat.train)

qda.o = qda(default~., data=dat.train)

# test classifiers:

knn.p = knn.o

glm.p = ( predict(glm.o, newdata=dat.test, type="response") > 0.5 )

lda.p = predict(lda.o, newdata=dat.test)$class

qda.p = predict(qda.o, newdata=dat.test)$class

tb.knn = table(knn.p, y.test)

tb.glm = table(glm.p, y.test)

tb.lda = table(lda.p, y.test)

tb.qda = table(qda.p, y.test)

# store prediction accuracies:

acc.knn[k] = sum(diag(tb.knn)) / sum(tb.knn)

acc.glm[k] = sum(diag(tb.glm)) / sum(tb.glm)

acc.lda[k] = sum(diag(tb.lda)) / sum(tb.lda)

acc.qda[k] = sum(diag(tb.qda)) / sum(tb.qda)

# ROC/AUC analysis:

# WARNING: THIS IS NOT Pr(Y=1 | X), BUT Pr(Y = Y_hat | X):

# knn.p = attributes(knn(x.train, x.test, y.train, K.knn, prob=TRUE))$prob

glm.p = predict(glm.o, newdata=dat.test, type="response")

lda.p = predict(lda.o, newdata=dat.test)$posterior[,2]

qda.p = predict(qda.o, newdata=dat.test)$posterior[,2]

# auc.knn[k] = roc(y.test, knn.p)$auc

auc.glm[k] = roc(y.test, glm.p)$auc

auc.lda[k] = roc(y.test, lda.p)$auc

auc.qda[k] = roc(y.test, qda.p)$auc

}

boxplot(acc.knn, acc.glm, acc.lda, acc.qda,

main="Overall CV prediction accuracy",

names=c("kNN","GLM","LDA","QDA"))

boxplot(auc.glm, auc.lda, auc.qda,

main="Overall CV AUC",

names=c("GLM","LDA","QDA"))

##### Taking a closer look at performance

roc(y.test, glm.p)$auc

library(caret)

(tb = table(y.test, glm.p>.5))

pred = as.factor(glm.p>.5)

pred = car::recode(pred, "FALSE='No'; TRUE='Yes'")

caret::confusionMatrix(y.test, pred)

sum(diag(tb))/sum(tb)

##### Further exercises for you to do:

## adapt code to evaluate sensitivity and specificity

## add validation analysis...

## repeat on scaled data...

###############################################################

### Exercise 6: benchmarking, again

###############################################################

## (1) benchmarking on unscaled data

set.seed(4061)

n = nrow(Default)

dat = Default[sample(1:n, n, replace=FALSE), ]

# get a random training sample containing 70% of original sample:

i.cv = sample(1:n, round(.7*n), replace=FALSE)

x = dat.cv = dat[i.cv,] # use this for CV (train+test)

dat.valid = dat[-i.cv,] # save this for later (after CV)

# Recover ROC curve manually from whole set:

n = nrow(x)

acc = numeric(length(thrs))

sens = spec = numeric(length(thrs))

thrs = seq(.05,.95,by=.05)

for(ithr in 1:length(thrs)){

thr = thrs[ithr]

glmo = glm(default~., data=x,

? ? ? ? ? family=binomial)

? tb = table(glmo$fitted.values>thr, x$default)

? acc[ithr] = sum(diag(tb))/sum(tb)

? #

? # calculate sensitivity for a given threshold

? sens[ithr] = ...

? # calculate specificity for a given threshold

? spec[ithr] = ...

? # prediction

}

plot(acc)

plot(spec, sens)

# Evaluate a cross-validated ROC curve manually:

n = nrow(x)

K = 10

train.acc = test.acc = matrix(NA, nrow=K, ncol=length(thrs))

folds = cut(1:n, K, labels=FALSE)

k = 1

thrs = seq(.05,.95,by=.05)

for(ithr in 1:length(thrs)){

thr = thrs[ithr]

for(k in 1:K){

? itrain = which(folds!=k)

? glmo = glm(default~., data=x,

? ? ? ? ? family=binomial,

? ? ? ? ? subset=itrain)

? tb = table(glmo$fitted.values>thr, x$default[itrain])

? train.acc[k, ithr] = sum(diag(tb))/sum(tb)

? #

? # calculate sensitivity for a given threshold

? # ...

? # calculate specificity for a given threshold

? # ...

? # prediction

? p.test = predict(glmo, x[-itrain,], type='response')

? tb = table(p.test>thr, x$default[-itrain])

? test.acc[k,ithr] = sum(diag(tb))/sum(tb)

}

boxplot(test.acc)

# warnings()

mean(train.acc)?

mean(test.acc)

?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者

人面猴
序言：七十年代末鸟蟹，一起剝皮案震驚了整個(gè)濱河市工猜，隨后出現(xiàn)的幾起案子，更是在濱河造成了極大的恐慌刽肠，老刑警劉巖，帶你破解...
沈念sama閱讀 216,591評論 6贊 501
死咒
序言：濱河連續(xù)發(fā)生了三起死亡事件示损，死亡現(xiàn)場離奇詭異暖庄，居然都是意外死亡，警方通過查閱死者的電腦和手機(jī)悦析，發(fā)現(xiàn)死者居然都...
沈念sama閱讀 92,448評論 3贊 392
救了他兩次的神仙讓他今天三更去死
文/潘曉璐我一進(jìn)店門，熙熙樓的掌柜王于貴愁眉苦臉地迎上來此衅，“玉大人强戴，你說我怎么就攤上這事〉舶埃” “怎么了骑歹？”我有些...
開封第一講書人閱讀 162,823評論 0贊 353
道士緝兇錄：失蹤的賣姜人
文/不壞的土叔我叫張陵，是天一觀的道長墨微。經(jīng)常有香客問我陵刹，道長，這世上最難降的妖魔是什么？我笑而不...
開封第一講書人閱讀 58,204評論 1贊 292
?港島之戀（遺憾婚禮）
正文為了忘掉前任衰琐，我火速辦了婚禮也糊，結(jié)果婚禮上，老公的妹妹穿的比我還像新娘羡宙。我一直安慰自己狸剃，他們只是感情好，可當(dāng)我...
茶點(diǎn)故事閱讀 67,228評論 6贊 388
惡毒庶女頂嫁案：這布局不是一般人想出來的
文/花漫我一把揭開白布狗热。她就那樣靜靜地躺著钞馁，像睡著了一般。火紅的嫁衣襯著肌膚如雪匿刮。梳的紋絲不亂的頭發(fā)上僧凰，一...
開封第一講書人閱讀 51,190評論 1贊 299
城市分裂傳說
那天，我揣著相機(jī)與錄音熟丸，去河邊找鬼训措。笑死，一個(gè)胖子當(dāng)著我的面吹牛光羞，可吹牛的內(nèi)容都是我干的绩鸣。我是一名探鬼主播，決...
沈念sama閱讀 40,078評論 3贊 418
雙鴛鴦連環(huán)套：你想象不到人心有多黑
文/蒼蘭香墨我猛地睜開眼纱兑，長吁一口氣：“原來是場噩夢啊……” “哼呀闻！你這毒婦竟也來了？” 一聲冷哼從身側(cè)響起潜慎，我...
開封第一講書人閱讀 38,923評論 0贊 274
萬榮殺人案實(shí)錄
序言：老撾萬榮一對情侶失蹤捡多，失蹤者是張志新（化名）和其女友劉穎，沒想到半個(gè)月后铐炫，有當(dāng)?shù)厝嗽跇淞掷锇l(fā)現(xiàn)了一具尸體垒手，經(jīng)...
沈念sama閱讀 45,334評論 1贊 310
?護(hù)林員之死
正文獨(dú)居荒郊野嶺守林人離奇死亡，尸身上長有42處帶血的膿包…… 初始之章·張勛以下內(nèi)容為張勛視角年9月15日...
茶點(diǎn)故事閱讀 37,550評論 2贊 333
?白月光啟示錄
正文我和宋清朗相戀三年驳遵，在試婚紗的時(shí)候發(fā)現(xiàn)自己被綠了淫奔。大學(xué)時(shí)的朋友給我發(fā)了我未婚夫和他白月光在一起吃飯的照片山涡。...
茶點(diǎn)故事閱讀 39,727評論 1贊 348
活死人
序言：一個(gè)原本活蹦亂跳的男人離奇死亡堤结，死狀恐怖，靈堂內(nèi)的尸體忽然破棺而出鸭丛，到底是詐尸還是另有隱情竞穷，我是刑警寧澤，帶...
沈念sama閱讀 35,428評論 5贊 343
?日本核電站爆炸內(nèi)幕
正文年R本政府宣布鳞溉，位于F島的核電站瘾带，受9級特大地震影響，放射性物質(zhì)發(fā)生泄漏熟菲。R本人自食惡果不足惜看政，卻給世界環(huán)境...
茶點(diǎn)故事閱讀 41,022評論 3贊 326
男人毒藥：我在死后第九天來索命
文/蒙蒙一朴恳、第九天我趴在偏房一處隱蔽的房頂上張望。院中可真熱鬧允蚣，春花似錦于颖、人聲如沸。這莊子的主人今日做“春日...
開封第一講書人閱讀 31,672評論 0贊 22
一樁弒父案森渐，背后竟有這般陰謀
文/蒼蘭香墨我抬頭看了看天上的太陽。三九已至冒晰，卻和暖如春同衣，著一層夾襖步出監(jiān)牢的瞬間，已是汗流浹背壶运。一陣腳步聲響...
開封第一講書人閱讀 32,826評論 1贊 269
情欲美人皮
我被黑心中介騙來泰國打工耐齐，沒想到剛下飛機(jī)就差點(diǎn)兒被人妖公主榨干…… 1. 我叫王不留，地道東北人前弯。一個(gè)月前我還...
沈念sama閱讀 47,734評論 2贊 368
代替公主和親
正文我出身青樓蚪缀，卻偏偏與公主長得像，于是被迫代替她去往敵國和親恕出。傳聞我的和親對象是個(gè)殘疾皇子询枚，可洞房花燭夜當(dāng)晚...
茶點(diǎn)故事閱讀 44,619評論 2贊 354

kilkjt2

推薦閱讀更多精彩內(nèi)容