kilkjt2

# --------------------------------------------------------

# ST4061 / ST6041

# 2021-2022

# Eric Wolsztynski

# ...

# Exercises Section 3: Classification Exercises

# --------------------------------------------------------

library(class) # contains knn()

library(MASS)? # to have lda()

library(car)

library(ISLR) # contains the datasets

library(pROC)

###############################################################

### Exercise 1: kNN on iris data

###############################################################

set.seed(1)

# shuffle dataset first:

z = iris[sample(1:nrow(iris)),]

# Here we focus on sepal information only

plot(z[,1:2], col=c(1,2,4)[z[,5]],

pch=20, cex=2)

x = z[,1:2] # sepal variables only

y = z$Species

# Here's how to use the knn() classifier:

K = 5

n = nrow(x)

# split the data into train+test:

i.train = sample(1:n, 100)

x.train = x[i.train,]

x.test = x[-i.train,]

y.train = y[i.train]

y.test = y[-i.train]

ko = knn(x.train, x.test, y.train, K)

tb = table(ko, y.test)

1 - sum(diag(tb)) / sum(tb) # overall classification error rate

confusionMatrix(data=ko, reference=y.test)

# Build a loop around that to find best k:

# (NB: assess effect of various k-values

# on the same data-split)

Kmax = 30

acc = numeric(Kmax)

for(k in 1:Kmax){

ko = knn(x.train, x.test, y.train, k)

tb = table(ko, y.test)

acc[k] = sum(diag(tb)) / sum(tb)

}

plot(1-acc, pch=20, t='b', xlab='k')

###############################################################

### Exercise 2: GLM on 2-class iris data

###############################################################

n = nrow(iris)

is = sample(1:n, size=n, replace=FALSE)

dat = iris[is,-c(3,4)] # shuffled version of the original set

# recode into 2-class problem:

dat$is.virginica = as.numeric(dat$Species=="virginica")

dat$Species = NULL # "remove" this component

names(dat)

is = 1:100 # training set

fit = glm(is.virginica~., data=dat, subset=is, family=binomial(logit))

pred = predict(fit, newdata=dat[-is,], type="response")

y.test = dat$is.virginica[-is] # true test set classes

# boxplot(pred~y.test, names=c("other","virginica"))

# boxplot(pred~y.test, names=c("cancer absent","cancer present"))

abline(h=0.5, col=3)

abline(h=0.1, col=4)

# for varying cut-off (ie threshold) values, compute corresponding

# predicted labels, and corresponding confusion matrix:

err = NULL

for(cut.off in seq(.1, .9, by=.1)){

pred.y = as.numeric(pred>cut.off)

tb = table(pred.y, y.test)

err = c(err, (1-sum(diag(tb))/sum(tb)))

}

plot(seq(.1, .9, by=.1), err, t='b')

###############################################################

### Exercise 3: LDA assumptions

###############################################################

## (1) 2-class classification problem

dat = iris

dat$Species = as.factor(ifelse(iris$Species=="virginica",1,0))

# to recode cleanly, you could use for instance:

dat$Species = car::recode(dat$Species, "0='other'; 1='virginica'")

# or:

# levels(dat$Species) = c("other","virginica")

#

par(mfrow=c(1,2))

plot(iris[,1:2], pch=20, col=c(1,2,4)[iris$Species], cex=2)

legend("topright",col=c(1,2,4),

legend=levels(iris$Species),

pch=20, bty='n')

plot(dat[,1:2], pch=20, col=c(1,4)[dat$Species], cex=2)

legend("topright",col=c(1,4),

legend=levels(dat$Species),

pch=20, bty='n')

## Explore distribution of predictors:

# boxplots seem relatively symmetric:

par(mfrow=c(2,2))

for(j in 1:4){

boxplot(dat[,j]~dat$Species,

? ? ? ylab='predictor',

col=c('cyan','pink'),

main=names(dat)[j])

}

# but we'd rather check for Normality more specifically:

par(mfrow=c(2,4), font.lab=2, cex=1.2)

for(j in 1:4){

hist(dat[which(dat$Species=='other'),j], col='cyan',

? ? ? xlab='predictor for class other',

main=names(dat)[j])

hist(dat[which(dat$Species!='other'),j], col='pink',

? ? xlab='predictor for class virginica',

? ? main=names(dat)[j])

}

# could also use QQ-plots:

par(mfrow=c(2,4), cex=1.2)

for(j in 1:4){

x.other = dat[which(dat$Species=='other'),j]

qqnorm(x.other, pch=20, col='cyan',

main=names(dat)[j])

abline(a=mean(x.other), b=sd(x.other))

x.virginica = dat[which(dat$Species!='other'),j]

qqnorm(x.virginica, pch=20, col='pink',

main=names(dat)[j])

abline(a=mean(x.virginica), b=sd(x.virginica))

}

# So what do you think?

## Check initial assumption of equality of variances:

# Bartlett's test with H0: all variances are equal

for(j in 1:4){

print( bartlett.test(dat[,1]~dat$Species)$p.value )

}

# Shapiro's test with H0: the distribution is Normal

for(j in 1:4){

? print( shapiro.test(dat[which(dat$Species=='virginica'),j])$p.value )

}

for(j in 1:4){

? print( shapiro.test(dat[which(dat$Species=='other'),j])$p.value )

}

## Fit LDA model to this dataset and check accuracy:

lda.o = lda(Species~., data=dat)

(lda.o)

# can we track some of the values in that summary?

table(dat$Species)/nrow(dat)

rbind(

? ? apply(dat[which(dat$Species=='other'),1:4], 2, mean),

? ? apply(dat[which(dat$Species=='virginica'),1:4], 2, mean)

)

# what about the coefficients of linear discriminants?

x = as.matrix(dat[,1:4])

proj = x %*% lda.o$scaling

plot(proj, pch=20, col=dat$Species, cex=2)

# little hack to recover the fitted values quickly

predo = predict(lda.o, newdata=dat)

y = predo$x

plot(proj, y)

plot(y, predo$posterior[,2])

boxplot(y ~ (predo$posterior[,2]>.5))

boxplot(proj ~ (predo$posterior[,2]>.5))

# NB: The way these coefs is calculated follows the MANOVA approach

# popular hack to recover the fitted values:

fitted.values = predict(lda.o, newdata=dat)$class?

boxplot(y~dat$Species)

boxplot(proj~dat$Species)

(tb.2 = table(fitted.values, dat$Species))

sum(diag(tb.2)) / sum(tb.2)

## (2) 3-class classification problem

dat = iris

## Explore distribution of predictors:

# boxplots seem relatively symmetric:

par(mfrow=c(2,2))

# here's just a loop to save having to write 4 boxplot

# instructions with names by hand (being lazy often

# makes for nicer code):

for(j in 1:4){

boxplot(dat[,j]~dat$Species,

xlab = 'Species',

ylab = 'predictor',

col=c('cyan','pink'),

main=names(dat)[j])

}

# but we'd rather check for Normality more specifically:

Ls = levels(dat$Species)

par(mfcol=c(3,4))

for(j in 1:4){

hist(dat[which(dat$Species==Ls[1]),j], col='cyan',

main=names(dat)[j])

hist(dat[which(dat$Species==Ls[2]),j], col='pink',

main=names(dat)[j])

hist(dat[which(dat$Species==Ls[3]),j], col='green',

main=names(dat)[j])

}

# could also use QQ-plots:

par(mfcol=c(3,4))

for(j in 1:4){

x1 = dat[which(dat$Species==Ls[1]),j]

qqnorm(x1, pch=20, col='cyan', main=names(dat)[j])

abline(a=mean(x1), b=sd(x1))

x2 = dat[which(dat$Species==Ls[2]),j]

qqnorm(x2, pch=20, col='pink', main=names(dat)[j])

abline(a=mean(x2), b=sd(x2))

x3 = dat[which(dat$Species==Ls[3]),j]

qqnorm(x3, pch=20, col='green', main=names(dat)[j])

abline(a=mean(x3), b=sd(x3))

}

# So what do you think now?

## Check initial assumption of equality of variances:

# Bartlett's test with H0: all variances are equal

print( bartlett.test(dat[,1]~dat$Species)$p.value )

print( bartlett.test(dat[,2]~dat$Species)$p.value )

print( bartlett.test(dat[,3]~dat$Species)$p.value )

print( bartlett.test(dat[,4]~dat$Species)$p.value )

## or if in lazy mode:

for(j in 1:4){

print( bartlett.test(dat[,j]~dat$Species)$p.value )

}

## Fit LDA model to this dataset and check accuracy:

lda.o = lda(Species~., data=dat)

(lda.o)

ftted.values = predict(lda.o, newdata=dat)$class

(tb.3 = table(ftted.values, dat$Species))

sum(diag(tb.3)) / sum(tb.3)

###############################################################

### Exercise 4: LDA

###############################################################

## (1) 2-class classification problem

dat = iris

dat$Species = as.factor(ifelse(iris$Species=="virginica",1,0))

levels(dat$Species) = c("other","virginica")

n = nrow(dat)

set.seed(4061)

dat = dat[sample(1:n),] # shuffle dataset

i.train = 1:100

dat.train = dat[i.train,]

dat.test = dat[-i.train,]

lda.o = lda(Species~., data=dat.train)

lda.p = predict(lda.o, newdata=dat.test)

names(lda.p)

(tb = table(lda.p$class, dat.test$Species))

sum(diag(tb))/sum(tb)

# QDA:

qda.o = qda(Species~., data=dat.train)

qda.p = predict(qda.o, newdata=dat.test)

(tb = table(qda.p$class, dat.test$Species))

sum(diag(tb))/sum(tb)

## (2) 3-class classification problem

dat = iris

n = nrow(dat)

set.seed(4061)

dat = dat[sample(1:n),]

i.train = 1:100

dat.train = dat[i.train,]

dat.test = dat[-i.train,]

# LDA:

lda.o = lda(Species~., data=dat.train)

lda.p = predict(lda.o, newdata=dat.test)

names(lda.p)

(tb = table(lda.p$class, dat.test$Species))

sum(diag(tb))/sum(tb)

# QDA:

qda.o = qda(Species~., data=dat.train)

qda.p = predict(qda.o, newdata=dat.test)

(tb = table(qda.p$class, dat.test$Species))

sum(diag(tb))/sum(tb)

###############################################################

### Exercise 5: benchmarking

###############################################################

## (1) benchmarking on unscaled data

set.seed(4061)

n = nrow(Default)

dat = Default[sample(1:n, n, replace=FALSE), ]

# get a random training sample containing 70% of original sample:

i.cv = sample(1:n, round(.7*n), replace=FALSE)

dat.cv = dat[i.cv,] # use this for CV (train+test)

dat.valid = dat[-i.cv,] # save this for later (after CV) (HOLD-OUT)

# tuning of the classifiers:

K.knn = 3

# perform K-fold CV:

K = 10

N = length(i.cv)

folds = cut(1:N, K, labels=FALSE)

acc.knn = acc.glm = acc.lda = acc.qda = numeric(K)

auc.knn = auc.glm = auc.lda = auc.qda = numeric(K)

#

for(k in 1:K){ # 10-fold CV loop

# split into train and test samples:

i.train = which(folds!=k)

dat.train = dat.cv[i.train, ]

dat.test = dat.cv[-i.train, ]

# adapt these sets for kNN:

x.train = dat.train[,-1]

y.train = dat.train[,1]

x.test = dat.test[,-1]

y.test = dat.test[,1]

x.train[,1] = as.numeric(x.train[,1])

x.test[,1] = as.numeric(x.test[,1])

# train classifiers:

knn.o = knn(x.train, x.test, y.train, K.knn)

glm.o = glm(default~., data=dat.train, family=binomial(logit))

lda.o = lda(default~., data=dat.train)

qda.o = qda(default~., data=dat.train)

# test classifiers:

knn.p = knn.o

glm.p = ( predict(glm.o, newdata=dat.test, type="response") > 0.5 )

lda.p = predict(lda.o, newdata=dat.test)$class

qda.p = predict(qda.o, newdata=dat.test)$class

tb.knn = table(knn.p, y.test)

tb.glm = table(glm.p, y.test)

tb.lda = table(lda.p, y.test)

tb.qda = table(qda.p, y.test)

# store prediction accuracies:

acc.knn[k] = sum(diag(tb.knn)) / sum(tb.knn)

acc.glm[k] = sum(diag(tb.glm)) / sum(tb.glm)

acc.lda[k] = sum(diag(tb.lda)) / sum(tb.lda)

acc.qda[k] = sum(diag(tb.qda)) / sum(tb.qda)

#

# ROC/AUC analysis:

# WARNING: THIS IS NOT Pr(Y=1 | X), BUT Pr(Y = Y_hat | X):

# knn.p = attributes(knn(x.train, x.test, y.train, K.knn, prob=TRUE))$prob

glm.p = predict(glm.o, newdata=dat.test, type="response")

lda.p = predict(lda.o, newdata=dat.test)$posterior[,2]

qda.p = predict(qda.o, newdata=dat.test)$posterior[,2]

# auc.knn[k] = roc(y.test, knn.p)$auc

auc.glm[k] = roc(y.test, glm.p)$auc

auc.lda[k] = roc(y.test, lda.p)$auc

auc.qda[k] = roc(y.test, qda.p)$auc

}

boxplot(acc.knn, acc.glm, acc.lda, acc.qda,

main="Overall CV prediction accuracy",

names=c("kNN","GLM","LDA","QDA"))

boxplot(auc.glm, auc.lda, auc.qda,

main="Overall CV AUC",

names=c("GLM","LDA","QDA"))

##### Taking a closer look at performance

roc(y.test, glm.p)$auc

library(caret)

(tb = table(y.test, glm.p>.5))

pred = as.factor(glm.p>.5)

pred = car::recode(pred, "FALSE='No'; TRUE='Yes'")

caret::confusionMatrix(y.test, pred)

sum(diag(tb))/sum(tb)

##### Further exercises for you to do:

## adapt code to evaluate sensitivity and specificity

## add validation analysis...

## repeat on scaled data...

###############################################################

### Exercise 6: benchmarking, again

###############################################################

## (1) benchmarking on unscaled data

set.seed(4061)

n = nrow(Default)

dat = Default[sample(1:n, n, replace=FALSE), ]

# get a random training sample containing 70% of original sample:

i.cv = sample(1:n, round(.7*n), replace=FALSE)

x = dat.cv = dat[i.cv,] # use this for CV (train+test)

dat.valid = dat[-i.cv,] # save this for later (after CV)

# Recover ROC curve manually from whole set:

n = nrow(x)

acc = numeric(length(thrs))

sens = spec = numeric(length(thrs))

thrs = seq(.05,.95,by=.05)

for(ithr in 1:length(thrs)){

thr = thrs[ithr]

glmo = glm(default~., data=x,

? ? ? ? ? family=binomial)

? tb = table(glmo$fitted.values>thr, x$default)

? acc[ithr] = sum(diag(tb))/sum(tb)

? #

? # calculate sensitivity for a given threshold

? sens[ithr] = ...

? # calculate specificity for a given threshold

? spec[ithr] = ...

? # prediction

}

plot(acc)

plot(spec, sens)

# Evaluate a cross-validated ROC curve manually:

n = nrow(x)

K = 10

train.acc = test.acc = matrix(NA, nrow=K, ncol=length(thrs))

folds = cut(1:n, K, labels=FALSE)

k = 1

thrs = seq(.05,.95,by=.05)

for(ithr in 1:length(thrs)){

thr = thrs[ithr]

for(k in 1:K){

? itrain = which(folds!=k)

? glmo = glm(default~., data=x,

? ? ? ? ? family=binomial,

? ? ? ? ? subset=itrain)

? tb = table(glmo$fitted.values>thr, x$default[itrain])

? train.acc[k, ithr] = sum(diag(tb))/sum(tb)

? #

? # calculate sensitivity for a given threshold

? # ...

? # calculate specificity for a given threshold

? # ...

? # prediction

? p.test = predict(glmo, x[-itrain,], type='response')

? tb = table(p.test>thr, x$default[-itrain])

? test.acc[k,ithr] = sum(diag(tb))/sum(tb)

}

}

boxplot(test.acc)

# warnings()

mean(train.acc)?

mean(test.acc)

?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
  • 序言:七十年代末鸟蟹,一起剝皮案震驚了整個(gè)濱河市工猜,隨后出現(xiàn)的幾起案子,更是在濱河造成了極大的恐慌刽肠,老刑警劉巖,帶你破解...
    沈念sama閱讀 216,591評論 6 501
  • 序言:濱河連續(xù)發(fā)生了三起死亡事件示损,死亡現(xiàn)場離奇詭異暖庄,居然都是意外死亡,警方通過查閱死者的電腦和手機(jī)悦析,發(fā)現(xiàn)死者居然都...
    沈念sama閱讀 92,448評論 3 392
  • 文/潘曉璐 我一進(jìn)店門,熙熙樓的掌柜王于貴愁眉苦臉地迎上來此衅,“玉大人强戴,你說我怎么就攤上這事〉舶埃” “怎么了骑歹?”我有些...
    開封第一講書人閱讀 162,823評論 0 353
  • 文/不壞的土叔 我叫張陵,是天一觀的道長墨微。 經(jīng)常有香客問我陵刹,道長,這世上最難降的妖魔是什么? 我笑而不...
    開封第一講書人閱讀 58,204評論 1 292
  • 正文 為了忘掉前任衰琐,我火速辦了婚禮也糊,結(jié)果婚禮上,老公的妹妹穿的比我還像新娘羡宙。我一直安慰自己狸剃,他們只是感情好,可當(dāng)我...
    茶點(diǎn)故事閱讀 67,228評論 6 388
  • 文/花漫 我一把揭開白布狗热。 她就那樣靜靜地躺著钞馁,像睡著了一般。 火紅的嫁衣襯著肌膚如雪匿刮。 梳的紋絲不亂的頭發(fā)上僧凰,一...
    開封第一講書人閱讀 51,190評論 1 299
  • 那天,我揣著相機(jī)與錄音熟丸,去河邊找鬼训措。 笑死,一個(gè)胖子當(dāng)著我的面吹牛光羞,可吹牛的內(nèi)容都是我干的绩鸣。 我是一名探鬼主播,決...
    沈念sama閱讀 40,078評論 3 418
  • 文/蒼蘭香墨 我猛地睜開眼纱兑,長吁一口氣:“原來是場噩夢啊……” “哼呀闻!你這毒婦竟也來了?” 一聲冷哼從身側(cè)響起潜慎,我...
    開封第一講書人閱讀 38,923評論 0 274
  • 序言:老撾萬榮一對情侶失蹤捡多,失蹤者是張志新(化名)和其女友劉穎,沒想到半個(gè)月后铐炫,有當(dāng)?shù)厝嗽跇淞掷锇l(fā)現(xiàn)了一具尸體垒手,經(jīng)...
    沈念sama閱讀 45,334評論 1 310
  • 正文 獨(dú)居荒郊野嶺守林人離奇死亡,尸身上長有42處帶血的膿包…… 初始之章·張勛 以下內(nèi)容為張勛視角 年9月15日...
    茶點(diǎn)故事閱讀 37,550評論 2 333
  • 正文 我和宋清朗相戀三年驳遵,在試婚紗的時(shí)候發(fā)現(xiàn)自己被綠了淫奔。 大學(xué)時(shí)的朋友給我發(fā)了我未婚夫和他白月光在一起吃飯的照片山涡。...
    茶點(diǎn)故事閱讀 39,727評論 1 348
  • 序言:一個(gè)原本活蹦亂跳的男人離奇死亡堤结,死狀恐怖,靈堂內(nèi)的尸體忽然破棺而出鸭丛,到底是詐尸還是另有隱情竞穷,我是刑警寧澤,帶...
    沈念sama閱讀 35,428評論 5 343
  • 正文 年R本政府宣布鳞溉,位于F島的核電站瘾带,受9級特大地震影響,放射性物質(zhì)發(fā)生泄漏熟菲。R本人自食惡果不足惜看政,卻給世界環(huán)境...
    茶點(diǎn)故事閱讀 41,022評論 3 326
  • 文/蒙蒙 一朴恳、第九天 我趴在偏房一處隱蔽的房頂上張望。 院中可真熱鬧允蚣,春花似錦于颖、人聲如沸。這莊子的主人今日做“春日...
    開封第一講書人閱讀 31,672評論 0 22
  • 文/蒼蘭香墨 我抬頭看了看天上的太陽。三九已至冒晰,卻和暖如春同衣,著一層夾襖步出監(jiān)牢的瞬間,已是汗流浹背壶运。 一陣腳步聲響...
    開封第一講書人閱讀 32,826評論 1 269
  • 我被黑心中介騙來泰國打工耐齐, 沒想到剛下飛機(jī)就差點(diǎn)兒被人妖公主榨干…… 1. 我叫王不留,地道東北人前弯。 一個(gè)月前我還...
    沈念sama閱讀 47,734評論 2 368
  • 正文 我出身青樓蚪缀,卻偏偏與公主長得像,于是被迫代替她去往敵國和親恕出。 傳聞我的和親對象是個(gè)殘疾皇子询枚,可洞房花燭夜當(dāng)晚...
    茶點(diǎn)故事閱讀 44,619評論 2 354

推薦閱讀更多精彩內(nèi)容