Auto Byte

Science AI

Leihua Ye, UC Santa Barbara作者冯羽、谭佳瑶校对于腾凯编辑陈超翻译

# R语言中K邻近算法的初学者指南：从菜鸟到大神（附代码＆链接）

Mathyas Kurmann拍摄，来自于Unsplash“如果你有5分钟时间可以离开比尔·盖茨生活，我敢打赌你很富有。”

“为了决定新观测样本的标签，我们就看最邻近样本。”

https://en.m.wikipedia.org/wiki/Knearest_neighbors_algorithm，本文将不会过多讨论数学问题。

1. 将数据分成K个均匀分布的块/层

2. 选择一个块/层集作为测试集，剩下的K-1块/层作为训练集

3. 基于训练集建立ML模型

4. 仅比较测试集当中的预测值和真实值

5. 将ML模型应用到测试集，并使用每个块重复测试K次

6. 把模型的度量得分加和并求K层的平均值

Jon Tyson拍摄，来自于Unsplash

“你好邻居！快进来吧。”

R语言实现

1. 软件准备

# install.packages(“ISLR”)

# install.packages(“ggplot2”) # install.packages(“plyr”)

# install.packages(“dplyr”) # install.packages(“class”)# Load libraries

library(ISLR)

library(ggplot2)

library(reshape2)

library(plyr)

library(dplyr)

library(class)# load data and clean the dataset

banking[!complete.cases(banking),]#re-code qualitative (factor) variables into numeric

housemaid’=4;’management’=5;’retired’=6;’self-employed’=7;’services’=8;

’student’=9;’technician’=10;’unemployed’=11;’unknown’=12”)#recode variable again

banking\$marital=recode(banking\$marital,“‘divorced’=1;’married’=2;’single’=3;’unknown’=4”)

banking\$education=recode(banking\$education,“‘basic.4y’=1;’basic.6y’=2;’basic.9y’=3;’high.school’=4;’illiterate’=5;’professional.course’=6;’university.degree’=7;’unknown’=8”)

banking\$default = recode(banking\$default, “‘no’=1;’yes’=2;’unknown’=3”)

banking\$housing = recode(banking\$housing, “‘no’=1;’yes’=2;’unknown’=3”)

banking\$loan=recode(banking\$loan,“‘no’=1;’yes’=2;’unknown’=3”)banking\$contact=recode(banking\$loan,“‘cellular’=1;’telephone’=2;”)

banking\$month=recode(banking\$month,“‘mar’=1;’apr’=2;’may’=3;’jun’=4;’jul’=5;’aug’=6;’sep’=7;’oct’=8;’nov’=9;’dec’=10”)

banking\$day_of_week=recode(banking\$day_of_week,“‘mon’=1;’tue’=2;’wed’=3;’thu’=4;’fri’=5;”)

banking\$poutcome = recode(banking\$poutcome,“‘failure’=1;’nonexistent’=2;’success’=3;”)#remove variable “pdays”, b/c it has no variation

banking\$pdays=NULL #remove variable “duration”, b/c itis collinear with the DV

banking\$duration=NULL

#EDA of the DV

plot(banking\$y,main="Plot 1: Distribution of Dependent Variable")

redirectUrl=https%3A%2F%2Ftowardsdatascience.com%2Fclassifying-rare-events-using-five-machine-learning-techniques-fab464573233）中介绍了KNN在与其他ML方法进行比较之后表现得更好。这个可能是参数和非参数模型中潜在的数学和统计假设导致的。

2. 数据分组

#split the dataset into training and test sets randomly, but we need to set seed so as to generate the same value each time we run the codeset.seed(1)#create an index to split the data: 80% training and 20% test

index = round(nrow(banking)*0.2,digits=0)#sample randomly throughout the dataset and keep the total number equal to the value of index

test.indices = sample(1:nrow(banking), index)#80% training set

banking.train=banking[-test.indices,] #20% test set

banking.test=banking[test.indices,] #Select the training set except the DV

YTrain = banking.train\$y

XTrain = banking.train %>% select(-y)# Select the test set except the DV

YTest = banking.test\$y

XTest = banking.test %>% select(-y)

3. 训练模型

#define an error rate function and apply it to obtain test/training errorscalc_error_rate <- function(predicted.value, true.value){

return(mean(true.value!=predicted.value))

}

nfold = 10

set.seed(1)# cut() divides the range into several intervals

folds = seq.int(nrow(banking.train)) %>%

cut(breaks = nfold, labels=FALSE) %>%

sampledo.chunk <- function(chunkid, folddef, Xdat, Ydat, k){

train = (folddef!=chunkid)# training indexXtr = Xdat[train,] # training set by the indexYtr = Ydat[train] # true label in training setXvl = Xdat[!train,] # test setYvl = Ydat[!train] # true label in test setpredYtr = knn(train = Xtr, test = Xtr, cl = Ytr, k = k) # predict training labelspredYvl = knn(train = Xtr, test = Xvl, cl = Ytr, k = k) # predict test labelsdata.frame(fold =chunkid, # k folds

train.error = calc_error_rate(predYtr, Ytr),#training error per fold

val.error = calc_error_rate(predYvl, Yvl)) # test error per fold

}# set error.folds to save validation errors

error.folds=NULL# create a sequence of data with an interval of 10

kvec = c(1, seq(10, 50, length.out=5))set.seed(1)for (j in kvec){

tmp = ldply(1:nfold, do.chunk, # apply do.function to each fold

folddef=folds, Xdat=XTrain, Ydat=YTrain, k=j) # required arguments

tmp\$neighbors = j # track each value of neighbors

error.folds = rbind(error.folds, tmp) # combine the results

}#melt() in the package reshape2 melts wide-format data into long-format data

errors = melt(error.folds, id.vars=c(“fold”,”neighbors”), value.name= “error”)

val.error.means = errors %>%

#select all rows of validation errors

filter(variable== “val.error” ) %>%

#group the selected data by neighbors

group_by(neighbors, variable) %>%

#cacluate CV error for each k

summarise_each(funs(mean), error) %>%

#remove existing grouping

ungroup() %>%

filter(error==min(error))#the best number of neighbors

numneighbor = max(val.error.means\$neighbors)

numneighbor## [20]

Nick Youngson

4. 一些模型的度量

#training error

set.seed(20)

pred.YTtrain = knn(train=XTrain, test=XTrain, cl=YTrain, k=20)

knn_traing_error<-calc_error_rate (predicted.value=pred.YTtrain,true.value=YTrain)

knn_traing_error

[1] 0.101214

#test error

set.seed(20)

pred.YTest = knn(train=XTrain, test=XTest, cl=YTrain, k=20)

knn_test_error <- calc_error_rate(predicted.value=pred.YTest, true.value=YTest)

knn_test_error

[1] 0.1100995

#confusion matrixconf.matrix = table(predicted=pred.YTest, true=YTest)

Accuracy = (TP +TN)/(TP+FP+FN+TN)

TPR/Recall/Sensitivity = TP/(TP+FN)

Precision = TP/(TP+FP)

Specificity = TN/(TN+FP)

FPR = 1 — Specificity = FP/(TN+FP)

F1 Score = 2*TP/(2*TP+FP+FN) = Precision*Recall /(Precision +Recall)

# Test accuracy ratesum(diag(conf.matrix)/sum(conf.matrix))[1] 0.8899005# Test error rate1 - sum(drag(conf.matrix)/sum(conf.matrix))[1] 0.1100995

# ROC and AUC

knn_model = knn(train=XTrain, test=XTrain, cl=YTrain, k=20,prob=TRUE)prob <- attr(knn_model, “prob”)prob <- 2*ifelse(knn_model == “-1”, prob,1-prob) — 1pred_knn <- prediction(prob, YTrain)performance_knn <- performance(pred_knn, “tpr”, “fpr”)# AUCauc_knn <- performance(pred_knn,”auc”)@y.valuesauc_knn[1] 0.8470583plot(performance_knn,col=2,lwd=2,main=”ROC Curves for KNN”)

Beginner’s Guide to K-Nearest Neighbors in R: from Zero to Hero

https://www.kdnuggets.com/2020/01/beginners-guide-nearest-neighbors-r.html

THU数据派

THU数据派"基于清华，放眼世界"，以扎实的理工功底闯荡“数据江湖”。发布全球大数据资讯，定期组织线下活动，分享前沿产业动态。了解清华大数据，敬请关注姐妹号“数据派THU”。