不写R包的分析师不是好全栈

logit,GBM,knn,xgboost对一个信用卡数据的实现

    R


Prepare the data


数据来自UCIhttp://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening,一个信a用卡的数据,具体各项变量名以及变量名代表的含义不明(应该是出于保护隐私的目的),本文会用logit,GBM,knn,xgboost来对数据进行分类预测,对比准确率


预计的准确率应该是:


xgboost > GBM > logit > knn



Download the data


dataset = read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data",
sep = ",",
header = F,
na.strings = "?")
head(dataset)

  V1    V2    V3 V4 V5 V6 V7   V8 V9 V10 V11 V12 V13 V14 V15 V16
1 b 30.83 0.000 u g w v 1.25 t t 1 f g 202 0 +
2 a 58.67 4.460 u g q h 3.04 t t 6 f g 43 560 +
3 a 24.50 0.500 u g q h 1.50 t f 0 f g 280 824 +
4 b 27.83 1.540 u g w v 3.75 t t 5 t g 100 3 +
5 b 20.17 5.625 u g w v 1.71 t f 0 f s 120 0 +
6 b 32.08 4.000 u g m v 2.50 t f 0 t g 360 0 +

## save.csv(dataset,file = "creditCard.csv")

以上是数据的形式,接下来看下数据是否有缺失值和各个数据的类型


sapply(dataset,function(x) sum(is.na(x)))

 V1  V2  V3  V4  V5  V6  V7  V8  V9 V10 V11 V12 V13 V14 V15 V16
12 12 0 6 6 9 9 0 0 0 0 0 0 13 0 0

sapply(dataset,class)

       V1        V2        V3        V4        V5        V6        V7        V8        V9       V10
"factor" "numeric" "numeric" "factor" "factor" "factor" "factor" "numeric" "factor" "factor"
V11 V12 V13 V14 V15 V16
"integer" "factor" "factor" "integer" "integer" "factor"



Train and Test


分割数据的训练集和测试集,这里set.seed(123),设定70%的训练集,30%的测试集.


set.seed(123)
dataset = na.omit(dataset)

n = dim(dataset)[1]
index = sample(n,round(0.7n))
train = dataset[index,]
test = dataset[-index,]


dim(train)

[1] 457  16

dim(test)

[1] 196  16



Change the variable into dummy variables


有时候,需要转化变量为哑变量,因为在一些挖掘场合,数据不能直接使用因子型的数据:



  • knn

  • glmnet

  • svm

  • xgboost


有些挖掘方法是可以使用因子变量的,比如:



  • logistic regression

  • raprt

  • GBM

  • randomforest


dataset2 = dataset
library(plyr)

into_factor = function(x){

if(class(x) == "factor"){
n = length(x)
data.fac = data.frame(x = x,y = 1:n)
output = model.matrix(y~x,data.fac)[,-1]
## Convert factor into dummy variable matrix
}else{
output = x
## if x is numeric, output is x
}
output

}

into_factor(dataset$V4)[1:5,]

  xu xy
1 1 0
2 1 0
3 1 0
4 1 0
5 1 0

dataset2 = colwise(into_factor)(dataset2)
dataset2 = do.call(cbind,dataset2)
dataset2 = as.data.frame(dataset2)
head(dataset2)

  V1    V2    V3 xu xy xgg xp xc xcc xd xe xff xi xj xk xm xq xr xw xx xdd xff xh xj xn xo xv xz
1 1 30.83 0.000 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0
2 0 58.67 4.460 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0
3 0 24.50 0.500 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0
4 1 27.83 1.540 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0
5 1 20.17 5.625 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0
6 1 32.08 4.000 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0
V8 V9 V10 V11 V12 xp xs V14 V15 V16
1 1.25 1 1 1 0 0 0 202 0 1
2 3.04 1 1 6 0 0 0 43 560 1
3 1.50 1 0 0 0 0 0 280 824 1
4 3.75 1 1 5 1 0 0 100 3 1
5 1.71 1 0 0 0 0 1 120 0 1
6 2.50 1 0 0 1 0 0 360 0 1

dim(dataset2)

[1] 653  38




Logistic Regression


使用logistic回归来进行测试建模和预测,使用的函数是glm


logit.model = glm(V16~.,data = train,family = "binomial")
logit.response = predict(logit.model,test,type = "response")
logit.predict = ifelse(logit.response>0.5,"+","-")
table(logit.predict,test$V16)


logit.predict - +
- 90 24
+ 13 69

accurancy1 = mean(logit.predict == test$V16)
accurancy1

[1] 0.81122



GBM


使用GBM方法来进行预测,这里用的是caret,repeat-cv来选择最优树


library(caret)
ctrl = trainControl(method = "repeatedcv",
number = 5, repeats = 5)

set.seed(300)
m_gbm = train(V16 ~ ., data=train, method = "gbm",
metric = "Kappa", trControl = ctrl)


gbm.predict = predict(m_gbm,test)
table(gbm.predict,test$V16)

accurancy2 = mean(gbm.predict == test$V16)
accurancy2

[1] 0.85714



knn method for classification



knn set k = 5


This is a model without cross-validation


首先测试一个knn模型,不做CV,不做标准化,不做数据类型转换得到的结果,这里,不转换数据类型会把因子类型的变量舍弃,仅保留数值变量


library(caret)
knn.model1 = knn3(V16 ~ .,data = train, k = 5)
knn.response1 = predict(knn.model1,test,class = "response")
knn.predict1 = ifelse(knn.response1[,1]<0.5,"+","-")
table(knn.predict1,test$V16)


knn.predict1 - +
- 78 48
+ 25 45

mean(knn.predict1 == test$V16)

[1] 0.62755



knn after scale


After scaling and convert into dummy variables:


经过标准化和数据转换之后的准确率:


knn.dataset = cbind(
colwise(scale)(dataset2[,-38]),
V16 = as.factor(dataset2$V16)
)

set.seed(123)

index = sample(n,round(0.7n))
train.knn = knn.dataset[index,]
test.knn = knn.dataset[-index,]


knn.model1 = knn3(V16 ~ .,data = train.knn, k = 5)
knn.predict1 = predict(knn.model1,test.knn,,type = "class")
table(knn.predict1,test.knn$V16)


knn.predict1 0 1
0 89 32
1 14 61

mean(knn.predict1 == test.knn$V16)

[1] 0.76531



knn CV for k


my-try



不管是我的这个程序函数caret,总算出来应该是k=2的时候误差最小,但是实际情况不是这样



library(class)
cv.knn = function(data,n=5,k){
index = sample(1:5,nrow(data),replace = T)
acc=0
for ( i in 1:5){
ind = index == i
train = data[-ind,]
test = data[ind,]
knn.model1 = knn3(V16 ~ .,data = train, k = k)
knn.predict= predict(knn.model1,test,type = "class")
acc[i] = mean(knn.predict == test$V16)
}
mean(acc)
}

cv.knn(train.knn,3,5)

[1] 0.8533

k = 2:20
set.seed(123)
acc = sapply(k,function(x) cv.knn(train.knn,3,x))

plot(k,acc,type = "b")




k.final = which.max(acc)



knn.model.f = knn3(V16 ~ .,data = train.knn, k = k.final)
knn.predict.f = predict(knn.model.f,test.knn,type = "class")
table(knn.predict.f,test.knn$V16)


knn.predict.f 0 1
0 81 31
1 22 62

mean(knn.predict.f == test.knn$V16)

[1] 0.72959

library(caret)
fitControl <- trainControl(method = "cv",
number = 10)

knnTune <- train(x = dataset2[1:37], y = dataset2[,38],
method = "knn",
preProc = c("center", "scale"),
tuneGrid = data.frame(.k = 1:20),
trControl = fitControl)



直接train,test来看:


效果是k=5最好


knn_train_test = function(train,test,k =5){
knn.model.f = knn3(V16 ~ .,data = train, k = k)
knn.predict.f = predict(knn.model.f,test,type = "class")
mean(knn.predict.f == test$V16)
}

x = 1:20
result =
sapply(x,
function(x) knn_train_test(train.knn,test.knn,k = x))


plot(x,result,type = "b")




k.final = which.max(result)
accurancy3 = knn_train_test(train.knn,test.knn,k = k.final)
accurancy3

[1] 0.75




xgboost


Install:


## devtools::install_github(‘dmlc/xgboost’,subdir=’R-package’)

require(xgboost)
require(methods)
require(plyr)

set.seed(123)

set.seed(123)

index = sample(n,round(0.7n))
train.xg = dataset2[index,]
test.xg = dataset2[-index,]

label <- as.matrix(train.xg[,38,drop =F])

data <- as.matrix(train.xg[,-38,drop =F])

data2 <- as.matrix(test.xg[,-38,drop =F])
label2 = as.matrix(test.xg[,38,drop =F])
# weight <- as.numeric(dtrain[[32]])
testsize / length(label)

xgmat <- xgb.DMatrix(data, label = label, missing = -10000)
param <- list("objective" = "binary:logistic",
"bst:eta" = 1,
"bst:max_depth" = 2,
"eval_metric" = "logloss",
"silent" = 1,
"nthread" = 16 ,
"min_child_weight" =1.45
)

nround =275

bst = xgb.train(param, xgmat, nround )

res1 = predict(bst,data2)
pre1 = ifelse(res1>0.5,1,0)
table(pre1,label2)

    label2
pre1 0 1
0 91 15
1 12 78

accurancy4 = mean(pre1 ==label2)
accurancy4

[1] 0.86224



Final Results



























MethodAccurancy
logistic regression0.81122
GBM0.85714
knn0.75
xgboost0.86224

page PV:  ・  site PV:  ・  site UV: