########################################################################## ## May 31: Fei Yuan ## try the tree method on the whole training set of KDD data ## in order to find the most useful variables from 74 explonatory variables ########################################################################### ########################################################################### ## Results (pruned tree) 18 variables ## x5,x11,x28,x33,x35,x38,x40,x45,x50,x53,x55,x57,x58,x59,x60,x63,x68,x73 ########################################################################### train<-kdd.train.new kdd.rpart<-rpart(y~x1+x2+x3+x4+x5+x6+x7+x8+x9+x10+x11+x12+x13+x14+x15+x16+x17+x18+x19+x20+x21+x22+x23+x24+x25+x26+x27+x28+x29+x30+x31+x32+x33+x34+x35+x36+x37+x38+x39+x40+x41+x42+x43+x44+x45+x46+x47+x48+x49+x50+x51+x52+x53+x54+x55+x56+x57+x58+x59+x60+x61+x62+x63+x64+x65+x66+x67+x68+x69+x70+x71+x72+x73+x74,data=train,method = "class",parms = list(split = "information"),cp = 0, minsplit = 10,minbucket = 5, maxsurrogate = 0) kdd.rpart.prune<-prune(kdd.rpart,cp = 0.00657895) > summary(kdd.rpart.prune) Call: rpart(formula = y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9 + x10 + x11 + x12 + x13 + x14 + x15 + x16 + x17 + x18 + x19 + x20 + x21 + x22 + x23 + x24 + x25 + x26 + x27 + x28 + x29 + x30 + x31 + x32 + x33 + x34 + x35 + x36 + x37 + x38 + x39 + x40 + x41 + x42 + x43 + x44 + x45 + x46 + x47 + x48 + x49 + x50 + x51 + x52 + x53 + x54 + x55 + x56 + x57 + x58 + x59 + x60 + x61 + x62 + x63 + x64 + x65 + x66 + x67 + x68 + x69 + x70 + x71 + x72 + x73 + x74, data = train, method = "class", parms = list(split = "information"), cp = 0, minsplit = 10, minbucket = 5, maxsurrogate = 0) n= 145751 CP nsplit rel error xerror xstd 1 0.611882716 0 1.0000000 1.0000000 0.02765400 2 0.008873457 1 0.3881173 0.4004630 0.01754705 3 0.008487654 3 0.3703704 0.3858025 0.01722399 4 0.006578950 7 0.3364198 0.3873457 0.01725829 Node number 1: 145751 observations, complexity param=0.6118827 predicted class=0 expected loss=0.008891877 class counts: 144455 1296 probabilities: 0.991 0.009 left son=2 (144874 obs) right son=3 (877 obs) Primary splits: x53 < 4.445 to the left, improve=4131.006, (0 missing) x55 < -40.25 to the right, improve=4000.869, (0 missing) x58 < 6.195 to the left, improve=3896.459, (0 missing) x59 < 36.75 to the left, improve=3775.973, (0 missing) x60 < -92.75 to the right, improve=3610.691, (0 missing) Node number 2: 144874 observations, complexity param=0.008487654 predicted class=0 expected loss=0.003182075 class counts: 144413 461 probabilities: 0.997 0.003 left son=4 (137240 obs) right son=5 (7634 obs) Primary splits: x55 < 15.75 to the right, improve=349.8462, (0 missing) x53 < 1.505 to the left, improve=341.6967, (0 missing) x63 < 2.005 to the left, improve=338.4546, (0 missing) x35 < -15.45 to the right, improve=271.5882, (0 missing) x5 < -7.25 to the right, improve=239.4035, (0 missing) Node number 3: 877 observations, complexity param=0.008873457 predicted class=1 expected loss=0.04789054 class counts: 42 835 probabilities: 0.048 0.952 left son=6 (57 obs) right son=7 (820 obs) Primary splits: x55 < -28.5 to the right, improve=80.25399, (0 missing) x60 < -67.25 to the right, improve=70.32789, (0 missing) x58 < 3.95 to the left, improve=69.67247, (0 missing) x59 < 16.75 to the left, improve=68.12516, (0 missing) x53 < 5.135 to the left, improve=67.24266, (0 missing) Node number 4: 137240 observations predicted class=0 expected loss=0.001741475 class counts: 137001 239 probabilities: 0.998 0.002 Node number 5: 7634 observations, complexity param=0.008487654 predicted class=0 expected loss=0.02908043 class counts: 7412 222 probabilities: 0.971 0.029 left son=10 (5971 obs) right son=11 (1663 obs) Primary splits: x40 < -37.55 to the right, improve=183.9589, (0 missing) x45 < -59.5 to the right, improve=162.5006, (0 missing) x50 < -65.5 to the right, improve=155.7119, (0 missing) x68 < 3.105 to the left, improve=152.1550, (0 missing) x63 < 2.735 to the left, improve=147.9014, (0 missing) Node number 6: 57 observations, complexity param=0.008873457 predicted class=0 expected loss=0.4210526 class counts: 33 24 probabilities: 0.579 0.421 left son=12 (43 obs) right son=13 (14 obs) Primary splits: x63 < 2.01 to the left, improve=15.47488, (0 missing) x58 < 2.31 to the left, improve=12.81102, (0 missing) x57 < 1.57 to the left, improve=12.06921, (0 missing) x11 < 446.8 to the right, improve=11.40747, (0 missing) x33 < 4.145 to the left, improve=10.74221, (0 missing) Node number 7: 820 observations predicted class=1 expected loss=0.01097561 class counts: 9 811 probabilities: 0.011 0.989 Node number 10: 5971 observations predicted class=0 expected loss=0.007033998 class counts: 5929 42 probabilities: 0.993 0.007 Node number 11: 1663 observations, complexity param=0.008487654 predicted class=0 expected loss=0.1082381 class counts: 1483 180 probabilities: 0.892 0.108 left son=22 (1454 obs) right son=23 (209 obs) Primary splits: x38 < 6.185 to the left, improve=84.87885, (0 missing) x68 < 3.105 to the left, improve=83.34055, (0 missing) x33 < 2.635 to the left, improve=82.31492, (0 missing) x63 < 2.48 to the left, improve=82.20234, (0 missing) x28 < 1.685 to the left, improve=71.58172, (0 missing) Node number 12: 43 observations predicted class=0 expected loss=0.2325581 class counts: 33 10 probabilities: 0.767 0.233 Node number 13: 14 observations predicted class=1 expected loss=0 class counts: 0 14 probabilities: 0.000 1.000 Node number 22: 1454 observations predicted class=0 expected loss=0.06327373 class counts: 1362 92 probabilities: 0.937 0.063 Node number 23: 209 observations, complexity param=0.008487654 predicted class=0 expected loss=0.4210526 class counts: 121 88 probabilities: 0.579 0.421 left son=46 (123 obs) right son=47 (86 obs) Primary splits: x68 < 2.915 to the left, improve=35.18255, (0 missing) x63 < 1.515 to the left, improve=27.50148, (0 missing) x28 < 2.975 to the left, improve=21.78615, (0 missing) x38 < 9.245 to the left, improve=18.27168, (0 missing) x73 < 0.355 to the left, improve=17.94550, (0 missing) Node number 46: 123 observations predicted class=0 expected loss=0.1869919 class counts: 100 23 probabilities: 0.813 0.187 Node number 47: 86 observations predicted class=1 expected loss=0.244186 class counts: 21 65 probabilities: 0.244 0.756 >