########################################################################## ## May 31: Fei Yuan ## try the tree method on the block 7 and 244 of KDD training data ## in order to find the most useful variables from 74 explonatory variables ########################################################################### ##################################################################### ## Conclusion: ## Block 7: Prune and unprune: x3,x4,x5,x8,x9 ## Block 244: unprune: x3,x21,x29,x45,x51,x52,x53,x54,x55,x58, x60,x74 prune: x3,x29,x45,x53,x54,x55,x58,x60,x74 ###################################################################### kdd.train.new<-kdd.train dimnames( )[[2]]<-c("BlockID","ExampleID", "y", paste("x", 1:74,sep="")) blockNum<-7 block<-kdd.train.new[kdd.train.new[,1]==blockNum,] train<-block library(rpart) set.seed(700) myString<-"y~" for( i in 1:74) { if (i < 74) myString<-paste(myString,"x",i,"+",sep="") else myString<-paste(myString,"x",i,sep="") } kdd.rpart<-rpart(y~x1+x2+x3+x4+x5+x6+x7+x8+x9+x10+x11+x12+x13+x14+x15+x16+x17+x18+x19+x20+x21+x22+x23+x24+x25+x26+x27+x28+x29+x30+x31+x32+x33+x34+x35+x36+x37+x38+x39+x40+x41+x42+x43+x44+x45+x46+x47+x48+x49+x50+x51+x52+x53+x54+x55+x56+x57+x58+x59+x60+x61+x62+x63+x64+x65+x66+x67+x68+x69+x70+x71+x72+x73+x74,data=train,method = "class",parms = list(split = "information"),cp = 0, minsplit = 10,minbucket = 5, maxsurrogate = 0) kdd.rpart.prune<-prune(kdd.rpart,cp = 0.00657895) summary(kdd.rpart.prune,file = "/home/grad/fyuan/public_html/kdd/prune_summary.out") kdd.block.rpart<-rpart(y~x1+x2,data=train,method = "class",parms = list(split = "information"),cp = 0, minsplit = 10,minbucket = 5, maxsurrogate = 0) =============== > summary(kdd.rpart.prune) Call: rpart(formula = y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9 + x10 + x11 + x12 + x13 + x14 + x15 + x16 + x17 + x18 + x19 + x20 + x21 + x22 + x23 + x24 + x25 + x26 + x27 + x28 + x29 + x30 + x31 + x32 + x33 + x34 + x35 + x36 + x37 + x38 + x39 + x40 + x41 + x42 + x43 + x44 + x45 + x46 + x47 + x48 + x49 + x50 + x51 + x52 + x53 + x54 + x55 + x56 + x57 + x58 + x59 + x60 + x61 + x62 + x63 + x64 + x65 + x66 + x67 + x68 + x69 + x70 + x71 + x72 + x73 + x74, data = train, method = "class", parms = list(split = "information"), cp = 0, minsplit = 10, minbucket = 5, maxsurrogate = 0) n= 975 CP nsplit rel error xerror xstd 1 0.9565217 0 1.00000000 1.0000000 0.20604034 2 0.0000000 1 0.04347826 0.1304348 0.07519061 Node number 1: 975 observations, complexity param=0.9565217 predicted class=0 expected loss=0.02358974 class counts: 952 23 probabilities: 0.976 0.024 left son=2 (953 obs) right son=3 (22 obs) Primary splits: x3 < 5.285 to the left, improve=101.0472, (0 missing) x4 < 117.75 to the left, improve=101.0472, (0 missing) x5 < -157.75 to the right, improve=101.0472, (0 missing) x8 < 6.905 to the left, improve=101.0472, (0 missing) x9 < 85.25 to the left, improve=101.0472, (0 missing) Node number 2: 953 observations predicted class=0 expected loss=0.001049318 class counts: 952 1 probabilities: 0.999 0.001 Node number 3: 22 observations predicted class=1 expected loss=0 class counts: 0 22 probabilities: 0.000 1.000 > summary(kdd.rpart) Call: rpart(formula = y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9 + x10 + x11 + x12 + x13 + x14 + x15 + x16 + x17 + x18 + x19 + x20 + x21 + x22 + x23 + x24 + x25 + x26 + x27 + x28 + x29 + x30 + x31 + x32 + x33 + x34 + x35 + x36 + x37 + x38 + x39 + x40 + x41 + x42 + x43 + x44 + x45 + x46 + x47 + x48 + x49 + x50 + x51 + x52 + x53 + x54 + x55 + x56 + x57 + x58 + x59 + x60 + x61 + x62 + x63 + x64 + x65 + x66 + x67 + x68 + x69 + x70 + x71 + x72 + x73 + x74, data = train, method = "class", parms = list(split = "information"), cp = 0, minsplit = 10, minbucket = 5, maxsurrogate = 0) n= 975 CP nsplit rel error xerror xstd 1 0.9565217 0 1.00000000 1.0000000 0.20604034 2 0.0000000 1 0.04347826 0.1304348 0.07519061 Node number 1: 975 observations, complexity param=0.9565217 predicted class=0 expected loss=0.02358974 class counts: 952 23 probabilities: 0.976 0.024 left son=2 (953 obs) right son=3 (22 obs) Primary splits: x3 < 5.285 to the left, improve=101.0472, (0 missing) x4 < 117.75 to the left, improve=101.0472, (0 missing) x5 < -157.75 to the right, improve=101.0472, (0 missing) x8 < 6.905 to the left, improve=101.0472, (0 missing) x9 < 85.25 to the left, improve=101.0472, (0 missing) Node number 2: 953 observations predicted class=0 expected loss=0.001049318 class counts: 952 1 probabilities: 0.999 0.001 Node number 3: 22 observations predicted class=1 expected loss=0 class counts: 0 22 probabilities: 0.000 1.000 =============================== blockNum<-244 block<-kdd.train.new[kdd.train.new[,1]==blockNum,] train<-block > summary(kdd.rpart) Call: rpart(formula = y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9 + x10 + x11 + x12 + x13 + x14 + x15 + x16 + x17 + x18 + x19 + x20 + x21 + x22 + x23 + x24 + x25 + x26 + x27 + x28 + x29 + x30 + x31 + x32 + x33 + x34 + x35 + x36 + x37 + x38 + x39 + x40 + x41 + x42 + x43 + x44 + x45 + x46 + x47 + x48 + x49 + x50 + x51 + x52 + x53 + x54 + x55 + x56 + x57 + x58 + x59 + x60 + x61 + x62 + x63 + x64 + x65 + x66 + x67 + x68 + x69 + x70 + x71 + x72 + x73 + x74, data = train, method = "class", parms = list(split = "information"), cp = 0, minsplit = 10, minbucket = 5, maxsurrogate = 0) n= 817 CP nsplit rel error xerror xstd 1 0.87500000 0 1.00000 1.00000 0.17328015 2 0.01041667 1 0.12500 0.15625 0.06966297 3 0.00000000 4 0.09375 0.25000 0.08795454 Node number 1: 817 observations, complexity param=0.875 predicted class=0 expected loss=0.03916769 class counts: 785 32 probabilities: 0.961 0.039 left son=2 (787 obs) right son=3 (30 obs) Primary splits: x53 < 2.765 to the left, improve=110.95440, (0 missing) x55 < -17.75 to the right, improve=102.57640, (0 missing) x3 < 2.825 to the left, improve= 92.58578, (0 missing) x58 < 3.165 to the left, improve= 90.90784, (0 missing) x54 < 59 to the left, improve= 88.02576, (0 missing) Node number 2: 787 observations, complexity param=0.01041667 predicted class=0 expected loss=0.003811944 class counts: 784 3 probabilities: 0.996 0.004 left son=4 (687 obs) right son=5 (100 obs) Primary splits: x29 < -2.445 to the right, improve=6.228906, (0 missing) x60 < -39.25 to the right, improve=5.649089, (0 missing) x58 < 2.305 to the left, improve=5.361907, (0 missing) x53 < 0.97 to the left, improve=5.206407, (0 missing) x45 < -52.5 to the right, improve=5.184824, (0 missing) Node number 3: 30 observations predicted class=1 expected loss=0.03333333 class counts: 1 29 probabilities: 0.033 0.967 Node number 4: 687 observations predicted class=0 expected loss=0 class counts: 687 0 probabilities: 1.000 0.000 Node number 5: 100 observations, complexity param=0.01041667 predicted class=0 expected loss=0.03 class counts: 97 3 probabilities: 0.970 0.030 left son=10 (83 obs) right son=11 (17 obs) Primary splits: x74 < 0.68 to the left, improve=5.552229, (0 missing) x45 < -52.5 to the right, improve=4.861774, (0 missing) x58 < 1.73 to the left, improve=4.560563, (0 missing) x60 < -39.25 to the right, improve=4.431733, (0 missing) x53 < 0.95 to the left, improve=4.301092, (0 missing) Node number 10: 83 observations predicted class=0 expected loss=0 class counts: 83 0 probabilities: 1.000 0.000 Node number 11: 17 observations, complexity param=0.01041667 predicted class=0 expected loss=0.1764706 class counts: 14 3 probabilities: 0.824 0.176 left son=22 (12 obs) right son=23 (5 obs) Primary splits: x53 < 0.95 to the left, improve=4.556929, (0 missing) x21 < -54.2 to the left, improve=3.141631, (0 missing) x51 < 238.75 to the right, improve=3.141631, (0 missing) x52 < -0.085 to the left, improve=3.141631, (0 missing) x55 < 35.25 to the right, improve=3.141631, (0 missing) Node number 22: 12 observations predicted class=0 expected loss=0 class counts: 12 0 probabilities: 1.000 0.000 Node number 23: 5 observations predicted class=1 expected loss=0.4 class counts: 2 3 probabilities: 0.400 0.600 > > summary(kdd.rpart.prune) Call: rpart(formula = y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9 + x10 + x11 + x12 + x13 + x14 + x15 + x16 + x17 + x18 + x19 + x20 + x21 + x22 + x23 + x24 + x25 + x26 + x27 + x28 + x29 + x30 + x31 + x32 + x33 + x34 + x35 + x36 + x37 + x38 + x39 + x40 + x41 + x42 + x43 + x44 + x45 + x46 + x47 + x48 + x49 + x50 + x51 + x52 + x53 + x54 + x55 + x56 + x57 + x58 + x59 + x60 + x61 + x62 + x63 + x64 + x65 + x66 + x67 + x68 + x69 + x70 + x71 + x72 + x73 + x74, data = train, method = "class", parms = list(split = "information"), cp = 0, minsplit = 10, minbucket = 5, maxsurrogate = 0) n= 817 CP nsplit rel error xerror xstd 1 0.87500000 0 1.00000 1.00000 0.17328015 2 0.01041667 1 0.12500 0.15625 0.06966297 3 0.00000000 4 0.09375 0.25000 0.08795454 Node number 1: 817 observations, complexity param=0.875 predicted class=0 expected loss=0.03916769 class counts: 785 32 probabilities: 0.961 0.039 left son=2 (787 obs) right son=3 (30 obs) Primary splits: x53 < 2.765 to the left, improve=110.95440, (0 missing) x55 < -17.75 to the right, improve=102.57640, (0 missing) x3 < 2.825 to the left, improve= 92.58578, (0 missing) x58 < 3.165 to the left, improve= 90.90784, (0 missing) x54 < 59 to the left, improve= 88.02576, (0 missing) Node number 2: 787 observations, complexity param=0.01041667 predicted class=0 expected loss=0.003811944 class counts: 784 3 probabilities: 0.996 0.004 left son=4 (687 obs) right son=5 (100 obs) Primary splits: x29 < -2.445 to the right, improve=6.228906, (0 missing) x60 < -39.25 to the right, improve=5.649089, (0 missing) x58 < 2.305 to the left, improve=5.361907, (0 missing) x53 < 0.97 to the left, improve=5.206407, (0 missing) x45 < -52.5 to the right, improve=5.184824, (0 missing) Node number 3: 30 observations predicted class=1 expected loss=0.03333333 class counts: 1 29 probabilities: 0.033 0.967 Node number 4: 687 observations predicted class=0 expected loss=0 class counts: 687 0 probabilities: 1.000 0.000 Node number 5: 100 observations, complexity param=0.01041667 predicted class=0 expected loss=0.03 class counts: 97 3 probabilities: 0.970 0.030 left son=10 (83 obs) right son=11 (17 obs) Primary splits: x74 < 0.68 to the left, improve=5.552229, (0 missing) x45 < -52.5 to the right, improve=4.861774, (0 missing) x58 < 1.73 to the left, improve=4.560563, (0 missing) x60 < -39.25 to the right, improve=4.431733, (0 missing) x53 < 0.95 to the left, improve=4.301092, (0 missing) Node number 10: 83 observations predicted class=0 expected loss=0 class counts: 83 0 probabilities: 1.000 0.000 Node number 11: 17 observations, complexity param=0.01041667 predicted class=0 expected loss=0.1764706 class counts: 14 3 probabilities: 0.824 0.176 left son=22 (12 obs) right son=23 (5 obs) Primary splits: x53 < 0.95 to the left, improve=4.556929, (0 missing) x21 < -54.2 to the left, improve=3.141631, (0 missing) x51 < 238.75 to the right, improve=3.141631, (0 missing) x52 < -0.085 to the left, improve=3.141631, (0 missing) x55 < 35.25 to the right, improve=3.141631, (0 missing) Node number 22: 12 observations predicted class=0 expected loss=0 class counts: 12 0 probabilities: 1.000 0.000 Node number 23: 5 observations predicted class=1 expected loss=0.4 class counts: 2 3 probabilities: 0.400 0.600 >