##########################################################################
## May 31: Fei Yuan
## try the tree method on the whole training set of KDD data
## in order to find the most useful variables from 74 explonatory variables
###########################################################################

###########################################################################
## Results (pruned tree)  18 variables
## x5,x11,x28,x33,x35,x38,x40,x45,x50,x53,x55,x57,x58,x59,x60,x63,x68,x73
###########################################################################

train<-kdd.train.new
kdd.rpart<-rpart(y~x1+x2+x3+x4+x5+x6+x7+x8+x9+x10+x11+x12+x13+x14+x15+x16+x17+x18+x19+x20+x21+x22+x23+x24+x25+x26+x27+x28+x29+x30+x31+x32+x33+x34+x35+x36+x37+x38+x39+x40+x41+x42+x43+x44+x45+x46+x47+x48+x49+x50+x51+x52+x53+x54+x55+x56+x57+x58+x59+x60+x61+x62+x63+x64+x65+x66+x67+x68+x69+x70+x71+x72+x73+x74,data=train,method = "class",parms = list(split = "information"),cp = 0, minsplit = 10,minbucket = 5, maxsurrogate = 0) 
kdd.rpart.prune<-prune(kdd.rpart,cp = 0.00657895) 

> summary(kdd.rpart.prune)
Call:
rpart(formula = y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9 + 
    x10 + x11 + x12 + x13 + x14 + x15 + x16 + x17 + x18 + x19 + 
    x20 + x21 + x22 + x23 + x24 + x25 + x26 + x27 + x28 + x29 + 
    x30 + x31 + x32 + x33 + x34 + x35 + x36 + x37 + x38 + x39 + 
    x40 + x41 + x42 + x43 + x44 + x45 + x46 + x47 + x48 + x49 + 
    x50 + x51 + x52 + x53 + x54 + x55 + x56 + x57 + x58 + x59 + 
    x60 + x61 + x62 + x63 + x64 + x65 + x66 + x67 + x68 + x69 + 
    x70 + x71 + x72 + x73 + x74, data = train, method = "class", 
    parms = list(split = "information"), cp = 0, minsplit = 10, 
    minbucket = 5, maxsurrogate = 0)
  n= 145751 

           CP nsplit rel error    xerror       xstd
1 0.611882716      0 1.0000000 1.0000000 0.02765400
2 0.008873457      1 0.3881173 0.4004630 0.01754705
3 0.008487654      3 0.3703704 0.3858025 0.01722399
4 0.006578950      7 0.3364198 0.3873457 0.01725829

Node number 1: 145751 observations,    complexity param=0.6118827
  predicted class=0  expected loss=0.008891877
    class counts: 144455  1296
   probabilities: 0.991 0.009 
  left son=2 (144874 obs) right son=3 (877 obs)
  Primary splits:
      x53 < 4.445  to the left,  improve=4131.006, (0 missing)
      x55 < -40.25 to the right, improve=4000.869, (0 missing)
      x58 < 6.195  to the left,  improve=3896.459, (0 missing)
      x59 < 36.75  to the left,  improve=3775.973, (0 missing)
      x60 < -92.75 to the right, improve=3610.691, (0 missing)

Node number 2: 144874 observations,    complexity param=0.008487654
  predicted class=0  expected loss=0.003182075
    class counts: 144413   461
   probabilities: 0.997 0.003 
  left son=4 (137240 obs) right son=5 (7634 obs)
  Primary splits:
      x55 < 15.75  to the right, improve=349.8462, (0 missing)
      x53 < 1.505  to the left,  improve=341.6967, (0 missing)
      x63 < 2.005  to the left,  improve=338.4546, (0 missing)
      x35 < -15.45 to the right, improve=271.5882, (0 missing)
      x5  < -7.25  to the right, improve=239.4035, (0 missing)

Node number 3: 877 observations,    complexity param=0.008873457
  predicted class=1  expected loss=0.04789054
    class counts:    42   835
   probabilities: 0.048 0.952 
  left son=6 (57 obs) right son=7 (820 obs)
  Primary splits:
      x55 < -28.5  to the right, improve=80.25399, (0 missing)
      x60 < -67.25 to the right, improve=70.32789, (0 missing)
      x58 < 3.95   to the left,  improve=69.67247, (0 missing)
      x59 < 16.75  to the left,  improve=68.12516, (0 missing)
      x53 < 5.135  to the left,  improve=67.24266, (0 missing)

Node number 4: 137240 observations
  predicted class=0  expected loss=0.001741475
    class counts: 137001   239
   probabilities: 0.998 0.002 

Node number 5: 7634 observations,    complexity param=0.008487654
  predicted class=0  expected loss=0.02908043
    class counts:  7412   222
   probabilities: 0.971 0.029 
  left son=10 (5971 obs) right son=11 (1663 obs)
  Primary splits:
      x40 < -37.55 to the right, improve=183.9589, (0 missing)
      x45 < -59.5  to the right, improve=162.5006, (0 missing)
      x50 < -65.5  to the right, improve=155.7119, (0 missing)
      x68 < 3.105  to the left,  improve=152.1550, (0 missing)
      x63 < 2.735  to the left,  improve=147.9014, (0 missing)

Node number 6: 57 observations,    complexity param=0.008873457
  predicted class=0  expected loss=0.4210526
    class counts:    33    24
   probabilities: 0.579 0.421 
  left son=12 (43 obs) right son=13 (14 obs)
  Primary splits:
      x63 < 2.01   to the left,  improve=15.47488, (0 missing)
      x58 < 2.31   to the left,  improve=12.81102, (0 missing)
      x57 < 1.57   to the left,  improve=12.06921, (0 missing)
      x11 < 446.8  to the right, improve=11.40747, (0 missing)
      x33 < 4.145  to the left,  improve=10.74221, (0 missing)

Node number 7: 820 observations
  predicted class=1  expected loss=0.01097561
    class counts:     9   811
   probabilities: 0.011 0.989 

Node number 10: 5971 observations
  predicted class=0  expected loss=0.007033998
    class counts:  5929    42
   probabilities: 0.993 0.007 

Node number 11: 1663 observations,    complexity param=0.008487654
  predicted class=0  expected loss=0.1082381
    class counts:  1483   180
   probabilities: 0.892 0.108 
  left son=22 (1454 obs) right son=23 (209 obs)
  Primary splits:
      x38 < 6.185  to the left,  improve=84.87885, (0 missing)
      x68 < 3.105  to the left,  improve=83.34055, (0 missing)
      x33 < 2.635  to the left,  improve=82.31492, (0 missing)
      x63 < 2.48   to the left,  improve=82.20234, (0 missing)
      x28 < 1.685  to the left,  improve=71.58172, (0 missing)

Node number 12: 43 observations
  predicted class=0  expected loss=0.2325581
    class counts:    33    10
   probabilities: 0.767 0.233 

Node number 13: 14 observations
  predicted class=1  expected loss=0
    class counts:     0    14
   probabilities: 0.000 1.000 

Node number 22: 1454 observations
  predicted class=0  expected loss=0.06327373
    class counts:  1362    92
   probabilities: 0.937 0.063 

Node number 23: 209 observations,    complexity param=0.008487654
  predicted class=0  expected loss=0.4210526
    class counts:   121    88
   probabilities: 0.579 0.421 
  left son=46 (123 obs) right son=47 (86 obs)
  Primary splits:
      x68 < 2.915  to the left,  improve=35.18255, (0 missing)
      x63 < 1.515  to the left,  improve=27.50148, (0 missing)
      x28 < 2.975  to the left,  improve=21.78615, (0 missing)
      x38 < 9.245  to the left,  improve=18.27168, (0 missing)
      x73 < 0.355  to the left,  improve=17.94550, (0 missing)

Node number 46: 123 observations
  predicted class=0  expected loss=0.1869919
    class counts:   100    23
   probabilities: 0.813 0.187 

Node number 47: 86 observations
  predicted class=1  expected loss=0.244186
    class counts:    21    65
   probabilities: 0.244 0.756 

>