============================================= August 9 2004 Organize the kdd training data ============================================= #======================= # Reorganize the data # Seperate response variable from the dataset; #======================= kdd.train<-read.table("http://hajek.stat.ubc.ca/~ruben/data/bio_train.dat",header=F) kdd.train.new<-kdd.train dimnames(kdd.train.new)[[2]]<-c("BlockID","ExampleID", "y", paste("x", 1:74,sep="")) attach(kdd.train.new) y<-kdd.train.new[,1:3] source("http://hajek.stat.ubc.ca/~fyuan/rcode/writemtx.R") write.mtx(y,"/export/Grad/fyuan/public_html/kdd/kdd_act.txt") x<-kdd.train.new[,-c(1:3)] source("http://hajek.stat.ubc.ca/~fyuan/rcode/QuickWriteMtx.R") quick.write.mtx(x,"/export/Grad/fyuan/public_html/kdd/kdd_train.txt") #======================================================================== # Generate two folds: # Fold 1: blocks that were sampled during the kdd cup competition # Fold 2: the rest blocks that were not sampled in the kdd training dataset #======================================================================== source("http://hajek.stat.ubc.ca/~fyuan/rcode/readmtx.R") sampleBlocks<-read.mtx("http://hajek.stat.ubc.ca/~fyuan/kdd/kddSamplBlocks.mtx") sampleBlocks<-sampleBlocks[,1] # generate index for training data: 1 iff training cases. split.indx<-NULL split.indx<-rep(2,nrow(y)) for (i in 1:length(sampleBlocks)) { split.indx[y$BlockID==sampleBlocks[i]]<-1 } #======================================================================== # Generate 153 folds: each fold is a block # Fold 1: blocks that were sampled during the kdd cup competition # Fold 2: the rest blocks that were not sampled in the kdd training dataset #======================================================================== fold.indx<-NULL fold.indx<-rep(0,nrow(y)) blocks<-sort(unique(y$BlockID)) foldID<-c(1:length(blocks)) for (i in 1:length(blocks)) { fold.indx[y$BlockID==blocks[i]]<-foldID[i] } > unique(fold.indx) [1] 143 14 133 147 107 22 70 109 25 67 51 26 38 50 3 61 116 94 [19] 17 72 39 108 52 100 86 31 81 32 64 34 104 122 125 149 56 11 [37] 63 129 23 139 123 44 49 137 85 1 113 59 144 80 91 43 37 7 [55] 87 54 2 131 65 128 127 4 152 71 24 42 121 148 77 141 36 103 [73] 102 146 140 98 111 66 112 115 21 15 110 35 45 18 96 130 142 13 [91] 89 136 28 9 84 90 19 68 117 47 62 5 119 105 134 138 20 88 [109] 33 29 93 82 79 95 101 114 106 6 132 40 99 75 83 10 78 97 [127] 120 73 41 16 145 153 118 12 76 74 151 69 55 135 48 92 53 57 [145] 126 8 124 46 60 58 150 30 27 > length(unique(fold.indx))==length(unique(y$BlockID)) [1] TRUE > sort( unique(fold.indx) + ) [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 [19] 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 [37] 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 [55] 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 [73] 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 [91] 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 [109] 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 [127] 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 [145] 145 146 147 148 149 150 151 152 153 > folds<-cbind(split.indx,fold.indx) > dimnames(folds)[[2]]<-c("2Folds","153Folds") > folds[1:10,] 2Folds 153Folds [1,] 2 143 [2,] 2 143 [3,] 2 143 [4,] 2 143 [5,] 2 143 [6,] 2 143 [7,] 2 143 [8,] 2 143 [9,] 2 143 [10,] 2 143 > write.mtx(folds,"/export/Grad/fyuan/public_html/kdd/kdd_fold.txt")