# catcode.R: The complete example code for the paper # "On the perils of categorizing responses" # The commented changes to the original code were made to allow the # reader to reproduce the analyses without excessive amounts of # printout. By uncommenting the appropriate lines, the excluded # printouts can be displayed. # This requires the R statistical language and the optional packages # coin, modeltools, mvtnorm, nortest, plotrix, prettyR, splines, # stats4 and survival # # categorize.R: categorize values in the range of “breaks” into # values in the range 1 to the number of breaks minus one. # categorization is left-closed. # this function returns the typical integer score rather than the # less easily interpreted factor returned by the “cut” function categorize<-function(x,breaks) { newx<-rep(NA,length(x)) for(bin in 1:(length(breaks)-1)) newx[x >= breaks[bin] & x < breaks[bin+1]] <- bin return(newx) } # categorize1.dat: N200 sample x1<-c( 6.133804,9.383674,4.708543,5.644544,7.369074, 5.350940,7.384859,4.830140,4.835787,5.511067, 5.630028,7.491248,4.724624,3.829655,4.801791, 4.167953,5.189111,3.701049,5.083101,3.575646, 5.494093,1.256340,6.093684,1.162320,3.135098, 7.191146,1.563400,2.902534,4.231852,5.340493, 4.944946,4.089819,5.455186,3.826344,7.611098, 3.253509,7.564146,3.947369,7.090654,6.202382, 3.808527,3.267778,4.646471,6.243717,8.578432, 6.325416,3.108514,5.812343,6.473344,2.029210, 6.342119,3.254678,8.451556,4.794868,4.496466, 3.604764,6.587344,7.319965,6.971645,4.985919, 5.487573,1.329454,2.543154,8.217647,5.328220, 8.607426,4.471738,3.449995,6.116041,3.828421, 4.419119,5.183006,5.287919,5.884128,7.592372, 5.151679,6.714666,4.105611,8.715379,6.367117, 5.362190,5.846408,4.633992,7.866846,2.513230, 3.862001,9.275949,4.348864,6.438965,4.832527, 6.785707,2.545047,5.905525,7.414918,6.007792, 5.513230,7.605423,7.272780,4.726525,5.158767) x2<-c(4.7834572,3.8721400,0.8751237,8.6570744,7.1574917, 10.1696081,4.3069968,6.5021780,8.3505078,5.4164757, 7.1785890,4.7116808, 9.1623200,6.6116362,8.5934319, 7.5485314,8.2112045,8.3204427,4.5606782,6.5048377, 6.2085066,6.2517490,5.2562843,3.3108844,5.1436923, 4.5167117,3.0061935,6.1108965,8.3721694,7.5826674, 5.0798278,4.7413580,8.2363941,3.0628398,4.3079318, 9.2948106,2.8734872,2.0612415,6.6867012,7.9309642, 7.7424061,4.4456480,8.3800068,5.1677959,9.2170538, 9.3984044,5.7383915,8.8174195,5.3375800,2.1426965, 0.5132300,6.1212886,4.1204040,4.5791155,7.2034374, 7.1575594,7.2083693,3.6905859,6.8406519,8.2499845, 4.1623200,4.7008660,6.7299867,2.3166443,4.7710031, 6.1286866,1.9612620,7.8706067,4.1528707,3.5179886, 4.1797922,7.6972420,6.5206963,5.7093282,7.3901761, 5.5821797,6.5853815,2.3103230,5.6795558,4.8922551, 7.6580343,4.6380437,4.7501116,4.4827674,7.1072821, 5.5761929,7.4821233,7.2438225,4.9916257,7.2807440, 5.5132300,5.4211121,2.2933941,3.6057153,5.9293527, 7.1715263,6.9389506,7.2270761,6.9058014,6.6297080) # source code for changing category boundaries require(prettyR) categorize<-function(x,breaks) { newx<-rep(NA,length(x)) for(bin in 1:(length(breaks)-1)) newx[x >= breaks[bin] & x < breaks[bin+1]] <- bin return(newx) } # get the adjusted normal data # not needed, source included above # source("categorize1.dat") require(nortest) print(sf.test(x1)) print(sf.test(x2)) tp2<-tp6<-tp9<-tw6<-tw9<-tp16<-rep(0,30) describe(cbind(x1+3,x2+3),xname="Normal test data") # CAT2 breaks for(i in 0:29) { breaks<-c(0,3,6,9,12,15)+i/10 newx1<-categorize(x1,breaks) newx2<-categorize(x2,breaks) # uncomment the following three lines to display all values # cat("breaks are",breaks,"\n") # cat("means are",mean(newx1,na.rm=TRUE),mean(newx2,na.rm=TRUE), # "difference is",mean(newx2,na.rm=TRUE)-mean(newx1,na.rm=TRUE),"\n") testout<-t.test(newx1,newx2) tp6[i+1]<-testout$p.value wout<-wilcox.test(newx1,newx2) tw6[i+1]<-wout$p.value # uncomment the following three lines to display all values # cat("t[",testout$parameter,"] = ",testout$statistic, # ", p = ",testout$p.value,"; W =",wout$statistic, # ", p = ",wout$p.value,"\n",sep="") } cat("minimum p(t) =",min(tp6)," maximum p(t) =",max(tp6), "\nminimum p(W) =",min(tw6)," maximum p(W) =",max(tw6),"\n\n") # output to the screen device, not a PNG file # add a pause so that the user can see the graphics par(ask=TRUE) # png("listings/cat2move.png") plot(seq(0,2.9,by=0.1),tp6,main="CAT2 + 0 to 2.9 by 0.1", ylim=range(c(tp6,tw6)),xlab="Added to breaks",ylab="p value",type="b") lines(seq(0,2.9,by=0.1),tw6,pch=2,lty=2,type="b") abline(h=0.05,lty=3) legend(0,0.25,c("t-test","Wilcoxon"),pch=1:2,lty=1:2) # dev.off() # CAT1 breaks for(i in 0:29) { breaks<-c(0,2,4,6,10,15)+i/10 newx1<-categorize(x1,breaks) newx2<-categorize(x2,breaks) # uncomment the following three lines to display all values # cat("breaks are",breaks,"\n") # cat("means are",mean(newx1,na.rm=TRUE),mean(newx2,na.rm=TRUE), # "difference is",mean(newx2,na.rm=TRUE)-mean(newx1,na.rm=TRUE),"\n") testout<-t.test(newx1,newx2) tp9[i+1]<-testout$p.value wout<-wilcox.test(newx1,newx2) tw9[i+1]<-wout$p.value # uncomment the following three lines to display all values # cat("t[",testout$parameter,"] = ",testout$statistic, # ", p = ",testout$p.value,"; W =",wout$statistic, # ", p = ",wout$p.value,"\n",sep="") } cat("minimum p(t) =",min(tp9)," maximum p(t) =",max(tp9), "\nminimum p(W) =",min(tw9)," maximum p(W) =",max(tw9),"\n\n") # png("listings/cat1move.png") plot(seq(0,2.9,by=0.1),tp9,main="CAT1 + 0-2.9 by 0.1", ylim=c(0,0.13),xlab="Added to breaks",ylab="p value",type="b") lines(seq(0,2.9,by=0.1),tw9,pch=2,lty=2,type="b") abline(h=0.05,lty=3) legend(1.5,0.132,c("t-test","Wilcoxon"),pch=1:2,lty=1:2) # dev.off() # remove the graphics pause par(ask=FALSE) # repeated measures simulation - takes some time to run set.seed(23456) dpd1<-sample(1:14,200,TRUE, prob=c(0.054,0.21,0.27,0.18,0.12,0.062,0.039,0.023,0.015,0.0077, 0.006,0.005,0.004,0.003)) dpdchngt<-sample(-3:2,100,TRUE,prob=c(0.15,0.2,0.3,0.2,0.1,0.05)) dpdchngc<-sample(-2:2,100,TRUE,prob=c(0.1,0.25,0.3,0.25,0.1)) dpd8<-dpd1+c(dpdchngt,dpdchngc) dpd8[dpd8<0]<-0 require(prettyR) require(plotrix) describe(dpd1[1:100],xname="Pre-T") describe(dpd1[101:200],xname="Pre-C") describe(dpd8[1:100],xname="Post-T") describe(dpd8[101:200],xname="Post-C") sex<-sample(c("M","F"),100,TRUE) dpd1cat1<-categorize(dpd1,c(0,2,4,6,10,15)) dpd8cat1<-categorize(dpd8,c(0,2,4,6,10,15)) dpd1cat2<-categorize(dpd1,c(0,3,6,9,12,15)) dpd8cat2<-categorize(dpd8,c(0,3,6,9,12,15)) R100raw<-data.frame(ID=c(1:200,1:200), ndrinks=c(dpd1,dpd8), sex=c(sex,sex), group=c(rep("T",100),rep("C",100),rep("T",100),rep("C",100)), occ=c(rep("pre",200),rep("post",200))) cat("\nRepeated measures ANOVA for raw scores\n") print(summary(aov(ndrinks~group*occ+Error(ID),R100raw))) R100cat1<-data.frame(ID=c(1:200,1:200),ndrinks=c(dpd1cat1,dpd8cat1), sex,group=c(rep("T",100),rep("C",100),rep("T",100),rep("C",100)), occ=c(rep("pre",200),rep("post",200))) cat("\nRepeated measures ANOVA for CAT1 scores\n") print(summary(aov(ndrinks~group*occ+Error(ID),R100cat1))) R100cat2<-data.frame(ID=c(1:100,1:100),ndrinks=c(dpd1cat2,dpd8cat2), sex,group=c(rep("T",50),rep("C",50),rep("T",50),rep("C",50)), occ=c(rep("pre",200),rep("post",200))) cat("\nRepeated measures ANOVA for CAT2 scores\n") print(summary(aov(ndrinks~group*occ+Error(ID),R100cat2))) # output to the screen device, not a PNG file # add a pause so that the user can view the graphics par(ask=TRUE) # png("dpdraw.png",width=600,height=600) par(mfrow=c(2,2)) barp(freq(dpd1[1:100])[[1]],main="Pre drinking - treatment", ylim=c(0,30),xlab="Drinks per day",ylab="Frequency") barp(freq(dpd1[101:200])[[1]],main="Pre drinking - control", ylim=c(0,30),xlab="Drinks per day",ylab="Frequency") barp(freq(dpd8[1:100])[[1]],main="Post drinking - treatment", ylim=c(0,30),xlab="Drinks per day",ylab="Frequency") barp(freq(dpd8[101:200])[[1]],main="Post drinking - control", ylim=c(0,30),xlab="Drinks per day",ylab="Frequency") # dev.off() # png("dpdcat1.png",width=600,height=600) par(mfrow=c(2,2)) hist(dpd1cat1[1:100],main="CAT1 Pre - treatment", breaks=seq(0,12,by=2)) hist(dpd1cat1[101:200],main="CAT1 Pre - control", breaks=seq(0,12,by=2)) hist(dpd8cat1[1:100],main="CAT1 Post - treatment", breaks=seq(0,12,by=2)) hist(dpd8cat1[101:200],main="CAT1 Post - control", breaks=seq(0,12,by=2)) # dev.off() # png("dpdcat2.png",width=600,height=600) par(mfrow=c(2,2)) hist(dpd1cat2[1:100],main="CAT2 Pre - treatment", breaks=seq(0,12,by=2)) hist(dpd1cat2[101:200],main="CAT2 Pre - control", breaks=seq(0,12,by=2)) hist(dpd8cat2[1:100],main="CAT2 Post - treatment", breaks=seq(0,12,by=2)) hist(dpd8cat2[101:200],main="CAT2 Post - control", breaks=seq(0,12,by=2)) # dev.off() ddpd1<-density(dpd1) ddpd8<-density(dpd8) # return to 1 plot per page par(mfrow=c(1,1)) # png("dpddist.png",width=600,height=600) plot(ddpd1$x,ddpd1$y,ylab="Density",xlab="Number of drinks", main="Distribution of drinking") points(ddpd8$x,ddpd8$y,col="red") legend(8,0.2,c("Pre","Post"),pch=1,col=c("black","red")) # dev.off() # remove the graphics pause par(ask=FALSE) praw<-pcat1<-pcat2<-rep(NA,1000) for(repsamp in 1:1000) { dpdchngt<-sample(-3:2,100,TRUE,prob=c(0.15,0.2,0.3,0.2,0.1,0.05)) dpdchngc<-sample(-2:2,100,TRUE,prob=c(0.1,0.25,0.3,0.25,0.1)) dpd8<-dpd1+c(dpdchngt,dpdchngc) dpd8[dpd8<0]<-0 dpd1cat1<-categorize(dpd1,c(0,2,4,6,10,15)) dpd8cat1<-categorize(dpd8,c(0,2,4,6,10,15)) dpd1cat2<-categorize(dpd1,c(0,3,6,9,12,15)) dpd8cat2<-categorize(dpd8,c(0,3,6,9,12,15)) R100raw<-data.frame(ID=c(1:200,1:200), ndrinks=c(dpd1,dpd8), sex=c(sex,sex), group=c(rep("T",100),rep("C",100),rep("T",100),rep("C",100)), occ=c(rep("pre",200),rep("post",200))) praw[repsamp]<- summary(aov(ndrinks~group*occ+Error(ID), R100raw))$'Error: Within'[[1]]$'Pr(>F)'[3] R100cat1<-data.frame(ID=c(1:200,1:200),ndrinks=c(dpd1cat1,dpd8cat1), sex,group=c(rep("T",100),rep("C",100),rep("T",100),rep("C",100)), occ=c(rep("pre",200),rep("post",200))) pcat1[repsamp]<- summary(aov(ndrinks~group*occ+Error(ID), R100cat1))$'Error: Within'[[1]]$'Pr(>F)'[3] R100cat2<-data.frame(ID=c(1:200,1:200),ndrinks=c(dpd1cat2,dpd8cat2), sex,group=c(rep("T",100),rep("C",100),rep("T",100),rep("C",100)), occ=c(rep("pre",200),rep("post",200))) pcat2[repsamp]<- summary(aov(ndrinks~group*occ+Error(ID), R100cat2))$'Error: Within'[[1]]$'Pr(>F)'[3] } nsigraw<-sum(praw <= 0.05) cat("\n",nsigraw,"raw score comparisons p < 0.05\n\n") nsigcat1<-sum(pcat1 <= 0.05) cat(nsigcat1,"CAT1 score comparisons p < 0.05\n\n") nsigcat2<-sum(pcat2 <= 0.05) cat(nsigcat2,"CAT2 score comparisons p < 0.05\n\n") cat("Raw vs CAT1\n") print(prop.test(c(nsigraw,nsigcat1),c(1000,1000))) cat("Raw vs CAT2\n") print(prop.test(c(nsigraw,nsigcat2),c(1000,1000))) cat("CAT1 vs CAT2\n") print(prop.test(c(nsigcat1,nsigcat2),c(1000,1000))) # rtriang.R: triangular distribution sample generator # lowerlim and upperlim are the zero density limits rtriang<-function(triangmode,lowerlim,upperlim,peakn) { ntriang<-rep(0,1+upperlim-lowerlim) ntriang[1:(triangmode-lowerlim)]<- peakn*(lowerlim:(triangmode-1)-lowerlim)/(triangmode-lowerlim) ntriang[1+triangmode:upperlim-lowerlim]<- peakn*(upperlim-triangmode:upperlim)/(upperlim-triangmode) return(rep(lowerlim:upperlim,round(ntriang))) } # triangular distribution example # not needed - source included above # source("rtriang.R") # source("categorize.R") cat("\nTriangular distribution example\n\n") require(coin) cat1<-c(0,2,4,6,10,15) cat2<-c(0,3,6,9,12,15) raw1<-rtriang(5,2,9,30) raw2<-rtriang(6,2,9,30) rawdf<-data.frame(ndrinks=c(raw1,raw2),group=rep(c(-1,1),each=105)) cat("Raw score analysis\n\n") print(anova(lm(ndrinks~group,rawdf))) print(wilcox.test(ndrinks~group,rawdf)) cat1df<-data.frame( ndrinks=c(categorize(raw1,cat1),categorize(raw2,cat1)), group=rep(c(-1,1),each=105)) cat("\nCAT1 analysis\n\n") print(anova(lm(ndrinks~group,cat1df))) print(wilcox.test(ndrinks~group,cat1df)) cat2df<-data.frame( ndrinks=c(categorize(raw1,cat2),categorize(raw2,cat2)), group=rep(c(-1,1),each=105)) cat("\nCAT2 analysis\n\n") print(anova(lm(ndrinks~group,cat2df))) print(wilcox.test(ndrinks~group,cat2df))