# brkdn is a function that attempts to calculate and display means, # variances and valid ns for the variable that appears on the left # side of the formula. # It expects the variables on the right side of the formula to be # factors, or at worst integers with a very small range. # It returns a list of "dstat" objects (or a list of lists of "dstat" # objects if there are two breakdown variables, and so on.) # A "dstat" object is a matrix that looks like: # vname1 vname2 ... # Mean mean1 mean2 ... # Variance var1 var2 ... # Valid n n1 n2 ... brkdn<-function(formula,dataframe,maxlevels=10) { if(!missing(dataframe) && !missing(formula)) { bn<-as.character(attr(terms(formula),"variables")[-1]) nbn<-length(bn) cat("\nBreakdown of",bn[1],"by",bn[nbn],"\n") if(nbn > 2) { # get the factor for this level by.factor<-as.factor(dataframe[[bn[nbn]]]) factor.levels<-levels(by.factor) nlevels<-length(factor.levels) if(nlevels > maxlevels) { nlevels<-maxlevels cat("Too many levels - only using first",maxlevels,"\n") } brkstats<-as.list(rep(0,nlevels)) names(brkstats)<-factor.levels # calculate the mean for this level for(i in 1:nlevels) { currentdata<-subset(dataframe,by.factor == factor.levels[i]) cat(paste("\n",bn[1],sep="",collapse=""),"for",bn[nbn],"- level",factor.levels[i],"\n\n") gstat<-desc.stat(currentdata[bn[1]],na.rm=TRUE) cat("Mean ",gstat[1],"\n") cat("Variance ",gstat[2],"\n") cat("n ",gstat[3],"\n") next.formula<-as.formula(paste(paste(bn[1],"~"),paste(bn[2:(nbn-1)],collapse="+"))) # and call yourself for the next level down brkstats[[i]]<-brkdn(next.formula,currentdata) } class(brkstats)<-"dstat" invisible(brkstats) } else { by.factor<-as.factor(dataframe[[bn[2]]]) factor.levels<-levels(by.factor) nlevels<-length(factor.levels) if(nlevels > maxlevels) { nlevels<-maxlevels cat("Too many levels - only using first",maxlevels,"\n") } gstats<-matrix(NA,ncol=nlevels,nrow=3) colnames(gstats)<-factor.levels[1:nlevels] rownames(gstats)<-c("Mean","Variance","n") # calculate the basic descriptive stats if(is.numeric(dataframe[[bn[1]]])) { for(i in 1:nlevels) { currentdata<-subset(dataframe[[bn[1]]],by.factor == factor.levels[i]) if(length(currentdata)) gstats[,i]<-desc.stat(currentdata,na.rm=TRUE) } class(gstats)<-"dstat" print(gstats) } invisible(gstats) } } else cat("Usage: brkdn(formula, dataframe, maxlevels=10)\n") } # desc.stat calculates the mean, variance and valid n for a numeric vector # needed by brkdn() desc.stat<-function(datavec,na.rm=TRUE) { dstat<-c(0,0,0) dstat[1]<-mean(datavec,na.rm=na.rm) dstat[2]<-var(datavec,na.rm=na.rm) dstat[3]<-sum(!is.na(datavec)) return(dstat) }