% New Report: Thu Sep 15 19:27:10 2005 % New Report: Mon Sep 12 17:03:31 2005 % New Report: Mon Sep 12 10:35:36 2005 % New Report: Sun Sep 11 20:09:28 2005 % New Report: Sun Sep 11 08:52:43 2005 % New Report: Tue Sep 6 08:02:12 2005 % New Report: Wed Aug 24 16:51:01 2005 \documentclass{article} \usepackage{a4,graphicx,mathpazo,courier,alltt,amssymb} % \usepackage{/usr/lib/R/library/relax/lib/noweb}\noweboptions{webnumbering,smallcode} \usepackage[scaled=.95]{helvet} \usepackage[T1]{fontenc} \parindent0mm \title{A rough R Impementation of the Bagplot} \author{File: \jobname.rev\\in: /home/wiwi/pwolf/R/work/bagplot} \begin{document} \maketitle @ \begin{center}\includegraphics[height=8cm]{p2005-Sep16-114503.ps}\end{center} \small \tableofcontents @ \section{Arguments of [[bagplot]]} <>= args(bagplot) @ \begin{verbatim} Mon Sep 12 14:31:14 2005 function (x, y, factor = 3, approx.limit = 300, show.outlier = TRUE, show.whiskers = TRUE, show.looppoints = TRUE, show.bagpoints = TRUE, show.loophull = TRUE, show.baghull = TRUE, create.plot = TRUE, add = FALSE, pch = 16, cex = 0.4, ..., verbose = FALSE, debug.plots = "") \end{verbatim} @ \section{The definition of bagplot} <>= bagplot<-function(x,y,factor=3,approx.limit=300, show.outlier=TRUE,show.whiskers=TRUE, show.looppoints=TRUE,show.bagpoints=TRUE, show.loophull=TRUE,show.baghull=TRUE, create.plot=TRUE,add=FALSE,pch=16,cex=.4,..., dkmethod=1,resolution.factor=1, verbose=FALSE,debug.plots=""){ <> } @ <>= #pw 050912 <> <> <> <> <> <> if(dkmethod==1){ <> }else{ <> } <> <> <> <> <> <> <> @ Output of [[bagplot]] is a list of essential components of the computation. To identify singular points, use [[identify()]]. <>= assign(".Random.seed",save.seed,env=.GlobalEnv) return(invisible(list( center=center,bag=hull.bag,loop=hull.loop, outlier=if(length(pkt.outlier)>0) pkt.outlier else NULL, hdepths=hdepth ))) @ Points with identical coordinates may result in numerical problem. Therefore, some noise is added to the data. <>= # define some functions <> <> <> <> <> <> <> <> # check input xydata<-if(missing(y)) x else cbind(x,y) if(is.data.frame(xydata)) xydata<-as.matrix(xydata) # select sample in case of large data set very.large.data.set<-nrow(xydata)>approx.limit if(!exists(".Random.seed")) set.seed(13) save.seed<-.Random.seed if(very.large.data.set){ ind<-sample(seq(nrow(xydata)),size=approx.limit) xy<-xydata[ind,] } else xy<-xydata n<-nrow(xy) points.in.bag<-floor(n/2) # if jittering is needed # the following two lines can be activated #xy<-xy+cbind(rnorm(n,0,.0001*sd(xy[,1])), # rnorm(n,0,.0001*sd(xy[,2]))) assign(".Random.seed",save.seed,env=.GlobalEnv) if(verbose) cat("end of initialization") @ after a lot of experiments the function [[atan2]] is found to do the job best <>= win<-function(dx,dy){ atan2(y=dy,x=dx) } @ [[out.of.polygon]] checks if the points of [[xy]] are within the polygon [[pg]] (return value TRUE) or not (return value FALSE). <>= out.of.polygon<-function(xy,pg){ if(nrow(pg)==1) return(pg) pgcenter<-apply(pg,2,mean) pg<-cbind(pg[,1]-pgcenter[1],pg[,2]-pgcenter[2]) xy<-cbind(xy[,1]-pgcenter[1],xy[,2]-pgcenter[2]) extr<-rep(FALSE,nrow(xy)) for(i in seq(nrow(xy))){ alpha<-sort((win(xy[i,1]-pg[,1],xy[i,2]-pg[,2]))%%(2*pi)) extr[i]<-pi>= cut.z.pg<-function(zx,zy,p1x,p1y,p2x,p2y){ a2<-(p2y-p1y)/(p2x-p1x); a1<-zy/zx sx<-(p1y-a2*p1x)/(a1-a2); sy<-a1*sx sxy<-cbind(sx,sy) h<-any(is.nan(sxy))||any(is.na(sxy))||any(Inf==abs(sxy)) if(h){ if(verbose) cat("special") # points on line defined by line segment h<-0==(a1-a2) & sign(zx)==sign(p1x) sx<-ifelse(h,p1x,sx); sy<-ifelse(h,p1y,sy) h<-0==(a1-a2) & sign(zx)!=sign(p1x) sx<-ifelse(h,p2x,sx); sy<-ifelse(h,p2y,sy) # line segment vertical # & center NOT ON line segment h<-p1x==p2x & zx!=p1x & p1x!=0 sx<-ifelse(h,p1x,sx); sy<-ifelse(h,zy*p1x/zx,sy) # & center ON line segment h<-p1x==p2x & zx!=p1x & p1x==0 sx<-ifelse(h,p1x,sx); sy<-ifelse(h,0,sy) # & center ON line segment & point on line h<-p1x==p2x & zx==p1x & p1x==0 & sign(zy)==sign(p1y) sx<-ifelse(h,p1x,sx); sy<-ifelse(h,p1y,sy) h<-p1x==p2x & zx==p1x & p1x==0 & sign(zy)!=sign(p1y) sx<-ifelse(h,p1x,sx); sy<-ifelse(h,p2y,sy) # points identical to end points of line segment h<-zx==p1x & zy==p1y; sx<-ifelse(h,p1x,sx); sy<-ifelse(h,p1y,sy) h<-zx==p2x & zy==p2y; sx<-ifelse(h,p2x,sx); sy<-ifelse(h,p2y,sy) # point of z is center h<-zx==0 & zy==0; sx<-ifelse(h,0,sx); sy<-ifelse(h,0,sy) sxy<-cbind(sx,sy) } # end of special cases #if(verbose){ print(rbind(a1,a2));print(cbind(zx,zy,p1x,p1y,p2x,p2y,sxy))} if(debug.plots=="all"){ segments(sxy[,1],sxy[,2],zx,zy,col="red") segments(0,0,sxy[,1],sxy[,2],type="l",col="green",lty=2) points(sxy,col="red") } return(sxy) } @ [[find.cut.z.pg]] finds the cut points of the lines defined by [[z]] and center [[center]] and polygon [[pg]]. <>= find.cut.z.pg<-function(z,pg,center=c(0,0),debug.plots="no"){ if(!is.matrix(z)) z<-rbind(z) if(1==nrow(pg)) return(matrix(center,nrow(z),2,TRUE)) n.pg<-nrow(pg); n.z<-nrow(z) # center z and pg z<-cbind(z[,1]-center[1],z[,2]-center[2]) pgo<-pg; pg<-cbind(pg[,1]-center[1],pg[,2]-center[2]) if(debug.plots=="all"){plot(rbind(z,pg,0),bty="n"); points(z,pch="p") lines(c(pg[,1],pg[1,1]),c(pg[,2],pg[1,2]))} # find angles of pg und z apg<-win(pg[,1],pg[,2]) apg[is.nan(apg)]<-0; a<-order(apg); apg<-apg[a]; pg<-pg[a,] az<-win(z[,1],z[,2]) # find line segments segm.no<-apply((outer(apg,az,"<")),2,sum) segm.no<-ifelse(segm.no==0,n.pg,segm.no) next.no<-1+(segm.no %% length(apg)) # compute cut points cuts<-cut.z.pg(z[,1],z[,2],pg[segm.no,1],pg[segm.no,2], pg[next.no,1],pg[next.no,2]) # rescale cuts<-cbind(cuts[,1]+center[1],cuts[,2]+center[2]) return(cuts) } @ [[hdepth.of.points]] computes the hdepths of test points [[tp]]. <>= hdepth.of.points<-function(tp,n){ n.tp<-nrow(tp) tphdepth<-rep(0,n.tp); dpi<-2*pi-0.000001 minusplus<-c(rep(-1,n),rep(1,n)) for(j in 1:n.tp) { dx<-tp[j,1]-xy[,1]; dy<-tp[j,2]-xy[,2] a<-win(dx,dy)+pi; a<-a[a<10]; init<-sum(a < pi) a.shift<-(a+pi) %% dpi h<-cumsum(minusplus[order(c(a,a.shift))]) tphdepth[j]<-init+min(h)+1 } tphdepth } @ [[expand.hull]] expands polygon [[pk]] without changing the depth of its points. [[k]] is the depth and [[resolution]] the number of points to be checked during expandation. <>= expand.hull<-function(pg,k){ <> <> <> <> } @ At first we search the cut points of the hull of the data set with the lines beginning in the center and running through the points of [[pg]]. Then test points on the segments defined by these cut points and the points of [[pg]] will be generated by using a vector [[lam]]. <>= resolution<-20*resolution.factor pg0<-xy[hdepth==1,] pg0<-pg0[chull(pg0[,1],pg0[,2]),] end.points<-find.cut.z.pg(pg,pg0,center=center,debug.plots=debug.plots) lam<-((0:resolution)^1)/resolution^1 @ The test is performed in two stages. In the interval form start point to end point [[resolution]] test points are tested concerning their h-depth. The critical interval is divided again to find a better limit. <>= pg.new<-pg for(i in 1:nrow(pg)){ tp<-cbind(pg[i,1]+lam*(end.points[i,1]-pg[i,1]), pg[i,2]+lam*(end.points[i,2]-pg[i,2])) hd.tp<-hdepth.of.points(tp,nrow(xy)) ind<-max(sum(hd.tp>=k),1) if(indk && tp<-cbind(tp[ind,1]+lam*(tp[ind+1,1]-tp[ind,1]), tp[ind,2]+lam*(tp[ind+1,2]-tp[ind,2])) hd.tp<-hdepth.of.points(tp,nrow(xy)) ind<-max(sum(hd.tp>=k),1) } pg.new[i,]<-tp[ind,] } pg.new<-pg.new[chull(pg.new[,1],pg.new[,2]),] # cat("depth pg.new", hdepth.of.points(pg.new,n)) @ Between the spurts we interpolated additional directions and find additional limits by the same procedure. <>= pg.add<-0.5*(pg.new+rbind(pg.new[-1,],pg.new[1,])) end.points<-find.cut.z.pg(pg,pg0,center=center) for(i in 1:nrow(pg.add)){ tp<-cbind(pg.add[i,1]+lam*(end.points[i,1]-pg.add[i,1]), pg.add[i,2]+lam*(end.points[i,2]-pg.add[i,2])) hd.tp<-hdepth.of.points(tp,nrow(xy)) ind<-max(sum(hd.tp>=k),1) if(indk && tp<-cbind(tp[ind,1]+lam*(tp[ind+1,1]-tp[ind,1]), tp[ind,2]+lam*(tp[ind+1,2]-tp[ind,2])) hd.tp<-hdepth.of.points(tp,nrow(xy)) ind<-max(sum(hd.tp>=k),1) } pg.add[i,]<-tp[ind,] } # cat("depth pg.add", hdepth.of.points(pg.add,n)) @ Finally the hull of the limits is computed and our numerical solution of hull($d_k$). [[pg.new]] is the output of [[expand.hull]]. <>= pg.new<-rbind(pg.new,pg.add) pg.new<-pg.new[chull(pg.new[,1],pg.new[,2]),] @ [[cut.p.sl.p.sl]] finds the cut of two lines. Both of them are described by a point and its slope. Remember: \[ y=y_1+m_1(x-x_1) \] <>= cut.p.sl.p.sl<-function(xy1,m1,xy2,m2){ sx<-(xy2[2]-m2*xy2[1]-xy1[2]+m1*xy1[1])/(m1-m2) sy<-xy1[2]-m1*xy1[1]+m1*sx if(!is.nan(sy)) return( c(sx,sy) ) if(abs(m1)==Inf) return( c(xy1[1],xy2[2]+m2*(xy1[1]-xy2[1])) ) if(abs(m2)==Inf) return( c(xy2[1],xy1[2]+m1*(xy2[1]-xy1[1])) ) } @ [[pos.to.pg]] finds the position of points [[z]] relative to a polygon [[pg]] If a point is below the polygon [["lower"]] is returned otherwise [["upper"]]. <>= pos.to.pg<-function(z,pg,reverse=FALSE){ if(reverse){ int.no<-apply(outer(pg[,1],z[,1],">="),2,sum) zy.on.pg<-pg[int.no,2]+pg[int.no,3]*(z[,1]-pg[int.no,1]) }else{ int.no<-apply(outer(pg[,1],z[,1],"<="),2,sum) zy.on.pg<-pg[int.no,2]+pg[int.no,3]*(z[,1]-pg[int.no,1]) } ifelse(z[,2]>= prdata<-prcomp(xydata) is.one.dim<-(min(prdata[[1]])/max(prdata[[1]]))<0.0001 if(is.one.dim){ if(verbose) cat("data set one dimensional") prdata<-prdata[[2]]; trdata<-xydata%*%prdata; ytr<-mean(trdata[,2]) boxplotres<-boxplot(trdata[,1],plot=FALSE) dy<-0.1*diff(range(stats<-boxplotres$stats)) dy<-0.05*mean(c(diff(range(xydata[,1])), diff(range(xydata[,2])))) segtr<-rbind(cbind(stats[2:4],ytr-dy,stats[2:4],ytr+dy), cbind(stats[c(2,2)],ytr+c(dy,-dy), stats[c(4,4)],ytr+c(dy,-dy)), cbind(stats[c(2,4)],ytr,stats[c(1,5)],ytr)) segm<-cbind(segtr[,1:2]%*%t(prdata), segtr[,3:4]%*%t(prdata)) if(!add) plot(xydata,type="n",bty="n",pch=16,cex=.2,...) extr<-c(min(segm[6,3],segm[7,3]),max(segm[6,3],segm[7,3])) extr<-extr+c(-1,1)*0.000001*diff(extr) xydata<-xydata[xydata[,1]extr[2],,drop=FALSE] if(0>= dx<-(outer(xy[,1],xy[,1],"-")) dy<-(outer(xy[,2],xy[,2],"-")) alpha<-atan2(y=dy,x=dx); diag(alpha)<-1200 for(j in 1:n) alpha[,j]<-sort(alpha[,j]) alpha<-alpha[-n,] ; m<-n-1 ## quick look inside, just for check if(debug.plots=="all"){ plot(xy,bty="n"); xdelta<-abs(diff(range(xy[,1]))); dx<-xdelta*.3 for(j in 1:n) { p<-xy[j,]; dy<-dx*tan(alpha[,j]) segments(p[1]-dx,p[2]-dy,p[1]+dx,p[2]+dy,col=j) text(p[1]-xdelta*.02,p[2],j,col=j) } } if(verbose) print("end of computation of angles") @ We compute the h-depths in $o(n^2 \log(n))$. The [[NaN]] angles are extracted because they indicate points with identical coordinates. For every point we find the hdeep by the following algorithm: At first we count the number of angles of the actual point within interval $[0, \pi)$. This is equivalent to the number of points above the actual point. Then we rotate the $y=0$-line counterclockwise and increment the initial counter if an additional point emerges and we decrement the counter if a point / angle leaves the halve plain. @ The median is defined as the gravity center of all points with maximal hdeep. <>= hdepth<-rep(0,n); dpi<-2*pi-0.000001 minusplus<-c(rep(-1,m),rep(1,m)) for(j in 1:n) { a<-alpha[,j]+pi; a<-a[a<10]; init<-sum(a < pi) a.shift<-(a+pi) %% dpi h<-cumsum(minusplus[order(c(a,a.shift))]) hdepth[j]<-init+min(h)+1 } if(verbose){print("end of computation of hdepth:"); print(hdepth)} ## quick look inside, just for a check if(debug.plots=="all"){ plot(xy,bty="n") xdelta<-abs(diff(range(xy[,1]))); dx<-xdelta*.1 for(j in 1:n) { a<-alpha[,j]+pi; a<-a[a<10]; init<-sum(a < pi) a.shift<-(a+pi) %% dpi h<-cumsum(minusplus[ao<-(order(c(a,a.shift)))]) no<-which((init+min(h)) == (init+h))[1] p<-xy[j,]; dy<-dx*tan(alpha[,j]) segments(p[1]-dx,p[2]-dy,p[1]+dx,p[2]+dy,col=j,lty=3) dy<-dx*tan(c(sort(a),sort(a))[no]) segments(p[1]-5*dx,p[2]-5*dy,p[1]+5*dx,p[2]+5*dy,col="black") text(p[1]-xdelta*.02,p[2],hdepth[j],col=1,cex=2.5) } } @ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% We compute the depth $k$ with \#$D_k \le $ [[points.in.bag]] $<$ \#$D_{k-1}$ <>= hd.table<-table(sort(hdepth)) d.k<-cbind(dk=rev(cumsum(rev(hd.table))), k =as.numeric(names(hd.table))) k.1<-sum(points.in.bag1){ k<-d.k[k.1+1,2] } else { k<-d.k[k.1,2] } if(verbose){cat("counts of members of dk:"); print(hd.table)} if(verbose){cat("end of computation of k, k=",k)} @ The two dimensional median is the center of gravity of the points (not data points) with maximal h-depths. We extract some data points with maximal depths and define [[tp]] as random linear combinations of them. Then we compute their h-depths. <>= center<-apply(xy[which(hdepth==max(hdepth)),,drop=FALSE],2,mean) if(155){ n.p<-100*resolution.factor cands<-xy[rev(order(hdepth))[1:7],] cands<-cands[chull(cands[,1],cands[,2]),]; n.c<-nrow(cands) set.seed(13); lam<-matrix(runif(n.c*n.p),n.p,n.c) lam<-lam/matrix(apply(lam,1,sum),n.p,n.c,F) tp<-cbind( lam%*%cands[,1],lam%*%cands[,2]) tphdepth<-hdepth.of.points(tp,n) center.region<-tp[which(tphdepth==max(tphdepth)),,drop=FALSE] center<-apply(center.region,2,mean) center.region<-center.region[chull(center.region[,1],center.region[,2]),] if(verbose){cat("center.region",center.region); print(table(tphdepth)) } } if(verbose){print("end of computation of center"); print(center)} @ We compute the convex hull of $D_k$: polygon [[pdk]] and the hull of $D_{k-1}$: polygon [[pdk.1]]. % For simplification both of them are centered in (0,0). % [[ai]] will be assigned the angles of points of the inner polygon: % angle( point, $(0,0)$, $(1,0)$ ). % [[ao]] is assigned the angles of points of the outer polygon. %To get both polygons counter clockwise they are sorted %twice. (This is a little bit time consuming but not critical.) [[pdk]] represents inner polygon and [[pdk.1]] outer one. Then polygon [[pdk]] and [[pdk.1]] are enlarged without changing its h-depth: [[exp.dk]], [[exp.dk.1]]- <>= # inner hull of bag xyi<-xy[hdepth>=k,,drop=FALSE] pdk<-xyi[chull(xyi[,1],xyi[,2]),,drop=FALSE] # outer hull of bag xyo<-xy[hdepth>=(k-1),,drop=FALSE] pdk.1<-xyo[chull(xyo[,1],xyo[,2]),,drop=FALSE] if(verbose)cat("hull computed:") #; if(verbose){print(pdk); print(pdk.1) } if(debug.plots=="all"){ plot(xy,bty="n") h<-rbind(pdk,pdk[1,]); lines(h,col="red",lty=2) h<-rbind(pdk.1,pdk.1[1,]);lines(h,col="blue",lty=3) points(center[1],center[2],pch=8,col="red") } exp.dk<-expand.hull(pdk,k) exp.dk.1<-expand.hull(exp.dk,k-1) # pdk.1,k-1,20) @ The new approach to find the hull works as follows: For a given $k$ we move lines with different slopes from outside of the cloud to the center and stop if $k$ points are crossed. To keep things simple we rotate the data points so that we have only move a vertical line. \begin{enumerate} \item define directions / angles for hdepth search \item standardize data set to get appropiate directions \item computation of $D_k$ polygon and restandardization \item computation of $D_{k-1}$ polygon and restandardization \end{enumerate} <>= # define direction for hdepth search num<-c(351,171,85)[c(n>200,n<200,n<50)][1]*resolution.factor num.h<-floor(num/2); angles<-seq(0,pi,length=num.h) ang<-tan(pi/2-angles) # standardization of data set xym<-apply(xy,2,mean); xysd<-apply(xy,2,sd) xyxy<-cbind((xy[,1]-xym[1])/xysd[1],(xy[,2]-xym[2])/xysd[2]) kkk<-k <> exp.dk<-cbind(pg[,1]*xysd[1]+xym[1],pg[,2]*xysd[2]+xym[2]) if(kkk>1) kkk<-kkk-1 <> exp.dk.1<-cbind(pg[,1]*xysd[1]+xym[1],pg[,2]*xysd[2]+xym[2]) @ The polygon for h-depth $kkk$ is constructed in a loop. In each step we consider one direction / angle. <>= <> for(ia in seq(angles)[-1]){ <> } <> @ At first we search the limiting points for every direction by rotating the data set and then we determine the quantiles $x_{k/n}$ and $x_{(n+1-k)/n}$. With this points we construct a upper [[pg]] and a lower polygon [[pgl]] that limits the hull we are looking for. To update a polygon we have to find the line segments of the polygon that are cut by the lines of slope [[a]] through the limiting points as well as the cut points. <>= # determine critical points pnew and pnewl of direction a ### cat("ia",ia) a<-angles[ia]; angtan<-ang[ia]; xyt<-xyxy%*%c(cos(a),-sin(a)); xyto<-order(xyt) ind.k <-xyto[kkk]; ind.kk<-xyto[n+1-kkk]; pnew<-xyxy[ind.k,]; pnewl<-xyxy[ind.kk,] if(debug.plots=="all") points(pnew[1],pnew[2],col="red") # new limiting lines are defined by pnew / pnewl and slope a # find segment of polygon that is cut by new limiting line and cut if(abs(angtan)>1e10){ ### cat("y=c case") pg.no<-sum(pg[,1]=pnewl[1]) cutpl<-c(pnewl[1],pgl[pg.nol,2]+pgl[pg.nol,3]*(pnewl[1]-pgl[pg.nol,1])) }else{ ### cat("normal case") pg.inter<-pg[,2]-angtan*pg[,1]; pnew.inter<-pnew[2]-angtan*pnew[1] pg.no<-sum(pg.interpnew.interl) cutpl<-cut.p.sl.p.sl(pnewl,angtan,pgl[pg.nol,1:2],pgl[pg.nol,3]) } # update pg, pgl pg<-rbind(pg[1:pg.no,],c(cutp,angtan),c(cutp[1]+dxy, cutp[2]+angtan*dxy,NA)) pgl<-rbind(pgl[1:pg.nol,],c(cutpl,angtan),c(cutpl[1]-dxy, cutpl[2]-angtan*dxy,NA)) <> @ To initialize the loop we construct the first polygons (upper one: [[pg]], lower one: [[pgl]]) by vertical lines. [[dxdy]] is a step that is larger than the range of the standardized data set. <>= ia<-1; a<-angles[ia]; xyt<-xyxy%*%c(cos(a),-sin(a)); xyto<-order(xyt) # initial for upper part ind.k <-xyto[kkk]; cutp<-c(xyxy[ind.k,1],-10) dxy<-diff(range(xyxy)) pg<-rbind(c(cutp[1],-dxy,Inf),c(cutp[1],dxy,NA)) # initial for lower part ind.kk<-xyto[n+1-kkk]; cutpl<-c(xyxy[ind.kk,1],10) pgl<-rbind(c(cutpl[1],dxy,Inf),c(cutpl[1],-dxy,NA)) <> @ The combination of the is a little bit complicated because sometimes at the right and at left margin an additional cut point has to be computed and integrated. [[l]] in front of a variable name indicates the left margin whereas the right one is coded by [[r]]. Letter [[l]] ([[u]]) at the end of a name is short for lower and upper. <>= pg<-pg[-nrow(pg),][-1,,drop=F]; pgl<-pgl[-nrow(pgl),][-1,,drop=FALSE] indl<-pos.to.pg(pgl,pg); indu<-pos.to.pg(pg,pgl,TRUE) npg<-nrow(pg); npgl<-nrow(pgl) rnuml<-rnumu<-lnuml<-lnumu<-0; sl<-pg[1,1:2]; sr<-pgl[1,1:2] # right region if(indl[1]=="higher"&indu[npg]=="lower"){ rnuml<-which(indl=="lower")[1]-1; xyl<-pgl[rnuml,] # rnumu<-which(rev(indu=="higher"))[1]; xyu<-pg[npg+1-rnumu,] # sr<-cut.p.sl.p.sl(xyl[1:2],xyl[3],xyu[1:2],xyu[3]) } # left region if(indl[npgl]=="higher"&indu[1]=="lower"){ lnuml<-which(rev(indl=="lower"))[1]; xyl<-pgl[npgl+1-lnuml,] # lnumu<-which(indu=="higher")[1]-1; xyu<-pg[lnumu,] #? sl<-cut.p.sl.p.sl(xyl[1:2],xyl[3],xyu[1:2],xyu[3]) } pgl<-pgl[(rnuml+1):(npgl-lnuml),1:2,drop=FALSE] pg <-pg [(lnumu+1):(npg -rnumu),1:2,drop=FALSE] pg<-rbind(pg,sr,pgl,sl) pg<-pg[chull(pg[,1],pg[,2]),] if(debug.plots=="all") lines(rbind(pg,pg[1,]),col="red") @ <>= ######################################### #### cat("angtan",angtan,"pg.no",pg.no,"pkt:",pnew) # if(ia==stopp) lines(pg,type="b",col="green") if(debug.plots=="all"){ points(pnew[1],pnew[2],col="red") hx<-xyxy[ind.k,c(1,1)]; hy<-xyxy[ind.k,c(2,2)] segments(hx,hy,c(10,-10),hy+ang[ia]*(c(10,-10)-hx),lty=2) # text(hx+rnorm(1,,.1),hy+rnorm(1,,.1),ia) #print(pg) # if(ia==stopp) lines(pgl,type="b",col="green") points(cutpl[1],cutpl[2],col="red") hx<-xyxy[ind.kk,c(1,1)]; hy<-xyxy[ind.kk,c(2,2)] segments(hx,hy,c(10,-10),hy+ang[ia]*(c(10,-10)-hx),lty=2) # text(hx+rnorm(1,,.1),hy+rnorm(1,,.1),ia) #print(pgl) } @ <>= if(debug.plots=="all"){ plot(xyxy,type="p",bty="n") # text(xy,,1:n,col="blue") # hx<-xy[ind.k,c(1,1)]; hy<-xy[ind.k,c(2,2)] # segments(hx,hy,c(10,-10),hy+ang[ia]*(c(10,-10)-hx),lty=2) # text(hx+rnorm(1,,.1),hy+rnorm(1,,.1),ia) } @ On the way finding the bag the function [[expand.hull]] computes not an exact solution but a numerical approximation. [[k.1]] indicates the polygon with h-depth $k-1$. [[k.1+1]] will usually points to h-depth $k$, to the inner polygon. In computing $\lambda$ we follow Miller et al. (1999). They define $\lambda$ as the relative distance from the bag to the inner contour and they compute it \emph{by $\lambda=(50-J)/(L-J)$, where $D_k$ contains $J$\% of the original points and $D_{k-1}$ contains $L$\% of the original points:} \[ \lambda=\frac{50-J}{L-J}=\frac{n/2-\#D_k}{\#D_{k-1}-\#D_k}= \frac{\mbox{number in bag}-\mbox{number in inner contour}} {\mbox{number in outer contour}-\mbox{number in inner contour}} \] $\lambda==0$ happens if bag and inner contour is identical. <>= lambda<-if(nrow(d.k)==1)0.5 else (n/2-d.k[k.1+1,1])/(d.k[k.1,1]-d.k[k.1+1,1]) if(verbose) cat("lambda",lambda) @ $\lambda==0$ happens if bag and inner contour is identical. The bag is constructed by lambda * outer polygon + (1-lambda)* inner polygon <>= cut.on.pdk.1<-find.cut.z.pg(exp.dk,exp.dk.1,center=center) cut.on.pdk <-find.cut.z.pg(exp.dk.1,exp.dk,center=center) # expand inner polgon h1<-(1-lambda)*exp.dk+lambda*cut.on.pdk.1 # shrink outer polygon h2<-(1-lambda)*cut.on.pdk+lambda*exp.dk.1 h<-rbind(h1,h2); hull.bag<-h[chull(h[,1],h[,2]),] if(verbose)cat("bag completed:") #if(verbose)print(hull.bag) if(debug.plots=="all"){ lines(hull.bag,col="red") } @ The loop is found by expand [[hull.bag]] by factor [[factor]]. <>= hull.loop<-cbind(hull.bag[,1]-center[1],hull.bag[,2]-center[2]) hull.loop<-factor*hull.loop hull.loop<-cbind(hull.loop[,1]+center[1],hull.loop[,2]+center[2]) if(verbose) cat("loop computed") @ Now we look for the loop ... <>= if(!very.large.data.set){ pkt.bag <-xydata[hdepth>= k ,,drop=FALSE] pkt.cand <-xydata[hdepth==(k-1),,drop=FALSE] pkt.not.bag<-xydata[hdepth< (k-1),,drop=FALSE] if(length(pkt.cand)>0){ outside<-out.of.polygon(pkt.cand,hull.bag) if(sum(!outside)>0) pkt.bag <-rbind(pkt.bag, pkt.cand[!outside,]) if(sum( outside)>0) pkt.not.bag<-rbind(pkt.not.bag, pkt.cand[ outside,]) } }else { extr<-out.of.polygon(xydata,hull.bag) pkt.bag <-xydata[!extr,] pkt.not.bag<-xydata[extr,,drop=FALSE] } if(length(pkt.not.bag)>0){ extr<-out.of.polygon(pkt.not.bag,hull.loop) pkt.outlier<-pkt.not.bag[extr,,drop=FALSE] if(0==length(pkt.outlier)) pkt.outlier<-NULL pkt.outer<-pkt.not.bag[!extr,,drop=FALSE] }else{ pkt.outer<-pkt.outlier<-NULL } if(verbose) cat("points of bag, outer points and outlier identified") @ and compute the hull of the loop points. <>= hull.loop<-rbind(pkt.outer,hull.bag) hull.loop<-hull.loop[chull(hull.loop[,1],hull.loop[,2]),] if(verbose) cat("end of computation of loop") @ Finally the plot is constructed. <>= if(create.plot){ if(!add) plot(xydata,type="n",pch=pch,cex=cex,bty="n",...) if(verbose) text(xy[,1],xy[,2],paste(as.character(hdepth)),cex=2) # loop: ------------------------------------------------------ if(show.loophull){ # fill loop h<-rbind(hull.loop,hull.loop[1,]); lines(h[,1],h[,2],lty=1) polygon(hull.loop[,1],hull.loop[,2],col="#aaccff") } if(show.looppoints && length(pkt.outer)>0){ # points in loop points(pkt.outer[,1],pkt.outer[,2],col="#3355ff",pch=pch,cex=cex) } # bag: ------------------------------------------------------- if(show.baghull){ # fill bag h<-rbind(hull.bag,hull.bag[1,]); lines(h[,1],h[,2],lty=1) polygon(hull.bag[,1],hull.bag[,2],col="#7799ff") } if(show.bagpoints && length(pkt.bag)>0){ # points in bag points(pkt.bag[,1],pkt.bag[,2],col="#000088",pch=pch,cex=cex) } # whiskers if(show.whiskers && length(pkt.outer)>0){ debug.plots<-"not" pkt.cut<-find.cut.z.pg(pkt.outer,hull.bag,center=center) segments(pkt.outer[,1],pkt.outer[,2],pkt.cut[,1],pkt.cut[,2],col="red") } # outlier: -------------------------------------------------- if(show.outlier && length(pkt.outlier)>0){ # points in loop points(pkt.outlier[,1],pkt.outlier[,2],col="red",pch=pch,cex=cex) } # center: if(exists("center.region")&&length(center.region)>2){ h<-rbind(center.region,center.region[1,]); lines(h[,1],h[,2],lty=1) polygon(center.region[,1],center.region[,2],col="orange") } points(center[1],center[2],pch=8,col="red") } if(verbose){ h<-rbind(exp.dk,exp.dk[1,]); lines(h,col="blue",lty=2) h<-rbind(exp.dk.1,exp.dk.1[1,]); lines(h,col="black",lty=2) } if(verbose&&exists("tp"))text(tp[,1],tp[,2],as.character(tphdepth),col="green") if(verbose) text(xy[,1],xy[,2],paste(as.character(hdepth)),cex=2) @ In case of problems some additional plottings may be helpful. <>= # cat("tp");print(tp) # points(exp.dk[,1],exp.dk[,2],type="b",col="red") # points(exp.dk[,1],exp.dk[,2],type="b",col="green") # points(exp.dk.1[,1],exp.dk.1[,2],type="b",col="blue") @ \newpage \section{TEST} \subsection{Example: car data (Chambers / Hastie 1992)} <>= <> library(rpart); cardata<-car.test.frame[,6:7]; par(mfrow=c(1,1)) bagplot(cardata,verbose=F,factor=3,show.baghull=T,dkmethod=2, show.loophull=T) title("car data Chambers/Hastie 1992") @ \begin{center}\includegraphics[height=13cm]{p2005-Sep16-114503.ps}\end{center} %

@ \newpage \subsection{The normal case} A bagplot of an rnorm sample plus one heavy outlier <>= lll<-221 <> datan<-rbind(data,c(106,294)); par(mfrow=c(1,1)) datan[,2]<-datan[,2]*100 bagplot(datan,factor=3,create.plot=T,approx.limit=300, show.outlier=T,show.looppoints=T,show.bagpoints=T, show.whiskers=T,show.loophull=T,show.baghull=T,verbose=F) title(paste("seed: ",lll)) @ \begin{center}\includegraphics[height=13cm]{p2005-Sep16-114536.ps}\end{center} %

@ @ @ \newpage \subsection{Large data sets} What about large data sets? <>= lll<-173 if(!exists("lll")) lll<-75 set.seed(lll<-lll+1); print(lll) n<-3000;datan<-cbind(rnorm(n)+100,rnorm(n)+300) print(lll) datan<-rbind(datan,c(105,295)) par(mfrow=c(1,1)) #; par(mfrow=2:3) bagplot(datan,factor=2.5,create.plot=T,approx.limit=1000,cex=0.2, show.outlier=T,show.looppoints=T,show.bagpoints=T,dkmethod=2, show.loophull=T,show.baghull=T,verbose=F,debug.plots="no") title(paste("seed",lll)) @ \begin{center}\includegraphics[height=13cm]{p2005-Sep16-114606.ps}\end{center} %

@ @ \newpage \subsection{Depth one data sets} What happens if all points are of depth 1? <>= bagplot(x=1:30,y=(1:30)^2,verbose=F,dkmethod=2) @ \begin{center}\includegraphics[height=8cm]{p2005-Sep16-114625.ps}\end{center} %

@ @ \subsection{Degenerated data sets} What happens if the data are in a one dim subspace? <>= bagplot(x=10+c(1:100,200),y=30-c(1:100,200),verbose=F) @ \begin{center}\includegraphics[height=6cm]{p2005-Sep16-114641.ps}\end{center} %

@ @ Here is a second one dim data set. <>= bagplot(x=(1:100),y=(1:100),verbose=F) @ \subsection{Data set from the mail of M. Maechler} The data set of M. Maechler is discussed within R-help. <>= x0<-c(1,5, 6,6, 6, 6,6,7,7,8, 11, 13) # x0 <- c(x0, 8) y0<-c(2,3.5,4,4.5,4.5,5,5,5,5,5.5,5.5, 7) # y0 <- c(y0, 7) bagplot(x0,y0,show.baghull=T,show.loophull=T,create.plot=T, show.whiskers=T,factor=3,debug.plots="noall", dkmethod=2,verbose=F) @ \begin{center}\includegraphics[height=10cm]{p2005-Sep16-114709.ps}\end{center} %

\newpage @ \subsection{Bagplot with additional graphical supplements} %hallo Verbose bagplot of a sample of 100 rnorm points and an outlier <>= lll<-221 <> datan<-rbind(data,c(105,295)) bagplot(datan,factor=2.5,create.plot=T,approx.limit=300, show.outlier=T,show.looppoints=T,show.bagpoints=T,dkmethod=2, show.whiskers=T,show.loophull=T,show.baghull=T,verbose=T) title(paste("seed",lll)) @ \begin{center}\includegraphics[height=13cm]{p2005-Sep16-114739.ps}\end{center} %

\newpage @ \subsection{Debugging plots with additional elements} Here is an example of plots generated with option [[debug.plots="all"]]. <>= <> datan<-rbind(data,c(120,280)) datan<-data[1:10,] #datan<-cbind(c(1:100,200),c(1:100,200)) par(mfrow=c(2,3)) bagplot(datan,factor=2.5,create.plot=T,approx.limit=300, show.outlier=T,show.looppoints=T,show.bagpoints=T, show.whiskers=T,show.loophull=F,show.baghull=T,dkmethod=2, debug.plots="all",verbose=T) @ \begin{center}\includegraphics[height=13cm]{p2005-Sep16-114806.ps}\end{center} %

@ @ \newpage \section{Random data set} <>= if(!exists("lll")) lll<-75 #lll<-75 # 267 81 115 set.seed(lll<-lll+1); print(lll) #data<-matrix(sample(1:10000,size=2000),1000,2) #data<-matrix(sample(1:10000,size=1000),500,2) #data<-matrix(sample(1:10000,size=300),50,2) n<-100;data<-cbind(rnorm(n)+100,rnorm(n)+300) par(mfrow=c(1,1)) @ \section{Definitionn of [[bagplot]] on start} <>= <> @ \section{Extracting of function [[bagplot]]} <>= tangleR("hdeep.rev",expand.roots="define [[bagplot]]") @ \section{Some old code chunks for documentation} \tiny <>= critical.angles.of.points<-function(tp){ n.tp<-nrow(tp) tphdepth<-rep(0,n.tp); dpi<-2*pi-0.000001 minusplus<-c(rep(-1,n),rep(1,n)) result<-matrix(0,n.tp,4) for(j in 1:n.tp) { dx<-tp[j,1]-xy[,1]; dy<-tp[j,2]-xy[,2] a<-win(dx,dy)+pi; a<-a[a<10]; a<-sort(a) a.shift<-(a+pi) %% dpi h<-cumsum(minusplus[order(c(a,a.shift))]) no<-which(min(h)==h); no<-c(no[1],no[length(no)]) no<-c(no,1+((no-2)%%n)) #cat("HALLO"); print(no) result[j,]<-c(a,a)[no] } if(debug.plots=="all"){ plot(xy,type="n") # points(xy); text(xy,as.character(hdepth)) h<-rbind(tp,tp[1,]); lines(h) points(tp[1,,drop=F],col="red") dx<-3; ro<-1 dy<-dx*tan(result[ro,1]) # segments(tp[ro,1]-dx,tp[ro,2]-dy,tp[ro,1]+dx,tp[ro,2]+dy,col="orange") dy<-dx*tan(result[ro,3]) segments(tp[ro,1]-dx,tp[ro,2]-dy,tp[ro,1]+dx,tp[ro,2]+dy,col="red") dy<-dx*tan(result[ro,2]) # segments(tp[ro,1]-dx,tp[ro,2]-dy,tp[ro,1]+dx,tp[ro,2]+dy,col="green") dy<-dx*tan(result[ro,4]) # segments(tp[ro,1]-dx,tp[ro,2]-dy,tp[ro,1]+dx,tp[ro,2]+dy,col="blue") } result } a.pdk<-critical.angles.of.points(pdk) @ <>= # old version based on polygon of data points if(nrow(d.k)>1){ lambda<-1-(points.in.bag-d.k[k.1+1,1])/(d.k[k.1,1]-d.k[k.1+1,1]) } else { lambda<-0.5 } @ \vspace*{-3cm} <>= pdk.1<-pdk.1-matrix(pcenter,nrow(pdk.1),2,byrow=TRUE) pcenter<-apply(pdk,2,mean) pdk<-pdk-matrix(pcenter,nrow(pdk),2,byrow=TRUE) ai<-win(pdk[,1],pdk[,2]) a<-order(ai); ai<-ai[a]; pdk<-pdk[a,,drop=FALSE] ai<-win(pdk[,1],pdk[,2]) ao<-win(pdk.1[,1],pdk.1[,2]) a<-order(ao); ao<-ao[a]; pdk.1<-pdk.1[a,,drop=FALSE] ao<-win(pdk.1[,1],pdk.1[,2]) # for display the two polygons in verbose mode we store them @ <>= <> <> <> @ Some points of the two polygon will be identical, so the subsets of points has to be moved. <>= h1<-match(pdk.1[,1], pdk[,1]); h2<-match(pdk.1[,2], pdk[,2]) ind.pdk.1<-seq(along=h1); found<-!is.na(h1) union.points<-pdk.1[found,2]==pdk[h1[found],2] union.points<-ind.pdk.1[found][union.points] ind.o.points.to.shift<-ind.pdk.1[-union.points] outer.shift.points<-pdk.1[ind.o.points.to.shift,,drop=FALSE] if(length(ai)==1){ # inner polygon and center center identical / ai==NaN outer.shift.points<- lambda *outer.shift.points } else { for(i in seq(along=outer.shift.points[,1])){ # get point xy0<-outer.shift.points[i,] # get segment of inner polygon ind1<-sum(ailength(ai)) ind2<-1 xy1<-pdk[ind1,]; xy2<-pdk[ind2,] # determinate cut of inner segment and line (0,0) -> point lam<-solve(matrix(c(xy0,xy1-xy2),2,2))%*%xy1 xy.cut<-lam[1]*xy0 # determinate new position of point of outer polygon outer.shift.points[i,]<- lambda *outer.shift.points[i,]+ (1-lambda)*xy.cut } # end of for } # end of if if(verbose) {cat("outer polygon points have been shifted:") } @ <>= h1<-match(pdk[,1], pdk.1[,1]); h2<-match(pdk[,2], pdk.1[,2]) ind.i.points.to.shift<-is.na(h1)&is.na(h2) inner.shift.points<-pdk[ind.i.points.to.shift,,drop=FALSE] for(i in seq(along=inner.shift.points[,1])){ # get point xy0<-inner.shift.points[i,] # get segment of outer polygon ind1<-sum(aolength(ao)) ind2<-1 xy1<-pdk.1[ind1,]; xy2<-pdk.1[ind2,] # determinate cut of outer segment and line (0,0) -> point lam<-solve(matrix(c(xy0,xy1-xy2),2,2))%*%xy1 xy.cut<-lam[1]*xy0 # determinate new position of point of inner polygon inner.shift.points[i,]<-(1-lambda)*inner.shift.points[i,]+ lambda *xy.cut } if(verbose) {cat("inner polygon points have been shifted:") } @ <>= pdk[ind.i.points.to.shift,]<-inner.shift.points pdk.1[ind.o.points.to.shift,]<-outer.shift.points hull.bag<-rbind(pdk.1,pdk) hull.bag<-hull.bag[chull(hull.bag[,1],hull.bag[,2]),,drop=FALSE] if(verbose){cat("bag completed:"); print(hull.bag) } @ <>= if(nrow(d.k)>1){ lambda<-1-(points.in.bag-d.k[k.1+1,1])/(d.k[k.1,1]-d.k[k.1+1,1]) } else { lambda<-0.5 } vt<-find.cut.z.pg(xy,pdk,center=center) vt.1<-find.cut.z.pg(xy,pdk.1,center=center) h<-cbind(xy[,1]-center[1],xy[,2]-center[2]); lz<-apply(h*h,1,sum)^0.5 h<-cbind(vt[,1]-center[1],vt[,2]-center[2]); lv<-apply(h*h,1,sum)^0.5 h<-cbind(vt.1[,1]-center[1],vt.1[,2]-center[2]); lv.1<-apply(h*h,1,sum)^0.5 lambda.i<-(lz-lv)/(lv.1-lv) lambda.i<-(lambda.i[!is.na(lambda.i) & ! is.nan(lambda.i)]) # lambda<-median(lambda.i) # cat("median? lambda",median(lambda.i)) if(lambda<0|lambda>1) lambda<-0.5 if(verbose) cat("lambda",lambda) # segm.no[is.na(segm.no)]<-1 # cut.pkt[is.nan(cut.pkt)]<-z[is.nan(cut.pkt)] # cut.pkt<-cbind(cut.pkt[,1]+center[1],cut.pkt[,2]+center[2]) # h<-is.na(cuts[,1]) # if(any(h)){cut.pkt[h,1]<-pgo[1,1];cut.pkt[h,2]<-pgo[1,2]} @ [[# car data: lambda==0.6918136]] In this definition it follows: [[lambda==1]] iff bag is identical with inner polygon [[exp.dk]]. <>= vt<-find.cut.z.pg(xy,exp.dk,center=center) vt.1<-find.cut.z.pg(xy,exp.dk.1,center=center) h<-cbind(xy[,1]-center[1],xy[,2]-center[2]); lz<-apply(h*h,1,sum)^0.5 h<-cbind(vt[,1]-center[1],vt[,2]-center[2]); lv<-apply(h*h,1,sum)^0.5 h<-cbind(vt.1[,1]-center[1],vt.1[,2]-center[2]); lv.1<-apply(h*h,1,sum)^0.5 lambda.i<-(lz-lv)/(lv.1-lv) lambda.i<-(lambda.i[!is.na(lambda.i) & ! is.nan(lambda.i)]) lambda<-median(lambda.i) if(verbose) cat("\nmedian lambda",lambda) if(lambda<0|lambda>1) lambda<-lambda<-min(1,max(0,median(lambda.i))) @ \end{document}