#This document contains codes that allow to estimate the decomposition proposed in the #paper "Decomposition of differences in distribution using quantile regression" #Work in progress: comments and suggestions are very welcome! #The main command is #rqdeco(depref,covref,dep,cov,probs=seq(0.01,0.99,0.01),weightsref=rep(1,length(depref)),weights=rep(1,length(dep)),nrq=0,mn=10000000,method=1) #Arguments: depref dependent variable from the reference sector/year/gender... # covref covariates from the reference sector/year/gender... (matrix with a line by observation) # dep dependent variable from the other sector/year/gender... # cov covariates from the other sector/year/gender... (matrix with a line by observation) # probs quantiles at which the decomposition will be estimated # weightsref observation weights for the reference sector/year/gender... # weights observation weights for the other reference sector/year/gender... # nrq number of regression quantile in the first step. If nrq=0 the whole conditional quantile process is estimated # mn technical number, should not be changed (see details below) # method method used in the second step, possible values: 1 and 2, (see details below) #Values: a matrix of size length(prob) X 9 # column 1 gives the quantiles at which the decomposition will be estimated # column 2 gives the estimated unconditional quantile function for the other sector using quantile regression in the first step # column 3 gives the raw unconditional quantile for the other sector # column 4 gives the estimated unconditional quantile function for the reference sector using quantile regression in the first step # column 5 gives the raw unconditional quantile for the reference sector sector # column 6 gives the total difference between both sector (column 2 - column 4) # column 7 gives the difference that is explained by residuals # column 8 gives the difference that is explained by coefficients # column 9 gives the difference that is explained by characteristics #Details: we must distinguish between the first and the second step # first step: for small samples (number of observations < 2000 or 3000 depending on the # power of the computer) it is preferable to estimate the whole process and to set nrq=0. If # it is not possible, you should estimate an important number of quantile regression (100, 200 or more). # second step: the second step is the estimation of weighted quantiles which is principally simple # but the number of "observations" is very high: number of observations*number of quantile # regression. If the computer cannot perform the second step because of limited memory, we # can first try to decrease mn. If it is not sufficient, we can set method=2 and decrease mn # but this increase the computation time # #Internal functions called by rqdeco: # rqp(y,x,w): estimate the whole quantile regression process # rql(y,x,w,nrq): estimate nrq quantile regression on a regular grid between 0 and 1 # wq(y,w,prob): estimate the prob weighted quantile of y using wheights w # evalu(x,beta,wq,w=rep(1,dim(x)[1]),prob=seq(0.01,0.99,0.01),ns=1): function called by # rqdeco in the second step if method=1 that estimates the prob quantiles using # characteristics x and coefficients beta. # evalu1(y,x,beta,wq,w=rep(1,length(y)),prob=seq(0.01,0.99,0.01),ns): function called by # rqdeco in the second step if method=2 that estimates the prob quantiles using # characteristics x and coefficients beta. library(quantreg) library(MASS) #needed only for the examples below rqp<-function(y,x,w){ rq<-rq(y~x,weights=w,tau=-1)$sol rq[1,1]<-0 beta<-rq[4:dim(rq)[1],1:(dim(rq)[2]-1),drop=FALSE] wq<-rq[1,2:dim(rq)[2]]-rq[1,1:(dim(rq)[2]-1)] return(beta,wq) } rql<-function(y,x,w,nrq){ beta<-rq(y~x,tau=0.5/nrq,weights=w,method="fn")$coef for(i in seq(2,nrq,1)) beta<-cbind(beta,rq(y~x,tau=(i/nrq-0.5/nrq),weights=w,method="fn")$coef) wq<-rep(1/nrq,nrq) return(beta,wq) } wq<-function(y,w,prob){ o<-order(y) y<-y[o]; w<-w[o] a<-0; i<-0; s<-sum(w) res<-c() for(q in prob){ while(a1) for(k in 2:ns){ p<-t(t(beta[floor((k-1)*dim(beta)[1]/ns+1):floor(k*dim(beta)[1]/ns),,drop=FALSE])%*%t(x[,floor((k-1)*dim(beta)[1]/ns+1):floor(k*dim(beta)[1]/ns)])) pred<-pred+p } nw<-c(c(w)%*%t(c(wq))) o<-order(pred) pred<-pred[o]; nw<-nw[o] a<-0; i<-0; s<-sum(nw) res<-c() for(q in prob){ while(a1) for(k in 2:ns){ p<-t(t(beta[floor((k-1)*dim(beta)[1]/ns+1):floor(k*dim(beta)[1]/ns),,drop=FALSE])%*%t(x[,floor((k-1)*dim(beta)[1]/ns+1):floor(k*dim(beta)[1]/ns)])) pred<-pred+p } o<-order(pred) pred1<-pred[o] a<-0; i<-0 res<-c() for(q in prob){ while(aq[j] & pre[,1]<=q[j+1],]) } } sum<-c(0) for(j in 1:ns){ o<-order(prej[[j]][,1]) prej[[j]]<-prej[[j]][o,] sum<-c(sum,sum[j]+sum(prej[[j]][,2])) } s<-sum[ns+1]; sum<-sum/s res<-c(); a<-0; j<-1; i<-0; t<-prej[[j]]; a<-sum[j] for(q in prob){ while(q>sum[j+1]){ j<-j+1 i<-0 t<-prej[[j]] a<-sum[j] } while(a