/* Table_5_7_process_model_results.sas
Revised May 29, 2012

run gauss program base97.prg first.

this program processes the output of the gauss program which simulates the model, both for 1997 and 2007 (with the China_surge experiment)

Note: T segment corresponds to "Primary" segment in paper (T for "tradable")
      N segment corresponds to "Speciality" segment in paper (N for nontradable")

Note model 1 is pure BEJK model
     model 2 is general model with speciality segment.
Hence in the notation, 

For example:
est_fitted_base1 fitted values of establishment counts at location, under pure BEJK model (model1)
est_fitted_base2 fitted values of establishment counts at location, under general model (model2)

est_fitted_china1 predicted with China, using model 1
est_fitted_china2 predicted with China, using model 2 and original pop
est_fitted_China2_newpop with China and model 2, using 2007 population 

*/

libname public 'd:\a_data\proj\size_fun\exports\revision\wrap\public';
run;

ods html file="d:\a_data\proj\size_fun\exports\revision\wrap\public\Table_6_7_process_model_results.html" style=minimal;


* %%%%%%% Step 1  set up naics_level and loc_level data %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%;

*Note S_TNboth is the sales share, with T and N section together;
* S is just the T sector shares;

filename nu_iter 'd:\a_data\proj\size_fun\exports\revision\wrap\public\base97_nu_bsize.asc';
filename n_lev 'd:\a_data\proj\size_fun\exports\revision\wrap\public\base97_naics_level.asc';   
filename l_lev 'd:\a_data\proj\size_fun\exports\revision\wrap\public\base97_loc_level.asc';  

*Note beta1 is constant, won't use it here, and in any case rerun regression below, so get a second crack at bringing it in;
DATA nu_iter; INFILE nu_iter;
input naicsindex naics iter_num boutiquesize beta1 beta2 beta3;
nuN=beta2;
nuT=beta3;
data nu_iter; set nu_iter; drop naics; *not character, so merge it in;

*naics_level has two observations for each industry, one for iter=1, ther other for iter=10;
DATA naics_level; 
INFILE n_lev;
input naicsindex naics iter_num  sal_growth_chinaUS est_modelUS est_chinaUS alowerbar eta1 eta2 gam1-gam177;
data naics_level; set naics_level; drop naics; *not character, so merge it in;

data naicstext; 
set public.mandat_naics2;
naicsindex=_n_;
keep naicsindex naicstext naics;

data nu_iter; merge nu_iter(in=in1) naicstext; by naicsindex; if in1;
data naics_level; merge naics_level(in=in1) naicstext; by naicsindex; if in1;

DATA loc_level; 
INFILE l_lev;
input naicsindex iter_num ea_index sal_dat salT salN est_model estNhat S_TNboth S S_China est_China;
S_justT=S;
S=S_TNboth;  *For iter1 S_justT and S_TNboth are the same, now even if have S_justT<S_TNboth, for this program only need complete sales;

*Note est_model has not been rescaled, they are in model units;

data est_modelUS; set naics_level;
keep naicsindex iter_num est_modelUS;

proc sort data=loc_level; by naicsindex iter_num;

data loc_level;
merge loc_level est_modelUS;
by naicsindex iter_num;

run;
data loc_level2;
set public.ea_cm97;

proc sort data=loc_level;  by naicsindex ea_index;
proc sort data=loc_level2; by naicsindex ea_index;

data loc_level;
merge loc_level(in=in1) loc_level2; by naicsindex ea_index; if in1;
pop_share=pop97/pop97_US;
est_mod_pop_share=est_model/pop_share;
est_dat_pop_share=est_LM/pop_share;
inv_sqrt_pop_share=1/sqrt(pop_share);
format est_mod_pop_share est_dat_pop_share 7.3 ;

data naicstext; 
set public.ea_cm97; by naics; if first.naics;
keep naics naicsindex naicstext;

data naics_level;
merge naicstext naics_level(in=in2) ; by naicsindex; if in2;


data loc_level; merge loc_level(in=in1) naicstext; by naicsindex; if in1;

data pop2007; set public.ea_pop07; keep ea_index pop2007 pop2007_US;
proc sort data=loc_level; by ea_index;
data loc_level; merge loc_level pop2007; by ea_index;

run;

* %%%%%%% Step 2  estimate sales and estab share of plants with 1-19 employees %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%;

*Now pick up shares of establishment and sales of 13 group;
proc means data=public.ea_cm97 noprint;
by naics;
var est_LM est13_LM;
output out=tsum sum=estUS_LM est13US_LM;
data est13_shr; set tsum;
est13_shr=est13US_LM/estUS_LM;
keep naics est13_shr;

data naics_level;
merge naics_level(in=in1) est13_shr; by naics;
if in1;

*Next get share of sales that are T and N;

proc sort data=loc_level;
by naics naicstext iter_num ;

proc means data=loc_level noprint;
by naics iter_num;
var  sal_dat salT salN;
output out=sal_T_plus_N_US sum=salT_plus_N_US salT_US salN_US;


* %%%%%%% Step 3  estimate est count parameters for model 2 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%;

proc reg data=loc_level outest=est tableout noprint RSQUARE;
by  naics naicstext iter_num;
weight inv_sqrt_pop_share;
model est_LM= pop_share est_model;

run;


data k1; set est;
if _type_='PARMS';
nuN=pop_share ;
nuT=est_model ;
keep naics iter_num intercept nuN nuT _RSQ_;

data se_k1; set est;
if _type_='STDERR';
se_intercept=intercept;
se_nuN=pop_share ;
se_nuT=est_model ;
keep naics iter_num se_intercept se_nuN se_nuT;


data k2; merge k1 se_k1 naics_level; by naics iter_num;
est13_shr =100*est13_shr;
format _RSQ_  7.2;
drop gam1-gam177;
length naics3 $ 3;
naics3=substr(naics,1,3);
if naics in ('311330','311340','314121','315999','337110','337121','337122') then reclass_dum=1;
else reclass_dum=2;
format est13_shr 7.1;

proc sort data=k2; by naics iter_num;


*add in diffuse variable;
data diffuse;
set public.Diffuse_naics;
keep naics diffuse;

data k2; merge k2(in=in1) diffuse; by naics; if in1;

data k2;
merge k2(in=in1) sal_T_plus_N_US;
by naics iter_num; drop _freq_ _type_;
salN_share= salN_US/(salT_US+salN_US);



* %%%%%%% Step 4  a look at the fitted values of estblishments %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%;

*Now examine fitted values;
data nu;
set k2;
keep iter_num naics nuT nuN reclass_dum;

proc sort data=loc_level; by naics iter_num;
proc sort data=nu; by naics iter_num;


*May 18,2012 added new variables est_speciality and est_primary and est_total.  These are in model units, so need to be rescaled;
*Needed these variables to calcualte industry shares of specialty segment, there is overalap with est_fittedbase1, 2, etc.  Sorry about that!;
data fitted;
merge loc_level nu; by naics  iter_num;
est_fitted_base1=est_model*(estUS_LM/est_modelUS);
est_fitted_china1=est_china*(estUS_LM/est_modelUS);
if nuT>0 and nuN>0 then do;
	est_speciality=pop_share*nuN; est_primary=est_model*nuT; est_total=est_speciality+est_primary;
	est_fitted_base2=est_model*nuT + pop_share*nuN; *Just use explained, leave out constant.  OK, because sum up and use totals;
	est_fitted_china2=est_china*nuT + (pop97/pop97_US)*nuN;
	est_fitted_china2_newpop=est_china*nuT + (pop2007_US/pop97_US)*(pop2007/pop2007_US)*nuN;
	end;  
if nuT<=0 and nuN>0 then do; *set nuT to zero if negative, 0 cases;
	est_speciality=pop_share*nuN; est_primary=0; est_total=est_speciality+est_primary;
	est_fitted_base2=pop_share*nuN; 
	est_fitted_china2=(pop97/pop97_US)*nuN;
	est_fitted_china2_newpop=(pop2007_US/pop97_US)*(pop2007/pop2007_US)*nuN;
	end;  
if nuT>0 and nuN<=0 then do; *set nuN to zero if negative, 6 cases;
	est_speciality=0;  est_primary=est_model*nuT; est_total=est_speciality+est_primary;
	est_fitted_base2=est_fitted_base1;
	est_fitted_china2=est_fitted_china1;
	est_fitted_china2_newpop=est_fitted_china1;
	end;  
  *note above start with popsum07 analog, then reseacle by the population growth;
  *Note simpop07 is set at the start of the program, depending on the samepop option (which must be set manualy;


proc means data=fitted noprint;
by naics iter_num;
var est_fitted_base1 est_fitted_base2 est_fitted_china1 est_fitted_china2 est_fitted_china2_newpop s_china est_speciality est_primary est_total;
output out=tsum sum=est_fitted_baseUS1 est_fitted_baseUS2 est_fitted_chinaUS1 est_fitted_chinaUS2  est_fitted_china_newpopUS2 s_china_US est_specialityUS est_primaryUS est_totalUS;

data tsum; set tsum; drop _freq_ _type_;
data est_totalUS; set tsum; keep naics iter_num est_specialityUS est_primaryUS est_totalUS;


data fitted;
merge fitted tsum; by naics iter_num;
if est_fitted_base1>0 then LQsize1=(s/est_fitted_base1)/(1/est_fitted_baseUS1);	
if est_fitted_base2>0 then LQsize2=(s/est_fitted_base2)/(1/est_fitted_baseUS2);
if est_LM>0 then LQsize97=(s/est_LM)/(1/estUS_LM); 
s_est_base1=est_fitted_base1/est_fitted_baseUS1;
s_est_base2=est_fitted_base2/est_fitted_baseUS2;
s_est_China1=est_fitted_China1/est_fitted_ChinaUS1;
s_est_China2=est_fitted_China2/est_fitted_ChinaUS2;
s_est_China2_newpop=est_fitted_China2_newpop/est_fitted_China_newpopUS2;

LQest_base1=s_est_base1/(pop97/pop97_US);
LQest_base2=s_est_base2/(pop97/pop97_US);
LQest_China1=s_est_China1/(pop97/pop97_US);  
LQest_China2=s_est_China2/(pop97/pop97_US);
LQest_China2_newpop=s_est_China2_newpop/(pop97/pop97_US);

s_sal_china1=s_china/s_china_US;
LQsal_china1=s_sal_china1/(pop97/pop97_US);

s_est97= est_LM/estUS_LM;
LQest97=(est_LM/estUS_LM)/(pop97/pop97_US);
LQsal97=s/(pop97/pop97_US);
difLQ1=LQest_china1-LQest_base1;
difLQ2=LQest_china2-LQest_base2;

format est_fitted_base1 est_fitted_base2 
est_fitted_baseUS1 est_fitted_baseUS2 7.1;
format LQsize1 LQsize2 LQest97 LQsal97 LQsal_china1 7.3;

*Now merge in 2007 information;
data info07;
set public.ea_cbp07;
s07=snorm_cbp/snormUS_cbp;
s_est07=est_cbp/estUS_cbp;
keep naics ea estUS_cbp emphatUS_cbp s07 s_est07 est_CBP;


proc sort data=info07; by naics ea;
proc sort data=fitted; by naics ea;

data fitted;
merge fitted(in=in1) info07; by naics ea; if in1;
LQest07=s_est07/(pop97/pop97_US);
LQsal07=s07/(pop97/pop97_US);
difLQest_data=LQest07-LQest97;
g_est=100*(estUS_cbp-estUS_LM)/estUS_LM;
g_emp=100*(emphatUS_cbp-emphatUS_LM)/emphatUS_LM;


data t;
set fitted;
if reclass_dum=1;


proc sort data=t;
by iter_num naics descending LQsal97;
run;

data t;
set t;
by iter_num naics ; retain rank;
if first.naics then rank=0; rank=rank+1; 


*Now print out industry level information;

data k2;
merge k2 est_totalUS;
by naics iter_num;
countNshr=100*est_specialityUS/est_totalUS;



/* Construct Second Stage Estimates of the Plant Count Parameters and Related Model and Data Statistics */
proc print data=k2;
where reclass_dum=1 and iter_num=10;
var iter_num naics naicstext  intercept se_intercept nuN se_nuN nuT se_nuT   _RSQ_ countNshr ;
format intercept se_intercept nuN se_nuN nuT se_nuT   7.1  salN_share 7.2;
title 'iter=10, regression results ';

proc sort data=k2;
by iter_num reclass_dum;

proc tabulate data=k2;
where iter_num=10;
class diffuse reclass_dum iter_num;
var intercept se_intercept nuN se_nuN nuT se_nuT   _RSQ_ countNshr est13_shr ; 
table iter_num, all reclass_dum, 
         N*f=7.0  mean*(intercept se_intercept nuN se_nuN nuT se_nuT)*f=7.1 mean*(_RSQ_ countNshr est13_shr )*f=7.2;
title 'proc tab';

proc tabulate data=k2;
where iter_num=10;
class diffuse iter_num;
var intercept se_intercept nuN se_nuN nuT se_nuT   _RSQ_ countNshr est13_shr ; 
table iter_num*(all diffuse),N mean min p10 p25 median p75 p90 max, 
(intercept se_intercept nuN se_nuN nuT se_nuT)*f=7.1 (_RSQ_ countNshr est13_shr )*f=7.2;
title 'proc tab';


/* Construct Table Estimated Specialty Count Share By Quartiles of all Industries */
proc sort data=k2;
by iter_num reclass_dum;


data k3; set k2; if iter_num=10;
if countNshr< 54.0164 then quartile=1;
else if countNshr<68.0395 then quartile=2;
else if countNshr<78.9763 then quartile=3;
else quartile=4;

proc tabulate data=k3;
class quartile;
var countNshr est13_shr;
table quartile, N*f=comma12.0 (min max mean)*countNshr*f=7.1 mean*est13_shr*f=7.1;
title 'break down into quartile by estimates share of speciality';

proc reg data=k3;
model countNshr=est13_shr;
title 'regression of countNshr onr est13_shr, unweighted';


proc print data=k2;
where iter_num=10 and nuN<=0;
var iter_num naics naicstext  intercept se_intercept nuN se_nuN nuT se_nuT   _RSQ_ countNshr est13_shr ;
format intercept se_intercept nuN se_nuN nuT se_nuT   7.1  salN_share 7.2;
title 'iter=10, regression results, where nuN<=0 ';


/* Construct Table: Sales, Count and Size Quotients in Data, Size Quotients for Both Models In High Concentration Industry Locations */
proc print data=t;
by iter_num;
where s>=.05 and LQsal97>=2 and iter_num=1;
var naicstext eatext s LQsal97 LQest97 LQsize97  LQsize1  ;
format s 7.2 LQsal97 LQest97 LQsize97 LQsize97  LQsize1   7.1;
title 'proc print reclass_dum=1 industries, s>=.05, LQsal97>=2, use iter=1 for primary only model'; 

proc print data=t;
by iter_num;
where s>=.05 and LQsal97>=2 and iter_num=10;
var naicstext eatext  LQsize2 ;
format s 7.2 LQsize2  7.1;
title 'proc print reclass_dum=1 industries, s>=.05, LQsal97>=2, use iter=10 for general model with speciality segment'; 


/* Now add summary statiatics */

proc sort data=fitted; by naics;

data fitted;
merge fitted(in=in1) diffuse; by naics;
if in1;

proc tabulate data=fitted;
where s>=.05 and LQsal97>=2 and iter_num=1;
class iter_num reclass_dum diffuse;
var s LQsal97 LQest97 LQsize97  LQsize1 ;
table iter_num, (all reclass_dum diffuse), N*f=7.0 mean*(s*f=7.2 (LQsal97 LQest97 LQsize97 LQsize1 )*f=7.1);
table iter_num, (all reclass_dum diffuse), N*f=7.0 median*(s*f=7.2 (LQsal97 LQest97 LQsize97 LQsize1 )*f=7.1);
title 'proc tabulate s>=.05, LQsal97>=2, use iter_num=1 for constrained model with primary only'; 

proc tabulate data=fitted;
where s>=.05 and LQsal97>=2 and iter_num=10;
class iter_num reclass_dum diffuse;
var LQsize2 ;
table iter_num, (all reclass_dum diffuse), N*f=7.0 mean*((LQsize2 )*f=7.1);
table iter_num, (all reclass_dum diffuse), N*f=7.0 median*( (LQsize2 )*f=7.1);
title 'proc tabulate s>=.05, LQsal97>=2, use iter_num=10 for general model with speciality segment'; 


run;

ods html close;
run;