clear
set mem 500m
log using "c:\zunaid\bgd\mes2009\dofiles\BGD_MES2009_generate.log", text replace

* Stata datafile (employments.dta) obtained from Faiz'vi on 01 September 2010.
* The working folder is c:\zunaid\bgd\mes2009\data

*recoding the variables in each component
use c:\zunaid\bgd\mes2009\data\dataorig\employments.dta
use c:\zunaid\bgd\mes2009\data\dataorig\members_wms.dta
use c:\zunaid\bgd\mes2009\data\dataorig\WelfareWB.dta
*WelfareWB.dta in not included into the processed LM-MD dataset

* creating unique person identification number
use c:\zunaid\bgd\mes2009\data\dataorig\members_wms.dta
tab MPSU
tab MQSL
tab MMSL
gen psu2 = 1000+MPSU
gen hhnum = psu2*100+MQSL
gen indnum = hhnum*100+MMSL
format indnum %12.0g
drop psu2 hhnum

sort indnum
save c:\zunaid\bgd\mes2009\data\datawaste\IND.dta, replace

use c:\zunaid\bgd\mes2009\data\dataorig\employments.dta
tab wPSU
tab wQSL
tab MMSL
gen psu2 = 1000+wPSU
gen hhnum = psu2*100+wQSL
gen indnum = hhnum*100+MMSL
format indnum %12.0g
drop psu2 hhnum

sort indnum
save c:\zunaid\bgd\mes2009\data\datawaste\EMP.dta, replace

*merging the component files
use c:\zunaid\bgd\mes2009\data\datawaste\IND.dta
merge indnum using "c:\zunaid\bgd\mes2009\data\datawaste\EMP.dta"
tab _merge
save "c:\zunaid\bgd\mes2009\data\dataproc\BGD_MES_2009_2009.dta", replace
* NOTE: merge command notes "variable indnum does not uniquely identify observations in the master data; variable indnum does not uniquely identify observations in c:\zunaid\bgd\mes2009\data\datawaste\EMP.dta" but could not find any dplicate individual id

* dropping variables of the Health Module
drop Q4_2 Q4_3 Q4_4 Q4_5 Q4_6 Q4_7 Q4_8 Q4_9 Q4_10 Q4_11 Q4_12 Q4_13 Q4_14


* creating LMMD variables

generate _SAR_VAR = .

generate COUNTRY = "BGD"
tab  COUNTRY

generate YEAR = 2009
tab  YEAR

generate HID = houseid
*tab HID

generate INDID = indnum
*tab INDID

tab DivCode
generate REGION = DivCode
label define region 1 "Chittagong" 2 "Dhaka" 3 "Khulna" 4 "Rajshahi" 5 "Barisal" 6 "Sylhet"
label values REGION region
tab REGION

tab RMO
generate URBAN = .
replace URBAN = 1 if RMO == 2 | RMO == 4
replace URBAN = 0 if RMO == 1 | RMO == 3
label define urban 0 "Rural" 1 "Urban"
label values URBAN urban
tab URBAN
* checked with variable UR in the EMP dataset, matched

tab Q3_5
generate MALE = Q3_5
recode MALE 1=1 2=0
label define male 0 "Female" 1 "Male"
label values MALE male
tab MALE

tab Q3_4
generate AGEY = Q3_4
recode  AGEY 99=98
tab AGEY
tab AGEY Age_group
* checked with the created variable in EMP file

tab Q3_3
* this variable has extra codes outside the listed ones
sort HID INDID
by HID, sort: egen min_relation = min(Q3_3)
gen one = 1
by HID, sort: egen num_heads = count(one) if Q3_3 == 1
recode num_heads . = 0
by HID, sort: egen min_id = min(INDID)
gen HEAD = .
replace HEAD = 1 if Q3_3 == 1
replace HEAD = 0 if Q3_3 != 1
replace HEAD = 0 if min_relation >= 2 | num_heads > 1
drop  min_id num_heads one min_relation

tab HEAD Q3_3
generate HEAD_UNIQUE=.
* this dataset doesn't have multiple head in same HH

gen HHSIZE = HHSize
tab HHSIZE
* household size variable already clreated - unsual values 45 and 101 WILL BE CHECKED

generate ETHNICITY = .
* ethnicity info not collected 

generate LANGUAGE=.
* language info is not collected in this survey

generate RELIGION = .
* religion variable collected in the intro section, not available in the datasets

tab Q3_6
* Info on marital status was collected from all the members in the household

* Married polygamy and living together info not collected; the "others" category needs to be checked
generate MARSTAT = Q3_6
recode MARSTAT 1=1 2=2 3=5 4=6 5=7
label define marstat 1 "Never Married" 2 "Married Monog" 5 "Divorced/Separated" 6 "Widowed" 7 "Others"
label values MARSTAT marstat
tab MARSTAT
tab AGEY MARSTAT
tab MALE MARSTAT
* Others category has 31 females and 9 males - could be separated - WILL CHECK

gen _EDUCATION_ = .

tab Q3_8
* Education level info collected for ages 5 years and above
generate EDLEVEL = .
replace EDLEVEL = 0 if Q3_8 == 0 & AGEY < 5
replace EDLEVEL = 1 if Q3_8 == 1
* matching with age indicates that grade passed = primary possibly equals primary incomplete
replace EDLEVEL = 2 if Q3_8 == 2 
replace EDLEVEL = 3 if Q3_8 == 3 | Q3_8 == 4
replace EDLEVEL = 4 if Q3_8 == 5 | Q3_8 == 6
* NOTE: edu level has an additional code (7), which falls outside the categories
* NOTE: Observations of age <5 or coded as 7 are recoded as missing
label define edlevel 0 "No education" 1 "Primary" 2 "Junior Secondary" 3 "Senior Secondary" 4 "Pre-University and  Above"
label values EDLEVEL edlevel
tab EDLEVEL

gen EDYEARS = .
* data not collected on completed years of education, only the education level completed

gen EDLEVEL_DAVID = .
* MES does not provide info for separating "Some Primary, but not completed" and "Completed Primary" groups

gen CONEDLEVEL = .
* not enough info for creating this variable 

gen CONEDYEARS = .
* not enough info for creating this variable 

gen _EMPLOYMENT_ = .

tab Q3_13
tab Q3_14
tab LF1
tab Q3_13 LF1 
* Q3_13 and LF1 collects information on employment, but doesn't match fully - LF1 has recorded a number of employed under 15 yrs
* Q3.13  has 5 observations falling outside listed categories
* Q3.13 collected information on the activity in last 7 days
* Q3.13 did not collect information to construct discouraged category
* Q3.13 have two responses for Unemploed category - unemployed and looking for job
generate EMP_STAT = 4 if AGEY >= 15
replace EMP_STAT = 1 if Q3_13 == 1 | LF1 == 1
replace EMP_STAT = 2 if Q3_13 == 2 | Q3_13 == 3
tab  AGEY EMP_STAT
* A number of observations are under 15 years of age - recoded to missing
replace EMP_STAT = . if AGEY < 15
label define emp_stat 1 "Employed" 2 "Unemployed" 3 "Discouraged" 4 "Inactive"
label values EMP_STAT emp_stat
tab  EMP_STAT

tab LF3
tab Q3_13
tab LF3 EMP_STAT
generate WHYINACTIVE = Q3_13 if EMP_STAT==4
recode WHYINACTIVE 4 = 4 5 = 5 6 = 8 15 = . 25 = .
* here housewife recoded as "doing housework" and disabled recoded as "sick/incapable"
label define whyinactive 1 "Waiting for re-appointment" 2 "Waiting for joining" 3 "Discouraged" 4 "Studying" 5 "Doing housework" 6 "Retired" 7 "No need to work" 8 "Sick/incapable" 9 "Other"
label values WHYINACTIVE whyinactive
tab  WHYINACTIVE

generate EMPLOYED=(EMP_STAT==1)
replace EMPLOYED=. if EMP_STAT==.
tab  EMPLOYED

generate UNEMPLYD=(EMP_STAT==2)
replace UNEMPLYD=. if EMP_STAT==.
tab UNEMPLYD

generate DISCRGD=.

generate INACTIVE=(EMP_STAT==4)
replace INACTIVE=. if EMP_STAT==.
tab INACTIVE

* Bangladesh Standard Industrial Classifications (BSIC, Rev 3), which is developed in conformity with the International Standard Industrial Classification (ISIC) Rev.-3, are used from http://unstats.un.org/unsd/cr/registry/regcst.asp?Cl=17 and http://unstats.un.org/unsd/class/intercop/training/escap08/escap08-20.PDF

tab Q3_14
tab LF4
tab LF5
* Q3_14 collects information with a created industry/occupation classification
* as per the questionnaire, LF4 was supposed to contain 2 digit level codes as per BSIC 2001 (BSIC2001.pdf)
* but the variable has only values ranging from 0 to 5 (and 65, 93) and LF5 appears top contain industry info
* for SECTOR_MAIN, variable LF5 was used 

generate SECTOR_MAIN = .
replace SECTOR_MAIN=1 if Q3_14 >= 1 & Q3_14 <= 5
replace SECTOR_MAIN=2 if Q3_14 >= 10 & Q3_14 <= 14
replace SECTOR_MAIN=3 if Q3_14 >= 15 & Q3_14 <= 37
replace SECTOR_MAIN=4 if Q3_14 >= 40 & Q3_14 <= 41
replace SECTOR_MAIN=5 if Q3_14 == 45
replace SECTOR_MAIN=6 if Q3_14 >= 50 & Q3_14 <= 55
replace SECTOR_MAIN=7 if Q3_14 >= 60 & Q3_14 <= 64
replace SECTOR_MAIN=8 if Q3_14 >= 65 & Q3_14 <= 74
replace SECTOR_MAIN=9 if Q3_14 == 75
replace SECTOR_MAIN=10 if Q3_14 >= 80 & Q3_14 <= 99
label define sector_main 1 "Agriculture & fishing" 2 "Mining" 3 "Manufacturing" 4 "Electricity & utilities" 5  "Construction" 6 "Commerce" 7"Transportation, storage & communication" 8 "Financial, insurance & real estate" 9  "Public administration" 10 "Other services"
label values SECTOR_MAIN sector_main
tab  SECTOR_MAIN

tab LF6
* 2-digit ISCO-88 codes (Meeting with Ainul Kabir on 04 Nov 2010) are used from ISCO.pdf
generate OCC_MAIN=0
replace OCC_MAIN = . if EMPLOYED == 0 | EMPLOYED == .
replace OCC_MAIN=1 if LF6 >= 11 & LF6 <= 13
replace OCC_MAIN=2 if LF6 >= 2111 & LF6 <= 2460
replace OCC_MAIN=3 if LF6 >= 3111 & LF6 <= 3480
replace OCC_MAIN=4 if LF6 >= 4111 & LF6 <= 4223
replace OCC_MAIN=5 if LF6 >= 5111 & LF6 <= 5230
replace OCC_MAIN=6 if LF6 >= 6111 & LF6 <= 6210
replace OCC_MAIN=7 if LF6 >= 7111 & LF6 <= 7442
replace OCC_MAIN=8 if LF6 >= 8111 & LF6 <= 8340
replace OCC_MAIN=9 if LF6 >= 9111 & LF6 <= 9333
replace OCC_MAIN=10 if LF6 == 110
label define occ_main 1 "Legislators, senior officials & managers" 2 "Professionals" 3 "Technicians & associated  professionals" 4 "Clerks" 5 "Service workers & shop & market sales" 6 "Skilled agricultural and fishery workers"  7"Craft & related trades" 8 "Plant & machine operators & assemblers" 9 "Elementary occupations" 10 "Armed forces" 0 "Other/unspecified"
label values OCC_MAIN occ_main
tab OCC_MAIN EMPLOYED
tab OCC_MAIN

tab Q3_14
tab LF11
generate PUBLIC = LF11
recode PUBLIC 0=. 1=1 2=0 3=0 4=0
tab PUBLIC
replace PUBLIC = . if EMPLOYED == 0
tab PUBLIC

tab LF10
generate EMPTYPE_MAIN = .
replace EMPTYPE_MAIN=1 if LF10==1 | LF10==5 | LF10==6 
replace EMPTYPE_MAIN=2 if LF10==2
replace EMPTYPE_MAIN=3 if LF10==3
replace EMPTYPE_MAIN=4 if LF10==4
replace EMPTYPE_MAIN=. if LF10==10
label define emptype_main 1 "Wage & salaried worker" 2 "Employer" 3 "Individual self-employed worker" 4 "Household  enterprise worker"
label values EMPTYPE_MAIN emptype_main
tab EMPTYPE_MAIN

tab LF8
generate SECONDJOB = LF8
recode  SECONDJOB 0=0 1=1 2=0 
replace SECONDJOB = . if EMPLOYED == .
tab  SECONDJOB

generate NUMJOBS12MO = .
* BGD MES 2009 did not collect info on this

generate AG_WRK_MAIN=(SECTOR_MAIN==1)
replace  AG_WRK_MAIN=. if  SECTOR_MAIN==.
tab  AG_WRK_MAIN

tab LF7
tab LF9

* 0.10% and 0.01% respondents respectively stated the working hours for main and secondary jobs to exceed 96 per week
generate main_wrkhr = LF7
generate tot_wrkhr = LF7 + LF9
generate wrkhr_prop_m=main_wrkhr/tot_wrkhr
* tab wrkhr_prop_m
generate HOURWRKMAIN= main_wrkhr*52
replace HOURWRKMAIN = 96* wrkhr_prop_m*52 if  tot_wrkhr>= 96
* tab  HOURWRKMAIN

generate HOURWRKTOT=tot_wrkhr*52
replace HOURWRKTOT = 4992 if tot_wrkhr >= 96 
* tab  HOURWRKTOT

generate HOURWRKTOT_mon = tot_wrkhr*4.33
replace HOURWRKTOT_mon = 384 if tot_wrkhr >= 96
* tab  HOURWRKTOT_mon

generate HOURWRKTYPE = 1
replace  HOURWRKTYPE=. if  HOURWRKTOT==.
tab  HOURWRKTYPE
drop  tot_wrkhr main_wrkhr wrkhr_prop_m

gen _INCOME_ = .

gen reg_deflator = 1
replace reg_deflator = 1.03 if URBAN == 1
tab reg_deflator
* regional price deflator was created using 2007/08 CPI of urban and rural areas (file: CPI_January-09.pdf)

generate INCOME_MAIN_def = .
* info on individual income for main or secondary job is only available for day labour and paid worker
* original question LF11 (how do you get the salary/wages?) is not recorded
* reference period is one week
tab LF12
tab LF13
generate inc_tot = LF12 + LF13 
replace INCOME_MAIN_def = (inc_tot*52)/reg_deflator if inc_tot != 0

foreach v in INCOME_MAIN_def {
qui summ `v', d
  replace `v' = r(p50) if !inrange(`v', r(p50) - 3 * r(sd), r(p50) + 3*r(sd)) & !mi(`v')
}


generate INCOME_TOT_def = INCOME_MAIN_def

generate NONLBRINC_def = .
* info on non-labour income not collected

egen HHINCOME_TOT_def = sum(inc_tot), by(HID)
tab HHINCOME_TOT_def
drop  inc_tot

*outlier correction
foreach v in HHINCOME_TOT_def {
qui summ `v', d
  replace `v' = r(p50) if !inrange(`v', r(p50) - 3 * r(sd), r(p50) + 3*r(sd)) & !mi(`v')
}

generate XINCOME_MAIN_def = .
generate XINCOME_TOT_def = .
generate XNONLBRINC_def = .
generate IMP_FM_RENT_def = .
* above variables are not relevant for BGD, as info on implicit land rental cost not collected

gen _CONSUMPTION_ = .
* No consumption information collected

generate TOTCONS_def = .

generate CONS_PC_def = .

generate adeq = 1 if HEAD==1
replace adeq = 0.5 if AGEY>=15 & mi(adeq)
replace adeq = 0.3 if AGEY<15 & mi(adeq)
egen ADEQ = sum(adeq), by(HID)
tab ADEQ
drop adeq
* reversed the sequence of ADEQ and CONS_PEQA_def

generate CONS_PEQA_def = .

generate REGPLINE_ann = .
replace REGPLINE_ann = 832*12 if URBAN == 0
replace REGPLINE_ann = 949*12 if URBAN == 1

generate EXTPLINE_ann = .
replace EXTPLINE_ann = 707*12 if URBAN == 0
replace EXTPLINE_ann = 747*12 if URBAN == 1
* poverty lines are calculated as average for urban and rural from HIES 2005 report (page 160, poverty lines.pdf)

generate ppp05deflator = 25.49
* please check this value (source: Scoring_Poverty_Bangladesh_2005_EN.pdf)

generate INTPLINE_ann = 1.25*ppp05deflator*365
tab INTPLINE_ann

generate STRATA = .
* stratification done in two stages (p.13)

generate PSU = MPSU

generate WEIGHT = WeightedVal
* weights are only available for working-age population

generate SPATIALDEF = reg_deflator
drop reg_deflator

generate YEAR_def = 2005

generate PPP05DEFLATOR = ppp05deflator
drop ppp05deflator
tab PPP05DEFLATOR

foreach v in INCOME_MAIN INCOME_TOT NONLBRINC TOTCONS CONS_PC CONS_PEQA HHINCOME_TOT XINCOME_MAIN XINCOME_TOT  XNONLBRINC{
gen `v'_PPP05 = `v'_def/PPP05DEFLATOR
}

foreach v in REGPLINE_ann EXTPLINE_ann{
gen `v'_PPP05 = `v'/PPP05DEFLATOR
}

generate _SKILLS_=.

generate TRAINING = .
* no info collected on training

generate EDLEVEL_VT = .
* no info collected on vocational training

generate VT_CATEG = .
* no info collected on vocational training

generate SCHOOL_LEAVE = .
* no info collected on vocational training

generate CURRENT_ATTEND = .
* no info collected on vocational training

generate DURATION_UNEMP = .
* info not available

generate CASUAL_OR_WAGE = .

egen NO_ADULT = sum(AGEY >= 15 & AGEY <= 64), by(HID)
tab NO_ADULT

egen NO_CHILDREN = sum(AGEY >= 0 & AGEY <= 14), by(HID)
tab NO_CHILDREN

egen NO_ELDERLY = sum(AGEY >= 65 & AGEY < .), by(HID)
tab NO_ELDERLY 

generate ENROL_CHILDREN = .

gen SCHOOL_DIST = .

generate PENSION = . 

generate PENSION_INCOME = .
*info not available

generate CONTRIBUTORY_HEALTH = .
* no info on health insurance

xtile HHINCOME_DECILE =  HHINCOME_TOT_def, nq(10)
tab HHINCOME_DECILE

gen PCHHINCOME_TOT_def = HHINCOME_TOT_def / HHSIZE
*tab PCHHINCOME_TOT_def

xtile DEC_PCHHINCOME_TOT_def =  PCHHINCOME_TOT_def, nq(10)
tab  DEC_PCHHINCOME_TOT_def 

generate PC_CONSUMPTION = .
* no info collected on consumption

generate DECILES_PC_CONSUMPTION =  .

gen TRANSFER_TYPE = .

gen TENURE = .

**************** labelling the created variables
label var COUNTRY 		"Country Name / Code"
label var YEAR 			"Survey Year"
label var HID 			"Household Identifier"
label var INDID			"Individual Identifier"
label var REGION                "Region / Province"
label var URBAN                 "Urban or rural location"
label var MALE                  "Gender"
label var AGEY                  "Age in Completed years"
label var HEAD                  "Household head"
label var HEAD_UNIQUE   	"Household head unique"
label var HHSIZE                "Household size"
label var ETHNICITY             "Ethnicity"
label var LANGUAGE              "Language"
label var RELIGION              "Religion"
label var MARSTAT               "Marital Status"
label var EDLEVEL               "Level of education"
label var EDYEARS               "Years of education"
label var CONEDLEVEL     	"Constructed level of education"
label var CONEDYEARS		"Constructed years of education"
label var EMP_STAT              "Employment Status"
label var WHYINACTIVE 	 	"Reasons for inactivity"
label var EMPLOYED              "Employed"
label var UNEMPLYD              "Unemployed"
label var DISCRGD               "Discouraged"
label var INACTIVE              "Inactive"
label var SECTOR_MAIN   	"Int Stand Ind Class-Main Job-expanded"
label var OCC_MAIN              "Int Stand Class Occ-Main Job-ISCO88"
label var PUBLIC                "Public employee"
label var EMPTYPE_MAIN          "Work category for main job"
label var SECONDJOB             "Second job"
label var NUMJOBS12MO   	"Num of jobs worked last 12 mon"
label var AG_WRK_MAIN           "Agricultural job"
label var HOURWRKMAIN           "Hrs wrk by ind main job (annual)"
label var HOURWRKTOT            "Hrs wrk by ind in all jobs (ann)"
label var HOURWRKTYPE           "Type of hours worked"
label var INCOME_MAIN_def       "Def annual income fr main job"
label var INCOME_TOT_def        "Def annual income fr all jobs"
label var NONLBRINC_def         "Def annual non labor income-HH"
label var HHINCOME_TOT_def      "Def annual tot income of HH in year"
label var XINCOME_MAIN_def      "Def totai income at HH in year"
label var XINCOME_TOT_def       "Adjust Def annual income fr ind main job"
label var XNONLBRINC_def        "Adjust Def annual non labor income HH"
label var IMP_FM_RENT_def       "Implicit farm rental value"
label var TOTCONS_def           "Def annual tot consumption HH"
label var CONS_PC_def           "Def annual tot consumption pc"
label var CONS_PEQA_def         "Def annual tot consumptin per adult equ"
label var ADEQ                  "HH sixe in adult equi"
label var REGPLINE_ann          "Regular poverty line, annual"
label var EXTPLINE_ann          "Extreme poverty line, annual"
label var INTPLINE_ann          "Int poverty line"
label var STRATA                "Sampling strata ID"
label var PSU                   "Primary sampling unit"
label var WEIGHT                "Household weights"
label var SPATIALDEF            "Regional deflator"
label var YEAR_def              "Deflator base year"
label var PPP05DEFLATOR         "PPP Def for consum in 2009"
label var INCOME_MAIN_PPP05     "PPP05 Def ann income from Main job"
label var INCOME_TOT_PPP05      "PPP05 Def ann income from all jobs"
label var NONLBRINC_PPP05       "PPP05 Def ann non labor income HH"
label var TOTCONS_PPP05 	"PPP05 Def ann tot consum for HH"
label var CONS_PC_PPP05 	"PPP05 Def ann tot consum per cap"
label var CONS_PEQA_PPP05       "PPP05 Def ann tot consum per adul equi"
label var HHINCOME_TOT_PPP05    "PPP05 Def tot income at HH in year"
label var XINCOME_MAIN_PPP05    "PPP05 Adjs Def ann income from main job"
label var XINCOME_TOT_PPP05     "PPP05 Adjs Def ann income from all jobs"
label var XNONLBRINC_PPP05      "PPP05 Adjs Def ann non labor income HH"
label var REGPLINE_ann_PPP05    "PPP05 Def regular poverty line, annual"
label var EXTPLINE_ann_PPP05    "PPP05 Def ann extreme poverty line"
label var TRAINING              "If vocational/tech training"
label var EDLEVEL_VT            "Education categories"
label var VT_CATEG              "Vocational/Tech edu"
label var SCHOOL_LEAVE          "School leaving age"
label var CURRENT_ATTEND        "Attendance status" 
label var DURATION_UNEMP        "Duration of unemployment"
label var CASUAL_OR_WAGE        "Casual or wage work"
label var NO_ADULT              "Number of adults"
label var NO_CHILDREN           "Number of children"
label var NO_ELDERLY            "Number of elderly"
label var ENROL_CHILDREN        "Children present Currently attending to school"
label var SCHOOL_DIST		"Distance of nearest school"
label var PENSION               "Whether enrolled in a formal pension sys"
label var PENSION_INCOME        "Money received from pension"
label var CONTRIBUTORY_HEALTH   "Whether enrolled in a contri health insurance"
label var HHINCOME_DECILE       "Income deciles"
label var PCHHINCOME_TOT_def    "Per capita hh total income Def"
label var DEC_PCHHINCOME_TOT_def "Per capita hh income deciles"
label var PC_CONSUMPTION        "Per capita consumption"
label var DECILES_PC_CONSUMPTION "Deciles of per capita consumption"
label var TRANSFER_TYPE         "Transfer type"
label var TENURE                "Tenure"

save "c:\zunaid\bgd\mes2009\data\dataproc\BGD_MES_2009_2009.dta", replace
log close

set more 0
