--- title: "Reall - Data Processing and Analysis" output: word_document: default html_document: default date: "2024-06-18" editor_options: chunk_output_type: console --- ```{r setup, include=FALSE} # Load necessary libraries and set options library(foreign) library(dplyr) library(haven) library(tidyverse) library(readxl) library(ggplot2) library(xml2) library(haven) library(tidyr) library(stringr) library(purrr) knitr::opts_chunk$set(echo = TRUE, results = "hide", message = FALSE, warning = FALSE) ``` ```{r} # Function to calculate group averages based on quantiles calculate_group_averages <- function(data, quantile_column, group_column) { data %>% arrange(!!sym(quantile_column)) %>% mutate( quantile_group = cut_number(!!sym(quantile_column), n = 5, labels = c("20", "40", "60", "80", "95")) ) %>% group_by(Location, quantile_group) %>% summarise( avg_HH_size = mean(HH_size, na.rm = TRUE), avg_NoEarners = mean(NoEarners, na.rm = TRUE) ) %>% pivot_wider(names_from = quantile_group, values_from = c(avg_HH_size, avg_NoEarners), names_prefix = "percentile_") } # Function to calculate quantiles and related statistics calculate_quantiles <- function(data, quantiles) { do.call(rbind, lapply(quantiles, function(q) { data.frame( Quantile = q * 100, # Convert quantile probability to percentage HH_exp = quantile(data$HH_exp, probs = q, na.rm = TRUE), no_of_earners = median(data$no_of_earners[data$HH_exp <= quantile(data$HH_exp, probs = q, na.rm = TRUE)], na.rm = TRUE), HH_size = median(data$HH_size[data$HH_exp <= quantile(data$HH_exp, probs = q, na.rm = TRUE)], na.rm = TRUE), Percent_Income_Spent_on_Housing = median(data$Percent_Income_Spent_on_Housing[data$HH_exp <= quantile(data$HH_exp, probs = q, na.rm = TRUE)], na.rm = TRUE) ) })) } ``` India 2018 ```{r} # Load datasets for India LO1_dataset <- read_dta('/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/India/R76120L01-Identification of sample household.dta') LO2_dataset <- read_dta('/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/India/R76120L02-Demographic and other particulars of household members.dta') LO3_dataset <- read_dta('/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/India/R76120L03-Household characteristics.dta') india_codes <- read_xlsx('/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/India/India codes.xlsx') Appendix_II <- read_xlsx('/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/India/Appendix II.xlsx') LO7_dataset <- read_dta('/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/India/R76120L07-Particulars of the dwelling of the households living in houses.dta') # Read the XML file (replace 'your_file.xml' with your actual file path) xml_data <- read_xml("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/India/NSS76CH1dot2DRINKING.xml") # Convert necessary columns to integers data <- LO3_dataset %>% mutate( Sector = as.integer(Sector), NSS_Region = as.integer(NSS_Region), District = as.integer(District), HH_size = as.integer(HH_size), norm_weight = as.integer(Multiplier/100) ) ``` India Data Processing ```{r} # Calculate the number of earners per household LO2_dataset <- LO2_dataset %>% mutate(is_earner = ifelse(nco2digitcode > 0, 1,0)) %>% #ifelse(is.na(nco2digitcode), 0, 1)) %>% group_by(HHID) %>% mutate(no_of_earners = sum(is_earner)) %>% ungroup() # Filter and merge datasets LO7_dataset <- LO7_dataset %>% filter(montly_rent != 0) data <- data %>% mutate(urban_rural = ifelse(Sector == 2, "Urban", "Rural")) %>% left_join(LO2_dataset[c('HHID', 'no_of_earners')], by = "HHID") %>% left_join(LO7_dataset[c("HHID", "montly_rent")], by = "HHID") %>% #left_join(india_codes[c('state name', 'regional office name', 'NSS_Region')], by = c('NSS_Region')) %>% left_join(Appendix_II[c('state_name', 'district_name', 'district_code','nss_code')], by = c('NSS_Region'='nss_code', 'District' = 'district_code'), ) %>% mutate(Percent_Income_Spent_on_Housing = (montly_rent / Total_Monthly_expenditure) * 100) %>% distinct() %>% drop_na(HHID) # Rename columns for consistency data <- data %>% rename(HH_exp = Total_Monthly_expenditure, HH_size = HH_size, state = 'state_name', City = 'district_name') sample_size_india <- data %>% group_by(City, state, urban_rural, Year = 2018) %>% summarise(sample_size = n()) sample_size_india_u_r <- data %>% group_by(City, state, urban_rural = "All", Year = 2018) %>% summarise (sample_size = n()) all_data_india <- data %>% uncount(weights = norm_weight) ``` India Quantile Calculation ```{r} # Calculate quantiles for each location quantile_probs <- seq(0.01, 0.99, by = 0.01) india_location_quantiles <- all_data_india %>% group_by(Country = "India", urban_rural, Location = state, City, Year = 2018) %>% group_modify(~ calculate_quantiles(.x, quantile_probs)) %>% ungroup() # Additional mixed urban/rural quantiles for each location quantile_probs <- seq(0.01, 0.99, by = 0.01) india_location_quantiles_u_r <- all_data_india %>% group_by(Country = "India", urban_rural = "All", Location = state, City, Year = 2018) %>% group_modify(~ calculate_quantiles(.x, quantile_probs)) %>% ungroup() ``` India 2022 ```{r} #HCES_22_lvl14 <- read.delim('/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/India//HCES 22-23/hces22_lvl_14.txt') HCES_22_lvl15 <- read.delim('/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/India/HCES 22-23/hces22_lvl_15.txt') Appendix_II <- read_xlsx('/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/India/Appendix II.xlsx') Header <- data.frame ( Data = c("Datas")) #HCES_22_lvl14 <- rbind (Header, setNames(HCES_22_lvl14,names(Header))) %>% # filter(Data != "Datas") %>% # transform (HCES_22_lvl14, Common_ID = substr(Data,1,38), Questionnaire_No = substr(Data,39,39), Level = substr(Data,40,41), Section = substr(Data,42,46), Item_Code = substr(Data,47,49), Value = substr(Data,50,59), Multiplier = substr (Data,60,74)) HCES_22_lvl15 <- rbind (Header, setNames(HCES_22_lvl15,names(Header))) %>% filter(Data != "Datas") %>% transform (HCES_22_lvl15, Survey_Name = substr(Data,1,4), Year = substr (Data,5,8), FSU_Serial = substr (Data,9,13), Sector = substr (Data,14,14), State = substr (Data,15,16), NSS_Region = as.integer(substr (Data,17,19)), District = as.integer(substr (Data,20,21)), Stratum = substr (Data,22,23), Sub_stratum = substr (Data,24,25), Panel = substr (Data,26,27), Sub_sample = substr (Data,28,28), FOD_Sub_Region = substr (Data,29,32), Sample_SU_No = substr (Data,33,34), Sample_Sub_No = substr (Data,35,35), Second_stage_No = substr (Data,36,36), Sample_HH_No = substr (Data,37,38), Questionnaire_No = substr(Data,39,39), Level = substr(Data,40,41), Section = substr(Data,42,43), Time_Taken = substr(Data,44,46), HH_Consumption = as.integer(substr(Data,47,54)), Total_Online = substr (Data,55,62), Informant_Code = substr (Data,63,64), Response_Code = substr (Data,65,65), Household_Size = as.integer(substr (Data,66,68)), Multiplier = as.double(substr (Data, 69, 83))) %>% mutate(HHID = paste (Sector,NSS_Region,District,Stratum,Sub_stratum,Sample_HH_No))%>% filter (HH_Consumption > 0) #%>% HCES_22_lvl15_mean <- HCES_22_lvl15 %>% group_by (HHID) %>% summarise (Mean_HH_Size = mean(as.double(Household_Size)), Mean_HH_Consumption = mean(HH_Consumption)) %>% ungroup() HCES_22 = select(HCES_22_lvl15,-1:-5,-10:-18,-20:-27)%>% filter(Questionnaire_No == 'F') %>% mutate(urban_rural = ifelse(Sector == 2, "Urban", "Rural"), no_of_earners = NA, Percent_Income_Spent_on_Housing = NA) %>% left_join(HCES_22_lvl15_mean[c('HHID','Mean_HH_Size', 'Mean_HH_Consumption')], by = 'HHID') %>% left_join(Appendix_II[c('state_name', 'district_name', 'district_code','nss_code')], by = c('NSS_Region'='nss_code', 'District' = 'district_code'), ) # Rename columns for consistency HCES_22 <- HCES_22 %>% rename(HH_exp = Mean_HH_Consumption, HH_size = Mean_HH_Size, state = 'state_name', City = 'district_name') sample_size_india_2022 <- HCES_22 %>% group_by(City, state, urban_rural, Year = 2022) %>% summarise(sample_size = n()) sample_size_india_u_r_2022 <- HCES_22 %>% group_by(City, state, urban_rural = "All", Year = 2022) %>% summarise (sample_size = n()) all_data_india_2022 <- select(HCES_22,-1:-5) %>% uncount(weights = as.integer(Multiplier/1000)) # Calculate quantiles for each location india_location_quantiles_2022 <- all_data_india_2022 %>% group_by(Country = "India", urban_rural, Location = state, City, Year = 2022) %>% group_modify(~ calculate_quantiles(.x, quantile_probs)) %>% ungroup() quantile_probs <- seq(0.01, 0.99, by = 0.01) india_location_quantiles_u_r_2022 <- all_data_india_2022 %>% group_by(Country = "India", urban_rural = "All", Location = state, City, Year = 2022) %>% group_modify(~ calculate_quantiles(.x, quantile_probs)) %>% ungroup() ``` Uganda ```{r} # Load datasets for Uganda gsec1_dataset <- read_csv("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Uganda/gsec1.csv") gsec8_dataset <- read_csv("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Uganda/gsec8.csv") gsec15c_dataset <- read_csv("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Uganda/gsec15c.csv") pov19_20_dataset <- read_csv("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Uganda/pov2019_20.csv") sub_region_codes <- read_csv("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Uganda/Uganda_regions_categories.csv") ``` Uganda Data Processing ```{r} # Data processing and merging gsec1_dataset <- gsec1_dataset %>% mutate(urban_rural = ifelse(urban == 0, "Rural", "Urban"), region_value = case_when( region == 1 ~ "Central", region == 2 ~ "Eastern", region == 3 ~ "Northern", region == 4 ~ "Western", TRUE ~ "Unknown" )) housing_expenses <- gsec15c_dataset %>% filter(CEC02 %in% c(301)) %>% group_by(hhid) %>% summarise(Housing_Expenditure = sum(CEC05, na.rm = TRUE)) merged_dataset <- pov19_20_dataset %>% left_join(gsec1_dataset[c('hhid', 'district', 'wgt', "urban_rural", "region_value")], by = "hhid") %>% left_join(sub_region_codes, by = "subreg") %>% left_join(housing_expenses, by = "hhid") %>% drop_na(Housing_Expenditure) %>% mutate( Percent_Income_Spent_on_Housing = ifelse(urban_rural == "Urban", 20.3, 15.5) ) # Calculate sample size for Uganda sample_size_uganda <- merged_dataset %>% group_by(region_value, urban_rural, subreg_value, Year = 2020) %>% summarise(sample_size = n()) sample_size_uganda_u_r <- merged_dataset %>% group_by(region_value, subreg_value, urban_rural = "All", Year = 2020) %>% summarise(sample_size = n()) # Calculate the number of earners per household gsec8_dataset <- gsec8_dataset %>% mutate(is_earner = ifelse(s8q14 == 1, 1, 0)) %>% group_by(hhid) %>% mutate(no_of_earners = sum(is_earner)) %>% ungroup() merged_dataset <- merged_dataset %>% left_join(gsec8_dataset[c('hhid', 'no_of_earners')], by = "hhid") %>% distinct() %>% mutate(wgt = ifelse(is.na(wgt), 1, wgt)) %>% uncount(weights = as.integer(wgt/100)) all_data_uganda <- merged_dataset %>% rename(HH_exp = nrrexp30, HH_size = hsize) ``` Uganda Quantile Calculation ```{r} # Calculate quantiles for each location uganda_location_quantiles <- all_data_uganda %>% group_by(Country = "Uganda", urban_rural, Location = region_value,City = subreg_value, Year = 2020) %>% group_modify(~ calculate_quantiles(.x, quantile_probs)) %>% ungroup() # Additional mixed urban/rural quantiles for each location quantile_probs <- seq(0.01, 0.99, by = 0.01) uganda_location_quantiles_u_r <- all_data_uganda %>% group_by(Country = "Uganda", urban_rural = "All", Location = region_value, City = subreg_value, Year = 2020) %>% group_modify(~ calculate_quantiles(.x, quantile_probs)) %>% ungroup() ``` Nigeria ```{r} # Load datasets for Nigeria sec4a_dataset <- read_csv("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Nigeria/sect4a1_labour.csv") totcons_dataset <- read_csv("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Nigeria/totcons.csv") state_codes <- read_xlsx("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Nigeria/Nigeria states.xlsx", sheet = "States") lga_codes <- read_xlsx("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Nigeria/Nigeria states.xlsx", sheet = "Lga") ``` Nigeria Data Processing ```{r} # Calculate the number of earners per household sec4a_dataset <- sec4a_dataset %>% mutate( s04aq04 = replace_na(s04aq04, 2), s04aq48 = replace_na(s04aq48, 2), s04aq06 = replace_na(s04aq06, 2), s04aq09 = replace_na(s04aq09, 2), s04aq11 = replace_na(s04aq11, 3) ) %>% mutate(is_earner = ifelse(s04aq04 == 1 | s04aq06 == 1 | s04aq09 == 1 | s04aq11 == 1 | s04aq11 == 2 | s04aq48 == 1, 1, 0)) %>% group_by(hhid) %>% mutate(no_of_earners = sum(is_earner)) %>% ungroup() # Prepare total consumption data totcons_summary <- totcons_dataset %>% mutate( Household_monthly = (totcons_pc / 12) * hhsize, Percent_Income_Spent_on_Housing = (rent33 / totcons_pc) * 100, urban_rural = ifelse(sector == 1, "Urban", "Rural") ) %>% left_join(sec4a_dataset[c('hhid', 'no_of_earners')], by = "hhid") %>% left_join(state_codes, by = "state") %>% left_join(lga_codes, by = "lga") %>% distinct() sample_size_nigeria <- totcons_summary %>% group_by(state_value, urban_rural,lga_value, Year = 2019) %>% summarise(sample_size = n()) sample_size_nigeria_u_r <- totcons_summary %>% group_by(state_value, urban_rural = "All",lga_value, Year = 2019) %>% summarise(sample_size = n()) # Calculate quantiles for each location all_data_nigeria <- totcons_summary %>% uncount(weights = as.integer(wt_final / 100)) %>% rename(HH_exp = Household_monthly, HH_size = hhsize) ``` Nigeria Quantile Calculation ```{r} nigeria_location_quantiles <- all_data_nigeria %>% group_by(Country = "Nigeria", urban_rural, Location = state_value,City = lga_value ,Year = 2019) %>% group_modify(~ calculate_quantiles(.x, quantile_probs)) %>% ungroup() # Additional mixed urban/rural quantiles for each location quantile_probs <- seq(0.01, 0.99, by = 0.01) nigeria_location_quantiles_u_r <- all_data_nigeria %>% group_by(Country = "Nigeria", urban_rural = "All", Location = state_value, City = lga_value, Year = 2019) %>% group_modify(~ calculate_quantiles(.x, quantile_probs)) %>% ungroup() ``` Pakistan ```{r} # Load datasets for Pakistan sec00_dataset <- read.spss("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Pakistan/sec 00.sav", to.data.frame = TRUE) sec1a_dataset <- read.spss("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Pakistan/sec 1a.sav", to.data.frame = TRUE) sec12A_dataset <- read.spss("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Pakistan/sec 12A.sav", to.data.frame = TRUE) sec12CE_dataset <- read.spss("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Pakistan/sec 12CE.sav", to.data.frame = TRUE) sec6a_dataset <- read.spss("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Pakistan/sec 6a.sav", to.data.frame = TRUE) weight_dataset <- read.spss("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Pakistan/weight.sav", to.data.frame = TRUE) codes <- read_xlsx("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Pakistan/Pakistan Codes.xlsx") perc_income <- read_xlsx("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Pakistan/pakistan_income_data.xlsx") ``` Pakistan Data Processing ```{r} # Data processing for income and earners sec1a_dataset <- sec1a_dataset %>% group_by(hhcode) %>% summarise(HHsize = n()) %>% #changed from mutate ungroup() weight_dataset <- weight_dataset %>% rename( PSU = psu ) sec12A_dataset <- sec12A_dataset %>% filter(idc != 99) %>% mutate(is_earner = ifelse(s12aq08 != 0, 1, 0)) %>% group_by(hhcode) %>% summarise(no_of_earners = sum(is_earner)) %>% #changed from mutate ungroup() sec12CE_dataset <- sec12CE_dataset %>% mutate(monthly_income = ifelse(ratio_lrg == "Yes", t_income / 12, t_income1 / 12)) # Prepare housing data and merge with total expenditure house_rent <- sec6a_dataset %>% filter(itc == "House Rent ") %>% select(hhcode, PSU, House_Rent = v1) total_expenditure <- sec6a_dataset %>% filter(itc == "5000") %>% select(hhcode, PSU, Total_Expenditure = v1) result <- left_join(house_rent, total_expenditure, by = "hhcode") %>% drop_na(House_Rent) # Merge all datasets for Pakistan merged_dataset <- left_join(sec12CE_dataset, weight_dataset, by = "PSU") %>% left_join(sec1a_dataset[c('hhcode', 'HHsize')], by = "hhcode") %>% left_join(sec12A_dataset[c('hhcode', 'no_of_earners')], by = "hhcode") %>% left_join(sec00_dataset[c('hhcode', 'Q05')], by = "hhcode") %>% mutate( province = as.integer(substr(PSU, 1, 1)), division = as.integer(substr(PSU, 2, 3)), region = as.integer(substr(PSU, 4, 4)) ) %>% left_join(codes, by = c('province', "division", "region")) %>% left_join(perc_income, by = c('province_name', "Urban_rural")) %>% group_by(hhcode) %>% # added these 2 lines filter (row_number() == 1) #distinct() #replaced with filter() above merged_dataset <- merged_dataset %>% mutate(Urban_rural = case_when( Urban_rural == "urban" ~ "Urban", Urban_rural == "rural" ~ "Rural", TRUE ~ Urban_rural # Leave other values unchanged )) sample_size_pakistan <- merged_dataset %>% group_by(province_name, Urban_rural, district_name, Year = 2019) %>% summarise(sample_size = n()) #%>% #filter(province_name != "ICT") sample_size_pakistan_u_r <- merged_dataset %>% group_by(province_name, Urban_rural ="All", district_name, Year = 2019) %>% summarise(sample_size = n()) #%>% #filter(province_name != "ICT") all_data_pakistan <- merged_dataset %>% uncount(weights = as.integer(weight / 100)) %>% rename(HH_exp = monthly_income, HH_size = HHsize) ``` Pakistan Quantile Calculation ```{r} # Calculate quantiles for each location pakistan_location_quantiles <- all_data_pakistan %>% group_by(Country = "Pakistan", urban_rural = Urban_rural, Location = province_name,City =district_name, Year = 2019) %>% group_modify(~ calculate_quantiles(.x, quantile_probs)) %>% ungroup() # Additional mixed urban/rural quantiles for each location quantile_probs <- seq(0.01, 0.99, by = 0.01) pakistan_location_quantiles_u_r <- all_data_pakistan %>% group_by(Country = "Pakistan", urban_rural = "All", Location = province_name, City = district_name, Year = 2019) %>% group_modify(~ calculate_quantiles(.x, quantile_probs)) %>% ungroup() ``` Kenya 2016 ```{r} # Load datasets for Kenya 2016 hh_dataset <- read_dta('/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Kenya/2016/HH_Members_Information.dta') consumption_dataset <- read_dta('/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Kenya/2016/Consumption_aggregate.dta') counties_code <- read.csv("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Kenya/kenya_counties.csv") ``` Kenya 2016 Data Processing ```{r} # Data processing and merging hh_dataset <- hh_dataset %>% #filter(d08 > 0) %>% # mutate (hhcode = paste (clid,hhid)) %>% mutate(hhcode = paste (clid,hhid) # d04_1 = replace_na(d04_1, 'G'), # d04_2 = replace_na(d04_2, 'G'), # d04_3 = replace_na(d04_3, 'G') ) %>% mutate ( d04_1 = ifelse(d04_1 == 'A', 1, ifelse(d04_1 == 'B', 1, ifelse(d04_1 == 'C', 1, ifelse(d04_1 == 'D', 1, ifelse(d04_1 == 'E', 1, ifelse(d04_1 == 'F', 1, 0)))))), d04_2 = ifelse(d04_2 == 'A', 1, ifelse(d04_2 == 'B', 1, ifelse(d04_2 == 'C', 1, ifelse(d04_2 == 'D', 1, ifelse(d04_2 == 'E', 1, ifelse(d04_2 == 'F', 1, 0)))))), d04_3 = ifelse(d04_3 == 'A', 1, ifelse(d04_3 == 'B', 1, ifelse(d04_3 == 'C', 1, ifelse(d04_3 == 'D', 1, ifelse(d04_3 == 'E', 1, ifelse(d04_3 == 'F', 1, 0)))))) ) %>% mutate(is_working = ifelse(d02_1 == 1,1, ifelse(d02_2 == 1,1, ifelse(d02_3 == 1,1, ifelse(d02_4 == 1,1,ifelse(d02_5 == 1 ,1, ifelse(d02_6 == 1, 1, 0))))))) %>% #mutate(wont_work = ifelse(d04_1 == 'G' | d04_2 == 'G' | d04_3 == 'G',1,0)) %>% mutate (d04 = d04_1 + d04_2 + d04_3) %>% mutate (is_earner = ifelse(d04 + is_working > 0,1,0)) %>% #mutate (is_earner = ifelse(is_working == 1 | wont_work == 0,1,0)) %>% group_by(hhcode) %>% #summarise(d08 = n()) %>% summarise(no_of_earners = sum(is_earner)) %>% ungroup() consumption_dataset <- consumption_dataset %>% mutate( HH_exp = padqexp * ctry_adq, Percent_Income_Spent_on_Housing = (padqrent / HH_exp) * 100, #no_of_earners = round(ctry_adq, digits = 0), removed this on 5/11/24 urban_rural = if_else(resid == 1, "Rural", "Urban"), hhcode = paste (clid,hhid) ) %>% #drop_na(padqrent) %>% left_join(counties_code, by = "county") %>% left_join(hh_dataset, by = c("hhcode")) %>% #rename(no_of_earners = is_earner) %>% #changed from d08 11/11/24 distinct() sample_size_kenya <- consumption_dataset %>% group_by(county_value, urban_rural, Year = 2016) %>% summarise(sample_size = n()) sample_size_kenya_u_r <- consumption_dataset %>% group_by(county_value, urban_rural ="All", Year = 2016) %>% summarise(sample_size = n()) all_data_kenya <- consumption_dataset %>% uncount(weights = as.integer(weight / 100)) %>% rename(HH_size = hhsize) ``` Kenya 2016 Quantile Calculation ```{r} # Calculate quantiles for each location kenya_location_quantiles <- all_data_kenya %>% group_by(Country = "Kenya", urban_rural, Location = county_value, Year = 2016) %>% group_modify(~ calculate_quantiles(.x, quantile_probs)) %>% ungroup() # Additional mixed urban/rural quantiles for each location quantile_probs <- seq(0.01, 0.99, by = 0.01) kenya_location_quantiles_u_r <- all_data_kenya %>% group_by(Country = "Kenya", urban_rural = "All", Location = county_value, Year = 2016) %>% group_modify(~ calculate_quantiles(.x, quantile_probs)) %>% ungroup() kenya_location_quantiles["City"] <- "All" kenya_location_quantiles_u_r ["City"] <- "All" ``` ```{r} # inflation_data <- read_csv("/Users/vicky/Desktop/Reall/inflation_pivot_data.csv") ``` ```{r} #median_inflation <- inflation_data %>% # group_by(Country) %>% # summarise(median_inflation_rate = median(Inflation_Rate, na.rm = TRUE)) #inflation_data_with_median <- inflation_data %>% # left_join(median_inflation, by = "Country") ``` Kenya 2020 ```{r} #Load Datasets for Kenya 2020 y2020_kenya_consumption <- read_dta('/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Kenya/2020/consagg_microdata.dta') y2020_kenya_hh <- read_dta('/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Kenya/2020/households_microdata.dta') y2020_kenya_hh <- y2020_kenya_hh %>% #filter(d08 > 0) %>% # mutate (hhcode = paste (clid,hhid)) %>% mutate(hhcode = paste (clid,hhid) # d04_1 = replace_na(d04_1, 'G'), # d04_2 = replace_na(d04_2, 'G'), # d04_3 = replace_na(d04_3, 'G') ) y2020_kenya_consumption <- y2020_kenya_consumption %>% mutate( HH_exp = padqexp * adq_scale, Percent_Income_Spent_on_Housing = (padqrent / HH_exp) * 100, #no_of_earners = round(adq_scale, digits = 0), urban_rural = if_else(resid == 1, "Rural", "Urban"), hhcode = paste (clid,hhid), no_of_earners = NA )%>% left_join(counties_code, by = "county") %>% #left_join(y2020_kenya_hh, by = c("hhcode")) %>% distinct() sample_size_kenya_2020 <- y2020_kenya_consumption %>% group_by(county_value, urban_rural, Year = 2020) %>% summarise(sample_size = n()) sample_size_kenya_2020_u_r <- y2020_kenya_consumption %>% group_by(county_value, urban_rural ="All", Year = 2020) %>% summarise(sample_size = n()) all_data_kenya_2020 <- y2020_kenya_consumption %>% uncount(weights = as.integer(weight_hh / 100)) %>% rename(HH_size = hhsize) ``` Kenya 2020 Quantile Calculation ```{r} # Calculate quantiles for each location kenya_location_quantiles_2020 <- all_data_kenya_2020 %>% group_by(Country = "Kenya", urban_rural, Location = county_value, Year = 2020) %>% group_modify(~ calculate_quantiles(.x, quantile_probs)) %>% ungroup() # Additional mixed urban/rural quantiles for each location quantile_probs <- seq(0.01, 0.99, by = 0.01) kenya_location_quantiles_2020_u_r <- all_data_kenya_2020 %>% group_by(Country = "Kenya", urban_rural = "All", Location = county_value, Year = 2020) %>% group_modify(~ calculate_quantiles(.x, quantile_probs)) %>% ungroup() kenya_location_quantiles_2020["City"] <- "All" kenya_location_quantiles_2020_u_r ["City"] <- "All" ``` Final Summary and Export ```{r} # Define common columns for final summary common_columns <- c("Country", "urban_rural", "Location","City", "Year", "Quantile", "HH_exp", "no_of_earners", "HH_size", "Percent_Income_Spent_on_Housing") # Function to align columns across datasets align_columns <- function(df, common_cols) { df %>% mutate(across(setdiff(common_cols, colnames(df)), ~ NA)) %>% select(all_of(common_cols)) } # Align columns and combine all datasets india_location_quantiles <- align_columns(india_location_quantiles, common_columns) india_location_quantiles_u_r <- align_columns(india_location_quantiles_u_r, common_columns) uganda_location_quantiles <- align_columns(uganda_location_quantiles, common_columns) uganda_location_quantiles_u_r <- align_columns(uganda_location_quantiles_u_r, common_columns) nigeria_location_quantiles <- align_columns(nigeria_location_quantiles, common_columns) nigeria_location_quantiles_u_r <- align_columns(nigeria_location_quantiles_u_r, common_columns) pakistan_location_quantiles <- align_columns(pakistan_location_quantiles, common_columns) pakistan_location_quantiles_u_r <- align_columns(pakistan_location_quantiles_u_r, common_columns) kenya_location_quantiles <- align_columns(kenya_location_quantiles, common_columns) kenya_location_quantiles_u_r <- align_columns(kenya_location_quantiles_u_r, common_columns) kenya_location_quantiles_2020 <- align_columns(kenya_location_quantiles_2020, common_columns) kenya_location_quantiles_2020_u_r <- align_columns(kenya_location_quantiles_2020_u_r, common_columns) india_location_quantiles_2022 <- align_columns(india_location_quantiles_2022, common_columns) india_location_quantiles_u_r_2022 <- align_columns(india_location_quantiles_u_r_2022, common_columns) # Combine all country datasets into a single summary dataset summary_dataset <- bind_rows(india_location_quantiles, india_location_quantiles_u_r, india_location_quantiles_2022, india_location_quantiles_u_r_2022, pakistan_location_quantiles, pakistan_location_quantiles_u_r, uganda_location_quantiles, uganda_location_quantiles_u_r, nigeria_location_quantiles, nigeria_location_quantiles_u_r, kenya_location_quantiles, kenya_location_quantiles_u_r, kenya_location_quantiles_2020, kenya_location_quantiles_2020_u_r) ``` ```{r} sample_size_india <- sample_size_india %>% rename(Location = state, sample_size = sample_size) %>% mutate(Country = "India") sample_size_india_u_r <- sample_size_india_u_r %>% rename(Location = state, sample_size = sample_size) %>% mutate(Country = "India") sample_size_uganda <- sample_size_uganda %>% rename(Location = region_value, City = subreg_value, sample_size = sample_size) %>% mutate(Country = "Uganda") sample_size_uganda_u_r <- sample_size_uganda_u_r %>% rename(Location = region_value, City = subreg_value, sample_size = sample_size) %>% mutate(Country = "Uganda") sample_size_nigeria <- sample_size_nigeria %>% rename(Location = state_value,City = lga_value, sample_size = sample_size) %>% mutate(Country = "Nigeria") sample_size_nigeria_u_r <- sample_size_nigeria_u_r %>% rename(Location = state_value,City = lga_value, sample_size = sample_size) %>% mutate(Country = "Nigeria") sample_size_pakistan <- sample_size_pakistan %>% rename(Location = province_name,City=district_name, sample_size = sample_size, urban_rural = Urban_rural ) %>% mutate(Country = "Pakistan") sample_size_pakistan_u_r <- sample_size_pakistan_u_r %>% rename(Location = province_name,City=district_name, sample_size = sample_size, urban_rural = Urban_rural ) %>% mutate(Country = "Pakistan") sample_size_kenya <- sample_size_kenya %>% rename(Location = county_value, sample_size = sample_size) %>% mutate(Country = "Kenya") sample_size_kenya_u_r <- sample_size_kenya_u_r %>% rename(Location = county_value, sample_size = sample_size) %>% mutate(Country = "Kenya") sample_size_kenya_2020 <- sample_size_kenya_2020 %>% rename(Location = county_value, sample_size = sample_size) %>% mutate(Country = "Kenya") sample_size_kenya_2020_u_r <- sample_size_kenya_2020_u_r %>% rename(Location = county_value, sample_size = sample_size) %>% mutate(Country = "Kenya") sample_size_india_2022 <- sample_size_india_2022 %>% rename(Location = state, sample_size = sample_size) %>% mutate(Country = "India") sample_size_india_u_r_2022 <- sample_size_india_u_r_2022 %>% rename(Location = state, sample_size = sample_size) %>% mutate(Country = "India") sample_size_kenya["City"] <- "All" sample_size_kenya_u_r["City"] <- "All" sample_size_kenya_2020["City"] <- "All" sample_size_kenya_2020_u_r["City"] <- "All" # Combine the sample size tables into one ``` ```{r} # === Philippines 2023 Data Integration === # Load raw files ph_2023 <- read_csv("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Philippines/2023/FIES PUF 2023 Volume2 Household Summary.csv") region_lookup <- read_excel("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Philippines/2023/region_code_mapping_phil2023.xlsx") %>% mutate(W_REGN = as.numeric(W_REGN_VS1)) province_lookup <- read_excel("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Philippines/2023/province_code_mapping_phil2023.xlsx") %>% mutate(W_PROV = as.numeric(W_PROV_VS1)) # Clean dataset ph_2023_clean <- ph_2023 %>% mutate( W_REGN = as.numeric(W_REGN), W_PROV = as.numeric(W_PROV), HH_exp = as.numeric(TOTEX)/12, HH_size = as.numeric(FSIZE), no_of_earners = NA_real_, monthly_rent = as.numeric(ACTRENT), urban_rural = ifelse(URB == 1, "Urban", "Rural"), norm_weight = round(as.numeric(RFACT) / 100) ) %>% left_join(region_lookup, by = "W_REGN") %>% left_join(province_lookup, by = "W_PROV") %>% mutate( Region = case_when( Province %in% c("Batangas", "Cavite", "Laguna", "Quezon", "Rizal") ~ "Region IV-A - CALABARZON", Province %in% c("Marinduque", "Occidental Mindoro", "Oriental Mindoro", "Palawan", "Romblon") ~ "Region IV-B - MIMAROPA", TRUE ~ Region ), Location = Region, City = Province, Percent_Income_Spent_on_Housing = (monthly_rent / HH_exp) * 100 ) %>% drop_na(HH_exp, HH_size, monthly_rent) # Expand based on weight all_data_ph_2023 <- ph_2023_clean %>% uncount(weights = norm_weight) # Sample size sample_size_ph_2023 <- ph_2023_clean %>% group_by(Location, City, urban_rural, Year = 2023) %>% summarise(sample_size = n(), .groups = "drop") %>% mutate(Country = "Philippines") # Quantile computation quantile_probs <- seq(0.01, 0.99, by = 0.01) calculate_quantiles_simple <- function(data, quantiles) { do.call(rbind, lapply(quantiles, function(q) { data.frame( Quantile = q * 100, HH_exp = quantile(data$HH_exp, probs = q, na.rm = TRUE), no_of_earners = NA_real_, HH_size = median(data$HH_size[data$HH_exp <= quantile(data$HH_exp, probs = q, na.rm = TRUE)], na.rm = TRUE), Percent_Income_Spent_on_Housing = median(data$Percent_Income_Spent_on_Housing[data$HH_exp <= quantile(data$HH_exp, probs = q, na.rm = TRUE)], na.rm = TRUE) ) })) } ph_quantiles_2023 <- all_data_ph_2023 %>% group_by(Location, City, urban_rural) %>% group_modify(~ calculate_quantiles_simple(.x, quantile_probs)) %>% ungroup() %>% mutate(Country = "Philippines", Year = 2023) # Merge with sample size ph_final_2023 <- ph_quantiles_2023 %>% left_join(sample_size_ph_2023, by = c("Country", "Location", "City", "urban_rural", "Year")) %>% select(Country, Location, Year, Quantile, urban_rural, sample_size, HH_exp, no_of_earners, HH_size, Percent_Income_Spent_on_Housing, City) # Urban + Rural Combined ph_quantiles_u_r_2023 <- all_data_ph_2023 %>% group_by(Location, City, urban_rural = "All") %>% group_modify(~ calculate_quantiles_simple(.x, quantile_probs)) %>% ungroup() %>% mutate(Country = "Philippines", Year = 2023) sample_size_ph_u_r_2023 <- ph_2023_clean %>% group_by(Location, City, urban_rural = "All", Year = 2023) %>% summarise(sample_size = n(), .groups = "drop") %>% mutate(Country = "Philippines") # Final Combine ph_final_combined_2023 <- bind_rows( ph_final_2023, ph_quantiles_u_r_2023 %>% left_join(sample_size_ph_u_r_2023, by = c("Country", "Location", "City", "urban_rural", "Year")) %>% select(Country, Location, Year, Quantile, urban_rural, sample_size, HH_exp, no_of_earners, HH_size, Percent_Income_Spent_on_Housing, City) ) # Align & Merge ph_final_combined_2023 <- align_columns(ph_final_combined_2023, common_columns) summary_dataset <- bind_rows(summary_dataset, ph_final_combined_2023) # combined_sample_size <- bind_rows(combined_sample_size, sample_size_ph_2023, sample_size_ph_u_r_2023) ``` ```{r} # === Philippines 2018 Data Integration === # Load raw files ph_2018 <- read_csv("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Philippines/2018/2018_FIES_vol2.csv") region_lookup_2018 <- read_excel("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Philippines/2018/region_code_mapping_Philippines.xlsx") %>% mutate(W_REGN = as.numeric(W_REGN)) province_lookup_2018 <- read_excel("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Philippines/2018/province_code_mapping_Philippines.xlsx") %>% mutate(W_PROV = as.numeric(W_PROV)) # Clean and process ph_2018_clean <- ph_2018 %>% mutate( W_REGN = as.numeric(W_REGN), W_PROV = as.numeric(W_PROV), HH_exp = as.numeric(TTOTEX)/12, HH_size = as.numeric(FSIZE), no_of_earners = as.numeric(EMPLOYED_PAY)+as.numeric(EMPLOYED_PROF), monthly_rent = as.numeric(RENT), urban_rural = ifelse(URB == 1, "Urban", "Rural"), Percent_Income_Spent_on_Housing = (monthly_rent / HH_exp) * 100, norm_weight = round(as.numeric(RFACT) / 100) ) %>% left_join(region_lookup_2018, by = "W_REGN") %>% left_join(province_lookup_2018, by = "W_PROV") %>% mutate( Region = case_when( ProvinceName %in% c("Batangas", "Cavite", "Laguna", "Quezon", "Rizal") ~ "Region IV-A - CALABARZON", ProvinceName %in% c("Marinduque", "Occidental Mindoro", "Oriental Mindoro", "Palawan", "Romblon") ~ "Region IV-B - MIMAROPA", TRUE ~ RegionName ), Location = Region, City = ProvinceName ) %>% drop_na(HH_exp, HH_size, monthly_rent) # Expand based on weights all_data_ph_2018 <- ph_2018_clean %>% uncount(weights = norm_weight) # Sample size sample_size_ph_2018 <- ph_2018_clean %>% group_by(Location, City, urban_rural, Year = 2018) %>% summarise(sample_size = n(), .groups = "drop") %>% mutate(Country = "Philippines") # Urban + Rural Combined sample_size_ph_u_r_2018 <- ph_2018_clean %>% group_by(Location, City, urban_rural = "All", Year = 2018) %>% summarise(sample_size = n(), .groups = "drop") %>% mutate(Country = "Philippines") # Quantile Calculation ph_quantiles_2018 <- all_data_ph_2018 %>% group_by(Location, City, urban_rural) %>% group_modify(~ calculate_quantiles(.x, quantile_probs)) %>% ungroup() %>% mutate(Country = "Philippines", Year = 2018) ph_quantiles_u_r_2018 <- all_data_ph_2018 %>% group_by(Location, City, urban_rural = "All") %>% group_modify(~ calculate_quantiles(.x, quantile_probs)) %>% ungroup() %>% mutate(Country = "Philippines", Year = 2018) # Combine all ph_final_combined_2018 <- bind_rows( ph_quantiles_2018 %>% left_join(sample_size_ph_2018, by = c("Country", "Location", "City", "urban_rural", "Year")), ph_quantiles_u_r_2018 %>% left_join(sample_size_ph_u_r_2018, by = c("Country", "Location", "City", "urban_rural", "Year")) ) %>% select(Country, Location, Year, Quantile, urban_rural, sample_size, HH_exp, no_of_earners, HH_size, Percent_Income_Spent_on_Housing, City) # Align columns ph_final_combined_2018 <- align_columns(ph_final_combined_2018, common_columns) # Add to final dataset summary_dataset <- bind_rows(summary_dataset, ph_final_combined_2018) # Add to combined sample size # combined_sample_size <- bind_rows(combined_sample_size, sample_size_ph_2018, sample_size_ph_u_r_2018) ``` ```{r} # === Tanzania 2018/19 Data Processing === # Step 1: Load datasets (replace filenames with your actual extracted .dta names) hh_tz <- read_dta("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Tanzania/TZA_2020_NPS-R5_v02_M_STATA14/hh_sec_a.dta") cons_tz <- read_dta("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Tanzania/TZA_2020_NPS-R5_v02_M_STATA14/consumption_real_y5.dta") # Step 2: Load Region Lookup (Excel we created earlier) region_lookup_tz <- readxl::read_excel("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Tanzania/tanzania_region_lookup.xlsx") %>% dplyr::mutate(RegionCode = as.integer(RegionCode)) # Step 3: Clean location dataset hh_tz_clean <- hh_tz %>% select( y5_hhid, hh_a01_1, hh_a02_1, y5_rural, y5_crossweight ) %>% rename( RegionCode = hh_a01_1, DistrictCode = hh_a02_1, Weight = y5_crossweight, urban_rural = y5_rural ) %>% mutate( urban_rural = ifelse(urban_rural == 1, "Urban", "Rural") ) # Step 4: Clean consumption dataset tz_cons_clean <- cons_tz %>% transmute( y5_hhid, HH_size = hhsize, HH_exp = expm / 12, # annual → monthly Percent_Income_Spent_on_Housing = NA_real_ ) # Step 5: Merge datasets + region lookup tz_merged <- hh_tz_clean %>% left_join(tz_cons_clean, by = "y5_hhid") %>% left_join(region_lookup_tz, by = "RegionCode") %>% mutate( norm_weight = round(Weight / 100), Country = "Tanzania", Year = 2021, Location = stringr::str_to_title(RegionName), # use region names City = "All", no_of_earners = NA_real_ ) %>% drop_na(HH_exp, HH_size, norm_weight) # Step 6: Sample size summaries sample_size_tz <- tz_merged %>% group_by(Location, City, urban_rural, Year) %>% summarise(sample_size = n(), .groups = "drop") %>% mutate(Country = "Tanzania") sample_size_tz_u_r <- tz_merged %>% group_by(Location, City, urban_rural = "All", Year) %>% summarise(sample_size = n(), .groups = "drop") %>% mutate(Country = "Tanzania") # Step 7: Expand data by survey weights tz_expanded <- tz_merged %>% uncount(weights = norm_weight) # Step 8: Quantile calculations quantile_probs <- seq(0.01, 0.99, by = 0.01) tz_quantiles <- tz_expanded %>% group_by(Location, City, urban_rural) %>% group_modify(~ calculate_quantiles(.x, quantile_probs)) %>% ungroup() %>% mutate(Country = "Tanzania", Year = 2021) tz_quantiles_u_r <- tz_expanded %>% group_by(Location, City, urban_rural = "All") %>% group_modify(~ calculate_quantiles(.x, quantile_probs)) %>% ungroup() %>% mutate(Country = "Tanzania", Year = 2021) # Step 9: Combine quantiles with sample sizes tz_final <- tz_quantiles %>% left_join(sample_size_tz, by = c("Country","Location","City","urban_rural","Year")) %>% select(Country, Location, Year, Quantile, urban_rural, sample_size, HH_exp, no_of_earners, HH_size, Percent_Income_Spent_on_Housing, City) tz_final_combined <- bind_rows( tz_final, tz_quantiles_u_r %>% left_join(sample_size_tz_u_r, by = c("Country","Location","City","urban_rural","Year")) %>% select(Country, Location, Year, Quantile, urban_rural, sample_size, HH_exp, no_of_earners, HH_size, Percent_Income_Spent_on_Housing, City) ) # Step 10: Align with global schema & integrate into master datasets tz_final_aligned <- align_columns(tz_final_combined, common_columns) summary_dataset <- bind_rows(summary_dataset, tz_final_aligned) #combined_sample_size <- bind_rows(combined_sample_size, sample_size_tz, sample_size_tz_u_r) ``` ```{r} # === Ghana 2016 Data Processing === # Step 1: Load datasets # Set your folder path path <- "/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Ghana/Data Used In Code/" # Read the three Stata files gh_hh <- read_dta(file.path(path, "00_GHA_BASICINFO.dta")) exphous <- read_dta(file.path(path, "04_GHA_EXPHOUS.dta")) gh_cons <- read_dta(file.path(path, "15_GHA_2017_E_final.dta")) region_lookup_gh <- readxl::read_excel("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Ghana/Data Used In Code/ghana_region_code_mapping.xlsx") %>% dplyr::mutate(region = as.integer(region)) # Step 2: Clean location dataset gh_hh_clean <- gh_hh %>% select(hid, region, rururb, hhsize, WTA_S ) %>% rename( HH_size = hhsize, hhweight= WTA_S, urban_rural = rururb ) %>% mutate( urban_rural = ifelse(urban_rural == 1, "Urban", "Rural") ) # Step 3: Clean consumption dataset gh_cons_clean <- gh_cons %>% transmute( hid, HH_exp = HHEXP_N / 12, # annual → monthly ) # Step 4: Merge datasets + region lookup gh_merged <- gh_hh_clean %>% left_join(gh_cons_clean, by = "hid") %>% left_join(region_lookup_gh, by = "region") %>% mutate( norm_weight = round(hhweight / 100), Country = "Ghana", Year = 2016, Location = stringr::str_to_title(RegionName), City = "All", no_of_earners = NA_real_, Percent_Income_Spent_on_Housing = NA_real_ ) %>% drop_na(HH_exp, HH_size, norm_weight) # Step 5: Sample size summaries sample_size_gh <- gh_merged %>% group_by(Location, City, urban_rural, Year) %>% summarise(sample_size = n(), .groups = "drop") %>% mutate(Country = "Ghana") sample_size_gh_u_r <- gh_merged %>% group_by(Location, City, urban_rural = "All", Year) %>% summarise(sample_size = n(), .groups = "drop") %>% mutate(Country = "Ghana") # Step 6: Expand data by survey weights gh_expanded <- gh_merged %>% uncount(weights = norm_weight) # Step 7: Quantile calculations quantile_probs <- seq(0.01, 0.99, by = 0.01) gh_quantiles <- gh_expanded %>% group_by(Location, City, urban_rural) %>% group_modify(~ calculate_quantiles(.x, quantile_probs)) %>% ungroup() %>% mutate(Country = "Ghana", Year = 2016) gh_quantiles_u_r <- gh_expanded %>% group_by(Location, City, urban_rural = "All") %>% group_modify(~ calculate_quantiles(.x, quantile_probs)) %>% ungroup() %>% mutate(Country = "Ghana", Year = 2016) # Step 8: Combine quantiles with sample sizes gh_final <- gh_quantiles %>% left_join(sample_size_gh, by = c("Country","Location","City","urban_rural","Year")) %>% select(Country, Location, Year, Quantile, urban_rural, sample_size, HH_exp, no_of_earners, HH_size, Percent_Income_Spent_on_Housing, City) gh_final_combined <- bind_rows( gh_final, gh_quantiles_u_r %>% left_join(sample_size_gh_u_r, by = c("Country","Location","City","urban_rural","Year")) %>% select(Country, Location, Year, Quantile, urban_rural, sample_size, HH_exp, no_of_earners, HH_size, Percent_Income_Spent_on_Housing, City) ) # Step 9: Align with global schema & integrate into master datasets gh_final_aligned <- align_columns(gh_final_combined, common_columns) summary_dataset <- bind_rows(summary_dataset, gh_final_aligned) #combined_sample_size <- bind_rows(combined_sample_size, sample_size_gh, sample_size_gh_u_r) ``` ```{r} # === Rwanda 2016 Data Integration === # Load household-level data hh <- read_dta("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Rwanda/cs_S0_S5_Household.dta") %>% select(hhid, region, province, district, ur, weight) %>% mutate( region = as.numeric(region), province = as.numeric(province), district = as.numeric(district), urban_rural = ifelse(ur == 1, "Urban", "Rural") ) # Load mapping files region_map <- read_excel("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Rwanda/Rwanda_region_mapping.xlsx") province_map <- read_excel("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Rwanda/Rwanda_province_mapping.xlsx") district_map <- read_excel("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Rwanda/Rwanda_district_mapping.xlsx") # Join mappings to household data hh <- hh %>% left_join(region_map, by = "region") %>% left_join(province_map, by = "province") %>% left_join(district_map, by = "district") # Calculate number of earners per household persons <- read_dta("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Rwanda/cs_S6B_Employement_6C_Salaried_S6D_Business.dta") earners <- persons %>% mutate(is_earner = ifelse(s6bq6 == 1, 1, 0)) %>% group_by(hhid) %>% summarise(no_of_earners = sum(is_earner, na.rm = TRUE)) # Load and process household expenditure exp1 <- read_dta("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Rwanda/cs_S8A1_expenditure.dta") %>% select(hhid, amount = s8a1q3) %>% group_by(hhid) %>% summarise(exp_a1 = sum(amount, na.rm = TRUE) / 12) # annual → monthly exp2 <- read_dta("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Rwanda/cs_S8A2_expenditure.dta") %>% select(hhid, amount = s8a2q3) %>% group_by(hhid) %>% summarise(exp_a2 = sum(amount, na.rm = TRUE)) # 4 weeks → monthly exp3 <- read_dta("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Rwanda/cs_S8A3_expenditure.dta") %>% select(hhid, amount = s8a3q3) %>% group_by(hhid) %>% summarise(exp_a3 = sum(amount, na.rm = TRUE) * 4.33) # 7 days → monthly exp4 <- read_dta("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Rwanda/cs_S8B_expenditure.dta") %>% select(hhid, amount = s8bq3) %>% group_by(hhid) %>% summarise(exp_b = sum(amount, na.rm = TRUE) * 4.33) # 7 days → monthly exp_all <- list(exp1, exp2, exp3, exp4) %>% reduce(full_join, by = "hhid") %>% mutate(HH_exp = rowSums(across(starts_with("exp_")), na.rm = TRUE)) # Merge household + earners + expenditure merged_data <- hh %>% left_join(earners, by = "hhid") %>% left_join(exp_all, by = "hhid") %>% mutate( norm_weight = round(weight / 100), Country = "Rwanda", Year = 2016, City = DistrictName, Location = ProvinceName, Percent_Income_Spent_on_Housing = NA_real_ ) %>% drop_na(HH_exp, norm_weight) # Expand based on weight expanded_data <- merged_data %>% uncount(weights = norm_weight) # Sample size summary sample_size_rw <- merged_data %>% group_by(Location, City, urban_rural, Year) %>% summarise(sample_size = n(), .groups = 'drop') %>% mutate(Country = "Rwanda") sample_size_rw_u_r <- merged_data %>% group_by(Location, City, urban_rural = "All", Year) %>% summarise(sample_size = n(), .groups = 'drop') %>% mutate(Country = "Rwanda") # Quantile calculation function quantile_probs <- seq(0.01, 0.99, by = 0.01) calculate_quantiles <- function(data, quantiles) { do.call(rbind, lapply(quantiles, function(q) { data.frame( Quantile = q * 100, HH_exp = quantile(data$HH_exp, probs = q, na.rm = TRUE), no_of_earners = median(data$no_of_earners[data$HH_exp <= quantile(data$HH_exp, probs = q, na.rm = TRUE)], na.rm = TRUE), HH_size = NA_real_, Percent_Income_Spent_on_Housing = NA_real_ ) })) } # Compute quantiles rw_quantiles <- expanded_data %>% group_by(Location, City, urban_rural) %>% group_modify(~ calculate_quantiles(.x, quantile_probs)) %>% ungroup() %>% mutate(Country = "Rwanda", Year = 2016) rw_quantiles_u_r <- expanded_data %>% group_by(Location, City, urban_rural = "All") %>% group_modify(~ calculate_quantiles(.x, quantile_probs)) %>% ungroup() %>% mutate(Country = "Rwanda", Year = 2016) # Merge and finalize rw_final <- rw_quantiles %>% left_join(sample_size_rw, by = c("Country", "Location", "City", "urban_rural", "Year")) %>% select(Country, Location, Year, Quantile, urban_rural, sample_size, HH_exp, no_of_earners, HH_size, Percent_Income_Spent_on_Housing, City) rw_final_combined <- bind_rows( rw_final, rw_quantiles_u_r %>% left_join(sample_size_rw_u_r, by = c("Country", "Location", "City", "urban_rural", "Year")) %>% select(Country, Location, Year, Quantile, urban_rural, sample_size, HH_exp, no_of_earners, HH_size, Percent_Income_Spent_on_Housing, City) ) # Align and add to summary dataset rw_final_combined_aligned <- align_columns(rw_final_combined, common_columns) summary_dataset <- bind_rows(summary_dataset, rw_final_combined_aligned) # combined_sample_size <- bind_rows(combined_sample_size, sample_size_rw, sample_size_rw_u_r) ``` ```{R} # === Malawi 2019 Data Processing === # Step 1: Load consumption data welfare <- read_dta("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Malawi/ihs5_consumption_aggregate.dta") %>% select(HHID, hhsize, rexpagg, rexp_cat041, hh_wgt, area, region, district) %>% rename(hhid = HHID) # Step 2: Load individual-level data and compute earners indiv <- read_dta("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Malawi/HH_MOD_E.dta") %>% select(HHID, hh_e06_1a, hh_e06_1b, hh_e06_1c, hh_e06_2, hh_e06_3, hh_e06_4, hh_e06_5, hh_e06_6) %>% mutate(earner = if_else( hh_e06_1a == 1 | hh_e06_1b == 1 | hh_e06_1c == 1 | hh_e06_2 == 1 | hh_e06_3 == 1 | hh_e06_4 == 1 | hh_e06_6 == 1, 1, 0, missing = 0)) %>% group_by(HHID) %>% summarise(no_of_earners = sum(earner, na.rm = TRUE), .groups = "drop") %>% rename(hhid = HHID) # Step 3: Load region and district mapping region_map <- read_excel("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Malawi/region_mapping_mw.xlsx") %>% rename(region = region, RegionName = RegionName) district_map <- read_excel("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Malawi/district_mapping_mw.xlsx") %>% rename(district = district, DistrictName = DistrictName) # Step 4: Merge and transform data merged_data <- welfare %>% left_join(indiv, by = "hhid") %>% left_join(region_map, by = "region") %>% left_join(district_map, by = "district") %>% mutate( HH_exp = rexpagg / 12, # Convert annual to monthly norm_weight = round(hh_wgt / 100), Country = "Malawi", Year = 2019, City = DistrictName, Location = RegionName, urban_rural = ifelse(area == 1, "Urban", "Rural"), Percent_Income_Spent_on_Housing = (rexp_cat041 / 12) / HH_exp * 100 ) %>% drop_na(HH_exp, hhsize, norm_weight) # Step 5: Expand based on survey weights expanded_data <- merged_data %>% uncount(weights = norm_weight) # Step 6: Compute sample size by area sample_size_mw <- merged_data %>% group_by(Location, City, urban_rural, Year) %>% summarise(sample_size = n(), .groups = 'drop') %>% mutate(Country = "Malawi") sample_size_mw_u_r <- merged_data %>% group_by(Location, City, urban_rural = "All", Year) %>% summarise(sample_size = n(), .groups = 'drop') %>% mutate(Country = "Malawi") # Step 7: Define quantile function quantile_probs <- seq(0.01, 0.99, by = 0.01) calculate_quantiles <- function(data, quantiles) { do.call(rbind, lapply(quantiles, function(q) { q_val <- quantile(data$HH_exp, probs = q, na.rm = TRUE) data.frame( Quantile = q * 100, HH_exp = q_val, no_of_earners = median(data$no_of_earners[data$HH_exp <= q_val], na.rm = TRUE), HH_size = median(data$hhsize[data$HH_exp <= q_val], na.rm = TRUE), Percent_Income_Spent_on_Housing = median(data$Percent_Income_Spent_on_Housing[data$HH_exp <= q_val], na.rm = TRUE) ) })) } # Step 8: Compute quantiles mw_quantiles <- expanded_data %>% group_by(Location, City, urban_rural) %>% group_modify(~ calculate_quantiles(.x, quantile_probs)) %>% ungroup() %>% mutate(Country = "Malawi", Year = 2019) mw_quantiles_u_r <- expanded_data %>% group_by(Location, City, urban_rural = "All") %>% group_modify(~ calculate_quantiles(.x, quantile_probs)) %>% ungroup() %>% mutate(Country = "Malawi", Year = 2019) # Step 9: Merge quantiles and sample size mw_final <- mw_quantiles %>% left_join(sample_size_mw, by = c("Country", "Location", "City", "urban_rural", "Year")) %>% select(Country, Location, Year, Quantile, urban_rural, sample_size, HH_exp, no_of_earners, HH_size, Percent_Income_Spent_on_Housing, City) mw_final_combined <- bind_rows( mw_final, mw_quantiles_u_r %>% left_join(sample_size_mw_u_r, by = c("Country", "Location", "City", "urban_rural", "Year")) %>% select(Country, Location, Year, Quantile, urban_rural, sample_size, HH_exp, no_of_earners, HH_size, Percent_Income_Spent_on_Housing, City) ) # === Malawi 2019 Data Integration into Final Dataset === # Align and bind Malawi data to summary dataset mw_final_combined_aligned <- align_columns(mw_final_combined, common_columns) summary_dataset <- bind_rows(summary_dataset, mw_final_combined_aligned) # combined_sample_size <- bind_rows(combined_sample_size, sample_size_mw, sample_size_mw_u_r) ``` ```{r} # === Côte d'Ivoire 2021 Data Processing === # Step 1: Load welfare data welfare <- read_dta("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Côte d'Ivoire/ehcvm_welfare_civ2021.dta") %>% select(hhid, hhsize, dtot, hhweight) # Step 2: Load individual-level data for earners + location mapping indiv <- read_dta("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Côte d'Ivoire/ehcvm_individu_civ2021.dta") %>% select(hhid, activ12m, milieu, region, zae) # Step 3: Number of earners per household earners <- indiv %>% filter(activ12m %in% c(1,2)) %>% group_by(hhid) %>% summarise(no_of_earners = n(), .groups = 'drop') # Step 4: Extract urban/rural and region info (one row per household) location_data <- indiv %>% group_by(hhid) %>% summarise( urban_rural = ifelse(first(milieu) == 1, "Urban", "Rural"), RegionCode = first(region), AgroZone = first(zae), .groups = "drop" ) # Step 5: Load region and agro zone mapping files region_map <- read_excel("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Côte d'Ivoire/region_mapping_cote.xlsx") %>% rename(RegionCode = region, RegionName = region_name) zone_map <- read_excel("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Côte d'Ivoire/zone_mapping_cote.xlsx") %>% rename(AgroZone = zae, ZoneName = zae_name) # Step 6: Merge all datasets merged_data <- welfare %>% left_join(earners, by = "hhid") %>% left_join(location_data, by = "hhid") %>% left_join(region_map, by = "RegionCode") %>% left_join(zone_map, by = "AgroZone") %>% mutate( HH_exp = dtot / 12, # Convert annual to monthly expenditure norm_weight = round(hhweight / 100), Country = "Cote d Ivoire", Year = 2021, Location = str_to_title(ZoneName), City = str_to_title(RegionName), Percent_Income_Spent_on_Housing = NA_real_ ) %>% drop_na(HH_exp, hhsize, norm_weight) # Step 7: Expand based on survey weights expanded_data <- merged_data %>% uncount(weights = norm_weight) # Step 8: Sample size summary sample_size_ci <- merged_data %>% group_by(Location, City, urban_rural, Year) %>% summarise(sample_size = n(), .groups = 'drop') %>% mutate(Country = "Cote d Ivoire") sample_size_ci_u_r <- merged_data %>% group_by(Location, City, urban_rural = "All", Year) %>% summarise(sample_size = n(), .groups = 'drop') %>% mutate(Country = "Cote d Ivoire") # Step 9: Define quantile function quantile_probs <- seq(0.01, 0.99, by = 0.01) calculate_quantiles <- function(data, quantiles) { do.call(rbind, lapply(quantiles, function(q) { data.frame( Quantile = q * 100, HH_exp = quantile(data$HH_exp, probs = q, na.rm = TRUE), no_of_earners = median(data$no_of_earners[data$HH_exp <= quantile(data$HH_exp, probs = q, na.rm = TRUE)], na.rm = TRUE), HH_size = median(data$hhsize[data$HH_exp <= quantile(data$HH_exp, probs = q, na.rm = TRUE)], na.rm = TRUE), Percent_Income_Spent_on_Housing = NA_real_ ) })) } # Step 10: Compute quantiles ci_quantiles <- expanded_data %>% group_by(Location, City, urban_rural) %>% group_modify(~ calculate_quantiles(.x, quantile_probs)) %>% ungroup() %>% mutate(Country = "Cote d Ivoire", Year = 2021) ci_quantiles_u_r <- expanded_data %>% group_by(Location, City, urban_rural = "All") %>% group_modify(~ calculate_quantiles(.x, quantile_probs)) %>% ungroup() %>% mutate(Country = "Cote d Ivoire", Year = 2021) # Step 11: Merge quantiles with sample size ci_final <- ci_quantiles %>% left_join(sample_size_ci, by = c("Country", "Location", "City", "urban_rural", "Year")) %>% select(Country, Location, Year, Quantile, urban_rural, sample_size, HH_exp, no_of_earners, HH_size, Percent_Income_Spent_on_Housing, City) ci_final_combined <- bind_rows( ci_final, ci_quantiles_u_r %>% left_join(sample_size_ci_u_r, by = c("Country", "Location", "City", "urban_rural", "Year")) %>% select(Country, Location, Year, Quantile, urban_rural, sample_size, HH_exp, no_of_earners, HH_size, Percent_Income_Spent_on_Housing, City) ) # === Côte d'Ivoire 2021 Integration into Final Dataset === # Align columns ci_final_combined_aligned <- align_columns(ci_final_combined, common_columns) # Append to summary dataset and sample size table summary_dataset <- bind_rows(summary_dataset, ci_final_combined_aligned) # combined_sample_size <- bind_rows(combined_sample_size, sample_size_ci, sample_size_ci_u_r) ``` ```{r} # === Burkina Faso 2021 Data Processing === # Step 1: Load datasets welfare <- read_dta("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Burkina Faso/ehcvm_welfare_2b_bfa2021.dta") individual <- read_dta("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Burkina Faso/ehcvm_individu_bfa2021.dta") region_mapping <- read_excel("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Burkina Faso/region_id_name_mapping_BurkinFaso.xlsx") %>% rename(Region = "Region ID", RegionName = "Region Name") %>% mutate(Region = as.character(Region)) # Step 2: Extract household size and total monthly expenditure from welfare welfare_summary <- welfare %>% select(hhid, hhsize, dtot, hhweight) %>% rename(HH_size = hhsize, HH_exp = dtot, norm_weight = hhweight) %>% mutate( norm_weight = round(norm_weight / 100), HH_exp = HH_exp / 12 # Convert annual to monthly expenditure ) # Step 3: Extract Region/Province/Commune and Urban/Rural region_info <- individual %>% select(hhid, region, province, commune, milieu) %>% distinct() %>% mutate( Region = as.character(region), Province = as.character(province), City = as.character(commune), urban_rural = ifelse(milieu == 1, "Urban", "Rural") ) %>% # Replace all 12 Arrondissements in Centre (Region ID == "3") with "OUAGADOUGOU" mutate( City = ifelse(Region == "3" & grepl("^Arrondissement", City), "Ouagadougou", City), # Combine arrondissements under Hauts-Bassins (region ID 9) into BOBO-DIOULASSO City = ifelse(Region == "9" & grepl("^Arrondissement", City), "Bobo-dioulasso", City) ) %>% left_join(region_mapping, by = "Region") # Step 4: Merge components merged_data <- welfare_summary %>% left_join(region_info, by = "hhid") %>% mutate( Percent_Income_Spent_on_Housing = NA_real_, no_of_earners = NA_real_, Country = "Burkina Faso", Year = 2021, Location = RegionName ) %>% drop_na(HH_exp, HH_size) # Step 5: Compute TRUE sample size BEFORE expansion sample_size_bf <- merged_data %>% group_by(Country, Year, Location, City, urban_rural) %>% summarise(sample_size = n(), .groups = "drop") sample_size_bf_u_r <- merged_data %>% group_by(Country, Year, Location, City, urban_rural = "All") %>% summarise(sample_size = n(), .groups = "drop") # Step 6: Expand based on weights expanded_data <- merged_data %>% uncount(weights = norm_weight) # Step 7: Define quantile calculation function quantile_probs <- seq(0.01, 0.99, by = 0.01) calculate_quantiles <- function(data, quantiles) { do.call(rbind, lapply(quantiles, function(q) { data.frame( Quantile = q * 100, HH_exp = quantile(data$HH_exp, probs = q, na.rm = TRUE), no_of_earners = NA_real_, HH_size = median(data$HH_size[data$HH_exp <= quantile(data$HH_exp, probs = q, na.rm = TRUE)], na.rm = TRUE), Percent_Income_Spent_on_Housing = NA_real_ ) })) } # Step 8: Compute quantiles bf_quantiles <- expanded_data %>% group_by(Country, Year, Location, City, urban_rural) %>% group_modify(~ calculate_quantiles(.x, quantile_probs)) %>% ungroup() bf_quantiles_u_r <- expanded_data %>% group_by(Country, Year, Location, City, urban_rural = "All") %>% group_modify(~ calculate_quantiles(.x, quantile_probs)) %>% ungroup() # Step 9: Combine with sample sizes bf_final <- bf_quantiles %>% left_join(sample_size_bf, by = c("Country", "Year", "Location", "City", "urban_rural")) %>% select(Country, Location, Year, Quantile, urban_rural, sample_size, HH_exp, no_of_earners, HH_size, Percent_Income_Spent_on_Housing, City) bf_final_combined <- bind_rows( bf_final, bf_quantiles_u_r %>% left_join(sample_size_bf_u_r, by = c("Country", "Year", "Location", "City", "urban_rural")) %>% select(Country, Location, Year, Quantile, urban_rural, sample_size, HH_exp, no_of_earners, HH_size, Percent_Income_Spent_on_Housing, City) ) # Align with common structure bf_final_combined_aligned <- align_columns(bf_final_combined, common_columns) # Add to final dataset summary_dataset <- bind_rows(summary_dataset, bf_final_combined_aligned) # combined_sample_size <- bind_rows(combined_sample_size, sample_size_bf, sample_size_bf_u_r) ``` ```{r} #Kenya 2021 Data Processing # === Step 0: Load datasets === consump_kenya_2021 <- read_dta("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Kenya/2021/consagg_microdata.dta") hh_kenya_2021 <- read_dta("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Kenya/2021/households_microdata.dta") county_kenya2021_map <- read_csv("/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Datasets/Kenya/kenya_counties.csv") # === Step 1: Merge county names into household data === hh_kenya_2021 <- hh_kenya_2021 %>% left_join(county_kenya2021_map, by = c("county" = "county")) %>% mutate(hhcode = paste0(clid, hhid)) # === Step 2: Process consumption and join === kenya2021 <- consump_kenya_2021 %>% mutate( hhcode = paste0(clid, hhid), HH_exp = padqexp * adq_scale, Percent_Income_Spent_on_Housing = (padqrent / HH_exp) * 100, no_of_earners = NA_real_, urban_rural = if_else(resid == 1, "Rural", "Urban") ) %>% left_join(hh_kenya_2021 %>% select(hhcode, county_value), by = "hhcode") %>% rename(HH_size = hhsize, weight_hh = weight_hh, Location = county_value) %>% mutate(Country = "Kenya", Year = 2021, City = "All") %>% distinct() # === Step 3: Sample sizes === sample_size_kenya_2021 <- kenya2021 %>% group_by(Country, Year, Location, City, urban_rural) %>% summarise(sample_size = n(), .groups = "drop") sample_size_all_kenya_2021 <- kenya2021 %>% group_by(Country, Year, Location, City, urban_rural = "All") %>% summarise(sample_size = n(), .groups = "drop") # === Step 4: Expand by weights === expanded_kenya2021 <- kenya2021 %>% uncount(weights = as.integer(weight_hh / 100)) # === Step 5: Define quantile calculator === quantile_probs <- seq(0.01, 0.99, by = 0.01) calculate_quantiles <- function(data, quantiles) { do.call(rbind, lapply(quantiles, function(q) { q_val <- quantile(data$HH_exp, probs = q, na.rm = TRUE) data.frame( Quantile = q * 100, HH_exp = q_val, no_of_earners = median(data$no_of_earners[data$HH_exp <= q_val], na.rm = TRUE), HH_size = median(data$HH_size[data$HH_exp <= q_val], na.rm = TRUE), Percent_Income_Spent_on_Housing = median(data$Percent_Income_Spent_on_Housing[data$HH_exp <= q_val], na.rm = TRUE) ) })) } # === Step 6: Compute quantiles === kenya_quantiles_2021 <- expanded_kenya2021 %>% group_by(Country, Location, City, urban_rural, Year) %>% group_modify(~ calculate_quantiles(.x, quantile_probs)) %>% ungroup() kenya_quantiles_2021_all <- expanded_kenya2021 %>% group_by(Country, Location, City, urban_rural = "All", Year) %>% group_modify(~ calculate_quantiles(.x, quantile_probs)) %>% ungroup() # === Step 7: Combine and export === kenya_final_2021 <- bind_rows(kenya_quantiles_2021, kenya_quantiles_2021_all) %>% left_join(bind_rows(sample_size_kenya_2021, sample_size_all_kenya_2021), by = c("Country", "Year", "Location", "City", "urban_rural")) %>% select(Country, Location, Year, Quantile, urban_rural, sample_size, HH_exp, no_of_earners, HH_size, Percent_Income_Spent_on_Housing, City) # write_csv(kenya_final, "kenya_quantile_summary_2021.csv") # === Step 8: Integrate into Combined Dataset === kenya_final_combined_aligned_2021 <- align_columns(kenya_final_2021, common_columns) summary_dataset <- bind_rows(summary_dataset, kenya_final_combined_aligned_2021) # combined_sample_size <- bind_rows( # combined_sample_size, # sample_size_kenya_2021, # sample_size_all_kenya_2021 # ) ``` ```{r} combined_sample_size <- bind_rows( sample_size_india, sample_size_india_u_r, sample_size_uganda, sample_size_uganda_u_r, sample_size_nigeria, sample_size_nigeria_u_r, sample_size_pakistan, sample_size_pakistan_u_r, sample_size_kenya, sample_size_kenya_u_r, sample_size_kenya_2020, sample_size_kenya_2020_u_r, sample_size_india_2022, sample_size_india_u_r_2022,sample_size_ph_2023, sample_size_ph_u_r_2023, sample_size_ph_2018, sample_size_ph_u_r_2018, sample_size_rw, sample_size_rw_u_r, sample_size_mw, sample_size_mw_u_r, sample_size_ci, sample_size_ci_u_r,sample_size_bf, sample_size_bf_u_r, sample_size_kenya_2021, sample_size_all_kenya_2021,sample_size_tz, sample_size_tz_u_r, sample_size_gh, sample_size_gh_u_r ) summary_dataset <- summary_dataset %>% left_join(combined_sample_size, by = c("Country","urban_rural","Location","City","Year")) ``` ```{r} state_aggregated <- summary_dataset %>% group_by(Country, Location, Year, Quantile,urban_rural) %>% summarise( sample_size = sum(sample_size), HH_exp = mean(HH_exp, na.rm = TRUE), no_of_earners = mean(no_of_earners, na.rm = TRUE), HH_size = mean(HH_size, na.rm = TRUE), Percent_Income_Spent_on_Housing = mean(Percent_Income_Spent_on_Housing, na.rm = TRUE) ) %>% filter (Country != "Kenya") %>% ungroup()%>% mutate(City = "All") # ---------------------------------------- # National-Level Aggregation # ---------------------------------------- # Aggregate data for the national level by combining all states within each country national_aggregated <- summary_dataset %>% group_by(Country, Year, Quantile,urban_rural) %>% summarise( sample_size = sum(sample_size), HH_exp = mean(HH_exp, na.rm = TRUE), no_of_earners = mean(no_of_earners, na.rm = TRUE), HH_size = mean(HH_size, na.rm = TRUE) #Percent_Income_Spent_on_Housing = mean(Percent_Income_Spent_on_Housing, na.rm = TRUE) ) %>% ungroup() %>% mutate(Location = "All", City = "All") # ---------------------------------------- # Combine State and National Data # ---------------------------------------- # Combine both the state-level and national-level data final_dataset <- bind_rows(state_aggregated, national_aggregated,summary_dataset) # Write the final dataset to a CSV file write.csv(final_dataset, file = "/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/Summary_Dataset_v10_4.csv", row.names = FALSE) write.csv(combined_sample_size, file = "/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/sample_size_dataset_v10_4.csv", row.names = FALSE) #write.csv(inflation_data_with_median, file = "/Users/Ben.Atkinson/Reall/CompanyData - Documents/Evidence/Market Shaping Indicators/Understanding B40/External Income and Affordability Calculator/inflation_dataset_v9.csv", row.names = FALSE) ```