Maternal Mortality Rate - Gapminder & WHO
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
install.packages('janitor',repos = "http://cran.us.r-project.org")
Load libraries
library(readxl)
library(dplyr)
library(janitor)
library(tidyr)
library(ggplot2)
date <- Sys.Date()
Download data from Gapminder and save it locally
dir.create("data/input", recursive = TRUE)
download.file('https://www.gapminder.org/documentation/documentation/gapdata010.xls', paste0('data/input/',date,'_gapminder_data.xls'))
Read in the Gapminder data
df <- read_xls(paste0('data/input/',date,'_gapminder_data.xls'), col_names = TRUE)
df <- df[-(1:16),]
The first three year ranges seem to be wrong as in another source (https://docs.google.com/spreadsheets/u/0/d/14ZtQy9kd0pMRKWg_zKsTg3qKHoGtflj-Ekal9gIPZ4A/pub?gid=1#) they cover only 10 years rather than 110. I will replace them manually here with the middle of each decade, as is used in the other data from the same source.
There are also two cases where the year for Finland has been entered wrongly three times, there are two 1772s, 1775s & 1967s, the second of each should be 1872, 1875 and 1967 respectively.
There are also two errors for New Zealand, and one error for both Swecen and the US.
df$year[1:3] <- c(1875, 1885,1895)
df['year'][df$year == '1772' & df$Country == 'Finland' & df$`Maternal deaths`== 487,] <- '1872'
df['year'][df$year == '1775' & df$Country == 'Finland' & df$`Maternal deaths` == 629,] <- '1875'
df['year'][df$year == '1967' & df$Country == 'Finland' & df$`Maternal deaths` == 77,] <- '1957'
df['year'][df$year == '1967' & df$Country == 'Sweden' & df$`Maternal deaths` == 39,] <- '1957'
df['year'][df$year == '1967' & df$Country == 'United States' & df$`Maternal deaths` == 1766.28,] <- '1957'
df['year'][df$year =='1989-02'& df$Country == 'New Zealand',] <- '1889-02'
#Dropping an erroneous row for NZ in 1950 - there are two values but the first is incorrect. You can using the maternal deaths and live births columns.
df <- df[-which(df$year == '1950' & df$Country == 'New Zealand' & df$MMR == 90),]
For some rows a range of years is given for a particular maternal mortality rate. We use this function to find the mid-value of years given.
mid_year <- function(year) {
if(grepl("-", year)){
year_split <- unlist(strsplit(year, split = "-"))
year_begin <- year_split[1]
year_end_len <- nchar(year_split[2])
year_end_pref <- 4 - year_end_len
if (year_end_pref > 0) {
year_pref <- substr(year_begin, 1, year_end_pref)
year_end <- as.numeric(paste(c(year_pref, year_split[2]),collapse = ""))
year_out <- round(mean(c(as.numeric(year_begin), year_end)))
}else{
year_out <- round(mean(as.numeric(c(year_begin,year_split[2]))))
}
}else{
year_out <- year
}
return(year_out)
}
year_out_all <- unlist(lapply(df$year, mid_year))
df$mid_year <- as.numeric(year_out_all)
Let's check the mid-year estimates have worked as expected:
df %>% select(year, mid_year) %>% filter(grepl("-",year)) %>% distinct()
Keeping only the columns we want to take forward.
df <- df %>%
select(entity = Country, year = mid_year, maternal_mortality_rate = MMR)
Read in data from WDI - in future iterations we will read this in from ETL so it is auto updated. The WDI sources this variable from the WHO so we will henceforth refer to it as the WHO data.
download.file('https://api.worldbank.org/v2/en/indicator/SH.STA.MMRT?downloadformat=csv', paste0('data/input/',date,'_wdi_maternal_mortality.zip'))
unzip(paste0('data/input/',date,'_wdi_maternal_mortality.zip'), exdir = "data/input")
who <- clean_names(read.csv('data/input/API_SH.STA.MMRT_DS2_en_csv_v2_4029775.csv', skip = 4))
Pivot and clean the WHO data to long format
who_clean <- who %>%
select(country_name, starts_with("X")) %>%
select(-x) %>%
pivot_longer(starts_with("x")) %>%
mutate(year = as.numeric(gsub("x", "", name))) %>%
select(entity = country_name, year, maternal_mortality_rate = value)
Check the Gapminder countries are in the WHO dataset
unique(df$entity) %in% who_clean$entity
Combine the two datasets and add a source column so we can keep track of where each datapoin comes from.
df$source <- 'GapMinder'
who_clean$source <- 'WHO_GHO'
df_all <- rbind(df, who_clean)
df_all$maternal_mortality_rate <- as.numeric(df_all$maternal_mortality_rate)
Dealing with duplicate years - there are two identical values for NZ in 1950, we can drop one with distinct(). We remove NAs where there is a non-NA value in either Gapminder or WHO. If there are values for both Gapminder and WHO GHO we have a preference for WHO GHO.
df_check <-df_all %>%
group_by(entity, year) %>%
add_count() %>%
ungroup()
df_sel <- df_check[-which(df_check$n >1 & is.na(df_check$maternal_mortality_rate)),]
df_sel <-df_sel %>%
group_by(entity, year) %>%
add_count() %>%
ungroup() %>%
distinct()
df_sel <- df_sel[-which(df_sel$nn >1 & df_sel$source == 'WHO_GHO'),]
df_sel %>%
group_by(entity, year) %>%
add_count() %>%
filter(nnn >1) %>%
ungroup()
df_sel <- df_sel %>% select(-c(n, nn))
stand_countries <- read.csv("data/input/countries_to_standardise_country_standardized.csv")
df_fin <- df_sel %>% left_join(.,stand_countries, by = c("entity" = "Country")) %>%
mutate(entity = Our.World.In.Data.Name) %>%
select(-c(Our.World.In.Data.Name, source))
df_fin$maternal_mortality_rate[is.na(df_fin$maternal_mortality_rate)] <- ""
write.csv(df_fin, "data/output/maternal_mortality_rate_upload.csv", row.names = FALSE)
Plot to see what it looks like:
ggplot(data = df_sel %>% filter(entity== "Sweden"), aes(x = year, y = maternal_mortality_rate, group = entity, colour = entity))+
geom_line()