diff --git a/pipelines/snt_dhs_indicators/code/snt_dhs_bednets_computation.ipynb b/pipelines/snt_dhs_indicators/code/snt_dhs_bednets_computation.ipynb index bea665f..a4c5e25 100644 --- a/pipelines/snt_dhs_indicators/code/snt_dhs_bednets_computation.ipynb +++ b/pipelines/snt_dhs_indicators/code/snt_dhs_bednets_computation.ipynb @@ -1,1032 +1,1204 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "8ba79c20-9f47-4d61-93ab-19d3802125ec", - "metadata": {}, - "source": [ - "# Insecticide-teated net (ITN) access and use, DHS data" - ] - }, - { - "cell_type": "markdown", - "id": "d5d9b645-2094-4b60-a9b2-b89ba33ac4dc", - "metadata": {}, - "source": [ - "## Resources" - ] - }, - { - "cell_type": "markdown", - "id": "90ddd8fd-ec6b-4a29-b78d-8010cdc4d40e", - "metadata": {}, - "source": [ - "https://dhsprogram.com/data/Guide-to-DHS-Statistics/Access_to_an_Insecticide-Treated_Net_ITN.htm\n", - "\n", - "https://dhsprogram.com/data/Guide-to-DHS-Statistics/index.htm#t=Use_of_Mosquito_Nets_by_Persons_in_the_Household.htm%23Percentage_of_the1bc-1&rhtocid=_15_3_0\n", - "\n", - "https://dhsprogram.com/publications/publication-dhsg4-dhs-questionnaires-and-manuals.cfm" - ] - }, - { - "cell_type": "markdown", - "id": "53012a66-e2c7-4ecb-9233-27581e349368", - "metadata": {}, - "source": [ - "### Access" - ] - }, - { - "cell_type": "markdown", - "id": "64d4e878-f065-47cc-be00-2b0cd3baeacc", - "metadata": {}, - "source": [ - "Percentage of the de facto household population with access to an ITN in the household, defined as the proportion of the de facto household population who slept under an ITN if each ITN in the household were used by up to two people.\n", - "\n", - "Numerator: Number of de facto persons (hv103 = 1) who could sleep under an ITN if each ITN in the household is used by up to 2 people, calculated for each household as the minimum of:\n", - "\n", - "1. number of de facto persons in the household (hv013), and\n", - "2. twice the number of ITNs per household (2 * sum of hml10_1 – hml10_7 = 1) <- assumed that maximum two people can sleep under a bednet\n", - " \n", - "Denominator: Number of persons who stayed in the household the night before the survey (hv103 = 1)\n", - "\n", - "Variables: hhid (household identification), hml10_1 – _7 (Insecticide-Treated Net (ITN)), hv013 (Number of de facto members) hv103, (Slept last night), hv005 (Household sample)" - ] - }, - { - "cell_type": "markdown", - "id": "5bd2650b-e952-45d1-b46a-b08b777a5961", - "metadata": {}, - "source": [ - "### Use" - ] - }, - { - "cell_type": "markdown", - "id": "327fe0a1-1be5-4d35-a6ff-8913ad56b6c3", - "metadata": {}, - "source": [ - "1) Percentage of the de facto household population who slept the night before the survey under a mosquito net (treated or untreated).\n", - "\n", - "2) Percentage of the de facto household population who slept the night before the survey under an insecticide-treated net (ITN).\n", - "\n", - "3) Among the de facto household population in households with at least one ITN, the percentage who slept under an ITN the night before the survey.\n", - "\n", - "Coverage:\n", - "Population base: De facto household members (PR file, HR file)\n", - "Time period: Night before the survey\n", - "\n", - "Numerators:\n", - "1) Number of de facto persons who reported sleeping under any mosquito net the night before the survey (hv103 = 1 & hml12 in 1:3)\n", - "2) Number of de facto persons who reported sleeping under an ITN the night before the survey (hv103 = 1 & hml12 in 1:2)\n", - "3) Number of de facto persons in households with at least one ITN who reported sleeping under an ITN the night before the survey (hv103 = 1 & hml12 in 1:2 & any hml10_1 – hml10_7 = 1)\n", - "\n", - "Denominators:\n", - "a) Number of persons in the de facto household population (hv103 = 1)\n", - "b) Number of persons in the de facto household population (hv103 = 1)\n", - "c) Number of persons in the de facto household population in households owning at least one ITN (hv103 = 1 & any hml10_1 – hml10_7 = 1)\n", - "\n", - "Variables: HR file, PR file.\n", - "\n", - "\n", - "**Project uses numerator 2) Number of de facto persons who reported sleeping under an ITN the night before the survey (hv103 = 1 & hml12 in 1:2)**\n", - "\n", - "**Project uses denominator b) Number of persons in the de facto household population (hv103 = 1)**" - ] - }, - { - "cell_type": "markdown", - "id": "3b050280-b234-45f1-bac5-8e6910079118", - "metadata": {}, - "source": [ - "## Preliminary steps" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3632c310-6a58-4825-8b80-ce3612b6caca", - "metadata": {}, - "outputs": [], - "source": [ - "rm(list = ls())\n", - "\n", - "options(scipen=999)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cb98532b-56c9-42c7-9bbd-a6f7869bfc76", - "metadata": {}, - "outputs": [], - "source": [ - "# Global paths\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ebf120a6-1559-4295-93c4-cfbfd141a67b", - "metadata": {}, - "outputs": [], - "source": [ - "# Paths\n", - "ROOT_PATH <- '~/workspace'\n", - "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", - "CODE_PATH <- file.path(ROOT_PATH, 'code')\n", - "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", - "DHS_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'raw')\n", - "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'bednets')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cc24cdd4-2ccb-4511-8a63-8ee4b0c29bde", - "metadata": {}, - "outputs": [], - "source": [ - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "\n", - "# List required pcks\n", - "required_packages <- c(\"haven\", \"sf\", \"glue\", \"survey\", \"data.table\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"arrow\")\n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7acc95aa-eb5e-421e-b23e-0efa602d1cc1", - "metadata": {}, - "outputs": [], - "source": [ - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")\n", - "\n", - "# Load SNT config\n", - "CONFIG_FILE_NAME <- \"SNT_config.json\"\n", - "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, CONFIG_FILE_NAME)) \n", - "log_msg(msg)\n", - "\n", - "# Set config variables\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE" - ] - }, - { - "cell_type": "markdown", - "id": "93898419-b98d-4a53-8fc9-a1bb9bff01a4", - "metadata": {}, - "source": [ - "## Geo and admin data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "145fe721-f42a-45ff-a3cb-060886fe7a9e", - "metadata": {}, - "outputs": [], - "source": [ - "admin_level <- 'ADM1'\n", - "admin_id_col <- glue(admin_level, 'ID', .sep='_')\n", - "admin_name_col <- glue(admin_level, 'NAME', .sep='_')\n", - "admin_cols <- c(admin_id_col, admin_name_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4938a133-569e-4aec-bf55-7a633a142bc2", - "metadata": {}, - "outputs": [], - "source": [ - "# Load spatial file from dataset \n", - "\n", - "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "\n", - "spatial_data_filename <- paste(COUNTRY_CODE, \"shapes.geojson\", sep = \"_\")\n", - "# spatial_data <- read_sf(file.path(DATA_PATH, 'dhis2', 'formatted', spatial_data_filename))\n", - "spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)\n", - "log_msg(glue(\"File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "31c87079-f8b6-417a-a206-804d5c3208e8", - "metadata": {}, - "outputs": [], - "source": [ - "spatial_data <- st_as_sf(spatial_data)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "98c0e39a-5433-4dc8-a109-f279f7be0271", - "metadata": {}, - "outputs": [], - "source": [ - "# aggregate geometries by the admin columns\n", - "spatial_data <- aggregate_geometry(\n", - " sf_data=spatial_data,\n", - " admin_id_colname=admin_id_col,\n", - " admin_name_colname=admin_name_col\n", - ")\n", - "\n", - "# keep class\n", - "spatial_data <- st_as_sf(spatial_data)\n", - "\n", - "if(COUNTRY_CODE == \"COD\"){\n", - " spatial_data[[admin_name_col]] <- clean_admin_names(spatial_data[[admin_name_col]])\n", - "}\n", - "\n", - "admin_data <- st_drop_geometry(spatial_data)\n", - "setDT(admin_data)" - ] - }, - { - "cell_type": "markdown", - "id": "6399c2eb-9509-4b4f-839a-6c8c83004510", - "metadata": {}, - "source": [ - "## Import DHS data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0fb15b9d-3cc5-4c6f-be25-124169388c25", - "metadata": {}, - "outputs": [], - "source": [ - "data_source <- 'DHS'\n", - "indicator_access <- 'PCT_ITN_ACCESS'\n", - "indicator_use <- 'PCT_ITN_USE'" - ] - }, - { - "cell_type": "markdown", - "id": "0de3a133-4873-43ad-8a33-ed6afa42330b", - "metadata": {}, - "source": [ - "### Unzip data for the analysis" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1b88b59b-383c-49a6-b476-9319042243e2", - "metadata": {}, - "outputs": [], - "source": [ - "household_recode <- 'HR'\n", - "person_recode <- 'PR'\n", - "target_file_type <- 'SV'\n", - "\n", - "delete_otherextension_files(DHS_DATA_PATH, extension_to_retain=\".zip\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bf5e0839-612a-430b-ad22-14bc62d6cad5", - "metadata": {}, - "outputs": [], - "source": [ - "dhs_hr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, household_recode, target_file_type)\n", - "unzip(file.path(DHS_DATA_PATH, dhs_hr_zip_filename), exdir=DHS_DATA_PATH)\n", - "\n", - "dhs_pr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, person_recode, target_file_type)\n", - "unzip(file.path(DHS_DATA_PATH, dhs_pr_zip_filename), exdir=DHS_DATA_PATH)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ca6e3da7-916c-47e1-b475-2bd5cf551bcd", - "metadata": {}, - "outputs": [], - "source": [ - "# # Remove existing output files\n", - "# files <- list.files(OUTPUT_DATA_PATH, full.names = TRUE)\n", - "# files_to_delete <- files[grepl('_ITN_', basename(files), ignore.case = TRUE) & grepl(COUNTRY_CODE, basename(files), ignore.case = TRUE)]\n", - "# file.remove(files_to_delete)" - ] - }, - { - "cell_type": "markdown", - "id": "02d577f1-007e-40b2-a3f6-e5b41089ee4a", - "metadata": {}, - "source": [ - "### Import data files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "930ef55c-a590-4d76-bcad-902528f8815a", - "metadata": {}, - "outputs": [], - "source": [ - "data_extension <- '.SAV'\n", - "dhs_hr_filename <- list.files(path = DHS_DATA_PATH, pattern = paste0(\".*\", household_recode, \".*\\\\\", data_extension, \"$\"), ignore.case=TRUE)\n", - "dhs_pr_filename <- dir(path = DHS_DATA_PATH, pattern = paste0(\".*\", person_recode, \".*\\\\\", data_extension, \"$\"), ignore.case=TRUE)\n", - "\n", - "if(!check_dhs_same_version(dhs_hr_filename, dhs_pr_filename)){\n", - " stop(\"The necessary DHS data do not have the same version/issue. Check available data before rerunning.\")\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "60283f74-b656-4596-82ff-aefa487ddd28", - "metadata": {}, - "outputs": [], - "source": [ - "dhs_hr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_hr_filename)) # household recode\n", - "dhs_hr_dt <- setDT(dhs_hr_dt)\n", - "\n", - "dhs_pr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_pr_filename)) # person recode\n", - "dhs_pr_dt <- setDT(dhs_pr_dt)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9c47b679-8735-4f15-bac5-6efcf912df86", - "metadata": {}, - "outputs": [], - "source": [ - "# Make admin codes and names dataframe (for future merging)\n", - "\n", - "dhs_admin_dt <- make_dhs_admin_df(\n", - " input_dhs_df=dhs_hr_dt,\n", - " original_admin_column=\"HV024\",\n", - " new_admin_name_colname=admin_name_col,\n", - " new_admin_code_colname='DHS_ADM1_CODE'\n", - ")\n", - "\n", - "# format the names to be like DHIS2 names\n", - "dhs_admin_dt[, (admin_name_col) := format_names(get(admin_name_col))]\n", - "\n", - "# TODO this should be changed in the formatting of DHIS2 data; the correct name should be with a space\n", - "dhs_admin_dt[get(admin_name_col) == \"MAI NDOMBE\", (admin_name_col) := \"MAINDOMBE\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2727c88e-45fb-4011-9fe8-99b55b8313e8", - "metadata": {}, - "outputs": [], - "source": [ - "# Check that all regions can be matched with DHIS2 pyramid\n", - "if(!check_perfect_match(dhs_admin_dt, admin_name_col, admin_data, admin_name_col)){\n", - " stop(\"The DHS data provided does not fully match DHIS2 pyramid data. Please check input data before retrying.\")\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "659a2a79-a563-4062-b7b4-25652e140c5c", - "metadata": {}, - "source": [ - "### Set relevant columns" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1fe61ebf-11b1-41ee-beb5-a553fffc3015", - "metadata": {}, - "outputs": [], - "source": [ - "household_id_cols <- c(\"HHID\", \"HV000\", \"HV001\", \"HV002\")\n", - "original_household_ITN_cols <- grep('HML10', names(dhs_hr_dt), value = TRUE)\n", - "household_sampling_cols <- c(\"HV005\", \"HV021\", \"HV022\", \"HV023\", \"HV024\")\n", - "household_inhabitants_col <- \"HV013\"\n", - "person_slept_col <- \"HV103\"\n", - "person_id_col <- \"HVIDX\"\n", - "person_bednet_col <- \"HML12\"" - ] - }, - { - "cell_type": "markdown", - "id": "8e71933c-df9f-413a-93eb-38f6e457ce7c", - "metadata": {}, - "source": [ - "## Preprocess Household recode data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "72d8d33a-6163-493b-87b9-11e82eae5bf7", - "metadata": {}, - "outputs": [], - "source": [ - "# filter columns\n", - "hr_dt <- dhs_hr_dt[, .SD, .SDcols=c(household_id_cols, household_sampling_cols, household_inhabitants_col, original_household_ITN_cols)]\n", - "\n", - "# check i didn't omit any crucial variable\n", - "nrow(hr_dt[duplicated(hr_dt)])\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "58bec3c3-9c58-4ae9-8939-af545160239d", - "metadata": {}, - "outputs": [], - "source": [ - "sapply(original_household_ITN_cols, function(i) table(hr_dt[[i]], useNA = 'always'))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "436a4f7b-b3a3-4849-8b4a-7205ffb64efd", - "metadata": {}, - "outputs": [], - "source": [ - "# make syntactically valid names\n", - "setnames(hr_dt, old = names(hr_dt), new = make.names(names(hr_dt)))\n", - "household_ITN_cols <- grep('HML10', names(hr_dt), value = TRUE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6508361d-f908-41fc-9a6b-ef709fcf24cd", - "metadata": {}, - "outputs": [], - "source": [ - "sapply(household_ITN_cols, function(i) table(hr_dt[[i]], useNA = 'always'))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b89fdf1e-059a-4a39-be91-f7722c18824d", - "metadata": {}, - "outputs": [], - "source": [ - "# add admin name column\n", - "hr_dt <- merge.data.table(dhs_admin_dt, hr_dt, by.x = \"DHS_ADM1_CODE\", by.y = \"HV024\", all = TRUE)\n", - "\n", - "# sapply(household_ITN_cols, function(i) table(hr_dt[[i]], useNA = 'always'))\n", - "\n", - "hr_dt[, (household_ITN_cols) := lapply(.SD, function(x) {\n", - " x <- as.integer(as.character(x)) # convert factors/characters to numeric\n", - " ifelse(is.na(x), 0, x)\n", - "}), .SDcols = household_ITN_cols]\n", - "\n", - "# compute the maximum potential users, given the number of ITNs present in the household\n", - "hr_dt[, max_users := 2 * rowSums(.SD, na.rm = TRUE), .SDcols = household_ITN_cols] # maximum 2 times the number of ITNs in the household\n", - "\n", - "# compute real potential users\n", - "hr_dt[, potential_users := pmin(max_users, HV013, na.rm = TRUE)]\n", - "\n", - "# compute weights\n", - "hr_dt[, wt := HV005/1000000]" - ] - }, - { - "cell_type": "markdown", - "id": "a17d5a0b-de6c-4e06-8567-e03a37936d65", - "metadata": {}, - "source": [ - "## Access to ITN" - ] - }, - { - "cell_type": "markdown", - "id": "c8d3241b-3f42-44fd-9cbc-1c5d3f7d877c", - "metadata": {}, - "source": [ - "### Preprocess person file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6aa78b5e-f5a3-492e-8859-8757fabc6e78", - "metadata": {}, - "outputs": [], - "source": [ - "# filter relevant columns\n", - "access_pr_dt <- dhs_pr_dt[, .SD, .SDcols = c(\n", - " household_id_cols,\n", - " person_id_col,\n", - " person_slept_col\n", - ")]\n", - "\n", - "# # check no necessary column was omitted\n", - "# nrow(access_pr_dt[duplicated(access_pr_dt)])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9eb38a46-4c62-4ed8-b33b-e4cb9b7508b3", - "metadata": {}, - "outputs": [], - "source": [ - "# make denominator: group and sum, removing NAs\n", - "access_pr_dt <- access_pr_dt[, .(total_slept = sum(get(person_slept_col), na.rm = TRUE)), by = household_id_cols]" - ] - }, - { - "cell_type": "markdown", - "id": "149e766d-0863-4c74-b12e-26727e940a8c", - "metadata": {}, - "source": [ - "### Join with household file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "af6a7e45-18b1-4fbf-af8b-c57886920780", - "metadata": {}, - "outputs": [], - "source": [ - "# check merge with household file\n", - "check_perfect_match(hr_dt, 'HHID', access_pr_dt, 'HHID')\n", - "\n", - "# lapply(household_id_cols, function(i) check_perfect_match(hr_dt, i, access_pr_dt, i))\n", - "if(!all(unlist((lapply(household_id_cols, function(i) check_perfect_match(hr_dt, i, access_pr_dt, i)))))){\n", - " print('Person and Household data does not match')\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "02c1997c-f5c6-4eee-b889-a8c8cc81dd0f", - "metadata": {}, - "outputs": [], - "source": [ - "access_dt <- merge.data.table(hr_dt, access_pr_dt, by = household_id_cols, all = TRUE)\n", - "\n", - "# filter rows\n", - "access_dt <- access_dt[total_slept > 0] # to not divide by 0 (only households where someone slept last night)" - ] - }, - { - "cell_type": "markdown", - "id": "50e7c498-1141-46a4-b4fc-b5c4617fb956", - "metadata": {}, - "source": [ - "DHS guidelines for the calculation of “potential users”: \"In households which have more than 1 ITN for every 2 people, the product of this calculation will be greater than the number of individuals who spent the previous night. In this case, the “potential users” variable in that household should be modified to reflect the number of individuals who spent the previous night in the household because the number of potential users in a household cannot exceed the number of individuals who spent the previous night in that household.\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9b20ff38-d658-43c8-83d5-5357bc7a293a", - "metadata": {}, - "outputs": [], - "source": [ - "access_dt[, foo := fifelse(\n", - " potential_users > total_slept,\n", - " total_slept,\n", - " potential_users\n", - ")]" - ] - }, - { - "cell_type": "markdown", - "id": "fddd66b0-c351-4aa4-bdaa-47542fbdee9d", - "metadata": {}, - "source": [ - "### Compute ITN access indicator" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "caa18abf-6589-4846-a705-897ce3943692", - "metadata": {}, - "outputs": [], - "source": [ - "access_dt[, (indicator_access) := potential_users / total_slept]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0ff9c48c-1c35-4e14-a4bf-417040e9e4f7", - "metadata": {}, - "outputs": [], - "source": [ - "summary(access_dt[[indicator_access]])" - ] - }, - { - "cell_type": "markdown", - "id": "9c53f039-299a-4c02-a728-77a77d088472", - "metadata": {}, - "source": [ - "#### Account for the sampling strategy" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9df627ce-0424-4341-9fcd-b1f13cd769b6", - "metadata": {}, - "outputs": [], - "source": [ - "# clustering, stratification, weights (for means, proportions, regression models, etc.)\n", - "access_design_sampling = svydesign(\n", - " ids = ~ HV021, # primary sampling unit / cluster ids (cluster number and/or ultimate area unit)\n", - " data = access_dt, # dataset\n", - " strata = ~ HV023, # groupings of primary sampling units\n", - " weights = ~ wt, # the sampling weights variable\n", - " num_p=1, # ? dunno what this is\n", - " nest = T # the primary sampling units are nested within the strata\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f699992f-e931-4657-abf3-7a2e13a89dbf", - "metadata": {}, - "outputs": [], - "source": [ - "bednet_access_table <- svyby(formula = as.formula(paste(\"~\", indicator_access)),\n", - " # by = ~ ADM1,\n", - " by = reformulate(admin_name_col),\n", - " FUN = svymean,\n", - " design = access_design_sampling,\n", - " level = 0.95,\n", - " vartype = \"ci\",\n", - " na.rm = TRUE,\n", - " influence = TRUE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bfc79978-0f59-444b-a8f1-3420864489b1", - "metadata": {}, - "outputs": [], - "source": [ - "setDT(bednet_access_table)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4e71c083-0b67-43db-8f0b-eda14d8a2be3", - "metadata": {}, - "outputs": [], - "source": [ - "lower_bound_col <- glue(\"{toupper(indicator_access)}_CI_LOWER_BOUND\")\n", - "upper_bound_col <- glue(\"{toupper(indicator_access)}_CI_UPPER_BOUND\")\n", - "sample_avg_col <- glue(\"{toupper(indicator_access)}_SAMPLE_AVERAGE\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0c92b8b6-d343-4a69-b508-ae23ebbae48b", - "metadata": {}, - "outputs": [], - "source": [ - "# names(bednet_access_table) <- toupper(names(bednet_access_table))\n", - "names(bednet_access_table)[names(bednet_access_table) == 'ci_l'] <- lower_bound_col\n", - "names(bednet_access_table)[names(bednet_access_table) == 'ci_u'] <- upper_bound_col\n", - "names(bednet_access_table)[names(bednet_access_table) == indicator_access] <- sample_avg_col" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "98542095-2e06-4860-b038-c7eeceeb6265", - "metadata": {}, - "outputs": [], - "source": [ - "# Cap the CI's between 0 and 1 (in case of small sample => large CI's)\n", - "bednet_access_table[get(lower_bound_col) < 0, (lower_bound_col) := 0]\n", - "bednet_access_table[get(upper_bound_col) > 1, (upper_bound_col) := 1]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "75e920e5-3715-4971-bc6c-62291dc59fc5", - "metadata": {}, - "outputs": [], - "source": [ - "# Convert to percentages\n", - "bednet_access_table[, (lower_bound_col) := get(lower_bound_col) * 100]\n", - "bednet_access_table[, (upper_bound_col) := get(upper_bound_col) * 100]\n", - "bednet_access_table[, (sample_avg_col) := get(sample_avg_col) * 100]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "67cb29d2-ee18-4d28-9c72-e253f8dcb5b6", - "metadata": {}, - "outputs": [], - "source": [ - "bednet_access_table <- merge.data.table(admin_data, bednet_access_table, by = admin_name_col, all = TRUE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "54935f07-b254-458d-b1fb-8c7a921b5bd9", - "metadata": {}, - "outputs": [], - "source": [ - "head(bednet_access_table)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "85c1bf55-39d6-4c70-88bf-46542f158b0c", - "metadata": {}, - "outputs": [], - "source": [ - "filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_access)}\")\n", - "write.csv(bednet_access_table, file = file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.csv')), row.names = FALSE)\n", - "write_parquet(bednet_access_table, file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.parquet')))" - ] - }, - { - "cell_type": "markdown", - "id": "484e0a38-8c37-4213-a750-babfaf9107bc", - "metadata": {}, - "source": [ - "## ITN use" - ] - }, - { - "cell_type": "markdown", - "id": "1ebdff14-edbd-4fd5-85f5-230b1e27adb7", - "metadata": {}, - "source": [ - "### Preprocess person file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7a3536b2-6fc4-412f-91bc-21466b688e59", - "metadata": {}, - "outputs": [], - "source": [ - "# filter columns\n", - "use_pr_dt <- dhs_pr_dt[, .SD, .SDcols=c(household_id_cols, person_id_col, person_slept_col, person_bednet_col)]\n", - "\n", - "# check no necessary column was omitted\n", - "nrow(use_pr_dt[duplicated(use_pr_dt)])\n", - "\n", - "# # for(i in person_slept_col){print(table(access_pr_dt[[i]]))}\n", - "# sapply(person_bednet_col, function(i) table(use_pr_dt[[i]], useNA = 'always'))" - ] - }, - { - "cell_type": "markdown", - "id": "a303e6ec-a3ac-4a20-8f3b-40a06d8067e3", - "metadata": {}, - "source": [ - "The DHS guide ( https://dhsprogram.com/data/Guide-to-DHS-Statistics/index.htm#t=Use_of_Mosquito_Nets_by_Persons_in_the_Household.htm ) suggests to use both 1 & 2 as possible values for HML12; but 2 is \"Both treated (ITN) and untreated nets\"; using as specified in the guide, but to be kept in mind" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "35211f4e-e84d-4fbc-be01-577912300d29", - "metadata": {}, - "outputs": [], - "source": [ - "# # group and sum, removing NAs and keeping only 1 as valid value\n", - "# use_pr_dt <- use_pr_dt[, slept_itn := as.integer(\n", - "# get(person_slept_col) == 1 & (get(person_bednet_col) == 1)\n", - "# )]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "132e025d-a699-4c0c-a20d-d4f7503811a6", - "metadata": {}, - "outputs": [], - "source": [ - "# group and sum, removing NAs\n", - "use_pr_dt <- use_pr_dt[, slept_itn := as.integer(\n", - " get(person_slept_col) == 1 & (get(person_bednet_col) %in% c(1, 2))\n", - ")]\n", - "\n", - "# check recodings are correct\n", - "xtabs(~ get(person_slept_col) + get(person_bednet_col) + slept_itn, data = use_pr_dt, addNA = TRUE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3c883b64-605c-4825-b16f-5d4b2f6febf9", - "metadata": {}, - "outputs": [], - "source": [ - "use_pr_dt <- use_pr_dt[, .(\n", - " total_slept = sum(get(person_slept_col), na.rm = TRUE),\n", - " total_slept_itn = sum(get(\"slept_itn\"), na.rm = TRUE)\n", - "), by = household_id_cols\n", - "]\n", - "\n", - "use_pr_dt[, (indicator_use) := total_slept_itn / total_slept]" - ] - }, - { - "cell_type": "markdown", - "id": "ab6ac712-21d4-478b-9f48-fe9d39c4ffc5", - "metadata": {}, - "source": [ - "### Join with household file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e164f7b2-26c4-4357-8b1d-5acad2de6e54", - "metadata": {}, - "outputs": [], - "source": [ - "use_dt <- merge.data.table(hr_dt, use_pr_dt, by = household_id_cols)" - ] - }, - { - "cell_type": "markdown", - "id": "16738a93-d779-4671-9d4a-790e08860f6d", - "metadata": {}, - "source": [ - "### Compute ITN use indicator" - ] - }, - { - "cell_type": "markdown", - "id": "500726f9-5091-45f7-b164-1ab50c362587", - "metadata": {}, - "source": [ - "#### Account for sampling strategy" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "12f19a36-722c-47a1-8183-7022e9674ade", - "metadata": {}, - "outputs": [], - "source": [ - "use_design_sampling = svydesign(\n", - " ids = ~ HV021, # primary sampling unit / cluster ids (cluster number and/or ultimate area unit)\n", - " data = use_dt, # dataset\n", - " strata = ~ HV023, # groupings of primary sampling units\n", - " weights = ~ wt, # the sampling weights variable\n", - " num_p=1, # ? dunno what this is\n", - " nest = T # the primary sampling units are nested within the strata\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "95ca0478-3d3a-42b6-85df-41e6e1fe19fa", - "metadata": {}, - "outputs": [], - "source": [ - "bednet_use_table <- svyby(formula = as.formula(paste(\"~\", indicator_use)),\n", - " # by = ~ ADM1,\n", - " by = reformulate(admin_name_col),\n", - " FUN = svymean,\n", - " design = use_design_sampling,\n", - " level = 0.95,\n", - " vartype = \"ci\",\n", - " na.rm = TRUE,\n", - " influence = TRUE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4a571c0d-2a47-423c-b716-a46589f7f41f", - "metadata": {}, - "outputs": [], - "source": [ - "setDT(bednet_use_table)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "79b8ae99-1fb9-4611-b17d-4f5c98e3d9e3", - "metadata": {}, - "outputs": [], - "source": [ - "lower_bound_col <- glue(\"{toupper(indicator_use)}_CI_LOWER_BOUND\")\n", - "upper_bound_col <- glue(\"{toupper(indicator_use)}_CI_UPPER_BOUND\")\n", - "sample_avg_col <- glue(\"{toupper(indicator_use)}_SAMPLE_AVERAGE\")\n", - "\n", - "names(bednet_use_table)[names(bednet_use_table) == 'ci_l'] <- lower_bound_col\n", - "names(bednet_use_table)[names(bednet_use_table) == 'ci_u'] <- upper_bound_col\n", - "names(bednet_use_table)[names(bednet_use_table) == indicator_use] <- sample_avg_col" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3bb41a0b-a67f-4710-9749-498d433ee270", - "metadata": {}, - "outputs": [], - "source": [ - "# Cap the CI's between 0 and 1 (in case of small sample => large CI's)\n", - "bednet_use_table[get(lower_bound_col) < 0, (lower_bound_col) := 0]\n", - "bednet_use_table[get(upper_bound_col) > 1, (upper_bound_col) := 1]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8a6d7558-378a-4d85-a2d2-6916e227ff19", - "metadata": {}, - "outputs": [], - "source": [ - "# Convert to percentages\n", - "bednet_use_table[, (lower_bound_col) := get(lower_bound_col) * 100]\n", - "bednet_use_table[, (upper_bound_col) := get(upper_bound_col) * 100]\n", - "bednet_use_table[, (sample_avg_col) := get(sample_avg_col) * 100]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9b886265-bca7-492c-b216-15209da1d515", - "metadata": {}, - "outputs": [], - "source": [ - "bednet_use_table <- merge.data.table(admin_data, bednet_use_table, by = admin_name_col, all = TRUE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "47e8ec4b-b90d-434f-b6a5-5962d43451e5", - "metadata": {}, - "outputs": [], - "source": [ - "filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{indicator_use}\")\n", - "write.csv(bednet_use_table, file = file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.csv')), row.names = FALSE)\n", - "write_parquet(bednet_use_table, file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.parquet')))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b0aec24e-9feb-4a7a-a491-be7f44617ac0", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Insecticide-teated net (ITN) access and use, DHS data" + ], + "id": "8ba79c20-9f47-4d61-93ab-19d3802125ec" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Resources" + ], + "id": "d5d9b645-2094-4b60-a9b2-b89ba33ac4dc" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "https://dhsprogram.com/data/Guide-to-DHS-Statistics/Access_to_an_Insecticide-Treated_Net_ITN.htm\n", + "\n", + "https://dhsprogram.com/data/Guide-to-DHS-Statistics/index.htm#t=Use_of_Mosquito_Nets_by_Persons_in_the_Household.htm%23Percentage_of_the1bc-1&rhtocid=_15_3_0\n", + "\n", + "https://dhsprogram.com/publications/publication-dhsg4-dhs-questionnaires-and-manuals.cfm" + ], + "id": "90ddd8fd-ec6b-4a29-b78d-8010cdc4d40e" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Access" + ], + "id": "53012a66-e2c7-4ecb-9233-27581e349368" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Percentage of the de facto household population with access to an ITN in the household, defined as the proportion of the de facto household population who slept under an ITN if each ITN in the household were used by up to two people.\n", + "\n", + "Numerator: Number of de facto persons (hv103 = 1) who could sleep under an ITN if each ITN in the household is used by up to 2 people, calculated for each household as the minimum of:\n", + "\n", + "1. number of de facto persons in the household (hv013), and\n", + "2. twice the number of ITNs per household (2 * sum of hml10_1 – hml10_7 = 1) <- assumed that maximum two people can sleep under a bednet\n", + " \n", + "Denominator: Number of persons who stayed in the household the night before the survey (hv103 = 1)\n", + "\n", + "Variables: hhid (household identification), hml10_1 – _7 (Insecticide-Treated Net (ITN)), hv013 (Number of de facto members) hv103, (Slept last night), hv005 (Household sample)" + ], + "id": "64d4e878-f065-47cc-be00-2b0cd3baeacc" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Use" + ], + "id": "5bd2650b-e952-45d1-b46a-b08b777a5961" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1) Percentage of the de facto household population who slept the night before the survey under a mosquito net (treated or untreated).\n", + "\n", + "2) Percentage of the de facto household population who slept the night before the survey under an insecticide-treated net (ITN).\n", + "\n", + "3) Among the de facto household population in households with at least one ITN, the percentage who slept under an ITN the night before the survey.\n", + "\n", + "Coverage:\n", + "Population base: De facto household members (PR file, HR file)\n", + "Time period: Night before the survey\n", + "\n", + "Numerators:\n", + "1) Number of de facto persons who reported sleeping under any mosquito net the night before the survey (hv103 = 1 & hml12 in 1:3)\n", + "2) Number of de facto persons who reported sleeping under an ITN the night before the survey (hv103 = 1 & hml12 in 1:2)\n", + "3) Number of de facto persons in households with at least one ITN who reported sleeping under an ITN the night before the survey (hv103 = 1 & hml12 in 1:2 & any hml10_1 – hml10_7 = 1)\n", + "\n", + "Denominators:\n", + "a) Number of persons in the de facto household population (hv103 = 1)\n", + "b) Number of persons in the de facto household population (hv103 = 1)\n", + "c) Number of persons in the de facto household population in households owning at least one ITN (hv103 = 1 & any hml10_1 – hml10_7 = 1)\n", + "\n", + "Variables: HR file, PR file.\n", + "\n", + "\n", + "**Project uses numerator 2) Number of de facto persons who reported sleeping under an ITN the night before the survey (hv103 = 1 & hml12 in 1:2)**\n", + "\n", + "**Project uses denominator b) Number of persons in the de facto household population (hv103 = 1)**" + ], + "id": "327fe0a1-1be5-4d35-a6ff-8913ad56b6c3" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preliminary steps" + ], + "id": "3b050280-b234-45f1-bac5-8e6910079118" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "rm(list = ls())\n", + "\n", + "options(scipen=999)" + ], + "execution_count": null, + "outputs": [], + "id": "3632c310-6a58-4825-8b80-ce3612b6caca" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Global paths\n", + "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", + "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")" + ], + "execution_count": null, + "outputs": [], + "id": "cb98532b-56c9-42c7-9bbd-a6f7869bfc76" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Paths\n", + "ROOT_PATH <- '~/workspace'\n", + "PIPELINE_PATH <- file.path(ROOT_PATH, 'pipelines', 'snt_dhs_indicators')" + ], + "execution_count": null, + "outputs": [], + "id": "ebf120a6-1559-4295-93c4-cfbfd141a67b" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Load notebook-specific utils\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhs_bednets_computation.r\"))\n", + "\n", + "setup_ctx <- bootstrap_dhs_indicators_context(root_path = ROOT_PATH)\n", + "DATA_PATH <- setup_ctx$DATA_PATH\n", + "DHS_DATA_PATH <- setup_ctx$DHS_DATA_PATH\n", + "config_json <- setup_ctx$config_json\n", + "COUNTRY_CODE <- setup_ctx$COUNTRY_CODE\n", + "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'bednets')\n", + "dir.create(OUTPUT_DATA_PATH, recursive = TRUE, showWarnings = FALSE)" + ], + "execution_count": null, + "outputs": [], + "id": "cc24cdd4-2ccb-4511-8a63-8ee4b0c29bde" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "reticulate::py_config()$python" + ], + "execution_count": null, + "outputs": [], + "id": "7acc95aa-eb5e-421e-b23e-0efa602d1cc1" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Geo and admin data" + ], + "id": "93898419-b98d-4a53-8fc9-a1bb9bff01a4" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "admin_level <- 'ADM1'\n", + "admin_id_col <- glue(admin_level, 'ID', .sep='_')\n", + "admin_name_col <- glue(admin_level, 'NAME', .sep='_')\n", + "admin_cols <- c(admin_id_col, admin_name_col)" + ], + "execution_count": null, + "outputs": [], + "id": "145fe721-f42a-45ff-a3cb-060886fe7a9e" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Load spatial file from dataset \n", + "\n", + "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "\n", + "spatial_data <- load_dhs_spatial_data(\n", + " dhis2_dataset = dhis2_dataset,\n", + " country_code = COUNTRY_CODE\n", + ")" + ], + "execution_count": null, + "outputs": [], + "id": "4938a133-569e-4aec-bf55-7a633a142bc2" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "spatial_data <- st_as_sf(spatial_data)" + ], + "execution_count": null, + "outputs": [], + "id": "31c87079-f8b6-417a-a206-804d5c3208e8" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# aggregate geometries by the admin columns\n", + "spatial_data <- aggregate_geometry(\n", + " sf_data=spatial_data,\n", + " admin_id_colname=admin_id_col,\n", + " admin_name_colname=admin_name_col\n", + ")\n", + "\n", + "# keep class\n", + "spatial_data <- st_as_sf(spatial_data)\n", + "\n", + "if(COUNTRY_CODE == \"COD\"){\n", + " spatial_data[[admin_name_col]] <- clean_admin_names(spatial_data[[admin_name_col]])\n", + "}\n", + "\n", + "admin_data <- st_drop_geometry(spatial_data)\n", + "setDT(admin_data)" + ], + "execution_count": null, + "outputs": [], + "id": "98c0e39a-5433-4dc8-a109-f279f7be0271" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import DHS data" + ], + "id": "6399c2eb-9509-4b4f-839a-6c8c83004510" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "data_source <- 'DHS'\n", + "indicator_access <- 'PCT_ITN_ACCESS'\n", + "indicator_use <- 'PCT_ITN_USE'" + ], + "execution_count": null, + "outputs": [], + "id": "0fb15b9d-3cc5-4c6f-be25-124169388c25" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Unzip data for the analysis" + ], + "id": "0de3a133-4873-43ad-8a33-ed6afa42330b" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "household_recode <- 'HR'\n", + "person_recode <- 'PR'\n", + "target_file_type <- 'SV'\n", + "\n", + "delete_otherextension_files(DHS_DATA_PATH, extension_to_retain=\".zip\")" + ], + "execution_count": null, + "outputs": [], + "id": "1b88b59b-383c-49a6-b476-9319042243e2" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "dhs_hr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, household_recode, target_file_type)\n", + "unzip(file.path(DHS_DATA_PATH, dhs_hr_zip_filename), exdir=DHS_DATA_PATH)\n", + "\n", + "dhs_pr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, person_recode, target_file_type)\n", + "unzip(file.path(DHS_DATA_PATH, dhs_pr_zip_filename), exdir=DHS_DATA_PATH)" + ], + "execution_count": null, + "outputs": [], + "id": "bf5e0839-612a-430b-ad22-14bc62d6cad5" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# # Remove existing output files\n", + "# files <- list.files(OUTPUT_DATA_PATH, full.names = TRUE)\n", + "# files_to_delete <- files[grepl('_ITN_', basename(files), ignore.case = TRUE) & grepl(COUNTRY_CODE, basename(files), ignore.case = TRUE)]\n", + "# file.remove(files_to_delete)" + ], + "execution_count": null, + "outputs": [], + "id": "ca6e3da7-916c-47e1-b475-2bd5cf551bcd" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Import data files" + ], + "id": "02d577f1-007e-40b2-a3f6-e5b41089ee4a" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "data_extension <- '.SAV'\n", + "dhs_hr_filename <- list.files(path = DHS_DATA_PATH, pattern = paste0(\".*\", household_recode, \".*\\\\\", data_extension, \"$\"), ignore.case=TRUE)\n", + "dhs_pr_filename <- dir(path = DHS_DATA_PATH, pattern = paste0(\".*\", person_recode, \".*\\\\\", data_extension, \"$\"), ignore.case=TRUE)\n", + "\n", + "if(!check_dhs_same_version(dhs_hr_filename, dhs_pr_filename)){\n", + " stop(\"The necessary DHS data do not have the same version/issue. Check available data before rerunning.\")\n", + "}" + ], + "execution_count": null, + "outputs": [], + "id": "930ef55c-a590-4d76-bcad-902528f8815a" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "dhs_hr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_hr_filename)) # household recode\n", + "dhs_hr_dt <- setDT(dhs_hr_dt)\n", + "\n", + "dhs_pr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_pr_filename)) # person recode\n", + "dhs_pr_dt <- setDT(dhs_pr_dt)" + ], + "execution_count": null, + "outputs": [], + "id": "60283f74-b656-4596-82ff-aefa487ddd28" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Make admin codes and names dataframe (for future merging)\n", + "\n", + "dhs_admin_dt <- make_dhs_admin_df(\n", + " input_dhs_df=dhs_hr_dt,\n", + " original_admin_column=\"HV024\",\n", + " new_admin_name_colname=admin_name_col,\n", + " new_admin_code_colname='DHS_ADM1_CODE'\n", + ")\n", + "\n", + "# format the names to be like DHIS2 names\n", + "dhs_admin_dt[, (admin_name_col) := format_names(get(admin_name_col))]\n", + "\n", + "# TODO this should be changed in the formatting of DHIS2 data; the correct name should be with a space\n", + "dhs_admin_dt[get(admin_name_col) == \"MAI NDOMBE\", (admin_name_col) := \"MAINDOMBE\"]" + ], + "execution_count": null, + "outputs": [], + "id": "9c47b679-8735-4f15-bac5-6efcf912df86" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Check that all regions can be matched with DHIS2 pyramid\n", + "if(!check_perfect_match(dhs_admin_dt, admin_name_col, admin_data, admin_name_col)){\n", + " stop(\"The DHS data provided does not fully match DHIS2 pyramid data. Please check input data before retrying.\")\n", + "}" + ], + "execution_count": null, + "outputs": [], + "id": "2727c88e-45fb-4011-9fe8-99b55b8313e8" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Set relevant columns" + ], + "id": "659a2a79-a563-4062-b7b4-25652e140c5c" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "household_id_cols <- c(\"HHID\", \"HV000\", \"HV001\", \"HV002\")\n", + "original_household_ITN_cols <- grep('HML10', names(dhs_hr_dt), value = TRUE)\n", + "household_sampling_cols <- c(\"HV005\", \"HV021\", \"HV022\", \"HV023\", \"HV024\")\n", + "household_inhabitants_col <- \"HV013\"\n", + "person_slept_col <- \"HV103\"\n", + "person_id_col <- \"HVIDX\"\n", + "person_bednet_col <- \"HML12\"" + ], + "execution_count": null, + "outputs": [], + "id": "1fe61ebf-11b1-41ee-beb5-a553fffc3015" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preprocess Household recode data" + ], + "id": "8e71933c-df9f-413a-93eb-38f6e457ce7c" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# filter columns\n", + "hr_dt <- dhs_hr_dt[, .SD, .SDcols=c(household_id_cols, household_sampling_cols, household_inhabitants_col, original_household_ITN_cols)]\n", + "\n", + "# check i didn't omit any crucial variable\n", + "nrow(hr_dt[duplicated(hr_dt)])\n" + ], + "execution_count": null, + "outputs": [], + "id": "72d8d33a-6163-493b-87b9-11e82eae5bf7" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "sapply(original_household_ITN_cols, function(i) table(hr_dt[[i]], useNA = 'always'))" + ], + "execution_count": null, + "outputs": [], + "id": "58bec3c3-9c58-4ae9-8939-af545160239d" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# make syntactically valid names\n", + "setnames(hr_dt, old = names(hr_dt), new = make.names(names(hr_dt)))\n", + "household_ITN_cols <- grep('HML10', names(hr_dt), value = TRUE)" + ], + "execution_count": null, + "outputs": [], + "id": "436a4f7b-b3a3-4849-8b4a-7205ffb64efd" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "sapply(household_ITN_cols, function(i) table(hr_dt[[i]], useNA = 'always'))" + ], + "execution_count": null, + "outputs": [], + "id": "6508361d-f908-41fc-9a6b-ef709fcf24cd" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# add admin name column\n", + "hr_dt <- merge.data.table(dhs_admin_dt, hr_dt, by.x = \"DHS_ADM1_CODE\", by.y = \"HV024\", all = TRUE)\n", + "\n", + "# sapply(household_ITN_cols, function(i) table(hr_dt[[i]], useNA = 'always'))\n", + "\n", + "hr_dt[, (household_ITN_cols) := lapply(.SD, function(x) {\n", + " x <- as.integer(as.character(x)) # convert factors/characters to numeric\n", + " ifelse(is.na(x), 0, x)\n", + "}), .SDcols = household_ITN_cols]\n", + "\n", + "# compute the maximum potential users, given the number of ITNs present in the household\n", + "hr_dt[, max_users := 2 * rowSums(.SD, na.rm = TRUE), .SDcols = household_ITN_cols] # maximum 2 times the number of ITNs in the household\n", + "\n", + "# compute real potential users\n", + "hr_dt[, potential_users := pmin(max_users, HV013, na.rm = TRUE)]\n", + "\n", + "# compute weights\n", + "hr_dt[, wt := HV005/1000000]" + ], + "execution_count": null, + "outputs": [], + "id": "b89fdf1e-059a-4a39-be91-f7722c18824d" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Access to ITN" + ], + "id": "a17d5a0b-de6c-4e06-8567-e03a37936d65" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Preprocess person file" + ], + "id": "c8d3241b-3f42-44fd-9cbc-1c5d3f7d877c" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# filter relevant columns\n", + "access_pr_dt <- dhs_pr_dt[, .SD, .SDcols = c(\n", + " household_id_cols,\n", + " person_id_col,\n", + " person_slept_col\n", + ")]\n", + "\n", + "# # check no necessary column was omitted\n", + "# nrow(access_pr_dt[duplicated(access_pr_dt)])" + ], + "execution_count": null, + "outputs": [], + "id": "6aa78b5e-f5a3-492e-8859-8757fabc6e78" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# make denominator: group and sum, removing NAs\n", + "access_pr_dt <- access_pr_dt[, .(total_slept = sum(get(person_slept_col), na.rm = TRUE)), by = household_id_cols]" + ], + "execution_count": null, + "outputs": [], + "id": "9eb38a46-4c62-4ed8-b33b-e4cb9b7508b3" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Join with household file" + ], + "id": "149e766d-0863-4c74-b12e-26727e940a8c" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# check merge with household file\n", + "check_perfect_match(hr_dt, 'HHID', access_pr_dt, 'HHID')\n", + "\n", + "# lapply(household_id_cols, function(i) check_perfect_match(hr_dt, i, access_pr_dt, i))\n", + "if(!all(unlist((lapply(household_id_cols, function(i) check_perfect_match(hr_dt, i, access_pr_dt, i)))))){\n", + " print('Person and Household data does not match')\n", + "}" + ], + "execution_count": null, + "outputs": [], + "id": "af6a7e45-18b1-4fbf-af8b-c57886920780" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "access_dt <- merge.data.table(hr_dt, access_pr_dt, by = household_id_cols, all = TRUE)\n", + "\n", + "# filter rows\n", + "access_dt <- access_dt[total_slept > 0] # to not divide by 0 (only households where someone slept last night)" + ], + "execution_count": null, + "outputs": [], + "id": "02c1997c-f5c6-4eee-b889-a8c8cc81dd0f" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "DHS guidelines for the calculation of “potential users”: \"In households which have more than 1 ITN for every 2 people, the product of this calculation will be greater than the number of individuals who spent the previous night. In this case, the “potential users” variable in that household should be modified to reflect the number of individuals who spent the previous night in the household because the number of potential users in a household cannot exceed the number of individuals who spent the previous night in that household.\"" + ], + "id": "50e7c498-1141-46a4-b4fc-b5c4617fb956" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "access_dt[, foo := fifelse(\n", + " potential_users > total_slept,\n", + " total_slept,\n", + " potential_users\n", + ")]" + ], + "execution_count": null, + "outputs": [], + "id": "9b20ff38-d658-43c8-83d5-5357bc7a293a" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Compute ITN access indicator" + ], + "id": "fddd66b0-c351-4aa4-bdaa-47542fbdee9d" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "access_dt[, (indicator_access) := potential_users / total_slept]" + ], + "execution_count": null, + "outputs": [], + "id": "caa18abf-6589-4846-a705-897ce3943692" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "summary(access_dt[[indicator_access]])" + ], + "execution_count": null, + "outputs": [], + "id": "0ff9c48c-1c35-4e14-a4bf-417040e9e4f7" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Account for the sampling strategy" + ], + "id": "9c53f039-299a-4c02-a728-77a77d088472" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# clustering, stratification, weights (for means, proportions, regression models, etc.)\n", + "access_design_sampling = svydesign(\n", + " ids = ~ HV021, # primary sampling unit / cluster ids (cluster number and/or ultimate area unit)\n", + " data = access_dt, # dataset\n", + " strata = ~ HV023, # groupings of primary sampling units\n", + " weights = ~ wt, # the sampling weights variable\n", + " num_p=1, # ? dunno what this is\n", + " nest = T # the primary sampling units are nested within the strata\n", + ")" + ], + "execution_count": null, + "outputs": [], + "id": "9df627ce-0424-4341-9fcd-b1f13cd769b6" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_access)}\")\n", + "bednet_access_table <- compute_and_export_indicator_table(\n", + " design_obj = access_design_sampling,\n", + " indicator_name = indicator_access,\n", + " admin_name_col = admin_name_col,\n", + " admin_data = admin_data,\n", + " output_data_path = OUTPUT_DATA_PATH,\n", + " filename_without_extension = filename_without_extension\n", + ")" + ], + "execution_count": null, + "outputs": [], + "id": "f699992f-e931-4657-abf3-7a2e13a89dbf" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Access indicator table computed and exported in previous cell." + ], + "execution_count": null, + "outputs": [], + "id": "bfc79978-0f59-444b-a8f1-3420864489b1" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# handled by compute_and_export_indicator_table()" + ], + "execution_count": null, + "outputs": [], + "id": "4e71c083-0b67-43db-8f0b-eda14d8a2be3" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# handled by compute_and_export_indicator_table()" + ], + "execution_count": null, + "outputs": [], + "id": "0c92b8b6-d343-4a69-b508-ae23ebbae48b" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# handled by compute_and_export_indicator_table()" + ], + "execution_count": null, + "outputs": [], + "id": "98542095-2e06-4860-b038-c7eeceeb6265" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# handled by compute_and_export_indicator_table()" + ], + "execution_count": null, + "outputs": [], + "id": "75e920e5-3715-4971-bc6c-62291dc59fc5" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# handled by compute_and_export_indicator_table()" + ], + "execution_count": null, + "outputs": [], + "id": "67cb29d2-ee18-4d28-9c72-e253f8dcb5b6" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "head(bednet_access_table)" + ], + "execution_count": null, + "outputs": [], + "id": "54935f07-b254-458d-b1fb-8c7a921b5bd9" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# already exported by compute_and_export_indicator_table()" + ], + "execution_count": null, + "outputs": [], + "id": "85c1bf55-39d6-4c70-88bf-46542f158b0c" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ITN use" + ], + "id": "484e0a38-8c37-4213-a750-babfaf9107bc" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Preprocess person file" + ], + "id": "1ebdff14-edbd-4fd5-85f5-230b1e27adb7" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# filter columns\n", + "use_pr_dt <- dhs_pr_dt[, .SD, .SDcols=c(household_id_cols, person_id_col, person_slept_col, person_bednet_col)]\n", + "\n", + "# check no necessary column was omitted\n", + "nrow(use_pr_dt[duplicated(use_pr_dt)])\n", + "\n", + "# # for(i in person_slept_col){print(table(access_pr_dt[[i]]))}\n", + "# sapply(person_bednet_col, function(i) table(use_pr_dt[[i]], useNA = 'always'))" + ], + "execution_count": null, + "outputs": [], + "id": "7a3536b2-6fc4-412f-91bc-21466b688e59" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The DHS guide ( https://dhsprogram.com/data/Guide-to-DHS-Statistics/index.htm#t=Use_of_Mosquito_Nets_by_Persons_in_the_Household.htm ) suggests to use both 1 & 2 as possible values for HML12; but 2 is \"Both treated (ITN) and untreated nets\"; using as specified in the guide, but to be kept in mind" + ], + "id": "a303e6ec-a3ac-4a20-8f3b-40a06d8067e3" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# # group and sum, removing NAs and keeping only 1 as valid value\n", + "# use_pr_dt <- use_pr_dt[, slept_itn := as.integer(\n", + "# get(person_slept_col) == 1 & (get(person_bednet_col) == 1)\n", + "# )]" + ], + "execution_count": null, + "outputs": [], + "id": "35211f4e-e84d-4fbc-be01-577912300d29" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# group and sum, removing NAs\n", + "use_pr_dt <- use_pr_dt[, slept_itn := as.integer(\n", + " get(person_slept_col) == 1 & (get(person_bednet_col) %in% c(1, 2))\n", + ")]\n", + "\n", + "# check recodings are correct\n", + "xtabs(~ get(person_slept_col) + get(person_bednet_col) + slept_itn, data = use_pr_dt, addNA = TRUE)" + ], + "execution_count": null, + "outputs": [], + "id": "132e025d-a699-4c0c-a20d-d4f7503811a6" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "use_pr_dt <- use_pr_dt[, .(\n", + " total_slept = sum(get(person_slept_col), na.rm = TRUE),\n", + " total_slept_itn = sum(get(\"slept_itn\"), na.rm = TRUE)\n", + "), by = household_id_cols\n", + "]\n", + "\n", + "use_pr_dt[, (indicator_use) := total_slept_itn / total_slept]" + ], + "execution_count": null, + "outputs": [], + "id": "3c883b64-605c-4825-b16f-5d4b2f6febf9" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Join with household file" + ], + "id": "ab6ac712-21d4-478b-9f48-fe9d39c4ffc5" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "use_dt <- merge.data.table(hr_dt, use_pr_dt, by = household_id_cols)" + ], + "execution_count": null, + "outputs": [], + "id": "e164f7b2-26c4-4357-8b1d-5acad2de6e54" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Compute ITN use indicator" + ], + "id": "16738a93-d779-4671-9d4a-790e08860f6d" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Account for sampling strategy" + ], + "id": "500726f9-5091-45f7-b164-1ab50c362587" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "use_design_sampling = svydesign(\n", + " ids = ~ HV021, # primary sampling unit / cluster ids (cluster number and/or ultimate area unit)\n", + " data = use_dt, # dataset\n", + " strata = ~ HV023, # groupings of primary sampling units\n", + " weights = ~ wt, # the sampling weights variable\n", + " num_p=1, # ? dunno what this is\n", + " nest = T # the primary sampling units are nested within the strata\n", + ")" + ], + "execution_count": null, + "outputs": [], + "id": "12f19a36-722c-47a1-8183-7022e9674ade" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{indicator_use}\")\n", + "bednet_use_table <- compute_and_export_indicator_table(\n", + " design_obj = use_design_sampling,\n", + " indicator_name = indicator_use,\n", + " admin_name_col = admin_name_col,\n", + " admin_data = admin_data,\n", + " output_data_path = OUTPUT_DATA_PATH,\n", + " filename_without_extension = filename_without_extension\n", + ")" + ], + "execution_count": null, + "outputs": [], + "id": "95ca0478-3d3a-42b6-85df-41e6e1fe19fa" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Use indicator table computed and exported in previous cell." + ], + "execution_count": null, + "outputs": [], + "id": "4a571c0d-2a47-423c-b716-a46589f7f41f" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# handled by compute_and_export_indicator_table()" + ], + "execution_count": null, + "outputs": [], + "id": "79b8ae99-1fb9-4611-b17d-4f5c98e3d9e3" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# handled by compute_and_export_indicator_table()" + ], + "execution_count": null, + "outputs": [], + "id": "3bb41a0b-a67f-4710-9749-498d433ee270" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# handled by compute_and_export_indicator_table()" + ], + "execution_count": null, + "outputs": [], + "id": "8a6d7558-378a-4d85-a2d2-6916e227ff19" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# handled by compute_and_export_indicator_table()" + ], + "execution_count": null, + "outputs": [], + "id": "9b886265-bca7-492c-b216-15209da1d515" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# already exported by compute_and_export_indicator_table()" + ], + "execution_count": null, + "outputs": [], + "id": "47e8ec4b-b90d-434f-b6a5-5962d43451e5" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [], + "execution_count": null, + "outputs": [], + "id": "b0aec24e-9feb-4a7a-a491-be7f44617ac0" + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/pipelines/snt_dhs_indicators/code/snt_dhs_careseeking_computation.ipynb b/pipelines/snt_dhs_indicators/code/snt_dhs_careseeking_computation.ipynb index 39e5171..da8d105 100644 --- a/pipelines/snt_dhs_indicators/code/snt_dhs_careseeking_computation.ipynb +++ b/pipelines/snt_dhs_indicators/code/snt_dhs_careseeking_computation.ipynb @@ -1,605 +1,531 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "c9aadda5-521c-4345-916c-a60f9802a853", - "metadata": {}, - "source": [ - "# Careseeking behavior upon child fever (DHS data)" - ] - }, - { - "cell_type": "markdown", - "id": "4ce067b3-f707-4df6-b7de-30389f125534", - "metadata": {}, - "source": [ - "## Resources" - ] - }, - { - "cell_type": "markdown", - "id": "a71afb11-ea04-40e9-8566-4b80a0857e25", - "metadata": {}, - "source": [ - "https://dhsprogram.com/data/Guide-to-DHS-Statistics/Fever_and_Careseeking.htm?rhtocid=_13_3_0#Percentage_of_children4\n", - "\n", - "1) Percentage of children under age 5 years with fever in the 2 weeks preceding the survey.\n", - "2) Among children under age 5 years with fever in the 2 weeks preceding the survey, percentage for whom advice or treatment was sought.\n", - "3) Among children under age 5 years with fever in the 2 weeks preceding the survey, percentage for whom advice or treatment was sought the same or next day following the onset of fever.\n", - "4) Among children under age 5 with fever in the 2 weeks preceding the survey, percentage who took antibiotic drugs.\n", - "\n", - "Coverage:\n", - "\n", - "Population base: Living children under age 5 years (KR file)\n", - "\n", - "Time period: Two weeks preceding the survey\n", - "\n", - "Numerators:\n", - "1) Number of living children under age 5 years with fever at any time during the 2 weeks preceding the survey (b5 = 1 & b19 < 60 & h22 = 1)\n", - "2) Number of living children under age 5 years with a fever at any time during the 2 weeks preceding the interview for whom advice or treatment was sought (b5 = 1 & b19 < 60 & h22 = 1 & any of h32a – x = 1 except traditional practitioner (usually h32t))\n", - "3) Number of living children under age 5 years with a fever at any time during the 2 weeks preceding the interview for whom advice or treatment was sought the same day or next day following the onset of fever (b5 = 1 & b19 < 60 & h22 = 1 & any of h32a – x = 1 excluding advice or treatment from a traditional practitioner (usually h32t) & h46b in 0:1)\n", - "4) Number of living children under age 5 years with a fever at any time during the 2 weeks preceding the interview who took antibiotic drugs (b5 = 1 & h22 = 1 & (h37i = 1 or h37j = 1 or h37n= 1 or h37o = 1) [or ml13i = 1 or ml13j = 1 or ml13n = 1 or ml13o = 1])\n", - "\n", - "Denominators:\n", - "- Numerator 1: Number of living children under age 5 (b5 = 1 & b19 < 60)\n", - "\n", - "- Numerators 2, 3, and 4: Number of living children under age 5 with fever at any time during the 2 weeks preceding the survey (b5 = 1 & b19 < 60 and h22 = 1).\n", - "\n", - "Project uses (split by \"private/public\"): \"2) Among children under age 5 years with fever in the 2 weeks preceding the survey, percentage for whom advice or treatment was sought.\"" - ] - }, - { - "cell_type": "markdown", - "id": "570901a3-7312-4583-bb4f-753e7e7c0ca2", - "metadata": {}, - "source": [ - "## Preliminary steps" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "58c2d4af-aa4e-45ad-9ef7-6dba584f12ca", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "rm(list = ls())\n", - "\n", - "options(scipen=999)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "690423e8-6e7d-49fc-8f01-4a1f445fa537", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Global paths\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "\n", - "# Paths\n", - "ROOT_PATH <- '~/workspace'\n", - "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", - "CODE_PATH <- file.path(ROOT_PATH, 'code')\n", - "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", - "DHS_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'raw')\n", - "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'careseeking')\n", - "\n", - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "\n", - "# List required pcks\n", - "required_packages <- c(\"haven\", \"sf\", \"glue\", \"survey\", \"data.table\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"arrow\")\n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "687e8bab-120e-4367-b4fe-43c3dee11185", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")\n", - "\n", - "# Load SNT config\n", - "CONFIG_FILE_NAME <- \"SNT_config.json\"\n", - "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, CONFIG_FILE_NAME)) \n", - "log_msg(msg)\n", - "\n", - "# Set config variables\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE" - ] - }, - { - "cell_type": "markdown", - "id": "e07d8a08-5bae-4189-80bf-57d6ba653e83", - "metadata": {}, - "source": [ - "## Spatial/admin data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "18fd6f07-2e0f-4e26-90d4-ac07e052379a", - "metadata": { - "vscode": { - "languageId": "r" + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Careseeking behavior upon child fever (DHS data)" + ], + "id": "c9aadda5-521c-4345-916c-a60f9802a853" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Resources" + ], + "id": "4ce067b3-f707-4df6-b7de-30389f125534" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "https://dhsprogram.com/data/Guide-to-DHS-Statistics/Fever_and_Careseeking.htm?rhtocid=_13_3_0#Percentage_of_children4\n", + "\n", + "1) Percentage of children under age 5 years with fever in the 2 weeks preceding the survey.\n", + "2) Among children under age 5 years with fever in the 2 weeks preceding the survey, percentage for whom advice or treatment was sought.\n", + "3) Among children under age 5 years with fever in the 2 weeks preceding the survey, percentage for whom advice or treatment was sought the same or next day following the onset of fever.\n", + "4) Among children under age 5 with fever in the 2 weeks preceding the survey, percentage who took antibiotic drugs.\n", + "\n", + "Coverage:\n", + "\n", + "Population base: Living children under age 5 years (KR file)\n", + "\n", + "Time period: Two weeks preceding the survey\n", + "\n", + "Numerators:\n", + "1) Number of living children under age 5 years with fever at any time during the 2 weeks preceding the survey (b5 = 1 & b19 < 60 & h22 = 1)\n", + "2) Number of living children under age 5 years with a fever at any time during the 2 weeks preceding the interview for whom advice or treatment was sought (b5 = 1 & b19 < 60 & h22 = 1 & any of h32a – x = 1 except traditional practitioner (usually h32t))\n", + "3) Number of living children under age 5 years with a fever at any time during the 2 weeks preceding the interview for whom advice or treatment was sought the same day or next day following the onset of fever (b5 = 1 & b19 < 60 & h22 = 1 & any of h32a – x = 1 excluding advice or treatment from a traditional practitioner (usually h32t) & h46b in 0:1)\n", + "4) Number of living children under age 5 years with a fever at any time during the 2 weeks preceding the interview who took antibiotic drugs (b5 = 1 & h22 = 1 & (h37i = 1 or h37j = 1 or h37n= 1 or h37o = 1) [or ml13i = 1 or ml13j = 1 or ml13n = 1 or ml13o = 1])\n", + "\n", + "Denominators:\n", + "- Numerator 1: Number of living children under age 5 (b5 = 1 & b19 < 60)\n", + "\n", + "- Numerators 2, 3, and 4: Number of living children under age 5 with fever at any time during the 2 weeks preceding the survey (b5 = 1 & b19 < 60 and h22 = 1).\n", + "\n", + "Project uses (split by \"private/public\"): \"2) Among children under age 5 years with fever in the 2 weeks preceding the survey, percentage for whom advice or treatment was sought.\"" + ], + "id": "a71afb11-ea04-40e9-8566-4b80a0857e25" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preliminary steps" + ], + "id": "570901a3-7312-4583-bb4f-753e7e7c0ca2" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "rm(list = ls())\n", + "\n", + "options(scipen=999)" + ], + "execution_count": null, + "outputs": [], + "id": "58c2d4af-aa4e-45ad-9ef7-6dba584f12ca" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Paths\n", + "ROOT_PATH <- '~/workspace'\n", + "PIPELINE_PATH <- file.path(ROOT_PATH, 'pipelines', 'snt_dhs_indicators')\n", + "\n", + "# Load notebook-specific utils\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhs_careseeking_computation.r\"))\n", + "\n", + "setup_ctx <- bootstrap_dhs_indicators_context(root_path = ROOT_PATH)\n", + "DATA_PATH <- setup_ctx$DATA_PATH\n", + "DHS_DATA_PATH <- setup_ctx$DHS_DATA_PATH\n", + "config_json <- setup_ctx$config_json\n", + "COUNTRY_CODE <- setup_ctx$COUNTRY_CODE\n", + "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'careseeking')\n", + "dir.create(OUTPUT_DATA_PATH, recursive = TRUE, showWarnings = FALSE)" + ], + "execution_count": null, + "outputs": [], + "id": "690423e8-6e7d-49fc-8f01-4a1f445fa537" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "reticulate::py_config()$python" + ], + "execution_count": null, + "outputs": [], + "id": "687e8bab-120e-4367-b4fe-43c3dee11185" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Spatial/admin data" + ], + "id": "e07d8a08-5bae-4189-80bf-57d6ba653e83" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "admin_level <- 'ADM1'\n", + "admin_id_col <- glue(admin_level, 'ID', .sep='_')\n", + "admin_name_col <- glue(admin_level, 'NAME', .sep='_')\n", + "admin_cols <- c(admin_id_col, admin_name_col)" + ], + "execution_count": null, + "outputs": [], + "id": "18fd6f07-2e0f-4e26-90d4-ac07e052379a" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Load spatial file from dataset\n", + "\n", + "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "\n", + "spatial_data <- load_dhs_spatial_data(\n", + " dhis2_dataset = dhis2_dataset,\n", + " country_code = COUNTRY_CODE\n", + ")\n", + "\n", + "spatial_data <- st_as_sf(spatial_data)\n", + "\n", + "# aggregate geometries by the admin columns\n", + "spatial_data <- aggregate_geometry(\n", + " sf_data=spatial_data,\n", + " admin_id_colname=admin_id_col,\n", + " admin_name_colname=admin_name_col\n", + ")\n", + "\n", + "# keep class\n", + "spatial_data <- st_as_sf(spatial_data)\n", + "\n", + "if(COUNTRY_CODE == \"COD\"){\n", + " spatial_data[[admin_name_col]] <- clean_admin_names(spatial_data[[admin_name_col]])\n", + "}\n", + "\n", + "admin_data <- st_drop_geometry(spatial_data)\n", + "setDT(admin_data)" + ], + "execution_count": null, + "outputs": [], + "id": "9e0ebcf5-45d4-4c40-be95-cf27b0a3ae75" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import DHS data" + ], + "id": "51c988f2-1fb5-4e35-8920-c5d341a543f3" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "data_source <- 'DHS'\n", + "\n", + "indicator_public_care <- 'PCT_PUBLIC_CARE'\n", + "indicator_private_care <- 'PCT_PRIVATE_CARE'\n", + "indicator_no_care <- 'PCT_NO_CARE'" + ], + "execution_count": null, + "outputs": [], + "id": "599d0f6e-d0f5-42fd-8e78-6a87bdbd12a6" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "household_recode <- 'HR'\n", + "kid_recode <- 'KR'\n", + "target_file_type <- 'SV'\n", + "\n", + "delete_otherextension_files(DHS_DATA_PATH, extension_to_retain=\".zip\")\n", + "\n", + "dhs_hr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, household_recode, target_file_type)\n", + "unzip(file.path(DHS_DATA_PATH, dhs_hr_zip_filename), exdir=DHS_DATA_PATH)\n", + "\n", + "dhs_kr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, kid_recode, target_file_type)\n", + "unzip(file.path(DHS_DATA_PATH, dhs_kr_zip_filename), exdir=DHS_DATA_PATH)\n", + "\n", + "# # Remove existing output files\n", + "# files <- list.files(OUTPUT_DATA_PATH, full.names = TRUE)\n", + "# files_to_delete <- files[grepl('U5_PREV', basename(files), ignore.case = TRUE) & grepl(COUNTRY_CODE, basename(files), ignore.case = TRUE)]\n", + "# file.remove(files_to_delete)\n", + "\n", + "data_extension <- '.SAV'\n", + "dhs_hr_filename <- list.files(path = DHS_DATA_PATH, pattern = paste0(\".*\", household_recode, \".*\\\\\", data_extension, \"$\"), ignore.case=TRUE)\n", + "dhs_kr_filename <- dir(path = DHS_DATA_PATH, pattern = paste0(\".*\", kid_recode, \".*\\\\\", data_extension, \"$\"), ignore.case=TRUE)\n", + "\n", + "if(!check_dhs_same_version(dhs_hr_filename, dhs_kr_filename)){\n", + " stop(\"The necessary DHS data do not have the same version/issue. Check available data before rerunning.\")\n", + "}\n", + "\n", + "dhs_hr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_hr_filename)) # household recode\n", + "dhs_hr_dt <- setDT(dhs_hr_dt)\n", + "\n", + "dhs_kr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_kr_filename)) # kid recode\n", + "dhs_kr_dt <- setDT(dhs_kr_dt)" + ], + "execution_count": null, + "outputs": [], + "id": "9589f2b2-c0ad-4a46-8778-faa4f427eb37" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Make admin codes and names dataframe (for future merging)" + ], + "id": "c26e24f7-0a59-4dbf-b493-a760c9d30e39" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "dhs_beginning_year <- as.integer(dhs_hr_dt[, min(HV007)])\n", + "\n", + "dhs_admin_dt <- make_dhs_admin_df(\n", + " input_dhs_df=dhs_hr_dt,\n", + " original_admin_column=\"HV024\",\n", + " new_admin_name_colname=admin_name_col,\n", + " new_admin_code_colname='DHS_ADM1_CODE'\n", + ")\n", + "\n", + "# format the names to be like DHIS2 names\n", + "dhs_admin_dt[, (admin_name_col) := format_names(get(admin_name_col))]\n", + "\n", + "# TODO this should be changed in the formatting of DHIS2 data; the correct name should be with a space\n", + "dhs_admin_dt[get(admin_name_col) == \"MAI NDOMBE\", (admin_name_col) := \"MAINDOMBE\"]\n", + "\n", + "# Check that all regions can be matched with DHIS2 pyramid\n", + "if(!check_perfect_match(dhs_admin_dt, admin_name_col, admin_data, admin_name_col)){\n", + " stop(\"The DHS data provided does not fully match DHIS2 pyramid data. Please check input data before retrying.\")\n", + "}" + ], + "execution_count": null, + "outputs": [], + "id": "f244f017-b532-4106-8beb-4bf7c0ed4d02" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "rm(dhs_hr_dt) # will not be used further" + ], + "execution_count": null, + "outputs": [], + "id": "52af343a-bbee-4601-9e3f-adceb60d809f" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Relevant columns" + ], + "id": "c35b457b-e6ac-42c6-bad5-4ee328e94177" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "kid_id_cols <- c(\"CASEID\", \"V000\", \"V001\", \"V002\")\n", + "household_sampling_cols <- c(\"V005\", \"V021\", \"V022\", \"V023\", \"V024\")\n", + "kid_alive_col <- \"B5\"\n", + "kid_age_col <- \"B19\"\n", + "kid_fever_col <- \"H22\"\n", + "# grep(\"^H32\", names(dhs_kr_dt), value = TRUE)\n", + "kid_public_care_cols <- c(\"H32A\", \"H32B\", \"H32C\", \"H32D\", \"H32E\", \"H32F\", \"H32G\", \"H32H\", \"H32I\")\n", + "kid_private_care_cols <- c(\"H32J\", \"H32K\", \"H32L\", \"H32M\", \"H32N\", \"H32O\", \"H32P\", \"H32Q\", \"H32R\",\n", + " \"H32NA\", \"H32NB\", \"H32NC\", \"H32ND\", \"H32NE\")\n", + "kid_other_care_cols <- c(\"H32S\", \"H32W\", \"H32X\")" + ], + "execution_count": null, + "outputs": [], + "id": "4ea43f49-f9dd-4403-a3b4-885ff31da8bb" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preprocess kid file" + ], + "id": "6b0924e4-aef6-4e1c-be29-4f818fb14d8e" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# filter rows and columns\n", + "kr_dt <- dhs_kr_dt[(\n", + " !(is.na(get(kid_age_col))) & # no missing age\n", + " get(kid_age_col) < 60 & # younger than 5\n", + " get(kid_alive_col) == 1 & # alive\n", + " get(kid_fever_col) == 1 # had fever\n", + " ),\n", + " .SD, .SDcols = c(\n", + " kid_id_cols,\n", + " household_sampling_cols,\n", + " kid_alive_col,\n", + " kid_age_col,\n", + " kid_fever_col,\n", + " kid_other_care_cols,\n", + " kid_public_care_cols,\n", + " kid_private_care_cols\n", + " )]\n", + "\n", + "kr_dt[, wt := V005/1000000]\n", + "\n", + "kr_dt <- merge.data.table(dhs_admin_dt, kr_dt, by.x = \"DHS_ADM1_CODE\", by.y = \"V024\", all = TRUE)" + ], + "execution_count": null, + "outputs": [], + "id": "5184585f-bb68-4e99-89e2-ff52ffbf2eed" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Create the numerators\n", + "\n", + "kr_dt[, (indicator_public_care) := as.integer(rowSums(.SD == 1, na.rm = TRUE) > 0), .SDcols = kid_public_care_cols]\n", + "kr_dt[, (indicator_private_care) := as.integer(rowSums(.SD == 1, na.rm = TRUE) > 0), .SDcols = kid_private_care_cols]\n", + "kr_dt[, (indicator_no_care) := as.integer(rowSums(.SD != 0, na.rm = TRUE) == 0), .SDcols = c(kid_public_care_cols, kid_private_care_cols)]\n", + "\n", + "# check\n", + "xtabs(~ kr_dt[[indicator_public_care]] + kr_dt[[indicator_private_care]] + kr_dt[[indicator_no_care]])" + ], + "execution_count": null, + "outputs": [], + "id": "3c3f8447-842f-414c-b47f-869684c05c37" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sampling design" + ], + "id": "ead4ca71-3a5c-445f-ae61-3359a7cc7e31" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# clustering, stratification, weights (for means, proportions, regression models, etc.)\n", + "kr_design_sampling = svydesign(\n", + " ids = ~ V021, # primary sampling unit / cluster ids (cluster number and/or ultimate area unit)\n", + " data = kr_dt, # dataset\n", + " strata = ~ V023, # groupings of primary sampling units\n", + " weights = ~ wt, # the sampling weights variable\n", + " num_p=1, # ? dunno what this is\n", + " nest = T # the primary sampling units are nested within the strata\n", + ")" + ], + "execution_count": null, + "outputs": [], + "id": "81f99ad0-b0fa-4be8-b6df-9b726a0eccad" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compute indicators" + ], + "id": "88f9844d-3f29-4cc9-91e7-351bc81b6d69" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Make the indicator tables and save them, add the sample estimation of the average proportion to a summary table" + ], + "id": "0bec1af7-62e7-4da6-8531-1ed6f9933dc0" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "indicator_results <- compute_careseeking_indicators(\n", + " kr_design_sampling = kr_design_sampling,\n", + " indicator_names = c(indicator_public_care, indicator_private_care, indicator_no_care),\n", + " admin_name_col = admin_name_col,\n", + " admin_data = admin_data,\n", + " output_data_path = OUTPUT_DATA_PATH,\n", + " country_code = COUNTRY_CODE,\n", + " data_source = data_source,\n", + " admin_level = admin_level\n", + ")\n", + "\n", + "summary_table <- indicator_results$summary_table\n", + "pct_public_care_table <- indicator_results$indicator_tables[[indicator_public_care]]\n", + "pct_private_care_table <- indicator_results$indicator_tables[[indicator_private_care]]\n", + "pct_no_care_table <- indicator_results$indicator_tables[[indicator_no_care]]" + ], + "execution_count": null, + "outputs": [], + "id": "357ecff3-538e-4265-b024-51f678bcf29f" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Summary table already prepared and renamed in compute_careseeking_indicators()\n", + "head(summary_table)" + ], + "execution_count": null, + "outputs": [], + "id": "f40da01a-4558-43af-9d1a-136b7c6920db" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Summary output already exported in compute_careseeking_indicators()" + ], + "execution_count": null, + "outputs": [], + "id": "3896b18f-ad5f-49c2-a2cc-2ebf40208aca" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [], + "execution_count": null, + "outputs": [], + "id": "2449bdb3-61cf-4ddd-a272-2172da6cba2e" } - }, - "outputs": [], - "source": [ - "admin_level <- 'ADM1'\n", - "admin_id_col <- glue(admin_level, 'ID', .sep='_')\n", - "admin_name_col <- glue(admin_level, 'NAME', .sep='_')\n", - "admin_cols <- c(admin_id_col, admin_name_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9e0ebcf5-45d4-4c40-be95-cf27b0a3ae75", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load spatial file from dataset\n", - "\n", - "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "\n", - "spatial_data_filename <- paste(COUNTRY_CODE, \"shapes.geojson\", sep = \"_\")\n", - "# spatial_data <- read_sf(file.path(DATA_PATH, 'dhis2', 'formatted', spatial_data_filename))\n", - "spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)\n", - "log_msg(glue(\"File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))\n", - "\n", - "spatial_data <- st_as_sf(spatial_data)\n", - "\n", - "# aggregate geometries by the admin columns\n", - "spatial_data <- aggregate_geometry(\n", - " sf_data=spatial_data,\n", - " admin_id_colname=admin_id_col,\n", - " admin_name_colname=admin_name_col\n", - ")\n", - "\n", - "# keep class\n", - "spatial_data <- st_as_sf(spatial_data)\n", - "\n", - "if(COUNTRY_CODE == \"COD\"){\n", - " spatial_data[[admin_name_col]] <- clean_admin_names(spatial_data[[admin_name_col]])\n", - "}\n", - "\n", - "admin_data <- st_drop_geometry(spatial_data)\n", - "setDT(admin_data)" - ] - }, - { - "cell_type": "markdown", - "id": "51c988f2-1fb5-4e35-8920-c5d341a543f3", - "metadata": {}, - "source": [ - "## Import DHS data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "599d0f6e-d0f5-42fd-8e78-6a87bdbd12a6", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "data_source <- 'DHS'\n", - "\n", - "indicator_public_care <- 'PCT_PUBLIC_CARE'\n", - "indicator_private_care <- 'PCT_PRIVATE_CARE'\n", - "indicator_no_care <- 'PCT_NO_CARE'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9589f2b2-c0ad-4a46-8778-faa4f427eb37", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "household_recode <- 'HR'\n", - "kid_recode <- 'KR'\n", - "target_file_type <- 'SV'\n", - "\n", - "delete_otherextension_files(DHS_DATA_PATH, extension_to_retain=\".zip\")\n", - "\n", - "dhs_hr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, household_recode, target_file_type)\n", - "unzip(file.path(DHS_DATA_PATH, dhs_hr_zip_filename), exdir=DHS_DATA_PATH)\n", - "\n", - "dhs_kr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, kid_recode, target_file_type)\n", - "unzip(file.path(DHS_DATA_PATH, dhs_kr_zip_filename), exdir=DHS_DATA_PATH)\n", - "\n", - "# # Remove existing output files\n", - "# files <- list.files(OUTPUT_DATA_PATH, full.names = TRUE)\n", - "# files_to_delete <- files[grepl('U5_PREV', basename(files), ignore.case = TRUE) & grepl(COUNTRY_CODE, basename(files), ignore.case = TRUE)]\n", - "# file.remove(files_to_delete)\n", - "\n", - "data_extension <- '.SAV'\n", - "dhs_hr_filename <- list.files(path = DHS_DATA_PATH, pattern = paste0(\".*\", household_recode, \".*\\\\\", data_extension, \"$\"), ignore.case=TRUE)\n", - "dhs_kr_filename <- dir(path = DHS_DATA_PATH, pattern = paste0(\".*\", kid_recode, \".*\\\\\", data_extension, \"$\"), ignore.case=TRUE)\n", - "\n", - "if(!check_dhs_same_version(dhs_hr_filename, dhs_kr_filename)){\n", - " stop(\"The necessary DHS data do not have the same version/issue. Check available data before rerunning.\")\n", - "}\n", - "\n", - "dhs_hr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_hr_filename)) # household recode\n", - "dhs_hr_dt <- setDT(dhs_hr_dt)\n", - "\n", - "dhs_kr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_kr_filename)) # kid recode\n", - "dhs_kr_dt <- setDT(dhs_kr_dt)" - ] - }, - { - "cell_type": "markdown", - "id": "c26e24f7-0a59-4dbf-b493-a760c9d30e39", - "metadata": {}, - "source": [ - "### Make admin codes and names dataframe (for future merging)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f244f017-b532-4106-8beb-4bf7c0ed4d02", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "dhs_beginning_year <- as.integer(dhs_hr_dt[, min(HV007)])\n", - "\n", - "dhs_admin_dt <- make_dhs_admin_df(\n", - " input_dhs_df=dhs_hr_dt,\n", - " original_admin_column=\"HV024\",\n", - " new_admin_name_colname=admin_name_col,\n", - " new_admin_code_colname='DHS_ADM1_CODE'\n", - ")\n", - "\n", - "# format the names to be like DHIS2 names\n", - "dhs_admin_dt[, (admin_name_col) := format_names(get(admin_name_col))]\n", - "\n", - "# TODO this should be changed in the formatting of DHIS2 data; the correct name should be with a space\n", - "dhs_admin_dt[get(admin_name_col) == \"MAI NDOMBE\", (admin_name_col) := \"MAINDOMBE\"]\n", - "\n", - "# Check that all regions can be matched with DHIS2 pyramid\n", - "if(!check_perfect_match(dhs_admin_dt, admin_name_col, admin_data, admin_name_col)){\n", - " stop(\"The DHS data provided does not fully match DHIS2 pyramid data. Please check input data before retrying.\")\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "52af343a-bbee-4601-9e3f-adceb60d809f", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "rm(dhs_hr_dt) # will not be used further" - ] - }, - { - "cell_type": "markdown", - "id": "c35b457b-e6ac-42c6-bad5-4ee328e94177", - "metadata": {}, - "source": [ - "### Relevant columns" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4ea43f49-f9dd-4403-a3b4-885ff31da8bb", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "kid_id_cols <- c(\"CASEID\", \"V000\", \"V001\", \"V002\")\n", - "household_sampling_cols <- c(\"V005\", \"V021\", \"V022\", \"V023\", \"V024\")\n", - "kid_alive_col <- \"B5\"\n", - "kid_age_col <- \"B19\"\n", - "kid_fever_col <- \"H22\"\n", - "# grep(\"^H32\", names(dhs_kr_dt), value = TRUE)\n", - "kid_public_care_cols <- c(\"H32A\", \"H32B\", \"H32C\", \"H32D\", \"H32E\", \"H32F\", \"H32G\", \"H32H\", \"H32I\")\n", - "kid_private_care_cols <- c(\"H32J\", \"H32K\", \"H32L\", \"H32M\", \"H32N\", \"H32O\", \"H32P\", \"H32Q\", \"H32R\",\n", - " \"H32NA\", \"H32NB\", \"H32NC\", \"H32ND\", \"H32NE\")\n", - "kid_other_care_cols <- c(\"H32S\", \"H32W\", \"H32X\")" - ] - }, - { - "cell_type": "markdown", - "id": "6b0924e4-aef6-4e1c-be29-4f818fb14d8e", - "metadata": {}, - "source": [ - "## Preprocess kid file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5184585f-bb68-4e99-89e2-ff52ffbf2eed", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# filter rows and columns\n", - "kr_dt <- dhs_kr_dt[(\n", - " !(is.na(get(kid_age_col))) & # no missing age\n", - " get(kid_age_col) < 60 & # younger than 5\n", - " get(kid_alive_col) == 1 & # alive\n", - " get(kid_fever_col) == 1 # had fever\n", - " ),\n", - " .SD, .SDcols = c(\n", - " kid_id_cols,\n", - " household_sampling_cols,\n", - " kid_alive_col,\n", - " kid_age_col,\n", - " kid_fever_col,\n", - " kid_other_care_cols,\n", - " kid_public_care_cols,\n", - " kid_private_care_cols\n", - " )]\n", - "\n", - "kr_dt[, wt := V005/1000000]\n", - "\n", - "kr_dt <- merge.data.table(dhs_admin_dt, kr_dt, by.x = \"DHS_ADM1_CODE\", by.y = \"V024\", all = TRUE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3c3f8447-842f-414c-b47f-869684c05c37", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Create the numerators\n", - "\n", - "kr_dt[, (indicator_public_care) := as.integer(rowSums(.SD == 1, na.rm = TRUE) > 0), .SDcols = kid_public_care_cols]\n", - "kr_dt[, (indicator_private_care) := as.integer(rowSums(.SD == 1, na.rm = TRUE) > 0), .SDcols = kid_private_care_cols]\n", - "kr_dt[, (indicator_no_care) := as.integer(rowSums(.SD != 0, na.rm = TRUE) == 0), .SDcols = c(kid_public_care_cols, kid_private_care_cols)]\n", - "\n", - "# check\n", - "xtabs(~ kr_dt[[indicator_public_care]] + kr_dt[[indicator_private_care]] + kr_dt[[indicator_no_care]])" - ] - }, - { - "cell_type": "markdown", - "id": "ead4ca71-3a5c-445f-ae61-3359a7cc7e31", - "metadata": {}, - "source": [ - "### Sampling design" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "81f99ad0-b0fa-4be8-b6df-9b726a0eccad", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# clustering, stratification, weights (for means, proportions, regression models, etc.)\n", - "kr_design_sampling = svydesign(\n", - " ids = ~ V021, # primary sampling unit / cluster ids (cluster number and/or ultimate area unit)\n", - " data = kr_dt, # dataset\n", - " strata = ~ V023, # groupings of primary sampling units\n", - " weights = ~ wt, # the sampling weights variable\n", - " num_p=1, # ? dunno what this is\n", - " nest = T # the primary sampling units are nested within the strata\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "88f9844d-3f29-4cc9-91e7-351bc81b6d69", - "metadata": {}, - "source": [ - "## Compute indicators" - ] - }, - { - "cell_type": "markdown", - "id": "0bec1af7-62e7-4da6-8531-1ed6f9933dc0", - "metadata": {}, - "source": [ - "Make the indicator tables and save them, add the sample estimation of the average proportion to a summary table" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "357ecff3-538e-4265-b024-51f678bcf29f", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "summary_table <- copy(admin_data)\n", - "\n", - "for (indicator_name in c(indicator_public_care, indicator_private_care, indicator_no_care)){\n", - " \n", - " # make the table name\n", - " table_name <- glue(tolower(indicator_name), 'table', .sep = '_')\n", - " \n", - " # create the content for the table\n", - " table_content <- svyby(\n", - " formula = as.formula(paste(\"~\", indicator_name)),\n", - " by = reformulate(admin_name_col),\n", - " FUN = svymean,\n", - " design = kr_design_sampling,\n", - " level = 0.95,\n", - " vartype = \"ci\",\n", - " na.rm = TRUE,\n", - " influence = TRUE # which observations have a substantial change in the results of the analysis\n", - " )\n", - " \n", - " # make it into data.table\n", - " setDT(table_content)\n", - "\n", - " lower_bound_col <- glue(\"{toupper(indicator_name)}_CI_LOWER_BOUND\")\n", - " upper_bound_col <- glue(\"{toupper(indicator_name)}_CI_UPPER_BOUND\")\n", - " sample_avg_col <- glue(\"{toupper(indicator_name)}_SAMPLE_AVERAGE\")\n", - " \n", - " # names(table_content) <- toupper(names(table_content))\n", - " names(table_content)[names(table_content) == 'ci_l'] <- lower_bound_col\n", - " names(table_content)[names(table_content) == 'ci_u'] <- upper_bound_col\n", - " names(table_content)[names(table_content) == indicator_name] <- sample_avg_col\n", - " \n", - " # cap the CI's between 0 and 1 (in case of small sample => large CI's)\n", - " table_content[get(lower_bound_col) < 0, (lower_bound_col) := 0]\n", - " table_content[get(upper_bound_col) > 1, (upper_bound_col) := 1]\n", - "\n", - " # convert to percentages\n", - " table_content[, (lower_bound_col) := get(lower_bound_col) * 100]\n", - " table_content[, (upper_bound_col) := get(upper_bound_col) * 100]\n", - " table_content[, (sample_avg_col) := get(sample_avg_col) * 100]\n", - " \n", - " # add the sample average column to the summary table\n", - " indicator_estimation_table <- table_content[\n", - " ,\n", - " .SD,\n", - " .SDcols = c(\n", - " admin_name_col,\n", - " grep('SAMPLE_AVERAGE', names(table_content), value = TRUE)\n", - " )\n", - " ]\n", - " \n", - " # add the admin id column to the indicator output table\n", - " table_content <- merge.data.table(admin_data, table_content, by = admin_name_col)\n", - " \n", - " # add the admin id column to the summary point estimates table\n", - " summary_table <- merge.data.table(summary_table, indicator_estimation_table, by = admin_name_col)\n", - " \n", - " # write it to .csv and .parquet files\n", - " filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_name)}\")\n", - " write.csv(table_content, file = file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.csv')), row.names = FALSE)\n", - " write_parquet(table_content, file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.parquet')))\n", - " \n", - " # assign the content to its variable name\n", - " assign(table_name, table_content)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f40da01a-4558-43af-9d1a-136b7c6920db", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# changing names for the summary table\n", - "names(summary_table) <- gsub('_SAMPLE_AVERAGE', '', names(summary_table))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3896b18f-ad5f-49c2-a2cc-2ebf40208aca", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "summary_filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_PCT_CARESEEKING_SAMPLE_AVERAGE\")\n", - "write.csv(summary_table, file = file.path(OUTPUT_DATA_PATH, paste0(summary_filename_without_extension, '.csv')), row.names = FALSE)\n", - "write_parquet(summary_table, file.path(OUTPUT_DATA_PATH, paste0(summary_filename_without_extension, '.parquet')))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2449bdb3-61cf-4ddd-a272-2172da6cba2e", - "metadata": { - "vscode": { - "languageId": "r" + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" } - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/pipelines/snt_dhs_indicators/code/snt_dhs_mortality_computation.ipynb b/pipelines/snt_dhs_indicators/code/snt_dhs_mortality_computation.ipynb index 9c5a000..1625e34 100644 --- a/pipelines/snt_dhs_indicators/code/snt_dhs_mortality_computation.ipynb +++ b/pipelines/snt_dhs_indicators/code/snt_dhs_mortality_computation.ipynb @@ -1,368 +1,375 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "5ce00012-556c-45dc-a572-a38b2205e5a8", - "metadata": {}, - "source": [ - "# Under-five mortality (DHS data)" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Under-five mortality (DHS data)" + ], + "id": "5ce00012-556c-45dc-a572-a38b2205e5a8" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Resources" + ], + "id": "e455838e-0ba0-475f-8860-07fd0705fdae" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "https://dhsprogram.com/data/Guide-to-DHS-Statistics/index.htm#t=Early_Childhood_Mortality.htm\n", + "\n", + "Under-5 Mortality Rate (U5MR)\n", + "The under-5 mortality rate is the probability (expressed as a rate per 1,000 live births) of a child exposed in a specific period dying before reaching the age of five years.\n", + "\n", + "\n", + "Coverage:\n", + "Population base: Live births to respondents (BR file)\n", + "\n", + "Time period: Five-year or ten-year periods of time preceding the survey (v008-1 to v008-60 or v008-120 months), excluding the month of interview\n", + "\n", + "Numerators:\n", + "Number of deaths to live-born children during specified age range and specified time period\n", + "Under-5 mortality: Deaths at ages 0 to 4 years, including deaths reported at ages 0 to 59 months and 0 to 99 days\n", + "\n", + "Denominator: Number of surviving children at beginning of specified age range during the specified time period\n", + "\n", + "Variables: BR file.\n", + "\n", + "b3 Date of birth of child (CMC)\n", + "\n", + "b5 Child is alive (1 = Yes, 0 = No)\n", + "\n", + "b7 Age at death in months (imputed)\n", + "\n", + "v008 Date of interview (CMC)\n", + "\n", + "v005 Woman’s individual sample weight" + ], + "id": "ca3f5f16-0df0-432d-9363-e4fcb4ab195b" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preliminary steps" + ], + "id": "a2f57835-fe83-44fe-b9a3-a0e21381c21b" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "rm(list = ls())\n", + "\n", + "options(scipen=999)\n", + "\n", + "# Paths\n", + "ROOT_PATH <- '~/workspace'\n", + "PIPELINE_PATH <- file.path(ROOT_PATH, 'pipelines', 'snt_dhs_indicators')\n", + "\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhs_mortality_computation.r\"))\n", + "setup_ctx <- bootstrap_dhs_indicators_context(\n", + " root_path = ROOT_PATH,\n", + " required_packages = c(\"haven\", \"sf\", \"glue\", \"survey\", \"data.table\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"arrow\", \"DHS.rates\")\n", + ")\n", + "\n", + "DATA_PATH <- setup_ctx$DATA_PATH\n", + "DHS_DATA_PATH <- setup_ctx$DHS_DATA_PATH\n", + "config_json <- setup_ctx$config_json\n", + "COUNTRY_CODE <- setup_ctx$COUNTRY_CODE\n", + "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'mortality')\n", + "dir.create(OUTPUT_DATA_PATH, recursive = TRUE, showWarnings = FALSE)\n", + "\n", + "reticulate::py_config()$python" + ], + "execution_count": null, + "outputs": [], + "id": "fc050b96-7cd0-4eba-aa9b-0744e531dd4c" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "admin_level <- 'ADM1'\n", + "admin_id_col <- glue(admin_level, 'ID', .sep='_')\n", + "admin_name_col <- glue(admin_level, 'NAME', .sep='_')\n", + "admin_cols <- c(admin_id_col, admin_name_col)" + ], + "execution_count": null, + "outputs": [], + "id": "91cd32a3-7ec7-467d-917c-2a2fbd0f13a1" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Geo/admin data" + ], + "id": "1f0a0363-d31a-4667-bad2-f3aca94c57ed" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Load spatial file from dataset\n", + "\n", + "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "\n", + "spatial_data <- load_dhs_spatial_data(\n", + " dhis2_dataset = dhis2_dataset,\n", + " country_code = COUNTRY_CODE\n", + ")\n", + "\n", + "spatial_data <- st_as_sf(spatial_data)\n", + "\n", + "# aggregate geometries by the admin columns\n", + "spatial_data <- aggregate_geometry(\n", + " sf_data=spatial_data,\n", + " admin_id_colname=admin_id_col,\n", + " admin_name_colname=admin_name_col\n", + ")\n", + "\n", + "# keep class\n", + "spatial_data <- st_as_sf(spatial_data)\n", + "\n", + "if(COUNTRY_CODE == \"COD\"){\n", + " spatial_data[[admin_name_col]] <- clean_admin_names(spatial_data[[admin_name_col]])\n", + "}\n", + "\n", + "admin_data <- st_drop_geometry(spatial_data)\n", + "setDT(admin_data)" + ], + "execution_count": null, + "outputs": [], + "id": "ca05f908-4c8a-4fd7-832c-6d52a7da83e8" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import DHS data" + ], + "id": "1e7a06e8-fe84-4a75-aac8-316abc36e7ce" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "indicator_u5mr <- 'U5MR_PERMIL' # to be computed\n", + "\n", + "data_source <- 'DHS'\n", + "household_recode <- 'HR'\n", + "births_recode <- 'BR'\n", + "target_file_type <- 'SV'\n", + "\n", + "delete_otherextension_files(DHS_DATA_PATH, extension_to_retain=\".zip\")\n", + "\n", + "dhs_hr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, household_recode, target_file_type)\n", + "unzip(file.path(DHS_DATA_PATH, dhs_hr_zip_filename), exdir=DHS_DATA_PATH)\n", + "\n", + "dhs_br_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, births_recode, target_file_type)\n", + "unzip(file.path(DHS_DATA_PATH, dhs_br_zip_filename), exdir=DHS_DATA_PATH)\n", + "\n", + "# # Remove existing output files\n", + "# files <- list.files(OUTPUT_DATA_PATH, full.names = TRUE)\n", + "# files_to_delete <- files[grepl('U5_MORT', basename(files), ignore.case = TRUE) & grepl(COUNTRY_CODE, basename(files), ignore.case = TRUE)]\n", + "# file.remove(files_to_delete)\n", + "\n", + "data_extension <- '.SAV'\n", + "dhs_hr_filename <- list.files(path = DHS_DATA_PATH, pattern = paste0(\".*\", household_recode, \".*\\\\\", data_extension, \"$\"), ignore.case=TRUE)\n", + "dhs_br_filename <- dir(path = DHS_DATA_PATH, pattern = paste0(\".*\", births_recode, \".*\\\\\", data_extension, \"$\"), ignore.case=TRUE)\n", + "\n", + "if(!check_dhs_same_version(dhs_hr_filename, dhs_br_filename)){\n", + " stop(\"The input DHS data do not have the same version/issue. Check available data before rerunning.\")\n", + "}\n", + "\n", + "dhs_hr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_hr_filename)) # household recode\n", + "dhs_hr_dt <- setDT(dhs_hr_dt)\n", + "\n", + "dhs_br_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_br_filename)) # births recode\n", + "dhs_br_dt <- setDT(dhs_br_dt)\n", + "\n", + "# Make admin codes and names dataframe (for future merging)\n", + "\n", + "dhs_beginning_year <- as.integer(dhs_hr_dt[, min(HV007)])\n", + "\n", + "dhs_admin_dt <- make_dhs_admin_df(\n", + " input_dhs_df=dhs_hr_dt,\n", + " original_admin_column=\"HV024\",\n", + " new_admin_name_colname=admin_name_col,\n", + " new_admin_code_colname='DHS_ADM1_CODE'\n", + ")\n", + "\n", + "# format the names to be like DHIS2 names\n", + "dhs_admin_dt[, (admin_name_col) := format_names(get(admin_name_col))]\n", + "\n", + "# TODO this should be changed in the formatting of DHIS2 data; the correct name should be with a space\n", + "dhs_admin_dt[get(admin_name_col) == \"MAI NDOMBE\", (admin_name_col) := \"MAINDOMBE\"]\n", + "\n", + "# Check that all regions can be matched with DHIS2 pyramid\n", + "if(!check_perfect_match(dhs_admin_dt, admin_name_col, admin_data, admin_name_col)){\n", + " stop(\"The DHS data provided does not fully match DHIS2 pyramid data. Please check input data before retrying.\")\n", + "}\n", + "\n", + "rm(dhs_hr_dt) # free up resources" + ], + "execution_count": null, + "outputs": [], + "id": "5ec3b998-25a8-455b-8b61-4201d300888e" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preprocess DHS data" + ], + "id": "71af8f28-3a2d-4ae2-812c-7cec9cd89288" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Relevant columns\n", + "household_id_cols <- c(\"CASEID\", \"V000\", \"V001\", \"V002\")\n", + "household_sampling_cols <- c(\"V005\", \"V021\", \"V022\", \"V023\", \"V024\")\n", + "birth_date_col <- \"B3\" # Date of birth of child (CMC)\n", + "alive_col <- \"B5\" # Child is alive (1 = Yes, 0 = No)\n", + "death_age_col <-\"B7\" # Age at death in months (imputed)\n", + "end_date_col <- \"V008\" # Date of interview (CMC)\n", + "\n", + "dhs_br_dt[, (birth_date_col) := as.integer(get(birth_date_col))]\n", + "dhs_br_dt[, (death_age_col) := as.integer(get(death_age_col))]\n", + "dhs_br_dt[, (end_date_col) := as.integer(get(end_date_col))]\n", + "\n", + "dhs_br_dt <- dhs_br_dt[\n", + " ,\n", + " .SD,\n", + " .SDcols = c(\n", + " household_id_cols,\n", + " household_sampling_cols,\n", + " birth_date_col,\n", + " alive_col,\n", + " death_age_col,\n", + " end_date_col\n", + ")\n", + "]" + ], + "execution_count": null, + "outputs": [], + "id": "e560e51b-369e-4365-868b-1285a5522f7c" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compute indicator" + ], + "id": "d48831a5-2b1f-4201-9438-e7fca9b8accb" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "region_dt_list <- split(dhs_br_dt, by = \"V024\")\n", + "\n", + "u5mort_table <- rbindlist(\n", + " lapply(region_dt_list, make_dhs_adm1_u5mort_dt) \n", + ")\n", + "\n", + "lower_bound_col <- glue(\"{toupper(indicator_u5mr)}_CI_LOWER_BOUND\")\n", + "upper_bound_col <- glue(\"{toupper(indicator_u5mr)}_CI_UPPER_BOUND\")\n", + "sample_avg_col <- glue(\"{toupper(indicator_u5mr)}_SAMPLE_AVERAGE\")\n", + "\n", + "# add necessary missing columns and remove non-necessary present columns\n", + "u5mort_table <- merge.data.table(dhs_admin_dt, u5mort_table, by = 'DHS_ADM1_CODE', all = TRUE)\n", + "setnames(u5mort_table,\n", + " old=c(\"R\", \"LCI\", \"UCI\"),\n", + " new=c(\n", + " sample_avg_col,\n", + " lower_bound_col,\n", + " upper_bound_col\n", + " ),\n", + " skip_absent=TRUE # not changing all names\n", + " )\n", + "u5mort_table <- merge.data.table(admin_data, u5mort_table, by = admin_name_col)\n", + "u5mort_table <- u5mort_table[\n", + " ,\n", + " .SD,\n", + " .SDcols = c(\n", + " admin_cols,\n", + " sample_avg_col,\n", + " lower_bound_col,\n", + " upper_bound_col\n", + " )\n", + " ]\n", + "\n", + "# Cap the CI's at 0 (in case of small numbers)\n", + "u5mort_table[get(lower_bound_col) < 0, (lower_bound_col) := 0]\n", + "\n", + "filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_u5mr)}\")\n", + "write.csv(u5mort_table, file = file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.csv')), row.names = FALSE)\n", + "write_parquet(u5mort_table, file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.parquet')))" + ], + "execution_count": null, + "outputs": [], + "id": "bf8c11a9-e69c-47e2-87df-a693b9e68cd3" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [], + "execution_count": null, + "outputs": [], + "id": "25bf0a56-a8bf-45a9-b63f-d0bdbd606fde" + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" + } }, - { - "cell_type": "markdown", - "id": "e455838e-0ba0-475f-8860-07fd0705fdae", - "metadata": {}, - "source": [ - "## Resources" - ] - }, - { - "cell_type": "markdown", - "id": "ca3f5f16-0df0-432d-9363-e4fcb4ab195b", - "metadata": {}, - "source": [ - "https://dhsprogram.com/data/Guide-to-DHS-Statistics/index.htm#t=Early_Childhood_Mortality.htm\n", - "\n", - "Under-5 Mortality Rate (U5MR)\n", - "The under-5 mortality rate is the probability (expressed as a rate per 1,000 live births) of a child exposed in a specific period dying before reaching the age of five years.\n", - "\n", - "\n", - "Coverage:\n", - "Population base: Live births to respondents (BR file)\n", - "\n", - "Time period: Five-year or ten-year periods of time preceding the survey (v008-1 to v008-60 or v008-120 months), excluding the month of interview\n", - "\n", - "Numerators:\n", - "Number of deaths to live-born children during specified age range and specified time period\n", - "Under-5 mortality: Deaths at ages 0 to 4 years, including deaths reported at ages 0 to 59 months and 0 to 99 days\n", - "\n", - "Denominator: Number of surviving children at beginning of specified age range during the specified time period\n", - "\n", - "Variables: BR file.\n", - "\n", - "b3 Date of birth of child (CMC)\n", - "\n", - "b5 Child is alive (1 = Yes, 0 = No)\n", - "\n", - "b7 Age at death in months (imputed)\n", - "\n", - "v008 Date of interview (CMC)\n", - "\n", - "v005 Woman’s individual sample weight" - ] - }, - { - "cell_type": "markdown", - "id": "a2f57835-fe83-44fe-b9a3-a0e21381c21b", - "metadata": {}, - "source": [ - "## Preliminary steps" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fc050b96-7cd0-4eba-aa9b-0744e531dd4c", - "metadata": {}, - "outputs": [], - "source": [ - "rm(list = ls())\n", - "\n", - "options(scipen=999)\n", - "\n", - "# Global paths\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "\n", - "# Paths\n", - "ROOT_PATH <- '~/workspace'\n", - "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", - "CODE_PATH <- file.path(ROOT_PATH, 'code')\n", - "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", - "DHS_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'raw')\n", - "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'mortality')\n", - "\n", - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "\n", - "# List required pcks\n", - "required_packages <- c(\"haven\", \"sf\", \"glue\", \"survey\", \"data.table\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"arrow\", \"DHS.rates\")\n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)\n", - "\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")\n", - "\n", - "# Load SNT config\n", - "CONFIG_FILE_NAME <- \"SNT_config.json\"\n", - "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, CONFIG_FILE_NAME)) \n", - "log_msg(msg)\n", - "\n", - "# Set config variables\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "91cd32a3-7ec7-467d-917c-2a2fbd0f13a1", - "metadata": {}, - "outputs": [], - "source": [ - "admin_level <- 'ADM1'\n", - "admin_id_col <- glue(admin_level, 'ID', .sep='_')\n", - "admin_name_col <- glue(admin_level, 'NAME', .sep='_')\n", - "admin_cols <- c(admin_id_col, admin_name_col)" - ] - }, - { - "cell_type": "markdown", - "id": "1f0a0363-d31a-4667-bad2-f3aca94c57ed", - "metadata": {}, - "source": [ - "## Geo/admin data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ca05f908-4c8a-4fd7-832c-6d52a7da83e8", - "metadata": {}, - "outputs": [], - "source": [ - "# Load spatial file from dataset\n", - "\n", - "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "\n", - "spatial_data_filename <- paste(COUNTRY_CODE, \"shapes.geojson\", sep = \"_\")\n", - "# spatial_data <- read_sf(file.path(DATA_PATH, 'dhis2', 'formatted', spatial_data_filename))\n", - "spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)\n", - "log_msg(glue(\"File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))\n", - "\n", - "spatial_data <- st_as_sf(spatial_data)\n", - "\n", - "# aggregate geometries by the admin columns\n", - "spatial_data <- aggregate_geometry(\n", - " sf_data=spatial_data,\n", - " admin_id_colname=admin_id_col,\n", - " admin_name_colname=admin_name_col\n", - ")\n", - "\n", - "# keep class\n", - "spatial_data <- st_as_sf(spatial_data)\n", - "\n", - "if(COUNTRY_CODE == \"COD\"){\n", - " spatial_data[[admin_name_col]] <- clean_admin_names(spatial_data[[admin_name_col]])\n", - "}\n", - "\n", - "admin_data <- st_drop_geometry(spatial_data)\n", - "setDT(admin_data)" - ] - }, - { - "cell_type": "markdown", - "id": "1e7a06e8-fe84-4a75-aac8-316abc36e7ce", - "metadata": {}, - "source": [ - "## Import DHS data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5ec3b998-25a8-455b-8b61-4201d300888e", - "metadata": {}, - "outputs": [], - "source": [ - "indicator_u5mr <- 'U5MR_PERMIL' # to be computed\n", - "\n", - "data_source <- 'DHS'\n", - "household_recode <- 'HR'\n", - "births_recode <- 'BR'\n", - "target_file_type <- 'SV'\n", - "\n", - "delete_otherextension_files(DHS_DATA_PATH, extension_to_retain=\".zip\")\n", - "\n", - "dhs_hr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, household_recode, target_file_type)\n", - "unzip(file.path(DHS_DATA_PATH, dhs_hr_zip_filename), exdir=DHS_DATA_PATH)\n", - "\n", - "dhs_br_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, births_recode, target_file_type)\n", - "unzip(file.path(DHS_DATA_PATH, dhs_br_zip_filename), exdir=DHS_DATA_PATH)\n", - "\n", - "# # Remove existing output files\n", - "# files <- list.files(OUTPUT_DATA_PATH, full.names = TRUE)\n", - "# files_to_delete <- files[grepl('U5_MORT', basename(files), ignore.case = TRUE) & grepl(COUNTRY_CODE, basename(files), ignore.case = TRUE)]\n", - "# file.remove(files_to_delete)\n", - "\n", - "data_extension <- '.SAV'\n", - "dhs_hr_filename <- list.files(path = DHS_DATA_PATH, pattern = paste0(\".*\", household_recode, \".*\\\\\", data_extension, \"$\"), ignore.case=TRUE)\n", - "dhs_br_filename <- dir(path = DHS_DATA_PATH, pattern = paste0(\".*\", births_recode, \".*\\\\\", data_extension, \"$\"), ignore.case=TRUE)\n", - "\n", - "if(!check_dhs_same_version(dhs_hr_filename, dhs_br_filename)){\n", - " stop(\"The input DHS data do not have the same version/issue. Check available data before rerunning.\")\n", - "}\n", - "\n", - "dhs_hr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_hr_filename)) # household recode\n", - "dhs_hr_dt <- setDT(dhs_hr_dt)\n", - "\n", - "dhs_br_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_br_filename)) # births recode\n", - "dhs_br_dt <- setDT(dhs_br_dt)\n", - "\n", - "# Make admin codes and names dataframe (for future merging)\n", - "\n", - "dhs_beginning_year <- as.integer(dhs_hr_dt[, min(HV007)])\n", - "\n", - "dhs_admin_dt <- make_dhs_admin_df(\n", - " input_dhs_df=dhs_hr_dt,\n", - " original_admin_column=\"HV024\",\n", - " new_admin_name_colname=admin_name_col,\n", - " new_admin_code_colname='DHS_ADM1_CODE'\n", - ")\n", - "\n", - "# format the names to be like DHIS2 names\n", - "dhs_admin_dt[, (admin_name_col) := format_names(get(admin_name_col))]\n", - "\n", - "# TODO this should be changed in the formatting of DHIS2 data; the correct name should be with a space\n", - "dhs_admin_dt[get(admin_name_col) == \"MAI NDOMBE\", (admin_name_col) := \"MAINDOMBE\"]\n", - "\n", - "# Check that all regions can be matched with DHIS2 pyramid\n", - "if(!check_perfect_match(dhs_admin_dt, admin_name_col, admin_data, admin_name_col)){\n", - " stop(\"The DHS data provided does not fully match DHIS2 pyramid data. Please check input data before retrying.\")\n", - "}\n", - "\n", - "rm(dhs_hr_dt) # free up resources" - ] - }, - { - "cell_type": "markdown", - "id": "71af8f28-3a2d-4ae2-812c-7cec9cd89288", - "metadata": {}, - "source": [ - "## Preprocess DHS data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e560e51b-369e-4365-868b-1285a5522f7c", - "metadata": {}, - "outputs": [], - "source": [ - "# Relevant columns\n", - "household_id_cols <- c(\"CASEID\", \"V000\", \"V001\", \"V002\")\n", - "household_sampling_cols <- c(\"V005\", \"V021\", \"V022\", \"V023\", \"V024\")\n", - "birth_date_col <- \"B3\" # Date of birth of child (CMC)\n", - "alive_col <- \"B5\" # Child is alive (1 = Yes, 0 = No)\n", - "death_age_col <-\"B7\" # Age at death in months (imputed)\n", - "end_date_col <- \"V008\" # Date of interview (CMC)\n", - "\n", - "dhs_br_dt[, (birth_date_col) := as.integer(get(birth_date_col))]\n", - "dhs_br_dt[, (death_age_col) := as.integer(get(death_age_col))]\n", - "dhs_br_dt[, (end_date_col) := as.integer(get(end_date_col))]\n", - "\n", - "dhs_br_dt <- dhs_br_dt[\n", - " ,\n", - " .SD,\n", - " .SDcols = c(\n", - " household_id_cols,\n", - " household_sampling_cols,\n", - " birth_date_col,\n", - " alive_col,\n", - " death_age_col,\n", - " end_date_col\n", - ")\n", - "]" - ] - }, - { - "cell_type": "markdown", - "id": "d48831a5-2b1f-4201-9438-e7fca9b8accb", - "metadata": {}, - "source": [ - "## Compute indicator" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bf8c11a9-e69c-47e2-87df-a693b9e68cd3", - "metadata": {}, - "outputs": [], - "source": [ - "region_dt_list <- split(dhs_br_dt, by = \"V024\")\n", - "\n", - "u5mort_table <- rbindlist(\n", - " lapply(region_dt_list, make_dhs_adm1_u5mort_dt) \n", - ")\n", - "\n", - "lower_bound_col <- glue(\"{toupper(indicator_u5mr)}_CI_LOWER_BOUND\")\n", - "upper_bound_col <- glue(\"{toupper(indicator_u5mr)}_CI_UPPER_BOUND\")\n", - "sample_avg_col <- glue(\"{toupper(indicator_u5mr)}_SAMPLE_AVERAGE\")\n", - "\n", - "# add necessary missing columns and remove non-necessary present columns\n", - "u5mort_table <- merge.data.table(dhs_admin_dt, u5mort_table, by = 'DHS_ADM1_CODE', all = TRUE)\n", - "setnames(u5mort_table,\n", - " old=c(\"R\", \"LCI\", \"UCI\"),\n", - " new=c(\n", - " sample_avg_col,\n", - " lower_bound_col,\n", - " upper_bound_col\n", - " ),\n", - " skip_absent=TRUE # not changing all names\n", - " )\n", - "u5mort_table <- merge.data.table(admin_data, u5mort_table, by = admin_name_col)\n", - "u5mort_table <- u5mort_table[\n", - " ,\n", - " .SD,\n", - " .SDcols = c(\n", - " admin_cols,\n", - " sample_avg_col,\n", - " lower_bound_col,\n", - " upper_bound_col\n", - " )\n", - " ]\n", - "\n", - "# Cap the CI's at 0 (in case of small numbers)\n", - "u5mort_table[get(lower_bound_col) < 0, (lower_bound_col) := 0]\n", - "\n", - "filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_u5mr)}\")\n", - "write.csv(u5mort_table, file = file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.csv')), row.names = FALSE)\n", - "write_parquet(u5mort_table, file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.parquet')))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "25bf0a56-a8bf-45a9-b63f-d0bdbd606fde", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/pipelines/snt_dhs_indicators/code/snt_dhs_prevalence_computation.ipynb b/pipelines/snt_dhs_indicators/code/snt_dhs_prevalence_computation.ipynb index ff19854..72f83b6 100644 --- a/pipelines/snt_dhs_indicators/code/snt_dhs_prevalence_computation.ipynb +++ b/pipelines/snt_dhs_indicators/code/snt_dhs_prevalence_computation.ipynb @@ -1,442 +1,458 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "1a337757-f2fa-467e-8241-ea174c7ea790", - "metadata": {}, - "source": [ - "# Under-5 Prevalence of Malaria (DHS data)" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Under-5 Prevalence of Malaria (DHS data)" + ], + "id": "1a337757-f2fa-467e-8241-ea174c7ea790" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Resources" + ], + "id": "fc27d9c1-0c0c-46df-9508-57add133acaf" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "https://dhsprogram.com/data/Guide-to-DHS-Statistics/index.htm#t=Prevalence_of_Malaria_in_Children.htm%23Percentage_of_children22bc-1&rhtocid=_15_13_0\n", + "\n", + "Numerators:\n", + "1) Number of de facto children tested using RDT who are positive for malaria (hv042 = 1 & hv103 = 1 & hc1 in 6:59 & hml35 = 1)\n", + "2) Number of de facto children tested using microscopy who are positive for malaria (hv042 = 1 & hv103 = 1 & hc1 in 6:59 & hml32 = 1)\n", + " \n", + "Denominators:\n", + "a) Number of de facto children tested using RDT (hv042 = 1 & hv103 = 1 & hc1 in 6:59 & hml35 in 0,1)\n", + "b) Number of de facto children tested using microscopy (hv042 = 1 & hv103 = 1 & hc1 in 6:59 & hml32 in 0,1,6)" + ], + "id": "9cf1724b-b01c-4b2e-93f7-d0f54cfa4850" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Project uses RDT**" + ], + "id": "7c61058a-361f-4992-9f2b-99a82f798fd8" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preliminary steps" + ], + "id": "d0e715b0-9a8d-4d15-b7ef-486f16fec73b" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "rm(list = ls())\n", + "\n", + "options(scipen=999)\n", + "\n", + "# Paths\n", + "ROOT_PATH <- '~/workspace'\n", + "PIPELINE_PATH <- file.path(ROOT_PATH, 'pipelines', 'snt_dhs_indicators')\n", + "\n", + "# Load notebook-specific utils\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhs_prevalence_computation.r\"))\n", + "\n", + "setup_ctx <- bootstrap_dhs_indicators_context(root_path = ROOT_PATH)\n", + "DATA_PATH <- setup_ctx$DATA_PATH\n", + "DHS_DATA_PATH <- setup_ctx$DHS_DATA_PATH\n", + "config_json <- setup_ctx$config_json\n", + "COUNTRY_CODE <- setup_ctx$COUNTRY_CODE\n", + "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'prevalence')\n", + "dir.create(OUTPUT_DATA_PATH, recursive = TRUE, showWarnings = FALSE)\n", + "\n", + "reticulate::py_config()$python" + ], + "execution_count": null, + "outputs": [], + "id": "a3cc2680-f9cc-46cd-9565-2b8af14fc29a" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get data" + ], + "id": "43135a7a-f1b8-4a89-9db1-9012c3369f3d" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "admin_level <- 'ADM1'\n", + "admin_id_col <- glue(admin_level, 'ID', .sep='_')\n", + "admin_name_col <- glue(admin_level, 'NAME', .sep='_')\n", + "admin_cols <- c(admin_id_col, admin_name_col)" + ], + "execution_count": null, + "outputs": [], + "id": "7e83a6c1-cd47-4eeb-b949-7bef7d1f36c6" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Load spatial file from dataset\n", + "\n", + "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "\n", + "spatial_data <- load_dhs_spatial_data(\n", + " dhis2_dataset = dhis2_dataset,\n", + " country_code = COUNTRY_CODE\n", + ")\n", + "\n", + "spatial_data <- st_as_sf(spatial_data)\n", + "\n", + "# aggregate geometries by the admin columns\n", + "spatial_data <- aggregate_geometry(\n", + " sf_data=spatial_data,\n", + " admin_id_colname=admin_id_col,\n", + " admin_name_colname=admin_name_col\n", + ")\n", + "\n", + "# keep class\n", + "spatial_data <- st_as_sf(spatial_data)\n", + "\n", + "if(COUNTRY_CODE == \"COD\"){\n", + " spatial_data[[admin_name_col]] <- clean_admin_names(spatial_data[[admin_name_col]])\n", + "}\n", + "\n", + "admin_data <- st_drop_geometry(spatial_data)\n", + "setDT(admin_data)" + ], + "execution_count": null, + "outputs": [], + "id": "9a623578-170f-42e0-a012-78f0ddbcce87" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Import " + ], + "id": "11aa09f3-ecc8-46c0-b9f3-95c0520202ef" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "indicator_u5prev <- 'PCT_U5_PREV_RDT' # to be computed\n", + "\n", + "data_source <- 'DHS'\n", + "household_recode <- 'HR'\n", + "person_recode <- 'PR'\n", + "target_file_type <- 'SV'\n", + "\n", + "delete_otherextension_files(DHS_DATA_PATH, extension_to_retain=\".zip\")\n", + "\n", + "dhs_hr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, household_recode, target_file_type)\n", + "unzip(file.path(DHS_DATA_PATH, dhs_hr_zip_filename), exdir=DHS_DATA_PATH)\n", + "\n", + "dhs_pr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, person_recode, target_file_type)\n", + "unzip(file.path(DHS_DATA_PATH, dhs_pr_zip_filename), exdir=DHS_DATA_PATH)\n", + "\n", + "# # Remove existing output files\n", + "# files <- list.files(OUTPUT_DATA_PATH, full.names = TRUE)\n", + "# files_to_delete <- files[grepl('U5_PREV', basename(files), ignore.case = TRUE) & grepl(COUNTRY_CODE, basename(files), ignore.case = TRUE)]\n", + "# file.remove(files_to_delete)\n", + "\n", + "data_extension <- '.SAV'\n", + "dhs_hr_filename <- list.files(path = DHS_DATA_PATH, pattern = paste0(\".*\", household_recode, \".*\\\\\", data_extension, \"$\"), ignore.case=TRUE)\n", + "dhs_pr_filename <- dir(path = DHS_DATA_PATH, pattern = paste0(\".*\", person_recode, \".*\\\\\", data_extension, \"$\"), ignore.case=TRUE)\n", + "\n", + "if(!check_dhs_same_version(dhs_hr_filename, dhs_pr_filename)){\n", + " stop(\"The input DHS data do not have the same version/issue. Check available data before rerunning.\")\n", + "}\n", + "\n", + "dhs_hr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_hr_filename)) # household recode\n", + "dhs_hr_dt <- setDT(dhs_hr_dt)\n", + "\n", + "dhs_pr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_pr_filename)) # person recode\n", + "dhs_pr_dt <- setDT(dhs_pr_dt)" + ], + "execution_count": null, + "outputs": [], + "id": "c015490b-a8fc-471d-a83f-0dce8e010cef" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Make admin dataframe (for future merging)" + ], + "id": "8b191b87-3694-4883-99c9-66ade3477f8e" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "dhs_admin_dt <- make_dhs_admin_df(\n", + " input_dhs_df=dhs_hr_dt,\n", + " original_admin_column=\"HV024\",\n", + " new_admin_name_colname=admin_name_col,\n", + " new_admin_code_colname='DHS_ADM1_CODE'\n", + ")\n", + "\n", + "# format the names to be like DHIS2 names\n", + "dhs_admin_dt[, (admin_name_col) := format_names(get(admin_name_col))]\n", + "\n", + "# TODO this should be changed in the formatting of DHIS2 data; the correct name should be with a space\n", + "dhs_admin_dt[get(admin_name_col) == \"MAI NDOMBE\", (admin_name_col) := \"MAINDOMBE\"]\n", + "\n", + "# Check that all regions can be matched with DHIS2 pyramid\n", + "if(!check_perfect_match(dhs_admin_dt, admin_name_col, admin_data, admin_name_col)){\n", + " stop(\"The DHS data provided does not fully match DHIS2 pyramid data. Please check input data before retrying.\")\n", + "}\n", + "\n", + "rm(dhs_hr_dt) # free up resources" + ], + "execution_count": null, + "outputs": [], + "id": "e0767159-1660-48c3-bdaf-f8be3643e039" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Preprocess" + ], + "id": "8a47d5f6-f8d8-4373-9cc8-a776d85b8a75" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Relevant columns\n", + "household_id_cols <- c(\"HHID\", \"HV000\", \"HV001\", \"HV002\")\n", + "household_sampling_cols <- c(\"HV005\", \"HV021\", \"HV022\", \"HV023\", \"HV024\")\n", + "hemoglobin_selection_col = \"HV042\"\n", + "person_slept_col <- grep(\"^HV103\", names(dhs_pr_dt), value = TRUE)\n", + "kid_age_col <- \"HC1\"\n", + "smear_result_col <- \"HML32\" # smear test (GE)\n", + "rdt_result_col <- \"HML35\" # rapid diagnostic test (RDT / TDR)" + ], + "execution_count": null, + "outputs": [], + "id": "f49ecb55-da48-4cf5-aa2b-7691eed04b77" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# sapply(kid_age_cols, function(i) table(dhs_pr_dt[[i]], useNA = 'always'))\n", + "\n", + "# table(dhs_pr_dt$HC1, useNA = 'ifany')\n", + "# table(dhs_pr_dt$HV103, useNA = 'ifany')\n", + "# table(dhs_pr_dt$HV042, useNA = 'ifany')\n", + "\n", + "# filter rows and columns\n", + "pr_dt <- dhs_pr_dt[(\n", + " !(is.na(get(kid_age_col))) & # no missing age\n", + " get(kid_age_col) >= 6 & # 6 months or older\n", + " get(kid_age_col) <= 59 & # younger than 5\n", + " get(person_slept_col) == 1 & # slept last night in household\n", + " get(hemoglobin_selection_col) == 1 # household selected for hemoglobin test\n", + " ),\n", + " .SD, .SDcols = c(\n", + " household_id_cols,\n", + " household_sampling_cols,\n", + " hemoglobin_selection_col,\n", + " person_slept_col,\n", + " kid_age_col,\n", + " smear_result_col,\n", + " rdt_result_col)\n", + " ]\n", + "\n", + "pr_dt[, wt := HV005/1000000]\n", + "\n", + "pr_dt <- merge.data.table(dhs_admin_dt, pr_dt, by.x = \"DHS_ADM1_CODE\", by.y = \"HV024\", all = TRUE)" + ], + "execution_count": null, + "outputs": [], + "id": "a5fe638a-7418-4049-822e-6c3d715fdded" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Rapid Diagnostic Test Indicator" + ], + "id": "7fa09c3c-d462-42f0-830a-444c79addf86" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "xtabs( ~ get(rdt_result_col), data = pr_dt, addNA = TRUE)\n", + "\n", + "# filter rows\n", + "rdt_dt <- pr_dt[\n", + " get(rdt_result_col) %in% c(0, 1), # tested and had either positive (1) or negative (0) result\n", + " ]\n", + "\n", + "# clustering, stratification, weights (for means, proportions, regression models, etc.)\n", + "rdt_design_sampling = svydesign(\n", + " ids = ~ HV021, # primary sampling unit / cluster ids (cluster number and/or ultimate area unit)\n", + " data = rdt_dt, # dataset\n", + " strata = ~ HV023, # groupings of primary sampling units\n", + " weights = ~ wt, # the sampling weights variable\n", + " num_p=1, # ? dunno what this is\n", + " nest = T # the primary sampling units are nested within the strata\n", + ")" + ], + "execution_count": null, + "outputs": [], + "id": "af0796b0-6f8e-40a0-8a41-e837ce0b02ee" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_u5prev)}\")\n", + "malaria_rdt_table <- compute_and_export_indicator_table(\n", + " design_obj = rdt_design_sampling,\n", + " indicator_name = rdt_result_col,\n", + " output_indicator_name = indicator_u5prev,\n", + " admin_name_col = admin_name_col,\n", + " admin_data = admin_data,\n", + " output_data_path = OUTPUT_DATA_PATH,\n", + " filename_without_extension = filename_without_extension\n", + ")" + ], + "execution_count": null, + "outputs": [], + "id": "028130f5-8522-4a2f-81d9-25630affdab3" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# prevalence table computed and exported in previous cell." + ], + "execution_count": null, + "outputs": [], + "id": "bcef2f47-d0d8-4cea-8477-a192b92bb9a8" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# handled by compute_and_export_indicator_table()" + ], + "execution_count": null, + "outputs": [], + "id": "e975d45b-1e32-442c-a859-d65ad3db904c" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# handled by compute_and_export_indicator_table()" + ], + "execution_count": null, + "outputs": [], + "id": "706e8de0-643f-4b68-a9e7-8c9dcd7bfba7" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# already merged and exported above" + ], + "execution_count": null, + "outputs": [], + "id": "be99a596-2647-4399-979f-4fd5855bd7cf" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [], + "execution_count": null, + "outputs": [], + "id": "d8d7236b-bf86-4da4-bc15-ca97b57b5ba4" + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" + } }, - { - "cell_type": "markdown", - "id": "fc27d9c1-0c0c-46df-9508-57add133acaf", - "metadata": {}, - "source": [ - "## Resources" - ] - }, - { - "cell_type": "markdown", - "id": "9cf1724b-b01c-4b2e-93f7-d0f54cfa4850", - "metadata": {}, - "source": [ - "https://dhsprogram.com/data/Guide-to-DHS-Statistics/index.htm#t=Prevalence_of_Malaria_in_Children.htm%23Percentage_of_children22bc-1&rhtocid=_15_13_0\n", - "\n", - "Numerators:\n", - "1) Number of de facto children tested using RDT who are positive for malaria (hv042 = 1 & hv103 = 1 & hc1 in 6:59 & hml35 = 1)\n", - "2) Number of de facto children tested using microscopy who are positive for malaria (hv042 = 1 & hv103 = 1 & hc1 in 6:59 & hml32 = 1)\n", - " \n", - "Denominators:\n", - "a) Number of de facto children tested using RDT (hv042 = 1 & hv103 = 1 & hc1 in 6:59 & hml35 in 0,1)\n", - "b) Number of de facto children tested using microscopy (hv042 = 1 & hv103 = 1 & hc1 in 6:59 & hml32 in 0,1,6)" - ] - }, - { - "cell_type": "markdown", - "id": "7c61058a-361f-4992-9f2b-99a82f798fd8", - "metadata": {}, - "source": [ - "**Project uses RDT**" - ] - }, - { - "cell_type": "markdown", - "id": "d0e715b0-9a8d-4d15-b7ef-486f16fec73b", - "metadata": {}, - "source": [ - "## Preliminary steps" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a3cc2680-f9cc-46cd-9565-2b8af14fc29a", - "metadata": {}, - "outputs": [], - "source": [ - "rm(list = ls())\n", - "\n", - "options(scipen=999)\n", - "\n", - "# Global paths\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "\n", - "# Paths\n", - "ROOT_PATH <- '~/workspace'\n", - "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", - "CODE_PATH <- file.path(ROOT_PATH, 'code')\n", - "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", - "DHS_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'raw')\n", - "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'prevalence')\n", - "\n", - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "\n", - "# List required pcks\n", - "required_packages <- c(\"haven\", \"sf\", \"glue\", \"survey\", \"data.table\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"arrow\")\n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)\n", - "\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")\n", - "\n", - "# Load SNT config\n", - "CONFIG_FILE_NAME <- \"SNT_config.json\"\n", - "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, CONFIG_FILE_NAME)) \n", - "log_msg(msg)\n", - "\n", - "# Set config variables\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n" - ] - }, - { - "cell_type": "markdown", - "id": "43135a7a-f1b8-4a89-9db1-9012c3369f3d", - "metadata": {}, - "source": [ - "## Get data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7e83a6c1-cd47-4eeb-b949-7bef7d1f36c6", - "metadata": {}, - "outputs": [], - "source": [ - "admin_level <- 'ADM1'\n", - "admin_id_col <- glue(admin_level, 'ID', .sep='_')\n", - "admin_name_col <- glue(admin_level, 'NAME', .sep='_')\n", - "admin_cols <- c(admin_id_col, admin_name_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9a623578-170f-42e0-a012-78f0ddbcce87", - "metadata": {}, - "outputs": [], - "source": [ - "# Load spatial file from dataset\n", - "\n", - "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "\n", - "spatial_data_filename <- paste(COUNTRY_CODE, \"shapes.geojson\", sep = \"_\")\n", - "# spatial_data <- read_sf(file.path(DATA_PATH, 'dhis2', 'formatted', spatial_data_filename))\n", - "spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)\n", - "log_msg(glue(\"File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))\n", - "\n", - "spatial_data <- st_as_sf(spatial_data)\n", - "\n", - "# aggregate geometries by the admin columns\n", - "spatial_data <- aggregate_geometry(\n", - " sf_data=spatial_data,\n", - " admin_id_colname=admin_id_col,\n", - " admin_name_colname=admin_name_col\n", - ")\n", - "\n", - "# keep class\n", - "spatial_data <- st_as_sf(spatial_data)\n", - "\n", - "if(COUNTRY_CODE == \"COD\"){\n", - " spatial_data[[admin_name_col]] <- clean_admin_names(spatial_data[[admin_name_col]])\n", - "}\n", - "\n", - "admin_data <- st_drop_geometry(spatial_data)\n", - "setDT(admin_data)" - ] - }, - { - "cell_type": "markdown", - "id": "11aa09f3-ecc8-46c0-b9f3-95c0520202ef", - "metadata": {}, - "source": [ - "### Import " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c015490b-a8fc-471d-a83f-0dce8e010cef", - "metadata": {}, - "outputs": [], - "source": [ - "indicator_u5prev <- 'PCT_U5_PREV_RDT' # to be computed\n", - "\n", - "data_source <- 'DHS'\n", - "household_recode <- 'HR'\n", - "person_recode <- 'PR'\n", - "target_file_type <- 'SV'\n", - "\n", - "delete_otherextension_files(DHS_DATA_PATH, extension_to_retain=\".zip\")\n", - "\n", - "dhs_hr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, household_recode, target_file_type)\n", - "unzip(file.path(DHS_DATA_PATH, dhs_hr_zip_filename), exdir=DHS_DATA_PATH)\n", - "\n", - "dhs_pr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, person_recode, target_file_type)\n", - "unzip(file.path(DHS_DATA_PATH, dhs_pr_zip_filename), exdir=DHS_DATA_PATH)\n", - "\n", - "# # Remove existing output files\n", - "# files <- list.files(OUTPUT_DATA_PATH, full.names = TRUE)\n", - "# files_to_delete <- files[grepl('U5_PREV', basename(files), ignore.case = TRUE) & grepl(COUNTRY_CODE, basename(files), ignore.case = TRUE)]\n", - "# file.remove(files_to_delete)\n", - "\n", - "data_extension <- '.SAV'\n", - "dhs_hr_filename <- list.files(path = DHS_DATA_PATH, pattern = paste0(\".*\", household_recode, \".*\\\\\", data_extension, \"$\"), ignore.case=TRUE)\n", - "dhs_pr_filename <- dir(path = DHS_DATA_PATH, pattern = paste0(\".*\", person_recode, \".*\\\\\", data_extension, \"$\"), ignore.case=TRUE)\n", - "\n", - "if(!check_dhs_same_version(dhs_hr_filename, dhs_pr_filename)){\n", - " stop(\"The input DHS data do not have the same version/issue. Check available data before rerunning.\")\n", - "}\n", - "\n", - "dhs_hr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_hr_filename)) # household recode\n", - "dhs_hr_dt <- setDT(dhs_hr_dt)\n", - "\n", - "dhs_pr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_pr_filename)) # person recode\n", - "dhs_pr_dt <- setDT(dhs_pr_dt)" - ] - }, - { - "cell_type": "markdown", - "id": "8b191b87-3694-4883-99c9-66ade3477f8e", - "metadata": {}, - "source": [ - "### Make admin dataframe (for future merging)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e0767159-1660-48c3-bdaf-f8be3643e039", - "metadata": {}, - "outputs": [], - "source": [ - "dhs_admin_dt <- make_dhs_admin_df(\n", - " input_dhs_df=dhs_hr_dt,\n", - " original_admin_column=\"HV024\",\n", - " new_admin_name_colname=admin_name_col,\n", - " new_admin_code_colname='DHS_ADM1_CODE'\n", - ")\n", - "\n", - "# format the names to be like DHIS2 names\n", - "dhs_admin_dt[, (admin_name_col) := format_names(get(admin_name_col))]\n", - "\n", - "# TODO this should be changed in the formatting of DHIS2 data; the correct name should be with a space\n", - "dhs_admin_dt[get(admin_name_col) == \"MAI NDOMBE\", (admin_name_col) := \"MAINDOMBE\"]\n", - "\n", - "# Check that all regions can be matched with DHIS2 pyramid\n", - "if(!check_perfect_match(dhs_admin_dt, admin_name_col, admin_data, admin_name_col)){\n", - " stop(\"The DHS data provided does not fully match DHIS2 pyramid data. Please check input data before retrying.\")\n", - "}\n", - "\n", - "rm(dhs_hr_dt) # free up resources" - ] - }, - { - "cell_type": "markdown", - "id": "8a47d5f6-f8d8-4373-9cc8-a776d85b8a75", - "metadata": {}, - "source": [ - "### Preprocess" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f49ecb55-da48-4cf5-aa2b-7691eed04b77", - "metadata": {}, - "outputs": [], - "source": [ - "# Relevant columns\n", - "household_id_cols <- c(\"HHID\", \"HV000\", \"HV001\", \"HV002\")\n", - "household_sampling_cols <- c(\"HV005\", \"HV021\", \"HV022\", \"HV023\", \"HV024\")\n", - "hemoglobin_selection_col = \"HV042\"\n", - "person_slept_col <- grep(\"^HV103\", names(dhs_pr_dt), value = TRUE)\n", - "kid_age_col <- \"HC1\"\n", - "smear_result_col <- \"HML32\" # smear test (GE)\n", - "rdt_result_col <- \"HML35\" # rapid diagnostic test (RDT / TDR)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a5fe638a-7418-4049-822e-6c3d715fdded", - "metadata": {}, - "outputs": [], - "source": [ - "# sapply(kid_age_cols, function(i) table(dhs_pr_dt[[i]], useNA = 'always'))\n", - "\n", - "# table(dhs_pr_dt$HC1, useNA = 'ifany')\n", - "# table(dhs_pr_dt$HV103, useNA = 'ifany')\n", - "# table(dhs_pr_dt$HV042, useNA = 'ifany')\n", - "\n", - "# filter rows and columns\n", - "pr_dt <- dhs_pr_dt[(\n", - " !(is.na(get(kid_age_col))) & # no missing age\n", - " get(kid_age_col) >= 6 & # 6 months or older\n", - " get(kid_age_col) <= 59 & # younger than 5\n", - " get(person_slept_col) == 1 & # slept last night in household\n", - " get(hemoglobin_selection_col) == 1 # household selected for hemoglobin test\n", - " ),\n", - " .SD, .SDcols = c(\n", - " household_id_cols,\n", - " household_sampling_cols,\n", - " hemoglobin_selection_col,\n", - " person_slept_col,\n", - " kid_age_col,\n", - " smear_result_col,\n", - " rdt_result_col)\n", - " ]\n", - "\n", - "pr_dt[, wt := HV005/1000000]\n", - "\n", - "pr_dt <- merge.data.table(dhs_admin_dt, pr_dt, by.x = \"DHS_ADM1_CODE\", by.y = \"HV024\", all = TRUE)" - ] - }, - { - "cell_type": "markdown", - "id": "7fa09c3c-d462-42f0-830a-444c79addf86", - "metadata": {}, - "source": [ - "## Rapid Diagnostic Test Indicator" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "af0796b0-6f8e-40a0-8a41-e837ce0b02ee", - "metadata": {}, - "outputs": [], - "source": [ - "xtabs( ~ get(rdt_result_col), data = pr_dt, addNA = TRUE)\n", - "\n", - "# filter rows\n", - "rdt_dt <- pr_dt[\n", - " get(rdt_result_col) %in% c(0, 1), # tested and had either positive (1) or negative (0) result\n", - " ]\n", - "\n", - "# clustering, stratification, weights (for means, proportions, regression models, etc.)\n", - "rdt_design_sampling = svydesign(\n", - " ids = ~ HV021, # primary sampling unit / cluster ids (cluster number and/or ultimate area unit)\n", - " data = rdt_dt, # dataset\n", - " strata = ~ HV023, # groupings of primary sampling units\n", - " weights = ~ wt, # the sampling weights variable\n", - " num_p=1, # ? dunno what this is\n", - " nest = T # the primary sampling units are nested within the strata\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "028130f5-8522-4a2f-81d9-25630affdab3", - "metadata": {}, - "outputs": [], - "source": [ - "malaria_rdt_table <- svyby(formula = as.formula(paste(\"~\", rdt_result_col)), # to dynamically set the target colname\n", - " # by = ~ ADM1,\n", - " by = reformulate(admin_name_col), # to dynamically define the grouping colname\n", - " FUN = svymean, # compute survey mean\n", - " design = rdt_design_sampling, # the weights, strata, clusters\n", - " level = 0.95, # the level for CI's\n", - " vartype = \"ci\", # for variance, use the CI's\n", - " na.rm = TRUE, # remove the NA's in the calculation\n", - " influence = TRUE) # which observations have a substantial change in the results of the analysis" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bcef2f47-d0d8-4cea-8477-a192b92bb9a8", - "metadata": {}, - "outputs": [], - "source": [ - "setDT(malaria_rdt_table)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e975d45b-1e32-442c-a859-d65ad3db904c", - "metadata": {}, - "outputs": [], - "source": [ - "lower_bound_col <- glue(\"{toupper(indicator_u5prev)}_CI_LOWER_BOUND\")\n", - "upper_bound_col <- glue(\"{toupper(indicator_u5prev)}_CI_UPPER_BOUND\")\n", - "sample_avg_col <- glue(\"{toupper(indicator_u5prev)}_SAMPLE_AVERAGE\")\n", - "\n", - "# names(malaria_rdt_table) <- toupper(names(malaria_rdt_table))\n", - "names(malaria_rdt_table)[names(malaria_rdt_table) == 'ci_l'] <- lower_bound_col\n", - "names(malaria_rdt_table)[names(malaria_rdt_table) == 'ci_u'] <- upper_bound_col\n", - "names(malaria_rdt_table)[names(malaria_rdt_table) == rdt_result_col] <- sample_avg_col\n", - "\n", - "# Cap the CI's between 0 and 1 (in case of small sample => large CI's)\n", - "malaria_rdt_table[get(lower_bound_col) < 0, (lower_bound_col) := 0]\n", - "malaria_rdt_table[get(upper_bound_col) > 1, (upper_bound_col) := 1]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "706e8de0-643f-4b68-a9e7-8c9dcd7bfba7", - "metadata": {}, - "outputs": [], - "source": [ - "# Convert to percentages\n", - "malaria_rdt_table[, (lower_bound_col) := get(lower_bound_col) * 100]\n", - "malaria_rdt_table[, (upper_bound_col) := get(upper_bound_col) * 100]\n", - "malaria_rdt_table[, (sample_avg_col) := get(sample_avg_col) * 100]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "be99a596-2647-4399-979f-4fd5855bd7cf", - "metadata": {}, - "outputs": [], - "source": [ - "malaria_rdt_table <- merge.data.table(admin_data, malaria_rdt_table, by = admin_name_col, all = TRUE)\n", - "\n", - "filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_u5prev)}\")\n", - "write.csv(malaria_rdt_table, file = file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.csv')), row.names = FALSE)\n", - "write_parquet(malaria_rdt_table, file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.parquet')))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d8d7236b-bf86-4da4-bc15-ca97b57b5ba4", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/pipelines/snt_dhs_indicators/code/snt_dhs_vaccination_computation.ipynb b/pipelines/snt_dhs_indicators/code/snt_dhs_vaccination_computation.ipynb index 8813975..40bcc63 100644 --- a/pipelines/snt_dhs_indicators/code/snt_dhs_vaccination_computation.ipynb +++ b/pipelines/snt_dhs_indicators/code/snt_dhs_vaccination_computation.ipynb @@ -1,604 +1,637 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "4a75d418-3144-427f-8fa3-2cb7f727e66d", - "metadata": {}, - "source": [ - "# DTP Vaccination rates and attrition using DHS data" - ] - }, - { - "cell_type": "markdown", - "id": "6b666c7a-f105-4fad-aea0-d23a38fa0153", - "metadata": {}, - "source": [ - "## Preliminaries" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dbcf1df6-c264-48b3-bf53-f13a4b036487", - "metadata": {}, - "outputs": [], - "source": [ - "rm(list = ls())\n", - "\n", - "options(scipen=999)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "449ea786-ac70-4513-aaf5-24ffe69aec5c", - "metadata": {}, - "outputs": [], - "source": [ - "# Global paths\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c9037624-3ac0-403c-9ea0-fc78891c2393", - "metadata": {}, - "outputs": [], - "source": [ - "# Paths\n", - "ROOT_PATH <- '~/workspace'\n", - "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", - "CODE_PATH <- file.path(ROOT_PATH, 'code')\n", - "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", - "DHS_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'raw')\n", - "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'vaccination')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1f13522e-a0e5-44b7-b8f5-b66dc7ca37c0", - "metadata": {}, - "outputs": [], - "source": [ - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "\n", - "# List required pcks\n", - "required_packages <- c(\"readr\", \"haven\", \"glue\", \"survey\", \"data.table\", \"sf\", \"ggplot2\", \"stringi\", \"reticulate\", \"jsonlite\", \"httr\", \"arrow\")\n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e32d9589-1a34-4a62-ac93-ae04e5939eb1", - "metadata": {}, - "outputs": [], - "source": [ - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9d0221bb-9d59-42ff-a6c9-09c735083135", - "metadata": {}, - "outputs": [], - "source": [ - "# Load SNT config\n", - "CONFIG_FILE_NAME <- \"SNT_config.json\"\n", - "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, CONFIG_FILE_NAME)) \n", - "log_msg(msg)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d9196629-0d55-416c-9171-4ebc630cc93b", - "metadata": {}, - "outputs": [], - "source": [ - "# Set config variables\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE" - ] - }, - { - "cell_type": "markdown", - "id": "ff080767-9fdb-4790-988c-e2b4c4f7226f", - "metadata": {}, - "source": [ - "## Geo data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5249483e-2b07-425d-b385-bc32ac601ced", - "metadata": {}, - "outputs": [], - "source": [ - "admin_level <- 'ADM1'\n", - "admin_id_col <- glue(admin_level, 'ID', .sep='_')\n", - "admin_name_col <- glue(admin_level, 'NAME', .sep='_')\n", - "admin_cols <- c(admin_id_col, admin_name_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1308ee0c-4856-445f-b518-ee0f4497c9b5", - "metadata": {}, - "outputs": [], - "source": [ - "# Load spatial file from dataset\n", - "\n", - "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "\n", - "spatial_data_filename <- paste(COUNTRY_CODE, \"shapes.geojson\", sep = \"_\")\n", - "# spatial_data <- read_sf(file.path(DATA_PATH, 'dhis2', 'formatted', spatial_data_filename))\n", - "spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)\n", - "log_msg(glue(\"File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))\n", - "\n", - "spatial_data <- st_as_sf(spatial_data)\n", - "\n", - "# aggregate geometries by the admin columns\n", - "spatial_data <- aggregate_geometry(\n", - " sf_data=spatial_data,\n", - " admin_id_colname=admin_id_col,\n", - " admin_name_colname=admin_name_col\n", - " )\n", - "\n", - "# keep class\n", - "spatial_data <- st_as_sf(spatial_data)\n", - "\n", - "# DRC provinces need to be cleaned\n", - "if(COUNTRY_CODE == \"COD\"){\n", - " spatial_data[[admin_name_col]] <- clean_admin_names(spatial_data[[admin_name_col]])\n", - "}\n", - "\n", - "admin_data <- st_drop_geometry(spatial_data)\n", - "setDT(admin_data)" - ] - }, - { - "cell_type": "markdown", - "id": "ba6c6b6b-e0fd-434a-8e44-6c45bea47d97", - "metadata": {}, - "source": [ - "## Import DHS data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "29d4866a-5f98-404e-aaa8-6fe1e6dc9e2d", - "metadata": {}, - "outputs": [], - "source": [ - "vaccination_doses <- c(1, 2, 3)\n", - "indicator_access <- 'PCT_DTP'\n", - "indicator_attrition <- 'PCT_DROPOUT_DTP'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b6bd9ec2-cdcb-41d9-8cd7-54096bbfa2a3", - "metadata": {}, - "outputs": [], - "source": [ - "data_source <- 'DHS'\n", - "household_recode <- 'HR'\n", - "kid_recode <- 'KR'\n", - "target_file_type <- 'SV'\n", - "\n", - "delete_otherextension_files(DHS_DATA_PATH, extension_to_retain=\".zip\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e5591f0f-fe5f-4c05-9368-2d1c7e388782", - "metadata": {}, - "outputs": [], - "source": [ - "dhs_hr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, household_recode, target_file_type)\n", - "unzip(file.path(DHS_DATA_PATH, dhs_hr_zip_filename), exdir=DHS_DATA_PATH)\n", - "\n", - "dhs_kr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, kid_recode, target_file_type)\n", - "unzip(file.path(DHS_DATA_PATH, dhs_kr_zip_filename), exdir=DHS_DATA_PATH)\n", - "\n", - "# # Remove existing output files\n", - "# files <- list.files(OUTPUT_DATA_PATH, full.names = TRUE)\n", - "# files_to_delete <- files[grepl('DTP', basename(files), ignore.case = TRUE) & grepl(COUNTRY_CODE, basename(files), ignore.case = TRUE)]\n", - "# file.remove(files_to_delete)\n", - "\n", - "data_extension <- '.SAV'\n", - "dhs_hr_filename <- list.files(path = DHS_DATA_PATH, pattern = paste0(\".*\", household_recode, \".*\\\\\", data_extension, \"$\"), ignore.case=TRUE)\n", - "dhs_kr_filename <- dir(path = DHS_DATA_PATH, pattern = paste0(\".*\", kid_recode, \".*\\\\\", data_extension, \"$\"), ignore.case=TRUE)\n", - "\n", - "if(!check_dhs_same_version(dhs_hr_filename, dhs_kr_filename)){\n", - " stop(\"The necessary DHS data do not have the same version/issue. Check available data before rerunning.\")\n", - "}\n", - "\n", - "dhs_hr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_hr_filename)) # household recode\n", - "dhs_hr_dt <- setDT(dhs_hr_dt)\n", - "\n", - "dhs_kr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_kr_filename)) # kid recode\n", - "dhs_kr_dt <- setDT(dhs_kr_dt)" - ] - }, - { - "cell_type": "markdown", - "id": "51ccd171-5056-42bc-a45f-ea70bf2c3bbe", - "metadata": {}, - "source": [ - "## Preprocess DHS data" - ] - }, - { - "cell_type": "markdown", - "id": "d6607265-c852-48b0-852b-e8fecaed804b", - "metadata": {}, - "source": [ - "### Extract DHS admin data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6fb209c6-8090-40f0-a101-fbfbeb17de41", - "metadata": {}, - "outputs": [], - "source": [ - "# Make admin codes and names dataframe (for future merging)\n", - "\n", - "dhs_beginning_year <- as.integer(dhs_hr_dt[, min(HV007)])\n", - "\n", - "dhs_admin_dt <- make_dhs_admin_df(\n", - " input_dhs_df=dhs_hr_dt,\n", - " original_admin_column=\"HV024\",\n", - " new_admin_name_colname=admin_name_col,\n", - " new_admin_code_colname='DHS_ADM1_CODE'\n", - ")\n", - "\n", - "# format the names to be like DHIS2 names\n", - "dhs_admin_dt[, (admin_name_col) := format_names(get(admin_name_col))]\n", - "\n", - "# TODO this should be changed in the formatting of DHIS2 data; the correct name should be with a space\n", - "dhs_admin_dt[get(admin_name_col) == \"MAI NDOMBE\", (admin_name_col) := \"MAINDOMBE\"]\n", - "\n", - "# Check that all regions can be matched with DHIS2 pyramid\n", - "if(!check_perfect_match(dhs_admin_dt, admin_name_col, admin_data, admin_name_col)){\n", - " stop(\"The DHS data provided does not fully match DHIS2 pyramid data. Please check input data before retrying.\")\n", - "}\n", - "\n", - "rm(dhs_hr_dt) # free up resources" - ] - }, - { - "cell_type": "markdown", - "id": "ae09cc05-a942-4e56-be52-7cc5876f62a9", - "metadata": {}, - "source": [ - "### Filter rows and columns" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5b852bc5-fc91-45d6-a668-0efb25b00092", - "metadata": {}, - "outputs": [], - "source": [ - "# remove dead children from the dataset, keep only children aged 1 or more (avoid left censoring for vaccination) and respect the base for the 'h' variables\n", - "kr_dt <- dhs_kr_dt[B5 == 1 & B8 >= 1 & B19 < 36,]\n", - "\n", - "household_id_cols <- c('V000', 'V001', 'V002')\n", - "kid_id_cols <- c('CASEID', 'BIDX')\n", - "kid_dpt1_cols <- c('H3', 'H3D', 'H3M', 'H3Y')\n", - "kid_dpt2_cols <- c('H5', 'H5D', 'H5M', 'H5Y')\n", - "kid_dpt3_cols <- c('H7', 'H7D', 'H7M', 'H7Y')\n", - "kid_sampling_cols <- c('V005', 'V021', 'V023', 'V024')\n", - "\n", - "kr_dt <- kr_dt[, .SD, .SDcols = c(household_id_cols, kid_id_cols, kid_sampling_cols, kid_dpt1_cols, kid_dpt2_cols, kid_dpt3_cols)]\n", - "\n", - "# # check i didn't omit any crucial variable\n", - "# stopifnot(nrow(kr_dt[duplicated(kr_dt)]) == 0)" - ] - }, - { - "cell_type": "markdown", - "id": "a0194b86-ac85-44b5-bdc0-d7dd46b2170a", - "metadata": {}, - "source": [ - "### New features" - ] - }, - { - "cell_type": "markdown", - "id": "fb4ad225-93bc-44f7-bc87-d16f9eb2f065", - "metadata": {}, - "source": [ - "Add the region labels, to subsequently match DHIS2 data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e63a49a0-273a-4cef-ac91-2f95acf141dd", - "metadata": {}, - "outputs": [], - "source": [ - "kr_dt <- merge.data.table(dhs_admin_dt, kr_dt, by.x = \"DHS_ADM1_CODE\", by.y = \"V024\", all = TRUE)" - ] - }, - { - "cell_type": "markdown", - "id": "f578eda9-0ae6-4366-9aa3-a3a110e19d30", - "metadata": {}, - "source": [ - "Create the target features (whether or not the kid was vaccinated, for each dose)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3e6d586f-636d-4b5c-9ccb-31520e4ca62e", - "metadata": {}, - "outputs": [], - "source": [ - "# Create dummy variables for the various DTP vaccine doses\n", - "kr_dt[, `:=`(\n", - " DTP1 = fcase(\n", - " H3 == 0L, 0L,\n", - " H3 %in% c(1L, 2L, 3L), 1L,\n", - " default = NA\n", - " ),\n", - " DTP2 = fcase(\n", - " H5 == 0L, 0L,\n", - " H5 %in% c(1L, 2L, 3L), 1L,\n", - " default = NA\n", - " ),\n", - " DTP3 = fcase(\n", - " H7 == 0L, 0L,\n", - " H7 %in% c(1L, 2L, 3L), 1L,\n", - " default = NA\n", - " )\n", - ")]\n", - "\n", - "# Correct external consistency issues: children who got the third dose also had the second, and so on:\n", - "kr_dt[DTP2 == 1, DTP1 := 1]\n", - "kr_dt[DTP3 == 1, DTP1 := 1]\n", - "kr_dt[DTP3 == 1, DTP2 := 1]" - ] - }, - { - "cell_type": "markdown", - "id": "f11d02d8-1c0f-4cc0-9bc6-736a84d73a23", - "metadata": {}, - "source": [ - "### Create the survey design" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dad05a65-9967-4c4f-9522-ba6cd4cfef7f", - "metadata": {}, - "outputs": [], - "source": [ - "# compute the household/kid weights\n", - "kr_dt[, wt := V005/1000000]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "db28fe66-7e99-4d98-8ac1-0ea7ba30c677", - "metadata": {}, - "outputs": [], - "source": [ - "# account for the sampling strategy (clustering, stratification, weights) for means, proportions, regression models, etc.\n", - "dtp_design = svydesign(\n", - " ids = ~ V021, # primary sampling unit / cluster ids (cluster number and/or ultimate area unit)\n", - " data = kr_dt, # dataset\n", - " strata = ~ V023, # groupings of primary sampling units\n", - " weights = ~ wt, # the sampling weights variable\n", - " nest = T # the primary sampling units are nested within the strata\n", - " )" - ] - }, - { - "cell_type": "markdown", - "id": "44ece144-6678-4de2-a1d4-f924b6549345", - "metadata": {}, - "source": [ - "## Vaccination proportion indicator" - ] - }, - { - "cell_type": "markdown", - "id": "a2034bea-e0fd-4268-b383-4d39d9cd7e75", - "metadata": {}, - "source": [ - "For each vaccine dose:\n", - "- compute the proportions of vaccinated per region\n", - "- compute the CIs\n", - "- add the admin units and save to .csv and parquet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a1c19f83-a1e1-4b1d-bc26-605e4ed4fa06", - "metadata": {}, - "outputs": [], - "source": [ - "# create the empty dropout table to add each proportion and compute attrition rates\n", - "DTP_DROPOUT <- copy(admin_data)\n", - "\n", - "for (dose_number in vaccination_doses){\n", - " table_name <- glue(\"{toupper(indicator_access)}{dose_number}\")\n", - " vaccine_colname <- glue(\"DTP{dose_number}\")\n", - " \n", - " computed_proportions <- svyby(\n", - " # formula = ~ get(vaccine_colname),\n", - " as.formula(paste(\"~\", vaccine_colname)),\n", - " # by = ~ ADM1,\n", - " by = reformulate(admin_name_col),\n", - " FUN = svymean,\n", - " design = dtp_design,\n", - " level = 0.95,\n", - " vartype = \"ci\",\n", - " na.rm = TRUE,\n", - " influence = TRUE\n", - " )\n", - " \n", - " # change the name of the target column (to avoid the 'get' in the name)\n", - " names(computed_proportions)[2] <- vaccine_colname\n", - " \n", - " # assign the table value to the table name\n", - " assign(table_name, computed_proportions)\n", - " \n", - " # change the names of the columns\n", - " lower_bound_col <- glue(\"{toupper(indicator_access)}{dose_number}_CI_LOWER_BOUND\")\n", - " upper_bound_col <- glue(\"{toupper(indicator_access)}{dose_number}_CI_UPPER_BOUND\")\n", - " sample_avg_col <- glue(\"{toupper(indicator_access)}{dose_number}_SAMPLE_AVERAGE\") \n", - " \n", - " # retrieve data, modify colnames, and reassign\n", - " df <- get(table_name)\n", - " names(df)[names(df) == 'ci_l'] <- lower_bound_col\n", - " names(df)[names(df) == 'ci_u'] <- upper_bound_col\n", - " names(df)[names(df) == vaccine_colname] <- sample_avg_col\n", - " setDT(df)\n", - "\n", - " # Cap the CI's between 0 and 1 (in case of small sample => large CI's)\n", - " df[get(lower_bound_col) < 0, (lower_bound_col) := 0]\n", - " df[get(upper_bound_col) > 1, (upper_bound_col) := 1]\n", - "\n", - " # Convert to percentages\n", - " df[, (lower_bound_col) := get(lower_bound_col) * 100]\n", - " df[, (upper_bound_col) := get(upper_bound_col) * 100]\n", - " df[, (sample_avg_col) := get(sample_avg_col) * 100]\n", - " \n", - " # add the admin units\n", - " df <- merge.data.table(admin_data, df, by = admin_name_col, all.x = TRUE)\n", - " \n", - " # write to file\n", - " filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{table_name}\")\n", - " fwrite(df, file = file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.csv')))\n", - " write_parquet(df, file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.parquet')))\n", - " \n", - " # add current dose table to the summary table (for future computation of dropout rates)\n", - " DTP_DROPOUT <- merge.data.table(DTP_DROPOUT, df, by = admin_cols)\n", - "}\n", - " " - ] - }, - { - "cell_type": "markdown", - "id": "d4e56da8-3ed5-4791-913c-58b76fbba125", - "metadata": {}, - "source": [ - "## Dropout rate indicator" - ] - }, - { - "cell_type": "markdown", - "id": "375952b4-4a25-435c-aae1-c53c54e9382c", - "metadata": {}, - "source": [ - "Add dropout rates plots: for each vaccine dose:\n", - "- make the dropout rates\n", - "- add them to the summary file and save it as .csv and parquet\n", - "- make plots and save them" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d21baeab-f1b9-441d-b073-e7dc193a7535", - "metadata": {}, - "outputs": [], - "source": [ - "# remove the CI columns (as requested)\n", - "DTP_DROPOUT[, grep(\"BOUND\", names(DTP_DROPOUT), value = TRUE) := NULL]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ed9b8475-bc2e-4170-b5b5-034909840269", - "metadata": {}, - "outputs": [], - "source": [ - "for(current_dose in vaccination_doses){\n", - " for (reference_dose in 1:(current_dose - 1)){\n", - " if((reference_dose >= 1) & (reference_dose < current_dose)){\n", - " attrition_col <- glue(\"{toupper(indicator_attrition)}_{reference_dose}_{current_dose}\")\n", - " print(glue('Computing attrition for {attrition_col}'))\n", - " numerator_colname <- glue(\"{toupper(indicator_access)}{current_dose}_SAMPLE_AVERAGE\")\n", - " denominator_colname <- glue(\"{toupper(indicator_access)}{reference_dose}_SAMPLE_AVERAGE\")\n", - " DTP_DROPOUT[, (attrition_col) := (1 - get(numerator_colname) / get(denominator_colname))*100] # percentages instead of rates, as requested\n", - " }\n", - " }\n", - "}\n", - "\n", - "# remove the unnecessary columns\n", - "DTP_DROPOUT[, grep(\"SAMPLE_AVERAGE\", names(DTP_DROPOUT), value = TRUE) := NULL]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2369423c-8752-4c92-ad84-44d2120be6a9", - "metadata": {}, - "outputs": [], - "source": [ - "dtp_dropout_filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{indicator_attrition}\")\n", - "fwrite(DTP_DROPOUT, file = file.path(OUTPUT_DATA_PATH, paste0(dtp_dropout_filename_without_extension, \".csv\")))\n", - "write_parquet(DTP_DROPOUT, file.path(OUTPUT_DATA_PATH, paste0(dtp_dropout_filename_without_extension, \".parquet\")))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ecc63839-393c-4487-bdfb-e9e96626cc71", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# DTP Vaccination rates and attrition using DHS data" + ], + "id": "4a75d418-3144-427f-8fa3-2cb7f727e66d" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preliminaries" + ], + "id": "6b666c7a-f105-4fad-aea0-d23a38fa0153" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "rm(list = ls())\n", + "\n", + "options(scipen=999)" + ], + "execution_count": null, + "outputs": [], + "id": "dbcf1df6-c264-48b3-bf53-f13a4b036487" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Global paths\n", + "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", + "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")" + ], + "execution_count": null, + "outputs": [], + "id": "449ea786-ac70-4513-aaf5-24ffe69aec5c" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Paths\n", + "ROOT_PATH <- '~/workspace'\n", + "PIPELINE_PATH <- file.path(ROOT_PATH, 'pipelines', 'snt_dhs_indicators')" + ], + "execution_count": null, + "outputs": [], + "id": "c9037624-3ac0-403c-9ea0-fc78891c2393" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Load notebook-specific utils\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhs_vaccination_computation.r\"))\n", + "\n", + "setup_ctx <- bootstrap_dhs_indicators_context(\n", + " root_path = ROOT_PATH,\n", + " required_packages = c(\"readr\", \"haven\", \"glue\", \"survey\", \"data.table\", \"sf\", \"ggplot2\", \"stringi\", \"reticulate\", \"jsonlite\", \"httr\", \"arrow\")\n", + ")\n", + "\n", + "DATA_PATH <- setup_ctx$DATA_PATH\n", + "DHS_DATA_PATH <- setup_ctx$DHS_DATA_PATH\n", + "config_json <- setup_ctx$config_json\n", + "COUNTRY_CODE <- setup_ctx$COUNTRY_CODE\n", + "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'vaccination')\n", + "dir.create(OUTPUT_DATA_PATH, recursive = TRUE, showWarnings = FALSE)" + ], + "execution_count": null, + "outputs": [], + "id": "1f13522e-a0e5-44b7-b8f5-b66dc7ca37c0" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "reticulate::py_config()$python" + ], + "execution_count": null, + "outputs": [], + "id": "e32d9589-1a34-4a62-ac93-ae04e5939eb1" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Configuration already loaded by bootstrap_dhs_indicators_context()." + ], + "execution_count": null, + "outputs": [], + "id": "9d0221bb-9d59-42ff-a6c9-09c735083135" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Set config variables\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE" + ], + "execution_count": null, + "outputs": [], + "id": "d9196629-0d55-416c-9171-4ebc630cc93b" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Geo data" + ], + "id": "ff080767-9fdb-4790-988c-e2b4c4f7226f" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "admin_level <- 'ADM1'\n", + "admin_id_col <- glue(admin_level, 'ID', .sep='_')\n", + "admin_name_col <- glue(admin_level, 'NAME', .sep='_')\n", + "admin_cols <- c(admin_id_col, admin_name_col)" + ], + "execution_count": null, + "outputs": [], + "id": "5249483e-2b07-425d-b385-bc32ac601ced" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Load spatial file from dataset\n", + "\n", + "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "\n", + "spatial_data <- load_dhs_spatial_data(\n", + " dhis2_dataset = dhis2_dataset,\n", + " country_code = COUNTRY_CODE\n", + ")\n", + "\n", + "spatial_data <- st_as_sf(spatial_data)\n", + "\n", + "# aggregate geometries by the admin columns\n", + "spatial_data <- aggregate_geometry(\n", + " sf_data=spatial_data,\n", + " admin_id_colname=admin_id_col,\n", + " admin_name_colname=admin_name_col\n", + " )\n", + "\n", + "# keep class\n", + "spatial_data <- st_as_sf(spatial_data)\n", + "\n", + "# DRC provinces need to be cleaned\n", + "if(COUNTRY_CODE == \"COD\"){\n", + " spatial_data[[admin_name_col]] <- clean_admin_names(spatial_data[[admin_name_col]])\n", + "}\n", + "\n", + "admin_data <- st_drop_geometry(spatial_data)\n", + "setDT(admin_data)" + ], + "execution_count": null, + "outputs": [], + "id": "1308ee0c-4856-445f-b518-ee0f4497c9b5" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import DHS data" + ], + "id": "ba6c6b6b-e0fd-434a-8e44-6c45bea47d97" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "vaccination_doses <- c(1, 2, 3)\n", + "indicator_access <- 'PCT_DTP'\n", + "indicator_attrition <- 'PCT_DROPOUT_DTP'" + ], + "execution_count": null, + "outputs": [], + "id": "29d4866a-5f98-404e-aaa8-6fe1e6dc9e2d" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "data_source <- 'DHS'\n", + "household_recode <- 'HR'\n", + "kid_recode <- 'KR'\n", + "target_file_type <- 'SV'\n", + "\n", + "delete_otherextension_files(DHS_DATA_PATH, extension_to_retain=\".zip\")" + ], + "execution_count": null, + "outputs": [], + "id": "b6bd9ec2-cdcb-41d9-8cd7-54096bbfa2a3" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "dhs_hr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, household_recode, target_file_type)\n", + "unzip(file.path(DHS_DATA_PATH, dhs_hr_zip_filename), exdir=DHS_DATA_PATH)\n", + "\n", + "dhs_kr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, kid_recode, target_file_type)\n", + "unzip(file.path(DHS_DATA_PATH, dhs_kr_zip_filename), exdir=DHS_DATA_PATH)\n", + "\n", + "# # Remove existing output files\n", + "# files <- list.files(OUTPUT_DATA_PATH, full.names = TRUE)\n", + "# files_to_delete <- files[grepl('DTP', basename(files), ignore.case = TRUE) & grepl(COUNTRY_CODE, basename(files), ignore.case = TRUE)]\n", + "# file.remove(files_to_delete)\n", + "\n", + "data_extension <- '.SAV'\n", + "dhs_hr_filename <- list.files(path = DHS_DATA_PATH, pattern = paste0(\".*\", household_recode, \".*\\\\\", data_extension, \"$\"), ignore.case=TRUE)\n", + "dhs_kr_filename <- dir(path = DHS_DATA_PATH, pattern = paste0(\".*\", kid_recode, \".*\\\\\", data_extension, \"$\"), ignore.case=TRUE)\n", + "\n", + "if(!check_dhs_same_version(dhs_hr_filename, dhs_kr_filename)){\n", + " stop(\"The necessary DHS data do not have the same version/issue. Check available data before rerunning.\")\n", + "}\n", + "\n", + "dhs_hr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_hr_filename)) # household recode\n", + "dhs_hr_dt <- setDT(dhs_hr_dt)\n", + "\n", + "dhs_kr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_kr_filename)) # kid recode\n", + "dhs_kr_dt <- setDT(dhs_kr_dt)" + ], + "execution_count": null, + "outputs": [], + "id": "e5591f0f-fe5f-4c05-9368-2d1c7e388782" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preprocess DHS data" + ], + "id": "51ccd171-5056-42bc-a45f-ea70bf2c3bbe" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Extract DHS admin data" + ], + "id": "d6607265-c852-48b0-852b-e8fecaed804b" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Make admin codes and names dataframe (for future merging)\n", + "\n", + "dhs_beginning_year <- as.integer(dhs_hr_dt[, min(HV007)])\n", + "\n", + "dhs_admin_dt <- make_dhs_admin_df(\n", + " input_dhs_df=dhs_hr_dt,\n", + " original_admin_column=\"HV024\",\n", + " new_admin_name_colname=admin_name_col,\n", + " new_admin_code_colname='DHS_ADM1_CODE'\n", + ")\n", + "\n", + "# format the names to be like DHIS2 names\n", + "dhs_admin_dt[, (admin_name_col) := format_names(get(admin_name_col))]\n", + "\n", + "# TODO this should be changed in the formatting of DHIS2 data; the correct name should be with a space\n", + "dhs_admin_dt[get(admin_name_col) == \"MAI NDOMBE\", (admin_name_col) := \"MAINDOMBE\"]\n", + "\n", + "# Check that all regions can be matched with DHIS2 pyramid\n", + "if(!check_perfect_match(dhs_admin_dt, admin_name_col, admin_data, admin_name_col)){\n", + " stop(\"The DHS data provided does not fully match DHIS2 pyramid data. Please check input data before retrying.\")\n", + "}\n", + "\n", + "rm(dhs_hr_dt) # free up resources" + ], + "execution_count": null, + "outputs": [], + "id": "6fb209c6-8090-40f0-a101-fbfbeb17de41" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Filter rows and columns" + ], + "id": "ae09cc05-a942-4e56-be52-7cc5876f62a9" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# remove dead children from the dataset, keep only children aged 1 or more (avoid left censoring for vaccination) and respect the base for the 'h' variables\n", + "kr_dt <- dhs_kr_dt[B5 == 1 & B8 >= 1 & B19 < 36,]\n", + "\n", + "household_id_cols <- c('V000', 'V001', 'V002')\n", + "kid_id_cols <- c('CASEID', 'BIDX')\n", + "kid_dpt1_cols <- c('H3', 'H3D', 'H3M', 'H3Y')\n", + "kid_dpt2_cols <- c('H5', 'H5D', 'H5M', 'H5Y')\n", + "kid_dpt3_cols <- c('H7', 'H7D', 'H7M', 'H7Y')\n", + "kid_sampling_cols <- c('V005', 'V021', 'V023', 'V024')\n", + "\n", + "kr_dt <- kr_dt[, .SD, .SDcols = c(household_id_cols, kid_id_cols, kid_sampling_cols, kid_dpt1_cols, kid_dpt2_cols, kid_dpt3_cols)]\n", + "\n", + "# # check i didn't omit any crucial variable\n", + "# stopifnot(nrow(kr_dt[duplicated(kr_dt)]) == 0)" + ], + "execution_count": null, + "outputs": [], + "id": "5b852bc5-fc91-45d6-a668-0efb25b00092" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### New features" + ], + "id": "a0194b86-ac85-44b5-bdc0-d7dd46b2170a" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Add the region labels, to subsequently match DHIS2 data" + ], + "id": "fb4ad225-93bc-44f7-bc87-d16f9eb2f065" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "kr_dt <- merge.data.table(dhs_admin_dt, kr_dt, by.x = \"DHS_ADM1_CODE\", by.y = \"V024\", all = TRUE)" + ], + "execution_count": null, + "outputs": [], + "id": "e63a49a0-273a-4cef-ac91-2f95acf141dd" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create the target features (whether or not the kid was vaccinated, for each dose)" + ], + "id": "f578eda9-0ae6-4366-9aa3-a3a110e19d30" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Create dummy variables for the various DTP vaccine doses\n", + "kr_dt[, `:=`(\n", + " DTP1 = fcase(\n", + " H3 == 0L, 0L,\n", + " H3 %in% c(1L, 2L, 3L), 1L,\n", + " default = NA\n", + " ),\n", + " DTP2 = fcase(\n", + " H5 == 0L, 0L,\n", + " H5 %in% c(1L, 2L, 3L), 1L,\n", + " default = NA\n", + " ),\n", + " DTP3 = fcase(\n", + " H7 == 0L, 0L,\n", + " H7 %in% c(1L, 2L, 3L), 1L,\n", + " default = NA\n", + " )\n", + ")]\n", + "\n", + "# Correct external consistency issues: children who got the third dose also had the second, and so on:\n", + "kr_dt[DTP2 == 1, DTP1 := 1]\n", + "kr_dt[DTP3 == 1, DTP1 := 1]\n", + "kr_dt[DTP3 == 1, DTP2 := 1]" + ], + "execution_count": null, + "outputs": [], + "id": "3e6d586f-636d-4b5c-9ccb-31520e4ca62e" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create the survey design" + ], + "id": "f11d02d8-1c0f-4cc0-9bc6-736a84d73a23" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# compute the household/kid weights\n", + "kr_dt[, wt := V005/1000000]" + ], + "execution_count": null, + "outputs": [], + "id": "dad05a65-9967-4c4f-9522-ba6cd4cfef7f" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# account for the sampling strategy (clustering, stratification, weights) for means, proportions, regression models, etc.\n", + "dtp_design = svydesign(\n", + " ids = ~ V021, # primary sampling unit / cluster ids (cluster number and/or ultimate area unit)\n", + " data = kr_dt, # dataset\n", + " strata = ~ V023, # groupings of primary sampling units\n", + " weights = ~ wt, # the sampling weights variable\n", + " nest = T # the primary sampling units are nested within the strata\n", + " )" + ], + "execution_count": null, + "outputs": [], + "id": "db28fe66-7e99-4d98-8ac1-0ea7ba30c677" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Vaccination proportion indicator" + ], + "id": "44ece144-6678-4de2-a1d4-f924b6549345" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For each vaccine dose:\n", + "- compute the proportions of vaccinated per region\n", + "- compute the CIs\n", + "- add the admin units and save to .csv and parquet" + ], + "id": "a2034bea-e0fd-4268-b383-4d39d9cd7e75" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "vaccination_results <- compute_dtp_indicator_tables(\n", + " dtp_design = dtp_design,\n", + " vaccination_doses = vaccination_doses,\n", + " indicator_access = indicator_access,\n", + " admin_name_col = admin_name_col,\n", + " admin_cols = admin_cols,\n", + " admin_data = admin_data,\n", + " output_data_path = OUTPUT_DATA_PATH,\n", + " country_code = COUNTRY_CODE,\n", + " data_source = data_source,\n", + " admin_level = admin_level\n", + ")\n", + "\n", + "DTP_DROPOUT <- vaccination_results$dtp_dropout\n", + "PCT_DTP1 <- vaccination_results$dose_tables[[\"PCT_DTP1\"]]\n", + "PCT_DTP2 <- vaccination_results$dose_tables[[\"PCT_DTP2\"]]\n", + "PCT_DTP3 <- vaccination_results$dose_tables[[\"PCT_DTP3\"]]" + ], + "execution_count": null, + "outputs": [], + "id": "a1c19f83-a1e1-4b1d-bc26-605e4ed4fa06" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Dropout rate indicator" + ], + "id": "d4e56da8-3ed5-4791-913c-58b76fbba125" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Add dropout rates plots: for each vaccine dose:\n", + "- make the dropout rates\n", + "- add them to the summary file and save it as .csv and parquet\n", + "- make plots and save them" + ], + "id": "375952b4-4a25-435c-aae1-c53c54e9382c" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# dropout computed and exported in next cell using helper" + ], + "execution_count": null, + "outputs": [], + "id": "d21baeab-f1b9-441d-b073-e7dc193a7535" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "DTP_DROPOUT <- compute_and_export_dtp_dropout(\n", + " dtp_dropout = DTP_DROPOUT,\n", + " vaccination_doses = vaccination_doses,\n", + " indicator_access = indicator_access,\n", + " indicator_attrition = indicator_attrition,\n", + " output_data_path = OUTPUT_DATA_PATH,\n", + " country_code = COUNTRY_CODE,\n", + " data_source = data_source,\n", + " admin_level = admin_level\n", + ")" + ], + "execution_count": null, + "outputs": [], + "id": "ed9b8475-bc2e-4170-b5b5-034909840269" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# already exported by compute_and_export_dtp_dropout()" + ], + "execution_count": null, + "outputs": [], + "id": "2369423c-8752-4c92-ad84-44d2120be6a9" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [], + "execution_count": null, + "outputs": [], + "id": "ecc63839-393c-4487-bdfb-e9e96626cc71" + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/pipelines/snt_dhs_indicators/reporting/snt_dhs_bednets_report.ipynb b/pipelines/snt_dhs_indicators/reporting/snt_dhs_bednets_report.ipynb index c8a62f2..a1c8be6 100644 --- a/pipelines/snt_dhs_indicators/reporting/snt_dhs_bednets_report.ipynb +++ b/pipelines/snt_dhs_indicators/reporting/snt_dhs_bednets_report.ipynb @@ -1,480 +1,481 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "8ba79c20-9f47-4d61-93ab-19d3802125ec", - "metadata": {}, - "source": [ - "# ITN Plots, DHS data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "41cd8bf1-fd71-49da-b4a5-1254421ad8ec", - "metadata": {}, - "outputs": [], - "source": [ - "# TODO These could be one large for-loop, through 'access' and 'use'" - ] - }, - { - "cell_type": "markdown", - "id": "5bd2650b-e952-45d1-b46a-b08b777a5961", - "metadata": {}, - "source": [ - "### Preliminary steps" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3632c310-6a58-4825-8b80-ce3612b6caca", - "metadata": {}, - "outputs": [], - "source": [ - "rm(list = ls())\n", - "\n", - "options(scipen=999)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cb98532b-56c9-42c7-9bbd-a6f7869bfc76", - "metadata": {}, - "outputs": [], - "source": [ - "# Global paths\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ebf120a6-1559-4295-93c4-cfbfd141a67b", - "metadata": {}, - "outputs": [], - "source": [ - "# Paths\n", - "ROOT_PATH <- '~/workspace'\n", - "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", - "CODE_PATH <- file.path(ROOT_PATH, 'code')\n", - "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", - "DHS_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'raw')\n", - "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'bednets')\n", - "OUTPUT_PLOTS_PATH <- file.path(ROOT_PATH, 'pipelines', 'snt_dhs_indicators', 'reporting', 'outputs')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cc24cdd4-2ccb-4511-8a63-8ee4b0c29bde", - "metadata": {}, - "outputs": [], - "source": [ - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "\n", - "# List required pcks\n", - "required_packages <- c(\"sf\", \"glue\", \"data.table\", \"ggplot2\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\")\n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3cff50d0-b6cd-42ca-86fb-eccc82eb2236", - "metadata": {}, - "outputs": [], - "source": [ - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8cbd66ad-1659-4db9-a1ad-4b329080782d", - "metadata": {}, - "outputs": [], - "source": [ - "# Load SNT config\n", - "CONFIG_FILE_NAME <- \"SNT_config.json\"\n", - "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, CONFIG_FILE_NAME)) \n", - "log_msg(msg)\n", - "\n", - "# Set config variables\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "\n", - "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7d7bc112-ab91-421a-b03b-510be814361f", - "metadata": {}, - "outputs": [], - "source": [ - "# COUNTRY_CODE <- \"BFA\"\n", - "print(paste(\"Country code: \", COUNTRY_CODE))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "145fe721-f42a-45ff-a3cb-060886fe7a9e", - "metadata": {}, - "outputs": [], - "source": [ - "admin_level <- 'ADM1'\n", - "admin_id_col <- glue(admin_level, 'ID', .sep='_')\n", - "admin_name_col <- glue(admin_level, 'NAME', .sep='_')\n", - "admin_cols <- c(admin_id_col, admin_name_col)" - ] - }, - { - "cell_type": "markdown", - "id": "93898419-b98d-4a53-8fc9-a1bb9bff01a4", - "metadata": {}, - "source": [ - "## Geo data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "02d322a1-11c9-4899-ba0c-07a91045f10c", - "metadata": {}, - "outputs": [], - "source": [ - "# Load spatial file from dataset\n", - "spatial_data_filename <- paste(COUNTRY_CODE, \"shapes.geojson\", sep = \"_\")\n", - "# spatial_data <- read_sf(file.path(DATA_PATH, 'dhis2', 'formatted', spatial_data_filename))\n", - "spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)\n", - "# log_msg(glue(\"File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4075e2b4-e665-4442-b28e-ec75a7ba6ff1", - "metadata": {}, - "outputs": [], - "source": [ - "spatial_data <- st_as_sf(spatial_data)\n", - "\n", - "# aggregate geometries by the admin columns\n", - "spatial_data <- aggregate_geometry(\n", - " sf_data=spatial_data,\n", - " admin_id_colname=admin_id_col,\n", - " admin_name_colname=admin_name_col\n", - ")\n", - "\n", - "# keep class\n", - "spatial_data <- st_as_sf(spatial_data)\n", - "\n", - "if(COUNTRY_CODE == \"COD\"){\n", - " spatial_data[[admin_name_col]] <- clean_admin_names(spatial_data[[admin_name_col]])\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "6399c2eb-9509-4b4f-839a-6c8c83004510", - "metadata": {}, - "source": [ - "## Import DHS data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0fb15b9d-3cc5-4c6f-be25-124169388c25", - "metadata": {}, - "outputs": [], - "source": [ - "data_source <- 'DHS'\n", - "indicator_access <- 'PCT_ITN_ACCESS'\n", - "indicator_use <- 'PCT_ITN_USE'" - ] - }, - { - "cell_type": "markdown", - "id": "2573c887-0fa9-4004-9a30-0d6ffb90de07", - "metadata": {}, - "source": [ - "## ITN ACCCESS PLOTS" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d813a167-92a0-47ab-a796-709fc6fa55ab", - "metadata": {}, - "outputs": [], - "source": [ - "filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_access)}\")\n", - "bednet_access_table <- fread(file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.csv')))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b630f927-d4a4-4c5b-a5a6-f97c3de8ecf7", - "metadata": {}, - "outputs": [], - "source": [ - "access_plot_data = merge(spatial_data, bednet_access_table, by = admin_cols, all = TRUE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2db72c84-f3ca-4962-bc63-9fddc97a113a", - "metadata": {}, - "outputs": [], - "source": [ - "access_lower_bound_col <- glue(\"{toupper(indicator_access)}_CI_LOWER_BOUND\")\n", - "access_upper_bound_col <- glue(\"{toupper(indicator_access)}_CI_UPPER_BOUND\")\n", - "access_sample_avg_col <- glue(\"{toupper(indicator_access)}_SAMPLE_AVERAGE\")" - ] - }, - { - "cell_type": "markdown", - "id": "88a625c0-2222-4dfd-9b28-b40b5749ddc7", - "metadata": {}, - "source": [ - "### ITN Access Map" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b76d619f-004c-4ea3-910c-a62011abe823", - "metadata": {}, - "outputs": [], - "source": [ - "ITN_access_plot <- make_dhs_map(\n", - " plot_dt = access_plot_data,\n", - " plot_colname = access_sample_avg_col,\n", - " title_name = \"ITN Access (%)\",\n", - " legend_title = \"%\",\n", - " scale_limits = c(0, 100)\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "373b6b2b-f667-4619-b226-2c21ddfa5b36", - "metadata": {}, - "outputs": [], - "source": [ - "ITN_access_plot_filename <- glue('{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_access)}_plot.png')\n", - "ggsave(ITN_access_plot, file = file.path(OUTPUT_PLOTS_PATH, ITN_access_plot_filename), dpi = 500)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f3d272c9-0456-43a2-bbc8-84aaa3f79dd3", - "metadata": {}, - "outputs": [], - "source": [ - "# confidence interval plot\n", - "access_ci_plot_title <- glue(\"{COUNTRY_CODE} {data_source} {toupper(indicator_access)} CI\")\n", - "access_ci_plot_xlab <- admin_name_col\n", - "access_ci_plot_ylab <- glue(\"ITN access (%)\")\n", - "access_ci_plot_filename <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_access)}_CI_plot.png\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "73afeb71-c365-494e-b60a-4dbeb2c0e853", - "metadata": {}, - "outputs": [], - "source": [ - "access_ci_plot <- make_ci_plot(\n", - " df_to_plot=access_plot_data,\n", - " admin_colname=admin_name_col,\n", - " point_estimation_colname=access_sample_avg_col,\n", - " ci_lower_colname=access_lower_bound_col,\n", - " ci_upper_colname=access_upper_bound_col,\n", - " title_name=access_ci_plot_title,\n", - " x_title=access_ci_plot_xlab,\n", - " y_title=access_ci_plot_ylab\n", - ")\n", - "\n", - "# ci_access_plot" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cb99fb66-183f-48f8-9df1-ea9475d80123", - "metadata": {}, - "outputs": [], - "source": [ - "ggsave(filename=file.path(OUTPUT_PLOTS_PATH, access_ci_plot_filename), plot=access_ci_plot, width = 8, height = 6, dpi = 300)" - ] - }, - { - "cell_type": "markdown", - "id": "c2e1283b-e287-4858-b70d-c1764ad89731", - "metadata": {}, - "source": [ - "## ITN USE PLOTS" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6dcc961e-712a-468c-8b66-3e8a936b695a", - "metadata": {}, - "outputs": [], - "source": [ - "filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_use)}\")\n", - "bednet_use_table <- fread(file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.csv')))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6ad59821-7504-45e0-a6d7-b0a0bcda5f97", - "metadata": {}, - "outputs": [], - "source": [ - "use_plot_data = merge(spatial_data, bednet_use_table, by = admin_cols, all = TRUE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b34245bf-9ae0-467c-a07b-ff7c50a85282", - "metadata": {}, - "outputs": [], - "source": [ - "use_lower_bound_col <- glue(\"{toupper(indicator_use)}_CI_LOWER_BOUND\")\n", - "use_upper_bound_col <- glue(\"{toupper(indicator_use)}_CI_UPPER_BOUND\")\n", - "use_sample_avg_col <- glue(\"{toupper(indicator_use)}_SAMPLE_AVERAGE\")" - ] - }, - { - "cell_type": "markdown", - "id": "6f260543-35b6-4d8b-a970-f4f9e2065ee7", - "metadata": {}, - "source": [ - "### ITN use plot" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f718b10e-ca51-41bc-9989-17d2d0374458", - "metadata": {}, - "outputs": [], - "source": [ - "ITN_use_plot <- make_dhs_map(\n", - " plot_dt = use_plot_data,\n", - " plot_colname = use_sample_avg_col,\n", - " title_name = \"ITN Use (%)\",\n", - " legend_title = \"%\",\n", - " scale_limits = c(0, 100)\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5d751ec3-22e4-40a5-8378-037de76236df", - "metadata": {}, - "outputs": [], - "source": [ - "ITN_use_plot_filename <- glue('{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_use)}_plot.png')\n", - "ggsave(ITN_use_plot, file = file.path(OUTPUT_PLOTS_PATH, ITN_use_plot_filename), dpi = 500)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7a8ba05c-4e90-4acb-8bf9-88631ae43cf2", - "metadata": {}, - "outputs": [], - "source": [ - "# confidence interval plot\n", - "use_ci_plot_title <- glue(\"{COUNTRY_CODE} {data_source} {toupper(indicator_use)} CI\")\n", - "use_ci_plot_xlab <- admin_name_col\n", - "use_ci_plot_ylab <- glue(\"ITN Use (%)\")\n", - "use_ci_plot_filename <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_use)}_CI_plot.png\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "13b3fe6e-7aeb-43ca-ad31-add612f722ee", - "metadata": {}, - "outputs": [], - "source": [ - "use_ci_plot <- make_ci_plot(\n", - " df_to_plot=use_plot_data,\n", - " admin_colname=admin_name_col,\n", - " point_estimation_colname=use_sample_avg_col,\n", - " ci_lower_colname=use_lower_bound_col,\n", - " ci_upper_colname=use_upper_bound_col,\n", - " title_name=use_ci_plot_title,\n", - " x_title=use_ci_plot_xlab,\n", - " y_title=use_ci_plot_ylab\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "06e60a12-e8b1-435f-adb4-b676aedf67fe", - "metadata": {}, - "outputs": [], - "source": [ - "ggsave(filename=file.path(OUTPUT_PLOTS_PATH, use_ci_plot_filename), plot=use_ci_plot, width = 8, height = 6, dpi = 300)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e7427ae0-be6b-4dfa-b0e2-e9d9847dded3", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# ITN Plots, DHS data" + ], + "id": "8ba79c20-9f47-4d61-93ab-19d3802125ec" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# TODO These could be one large for-loop, through 'access' and 'use'" + ], + "execution_count": null, + "outputs": [], + "id": "41cd8bf1-fd71-49da-b4a5-1254421ad8ec" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Preliminary steps" + ], + "id": "5bd2650b-e952-45d1-b46a-b08b777a5961" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "rm(list = ls())\n", + "\n", + "options(scipen=999)" + ], + "execution_count": null, + "outputs": [], + "id": "3632c310-6a58-4825-8b80-ce3612b6caca" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Global paths\n", + "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", + "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")" + ], + "execution_count": null, + "outputs": [], + "id": "cb98532b-56c9-42c7-9bbd-a6f7869bfc76" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Paths\n", + "ROOT_PATH <- '~/workspace'\n", + "PIPELINE_PATH <- file.path(ROOT_PATH, 'pipelines', 'snt_dhs_indicators')\n", + "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", + "CODE_PATH <- file.path(ROOT_PATH, 'code')\n", + "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", + "DHS_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'raw')\n", + "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'bednets')\n", + "OUTPUT_PLOTS_PATH <- file.path(ROOT_PATH, 'pipelines', 'snt_dhs_indicators', 'reporting', 'outputs')" + ], + "execution_count": null, + "outputs": [], + "id": "ebf120a6-1559-4295-93c4-cfbfd141a67b" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Load notebook-specific utilities\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhs_bednets_report.r\"))\n", + "\n", + "# List required pcks\n", + "required_packages <- c(\"sf\", \"glue\", \"data.table\", \"ggplot2\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"arrow\")\n", + "\n", + "# Execute function\n", + "install_and_load(required_packages)" + ], + "execution_count": null, + "outputs": [], + "id": "cc24cdd4-2ccb-4511-8a63-8ee4b0c29bde" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "reticulate::py_config()$python\n", + "openhexa <- import(\"openhexa.sdk\")" + ], + "execution_count": null, + "outputs": [], + "id": "3cff50d0-b6cd-42ca-86fb-eccc82eb2236" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Load SNT config\n", + "CONFIG_FILE_NAME <- \"SNT_config.json\"\n", + "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },\n", + " error = function(e) {\n", + " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })\n", + "\n", + "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, CONFIG_FILE_NAME)) \n", + "log_msg(msg)\n", + "\n", + "# Set config variables\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "\n", + "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED" + ], + "execution_count": null, + "outputs": [], + "id": "8cbd66ad-1659-4db9-a1ad-4b329080782d" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# COUNTRY_CODE <- \"BFA\"\n", + "print(paste(\"Country code: \", COUNTRY_CODE))" + ], + "execution_count": null, + "outputs": [], + "id": "7d7bc112-ab91-421a-b03b-510be814361f" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "admin_level <- 'ADM1'\n", + "admin_id_col <- glue(admin_level, 'ID', .sep='_')\n", + "admin_name_col <- glue(admin_level, 'NAME', .sep='_')\n", + "admin_cols <- c(admin_id_col, admin_name_col)" + ], + "execution_count": null, + "outputs": [], + "id": "145fe721-f42a-45ff-a3cb-060886fe7a9e" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Geo data" + ], + "id": "93898419-b98d-4a53-8fc9-a1bb9bff01a4" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Load spatial file from dataset\n", + "spatial_data_filename <- paste(COUNTRY_CODE, \"shapes.geojson\", sep = \"_\")\n", + "# spatial_data <- read_sf(file.path(DATA_PATH, 'dhis2', 'formatted', spatial_data_filename))\n", + "spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)\n", + "# log_msg(glue(\"File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))" + ], + "execution_count": null, + "outputs": [], + "id": "02d322a1-11c9-4899-ba0c-07a91045f10c" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "spatial_data <- st_as_sf(spatial_data)\n", + "\n", + "# aggregate geometries by the admin columns\n", + "spatial_data <- aggregate_geometry(\n", + " sf_data=spatial_data,\n", + " admin_id_colname=admin_id_col,\n", + " admin_name_colname=admin_name_col\n", + ")\n", + "\n", + "# keep class\n", + "spatial_data <- st_as_sf(spatial_data)\n", + "\n", + "if(COUNTRY_CODE == \"COD\"){\n", + " spatial_data[[admin_name_col]] <- clean_admin_names(spatial_data[[admin_name_col]])\n", + "}" + ], + "execution_count": null, + "outputs": [], + "id": "4075e2b4-e665-4442-b28e-ec75a7ba6ff1" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import DHS data" + ], + "id": "6399c2eb-9509-4b4f-839a-6c8c83004510" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "data_source <- 'DHS'\n", + "indicator_access <- 'PCT_ITN_ACCESS'\n", + "indicator_use <- 'PCT_ITN_USE'" + ], + "execution_count": null, + "outputs": [], + "id": "0fb15b9d-3cc5-4c6f-be25-124169388c25" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ITN ACCCESS PLOTS" + ], + "id": "2573c887-0fa9-4004-9a30-0d6ffb90de07" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_access)}\")\n", + "bednet_access_table <- fread(file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.csv')))" + ], + "execution_count": null, + "outputs": [], + "id": "d813a167-92a0-47ab-a796-709fc6fa55ab" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "access_plot_data = merge(spatial_data, bednet_access_table, by = admin_cols, all = TRUE)" + ], + "execution_count": null, + "outputs": [], + "id": "b630f927-d4a4-4c5b-a5a6-f97c3de8ecf7" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "access_lower_bound_col <- glue(\"{toupper(indicator_access)}_CI_LOWER_BOUND\")\n", + "access_upper_bound_col <- glue(\"{toupper(indicator_access)}_CI_UPPER_BOUND\")\n", + "access_sample_avg_col <- glue(\"{toupper(indicator_access)}_SAMPLE_AVERAGE\")" + ], + "execution_count": null, + "outputs": [], + "id": "2db72c84-f3ca-4962-bc63-9fddc97a113a" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### ITN Access Map" + ], + "id": "88a625c0-2222-4dfd-9b28-b40b5749ddc7" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "ITN_access_plot <- make_dhs_map(\n", + " plot_dt = access_plot_data,\n", + " plot_colname = access_sample_avg_col,\n", + " title_name = \"ITN Access (%)\",\n", + " legend_title = \"%\",\n", + " scale_limits = c(0, 100)\n", + " )" + ], + "execution_count": null, + "outputs": [], + "id": "b76d619f-004c-4ea3-910c-a62011abe823" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "ITN_access_plot_filename <- glue('{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_access)}_plot.png')\n", + "ggsave(ITN_access_plot, file = file.path(OUTPUT_PLOTS_PATH, ITN_access_plot_filename), dpi = 500)" + ], + "execution_count": null, + "outputs": [], + "id": "373b6b2b-f667-4619-b226-2c21ddfa5b36" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# confidence interval plot\n", + "access_ci_plot_title <- glue(\"{COUNTRY_CODE} {data_source} {toupper(indicator_access)} CI\")\n", + "access_ci_plot_xlab <- admin_name_col\n", + "access_ci_plot_ylab <- glue(\"ITN access (%)\")\n", + "access_ci_plot_filename <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_access)}_CI_plot.png\")" + ], + "execution_count": null, + "outputs": [], + "id": "f3d272c9-0456-43a2-bbc8-84aaa3f79dd3" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "access_ci_plot <- make_ci_plot(\n", + " df_to_plot=access_plot_data,\n", + " admin_colname=admin_name_col,\n", + " point_estimation_colname=access_sample_avg_col,\n", + " ci_lower_colname=access_lower_bound_col,\n", + " ci_upper_colname=access_upper_bound_col,\n", + " title_name=access_ci_plot_title,\n", + " x_title=access_ci_plot_xlab,\n", + " y_title=access_ci_plot_ylab\n", + ")\n", + "\n", + "# ci_access_plot" + ], + "execution_count": null, + "outputs": [], + "id": "73afeb71-c365-494e-b60a-4dbeb2c0e853" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "ggsave(filename=file.path(OUTPUT_PLOTS_PATH, access_ci_plot_filename), plot=access_ci_plot, width = 8, height = 6, dpi = 300)" + ], + "execution_count": null, + "outputs": [], + "id": "cb99fb66-183f-48f8-9df1-ea9475d80123" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ITN USE PLOTS" + ], + "id": "c2e1283b-e287-4858-b70d-c1764ad89731" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_use)}\")\n", + "bednet_use_table <- fread(file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.csv')))" + ], + "execution_count": null, + "outputs": [], + "id": "6dcc961e-712a-468c-8b66-3e8a936b695a" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "use_plot_data = merge(spatial_data, bednet_use_table, by = admin_cols, all = TRUE)" + ], + "execution_count": null, + "outputs": [], + "id": "6ad59821-7504-45e0-a6d7-b0a0bcda5f97" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "use_lower_bound_col <- glue(\"{toupper(indicator_use)}_CI_LOWER_BOUND\")\n", + "use_upper_bound_col <- glue(\"{toupper(indicator_use)}_CI_UPPER_BOUND\")\n", + "use_sample_avg_col <- glue(\"{toupper(indicator_use)}_SAMPLE_AVERAGE\")" + ], + "execution_count": null, + "outputs": [], + "id": "b34245bf-9ae0-467c-a07b-ff7c50a85282" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### ITN use plot" + ], + "id": "6f260543-35b6-4d8b-a970-f4f9e2065ee7" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "ITN_use_plot <- make_dhs_map(\n", + " plot_dt = use_plot_data,\n", + " plot_colname = use_sample_avg_col,\n", + " title_name = \"ITN Use (%)\",\n", + " legend_title = \"%\",\n", + " scale_limits = c(0, 100)\n", + " )" + ], + "execution_count": null, + "outputs": [], + "id": "f718b10e-ca51-41bc-9989-17d2d0374458" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "ITN_use_plot_filename <- glue('{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_use)}_plot.png')\n", + "ggsave(ITN_use_plot, file = file.path(OUTPUT_PLOTS_PATH, ITN_use_plot_filename), dpi = 500)" + ], + "execution_count": null, + "outputs": [], + "id": "5d751ec3-22e4-40a5-8378-037de76236df" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# confidence interval plot\n", + "use_ci_plot_title <- glue(\"{COUNTRY_CODE} {data_source} {toupper(indicator_use)} CI\")\n", + "use_ci_plot_xlab <- admin_name_col\n", + "use_ci_plot_ylab <- glue(\"ITN Use (%)\")\n", + "use_ci_plot_filename <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_use)}_CI_plot.png\")" + ], + "execution_count": null, + "outputs": [], + "id": "7a8ba05c-4e90-4acb-8bf9-88631ae43cf2" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "use_ci_plot <- make_ci_plot(\n", + " df_to_plot=use_plot_data,\n", + " admin_colname=admin_name_col,\n", + " point_estimation_colname=use_sample_avg_col,\n", + " ci_lower_colname=use_lower_bound_col,\n", + " ci_upper_colname=use_upper_bound_col,\n", + " title_name=use_ci_plot_title,\n", + " x_title=use_ci_plot_xlab,\n", + " y_title=use_ci_plot_ylab\n", + ")" + ], + "execution_count": null, + "outputs": [], + "id": "13b3fe6e-7aeb-43ca-ad31-add612f722ee" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "ggsave(filename=file.path(OUTPUT_PLOTS_PATH, use_ci_plot_filename), plot=use_ci_plot, width = 8, height = 6, dpi = 300)" + ], + "execution_count": null, + "outputs": [], + "id": "06e60a12-e8b1-435f-adb4-b676aedf67fe" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [], + "execution_count": null, + "outputs": [], + "id": "e7427ae0-be6b-4dfa-b0e2-e9d9847dded3" + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/pipelines/snt_dhs_indicators/reporting/snt_dhs_careseeking_report.ipynb b/pipelines/snt_dhs_indicators/reporting/snt_dhs_careseeking_report.ipynb index 55f03b1..15753ce 100644 --- a/pipelines/snt_dhs_indicators/reporting/snt_dhs_careseeking_report.ipynb +++ b/pipelines/snt_dhs_indicators/reporting/snt_dhs_careseeking_report.ipynb @@ -1,315 +1,322 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "33d7a9b4-8e3f-4ff5-8369-88dccd3f6d8c", - "metadata": {}, - "source": [ - "# Plots for careseeking behavior upon child fever (DHS data)" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Plots for careseeking behavior upon child fever (DHS data)" + ], + "id": "33d7a9b4-8e3f-4ff5-8369-88dccd3f6d8c" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preliminary steps" + ], + "id": "0ef48ace-d77e-49bf-9b21-1cece3d48161" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "rm(list = ls())\n", + "\n", + "options(scipen=999)" + ], + "execution_count": null, + "outputs": [], + "id": "f4156fe8-631a-4012-8c66-08dc8a721851" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Global paths\n", + "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", + "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", + "\n", + "# Paths\n", + "ROOT_PATH <- '~/workspace'\n", + "PIPELINE_PATH <- file.path(ROOT_PATH, 'pipelines', 'snt_dhs_indicators')\n", + "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", + "CODE_PATH <- file.path(ROOT_PATH, 'code')\n", + "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", + "DHS_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'raw')\n", + "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'careseeking')\n", + "OUTPUT_PLOTS_PATH <- file.path(ROOT_PATH, 'pipelines', 'snt_dhs_indicators', 'reporting', 'outputs')" + ], + "execution_count": null, + "outputs": [], + "id": "20a367b8-844b-41a7-8725-4bd37bda0352" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Load notebook-specific utilities\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhs_careseeking_report.r\"))\n", + "\n", + "# List required pcks\n", + "required_packages <- c(\"sf\", \"glue\", \"data.table\", \"ggplot2\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"arrow\")\n", + "\n", + "# Execute function\n", + "install_and_load(required_packages)" + ], + "execution_count": null, + "outputs": [], + "id": "d81fb691-698d-4832-9137-f8c6f0c5938c" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "reticulate::py_config()$python\n", + "openhexa <- import(\"openhexa.sdk\")\n", + "\n", + "# Load SNT config\n", + "CONFIG_FILE_NAME <- \"SNT_config.json\"\n", + "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },\n", + " error = function(e) {\n", + " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })\n", + "\n", + "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, CONFIG_FILE_NAME)) \n", + "log_msg(msg)\n", + "\n", + "# Set config variables\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "print(paste(\"Country code: \", COUNTRY_CODE))" + ], + "execution_count": null, + "outputs": [], + "id": "d2ab76ca-5e05-4867-af1f-ade9d540a1b9" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Geo data" + ], + "id": "484aed3b-0c40-40f4-8f81-0c4ed16a5d49" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "admin_level <- 'ADM1'\n", + "admin_id_col <- glue(admin_level, 'ID', .sep='_')\n", + "admin_name_col <- glue(admin_level, 'NAME', .sep='_')\n", + "admin_cols <- c(admin_id_col, admin_name_col)" + ], + "execution_count": null, + "outputs": [], + "id": "7d326034-33ff-40d6-a860-81d0bd5a1c34" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Load spatial file from dataset\n", + "\n", + "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "\n", + "spatial_data_filename <- paste(COUNTRY_CODE, \"shapes.geojson\", sep = \"_\")\n", + "# spatial_data <- read_sf(file.path(DATA_PATH, 'dhis2', 'formatted', spatial_data_filename))\n", + "spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)\n", + "log_msg(glue(\"File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))\n", + "\n", + "spatial_data <- st_as_sf(spatial_data)\n", + "\n", + "# aggregate geometries by the admin columns\n", + "spatial_data <- aggregate_geometry(\n", + " sf_data=spatial_data,\n", + " admin_id_colname=admin_id_col,\n", + " admin_name_colname=admin_name_col\n", + ")\n", + "\n", + "# keep class\n", + "spatial_data <- st_as_sf(spatial_data)\n", + "\n", + "if(COUNTRY_CODE == \"COD\"){\n", + " spatial_data[[admin_name_col]] <- clean_admin_names(spatial_data[[admin_name_col]])\n", + "}" + ], + "execution_count": null, + "outputs": [], + "id": "f0b405fa-f9aa-40b4-b266-54d1de4b2317" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## DHS tables/names" + ], + "id": "260e3c90-a72c-4683-acc9-d3ed3d7ac516" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "data_source <- 'DHS'\n", + "# indicator_public_care <- 'PUBLIC_CARE'\n", + "# indicator_private_care <- 'PRIVATE_CARE'\n", + "# indicator_no_care <- 'NO_CARE'" + ], + "execution_count": null, + "outputs": [], + "id": "b95b6df7-2781-4ec5-9c5b-fe79227268de" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_PCT_CARESEEKING_SAMPLE_AVERAGE\")\n", + "careseeking_table <- fread(file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.csv')))\n", + "\n", + "# all columns which are not admin columns, are indicator columns\n", + "all_indicators <- setdiff(names(careseeking_table), admin_cols)" + ], + "execution_count": null, + "outputs": [], + "id": "7d5b1eae-4933-45e1-9cb8-981163cd1369" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Maps" + ], + "id": "530e1567-2820-4a91-bd97-68af0df7aa4c" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "plot_data = merge(spatial_data, careseeking_table, by = admin_cols, all = TRUE)" + ], + "execution_count": null, + "outputs": [], + "id": "0120c63f-27ab-421b-b635-1e538295c466" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "export_careseeking_reporting_plots(\n", + " plot_data = plot_data,\n", + " all_indicators = all_indicators,\n", + " output_plots_path = OUTPUT_PLOTS_PATH,\n", + " country_code = COUNTRY_CODE,\n", + " data_source = data_source,\n", + " admin_level = admin_level\n", + ")" + ], + "execution_count": null, + "outputs": [], + "id": "071dd126-b29d-4f67-b1a2-89b5c09a43e7" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Confidence interval plots" + ], + "id": "f42cc870-2d67-450d-9fd7-616cfb099eb2" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "export_careseeking_reporting_ci_plots(\n", + " all_indicators = all_indicators,\n", + " output_data_path = OUTPUT_DATA_PATH,\n", + " output_plots_path = OUTPUT_PLOTS_PATH,\n", + " country_code = COUNTRY_CODE,\n", + " data_source = data_source,\n", + " admin_level = admin_level,\n", + " admin_name_col = admin_name_col\n", + ")" + ], + "execution_count": null, + "outputs": [], + "id": "ff3157da-2816-40cc-9d73-f53849947fe9" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [], + "execution_count": null, + "outputs": [], + "id": "5eefeb57-1a36-45a9-a462-a4fe098e3bda" + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" + } }, - { - "cell_type": "markdown", - "id": "0ef48ace-d77e-49bf-9b21-1cece3d48161", - "metadata": {}, - "source": [ - "## Preliminary steps" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f4156fe8-631a-4012-8c66-08dc8a721851", - "metadata": {}, - "outputs": [], - "source": [ - "rm(list = ls())\n", - "\n", - "options(scipen=999)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "20a367b8-844b-41a7-8725-4bd37bda0352", - "metadata": {}, - "outputs": [], - "source": [ - "# Global paths\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "\n", - "# Paths\n", - "ROOT_PATH <- '~/workspace'\n", - "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", - "CODE_PATH <- file.path(ROOT_PATH, 'code')\n", - "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", - "DHS_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'raw')\n", - "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'careseeking')\n", - "OUTPUT_PLOTS_PATH <- file.path(ROOT_PATH, 'pipelines', 'snt_dhs_indicators', 'reporting', 'outputs')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d81fb691-698d-4832-9137-f8c6f0c5938c", - "metadata": {}, - "outputs": [], - "source": [ - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "\n", - "# List required pcks\n", - "required_packages <- c(\"sf\", \"glue\", \"data.table\", \"ggplot2\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\")\n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d2ab76ca-5e05-4867-af1f-ade9d540a1b9", - "metadata": {}, - "outputs": [], - "source": [ - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")\n", - "\n", - "# Load SNT config\n", - "CONFIG_FILE_NAME <- \"SNT_config.json\"\n", - "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, CONFIG_FILE_NAME)) \n", - "log_msg(msg)\n", - "\n", - "# Set config variables\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "print(paste(\"Country code: \", COUNTRY_CODE))" - ] - }, - { - "cell_type": "markdown", - "id": "484aed3b-0c40-40f4-8f81-0c4ed16a5d49", - "metadata": {}, - "source": [ - "## Geo data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7d326034-33ff-40d6-a860-81d0bd5a1c34", - "metadata": {}, - "outputs": [], - "source": [ - "admin_level <- 'ADM1'\n", - "admin_id_col <- glue(admin_level, 'ID', .sep='_')\n", - "admin_name_col <- glue(admin_level, 'NAME', .sep='_')\n", - "admin_cols <- c(admin_id_col, admin_name_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f0b405fa-f9aa-40b4-b266-54d1de4b2317", - "metadata": {}, - "outputs": [], - "source": [ - "# Load spatial file from dataset\n", - "\n", - "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "\n", - "spatial_data_filename <- paste(COUNTRY_CODE, \"shapes.geojson\", sep = \"_\")\n", - "# spatial_data <- read_sf(file.path(DATA_PATH, 'dhis2', 'formatted', spatial_data_filename))\n", - "spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)\n", - "log_msg(glue(\"File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))\n", - "\n", - "spatial_data <- st_as_sf(spatial_data)\n", - "\n", - "# aggregate geometries by the admin columns\n", - "spatial_data <- aggregate_geometry(\n", - " sf_data=spatial_data,\n", - " admin_id_colname=admin_id_col,\n", - " admin_name_colname=admin_name_col\n", - ")\n", - "\n", - "# keep class\n", - "spatial_data <- st_as_sf(spatial_data)\n", - "\n", - "if(COUNTRY_CODE == \"COD\"){\n", - " spatial_data[[admin_name_col]] <- clean_admin_names(spatial_data[[admin_name_col]])\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "260e3c90-a72c-4683-acc9-d3ed3d7ac516", - "metadata": {}, - "source": [ - "## DHS tables/names" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b95b6df7-2781-4ec5-9c5b-fe79227268de", - "metadata": {}, - "outputs": [], - "source": [ - "data_source <- 'DHS'\n", - "# indicator_public_care <- 'PUBLIC_CARE'\n", - "# indicator_private_care <- 'PRIVATE_CARE'\n", - "# indicator_no_care <- 'NO_CARE'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7d5b1eae-4933-45e1-9cb8-981163cd1369", - "metadata": {}, - "outputs": [], - "source": [ - "filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_PCT_CARESEEKING_SAMPLE_AVERAGE\")\n", - "careseeking_table <- fread(file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.csv')))\n", - "\n", - "# all columns which are not admin columns, are indicator columns\n", - "all_indicators <- setdiff(names(careseeking_table), admin_cols)" - ] - }, - { - "cell_type": "markdown", - "id": "530e1567-2820-4a91-bd97-68af0df7aa4c", - "metadata": {}, - "source": [ - "## Maps" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0120c63f-27ab-421b-b635-1e538295c466", - "metadata": {}, - "outputs": [], - "source": [ - "plot_data = merge(spatial_data, careseeking_table, by = admin_cols, all = TRUE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "071dd126-b29d-4f67-b1a2-89b5c09a43e7", - "metadata": {}, - "outputs": [], - "source": [ - "for (indicator_name in all_indicators){\n", - " \n", - " plot_label = gsub('PCT ', '', gsub('_', ' ', indicator_name))\n", - "\n", - " indicator_plot <- make_dhs_map(\n", - " plot_dt = plot_data,\n", - " plot_colname = indicator_name,\n", - " title_name = glue(\"Percentage children: {plot_label}\"),\n", - " legend_title = glue(\"%\"),\n", - " scale_limits = c(0, 100)\n", - " )\n", - " # indicator_plot <- ggplot(plot_data) +\n", - " # geom_sf(aes(fill = get(indicator_name))) +\n", - " # # geom_sf(aes(fill = U5_PREV_RDT_SAMPLE_AVERAGE)) +\n", - " # scale_fill_gradient(\n", - " # limits = c(0,1),\n", - " # low = \"white\",\n", - " # high = \"navy\",\n", - " # na.value = \"grey90\"\n", - " # ) +\n", - " # coord_sf() + # map projection\n", - " # theme_classic() +\n", - " # theme(plot.title = element_text(face = \"bold\", hjust = 0.5),\n", - " # legend.position = \"bottom\", legend.key.width = unit(2,\"cm\"), legend.text=element_text(size=10)) +\n", - " # labs(fill = glue(\"Percentage {plot_label}\"))\n", - " \n", - " # print(indicator_plot)\n", - " ggsave(indicator_plot, file = file.path(OUTPUT_PLOTS_PATH, glue('{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_name)}_plot.png')), dpi = 500)\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "f42cc870-2d67-450d-9fd7-616cfb099eb2", - "metadata": {}, - "source": [ - "## Confidence interval plots" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ff3157da-2816-40cc-9d73-f53849947fe9", - "metadata": {}, - "outputs": [], - "source": [ - "for (indicator_name in all_indicators){\n", - " \n", - " indicator_label <- gsub('_', ' ', indicator_name)\n", - " \n", - " ci_data <- fread(file.path(\n", - " OUTPUT_DATA_PATH,\n", - " glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{indicator_name}.csv\")\n", - " ))\n", - " \n", - " sample_avg_col <- glue(\"{indicator_name}_SAMPLE_AVERAGE\")\n", - " lower_bound_col <- glue(\"{indicator_name}_CI_LOWER_BOUND\")\n", - " upper_bound_col <- glue(\"{indicator_name}_CI_UPPER_BOUND\")\n", - " ci_plot_title <- glue(\"{COUNTRY_CODE} {data_source} {indicator_label} CI\")\n", - " ci_plot_xlab <- admin_name_col\n", - " ci_plot_ylab <- glue(\"{indicator_label} (%)\")\n", - " ci_plot_filename <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_name)}_CI_plot.png\")\n", - " \n", - " ci_plot <- make_ci_plot(\n", - " df_to_plot=ci_data,\n", - " admin_colname=admin_name_col,\n", - " point_estimation_colname=sample_avg_col,\n", - " ci_lower_colname=lower_bound_col,\n", - " ci_upper_colname=upper_bound_col,\n", - " title_name=ci_plot_title,\n", - " x_title=ci_plot_xlab,\n", - " y_title=ci_plot_ylab\n", - " )\n", - " \n", - " ggsave(plot=ci_plot, filename=file.path(OUTPUT_PLOTS_PATH, ci_plot_filename), width = 8, height = 6, dpi = 300)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5eefeb57-1a36-45a9-a462-a4fe098e3bda", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/pipelines/snt_dhs_indicators/reporting/snt_dhs_mortality_report.ipynb b/pipelines/snt_dhs_indicators/reporting/snt_dhs_mortality_report.ipynb index 36fa1b0..f57ed9f 100644 --- a/pipelines/snt_dhs_indicators/reporting/snt_dhs_mortality_report.ipynb +++ b/pipelines/snt_dhs_indicators/reporting/snt_dhs_mortality_report.ipynb @@ -1,239 +1,238 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "2464d129-cfb5-4771-8491-ebf35fc9d532", - "metadata": {}, - "source": [ - "# Plots for under-5 mortality (DHS data)" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Plots for under-5 mortality (DHS data)" + ], + "id": "2464d129-cfb5-4771-8491-ebf35fc9d532" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preliminary steps" + ], + "id": "56a45126-27ea-4fc8-8714-5dcff26dd97c" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "rm(list = ls())\n", + "\n", + "options(scipen=999)\n", + "\n", + "# Global paths\n", + "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", + "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", + "\n", + "# Paths\n", + "ROOT_PATH <- '~/workspace'\n", + "PIPELINE_PATH <- file.path(ROOT_PATH, 'pipelines', 'snt_dhs_indicators')\n", + "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", + "CODE_PATH <- file.path(ROOT_PATH, 'code')\n", + "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", + "DHS_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'raw')\n", + "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'mortality')\n", + "OUTPUT_PLOTS_PATH <- file.path(ROOT_PATH, 'pipelines', 'snt_dhs_indicators', 'reporting', 'outputs')\n", + "\n", + "# Load notebook-specific utilities\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhs_mortality_report.r\"))\n", + "\n", + "# List required pcks\n", + "required_packages <- c(\"sf\", \"glue\", \"data.table\", \"ggplot2\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"arrow\")\n", + "\n", + "# Execute function\n", + "install_and_load(required_packages)\n", + "\n", + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "reticulate::py_config()$python\n", + "openhexa <- import(\"openhexa.sdk\")\n", + "\n", + "# Load SNT config\n", + "CONFIG_FILE_NAME <- \"SNT_config.json\"\n", + "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },\n", + " error = function(e) {\n", + " msg <- paste0(\"Error while loading configuration\", conditionMessage(e))\n", + " cat(msg)\n", + " stop(msg)\n", + " })\n", + "\n", + "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, CONFIG_FILE_NAME))\n", + "log_msg(msg)\n", + "\n", + "# Set config variables\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "\n", + "print(paste(\"Country code: \", COUNTRY_CODE))\n", + "\n", + "data_source <- 'DHS'" + ], + "execution_count": null, + "outputs": [], + "id": "5d6705d9-1730-49fd-804c-5754c8d35ed3" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Geo data ----------------------------------------------------------------\n", + "\n", + "admin_level <- 'ADM1'\n", + "admin_id_col <- glue(admin_level, 'ID', .sep='_')\n", + "admin_name_col <- glue(admin_level, 'NAME', .sep='_')\n", + "admin_cols <- c(admin_id_col, admin_name_col)\n", + "\n", + "# Load spatial file from dataset\n", + "\n", + "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "\n", + "spatial_data_filename <- paste(COUNTRY_CODE, \"shapes.geojson\", sep = \"_\")\n", + "# spatial_data <- read_sf(file.path(DATA_PATH, 'dhis2', 'formatted', spatial_data_filename))\n", + "spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)\n", + "log_msg(glue(\"File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))\n", + "\n", + "spatial_data <- st_as_sf(spatial_data)\n", + "\n", + "# aggregate geometries by the admin columns\n", + "spatial_data <- aggregate_geometry(\n", + " sf_data=spatial_data,\n", + " admin_id_colname=admin_id_col,\n", + " admin_name_colname=admin_name_col\n", + ")\n", + "\n", + "# keep class\n", + "spatial_data <- st_as_sf(spatial_data)\n", + "\n", + "if(COUNTRY_CODE == \"COD\"){\n", + " spatial_data[[admin_name_col]] <- clean_admin_names(spatial_data[[admin_name_col]])\n", + "}" + ], + "execution_count": null, + "outputs": [], + "id": "520b0296-3941-4276-942e-0165f7974a61" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Map" + ], + "id": "02caf8c4-124b-4f84-9768-4a5137f9458c" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "indicator_u5mr <- 'U5MR_PERMIL'\n", + "lower_bound_col <- glue(\"{toupper(indicator_u5mr)}_CI_LOWER_BOUND\")\n", + "upper_bound_col <- glue(\"{toupper(indicator_u5mr)}_CI_UPPER_BOUND\")\n", + "sample_avg_col <- glue(\"{toupper(indicator_u5mr)}_SAMPLE_AVERAGE\")\n", + "\n", + "filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_u5mr)}\")\n", + "u5mort_table <- fread(file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.csv')))" + ], + "execution_count": null, + "outputs": [], + "id": "66cd9b28-1ee4-4547-8e39-7c13f5d3c360" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "plot_data = merge(spatial_data, u5mort_table, by = admin_cols, all = TRUE)" + ], + "execution_count": null, + "outputs": [], + "id": "3bc549c8-52f2-4fc9-bc4a-cd0060527524" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "u5_mort_permil_plot <- make_dhs_map(\n", + " plot_dt = plot_data,\n", + " plot_colname = sample_avg_col,\n", + " title_name = \"Under-5 mortality (\\u2030)\",\n", + " legend_title = \"\\u2030\",\n", + " scale_limits = c(0, 200)\n", + " )" + ], + "execution_count": null, + "outputs": [], + "id": "f993b56a-cd3b-4c23-a627-2001db173f93" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# u5_mort_permil_plot\n", + "plot_filename <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_u5mr)}_plot.png\")\n", + "ggsave(u5_mort_permil_plot, file = file.path(OUTPUT_PLOTS_PATH, plot_filename), dpi = 500)" + ], + "execution_count": null, + "outputs": [], + "id": "192e947d-710d-4d90-be25-dbd83262394f" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Confidence interval plot" + ], + "id": "bef1a6d0-4163-45f1-be3d-877d4c747bfe" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "ci_plot_title <- glue(\"{COUNTRY_CODE} {data_source} {indicator_u5mr} (95% Confidence Intervals)\")\n", + "ci_plot_xlab <- admin_name_col\n", + "ci_plot_ylab <- glue(\"Under-5 mortality (\\u2030)\")\n", + "ci_plot_filename <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_u5mr)}_CI_plot.png\")\n", + "ci_u5_mort_permil_plot <- make_ci_plot(\n", + " df_to_plot=plot_data,\n", + " admin_colname=admin_name_col,\n", + " point_estimation_colname=sample_avg_col,\n", + " ci_lower_colname=lower_bound_col,\n", + " ci_upper_colname=upper_bound_col,\n", + " title_name=ci_plot_title,\n", + " x_title=ci_plot_xlab,\n", + " y_title=ci_plot_ylab\n", + ")\n", + "\n", + "ggsave(filename=file.path(OUTPUT_PLOTS_PATH, ci_plot_filename), plot=ci_u5_mort_permil_plot, width = 8, height = 6, dpi = 300)" + ], + "execution_count": null, + "outputs": [], + "id": "2ccc88b9-31a9-4972-9f31-690ee891d4de" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [], + "execution_count": null, + "outputs": [], + "id": "ec398b6f-554f-45f4-bae3-155490e40fbb" + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" + } }, - { - "cell_type": "markdown", - "id": "56a45126-27ea-4fc8-8714-5dcff26dd97c", - "metadata": {}, - "source": [ - "## Preliminary steps" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5d6705d9-1730-49fd-804c-5754c8d35ed3", - "metadata": {}, - "outputs": [], - "source": [ - "rm(list = ls())\n", - "\n", - "options(scipen=999)\n", - "\n", - "# Global paths\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "\n", - "# Paths\n", - "ROOT_PATH <- '~/workspace'\n", - "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", - "CODE_PATH <- file.path(ROOT_PATH, 'code')\n", - "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", - "DHS_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'raw')\n", - "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'mortality')\n", - "OUTPUT_PLOTS_PATH <- file.path(ROOT_PATH, 'pipelines', 'snt_dhs_indicators', 'reporting', 'outputs')\n", - "\n", - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "\n", - "# List required pcks\n", - "required_packages <- c(\"sf\", \"glue\", \"data.table\", \"ggplot2\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\")\n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)\n", - "\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")\n", - "\n", - "# Load SNT config\n", - "CONFIG_FILE_NAME <- \"SNT_config.json\"\n", - "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, CONFIG_FILE_NAME)) \n", - "log_msg(msg)\n", - "\n", - "# Set config variables\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "\n", - "# COUNTRY_CODE <- \"BFA\"\n", - "print(paste(\"Country code: \", COUNTRY_CODE))\n", - "\n", - "data_source <- 'DHS'\n", - "# dhs_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHS_DATASET" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "520b0296-3941-4276-942e-0165f7974a61", - "metadata": {}, - "outputs": [], - "source": [ - "# Geo data ----------------------------------------------------------------\n", - "\n", - "admin_level <- 'ADM1'\n", - "admin_id_col <- glue(admin_level, 'ID', .sep='_')\n", - "admin_name_col <- glue(admin_level, 'NAME', .sep='_')\n", - "admin_cols <- c(admin_id_col, admin_name_col)\n", - "\n", - "# Load spatial file from dataset\n", - "\n", - "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "\n", - "spatial_data_filename <- paste(COUNTRY_CODE, \"shapes.geojson\", sep = \"_\")\n", - "# spatial_data <- read_sf(file.path(DATA_PATH, 'dhis2', 'formatted', spatial_data_filename))\n", - "spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)\n", - "log_msg(glue(\"File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))\n", - "\n", - "spatial_data <- st_as_sf(spatial_data)\n", - "\n", - "# aggregate geometries by the admin columns\n", - "spatial_data <- aggregate_geometry(\n", - " sf_data=spatial_data,\n", - " admin_id_colname=admin_id_col,\n", - " admin_name_colname=admin_name_col\n", - ")\n", - "\n", - "# keep class\n", - "spatial_data <- st_as_sf(spatial_data)\n", - "\n", - "if(COUNTRY_CODE == \"COD\"){\n", - " spatial_data[[admin_name_col]] <- clean_admin_names(spatial_data[[admin_name_col]])\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "02caf8c4-124b-4f84-9768-4a5137f9458c", - "metadata": {}, - "source": [ - "## Map" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "66cd9b28-1ee4-4547-8e39-7c13f5d3c360", - "metadata": {}, - "outputs": [], - "source": [ - "indicator_u5mr <- 'U5MR_PERMIL'\n", - "lower_bound_col <- glue(\"{toupper(indicator_u5mr)}_CI_LOWER_BOUND\")\n", - "upper_bound_col <- glue(\"{toupper(indicator_u5mr)}_CI_UPPER_BOUND\")\n", - "sample_avg_col <- glue(\"{toupper(indicator_u5mr)}_SAMPLE_AVERAGE\")\n", - "\n", - "filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_u5mr)}\")\n", - "u5mort_table <- fread(file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.csv')))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3bc549c8-52f2-4fc9-bc4a-cd0060527524", - "metadata": {}, - "outputs": [], - "source": [ - "plot_data = merge(spatial_data, u5mort_table, by = admin_cols, all = TRUE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f993b56a-cd3b-4c23-a627-2001db173f93", - "metadata": {}, - "outputs": [], - "source": [ - "u5_mort_permil_plot <- make_dhs_map(\n", - " plot_dt = plot_data,\n", - " plot_colname = sample_avg_col,\n", - " title_name = \"Under-5 mortality (\\u2030)\",\n", - " legend_title = \"\\u2030\",\n", - " scale_limits = c(0, 200)\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "192e947d-710d-4d90-be25-dbd83262394f", - "metadata": {}, - "outputs": [], - "source": [ - "# u5_mort_permil_plot\n", - "plot_filename <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_u5mr)}_plot.png\")\n", - "ggsave(u5_mort_permil_plot, file = file.path(OUTPUT_PLOTS_PATH, plot_filename), dpi = 500)" - ] - }, - { - "cell_type": "markdown", - "id": "bef1a6d0-4163-45f1-be3d-877d4c747bfe", - "metadata": {}, - "source": [ - "## Confidence interval plot" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2ccc88b9-31a9-4972-9f31-690ee891d4de", - "metadata": {}, - "outputs": [], - "source": [ - "ci_plot_title <- glue(\"{COUNTRY_CODE} {data_source} {indicator_u5mr} (95% Confidence Intervals)\")\n", - "ci_plot_xlab <- admin_name_col\n", - "ci_plot_ylab <- glue(\"Under-5 mortality (\\u2030)\")\n", - "ci_plot_filename <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_u5mr)}_CI_plot.png\")\n", - "ci_u5_mort_permil_plot <- make_ci_plot(\n", - " df_to_plot=plot_data,\n", - " admin_colname=admin_name_col,\n", - " point_estimation_colname=sample_avg_col,\n", - " ci_lower_colname=lower_bound_col,\n", - " ci_upper_colname=upper_bound_col,\n", - " title_name=ci_plot_title,\n", - " x_title=ci_plot_xlab,\n", - " y_title=ci_plot_ylab\n", - ")\n", - "\n", - "ggsave(filename=file.path(OUTPUT_PLOTS_PATH, ci_plot_filename), plot=ci_u5_mort_permil_plot, width = 8, height = 6, dpi = 300)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ec398b6f-554f-45f4-bae3-155490e40fbb", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/pipelines/snt_dhs_indicators/reporting/snt_dhs_prevalence_report.ipynb b/pipelines/snt_dhs_indicators/reporting/snt_dhs_prevalence_report.ipynb index f6e177e..e1eeb3b 100644 --- a/pipelines/snt_dhs_indicators/reporting/snt_dhs_prevalence_report.ipynb +++ b/pipelines/snt_dhs_indicators/reporting/snt_dhs_prevalence_report.ipynb @@ -1,274 +1,275 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "b94b23b4-408e-4a70-81c0-ea34a575defb", - "metadata": {}, - "source": [ - "# Plots for under-5 Prevalence of Malaria (DHS data)" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Plots for under-5 Prevalence of Malaria (DHS data)" + ], + "id": "b94b23b4-408e-4a70-81c0-ea34a575defb" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preliminary steps" + ], + "id": "00d4b58b-598b-40c4-8911-f4beb622e987" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "rm(list = ls())\n", + "\n", + "options(scipen=999)\n", + "\n", + "# Global paths\n", + "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", + "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")" + ], + "execution_count": null, + "outputs": [], + "id": "3d8bbf28-cc3d-40f7-acd0-a6e8a0fa51ff" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Paths\n", + "ROOT_PATH <- '~/workspace'\n", + "PIPELINE_PATH <- file.path(ROOT_PATH, 'pipelines', 'snt_dhs_indicators')\n", + "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", + "CODE_PATH <- file.path(ROOT_PATH, 'code')\n", + "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", + "DHS_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'raw')\n", + "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'prevalence')\n", + "OUTPUT_PLOTS_PATH <- file.path(ROOT_PATH, 'pipelines', 'snt_dhs_indicators', 'reporting', 'outputs')" + ], + "execution_count": null, + "outputs": [], + "id": "08d440aa-ac71-4c1d-988a-3ae62f6170a2" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Load notebook-specific utilities\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhs_prevalence_report.r\"))\n", + "\n", + "# List required pcks\n", + "required_packages <- c(\"sf\", \"glue\", \"data.table\", \"ggplot2\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"arrow\")\n", + "\n", + "# Execute function\n", + "install_and_load(required_packages)\n", + "\n", + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "reticulate::py_config()$python\n", + "openhexa <- import(\"openhexa.sdk\")\n", + "\n", + "# Load SNT config\n", + "CONFIG_FILE_NAME <- \"SNT_config.json\"\n", + "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },\n", + " error = function(e) {\n", + " msg <- paste0(\"Error while loading configuration\", conditionMessage(e))\n", + " cat(msg)\n", + " stop(msg)\n", + " })\n", + "\n", + "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, CONFIG_FILE_NAME))\n", + "log_msg(msg)\n", + "\n", + "# Set config variables\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "\n", + "print(paste(\"Country code: \", COUNTRY_CODE))" + ], + "execution_count": null, + "outputs": [], + "id": "9c59c670-1a5c-4b17-8188-24aaa2ecc614" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Geo data" + ], + "id": "56be88f6-6b71-4cbb-b3d8-81dd4ec8e7cd" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "admin_level <- 'ADM1'\n", + "admin_id_col <- glue(admin_level, 'ID', .sep='_')\n", + "admin_name_col <- glue(admin_level, 'NAME', .sep='_')\n", + "admin_cols <- c(admin_id_col, admin_name_col)" + ], + "execution_count": null, + "outputs": [], + "id": "62fe94e7-3b59-469c-b29e-ee7c6c735c39" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Load spatial file from dataset\n", + "\n", + "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "\n", + "spatial_data_filename <- paste(COUNTRY_CODE, \"shapes.geojson\", sep = \"_\")\n", + "# spatial_data <- read_sf(file.path(DATA_PATH, 'dhis2', 'formatted', spatial_data_filename))\n", + "spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)\n", + "log_msg(glue(\"File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))\n", + "\n", + "spatial_data <- st_as_sf(spatial_data)\n", + "\n", + "# aggregate geometries by the admin columns\n", + "spatial_data <- aggregate_geometry(\n", + " sf_data=spatial_data,\n", + " admin_id_colname=admin_id_col,\n", + " admin_name_colname=admin_name_col\n", + ")\n", + "\n", + "# keep class\n", + "spatial_data <- st_as_sf(spatial_data)\n", + "\n", + "if(COUNTRY_CODE == \"COD\"){\n", + " spatial_data[[admin_name_col]] <- clean_admin_names(spatial_data[[admin_name_col]])\n", + "}" + ], + "execution_count": null, + "outputs": [], + "id": "2a2b9aa0-223e-4688-ad21-ca1a0203380e" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Map" + ], + "id": "36611c8f-41db-4c2b-8fc0-5879944e611d" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "data_source <- 'DHS'\n", + "indicator_u5prev <- 'PCT_U5_PREV_RDT'" + ], + "execution_count": null, + "outputs": [], + "id": "82e7f761-5d2a-4a07-8d68-c8a7d3f51f29" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_u5prev)}\")\n", + "u5_rdt_prevalence_table <- fread(file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.csv')))\n", + "\n", + "plot_data = merge(spatial_data, u5_rdt_prevalence_table, by = admin_cols, all = TRUE)" + ], + "execution_count": null, + "outputs": [], + "id": "b9c97a19-dc70-4f4e-a812-816449b24bea" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "lower_bound_col <- glue(\"{toupper(indicator_u5prev)}_CI_LOWER_BOUND\")\n", + "upper_bound_col <- glue(\"{toupper(indicator_u5prev)}_CI_UPPER_BOUND\")\n", + "sample_avg_col <- glue(\"{toupper(indicator_u5prev)}_SAMPLE_AVERAGE\")" + ], + "execution_count": null, + "outputs": [], + "id": "d4eb221f-c86d-46c0-b0c7-93644322724d" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "u5_rdt_prevalence_plot <- make_dhs_map(\n", + " plot_dt = plot_data,\n", + " plot_colname = sample_avg_col,\n", + " title_name = \"Under-5 malaria prevalence (RDT, %)\",\n", + " legend_title = '%',\n", + " scale_limits = c(0, 100)\n", + " )" + ], + "execution_count": null, + "outputs": [], + "id": "b6f3d885-e41f-4d6d-9ace-d574c28c9466" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "plot_filename <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_u5prev)}_plot.png\")\n", + "ggsave(u5_rdt_prevalence_plot, file = file.path(OUTPUT_PLOTS_PATH, plot_filename), dpi = 500)" + ], + "execution_count": null, + "outputs": [], + "id": "14def9f0-23b6-4008-a39c-d344516be71a" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Confidence interval plot" + ], + "id": "b8c9ac5b-5000-45e7-a358-7226846a55b0" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "ci_plot_title <- glue(\"{COUNTRY_CODE} {data_source} {toupper(indicator_u5prev)} CI\")\n", + "ci_plot_xlab <- admin_name_col\n", + "ci_plot_ylab <- glue(\"% children with positive malaria RDT\")\n", + "ci_plot_filename <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_u5prev)}_CI_plot.png\")\n", + "ci_u5_rdt_prevalence_plot <- make_ci_plot(\n", + " df_to_plot=plot_data,\n", + " admin_colname=admin_name_col,\n", + " point_estimation_colname=sample_avg_col,\n", + " ci_lower_colname=lower_bound_col,\n", + " ci_upper_colname=upper_bound_col,\n", + " title_name=ci_plot_title,\n", + " x_title=ci_plot_xlab,\n", + " y_title=ci_plot_ylab\n", + ")\n", + "# ci_u5_rdt_prevalence_plot\n", + "ggsave(filename=file.path(OUTPUT_PLOTS_PATH, ci_plot_filename), plot=ci_u5_rdt_prevalence_plot, width = 8, height = 6, dpi = 300)" + ], + "execution_count": null, + "outputs": [], + "id": "58a07481-81a5-45e2-93f3-3ebf3cfca649" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [], + "execution_count": null, + "outputs": [], + "id": "61bb2341-376f-47f2-9517-e4c6ab952885" + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" + } }, - { - "cell_type": "markdown", - "id": "00d4b58b-598b-40c4-8911-f4beb622e987", - "metadata": {}, - "source": [ - "## Preliminary steps" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3d8bbf28-cc3d-40f7-acd0-a6e8a0fa51ff", - "metadata": {}, - "outputs": [], - "source": [ - "rm(list = ls())\n", - "\n", - "options(scipen=999)\n", - "\n", - "# Global paths\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "08d440aa-ac71-4c1d-988a-3ae62f6170a2", - "metadata": {}, - "outputs": [], - "source": [ - "# Paths\n", - "ROOT_PATH <- '~/workspace'\n", - "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", - "CODE_PATH <- file.path(ROOT_PATH, 'code')\n", - "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", - "DHS_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'raw')\n", - "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'prevalence')\n", - "OUTPUT_PLOTS_PATH <- file.path(ROOT_PATH, 'pipelines', 'snt_dhs_indicators', 'reporting', 'outputs')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9c59c670-1a5c-4b17-8188-24aaa2ecc614", - "metadata": {}, - "outputs": [], - "source": [ - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "\n", - "# List required pcks\n", - "required_packages <- c(\"sf\", \"glue\", \"data.table\", \"ggplot2\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\")\n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)\n", - "\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")\n", - "\n", - "# Load SNT config\n", - "CONFIG_FILE_NAME <- \"SNT_config.json\"\n", - "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, CONFIG_FILE_NAME)) \n", - "log_msg(msg)\n", - "\n", - "# Set config variables\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "\n", - "print(paste(\"Country code: \", COUNTRY_CODE))" - ] - }, - { - "cell_type": "markdown", - "id": "56be88f6-6b71-4cbb-b3d8-81dd4ec8e7cd", - "metadata": {}, - "source": [ - "## Geo data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "62fe94e7-3b59-469c-b29e-ee7c6c735c39", - "metadata": {}, - "outputs": [], - "source": [ - "admin_level <- 'ADM1'\n", - "admin_id_col <- glue(admin_level, 'ID', .sep='_')\n", - "admin_name_col <- glue(admin_level, 'NAME', .sep='_')\n", - "admin_cols <- c(admin_id_col, admin_name_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2a2b9aa0-223e-4688-ad21-ca1a0203380e", - "metadata": {}, - "outputs": [], - "source": [ - "# Load spatial file from dataset\n", - "\n", - "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "\n", - "spatial_data_filename <- paste(COUNTRY_CODE, \"shapes.geojson\", sep = \"_\")\n", - "# spatial_data <- read_sf(file.path(DATA_PATH, 'dhis2', 'formatted', spatial_data_filename))\n", - "spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)\n", - "log_msg(glue(\"File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))\n", - "\n", - "spatial_data <- st_as_sf(spatial_data)\n", - "\n", - "# aggregate geometries by the admin columns\n", - "spatial_data <- aggregate_geometry(\n", - " sf_data=spatial_data,\n", - " admin_id_colname=admin_id_col,\n", - " admin_name_colname=admin_name_col\n", - ")\n", - "\n", - "# keep class\n", - "spatial_data <- st_as_sf(spatial_data)\n", - "\n", - "if(COUNTRY_CODE == \"COD\"){\n", - " spatial_data[[admin_name_col]] <- clean_admin_names(spatial_data[[admin_name_col]])\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "36611c8f-41db-4c2b-8fc0-5879944e611d", - "metadata": {}, - "source": [ - "## Map" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "82e7f761-5d2a-4a07-8d68-c8a7d3f51f29", - "metadata": {}, - "outputs": [], - "source": [ - "data_source <- 'DHS'\n", - "indicator_u5prev <- 'PCT_U5_PREV_RDT'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b9c97a19-dc70-4f4e-a812-816449b24bea", - "metadata": {}, - "outputs": [], - "source": [ - "filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_u5prev)}\")\n", - "u5_rdt_prevalence_table <- fread(file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.csv')))\n", - "\n", - "plot_data = merge(spatial_data, u5_rdt_prevalence_table, by = admin_cols, all = TRUE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d4eb221f-c86d-46c0-b0c7-93644322724d", - "metadata": {}, - "outputs": [], - "source": [ - "lower_bound_col <- glue(\"{toupper(indicator_u5prev)}_CI_LOWER_BOUND\")\n", - "upper_bound_col <- glue(\"{toupper(indicator_u5prev)}_CI_UPPER_BOUND\")\n", - "sample_avg_col <- glue(\"{toupper(indicator_u5prev)}_SAMPLE_AVERAGE\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b6f3d885-e41f-4d6d-9ace-d574c28c9466", - "metadata": {}, - "outputs": [], - "source": [ - "u5_rdt_prevalence_plot <- make_dhs_map(\n", - " plot_dt = plot_data,\n", - " plot_colname = sample_avg_col,\n", - " title_name = \"Under-5 malaria prevalence (RDT, %)\",\n", - " legend_title = '%',\n", - " scale_limits = c(0, 100)\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "14def9f0-23b6-4008-a39c-d344516be71a", - "metadata": {}, - "outputs": [], - "source": [ - "plot_filename <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_u5prev)}_plot.png\")\n", - "ggsave(u5_rdt_prevalence_plot, file = file.path(OUTPUT_PLOTS_PATH, plot_filename), dpi = 500)" - ] - }, - { - "cell_type": "markdown", - "id": "b8c9ac5b-5000-45e7-a358-7226846a55b0", - "metadata": {}, - "source": [ - "## Confidence interval plot" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "58a07481-81a5-45e2-93f3-3ebf3cfca649", - "metadata": {}, - "outputs": [], - "source": [ - "ci_plot_title <- glue(\"{COUNTRY_CODE} {data_source} {toupper(indicator_u5prev)} CI\")\n", - "ci_plot_xlab <- admin_name_col\n", - "ci_plot_ylab <- glue(\"% children with positive malaria RDT\")\n", - "ci_plot_filename <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_u5prev)}_CI_plot.png\")\n", - "ci_u5_rdt_prevalence_plot <- make_ci_plot(\n", - " df_to_plot=plot_data,\n", - " admin_colname=admin_name_col,\n", - " point_estimation_colname=sample_avg_col,\n", - " ci_lower_colname=lower_bound_col,\n", - " ci_upper_colname=upper_bound_col,\n", - " title_name=ci_plot_title,\n", - " x_title=ci_plot_xlab,\n", - " y_title=ci_plot_ylab\n", - ")\n", - "# ci_u5_rdt_prevalence_plot\n", - "ggsave(filename=file.path(OUTPUT_PLOTS_PATH, ci_plot_filename), plot=ci_u5_rdt_prevalence_plot, width = 8, height = 6, dpi = 300)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "61bb2341-376f-47f2-9517-e4c6ab952885", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/pipelines/snt_dhs_indicators/reporting/snt_dhs_vaccination_report.ipynb b/pipelines/snt_dhs_indicators/reporting/snt_dhs_vaccination_report.ipynb index f95bcb9..f47101c 100644 --- a/pipelines/snt_dhs_indicators/reporting/snt_dhs_vaccination_report.ipynb +++ b/pipelines/snt_dhs_indicators/reporting/snt_dhs_vaccination_report.ipynb @@ -1,333 +1,334 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "4a75d418-3144-427f-8fa3-2cb7f727e66d", - "metadata": {}, - "source": [ - "# Plots for DTP Vaccination and attrition rates (DHS data)" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Plots for DTP Vaccination and attrition rates (DHS data)" + ], + "id": "4a75d418-3144-427f-8fa3-2cb7f727e66d" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preliminaries" + ], + "id": "6b666c7a-f105-4fad-aea0-d23a38fa0153" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "rm(list = ls())\n", + "\n", + "options(scipen=999)" + ], + "execution_count": null, + "outputs": [], + "id": "dbcf1df6-c264-48b3-bf53-f13a4b036487" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Global paths\n", + "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", + "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")" + ], + "execution_count": null, + "outputs": [], + "id": "449ea786-ac70-4513-aaf5-24ffe69aec5c" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Paths\n", + "ROOT_PATH <- '~/workspace'\n", + "PIPELINE_PATH <- file.path(ROOT_PATH, 'pipelines', 'snt_dhs_indicators')\n", + "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", + "CODE_PATH <- file.path(ROOT_PATH, 'code')\n", + "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", + "DHS_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'raw')\n", + "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'vaccination')\n", + "OUTPUT_PLOTS_PATH <- file.path(ROOT_PATH, 'pipelines', 'snt_dhs_indicators', 'reporting', 'outputs')" + ], + "execution_count": null, + "outputs": [], + "id": "c9037624-3ac0-403c-9ea0-fc78891c2393" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Load notebook-specific utilities\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhs_vaccination_report.r\"))\n", + "\n", + "# List required pcks\n", + "required_packages <- c(\"haven\", \"glue\", \"survey\", \"data.table\", \"sf\", \"ggplot2\", \"stringi\", \"reticulate\", \"jsonlite\", \"arrow\")\n", + "\n", + "# Execute function\n", + "install_and_load(required_packages)" + ], + "execution_count": null, + "outputs": [], + "id": "1f13522e-a0e5-44b7-b8f5-b66dc7ca37c0" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "reticulate::py_config()$python\n", + "openhexa <- import(\"openhexa.sdk\")\n", + "\n", + "# Load SNT config\n", + "CONFIG_FILE_NAME <- \"SNT_config.json\"\n", + "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },\n", + " error = function(e) {\n", + " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })\n", + "\n", + "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, CONFIG_FILE_NAME)) \n", + "log_msg(msg)\n", + "\n", + "# Set config variables\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "print(paste(\"Country code: \", COUNTRY_CODE))" + ], + "execution_count": null, + "outputs": [], + "id": "50d8b838-dd97-4646-b141-a3aa9c2ab681" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Geo data" + ], + "id": "ff080767-9fdb-4790-988c-e2b4c4f7226f" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "admin_level <- 'ADM1'\n", + "admin_id_col <- glue(admin_level, 'ID', .sep='_')\n", + "admin_name_col <- glue(admin_level, 'NAME', .sep='_')\n", + "admin_cols <- c(admin_id_col, admin_name_col)" + ], + "execution_count": null, + "outputs": [], + "id": "56270983-dbb7-4b6f-aa0f-9682c5cf194f" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Load spatial file from dataset\n", + "\n", + "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "\n", + "spatial_data_filename <- paste(COUNTRY_CODE, \"shapes.geojson\", sep = \"_\")\n", + "# spatial_data <- read_sf(file.path(DATA_PATH, 'dhis2', 'formatted', spatial_data_filename))\n", + "spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)\n", + "log_msg(glue(\"File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))\n", + "\n", + "spatial_data <- st_as_sf(spatial_data)\n", + "\n", + "# aggregate geometries by the admin columns\n", + "spatial_data <- aggregate_geometry(\n", + " sf_data=spatial_data,\n", + " admin_id_colname=admin_id_col,\n", + " admin_name_colname=admin_name_col\n", + " )\n", + "\n", + "# keep class\n", + "spatial_data <- st_as_sf(spatial_data)\n", + "\n", + "# DRC provinces need to be cleaned\n", + "if(COUNTRY_CODE == \"COD\"){\n", + " spatial_data[[admin_name_col]] <- clean_admin_names(spatial_data[[admin_name_col]])\n", + "}" + ], + "execution_count": null, + "outputs": [], + "id": "1308ee0c-4856-445f-b518-ee0f4497c9b5" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import DHS data" + ], + "id": "ba6c6b6b-e0fd-434a-8e44-6c45bea47d97" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "data_source <- 'DHS'\n", + "vaccination_doses <- c(1, 2, 3)\n", + "indicator_access <- 'PCT_DTP'\n", + "indicator_attrition <- 'PCT_DROPOUT_DTP'" + ], + "execution_count": null, + "outputs": [], + "id": "cb1a56ee-5191-4930-a8d6-fa0075534725" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For each vaccine dose, do everything :D\n", + "- add the admin units and save to .csv and parquet\n", + "- add the spatial data\n", + "- make percentage maps (sample average) and save them\n", + "- make confidence interval plots for the regions and save them" + ], + "id": "a2034bea-e0fd-4268-b383-4d39d9cd7e75" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Maps and CI plots" + ], + "id": "0b8996bc-8f6a-4eea-821a-9f9013bbc8c2" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "for (dose_number in vaccination_doses){\n", + " table_name <- glue(\"{toupper(indicator_access)}{dose_number}\")\n", + " filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{table_name}\")\n", + " df <- fread(file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.csv')))\n", + " \n", + " vaccine_colname <- glue(\"{toupper(indicator_access)}{dose_number}\")\n", + " \n", + " # change the names of the columns\n", + " sample_avg_col <- paste(vaccine_colname, 'SAMPLE_AVERAGE', sep = '_')\n", + " lower_bound_col <- paste(vaccine_colname, 'CI_LOWER_BOUND', sep = '_')\n", + " upper_bound_col <- paste(vaccine_colname, 'CI_UPPER_BOUND', sep = '_')\n", + " \n", + " # add spatial data\n", + " plot_data <- merge(spatial_data, df, by = admin_cols, all = TRUE)\n", + " \n", + " print(glue('Processing data for', vaccine_colname, .sep = ' '))\n", + "\n", + " plot_filename <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_access)}{dose_number}_plot.png\")\n", + " save_path = file.path(OUTPUT_PLOTS_PATH, plot_filename)\n", + " # print(save_path)\n", + "\n", + " map_title = glue(\"{COUNTRY_CODE} DTP{dose_number} vaccine coverage (%)\")\n", + " # make, show and save the plot\n", + " dose_plot <- make_dhs_map(\n", + " plot_dt = plot_data,\n", + " plot_colname = sample_avg_col,\n", + " title_name = glue(\"{COUNTRY_CODE} DTP{dose_number} vaccine coverage (%)\"),\n", + " legend_title = \"%\",\n", + " scale_limits = c(0, 100)\n", + " )\n", + " ggsave(filename = save_path, plot = dose_plot, width = 8, height = 6, dpi = 300)\n", + " \n", + " # make the confidence interval plot\n", + " ci_plot_title <- glue(\"{COUNTRY_CODE} {data_source} DTP{dose_number} CI\")\n", + " ci_plot_xlab <- admin_name_col\n", + " ci_plot_ylab <- glue(\"DTP{dose_number} vaccinated (%)\")\n", + " ci_plot_filename <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{indicator_access}{dose_number}_CI_plot.png\")\n", + " ci_dtp_plot <- make_ci_plot(\n", + " df_to_plot=plot_data,\n", + " admin_colname=admin_name_col,\n", + " point_estimation_colname=sample_avg_col,\n", + " ci_lower_colname=lower_bound_col,\n", + " ci_upper_colname=upper_bound_col,\n", + " title_name=ci_plot_title,\n", + " x_title=ci_plot_xlab,\n", + " y_title=ci_plot_ylab\n", + " )\n", + " ggsave(filename=file.path(OUTPUT_PLOTS_PATH, ci_plot_filename), plot=ci_dtp_plot, width = 8, height = 6, dpi = 300)\n", + "}" + ], + "execution_count": null, + "outputs": [], + "id": "a1c19f83-a1e1-4b1d-bc26-605e4ed4fa06" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Dropout rates plots, for each vaccine dose" + ], + "id": "375952b4-4a25-435c-aae1-c53c54e9382c" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "dtp_dropout_filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{indicator_attrition}\")\n", + "DTP_DROPOUT <- fread(file.path(OUTPUT_DATA_PATH, paste0(dtp_dropout_filename_without_extension, \".csv\")))" + ], + "execution_count": null, + "outputs": [], + "id": "d817600e-4be1-42d5-b06a-95b7f5080afb" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "for(current_dose in vaccination_doses){\n", + " for (reference_dose in 1:(current_dose - 1)){\n", + " if((reference_dose >= 1) & (reference_dose < current_dose)){\n", + " dropout_colname <- glue(\"{indicator_attrition}_{reference_dose}_{current_dose}\")\n", + " print(glue('Plotting attrition for {dropout_colname}'))\n", + " dropout_plot_title = glue(\"{COUNTRY_CODE} DTP vaccine dropout doses {reference_dose} to {current_dose} (%)\")\n", + " dropout_plot_data <- merge(spatial_data, DTP_DROPOUT, by = admin_cols)\n", + " dropout_plot <- make_dhs_map(\n", + " plot_dt = dropout_plot_data,\n", + " plot_colname = dropout_colname,\n", + " title_name = dropout_plot_title,\n", + " legend_title = '%',\n", + " scale_limits = c(0, 100)\n", + " )\n", + " dropout_plot_filename <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(dropout_colname)}_plot.png\")\n", + " ggsave(filename = file.path(OUTPUT_PLOTS_PATH, dropout_plot_filename), plot = dropout_plot, width = 8, height = 6, dpi = 300)\n", + "\n", + " }\n", + " }\n", + "}" + ], + "execution_count": null, + "outputs": [], + "id": "13b6240e-755c-49bb-bae4-5f2175159211" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [], + "execution_count": null, + "outputs": [], + "id": "67676f86-0e5d-42db-8189-b7569d640b54" + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" + } }, - { - "cell_type": "markdown", - "id": "6b666c7a-f105-4fad-aea0-d23a38fa0153", - "metadata": {}, - "source": [ - "## Preliminaries" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dbcf1df6-c264-48b3-bf53-f13a4b036487", - "metadata": {}, - "outputs": [], - "source": [ - "rm(list = ls())\n", - "\n", - "options(scipen=999)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "449ea786-ac70-4513-aaf5-24ffe69aec5c", - "metadata": {}, - "outputs": [], - "source": [ - "# Global paths\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c9037624-3ac0-403c-9ea0-fc78891c2393", - "metadata": {}, - "outputs": [], - "source": [ - "# Paths\n", - "ROOT_PATH <- '~/workspace'\n", - "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", - "CODE_PATH <- file.path(ROOT_PATH, 'code')\n", - "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", - "DHS_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'raw')\n", - "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'vaccination')\n", - "OUTPUT_PLOTS_PATH <- file.path(ROOT_PATH, 'pipelines', 'snt_dhs_indicators', 'reporting', 'outputs')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1f13522e-a0e5-44b7-b8f5-b66dc7ca37c0", - "metadata": {}, - "outputs": [], - "source": [ - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "\n", - "# List required pcks\n", - "required_packages <- c(\"haven\", \"glue\", \"survey\", \"data.table\", \"sf\", \"ggplot2\", \"stringi\", \"reticulate\", \"jsonlite\", \"arrow\")\n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "50d8b838-dd97-4646-b141-a3aa9c2ab681", - "metadata": {}, - "outputs": [], - "source": [ - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")\n", - "\n", - "# Load SNT config\n", - "CONFIG_FILE_NAME <- \"SNT_config.json\"\n", - "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, CONFIG_FILE_NAME)) \n", - "log_msg(msg)\n", - "\n", - "# Set config variables\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "print(paste(\"Country code: \", COUNTRY_CODE))" - ] - }, - { - "cell_type": "markdown", - "id": "ff080767-9fdb-4790-988c-e2b4c4f7226f", - "metadata": {}, - "source": [ - "## Geo data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "56270983-dbb7-4b6f-aa0f-9682c5cf194f", - "metadata": {}, - "outputs": [], - "source": [ - "admin_level <- 'ADM1'\n", - "admin_id_col <- glue(admin_level, 'ID', .sep='_')\n", - "admin_name_col <- glue(admin_level, 'NAME', .sep='_')\n", - "admin_cols <- c(admin_id_col, admin_name_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1308ee0c-4856-445f-b518-ee0f4497c9b5", - "metadata": {}, - "outputs": [], - "source": [ - "# Load spatial file from dataset\n", - "\n", - "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "\n", - "spatial_data_filename <- paste(COUNTRY_CODE, \"shapes.geojson\", sep = \"_\")\n", - "# spatial_data <- read_sf(file.path(DATA_PATH, 'dhis2', 'formatted', spatial_data_filename))\n", - "spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)\n", - "log_msg(glue(\"File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))\n", - "\n", - "spatial_data <- st_as_sf(spatial_data)\n", - "\n", - "# aggregate geometries by the admin columns\n", - "spatial_data <- aggregate_geometry(\n", - " sf_data=spatial_data,\n", - " admin_id_colname=admin_id_col,\n", - " admin_name_colname=admin_name_col\n", - " )\n", - "\n", - "# keep class\n", - "spatial_data <- st_as_sf(spatial_data)\n", - "\n", - "# DRC provinces need to be cleaned\n", - "if(COUNTRY_CODE == \"COD\"){\n", - " spatial_data[[admin_name_col]] <- clean_admin_names(spatial_data[[admin_name_col]])\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "ba6c6b6b-e0fd-434a-8e44-6c45bea47d97", - "metadata": {}, - "source": [ - "## Import DHS data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cb1a56ee-5191-4930-a8d6-fa0075534725", - "metadata": {}, - "outputs": [], - "source": [ - "data_source <- 'DHS'\n", - "vaccination_doses <- c(1, 2, 3)\n", - "indicator_access <- 'PCT_DTP'\n", - "indicator_attrition <- 'PCT_DROPOUT_DTP'" - ] - }, - { - "cell_type": "markdown", - "id": "a2034bea-e0fd-4268-b383-4d39d9cd7e75", - "metadata": {}, - "source": [ - "For each vaccine dose, do everything :D\n", - "- add the admin units and save to .csv and parquet\n", - "- add the spatial data\n", - "- make percentage maps (sample average) and save them\n", - "- make confidence interval plots for the regions and save them" - ] - }, - { - "cell_type": "markdown", - "id": "0b8996bc-8f6a-4eea-821a-9f9013bbc8c2", - "metadata": {}, - "source": [ - "## Maps and CI plots" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a1c19f83-a1e1-4b1d-bc26-605e4ed4fa06", - "metadata": {}, - "outputs": [], - "source": [ - "for (dose_number in vaccination_doses){\n", - " table_name <- glue(\"{toupper(indicator_access)}{dose_number}\")\n", - " filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{table_name}\")\n", - " df <- fread(file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.csv')))\n", - " \n", - " vaccine_colname <- glue(\"{toupper(indicator_access)}{dose_number}\")\n", - " \n", - " # change the names of the columns\n", - " sample_avg_col <- paste(vaccine_colname, 'SAMPLE_AVERAGE', sep = '_')\n", - " lower_bound_col <- paste(vaccine_colname, 'CI_LOWER_BOUND', sep = '_')\n", - " upper_bound_col <- paste(vaccine_colname, 'CI_UPPER_BOUND', sep = '_')\n", - " \n", - " # add spatial data\n", - " plot_data <- merge(spatial_data, df, by = admin_cols, all = TRUE)\n", - " \n", - " print(glue('Processing data for', vaccine_colname, .sep = ' '))\n", - "\n", - " plot_filename <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_access)}{dose_number}_plot.png\")\n", - " save_path = file.path(OUTPUT_PLOTS_PATH, plot_filename)\n", - " # print(save_path)\n", - "\n", - " map_title = glue(\"{COUNTRY_CODE} DTP{dose_number} vaccine coverage (%)\")\n", - " # make, show and save the plot\n", - " dose_plot <- make_dhs_map(\n", - " plot_dt = plot_data,\n", - " plot_colname = sample_avg_col,\n", - " title_name = glue(\"{COUNTRY_CODE} DTP{dose_number} vaccine coverage (%)\"),\n", - " legend_title = \"%\",\n", - " scale_limits = c(0, 100)\n", - " )\n", - " ggsave(filename = save_path, plot = dose_plot, width = 8, height = 6, dpi = 300)\n", - " \n", - " # make the confidence interval plot\n", - " ci_plot_title <- glue(\"{COUNTRY_CODE} {data_source} DTP{dose_number} CI\")\n", - " ci_plot_xlab <- admin_name_col\n", - " ci_plot_ylab <- glue(\"DTP{dose_number} vaccinated (%)\")\n", - " ci_plot_filename <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{indicator_access}{dose_number}_CI_plot.png\")\n", - " ci_dtp_plot <- make_ci_plot(\n", - " df_to_plot=plot_data,\n", - " admin_colname=admin_name_col,\n", - " point_estimation_colname=sample_avg_col,\n", - " ci_lower_colname=lower_bound_col,\n", - " ci_upper_colname=upper_bound_col,\n", - " title_name=ci_plot_title,\n", - " x_title=ci_plot_xlab,\n", - " y_title=ci_plot_ylab\n", - " )\n", - " ggsave(filename=file.path(OUTPUT_PLOTS_PATH, ci_plot_filename), plot=ci_dtp_plot, width = 8, height = 6, dpi = 300)\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "375952b4-4a25-435c-aae1-c53c54e9382c", - "metadata": {}, - "source": [ - "## Dropout rates plots, for each vaccine dose" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d817600e-4be1-42d5-b06a-95b7f5080afb", - "metadata": {}, - "outputs": [], - "source": [ - "dtp_dropout_filename_without_extension <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{indicator_attrition}\")\n", - "DTP_DROPOUT <- fread(file.path(OUTPUT_DATA_PATH, paste0(dtp_dropout_filename_without_extension, \".csv\")))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "13b6240e-755c-49bb-bae4-5f2175159211", - "metadata": {}, - "outputs": [], - "source": [ - "for(current_dose in vaccination_doses){\n", - " for (reference_dose in 1:(current_dose - 1)){\n", - " if((reference_dose >= 1) & (reference_dose < current_dose)){\n", - " dropout_colname <- glue(\"{indicator_attrition}_{reference_dose}_{current_dose}\")\n", - " print(glue('Plotting attrition for {dropout_colname}'))\n", - " dropout_plot_title = glue(\"{COUNTRY_CODE} DTP vaccine dropout doses {reference_dose} to {current_dose} (%)\")\n", - " dropout_plot_data <- merge(spatial_data, DTP_DROPOUT, by = admin_cols)\n", - " dropout_plot <- make_dhs_map(\n", - " plot_dt = dropout_plot_data,\n", - " plot_colname = dropout_colname,\n", - " title_name = dropout_plot_title,\n", - " legend_title = '%',\n", - " scale_limits = c(0, 100)\n", - " )\n", - " dropout_plot_filename <- glue(\"{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(dropout_colname)}_plot.png\")\n", - " ggsave(filename = file.path(OUTPUT_PLOTS_PATH, dropout_plot_filename), plot = dropout_plot, width = 8, height = 6, dpi = 300)\n", - "\n", - " }\n", - " }\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "67676f86-0e5d-42db-8189-b7569d640b54", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/pipelines/snt_dhs_indicators/utils/snt_dhs_bednets_computation.r b/pipelines/snt_dhs_indicators/utils/snt_dhs_bednets_computation.r new file mode 100644 index 0000000..a2303de --- /dev/null +++ b/pipelines/snt_dhs_indicators/utils/snt_dhs_bednets_computation.r @@ -0,0 +1,4 @@ +# Utils entrypoint for snt_dhs_bednets_computation.ipynb +if (!exists("ROOT_PATH", inherits = TRUE)) ROOT_PATH <- "~/workspace" +if (!exists("PIPELINE_PATH", inherits = TRUE)) PIPELINE_PATH <- file.path(ROOT_PATH, "pipelines", "snt_dhs_indicators") +source(file.path(PIPELINE_PATH, "utils", "snt_dhs_indicator_tables.r")) diff --git a/pipelines/snt_dhs_indicators/utils/snt_dhs_bednets_report.r b/pipelines/snt_dhs_indicators/utils/snt_dhs_bednets_report.r new file mode 100644 index 0000000..17712d0 --- /dev/null +++ b/pipelines/snt_dhs_indicators/utils/snt_dhs_bednets_report.r @@ -0,0 +1,6 @@ +# Utils entrypoint for snt_dhs_bednets_report.ipynb +if (!exists("ROOT_PATH", inherits = TRUE)) ROOT_PATH <- "~/workspace" +if (!exists("PIPELINE_PATH", inherits = TRUE)) PIPELINE_PATH <- file.path(ROOT_PATH, "pipelines", "snt_dhs_indicators") +if (!exists("CODE_PATH", inherits = TRUE)) CODE_PATH <- file.path(ROOT_PATH, "code") +if (file.exists(file.path(CODE_PATH, "snt_utils.r"))) source(file.path(CODE_PATH, "snt_utils.r")) +source(file.path(PIPELINE_PATH, "utils", "snt_dhs_indicator_tables.r")) diff --git a/pipelines/snt_dhs_indicators/utils/snt_dhs_careseeking_computation.r b/pipelines/snt_dhs_indicators/utils/snt_dhs_careseeking_computation.r new file mode 100644 index 0000000..737e738 --- /dev/null +++ b/pipelines/snt_dhs_indicators/utils/snt_dhs_careseeking_computation.r @@ -0,0 +1,91 @@ +# Utils entrypoint for snt_dhs_careseeking_computation.ipynb +if (!exists("ROOT_PATH", inherits = TRUE)) ROOT_PATH <- "~/workspace" +if (!exists("PIPELINE_PATH", inherits = TRUE)) PIPELINE_PATH <- file.path(ROOT_PATH, "pipelines", "snt_dhs_indicators") +source(file.path(PIPELINE_PATH, "utils", "snt_dhs_indicator_tables.r")) + +#' Compute and export careseeking indicators with confidence intervals. +#' +#' For each indicator, runs survey-weighted estimation by admin unit, formats +#' CI/sample-average columns as percentages, exports per-indicator tables, and +#' builds a consolidated sample-average summary table. +#' +#' @param kr_design_sampling Survey design object. +#' @param indicator_names Character vector of careseeking indicator names. +#' @param admin_name_col Admin grouping column name. +#' @param admin_data Admin reference table for merges. +#' @param output_data_path Directory to write output files. +#' @param country_code Country code used in filenames. +#' @param data_source DHS source code used in filenames. +#' @param admin_level Admin level label used in filenames. +#' @return List with `summary_table` and per-indicator `indicator_tables`. +compute_careseeking_indicators <- function( + kr_design_sampling, + indicator_names, + admin_name_col, + admin_data, + output_data_path, + country_code, + data_source, + admin_level +) { + summary_table <- data.table::copy(admin_data) + indicator_tables <- list() + + for (indicator_name in indicator_names) { + table_content <- survey::svyby( + formula = as.formula(paste("~", indicator_name)), + by = reformulate(admin_name_col), + FUN = survey::svymean, + design = kr_design_sampling, + level = 0.95, + vartype = "ci", + na.rm = TRUE, + influence = TRUE + ) + + data.table::setDT(table_content) + lower_bound_col <- glue::glue("{toupper(indicator_name)}_CI_LOWER_BOUND") + upper_bound_col <- glue::glue("{toupper(indicator_name)}_CI_UPPER_BOUND") + sample_avg_col <- glue::glue("{toupper(indicator_name)}_SAMPLE_AVERAGE") + + names(table_content)[names(table_content) == "ci_l"] <- lower_bound_col + names(table_content)[names(table_content) == "ci_u"] <- upper_bound_col + names(table_content)[names(table_content) == indicator_name] <- sample_avg_col + + table_content[get(lower_bound_col) < 0, (lower_bound_col) := 0] + table_content[get(upper_bound_col) > 1, (upper_bound_col) := 1] + table_content[, (lower_bound_col) := get(lower_bound_col) * 100] + table_content[, (upper_bound_col) := get(upper_bound_col) * 100] + table_content[, (sample_avg_col) := get(sample_avg_col) * 100] + + indicator_estimation_table <- table_content[ + , + .SD, + .SDcols = c(admin_name_col, grep("SAMPLE_AVERAGE", names(table_content), value = TRUE)) + ] + + table_content <- data.table::merge.data.table(admin_data, table_content, by = admin_name_col) + summary_table <- data.table::merge.data.table(summary_table, indicator_estimation_table, by = admin_name_col) + + filename_without_extension <- glue::glue("{country_code}_{data_source}_{admin_level}_{toupper(indicator_name)}") + utils::write.csv( + table_content, + file = file.path(output_data_path, paste0(filename_without_extension, ".csv")), + row.names = FALSE + ) + arrow::write_parquet(table_content, file.path(output_data_path, paste0(filename_without_extension, ".parquet"))) + + indicator_tables[[indicator_name]] <- table_content + } + + names(summary_table) <- gsub("_SAMPLE_AVERAGE", "", names(summary_table)) + summary_filename_without_extension <- glue::glue("{country_code}_{data_source}_{admin_level}_PCT_CARESEEKING_SAMPLE_AVERAGE") + utils::write.csv( + summary_table, + file = file.path(output_data_path, paste0(summary_filename_without_extension, ".csv")), + row.names = FALSE + ) + arrow::write_parquet(summary_table, file.path(output_data_path, paste0(summary_filename_without_extension, ".parquet"))) + + list(summary_table = summary_table, indicator_tables = indicator_tables) +} diff --git a/pipelines/snt_dhs_indicators/utils/snt_dhs_careseeking_report.r b/pipelines/snt_dhs_indicators/utils/snt_dhs_careseeking_report.r new file mode 100644 index 0000000..639896b --- /dev/null +++ b/pipelines/snt_dhs_indicators/utils/snt_dhs_careseeking_report.r @@ -0,0 +1,6 @@ +# Utils entrypoint for snt_dhs_careseeking_report.ipynb +if (!exists("ROOT_PATH", inherits = TRUE)) ROOT_PATH <- "~/workspace" +if (!exists("PIPELINE_PATH", inherits = TRUE)) PIPELINE_PATH <- file.path(ROOT_PATH, "pipelines", "snt_dhs_indicators") +if (!exists("CODE_PATH", inherits = TRUE)) CODE_PATH <- file.path(ROOT_PATH, "code") +if (file.exists(file.path(CODE_PATH, "snt_utils.r"))) source(file.path(CODE_PATH, "snt_utils.r")) +source(file.path(PIPELINE_PATH, "utils", "snt_dhs_indicator_tables.r")) diff --git a/pipelines/snt_dhs_indicators/utils/snt_dhs_indicator_tables.r b/pipelines/snt_dhs_indicators/utils/snt_dhs_indicator_tables.r new file mode 100644 index 0000000..ad4076c --- /dev/null +++ b/pipelines/snt_dhs_indicators/utils/snt_dhs_indicator_tables.r @@ -0,0 +1,328 @@ +#' Bootstrap runtime context for DHS indicators pipelines. +#' +#' Loads shared utilities/packages, initializes OpenHEXA SDK, parses +#' configuration, and returns common paths/metadata used across DHS notebooks. +#' +#' @param root_path Root workspace path. +#' @param required_packages Character vector of required R packages. +#' @param load_openhexa Whether to import OpenHEXA SDK. +#' @return Named list with paths, config, country code and SDK handle. +bootstrap_dhs_indicators_context <- function( + root_path = "~/workspace", + required_packages = c( + "haven", "sf", "glue", "survey", "data.table", "stringi", + "jsonlite", "httr", "reticulate", "arrow" + ), + load_openhexa = TRUE +) { + code_path <- file.path(root_path, "code") + config_path <- file.path(root_path, "configuration") + data_path <- file.path(root_path, "data") + dhs_data_path <- file.path(data_path, "dhs", "raw") + + source(file.path(code_path, "snt_utils.r")) + install_and_load(required_packages) + + Sys.setenv(PROJ_LIB = "/opt/conda/share/proj") + Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal") + Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") + + openhexa <- NULL + if (load_openhexa) { + openhexa <- reticulate::import("openhexa.sdk") + } + assign("openhexa", openhexa, envir = .GlobalEnv) + + config_file_name <- "SNT_config.json" + config_json <- tryCatch( + { + jsonlite::fromJSON(file.path(config_path, config_file_name)) + }, + error = function(e) { + msg <- paste0("Error while loading configuration", conditionMessage(e)) + cat(msg) + stop(msg) + } + ) + + log_msg(paste0("SNT configuration loaded from : ", file.path(config_path, config_file_name))) + + list( + ROOT_PATH = root_path, + CODE_PATH = code_path, + CONFIG_PATH = config_path, + DATA_PATH = data_path, + DHS_DATA_PATH = dhs_data_path, + config_json = config_json, + COUNTRY_CODE = config_json$SNT_CONFIG$COUNTRY_CODE, + openhexa = openhexa + ) +} + +#' Load DHIS2 shapes used as spatial reference for DHS outputs. +#' +#' Downloads country-specific shapes from the configured DHIS2 formatted +#' dataset with explicit logging and stop-on-error behavior. +#' +#' @param dhis2_dataset Dataset identifier containing shapes. +#' @param country_code Country code used in filename prefix. +#' @return Spatial data object loaded from `*_shapes.geojson`. +load_dhs_spatial_data <- function(dhis2_dataset, country_code) { + spatial_data_filename <- paste(country_code, "shapes.geojson", sep = "_") + spatial_data <- tryCatch( + { + get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename) + }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading DHIS2 shapes data for {country_code}: {conditionMessage(e)}") + log_msg(msg, "error") + stop(msg) + } + ) + log_msg(glue::glue("File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}")) + spatial_data +} + +#' Compute and export a DHS indicator table with confidence intervals. +#' +#' Runs survey-weighted estimation by admin unit, formats confidence interval +#' and sample-average columns, converts proportions to percentages, merges with +#' admin reference table, and exports CSV/Parquet outputs. +#' +#' @param design_obj Survey design object. +#' @param indicator_name Indicator variable name in design data. +#' @param output_indicator_name Indicator label used in output columns/files. +#' @param admin_name_col Admin grouping column name. +#' @param admin_data Admin reference table for full joins. +#' @param output_data_path Directory to write output files. +#' @param filename_without_extension Output file stem. +#' @return Formatted indicator table. +compute_and_export_indicator_table <- function( + design_obj, + indicator_name, + output_indicator_name = indicator_name, + admin_name_col, + admin_data, + output_data_path, + filename_without_extension +) { + table_content <- survey::svyby( + formula = as.formula(paste("~", indicator_name)), + by = reformulate(admin_name_col), + FUN = survey::svymean, + design = design_obj, + level = 0.95, + vartype = "ci", + na.rm = TRUE, + influence = TRUE + ) + + data.table::setDT(table_content) + lower_bound_col <- glue::glue("{toupper(output_indicator_name)}_CI_LOWER_BOUND") + upper_bound_col <- glue::glue("{toupper(output_indicator_name)}_CI_UPPER_BOUND") + sample_avg_col <- glue::glue("{toupper(output_indicator_name)}_SAMPLE_AVERAGE") + + names(table_content)[names(table_content) == "ci_l"] <- lower_bound_col + names(table_content)[names(table_content) == "ci_u"] <- upper_bound_col + names(table_content)[names(table_content) == indicator_name] <- sample_avg_col + + table_content[get(lower_bound_col) < 0, (lower_bound_col) := 0] + table_content[get(upper_bound_col) > 1, (upper_bound_col) := 1] + table_content[, (lower_bound_col) := get(lower_bound_col) * 100] + table_content[, (upper_bound_col) := get(upper_bound_col) * 100] + table_content[, (sample_avg_col) := get(sample_avg_col) * 100] + + table_content <- data.table::merge.data.table(admin_data, table_content, by = admin_name_col, all.x = TRUE) + utils::write.csv(table_content, file = file.path(output_data_path, paste0(filename_without_extension, ".csv")), row.names = FALSE) + arrow::write_parquet(table_content, file.path(output_data_path, paste0(filename_without_extension, ".parquet"))) + table_content +} + + +#' Compute DTP dose indicators and prepare dropout base table. +#' +#' Iterates over configured DTP doses, computes/export per-dose indicators, and +#' aggregates them into a single table used later to derive dropout metrics. +#' +#' @param dtp_design Survey design object for vaccination analysis. +#' @param vaccination_doses Integer vector of dose numbers (e.g., 1:3). +#' @param indicator_access Prefix used for access indicator naming. +#' @param admin_name_col Admin grouping column name. +#' @param admin_cols Admin key columns used for table merges. +#' @param admin_data Admin reference table. +#' @param output_data_path Directory to write output files. +#' @param country_code Country code used in filenames. +#' @param data_source DHS source code used in filenames. +#' @param admin_level Admin level label used in filenames. +#' @return List with `dtp_dropout` table and `dose_tables`. +compute_dtp_indicator_tables <- function( + dtp_design, + vaccination_doses, + indicator_access, + admin_name_col, + admin_cols, + admin_data, + output_data_path, + country_code, + data_source, + admin_level +) { + dtp_dropout <- data.table::copy(admin_data) + dose_tables <- list() + + for (dose_number in vaccination_doses) { + vaccine_colname <- glue::glue("DTP{dose_number}") + table_name <- glue::glue("{toupper(indicator_access)}{dose_number}") + filename_without_extension <- glue::glue("{country_code}_{data_source}_{admin_level}_{table_name}") + + df <- compute_and_export_indicator_table( + design_obj = dtp_design, + indicator_name = vaccine_colname, + output_indicator_name = table_name, + admin_name_col = admin_name_col, + admin_data = admin_data, + output_data_path = output_data_path, + filename_without_extension = filename_without_extension + ) + + dose_tables[[table_name]] <- df + dtp_dropout <- data.table::merge.data.table(dtp_dropout, df, by = admin_cols) + } + + list(dtp_dropout = dtp_dropout, dose_tables = dose_tables) +} + + +#' Compute and export DTP dropout indicators. +#' +#' Derives dropout percentages between dose pairs from dose-level sample +#' averages, removes intermediate CI/sample columns, and exports final table. +#' +#' @param dtp_dropout Base table containing per-dose sample averages. +#' @param vaccination_doses Integer vector of dose numbers. +#' @param indicator_access Prefix used for dose access columns. +#' @param indicator_attrition Prefix used for dropout columns. +#' @param output_data_path Directory to write output files. +#' @param country_code Country code used in filenames. +#' @param data_source DHS source code used in filenames. +#' @param admin_level Admin level label used in filenames. +#' @return Dropout table with derived attrition columns. +compute_and_export_dtp_dropout <- function( + dtp_dropout, + vaccination_doses, + indicator_access, + indicator_attrition, + output_data_path, + country_code, + data_source, + admin_level +) { + dtp_dropout[, grep("BOUND", names(dtp_dropout), value = TRUE) := NULL] + + for (current_dose in vaccination_doses) { + for (reference_dose in 1:(current_dose - 1)) { + if ((reference_dose >= 1) & (reference_dose < current_dose)) { + attrition_col <- glue::glue("{toupper(indicator_attrition)}_{reference_dose}_{current_dose}") + numerator_colname <- glue::glue("{toupper(indicator_access)}{current_dose}_SAMPLE_AVERAGE") + denominator_colname <- glue::glue("{toupper(indicator_access)}{reference_dose}_SAMPLE_AVERAGE") + dtp_dropout[, (attrition_col) := (1 - get(numerator_colname) / get(denominator_colname)) * 100] + } + } + } + + dtp_dropout[, grep("SAMPLE_AVERAGE", names(dtp_dropout), value = TRUE) := NULL] + filename <- glue::glue("{country_code}_{data_source}_{admin_level}_{indicator_attrition}") + data.table::fwrite(dtp_dropout, file = file.path(output_data_path, paste0(filename, ".csv"))) + arrow::write_parquet(dtp_dropout, file.path(output_data_path, paste0(filename, ".parquet"))) + dtp_dropout +} + + +#' Export careseeking choropleth plots for each indicator. +#' +#' Generates and saves map plots for all requested indicators using shared DHS +#' map styling helper. +#' +#' @param plot_data Spatial table with indicator values. +#' @param all_indicators Character vector of indicator names. +#' @param output_plots_path Directory to write plot images. +#' @param country_code Country code used in filenames. +#' @param data_source DHS source code used in filenames. +#' @param admin_level Admin level label used in filenames. +#' @return Invisibly saves plot files for each indicator. +export_careseeking_reporting_plots <- function( + plot_data, + all_indicators, + output_plots_path, + country_code, + data_source, + admin_level +) { + for (indicator_name in all_indicators) { + plot_label <- gsub("PCT ", "", gsub("_", " ", indicator_name)) + indicator_plot <- make_dhs_map( + plot_dt = plot_data, + plot_colname = indicator_name, + title_name = glue::glue("Percentage children: {plot_label}"), + legend_title = "%", + scale_limits = c(0, 100) + ) + ggplot2::ggsave( + indicator_plot, + file = file.path(output_plots_path, glue::glue("{country_code}_{data_source}_{admin_level}_{toupper(indicator_name)}_plot.png")), + dpi = 500 + ) + } +} + + +#' Export careseeking confidence-interval bar plots. +#' +#' Reads per-indicator summary tables, builds CI plots, and exports one image +#' per indicator for reporting. +#' +#' @param all_indicators Character vector of indicator names. +#' @param output_data_path Directory containing per-indicator CSV files. +#' @param output_plots_path Directory to write plot images. +#' @param country_code Country code used in filenames. +#' @param data_source DHS source code used in filenames. +#' @param admin_level Admin level label used in filenames. +#' @param admin_name_col Admin label column used on plot axis. +#' @return Invisibly saves CI plot files for each indicator. +export_careseeking_reporting_ci_plots <- function( + all_indicators, + output_data_path, + output_plots_path, + country_code, + data_source, + admin_level, + admin_name_col +) { + for (indicator_name in all_indicators) { + indicator_label <- gsub("_", " ", indicator_name) + ci_data <- data.table::fread( + file.path(output_data_path, glue::glue("{country_code}_{data_source}_{admin_level}_{indicator_name}.csv")) + ) + + sample_avg_col <- glue::glue("{indicator_name}_SAMPLE_AVERAGE") + lower_bound_col <- glue::glue("{indicator_name}_CI_LOWER_BOUND") + upper_bound_col <- glue::glue("{indicator_name}_CI_UPPER_BOUND") + ci_plot <- make_ci_plot( + df_to_plot = ci_data, + admin_colname = admin_name_col, + point_estimation_colname = sample_avg_col, + ci_lower_colname = lower_bound_col, + ci_upper_colname = upper_bound_col, + title_name = glue::glue("{country_code} {data_source} {indicator_label} CI"), + x_title = admin_name_col, + y_title = glue::glue("{indicator_label} (%)") + ) + ggplot2::ggsave( + plot = ci_plot, + filename = file.path(output_plots_path, glue::glue("{country_code}_{data_source}_{admin_level}_{toupper(indicator_name)}_CI_plot.png")), + width = 8, + height = 6, + dpi = 300 + ) + } +} diff --git a/pipelines/snt_dhs_indicators/utils/snt_dhs_mortality_computation.r b/pipelines/snt_dhs_indicators/utils/snt_dhs_mortality_computation.r new file mode 100644 index 0000000..62111f3 --- /dev/null +++ b/pipelines/snt_dhs_indicators/utils/snt_dhs_mortality_computation.r @@ -0,0 +1,4 @@ +# Utils entrypoint for snt_dhs_mortality_computation.ipynb +if (!exists("ROOT_PATH", inherits = TRUE)) ROOT_PATH <- "~/workspace" +if (!exists("PIPELINE_PATH", inherits = TRUE)) PIPELINE_PATH <- file.path(ROOT_PATH, "pipelines", "snt_dhs_indicators") +source(file.path(PIPELINE_PATH, "utils", "snt_dhs_indicator_tables.r")) diff --git a/pipelines/snt_dhs_indicators/utils/snt_dhs_mortality_report.r b/pipelines/snt_dhs_indicators/utils/snt_dhs_mortality_report.r new file mode 100644 index 0000000..81a9c9a --- /dev/null +++ b/pipelines/snt_dhs_indicators/utils/snt_dhs_mortality_report.r @@ -0,0 +1,6 @@ +# Utils entrypoint for snt_dhs_mortality_report.ipynb +if (!exists("ROOT_PATH", inherits = TRUE)) ROOT_PATH <- "~/workspace" +if (!exists("PIPELINE_PATH", inherits = TRUE)) PIPELINE_PATH <- file.path(ROOT_PATH, "pipelines", "snt_dhs_indicators") +if (!exists("CODE_PATH", inherits = TRUE)) CODE_PATH <- file.path(ROOT_PATH, "code") +if (file.exists(file.path(CODE_PATH, "snt_utils.r"))) source(file.path(CODE_PATH, "snt_utils.r")) +source(file.path(PIPELINE_PATH, "utils", "snt_dhs_indicator_tables.r")) diff --git a/pipelines/snt_dhs_indicators/utils/snt_dhs_prevalence_computation.r b/pipelines/snt_dhs_indicators/utils/snt_dhs_prevalence_computation.r new file mode 100644 index 0000000..7f00ef6 --- /dev/null +++ b/pipelines/snt_dhs_indicators/utils/snt_dhs_prevalence_computation.r @@ -0,0 +1,4 @@ +# Utils entrypoint for snt_dhs_prevalence_computation.ipynb +if (!exists("ROOT_PATH", inherits = TRUE)) ROOT_PATH <- "~/workspace" +if (!exists("PIPELINE_PATH", inherits = TRUE)) PIPELINE_PATH <- file.path(ROOT_PATH, "pipelines", "snt_dhs_indicators") +source(file.path(PIPELINE_PATH, "utils", "snt_dhs_indicator_tables.r")) diff --git a/pipelines/snt_dhs_indicators/utils/snt_dhs_prevalence_report.r b/pipelines/snt_dhs_indicators/utils/snt_dhs_prevalence_report.r new file mode 100644 index 0000000..c6cc534 --- /dev/null +++ b/pipelines/snt_dhs_indicators/utils/snt_dhs_prevalence_report.r @@ -0,0 +1,6 @@ +# Utils entrypoint for snt_dhs_prevalence_report.ipynb +if (!exists("ROOT_PATH", inherits = TRUE)) ROOT_PATH <- "~/workspace" +if (!exists("PIPELINE_PATH", inherits = TRUE)) PIPELINE_PATH <- file.path(ROOT_PATH, "pipelines", "snt_dhs_indicators") +if (!exists("CODE_PATH", inherits = TRUE)) CODE_PATH <- file.path(ROOT_PATH, "code") +if (file.exists(file.path(CODE_PATH, "snt_utils.r"))) source(file.path(CODE_PATH, "snt_utils.r")) +source(file.path(PIPELINE_PATH, "utils", "snt_dhs_indicator_tables.r")) diff --git a/pipelines/snt_dhs_indicators/utils/snt_dhs_vaccination_computation.r b/pipelines/snt_dhs_indicators/utils/snt_dhs_vaccination_computation.r new file mode 100644 index 0000000..c599d25 --- /dev/null +++ b/pipelines/snt_dhs_indicators/utils/snt_dhs_vaccination_computation.r @@ -0,0 +1,4 @@ +# Utils entrypoint for snt_dhs_vaccination_computation.ipynb +if (!exists("ROOT_PATH", inherits = TRUE)) ROOT_PATH <- "~/workspace" +if (!exists("PIPELINE_PATH", inherits = TRUE)) PIPELINE_PATH <- file.path(ROOT_PATH, "pipelines", "snt_dhs_indicators") +source(file.path(PIPELINE_PATH, "utils", "snt_dhs_indicator_tables.r")) diff --git a/pipelines/snt_dhs_indicators/utils/snt_dhs_vaccination_report.r b/pipelines/snt_dhs_indicators/utils/snt_dhs_vaccination_report.r new file mode 100644 index 0000000..61aacaa --- /dev/null +++ b/pipelines/snt_dhs_indicators/utils/snt_dhs_vaccination_report.r @@ -0,0 +1,6 @@ +# Utils entrypoint for snt_dhs_vaccination_report.ipynb +if (!exists("ROOT_PATH", inherits = TRUE)) ROOT_PATH <- "~/workspace" +if (!exists("PIPELINE_PATH", inherits = TRUE)) PIPELINE_PATH <- file.path(ROOT_PATH, "pipelines", "snt_dhs_indicators") +if (!exists("CODE_PATH", inherits = TRUE)) CODE_PATH <- file.path(ROOT_PATH, "code") +if (file.exists(file.path(CODE_PATH, "snt_utils.r"))) source(file.path(CODE_PATH, "snt_utils.r")) +source(file.path(PIPELINE_PATH, "utils", "snt_dhs_indicator_tables.r")) diff --git a/snt_dhs_indicators/pipeline.py b/snt_dhs_indicators/pipeline.py index ba6bbec..27e0d46 100644 --- a/snt_dhs_indicators/pipeline.py +++ b/snt_dhs_indicators/pipeline.py @@ -55,6 +55,17 @@ def dhs_indicators(run_reports_only: bool, pull_scripts: bool) -> None: "snt_dhs_mortality_computation.ipynb", "snt_dhs_prevalence_computation.ipynb", "snt_dhs_vaccination_computation.ipynb", + "utils/snt_dhs_indicator_tables.r", + "utils/snt_dhs_careseeking_computation.r", + "utils/snt_dhs_bednets_computation.r", + "utils/snt_dhs_prevalence_computation.r", + "utils/snt_dhs_mortality_computation.r", + "utils/snt_dhs_vaccination_computation.r", + "utils/snt_dhs_bednets_report.r", + "utils/snt_dhs_careseeking_report.r", + "utils/snt_dhs_prevalence_report.r", + "utils/snt_dhs_mortality_report.r", + "utils/snt_dhs_vaccination_report.r", ], ) @@ -223,6 +234,7 @@ def run_dhs_indicator_notebooks( computation_notebook_name (str): Filename of the computation notebook. reporting_notebook_name (str): Filename of the reporting notebook. run_report_only (bool): If True, only the reporting notebook will be executed. + country_code (str | None): Country code used for notebook execution context. """ computation_notebook_path = pipeline_root_path / "code" / computation_notebook_name