diff --git a/pipelines/snt_dhis2_quality_of_care/code/snt_dhis2_quality_of_care.ipynb b/pipelines/snt_dhis2_quality_of_care/code/snt_dhis2_quality_of_care.ipynb index 3d650ea..3e3d55c 100644 --- a/pipelines/snt_dhis2_quality_of_care/code/snt_dhis2_quality_of_care.ipynb +++ b/pipelines/snt_dhis2_quality_of_care/code/snt_dhis2_quality_of_care.ipynb @@ -7,7 +7,7 @@ "source": [ "## Quality of Care Indicators\n", "\n", - "Compute district-year quality-of-care indicators from DHIS2 outliers-imputed routine data.\n", + "Compute district-year quality-of-care indicators from DHIS2 routine data produced by outliers pipelines (`imputed` or `removed`).\n", "\n", "Indicators:\n", "- testing_rate = TEST / SUSP\n", @@ -33,29 +33,24 @@ "outputs": [], "source": [ "# Preliminaries\n", - "options(scipen=999)\n", + "options(scipen = 999)\n", "\n", "ROOT_PATH <- \"~/workspace\"\n", - "CONFIG_PATH <- file.path(ROOT_PATH, \"configuration\")\n", - "CODE_PATH <- file.path(ROOT_PATH, \"code\")\n", - "DATA_PATH <- file.path(ROOT_PATH, \"data\")\n", - "OUTPUT_DATA_PATH <- file.path(DATA_PATH, \"dhis2\", \"quality_of_care\")\n", - "FIGURES_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_quality_of_care\", \"reporting\", \"outputs\", \"figures\")\n", + "source(file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_quality_of_care\", \"utils\", \"snt_dhis2_quality_of_care.r\"))\n", "\n", - "dir.create(OUTPUT_DATA_PATH, recursive = TRUE, showWarnings = FALSE)\n", - "dir.create(FIGURES_PATH, recursive = TRUE, showWarnings = FALSE)\n", - "\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "required_packages <- c(\"jsonlite\", \"data.table\", \"arrow\", \"sf\", \"ggplot2\", \"glue\", \"reticulate\", \"RColorBrewer\", \"dplyr\")\n", - "install_and_load(required_packages)\n", - "\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "openhexa <- reticulate::import(\"openhexa.sdk\")\n", - "\n", - "config_json <- jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))\n", + "snt_environment <- get_setup_variables(SNT_ROOT_PATH = ROOT_PATH, packages = c(\"jsonlite\", \"data.table\", \"arrow\", \"sf\", \"ggplot2\", \"glue\", \"reticulate\", \"RColorBrewer\", \"dplyr\", \"knitr\", \"scales\", \"gridExtra\"))\n", + "config_json <- load_snt_config(snt_environment$CONFIG_PATH, \"SNT_config.json\")\n", "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", "DHIS2_FORMATTED_DATASET <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "OUTLIERS_DATASET <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION" + "OUTLIERS_DATASET <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION\n", + "\n", + "PIPELINE_PATH <- file.path(snt_environment$PIPELINES_PATH, \"snt_dhis2_quality_of_care\")\n", + "OUTPUT_DATA_PATH <- file.path(snt_environment$DATA_PATH, \"dhis2\", \"quality_of_care\")\n", + "REPORT_OUTPUTS_PATH <- file.path(PIPELINE_PATH, \"reporting\", \"outputs\")\n", + "FIGURES_PATH <- file.path(REPORT_OUTPUTS_PATH, \"figures\")\n", + "dir.create(OUTPUT_DATA_PATH, recursive = TRUE, showWarnings = FALSE)\n", + "dir.create(REPORT_OUTPUTS_PATH, recursive = TRUE, showWarnings = FALSE)\n", + "dir.create(FIGURES_PATH, recursive = TRUE, showWarnings = FALSE)" ] }, { @@ -69,163 +64,40 @@ }, "outputs": [], "source": [ - "# Validate data_action parameter\n", - "if (!exists(\"data_action\")) {\n", - " data_action <- \"imputed\"\n", - "}\n", - "\n", - "allowed_actions <- c(\"imputed\", \"removed\")\n", - "if (!(data_action %in% allowed_actions)) {\n", - " stop(glue::glue(\"Invalid data_action: {data_action}. Allowed: {paste(allowed_actions, collapse=', ')}\"))\n", - "}\n", - "\n", - "# Automatically find the latest routine outliers-imputed file in the dataset\n", - "# Pattern: {COUNTRY_CODE}_routine_outliers-*_{data_action}.parquet\n", - "log_msg(glue::glue(\"Searching for latest routine outliers-imputed file in dataset (data_action: {data_action})...\"))\n", - "\n", - "dataset_last_version <- openhexa$workspace$get_dataset(OUTLIERS_DATASET)$latest_version\n", - "if (is.null(dataset_last_version)) {\n", - " stop(glue::glue(\"[ERROR] No version available in dataset `{OUTLIERS_DATASET}`. Process stopped.\"))\n", - "}\n", - "\n", - "# Pattern to match: {COUNTRY_CODE}_routine_outliers-*_{data_action}.parquet\n", - "pattern_prefix <- glue::glue(\"{COUNTRY_CODE}_routine_outliers-\")\n", - "pattern_suffix <- glue::glue(\"_{data_action}.parquet\")\n", - "routine_filename <- NULL\n", - "files_list <- reticulate::iterate(dataset_last_version$files)\n", + "if (!exists(\"data_action\")) data_action <- \"imputed\"\n", + "data_action <- validate_quality_of_care_action(data_action)\n", "\n", - "# Find all matching files and select the latest one\n", - "matching_files <- c()\n", - "for (file in files_list) {\n", - " filename <- file$filename\n", - " if (startsWith(filename, pattern_prefix) && endsWith(filename, pattern_suffix)) {\n", - " matching_files <- c(matching_files, filename)\n", - " }\n", - "}\n", - "\n", - "if (length(matching_files) == 0) {\n", - " stop(glue::glue(\"[ERROR] No file matching pattern `{pattern_prefix}*{pattern_suffix}` found in dataset `{OUTLIERS_DATASET}`. \",\n", - " \"Please run an outlier imputation pipeline first (e.g., snt_dhis2_outliers_imputation_mean) with `data_action=\\\"{data_action}\\\"`.\"))\n", - "}\n", - "\n", - "# Select the latest file (alphabetically sorted, which should correspond to most recent method)\n", - "routine_filename <- sort(matching_files, decreasing = TRUE)[1]\n", - "\n", - "log_msg(glue::glue(\"Found {length(matching_files)} matching file(s). Using latest: {routine_filename}\"))\n", - "\n", - "# Load the routine file\n", - "routine <- tryCatch({\n", - " get_latest_dataset_file_in_memory(OUTLIERS_DATASET, routine_filename)\n", - "}, error = function(e) {\n", - " msg <- paste0(\"[ERROR] 🛑 Error while loading DHIS2 routine data file `\", routine_filename, \n", - " \"` from `\", OUTLIERS_DATASET, \"`. [ERROR DETAILS] \", conditionMessage(e))\n", - " stop(msg)\n", - "})\n", - "\n", - "shapes <- get_latest_dataset_file_in_memory(DHIS2_FORMATTED_DATASET, paste0(COUNTRY_CODE, \"_shapes.geojson\"))\n", - "\n", - "setDT(routine)\n", - "\n", - "# Core required columns (must exist)\n", - "core_cols <- c(\"ADM2_ID\", \"YEAR\")\n", - "core_missing <- setdiff(core_cols, names(routine))\n", - "if (length(core_missing) > 0) {\n", - " stop(glue::glue(\"Missing core required columns in routine data: {paste(core_missing, collapse=', ')}\"))\n", - "}\n", - "\n", - "# Optional indicator columns (will be checked and handled gracefully)\n", "indicator_cols <- c(\"TEST\", \"SUSP\", \"MALTREAT\", \"CONF\", \"MALDTH\", \"MALADM\", \"ALLADM\", \"ALLDTH\", \"ALLOUT\", \"PRES\")\n", - "available_cols <- intersect(indicator_cols, names(routine))\n", - "missing_cols <- setdiff(indicator_cols, names(routine))\n", - "\n", - "if (length(missing_cols) > 0) {\n", - " log_msg(glue::glue(\"[WARNING] Some indicator columns are missing: {paste(missing_cols, collapse=', ')}. These indicators will not be calculated.\"), level = \"warning\")\n", - "}\n", - "\n", - "# Convert available numeric columns\n", - "# Handle \"-\" and other non-numeric values by converting them to NA first\n", - "num_cols <- intersect(available_cols, names(routine))\n", - "if (length(num_cols) > 0) {\n", - " for (col in num_cols) {\n", - " # First convert to character to handle \"-\" strings, then replace with NA, then convert to numeric\n", - " col_vals <- as.character(routine[[col]])\n", - " col_vals[is.na(col_vals) | col_vals == \"\" | col_vals == \"-\"] <- NA_character_\n", - " routine[, (col) := as.numeric(col_vals)]\n", - " }\n", - "}\n", - "routine[, YEAR := as.integer(YEAR)]\n", - "routine[, ADM2_ID := as.character(ADM2_ID)]\n", "\n", - "# Aggregate available columns only using lapply\n", - "if (length(available_cols) > 0) {\n", - " qoc <- routine[, lapply(.SD, function(x) sum(x, na.rm = TRUE)), \n", - " .SDcols = available_cols, \n", - " by = .(ADM2_ID, YEAR)]\n", - "} else {\n", - " # If no indicator columns available, create empty structure\n", - " qoc <- routine[, .(ADM2_ID, YEAR)]\n", - " qoc <- unique(qoc)\n", - "}\n", - "\n", - "# Calculate indicators only if required columns are available\n", - "if (\"TEST\" %in% names(qoc) && \"SUSP\" %in% names(qoc)) {\n", - " qoc[, testing_rate := fifelse(SUSP > 0, TEST / SUSP, NA_real_)]\n", - "} else {\n", - " log_msg(\"[WARNING] Cannot calculate testing_rate: missing TEST or SUSP columns\", level = \"warning\")\n", - "}\n", - "\n", - "if (\"MALTREAT\" %in% names(qoc) && \"CONF\" %in% names(qoc)) {\n", - " qoc[, treatment_rate := fifelse(CONF > 0, MALTREAT / CONF, NA_real_)]\n", - "} else {\n", - " log_msg(\"[WARNING] Cannot calculate treatment_rate: missing MALTREAT or CONF columns\", level = \"warning\")\n", - "}\n", - "\n", - "if (\"MALDTH\" %in% names(qoc) && \"MALADM\" %in% names(qoc)) {\n", - " qoc[, case_fatality_rate := fifelse(MALADM > 0, MALDTH / MALADM, NA_real_)]\n", - "} else {\n", - " log_msg(\"[WARNING] Cannot calculate case_fatality_rate: missing MALDTH or MALADM columns\", level = \"warning\")\n", - "}\n", - "\n", - "if (\"MALADM\" %in% names(qoc) && \"ALLADM\" %in% names(qoc)) {\n", - " qoc[, prop_adm_malaria := fifelse(ALLADM > 0, MALADM / ALLADM, NA_real_)]\n", - "} else {\n", - " log_msg(\"[WARNING] Cannot calculate prop_adm_malaria: missing MALADM or ALLADM columns\", level = \"warning\")\n", - "}\n", - "\n", - "if (\"MALDTH\" %in% names(qoc) && \"ALLDTH\" %in% names(qoc)) {\n", - " qoc[, prop_malaria_deaths := fifelse(ALLDTH > 0, MALDTH / ALLDTH, NA_real_)]\n", - " # Compatibility alias to match historical notebook export naming\n", - " qoc[, prop_deaths_malaria := prop_malaria_deaths]\n", - "} else {\n", - " log_msg(\"[WARNING] Cannot calculate prop_malaria_deaths: missing MALDTH or ALLDTH columns\", level = \"warning\")\n", - "}\n", - "\n", - "if (\"ALLOUT\" %in% names(qoc)) {\n", - " qoc[, non_malaria_all_cause_outpatients := ALLOUT]\n", - "} else {\n", - " log_msg(\"[WARNING] Cannot calculate non_malaria_all_cause_outpatients: missing ALLOUT column\", level = \"warning\")\n", - "}\n", - "\n", - "if (\"PRES\" %in% names(qoc)) {\n", - " qoc[, presumed_cases := PRES]\n", - "} else {\n", - " log_msg(\"[WARNING] Cannot calculate presumed_cases: missing PRES column\", level = \"warning\")\n", - "}\n", - "\n", - "shapes_dt <- as.data.table(sf::st_drop_geometry(shapes))\n", - "if (\"ADM2_ID\" %in% names(shapes_dt) && \"ADM2_NAME\" %in% names(shapes_dt)) {\n", - " shapes_dt[, ADM2_ID := as.character(ADM2_ID)]\n", - " qoc <- merge(qoc, unique(shapes_dt[, .(ADM2_ID, ADM2_NAME)]), by = \"ADM2_ID\", all.x = TRUE)\n", - "}\n", - "\n", - "# Persist only district-year outputs (requested)\n", - "out_district_parquet <- file.path(OUTPUT_DATA_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_district_year_{data_action}.parquet\"))\n", - "out_district_csv <- file.path(OUTPUT_DATA_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_district_year_{data_action}.csv\"))\n", - "\n", - "arrow::write_parquet(qoc, out_district_parquet)\n", - "data.table::fwrite(qoc, out_district_csv)\n", - "\n", - "log_msg(glue::glue(\"Saved outputs: {out_district_parquet}, {out_district_csv}\"))" + "routine <- load_dataset_file(\n", + " dataset_id = OUTLIERS_DATASET,\n", + " filename = glue::glue(\"{COUNTRY_CODE}_routine_outliers_{data_action}.parquet\")\n", + ")\n", + "shapes <- load_dataset_file(\n", + " dataset_id = DHIS2_FORMATTED_DATASET,\n", + " filename = glue::glue(\"{COUNTRY_CODE}_shapes.geojson\")\n", + ")\n", + "\n", + "routine <- normalize_qoc_routine_types(routine, indicator_cols = indicator_cols)\n", + "qoc <- aggregate_qoc_district_year(routine, indicator_cols = indicator_cols)\n", + "\n", + "# Derived indicators — edit here to add / remove / modify\n", + "if (\"TEST\" %in% names(qoc) && \"SUSP\" %in% names(qoc)) qoc[, testing_rate := fifelse(SUSP > 0, TEST / SUSP, NA_real_)]\n", + "if (\"MALTREAT\" %in% names(qoc) && \"CONF\" %in% names(qoc)) qoc[, treatment_rate := fifelse(CONF > 0, MALTREAT / CONF, NA_real_)]\n", + "if (\"MALDTH\" %in% names(qoc) && \"MALADM\" %in% names(qoc)) qoc[, case_fatality_rate := fifelse(MALADM > 0, MALDTH / MALADM, NA_real_)]\n", + "if (\"MALADM\" %in% names(qoc) && \"ALLADM\" %in% names(qoc)) qoc[, prop_adm_malaria := fifelse(ALLADM > 0, MALADM / ALLADM, NA_real_)]\n", + "if (\"MALDTH\" %in% names(qoc) && \"ALLDTH\" %in% names(qoc)) qoc[, prop_malaria_deaths := fifelse(ALLDTH > 0, MALDTH / ALLDTH, NA_real_)]\n", + "if (\"ALLOUT\" %in% names(qoc)) qoc[, non_malaria_all_cause_outpatients := ALLOUT]\n", + "if (\"PRES\" %in% names(qoc)) qoc[, presumed_cases := PRES]\n", + "\n", + "qoc <- attach_quality_of_care_shapes(qoc, shapes)\n", + "\n", + "save_quality_of_care_outputs(\n", + " qoc_dt = qoc,\n", + " output_data_path = OUTPUT_DATA_PATH,\n", + " country_code = COUNTRY_CODE,\n", + " data_action = data_action\n", + ")" ] }, { @@ -239,174 +111,12 @@ }, "outputs": [], "source": [ - "# Yearly maps by ADM2\n", - "# Ensure ADM2_ID is character in both objects (do this once before the function)\n", - "shapes$ADM2_ID <- as.character(shapes$ADM2_ID)\n", - "qoc$ADM2_ID <- as.character(qoc$ADM2_ID)\n", - "\n", - "plot_yearly_map <- function(df, sf_shapes, value_col, title_prefix, filename_prefix, is_rate = TRUE) {\n", - " # Check if value_col exists in df\n", - " if (!(value_col %in% names(df))) {\n", - " log_msg(glue::glue(\"[WARNING] Column '{value_col}' not found in data. Skipping map generation.\"), level = \"warning\")\n", - " return(invisible(NULL))\n", - " }\n", - " \n", - " # Create a local copy of sf_shapes to avoid modifying the original\n", - " sf_shapes_local <- sf_shapes\n", - " if (!is.character(sf_shapes_local$ADM2_ID)) {\n", - " sf_shapes_local$ADM2_ID <- as.character(sf_shapes_local$ADM2_ID)\n", - " }\n", - " \n", - " years <- sort(unique(df$YEAR))\n", - " for (yr in years) {\n", - " df_y <- df[YEAR == yr]\n", - " \n", - " # Check if df_y has any rows\n", - " if (nrow(df_y) == 0) {\n", - " log_msg(glue::glue(\"[WARNING] No data for '{value_col}' in year {yr}. Skipping map.\"), level = \"warning\")\n", - " next\n", - " }\n", - " \n", - " # Ensure ADM2_ID is character in df_y\n", - " df_y$ADM2_ID <- as.character(df_y$ADM2_ID)\n", - " \n", - " # Use dplyr::left_join for sf objects to preserve geometry (use local copy)\n", - " map_df <- dplyr::left_join(sf_shapes_local, df_y, by = \"ADM2_ID\")\n", - "\n", - " # Check if value_col exists in map_df after merge\n", - " if (!(value_col %in% names(map_df))) {\n", - " log_msg(glue::glue(\"[WARNING] Column '{value_col}' not found after merge for year {yr}. Skipping map.\"), level = \"warning\")\n", - " next\n", - " }\n", - "\n", - " vals <- map_df[[value_col]]\n", - " finite_vals <- vals[is.finite(vals) & !is.na(vals)]\n", - " \n", - " # If no valid values, skip this map\n", - " if (length(finite_vals) == 0) {\n", - " log_msg(glue::glue(\"[WARNING] No valid values for '{value_col}' in year {yr}. Skipping map.\"), level = \"warning\")\n", - " next\n", - " }\n", - "\n", - " # Create cat column BEFORE creating the plot\n", - " cat_vals <- NULL\n", - " fill_palette <- NULL\n", - " \n", - " if (is_rate) {\n", - " # Create cat column with proper handling of NA values\n", - " cat_result <- tryCatch({\n", - " cat_vals <- cut(\n", - " vals,\n", - " breaks = c(-Inf, 0, 0.2, 0.4, 0.6, 0.8, 1.0, Inf),\n", - " labels = c(\"<0\", \"0-0.2\", \"0.2-0.4\", \"0.4-0.6\", \"0.6-0.8\", \"0.8-1.0\", \">1.0\"),\n", - " include.lowest = TRUE\n", - " )\n", - " fill_palette <- \"YlOrRd\"\n", - " TRUE # Success\n", - " }, error = function(e) {\n", - " log_msg(glue::glue(\"[WARNING] Failed to create categories for '{value_col}' year {yr}: {conditionMessage(e)}\"), level = \"warning\")\n", - " FALSE # Failure\n", - " })\n", - " if (!cat_result) {\n", - " next\n", - " }\n", - " } else {\n", - " cat_result <- tryCatch({\n", - " if (length(finite_vals) > 4) {\n", - " br <- unique(as.numeric(quantile(finite_vals, probs = seq(0, 1, 0.2), na.rm = TRUE)))\n", - " if (length(br) < 2) {\n", - " cat_vals <- as.factor(rep(\"all\", nrow(map_df)))\n", - " } else {\n", - " cat_vals <- cut(vals, breaks = br, include.lowest = TRUE)\n", - " }\n", - " } else {\n", - " cat_vals <- as.factor(vals)\n", - " }\n", - " fill_palette <- \"Blues\"\n", - " TRUE # Success\n", - " }, error = function(e) {\n", - " log_msg(glue::glue(\"[WARNING] Failed to create categories for '{value_col}' year {yr}: {conditionMessage(e)}\"), level = \"warning\")\n", - " FALSE # Failure\n", - " })\n", - " if (!cat_result) {\n", - " next\n", - " }\n", - " }\n", - " \n", - " # Check if cat_vals was created successfully\n", - " if (is.null(cat_vals) || length(cat_vals) != nrow(map_df)) {\n", - " log_msg(glue::glue(\"[WARNING] Failed to create 'cat' column for '{value_col}' in year {yr}. Skipping map.\"), level = \"warning\")\n", - " next\n", - " }\n", - " \n", - " # Check if all values are NA (cut failed) - but allow some NA values\n", - " if (all(is.na(cat_vals))) {\n", - " log_msg(glue::glue(\"[WARNING] All 'cat' values are NA for '{value_col}' in year {yr}. Skipping map.\"), level = \"warning\")\n", - " next\n", - " }\n", - " \n", - " # Add cat column using dplyr::mutate to ensure it's properly added to sf object\n", - " map_df <- dplyr::mutate(map_df, cat = as.factor(cat_vals))\n", - " \n", - " # Verify cat column exists before creating plot\n", - " if (!(\"cat\" %in% names(map_df))) {\n", - " log_msg(glue::glue(\"[WARNING] Failed to add 'cat' column to map_df for '{value_col}' in year {yr}. Skipping map.\"), level = \"warning\")\n", - " next\n", - " }\n", - " \n", - " # Create plot AFTER cat column is added\n", - " p <- ggplot(map_df) +\n", - " geom_sf(aes(fill = cat), color = \"grey60\", size = 0.1) +\n", - " scale_fill_brewer(palette = fill_palette, na.value = \"white\", drop = FALSE)\n", - "\n", - " p <- p +\n", - " theme_void() +\n", - " labs(\n", - " title = paste0(title_prefix, \" - \", yr),\n", - " fill = value_col,\n", - " caption = \"Source: SNT DHIS2 outliers-imputed routine data\"\n", - " ) +\n", - " theme(\n", - " legend.position = \"bottom\",\n", - " plot.title = element_text(face = \"bold\", size = 12)\n", - " )\n", - "\n", - " out_png <- file.path(FIGURES_PATH, glue::glue(\"{filename_prefix}_{yr}.png\"))\n", - " \n", - " # Try to save the plot, catch any errors\n", - " tryCatch({\n", - " ggsave(out_png, plot = p, width = 9, height = 7, dpi = 300, bg = \"white\")\n", - " log_msg(glue::glue(\"Saved map: {out_png}\"))\n", - " }, error = function(e) {\n", - " log_msg(glue::glue(\"[WARNING] Failed to save map for '{value_col}' year {yr}: {conditionMessage(e)}\"), level = \"warning\")\n", - " })\n", - " }\n", - "}\n", - "\n", - "# Plot only indicators that were calculated (columns exist)\n", - "if (\"testing_rate\" %in% names(qoc)) {\n", - " plot_yearly_map(qoc, shapes, \"testing_rate\", \"Testing rate (TEST / SUSP)\", \"testing_rate\", TRUE)\n", - "}\n", - "if (\"treatment_rate\" %in% names(qoc)) {\n", - " plot_yearly_map(qoc, shapes, \"treatment_rate\", \"Treatment rate (MALTREAT / CONF)\", \"treatment_rate\", TRUE)\n", - "}\n", - "if (\"case_fatality_rate\" %in% names(qoc)) {\n", - " plot_yearly_map(qoc, shapes, \"case_fatality_rate\", \"In-hospital case fatality rate (MALDTH / MALADM)\", \"case_fatality_rate\", TRUE)\n", - "}\n", - "if (\"prop_adm_malaria\" %in% names(qoc)) {\n", - " plot_yearly_map(qoc, shapes, \"prop_adm_malaria\", \"Proportion admitted for malaria (MALADM / ALLADM)\", \"prop_adm_malaria\", TRUE)\n", - "}\n", - "if (\"prop_malaria_deaths\" %in% names(qoc)) {\n", - " plot_yearly_map(qoc, shapes, \"prop_malaria_deaths\", \"Proportion of malaria deaths (MALDTH / ALLDTH)\", \"prop_malaria_deaths\", TRUE)\n", - "}\n", - "if (\"non_malaria_all_cause_outpatients\" %in% names(qoc)) {\n", - " plot_yearly_map(qoc, shapes, \"non_malaria_all_cause_outpatients\", \"Non-malaria all-cause outpatients (ALLOUT)\", \"allout\", FALSE)\n", - "}\n", - "if (\"presumed_cases\" %in% names(qoc)) {\n", - " plot_yearly_map(qoc, shapes, \"presumed_cases\", \"Presumed cases (PRES)\", \"presumed_cases\", FALSE)\n", - "}\n", - "\n", - "log_msg(glue::glue(\"Saved yearly maps in: {FIGURES_PATH}\"))" + "# Build yearly maps (saved as PNG)\n", + "save_quality_of_care_maps(\n", + " qoc_dt = qoc,\n", + " shapes_sf = shapes,\n", + " figures_path = FIGURES_PATH\n", + ")" ] } ], diff --git a/pipelines/snt_dhis2_quality_of_care/reporting/snt_dhis2_quality_of_care_report.ipynb b/pipelines/snt_dhis2_quality_of_care/reporting/snt_dhis2_quality_of_care_report.ipynb index 045eb65..696fa7b 100644 --- a/pipelines/snt_dhis2_quality_of_care/reporting/snt_dhis2_quality_of_care_report.ipynb +++ b/pipelines/snt_dhis2_quality_of_care/reporting/snt_dhis2_quality_of_care_report.ipynb @@ -22,24 +22,19 @@ "outputs": [], "source": [ "ROOT_PATH <- \"~/workspace\"\n", - "CONFIG_PATH <- file.path(ROOT_PATH, \"configuration\")\n", - "CODE_PATH <- file.path(ROOT_PATH, \"code\")\n", - "DATA_PATH <- file.path(ROOT_PATH, \"data\", \"dhis2\", \"quality_of_care\")\n", - "FIGURES_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_quality_of_care\", \"reporting\", \"outputs\", \"figures\")\n", + "source(file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_quality_of_care\", \"utils\", \"snt_dhis2_quality_of_care_report.r\"))\n", "\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "install_and_load(c(\"jsonlite\", \"data.table\", \"arrow\", \"dplyr\", \"knitr\", \"glue\", \"reticulate\", \"writexl\", \"ggplot2\", \"scales\", \"gridExtra\", \"sf\"))\n", - "\n", - "# Create output directories\n", - "REPORT_OUTPUTS_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_quality_of_care\", \"reporting\", \"outputs\")\n", + "snt_environment <- get_setup_variables(SNT_ROOT_PATH = ROOT_PATH, packages = c(\"jsonlite\", \"data.table\", \"arrow\", \"sf\", \"ggplot2\", \"glue\", \"reticulate\", \"RColorBrewer\", \"dplyr\", \"knitr\", \"scales\", \"gridExtra\", \"IRdisplay\"))\n", + "config_json <- load_snt_config(snt_environment$CONFIG_PATH, \"SNT_config.json\")\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "DHIS2_FORMATTED_DATASET <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "PIPELINE_PATH <- file.path(snt_environment$PIPELINES_PATH, \"snt_dhis2_quality_of_care\")\n", + "OUTPUT_DATA_PATH <- file.path(snt_environment$DATA_PATH, \"dhis2\", \"quality_of_care\")\n", + "REPORT_OUTPUTS_PATH <- file.path(PIPELINE_PATH, \"reporting\", \"outputs\")\n", + "FIGURES_PATH <- file.path(REPORT_OUTPUTS_PATH, \"figures\")\n", + "dir.create(OUTPUT_DATA_PATH, recursive = TRUE, showWarnings = FALSE)\n", "dir.create(REPORT_OUTPUTS_PATH, recursive = TRUE, showWarnings = FALSE)\n", - "dir.create(FIGURES_PATH, recursive = TRUE, showWarnings = FALSE)\n", - "\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "openhexa <- reticulate::import(\"openhexa.sdk\")\n", - "\n", - "config_json <- jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE" + "dir.create(FIGURES_PATH, recursive = TRUE, showWarnings = FALSE)" ] }, { @@ -53,96 +48,23 @@ }, "outputs": [], "source": [ - "# Use district-year output file (latest action)\n", - "files <- list.files(DATA_PATH, pattern = paste0(\"^\", COUNTRY_CODE, \"_quality_of_care_district_year_(imputed|removed)\\\\.parquet$\"), full.names = TRUE)\n", - "if (length(files) == 0) {\n", - " stop(glue::glue(\"No quality_of_care parquet found in {DATA_PATH}\"))\n", - "}\n", - "\n", - "latest_file <- files[which.max(file.info(files)$mtime)]\n", - "qoc <- as.data.table(arrow::read_parquet(latest_file))\n", - "\n", - "# Build summary table with only available columns\n", - "# Start with unique YEAR values\n", - "summary_tbl <- unique(qoc[, .(YEAR)])\n", - "\n", - "# Add rate indicators (mean) - merge one by one\n", - "if (\"testing_rate\" %in% names(qoc)) {\n", - " summary_tbl <- merge(summary_tbl, \n", - " qoc[, .(testing_rate = mean(testing_rate, na.rm = TRUE)), by = .(YEAR)], \n", - " by = \"YEAR\", all.x = TRUE)\n", - "}\n", - "if (\"treatment_rate\" %in% names(qoc)) {\n", - " summary_tbl <- merge(summary_tbl, \n", - " qoc[, .(treatment_rate = mean(treatment_rate, na.rm = TRUE)), by = .(YEAR)], \n", - " by = \"YEAR\", all.x = TRUE)\n", - "}\n", - "if (\"case_fatality_rate\" %in% names(qoc)) {\n", - " summary_tbl <- merge(summary_tbl, \n", - " qoc[, .(case_fatality_rate = mean(case_fatality_rate, na.rm = TRUE)), by = .(YEAR)], \n", - " by = \"YEAR\", all.x = TRUE)\n", - "}\n", - "if (\"prop_adm_malaria\" %in% names(qoc)) {\n", - " summary_tbl <- merge(summary_tbl, \n", - " qoc[, .(prop_adm_malaria = mean(prop_adm_malaria, na.rm = TRUE)), by = .(YEAR)], \n", - " by = \"YEAR\", all.x = TRUE)\n", - "}\n", - "if (\"prop_malaria_deaths\" %in% names(qoc)) {\n", - " summary_tbl <- merge(summary_tbl, \n", - " qoc[, .(prop_malaria_deaths = mean(prop_malaria_deaths, na.rm = TRUE)), by = .(YEAR)], \n", - " by = \"YEAR\", all.x = TRUE)\n", - "}\n", - "\n", - "# Add absolute indicators (sum)\n", - "if (\"non_malaria_all_cause_outpatients\" %in% names(qoc)) {\n", - " summary_tbl <- merge(summary_tbl, \n", - " qoc[, .(non_malaria_all_cause_outpatients = sum(non_malaria_all_cause_outpatients, na.rm = TRUE)), by = .(YEAR)], \n", - " by = \"YEAR\", all.x = TRUE)\n", - "}\n", - "if (\"presumed_cases\" %in% names(qoc)) {\n", - " summary_tbl <- merge(summary_tbl, \n", - " qoc[, .(presumed_cases = sum(presumed_cases, na.rm = TRUE)), by = .(YEAR)], \n", - " by = \"YEAR\", all.x = TRUE)\n", - "}\n", - "\n", - "summary_tbl <- summary_tbl[order(YEAR)]\n", - "\n", - "# Explicitly list missing indicators so report is self-explanatory\n", - "expected_indicators <- c(\n", - " \"testing_rate\",\n", - " \"treatment_rate\",\n", - " \"case_fatality_rate\",\n", - " \"prop_adm_malaria\",\n", - " \"prop_malaria_deaths\",\n", - " \"non_malaria_all_cause_outpatients\",\n", - " \"presumed_cases\"\n", + "# Load latest district-year output and build summary\n", + "qoc_ctx <- load_latest_quality_of_care_output(OUTPUT_DATA_PATH, COUNTRY_CODE)\n", + "qoc <- qoc_ctx$qoc\n", + "latest_file <- qoc_ctx$latest_file\n", + "\n", + "summary_tbl <- build_quality_of_care_summary(qoc)\n", + "summary_paths <- save_quality_of_care_summary_outputs(\n", + " summary_tbl = summary_tbl,\n", + " report_outputs_path = REPORT_OUTPUTS_PATH,\n", + " country_code = COUNTRY_CODE\n", ")\n", - "missing_indicators <- setdiff(expected_indicators, names(qoc))\n", - "if (length(missing_indicators) > 0) {\n", - " log_msg(glue::glue(\"[WARNING] Missing indicators in input file: {paste(missing_indicators, collapse=', ')}\"), level = \"warning\")\n", - " cat(glue::glue(\"\\nMissing indicators in this run: {paste(missing_indicators, collapse=', ')}\\n\"))\n", - " cat(\"Reason: required source columns are absent in the selected outliers file.\\n\")\n", - "}\n", - "\n", - "# Save summary data (parquet, csv, xlsx) - following other pipelines pattern\n", - "summary_parquet <- file.path(REPORT_OUTPUTS_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_summary.parquet\"))\n", - "summary_csv <- file.path(REPORT_OUTPUTS_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_summary.csv\"))\n", - "summary_xlsx <- file.path(REPORT_OUTPUTS_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_summary.xlsx\"))\n", - "\n", - "# Save as parquet (primary format, like other pipelines)\n", - "arrow::write_parquet(summary_tbl, summary_parquet)\n", - "\n", - "# Save as csv and xlsx for compatibility\n", - "data.table::fwrite(summary_tbl, summary_csv)\n", - "writexl::write_xlsx(list(summary = as.data.frame(summary_tbl)), summary_xlsx)\n", - "\n", - "log_msg(glue::glue(\"Summary data saved to: {summary_parquet}, {summary_csv}, {summary_xlsx}\"))\n", "\n", "knitr::kable(summary_tbl, caption = \"Quality of Care - Year-level summary\")\n", "\n", "cat(glue::glue(\"\\nLoaded file: {latest_file}\\n\"))\n", "cat(glue::glue(\"Map outputs folder: {FIGURES_PATH}\\n\"))\n", - "cat(glue::glue(\"Summary data saved to: {summary_parquet}, {summary_csv}, {summary_xlsx}\\n\"))" + "cat(glue::glue(\"Summary data saved to: {summary_paths$summary_parquet}, {summary_paths$summary_csv}\\n\"))" ] }, { @@ -164,202 +86,14 @@ }, "outputs": [], "source": [ - "# Create bar charts by year (same as original notebook - 4x2 grid layout)\n", - "# Prepare data - convert rates to percentages\n", - "plot_data <- copy(summary_tbl)\n", - "\n", - "# Create the same 4x2 subplot layout as original notebook\n", - "if (nrow(plot_data) > 0) {\n", - " # Create a list to store individual plots (in order: 4x2 grid)\n", - " plots_list <- list()\n", - " \n", - " # Row 0, Col 0: Testing rate\n", - " if (\"testing_rate\" %in% names(plot_data)) {\n", - " p <- ggplot(plot_data, aes(x = factor(YEAR), y = testing_rate * 100)) +\n", - " geom_bar(stat = \"identity\", fill = \"#2563eb\", color = \"#1e40af\", width = 0.7) +\n", - " geom_text(aes(label = paste0(round(testing_rate * 100, 1), \"%\")), \n", - " vjust = -0.5, size = 2.5) +\n", - " labs(title = \"Testing rate (TEST / SUSP)\", x = \"Année\", y = \"%\") +\n", - " theme_minimal() +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\", size = 10),\n", - " axis.text.x = element_text(angle = 45, hjust = 1, size = 9),\n", - " panel.grid.major.y = element_line(linetype = \"dashed\", color = scales::alpha(\"grey\", 0.7)),\n", - " plot.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " panel.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " plot.margin = margin(5, 5, 5, 5)\n", - " ) +\n", - " scale_y_continuous(expand = expansion(mult = c(0, 0.1)))\n", - " plots_list[[\"testing_rate\"]] <- p\n", - " }\n", - " \n", - " # Row 0, Col 1: Treatment rate\n", - " if (\"treatment_rate\" %in% names(plot_data)) {\n", - " p <- ggplot(plot_data, aes(x = factor(YEAR), y = treatment_rate * 100)) +\n", - " geom_bar(stat = \"identity\", fill = \"#2563eb\", color = \"#1e40af\", width = 0.7) +\n", - " geom_text(aes(label = paste0(round(treatment_rate * 100, 1), \"%\")), \n", - " vjust = -0.5, size = 2.5) +\n", - " labs(title = \"Treatment rate (MALTREAT / CONF)\", x = \"Année\", y = \"%\") +\n", - " theme_minimal() +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\", size = 10),\n", - " axis.text.x = element_text(angle = 45, hjust = 1, size = 9),\n", - " panel.grid.major.y = element_line(linetype = \"dashed\", color = scales::alpha(\"grey\", 0.7)),\n", - " plot.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " panel.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " plot.margin = margin(5, 5, 5, 5)\n", - " ) +\n", - " scale_y_continuous(expand = expansion(mult = c(0, 0.1)))\n", - " plots_list[[\"treatment_rate\"]] <- p\n", - " }\n", - " \n", - " # Row 1, Col 0: Case fatality rate\n", - " if (\"case_fatality_rate\" %in% names(plot_data)) {\n", - " p <- ggplot(plot_data, aes(x = factor(YEAR), y = case_fatality_rate * 100)) +\n", - " geom_bar(stat = \"identity\", fill = \"#2563eb\", color = \"#1e40af\", width = 0.7) +\n", - " geom_text(aes(label = paste0(round(case_fatality_rate * 100, 1), \"%\")), \n", - " vjust = -0.5, size = 2.5) +\n", - " labs(title = \"Case fatality rate (MALDTH / MALADM)\", x = \"Année\", y = \"%\") +\n", - " theme_minimal() +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\", size = 10),\n", - " axis.text.x = element_text(angle = 45, hjust = 1, size = 9),\n", - " panel.grid.major.y = element_line(linetype = \"dashed\", color = scales::alpha(\"grey\", 0.7)),\n", - " plot.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " panel.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " plot.margin = margin(5, 5, 5, 5)\n", - " ) +\n", - " scale_y_continuous(expand = expansion(mult = c(0, 0.1)))\n", - " plots_list[[\"case_fatality_rate\"]] <- p\n", - " }\n", - " \n", - " # Row 1, Col 1: Proportion admissions malaria\n", - " if (\"prop_adm_malaria\" %in% names(plot_data)) {\n", - " p <- ggplot(plot_data, aes(x = factor(YEAR), y = prop_adm_malaria * 100)) +\n", - " geom_bar(stat = \"identity\", fill = \"#2563eb\", color = \"#1e40af\", width = 0.7) +\n", - " geom_text(aes(label = paste0(round(prop_adm_malaria * 100, 1), \"%\")), \n", - " vjust = -0.5, size = 2.5) +\n", - " labs(title = \"Prop. admissions paludisme (MALADM / ALLADM)\", x = \"Année\", y = \"%\") +\n", - " theme_minimal() +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\", size = 10),\n", - " axis.text.x = element_text(angle = 45, hjust = 1, size = 9),\n", - " panel.grid.major.y = element_line(linetype = \"dashed\", color = scales::alpha(\"grey\", 0.7)),\n", - " plot.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " panel.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " plot.margin = margin(5, 5, 5, 5)\n", - " ) +\n", - " scale_y_continuous(expand = expansion(mult = c(0, 0.1)))\n", - " plots_list[[\"prop_adm_malaria\"]] <- p\n", - " }\n", - " \n", - " # Row 2, Col 0: Proportion deaths malaria\n", - " if (\"prop_malaria_deaths\" %in% names(plot_data)) {\n", - " p <- ggplot(plot_data, aes(x = factor(YEAR), y = prop_malaria_deaths * 100)) +\n", - " geom_bar(stat = \"identity\", fill = \"#2563eb\", color = \"#1e40af\", width = 0.7) +\n", - " geom_text(aes(label = paste0(round(prop_malaria_deaths * 100, 1), \"%\")), \n", - " vjust = -0.5, size = 2.5) +\n", - " labs(title = \"Prop. décès paludisme (MALDTH / ALLDTH)\", x = \"Année\", y = \"%\") +\n", - " theme_minimal() +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\", size = 10),\n", - " axis.text.x = element_text(angle = 45, hjust = 1, size = 9),\n", - " panel.grid.major.y = element_line(linetype = \"dashed\", color = scales::alpha(\"grey\", 0.7)),\n", - " plot.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " panel.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " plot.margin = margin(5, 5, 5, 5)\n", - " ) +\n", - " scale_y_continuous(expand = expansion(mult = c(0, 0.1)))\n", - " plots_list[[\"prop_malaria_deaths\"]] <- p\n", - " }\n", - " \n", - " # Row 2, Col 1: Presumed cases (absolute)\n", - " if (\"presumed_cases\" %in% names(plot_data)) {\n", - " format_label <- function(v) {\n", - " ifelse(is.na(v) | v == 0, \"0\",\n", - " ifelse(v >= 1e6, paste0(round(v/1e6, 2), \"M\"),\n", - " format(round(v), big.mark = \" \", scientific = FALSE)\n", - " )\n", - " )\n", - " }\n", - " p <- ggplot(plot_data, aes(x = factor(YEAR), y = presumed_cases)) +\n", - " geom_bar(stat = \"identity\", fill = \"#2563eb\", color = \"#1e40af\", width = 0.7) +\n", - " geom_text(aes(label = format_label(presumed_cases)), \n", - " vjust = -0.5, size = 2.5) +\n", - " labs(title = \"Cas présumés (PRES)\", x = \"Année\", y = \"Nombre\") +\n", - " theme_minimal() +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\", size = 10),\n", - " axis.text.x = element_text(angle = 45, hjust = 1, size = 9),\n", - " panel.grid.major.y = element_line(linetype = \"dashed\", color = scales::alpha(\"grey\", 0.7)),\n", - " plot.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " panel.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " plot.margin = margin(5, 5, 5, 5)\n", - " ) +\n", - " scale_y_continuous(labels = scales::comma, expand = expansion(mult = c(0, 0.1)))\n", - " plots_list[[\"presumed_cases\"]] <- p\n", - " }\n", - " \n", - " # Row 3, Col 0: Non-malaria all-cause outpatients (absolute)\n", - " if (\"non_malaria_all_cause_outpatients\" %in% names(plot_data)) {\n", - " format_label <- function(v) {\n", - " ifelse(is.na(v) | v == 0, \"0\",\n", - " ifelse(v >= 1e6, paste0(round(v/1e6, 2), \"M\"),\n", - " format(round(v), big.mark = \" \", scientific = FALSE)\n", - " )\n", - " )\n", - " }\n", - " p <- ggplot(plot_data, aes(x = factor(YEAR), y = non_malaria_all_cause_outpatients)) +\n", - " geom_bar(stat = \"identity\", fill = \"#2563eb\", color = \"#1e40af\", width = 0.7) +\n", - " geom_text(aes(label = format_label(non_malaria_all_cause_outpatients)), \n", - " vjust = -0.5, size = 2.5) +\n", - " labs(title = \"Consultations externes non-paludisme (ALLOUT)\", x = \"Année\", y = \"Nombre\") +\n", - " theme_minimal() +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\", size = 10),\n", - " axis.text.x = element_text(angle = 45, hjust = 1, size = 9),\n", - " panel.grid.major.y = element_line(linetype = \"dashed\", color = scales::alpha(\"grey\", 0.7)),\n", - " plot.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " panel.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " plot.margin = margin(5, 5, 5, 5)\n", - " ) +\n", - " scale_y_continuous(labels = scales::comma, expand = expansion(mult = c(0, 0.1)))\n", - " plots_list[[\"non_malaria_all_cause_outpatients\"]] <- p\n", - " }\n", - " \n", - " # Create and display combined plot (dynamic grid for readability)\n", - " if (length(plots_list) > 0) {\n", - " # Order plots as in original\n", - " plot_order <- c(\"testing_rate\", \"treatment_rate\", \"case_fatality_rate\", \"prop_adm_malaria\", \n", - " \"prop_malaria_deaths\", \"presumed_cases\", \"non_malaria_all_cause_outpatients\")\n", - " available_plots <- plots_list[intersect(plot_order, names(plots_list))]\n", - "\n", - " if (length(available_plots) > 0) {\n", - " n_plots <- length(available_plots)\n", - " ncol_layout <- 2\n", - " nrow_layout <- ceiling(n_plots / ncol_layout)\n", - "\n", - " # Bigger display in report so labels are readable\n", - " options(repr.plot.width = 14, repr.plot.height = max(7, 4.8 * nrow_layout))\n", - "\n", - " combined_plot <- do.call(grid.arrange, c(available_plots, ncol = ncol_layout, nrow = nrow_layout))\n", - " print(combined_plot)\n", + "# Build, save, and display year-level indicator chart\n", + "charts_file <- save_quality_of_care_summary_charts(\n", + " summary_tbl = summary_tbl,\n", + " figures_path = FIGURES_PATH,\n", + " country_code = COUNTRY_CODE\n", + ")\n", "\n", - " # Save at larger size for presentation readability\n", - " combined_file <- file.path(FIGURES_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_by_year.png\"))\n", - " ggsave(\n", - " combined_file,\n", - " plot = combined_plot,\n", - " width = 18,\n", - " height = max(8, 5.2 * nrow_layout),\n", - " dpi = 300,\n", - " bg = \"white\",\n", - " units = \"in\"\n", - " )\n", - " log_msg(glue::glue(\"Combined bar charts saved: {combined_file}\"))\n", - " }\n", - " }\n", - "}" + "IRdisplay::display_png(file = normalizePath(path.expand(charts_file)))" ] }, { @@ -383,157 +117,30 @@ }, "outputs": [], "source": [ - "# Load shapes geojson from dataset (like seasonality pipeline)\n", - "DHIS2_FORMATTED_DATASET <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "\n", - "shapes <- tryCatch({\n", - " get_latest_dataset_file_in_memory(DHIS2_FORMATTED_DATASET, paste0(COUNTRY_CODE, \"_shapes.geojson\"))\n", - "}, error = function(e) {\n", - " msg <- paste0(\"Error while loading DHIS2 Shapes data for: \", COUNTRY_CODE, \". \", conditionMessage(e))\n", - " log_msg(msg, level = \"error\")\n", - " stop(msg)\n", - "})\n", - "\n", - "# Ensure ADM2_ID is character in both datasets\n", - "shapes$ADM2_ID <- as.character(shapes$ADM2_ID)\n", - "qoc$ADM2_ID <- as.character(qoc$ADM2_ID)\n", - "\n", - "# Merge shapes with quality-of-care data\n", - "qoc_sf <- shapes %>%\n", - " dplyr::left_join(qoc, by = \"ADM2_ID\")\n", - "\n", - "# Helper to build readable interval labels for legends\n", - "format_interval_labels <- function(breaks_vec) {\n", - " labels <- c()\n", - " for (i in seq_len(length(breaks_vec) - 1)) {\n", - " a <- breaks_vec[i]\n", - " b <- breaks_vec[i + 1]\n", - " labels <- c(labels, paste0(scales::comma(round(a)), \" - \", scales::comma(round(b))))\n", - " }\n", - " labels\n", - "}\n", - "\n", - "# Function to plot yearly maps (similar to code notebook but inline in report)\n", - "plot_yearly_map_report <- function(sf_data, value_col, title_prefix, is_rate = TRUE) {\n", - " if (!(value_col %in% names(sf_data))) {\n", - " log_msg(glue::glue(\"[WARNING] Column '{value_col}' not found. Skipping map generation.\"), level = \"warning\")\n", - " return(invisible(NULL))\n", - " }\n", - " \n", - " years <- sort(unique(sf_data$YEAR[!is.na(sf_data$YEAR)]))\n", - " if (length(years) == 0) {\n", - " log_msg(glue::glue(\"[WARNING] No valid years for '{value_col}'. Skipping map.\"), level = \"warning\")\n", - " return(invisible(NULL))\n", - " }\n", - " \n", - " # Create plots for each year\n", - " plot_list <- list()\n", - " base_shapes <- sf_data %>% dplyr::select(ADM2_ID, geometry) %>% dplyr::distinct()\n", - "\n", - " for (yr in years) {\n", - " # Keep all districts on map, then join year values\n", - " year_vals <- sf_data[sf_data$YEAR == yr, c(\"ADM2_ID\", value_col), drop = FALSE]\n", - " year_vals <- sf::st_drop_geometry(year_vals)\n", - " year_vals <- year_vals[!duplicated(year_vals$ADM2_ID), , drop = FALSE]\n", - " sf_y <- dplyr::left_join(base_shapes, year_vals, by = \"ADM2_ID\")\n", - "\n", - " vals <- sf_y[[value_col]]\n", - " finite_vals <- vals[is.finite(vals) & !is.na(vals)]\n", - "\n", - " if (length(finite_vals) == 0) {\n", - " next\n", - " }\n", - "\n", - " # Create categories\n", - " if (is_rate) {\n", - " cat_vals <- cut(\n", - " vals,\n", - " breaks = c(-Inf, 0, 0.2, 0.4, 0.6, 0.8, 1.0, Inf),\n", - " labels = c(\"<0\", \"0-0.2\", \"0.2-0.4\", \"0.4-0.6\", \"0.6-0.8\", \"0.8-1.0\", \">1.0\"),\n", - " include.lowest = TRUE\n", - " )\n", - " fill_palette <- \"YlOrRd\"\n", - " } else {\n", - " # Use readable fixed-count classes for absolute values\n", - " n_classes <- 5\n", - " br <- unique(as.numeric(quantile(finite_vals, probs = seq(0, 1, length.out = n_classes + 1), na.rm = TRUE)))\n", - " br <- sort(br)\n", - " if (length(br) < 2) {\n", - " br <- c(min(finite_vals, na.rm = TRUE), max(finite_vals, na.rm = TRUE) + 1)\n", - " }\n", - " if (length(unique(br)) < 2) {\n", - " cat_vals <- as.factor(rep(\"single value\", nrow(sf_y)))\n", - " } else {\n", - " labels_abs <- format_interval_labels(br)\n", - " cat_vals <- cut(vals, breaks = br, include.lowest = TRUE, labels = labels_abs)\n", - " }\n", - " fill_palette <- \"Blues\"\n", - " }\n", - "\n", - " sf_y$cat <- as.factor(cat_vals)\n", - "\n", - " p <- ggplot(sf_y) +\n", - " geom_sf(aes(fill = cat), color = \"grey60\", size = 0.12) +\n", - " scale_fill_brewer(palette = fill_palette, na.value = \"#f3f4f6\", drop = FALSE) +\n", - " theme_void() +\n", - " labs(\n", - " title = paste0(title_prefix, \" - \", yr),\n", - " fill = ifelse(is_rate, \"Rate class\", \"Value class\")\n", - " ) +\n", - " guides(fill = guide_legend(nrow = 2, byrow = TRUE)) +\n", - " theme(\n", - " legend.position = \"bottom\",\n", - " legend.text = element_text(size = 9),\n", - " legend.title = element_text(size = 10, face = \"bold\"),\n", - " plot.title = element_text(face = \"bold\", size = 13)\n", - " )\n", - "\n", - " plot_list[[as.character(yr)]] <- p\n", - " }\n", - " \n", - " # Display all plots\n", - " if (length(plot_list) > 0) {\n", - " options(repr.plot.width = 10, repr.plot.height = 8)\n", - " for (yr_name in names(plot_list)) {\n", - " print(plot_list[[yr_name]])\n", - " }\n", - " }\n", - "}\n", - "\n", - "# Generate maps for each available indicator\n", - "cat(\"### Testing Rate\\n\")\n", - "if (\"testing_rate\" %in% names(qoc_sf)) {\n", - " plot_yearly_map_report(qoc_sf, \"testing_rate\", \"Testing rate (TEST / SUSP)\", TRUE)\n", - "}\n", - "\n", - "cat(\"\\n### Treatment Rate\\n\")\n", - "if (\"treatment_rate\" %in% names(qoc_sf)) {\n", - " plot_yearly_map_report(qoc_sf, \"treatment_rate\", \"Treatment rate (MALTREAT / CONF)\", TRUE)\n", - "}\n", - "\n", - "cat(\"\\n### Case Fatality Rate\\n\")\n", - "if (\"case_fatality_rate\" %in% names(qoc_sf)) {\n", - " plot_yearly_map_report(qoc_sf, \"case_fatality_rate\", \"In-hospital case fatality rate (MALDTH / MALADM)\", TRUE)\n", - "}\n", - "\n", - "cat(\"\\n### Proportion Admissions Malaria\\n\")\n", - "if (\"prop_adm_malaria\" %in% names(qoc_sf)) {\n", - " plot_yearly_map_report(qoc_sf, \"prop_adm_malaria\", \"Proportion admitted for malaria (MALADM / ALLADM)\", TRUE)\n", - "}\n", + "# Load shapes, regenerate yearly maps, and display them\n", + "shapes_filename <- glue::glue(\"{COUNTRY_CODE}_shapes.geojson\")\n", + "shapes <- load_dataset_file(\n", + " dataset_id = DHIS2_FORMATTED_DATASET,\n", + " filename = shapes_filename\n", + ")\n", "\n", - "cat(\"\\n### Proportion Malaria Deaths\\n\")\n", - "if (\"prop_malaria_deaths\" %in% names(qoc_sf)) {\n", - " plot_yearly_map_report(qoc_sf, \"prop_malaria_deaths\", \"Proportion of malaria deaths (MALDTH / ALLDTH)\", TRUE)\n", - "}\n", + "save_quality_of_care_maps(\n", + " qoc_dt = qoc,\n", + " shapes_sf = shapes,\n", + " figures_path = FIGURES_PATH\n", + ")\n", "\n", - "cat(\"\\n### Non-malaria All-cause Outpatients\\n\")\n", - "if (\"non_malaria_all_cause_outpatients\" %in% names(qoc_sf)) {\n", - " plot_yearly_map_report(qoc_sf, \"non_malaria_all_cause_outpatients\", \"Non-malaria all-cause outpatients (ALLOUT)\", FALSE)\n", - "}\n", + "years <- sort(unique(qoc$YEAR))\n", + "years_regex <- paste(years, collapse = \"|\")\n", + "map_files <- list.files(\n", + " FIGURES_PATH,\n", + " pattern = glue::glue(\"^(testing_rate|treatment_rate|case_fatality_rate|prop_adm_malaria|prop_malaria_deaths|allout|presumed_cases)_({years_regex})[.]png$\"),\n", + " full.names = TRUE\n", + ")\n", + "map_files <- sort(map_files)\n", "\n", - "cat(\"\\n### Presumed Cases\\n\")\n", - "if (\"presumed_cases\" %in% names(qoc_sf)) {\n", - " plot_yearly_map_report(qoc_sf, \"presumed_cases\", \"Presumed cases (PRES)\", FALSE)\n", + "for (map_file in map_files) {\n", + " IRdisplay::display_png(file = normalizePath(path.expand(map_file)))\n", "}" ] }, diff --git a/pipelines/snt_dhis2_quality_of_care/utils/snt_dhis2_quality_of_care.r b/pipelines/snt_dhis2_quality_of_care/utils/snt_dhis2_quality_of_care.r new file mode 100644 index 0000000..4d40380 --- /dev/null +++ b/pipelines/snt_dhis2_quality_of_care/utils/snt_dhis2_quality_of_care.r @@ -0,0 +1,223 @@ +# Load shared SNT helpers. +source(file.path("~/workspace", "code", "snt_utils.r")) + + +#' Load packages, OpenHEXA, and return base workspace paths (one list, four names). +#' @param SNT_ROOT_PATH Workspace root. Default `~/workspace`. +#' @param packages R packages to install/load. +#' @return Named list: `CONFIG_PATH`, `UPLOADS_PATH`, `DATA_PATH`, `PIPELINES_PATH`. +get_setup_variables <- function( + SNT_ROOT_PATH = "~/workspace", + packages = c("arrow", "dplyr", "tidyr", "stringr", "stringi", "jsonlite", "httr", "glue", "reticulate") +) { + base_paths <- list( + CONFIG_PATH = file.path(SNT_ROOT_PATH, "configuration"), + UPLOADS_PATH = file.path(SNT_ROOT_PATH, "uploads"), + DATA_PATH = file.path(SNT_ROOT_PATH, "data"), + PIPELINES_PATH = file.path(SNT_ROOT_PATH, "pipelines") + ) + + for (p in base_paths) { + if (!dir.exists(p)) { + dir.create(p, recursive = TRUE, showWarnings = FALSE) + } + } + + install_and_load(packages) + + Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") + reticulate::py_config()$python + assign("openhexa", reticulate::import("openhexa.sdk"), envir = .GlobalEnv) + + return(base_paths) +} + +#' Load dataset file from OpenHEXA. +#' +#' @param dataset_id Character. OpenHEXA dataset identifier. +#' @param filename Character. Name of file to load. +#' @param verbose Logical. If TRUE, log dataframe dimensions after a successful load. +#' @return Dataframe containing the loaded data. +load_dataset_file <- function(dataset_id, filename, verbose = TRUE) { + if (!exists("openhexa", inherits = TRUE) || is.null(get("openhexa", inherits = TRUE))) { + stop("[ERROR] OpenHEXA SDK is not available. Run `get_setup_variables()` before loading dataset files.") + } + + data <- tryCatch( + { + get_latest_dataset_file_in_memory(dataset_id, filename) + }, + error = function(e) { + stop(glue::glue("[ERROR] Error while loading {filename} file from dataset: {dataset_id}")) + } + ) + + if (verbose) { + log_msg(glue::glue( + "{filename} data loaded from dataset : {dataset_id} dataframe dimensions: [{paste(dim(data), collapse = ', ')}]" + )) + } + + return(data) +} + +#' Validate quality-of-care action parameter. +#' +#' @param data_action Action string expected to be `imputed` or `removed`. +#' @return Validated action string. +validate_quality_of_care_action <- function(data_action) { + if (is.null(data_action) || !nzchar(data_action)) { + return("imputed") + } + allowed_actions <- c("imputed", "removed") + if (!(data_action %in% allowed_actions)) { + stop(glue::glue("[ERROR] Invalid data_action `{data_action}`. Allowed: {paste(allowed_actions, collapse = ', ')}")) + } + data_action +} + +#' Compute district-year Quality of Care indicators. +#' +#' @param routine Routine dataframe loaded from outliers dataset. +#' @param indicator_cols Character vector of routine indicator column names to coerce to numeric +#' (define in the notebook or config, not hardcoded here). +#' @return Data table with district-year indicators. +normalize_qoc_routine_types <- function(routine, indicator_cols) { + data.table::setDT(routine) + available_cols <- intersect(indicator_cols, names(routine)) + + for (col in available_cols) { + col_vals <- as.character(routine[[col]]) + col_vals[is.na(col_vals) | col_vals == "" | col_vals == "-"] <- NA_character_ + routine[, (col) := as.numeric(col_vals)] + } + + routine[, YEAR := as.integer(YEAR)] + routine[, ADM2_ID := as.character(ADM2_ID)] + routine +} + +#' Aggregate QoC routine indicators by district and year. +#' +#' @param routine Routine data table with normalized types. +#' @param indicator_cols Character vector of column names to sum (must match the vector used +#' in [normalize_qoc_routine_types()]). +#' @return Aggregated district-year data table. +aggregate_qoc_district_year <- function(routine, indicator_cols) { + available_cols <- intersect(indicator_cols, names(routine)) + + if (length(available_cols) > 0) { + routine[, lapply(.SD, function(x) sum(x, na.rm = TRUE)), .SDcols = available_cols, by = .(ADM2_ID, YEAR)] + } else { + unique(routine[, .(ADM2_ID, YEAR)]) + } +} + +#' Merge ADM2 labels into Quality of Care outputs. +#' +#' @param qoc_dt Quality-of-care data table. +#' @param shapes_sf Shapes sf table. +#' @return Data table with optional ADM2_NAME. +attach_quality_of_care_shapes <- function(qoc_dt, shapes_sf) { + shapes_dt <- data.table::as.data.table(sf::st_drop_geometry(shapes_sf)) + if ("ADM2_ID" %in% names(shapes_dt) && "ADM2_NAME" %in% names(shapes_dt)) { + shapes_dt[, ADM2_ID := as.character(ADM2_ID)] + qoc_dt <- merge(qoc_dt, unique(shapes_dt[, .(ADM2_ID, ADM2_NAME)]), by = "ADM2_ID", all.x = TRUE) + } + qoc_dt +} + +#' Save district-year Quality of Care outputs. +#' +#' @param qoc_dt Computed quality-of-care data table. +#' @param output_data_path Output directory path. +#' @param country_code Country code. +#' @param data_action Action suffix for output naming. +#' @return Named list with `parquet` and `csv` output file paths. +save_quality_of_care_outputs <- function(qoc_dt, output_data_path, country_code, data_action) { + out_district_parquet <- file.path(output_data_path, glue::glue("{country_code}_quality_of_care_district_year_{data_action}.parquet")) + out_district_csv <- file.path(output_data_path, glue::glue("{country_code}_quality_of_care_district_year_{data_action}.csv")) + + arrow::write_parquet(qoc_dt, out_district_parquet) + data.table::fwrite(qoc_dt, out_district_csv) + log_msg(glue::glue("Saved outputs: {out_district_parquet}, {out_district_csv}")) + + list(parquet = out_district_parquet, csv = out_district_csv) +} + +#' Generate and save yearly district maps for QoC indicators. +#' +#' @param qoc_dt Quality-of-care data table. +#' @param shapes_sf District shapes sf. +#' @param figures_path Folder where PNG maps are written. +#' @return Invisibly returns `TRUE`. +save_quality_of_care_maps <- function(qoc_dt, shapes_sf, figures_path) { + shapes_sf$ADM2_ID <- as.character(shapes_sf$ADM2_ID) + qoc_dt$ADM2_ID <- as.character(qoc_dt$ADM2_ID) + + plot_yearly_map <- function(df, sf_shapes, value_col, title_prefix, filename_prefix, is_rate = TRUE) { + if (!(value_col %in% names(df))) return(invisible(NULL)) + sf_shapes_local <- sf_shapes + years <- sort(unique(df$YEAR)) + + for (yr in years) { + tryCatch( + { + df_y <- df[YEAR == yr] + if (nrow(df_y) == 0) next + df_y$ADM2_ID <- as.character(df_y$ADM2_ID) + map_df <- dplyr::left_join(sf_shapes_local, df_y, by = "ADM2_ID") + if (!(value_col %in% names(map_df))) next + + vals <- map_df[[value_col]] + finite_vals <- vals[is.finite(vals) & !is.na(vals)] + if (length(finite_vals) == 0) next + + if (is_rate) { + cat_vals <- cut(vals, breaks = c(-Inf, 0, 0.2, 0.4, 0.6, 0.8, 1.0, Inf), labels = c("<0", "0-0.2", "0.2-0.4", "0.4-0.6", "0.6-0.8", "0.8-1.0", ">1.0"), include.lowest = TRUE) + fill_palette <- "YlOrRd" + } else { + if (length(finite_vals) > 4) { + br <- unique(as.numeric(quantile(finite_vals, probs = seq(0, 1, 0.2), na.rm = TRUE))) + if (length(br) < 2) { + cat_vals <- as.factor(rep("all", nrow(map_df))) + } else { + cat_vals <- cut(vals, breaks = br, include.lowest = TRUE) + } + } else { + cat_vals <- as.factor(vals) + } + fill_palette <- "Blues" + } + + map_df <- dplyr::mutate(map_df, cat = as.factor(cat_vals)) + p <- ggplot2::ggplot(map_df) + + ggplot2::geom_sf(ggplot2::aes(fill = cat), color = "grey60", size = 0.1) + + ggplot2::scale_fill_brewer(palette = fill_palette, na.value = "white", drop = FALSE) + + ggplot2::theme_void() + + ggplot2::labs(title = paste0(title_prefix, " - ", yr), fill = value_col, caption = "Source: SNT DHIS2 outliers-imputed routine data") + + ggplot2::theme(legend.position = "bottom", plot.title = ggplot2::element_text(face = "bold", size = 12)) + + out_png <- file.path(figures_path, glue::glue("{filename_prefix}_{yr}.png")) + ggplot2::ggsave(out_png, plot = p, width = 9, height = 7, dpi = 300, bg = "white") + log_msg(glue::glue("Saved map: {out_png}")) + }, + error = function(e) { + log_msg(glue::glue("[WARNING] Failed to build/save map for `{value_col}` year `{yr}`: {conditionMessage(e)}"), level = "warning") + } + ) + } + } + + plot_yearly_map(qoc_dt, shapes_sf, "testing_rate","Testing rate (TEST / SUSP)","testing_rate",TRUE) + plot_yearly_map(qoc_dt, shapes_sf, "treatment_rate","Treatment rate (MALTREAT / CONF)","treatment_rate",TRUE) + plot_yearly_map(qoc_dt, shapes_sf, "case_fatality_rate","In-hospital case fatality rate (MALDTH / MALADM)","case_fatality_rate",TRUE) + plot_yearly_map(qoc_dt, shapes_sf, "prop_adm_malaria","Proportion admitted for malaria (MALADM / ALLADM)","prop_adm_malaria",TRUE) + plot_yearly_map(qoc_dt, shapes_sf, "prop_malaria_deaths","Proportion of malaria deaths (MALDTH / ALLDTH)","prop_malaria_deaths",TRUE) + plot_yearly_map(qoc_dt, shapes_sf, "non_malaria_all_cause_outpatients","Non-malaria all-cause outpatients (ALLOUT)","allout",FALSE) + plot_yearly_map(qoc_dt, shapes_sf, "presumed_cases","Presumed cases (PRES)","presumed_cases",FALSE) + + log_msg(glue::glue("Saved yearly maps in: {figures_path}")) + invisible(TRUE) +} + diff --git a/pipelines/snt_dhis2_quality_of_care/utils/snt_dhis2_quality_of_care_report.r b/pipelines/snt_dhis2_quality_of_care/utils/snt_dhis2_quality_of_care_report.r new file mode 100644 index 0000000..dfb20ec --- /dev/null +++ b/pipelines/snt_dhis2_quality_of_care/utils/snt_dhis2_quality_of_care_report.r @@ -0,0 +1,140 @@ +# Load pipeline helpers (common + code-specific functions). +source(file.path("~/workspace", "pipelines", "snt_dhis2_quality_of_care", "utils", "snt_dhis2_quality_of_care.r")) + + +#' Load latest Quality of Care district-year output. +#' +#' @param output_data_path Path to quality-of-care data outputs. +#' @param country_code Country code. +#' @return Named list with `qoc` (data table) and `latest_file` (path). +load_latest_quality_of_care_output <- function(output_data_path, country_code) { + files <- list.files( + output_data_path, + pattern = paste0("^", country_code, "_quality_of_care_district_year_(imputed|removed)\\.parquet$"), + full.names = TRUE + ) + if (length(files) == 0) { + stop(glue::glue("[ERROR] No quality_of_care parquet found in {output_data_path}")) + } + latest_file <- files[which.max(file.info(files)$mtime)] + qoc <- data.table::as.data.table(arrow::read_parquet(latest_file)) + list(qoc = qoc, latest_file = latest_file) +} + + +#' Build year-level Quality of Care summary table. +#' +#' @param qoc_dt Quality-of-care district-year data table. +#' @return Year-level summary table ordered by YEAR. +build_quality_of_care_summary <- function(qoc_dt) { + mean_cols <- c("testing_rate", "treatment_rate", "case_fatality_rate", "prop_adm_malaria", "prop_malaria_deaths") + sum_cols <- c("non_malaria_all_cause_outpatients", "presumed_cases") + + summary_tbl <- unique(qoc_dt[, .(YEAR)]) + + for (col in intersect(mean_cols, names(qoc_dt))) { + agg <- qoc_dt[, setNames(list(mean(get(col), na.rm = TRUE)), col), by = .(YEAR)] + summary_tbl <- merge(summary_tbl, agg, by = "YEAR", all.x = TRUE) + } + + for (col in intersect(sum_cols, names(qoc_dt))) { + agg <- qoc_dt[, setNames(list(sum(get(col), na.rm = TRUE)), col), by = .(YEAR)] + summary_tbl <- merge(summary_tbl, agg, by = "YEAR", all.x = TRUE) + } + + summary_tbl[order(YEAR)] +} + + +#' Save year-level summary outputs (parquet and csv only; no Excel — avoids extra deps). +#' +#' @param summary_tbl Summary table. +#' @param report_outputs_path Reporting outputs folder. +#' @param country_code Country code. +#' @return Named list with `summary_parquet` and `summary_csv` paths. +save_quality_of_care_summary_outputs <- function(summary_tbl, report_outputs_path, country_code) { + summary_parquet <- file.path(report_outputs_path, glue::glue("{country_code}_quality_of_care_summary.parquet")) + summary_csv <- file.path(report_outputs_path, glue::glue("{country_code}_quality_of_care_summary.csv")) + + arrow::write_parquet(summary_tbl, summary_parquet) + data.table::fwrite(summary_tbl, summary_csv) + + log_msg(glue::glue("Summary data saved to: {summary_parquet}, {summary_csv}")) + list(summary_parquet = summary_parquet, summary_csv = summary_csv) +} + + +#' Build and save year-level bar chart panel for QoC indicators. +#' +#' @param summary_tbl Year-level summary table. +#' @param figures_path Folder where the combined chart is saved. +#' @param country_code Country code used in output file name. +#' @return Path to saved chart, or NULL if no indicator columns are available. +save_quality_of_care_summary_charts <- function(summary_tbl, figures_path, country_code) { + plot_data <- data.table::copy(summary_tbl) + if (nrow(plot_data) == 0) return(NULL) + + make_pct_plot <- function(col_name, title_name) { + ggplot2::ggplot(plot_data, ggplot2::aes(x = factor(YEAR), y = .data[[col_name]] * 100)) + + ggplot2::geom_bar(stat = "identity", fill = "#2563eb", color = "#1e40af", width = 0.7) + + ggplot2::geom_text(ggplot2::aes(label = paste0(round(.data[[col_name]] * 100, 1), "%")), vjust = -0.5, size = 2.5) + + ggplot2::labs(title = title_name, x = "Annee", y = "%") + + ggplot2::theme_minimal() + + ggplot2::theme( + plot.title = ggplot2::element_text(face = "bold", size = 10), + axis.text.x = ggplot2::element_text(angle = 45, hjust = 1, size = 9), + panel.grid.major.y = ggplot2::element_line(linetype = "dashed", color = scales::alpha("grey", 0.7)), + plot.background = ggplot2::element_rect(fill = "#fafafa", color = NA), + panel.background = ggplot2::element_rect(fill = "#fafafa", color = NA), + plot.margin = ggplot2::margin(5, 5, 5, 5) + ) + + ggplot2::scale_y_continuous(expand = ggplot2::expansion(mult = c(0, 0.1))) + } + + make_abs_plot <- function(col_name, title_name) { + format_label <- function(v) { + ifelse( + is.na(v) | v == 0, + "0", + ifelse(v >= 1e6, paste0(round(v / 1e6, 2), "M"), format(round(v), big.mark = " ", scientific = FALSE)) + ) + } + ggplot2::ggplot(plot_data, ggplot2::aes(x = factor(YEAR), y = .data[[col_name]])) + + ggplot2::geom_bar(stat = "identity", fill = "#2563eb", color = "#1e40af", width = 0.7) + + ggplot2::geom_text(ggplot2::aes(label = format_label(.data[[col_name]])), vjust = -0.5, size = 2.5) + + ggplot2::labs(title = title_name, x = "Annee", y = "Nombre") + + ggplot2::theme_minimal() + + ggplot2::theme( + plot.title = ggplot2::element_text(face = "bold", size = 10), + axis.text.x = ggplot2::element_text(angle = 45, hjust = 1, size = 9), + panel.grid.major.y = ggplot2::element_line(linetype = "dashed", color = scales::alpha("grey", 0.7)), + plot.background = ggplot2::element_rect(fill = "#fafafa", color = NA), + panel.background = ggplot2::element_rect(fill = "#fafafa", color = NA), + plot.margin = ggplot2::margin(5, 5, 5, 5) + ) + + ggplot2::scale_y_continuous(labels = scales::comma, expand = ggplot2::expansion(mult = c(0, 0.1))) + } + + plots_list <- list() + if ("testing_rate" %in% names(plot_data)) plots_list[["testing_rate"]] <- make_pct_plot("testing_rate", "Testing rate (TEST / SUSP)") + if ("treatment_rate" %in% names(plot_data)) plots_list[["treatment_rate"]] <- make_pct_plot("treatment_rate", "Treatment rate (MALTREAT / CONF)") + if ("case_fatality_rate" %in% names(plot_data)) plots_list[["case_fatality_rate"]] <- make_pct_plot("case_fatality_rate", "Case fatality rate (MALDTH / MALADM)") + if ("prop_adm_malaria" %in% names(plot_data)) plots_list[["prop_adm_malaria"]] <- make_pct_plot("prop_adm_malaria", "Prop. admissions paludisme (MALADM / ALLADM)") + if ("prop_malaria_deaths" %in% names(plot_data)) plots_list[["prop_malaria_deaths"]] <- make_pct_plot("prop_malaria_deaths", "Prop. deces paludisme (MALDTH / ALLDTH)") + if ("presumed_cases" %in% names(plot_data)) plots_list[["presumed_cases"]] <- make_abs_plot("presumed_cases", "Cas presumes (PRES)") + if ("non_malaria_all_cause_outpatients" %in% names(plot_data)) plots_list[["non_malaria_all_cause_outpatients"]] <- make_abs_plot("non_malaria_all_cause_outpatients", "Consultations externes non-paludisme (ALLOUT)") + + if (length(plots_list) == 0) return(NULL) + + plot_order <- c("testing_rate", "treatment_rate", "case_fatality_rate", "prop_adm_malaria", "prop_malaria_deaths", "presumed_cases", "non_malaria_all_cause_outpatients") + available_plots <- plots_list[intersect(plot_order, names(plots_list))] + n_plots <- length(available_plots) + ncol_layout <- 2 + nrow_layout <- ceiling(n_plots / ncol_layout) + + combined_plot <- do.call(gridExtra::grid.arrange, c(available_plots, ncol = ncol_layout, nrow = nrow_layout)) + out_file <- file.path(figures_path, glue::glue("{country_code}_quality_of_care_by_year.png")) + ggplot2::ggsave(out_file, plot = combined_plot, width = 18, height = max(8, 5.2 * nrow_layout), dpi = 300, bg = "white", units = "in") + log_msg(glue::glue("Combined bar charts saved: {out_file}")) + out_file +} diff --git a/pipelines/snt_dhis2_reporting_rate/code/snt_dhis2_reporting_rate.ipynb b/pipelines/snt_dhis2_reporting_rate/code/snt_dhis2_reporting_rate.ipynb index 81eded2..4b8cf71 100644 --- a/pipelines/snt_dhis2_reporting_rate/code/snt_dhis2_reporting_rate.ipynb +++ b/pipelines/snt_dhis2_reporting_rate/code/snt_dhis2_reporting_rate.ipynb @@ -1,2355 +1,2438 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "f5827740-2917-4504-9017-9ec7d408e5f4", - "metadata": {}, - "source": [ - "Script structure:\n", - "\n", - " 0. Parameters: set back-up values for parameters, for when the notebook is run manually (_noy_ via pipeline)\n", - " 1. Setup:\n", - " * Paths\n", - " * Utils functions\n", - " 2. Load Data\n", - " * **Routine data** (DHIS2) already formatted & aggregated (output of pipeline XXX)\n", - " * **Reporting** (DHIS2) pre-computed, already formatted & aggregated (output of pipeline ???)\n", - " * **Shapes** (DHIS2) for plotting (this could be removed if we move the plots to \"report/EDA\" nb)\n", - " 3. Calculate **Reportng Rate (RR)**\n", - " * \"**Dataset**\": using pre-computed reportings from DHIS2/SNIS (was: \"DHIS2\")\n", - " * \"**Data Element**\": using calculated expected nr of report (nr of active facilities) (was: \"CONF\")\n", - " 4. **Export** reporting rate data to `.../data/dhis2/reporting_rate/` as .parquet (and .csv) files for **either**:\n", - " * data**set**: \"XXX_reporting_rate_**dataset**.parquet\" **or**\n", - " * data**element**: \"XXX_reporting_rate_**dataelement**.parquet\"" - ] - }, - { - "cell_type": "markdown", - "id": "5e8f5bf2-922a-468a-8a2c-8e56d7e652df", - "metadata": {}, - "source": [ - "--------------------" - ] - }, - { - "cell_type": "markdown", - "id": "e962c5a4-6b09-4485-8d71-d842159118d3", - "metadata": {}, - "source": [ - "### To Do:\n", - "* For `DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\"`: **add code** to count OPEN facilities () for **countries with \"normal\" pyramids** (i.e., when no mixing of facilities and admin levels ... !). Atm only code for Niger, which runs only if `COUNTRY_CODE == NER`. Should add similar (but simpler) code for the rest of the countries (i.e, `COUNTRY_CODE != NER`)\n", - "* Check why Data Element **Denominator** `routine_active_facilities` is **calculated at `YEAR` (aggregated) instead of `MONTH`** ... possibly fix this to match granularity of other alternatives for denominator (which are calculated at MONTH level)\n", - "* Modify **report notebook** and/or pipeline.py code so that it does not make the **pipeline FAIL** if `reporting_rate_dataset` or `reporting_rate_dataelement` is **not found** (which is now always the case since we only output 1 file at each run!!)" - ] - }, - { - "cell_type": "markdown", - "id": "0cdfdc73-bb9a-48a8-a26b-84ecbab2e0aa", - "metadata": {}, - "source": [ - "----------------" - ] - }, - { - "cell_type": "markdown", - "id": "339f6d58-0965-40ef-b718-96195d2463f8", - "metadata": {}, - "source": [ - "## Parameters" - ] - }, - { - "cell_type": "markdown", - "id": "dd6cd6f8-b91b-4902-8801-a60e11776f98", - "metadata": {}, - "source": [ - "Set Default values **if _not_ provided by pipeline**
\n", - "This makes the execution flexible and \"safe\": nb can be run manually from here or be executed via pipeline, without having to change anything in the code!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "93aac683-8828-4a42-b841-f16c7e8fbb07", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Set BACKUP VALUE: root path - NEVER CHANGE THIS!\n", - "if (!exists(\"SNT_ROOT_PATH\")) {\n", - " SNT_ROOT_PATH <- \"/home/hexa/workspace\" \n", - "}\n", - "\n", - "\n", - "# Choose to run either DataSet OR DataElement method\n", - "if (!exists(\"REPORTING_RATE_METHOD\")) {\n", - " # REPORTING_RATE_METHOD <- \"DATASET\" \n", - " REPORTING_RATE_METHOD <- \"DATAELEMENT\"\n", - "}\n", - "\n", - "\n", - "# Data Elemenet method: Choice of which INDICATORS to use to count the nr of reporting facilities \n", - "# CONF\n", - "if (!exists(\"DATAELEMENT_METHOD_NUMERATOR_CONF\")) {\n", - " DATAELEMENT_METHOD_NUMERATOR_CONF <- TRUE # FALSE\n", - "}\n", - "\n", - "# SUSP\n", - "if (!exists(\"DATAELEMENT_METHOD_NUMERATOR_SUSP\")) {\n", - " DATAELEMENT_METHOD_NUMERATOR_SUSP <- TRUE # FALSE\n", - "}\n", - "\n", - "# TEST\n", - "if (!exists(\"DATAELEMENT_METHOD_NUMERATOR_TEST\")) {\n", - " DATAELEMENT_METHOD_NUMERATOR_TEST <- TRUE # FALSE\n", - "}\n", - "\n", - "\n", - "\n", - "# Data Elemenet RR. Choice: which df to use for nr of `EXPECTED_REPORTS` (DENOMINATOR) \n", - "if (!exists(\"DATAELEMENT_METHOD_DENOMINATOR\")) {\n", - " # DATAELEMENT_METHOD_DENOMINATOR <- \"ROUTINE_ACTIVE_FACILITIES\" \n", - " DATAELEMENT_METHOD_DENOMINATOR <- \"PYRAMID_OPEN_FACILITIES\" \n", - " # DATAELEMENT_METHOD_DENOMINATOR <- \"DHIS2_EXPECTED_REPORTS\" # ⚠️ only if `REPORTING_RATE_METHOD == \"DATASET\"` && DataSet is available!! ⚠️\n", - "} \n" - ] - }, - { - "cell_type": "markdown", - "id": "af076158-1f5a-408d-8ce2-2f2101d0531c", - "metadata": {}, - "source": [ - "## 1. Setup" - ] - }, - { - "cell_type": "markdown", - "id": "3ae826e4-f728-4c8d-81fb-0857234ac622", - "metadata": {}, - "source": [ - "### 1.1. Paths" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b5f1b8ce-db82-4295-8e74-00b765cf0b9d", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# PROJECT PATHS\n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, 'code') # this is where we store snt_utils.r\n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, 'configuration') # .json config file\n", - "DATA_PATH <- file.path(SNT_ROOT_PATH, 'data', 'dhis2') " - ] - }, - { - "cell_type": "markdown", - "id": "22971de0-1431-4cbd-b8c1-3bd3e1609e0d", - "metadata": {}, - "source": [ - "### 1.2. Utils functions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1784fd43-03f3-478b-8148-4b478317ea21", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "source(file.path(CODE_PATH, \"snt_utils.r\"))" - ] - }, - { - "cell_type": "markdown", - "id": "3bbcbd39-54e8-4ece-9244-30d7d30291d2", - "metadata": {}, - "source": [ - "### 1.3. Packages" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "426ecff6-0b4c-474d-a48d-826002205b89", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# List required pcks ----------------> check what are the really required libraries\n", - "required_packages <- c(\"arrow\", # for .parquet\n", - " \"tidyverse\",\n", - " \"stringi\", \n", - " \"jsonlite\", \n", - " \"httr\", \n", - " \"reticulate\")\n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)" - ] - }, - { - "cell_type": "markdown", - "id": "18a8e0c1-ac09-4435-b6f4-5f91fd916396", - "metadata": {}, - "source": [ - "### 1.3.1. OpenHEXA-specific settings" - ] - }, - { - "cell_type": "markdown", - "id": "ebb8c7d5-7c2c-4dbe-a1ba-238419fbedf3", - "metadata": {}, - "source": [ - "#### For 📦{sf}, tell OH where to find stuff ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "91a66fb7-dd5e-43fd-a6a2-d8bb9f0315d6", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")" - ] - }, - { - "cell_type": "markdown", - "id": "ac9ee427-020e-47c5-b2c9-5ca24e1f2779", - "metadata": {}, - "source": [ - "#### Set environment to load openhexa.sdk from the right path" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "aa331278-573d-4a22-ab16-da6972d7b0be", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Set environment to load openhexa.sdk from the right path\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")" - ] - }, - { - "cell_type": "markdown", - "id": "339b2e8b-9bf6-4eaf-b283-d9360c1c6899", - "metadata": {}, - "source": [ - "### 1.4. Load and check `config` file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f1c46526-6844-43ae-bb53-d8d1ad2fac24", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load SNT config\n", - "\n", - "config_file_name <- \"SNT_config.json\" \n", - "config_json <- tryCatch({\n", - " jsonlite::fromJSON(file.path(CONFIG_PATH, config_file_name)) \n", - " },\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, config_file_name))\n", - "log_msg(msg)" - ] - }, - { - "cell_type": "markdown", - "id": "29182f25-b0cf-46aa-9818-49616cd3f353", - "metadata": {}, - "source": [ - "**Save config fields as variables**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c52654c8-8a19-4e0c-a83b-1bc2eecae6bc", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Generic\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", - "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", - "\n", - "# How to treat 0 values (in this case: \"SET_0_TO_NA\" converts 0 to NAs)\n", - "NA_TREATMENT <- config_json$SNT_CONFIG$NA_TREATMENT\n", - "\n", - "# Which (aggregated) indicators to use to evaluate \"activity\" of an HF - for Reporting Rate method \"Ousmane\"\n", - "DHIS2_INDICATORS <- names(config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS)\n", - "\n", - "# Which reporting rate PRODUCT_UID to use (not that this is a dataset in COD, but 2 dataElements in BFA!)\n", - "REPORTING_RATE_PRODUCT_ID <- config_json$SNT_CONFIG$REPORTING_RATE_PRODUCT_UID" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "412572bc-fb96-4f61-ac49-be7f449219b6", - "metadata": {}, - "outputs": [], - "source": [ - "# DHIS2_INDICATORS\n", - "log_msg(paste(\"Expecting the following DHIS2 (aggregated) indicators : \", paste(DHIS2_INDICATORS, collapse=\", \")))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2a0a8562-4a70-455c-9ccf-aa39f4cf4e31", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Fixed cols for routine data formatting \n", - "fixed_cols <- c('OU_ID','PERIOD', 'YEAR', 'MONTH', 'ADM1_ID', 'ADM2_ID') # (OU_NAME has homonimous values!)\n", - "# print(paste(\"Fixed routine data (`dhis2_routine`) columns (always expected): \", paste(fixed_cols, collapse=\", \")))\n", - "log_msg(paste(\"Expecting the following columns from routine data (`dhis2_routine`) : \", paste(fixed_cols, collapse=\", \")))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "86e82d54-2b00-4c25-9b34-3497d4c88c52", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Fixed cols for exporting RR tables: to export output tables with consistent structure\n", - "fixed_cols_rr <- c('YEAR', 'MONTH', 'ADM2_ID', 'REPORTING_RATE') " - ] - }, - { - "cell_type": "markdown", - "id": "dadc7351-e67e-450b-a046-bc64660a7dde", - "metadata": {}, - "source": [ - "### 1.5. 🔍 Check: at least 1 indicator must be selected\n", - "The use can toggle on/off each of the indicators. Therefore, need to make sure at least one is ON.
\n", - "Alternatively, `CONF` could be made mandatory, but I think it looks better if they're all displayed in the Run pipeline view (more intuitive)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2cf6e2a4-0822-4a0c-852e-143da5473d20", - "metadata": {}, - "outputs": [], - "source": [ - "nr_of_indicators_selected <- sum(DATAELEMENT_METHOD_NUMERATOR_CONF, DATAELEMENT_METHOD_NUMERATOR_SUSP, DATAELEMENT_METHOD_NUMERATOR_TEST)\n", - "\n", - "if (nr_of_indicators_selected == 0) {\n", - " msg <- \"[ERROR] Error: no indicator selected, cannot perform calculation of reporting rate method 'Data Element'! Select at least one (e.g., `CONF`).\"\n", - " cat(msg) \n", - " stop(msg)\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "8d8d9be2-bf05-466d-811e-6beea0dccfde", - "metadata": {}, - "source": [ - "## 2. Load Data" - ] - }, - { - "cell_type": "markdown", - "id": "0fa1b169-fc55-4ef1-b58f-6a7dc9d1dec3", - "metadata": {}, - "source": [ - "### 2.1. **Routine** data (DHIS2) \n", - "already formatted & aggregated (output of pipeline XXX)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "586e8da8-4e1c-431a-9b8d-1169167e1c09", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# DHIS2 Dataset extract identifier\n", - "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "\n", - "# Load file from dataset\n", - "dhis2_routine <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_routine.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 routine data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - "})\n", - "\n", - "msg <- paste0(\"DHIS2 routine data loaded from dataset : \", dataset_name, \" dataframe dimensions: \", paste(dim(dhis2_routine), collapse=\", \"))\n", - "log_msg(msg)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a2454183-44f7-4e2e-a0cf-ca112aa183bb", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Ensure correct data type for numerical columns \n", - "dhis2_routine <- dhis2_routine %>%\n", - " mutate(across(c(PERIOD, YEAR, MONTH), as.numeric))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "edb2fcdc-ce0a-4c78-b06a-9f4610ab4714", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "head(dhis2_routine, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "821e1ebf-b2fa-4469-974e-2e4d27d58854", - "metadata": {}, - "source": [ - "#### 🔍 Check expected cols for method **Data Element**, numerator using multiple indicators.\n", - "Only when: `DATAELEMENT_METHOD_NUMERATOR == \"CONF|SUSP|TEST\"`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d3f8b89e-a04e-4e0b-9892-95ce2150e7da", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "head(dhis2_routine, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "adec5412", - "metadata": {}, - "source": [ - "#### 🔍 Check expected cols for method **Data Element**, numerator using multiple indicators.\n", - "Based on which indicator(s) are selected (if any)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0bbcdf8c-873a-4b41-980a-f18d1863ab8f", - "metadata": {}, - "outputs": [], - "source": [ - "# Initialize empty vector\n", - "indicators_selected = c()\n", - "\n", - "# Add elements based on user selection(s)\n", - "if (DATAELEMENT_METHOD_NUMERATOR_CONF) {\n", - " indicators_selected = append(indicators_selected, \"CONF\")\n", - "}\n", - "\n", - "if (DATAELEMENT_METHOD_NUMERATOR_SUSP) {\n", - " indicators_selected = append(indicators_selected, \"SUSP\")\n", - "}\n", - "\n", - "if (DATAELEMENT_METHOD_NUMERATOR_TEST) {\n", - " indicators_selected = append(indicators_selected, \"TEST\")\n", - "}\n", - "\n", - "print(paste0(\"Selected indicators: \", paste(indicators_selected, collapse = \", \")))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b84753f8-aa9c-4563-beae-5e29b3f1e773", - "metadata": {}, - "outputs": [], - "source": [ - "# This is kinda useless now but KEEP in case we ADD MORE CHOICES OF INDICATORS!! \n", - "if(REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", - " if (DATAELEMENT_METHOD_NUMERATOR_CONF | DATAELEMENT_METHOD_NUMERATOR_SUSP | DATAELEMENT_METHOD_NUMERATOR_TEST) {\n", - " log_msg(paste0(\"Indicator(s) \", paste(indicators_selected, collapse = \", \") , \" selected for calculation of numerator for method `Data Element`.\" ))\n", - " \n", - " if ( length(which(indicators_selected %in% names(dhis2_routine))) < length(indicators_selected) ) {\n", - " log_msg(paste0(\"🚨 Warning: one or more of the follow column is missing from `dhis2_routine`: \", paste(expected_col, collapse = \", \"), \".\"), \"warning\")\n", - " } \n", - " }\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "c832da26-fe0c-43fe-8300-2fff5c4cbf34", - "metadata": {}, - "source": [ - "### 2.2. **Reporting** pre-computed from DHIS2 \n", - "Data granularity:\n", - "* **ADM2**\n", - "* **MONTH** (PERIOD)\n", - "\n", - "Note: data comes from different dataset (`DS_NAME`): `A SERVICES DE BASE`, `B SERVICES SECONDAIRES`,`D SERVICE HOPITAL` \n", - "\n", - "The col `DS_METRIC` indicates whether the `VALUE` is `EXPECTED_REPORTS` or `ACTUAL_REPORTS`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0ce295b9-9898-4e12-8a91-92bb25b9e0a2", - "metadata": {}, - "outputs": [], - "source": [ - "# REPORTING_RATE_METHOD <- \"DATAELEMENT\" # \"DATASET\"\n", - "REPORTING_RATE_METHOD" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8a32fc96-5b8e-4108-a224-c0d843df9b47", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", - " # DHIS2 Dataset extract identifier\n", - " dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - " file_name <- paste0(COUNTRY_CODE, \"_reporting.parquet\")\n", - " \n", - " # Load file from dataset\n", - " dhis2_reporting <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, file_name) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 pre-computed REPORTING data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - " })\n", - " \n", - " msg <- paste0(\"DHIS2 pre-computed REPORTING data loaded from file `\", file_name, \"` (from dataset : `\", dataset_name, \"`). Dataframe dimensions: \", \n", - " paste(dim(dhis2_reporting), collapse=\", \"))\n", - " log_msg(msg)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e131d9ee-0e88-4bb6-982b-53b1229fba5f", - "metadata": {}, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", - " # Convert VALUE col to - should not be needed but keep as safety measure \n", - " dhis2_reporting <- dhis2_reporting |>\n", - " mutate(across(c(PERIOD, YEAR, MONTH, VALUE), as.numeric))\n", - "\n", - " head(dhis2_reporting, 3)\n", - " }" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "46e3dba8-d46b-457e-ba90-c663e30c42d2", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# # Convert VALUE col to - should not be needed but keep as safety measure \n", - "# dhis2_reporting <- dhis2_reporting |>\n", - "# mutate(across(c(PERIOD, YEAR, MONTH, VALUE), as.numeric))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5149befe-b6ad-46a9-9879-7637ce5b02be", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# head(dhis2_reporting, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "7a967af3-f6e5-428a-8769-72808f21a125", - "metadata": {}, - "source": [ - "#### 2.2.1. **Filter** to keep only values for `PRODUCT_UID` defined in config.json" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1948c2f7-7a2c-47a2-9dc6-ba29da6d030c", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "REPORTING_RATE_PRODUCT_ID" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e4258098-e24c-4520-914d-0f73354bb3ab", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", - "\n", - " # Handle problems with incorrect configuration - to be improved 🚧\n", - " if (is.null(REPORTING_RATE_PRODUCT_ID)) {\n", - " log_msg(\"🛑 Problem with definition of REPORTING_RATE_PRODUCT_ID, check `SNT_config.json` file!\")\n", - " } else \n", - " product_name <- dhis2_reporting |> filter(PRODUCT_UID %in% REPORTING_RATE_PRODUCT_ID) |> pull(PRODUCT_NAME) |> unique()\n", - " log_msg(glue::glue(\"Using REPORTING_RATE_PRODUCT_ID == `{REPORTING_RATE_PRODUCT_ID}`, corresponding to DHIS2 Product name : `{product_name}`.\"))\n", - "\n", - " }" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c22c6ada-7cb1-4fca-b65e-b51e5eca35a2", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", - "\n", - " dhis2_reporting_filtered <- dhis2_reporting |>\n", - " filter(PRODUCT_UID %in% REPORTING_RATE_PRODUCT_ID) |>\n", - " select(-PRODUCT_UID, -PRODUCT_NAME) # useless cols now\n", - " \n", - " print(dim(dhis2_reporting_filtered))\n", - " head(dhis2_reporting_filtered)\n", - " \n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "9da035cf-5d3f-4df0-a063-2d2497616c82", - "metadata": {}, - "source": [ - "#### 2.2.2. Format to produce `dhis2_reporting_expected`\n", - "🚨 Note: Use `dhis2_reporting_expected$EXPECTED_REPORTS` as new denominator for REPORTING_RATE calculations (methods dataset and dataelement)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7e970f9d-258e-4050-ae69-185b88c79fc3", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", - " \n", - " dhis2_reporting_wide <- dhis2_reporting_filtered |> \n", - " pivot_wider(\n", - " names_from = PRODUCT_METRIC, \n", - " values_from = VALUE\n", - " )\n", - " \n", - " print(dim(dhis2_reporting_wide))\n", - " head(dhis2_reporting_wide)\n", - " \n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "eab31756-ae6b-4152-8ec3-8195236d8732", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Use `dhis2_reporting_expected$EXPECTED_REPORTS` as new denomitor for RR calculations (methods ANY and CONF)\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", - " \n", - " dhis2_reporting_expected <- dhis2_reporting_wide |> \n", - " select(-ACTUAL_REPORTS)\n", - " \n", - " print(dim(dhis2_reporting_expected))\n", - " head(dhis2_reporting_expected)\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "3c3d0f35-889d-4c16-9741-d6e75e2ef096", - "metadata": {}, - "source": [ - "#### 2.2.3. **Checks** on data completeness: _do **periods match** with routine data?_\n", - "Lack of perfect overlap in periods between routine data and reporting rate data might create headhaches downstream!
\n", - "Specifically, **incidence** calculations will show **N2 smaller than N1** due to **aggregation by YEAR when NA** values are present!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7ea57600-418b-45bc-805a-f829e237b4c4", - "metadata": {}, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", - " \n", - " # --- Check Year Compatibility ---\n", - " routine_years <- sort(unique(as.integer(dhis2_routine$YEAR))) # as.integer\n", - " expected_years <- sort(unique(as.integer(dhis2_reporting_expected$YEAR))) # as.integer\n", - " \n", - " if (!setequal(routine_years, expected_years)) {\n", - " missing_in_routine <- setdiff(expected_years, routine_years)\n", - " missing_in_expected <- setdiff(routine_years, expected_years)\n", - " \n", - " if (length(missing_in_routine) > 0) {\n", - " log_msg(paste0(\"🚨 Warning: YEAR value(s) present in 'dhis2_reporting_expected' but not in 'dhis2_routine': \",\n", - " paste(missing_in_routine, collapse = \", \")))\n", - " }\n", - " if (length(missing_in_expected) > 0) {\n", - " log_msg(paste0(\"🚨 Warning: YEAR value(s) present in 'dhis2_routine' but not in 'dhis2_reporting_expected': \",\n", - " paste(missing_in_expected, collapse = \", \")))\n", - " }\n", - " } else {\n", - " log_msg(\"✅ YEAR values are consistent across 'dhis2_routine' and 'dhis2_reporting_expected'.\")\n", - " \n", - " # --- Check Month Compatibility (if years are consistent) ---\n", - " all_years <- unique(routine_years) # Or expected_years, they are the same now\n", - " \n", - " for (year_val in all_years) {\n", - " routine_months_for_year <- dhis2_routine %>%\n", - " filter(YEAR == year_val) %>%\n", - " pull(MONTH) %>%\n", - " unique() %>%\n", - " sort()\n", - " \n", - " expected_months_for_year <- dhis2_reporting_expected %>%\n", - " filter(YEAR == year_val) %>%\n", - " pull(MONTH) %>%\n", - " unique() %>%\n", - " sort()\n", - " \n", - " if (!setequal(routine_months_for_year, expected_months_for_year)) {\n", - " missing_in_routine_months <- setdiff(expected_months_for_year, routine_months_for_year)\n", - " missing_in_expected_months <- setdiff(routine_months_for_year, expected_months_for_year)\n", - " \n", - " if (length(missing_in_routine_months) > 0) {\n", - " log_msg(paste0(\"🚨 Warning: for YEAR \", year_val, \", MONTH value(s) '\", paste(missing_in_routine_months, collapse = \", \"),\n", - " \"' present in 'dhis2_reporting_expected' but not in 'dhis2_routine'!\"\n", - " ))\n", - " }\n", - " if (length(missing_in_expected_months) > 0) {\n", - " log_msg(paste0(\"🚨 Warning: for YEAR \", year_val, \", MONTH value(s) '\", paste(missing_in_expected_months, collapse = \", \"), \n", - " \"' present in 'dhis2_routine' but not in 'dhis2_reporting_expected'!\"\n", - " ))\n", - " }\n", - " } else {\n", - " log_msg(paste0(\"✅ For year \", year_val, \", months are consistent across both data frames.\"))\n", - " }\n", - " }\n", - " }\n", - "\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "5e711191-995b-4f89-b10c-fc2214cdd8b2", - "metadata": {}, - "source": [ - "### 2.3. **Pyramid** to count OPEN facilities (denominator)\n", - "Table (and column) needed for denominator of \"Data Element\" reporting rate if choice == `PYRAMID_OPEN_FACILITIES`\n", - "\n", - "**Important**: the pyramid must contain the `OPENING_DATE` and `CLOSING_DATE` columns (this was implemented in the new extraction pipeline from 2025-09).
\n", - "Then, **depending on the Country** (well, theire pyramid structure) **import** either:\n", - "* **Raw** pyramid for 🇳🇪 Niger: because first need to \"manually\" correctly aggregate the VALUEs for the HF (separate them from admin levels and sum up HD units)\n", - "* **Formatted** pyramid for all other countries encountered so far: 🇨🇩 DRC, 🇧🇫 Burkina Faso ... bevcause their pyramid is already usable right away" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ad3f83b6-2fdd-45da-8a4d-fb06513b6be2", - "metadata": {}, - "outputs": [], - "source": [ - "# DATAELEMENT_METHOD_DENOMINATOR <- \"PYRAMID_OPEN_FACILITIES\"\n", - "DATAELEMENT_METHOD_DENOMINATOR" - ] - }, - { - "cell_type": "markdown", - "id": "e7b80b6e-9e34-4e71-93e8-7e16a110e17c", - "metadata": {}, - "source": [ - "#### **Raw** pyramid for 🇳🇪 **Niger**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "652cf1a7-c9a2-48db-b44d-8fabfd0e072f", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE == \"NER\") {\n", - " \n", - " # DHIS2 Dataset extract identifier\n", - " dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_EXTRACTS\n", - " \n", - " # Load file from dataset\n", - " # Rename `dhis2_pyramid`?? Check with downstream processes ... 🚧\n", - " dhis2_pyramid_raw <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_dhis2_raw_pyramid.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 pyramid RAW data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - " })\n", - " \n", - " msg <- paste0(\"DHIS2 RAW pyramid data loaded from dataset : `\", dataset_name, \"`. Dataframe dimensions: \", paste(dim(dhis2_pyramid_raw), collapse=\", \"))\n", - " log_msg(msg)\n", - " \n", - " head(dhis2_pyramid_raw)\n", - " \n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "f1716ac3-ce8f-4223-9729-6ed826e743bc", - "metadata": {}, - "source": [ - "#### **Formatted** pyramid for all other countries (normal pyramid) 🇨🇩 🇧🇫" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fc16ae54-4915-4333-b458-2b611e2b1792", - "metadata": {}, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE != \"NER\") {\n", - " \n", - " # DHIS2 Dataset extract identifier\n", - " dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - " \n", - " # Load file from dataset\n", - " # Rename `dhis2_pyramid`?? Check with downstream processes ... 🚧\n", - " dhis2_pyramid_formatted <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_pyramid.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 pyramid FORMATTED data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - " })\n", - " \n", - " msg <- paste0(\"DHIS2 pyramid FORMATTED data loaded from dataset : `\", dataset_name, \"`. Dataframe dimensions: \", paste(dim(dhis2_pyramid_formatted), collapse=\", \"))\n", - " log_msg(msg)\n", - " \n", - " head(dhis2_pyramid_formatted)\n", - " \n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "eb4c5c63-d140-46b8-b686-886e612a31dc", - "metadata": {}, - "source": [ - "## 3. Calculate **Reporting Rate** (RR)\n", - "We compute it using 2 approaches, user can decided later on which one to use for incidence adjustment." - ] - }, - { - "cell_type": "markdown", - "id": "cb724aa8-5f06-4e99-aeca-640d0c1b049e", - "metadata": {}, - "source": [ - "## 3.1. \"**Dataset**\" reporting rate: pre-computed, from **DHIS2**\n", - "Exrtacted from DHIS2 and formatted. \n", - "\n", - "Straightforward: `ACTUAL_REPORTS` / `EXPECTED_REPORTS` (just pivot `DS_METRIC` and divide)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "10b2f52b-0217-43f1-88a3-cd01d98869b1", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", - "\n", - " reporting_rate_dataset <- dhis2_reporting_wide |> \n", - " mutate(REPORTING_RATE = ACTUAL_REPORTS / EXPECTED_REPORTS)\n", - " \n", - " print(dim(reporting_rate_dataset))\n", - " head(reporting_rate_dataset, 3)\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "3d49eda8-b4fd-437a-8938-17bf0806f281", - "metadata": {}, - "source": [ - "#### Quick data quality check 🔍" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cff33416-ea66-4eeb-9d33-1597c2f05b0c", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# --- Define function ---------------------------\n", - "inspect_reporting_rate <- function(data_tibble) {\n", - "\n", - " # Dynamically get the name of the tibble passed to the function\n", - " # Extract the litteral name of the variable passed (e.g., \"reporting_rate_dhis2_month\")\n", - " tibble_name_full <- deparse(substitute(data_tibble))\n", - "\n", - " # Extract the 'method' part from the tibble name\n", - " method <- stringr::str_extract(tibble_name_full, \"(?<=reporting_rate_).*\") # \"(?<=reporting_rate_).*?(?=_month)\"\n", - "\n", - " # Calculations for proportion of values > 1\n", - " values_greater_than_1 <- sum(data_tibble$REPORTING_RATE > 1, na.rm = TRUE)\n", - " total_values <- length(data_tibble$REPORTING_RATE)\n", - "\n", - " if (total_values > 0) {\n", - " proportion <- values_greater_than_1 / total_values * 100\n", - " min_rate <- min(data_tibble$REPORTING_RATE, na.rm = TRUE)\n", - " max_rate <- max(data_tibble$REPORTING_RATE, na.rm = TRUE)\n", - " } else {\n", - " proportion <- 0\n", - " min_rate <- NA # Set to NA if no values to calculate min/max\n", - " max_rate <- NA # Set to NA if no values to calculate min/max\n", - " }\n", - "\n", - " if (proportion == 0) {\n", - " clarification = NULL\n", - " } else {\n", - " clarification = \" (there are more reports than expected)\"\n", - " }\n", - "\n", - " # Print the formatted result\n", - " log_msg(\n", - " paste0(\n", - " \"🔍 For reporting rate method : `\", method, \"`, the values of REPORTING_RATE range from \", round(min_rate, 2),\n", - " \" to \", round(max_rate, 2),\n", - " \", and \", round(proportion, 2), \" % of values are >1\", clarification, \".\"\n", - " )\n", - " )\n", - "\n", - " # Histogram\n", - " hist(data_tibble$REPORTING_RATE, \n", - " breaks = 50)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e2f4c11c-c683-4204-ab91-9d41cab4826c", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", - " inspect_reporting_rate(reporting_rate_dataset)\n", - " }" - ] - }, - { - "cell_type": "markdown", - "id": "04870e93-5385-425b-89fd-b815a87cfa21", - "metadata": {}, - "source": [ - "#### Subset cols" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d90671b0-36f8-4c6e-8736-4ea807079f83", - "metadata": { - "scrolled": true, - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", - " reporting_rate_dataset <- reporting_rate_dataset |> \n", - " select(all_of(fixed_cols_rr))\n", - " \n", - " dim(reporting_rate_dataset)\n", - " head(reporting_rate_dataset, 3)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "62e6cb16-0196-447f-b142-aaec2120eecb", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "2dc27c07-80cd-465e-891f-9fb70111dbb0", - "metadata": {}, - "source": [ - "#### Plot by MONTH (heatmap)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c9ce56fc-f86a-4a2b-95b7-fb6ec5b89087", - "metadata": {}, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATASET\") {\n", - " \n", - " # Plot reporting rate heatmap\n", - " options(repr.plot.width = 20, repr.plot.height = 10) \n", - " \n", - " # reporting_rate_conf_month %>%\n", - " reporting_rate_dataset %>%\n", - " mutate(\n", - " DATE = as.Date(paste0(YEAR, \"-\", MONTH, \"-01\"))\n", - " ) %>%\n", - " ggplot(., aes(x = DATE, \n", - " y = factor(ADM2_ID), \n", - " fill = REPORTING_RATE * 100)\n", - " ) + \n", - " geom_tile() +\n", - " scale_fill_viridis_c(\n", - " option = \"C\",\n", - " direction = 1, # blue = low, yellow = high\n", - " limits = c(0, 100),\n", - " name = \"Reporting rate (%)\"\n", - " ) +\n", - " labs(\n", - " title = \"Monthly Reporting Rate by Health District - Method 'DataSet'\",\n", - " subtitle = \"Each tile represents the reporting completeness per district per month\",\n", - " x = \"Month\",\n", - " y = \"Health District\"\n", - " ) +\n", - " theme_minimal(base_size = 13) +\n", - " theme(\n", - " axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 9),\n", - " axis.text.y = element_text(size = 9),\n", - " plot.title = element_text(face = \"bold\", hjust = 0.5, size = 14),\n", - " plot.subtitle = element_text(hjust = 0.5, size = 12),\n", - " legend.position = \"right\",\n", - " panel.grid = element_blank()\n", - " )\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "00bf15b7-baa7-4734-8133-8d4a9cc843a3", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "40b21c65-1b75-42f7-821a-24d31e436c73", - "metadata": {}, - "source": [ - "----------------------------" - ] - }, - { - "cell_type": "markdown", - "id": "17ffece4-9420-4004-993b-b5692cc1d2de", - "metadata": {}, - "source": [ - "## 3.2. **Data Element** reporting rate: based on reporting of one or more indicators\n", - "**_Partially_ following methods by WHO and as per Diallo (2025) paper**\n", - "\n", - "To accurately measure data completeness, we calculate the **monthly** reporting rate per **ADM2**, as the **proportion** of **facilities** (HF or `OU_ID`) that in a given month submitted data for either a single indicator (i.e., **confirmed** malaria case as `CONF`) or for _any_ of the chosen indicators (i.e., `CONF`, `SUSP`, `TEST`). \n", - "\n", - "Basically, \"Data Element\" reporting rate is the number of facilities reporting on 1 or more given indicators, over the total number of facilities.
\n", - "\n", - "For this method the user is allowed to **chose** how to calculate both the **numerator** and the **denominator**.
\n", - "Specifically:\n", - "* **Numerator**: is the number of **facilities that _actually reported_** data, and it is estimated based on whether a facility (FoSa, or HF, or `OU_ID`) **submitted data** for **_any_** of the following **indicators**:\n", - " * `CONF`: confirmed malaria cases and/or\n", - " * `SUSP`: suspected malaria cases and/or\n", - " * `TEST`: tested malaria cases
\n", - " Note: we **recommend** always including `CONF` because it is a core indicator consistently tracked across the dataset. This choice ensures alignment with the structure of the incidence calculation, which is also mainly based on confirmed cases.\n", - "\n", - "
\n", - " \n", - "* **Denominator**: is the number of **facilities _expected_ to report**. This number can be obtained in two different ways:\n", - " * `\"DHIS2_EXPECTED_REPORTS\"`: uses the col `EXPECTED_REPORTS` from the df `dhis2_reporting_expected`.
\n", - " This is obtained directly from DHIS2, and is the same denominator used to calculate the \"Dataset\" reporting rate.\n", - " * `\"ROUTINE_ACTIVE_FACILITIES\"`: uses the col `EXPECTED_REPORTS` from the df `active_facilities`.
\n", - " This is calculated as the number of \"**active**\" facilities (`OU_ID`), defined as those that submitted _any_ data **at least once in a given year**, across ***all*** indicators extracted in `dhis2_routine` (namely: all aggregated indicators as defined in the SNT_config.json file, see: `config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS`)\n", - "\n", - "
\n", - "\n", - "This method improves over simple binary completeness flags by accounting for both spatial (facility coverage) and temporal (monthly timeliness) dimensions.
" - ] - }, - { - "cell_type": "markdown", - "id": "f5dcd3b9-6f02-4fc5-9e5f-2253c015a3d4", - "metadata": {}, - "source": [ - "### Calculate the **numerator**" - ] - }, - { - "cell_type": "markdown", - "id": "a90d9f4a-a058-4ad5-8ef2-f827987b5def", - "metadata": {}, - "source": [ - "**Note**: the col `REPORTED` keeps the same name regardless of the value of `DATAELEMENT_METHOD_NUMERATOR` because \n", - "in this way the code needs to be parametrized only once (here).\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8076609c-46e8-478a-8283-bc63a70102f8", - "metadata": {}, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", - "\n", - "dhis2_routine_active <- dhis2_routine %>%\n", - " mutate(\n", - " # if_any() returns TRUE if the condition is met for any of the selected columns\n", - " ACTIVE = if_else(if_any(all_of(indicators_selected), ~ !is.na(.x)), 1, 0)\n", - " )\n", - "\n", - "log_msg(paste0(\"Evaluating reporting facilities based on indicators: \", paste(indicators_selected, collapse = \", \"), \".\"))\n", - "\n", - "dim(dhis2_routine_active)\n", - "head(dhis2_routine_active, 3)\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "325faf35-ed25-4b8e-b421-934a2852f27e", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a1773313-17e5-478d-b60d-c1193233204d", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "# --- 1. Calculate `SUBMITTED_REPORTS` as the nr of ACTIVE facilities (that REPORTED, each month) ------------------------\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", - "\n", - "dhis2_routine_submitted <- dhis2_routine_active %>% # OLD: dhis2_routine_reporting_month <- dhis2_routine_reporting %>%\n", - " group_by(ADM2_ID, YEAR, MONTH) %>% \n", - " summarise(\n", - " SUBMITTED_REPORTS = sum(ACTIVE, na.rm = TRUE),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " ungroup() %>% \n", - " mutate(YEAR = as.integer(YEAR),\n", - " MONTH = as.integer(MONTH)\n", - " ) \n", - "\n", - "print(dim(dhis2_routine_submitted))\n", - "head(dhis2_routine_submitted, 3)\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a25647e3-5674-44e0-855e-c3a48483310d", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "15f4c12f", - "metadata": {}, - "source": [ - "### Calculate the **denominator**" - ] - }, - { - "cell_type": "markdown", - "id": "06b2070d-c672-425f-a78f-b94a8d16a017", - "metadata": {}, - "source": [ - "#### Option: `ROUTINE_ACTIVE_FACILITIES`\n", - "This is to be used **only when** `DATAELEMENT_METHOD_DENOMINATOR ==`**`ROUTINE_ACTIVE_FACILITIES`** " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "08f03ed1-5831-4fe5-8bde-674a513e8110", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "# Calculate the tot nr of facilities (distinct OU_ID) based on all HF that appear in the routine data (each YEAR)\n", - "# meaning: regardless of what indicators they submit data for, as long as they have submitted something\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"ROUTINE_ACTIVE_FACILITIES\") {\n", - " routine_active_facilities <- dhis2_routine %>%\n", - " # Keep only rows where at least one indicator has non-NA value\n", - " filter(if_any(any_of(DHIS2_INDICATORS), ~ !is.na(.))) %>%\n", - " group_by(YEAR, ADM2_ID) %>%\n", - " summarize(\n", - " EXPECTED_REPORTS = n_distinct(OU_ID),\n", - " .groups = \"drop\" # remove grouping \n", - " )\n", - "\n", - " nr_of_rows <- nrow(routine_active_facilities)\n", - " log_msg(glue::glue(\"Produced df `routine_active_facilities`, with column `EXPECTED_REPORTS` calculated from DHIS2 routine data. Dataframe `routine_active_facilities` has {nr_of_rows} rows.\"))\n", - "\n", - " head(routine_active_facilities, 3)\n", - " \n", - "} \n" - ] - }, - { - "cell_type": "markdown", - "id": "6629dccb-97b0-4b0e-b23f-15b98704323d", - "metadata": {}, - "source": [ - "#### Option: `PYRAMID_OPEN_FACILITIES`\n", - "This is to be used **only when** `DATAELEMENT_METHOD_DENOMINATOR ==`**`PYRAMID_OPEN_FACILITIES`** " - ] - }, - { - "cell_type": "markdown", - "id": "0972ffca-c14a-4b93-85ff-027d056c3759", - "metadata": {}, - "source": [ - "------------------" - ] - }, - { - "cell_type": "markdown", - "id": "d49219b7-5932-4062-a10d-e1f3a4a81449", - "metadata": {}, - "source": [ - "#### TEMPORARY! 🇳🇪 **Niger-specific method**\n", - "🚨 Specific to **Niger EnDoP**: Pre-processing needed to separate facilities from adm levels!! 🚨
\n", - "\n", - "⚠️⚠️⚠️ **TEMPORARY: This will be moved to a dedicated pipeline!** ⚠️⚠️⚠️
\n", - "\n", - "Specifically:\n", - "* **Hospital**s (HD a Hopital District): at **level 4** together with Aires de Sante\n", - "* All other **FoSa**s: at **level 6**, also mixed with the hospital units\n", - "\n", - "Therefore, to assigned closed/open status, it is necessary to attach to each individual facility the closng and opening data column. \n", - "To do this: \n", - "1) extract list of facilities and id across the 2 levels (4 and 6) and\n", - "2) calculate the nr of open facilities per MONTH (PERIOD) per ADM2, ending up with a df with cols: `ADM2_ID`, `YEAR`, `MONTH`, `OPEN_FACILITIES_COUNT` = `EXPECTED_REPORTS`\n", - "3) add this to the df with the **numerator** (`dhis2_routine_submitted`)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e6296329-9bcd-4d2c-afb3-520c6a159cdb", - "metadata": {}, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE == \"NER\") {\n", - " \n", - "# names(dhis2_pyramid_raw)\n", - "dim(dhis2_pyramid_raw)\n", - "head(dhis2_pyramid_raw, 3)\n", - " \n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "6f651b18-2d85-4e26-8952-45dae9020c40", - "metadata": {}, - "source": [ - "#### 1. Create df with list of all **facilities** with their `DATE_OPENED` and `DATE_CLOSED`: `facility_master`\n", - "Separate \"facilities\" (of any type, such as hospitals to CSI, Infermieres etc) from admin levels and hospital units (wards, depts...)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "25dea3c7-44ea-42a7-b467-f470892fcfef", - "metadata": {}, - "outputs": [], - "source": [ - "# Helpers to detect Aires and Hospitals:\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE == \"NER\") {\n", - " \n", - "is_aire_l5 <- function(x) str_detect(x, regex(\"^\\\\s*aire[^a-zA-Z]?\", ignore_case = TRUE))\n", - "is_hospital_l4 <- function(x) str_detect(x, regex(\"^(hd|chr|chu|hgr)\", ignore_case = TRUE))\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "55760b89-f4d0-40c5-9905-f7c7c4fee5c0", - "metadata": {}, - "outputs": [], - "source": [ - "# List of all FoSa (from Aires → Level 6)\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE == \"NER\") {\n", - " \n", - "fosa_master <- dhis2_pyramid_raw %>%\n", - " filter(is_aire_l5(LEVEL_5_NAME)) %>%\n", - " distinct(\n", - " OU_ID = LEVEL_6_ID,\n", - " OU_NAME = LEVEL_6_NAME,\n", - " region = LEVEL_2_NAME,\n", - " district = LEVEL_3_NAME,\n", - " ADM2_ID = LEVEL_3_ID,\n", - " DATE_OPENED = OPENING_DATE, \n", - " DATE_CLOSED = CLOSED_DATE\n", - " ) %>%\n", - " mutate(OU_TYPE = \"FoSa\")\n", - "\n", - "dim(fosa_master)\n", - "head(fosa_master)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0e9c9f59-9c6c-44e4-bbd9-13c3917f5117", - "metadata": {}, - "outputs": [], - "source": [ - "# List of all Hospitals (from Level 4, aggregate dates across children)\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE == \"NER\") {\n", - " \n", - "hosp_master <- dhis2_pyramid_raw %>%\n", - "filter(is_hospital_l4(LEVEL_4_NAME)) %>%\n", - "group_by(LEVEL_4_ID, LEVEL_4_NAME, LEVEL_2_NAME, LEVEL_3_NAME, LEVEL_3_ID) %>%\n", - "summarise(\n", - " OPENING_DATE = suppressWarnings(min(OPENING_DATE, na.rm = TRUE)),\n", - " CLOSED_DATE = suppressWarnings(max(CLOSED_DATE, na.rm = TRUE)),\n", - " .groups = \"drop\"\n", - ") %>%\n", - "mutate(\n", - " DATE_OPENED = ifelse(is.infinite(OPENING_DATE), NA, OPENING_DATE) |> as_datetime(),\n", - " DATE_CLOSED = ifelse(is.infinite(CLOSED_DATE), NA, CLOSED_DATE) |> as_datetime()\n", - " ) %>%\n", - "distinct(\n", - " OU_ID = LEVEL_4_ID, \n", - " OU_NAME = LEVEL_4_NAME,\n", - " region=LEVEL_2_NAME,\n", - " district=LEVEL_3_NAME,\n", - " ADM2_ID=LEVEL_3_ID,\n", - " DATE_OPENED,\n", - " DATE_CLOSED\n", - ") %>%\n", - "mutate(\n", - " OU_TYPE = \"Hospital\"\n", - " )\n", - "\n", - "dim(hosp_master)\n", - "head(hosp_master)\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5859e393-bfac-46e6-b103-cb8177100860", - "metadata": {}, - "outputs": [], - "source": [ - "# Merge both\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE == \"NER\") {\n", - " \n", - "facility_master <- bind_rows(fosa_master, hosp_master) %>% \n", - " select(ADM2_ID, \n", - " OU_ID, \n", - " DATE_OPENED, \n", - " DATE_CLOSED)\n", - "\n", - "dim(facility_master)\n", - "head(facility_master, 3)\n", - "\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "191c3b01-1645-410e-be99-b247bf5f9cfb", - "metadata": {}, - "source": [ - "---------------------" - ] - }, - { - "cell_type": "markdown", - "id": "1ee48156-5ed5-43a5-b927-caa53c10d98e", - "metadata": {}, - "source": [ - "#### **Generic** part: applies to **all countries**" - ] - }, - { - "cell_type": "markdown", - "id": "3aa057a5-6f68-493e-83e2-81bafce42c9e", - "metadata": {}, - "source": [ - "#### 2. Calculate nr of **OPEN facilities** for each `MONTH` per `ADM2`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "91f972f1-bcc9-458f-9662-5574efc7ac9d", - "metadata": {}, - "outputs": [], - "source": [ - "# Define start and end period based on routine data \n", - "\n", - "# if (COUNTRY_CODE == \"NER\" && REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", - " \n", - "PERIOD_START <- dhis2_routine$PERIOD |> min()\n", - "PERIOD_END <- dhis2_routine$PERIOD |> max()\n", - "\n", - "print(paste0(\"Start period: \", PERIOD_START))\n", - "print(paste0(\"End period :\", PERIOD_END))\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0c6ed56f-b2a6-460c-a3dc-6c588c40b54c", - "metadata": {}, - "outputs": [], - "source": [ - "## Create a \"complete\" grid of every month and year for the period range ---------------------------------------\n", - "\n", - "# if (COUNTRY_CODE == \"NER\" && REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", - " \n", - "months_grid <- tibble(\n", - " month_date = seq(\n", - " ymd(paste0(PERIOD_START, \"01\")), # Converts 202201 to \"20220101\" and then to a date\n", - " ymd(paste0(PERIOD_END, \"01\")), # same\n", - " by = \"months\"\n", - " )\n", - ") %>%\n", - " mutate(\n", - " YEAR = year(month_date),\n", - " MONTH = month(month_date)\n", - " )\n", - "\n", - "dim(months_grid) \n", - "head(months_grid, 3)\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "eb7d9a47-053e-43a5-9e0d-c7b717236f3e", - "metadata": {}, - "outputs": [], - "source": [ - "## Create `facility_master` for any (🚨 non-NER) countries\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE != \"NER\") {\n", - "\n", - " # Programmatically define `ADM2_ID`\n", - " ADMIN_2_LEVEL <- str_replace(ADMIN_2, \"NAME\", \"ID\")\n", - " # Programmatically define `OU_ID`\n", - " HF_LEVEL <- glue::glue(\"LEVEL_{config_json$SNT_CONFIG$ANALYTICS_ORG_UNITS_LEVEL}_ID\")\n", - "\n", - " facility_master <- dhis2_pyramid_formatted |>\n", - " mutate(\n", - " DATE_OPENED = with_tz(OPENING_DATE, \"UTC\"),\n", - " DATE_CLOSED = with_tz(CLOSED_DATE, \"UTC\")\n", - " ) |>\n", - " select(\n", - " ADM2_ID = all_of(ADMIN_2_LEVEL), \n", - " OU_ID = all_of(HF_LEVEL),\n", - " DATE_OPENED, #= OPENING_DATE,\n", - " DATE_CLOSED #= CLOSED_DATE\n", - ")\n", - "\n", - "head(facility_master)\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f283cc96-ed69-43a3-964e-57ccb0180a4a", - "metadata": {}, - "outputs": [], - "source": [ - "## Create a \"complete\" grid of every ADM2_ID for every month ---------------------------------------\n", - "\n", - "# if (COUNTRY_CODE == \"NER\" && REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", - " \n", - "# This ensures that even if an ADM2_ID has zero open facilities in a month,\n", - "# it will still appear in the final result with a count of 0.\n", - "complete_grid <- expand_grid(\n", - " ADM2_ID = unique(facility_master$ADM2_ID),\n", - " month_date = months_grid$month_date\n", - ") %>%\n", - " mutate(\n", - " YEAR = year(month_date),\n", - " MONTH = month(month_date),\n", - " month_date = with_tz(as_datetime(month_date), \"UTC\") # GP added 0809\n", - " )\n", - "\n", - "head(complete_grid, 3)\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6e905c46-036a-4aa9-85ad-216f846f9e1b", - "metadata": {}, - "outputs": [], - "source": [ - "## Calculate the number of open facilities ---------------------------------------\n", - "\n", - "# # The facility must have opened on or before the last day of the current month. \n", - "# # To calculate the last day: add one month and subtract one day from the first day.\n", - "# complete_grid$month_date[1] # \"2022-01-01\"\n", - "# complete_grid$month_date[1] + months(1) - days(1) # \"2022-01-31\"\n", - "# # The facility must either still be open (DATE_CLOSED is NA) OR it must have closed on or after the first day of that month.\n", - "\n", - "# if (COUNTRY_CODE == \"NER\" && REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", - " \n", - "open_facilities_count <- facility_master %>%\n", - " # Create a row for every possible combination of facility and month\n", - " crossing(months_grid) %>%\n", - " # A facility is \"open\" if it opened BEFORE the end of the month\n", - " # AND it either never closed (NA) or closed AFTER the start of the month.\n", - " filter(\n", - " DATE_OPENED <= month_date + months(1) - days(1) & # opened on or before the last day of the current month\n", - " (is.na(DATE_CLOSED) | DATE_CLOSED >= month_date) # \n", - " ) %>%\n", - " # Count the number of open facilities for each area and month\n", - " count(ADM2_ID, YEAR, MONTH, name = \"OPEN_FACILITIES_COUNT\")\n", - "\n", - "head(open_facilities_count, 3)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0ba854f2-5925-4154-b86e-3e4e7bb6c363", - "metadata": {}, - "outputs": [], - "source": [ - "## Join the counts back to the complete grid to include zeros --------------------------------------\n", - "\n", - "# if (COUNTRY_CODE == \"NER\" && REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", - " \n", - "pyramid_open_facilities <- complete_grid %>%\n", - " left_join(open_facilities_count, by = c(\"ADM2_ID\", \"YEAR\", \"MONTH\")) %>%\n", - " # If a month had no open facilities, the count will be NA. Change it to 0.\n", - " # Also rename `OPEN_FACILITIES_COUNT` to `EXPECTED_REPORTS` to use same col name as other methods\n", - " mutate(OPEN_FACILITIES_COUNT = replace_na(OPEN_FACILITIES_COUNT, 0)) %>% # DENOMINATOR: consistent col name across all methods \n", - " select(ADM2_ID, YEAR, MONTH, \n", - " EXPECTED_REPORTS = OPEN_FACILITIES_COUNT) %>%\n", - " arrange(ADM2_ID, YEAR, MONTH)\n", - "\n", - "print(dim(pyramid_open_facilities))\n", - "head(pyramid_open_facilities, 3)\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ff1a8537-d093-4d5c-8a44-4b729090cced", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "811310a9-df85-4fa3-af9a-b931eaffd7e5", - "metadata": {}, - "source": [ - "### Calculate **Reporting Rate** " - ] - }, - { - "cell_type": "markdown", - "id": "8827cfd6-479b-4025-a379-d20bf20fcfb4", - "metadata": {}, - "source": [ - "**Join df for Denominator**\n", - "\n", - "**Note**
\n", - "in both df's (`dhis2_reporting_expected` OR `routine_active_facilities`) the col `EXPECTED_REPORTS` has the same name to simplify parametrization: only difference between the 2 options is the df to be joined (right element in `left_join()`)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "670508a0-3075-4f82-aa2c-d26cf867f13d", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# --- 2. Join `dhis2_reporting_expected` OR `dhis2_calculated_expected` to add `EXPECTED_REPORTS` ------------------------------------------------\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", - "\n", - "# Parametrized based on DATAELEMENT_METHOD_DENOMINATOR: left_join() the respective df\n", - "if (DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", - " # Add df of rep rate extracted directly from DHIS2\n", - " dhis2_routine_submitted_expected <- left_join(\n", - " dhis2_routine_submitted, \n", - " dhis2_reporting_expected |> select(ADM2_ID, YEAR, MONTH, EXPECTED_REPORTS), # `dhis2_reporting_expected`\n", - " by = join_by(ADM2_ID, YEAR, MONTH)\n", - " ) \n", - " log_msg(\"Calculating `Data Element` reporting rate, using as denominator `EXPECTED_REPORTS` extracted directly from DHIS2.\")\n", - " \n", - "} else if (DATAELEMENT_METHOD_DENOMINATOR == \"ROUTINE_ACTIVE_FACILITIES\") {\n", - " # Add df of rep rate CALCULATED based on submissiosn in dhis2 routine data \"active\" facilities\n", - " dhis2_routine_submitted_expected <- left_join(\n", - " dhis2_routine_submitted, \n", - " routine_active_facilities, # has only cols: `YEAR`, `ADM2_ID`, `EXPECTED_REPORTS`\n", - " by = join_by(ADM2_ID, YEAR) #, MONTH)\n", - " ) \n", - " log_msg(\"Calculating `Data Element` reporting rate, using as denominator `EXPECTED_REPORTS` as CALCULATED from DHIS2 routine data. Here, ACTIVE facilities \n", - " are defined as facilities that reported on any of the extracted indicators at least once per year.\")\n", - " \n", - "} else if (DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", - " # Add df of rep rate CALCULATED based on OPEN facilities as per PYRAMID RAW\n", - " dhis2_routine_submitted_expected <- left_join(\n", - " dhis2_routine_submitted, \n", - " pyramid_open_facilities, \n", - " by = join_by(ADM2_ID, YEAR, MONTH)\n", - " ) \n", - " log_msg(\"Calculating `Data Element` reporting rate, using as denominator `EXPECTED_REPORTS` as CALCULATED from DHIS2 pyramid. \n", - " This method counts the number of OPEN facilities for each ADM2 per MONTH.\")\n", - "}\n", - "\n", - "# Safety measures ...\n", - "dhis2_routine_submitted_expected <- dhis2_routine_submitted_expected |>\n", - " # ungroup() %>% \n", - " mutate(YEAR = as.integer(YEAR),\n", - " MONTH = as.integer(MONTH)\n", - " ) \n", - "\n", - "\n", - "print(dim(dhis2_routine_submitted_expected))\n", - "head(dhis2_routine_submitted_expected, 3)\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6fad303c-b239-4cf9-93a8-fe3ce5c33c37", - "metadata": {}, - "outputs": [], - "source": [ - "# --- 3. Calculate `REPORTING_RATE` ------------------------------------------------\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", - " \n", - "reporting_rate_dataelement <- dhis2_routine_submitted_expected |>\n", - "mutate(\n", - " REPORTING_RATE = SUBMITTED_REPORTS / EXPECTED_REPORTS\n", - " ) \n", - "\n", - "dim(reporting_rate_dataelement)\n", - "head(reporting_rate_dataelement, 3)\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "68023e8e-f7f6-4201-b097-1996bee57671", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# head(hf_active, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "ae3aa127-c20c-4ca5-af0c-4a4260883cac", - "metadata": {}, - "source": [ - "`#### 🚨 Here 👇 swap denominator: join `dhis2_reporting_expected` to replace `TOTAL_HF` with `EXPECTED_REPORTS``" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a97c7d75-3317-48bc-a2f1-770bf38d141a", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", - " inspect_reporting_rate(reporting_rate_dataelement)\n", - "}\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "92651472-26e2-4131-ac02-288122138b0b", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# # --- 1. create intermediate df `hf_active_month`: summarize nr of \"active\" (reporting) HF by month ------------------------\n", - "# hf_active_month <- hf_active %>% \n", - "# # filter(ADM1_ID == \"rWrCdr321Qu\") |> # ⚠️⚠️⚠️ TEMP subset just for CODE development ... ! ⚠️⚠️⚠️\n", - "# dplyr::group_by(ADM2_ID, YEAR, MONTH) %>%\n", - "# dplyr::summarize(\n", - "# SUBMITTED_REPORTS = length(which(ACTIVE == TRUE)), # 🚨 GP changed to BOOLEAN to save space\n", - "# .groups = \"drop\") |>\n", - "# mutate(YEAR = as.integer(YEAR), \n", - "# MONTH = as.integer(MONTH)\n", - "# )\n", - "\n", - "# print(dim(hf_active_month))\n", - "# head(hf_active_month)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "db5ad094-0601-4a18-9435-db60c1f4e8ff", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", - "\n", - " reporting_rate_dataelement <- reporting_rate_dataelement |> \n", - " select(all_of(fixed_cols_rr))\n", - " \n", - " head(reporting_rate_dataelement, 3)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "05f94483-1524-426e-9fe3-4b9bf572c05e", - "metadata": { - "vscode": { - "languageId": "r" + "cells": [ + { + "cell_type": "markdown", + "id": "f5827740-2917-4504-9017-9ec7d408e5f4", + "metadata": {}, + "source": [ + "Script structure:\n", + "\n", + " 0. Parameters: set back-up values for parameters, for when the notebook is run manually (_noy_ via pipeline)\n", + " 1. Setup:\n", + " * Paths\n", + " * Utils functions\n", + " 2. Load Data\n", + " * **Routine data** (DHIS2) already formatted & aggregated (output of pipeline XXX)\n", + " * **Reporting** (DHIS2) pre-computed, already formatted & aggregated (output of pipeline ???)\n", + " * **Shapes** (DHIS2) for plotting (this could be removed if we move the plots to \"report/EDA\" nb)\n", + " 3. Calculate **Reportng Rate (RR)**\n", + " * \"**Dataset**\": using pre-computed reportings from DHIS2/SNIS (was: \"DHIS2\")\n", + " * \"**Data Element**\": using calculated expected nr of report (nr of active facilities) (was: \"CONF\")\n", + " 4. **Export** reporting rate data to `.../data/dhis2/reporting_rate/` as .parquet (and .csv) files for **either**:\n", + " * data**set**: \"XXX_reporting_rate_**dataset**.parquet\" **or**\n", + " * data**element**: \"XXX_reporting_rate_**dataelement**.parquet\"" + ] + }, + { + "cell_type": "markdown", + "id": "5e8f5bf2-922a-468a-8a2c-8e56d7e652df", + "metadata": {}, + "source": [ + "--------------------" + ] + }, + { + "cell_type": "markdown", + "id": "e962c5a4-6b09-4485-8d71-d842159118d3", + "metadata": {}, + "source": [ + "### To Do:\n", + "* For `DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\"`: **add code** to count OPEN facilities () for **countries with \"normal\" pyramids** (i.e., when no mixing of facilities and admin levels ... !). Atm only code for Niger, which runs only if `COUNTRY_CODE == NER`. Should add similar (but simpler) code for the rest of the countries (i.e, `COUNTRY_CODE != NER`)\n", + "* Check why Data Element **Denominator** `routine_active_facilities` is **calculated at `YEAR` (aggregated) instead of `MONTH`** ... possibly fix this to match granularity of other alternatives for denominator (which are calculated at MONTH level)\n", + "* Modify **report notebook** and/or pipeline.py code so that it does not make the **pipeline FAIL** if `reporting_rate_dataset` or `reporting_rate_dataelement` is **not found** (which is now always the case since we only output 1 file at each run!!)" + ] + }, + { + "cell_type": "markdown", + "id": "0cdfdc73-bb9a-48a8-a26b-84ecbab2e0aa", + "metadata": {}, + "source": [ + "----------------" + ] + }, + { + "cell_type": "markdown", + "id": "339f6d58-0965-40ef-b718-96195d2463f8", + "metadata": {}, + "source": [ + "## Parameters" + ] + }, + { + "cell_type": "markdown", + "id": "dd6cd6f8-b91b-4902-8801-a60e11776f98", + "metadata": {}, + "source": [ + "Set Default values **if _not_ provided by pipeline**
\n", + "This makes the execution flexible and \"safe\": nb can be run manually from here or be executed via pipeline, without having to change anything in the code!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "93aac683-8828-4a42-b841-f16c7e8fbb07", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Set BACKUP VALUE: root path - NEVER CHANGE THIS!\n", + "if (!exists(\"SNT_ROOT_PATH\")) {\n", + " SNT_ROOT_PATH <- \"/home/hexa/workspace\" \n", + "}\n", + "\n", + "\n", + "# Choose to run either DataSet OR DataElement method\n", + "if (!exists(\"REPORTING_RATE_METHOD\")) {\n", + " # REPORTING_RATE_METHOD <- \"DATASET\" \n", + " REPORTING_RATE_METHOD <- \"DATAELEMENT\"\n", + "}\n", + "\n", + "\n", + "# Data Elemenet method: Choice of which INDICATORS to use to count the nr of reporting facilities \n", + "# CONF\n", + "if (!exists(\"DATAELEMENT_METHOD_NUMERATOR_CONF\")) {\n", + " DATAELEMENT_METHOD_NUMERATOR_CONF <- TRUE # FALSE\n", + "}\n", + "\n", + "# SUSP\n", + "if (!exists(\"DATAELEMENT_METHOD_NUMERATOR_SUSP\")) {\n", + " DATAELEMENT_METHOD_NUMERATOR_SUSP <- TRUE # FALSE\n", + "}\n", + "\n", + "# TEST\n", + "if (!exists(\"DATAELEMENT_METHOD_NUMERATOR_TEST\")) {\n", + " DATAELEMENT_METHOD_NUMERATOR_TEST <- TRUE # FALSE\n", + "}\n", + "\n", + "\n", + "\n", + "# Data Elemenet RR. Choice: which df to use for nr of `EXPECTED_REPORTS` (DENOMINATOR) \n", + "if (!exists(\"DATAELEMENT_METHOD_DENOMINATOR\")) {\n", + " # DATAELEMENT_METHOD_DENOMINATOR <- \"ROUTINE_ACTIVE_FACILITIES\" \n", + " DATAELEMENT_METHOD_DENOMINATOR <- \"PYRAMID_OPEN_FACILITIES\" \n", + " # DATAELEMENT_METHOD_DENOMINATOR <- \"DHIS2_EXPECTED_REPORTS\" # ⚠️ only if `REPORTING_RATE_METHOD == \"DATASET\"` && DataSet is available!! ⚠️\n", + "} \n" + ] + }, + { + "cell_type": "markdown", + "id": "af076158-1f5a-408d-8ce2-2f2101d0531c", + "metadata": {}, + "source": [ + "## 1. Setup" + ] + }, + { + "cell_type": "markdown", + "id": "3ae826e4-f728-4c8d-81fb-0857234ac622", + "metadata": {}, + "source": [ + "### 1.1. Paths" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5f1b8ce-db82-4295-8e74-00b765cf0b9d", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# PROJECT PATHS\n", + "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_reporting_rate\")\n", + "CODE_PATH <- file.path(SNT_ROOT_PATH, 'code') # this is where we store snt_utils.r\n", + "CONFIG_PATH <- file.path(SNT_ROOT_PATH, 'configuration') # .json config file\n", + "DATA_PATH <- file.path(SNT_ROOT_PATH, 'data', 'dhis2') " + ] + }, + { + "cell_type": "markdown", + "id": "22971de0-1431-4cbd-b8c1-3bd3e1609e0d", + "metadata": {}, + "source": [ + "### 1.2. Utils functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1784fd43-03f3-478b-8148-4b478317ea21", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_reporting_rate.r\"))" + ] + }, + { + "cell_type": "markdown", + "id": "3bbcbd39-54e8-4ece-9244-30d7d30291d2", + "metadata": {}, + "source": [ + "### 1.3. Packages" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "426ecff6-0b4c-474d-a48d-826002205b89", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# List required pcks ----------------> check what are the really required libraries\n", + "required_packages <- c(\"arrow\", # for .parquet\n", + " \"tidyverse\",\n", + " \"stringi\", \n", + " \"jsonlite\", \n", + " \"httr\", \n", + " \"reticulate\")\n", + "\n", + "# Execute function\n", + "install_and_load(required_packages)" + ] + }, + { + "cell_type": "markdown", + "id": "18a8e0c1-ac09-4435-b6f4-5f91fd916396", + "metadata": {}, + "source": [ + "### 1.3.1. OpenHEXA-specific settings" + ] + }, + { + "cell_type": "markdown", + "id": "ebb8c7d5-7c2c-4dbe-a1ba-238419fbedf3", + "metadata": {}, + "source": [ + "#### For 📦{sf}, tell OH where to find stuff ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "91a66fb7-dd5e-43fd-a6a2-d8bb9f0315d6", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", + "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")" + ] + }, + { + "cell_type": "markdown", + "id": "ac9ee427-020e-47c5-b2c9-5ca24e1f2779", + "metadata": {}, + "source": [ + "#### Set environment to load openhexa.sdk from the right path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa331278-573d-4a22-ab16-da6972d7b0be", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Set environment to load openhexa.sdk from the right path\n", + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "reticulate::py_config()$python\n", + "openhexa <- import(\"openhexa.sdk\")" + ] + }, + { + "cell_type": "markdown", + "id": "339b2e8b-9bf6-4eaf-b283-d9360c1c6899", + "metadata": {}, + "source": [ + "### 1.4. Load and check `config` file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1c46526-6844-43ae-bb53-d8d1ad2fac24", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load SNT config\n", + "\n", + "config_file_name <- \"SNT_config.json\" \n", + "config_json <- tryCatch({\n", + " jsonlite::fromJSON(file.path(CONFIG_PATH, config_file_name)) \n", + " },\n", + " error = function(e) {\n", + " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })\n", + "\n", + "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, config_file_name))\n", + "log_msg(msg)" + ] + }, + { + "cell_type": "markdown", + "id": "29182f25-b0cf-46aa-9818-49616cd3f353", + "metadata": {}, + "source": [ + "**Save config fields as variables**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c52654c8-8a19-4e0c-a83b-1bc2eecae6bc", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Generic\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", + "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", + "\n", + "# How to treat 0 values (in this case: \"SET_0_TO_NA\" converts 0 to NAs)\n", + "NA_TREATMENT <- config_json$SNT_CONFIG$NA_TREATMENT\n", + "\n", + "# Which (aggregated) indicators to use to evaluate \"activity\" of an HF - for Reporting Rate method \"Ousmane\"\n", + "DHIS2_INDICATORS <- names(config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS)\n", + "\n", + "# Which reporting rate PRODUCT_UID to use (not that this is a dataset in COD, but 2 dataElements in BFA!)\n", + "REPORTING_RATE_PRODUCT_ID <- config_json$SNT_CONFIG$REPORTING_RATE_PRODUCT_UID" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "412572bc-fb96-4f61-ac49-be7f449219b6", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# DHIS2_INDICATORS\n", + "log_msg(paste(\"Expecting the following DHIS2 (aggregated) indicators : \", paste(DHIS2_INDICATORS, collapse=\", \")))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a0a8562-4a70-455c-9ccf-aa39f4cf4e31", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Fixed cols for routine data formatting \n", + "fixed_cols <- c('OU_ID','PERIOD', 'YEAR', 'MONTH', 'ADM1_ID', 'ADM2_ID') # (OU_NAME has homonimous values!)\n", + "# print(paste(\"Fixed routine data (`dhis2_routine`) columns (always expected): \", paste(fixed_cols, collapse=\", \")))\n", + "log_msg(paste(\"Expecting the following columns from routine data (`dhis2_routine`) : \", paste(fixed_cols, collapse=\", \")))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "86e82d54-2b00-4c25-9b34-3497d4c88c52", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Fixed cols for exporting RR tables: to export output tables with consistent structure\n", + "fixed_cols_rr <- c('YEAR', 'MONTH', 'ADM2_ID', 'REPORTING_RATE') " + ] + }, + { + "cell_type": "markdown", + "id": "dadc7351-e67e-450b-a046-bc64660a7dde", + "metadata": {}, + "source": [ + "### 1.5. 🔍 Check: at least 1 indicator must be selected\n", + "The use can toggle on/off each of the indicators. Therefore, need to make sure at least one is ON.
\n", + "Alternatively, `CONF` could be made mandatory, but I think it looks better if they're all displayed in the Run pipeline view (more intuitive)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2cf6e2a4-0822-4a0c-852e-143da5473d20", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "nr_of_indicators_selected <- sum(DATAELEMENT_METHOD_NUMERATOR_CONF, DATAELEMENT_METHOD_NUMERATOR_SUSP, DATAELEMENT_METHOD_NUMERATOR_TEST)\n", + "\n", + "if (nr_of_indicators_selected == 0) {\n", + " msg <- \"[ERROR] Error: no indicator selected, cannot perform calculation of reporting rate method 'Data Element'! Select at least one (e.g., `CONF`).\"\n", + " cat(msg) \n", + " stop(msg)\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "8d8d9be2-bf05-466d-811e-6beea0dccfde", + "metadata": {}, + "source": [ + "## 2. Load Data" + ] + }, + { + "cell_type": "markdown", + "id": "0fa1b169-fc55-4ef1-b58f-6a7dc9d1dec3", + "metadata": {}, + "source": [ + "### 2.1. **Routine** data (DHIS2) \n", + "already formatted & aggregated (output of pipeline XXX)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "586e8da8-4e1c-431a-9b8d-1169167e1c09", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# DHIS2 Dataset extract identifier\n", + "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "\n", + "# Load file from dataset\n", + "dhis2_routine <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_routine.parquet\")) }, \n", + " error = function(e) {\n", + " msg <- paste(\"Error while loading DHIS2 routine data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", + " cat(msg)\n", + " stop(msg)\n", + "})\n", + "\n", + "msg <- paste0(\"DHIS2 routine data loaded from dataset : \", dataset_name, \" dataframe dimensions: \", paste(dim(dhis2_routine), collapse=\", \"))\n", + "log_msg(msg)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a2454183-44f7-4e2e-a0cf-ca112aa183bb", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Ensure correct data type for numerical columns \n", + "dhis2_routine <- dhis2_routine %>%\n", + " mutate(across(c(PERIOD, YEAR, MONTH), as.numeric))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "edb2fcdc-ce0a-4c78-b06a-9f4610ab4714", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "head(dhis2_routine, 3)" + ] + }, + { + "cell_type": "markdown", + "id": "821e1ebf-b2fa-4469-974e-2e4d27d58854", + "metadata": {}, + "source": [ + "#### 🔍 Check expected cols for method **Data Element**, numerator using multiple indicators.\n", + "Only when: `DATAELEMENT_METHOD_NUMERATOR == \"CONF|SUSP|TEST\"`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d3f8b89e-a04e-4e0b-9892-95ce2150e7da", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "head(dhis2_routine, 3)" + ] + }, + { + "cell_type": "markdown", + "id": "adec5412", + "metadata": {}, + "source": [ + "#### 🔍 Check expected cols for method **Data Element**, numerator using multiple indicators.\n", + "Based on which indicator(s) are selected (if any)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0bbcdf8c-873a-4b41-980a-f18d1863ab8f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Initialize empty vector\n", + "indicators_selected = c()\n", + "\n", + "# Add elements based on user selection(s)\n", + "if (DATAELEMENT_METHOD_NUMERATOR_CONF) {\n", + " indicators_selected = append(indicators_selected, \"CONF\")\n", + "}\n", + "\n", + "if (DATAELEMENT_METHOD_NUMERATOR_SUSP) {\n", + " indicators_selected = append(indicators_selected, \"SUSP\")\n", + "}\n", + "\n", + "if (DATAELEMENT_METHOD_NUMERATOR_TEST) {\n", + " indicators_selected = append(indicators_selected, \"TEST\")\n", + "}\n", + "\n", + "print(paste0(\"Selected indicators: \", paste(indicators_selected, collapse = \", \")))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b84753f8-aa9c-4563-beae-5e29b3f1e773", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# This is kinda useless now but KEEP in case we ADD MORE CHOICES OF INDICATORS!! \n", + "if(REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", + " if (DATAELEMENT_METHOD_NUMERATOR_CONF | DATAELEMENT_METHOD_NUMERATOR_SUSP | DATAELEMENT_METHOD_NUMERATOR_TEST) {\n", + " log_msg(paste0(\"Indicator(s) \", paste(indicators_selected, collapse = \", \") , \" selected for calculation of numerator for method `Data Element`.\" ))\n", + " \n", + " if ( length(which(indicators_selected %in% names(dhis2_routine))) < length(indicators_selected) ) {\n", + " log_msg(paste0(\"🚨 Warning: one or more of the follow column is missing from `dhis2_routine`: \", paste(expected_col, collapse = \", \"), \".\"), \"warning\")\n", + " } \n", + " }\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "c832da26-fe0c-43fe-8300-2fff5c4cbf34", + "metadata": {}, + "source": [ + "### 2.2. **Reporting** pre-computed from DHIS2 \n", + "Data granularity:\n", + "* **ADM2**\n", + "* **MONTH** (PERIOD)\n", + "\n", + "Note: data comes from different dataset (`DS_NAME`): `A SERVICES DE BASE`, `B SERVICES SECONDAIRES`,`D SERVICE HOPITAL` \n", + "\n", + "The col `DS_METRIC` indicates whether the `VALUE` is `EXPECTED_REPORTS` or `ACTUAL_REPORTS`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ce295b9-9898-4e12-8a91-92bb25b9e0a2", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# REPORTING_RATE_METHOD <- \"DATAELEMENT\" # \"DATASET\"\n", + "REPORTING_RATE_METHOD" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a32fc96-5b8e-4108-a224-c0d843df9b47", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", + " # DHIS2 Dataset extract identifier\n", + " dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + " file_name <- paste0(COUNTRY_CODE, \"_reporting.parquet\")\n", + " \n", + " # Load file from dataset\n", + " dhis2_reporting <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, file_name) }, \n", + " error = function(e) {\n", + " msg <- paste(\"Error while loading DHIS2 pre-computed REPORTING data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", + " cat(msg)\n", + " stop(msg)\n", + " })\n", + " \n", + " msg <- paste0(\"DHIS2 pre-computed REPORTING data loaded from file `\", file_name, \"` (from dataset : `\", dataset_name, \"`). Dataframe dimensions: \", \n", + " paste(dim(dhis2_reporting), collapse=\", \"))\n", + " log_msg(msg)\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e131d9ee-0e88-4bb6-982b-53b1229fba5f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", + " # Convert VALUE col to - should not be needed but keep as safety measure \n", + " dhis2_reporting <- dhis2_reporting |>\n", + " mutate(across(c(PERIOD, YEAR, MONTH, VALUE), as.numeric))\n", + "\n", + " head(dhis2_reporting, 3)\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "46e3dba8-d46b-457e-ba90-c663e30c42d2", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# # Convert VALUE col to - should not be needed but keep as safety measure \n", + "# dhis2_reporting <- dhis2_reporting |>\n", + "# mutate(across(c(PERIOD, YEAR, MONTH, VALUE), as.numeric))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5149befe-b6ad-46a9-9879-7637ce5b02be", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# head(dhis2_reporting, 3)" + ] + }, + { + "cell_type": "markdown", + "id": "7a967af3-f6e5-428a-8769-72808f21a125", + "metadata": {}, + "source": [ + "#### 2.2.1. **Filter** to keep only values for `PRODUCT_UID` defined in config.json" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1948c2f7-7a2c-47a2-9dc6-ba29da6d030c", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "REPORTING_RATE_PRODUCT_ID" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e4258098-e24c-4520-914d-0f73354bb3ab", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", + "\n", + " # Handle problems with incorrect configuration - to be improved 🚧\n", + " if (is.null(REPORTING_RATE_PRODUCT_ID)) {\n", + " log_msg(\"🛑 Problem with definition of REPORTING_RATE_PRODUCT_ID, check `SNT_config.json` file!\")\n", + " } else \n", + " product_name <- dhis2_reporting |> filter(PRODUCT_UID %in% REPORTING_RATE_PRODUCT_ID) |> pull(PRODUCT_NAME) |> unique()\n", + " log_msg(glue::glue(\"Using REPORTING_RATE_PRODUCT_ID == `{REPORTING_RATE_PRODUCT_ID}`, corresponding to DHIS2 Product name : `{product_name}`.\"))\n", + "\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c22c6ada-7cb1-4fca-b65e-b51e5eca35a2", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", + "\n", + " dhis2_reporting_filtered <- dhis2_reporting |>\n", + " filter(PRODUCT_UID %in% REPORTING_RATE_PRODUCT_ID) |>\n", + " select(-PRODUCT_UID, -PRODUCT_NAME) # useless cols now\n", + " \n", + " print(dim(dhis2_reporting_filtered))\n", + " head(dhis2_reporting_filtered)\n", + " \n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "9da035cf-5d3f-4df0-a063-2d2497616c82", + "metadata": {}, + "source": [ + "#### 2.2.2. Format to produce `dhis2_reporting_expected`\n", + "🚨 Note: Use `dhis2_reporting_expected$EXPECTED_REPORTS` as new denominator for REPORTING_RATE calculations (methods dataset and dataelement)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e970f9d-258e-4050-ae69-185b88c79fc3", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", + " \n", + " dhis2_reporting_wide <- dhis2_reporting_filtered |> \n", + " pivot_wider(\n", + " names_from = PRODUCT_METRIC, \n", + " values_from = VALUE\n", + " )\n", + " \n", + " print(dim(dhis2_reporting_wide))\n", + " head(dhis2_reporting_wide)\n", + " \n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eab31756-ae6b-4152-8ec3-8195236d8732", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Use `dhis2_reporting_expected$EXPECTED_REPORTS` as new denomitor for RR calculations (methods ANY and CONF)\n", + "\n", + "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", + " \n", + " dhis2_reporting_expected <- dhis2_reporting_wide |> \n", + " select(-ACTUAL_REPORTS)\n", + " \n", + " print(dim(dhis2_reporting_expected))\n", + " head(dhis2_reporting_expected)\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "3c3d0f35-889d-4c16-9741-d6e75e2ef096", + "metadata": {}, + "source": [ + "#### 2.2.3. **Checks** on data completeness: _do **periods match** with routine data?_\n", + "Lack of perfect overlap in periods between routine data and reporting rate data might create headhaches downstream!
\n", + "Specifically, **incidence** calculations will show **N2 smaller than N1** due to **aggregation by YEAR when NA** values are present!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ea57600-418b-45bc-805a-f829e237b4c4", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", + " \n", + " # --- Check Year Compatibility ---\n", + " routine_years <- sort(unique(as.integer(dhis2_routine$YEAR))) # as.integer\n", + " expected_years <- sort(unique(as.integer(dhis2_reporting_expected$YEAR))) # as.integer\n", + " \n", + " if (!setequal(routine_years, expected_years)) {\n", + " missing_in_routine <- setdiff(expected_years, routine_years)\n", + " missing_in_expected <- setdiff(routine_years, expected_years)\n", + " \n", + " if (length(missing_in_routine) > 0) {\n", + " log_msg(paste0(\"🚨 Warning: YEAR value(s) present in 'dhis2_reporting_expected' but not in 'dhis2_routine': \",\n", + " paste(missing_in_routine, collapse = \", \")))\n", + " }\n", + " if (length(missing_in_expected) > 0) {\n", + " log_msg(paste0(\"🚨 Warning: YEAR value(s) present in 'dhis2_routine' but not in 'dhis2_reporting_expected': \",\n", + " paste(missing_in_expected, collapse = \", \")))\n", + " }\n", + " } else {\n", + " log_msg(\"✅ YEAR values are consistent across 'dhis2_routine' and 'dhis2_reporting_expected'.\")\n", + " \n", + " # --- Check Month Compatibility (if years are consistent) ---\n", + " all_years <- unique(routine_years) # Or expected_years, they are the same now\n", + " \n", + " for (year_val in all_years) {\n", + " routine_months_for_year <- dhis2_routine %>%\n", + " filter(YEAR == year_val) %>%\n", + " pull(MONTH) %>%\n", + " unique() %>%\n", + " sort()\n", + " \n", + " expected_months_for_year <- dhis2_reporting_expected %>%\n", + " filter(YEAR == year_val) %>%\n", + " pull(MONTH) %>%\n", + " unique() %>%\n", + " sort()\n", + " \n", + " if (!setequal(routine_months_for_year, expected_months_for_year)) {\n", + " missing_in_routine_months <- setdiff(expected_months_for_year, routine_months_for_year)\n", + " missing_in_expected_months <- setdiff(routine_months_for_year, expected_months_for_year)\n", + " \n", + " if (length(missing_in_routine_months) > 0) {\n", + " log_msg(paste0(\"🚨 Warning: for YEAR \", year_val, \", MONTH value(s) '\", paste(missing_in_routine_months, collapse = \", \"),\n", + " \"' present in 'dhis2_reporting_expected' but not in 'dhis2_routine'!\"\n", + " ))\n", + " }\n", + " if (length(missing_in_expected_months) > 0) {\n", + " log_msg(paste0(\"🚨 Warning: for YEAR \", year_val, \", MONTH value(s) '\", paste(missing_in_expected_months, collapse = \", \"), \n", + " \"' present in 'dhis2_routine' but not in 'dhis2_reporting_expected'!\"\n", + " ))\n", + " }\n", + " } else {\n", + " log_msg(paste0(\"✅ For year \", year_val, \", months are consistent across both data frames.\"))\n", + " }\n", + " }\n", + " }\n", + "\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "5e711191-995b-4f89-b10c-fc2214cdd8b2", + "metadata": {}, + "source": [ + "### 2.3. **Pyramid** to count OPEN facilities (denominator)\n", + "Table (and column) needed for denominator of \"Data Element\" reporting rate if choice == `PYRAMID_OPEN_FACILITIES`\n", + "\n", + "**Important**: the pyramid must contain the `OPENING_DATE` and `CLOSING_DATE` columns (this was implemented in the new extraction pipeline from 2025-09).
\n", + "Then, **depending on the Country** (well, theire pyramid structure) **import** either:\n", + "* **Raw** pyramid for 🇳🇪 Niger: because first need to \"manually\" correctly aggregate the VALUEs for the HF (separate them from admin levels and sum up HD units)\n", + "* **Formatted** pyramid for all other countries encountered so far: 🇨🇩 DRC, 🇧🇫 Burkina Faso ... bevcause their pyramid is already usable right away" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ad3f83b6-2fdd-45da-8a4d-fb06513b6be2", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# DATAELEMENT_METHOD_DENOMINATOR <- \"PYRAMID_OPEN_FACILITIES\"\n", + "DATAELEMENT_METHOD_DENOMINATOR" + ] + }, + { + "cell_type": "markdown", + "id": "e7b80b6e-9e34-4e71-93e8-7e16a110e17c", + "metadata": {}, + "source": [ + "#### **Raw** pyramid for 🇳🇪 **Niger**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "652cf1a7-c9a2-48db-b44d-8fabfd0e072f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE == \"NER\") {\n", + " \n", + " # DHIS2 Dataset extract identifier\n", + " dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_EXTRACTS\n", + " \n", + " # Load file from dataset\n", + " # Rename `dhis2_pyramid`?? Check with downstream processes ... 🚧\n", + " dhis2_pyramid_raw <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_dhis2_raw_pyramid.parquet\")) }, \n", + " error = function(e) {\n", + " msg <- paste(\"Error while loading DHIS2 pyramid RAW data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", + " cat(msg)\n", + " stop(msg)\n", + " })\n", + " \n", + " msg <- paste0(\"DHIS2 RAW pyramid data loaded from dataset : `\", dataset_name, \"`. Dataframe dimensions: \", paste(dim(dhis2_pyramid_raw), collapse=\", \"))\n", + " log_msg(msg)\n", + " \n", + " head(dhis2_pyramid_raw)\n", + " \n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "f1716ac3-ce8f-4223-9729-6ed826e743bc", + "metadata": {}, + "source": [ + "#### **Formatted** pyramid for all other countries (normal pyramid) 🇨🇩 🇧🇫" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fc16ae54-4915-4333-b458-2b611e2b1792", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE != \"NER\") {\n", + " \n", + " # DHIS2 Dataset extract identifier\n", + " dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + " \n", + " # Load file from dataset\n", + " # Rename `dhis2_pyramid`?? Check with downstream processes ... 🚧\n", + " dhis2_pyramid_formatted <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_pyramid.parquet\")) }, \n", + " error = function(e) {\n", + " msg <- paste(\"Error while loading DHIS2 pyramid FORMATTED data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", + " cat(msg)\n", + " stop(msg)\n", + " })\n", + " \n", + " msg <- paste0(\"DHIS2 pyramid FORMATTED data loaded from dataset : `\", dataset_name, \"`. Dataframe dimensions: \", paste(dim(dhis2_pyramid_formatted), collapse=\", \"))\n", + " log_msg(msg)\n", + " \n", + " head(dhis2_pyramid_formatted)\n", + " \n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "eb4c5c63-d140-46b8-b686-886e612a31dc", + "metadata": {}, + "source": [ + "## 3. Calculate **Reporting Rate** (RR)\n", + "We compute it using 2 approaches, user can decided later on which one to use for incidence adjustment." + ] + }, + { + "cell_type": "markdown", + "id": "cb724aa8-5f06-4e99-aeca-640d0c1b049e", + "metadata": {}, + "source": [ + "## 3.1. \"**Dataset**\" reporting rate: pre-computed, from **DHIS2**\n", + "Exrtacted from DHIS2 and formatted. \n", + "\n", + "Straightforward: `ACTUAL_REPORTS` / `EXPECTED_REPORTS` (just pivot `DS_METRIC` and divide)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10b2f52b-0217-43f1-88a3-cd01d98869b1", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", + "\n", + " reporting_rate_dataset <- dhis2_reporting_wide |> \n", + " mutate(REPORTING_RATE = ACTUAL_REPORTS / EXPECTED_REPORTS)\n", + " \n", + " print(dim(reporting_rate_dataset))\n", + " head(reporting_rate_dataset, 3)\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "3d49eda8-b4fd-437a-8938-17bf0806f281", + "metadata": {}, + "source": [ + "#### Quick data quality check 🔍" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cff33416-ea66-4eeb-9d33-1597c2f05b0c", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# inspect_reporting_rate() loaded from utils/snt_dhis2_reporting_rate.r" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e2f4c11c-c683-4204-ab91-9d41cab4826c", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", + " inspect_reporting_rate(reporting_rate_dataset)\n", + " }" + ] + }, + { + "cell_type": "markdown", + "id": "04870e93-5385-425b-89fd-b815a87cfa21", + "metadata": {}, + "source": [ + "#### Subset cols" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d90671b0-36f8-4c6e-8736-4ea807079f83", + "metadata": { + "scrolled": true, + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", + " reporting_rate_dataset <- reporting_rate_dataset |> \n", + " select(all_of(fixed_cols_rr))\n", + " \n", + " dim(reporting_rate_dataset)\n", + " head(reporting_rate_dataset, 3)\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "62e6cb16-0196-447f-b142-aaec2120eecb", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "2dc27c07-80cd-465e-891f-9fb70111dbb0", + "metadata": {}, + "source": [ + "#### Plot by MONTH (heatmap)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9ce56fc-f86a-4a2b-95b7-fb6ec5b89087", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATASET\") {\n", + " \n", + " # Plot reporting rate heatmap\n", + " options(repr.plot.width = 20, repr.plot.height = 10) \n", + " \n", + " # reporting_rate_conf_month %>%\n", + " reporting_rate_dataset %>%\n", + " mutate(\n", + " DATE = as.Date(paste0(YEAR, \"-\", MONTH, \"-01\"))\n", + " ) %>%\n", + " ggplot(., aes(x = DATE, \n", + " y = factor(ADM2_ID), \n", + " fill = REPORTING_RATE * 100)\n", + " ) + \n", + " geom_tile() +\n", + " scale_fill_viridis_c(\n", + " option = \"C\",\n", + " direction = 1, # blue = low, yellow = high\n", + " limits = c(0, 100),\n", + " name = \"Reporting rate (%)\"\n", + " ) +\n", + " labs(\n", + " title = \"Monthly Reporting Rate by Health District - Method 'DataSet'\",\n", + " subtitle = \"Each tile represents the reporting completeness per district per month\",\n", + " x = \"Month\",\n", + " y = \"Health District\"\n", + " ) +\n", + " theme_minimal(base_size = 13) +\n", + " theme(\n", + " axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 9),\n", + " axis.text.y = element_text(size = 9),\n", + " plot.title = element_text(face = \"bold\", hjust = 0.5, size = 14),\n", + " plot.subtitle = element_text(hjust = 0.5, size = 12),\n", + " legend.position = \"right\",\n", + " panel.grid = element_blank()\n", + " )\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "00bf15b7-baa7-4734-8133-8d4a9cc843a3", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "40b21c65-1b75-42f7-821a-24d31e436c73", + "metadata": {}, + "source": [ + "----------------------------" + ] + }, + { + "cell_type": "markdown", + "id": "17ffece4-9420-4004-993b-b5692cc1d2de", + "metadata": {}, + "source": [ + "## 3.2. **Data Element** reporting rate: based on reporting of one or more indicators\n", + "**_Partially_ following methods by WHO and as per Diallo (2025) paper**\n", + "\n", + "To accurately measure data completeness, we calculate the **monthly** reporting rate per **ADM2**, as the **proportion** of **facilities** (HF or `OU_ID`) that in a given month submitted data for either a single indicator (i.e., **confirmed** malaria case as `CONF`) or for _any_ of the chosen indicators (i.e., `CONF`, `SUSP`, `TEST`). \n", + "\n", + "Basically, \"Data Element\" reporting rate is the number of facilities reporting on 1 or more given indicators, over the total number of facilities.
\n", + "\n", + "For this method the user is allowed to **chose** how to calculate both the **numerator** and the **denominator**.
\n", + "Specifically:\n", + "* **Numerator**: is the number of **facilities that _actually reported_** data, and it is estimated based on whether a facility (FoSa, or HF, or `OU_ID`) **submitted data** for **_any_** of the following **indicators**:\n", + " * `CONF`: confirmed malaria cases and/or\n", + " * `SUSP`: suspected malaria cases and/or\n", + " * `TEST`: tested malaria cases
\n", + " Note: we **recommend** always including `CONF` because it is a core indicator consistently tracked across the dataset. This choice ensures alignment with the structure of the incidence calculation, which is also mainly based on confirmed cases.\n", + "\n", + "
\n", + " \n", + "* **Denominator**: is the number of **facilities _expected_ to report**. This number can be obtained in two different ways:\n", + " * `\"DHIS2_EXPECTED_REPORTS\"`: uses the col `EXPECTED_REPORTS` from the df `dhis2_reporting_expected`.
\n", + " This is obtained directly from DHIS2, and is the same denominator used to calculate the \"Dataset\" reporting rate.\n", + " * `\"ROUTINE_ACTIVE_FACILITIES\"`: uses the col `EXPECTED_REPORTS` from the df `active_facilities`.
\n", + " This is calculated as the number of \"**active**\" facilities (`OU_ID`), defined as those that submitted _any_ data **at least once in a given year**, across ***all*** indicators extracted in `dhis2_routine` (namely: all aggregated indicators as defined in the SNT_config.json file, see: `config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS`)\n", + "\n", + "
\n", + "\n", + "This method improves over simple binary completeness flags by accounting for both spatial (facility coverage) and temporal (monthly timeliness) dimensions.
" + ] + }, + { + "cell_type": "markdown", + "id": "f5dcd3b9-6f02-4fc5-9e5f-2253c015a3d4", + "metadata": {}, + "source": [ + "### Calculate the **numerator**" + ] + }, + { + "cell_type": "markdown", + "id": "a90d9f4a-a058-4ad5-8ef2-f827987b5def", + "metadata": {}, + "source": [ + "**Note**: the col `REPORTED` keeps the same name regardless of the value of `DATAELEMENT_METHOD_NUMERATOR` because \n", + "in this way the code needs to be parametrized only once (here).\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8076609c-46e8-478a-8283-bc63a70102f8", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", + "\n", + "dhis2_routine_active <- dhis2_routine %>%\n", + " mutate(\n", + " # if_any() returns TRUE if the condition is met for any of the selected columns\n", + " ACTIVE = if_else(if_any(all_of(indicators_selected), ~ !is.na(.x)), 1, 0)\n", + " )\n", + "\n", + "log_msg(paste0(\"Evaluating reporting facilities based on indicators: \", paste(indicators_selected, collapse = \", \"), \".\"))\n", + "\n", + "dim(dhis2_routine_active)\n", + "head(dhis2_routine_active, 3)\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "325faf35-ed25-4b8e-b421-934a2852f27e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1773313-17e5-478d-b60d-c1193233204d", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# --- 1. Calculate `SUBMITTED_REPORTS` as the nr of ACTIVE facilities (that REPORTED, each month) ------------------------\n", + "\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", + "\n", + "dhis2_routine_submitted <- dhis2_routine_active %>% # OLD: dhis2_routine_reporting_month <- dhis2_routine_reporting %>%\n", + " group_by(ADM2_ID, YEAR, MONTH) %>% \n", + " summarise(\n", + " SUBMITTED_REPORTS = sum(ACTIVE, na.rm = TRUE),\n", + " .groups = \"drop\"\n", + " ) %>%\n", + " ungroup() %>% \n", + " mutate(YEAR = as.integer(YEAR),\n", + " MONTH = as.integer(MONTH)\n", + " ) \n", + "\n", + "print(dim(dhis2_routine_submitted))\n", + "head(dhis2_routine_submitted, 3)\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a25647e3-5674-44e0-855e-c3a48483310d", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "15f4c12f", + "metadata": {}, + "source": [ + "### Calculate the **denominator**" + ] + }, + { + "cell_type": "markdown", + "id": "06b2070d-c672-425f-a78f-b94a8d16a017", + "metadata": {}, + "source": [ + "#### Option: `ROUTINE_ACTIVE_FACILITIES`\n", + "This is to be used **only when** `DATAELEMENT_METHOD_DENOMINATOR ==`**`ROUTINE_ACTIVE_FACILITIES`** " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08f03ed1-5831-4fe5-8bde-674a513e8110", + "metadata": { + "scrolled": true, + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Calculate the tot nr of facilities (distinct OU_ID) based on all HF that appear in the routine data (each YEAR)\n", + "# meaning: regardless of what indicators they submit data for, as long as they have submitted something\n", + "\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"ROUTINE_ACTIVE_FACILITIES\") {\n", + " routine_active_facilities <- dhis2_routine %>%\n", + " # Keep only rows where at least one indicator has non-NA value\n", + " filter(if_any(any_of(DHIS2_INDICATORS), ~ !is.na(.))) %>%\n", + " group_by(YEAR, ADM2_ID) %>%\n", + " summarize(\n", + " EXPECTED_REPORTS = n_distinct(OU_ID),\n", + " .groups = \"drop\" # remove grouping \n", + " )\n", + "\n", + " nr_of_rows <- nrow(routine_active_facilities)\n", + " log_msg(glue::glue(\"Produced df `routine_active_facilities`, with column `EXPECTED_REPORTS` calculated from DHIS2 routine data. Dataframe `routine_active_facilities` has {nr_of_rows} rows.\"))\n", + "\n", + " head(routine_active_facilities, 3)\n", + " \n", + "} \n" + ] + }, + { + "cell_type": "markdown", + "id": "6629dccb-97b0-4b0e-b23f-15b98704323d", + "metadata": {}, + "source": [ + "#### Option: `PYRAMID_OPEN_FACILITIES`\n", + "This is to be used **only when** `DATAELEMENT_METHOD_DENOMINATOR ==`**`PYRAMID_OPEN_FACILITIES`** " + ] + }, + { + "cell_type": "markdown", + "id": "0972ffca-c14a-4b93-85ff-027d056c3759", + "metadata": {}, + "source": [ + "------------------" + ] + }, + { + "cell_type": "markdown", + "id": "d49219b7-5932-4062-a10d-e1f3a4a81449", + "metadata": {}, + "source": [ + "#### TEMPORARY! 🇳🇪 **Niger-specific method**\n", + "🚨 Specific to **Niger EnDoP**: Pre-processing needed to separate facilities from adm levels!! 🚨
\n", + "\n", + "⚠️⚠️⚠️ **TEMPORARY: This will be moved to a dedicated pipeline!** ⚠️⚠️⚠️
\n", + "\n", + "Specifically:\n", + "* **Hospital**s (HD a Hopital District): at **level 4** together with Aires de Sante\n", + "* All other **FoSa**s: at **level 6**, also mixed with the hospital units\n", + "\n", + "Therefore, to assigned closed/open status, it is necessary to attach to each individual facility the closng and opening data column. \n", + "To do this: \n", + "1) extract list of facilities and id across the 2 levels (4 and 6) and\n", + "2) calculate the nr of open facilities per MONTH (PERIOD) per ADM2, ending up with a df with cols: `ADM2_ID`, `YEAR`, `MONTH`, `OPEN_FACILITIES_COUNT` = `EXPECTED_REPORTS`\n", + "3) add this to the df with the **numerator** (`dhis2_routine_submitted`)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e6296329-9bcd-4d2c-afb3-520c6a159cdb", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE == \"NER\") {\n", + " \n", + "# names(dhis2_pyramid_raw)\n", + "dim(dhis2_pyramid_raw)\n", + "head(dhis2_pyramid_raw, 3)\n", + " \n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "6f651b18-2d85-4e26-8952-45dae9020c40", + "metadata": {}, + "source": [ + "#### 1. Create df with list of all **facilities** with their `DATE_OPENED` and `DATE_CLOSED`: `facility_master`\n", + "Separate \"facilities\" (of any type, such as hospitals to CSI, Infermieres etc) from admin levels and hospital units (wards, depts...)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "25dea3c7-44ea-42a7-b467-f470892fcfef", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# is_aire_l5() and is_hospital_l4() loaded from utils/snt_dhis2_reporting_rate.r" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "55760b89-f4d0-40c5-9905-f7c7c4fee5c0", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# List of all FoSa (from Aires → Level 6)\n", + "\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE == \"NER\") {\n", + " \n", + "fosa_master <- dhis2_pyramid_raw %>%\n", + " filter(is_aire_l5(LEVEL_5_NAME)) %>%\n", + " distinct(\n", + " OU_ID = LEVEL_6_ID,\n", + " OU_NAME = LEVEL_6_NAME,\n", + " region = LEVEL_2_NAME,\n", + " district = LEVEL_3_NAME,\n", + " ADM2_ID = LEVEL_3_ID,\n", + " DATE_OPENED = OPENING_DATE, \n", + " DATE_CLOSED = CLOSED_DATE\n", + " ) %>%\n", + " mutate(OU_TYPE = \"FoSa\")\n", + "\n", + "dim(fosa_master)\n", + "head(fosa_master)\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e9c9f59-9c6c-44e4-bbd9-13c3917f5117", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# List of all Hospitals (from Level 4, aggregate dates across children)\n", + "\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE == \"NER\") {\n", + " \n", + "hosp_master <- dhis2_pyramid_raw %>%\n", + "filter(is_hospital_l4(LEVEL_4_NAME)) %>%\n", + "group_by(LEVEL_4_ID, LEVEL_4_NAME, LEVEL_2_NAME, LEVEL_3_NAME, LEVEL_3_ID) %>%\n", + "summarise(\n", + " OPENING_DATE = suppressWarnings(min(OPENING_DATE, na.rm = TRUE)),\n", + " CLOSED_DATE = suppressWarnings(max(CLOSED_DATE, na.rm = TRUE)),\n", + " .groups = \"drop\"\n", + ") %>%\n", + "mutate(\n", + " DATE_OPENED = ifelse(is.infinite(OPENING_DATE), NA, OPENING_DATE) |> as_datetime(),\n", + " DATE_CLOSED = ifelse(is.infinite(CLOSED_DATE), NA, CLOSED_DATE) |> as_datetime()\n", + " ) %>%\n", + "distinct(\n", + " OU_ID = LEVEL_4_ID, \n", + " OU_NAME = LEVEL_4_NAME,\n", + " region=LEVEL_2_NAME,\n", + " district=LEVEL_3_NAME,\n", + " ADM2_ID=LEVEL_3_ID,\n", + " DATE_OPENED,\n", + " DATE_CLOSED\n", + ") %>%\n", + "mutate(\n", + " OU_TYPE = \"Hospital\"\n", + " )\n", + "\n", + "dim(hosp_master)\n", + "head(hosp_master)\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5859e393-bfac-46e6-b103-cb8177100860", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Merge both\n", + "\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE == \"NER\") {\n", + " \n", + "facility_master <- bind_rows(fosa_master, hosp_master) %>% \n", + " select(ADM2_ID, \n", + " OU_ID, \n", + " DATE_OPENED, \n", + " DATE_CLOSED)\n", + "\n", + "dim(facility_master)\n", + "head(facility_master, 3)\n", + "\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "191c3b01-1645-410e-be99-b247bf5f9cfb", + "metadata": {}, + "source": [ + "---------------------" + ] + }, + { + "cell_type": "markdown", + "id": "1ee48156-5ed5-43a5-b927-caa53c10d98e", + "metadata": {}, + "source": [ + "#### **Generic** part: applies to **all countries**" + ] + }, + { + "cell_type": "markdown", + "id": "3aa057a5-6f68-493e-83e2-81bafce42c9e", + "metadata": {}, + "source": [ + "#### 2. Calculate nr of **OPEN facilities** for each `MONTH` per `ADM2`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "91f972f1-bcc9-458f-9662-5574efc7ac9d", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Define start and end period based on routine data \n", + "\n", + "# if (COUNTRY_CODE == \"NER\" && REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", + " \n", + "PERIOD_START <- dhis2_routine$PERIOD |> min()\n", + "PERIOD_END <- dhis2_routine$PERIOD |> max()\n", + "\n", + "print(paste0(\"Start period: \", PERIOD_START))\n", + "print(paste0(\"End period :\", PERIOD_END))\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c6ed56f-b2a6-460c-a3dc-6c588c40b54c", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "## Create a \"complete\" grid of every month and year for the period range ---------------------------------------\n", + "\n", + "# if (COUNTRY_CODE == \"NER\" && REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", + " \n", + "months_grid <- tibble(\n", + " month_date = seq(\n", + " ymd(paste0(PERIOD_START, \"01\")), # Converts 202201 to \"20220101\" and then to a date\n", + " ymd(paste0(PERIOD_END, \"01\")), # same\n", + " by = \"months\"\n", + " )\n", + ") %>%\n", + " mutate(\n", + " YEAR = year(month_date),\n", + " MONTH = month(month_date)\n", + " )\n", + "\n", + "dim(months_grid) \n", + "head(months_grid, 3)\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eb7d9a47-053e-43a5-9e0d-c7b717236f3e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "## Create `facility_master` for any (🚨 non-NER) countries\n", + "\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE != \"NER\") {\n", + "\n", + " # Programmatically define `ADM2_ID`\n", + " ADMIN_2_LEVEL <- str_replace(ADMIN_2, \"NAME\", \"ID\")\n", + " # Programmatically define `OU_ID`\n", + " HF_LEVEL <- glue::glue(\"LEVEL_{config_json$SNT_CONFIG$ANALYTICS_ORG_UNITS_LEVEL}_ID\")\n", + "\n", + " facility_master <- dhis2_pyramid_formatted |>\n", + " mutate(\n", + " DATE_OPENED = with_tz(OPENING_DATE, \"UTC\"),\n", + " DATE_CLOSED = with_tz(CLOSED_DATE, \"UTC\")\n", + " ) |>\n", + " select(\n", + " ADM2_ID = all_of(ADMIN_2_LEVEL), \n", + " OU_ID = all_of(HF_LEVEL),\n", + " DATE_OPENED, #= OPENING_DATE,\n", + " DATE_CLOSED #= CLOSED_DATE\n", + ")\n", + "\n", + "head(facility_master)\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f283cc96-ed69-43a3-964e-57ccb0180a4a", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "## Create a \"complete\" grid of every ADM2_ID for every month ---------------------------------------\n", + "\n", + "# if (COUNTRY_CODE == \"NER\" && REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", + " \n", + "# This ensures that even if an ADM2_ID has zero open facilities in a month,\n", + "# it will still appear in the final result with a count of 0.\n", + "complete_grid <- expand_grid(\n", + " ADM2_ID = unique(facility_master$ADM2_ID),\n", + " month_date = months_grid$month_date\n", + ") %>%\n", + " mutate(\n", + " YEAR = year(month_date),\n", + " MONTH = month(month_date),\n", + " month_date = with_tz(as_datetime(month_date), \"UTC\") # GP added 0809\n", + " )\n", + "\n", + "head(complete_grid, 3)\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6e905c46-036a-4aa9-85ad-216f846f9e1b", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "## Calculate the number of open facilities ---------------------------------------\n", + "\n", + "# # The facility must have opened on or before the last day of the current month. \n", + "# # To calculate the last day: add one month and subtract one day from the first day.\n", + "# complete_grid$month_date[1] # \"2022-01-01\"\n", + "# complete_grid$month_date[1] + months(1) - days(1) # \"2022-01-31\"\n", + "# # The facility must either still be open (DATE_CLOSED is NA) OR it must have closed on or after the first day of that month.\n", + "\n", + "# if (COUNTRY_CODE == \"NER\" && REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", + " \n", + "open_facilities_count <- facility_master %>%\n", + " # Create a row for every possible combination of facility and month\n", + " crossing(months_grid) %>%\n", + " # A facility is \"open\" if it opened BEFORE the end of the month\n", + " # AND it either never closed (NA) or closed AFTER the start of the month.\n", + " filter(\n", + " DATE_OPENED <= month_date + months(1) - days(1) & # opened on or before the last day of the current month\n", + " (is.na(DATE_CLOSED) | DATE_CLOSED >= month_date) # \n", + " ) %>%\n", + " # Count the number of open facilities for each area and month\n", + " count(ADM2_ID, YEAR, MONTH, name = \"OPEN_FACILITIES_COUNT\")\n", + "\n", + "head(open_facilities_count, 3)\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ba854f2-5925-4154-b86e-3e4e7bb6c363", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "## Join the counts back to the complete grid to include zeros --------------------------------------\n", + "\n", + "# if (COUNTRY_CODE == \"NER\" && REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", + " \n", + "pyramid_open_facilities <- complete_grid %>%\n", + " left_join(open_facilities_count, by = c(\"ADM2_ID\", \"YEAR\", \"MONTH\")) %>%\n", + " # If a month had no open facilities, the count will be NA. Change it to 0.\n", + " # Also rename `OPEN_FACILITIES_COUNT` to `EXPECTED_REPORTS` to use same col name as other methods\n", + " mutate(OPEN_FACILITIES_COUNT = replace_na(OPEN_FACILITIES_COUNT, 0)) %>% # DENOMINATOR: consistent col name across all methods \n", + " select(ADM2_ID, YEAR, MONTH, \n", + " EXPECTED_REPORTS = OPEN_FACILITIES_COUNT) %>%\n", + " arrange(ADM2_ID, YEAR, MONTH)\n", + "\n", + "print(dim(pyramid_open_facilities))\n", + "head(pyramid_open_facilities, 3)\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff1a8537-d093-4d5c-8a44-4b729090cced", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "811310a9-df85-4fa3-af9a-b931eaffd7e5", + "metadata": {}, + "source": [ + "### Calculate **Reporting Rate** " + ] + }, + { + "cell_type": "markdown", + "id": "8827cfd6-479b-4025-a379-d20bf20fcfb4", + "metadata": {}, + "source": [ + "**Join df for Denominator**\n", + "\n", + "**Note**
\n", + "in both df's (`dhis2_reporting_expected` OR `routine_active_facilities`) the col `EXPECTED_REPORTS` has the same name to simplify parametrization: only difference between the 2 options is the df to be joined (right element in `left_join()`)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "670508a0-3075-4f82-aa2c-d26cf867f13d", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# --- 2. Join `dhis2_reporting_expected` OR `dhis2_calculated_expected` to add `EXPECTED_REPORTS` ------------------------------------------------\n", + "\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", + "\n", + "# Parametrized based on DATAELEMENT_METHOD_DENOMINATOR: left_join() the respective df\n", + "if (DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", + " # Add df of rep rate extracted directly from DHIS2\n", + " dhis2_routine_submitted_expected <- left_join(\n", + " dhis2_routine_submitted, \n", + " dhis2_reporting_expected |> select(ADM2_ID, YEAR, MONTH, EXPECTED_REPORTS), # `dhis2_reporting_expected`\n", + " by = join_by(ADM2_ID, YEAR, MONTH)\n", + " ) \n", + " log_msg(\"Calculating `Data Element` reporting rate, using as denominator `EXPECTED_REPORTS` extracted directly from DHIS2.\")\n", + " \n", + "} else if (DATAELEMENT_METHOD_DENOMINATOR == \"ROUTINE_ACTIVE_FACILITIES\") {\n", + " # Add df of rep rate CALCULATED based on submissiosn in dhis2 routine data \"active\" facilities\n", + " dhis2_routine_submitted_expected <- left_join(\n", + " dhis2_routine_submitted, \n", + " routine_active_facilities, # has only cols: `YEAR`, `ADM2_ID`, `EXPECTED_REPORTS`\n", + " by = join_by(ADM2_ID, YEAR) #, MONTH)\n", + " ) \n", + " log_msg(\"Calculating `Data Element` reporting rate, using as denominator `EXPECTED_REPORTS` as CALCULATED from DHIS2 routine data. Here, ACTIVE facilities \n", + " are defined as facilities that reported on any of the extracted indicators at least once per year.\")\n", + " \n", + "} else if (DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", + " # Add df of rep rate CALCULATED based on OPEN facilities as per PYRAMID RAW\n", + " dhis2_routine_submitted_expected <- left_join(\n", + " dhis2_routine_submitted, \n", + " pyramid_open_facilities, \n", + " by = join_by(ADM2_ID, YEAR, MONTH)\n", + " ) \n", + " log_msg(\"Calculating `Data Element` reporting rate, using as denominator `EXPECTED_REPORTS` as CALCULATED from DHIS2 pyramid. \n", + " This method counts the number of OPEN facilities for each ADM2 per MONTH.\")\n", + "}\n", + "\n", + "# Safety measures ...\n", + "dhis2_routine_submitted_expected <- dhis2_routine_submitted_expected |>\n", + " # ungroup() %>% \n", + " mutate(YEAR = as.integer(YEAR),\n", + " MONTH = as.integer(MONTH)\n", + " ) \n", + "\n", + "\n", + "print(dim(dhis2_routine_submitted_expected))\n", + "head(dhis2_routine_submitted_expected, 3)\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6fad303c-b239-4cf9-93a8-fe3ce5c33c37", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# --- 3. Calculate `REPORTING_RATE` ------------------------------------------------\n", + "\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", + " \n", + "reporting_rate_dataelement <- dhis2_routine_submitted_expected |>\n", + "mutate(\n", + " REPORTING_RATE = SUBMITTED_REPORTS / EXPECTED_REPORTS\n", + " ) \n", + "\n", + "dim(reporting_rate_dataelement)\n", + "head(reporting_rate_dataelement, 3)\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68023e8e-f7f6-4201-b097-1996bee57671", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# head(hf_active, 3)" + ] + }, + { + "cell_type": "markdown", + "id": "ae3aa127-c20c-4ca5-af0c-4a4260883cac", + "metadata": {}, + "source": [ + "`#### 🚨 Here 👇 swap denominator: join `dhis2_reporting_expected` to replace `TOTAL_HF` with `EXPECTED_REPORTS``" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a97c7d75-3317-48bc-a2f1-770bf38d141a", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", + " inspect_reporting_rate(reporting_rate_dataelement)\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92651472-26e2-4131-ac02-288122138b0b", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# # --- 1. create intermediate df `hf_active_month`: summarize nr of \"active\" (reporting) HF by month ------------------------\n", + "# hf_active_month <- hf_active %>% \n", + "# # filter(ADM1_ID == \"rWrCdr321Qu\") |> # ⚠️⚠️⚠️ TEMP subset just for CODE development ... ! ⚠️⚠️⚠️\n", + "# dplyr::group_by(ADM2_ID, YEAR, MONTH) %>%\n", + "# dplyr::summarize(\n", + "# SUBMITTED_REPORTS = length(which(ACTIVE == TRUE)), # 🚨 GP changed to BOOLEAN to save space\n", + "# .groups = \"drop\") |>\n", + "# mutate(YEAR = as.integer(YEAR), \n", + "# MONTH = as.integer(MONTH)\n", + "# )\n", + "\n", + "# print(dim(hf_active_month))\n", + "# head(hf_active_month)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db5ad094-0601-4a18-9435-db60c1f4e8ff", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", + "\n", + " reporting_rate_dataelement <- reporting_rate_dataelement |> \n", + " select(all_of(fixed_cols_rr))\n", + " \n", + " head(reporting_rate_dataelement, 3)\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05f94483-1524-426e-9fe3-4b9bf572c05e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "73ed8e24-1aab-47af-9d91-5bc4899a40e9", + "metadata": {}, + "source": [ + "`#### Quick data quality check 🔍`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "713a5ed3-2aeb-4949-8ecc-6ee3f787a719", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", + "\n", + "# Plot reporting rate heatmap\n", + "options(repr.plot.width = 20, repr.plot.height = 10) \n", + "\n", + "# reporting_rate_conf_month %>%\n", + "reporting_rate_dataelement %>%\n", + "mutate(\n", + " DATE = as.Date(paste0(YEAR, \"-\", MONTH, \"-01\"))\n", + " ) %>%\n", + "ggplot(., aes(x = DATE, \n", + " y = factor(ADM2_ID), \n", + " fill = REPORTING_RATE * 100)\n", + " ) + \n", + " geom_tile() +\n", + " scale_fill_viridis_c(\n", + " option = \"C\",\n", + " direction = 1, # blue = low, yellow = high\n", + " limits = c(0, 100),\n", + " name = \"Reporting rate (%)\"\n", + " ) +\n", + " labs(\n", + " title = \"Monthly Reporting Rate by Health District - Method 'DataElement'\",\n", + " subtitle = \"Each tile represents the reporting completeness per district per month\",\n", + " x = \"Month\",\n", + " y = \"Health District\"\n", + " ) +\n", + " theme_minimal(base_size = 13) +\n", + " theme(\n", + " axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 9),\n", + " axis.text.y = element_text(size = 9),\n", + " plot.title = element_text(face = \"bold\", hjust = 0.5, size = 14),\n", + " plot.subtitle = element_text(hjust = 0.5, size = 12),\n", + " legend.position = \"right\",\n", + " panel.grid = element_blank()\n", + " )\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "93f5b7f0-bf5e-4567-9d16-da2091125988", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "6729e183-5395-4fb7-a535-978c61124710", + "metadata": {}, + "source": [ + "# 4. Export 📁 /data/ folder" + ] + }, + { + "cell_type": "markdown", + "id": "ef68ae41-a0a9-4b45-8b7d-3d1c9b535ad9", + "metadata": {}, + "source": [ + "### 🧹 Clear output directory\n", + "This is needed to ensure that only 2 files are written to the new version of the Dataset:\n", + "* **Data Set** reporting rate (only one way to calculate it, not parametrized as nothing to \"decide\" here)\n", + "* **Data Element** reporting rate: here there are 7 possible combinations of numerator times 3 possible combinatiosn of denominator.
\n", + " These are too many optiosn to give to the incidence pipeline (the step that ingests this data), where these would need to be hardcoded in the pipeline module. When running the incidence pipeline, the user simply choses whether to use `\"dataset\"` or `\"dataelement\"`, and therefore there must be only one file for each option.
\n", + " However, we want to **preserve the info** on the choice of **numerator** and **denominator** in the **filename**. The import function used in incidence therefore only looks for the fixed pattern in the filename, and ignores the tags for numerator and denominator (e.g., \"n-conf-susp-test\", \"d-dexrep\")." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7eb882f3-a443-4363-bcad-be5b4ebc7d8f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Cleanup\n", + "path_to_clear <- file.path(DATA_PATH, \"reporting_rate\")\n", + "files_to_delete <- list.files(path_to_clear, full.names = TRUE, recursive = TRUE)\n", + "unlink(files_to_delete, recursive = TRUE)\n", + "log_msg(glue::glue(\"🧹 Deleting all existing files from `{path_to_clear}`. Output of current pipeline run will replace output of previous run.\"))" + ] + }, + { + "cell_type": "markdown", + "id": "1372184e-a1a9-472a-87d4-69e38a1b139d", + "metadata": { + "papermill": { + "duration": 0.000436, + "end_time": "2025-08-26T09:50:02.570794", + "exception": false, + "start_time": "2025-08-26T09:50:02.570358", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### CSV" + ] + }, + { + "cell_type": "markdown", + "id": "c266c99e-a08e-471b-93dd-dbedb4841483", + "metadata": { + "papermill": { + "duration": 0.000436, + "end_time": "2025-08-26T09:50:02.570794", + "exception": false, + "start_time": "2025-08-26T09:50:02.570358", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### Build up file name for **data Element** method" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1b6532c9-292e-4e8c-8e9d-987867920a6d", + "metadata": { + "papermill": { + "duration": 0.198788, + "end_time": "2025-08-26T09:50:02.770154", + "exception": false, + "start_time": "2025-08-26T09:50:02.571366", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# 🚨 Currently not in use! Keeping for future update to method 🚨 (GP 2025-08-29)\n", + "\n", + "# Abbreviation for Data Elememnt chosen NUMERATOR\n", + "method_num = tolower(paste0(\"n-\", paste(indicators_selected, collapse = \"-\")))\n", + "method_num\n", + "\n", + "\n", + "# Abbreviation for Data Elememnt chosen DENOMINATOR\n", + "if (DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", + " method_den = \"d-dexrep\" # \"d1\"\n", + "} else if (DATAELEMENT_METHOD_DENOMINATOR == \"ROUTINE_ACTIVE_FACILITIES\") {\n", + " method_den = \"d-actfac\" # \"d2\"\n", + " } else if (DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", + " method_den = \"d-opnfcl\" # \"d2\"\n", + " }\n", + "\n", + "method_den" + ] + }, + { + "cell_type": "markdown", + "id": "cf5bcd47-dba1-4a7a-81cf-d036fd0ee4db", + "metadata": { + "papermill": { + "duration": 0.000436, + "end_time": "2025-08-26T09:50:02.570794", + "exception": false, + "start_time": "2025-08-26T09:50:02.570358", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### Write function to assemble path based on method - for .**csv**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b01c022-aa52-4a1b-a2fe-7bcb145a2049", + "metadata": { + "papermill": { + "duration": 0.108587, + "end_time": "2025-08-26T09:50:02.884462", + "exception": false, + "start_time": "2025-08-26T09:50:02.775875", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# snt_write_csv() loaded from utils/snt_dhis2_reporting_rate.r" + ] + }, + { + "cell_type": "markdown", + "id": "512ba94e-b7fc-4e45-bdda-8f5533e4e665", + "metadata": { + "papermill": { + "duration": 0.000436, + "end_time": "2025-08-26T09:50:02.570794", + "exception": false, + "start_time": "2025-08-26T09:50:02.570358", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### Use function to export .csv files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8689ebc3-d975-45be-92fd-1fedfc733f49", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Method \"Dataset\"\n", + "\n", + "if (REPORTING_RATE_METHOD == \"DATASET\") {\n", + " snt_write_csv(x = reporting_rate_dataset, \n", + " output_data_path = DATA_PATH, \n", + " method = \"dataset\",\n", + " country_code = COUNTRY_CODE) \n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b16e74ae-64f2-4267-a6ae-8413c8463af6", + "metadata": { + "papermill": { + "duration": 2.659797, + "end_time": "2025-08-26T09:50:05.545618", + "exception": false, + "start_time": "2025-08-26T09:50:02.885821", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Method \"Data Element\"\n", + "\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", + " snt_write_csv(x = reporting_rate_dataelement,\n", + " output_data_path = DATA_PATH, \n", + " method = \"dataelement\",\n", + " country_code = COUNTRY_CODE)\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "cfd1679e-dc0e-4805-9420-0788884a7713", + "metadata": { + "papermill": { + "duration": 0.000345, + "end_time": "2025-08-26T09:50:05.546427", + "exception": false, + "start_time": "2025-08-26T09:50:05.546082", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### parquet" + ] + }, + { + "cell_type": "markdown", + "id": "bed7679d-c392-4e3a-9fc7-4d6ae6982517", + "metadata": { + "papermill": { + "duration": 0.000436, + "end_time": "2025-08-26T09:50:02.570794", + "exception": false, + "start_time": "2025-08-26T09:50:02.570358", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### Write function to assemble path based on method - for .**parquet**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26e68499-ef55-46f6-a017-8d03fdbff1b4", + "metadata": { + "papermill": { + "duration": 0.100077, + "end_time": "2025-08-26T09:50:05.647079", + "exception": false, + "start_time": "2025-08-26T09:50:05.547002", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# snt_write_parquet() loaded from utils/snt_dhis2_reporting_rate.r" + ] + }, + { + "cell_type": "markdown", + "id": "8250b998-2669-4590-a4fe-770e42b2d43f", + "metadata": { + "papermill": { + "duration": 0.000436, + "end_time": "2025-08-26T09:50:02.570794", + "exception": false, + "start_time": "2025-08-26T09:50:02.570358", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### Use function to export .csv files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "52b5720c-864e-49ac-bf40-6b5551214eaa", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Method \"Dataset\"\n", + "\n", + "if (REPORTING_RATE_METHOD == \"DATASET\") {\n", + " snt_write_parquet(x = reporting_rate_dataset,\n", + " output_data_path = DATA_PATH,\n", + " method = \"dataset\",\n", + " country_code = COUNTRY_CODE\n", + " ) \n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "95bf17f5-6015-464c-9388-df2397d1609c", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Method \"Data Element\"\n", + "\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", + " snt_write_parquet(x = reporting_rate_dataelement,\n", + " output_data_path = DATA_PATH,\n", + " method = \"dataelement\",\n", + " country_code = COUNTRY_CODE\n", + " )\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65b3e4b8-1a62-47c8-877b-1dae4511e4f0", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] } - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "73ed8e24-1aab-47af-9d91-5bc4899a40e9", - "metadata": {}, - "source": [ - "`#### Quick data quality check 🔍`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "713a5ed3-2aeb-4949-8ecc-6ee3f787a719", - "metadata": { - "vscode": { - "languageId": "r" + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" } - }, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", - "\n", - "# Plot reporting rate heatmap\n", - "options(repr.plot.width = 20, repr.plot.height = 10) \n", - "\n", - "# reporting_rate_conf_month %>%\n", - "reporting_rate_dataelement %>%\n", - "mutate(\n", - " DATE = as.Date(paste0(YEAR, \"-\", MONTH, \"-01\"))\n", - " ) %>%\n", - "ggplot(., aes(x = DATE, \n", - " y = factor(ADM2_ID), \n", - " fill = REPORTING_RATE * 100)\n", - " ) + \n", - " geom_tile() +\n", - " scale_fill_viridis_c(\n", - " option = \"C\",\n", - " direction = 1, # blue = low, yellow = high\n", - " limits = c(0, 100),\n", - " name = \"Reporting rate (%)\"\n", - " ) +\n", - " labs(\n", - " title = \"Monthly Reporting Rate by Health District - Method 'DataElement'\",\n", - " subtitle = \"Each tile represents the reporting completeness per district per month\",\n", - " x = \"Month\",\n", - " y = \"Health District\"\n", - " ) +\n", - " theme_minimal(base_size = 13) +\n", - " theme(\n", - " axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 9),\n", - " axis.text.y = element_text(size = 9),\n", - " plot.title = element_text(face = \"bold\", hjust = 0.5, size = 14),\n", - " plot.subtitle = element_text(hjust = 0.5, size = 12),\n", - " legend.position = \"right\",\n", - " panel.grid = element_blank()\n", - " )\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "93f5b7f0-bf5e-4567-9d16-da2091125988", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "6729e183-5395-4fb7-a535-978c61124710", - "metadata": {}, - "source": [ - "# 4. Export 📁 /data/ folder" - ] - }, - { - "cell_type": "markdown", - "id": "ef68ae41-a0a9-4b45-8b7d-3d1c9b535ad9", - "metadata": {}, - "source": [ - "### 🧹 Clear output directory\n", - "This is needed to ensure that only 2 files are written to the new version of the Dataset:\n", - "* **Data Set** reporting rate (only one way to calculate it, not parametrized as nothing to \"decide\" here)\n", - "* **Data Element** reporting rate: here there are 7 possible combinations of numerator times 3 possible combinatiosn of denominator.
\n", - " These are too many optiosn to give to the incidence pipeline (the step that ingests this data), where these would need to be hardcoded in the pipeline module. When running the incidence pipeline, the user simply choses whether to use `\"dataset\"` or `\"dataelement\"`, and therefore there must be only one file for each option.
\n", - " However, we want to **preserve the info** on the choice of **numerator** and **denominator** in the **filename**. The import function used in incidence therefore only looks for the fixed pattern in the filename, and ignores the tags for numerator and denominator (e.g., \"n-conf-susp-test\", \"d-dexrep\")." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7eb882f3-a443-4363-bcad-be5b4ebc7d8f", - "metadata": {}, - "outputs": [], - "source": [ - "# Cleanup\n", - "path_to_clear <- file.path(DATA_PATH, \"reporting_rate\")\n", - "files_to_delete <- list.files(path_to_clear, full.names = TRUE, recursive = TRUE)\n", - "unlink(files_to_delete, recursive = TRUE)\n", - "log_msg(glue::glue(\"🧹 Deleting all existing files from `{path_to_clear}`. Output of current pipeline run will replace output of previous run.\"))" - ] - }, - { - "cell_type": "markdown", - "id": "1372184e-a1a9-472a-87d4-69e38a1b139d", - "metadata": { - "papermill": { - "duration": 0.000436, - "end_time": "2025-08-26T09:50:02.570794", - "exception": false, - "start_time": "2025-08-26T09:50:02.570358", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### CSV" - ] - }, - { - "cell_type": "markdown", - "id": "c266c99e-a08e-471b-93dd-dbedb4841483", - "metadata": { - "papermill": { - "duration": 0.000436, - "end_time": "2025-08-26T09:50:02.570794", - "exception": false, - "start_time": "2025-08-26T09:50:02.570358", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### Build up file name for **data Element** method" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1b6532c9-292e-4e8c-8e9d-987867920a6d", - "metadata": { - "papermill": { - "duration": 0.198788, - "end_time": "2025-08-26T09:50:02.770154", - "exception": false, - "start_time": "2025-08-26T09:50:02.571366", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "# 🚨 Currently not in use! Keeping for future update to method 🚨 (GP 2025-08-29)\n", - "\n", - "# Abbreviation for Data Elememnt chosen NUMERATOR\n", - "method_num = tolower(paste0(\"n-\", paste(indicators_selected, collapse = \"-\")))\n", - "method_num\n", - "\n", - "\n", - "# Abbreviation for Data Elememnt chosen DENOMINATOR\n", - "if (DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", - " method_den = \"d-dexrep\" # \"d1\"\n", - "} else if (DATAELEMENT_METHOD_DENOMINATOR == \"ROUTINE_ACTIVE_FACILITIES\") {\n", - " method_den = \"d-actfac\" # \"d2\"\n", - " } else if (DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", - " method_den = \"d-opnfcl\" # \"d2\"\n", - " }\n", - "\n", - "method_den" - ] - }, - { - "cell_type": "markdown", - "id": "cf5bcd47-dba1-4a7a-81cf-d036fd0ee4db", - "metadata": { - "papermill": { - "duration": 0.000436, - "end_time": "2025-08-26T09:50:02.570794", - "exception": false, - "start_time": "2025-08-26T09:50:02.570358", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### Write function to assemble path based on method - for .**csv**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0b01c022-aa52-4a1b-a2fe-7bcb145a2049", - "metadata": { - "papermill": { - "duration": 0.108587, - "end_time": "2025-08-26T09:50:02.884462", - "exception": false, - "start_time": "2025-08-26T09:50:02.775875", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "# write function\n", - "snt_write_csv <- function(x, output_data_path, method) {\n", - " \n", - " full_directory_path <- file.path(output_data_path, \"reporting_rate\")\n", - " \n", - " if (!dir.exists(full_directory_path)) {\n", - " dir.create(full_directory_path, recursive = TRUE)\n", - " }\n", - "\n", - " file_path <- file.path(full_directory_path, paste0(COUNTRY_CODE, \"_reporting_rate_\", method, \".csv\")) \n", - " \n", - " write_csv(x, file_path)\n", - "\n", - " log_msg(paste0(\"Exported : \", file_path))\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "512ba94e-b7fc-4e45-bdda-8f5533e4e665", - "metadata": { - "papermill": { - "duration": 0.000436, - "end_time": "2025-08-26T09:50:02.570794", - "exception": false, - "start_time": "2025-08-26T09:50:02.570358", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### Use function to export .csv files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8689ebc3-d975-45be-92fd-1fedfc733f49", - "metadata": {}, - "outputs": [], - "source": [ - "# Method \"Dataset\"\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATASET\") {\n", - " snt_write_csv(x = reporting_rate_dataset, \n", - " output_data_path = DATA_PATH, \n", - " method = \"dataset\") \n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b16e74ae-64f2-4267-a6ae-8413c8463af6", - "metadata": { - "papermill": { - "duration": 2.659797, - "end_time": "2025-08-26T09:50:05.545618", - "exception": false, - "start_time": "2025-08-26T09:50:02.885821", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "# Method \"Data Element\"\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", - " snt_write_csv(x = reporting_rate_dataelement,\n", - " output_data_path = DATA_PATH, \n", - " method = \"dataelement\")\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "cfd1679e-dc0e-4805-9420-0788884a7713", - "metadata": { - "papermill": { - "duration": 0.000345, - "end_time": "2025-08-26T09:50:05.546427", - "exception": false, - "start_time": "2025-08-26T09:50:05.546082", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### parquet" - ] - }, - { - "cell_type": "markdown", - "id": "bed7679d-c392-4e3a-9fc7-4d6ae6982517", - "metadata": { - "papermill": { - "duration": 0.000436, - "end_time": "2025-08-26T09:50:02.570794", - "exception": false, - "start_time": "2025-08-26T09:50:02.570358", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### Write function to assemble path based on method - for .**parquet**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "26e68499-ef55-46f6-a017-8d03fdbff1b4", - "metadata": { - "papermill": { - "duration": 0.100077, - "end_time": "2025-08-26T09:50:05.647079", - "exception": false, - "start_time": "2025-08-26T09:50:05.547002", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "# write function\n", - "snt_write_parquet <- function(x, output_data_path, method) {\n", - " \n", - " full_directory_path <- file.path(output_data_path, \"reporting_rate\")\n", - " \n", - " if (!dir.exists(full_directory_path)) {\n", - " dir.create(full_directory_path, recursive = TRUE)\n", - " }\n", - "\n", - " file_path <- file.path(full_directory_path, paste0(COUNTRY_CODE, \"_reporting_rate_\", method, \".parquet\")) \n", - " \n", - " arrow::write_parquet(x, file_path)\n", - "\n", - " log_msg(paste0(\"Exported : \", file_path))\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "8250b998-2669-4590-a4fe-770e42b2d43f", - "metadata": { - "papermill": { - "duration": 0.000436, - "end_time": "2025-08-26T09:50:02.570794", - "exception": false, - "start_time": "2025-08-26T09:50:02.570358", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### Use function to export .csv files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "52b5720c-864e-49ac-bf40-6b5551214eaa", - "metadata": {}, - "outputs": [], - "source": [ - "# Method \"Dataset\"\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATASET\") {\n", - " snt_write_parquet(x = reporting_rate_dataset,\n", - " output_data_path = DATA_PATH,\n", - " method = \"dataset\"\n", - " ) \n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "95bf17f5-6015-464c-9388-df2397d1609c", - "metadata": {}, - "outputs": [], - "source": [ - "# Method \"Data Element\"\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", - " snt_write_parquet(x = reporting_rate_dataelement,\n", - " output_data_path = DATA_PATH,\n", - " method = \"dataelement\"\n", - " )\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "65b3e4b8-1a62-47c8-877b-1dae4511e4f0", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/pipelines/snt_dhis2_reporting_rate/reporting/snt_dhis2_reporting_rate_report.ipynb b/pipelines/snt_dhis2_reporting_rate/reporting/snt_dhis2_reporting_rate_report.ipynb index 27d2cd2..65073fc 100644 --- a/pipelines/snt_dhis2_reporting_rate/reporting/snt_dhis2_reporting_rate_report.ipynb +++ b/pipelines/snt_dhis2_reporting_rate/reporting/snt_dhis2_reporting_rate_report.ipynb @@ -1,998 +1,1113 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "ad4e51fd-ab5a-478c-856a-bc3308ce5781", - "metadata": {}, - "source": [ - "-------------\n", - "🤌🏼 Points to discuss:\n", - "* **what we do want to plot here?**
\n", - " Plot only what is produced by the pipeline (hence reflect choice of parameters from pipeline run) OR all the possible options (all output produced by all pipelines run so far, meaning whatever is writte the to most recent version of the Dataset?)\n", - "* **how to handle missing files?**: namely, situations in which files are not yet been produced. In this reporting rate case, if the user only runs the pipeline to produce the \"Dataset\" reporrting rate file,, then we cannot plot anything for the \"Data Element\" reporting rate as there is no file yet ...\n", - " Atm this is handled with `if` logic, but should be made more elegant to avoid repeating the same code twice (for dataset and for dataelement)\n", - "\n", - "-------------\n", - "\n", - "🚧 To do:\n", - "* **Plots shouls be wrapped as functions (DRY code)**! Cuold save in .R file in this same location to `source()` only here (as plots are specifc to this notebook, no need to save in snt_utils.R)\n", - "* **Display _real_ data**: do **_not_ cap** reporting rate values at 1 (100%)!! It's important to visualize real full range if we want to qualitatively assess and compare different methods!\n", - "* **fix object names**: `routine_data` is NOT routine data ... !!\n", - "* When importing `reporting_rate_data`, try if possible to avoid using `tryCatch`, and use `log_msg(..., \"warning\")` instead (should simplify code and logic ... ). Idea is to **log a meaningful warning without making the pipeline fail** just becauase a file in the report nb is missing ... !\n", - "\n", - "-------------" - ] + "cells": [ + { + "cell_type": "markdown", + "id": "ad4e51fd-ab5a-478c-856a-bc3308ce5781", + "metadata": {}, + "source": [ + "-------------\n", + "🤌🏼 Points to discuss:\n", + "* **what we do want to plot here?**
\n", + " Plot only what is produced by the pipeline (hence reflect choice of parameters from pipeline run) OR all the possible options (all output produced by all pipelines run so far, meaning whatever is writte the to most recent version of the Dataset?)\n", + "* **how to handle missing files?**: namely, situations in which files are not yet been produced. In this reporting rate case, if the user only runs the pipeline to produce the \"Dataset\" reporrting rate file,, then we cannot plot anything for the \"Data Element\" reporting rate as there is no file yet ...\n", + " Atm this is handled with `if` logic, but should be made more elegant to avoid repeating the same code twice (for dataset and for dataelement)\n", + "\n", + "-------------\n", + "\n", + "🚧 To do:\n", + "* **Plots shouls be wrapped as functions (DRY code)**! Cuold save in .R file in this same location to `source()` only here (as plots are specifc to this notebook, no need to save in snt_utils.R)\n", + "* **Display _real_ data**: do **_not_ cap** reporting rate values at 1 (100%)!! It's important to visualize real full range if we want to qualitatively assess and compare different methods!\n", + "* **fix object names**: `routine_data` is NOT routine data ... !!\n", + "* When importing `reporting_rate_data`, try if possible to avoid using `tryCatch`, and use `log_msg(..., \"warning\")` instead (should simplify code and logic ... ). Idea is to **log a meaningful warning without making the pipeline fail** just becauase a file in the report nb is missing ... !\n", + "\n", + "-------------" + ] + }, + { + "cell_type": "markdown", + "id": "80fa8c3c-ed62-4248-8149-ffe2974a7206", + "metadata": {}, + "source": [ + "# Taux de Rapportage des Formations Sanitaires - Health Facility Reporting Rates" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35bc4c99-5e5c-44dc-8c67-7f38eaec708e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Set SNT Paths\n", + "SNT_ROOT_PATH <- \"~/workspace\"\n", + "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", + "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", + "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_reporting_rate\")\n", + "\n", + "# load util functions\n", + "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_reporting_rate.r\"))\n", + "\n", + "# List required packages \n", + "required_packages <- c(\"dplyr\", \"tidyr\", \"terra\", \"ggplot2\", \"stringr\", \"lubridate\", \"viridis\", \"patchwork\", \"zoo\", \"purrr\", \"arrow\", \"sf\", \"reticulate\", \"leaflet\")\n", + "\n", + "# Execute function\n", + "install_and_load(required_packages)\n", + "\n", + "# Set environment to load openhexa.sdk from the right environment\n", + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "reticulate::py_config()$python\n", + "openhexa <- import(\"openhexa.sdk\")\n", + "\n", + "# Load SNT config\n", + "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))},\n", + " error = function(e) {\n", + " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })\n", + "\n", + "# Required environment for the sf packages\n", + "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", + "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dd297a84-5a55-4374-9d2b-3148fde8072d", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Configuration variables\n", + "DATASET_NAME <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_REPORTING_RATE\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "ADM_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa63eb27-746f-420b-87ad-da82139acff9", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# printdim() loaded from utils/snt_dhis2_reporting_rate.r" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a603fdb-e3ae-4aa3-a908-0385ae216d49", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# import DHIS2 shapes data\n", + "DATASET_DHIS2 <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "shapes_data <- tryCatch({ get_latest_dataset_file_in_memory(DATASET_DHIS2, paste0(COUNTRY_CODE, \"_shapes.geojson\")) }, \n", + " error = function(e) {\n", + " msg <- paste(\"Error while loading DHIS2 Shapes data for: \" , COUNTRY_CODE, conditionMessage(e))\n", + " cat(msg)\n", + " stop(msg)\n", + " })" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9656abb7-0085-4feb-974c-fb0b1c68c38f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# import pyramid data\n", + "pyramid_data <- tryCatch({ get_latest_dataset_file_in_memory(DATASET_DHIS2, paste0(COUNTRY_CODE, \"_pyramid.parquet\")) }, \n", + " error = function(e) {\n", + " msg <- paste(\"Error while loading DHIS2 Shapes data for: \" , COUNTRY_CODE, conditionMessage(e))\n", + " cat(msg)\n", + " stop(msg)\n", + " })\n", + "\n", + "# Select distinct (already done in SNT format pipeline)\n", + "ADMIN_1_ID <- str_replace(toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1), \"NAME\", \"ID\")\n", + "ADMIN_2_ID <- str_replace(toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2), \"NAME\", \"ID\")\n", + "\n", + "pyramid_data <- pyramid_data %>%\n", + " distinct(across(all_of(c(ADMIN_1_ID, ADMIN_2_ID))), .keep_all = TRUE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e912503-5c57-4997-8c68-da673bd14626", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "print(dim(pyramid_data))\n", + "head(pyramid_data)" + ] + }, + { + "cell_type": "markdown", + "id": "78ec55d0-3a0d-413d-97cd-303895275f88", + "metadata": {}, + "source": [ + "## A) Taux de Soumission des Rapports / Dataset Reporting Rate\n", + "\n", + "**[FR]**\n", + "Cette section analyse le **taux de soumission des rapports**, tel que calculé dans le Système National d’Information Sanitaire (SNIS). Ce taux est défini comme le nombre de rapports effectivement reçus (rapports actuels) divisé par le nombre de rapports attendus (rapports attendus) sur une période donnée. Les rapports attendus correspondent au nombre de formations sanitaires qui, selon les paramètres du SNIS, devaient soumettre un rapport. Cet indicateur permet d’évaluer si les structures ont transmis les rapports requis, sans tenir compte du contenu ou de l’exhaustivité des données saisies.\n", + "\n", + "**[EN]**\n", + "This section analyzes the **dataset reporting rate**, as calculated in the Health Management Information System (HMIS). The rate is defined as the number of reports actually submitted (actual reports) divided by the number of reports expected (expected reports) over a given period. Expected reports refer to the number of health facilities that were required to report according to SNIS configuration. This indicator helps assess whether health facilities submitted their required reports, regardless of the content or completeness of the data within those reports." + ] + }, + { + "cell_type": "markdown", + "id": "793a685b-a5cc-4e12-9c78-e548beffa213", + "metadata": {}, + "source": [ + "**Question:** Can the reporting rate file be loaded only once (using a parameter see below)? -> if that's the case, we can remove specific plotting codes for the \"dataset\" and \"dataelement\" files and just keep one \"plotting code\" for both types \n", + "\n", + "**Suggestion (link to previous Question):** The file name can be parameterized by injecting the user selection via parameters={...} from the OpenHexa pipeline. \n", + "\n", + "> paste0(COUNTRY_CODE, \"_reporting_rate_\", REPORTING_RATE_METHOD ,\".parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a2d9d428-c5a7-4f35-8a21-8f22adaa6a26", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Import from Dataset\n", + "reporting_rate_data <- tryCatch({\n", + " # Attempt to load the dataset\n", + " get_latest_dataset_file_in_memory(\n", + " DATASET_NAME, \n", + " paste0(COUNTRY_CODE, \"_reporting_rate_dataset.parquet\")\n", + " )\n", + " }, \n", + " error = function(e) {\n", + " # If an error occurs, log a warning\n", + " # msg <- paste(\"[WARNING] Warning: Could not load reporting rate file for:\", COUNTRY_CODE, \". Proceeding with empty data. Error:\", conditionMessage(e))\n", + " msg <- paste(\"[WARNING] Warning: file `\", COUNTRY_CODE, \"_reporting_rate_dataset.parquet` does not exist, skipped loading. \n", + " To generate this file, re-run the reporting rate pipeline. Error:\", conditionMessage(e))\n", + " log_msg(msg, level = \"warning\")\n", + " \n", + " # IMPORTANT: Return an empty tibble with the correct structure SO PIPELINE DOES NOT FAIL\n", + " return(\n", + " tibble(\n", + " YEAR = double(),\n", + " MONTH = double(),\n", + " ADM2_ID = character(),\n", + " REPORTING_RATE = double()\n", + " )\n", + " )\n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d13dea1-e204-4c27-8a44-14e260bcdad1", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Add _NAME cols from pyramid\n", + "if (nrow(reporting_rate_data) != 0) {\n", + " \n", + " ADMIN_2_ID <- str_replace(ADM_2, \"NAME\", \"ID\") \n", + " reporting_rate_data <- reporting_rate_data %>% \n", + " left_join(pyramid_data[c(ADM_2, ADMIN_2_ID)], by = c(\"ADM2_ID\" = ADMIN_2_ID))\n", + " \n", + " colnames(reporting_rate_data)[colnames(reporting_rate_data) == ADM_2] <- \"ADM2_NAME\"\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b96b0b12-e168-421f-b0a9-76e83c48842c", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "head(reporting_rate_data, 3)" + ] + }, + { + "cell_type": "markdown", + "id": "64d787c8-25d3-4b2c-87e9-6d35b206b018", + "metadata": {}, + "source": [ + "**fix:** \n", + " - Just replaced this line with the variable \"ADM2_NAME\" : \n", + "> Plot heatmap \n", + "> options(repr.plot.width = 18, repr.plot.height = 15) \n", + "> ggplot(reporting_rate_data, aes(x = date, y = **ADM2_NAME**, fill = category)) + " + ] + }, + { + "cell_type": "markdown", + "id": "c13f15a4-2788-4d57-9edc-78d0afdbe278", + "metadata": {}, + "source": [ + "### Plot: Heatmap" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bb3b565e-8497-4d88-8d6d-ae6b3e2929b2", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (nrow(reporting_rate_data) != 0) {\n", + " \n", + " # Prepare date column + category\n", + " reporting_rate_data <- reporting_rate_data %>%\n", + " mutate(\n", + " date = as.Date(paste0(YEAR, \"-\", MONTH, \"-01\")),\n", + " ADM2_ID = factor(ADM2_ID),\n", + " # reporting_pct = pmin(REPORTING_RATE, 1) * 100, # `pmin()` caps to 100%\n", + " reporting_pct = REPORTING_RATE * 100,\n", + " category = cut(\n", + " reporting_pct,\n", + " # breaks = c(-Inf, 50, 80, 90, Inf),\n", + " # labels = c(\"<50\", \"50–80\", \"80–90\", \"≥90\"),\n", + " # GP 2025-08-07 added this, but double check (seems too many >100!!)\n", + " breaks = c(-Inf, 50, 80, 90, 100, Inf),\n", + " labels = c(\"<50\", \"50–80\", \"80–90\", \"90-100\", \">100\"),\n", + " right = TRUE # FALSE: intervals are left-closed: lower bound is included\n", + " )\n", + " )\n", + " \n", + " # Define color scale\n", + " reporting_colors <- c(\n", + " \"<50\" = \"#d7191c\", # red\n", + " \"50–80\" = \"#fdae61\", # orange\n", + " \"80–90\" = \"#ffffbf\", # yellow\n", + " \"90-100\" = \"#1a9641\", # green\n", + " \">100\" = \"darkgreen\"\n", + " )\n", + " \n", + " # Plot heatmap\n", + " options(repr.plot.width = 18, repr.plot.height = 15)\n", + " ggplot(reporting_rate_data, aes(x = date, y = ADM2_NAME, fill = category)) + # -> Using a ADM2_NAME Variable to select the column !!\n", + " geom_tile() +\n", + " scale_fill_manual(\n", + " values = reporting_colors,\n", + " name = \"Taux de soumission (%)\"\n", + " ) +\n", + " labs(\n", + " title = \"Taux de soumission des rapports mensuels par district sanitaire\",\n", + " subtitle = \"Monthly Dataset Reporting Rate by Health District\",\n", + " x = \"Mois - Month\",\n", + " y = \"District Sanitaire - Health District\"\n", + " ) +\n", + " theme_minimal(base_size = 16) +\n", + " theme(\n", + " axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 16),\n", + " axis.text.y = element_text(size = 9),\n", + " plot.title = element_text(face = \"bold\", hjust = 0.5, size = 20),\n", + " plot.subtitle = element_text(hjust = 0.5, size = 16),\n", + " # legend.position = \"right\",\n", + " legend.position = \"top\",\n", + " panel.grid = element_blank()\n", + " )\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "28662f31-8aa9-4f83-8dd6-8eb489723652", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (nrow(reporting_rate_data) != 0) {\n", + "\n", + "# Prepare the data\n", + "reporting_rate_data_box <- reporting_rate_data %>%\n", + " mutate(\n", + " MONTH = as.integer(MONTH),\n", + " YEAR = as.factor(YEAR),\n", + " # reporting_pct = pmin(REPORTING_RATE, 1) * 100\n", + " reporting_pct = REPORTING_RATE * 100\n", + " )\n", + "\n", + "# Month labels in French\n", + "month_labels_fr <- c(\n", + " \"Janv\", \"Févr\", \"Mars\", \"Avril\", \"Mai\", \"Juin\",\n", + " \"Juil\", \"Août\", \"Sept\", \"Oct\", \"Nov\", \"Déc\"\n", + ")\n", + "\n", + "# Plot\n", + "options(repr.plot.width = 18, repr.plot.height = 15)\n", + "ggplot(reporting_rate_data_box, aes(x = factor(MONTH), y = reporting_pct, fill = YEAR)) +\n", + " geom_boxplot(outlier.size = 0.8, outlier.alpha = 0.4) +\n", + " scale_x_discrete(labels = month_labels_fr) +\n", + " # scale_y_continuous(name = \"Taux de soumission (%)\", limits = c(0, 100)) +\n", + " scale_y_continuous(name = \"Taux de soumission (%)\") +\n", + " labs(\n", + " title = \"Distribution mensuelle du taux de soumission des rapports\",\n", + " subtitle = \"Monthly Distribution of Dataset Reporting Rate by Health District (2021–2024)\",\n", + " x = \"Mois\",\n", + " fill = \"Année\"\n", + " ) +\n", + " theme_minimal(base_size = 16) +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\", size = 20),\n", + " plot.subtitle = element_text(size = 16),\n", + " legend.position = \"bottom\"\n", + " )\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1fd09749-2042-49d4-b2a3-9c2e6e5eae52", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (nrow(reporting_rate_data) != 0) {\n", + "\n", + "# Step 1: Aggregate to annual reporting rate per district\n", + "annual_data <- reporting_rate_data %>%\n", + " group_by(YEAR, ADM2_ID) %>%\n", + " summarise(reporting_rate = mean(REPORTING_RATE, na.rm = TRUE)) %>%\n", + " ungroup()\n", + "\n", + "# Step 2: Join with spatial data (assuming 'map_sf' contains geometry and ADM2_ID)\n", + "map_data <- shapes_data %>%\n", + " left_join(annual_data, by = \"ADM2_ID\")\n", + "\n", + "# Step 3: Bin the reporting rate into categories\n", + "map_data <- map_data %>%\n", + " mutate(\n", + " reporting_cat = case_when(\n", + " reporting_rate < 0.5 ~ \"<50\",\n", + " reporting_rate < 0.8 ~ \"50-79\", # \"50-80\"\n", + " reporting_rate < 0.9 ~ \"80-89\", # \"80-90\"\n", + " reporting_rate >= 0.9 ~ \">=90\",\n", + " TRUE ~ NA_character_\n", + " ),\n", + " reporting_cat = factor(reporting_cat, levels = c(\"<50\", \"50-79\", \"80-89\", \">=90\")) # levels = c(\"<50\", \"50-80\", \"80-90\", \">=90\")\n", + " )\n", + "\n", + "# Step 4: Define colors\n", + "reporting_colors <- c(\n", + " \"<50\" = \"#d7191c\",\n", + " \"50-79\" = \"#fdae61\",\n", + " \"80-89\" = \"#ffffbf\",\n", + " \">=90\" = \"#1a9641\"\n", + ")\n", + "\n", + "# Step 5: Plot\n", + "options(repr.plot.width = 18, repr.plot.height = 10)\n", + "ggplot(map_data) +\n", + " geom_sf(aes(fill = reporting_cat), color = \"white\", size = 0.2) +\n", + " facet_wrap(~ YEAR) +\n", + " scale_fill_manual(values = reporting_colors, name = \"Taux de soummision (%)\") +\n", + " labs(\n", + " title = \"Taux de soumission des rapports annuels par district sanitaire\",\n", + " subtitle = \"Annual Dataset Reporting Completeness by Health District\"\n", + " ) +\n", + " theme_minimal(base_size = 16) +\n", + " theme(\n", + " legend.position = \"right\",\n", + " strip.text = element_text(face = \"bold\", size = 16),\n", + " plot.title = element_text(face = \"bold\")\n", + " )\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df3f7949-baff-4cd9-a022-594420765289", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (nrow(reporting_rate_data) != 0) {\n", + "\n", + "# Step 1: Compute mean reporting rate per ADM2_ID over all years\n", + "mean_reporting_stats <- map_data %>%\n", + " group_by(ADM2_ID) %>%\n", + " summarise(\n", + " reporting_rate = mean(reporting_rate, na.rm = TRUE),\n", + " .groups = \"drop\"\n", + " ) %>%\n", + " mutate(\n", + " reporting_cat = case_when(\n", + " reporting_rate < 0.5 ~ \"<50\",\n", + " reporting_rate < 0.8 ~ \"50-80\",\n", + " reporting_rate < 0.9 ~ \"80-90\",\n", + " reporting_rate >= 0.9 ~ \">=90\",\n", + " TRUE ~ NA_character_\n", + " )\n", + " )\n", + "\n", + "# Set correct factor levels to match legend\n", + "mean_reporting_stats$reporting_cat <- factor(\n", + " mean_reporting_stats$reporting_cat,\n", + " levels = c(\"<50\", \"50-80\", \"80-90\", \">=90\")\n", + ")\n", + "\n", + "# Step 2: Join with shapes (drop geometry to avoid spatial join conflict)\n", + "mean_reporting_map <- shapes_data %>%\n", + " left_join(st_drop_geometry(mean_reporting_stats), by = \"ADM2_ID\") %>%\n", + " st_as_sf()\n", + "\n", + "# Step 3: Define custom color scale\n", + "reporting_colors <- c(\n", + " \"<50\" = \"#d7191c\", # red\n", + " \"50-80\" = \"#fdae61\", # orange\n", + " \"80-90\" = \"#ffffbf\", # yellow\n", + " \">=90\" = \"#1a9641\" # green\n", + ")\n", + "\n", + "# Step 4: Plot\n", + "options(repr.plot.width = 20, repr.plot.height = 10)\n", + "ggplot(mean_reporting_map) +\n", + " geom_sf(aes(fill = reporting_cat), color = \"white\", size = 0.2) +\n", + " scale_fill_manual(\n", + " values = reporting_colors,\n", + " name = \"Taux de soumission (%)\",\n", + " drop = FALSE\n", + " ) +\n", + " labs(\n", + " title = \"Taux moyen de soumission des rapports (toutes années confondues)\",\n", + " subtitle = \"Mean Annual Dataset Reporting Rate (All Years Combined)\"\n", + " ) +\n", + " theme_minimal(base_size = 16) +\n", + " theme(\n", + " legend.position = \"right\",\n", + " plot.title = element_text(face = \"bold\"),\n", + " plot.subtitle = element_text()\n", + " )\n", + "\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "9497287c-bbd2-446f-946d-88e34233f9f0", + "metadata": {}, + "source": [ + "## B) Taux de rapportage des éléments de données: cas confirmés / Data element Reporting Rate: confirmed cases\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f8cd01e-e4b5-41f5-bff8-d35a1b143d0e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# # import data\n", + "# # was: routine_data\n", + "# reporting_rate_data <- tryCatch({ get_latest_dataset_file_in_memory(DATASET_NAME, paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.parquet\")) }, \n", + "# error = function(e) {\n", + "# msg <- paste(\"Error while loading seasonality file for: \" , COUNTRY_CODE, conditionMessage(e))\n", + "# # cat(msg)\n", + "# log_msg(msg, level = \"warning\") # GP 20250908\n", + "# # stop(msg) # GP 20250908\n", + "# })\n", + "\n", + "# reporting_rate_data <- reporting_rate_data %>%\n", + "# left_join(pyramid_data, by = c(\"ADM2_ID\" = \"LEVEL_3_ID\"))\n", + "\n", + "# printdim(reporting_rate_data)" + ] + }, + { + "cell_type": "markdown", + "id": "8c11349d-598a-4882-a156-3e5b969ab76c", + "metadata": {}, + "source": [ + "### Import and format data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7838692f-fe89-446e-bac5-af5cc7324226", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f8eba51-b883-432b-8f7c-c4860cc9e78c", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", + "# ADMIN_2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ceeba43a-e9ba-4b5a-8d0b-c4faade1367e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# ADMIN_2_LEVEL <- str_replace(ADMIN_2, \"NAME\", \"ID\")\n", + "# ADMIN_2_LEVEL" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "54bf61aa-c324-411f-8eea-93049d1bb252", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# THIS CODE SHOULD BE REMOVED, WE SHOULD ONLY LOAD REPORTING RATE ONCE IN THIS REPORT (parameter " + ] + }, + { + "cell_type": "markdown", + "id": "2338477e-2036-42e6-bfd3-c2b2480395c1", + "metadata": {}, + "source": [ + "**suggestion:** \n", + "- If possible I would try to reuse the same plotting code. So we can remove all the code as from here ...**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "41da51da-7c8a-45d2-b492-a80071dfe2e3", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Import from Dataset\n", + "\n", + "reporting_rate_data <- tryCatch({\n", + " # Attempt to load the dataset\n", + " get_latest_dataset_file_in_memory(\n", + " DATASET_NAME, \n", + " paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.parquet\")\n", + " )\n", + "}, \n", + "error = function(e) {\n", + " # If an error occurs, log a warning\n", + " msg <- paste(\"[WARNING] Warning: file `\", COUNTRY_CODE, \"_reporting_rate_dataelement.parquet` does not exist, skipped loading. \n", + " To generate this file, re-run the reporting rate pipeline. Error:\", conditionMessage(e))\n", + " log_msg(msg, level = \"warning\")\n", + " \n", + " # IMPORTANT: Return an empty tibble with the correct structure SO PIPELINE DOES NOT FAIL\n", + " return(\n", + " tibble(\n", + " YEAR = double(),\n", + " MONTH = double(),\n", + " ADM2_ID = character(),\n", + " REPORTING_RATE = double()\n", + " )\n", + " )\n", + "})\n", + "\n", + "# Add _NAME cols from pyramid\n", + "if (nrow(reporting_rate_data) != 0) {\n", + "\n", + " ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", + " ADMIN_2_LEVEL <- str_replace(ADMIN_2, \"NAME\", \"ID\")\n", + " \n", + " reporting_rate_data <- reporting_rate_data %>%\n", + " # left_join(pyramid_data, by = c(\"ADM2_ID\" = \"LEVEL_3_ID\")) # old\n", + " left_join(pyramid_data, by = c(\"ADM2_ID\" = ADMIN_2_LEVEL))\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dff04b9c-0ee7-44d3-a84f-150bce1c368f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# reporting_rate_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0be95b4e-8678-44be-a361-d2216bcd741c", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# if (nrow(reporting_rate_data) != 0) {\n", + "# reporting_rate_data <- reporting_rate_data %>%\n", + "# left_join(pyramid_data, by = c(\"ADM2_ID\" = \"LEVEL_3_ID\"))\n", + "# }" + ] + }, + { + "cell_type": "markdown", + "id": "42155cf0-0475-45fc-a276-7ff2bd4ed555", + "metadata": {}, + "source": [ + "### Plot: Heatmap" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d4d40b6-7d01-4ed3-83ba-39ffdfbe4b3d", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Prepare date column + category\n", + "\n", + "if (nrow(reporting_rate_data) != 0) {\n", + "\n", + "reporting_rate_data <- reporting_rate_data %>%\n", + " mutate(\n", + " date = as.Date(paste0(YEAR, \"-\", MONTH, \"-01\")),\n", + " ADM2_ID = factor(ADM2_ID),\n", + " # reporting_pct = pmin(REPORTING_RATE, 1) * 100,\n", + " reporting_pct = REPORTING_RATE * 100,\n", + " category = cut(\n", + " reporting_pct,\n", + " # breaks = c(-Inf, 50, 80, 90, Inf),\n", + " # labels = c(\"<50\", \"50–80\", \"80–90\", \"≥90\"),\n", + " # right = FALSE\n", + " breaks = c(-Inf, 50, 80, 90, 100, Inf),\n", + " labels = c(\"<50\", \"50–80\", \"80–90\", \"90-100\", \">100\"),\n", + " right = TRUE\n", + " )\n", + " )\n", + "\n", + "# # Define color scale\n", + "# reporting_colors <- c(\n", + "# \"<50\" = \"#d7191c\", # red\n", + "# \"50–80\" = \"#fdae61\", # orange\n", + "# \"80–90\" = \"#ffffbf\", # yellow\n", + "# \"≥90\" = \"#1a9641\" # green\n", + "# )\n", + "\n", + "# Define color scale\n", + "reporting_colors <- c(\n", + " \"<50\" = \"#d7191c\", # red\n", + " \"50–80\" = \"#fdae61\", # orange\n", + " \"80–90\" = \"#ffffbf\", # yellow\n", + " \"90-100\" = \"#1a9641\", # green\n", + " \">100\" = \"darkgreen\" # \"darkgreen\"\n", + ")\n", + "\n", + "# Plot heatmap\n", + "options(repr.plot.width = 18, repr.plot.height = 15)\n", + "ggplot(reporting_rate_data, aes(x = date, y = LEVEL_3_NAME, fill = category)) +\n", + " geom_tile() +\n", + " scale_fill_manual(\n", + " values = reporting_colors,\n", + " name = \"Taux de soumission (%)\"\n", + " ) +\n", + " labs(\n", + " title = \"Taux de rapportage mensuels par district sanitaire\",\n", + " subtitle = \"Monthly Data Element Reporting Rate by Health District\",\n", + " x = \"Mois - Month\",\n", + " y = \"District Sanitaire - Health District\"\n", + " ) +\n", + " theme_minimal(base_size = 16) +\n", + " theme(\n", + " axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 16),\n", + " axis.text.y = element_text(size = 9),\n", + " plot.title = element_text(face = \"bold\", hjust = 0.5, size = 20),\n", + " plot.subtitle = element_text(hjust = 0.5, size = 16),\n", + " legend.position = \"top\", # \"right\"\n", + " panel.grid = element_blank()\n", + " )\n", + "\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "b14436cf-1f78-4c3f-944b-0c1eb845a3f6", + "metadata": {}, + "source": [ + "### Plot: boxplot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ba3a3e89-17f5-4024-b039-dcedb0f37dc2", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Prepare the data\n", + "\n", + "if (nrow(reporting_rate_data) != 0) {\n", + " \n", + "reporting_rate_data_box <- reporting_rate_data %>%\n", + " mutate(\n", + " MONTH = as.integer(MONTH),\n", + " YEAR = as.factor(YEAR),\n", + " # reporting_pct = pmin(REPORTING_RATE, 1) * 100 # `pmin()` caps values to 1 (then, 100%)\n", + " reporting_pct = REPORTING_RATE * 100\n", + " )\n", + "\n", + "# Month labels in French\n", + "month_labels_fr <- c(\n", + " \"Janv\", \"Févr\", \"Mars\", \"Avril\", \"Mai\", \"Juin\",\n", + " \"Juil\", \"Août\", \"Sept\", \"Oct\", \"Nov\", \"Déc\"\n", + ")\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1472d8f2-56d3-4b7a-82ef-4b344d87d264", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (nrow(reporting_rate_data) != 0) {\n", + "\n", + "# Plot\n", + "options(repr.plot.width = 18, repr.plot.height = 15)\n", + "ggplot(reporting_rate_data_box, aes(x = factor(MONTH), y = reporting_pct, fill = YEAR)) +\n", + " geom_boxplot(outlier.size = 0.8, outlier.alpha = 0.4) +\n", + " scale_x_discrete(labels = month_labels_fr) +\n", + " # scale_y_continuous(name = \"Taux de soumission (%)\", limits = c(0, 100)) +\n", + " scale_y_continuous(name = \"Taux de soumission (%)\") +\n", + " labs(\n", + " title = \"Distribution mensuelle du taux de rapportage\",\n", + " subtitle = \"Monthly Distribution of Data Element Reporting Rate by Health District (2021–2024)\",\n", + " x = \"Mois\",\n", + " fill = \"Année\"\n", + " ) +\n", + " theme_minimal(base_size = 16) +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\", size = 20),\n", + " plot.subtitle = element_text(size = 16),\n", + " legend.position = \"bottom\"\n", + " )\n", + "\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "0d275753-d21e-410a-b7d7-f265ac6e9235", + "metadata": {}, + "source": [ + "### Plot: choropleth" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12674c7e-7745-464c-9cc5-3c1e6dcd63c4", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (nrow(reporting_rate_data) != 0) {\n", + "\n", + "# Step 1: Aggregate to annual reporting rate per district \n", + "annual_data <- reporting_rate_data %>%\n", + " group_by(YEAR, ADM2_ID) %>%\n", + " summarise(reporting_rate = mean(REPORTING_RATE, na.rm = TRUE)) %>%\n", + " ungroup()\n", + "\n", + "# Step 2: Join with spatial data (assuming 'map_sf' contains geometry and ADM2_ID)\n", + "map_data <- shapes_data %>%\n", + " left_join(annual_data, by = \"ADM2_ID\")\n", + "\n", + "# Step 3: Bin the reporting rate into categories\n", + "map_data <- map_data %>%\n", + " mutate(\n", + " reporting_cat = case_when(\n", + " reporting_rate < 0.5 ~ \"<50\",\n", + " reporting_rate < 0.8 ~ \"50-80\",\n", + " reporting_rate < 0.9 ~ \"80-90\",\n", + " reporting_rate >= 0.9 ~ \">=90\",\n", + " TRUE ~ NA_character_\n", + " ),\n", + " reporting_cat = factor(reporting_cat, levels = c(\"<50\", \"50-80\", \"80-90\", \">=90\"))\n", + " )\n", + "\n", + "# Step 4: Define colors\n", + "reporting_colors <- c(\n", + " \"<50\" = \"#d7191c\",\n", + " \"50-80\" = \"#fdae61\",\n", + " \"80-90\" = \"#ffffbf\",\n", + " \">=90\" = \"#1a9641\"\n", + ")\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7777a718-2e6c-4f80-a7d6-ae304a1b49fb", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (nrow(reporting_rate_data) != 0) {\n", + "\n", + "# Step 5: Plot\n", + "options(repr.plot.width = 18, repr.plot.height = 10)\n", + "ggplot(map_data) +\n", + " geom_sf(aes(fill = reporting_cat), color = \"white\", size = 0.2) +\n", + " facet_wrap(~ YEAR) +\n", + " scale_fill_manual(values = reporting_colors, name = \"Taux de soummision (%)\") +\n", + " labs(\n", + " title = \"Taux de rapportage des éléments de donnée annuels par district sanitaire\",\n", + " subtitle = \"Annual Data element Reporting Completeness by Health District\"\n", + " ) +\n", + " theme_minimal(base_size = 16) +\n", + " theme(\n", + " legend.position = \"right\",\n", + " strip.text = element_text(face = \"bold\", size = 16),\n", + " plot.title = element_text(face = \"bold\")\n", + " )\n", + "\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "f25ec0fb-758b-476c-8567-b0dce0a387d1", + "metadata": {}, + "source": [ + "### Plot: choropleth 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c891e330-f56f-4846-88c3-fd13a9fac8e7", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (nrow(reporting_rate_data) != 0) {\n", + "\n", + "# Step 1: Compute mean reporting rate per ADM2_ID over all years\n", + "mean_reporting_stats <- map_data %>%\n", + " group_by(ADM2_ID) %>%\n", + " summarise(\n", + " reporting_rate = mean(reporting_rate, na.rm = TRUE),\n", + " .groups = \"drop\"\n", + " ) %>%\n", + " mutate(\n", + " reporting_cat = case_when(\n", + " reporting_rate < 0.5 ~ \"<50\",\n", + " reporting_rate < 0.8 ~ \"50-80\",\n", + " reporting_rate < 0.9 ~ \"80-90\",\n", + " reporting_rate >= 0.9 ~ \">=90\",\n", + " TRUE ~ NA_character_\n", + " )\n", + " )\n", + "\n", + "# Set correct factor levels to match legend\n", + "mean_reporting_stats$reporting_cat <- factor(\n", + " mean_reporting_stats$reporting_cat,\n", + " levels = c(\"<50\", \"50-80\", \"80-90\", \">=90\")\n", + ")\n", + "\n", + "# Step 2: Join with shapes (drop geometry to avoid spatial join conflict)\n", + "mean_reporting_map <- shapes_data %>%\n", + " left_join(st_drop_geometry(mean_reporting_stats), by = \"ADM2_ID\") %>%\n", + " st_as_sf()\n", + "\n", + "# Step 3: Define custom color scale\n", + "reporting_colors <- c(\n", + " \"<50\" = \"#d7191c\", # red\n", + " \"50-80\" = \"#fdae61\", # orange\n", + " \"80-90\" = \"#ffffbf\", # yellow\n", + " \">=90\" = \"#1a9641\" # green\n", + ")\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4032f2be-dc1e-48cf-9a5f-5c0854c32e9a", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (nrow(reporting_rate_data) != 0) {\n", + "\n", + "# Step 4: Plot\n", + "options(repr.plot.width = 20, repr.plot.height = 10)\n", + "ggplot(mean_reporting_map) +\n", + " geom_sf(aes(fill = reporting_cat), color = \"white\", size = 0.2) +\n", + " scale_fill_manual(\n", + " values = reporting_colors,\n", + " name = \"Taux de soumission (%)\",\n", + " drop = FALSE\n", + " ) +\n", + " labs(\n", + " title = \"Taux moyen de rapportage (toutes années confondues)\",\n", + " subtitle = \"Mean Annual Data Element Reporting Rate (All Years Combined)\"\n", + " ) +\n", + " theme_minimal(base_size = 16) +\n", + " theme(\n", + " legend.position = \"right\",\n", + " plot.title = element_text(face = \"bold\"),\n", + " plot.subtitle = element_text()\n", + " )\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0f1ea54-b02b-4523-b8b7-9dcdf30c39ba", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" + } }, - { - "cell_type": "markdown", - "id": "80fa8c3c-ed62-4248-8149-ffe2974a7206", - "metadata": {}, - "source": [ - "# Taux de Rapportage des Formations Sanitaires - Health Facility Reporting Rates" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "35bc4c99-5e5c-44dc-8c67-7f38eaec708e", - "metadata": {}, - "outputs": [], - "source": [ - "# Set SNT Paths\n", - "SNT_ROOT_PATH <- \"~/workspace\"\n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", - "\n", - "# load util functions\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "\n", - "# List required packages \n", - "required_packages <- c(\"dplyr\", \"tidyr\", \"terra\", \"ggplot2\", \"stringr\", \"lubridate\", \"viridis\", \"patchwork\", \"zoo\", \"purrr\", \"arrow\", \"sf\", \"reticulate\", \"leaflet\")\n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)\n", - "\n", - "# Set environment to load openhexa.sdk from the right environment\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")\n", - "\n", - "# Load SNT config\n", - "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))},\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "# Required environment for the sf packages\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dd297a84-5a55-4374-9d2b-3148fde8072d", - "metadata": {}, - "outputs": [], - "source": [ - "# Configuration variables\n", - "DATASET_NAME <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_REPORTING_RATE\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "ADM_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fa63eb27-746f-420b-87ad-da82139acff9", - "metadata": {}, - "outputs": [], - "source": [ - "# print function\n", - "printdim <- function(df, name = deparse(substitute(df))) {\n", - " cat(\"Dimensions of\", name, \":\", nrow(df), \"rows x\", ncol(df), \"columns\\n\\n\")\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2a603fdb-e3ae-4aa3-a908-0385ae216d49", - "metadata": {}, - "outputs": [], - "source": [ - "# import DHIS2 shapes data\n", - "DATASET_DHIS2 <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "shapes_data <- tryCatch({ get_latest_dataset_file_in_memory(DATASET_DHIS2, paste0(COUNTRY_CODE, \"_shapes.geojson\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 Shapes data for: \" , COUNTRY_CODE, conditionMessage(e))\n", - " cat(msg)\n", - " stop(msg)\n", - " })" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9656abb7-0085-4feb-974c-fb0b1c68c38f", - "metadata": {}, - "outputs": [], - "source": [ - "# import pyramid data\n", - "pyramid_data <- tryCatch({ get_latest_dataset_file_in_memory(DATASET_DHIS2, paste0(COUNTRY_CODE, \"_pyramid.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 Shapes data for: \" , COUNTRY_CODE, conditionMessage(e))\n", - " cat(msg)\n", - " stop(msg)\n", - " })\n", - "\n", - "# Select distinct (already done in SNT format pipeline)\n", - "ADMIN_1_ID <- str_replace(toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1), \"NAME\", \"ID\")\n", - "ADMIN_2_ID <- str_replace(toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2), \"NAME\", \"ID\")\n", - "\n", - "pyramid_data <- pyramid_data %>%\n", - " distinct(across(all_of(c(ADMIN_1_ID, ADMIN_2_ID))), .keep_all = TRUE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1e912503-5c57-4997-8c68-da673bd14626", - "metadata": {}, - "outputs": [], - "source": [ - "print(dim(pyramid_data))\n", - "head(pyramid_data)" - ] - }, - { - "cell_type": "markdown", - "id": "78ec55d0-3a0d-413d-97cd-303895275f88", - "metadata": {}, - "source": [ - "## A) Taux de Soumission des Rapports / Dataset Reporting Rate\n", - "\n", - "**[FR]**\n", - "Cette section analyse le **taux de soumission des rapports**, tel que calculé dans le Système National d’Information Sanitaire (SNIS). Ce taux est défini comme le nombre de rapports effectivement reçus (rapports actuels) divisé par le nombre de rapports attendus (rapports attendus) sur une période donnée. Les rapports attendus correspondent au nombre de formations sanitaires qui, selon les paramètres du SNIS, devaient soumettre un rapport. Cet indicateur permet d’évaluer si les structures ont transmis les rapports requis, sans tenir compte du contenu ou de l’exhaustivité des données saisies.\n", - "\n", - "**[EN]**\n", - "This section analyzes the **dataset reporting rate**, as calculated in the Health Management Information System (HMIS). The rate is defined as the number of reports actually submitted (actual reports) divided by the number of reports expected (expected reports) over a given period. Expected reports refer to the number of health facilities that were required to report according to SNIS configuration. This indicator helps assess whether health facilities submitted their required reports, regardless of the content or completeness of the data within those reports." - ] - }, - { - "cell_type": "markdown", - "id": "793a685b-a5cc-4e12-9c78-e548beffa213", - "metadata": {}, - "source": [ - "**Question:** Can the reporting rate file be loaded only once (using a parameter see below)? -> if that's the case, we can remove specific plotting codes for the \"dataset\" and \"dataelement\" files and just keep one \"plotting code\" for both types \n", - "\n", - "**Suggestion (link to previous Question):** The file name can be parameterized by injecting the user selection via parameters={...} from the OpenHexa pipeline. \n", - "\n", - "> paste0(COUNTRY_CODE, \"_reporting_rate_\", REPORTING_RATE_METHOD ,\".parquet\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a2d9d428-c5a7-4f35-8a21-8f22adaa6a26", - "metadata": {}, - "outputs": [], - "source": [ - "# Import from Dataset\n", - "reporting_rate_data <- tryCatch({\n", - " # Attempt to load the dataset\n", - " get_latest_dataset_file_in_memory(\n", - " DATASET_NAME, \n", - " paste0(COUNTRY_CODE, \"_reporting_rate_dataset.parquet\")\n", - " )\n", - " }, \n", - " error = function(e) {\n", - " # If an error occurs, log a warning\n", - " # msg <- paste(\"[WARNING] Warning: Could not load reporting rate file for:\", COUNTRY_CODE, \". Proceeding with empty data. Error:\", conditionMessage(e))\n", - " msg <- paste(\"[WARNING] Warning: file `\", COUNTRY_CODE, \"_reporting_rate_dataset.parquet` does not exist, skipped loading. \n", - " To generate this file, re-run the reporting rate pipeline. Error:\", conditionMessage(e))\n", - " log_msg(msg, level = \"warning\")\n", - " \n", - " # IMPORTANT: Return an empty tibble with the correct structure SO PIPELINE DOES NOT FAIL\n", - " return(\n", - " tibble(\n", - " YEAR = double(),\n", - " MONTH = double(),\n", - " ADM2_ID = character(),\n", - " REPORTING_RATE = double()\n", - " )\n", - " )\n", - " }\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5d13dea1-e204-4c27-8a44-14e260bcdad1", - "metadata": {}, - "outputs": [], - "source": [ - "# Add _NAME cols from pyramid\n", - "if (nrow(reporting_rate_data) != 0) {\n", - " \n", - " ADMIN_2_ID <- str_replace(ADM_2, \"NAME\", \"ID\") \n", - " reporting_rate_data <- reporting_rate_data %>% \n", - " left_join(pyramid_data[c(ADM_2, ADMIN_2_ID)], by = c(\"ADM2_ID\" = ADMIN_2_ID))\n", - " \n", - " colnames(reporting_rate_data)[colnames(reporting_rate_data) == ADM_2] <- \"ADM2_NAME\"\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b96b0b12-e168-421f-b0a9-76e83c48842c", - "metadata": {}, - "outputs": [], - "source": [ - "head(reporting_rate_data, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "64d787c8-25d3-4b2c-87e9-6d35b206b018", - "metadata": {}, - "source": [ - "**fix:** \n", - " - Just replaced this line with the variable \"ADM2_NAME\" : \n", - "> Plot heatmap \n", - "> options(repr.plot.width = 18, repr.plot.height = 15) \n", - "> ggplot(reporting_rate_data, aes(x = date, y = **ADM2_NAME**, fill = category)) + " - ] - }, - { - "cell_type": "markdown", - "id": "c13f15a4-2788-4d57-9edc-78d0afdbe278", - "metadata": {}, - "source": [ - "### Plot: Heatmap" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bb3b565e-8497-4d88-8d6d-ae6b3e2929b2", - "metadata": {}, - "outputs": [], - "source": [ - "if (nrow(reporting_rate_data) != 0) {\n", - " \n", - " # Prepare date column + category\n", - " reporting_rate_data <- reporting_rate_data %>%\n", - " mutate(\n", - " date = as.Date(paste0(YEAR, \"-\", MONTH, \"-01\")),\n", - " ADM2_ID = factor(ADM2_ID),\n", - " # reporting_pct = pmin(REPORTING_RATE, 1) * 100, # `pmin()` caps to 100%\n", - " reporting_pct = REPORTING_RATE * 100,\n", - " category = cut(\n", - " reporting_pct,\n", - " # breaks = c(-Inf, 50, 80, 90, Inf),\n", - " # labels = c(\"<50\", \"50–80\", \"80–90\", \"≥90\"),\n", - " # GP 2025-08-07 added this, but double check (seems too many >100!!)\n", - " breaks = c(-Inf, 50, 80, 90, 100, Inf),\n", - " labels = c(\"<50\", \"50–80\", \"80–90\", \"90-100\", \">100\"),\n", - " right = TRUE # FALSE: intervals are left-closed: lower bound is included\n", - " )\n", - " )\n", - " \n", - " # Define color scale\n", - " reporting_colors <- c(\n", - " \"<50\" = \"#d7191c\", # red\n", - " \"50–80\" = \"#fdae61\", # orange\n", - " \"80–90\" = \"#ffffbf\", # yellow\n", - " \"90-100\" = \"#1a9641\", # green\n", - " \">100\" = \"darkgreen\"\n", - " )\n", - " \n", - " # Plot heatmap\n", - " options(repr.plot.width = 18, repr.plot.height = 15)\n", - " ggplot(reporting_rate_data, aes(x = date, y = ADM2_NAME, fill = category)) + # -> Using a ADM2_NAME Variable to select the column !!\n", - " geom_tile() +\n", - " scale_fill_manual(\n", - " values = reporting_colors,\n", - " name = \"Taux de soumission (%)\"\n", - " ) +\n", - " labs(\n", - " title = \"Taux de soumission des rapports mensuels par district sanitaire\",\n", - " subtitle = \"Monthly Dataset Reporting Rate by Health District\",\n", - " x = \"Mois - Month\",\n", - " y = \"District Sanitaire - Health District\"\n", - " ) +\n", - " theme_minimal(base_size = 16) +\n", - " theme(\n", - " axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 16),\n", - " axis.text.y = element_text(size = 9),\n", - " plot.title = element_text(face = \"bold\", hjust = 0.5, size = 20),\n", - " plot.subtitle = element_text(hjust = 0.5, size = 16),\n", - " # legend.position = \"right\",\n", - " legend.position = \"top\",\n", - " panel.grid = element_blank()\n", - " )\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "28662f31-8aa9-4f83-8dd6-8eb489723652", - "metadata": {}, - "outputs": [], - "source": [ - "if (nrow(reporting_rate_data) != 0) {\n", - "\n", - "# Prepare the data\n", - "reporting_rate_data_box <- reporting_rate_data %>%\n", - " mutate(\n", - " MONTH = as.integer(MONTH),\n", - " YEAR = as.factor(YEAR),\n", - " # reporting_pct = pmin(REPORTING_RATE, 1) * 100\n", - " reporting_pct = REPORTING_RATE * 100\n", - " )\n", - "\n", - "# Month labels in French\n", - "month_labels_fr <- c(\n", - " \"Janv\", \"Févr\", \"Mars\", \"Avril\", \"Mai\", \"Juin\",\n", - " \"Juil\", \"Août\", \"Sept\", \"Oct\", \"Nov\", \"Déc\"\n", - ")\n", - "\n", - "# Plot\n", - "options(repr.plot.width = 18, repr.plot.height = 15)\n", - "ggplot(reporting_rate_data_box, aes(x = factor(MONTH), y = reporting_pct, fill = YEAR)) +\n", - " geom_boxplot(outlier.size = 0.8, outlier.alpha = 0.4) +\n", - " scale_x_discrete(labels = month_labels_fr) +\n", - " # scale_y_continuous(name = \"Taux de soumission (%)\", limits = c(0, 100)) +\n", - " scale_y_continuous(name = \"Taux de soumission (%)\") +\n", - " labs(\n", - " title = \"Distribution mensuelle du taux de soumission des rapports\",\n", - " subtitle = \"Monthly Distribution of Dataset Reporting Rate by Health District (2021–2024)\",\n", - " x = \"Mois\",\n", - " fill = \"Année\"\n", - " ) +\n", - " theme_minimal(base_size = 16) +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\", size = 20),\n", - " plot.subtitle = element_text(size = 16),\n", - " legend.position = \"bottom\"\n", - " )\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1fd09749-2042-49d4-b2a3-9c2e6e5eae52", - "metadata": {}, - "outputs": [], - "source": [ - "if (nrow(reporting_rate_data) != 0) {\n", - "\n", - "# Step 1: Aggregate to annual reporting rate per district\n", - "annual_data <- reporting_rate_data %>%\n", - " group_by(YEAR, ADM2_ID) %>%\n", - " summarise(reporting_rate = mean(REPORTING_RATE, na.rm = TRUE)) %>%\n", - " ungroup()\n", - "\n", - "# Step 2: Join with spatial data (assuming 'map_sf' contains geometry and ADM2_ID)\n", - "map_data <- shapes_data %>%\n", - " left_join(annual_data, by = \"ADM2_ID\")\n", - "\n", - "# Step 3: Bin the reporting rate into categories\n", - "map_data <- map_data %>%\n", - " mutate(\n", - " reporting_cat = case_when(\n", - " reporting_rate < 0.5 ~ \"<50\",\n", - " reporting_rate < 0.8 ~ \"50-79\", # \"50-80\"\n", - " reporting_rate < 0.9 ~ \"80-89\", # \"80-90\"\n", - " reporting_rate >= 0.9 ~ \">=90\",\n", - " TRUE ~ NA_character_\n", - " ),\n", - " reporting_cat = factor(reporting_cat, levels = c(\"<50\", \"50-79\", \"80-89\", \">=90\")) # levels = c(\"<50\", \"50-80\", \"80-90\", \">=90\")\n", - " )\n", - "\n", - "# Step 4: Define colors\n", - "reporting_colors <- c(\n", - " \"<50\" = \"#d7191c\",\n", - " \"50-79\" = \"#fdae61\",\n", - " \"80-89\" = \"#ffffbf\",\n", - " \">=90\" = \"#1a9641\"\n", - ")\n", - "\n", - "# Step 5: Plot\n", - "options(repr.plot.width = 18, repr.plot.height = 10)\n", - "ggplot(map_data) +\n", - " geom_sf(aes(fill = reporting_cat), color = \"white\", size = 0.2) +\n", - " facet_wrap(~ YEAR) +\n", - " scale_fill_manual(values = reporting_colors, name = \"Taux de soummision (%)\") +\n", - " labs(\n", - " title = \"Taux de soumission des rapports annuels par district sanitaire\",\n", - " subtitle = \"Annual Dataset Reporting Completeness by Health District\"\n", - " ) +\n", - " theme_minimal(base_size = 16) +\n", - " theme(\n", - " legend.position = \"right\",\n", - " strip.text = element_text(face = \"bold\", size = 16),\n", - " plot.title = element_text(face = \"bold\")\n", - " )\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "df3f7949-baff-4cd9-a022-594420765289", - "metadata": {}, - "outputs": [], - "source": [ - "if (nrow(reporting_rate_data) != 0) {\n", - "\n", - "# Step 1: Compute mean reporting rate per ADM2_ID over all years\n", - "mean_reporting_stats <- map_data %>%\n", - " group_by(ADM2_ID) %>%\n", - " summarise(\n", - " reporting_rate = mean(reporting_rate, na.rm = TRUE),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " mutate(\n", - " reporting_cat = case_when(\n", - " reporting_rate < 0.5 ~ \"<50\",\n", - " reporting_rate < 0.8 ~ \"50-80\",\n", - " reporting_rate < 0.9 ~ \"80-90\",\n", - " reporting_rate >= 0.9 ~ \">=90\",\n", - " TRUE ~ NA_character_\n", - " )\n", - " )\n", - "\n", - "# Set correct factor levels to match legend\n", - "mean_reporting_stats$reporting_cat <- factor(\n", - " mean_reporting_stats$reporting_cat,\n", - " levels = c(\"<50\", \"50-80\", \"80-90\", \">=90\")\n", - ")\n", - "\n", - "# Step 2: Join with shapes (drop geometry to avoid spatial join conflict)\n", - "mean_reporting_map <- shapes_data %>%\n", - " left_join(st_drop_geometry(mean_reporting_stats), by = \"ADM2_ID\") %>%\n", - " st_as_sf()\n", - "\n", - "# Step 3: Define custom color scale\n", - "reporting_colors <- c(\n", - " \"<50\" = \"#d7191c\", # red\n", - " \"50-80\" = \"#fdae61\", # orange\n", - " \"80-90\" = \"#ffffbf\", # yellow\n", - " \">=90\" = \"#1a9641\" # green\n", - ")\n", - "\n", - "# Step 4: Plot\n", - "options(repr.plot.width = 20, repr.plot.height = 10)\n", - "ggplot(mean_reporting_map) +\n", - " geom_sf(aes(fill = reporting_cat), color = \"white\", size = 0.2) +\n", - " scale_fill_manual(\n", - " values = reporting_colors,\n", - " name = \"Taux de soumission (%)\",\n", - " drop = FALSE\n", - " ) +\n", - " labs(\n", - " title = \"Taux moyen de soumission des rapports (toutes années confondues)\",\n", - " subtitle = \"Mean Annual Dataset Reporting Rate (All Years Combined)\"\n", - " ) +\n", - " theme_minimal(base_size = 16) +\n", - " theme(\n", - " legend.position = \"right\",\n", - " plot.title = element_text(face = \"bold\"),\n", - " plot.subtitle = element_text()\n", - " )\n", - "\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "9497287c-bbd2-446f-946d-88e34233f9f0", - "metadata": {}, - "source": [ - "## B) Taux de rapportage des éléments de données: cas confirmés / Data element Reporting Rate: confirmed cases\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6f8cd01e-e4b5-41f5-bff8-d35a1b143d0e", - "metadata": {}, - "outputs": [], - "source": [ - "# # import data\n", - "# # was: routine_data\n", - "# reporting_rate_data <- tryCatch({ get_latest_dataset_file_in_memory(DATASET_NAME, paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.parquet\")) }, \n", - "# error = function(e) {\n", - "# msg <- paste(\"Error while loading seasonality file for: \" , COUNTRY_CODE, conditionMessage(e))\n", - "# # cat(msg)\n", - "# log_msg(msg, level = \"warning\") # GP 20250908\n", - "# # stop(msg) # GP 20250908\n", - "# })\n", - "\n", - "# reporting_rate_data <- reporting_rate_data %>%\n", - "# left_join(pyramid_data, by = c(\"ADM2_ID\" = \"LEVEL_3_ID\"))\n", - "\n", - "# printdim(reporting_rate_data)" - ] - }, - { - "cell_type": "markdown", - "id": "8c11349d-598a-4882-a156-3e5b969ab76c", - "metadata": {}, - "source": [ - "### Import and format data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7838692f-fe89-446e-bac5-af5cc7324226", - "metadata": {}, - "outputs": [], - "source": [ - "# config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0f8eba51-b883-432b-8f7c-c4860cc9e78c", - "metadata": {}, - "outputs": [], - "source": [ - "# ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", - "# ADMIN_2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ceeba43a-e9ba-4b5a-8d0b-c4faade1367e", - "metadata": {}, - "outputs": [], - "source": [ - "# ADMIN_2_LEVEL <- str_replace(ADMIN_2, \"NAME\", \"ID\")\n", - "# ADMIN_2_LEVEL" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "54bf61aa-c324-411f-8eea-93049d1bb252", - "metadata": {}, - "outputs": [], - "source": [ - "# THIS CODE SHOULD BE REMOVED, WE SHOULD ONLY LOAD REPORTING RATE ONCE IN THIS REPORT (parameter " - ] - }, - { - "cell_type": "markdown", - "id": "2338477e-2036-42e6-bfd3-c2b2480395c1", - "metadata": {}, - "source": [ - "**suggestion:** \n", - "- If possible I would try to reuse the same plotting code. So we can remove all the code as from here ...**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "41da51da-7c8a-45d2-b492-a80071dfe2e3", - "metadata": {}, - "outputs": [], - "source": [ - "# Import from Dataset\n", - "\n", - "reporting_rate_data <- tryCatch({\n", - " # Attempt to load the dataset\n", - " get_latest_dataset_file_in_memory(\n", - " DATASET_NAME, \n", - " paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.parquet\")\n", - " )\n", - "}, \n", - "error = function(e) {\n", - " # If an error occurs, log a warning\n", - " msg <- paste(\"[WARNING] Warning: file `\", COUNTRY_CODE, \"_reporting_rate_dataelement.parquet` does not exist, skipped loading. \n", - " To generate this file, re-run the reporting rate pipeline. Error:\", conditionMessage(e))\n", - " log_msg(msg, level = \"warning\")\n", - " \n", - " # IMPORTANT: Return an empty tibble with the correct structure SO PIPELINE DOES NOT FAIL\n", - " return(\n", - " tibble(\n", - " YEAR = double(),\n", - " MONTH = double(),\n", - " ADM2_ID = character(),\n", - " REPORTING_RATE = double()\n", - " )\n", - " )\n", - "})\n", - "\n", - "# Add _NAME cols from pyramid\n", - "if (nrow(reporting_rate_data) != 0) {\n", - "\n", - " ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", - " ADMIN_2_LEVEL <- str_replace(ADMIN_2, \"NAME\", \"ID\")\n", - " \n", - " reporting_rate_data <- reporting_rate_data %>%\n", - " # left_join(pyramid_data, by = c(\"ADM2_ID\" = \"LEVEL_3_ID\")) # old\n", - " left_join(pyramid_data, by = c(\"ADM2_ID\" = ADMIN_2_LEVEL))\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dff04b9c-0ee7-44d3-a84f-150bce1c368f", - "metadata": {}, - "outputs": [], - "source": [ - "# reporting_rate_data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0be95b4e-8678-44be-a361-d2216bcd741c", - "metadata": {}, - "outputs": [], - "source": [ - "# if (nrow(reporting_rate_data) != 0) {\n", - "# reporting_rate_data <- reporting_rate_data %>%\n", - "# left_join(pyramid_data, by = c(\"ADM2_ID\" = \"LEVEL_3_ID\"))\n", - "# }" - ] - }, - { - "cell_type": "markdown", - "id": "42155cf0-0475-45fc-a276-7ff2bd4ed555", - "metadata": {}, - "source": [ - "### Plot: Heatmap" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0d4d40b6-7d01-4ed3-83ba-39ffdfbe4b3d", - "metadata": {}, - "outputs": [], - "source": [ - "# Prepare date column + category\n", - "\n", - "if (nrow(reporting_rate_data) != 0) {\n", - "\n", - "reporting_rate_data <- reporting_rate_data %>%\n", - " mutate(\n", - " date = as.Date(paste0(YEAR, \"-\", MONTH, \"-01\")),\n", - " ADM2_ID = factor(ADM2_ID),\n", - " # reporting_pct = pmin(REPORTING_RATE, 1) * 100,\n", - " reporting_pct = REPORTING_RATE * 100,\n", - " category = cut(\n", - " reporting_pct,\n", - " # breaks = c(-Inf, 50, 80, 90, Inf),\n", - " # labels = c(\"<50\", \"50–80\", \"80–90\", \"≥90\"),\n", - " # right = FALSE\n", - " breaks = c(-Inf, 50, 80, 90, 100, Inf),\n", - " labels = c(\"<50\", \"50–80\", \"80–90\", \"90-100\", \">100\"),\n", - " right = TRUE\n", - " )\n", - " )\n", - "\n", - "# # Define color scale\n", - "# reporting_colors <- c(\n", - "# \"<50\" = \"#d7191c\", # red\n", - "# \"50–80\" = \"#fdae61\", # orange\n", - "# \"80–90\" = \"#ffffbf\", # yellow\n", - "# \"≥90\" = \"#1a9641\" # green\n", - "# )\n", - "\n", - "# Define color scale\n", - "reporting_colors <- c(\n", - " \"<50\" = \"#d7191c\", # red\n", - " \"50–80\" = \"#fdae61\", # orange\n", - " \"80–90\" = \"#ffffbf\", # yellow\n", - " \"90-100\" = \"#1a9641\", # green\n", - " \">100\" = \"darkgreen\" # \"darkgreen\"\n", - ")\n", - "\n", - "# Plot heatmap\n", - "options(repr.plot.width = 18, repr.plot.height = 15)\n", - "ggplot(reporting_rate_data, aes(x = date, y = LEVEL_3_NAME, fill = category)) +\n", - " geom_tile() +\n", - " scale_fill_manual(\n", - " values = reporting_colors,\n", - " name = \"Taux de soumission (%)\"\n", - " ) +\n", - " labs(\n", - " title = \"Taux de rapportage mensuels par district sanitaire\",\n", - " subtitle = \"Monthly Data Element Reporting Rate by Health District\",\n", - " x = \"Mois - Month\",\n", - " y = \"District Sanitaire - Health District\"\n", - " ) +\n", - " theme_minimal(base_size = 16) +\n", - " theme(\n", - " axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 16),\n", - " axis.text.y = element_text(size = 9),\n", - " plot.title = element_text(face = \"bold\", hjust = 0.5, size = 20),\n", - " plot.subtitle = element_text(hjust = 0.5, size = 16),\n", - " legend.position = \"top\", # \"right\"\n", - " panel.grid = element_blank()\n", - " )\n", - "\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "b14436cf-1f78-4c3f-944b-0c1eb845a3f6", - "metadata": {}, - "source": [ - "### Plot: boxplot" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ba3a3e89-17f5-4024-b039-dcedb0f37dc2", - "metadata": {}, - "outputs": [], - "source": [ - "# Prepare the data\n", - "\n", - "if (nrow(reporting_rate_data) != 0) {\n", - " \n", - "reporting_rate_data_box <- reporting_rate_data %>%\n", - " mutate(\n", - " MONTH = as.integer(MONTH),\n", - " YEAR = as.factor(YEAR),\n", - " # reporting_pct = pmin(REPORTING_RATE, 1) * 100 # `pmin()` caps values to 1 (then, 100%)\n", - " reporting_pct = REPORTING_RATE * 100\n", - " )\n", - "\n", - "# Month labels in French\n", - "month_labels_fr <- c(\n", - " \"Janv\", \"Févr\", \"Mars\", \"Avril\", \"Mai\", \"Juin\",\n", - " \"Juil\", \"Août\", \"Sept\", \"Oct\", \"Nov\", \"Déc\"\n", - ")\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1472d8f2-56d3-4b7a-82ef-4b344d87d264", - "metadata": {}, - "outputs": [], - "source": [ - "if (nrow(reporting_rate_data) != 0) {\n", - "\n", - "# Plot\n", - "options(repr.plot.width = 18, repr.plot.height = 15)\n", - "ggplot(reporting_rate_data_box, aes(x = factor(MONTH), y = reporting_pct, fill = YEAR)) +\n", - " geom_boxplot(outlier.size = 0.8, outlier.alpha = 0.4) +\n", - " scale_x_discrete(labels = month_labels_fr) +\n", - " # scale_y_continuous(name = \"Taux de soumission (%)\", limits = c(0, 100)) +\n", - " scale_y_continuous(name = \"Taux de soumission (%)\") +\n", - " labs(\n", - " title = \"Distribution mensuelle du taux de rapportage\",\n", - " subtitle = \"Monthly Distribution of Data Element Reporting Rate by Health District (2021–2024)\",\n", - " x = \"Mois\",\n", - " fill = \"Année\"\n", - " ) +\n", - " theme_minimal(base_size = 16) +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\", size = 20),\n", - " plot.subtitle = element_text(size = 16),\n", - " legend.position = \"bottom\"\n", - " )\n", - "\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "0d275753-d21e-410a-b7d7-f265ac6e9235", - "metadata": {}, - "source": [ - "### Plot: choropleth" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "12674c7e-7745-464c-9cc5-3c1e6dcd63c4", - "metadata": {}, - "outputs": [], - "source": [ - "if (nrow(reporting_rate_data) != 0) {\n", - "\n", - "# Step 1: Aggregate to annual reporting rate per district \n", - "annual_data <- reporting_rate_data %>%\n", - " group_by(YEAR, ADM2_ID) %>%\n", - " summarise(reporting_rate = mean(REPORTING_RATE, na.rm = TRUE)) %>%\n", - " ungroup()\n", - "\n", - "# Step 2: Join with spatial data (assuming 'map_sf' contains geometry and ADM2_ID)\n", - "map_data <- shapes_data %>%\n", - " left_join(annual_data, by = \"ADM2_ID\")\n", - "\n", - "# Step 3: Bin the reporting rate into categories\n", - "map_data <- map_data %>%\n", - " mutate(\n", - " reporting_cat = case_when(\n", - " reporting_rate < 0.5 ~ \"<50\",\n", - " reporting_rate < 0.8 ~ \"50-80\",\n", - " reporting_rate < 0.9 ~ \"80-90\",\n", - " reporting_rate >= 0.9 ~ \">=90\",\n", - " TRUE ~ NA_character_\n", - " ),\n", - " reporting_cat = factor(reporting_cat, levels = c(\"<50\", \"50-80\", \"80-90\", \">=90\"))\n", - " )\n", - "\n", - "# Step 4: Define colors\n", - "reporting_colors <- c(\n", - " \"<50\" = \"#d7191c\",\n", - " \"50-80\" = \"#fdae61\",\n", - " \"80-90\" = \"#ffffbf\",\n", - " \">=90\" = \"#1a9641\"\n", - ")\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7777a718-2e6c-4f80-a7d6-ae304a1b49fb", - "metadata": {}, - "outputs": [], - "source": [ - "if (nrow(reporting_rate_data) != 0) {\n", - "\n", - "# Step 5: Plot\n", - "options(repr.plot.width = 18, repr.plot.height = 10)\n", - "ggplot(map_data) +\n", - " geom_sf(aes(fill = reporting_cat), color = \"white\", size = 0.2) +\n", - " facet_wrap(~ YEAR) +\n", - " scale_fill_manual(values = reporting_colors, name = \"Taux de soummision (%)\") +\n", - " labs(\n", - " title = \"Taux de rapportage des éléments de donnée annuels par district sanitaire\",\n", - " subtitle = \"Annual Data element Reporting Completeness by Health District\"\n", - " ) +\n", - " theme_minimal(base_size = 16) +\n", - " theme(\n", - " legend.position = \"right\",\n", - " strip.text = element_text(face = \"bold\", size = 16),\n", - " plot.title = element_text(face = \"bold\")\n", - " )\n", - "\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "f25ec0fb-758b-476c-8567-b0dce0a387d1", - "metadata": {}, - "source": [ - "### Plot: choropleth 2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c891e330-f56f-4846-88c3-fd13a9fac8e7", - "metadata": {}, - "outputs": [], - "source": [ - "if (nrow(reporting_rate_data) != 0) {\n", - "\n", - "# Step 1: Compute mean reporting rate per ADM2_ID over all years\n", - "mean_reporting_stats <- map_data %>%\n", - " group_by(ADM2_ID) %>%\n", - " summarise(\n", - " reporting_rate = mean(reporting_rate, na.rm = TRUE),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " mutate(\n", - " reporting_cat = case_when(\n", - " reporting_rate < 0.5 ~ \"<50\",\n", - " reporting_rate < 0.8 ~ \"50-80\",\n", - " reporting_rate < 0.9 ~ \"80-90\",\n", - " reporting_rate >= 0.9 ~ \">=90\",\n", - " TRUE ~ NA_character_\n", - " )\n", - " )\n", - "\n", - "# Set correct factor levels to match legend\n", - "mean_reporting_stats$reporting_cat <- factor(\n", - " mean_reporting_stats$reporting_cat,\n", - " levels = c(\"<50\", \"50-80\", \"80-90\", \">=90\")\n", - ")\n", - "\n", - "# Step 2: Join with shapes (drop geometry to avoid spatial join conflict)\n", - "mean_reporting_map <- shapes_data %>%\n", - " left_join(st_drop_geometry(mean_reporting_stats), by = \"ADM2_ID\") %>%\n", - " st_as_sf()\n", - "\n", - "# Step 3: Define custom color scale\n", - "reporting_colors <- c(\n", - " \"<50\" = \"#d7191c\", # red\n", - " \"50-80\" = \"#fdae61\", # orange\n", - " \"80-90\" = \"#ffffbf\", # yellow\n", - " \">=90\" = \"#1a9641\" # green\n", - ")\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4032f2be-dc1e-48cf-9a5f-5c0854c32e9a", - "metadata": {}, - "outputs": [], - "source": [ - "if (nrow(reporting_rate_data) != 0) {\n", - "\n", - "# Step 4: Plot\n", - "options(repr.plot.width = 20, repr.plot.height = 10)\n", - "ggplot(mean_reporting_map) +\n", - " geom_sf(aes(fill = reporting_cat), color = \"white\", size = 0.2) +\n", - " scale_fill_manual(\n", - " values = reporting_colors,\n", - " name = \"Taux de soumission (%)\",\n", - " drop = FALSE\n", - " ) +\n", - " labs(\n", - " title = \"Taux moyen de rapportage (toutes années confondues)\",\n", - " subtitle = \"Mean Annual Data Element Reporting Rate (All Years Combined)\"\n", - " ) +\n", - " theme_minimal(base_size = 16) +\n", - " theme(\n", - " legend.position = \"right\",\n", - " plot.title = element_text(face = \"bold\"),\n", - " plot.subtitle = element_text()\n", - " )\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a0f1ea54-b02b-4523-b8b7-9dcdf30c39ba", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/pipelines/snt_dhis2_reporting_rate/utils/snt_dhis2_reporting_rate.r b/pipelines/snt_dhis2_reporting_rate/utils/snt_dhis2_reporting_rate.r new file mode 100644 index 0000000..bc84a21 --- /dev/null +++ b/pipelines/snt_dhis2_reporting_rate/utils/snt_dhis2_reporting_rate.r @@ -0,0 +1,79 @@ +# Shared helpers for snt_dhis2_reporting_rate notebooks. + +inspect_reporting_rate <- function(data_tibble) { + tibble_name_full <- deparse(substitute(data_tibble)) + method <- stringr::str_extract(tibble_name_full, "(?<=reporting_rate_).*") + + values_greater_than_1 <- sum(data_tibble$REPORTING_RATE > 1, na.rm = TRUE) + total_values <- length(data_tibble$REPORTING_RATE) + + if (total_values > 0) { + proportion <- values_greater_than_1 / total_values * 100 + min_rate <- min(data_tibble$REPORTING_RATE, na.rm = TRUE) + max_rate <- max(data_tibble$REPORTING_RATE, na.rm = TRUE) + } else { + proportion <- 0 + min_rate <- NA + max_rate <- NA + } + + clarification <- if (proportion == 0) NULL else " (there are more reports than expected)" + + log_msg( + paste0( + "🔍 For reporting rate method : `", method, "`, the values of REPORTING_RATE range from ", round(min_rate, 2), + " to ", round(max_rate, 2), + ", and ", round(proportion, 2), " % of values are >1", clarification, "." + ) + ) + + hist(data_tibble$REPORTING_RATE, breaks = 50) +} + +is_aire_l5 <- function(x) { + stringr::str_detect(x, stringr::regex("^\\s*aire[^a-zA-Z]?", ignore_case = TRUE)) +} + +is_hospital_l4 <- function(x) { + stringr::str_detect(x, stringr::regex("^(hd|chr|chu|hgr)", ignore_case = TRUE)) +} + +snt_write_csv <- function(x, output_data_path, method, country_code = NULL) { + if (is.null(country_code) && exists("COUNTRY_CODE")) { + country_code <- get("COUNTRY_CODE") + } + if (is.null(country_code)) { + stop("country_code is required to export reporting rate csv.") + } + + full_directory_path <- file.path(output_data_path, "reporting_rate") + if (!dir.exists(full_directory_path)) { + dir.create(full_directory_path, recursive = TRUE) + } + + file_path <- file.path(full_directory_path, paste0(country_code, "_reporting_rate_", method, ".csv")) + readr::write_csv(x, file_path) + log_msg(paste0("Exported : ", file_path)) +} + +snt_write_parquet <- function(x, output_data_path, method, country_code = NULL) { + if (is.null(country_code) && exists("COUNTRY_CODE")) { + country_code <- get("COUNTRY_CODE") + } + if (is.null(country_code)) { + stop("country_code is required to export reporting rate parquet.") + } + + full_directory_path <- file.path(output_data_path, "reporting_rate") + if (!dir.exists(full_directory_path)) { + dir.create(full_directory_path, recursive = TRUE) + } + + file_path <- file.path(full_directory_path, paste0(country_code, "_reporting_rate_", method, ".parquet")) + arrow::write_parquet(x, file_path) + log_msg(paste0("Exported : ", file_path)) +} + +printdim <- function(df, name = deparse(substitute(df))) { + cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n") +} diff --git a/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb b/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb index 931e223..fca21e6 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb +++ b/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb @@ -1,1232 +1,1087 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "6e8d006c-fd3d-4186-bc8f-b83fdf234e65", - "metadata": { - "papermill": { - "duration": 0.000173, - "end_time": "2026-01-16T10:22:53.011120", - "exception": false, - "start_time": "2026-01-16T10:22:53.010947", - "status": "completed" + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000173, + "end_time": "2026-01-16T10:22:53.011120", + "exception": false, + "start_time": "2026-01-16T10:22:53.010947", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Data Element reporting rate: based on reporting of one or more indicators\n", + "Partially following methods by WHO and as per Diallo (2025) paper\n", + "\n", + "To accurately measure data completeness, we calculate the **monthly** reporting rate per **ADM2**, as the **proportion** of **facilities** (HF or `OU_ID`) that in a given month submitted data for either a single or _any_ of the chosen indicators (i.e., `CONF`, `SUSP`, `TEST`). \n", + "Basically, \"Data Element\" reporting rate is the number of facilities reporting on 1 or more given indicators, over the total number of facilities.
\n", + "For this method the user is allowed to **chose** how to calculate both the **numerator** and **denominator**.
\n", + "\n", + "Specifically: \n", + "\n", + "* **Numerator**: Number of facilities that _actually reported_ data, and it is estimated based on whether a facility (OU_ID) submitted data for **_any_** of the **selected indicators**. \n", + " Note: we **recommend** always including `CONF` because it is a core indicator consistently tracked across the dataset. This choice ensures alignment with the structure of the incidence calculation, which is also mainly based on confirmed cases.\n", + "
\n", + "
\n", + "* **Denominator**: Number of facilities _expected_ to report. This number can be obtained in two different ways: \n", + " * `\"ROUTINE_ACTIVE_FACILITIES\"`: uses the col `EXPECTED_REPORTS` from the df `active_facilities`.
\n", + " This is calculated as the number of \"**active**\" facilities (OU_ID), defined as those that submitted _any_ data **at least once in a given year**, across **all** indicators extracted in `dhis2_routine` (namely: all aggregated indicators as defined in the SNT_config.json file, see: `config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS`)\n", + " * `\"PYRAMID_OPEN_FACILITIES\"`: This method uses the opening and closing dates in DHIS2 (stored in the DHIS2 organisation units) to determine whether a facility was open, and thus expected to report, at the time of calculation.\n", + "
\n", + "
\n", + "* **Output**: Reporting rate table aggregated at administrative level 2 with extensions csv and parquet saved to dataset **SNT_DHIS2_REPORTING_RATE**:\n", + " * cols: YEAR, MONTH, ADM2_ID, REPORTING_RATE\n", + " * Filename: `XXX_reporting_rate_dataelement.`" + ], + "id": "6e8d006c-fd3d-4186-bc8f-b83fdf234e65" }, - "tags": [] - }, - "source": [ - "# Data Element reporting rate: based on reporting of one or more indicators\n", - "Partially following methods by WHO and as per Diallo (2025) paper\n", - "\n", - "To accurately measure data completeness, we calculate the **monthly** reporting rate per **ADM2**, as the **proportion** of **facilities** (HF or `OU_ID`) that in a given month submitted data for either a single or _any_ of the chosen indicators (i.e., `CONF`, `SUSP`, `TEST`). \n", - "Basically, \"Data Element\" reporting rate is the number of facilities reporting on 1 or more given indicators, over the total number of facilities.
\n", - "For this method the user is allowed to **chose** how to calculate both the **numerator** and **denominator**.
\n", - "\n", - "Specifically: \n", - "\n", - "* **Numerator**: Number of facilities that _actually reported_ data, and it is estimated based on whether a facility (OU_ID) submitted data for **_any_** of the **selected indicators**. \n", - " Note: we **recommend** always including `CONF` because it is a core indicator consistently tracked across the dataset. This choice ensures alignment with the structure of the incidence calculation, which is also mainly based on confirmed cases.\n", - "
\n", - "
\n", - "* **Denominator**: Number of facilities _expected_ to report. This number can be obtained in two different ways: \n", - " * `\"ROUTINE_ACTIVE_FACILITIES\"`: uses the col `EXPECTED_REPORTS` from the df `active_facilities`.
\n", - " This is calculated as the number of \"**active**\" facilities (OU_ID), defined as those that submitted _any_ data **at least once in a given year**, across **all** indicators extracted in `dhis2_routine` (namely: all aggregated indicators as defined in the SNT_config.json file, see: `config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS`)\n", - " * `\"PYRAMID_OPEN_FACILITIES\"`: This method uses the opening and closing dates in DHIS2 (stored in the DHIS2 organisation units) to determine whether a facility was open, and thus expected to report, at the time of calculation.\n", - "
\n", - "
\n", - "* **Output**: Reporting rate table aggregated at administrative level 2 with extensions csv and parquet saved to dataset **SNT_DHIS2_REPORTING_RATE**:\n", - " * cols: YEAR, MONTH, ADM2_ID, REPORTING_RATE\n", - " * Filename: `XXX_reporting_rate_dataelement.`" - ] - }, - { - "cell_type": "markdown", - "id": "064495be-24e5-4b76-a91f-7ac3d1a27a5a", - "metadata": { - "papermill": { - "duration": 0.000228, - "end_time": "2026-01-16T10:22:53.014752", - "exception": false, - "start_time": "2026-01-16T10:22:53.014524", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000228, + "end_time": "2026-01-16T10:22:53.014752", + "exception": false, + "start_time": "2026-01-16T10:22:53.014524", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## 1. Setup" + ], + "id": "064495be-24e5-4b76-a91f-7ac3d1a27a5a" }, - "tags": [] - }, - "source": [ - "## 1. Setup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "35ede7cf-257f-439c-a514-26a7290f881d", - "metadata": { - "papermill": { - "duration": 63.150489, - "end_time": "2026-01-16T10:23:56.165530", - "exception": false, - "start_time": "2026-01-16T10:22:53.015041", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "papermill": { + "duration": 63.150489, + "end_time": "2026-01-16T10:23:56.165530", + "exception": false, + "start_time": "2026-01-16T10:22:53.015041", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "source(\"~/workspace/pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r\")\n", + "snt_environment <- get_setup_variables()\n", + "config_json <- load_snt_config(file.path(snt_environment$CONFIG_PATH, \"SNT_config.json\"))\n" + ], + "execution_count": null, + "outputs": [], + "id": "35ede7cf-257f-439c-a514-26a7290f881d" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Project paths\n", - "SNT_ROOT_PATH <- \"/home/hexa/workspace\" \n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, 'code') \n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, 'configuration') \n", - "DATA_PATH <- file.path(SNT_ROOT_PATH, 'data', 'dhis2') \n", - "\n", - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "\n", - "# Load libraries \n", - "required_packages <- c(\"arrow\", \"tidyverse\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"glue\")\n", - "install_and_load(required_packages)\n", - "\n", - "# Environment variables\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "\n", - "# Load OpenHEXA sdk\n", - "openhexa <- import(\"openhexa.sdk\")" - ] - }, - { - "cell_type": "markdown", - "id": "a7a15634-4623-40f2-8e2d-3fa47203aa6e", - "metadata": { - "papermill": { - "duration": 0.00011, - "end_time": "2026-01-16T10:23:56.165873", - "exception": false, - "start_time": "2026-01-16T10:23:56.165763", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000095, + "end_time": "2026-01-16T10:23:56.200231", + "exception": false, + "start_time": "2026-01-16T10:23:56.200136", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 1.1. Pipeline parameters\n", + "Required parameters are injected by Papermill and validated in the notebook setup.\n", + "" + ], + "id": "7dedcc32-c531-498d-90b9-89b0ee9fb9be" }, - "tags": [] - }, - "source": [ - "### 1.1. Fallback parameters values\n", - "This parameters are injected by papermill when running in OH via pipeline run interface.
\n", - "The code cell below here provides fallback paramater values needed when running this notebook locally." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b17f7685-5291-4e5d-9eec-2d1f9435fccb", - "metadata": { - "papermill": { - "duration": 0.033954, - "end_time": "2026-01-16T10:23:56.199937", - "exception": false, - "start_time": "2026-01-16T10:23:56.165983", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "markdown" + } + }, + "source": [ + "#### Save variables\n", + "Indicator lists and remaining fields used downstream (mirrors the population notebook block that assigns `COUNTRY_CODE`, `ADMIN_1`, … from `config_json`).\n" + ], + "id": "a1b2c3d4-save-vars-md" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Current options:\n", - "# \"COUNTRY_CODE_routine.parquet\" (raw)\n", - "# \"COUNTRY_CODE_routine_outliers_imputed.parquet\"\n", - "# \"COUNTRY_CODE_routine_outliers_removed.parquet\"\n", - "if (!exists(\"ROUTINE_FILE\")) {ROUTINE_FILE <- \"NER_routine_outliers_imputed.parquet\"}\n", - "\n", - "# Options: \"ROUTINE_ACTIVE_FACILITIES\", \"PYRAMID_OPEN_FACILITIES\"\n", - "if (!exists(\"DATAELEMENT_METHOD_DENOMINATOR\")) {DATAELEMENT_METHOD_DENOMINATOR <- \"ROUTINE_ACTIVE_FACILITIES\"}\n", - "if (!exists(\"ACTIVITY_INDICATORS\")) {ACTIVITY_INDICATORS <- c(\"CONF\", \"PRES\", \"SUSP\")} \n", - "if (!exists(\"VOLUME_ACTIVITY_INDICATORS\")) {VOLUME_ACTIVITY_INDICATORS <- c(\"CONF\", \"PRES\")}\n", - "if (!exists(\"USE_WEIGHTED_REPORTING_RATES\")) {USE_WEIGHTED_REPORTING_RATES <- FALSE}" - ] - }, - { - "cell_type": "markdown", - "id": "7dedcc32-c531-498d-90b9-89b0ee9fb9be", - "metadata": { - "papermill": { - "duration": 9.5e-05, - "end_time": "2026-01-16T10:23:56.200231", - "exception": false, - "start_time": "2026-01-16T10:23:56.200136", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "papermill": { + "duration": 0.521572, + "end_time": "2026-01-16T10:23:56.721932", + "exception": false, + "start_time": "2026-01-16T10:23:56.200360", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "assert_papermill_dataelement_params()\n", + "\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", + "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", + "DHIS2_INDICATORS <- c(\"CONF\", \"PRES\", \"SUSP\", \"TEST\")\n", + "\n", + "ACTIVITY_INDICATORS <- as.character(unlist(ACTIVITY_INDICATORS, use.names = FALSE))\n", + "VOLUME_ACTIVITY_INDICATORS <- as.character(unlist(VOLUME_ACTIVITY_INDICATORS, use.names = FALSE))\n", + "\n", + "fixed_cols <- c(\"PERIOD\", \"YEAR\", \"MONTH\", \"ADM1_ID\", \"ADM2_ID\", \"OU_ID\")\n", + "fixed_cols_rr <- c(\"YEAR\", \"MONTH\", \"ADM2_ID\", \"REPORTING_RATE\")\n", + "" + ], + "execution_count": null, + "outputs": [], + "id": "5b6d29ea-91f3-4c53-b95e-4b485f88383f" }, - "tags": [] - }, - "source": [ - "### 1.2. Load and check `snt config` file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5b6d29ea-91f3-4c53-b95e-4b485f88383f", - "metadata": { - "papermill": { - "duration": 0.521572, - "end_time": "2026-01-16T10:23:56.721932", - "exception": false, - "start_time": "2026-01-16T10:23:56.200360", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.033003, + "end_time": "2026-01-16T10:23:56.755117", + "exception": false, + "start_time": "2026-01-16T10:23:56.722114", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "### 1.2. Checks\n", + "Validate activity-indicator selection before heavy joins.\n" + ], + "id": "c26c981c-dadd-48ac-ae35-613b8ba61a82" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", - " error = function(e) {\n", - " msg <- paste0(\"[ERROR] Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "log_msg(paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, \"SNT_config.json\")))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c26c981c-dadd-48ac-ae35-613b8ba61a82", - "metadata": { - "papermill": { - "duration": 0.033003, - "end_time": "2026-01-16T10:23:56.755117", - "exception": false, - "start_time": "2026-01-16T10:23:56.722114", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "stop_if_activity_indicators_empty(ACTIVITY_INDICATORS)\n" + ], + "execution_count": null, + "outputs": [], + "id": "8bf4a8bb" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Configuration settings\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", - "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", - "\n", - "# How to treat 0 values (in this case: \"SET_0_TO_NA\" converts 0 to NAs)\n", - "# 🚨 NOTE (2025-01-09): The configuration field `NA_TREATMENT` has been removed from SNT_config.json files.\n", - "# It was legacy code from Ousmane and was only used for Reporting Rate calculations (not anymore).\n", - "# It has been replaced by `0_VALUES_PRESERVED` (boolean: true/false) which specifies whether zero values\n", - "# are stored in the DHIS2 instance (true) or converted to NULL to save space (false).\n", - "# See: https://bluesquare.atlassian.net/browse/SNT25-158\n", - "# The variable `NA_TREATMENT` is kept here for backward compatibility but is no longer loaded from config.\n", - "NA_TREATMENT <- config_json$SNT_CONFIG$NA_TREATMENT\n", - "# DHIS2_INDICATORS <- names(config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS) \n", - "DHIS2_INDICATORS <- c(\"CONF\", \"PRES\", \"SUSP\", \"TEST\") # GP 20260205\n", - "\n", - "ACTIVITY_INDICATORS <- unlist(ACTIVITY_INDICATORS)\n", - "VOLUME_ACTIVITY_INDICATORS <- unlist(VOLUME_ACTIVITY_INDICATORS)\n", - "fixed_cols <- c('PERIOD', 'YEAR', 'MONTH', 'ADM1_ID', 'ADM2_ID', 'OU_ID')\n", - "fixed_cols_rr <- c('YEAR', 'MONTH', 'ADM2_ID', 'REPORTING_RATE') # Fixed cols for exporting RR tables" - ] - }, - { - "cell_type": "markdown", - "id": "8bf4a8bb", - "metadata": {}, - "source": [ - "### 1.3. 🔍 Check: at least 1 indicator must be selected\n", - "The use can toggle on/off each of the indicators. Therefore, need to make sure at least one is ON.
\n", - "Indicator `CONF` is mandatory, but I think it looks better if they're all displayed in the Run pipeline view (more intuitive)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "18b40207", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (!length(ACTIVITY_INDICATORS) > 0) {\n", - " msg <- \"[ERROR] Error: no indicator selected, cannot perform calculation of reporting rate method. Select at least one (e.g., `CONF`).\"\n", - " cat(msg) \n", - " stop(msg)\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "e44ae2ab-4af7-475a-8cbe-6d669895a18b", - "metadata": { - "papermill": { - "duration": 9.3e-05, - "end_time": "2026-01-16T10:23:56.779812", - "exception": false, - "start_time": "2026-01-16T10:23:56.779719", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000093, + "end_time": "2026-01-16T10:23:56.779812", + "exception": false, + "start_time": "2026-01-16T10:23:56.779719", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## 2. Load Data" + ], + "id": "e44ae2ab-4af7-475a-8cbe-6d669895a18b" }, - "tags": [] - }, - "source": [ - "## 2. Load Data" - ] - }, - { - "cell_type": "markdown", - "id": "39e2add7-bbc7-4312-9a6f-9886d675f532", - "metadata": { - "papermill": { - "duration": 6.9e-05, - "end_time": "2026-01-16T10:23:56.779987", - "exception": false, - "start_time": "2026-01-16T10:23:56.779918", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000069, + "end_time": "2026-01-16T10:23:56.779987", + "exception": false, + "start_time": "2026-01-16T10:23:56.779918", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 2.1. Routine data (DHIS2) \n", + "**Note on pipeline behaviour**:
\n", + "The value of `ROUTINE_FILE` is resolved within the pipeline.py code and injected into the notebook as parameter." + ], + "id": "39e2add7-bbc7-4312-9a6f-9886d675f532" }, - "tags": [] - }, - "source": [ - "### 2.1. Routine data (DHIS2) \n", - "**Note on pipeline behaviour**:
\n", - "The value of `ROUTINE_FILE` is resolved within the pipeline.py code and injected into the notebook as parameter." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a1213723-f7e2-4238-9f37-f1795b187232", - "metadata": { - "papermill": { - "duration": 2.018878, - "end_time": "2026-01-16T10:23:58.798963", - "exception": false, - "start_time": "2026-01-16T10:23:56.780085", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "papermill": { + "duration": 2.018878, + "end_time": "2026-01-16T10:23:58.798963", + "exception": false, + "start_time": "2026-01-16T10:23:56.780085", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "dhis2_routine <- load_dataset_file(DATASET_ID, ROUTINE_FILE)\n", + "dhis2_routine <- dhis2_routine %>%\n", + " dplyr::mutate(dplyr::across(c(PERIOD, YEAR, MONTH), as.numeric))\n", + "dim(dhis2_routine)\n", + "head(dhis2_routine, 2)\n" + ], + "execution_count": null, + "outputs": [], + "id": "a1213723-f7e2-4238-9f37-f1795b187232" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# select dataset\n", - "if (ROUTINE_FILE == glue(\"{COUNTRY_CODE}_routine.parquet\")) {\n", - " rountine_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "} else {\n", - " rountine_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION\n", - "}\n", - " \n", - "# Load file from dataset\n", - "dhis2_routine <- tryCatch({ get_latest_dataset_file_in_memory(rountine_dataset_name, ROUTINE_FILE) }, \n", - " error = function(e) {\n", - " msg <- paste(\"[ERROR] Error while loading DHIS2 routine data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - "})\n", - "dhis2_routine <- dhis2_routine %>% mutate(across(c(PERIOD, YEAR, MONTH), as.numeric)) # Ensure correct data type for numerical columns \n", - "\n", - "# log\n", - "log_msg(glue(\"DHIS2 routine file {ROUTINE_FILE} loaded from dataset: {rountine_dataset_name}. Dataframe dimensions: {paste(dim(dhis2_routine), collapse=', ')}\"))\n", - "dim(dhis2_routine)\n", - "head(dhis2_routine, 2)" - ] - }, - { - "cell_type": "markdown", - "id": "a8b91360-1a4e-4fc4-9883-602bc0ab2a2a", - "metadata": { - "papermill": { - "duration": 0.000138, - "end_time": "2026-01-16T10:23:58.799287", - "exception": false, - "start_time": "2026-01-16T10:23:58.799149", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000138, + "end_time": "2026-01-16T10:23:58.799287", + "exception": false, + "start_time": "2026-01-16T10:23:58.799149", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 2.2. Organisation units (DHIS2 pyramid)" + ], + "id": "a8b91360-1a4e-4fc4-9883-602bc0ab2a2a" }, - "tags": [] - }, - "source": [ - "### 2.2. Organisation units (DHIS2 pyramid)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2fd92901-901e-4019-be78-a7718050c1c4", - "metadata": { - "papermill": { - "duration": 0.992899, - "end_time": "2026-01-16T10:23:59.792385", - "exception": false, - "start_time": "2026-01-16T10:23:58.799486", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "papermill": { + "duration": 0.992899, + "end_time": "2026-01-16T10:23:59.792385", + "exception": false, + "start_time": "2026-01-16T10:23:58.799486", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "dhis2_pyramid_formatted <- load_dataset_file(\n", + " config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED,\n", + " paste0(COUNTRY_CODE, \"_pyramid.parquet\")\n", + ")\n", + "dim(dhis2_pyramid_formatted)\n", + "head(dhis2_pyramid_formatted, 2)\n" + ], + "execution_count": null, + "outputs": [], + "id": "2fd92901-901e-4019-be78-a7718050c1c4" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load file from dataset\n", - "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "\n", - "dhis2_pyramid_formatted <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_pyramid.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 pyramid FORMATTED data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - "})\n", - " \n", - "msg <- paste0(\"DHIS2 pyramid FORMATTED data loaded from dataset: `\", dataset_name, \"`. Dataframe dimensions: \", paste(dim(dhis2_pyramid_formatted), collapse=\", \"))\n", - "log_msg(msg)\n", - "dim(dhis2_pyramid_formatted)\n", - "head(dhis2_pyramid_formatted,2)" - ] - }, - { - "cell_type": "markdown", - "id": "2b7f4e50-3731-46bc-b7a7-2ef5317da9d1", - "metadata": { - "papermill": { - "duration": 0.000106, - "end_time": "2026-01-16T10:23:59.792710", - "exception": false, - "start_time": "2026-01-16T10:23:59.792604", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000106, + "end_time": "2026-01-16T10:23:59.792710", + "exception": false, + "start_time": "2026-01-16T10:23:59.792604", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 2.3. Check whether selected indicators are present in routine data\n", + "Extra precaution measure to avoid breaks downstream.
\n", + "\n", + "Note: This logic should be moved to pipeline.py 🐍" + ], + "id": "2b7f4e50-3731-46bc-b7a7-2ef5317da9d1" }, - "tags": [] - }, - "source": [ - "### 2.3. Check whether selected indicators are present in routine data\n", - "Extra precaution measure to avoid breaks downstream.
\n", - "\n", - "Note: This logic should be moved to pipeline.py 🐍" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "19ff7e56-2397-4ca1-b072-bca4ba1b3d0c", - "metadata": { - "papermill": { - "duration": 0.024863, - "end_time": "2026-01-16T10:23:59.817677", - "exception": false, - "start_time": "2026-01-16T10:23:59.792814", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "papermill": { + "duration": 0.024863, + "end_time": "2026-01-16T10:23:59.817677", + "exception": false, + "start_time": "2026-01-16T10:23:59.792814", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "validate_required_columns(\n", + " data = dhis2_routine,\n", + " required_columns = ACTIVITY_INDICATORS,\n", + " data_label = \"`dhis2_routine` (activity indicators)\",\n", + " on_missing = \"warning\"\n", + ")\n", + "validate_required_columns(\n", + " data = dhis2_routine,\n", + " required_columns = VOLUME_ACTIVITY_INDICATORS,\n", + " data_label = \"`dhis2_routine` (volume activity indicators)\",\n", + " on_missing = \"error\"\n", + ")\n" + ], + "execution_count": null, + "outputs": [], + "id": "19ff7e56-2397-4ca1-b072-bca4ba1b3d0c" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (!all(ACTIVITY_INDICATORS %in% names(dhis2_routine))) {\n", - " log_msg(glue(\"🚨 Warning: one or more of the follow column is missing from `dhis2_routine`: {paste(ACTIVITY_INDICATORS, collapse = ', ')}\"), \"warning\")\n", - "}\n", - "\n", - "if (!all(VOLUME_ACTIVITY_INDICATORS %in% names(dhis2_routine))) {\n", - " msg <- glue(\"[ERROR] Volume activity indicator {VOLUME_ACTIVITY_INDICATORS} not present in the routine data. Process cannot continue.\")\n", - " cat(msg)\n", - " stop(msg)\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "bcbd3a9f-5e45-4ae5-8671-e23155236295", - "metadata": { - "papermill": { - "duration": 9.1e-05, - "end_time": "2026-01-16T10:23:59.817949", - "exception": false, - "start_time": "2026-01-16T10:23:59.817858", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000091, + "end_time": "2026-01-16T10:23:59.817949", + "exception": false, + "start_time": "2026-01-16T10:23:59.817858", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## 3. Reporting rates computations" + ], + "id": "bcbd3a9f-5e45-4ae5-8671-e23155236295" }, - "tags": [] - }, - "source": [ - "## 3. Reporting rates computations" - ] - }, - { - "cell_type": "markdown", - "id": "7d62cdb6", - "metadata": {}, - "source": [ - "#### 3.0. Define start and end period based on routine data " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3bc2e76a-b5c7-4c71-90f2-c66926ca560a", - "metadata": { - "papermill": { - "duration": 0.044172, - "end_time": "2026-01-16T10:23:59.862224", - "exception": false, - "start_time": "2026-01-16T10:23:59.818052", - "status": "completed" + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3.0. Define start and end period based on routine data " + ], + "id": "7d62cdb6" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "PERIOD_START <- dhis2_routine$PERIOD %>% min()\n", - "PERIOD_END <- dhis2_routine$PERIOD %>% max()\n", - "\n", - "period_vector <- format(seq(ym(PERIOD_START), ym(PERIOD_END), by = \"month\"), \"%Y%m\")\n", - "cat(glue(\"Start period: {PERIOD_START} \\nEnd period: {PERIOD_END} \\nPeriods count: {length(period_vector)}\"))" - ] - }, - { - "cell_type": "markdown", - "id": "526bc3af-01c1-4ddc-b3b9-077354e57559", - "metadata": { - "papermill": { - "duration": 0.000109, - "end_time": "2026-01-16T10:23:59.862555", - "exception": false, - "start_time": "2026-01-16T10:23:59.862446", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "papermill": { + "duration": 0.044172, + "end_time": "2026-01-16T10:23:59.862224", + "exception": false, + "start_time": "2026-01-16T10:23:59.818052", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "pv <- summarize_routine_period_range_as_month_vector(dhis2_routine)\n", + "PERIOD_START <- pv$PERIOD_START\n", + "PERIOD_END <- pv$PERIOD_END\n", + "period_vector <- pv$period_vector\n", + "log_msg(glue::glue(\"Routine period range: {PERIOD_START} to {PERIOD_END} ({length(period_vector)} months)\"))\n" + ], + "execution_count": null, + "outputs": [], + "id": "3bc2e76a-b5c7-4c71-90f2-c66926ca560a" }, - "tags": [] - }, - "source": [ - "#### 3.1. Build master table (all PERIOD x OU)\n", - "The master table contains all combinations of period x organisation unit " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9308197a-0852-4d34-8888-cf5564f35a9d", - "metadata": { - "papermill": { - "duration": 0.289128, - "end_time": "2026-01-16T10:24:00.151791", - "exception": false, - "start_time": "2026-01-16T10:23:59.862663", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000109, + "end_time": "2026-01-16T10:23:59.862555", + "exception": false, + "start_time": "2026-01-16T10:23:59.862446", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 3.1. Build master table (all PERIOD x OU)\n", + "The master table contains all combinations of period x organisation unit " + ], + "id": "526bc3af-01c1-4ddc-b3b9-077354e57559" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "log_msg(glue(\"Building master table with periods from {PERIOD_START} to {PERIOD_END}. Periods count: {length(period_vector)}\"))\n", - "\n", - "facility_master <- dhis2_pyramid_formatted %>%\n", - " rename(\n", - " OU_ID = glue::glue(\"LEVEL_{config_json$SNT_CONFIG$ANALYTICS_ORG_UNITS_LEVEL}_ID\"),\n", - " OU_NAME = glue::glue(\"LEVEL_{config_json$SNT_CONFIG$ANALYTICS_ORG_UNITS_LEVEL}_NAME\"),\n", - " ADM2_ID = str_replace(ADMIN_2, \"NAME\", \"ID\"),\n", - " ADM2_NAME = all_of(ADMIN_2),\n", - " ADM1_ID = str_replace(ADMIN_1, \"NAME\", \"ID\"),\n", - " ADM1_NAME = all_of(ADMIN_1)\n", - " ) %>%\n", - " select(ADM1_ID, ADM1_NAME, ADM2_ID, ADM2_NAME, OU_ID, OU_NAME, OPENING_DATE, CLOSED_DATE) %>%\n", - " distinct() %>%\n", - " tidyr::crossing(PERIOD = period_vector) %>%\n", - " mutate(PERIOD=as.numeric(PERIOD))\n", - " " - ] - }, - { - "cell_type": "markdown", - "id": "d5af25ad-f17c-4cdc-ac96-908af49fe558", - "metadata": { - "papermill": { - "duration": 0.000114, - "end_time": "2026-01-16T10:24:00.152094", - "exception": false, - "start_time": "2026-01-16T10:24:00.151980", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "papermill": { + "duration": 0.289128, + "end_time": "2026-01-16T10:24:00.151791", + "exception": false, + "start_time": "2026-01-16T10:23:59.862663", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "log_msg(glue(\"Building master table with periods from {PERIOD_START} to {PERIOD_END}. Periods count: {length(period_vector)}\"))\n", + "facility_master <- build_facilities_crossed_with_monthly_periods(\n", + " dhis2_pyramid_formatted = dhis2_pyramid_formatted,\n", + " period_vector = period_vector,\n", + " config_json = config_json,\n", + " ADMIN_1 = ADMIN_1,\n", + " ADMIN_2 = ADMIN_2\n", + ")\n" + ], + "execution_count": null, + "outputs": [], + "id": "9308197a-0852-4d34-8888-cf5564f35a9d" }, - "tags": [] - }, - "source": [ - "#### 3.2. Identify \"Active\" facilities\n", - "\n", - "Facilities **reporting** zero or positive values on any of the selected indicators (**\"Activity indicators\"**) are considered to be **active**. Note that this method only counts **non-null** (not `NA`s) to prevent counting empty submissions as valid reporting.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7b279d27", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "log_msg(glue(\"Assessing facility reporting activity based on the following indicators: {paste(ACTIVITY_INDICATORS, collapse=', ')}\"))\n", - "\n", - "facility_master_routine <- left_join(\n", - " facility_master,\n", - " # dhis2_routine %>% select(OU_ID, PERIOD, all_of(DHIS2_INDICATORS)), # GP 2026-02-04\n", - " dhis2_routine %>% select(OU_ID, PERIOD, any_of(DHIS2_INDICATORS)), \n", - " by = c(\"OU_ID\", \"PERIOD\")\n", - " ) %>%\n", - " mutate(\n", - " YEAR = as.numeric(substr(PERIOD, 1, 4)),\n", - " ACTIVE_THIS_PERIOD = ifelse(\n", - " rowSums(!is.na(across(all_of(ACTIVITY_INDICATORS))) & across(all_of(ACTIVITY_INDICATORS)) >= 0) > 0, 1, 0), \n", - " COUNT = 1 # Counting every facility\n", - " )" - ] - }, - { - "cell_type": "markdown", - "id": "89c3e5c8-4a4e-497d-9d75-2aed2e8fe619", - "metadata": { - "papermill": { - "duration": 0.000107, - "end_time": "2026-01-16T10:24:01.626760", - "exception": false, - "start_time": "2026-01-16T10:24:01.626653", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000114, + "end_time": "2026-01-16T10:24:00.152094", + "exception": false, + "start_time": "2026-01-16T10:24:00.151980", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 3.2. Identify \"Active\" facilities\n", + "\n", + "Facilities **reporting** zero or positive values on any of the selected indicators (**\"Activity indicators\"**) are considered to be **active**. Note that this method only counts **non-null** (not `NA`s) to prevent counting empty submissions as valid reporting.\n" + ], + "id": "d5af25ad-f17c-4cdc-ac96-908af49fe558" }, - "tags": [] - }, - "source": [ - "#### 3.3. Identify `OPEN` facilities (denominator)\n", - "The \"OPEN\" variable indicates whether a facility is considered structurally open for a given reporting period.\n", - "\n", - "A facility is flagged as open (OPEN = 1) for a period if both of the following conditions are met:\n", - "1. No explicit closure in the facility name. The facility name does not contain closure keywords such as “CLOTUR”, “FERMÉ”, “FERMEE”, or similar.\n", - "\n", - "2. The period falls within the facility’s opening and closing dates. The opening date is not after the reporting period, and the closing date is not before or equal to the reporting period.\n", - "\n", - "If either of these conditions is not met, the facility is considered not open (OPEN = 0) for that period." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0b71f1d8-2048-4b62-865c-9acfe61b5b89", - "metadata": { - "papermill": { - "duration": 1.317923, - "end_time": "2026-01-16T10:24:02.944800", - "exception": false, - "start_time": "2026-01-16T10:24:01.626877", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Join routine values to the facility master and define monthly activity\n", + "facility_master_routine <- dplyr::left_join(\n", + " facility_master,\n", + " dhis2_routine %>% dplyr::select(OU_ID, PERIOD, dplyr::any_of(DHIS2_INDICATORS)),\n", + " by = c(\"OU_ID\", \"PERIOD\")\n", + ") %>%\n", + " dplyr::mutate(\n", + " YEAR = as.numeric(substr(PERIOD, 1, 4)),\n", + " ACTIVE_THIS_PERIOD = ifelse(\n", + " rowSums(!is.na(dplyr::across(dplyr::all_of(ACTIVITY_INDICATORS))) &\n", + " dplyr::across(dplyr::all_of(ACTIVITY_INDICATORS)) >= 0) > 0, 1, 0\n", + " ),\n", + " COUNT = 1\n", + " )\n" + ], + "execution_count": null, + "outputs": [], + "id": "7b279d27" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "facility_master_routine <- facility_master_routine %>%\n", - " mutate(\n", - " period_date = as.Date(ym(PERIOD)),\n", - " \n", - " # Flag facilities explicitly marked as closed in their name\n", - " NAME_CLOSED = str_detect(\n", - " toupper(OU_NAME),\n", - " \"CLOTUR|FERM(E|EE)?\"\n", - " ),\n", - "\n", - " # Check whether the facility is open during the period using open/close dates\n", - " OPEN_BY_DATE = \n", - " !(is.na(OPENING_DATE) | as.Date(OPENING_DATE) > period_date |\n", - " (!is.na(CLOSED_DATE) & as.Date(CLOSED_DATE) <= period_date)\n", - " ),\n", - " \n", - " # Final definition of an open facility for the period:\n", - " # not explicitly closed, within opening/closing dates,\n", - " # and started reporting\n", - " OPEN = ifelse(\n", - " !NAME_CLOSED & OPEN_BY_DATE,\n", - " 1, 0\n", - " )\n", - " )" - ] - }, - { - "cell_type": "markdown", - "id": "657fd6ca", - "metadata": {}, - "source": [ - "#### 3.4. Identify \"Active\" facilities for each YEAR (denominator)" - ] - }, - { - "cell_type": "markdown", - "id": "a598e4b7", - "metadata": {}, - "source": [ - "
\n", - " Important: this step could have a huge influence on reporting rates!
\n", - " Activity can be evaluated over 1 year or across all years, based on grouping: group_by(OU_ID, YEAR):
\n", - "
    \n", - "
  • With YEAR → “active that year”
  • \n", - "
  • Without YEAR → “ever active over the entire extracted period”
  • \n", - "
\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "002e7fbf-1f68-4419-be2d-f16d8c72936d", - "metadata": { - "papermill": { - "duration": 0.173961, - "end_time": "2026-01-16T10:24:05.948136", - "exception": false, - "start_time": "2026-01-16T10:24:05.774175", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000107, + "end_time": "2026-01-16T10:24:01.626760", + "exception": false, + "start_time": "2026-01-16T10:24:01.626653", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 3.3. Identify `OPEN` facilities (denominator)\n", + "The \"OPEN\" variable indicates whether a facility is considered structurally open for a given reporting period.\n", + "\n", + "A facility is flagged as open (OPEN = 1) for a period if both of the following conditions are met:\n", + "1. No explicit closure in the facility name. The facility name does not contain closure keywords such as “CLOTUR”, “FERMÉ”, “FERMEE”, or similar.\n", + "\n", + "2. The period falls within the facility’s opening and closing dates. The opening date is not after the reporting period, and the closing date is not before or equal to the reporting period.\n", + "\n", + "If either of these conditions is not met, the facility is considered not open (OPEN = 0) for that period." + ], + "id": "89c3e5c8-4a4e-497d-9d75-2aed2e8fe619" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Flag facilities with at least one report in the year\n", - "facility_master_routine_01 <- facility_master_routine %>%\n", - " group_by(OU_ID, YEAR) %>%\n", - " mutate(ACTIVE_THIS_YEAR = max(ACTIVE_THIS_PERIOD, na.rm = TRUE)) %>% # use max() to flag if ACTIVE_THIS_PERIOD is 1 at least once\n", - " ungroup()" - ] - }, - { - "cell_type": "markdown", - "id": "160c08ec-cc9a-4e1a-99ec-f703db83a71d", - "metadata": { - "papermill": { - "duration": 9.8e-05, - "end_time": "2026-01-16T10:24:05.948452", - "exception": false, - "start_time": "2026-01-16T10:24:05.948354", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "papermill": { + "duration": 1.317923, + "end_time": "2026-01-16T10:24:02.944800", + "exception": false, + "start_time": "2026-01-16T10:24:01.626877", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# 3.3 Identify OPEN facilities from naming and opening/closing dates\n", + "facility_master_routine <- facility_master_routine %>%\n", + " dplyr::mutate(\n", + " period_date = as.Date(zoo::as.yearmon(as.character(PERIOD), \"%Y%m\")),\n", + " NAME_CLOSED = stringr::str_detect(toupper(OU_NAME), \"CLOTUR|FERM(E|EE)?\"),\n", + " OPEN_BY_DATE = !(is.na(OPENING_DATE) | as.Date(OPENING_DATE) > period_date |\n", + " (!is.na(CLOSED_DATE) & as.Date(CLOSED_DATE) <= period_date)),\n", + " OPEN = ifelse(!NAME_CLOSED & OPEN_BY_DATE, 1, 0)\n", + " )\n" + ], + "execution_count": null, + "outputs": [], + "id": "0b71f1d8-2048-4b62-865c-9acfe61b5b89" }, - "tags": [] - }, - "source": [ - "#### 3.5. Compute Weighting factor based on \"volume of activity\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4420e559-4134-4fc3-8950-9972ebede00e", - "metadata": { - "papermill": { - "duration": 0.520673, - "end_time": "2026-01-16T10:24:06.469233", - "exception": false, - "start_time": "2026-01-16T10:24:05.948560", - "status": "completed" + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3.4. Identify \"Active\" facilities for each YEAR (denominator)" + ], + "id": "657fd6ca" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "log_msg(glue(\"Computing volume of activity using indicator: {paste(VOLUME_ACTIVITY_INDICATORS, collapse=', ')}\"))\n", - "\n", - "# Compute MEAN_REPORTED_CASES_BY_HF as total cases over months with activity\n", - "mean_monthly_cases <- dhis2_routine %>% \n", - " mutate(total_cases_by_hf_month = rowSums(across(all_of(VOLUME_ACTIVITY_INDICATORS)), na.rm = TRUE)) %>%\n", - " group_by(ADM2_ID, OU_ID) %>% \n", - " summarise(\n", - " total_cases_by_hf_year = sum(total_cases_by_hf_month, na.rm = TRUE),\n", - " number_of_reporting_months = length(which(total_cases_by_hf_month > 0)),\n", - " .groups = \"drop\"\n", - " ) %>% \n", - " mutate(MEAN_REPORTED_CASES_BY_HF = total_cases_by_hf_year / number_of_reporting_months) %>%\n", - " select(ADM2_ID, OU_ID, MEAN_REPORTED_CASES_BY_HF)\n", - "\n", - "mean_monthly_cases_adm2 <- mean_monthly_cases %>% \n", - " select(ADM2_ID, MEAN_REPORTED_CASES_BY_HF) %>% \n", - " group_by(ADM2_ID) %>% \n", - " summarise(SUMMED_MEAN_REPORTED_CASES_BY_ADM2 = sum(MEAN_REPORTED_CASES_BY_HF, na.rm=TRUE), \n", - " NR_OF_HF = n())\n", - "\n", - "# Compute weights\n", - "hf_weights <- mean_monthly_cases %>% \n", - " left_join(mean_monthly_cases_adm2, by = \"ADM2_ID\") %>%\n", - " mutate(WEIGHT = MEAN_REPORTED_CASES_BY_HF / SUMMED_MEAN_REPORTED_CASES_BY_ADM2 * NR_OF_HF)\n", - "\n", - "# Join with rest of data\n", - "facility_master_routine_02 <- facility_master_routine_01 %>%\n", - " left_join(hf_weights %>% select(OU_ID, WEIGHT), by = c(\"OU_ID\"))" - ] - }, - { - "cell_type": "markdown", - "id": "2fed8529-70e9-4e2e-a498-fe3dd7499bb3", - "metadata": { - "papermill": { - "duration": 0.000108, - "end_time": "2026-01-16T10:24:06.469622", - "exception": false, - "start_time": "2026-01-16T10:24:06.469514", - "status": "completed" + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + " Important: this step could have a huge influence on reporting rates!
\n", + " Activity can be evaluated over 1 year or across all years, based on grouping: group_by(OU_ID, YEAR):
\n", + "
    \n", + "
  • With YEAR → “active that year”
  • \n", + "
  • Without YEAR → “ever active over the entire extracted period”
  • \n", + "
\n", + "
" + ], + "id": "a598e4b7" }, - "tags": [] - }, - "source": [ - "#### 3.6. Compute Weighted variables" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "216f7658-c1da-44e4-9f4f-fdb44fd40259", - "metadata": { - "papermill": { - "duration": 0.483413, - "end_time": "2026-01-16T10:24:06.953139", - "exception": false, - "start_time": "2026-01-16T10:24:06.469726", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "papermill": { + "duration": 0.173961, + "end_time": "2026-01-16T10:24:05.948136", + "exception": false, + "start_time": "2026-01-16T10:24:05.774175", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# 3.4 Mark facilities active at least once per year\n", + "facility_master_routine <- facility_master_routine %>%\n", + " dplyr::group_by(OU_ID, YEAR) %>%\n", + " dplyr::mutate(ACTIVE_THIS_YEAR = max(ACTIVE_THIS_PERIOD, na.rm = TRUE)) %>%\n", + " dplyr::ungroup()\n" + ], + "execution_count": null, + "outputs": [], + "id": "002e7fbf-1f68-4419-be2d-f16d8c72936d" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "log_msg(glue(\"Computing weighted variables for reporting rate calculation.\"))\n", - "\n", - "facility_master_routine_02$ACTIVE_THIS_PERIOD_W <- facility_master_routine_02$ACTIVE_THIS_PERIOD * facility_master_routine_02$WEIGHT\n", - "facility_master_routine_02$COUNT_W <- facility_master_routine_02$COUNT * facility_master_routine_02$WEIGHT \n", - "facility_master_routine_02$OPEN_W <- facility_master_routine_02$OPEN * facility_master_routine_02$WEIGHT\n", - "facility_master_routine_02$ACTIVE_THIS_YEAR_W <- facility_master_routine_02$ACTIVE_THIS_YEAR * facility_master_routine_02$WEIGHT\n", - "\n", - "dim(facility_master_routine_02)\n", - "head(facility_master_routine_02, 2)" - ] - }, - { - "cell_type": "markdown", - "id": "9c0367f7-91cd-4524-abe4-11adf2fcea02", - "metadata": { - "papermill": { - "duration": 0.000172, - "end_time": "2026-01-16T10:24:06.953755", - "exception": false, - "start_time": "2026-01-16T10:24:06.953583", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000098, + "end_time": "2026-01-16T10:24:05.948452", + "exception": false, + "start_time": "2026-01-16T10:24:05.948354", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 3.5. Compute Weighting factor based on \"volume of activity\"" + ], + "id": "160c08ec-cc9a-4e1a-99ec-f703db83a71d" }, - "tags": [] - }, - "source": [ - "#### 3.7. Aggregate data at ADM2 level" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "af13191e", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "log_msg(glue(\"Aggregating data at admin level 2.\"))\n", - "\n", - "reporting_rate_adm2 <- facility_master_routine_02 %>% \n", - " group_by(ADM1_ID, ADM1_NAME, ADM2_ID, ADM2_NAME, YEAR, PERIOD) %>%\n", - " summarise(\n", - " HF_ACTIVE_THIS_PERIOD_BY_ADM2 = sum(ACTIVE_THIS_PERIOD, na.rm = TRUE), # (numerator) sum of all facilities active per PERIOD\n", - " NR_OF_HF_BY_ADM2 = sum(COUNT, na.rm = TRUE),\n", - " NR_OF_OPEN_HF_BY_ADM2 = sum(OPEN, na.rm = TRUE),\n", - " HF_ACTIVE_THIS_YEAR_BY_ADM2 = sum(ACTIVE_THIS_YEAR, na.rm = TRUE), # (denominator) sum of all facilities active at least once in the YEAR\n", - " HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED = sum(ACTIVE_THIS_PERIOD_W, na.rm = TRUE),\n", - " NR_OF_HF_BY_ADM2_WEIGHTED = sum(COUNT_W, na.rm = TRUE),\n", - " NR_OF_OPEN_HF_BY_ADM2_WEIGHTED = sum(OPEN_W, na.rm = TRUE),\n", - " HF_ACTIVE_THIS_YEAR_BY_ADM2_WEIGHTED = sum(ACTIVE_THIS_YEAR_W, na.rm = TRUE), \n", - " .groups = \"drop\")\n", - "\n", - "dim(reporting_rate_adm2)\n", - "# head(reporting_rate_adm2, 5)" - ] - }, - { - "cell_type": "markdown", - "id": "7d381937", - "metadata": {}, - "source": [ - "#### 3.8. Calculate Reporting Rates (all methods)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b41263f8", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "log_msg(glue(\"Calculating Reporting Rates at admin level 2. Using all methods, weighted and unweighted.\"))\n", - "\n", - "reporting_rate_adm2 <- reporting_rate_adm2 %>% \n", - " mutate(\n", - " RR_TOTAL_HF = HF_ACTIVE_THIS_PERIOD_BY_ADM2 / NR_OF_HF_BY_ADM2,\n", - " RR_OPEN_HF = HF_ACTIVE_THIS_PERIOD_BY_ADM2 / NR_OF_OPEN_HF_BY_ADM2,\n", - " RR_ACTIVE_HF = HF_ACTIVE_THIS_PERIOD_BY_ADM2 / HF_ACTIVE_THIS_YEAR_BY_ADM2,\n", - " RR_TOTAL_HF_W = HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED / NR_OF_HF_BY_ADM2_WEIGHTED,\n", - " RR_OPEN_HF_W = HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED / NR_OF_OPEN_HF_BY_ADM2_WEIGHTED,\n", - " RR_ACTIVE_HF_W = HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED / HF_ACTIVE_THIS_YEAR_BY_ADM2_WEIGHTED\n", - " )\n", - "\n", - "dim(reporting_rate_adm2)\n", - "head(reporting_rate_adm2, 5)" - ] - }, - { - "cell_type": "markdown", - "id": "5e593659", - "metadata": { - "papermill": { - "duration": 0.000108, - "end_time": "2026-01-16T10:24:07.310579", - "exception": false, - "start_time": "2026-01-16T10:24:07.310471", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "papermill": { + "duration": 0.520673, + "end_time": "2026-01-16T10:24:06.469233", + "exception": false, + "start_time": "2026-01-16T10:24:05.948560", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# 3.5 Compute facility weights from volume of activity\n", + "mean_monthly_cases <- dhis2_routine %>%\n", + " dplyr::mutate(total_cases_by_hf_month = rowSums(dplyr::across(dplyr::all_of(VOLUME_ACTIVITY_INDICATORS)), na.rm = TRUE)) %>%\n", + " dplyr::group_by(ADM2_ID, OU_ID) %>%\n", + " dplyr::summarise(\n", + " total_cases_by_hf_year = sum(total_cases_by_hf_month, na.rm = TRUE),\n", + " number_of_reporting_months = length(which(total_cases_by_hf_month > 0)),\n", + " .groups = \"drop\"\n", + " ) %>%\n", + " dplyr::mutate(MEAN_REPORTED_CASES_BY_HF = total_cases_by_hf_year / number_of_reporting_months) %>%\n", + " dplyr::select(ADM2_ID, OU_ID, MEAN_REPORTED_CASES_BY_HF)\n", + "\n", + "mean_monthly_cases_adm2 <- mean_monthly_cases %>%\n", + " dplyr::select(ADM2_ID, MEAN_REPORTED_CASES_BY_HF) %>%\n", + " dplyr::group_by(ADM2_ID) %>%\n", + " dplyr::summarise(\n", + " SUMMED_MEAN_REPORTED_CASES_BY_ADM2 = sum(MEAN_REPORTED_CASES_BY_HF, na.rm = TRUE),\n", + " NR_OF_HF = dplyr::n(),\n", + " .groups = \"drop\"\n", + " )\n", + "\n", + "hf_weights <- mean_monthly_cases %>%\n", + " dplyr::left_join(mean_monthly_cases_adm2, by = \"ADM2_ID\") %>%\n", + " dplyr::mutate(WEIGHT = MEAN_REPORTED_CASES_BY_HF / SUMMED_MEAN_REPORTED_CASES_BY_ADM2 * NR_OF_HF)\n" + ], + "execution_count": null, + "outputs": [], + "id": "4420e559-4134-4fc3-8950-9972ebede00e" }, - "tags": [] - }, - "source": [ - "## 4. Select correct col for `REPORTING_RATE` based on denominator method" - ] - }, - { - "cell_type": "markdown", - "id": "c75f2249", - "metadata": { - "papermill": { - "duration": 5.7e-05, - "end_time": "2026-01-16T10:24:07.310743", - "exception": false, - "start_time": "2026-01-16T10:24:07.310686", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000108, + "end_time": "2026-01-16T10:24:06.469622", + "exception": false, + "start_time": "2026-01-16T10:24:06.469514", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 3.6. Compute Weighted variables" + ], + "id": "2fed8529-70e9-4e2e-a498-fe3dd7499bb3" }, - "tags": [] - }, - "source": [ - "### 4.1. Select results and format" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "75e71b38", - "metadata": { - "papermill": { - "duration": 0.020644, - "end_time": "2026-01-16T10:24:07.351317", - "exception": false, - "start_time": "2026-01-16T10:24:07.330673", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "papermill": { + "duration": 0.483413, + "end_time": "2026-01-16T10:24:06.953139", + "exception": false, + "start_time": "2026-01-16T10:24:06.469726", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# 3.6 Apply weights to monthly status variables\n", + "facility_master_routine_02 <- facility_master_routine %>%\n", + " dplyr::left_join(hf_weights %>% dplyr::select(OU_ID, WEIGHT), by = c(\"OU_ID\"))\n", + "\n", + "facility_master_routine_02$ACTIVE_THIS_PERIOD_W <- facility_master_routine_02$ACTIVE_THIS_PERIOD * facility_master_routine_02$WEIGHT\n", + "facility_master_routine_02$COUNT_W <- facility_master_routine_02$COUNT * facility_master_routine_02$WEIGHT\n", + "facility_master_routine_02$OPEN_W <- facility_master_routine_02$OPEN * facility_master_routine_02$WEIGHT\n", + "facility_master_routine_02$ACTIVE_THIS_YEAR_W <- facility_master_routine_02$ACTIVE_THIS_YEAR * facility_master_routine_02$WEIGHT\n" + ], + "execution_count": null, + "outputs": [], + "id": "216f7658-c1da-44e4-9f4f-fdb44fd40259" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (DATAELEMENT_METHOD_DENOMINATOR == \"ROUTINE_ACTIVE_FACILITIES\") { \n", - " rr_column_selection <- \"RR_ACTIVE_HF\" \n", - " if (USE_WEIGHTED_REPORTING_RATES) {\n", - " rr_column_selection <- \"RR_ACTIVE_HF_W\"\n", - " }\n", - "} else if (DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", - " rr_column_selection <- \"RR_OPEN_HF\"\n", - " if (USE_WEIGHTED_REPORTING_RATES) {\n", - " rr_column_selection <- \"RR_OPEN_HF_W\"\n", - " }\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3df36abb", - "metadata": { - "papermill": { - "duration": 0.140976, - "end_time": "2026-01-16T10:24:07.492479", - "exception": false, - "start_time": "2026-01-16T10:24:07.351503", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000172, + "end_time": "2026-01-16T10:24:06.953755", + "exception": false, + "start_time": "2026-01-16T10:24:06.953583", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 3.7. Aggregate data at ADM2 level" + ], + "id": "9c0367f7-91cd-4524-abe4-11adf2fcea02" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "log_msg(glue(\"Using reporting rate column: `{rr_column_selection}` \n", - "based on DATAELEMENT_METHOD_DENOMINATOR == {DATAELEMENT_METHOD_DENOMINATOR} \n", - "and USE_WEIGHTED_REPORTING_RATES == {USE_WEIGHTED_REPORTING_RATES}\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0ccc272c", - "metadata": { - "papermill": { - "duration": 0.182574, - "end_time": "2026-01-16T10:24:07.675242", - "exception": false, - "start_time": "2026-01-16T10:24:07.492668", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# 3.7 Aggregate monthly counts at ADM2 level\n", + "reporting_rate_adm2 <- facility_master_routine_02 %>%\n", + " dplyr::group_by(ADM1_ID, ADM1_NAME, ADM2_ID, ADM2_NAME, YEAR, PERIOD) %>%\n", + " dplyr::summarise(\n", + " HF_ACTIVE_THIS_PERIOD_BY_ADM2 = sum(ACTIVE_THIS_PERIOD, na.rm = TRUE),\n", + " NR_OF_HF_BY_ADM2 = sum(COUNT, na.rm = TRUE),\n", + " NR_OF_OPEN_HF_BY_ADM2 = sum(OPEN, na.rm = TRUE),\n", + " HF_ACTIVE_THIS_YEAR_BY_ADM2 = sum(ACTIVE_THIS_YEAR, na.rm = TRUE),\n", + " HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED = sum(ACTIVE_THIS_PERIOD_W, na.rm = TRUE),\n", + " NR_OF_HF_BY_ADM2_WEIGHTED = sum(COUNT_W, na.rm = TRUE),\n", + " NR_OF_OPEN_HF_BY_ADM2_WEIGHTED = sum(OPEN_W, na.rm = TRUE),\n", + " HF_ACTIVE_THIS_YEAR_BY_ADM2_WEIGHTED = sum(ACTIVE_THIS_YEAR_W, na.rm = TRUE),\n", + " .groups = \"drop\"\n", + " ) %>%\n", + " dplyr::mutate(\n", + " RR_TOTAL_HF = HF_ACTIVE_THIS_PERIOD_BY_ADM2 / NR_OF_HF_BY_ADM2,\n", + " RR_OPEN_HF = HF_ACTIVE_THIS_PERIOD_BY_ADM2 / NR_OF_OPEN_HF_BY_ADM2,\n", + " RR_ACTIVE_HF = HF_ACTIVE_THIS_PERIOD_BY_ADM2 / HF_ACTIVE_THIS_YEAR_BY_ADM2,\n", + " RR_TOTAL_HF_W = HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED / NR_OF_HF_BY_ADM2_WEIGHTED,\n", + " RR_OPEN_HF_W = HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED / NR_OF_OPEN_HF_BY_ADM2_WEIGHTED,\n", + " RR_ACTIVE_HF_W = HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED / HF_ACTIVE_THIS_YEAR_BY_ADM2_WEIGHTED\n", + " )\n" + ], + "execution_count": null, + "outputs": [], + "id": "af13191e" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "log_msg(glue(\"Formatting table for '{DATAELEMENT_METHOD_DENOMINATOR}' selection.\"))\n", - "\n", - "# Select column and format final table\n", - "reporting_rate_dataelement <- reporting_rate_adm2 %>%\n", - " mutate(MONTH = PERIOD %% 100) %>%\n", - " rename(REPORTING_RATE = !!sym(rr_column_selection)) %>%\n", - " select(all_of(fixed_cols_rr))\n", - "\n", - "print(dim(reporting_rate_dataelement))\n", - "head(reporting_rate_dataelement, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "ca66e785", - "metadata": { - "papermill": { - "duration": 0.000109, - "end_time": "2026-01-16T10:24:07.675637", - "exception": false, - "start_time": "2026-01-16T10:24:07.675528", - "status": "completed" + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3.8. Calculate Reporting Rates (all methods)" + ], + "id": "7d381937" }, - "tags": [] - }, - "source": [ - "## 5. Inspect reporting rate values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "31535459", - "metadata": { - "papermill": { - "duration": 0.160299, - "end_time": "2026-01-16T10:24:07.836039", - "exception": false, - "start_time": "2026-01-16T10:24:07.675740", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# 3.8 Select final reporting-rate definition for export\n", + "rr_column_selection <- if (DATAELEMENT_METHOD_DENOMINATOR == \"ROUTINE_ACTIVE_FACILITIES\") \"RR_ACTIVE_HF\" else \"RR_OPEN_HF\"\n", + "if (USE_WEIGHTED_REPORTING_RATES) {\n", + " rr_column_selection <- if (DATAELEMENT_METHOD_DENOMINATOR == \"ROUTINE_ACTIVE_FACILITIES\") \"RR_ACTIVE_HF_W\" else \"RR_OPEN_HF_W\"\n", + "}\n", + "\n", + "reporting_rate_dataelement <- reporting_rate_adm2 %>%\n", + " dplyr::mutate(MONTH = PERIOD %% 100) %>%\n", + " dplyr::rename(REPORTING_RATE = !!rlang::sym(rr_column_selection)) %>%\n", + " dplyr::select(YEAR, MONTH, ADM2_ID, REPORTING_RATE)\n" + ], + "execution_count": null, + "outputs": [], + "id": "b41263f8" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "hist(reporting_rate_dataelement$REPORTING_RATE, breaks=50, \n", - "main=paste0(\"Histogram of REPORTING_RATE\\n(\", DATAELEMENT_METHOD_DENOMINATOR, \",\\n\", ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted\", \"Unweighted\"), \")\"), \n", - "xlab=\"REPORTING_RATE\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6778f17d", - "metadata": { - "papermill": { - "duration": 0.896382, - "end_time": "2026-01-16T10:24:08.732660", - "exception": false, - "start_time": "2026-01-16T10:24:07.836278", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000108, + "end_time": "2026-01-16T10:24:07.310579", + "exception": false, + "start_time": "2026-01-16T10:24:07.310471", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## 4. Select correct col for `REPORTING_RATE` based on denominator method" + ], + "id": "5e593659" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Boxplot\n", - "ggplot(reporting_rate_dataelement,\n", - " aes(x = factor(YEAR), y = REPORTING_RATE)) +\n", - " geom_boxplot(outlier.alpha = 0.3) +\n", - " labs(\n", - " x = \"Year\",\n", - " y = glue::glue(\"REPORTING_RATE ({DATAELEMENT_METHOD_DENOMINATOR})\"),\n", - " title = \"Distribution of REPORTING_RATE per year\",\n", - " subtitle = ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted Reporting Rates\", \"Unweighted Reporting Rates\")\n", - " ) +\n", - " theme_minimal()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a7f013fd", - "metadata": { - "papermill": { - "duration": 0.859448, - "end_time": "2026-01-16T10:24:09.592295", - "exception": false, - "start_time": "2026-01-16T10:24:08.732847", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000057, + "end_time": "2026-01-16T10:24:07.310743", + "exception": false, + "start_time": "2026-01-16T10:24:07.310686", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 4.1. Select results and format" + ], + "id": "c75f2249" }, - "tags": [], - "vscode": { - "languageId": "r" + { + "cell_type": "code", + "metadata": { + "papermill": { + "duration": 0.020644, + "end_time": "2026-01-16T10:24:07.351317", + "exception": false, + "start_time": "2026-01-16T10:24:07.330673", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# 4.1 Confirm which denominator/weighting option was selected\n", + "cat(glue::glue(\n", + " \"Selected denominator method: {DATAELEMENT_METHOD_DENOMINATOR} | Weighted reporting rates: {USE_WEIGHTED_REPORTING_RATES}\"\n", + "))\n" + ], + "execution_count": null, + "outputs": [], + "id": "75e71b38" + }, + { + "cell_type": "code", + "metadata": { + "papermill": { + "duration": 0.140976, + "end_time": "2026-01-16T10:24:07.492479", + "exception": false, + "start_time": "2026-01-16T10:24:07.351503", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Output preview\n", + "dim(reporting_rate_dataelement)\n", + "head(reporting_rate_dataelement, 5)\n" + ], + "execution_count": null, + "outputs": [], + "id": "3df36abb" + }, + { + "cell_type": "code", + "metadata": { + "papermill": { + "duration": 0.182574, + "end_time": "2026-01-16T10:24:07.675242", + "exception": false, + "start_time": "2026-01-16T10:24:07.492668", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Basic data quality checks\n", + "summary(reporting_rate_dataelement$REPORTING_RATE)\n", + "sum(is.na(reporting_rate_dataelement$REPORTING_RATE))\n" + ], + "execution_count": null, + "outputs": [], + "id": "0ccc272c" + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000109, + "end_time": "2026-01-16T10:24:07.675637", + "exception": false, + "start_time": "2026-01-16T10:24:07.675528", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## 5. Inspect reporting rate values" + ], + "id": "ca66e785" + }, + { + "cell_type": "code", + "metadata": { + "papermill": { + "duration": 0.160299, + "end_time": "2026-01-16T10:24:07.836039", + "exception": false, + "start_time": "2026-01-16T10:24:07.675740", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "hist(reporting_rate_dataelement$REPORTING_RATE, breaks=50, \n", + "main=paste0(\"Histogram of REPORTING_RATE\\n(\", DATAELEMENT_METHOD_DENOMINATOR, \",\\n\", ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted\", \"Unweighted\"), \")\"), \n", + "xlab=\"REPORTING_RATE\")" + ], + "execution_count": null, + "outputs": [], + "id": "31535459" + }, + { + "cell_type": "code", + "metadata": { + "papermill": { + "duration": 0.896382, + "end_time": "2026-01-16T10:24:08.732660", + "exception": false, + "start_time": "2026-01-16T10:24:07.836278", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Boxplot\n", + "ggplot(reporting_rate_dataelement,\n", + " aes(x = factor(YEAR), y = REPORTING_RATE)) +\n", + " geom_boxplot(outlier.alpha = 0.3) +\n", + " labs(\n", + " x = \"Year\",\n", + " y = glue::glue(\"REPORTING_RATE ({DATAELEMENT_METHOD_DENOMINATOR})\"),\n", + " title = \"Distribution of REPORTING_RATE per year\",\n", + " subtitle = ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted Reporting Rates\", \"Unweighted Reporting Rates\")\n", + " ) +\n", + " theme_minimal()" + ], + "execution_count": null, + "outputs": [], + "id": "6778f17d" + }, + { + "cell_type": "code", + "metadata": { + "papermill": { + "duration": 0.859448, + "end_time": "2026-01-16T10:24:09.592295", + "exception": false, + "start_time": "2026-01-16T10:24:08.732847", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "ggplot(reporting_rate_dataelement,\n", + " aes(x = factor(YEAR), y = REPORTING_RATE)) +\n", + "# Boxplot without outliers\n", + " geom_boxplot(outlier.alpha = 0) +\n", + " geom_point(alpha = 0.3, position = position_jitter(width = 0.35)) +\n", + " labs(\n", + " x = \"Year\",\n", + " y = glue::glue(\"REPORTING_RATE based on {DATAELEMENT_METHOD_DENOMINATOR}\"),\n", + " title = \"Distribution of REPORTING_RATE per year\",\n", + " subtitle = ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted Reporting Rates\", \"Unweighted Reporting Rates\")\n", + " ) +\n", + " theme_minimal()" + ], + "execution_count": null, + "outputs": [], + "id": "a7f013fd" + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000088, + "end_time": "2026-01-16T10:24:09.592563", + "exception": false, + "start_time": "2026-01-16T10:24:09.592475", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## 5. 📁 Export to `data/` folder" + ], + "id": "2866816a-7015-4c5c-b904-f553f3b4790d" + }, + { + "cell_type": "code", + "metadata": { + "papermill": { + "duration": 0.919937, + "end_time": "2026-01-16T10:24:10.512602", + "exception": false, + "start_time": "2026-01-16T10:24:09.592665", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "output_dir <- file.path(snt_environment$DATA_PATH, \"dhis2\", \"reporting_rate\")\n", + "dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)\n", + "\n", + "out_msg <- paste0(\"Reporting rate dataelement saved under: \", file.path(output_dir, paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.csv\")))\n", + "\n", + "# write parquet and csv files\n", + "write_parquet(reporting_rate_dataelement, file.path(output_dir, paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.parquet\")))\n", + "write.csv(reporting_rate_dataelement, file.path(output_dir, paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.csv\")), row.names = FALSE)\n", + "\n", + "# log\n", + "log_msg(out_msg)\n" + ], + "execution_count": null, + "outputs": [], + "id": "bbf27852-8ec5-4370-aae2-49e082928fe1" } - }, - "outputs": [], - "source": [ - "ggplot(reporting_rate_dataelement,\n", - " aes(x = factor(YEAR), y = REPORTING_RATE)) +\n", - "# Boxplot without outliers\n", - " geom_boxplot(outlier.alpha = 0) +\n", - " geom_point(alpha = 0.3, position = position_jitter(width = 0.35)) +\n", - " labs(\n", - " x = \"Year\",\n", - " y = glue::glue(\"REPORTING_RATE based on {DATAELEMENT_METHOD_DENOMINATOR}\"),\n", - " title = \"Distribution of REPORTING_RATE per year\",\n", - " subtitle = ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted Reporting Rates\", \"Unweighted Reporting Rates\")\n", - " ) +\n", - " theme_minimal()" - ] - }, - { - "cell_type": "markdown", - "id": "2866816a-7015-4c5c-b904-f553f3b4790d", - "metadata": { - "papermill": { - "duration": 8.8e-05, - "end_time": "2026-01-16T10:24:09.592563", - "exception": false, - "start_time": "2026-01-16T10:24:09.592475", - "status": "completed" + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" }, - "tags": [] - }, - "source": [ - "## 5. 📁 Export to `data/` folder" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bbf27852-8ec5-4370-aae2-49e082928fe1", - "metadata": { - "papermill": { - "duration": 0.919937, - "end_time": "2026-01-16T10:24:10.512602", - "exception": false, - "start_time": "2026-01-16T10:24:09.592665", - "status": "completed" + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" }, - "tags": [], - "vscode": { - "languageId": "r" + "papermill": { + "default_parameters": {}, + "duration": 81.158347, + "end_time": "2026-01-16T10:24:10.736106", + "environment_variables": {}, + "exception": null, + "input_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb", + "output_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataelement/papermill_outputs/snt_dhis2_reporting_rate_dataelement_OUTPUT_2026-01-16_102249.ipynb", + "parameters": { + "ACTIVITY_INDICATORS": [ + "CONF", + "PRES" + ], + "DATASET_ID": "DHIS2_OUTLIERS_IMPUTATION", + "DATAELEMENT_METHOD_DENOMINATOR": "ROUTINE_ACTIVE_FACILITIES", + "ROUTINE_FILE": "XXX_routine_outliers_removed.parquet", + "SNT_ROOT_PATH": "/home/hexa/workspace", + "USE_WEIGHTED_REPORTING_RATES": true, + "VOLUME_ACTIVITY_INDICATORS": [ + "CONF", + "PRES" + ] + }, + "start_time": "2026-01-16T10:22:49.577759", + "version": "2.6.0" } - }, - "outputs": [], - "source": [ - "output_data_path <- file.path(DATA_PATH, \"reporting_rate\")\n", - "\n", - "# parquet\n", - "file_path <- file.path(output_data_path, paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.parquet\"))\n", - "write_parquet(reporting_rate_dataelement, file_path)\n", - "log_msg(glue(\"Exported : {file_path}\"))\n", - "\n", - "# csv\n", - "file_path <- file.path(output_data_path, paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.csv\"))\n", - "write.csv(reporting_rate_dataelement, file_path, row.names = FALSE)\n", - "log_msg(glue(\"Exported : {file_path}\"))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" }, - "papermill": { - "default_parameters": {}, - "duration": 81.158347, - "end_time": "2026-01-16T10:24:10.736106", - "environment_variables": {}, - "exception": null, - "input_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb", - "output_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataelement/papermill_outputs/snt_dhis2_reporting_rate_dataelement_OUTPUT_2026-01-16_102249.ipynb", - "parameters": { - "AVAILABILITY_INDICATORS": [ - "CONF", - "PRES", - "SUSP", - "TEST" - ], - "DATAELEMENT_METHOD_DENOMINATOR": "ROUTINE_ACTIVE_FACILITIES", - "ROUTINE_FILE": "NER_routine_outliers_removed.parquet", - "SNT_ROOT_PATH": "/home/hexa/workspace", - "USE_WEIGHTED_REPORTING_RATES": true, - "VOLUME_ACTIVITY_INDICATORS": [ - "CONF", - "PRES" - ] - }, - "start_time": "2026-01-16T10:22:49.577759", - "version": "2.6.0" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/pipelines/snt_dhis2_reporting_rate_dataelement/reporting/snt_dhis2_reporting_rate_dataelement_report.ipynb b/pipelines/snt_dhis2_reporting_rate_dataelement/reporting/snt_dhis2_reporting_rate_dataelement_report.ipynb index 1d77c2b..5410101 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataelement/reporting/snt_dhis2_reporting_rate_dataelement_report.ipynb +++ b/pipelines/snt_dhis2_reporting_rate_dataelement/reporting/snt_dhis2_reporting_rate_dataelement_report.ipynb @@ -36,29 +36,19 @@ }, "outputs": [], "source": [ - "# Project paths\n", - "SNT_ROOT_PATH <- \"/home/hexa/workspace\" \n", - "REPORTING_NB_OUTPUTS_PATH <- file.path(SNT_ROOT_PATH, \"pipelines/snt_dhis2_reporting_rate_dataelement/reporting/outputs\")\n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, 'code') # this is where we store snt_utils.r\n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, 'configuration') # .json config file\n", - "DATA_PATH <- file.path(SNT_ROOT_PATH, 'data', 'dhis2') \n", + "source(\"~/workspace/pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r\")\n", "\n", - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "# Load palettes\n", - "source(file.path(CODE_PATH, \"snt_palettes.r\"))\n", + "report_packages <- c(\"arrow\", \"tidyverse\", \"sf\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"glue\")\n", + "snt_environment <- get_setup_variables(packages = report_packages)\n", "\n", - "# Load libraries \n", - "required_packages <- c(\"arrow\", \"tidyverse\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"glue\")\n", - "install_and_load(required_packages)\n", + "CONFIG_PATH <- snt_environment$CONFIG_PATH\n", + "SNT_ROOT_PATH <- dirname(CONFIG_PATH)\n", + "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", + "DATA_PATH <- file.path(snt_environment$DATA_PATH, \"dhis2\")\n", + "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_reporting_rate_dataelement\")\n", + "REPORTING_NB_OUTPUTS_PATH <- file.path(PIPELINE_PATH, \"reporting\", \"outputs\")\n", "\n", - "# Environment variables\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "\n", - "# Load OpenHEXA sdk\n", - "openhexa <- import(\"openhexa.sdk\")" + "source(file.path(CODE_PATH, \"snt_palettes.r\"))" ] }, { @@ -97,15 +87,8 @@ }, "outputs": [], "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", - " error = function(e) {\n", - " msg <- paste0(\"[ERROR] Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "log_msg(paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, \"SNT_config.json\")))" + "# we should move this thing in snt_utils at some points\n", + "config_json <- load_snt_config(file.path(CONFIG_PATH, \"SNT_config.json\"))" ] }, { @@ -174,15 +157,7 @@ }, "outputs": [], "source": [ - "# Load SNT metadata\n", - "metadata_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_metadata.json\")) },\n", - " error = function(e) {\n", - " msg <- paste0(\"[ERROR] Error while loading metadata\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "log_msg(paste0(\"SNT metadata loaded from : \", file.path(CONFIG_PATH, \"SNT_metadata.json\")))" + "metadata_json <- load_snt_metadata(file.path(CONFIG_PATH, \"SNT_metadata.json\"))" ] }, { @@ -279,15 +254,8 @@ "\n", "rr_filename <- glue::glue(\"{COUNTRY_CODE}_reporting_rate_dataelement.parquet\")\n", "\n", - "reporting_rate <- tryCatch({ get_latest_dataset_file_in_memory(REPORTING_RATE_DATASET_NAME, rr_filename) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading Reporting Rate (Data Element) data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - "})\n", + "reporting_rate <- load_dataset_file(REPORTING_RATE_DATASET_NAME, rr_filename)\n", "\n", - "# log\n", - "log_msg(glue::glue(\"Data file `{rr_filename}` loaded from dataset: `{REPORTING_RATE_DATASET_NAME}`. Dataframe dimensions: {paste(dim(reporting_rate), collapse=', ')}\"))\n", "dim(reporting_rate)\n", "head(reporting_rate, 2)" ] @@ -329,14 +297,8 @@ }, "outputs": [], "source": [ - "shapes <- tryCatch({ get_latest_dataset_file_in_memory(DHIS2_FORMATTED_DATASET_NAME, paste0(COUNTRY_CODE, \"_shapes.geojson\")) }, \n", - " error = function(e) { \n", - " msg <- paste0(COUNTRY_CODE , \" Shapes data is not available in dataset: '\" , DHIS2_FORMATTED_DATASET_NAME, \"' last version.\")\n", - " log_msg(msg, \"warning\")\n", - " shapes <- NULL\n", - " })\n", - "\n", - "log_msg(glue::glue(\"Shapes loaded from dataset: '{DHIS2_FORMATTED_DATASET_NAME}'. \\nDataframe with dimensions: {paste(dim(shapes), collapse=', ')}\"))\n", + "shapes_filename <- paste0(COUNTRY_CODE, \"_shapes.geojson\")\n", + "shapes <- load_dataset_file(DHIS2_FORMATTED_DATASET_NAME, shapes_filename)\n", "names(shapes)" ] }, diff --git a/pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r b/pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r new file mode 100644 index 0000000..c75f279 --- /dev/null +++ b/pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r @@ -0,0 +1,244 @@ +# Load base utils +source(file.path("~/workspace/code", "snt_utils.r")) + + +# JSON reader for this pipeline. +read_workspace_json_file <- function(json_path, resource_label = "JSON file") { + json_path <- as.character(json_path)[[1L]] + tryCatch( + jsonlite::fromJSON(json_path), + error = function(e) { + stop(paste0( + "[ERROR] Error while loading ", + resource_label, + " from `", + json_path, + "`: ", + conditionMessage(e) + )) + } + ) +} + + +#' Get Setup Variables for SNT Workspace +#' Initializes workspace paths, loads R packages, and imports OpenHEXA SDK. +#' +#' @param SNT_ROOT_PATH Character. Root path of the SNT workspace. Default: '~/workspace' +#' @param packages Character vector. R packages to install and load. +#' @return List with `paths_to_check` plus `CONFIG_PATH`, `UPLOADS_PATH`, `DATA_PATH` +#' (use as `snt_environment$CONFIG_PATH`, same pattern as population transformation). +#' +#' @export +get_setup_variables <- function( + SNT_ROOT_PATH = "~/workspace", + packages = c( + "arrow", "rlang", "dplyr", "tidyr", "lubridate", "ggplot2", + "stringr", "stringi", "jsonlite", "httr", "glue", "reticulate", "zoo" + ) +) { + paths_to_check <- list( + CONFIG_PATH = file.path(SNT_ROOT_PATH, "configuration"), + UPLOADS_PATH = file.path(SNT_ROOT_PATH, "uploads"), + DATA_PATH = file.path(SNT_ROOT_PATH, "data") + ) + setup_variable <- c( + list(paths_to_check = paths_to_check), + paths_to_check + ) + + install_and_load(packages) + + configure_conda_r_spatial_env() + Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") + reticulate::py_config()$python + assign("openhexa", reticulate::import("openhexa.sdk"), envir = .GlobalEnv) + + return(setup_variable) +} + + +#' Load SNT Configuration File +#' Reads and parses a JSON configuration file. +#' @param snt_config_path Character. Path to the configuration JSON file. +#' @return List containing parsed configuration. +#' +#' @export +load_snt_config <- function(snt_config_path) { + config_json <- read_workspace_json_file(snt_config_path, "configuration") + log_msg(paste0("SNT configuration loaded from: ", snt_config_path)) + return(config_json) +} + + +#' Load SNT Metadata File +#' Reads and parses `SNT_metadata.json` (or another workspace metadata JSON). +#' @param snt_metadata_path Character. Path to the metadata JSON file. +#' @return List containing parsed metadata. +#' +#' @export +load_snt_metadata <- function(snt_metadata_path) { + metadata_json <- read_workspace_json_file(snt_metadata_path, "SNT metadata") + log_msg(paste0("SNT metadata loaded from: ", snt_metadata_path)) + return(metadata_json) +} + + +#' Load Dataset File from OpenHEXA +#' Retrieves the latest version of a file from an OpenHEXA dataset. +#' +#' @param dataset_id Character. OpenHEXA dataset identifier. +#' @param filename Character. Name of file to load. +#' @param verbose Logical. If TRUE, log dataframe dimensions after a successful load. +#' @return Dataframe containing the loaded data. +#' +#' @export +load_dataset_file <- function(dataset_id, filename, verbose = TRUE) { + data <- tryCatch( + { + get_latest_dataset_file_in_memory(dataset_id, filename) + }, + error = function(e) { + stop(glue::glue("[ERROR] Error while loading {filename} file from dataset: {dataset_id}")) + } + ) + if (verbose) { + log_msg(glue::glue( + "{filename} data loaded from dataset : {dataset_id} dataframe dimensions: [{paste(dim(data), collapse = ', ')}]" + )) + } + return(data) +} + + +#' Conda-friendly defaults for PROJ/GDAL (used when reading spatial data). +configure_conda_r_spatial_env <- function() { + if (Sys.getenv("PROJ_LIB", "") == "") { + Sys.setenv(PROJ_LIB = "/opt/conda/share/proj") + } + if (Sys.getenv("GDAL_DATA", "") == "") { + Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal") + } +} + + +#' Fail if Papermill did not inject the required pipeline parameters. +assert_papermill_dataelement_params <- function() { + required_pm <- c( + "ROUTINE_FILE", + "DATASET_ID", + "DATAELEMENT_METHOD_DENOMINATOR", + "ACTIVITY_INDICATORS", + "VOLUME_ACTIVITY_INDICATORS", + "USE_WEIGHTED_REPORTING_RATES" + ) + missing_pm <- required_pm[!vapply(required_pm, exists, logical(1), inherits = TRUE)] + if (length(missing_pm) > 0) { + stop( + "[ERROR] Missing pipeline parameters (Papermill): ", + paste(missing_pm, collapse = ", ") + ) + } +} + + +activity_indicator_list_is_nonempty <- function(activity_indicators) { + length(activity_indicators) > 0L +} + + +#' Stop early if the analyst left the activity-indicator list empty. +#' @export +stop_if_activity_indicators_empty <- function(activity_indicators) { + if (!activity_indicator_list_is_nonempty(activity_indicators)) { + stop("[ERROR] No activity indicators selected; choose at least one (e.g. CONF).") + } + invisible(TRUE) +} + + +#' Return required columns that are missing from `data`. +#' @export +find_missing_columns <- function(data, required_columns) { + if (!is.data.frame(data)) { + stop("[ERROR] `data` must be a data.frame.") + } + required_columns <- as.character(unlist(required_columns, use.names = FALSE)) + required_columns <- required_columns[!is.na(required_columns) & nzchar(required_columns)] + required_columns <- unique(required_columns) + setdiff(required_columns, names(data)) +} + + +#' Validate that required columns exist in `data`. +#' +#' Returns missing columns invisibly. Behavior on missing columns is controlled by +#' `on_missing`: `"error"`, `"warning"`, or `"none"`. +#' @export +validate_required_columns <- function( + data, + required_columns, + data_label = "data", + on_missing = c("error", "warning", "none") +) { + on_missing <- match.arg(on_missing) + missing_columns <- find_missing_columns(data, required_columns) + if (length(missing_columns) == 0L) { + return(invisible(character(0))) + } + + msg <- glue::glue( + "{data_label} missing required column(s): {paste(missing_columns, collapse = ', ')}" + ) + + if (on_missing == "error") { + log_msg(paste0("[ERROR] ", msg), "error") + stop(paste0("[ERROR] ", msg)) + } + if (on_missing == "warning") { + log_msg(paste0("Warning: ", msg), "warning") + } + invisible(missing_columns) +} + + +#' First / last PERIOD in routine and full vector of YYYYMM months in between. +#' @export +summarize_routine_period_range_as_month_vector <- function(dhis2_routine) { + period_start <- min(dhis2_routine$PERIOD, na.rm = TRUE) + period_end <- max(dhis2_routine$PERIOD, na.rm = TRUE) + pv <- format( + seq(lubridate::ym(period_start), lubridate::ym(period_end), by = "month"), + "%Y%m" + ) + list( + PERIOD_START = period_start, + PERIOD_END = period_end, + period_vector = pv + ) +} + + +#' Pyramid table crossed with every month in the routine period (facility master for RR). +#' @export +build_facilities_crossed_with_monthly_periods <- function( + dhis2_pyramid_formatted, + period_vector, + config_json, + ADMIN_1, + ADMIN_2 +) { + dhis2_pyramid_formatted %>% + dplyr::rename( + OU_ID = glue::glue("LEVEL_{config_json$SNT_CONFIG$ANALYTICS_ORG_UNITS_LEVEL}_ID"), + OU_NAME = glue::glue("LEVEL_{config_json$SNT_CONFIG$ANALYTICS_ORG_UNITS_LEVEL}_NAME"), + ADM2_ID = stringr::str_replace(ADMIN_2, "NAME", "ID"), + ADM2_NAME = dplyr::all_of(ADMIN_2), + ADM1_ID = stringr::str_replace(ADMIN_1, "NAME", "ID"), + ADM1_NAME = dplyr::all_of(ADMIN_1) + ) %>% + dplyr::select(ADM1_ID, ADM1_NAME, ADM2_ID, ADM2_NAME, OU_ID, OU_NAME, OPENING_DATE, CLOSED_DATE) %>% + dplyr::distinct() %>% + tidyr::crossing(PERIOD = period_vector) %>% + dplyr::mutate(PERIOD = as.numeric(PERIOD)) +} diff --git a/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb b/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb index dcac610..9450d74 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb +++ b/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb @@ -1,1205 +1,1048 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "30bf8dfc", - "metadata": {}, - "source": [ - "# **Dataset Reporting Rate: Calculation Based on DHIS2 Extracted Data**\n", - "\n", - "The **reporting rate** measures the proportion of registered health facilities that submit data. It is calculated for each administrative level 2 (`ADM2`) area and for each reporting period (`PERIOD` in YYYYMM format).\n", - "
\n", - "\n", - "**Dataset Selection**
\n", - "The choice of dataset(s) used for reporting rate calculation is controlled by modifying the SNT_config.json configuration file. This allows flexible selection among multiple datasets extracted from the same DHIS2 instance.\n", - "\n", - "**Calculation Logic**
\n", - "From the selected dataset(s):\n", - "- **Numerator:** Number of facilities that _actually_ reported, derived from the element \"ACTUAL_REPORTS\".\n", - "- **Denominator:** Number of facilities _expected_ to report, derived from the element \"EXPECTED_REPORTS\".\n", - "\n", - "After aggregating these counts at the ADM2 level, the reporting rate is computed as:\n", - "
\n", - "REPORTING RATE = ACTUAL_REPORTS / EXPECTED_REPORTS\n", - "
\n", - "and expressed as a **proportion** between 0 and 1.\n", - "
\n", - "\n", - "-----\n", - "\n", - "### Additional Data Processing Steps\n", - "\n", - "- **Handling Multiple Datasets:** \n", - " When multiple datasets are available, the pipeline uses only those specified in SNT_config.json. For these selected datasets, the counts of actual and expected reports are summed by ADM2 area.\n", - "\n", - "- **Deduplication of Entries:** \n", - " Sometimes, the same organizational unit (OU_ID) may appear in multiple datasets for the same period, risking double counting. To address this, deduplication is performed by keeping only the entry with the **highest** ACTUAL_REPORTS value for each unique combination of OU_ID and PERIOD. \n", - "
    \n", - "
  • Why keep the highest? Because ACTUAL_REPORTS values are binary (0 or 1). If duplicates agree (all 0 or all 1), keeping one suffices. If they differ (some 0, some 1), keeping the 1 ensures that presence of a report is not missed.
  • \n", - "
  • 🚨Important: Deduplication only proceeds if all duplicated values are within {0,1}. If other values are present, deduplication is skipped with a warning to avoid incorrect data handling.
  • \n", - "
\n", - "\n", - "-----\n", - "\n", - "\n", - "### 🇳🇪 Niger-Specific Processing: \n", - " In Niger, datasets for HOP (hospital) facilities are already **pre-aggregated** and may contain values greater than 1 for actual or expected reports, reflecting subunits or departments within a hospital. \n", - "
\n", - " To accurately represent reporting at the facility level and avoid overcounting, all values greater than 1 are converted to 1 (presence/absence). This ensures that the reporting rate reflects whether the hospital as a whole reported, rather than counting multiple subunits separately. This step also prevents cases where ACTUAL_REPORTS exceeds EXPECTED_REPORTS.\n", - "\n", - "------\n", - "\n", - "### Pipeline parameters\n", - "\n", - "- **Routine data source**: Select the routine dataset variant used for reporting rate computation.\n", - "\n", - "- **`raw`**: Loads routine data from the formatted dataset.\n", - "\n", - "- **`imputed`**: Loads routine data from the outliers dataset using imputed values.\n", - "\n", - "- **`outliers_removed`**: Loads routine data from the outliers dataset after outliers removal." - ] - }, - { - "cell_type": "markdown", - "id": "064495be-24e5-4b76-a91f-7ac3d1a27a5a", - "metadata": { - "papermill": { - "duration": 9.2e-05, - "end_time": "2025-12-19T10:21:50.273573", - "exception": false, - "start_time": "2025-12-19T10:21:50.273481", - "status": "completed" + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# **Dataset Reporting Rate: Calculation Based on DHIS2 Extracted Data**\n", + "\n", + "The **reporting rate** measures the proportion of registered health facilities that submit data. It is calculated for each administrative level 2 (`ADM2`) area and for each reporting period (`PERIOD` in YYYYMM format).\n", + "
\n", + "\n", + "**Dataset Selection**
\n", + "The choice of dataset(s) used for reporting rate calculation is controlled by modifying the SNT_config.json configuration file. This allows flexible selection among multiple datasets extracted from the same DHIS2 instance.\n", + "\n", + "**Calculation Logic**
\n", + "From the selected dataset(s):\n", + "- **Numerator:** Number of facilities that _actually_ reported, derived from the element \"ACTUAL_REPORTS\".\n", + "- **Denominator:** Number of facilities _expected_ to report, derived from the element \"EXPECTED_REPORTS\".\n", + "\n", + "After aggregating these counts at the ADM2 level, the reporting rate is computed as:\n", + "
\n", + "REPORTING RATE = ACTUAL_REPORTS / EXPECTED_REPORTS\n", + "
\n", + "and expressed as a **proportion** between 0 and 1.\n", + "
\n", + "\n", + "-----\n", + "\n", + "### Additional Data Processing Steps\n", + "\n", + "- **Handling Multiple Datasets:** \n", + " When multiple datasets are available, the pipeline uses only those specified in SNT_config.json. For these selected datasets, the counts of actual and expected reports are summed by ADM2 area.\n", + "\n", + "- **Deduplication of Entries:** \n", + " Sometimes, the same organizational unit (OU_ID) may appear in multiple datasets for the same period, risking double counting. To address this, deduplication is performed by keeping only the entry with the **highest** ACTUAL_REPORTS value for each unique combination of OU_ID and PERIOD. \n", + "
    \n", + "
  • Why keep the highest? Because ACTUAL_REPORTS values are binary (0 or 1). If duplicates agree (all 0 or all 1), keeping one suffices. If they differ (some 0, some 1), keeping the 1 ensures that presence of a report is not missed.
  • \n", + "
  • 🚨Important: Deduplication only proceeds if all duplicated values are within {0,1}. If other values are present, deduplication is skipped with a warning to avoid incorrect data handling.
  • \n", + "
\n", + "\n", + "-----\n", + "\n", + "\n", + "### Pipeline parameters\n", + "\n", + "`ROUTINE_FILE` and `DATASET_ID` are injected by Papermill and validated in setup section 1.2.\n", + "\n", + "- **Outliers detection method**: Specify which method was used to detect outliers in routine data. Choose \"Routine data (Raw)\" to use raw routine data.\n", + " \n", + "- **Use routine with outliers removed**: Toggle this on to use the routine data after outliers have been removed (using the outliers detection method selected above). Else, this pipeline will use either the imputed routine data (to replace the outlier values removed) or the raw routine data if you selected \"Routine data (Raw)\" as your choice of “Outlier processing method”." + ], + "id": "30bf8dfc" }, - "tags": [] - }, - "source": [ - "## 1. Setup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "35ede7cf-257f-439c-a514-26a7290f881d", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:21:50.332786Z", - "iopub.status.busy": "2025-12-19T10:21:50.277536Z", - "iopub.status.idle": "2025-12-19T10:23:03.339080Z", - "shell.execute_reply": "2025-12-19T10:23:03.336413Z" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000092, + "end_time": "2025-12-19T10:21:50.273573", + "exception": false, + "start_time": "2025-12-19T10:21:50.273481", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## 1. Setup" + ], + "id": "064495be-24e5-4b76-a91f-7ac3d1a27a5a" }, - "papermill": { - "duration": 73.068006, - "end_time": "2025-12-19T10:23:03.341764", - "exception": false, - "start_time": "2025-12-19T10:21:50.273758", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:21:50.332786Z", + "iopub.status.busy": "2025-12-19T10:21:50.277536Z", + "iopub.status.idle": "2025-12-19T10:23:03.339080Z", + "shell.execute_reply": "2025-12-19T10:23:03.336413Z" + }, + "papermill": { + "duration": 73.068006, + "end_time": "2025-12-19T10:23:03.341764", + "exception": false, + "start_time": "2025-12-19T10:21:50.273758", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "source(\"~/workspace/pipelines/snt_dhis2_reporting_rate_dataset/utils/snt_dhis2_reporting_rate_dataset.r\")\n", + "snt_environment <- get_setup_variables()\n" + ], + "execution_count": null, + "outputs": [], + "id": "35ede7cf-257f-439c-a514-26a7290f881d" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Project paths\n", - "SNT_ROOT_PATH <- \"/home/hexa/workspace\" \n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, 'code') \n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, 'configuration') \n", - "DATA_PATH <- file.path(SNT_ROOT_PATH, 'data', 'dhis2') \n", - "\n", - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "\n", - "# Load libraries \n", - "required_packages <- c(\"arrow\", \"tidyverse\", \"glue\", \"jsonlite\", \"httr\", \"reticulate\") \n", - "install_and_load(required_packages)\n", - "\n", - "# Environment variables\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "\n", - "# Load OpenHEXA sdk\n", - "openhexa <- import(\"openhexa.sdk\")" - ] - }, - { - "cell_type": "markdown", - "id": "7dedcc32-c531-498d-90b9-89b0ee9fb9be", - "metadata": { - "papermill": { - "duration": 0.00017, - "end_time": "2025-12-19T10:23:03.342235", - "exception": false, - "start_time": "2025-12-19T10:23:03.342065", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.00017, + "end_time": "2025-12-19T10:23:03.342235", + "exception": false, + "start_time": "2025-12-19T10:23:03.342065", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 1.1. Load and check `config_json` file" + ], + "id": "7dedcc32-c531-498d-90b9-89b0ee9fb9be" }, - "tags": [] - }, - "source": [ - "#### 1.1. Load and check `config_json` file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5b6d29ea-91f3-4c53-b95e-4b485f88383f", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:03.351367Z", - "iopub.status.busy": "2025-12-19T10:23:03.348819Z", - "iopub.status.idle": "2025-12-19T10:23:03.979814Z", - "shell.execute_reply": "2025-12-19T10:23:03.976617Z" + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:03.351367Z", + "iopub.status.busy": "2025-12-19T10:23:03.348819Z", + "iopub.status.idle": "2025-12-19T10:23:03.979814Z", + "shell.execute_reply": "2025-12-19T10:23:03.976617Z" + }, + "papermill": { + "duration": 0.640406, + "end_time": "2025-12-19T10:23:03.982829", + "exception": false, + "start_time": "2025-12-19T10:23:03.342423", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "config_json <- load_snt_config(file.path(snt_environment$CONFIG_PATH, \"SNT_config.json\"))" + ], + "execution_count": null, + "outputs": [], + "id": "5b6d29ea-91f3-4c53-b95e-4b485f88383f" }, - "papermill": { - "duration": 0.640406, - "end_time": "2025-12-19T10:23:03.982829", - "exception": false, - "start_time": "2025-12-19T10:23:03.342423", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:03.987632Z", + "iopub.status.busy": "2025-12-19T10:23:03.985301Z", + "iopub.status.idle": "2025-12-19T10:23:04.011308Z", + "shell.execute_reply": "2025-12-19T10:23:04.008941Z" + }, + "papermill": { + "duration": 0.031002, + "end_time": "2025-12-19T10:23:04.014107", + "exception": false, + "start_time": "2025-12-19T10:23:03.983105", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "stop_if_dataset_reporting_papermill_params_missing()\n", + "\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", + "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", + "REPORTING_RATE_PRODUCT_ID <- config_json$SNT_CONFIG$REPORTING_RATE_PRODUCT_UID\n", + "fixed_cols_rr <- c(\"YEAR\", \"MONTH\", \"ADM2_ID\", \"REPORTING_RATE\")\n" + ], + "execution_count": null, + "outputs": [], + "id": "c26c981c-dadd-48ac-ae35-613b8ba61a82" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", - " error = function(e) {\n", - " msg <- paste0(\"[ERROR] Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "log_msg(paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, \"SNT_config.json\")))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c26c981c-dadd-48ac-ae35-613b8ba61a82", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:03.987632Z", - "iopub.status.busy": "2025-12-19T10:23:03.985301Z", - "iopub.status.idle": "2025-12-19T10:23:04.011308Z", - "shell.execute_reply": "2025-12-19T10:23:04.008941Z" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.00015, + "end_time": "2025-12-19T10:23:04.014523", + "exception": false, + "start_time": "2025-12-19T10:23:04.014373", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 1.2. Config + Papermill\n", + "Validate Papermill inputs, then assign country/admin/product and fixed reporting-rate columns explicitly from `config_json`." + ], + "id": "a7a15634-4623-40f2-8e2d-3fa47203aa6e" }, - "papermill": { - "duration": 0.031002, - "end_time": "2025-12-19T10:23:04.014107", - "exception": false, - "start_time": "2025-12-19T10:23:03.983105", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000144, + "end_time": "2025-12-19T10:23:04.043066", + "exception": false, + "start_time": "2025-12-19T10:23:04.042922", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 1.3. 🔍 Check REPORTING_RATE_PRODUCT_ID is configured" + ], + "id": "8d8b20f5-901b-46c7-a0ef-9850cba6e650" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Configuration settings\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", - "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", - "\n", - "# Which reporting rate PRODUCT_UID to use (DHIS2 dataset id)\n", - "REPORTING_RATE_PRODUCT_ID <- config_json$SNT_CONFIG$REPORTING_RATE_PRODUCT_UID \n", - "\n", - "fixed_cols_rr <- c('YEAR', 'MONTH', 'ADM2_ID', 'REPORTING_RATE') # Fixed cols for exporting RR tables" - ] - }, - { - "cell_type": "markdown", - "id": "a7a15634-4623-40f2-8e2d-3fa47203aa6e", - "metadata": { - "papermill": { - "duration": 0.00015, - "end_time": "2025-12-19T10:23:04.014523", - "exception": false, - "start_time": "2025-12-19T10:23:04.014373", - "status": "completed" + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.3. Validate reporting-rate product configuration" + ], + "id": "682a62d5" }, - "tags": [] - }, - "source": [ - "#### 1.2. Validate parameters" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b17f7685-5291-4e5d-9eec-2d1f9435fccb", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:04.019283Z", - "iopub.status.busy": "2025-12-19T10:23:04.017257Z", - "iopub.status.idle": "2025-12-19T10:23:04.039652Z", - "shell.execute_reply": "2025-12-19T10:23:04.037292Z" + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:04.047782Z", + "iopub.status.busy": "2025-12-19T10:23:04.045631Z", + "iopub.status.idle": "2025-12-19T10:23:04.545551Z", + "shell.execute_reply": "2025-12-19T10:23:04.542372Z" + }, + "papermill": { + "duration": 0.505908, + "end_time": "2025-12-19T10:23:04.549148", + "exception": false, + "start_time": "2025-12-19T10:23:04.043240", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Check if REPORTING_RATE_PRODUCT_ID is configured\n", + "if (is.null(REPORTING_RATE_PRODUCT_ID) || length(REPORTING_RATE_PRODUCT_ID) == 0) {\n", + " log_msg(\"🚨 Warning: REPORTING_RATE_PRODUCT_ID is not configured properly in 'SNT_config.json'. \n", + " This will prevent filtering by reporting dataset, and all values will be retained.\", level = \"warning\" )\n", + "}" + ], + "execution_count": null, + "outputs": [], + "id": "7469898d" }, - "papermill": { - "duration": 0.02788, - "end_time": "2025-12-19T10:23:04.042642", - "exception": false, - "start_time": "2025-12-19T10:23:04.014762", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000139, + "end_time": "2025-12-19T10:23:04.549558", + "exception": false, + "start_time": "2025-12-19T10:23:04.549419", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## 2. Load Data" + ], + "id": "e44ae2ab-4af7-475a-8cbe-6d669895a18b" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# default: raw routine\n", - "if (!exists(\"ROUTINE_FILE\")) { ROUTINE_FILE <- glue::glue(\"{COUNTRY_CODE}_routine.parquet\") }" - ] - }, - { - "cell_type": "markdown", - "id": "8d8b20f5-901b-46c7-a0ef-9850cba6e650", - "metadata": { - "papermill": { - "duration": 0.000144, - "end_time": "2025-12-19T10:23:04.043066", - "exception": false, - "start_time": "2025-12-19T10:23:04.042922", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000152, + "end_time": "2025-12-19T10:23:04.549924", + "exception": false, + "start_time": "2025-12-19T10:23:04.549772", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 2.1. Load routine data (DHIS2) \n", + "Already formatted routine data, we use this as the master table
\n", + "(only used at the very end before exporting the table)" + ], + "id": "39e2add7-bbc7-4312-9a6f-9886d675f532" }, - "tags": [] - }, - "source": [ - "#### 1.3. 🔍 Check REPORTING_RATE_PRODUCT_ID is configured" - ] - }, - { - "cell_type": "markdown", - "id": "682a62d5", - "metadata": {}, - "source": [ - "### 🐍 This probably to be moved to pipeline.py code?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7469898d", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:04.047782Z", - "iopub.status.busy": "2025-12-19T10:23:04.045631Z", - "iopub.status.idle": "2025-12-19T10:23:04.545551Z", - "shell.execute_reply": "2025-12-19T10:23:04.542372Z" + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:04.554212Z", + "iopub.status.busy": "2025-12-19T10:23:04.552423Z", + "iopub.status.idle": "2025-12-19T10:23:05.773324Z", + "shell.execute_reply": "2025-12-19T10:23:05.771316Z" + }, + "papermill": { + "duration": 1.225668, + "end_time": "2025-12-19T10:23:05.775768", + "exception": false, + "start_time": "2025-12-19T10:23:04.550100", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "dhis2_routine <- load_dataset_file(DATASET_ID, ROUTINE_FILE)\n", + "dhis2_routine <- dhis2_routine %>%\n", + " dplyr::mutate(dplyr::across(c(PERIOD, YEAR, MONTH), as.numeric)) %>%\n", + " dplyr::select(dplyr::any_of(fixed_cols_rr)) %>%\n", + " dplyr::distinct()\n", + "dim(dhis2_routine)\n", + "head(dhis2_routine, 3)\n" + ], + "execution_count": null, + "outputs": [], + "id": "a1213723-f7e2-4238-9f37-f1795b187232" }, - "papermill": { - "duration": 0.505908, - "end_time": "2025-12-19T10:23:04.549148", - "exception": false, - "start_time": "2025-12-19T10:23:04.043240", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000155, + "end_time": "2025-12-19T10:23:05.776205", + "exception": false, + "start_time": "2025-12-19T10:23:05.776050", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 2.2. Load Reporting Rate data (DHIS2)" + ], + "id": "dccc8626-7798-4bcd-ae5f-d7502dfdc452" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Check if REPORTING_RATE_PRODUCT_ID is configured\n", - "if (is.null(REPORTING_RATE_PRODUCT_ID) || length(REPORTING_RATE_PRODUCT_ID) == 0) {\n", - " log_msg(\"🚨 Warning: REPORTING_RATE_PRODUCT_ID is not configured properly in 'SNT_config.json'. \n", - " This will prevent filtering by reporting dataset, and all values will be retained.\", level = \"warning\" )\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "e44ae2ab-4af7-475a-8cbe-6d669895a18b", - "metadata": { - "papermill": { - "duration": 0.000139, - "end_time": "2025-12-19T10:23:04.549558", - "exception": false, - "start_time": "2025-12-19T10:23:04.549419", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:05.780487Z", + "iopub.status.busy": "2025-12-19T10:23:05.778651Z", + "iopub.status.idle": "2025-12-19T10:23:07.096742Z", + "shell.execute_reply": "2025-12-19T10:23:07.094774Z" + }, + "papermill": { + "duration": 1.322737, + "end_time": "2025-12-19T10:23:07.099136", + "exception": false, + "start_time": "2025-12-19T10:23:05.776399", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "formatting_dataset_id <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "reporting_parquet_name <- paste0(COUNTRY_CODE, \"_reporting.parquet\")\n", + "\n", + "dhis2_reporting <- load_dataset_file(formatting_dataset_id, reporting_parquet_name)\n", + "dhis2_reporting <- dhis2_reporting %>%\n", + " dplyr::mutate(dplyr::across(c(PERIOD, YEAR, MONTH, VALUE), as.numeric))\n", + "head(dhis2_reporting, 3)\n" + ], + "execution_count": null, + "outputs": [], + "id": "0e352c76-f2fb-43ba-b85d-391d808057a8" }, - "tags": [] - }, - "source": [ - "## 2. Load Data" - ] - }, - { - "cell_type": "markdown", - "id": "39e2add7-bbc7-4312-9a6f-9886d675f532", - "metadata": { - "papermill": { - "duration": 0.000152, - "end_time": "2025-12-19T10:23:04.549924", - "exception": false, - "start_time": "2025-12-19T10:23:04.549772", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000151, + "end_time": "2025-12-19T10:23:07.099531", + "exception": false, + "start_time": "2025-12-19T10:23:07.099380", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## 3. Transform reporting data" + ], + "id": "4d5f398b" }, - "tags": [] - }, - "source": [ - "### 2.1. Load routine data (DHIS2) \n", - "Already formatted routine data, we use this as the master table
\n", - "(only used at the very end before exporting the table)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a1213723-f7e2-4238-9f37-f1795b187232", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:04.554212Z", - "iopub.status.busy": "2025-12-19T10:23:04.552423Z", - "iopub.status.idle": "2025-12-19T10:23:05.773324Z", - "shell.execute_reply": "2025-12-19T10:23:05.771316Z" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.0001, + "end_time": "2025-12-19T10:23:07.099849", + "exception": false, + "start_time": "2025-12-19T10:23:07.099749", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 3.1. Filter Reporting Rate data by \"Dataset\" (`PRODUCT_UID`)\n", + "Logic:\n", + "* Value(s) (string) for `PRODUCT_UID` defined in the config.json file\n", + "* If none provided (**empty** field) skip filtering and **keep everything**" + ], + "id": "adcbee0b" }, - "papermill": { - "duration": 1.225668, - "end_time": "2025-12-19T10:23:05.775768", - "exception": false, - "start_time": "2025-12-19T10:23:04.550100", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:07.104617Z", + "iopub.status.busy": "2025-12-19T10:23:07.102475Z", + "iopub.status.idle": "2025-12-19T10:23:08.406561Z", + "shell.execute_reply": "2025-12-19T10:23:08.404419Z" + }, + "papermill": { + "duration": 1.309322, + "end_time": "2025-12-19T10:23:08.409343", + "exception": false, + "start_time": "2025-12-19T10:23:07.100021", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# 3.1 Filter Reporting Rate data by selected dataset PRODUCT_UID(s)\n", + "if (length(REPORTING_RATE_PRODUCT_ID) > 0 && all(REPORTING_RATE_PRODUCT_ID %in% unique(dhis2_reporting$PRODUCT_UID))) {\n", + " dhis2_reporting <- dhis2_reporting %>% dplyr::filter(PRODUCT_UID %in% REPORTING_RATE_PRODUCT_ID)\n", + "} else if (length(REPORTING_RATE_PRODUCT_ID) > 0) {\n", + " log_msg(glue::glue(\n", + " \"?? Warning: REPORTING_RATE_PRODUCT_UID: {paste(REPORTING_RATE_PRODUCT_ID, collapse=', ')} not found in DHIS2 reporting data. Skipping filtering.\"\n", + " ), level = \"warning\")\n", + "}\n", + "\n", + "# 3.2 Pivot wider on PRODUCT_METRIC\n", + "dhis2_reporting_wide <- dhis2_reporting %>%\n", + " tidyr::pivot_wider(names_from = PRODUCT_METRIC, values_from = VALUE)\n", + "\n", + "# 3.3 Detect duplicated OU_ID / PERIOD combinations across datasets\n", + "dupl_ou_period <- dhis2_reporting_wide %>%\n", + " dplyr::group_by(OU_ID, PERIOD) %>%\n", + " dplyr::filter(dplyr::n() > 1) %>%\n", + " dplyr::ungroup() %>%\n", + " dplyr::select(OU_ID, OU_NAME, PERIOD, PRODUCT_UID, dplyr::ends_with(\"REPORTS\"))\n", + "\n", + "# If duplicates are binary reports (0/1), keep the row where ACTUAL_REPORTS is maximal\n", + "if (nrow(dupl_ou_period) > 0 &&\n", + " all(dupl_ou_period$ACTUAL_REPORTS %in% c(0, 1), na.rm = TRUE) &&\n", + " all(dupl_ou_period$EXPECTED_REPORTS %in% c(0, 1), na.rm = TRUE)) {\n", + "\n", + " dhis2_reporting_wide <- dhis2_reporting_wide %>%\n", + " dplyr::group_by(PERIOD, OU_ID) %>%\n", + " dplyr::mutate(ACTUAL_REPORTS_deduplicated = ifelse(OU_ID %in% dupl_ou_period$OU_ID, max(ACTUAL_REPORTS), ACTUAL_REPORTS)) %>%\n", + " dplyr::ungroup() %>%\n", + " dplyr::filter(!(OU_ID %in% dupl_ou_period$OU_ID) | (ACTUAL_REPORTS == ACTUAL_REPORTS_deduplicated)) %>%\n", + " dplyr::select(-ACTUAL_REPORTS_deduplicated)\n", + "}\n", + "\n", + "# Country-specific normalization for Niger where reports can exceed 1\n", + "if (COUNTRY_CODE == \"NER\") {\n", + " log_msg(\"Special handling for NER: capping ACTUAL_REPORTS and EXPECTED_REPORTS values above 1.\")\n", + " dhis2_reporting_wide <- dhis2_reporting_wide %>%\n", + " dplyr::mutate(\n", + " ACTUAL_REPORTS = ifelse(ACTUAL_REPORTS > 1, 1, ACTUAL_REPORTS),\n", + " EXPECTED_REPORTS = ifelse(EXPECTED_REPORTS > 1, 1, EXPECTED_REPORTS)\n", + " )\n", + "}\n", + "\n", + "# 3.4 Aggregate at ADM2 and compute reporting rate\n", + "reporting_rate_results <- dhis2_reporting_wide %>%\n", + " dplyr::group_by(PERIOD, YEAR, MONTH, ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID) %>%\n", + " dplyr::summarise(\n", + " ACTUAL_REPORTS = sum(ACTUAL_REPORTS, na.rm = TRUE),\n", + " EXPECTED_REPORTS = sum(EXPECTED_REPORTS, na.rm = TRUE),\n", + " .groups = \"drop\"\n", + " ) %>%\n", + " dplyr::mutate(REPORTING_RATE = ACTUAL_REPORTS / EXPECTED_REPORTS)\n" + ], + "execution_count": null, + "outputs": [], + "id": "795a5e74" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# select dataset\n", - "if (ROUTINE_FILE == glue::glue(\"{COUNTRY_CODE}_routine.parquet\")) {\n", - " rountine_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "} else {\n", - " rountine_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION\n", - "}\n", - "\n", - "# Load file from dataset\n", - "dhis2_routine <- tryCatch({ get_latest_dataset_file_in_memory(rountine_dataset_name, ROUTINE_FILE) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 routine data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - "})\n", - "\n", - "dhis2_routine <- dhis2_routine %>% mutate(across(c(PERIOD, YEAR, MONTH), as.numeric)) # Ensure correct data type for numerical columns \n", - "\n", - "# Subset data to keep only columns defined in fixed_cols_rr (if defined)\n", - "if (exists(\"fixed_cols_rr\")) {\n", - " dhis2_routine <- dhis2_routine %>% \n", - " select(any_of(fixed_cols_rr)) |> \n", - " distinct()\n", - "}\n", - "\n", - "# log\n", - "log_msg(glue::glue(\"DHIS2 routine file {ROUTINE_FILE} loaded from dataset : {rountine_dataset_name} dataframe dimensions: {paste(dim(dhis2_routine), collapse=', ')}\"))\n", - "dim(dhis2_routine)\n", - "head(dhis2_routine, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "dccc8626-7798-4bcd-ae5f-d7502dfdc452", - "metadata": { - "papermill": { - "duration": 0.000155, - "end_time": "2025-12-19T10:23:05.776205", - "exception": false, - "start_time": "2025-12-19T10:23:05.776050", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000133, + "end_time": "2025-12-19T10:23:08.409660", + "exception": false, + "start_time": "2025-12-19T10:23:08.409527", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 3.2. Pivot wider" + ], + "id": "4237408a" }, - "tags": [] - }, - "source": [ - "### 2.2. Load Reporting Rate data (DHIS2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0e352c76-f2fb-43ba-b85d-391d808057a8", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:05.780487Z", - "iopub.status.busy": "2025-12-19T10:23:05.778651Z", - "iopub.status.idle": "2025-12-19T10:23:07.096742Z", - "shell.execute_reply": "2025-12-19T10:23:07.094774Z" + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:08.413415Z", + "iopub.status.busy": "2025-12-19T10:23:08.411805Z", + "iopub.status.idle": "2025-12-19T10:23:08.884793Z", + "shell.execute_reply": "2025-12-19T10:23:08.880916Z" + }, + "papermill": { + "duration": 0.479538, + "end_time": "2025-12-19T10:23:08.889341", + "exception": false, + "start_time": "2025-12-19T10:23:08.409803", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# 3.2 Quick check after pivot\n", + "dim(dhis2_reporting_wide)\n", + "head(dhis2_reporting_wide, 3)\n" + ], + "execution_count": null, + "outputs": [], + "id": "5c3b9a65" }, - "papermill": { - "duration": 1.322737, - "end_time": "2025-12-19T10:23:07.099136", - "exception": false, - "start_time": "2025-12-19T10:23:05.776399", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000186, + "end_time": "2025-12-19T10:23:08.889829", + "exception": false, + "start_time": "2025-12-19T10:23:08.889643", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 👯 Handle **duplicated** values (`OU_ID`)\n", + "Using multiple datasets relies on the **assumption** that **each dataset is complementary to the other(s)**. Namely, there should be no \"dupliacted\" orgunits that are counted in more than one dataset! Else, we would be **double counting**." + ], + "id": "0f485148" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "file_name <- paste0(COUNTRY_CODE, \"_reporting.parquet\") # reporting rate file\n", - "\n", - "# Load file from dataset\n", - "dhis2_reporting <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, file_name) }, \n", - " error = function(e) {\n", - " msg <- paste(\"[ERROR] Error while loading DHIS2 dataset reporting rates file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - "})\n", - "dhis2_reporting <- dhis2_reporting %>% mutate(across(c(PERIOD, YEAR, MONTH, VALUE), as.numeric)) # numeric values\n", - "\n", - "msg <- paste0(\"DHIS2 Datatset reporting data loaded from file `\", file_name, \"` (from dataset : `\", dataset_name, \"`). \n", - "Dataframe dimensions: \", \n", - " paste(dim(dhis2_reporting), collapse=\", \"))\n", - "log_msg(msg)\n", - "head(dhis2_reporting, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "4d5f398b", - "metadata": { - "papermill": { - "duration": 0.000151, - "end_time": "2025-12-19T10:23:07.099531", - "exception": false, - "start_time": "2025-12-19T10:23:07.099380", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000122, + "end_time": "2025-12-19T10:23:08.890157", + "exception": false, + "start_time": "2025-12-19T10:23:08.890035", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### Check for duplicated values (`OU_ID`)" + ], + "id": "55dececa" }, - "tags": [] - }, - "source": [ - "## 3. Transform reporting data" - ] - }, - { - "cell_type": "markdown", - "id": "adcbee0b", - "metadata": { - "papermill": { - "duration": 0.0001, - "end_time": "2025-12-19T10:23:07.099849", - "exception": false, - "start_time": "2025-12-19T10:23:07.099749", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:08.899486Z", + "iopub.status.busy": "2025-12-19T10:23:08.894706Z", + "iopub.status.idle": "2025-12-19T10:23:09.476248Z", + "shell.execute_reply": "2025-12-19T10:23:09.470283Z" + }, + "papermill": { + "duration": 0.590832, + "end_time": "2025-12-19T10:23:09.481144", + "exception": false, + "start_time": "2025-12-19T10:23:08.890312", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Count duplicated OU_ID/PERIOD combinations found\n", + "cat(glue::glue(\"Duplicated OU_ID-PERIOD rows detected: {nrow(dupl_ou_period)}\"))\n", + "head(dupl_ou_period, 5)\n" + ], + "execution_count": null, + "outputs": [], + "id": "d761bd15" }, - "tags": [] - }, - "source": [ - "### 3.1. Filter Reporting Rate data by \"Dataset\" (`PRODUCT_UID`)\n", - "Logic:\n", - "* Value(s) (string) for `PRODUCT_UID` defined in the config.json file\n", - "* If none provided (**empty** field) skip filtering and **keep everything**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "795a5e74", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:07.104617Z", - "iopub.status.busy": "2025-12-19T10:23:07.102475Z", - "iopub.status.idle": "2025-12-19T10:23:08.406561Z", - "shell.execute_reply": "2025-12-19T10:23:08.404419Z" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000139, + "end_time": "2025-12-19T10:23:09.481549", + "exception": false, + "start_time": "2025-12-19T10:23:09.481410", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "#### Remove duplicated OU_IDs (shared across PRODUCT_UIDs)\n", + "Logic: \n", + "1. Identify if any `OU_ID` is present in both datasets\n", + "2. For these, keep `max(ACTUAL_REPORTS)` (since `EXPECTED_REPORTS` is always == 1) because: \n", + " * if both same value (either both 0 or both 1) => simply deduplicate (`distinct()`)\n", + " * if else if different values, meaning that one dataset say 1 and the other 0 => keep 1 (facility _did_ report)" + ], + "id": "805ed555" }, - "papermill": { - "duration": 1.309322, - "end_time": "2025-12-19T10:23:08.409343", - "exception": false, - "start_time": "2025-12-19T10:23:07.100021", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:09.488856Z", + "iopub.status.busy": "2025-12-19T10:23:09.484674Z", + "iopub.status.idle": "2025-12-19T10:23:13.563200Z", + "shell.execute_reply": "2025-12-19T10:23:13.559294Z" + }, + "papermill": { + "duration": 4.086946, + "end_time": "2025-12-19T10:23:13.568699", + "exception": false, + "start_time": "2025-12-19T10:23:09.481753", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Verify deduplication effect at OU_ID/PERIOD level\n", + "dupl_after_cleaning <- dhis2_reporting_wide %>%\n", + " dplyr::group_by(OU_ID, PERIOD) %>%\n", + " dplyr::filter(dplyr::n() > 1) %>%\n", + " dplyr::ungroup()\n", + "cat(glue::glue(\"Remaining duplicated OU_ID-PERIOD rows after cleaning: {nrow(dupl_after_cleaning)}\"))\n" + ], + "execution_count": null, + "outputs": [], + "id": "593b013a" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Check if REPORTING_RATE_PRODUCT_ID present in the data: if yes, filter to keep only those, else skip filtering (keep all) and log a warning\n", - "if (all(REPORTING_RATE_PRODUCT_ID %in% unique(dhis2_reporting$PRODUCT_UID))) {\n", - " dhis2_reporting <- dhis2_reporting %>% filter(PRODUCT_UID %in% REPORTING_RATE_PRODUCT_ID)\n", - " log_msg(glue::glue(\"🪮 Filtering DHIS2 reporting data to keep only values for REPORTING_RATE_PRODUCT_UID(s): {paste(REPORTING_RATE_PRODUCT_ID, collapse=', ')}.\n", - " Removed {nrow(dhis2_reporting) - nrow(dhis2_reporting %>% filter(PRODUCT_UID %in% REPORTING_RATE_PRODUCT_ID))} rows.\n", - " Dataframe dimensions after filtering: {paste(dim(dhis2_reporting), collapse=', ')}\"))\n", - "} else {\n", - " log_msg(glue::glue(\"🚨 Warning: REPORTING_RATE_PRODUCT_UID: {paste(REPORTING_RATE_PRODUCT_ID, collapse=', ')} not found in DHIS2 reporting data PRODUCT_UIDs: {paste(unique(dhis2_reporting$PRODUCT_UID), collapse=', ')}. \n", - " 🦘 Skipping filtering and keeping all data. Dataframe dimensions: {paste(dim(dhis2_reporting), collapse=', ')}\"), level = \"warning\")\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "4237408a", - "metadata": { - "papermill": { - "duration": 0.000133, - "end_time": "2025-12-19T10:23:08.409660", - "exception": false, - "start_time": "2025-12-19T10:23:08.409527", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:13.581200Z", + "iopub.status.busy": "2025-12-19T10:23:13.574942Z", + "iopub.status.idle": "2025-12-19T10:23:18.911910Z", + "shell.execute_reply": "2025-12-19T10:23:18.907746Z" + }, + "papermill": { + "duration": 5.346749, + "end_time": "2025-12-19T10:23:18.915815", + "exception": false, + "start_time": "2025-12-19T10:23:13.569066", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Optional inspection of cleaned rows\n", + "head(dhis2_reporting_wide, 5)\n" + ], + "execution_count": null, + "outputs": [], + "id": "c72bd93a" }, - "tags": [] - }, - "source": [ - "### 3.2. Pivot wider" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5c3b9a65", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:08.413415Z", - "iopub.status.busy": "2025-12-19T10:23:08.411805Z", - "iopub.status.idle": "2025-12-19T10:23:08.884793Z", - "shell.execute_reply": "2025-12-19T10:23:08.880916Z" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000236, + "end_time": "2025-12-19T10:23:18.916421", + "exception": false, + "start_time": "2025-12-19T10:23:18.916185", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 3.3. ACTUAL / EXPECTED summaries after cleaning\n", + "Niger-specific capping (values > 1 set to 1) is applied above when `COUNTRY_CODE == \"NER\"`. This cell only prints `summary()` for quick QC on all countries." + ], + "id": "2f26c614" }, - "papermill": { - "duration": 0.479538, - "end_time": "2025-12-19T10:23:08.889341", - "exception": false, - "start_time": "2025-12-19T10:23:08.409803", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:18.924306Z", + "iopub.status.busy": "2025-12-19T10:23:18.920810Z", + "iopub.status.idle": "2025-12-19T10:23:19.482033Z", + "shell.execute_reply": "2025-12-19T10:23:19.479013Z" + }, + "papermill": { + "duration": 0.56938, + "end_time": "2025-12-19T10:23:19.486133", + "exception": false, + "start_time": "2025-12-19T10:23:18.916753", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "summary(dhis2_reporting_wide$ACTUAL_REPORTS)\n", + "summary(dhis2_reporting_wide$EXPECTED_REPORTS)\n" + ], + "execution_count": null, + "outputs": [], + "id": "4118991c" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Pivot wider to have one column per PRODUCT_METRIC (which now indicates whether the VALUE is \"ACTUAL_REPORTS\" or \"EXPECTED_REPORTS\")\n", - "dhis2_reporting_wide <- dhis2_reporting %>%\n", - " pivot_wider(names_from = PRODUCT_METRIC, values_from = VALUE)\n", - "\n", - "# Log msg\n", - "log_msg(glue::glue(\"Pivoted DHIS2 reporting data to wide format, with one column per PRODUCT_METRIC (ACTUAL_REPORTS, EXPECTED_REPORTS).\n", - "Dimensions after pivot: {paste(dim(dhis2_reporting_wide), collapse=', ')}\"))\n", - "\n", - "dim(dhis2_reporting_wide)\n", - "head(dhis2_reporting_wide, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "0f485148", - "metadata": { - "papermill": { - "duration": 0.000186, - "end_time": "2025-12-19T10:23:08.889829", - "exception": false, - "start_time": "2025-12-19T10:23:08.889643", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000172, + "end_time": "2025-12-19T10:23:19.486674", + "exception": false, + "start_time": "2025-12-19T10:23:19.486502", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 3.4. Aggregate at AMD2 level" + ], + "id": "066319a3" }, - "tags": [] - }, - "source": [ - "### 👯 Handle **duplicated** values (`OU_ID`)\n", - "Using multiple datasets relies on the **assumption** that **each dataset is complementary to the other(s)**. Namely, there should be no \"dupliacted\" orgunits that are counted in more than one dataset! Else, we would be **double counting**." - ] - }, - { - "cell_type": "markdown", - "id": "55dececa", - "metadata": { - "papermill": { - "duration": 0.000122, - "end_time": "2025-12-19T10:23:08.890157", - "exception": false, - "start_time": "2025-12-19T10:23:08.890035", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:19.494212Z", + "iopub.status.busy": "2025-12-19T10:23:19.491141Z", + "iopub.status.idle": "2025-12-19T10:23:19.791631Z", + "shell.execute_reply": "2025-12-19T10:23:19.786378Z" + }, + "papermill": { + "duration": 0.308903, + "end_time": "2025-12-19T10:23:19.795888", + "exception": false, + "start_time": "2025-12-19T10:23:19.486985", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# 3.4 Aggregate table preview\n", + "dim(reporting_rate_results)\n", + "head(reporting_rate_results, 5)\n" + ], + "execution_count": null, + "outputs": [], + "id": "e94eeddd" }, - "tags": [] - }, - "source": [ - "#### Check for duplicated values (`OU_ID`)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d761bd15", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:08.899486Z", - "iopub.status.busy": "2025-12-19T10:23:08.894706Z", - "iopub.status.idle": "2025-12-19T10:23:09.476248Z", - "shell.execute_reply": "2025-12-19T10:23:09.470283Z" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000151, + "end_time": "2025-12-19T10:23:19.796350", + "exception": false, + "start_time": "2025-12-19T10:23:19.796199", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 3.5. Calculate REPORTING_RATE\n", + "**numerator**: `ACTUAL_REPORTS`
\n", + "**denominator**: `EXPECTED_REPORTS`" + ], + "id": "eb181891" }, - "papermill": { - "duration": 0.590832, - "end_time": "2025-12-19T10:23:09.481144", - "exception": false, - "start_time": "2025-12-19T10:23:08.890312", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:19.803233Z", + "iopub.status.busy": "2025-12-19T10:23:19.799996Z", + "iopub.status.idle": "2025-12-19T10:23:19.994060Z", + "shell.execute_reply": "2025-12-19T10:23:19.991575Z" + }, + "papermill": { + "duration": 0.200465, + "end_time": "2025-12-19T10:23:19.997024", + "exception": false, + "start_time": "2025-12-19T10:23:19.796559", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# 3.5 Reporting rate range check\n", + "summary(reporting_rate_results$REPORTING_RATE)\n" + ], + "execution_count": null, + "outputs": [], + "id": "e90a1c20" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Check if any OU_ID is present in more than one PRODUCT_UID\n", - "# and if so list them\n", - "ou_product_counts <- dhis2_reporting %>%\n", - " group_by(OU_ID, OU_NAME) %>%\n", - " mutate(PRODUCT_UID_count = n_distinct(PRODUCT_UID)) %>%\n", - " filter(PRODUCT_UID_count > 1) %>%\n", - " select(ADM1_NAME, ADM2_NAME, OU_ID, OU_NAME, PRODUCT_UID_count) %>%\n", - " distinct() \n", - "\n", - "ou_product_counts\n", - "\n", - "# Log msg: which OU_ID have multiple PRODUCT_UIDs\n", - "if (nrow(ou_product_counts) > 0) {\n", - " log_msg(glue::glue(\"🚨 Warning: The following OU_IDs are associated with multiple PRODUCT_UIDs in the DHIS2 reporting data:\n", - "{paste(apply(ou_product_counts, 1, function(row) paste0(' - ', row['OU_NAME'], ' (', row['OU_ID'], ')')), collapse='\\n')}\"), \n", - " level = \"warning\")\n", - "} else {\n", - " log_msg(\"All OU_IDs are associated with a single PRODUCT_UID in the DHIS2 reporting data.\")\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "805ed555", - "metadata": { - "papermill": { - "duration": 0.000139, - "end_time": "2025-12-19T10:23:09.481549", - "exception": false, - "start_time": "2025-12-19T10:23:09.481410", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "source": [ - "#### Remove duplicated OU_IDs (shared across PRODUCT_UIDs)\n", - "Logic: \n", - "1. Identify if any `OU_ID` is present in both datasets\n", - "2. For these, keep `max(ACTUAL_REPORTS)` (since `EXPECTED_REPORTS` is always == 1) because: \n", - " * if both same value (either both 0 or both 1) => simply deduplicate (`distinct()`)\n", - " * if else if different values, meaning that one dataset say 1 and the other 0 => keep 1 (facility _did_ report)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "593b013a", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:09.488856Z", - "iopub.status.busy": "2025-12-19T10:23:09.484674Z", - "iopub.status.idle": "2025-12-19T10:23:13.563200Z", - "shell.execute_reply": "2025-12-19T10:23:13.559294Z" - }, - "papermill": { - "duration": 4.086946, - "end_time": "2025-12-19T10:23:13.568699", - "exception": false, - "start_time": "2025-12-19T10:23:09.481753", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Step 1: check for duplicated OU_ID by PERIOD (there should be only 1 value of OU_ID per PERIOD)\n", - "dupl_ou_period <- dhis2_reporting_wide %>%\n", - " group_by(OU_ID, PERIOD) %>%\n", - " filter(n() > 1) %>%\n", - " ungroup() %>%\n", - " select(OU_ID, OU_NAME, PERIOD, PRODUCT_UID, ends_with(\"REPORTS\"))\n", - "\n", - "# Log msg\n", - "if (nrow(dupl_ou_period) > 0) {\n", - " log_msg(glue::glue(\"🚨 Warning: The OU_IDs are associated with multiple PRODUCT_UIDs affect {nrow(dupl_ou_period)} PERIOD entries (rows) in the DHIS2 reporting data.\"))\n", - "}\n", - "\n", - "dim(dupl_ou_period)\n", - "head(dupl_ou_period, 5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c72bd93a", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:13.581200Z", - "iopub.status.busy": "2025-12-19T10:23:13.574942Z", - "iopub.status.idle": "2025-12-19T10:23:18.911910Z", - "shell.execute_reply": "2025-12-19T10:23:18.907746Z" - }, - "papermill": { - "duration": 5.346749, - "end_time": "2025-12-19T10:23:18.915815", - "exception": false, - "start_time": "2025-12-19T10:23:13.569066", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Step 2: remove duplicated OU_ID by PERIOD\n", - "# Use the following logic:\n", - "# - 1. first, check that values (ACTUAL_REPORTS, EXPECTED_REPORTS) are all 0 or 1 (if not that needs to be handled differently, so skip for now)\n", - "# - 2. then, if multiple PRODUCT_UIDs exist for the same OU_ID and PERIOD, keep the one with the highest ACTUAL_REPORTS value\n", - "# (this is because if values agree, then we can simply keep one, if they don't agree, that means that we have 1 and 0 values, so we keep the 1)\n", - "\n", - "if (all(dupl_ou_period$ACTUAL_REPORTS %in% c(0,1)) & all(dupl_ou_period$EXPECTED_REPORTS %in% c(0,1))) {\n", - " dhis2_reporting_wide <- dhis2_reporting_wide %>%\n", - " group_by(PERIOD, OU_ID) %>%\n", - " mutate(ACTUAL_REPORTS_deduplicated = ifelse(OU_ID %in% dupl_ou_period$OU_ID, max(ACTUAL_REPORTS), ACTUAL_REPORTS)) %>%\n", - " ungroup() %>%\n", - " filter(!(OU_ID %in% dupl_ou_period$OU_ID) | (ACTUAL_REPORTS == ACTUAL_REPORTS_deduplicated)) %>%\n", - " select(-ACTUAL_REPORTS_deduplicated)\n", - "\n", - " log_msg(glue::glue(\"✅ Deduplicated DHIS2 reporting data by keeping only one PRODUCT_UID per OU_ID and PERIOD, based on highest ACTUAL_REPORTS value.\n", - " Dataframe dimensions after deduplication: {paste(dim(dhis2_reporting_wide), collapse=', ')}\"))\n", - "} else {\n", - " log_msg(\"🚨 Warning: Cannot deduplicate OU_ID by PERIOD in DHIS2 reporting data because ACTUAL_REPORTS or EXPECTED_REPORTS contain values other than 0 or 1. \n", - " Analysis will continue without removing duplicated entries.\", level = \"warning\")\n", - "} \n", - "\n", - "dim(dhis2_reporting_wide)\n", - "head(dhis2_reporting_wide, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "2f26c614", - "metadata": { - "papermill": { - "duration": 0.000236, - "end_time": "2025-12-19T10:23:18.916421", - "exception": false, - "start_time": "2025-12-19T10:23:18.916185", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 3.3. (🇳🇪 NER only) Make HOP aggregated values (0, >1) into presence/absence (0, 1)\n", - "Specific for Niger SNIS instance!
\n", - "Values for dataset HOP (\"ki7YKOfyxjf\" = \"HOP 03 ACTIVITES DE LUTTE CONTRE LE PALUDISME\") count the individual \"sub-units\" (departments, etc ... ) of a given hospital and therefore can have values >1.
\n", - "For consistency with CSI (where all values are raw, and therefore only 0 and 1), we need to convert all HOP value >1 into 1." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4118991c", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:18.924306Z", - "iopub.status.busy": "2025-12-19T10:23:18.920810Z", - "iopub.status.idle": "2025-12-19T10:23:19.482033Z", - "shell.execute_reply": "2025-12-19T10:23:19.479013Z" - }, - "papermill": { - "duration": 0.56938, - "end_time": "2025-12-19T10:23:19.486133", - "exception": false, - "start_time": "2025-12-19T10:23:18.916753", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Modify dhis2_reporting_wide to replace all values of ACTUAL_REPORTS and EXPECTED_REPORTS that are >1 with 1\n", - "if (COUNTRY_CODE == \"NER\") {\n", - " log_msg(\"🇳🇪 Special handling for NER: replacing all values of ACTUAL_REPORTS and EXPECTED_REPORTS that are >1 with 1.\")\n", - "\n", - " # Check if any values >1 exist\n", - " n_actual_reports_gt1 <- sum(dhis2_reporting_wide$ACTUAL_REPORTS > 1, na.rm = TRUE)\n", - " n_expected_reports_gt1 <- sum(dhis2_reporting_wide$EXPECTED_REPORTS > 1, na.rm = TRUE)\n", - "\n", - " # Extract the PRODUCT_UID and PRODUCT_NAME associated with those values\n", - " if (n_actual_reports_gt1 > 0 | n_expected_reports_gt1 > 0) {\n", - " dupl_actual_reports <- dhis2_reporting_wide %>%\n", - " filter(ACTUAL_REPORTS > 1) %>%\n", - " select(PRODUCT_UID, PRODUCT_NAME) %>%\n", - " distinct()\n", - "\n", - " log_msg(glue::glue(\"Note: Found {n_actual_reports_gt1} entries with ACTUAL_REPORTS > 1 and {n_expected_reports_gt1} entries with EXPECTED_REPORTS > 1.\n", - "Affected PRODUCT_UIDs and PRODUCT_NAMEs for ACTUAL_REPORTS > 1:\n", - "{paste(apply(dupl_actual_reports, 1, function(row) paste0(row['PRODUCT_NAME'], ' (', row['PRODUCT_UID'], ')')), collapse='\\n')}\"))\n", - "\n", - " dhis2_reporting_wide <- dhis2_reporting_wide %>%\n", - " mutate(\n", - " ACTUAL_REPORTS = ifelse(ACTUAL_REPORTS > 1, 1, ACTUAL_REPORTS),\n", - " EXPECTED_REPORTS = ifelse(EXPECTED_REPORTS > 1, 1, EXPECTED_REPORTS)\n", - " )\n", - "\n", - " log_msg(\"✅ Replaced all values of ACTUAL_REPORTS and EXPECTED_REPORTS that were >1 with 1.\")\n", - "\n", - "} # else nothing to replace\n", - "\n", - " dim(dhis2_reporting_wide)\n", - " head(dhis2_reporting_wide, 3)\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "066319a3", - "metadata": { - "papermill": { - "duration": 0.000172, - "end_time": "2025-12-19T10:23:19.486674", - "exception": false, - "start_time": "2025-12-19T10:23:19.486502", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 3.4. Aggregate at AMD2 level" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e94eeddd", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:19.494212Z", - "iopub.status.busy": "2025-12-19T10:23:19.491141Z", - "iopub.status.idle": "2025-12-19T10:23:19.791631Z", - "shell.execute_reply": "2025-12-19T10:23:19.786378Z" - }, - "papermill": { - "duration": 0.308903, - "end_time": "2025-12-19T10:23:19.795888", - "exception": false, - "start_time": "2025-12-19T10:23:19.486985", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000123, + "end_time": "2025-12-19T10:23:19.997465", + "exception": false, + "start_time": "2025-12-19T10:23:19.997342", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 3.6. Ensure table consistency\n", + "Left join reporting indicators with DHIS2 routine data.\n", + "Make sure we have a consistent reporting rates table matching periods x org units (safety measure only)." + ], + "id": "0556eba8-3d6a-45b1-af02-9bdf7da6fc99" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Sum up values (now at acility level) to get totals per ADM2_ID and PERIOD\n", - "dhis2_reporting_wide_adm2 <- dhis2_reporting_wide %>%\n", - " group_by(\n", - " PERIOD, \n", - " YEAR, MONTH, # keep these just for sanity check (not needed for grouping)\n", - " ADM1_NAME, ADM1_ID, # keep these just for sanity check (not needed for grouping)\n", - " ADM2_NAME, ADM2_ID\n", - " ) %>%\n", - " summarise(\n", - " ACTUAL_REPORTS = sum(ACTUAL_REPORTS, na.rm = TRUE),\n", - " EXPECTED_REPORTS = sum(EXPECTED_REPORTS, na.rm = TRUE),\n", - " .groups = 'drop'\n", - " ) \n", - "\n", - "# Add log messages\n", - "log_msg(glue::glue(\"DHIS2 reporting data pivoted to wide format and aggregated at ADM2 level. \n", - "Dataframe dimensions: {paste(dim(dhis2_reporting_wide_adm2), collapse=', ')}\"))\n", - "head(dhis2_reporting_wide_adm2, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "eb181891", - "metadata": { - "papermill": { - "duration": 0.000151, - "end_time": "2025-12-19T10:23:19.796350", - "exception": false, - "start_time": "2025-12-19T10:23:19.796199", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:20.001909Z", + "iopub.status.busy": "2025-12-19T10:23:19.999878Z", + "iopub.status.idle": "2025-12-19T10:23:20.072344Z", + "shell.execute_reply": "2025-12-19T10:23:20.070004Z" + }, + "papermill": { + "duration": 0.077426, + "end_time": "2025-12-19T10:23:20.075077", + "exception": false, + "start_time": "2025-12-19T10:23:19.997651", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "reporting_rate_dataset <- left_join(dhis2_routine, \n", + " reporting_rate_results %>% select(all_of(fixed_cols_rr)), \n", + " by=c(\"YEAR\", \"MONTH\", \"ADM2_ID\"))\n", + "\n", + "print(dim(reporting_rate_dataset))\n", + "head(reporting_rate_dataset, 3)" + ], + "execution_count": null, + "outputs": [], + "id": "51e5b97a-e9b9-42d4-b991-0cee4fd5041f" }, - "tags": [] - }, - "source": [ - "### 3.5. Calculate REPORTING_RATE\n", - "**numerator**: `ACTUAL_REPORTS`
\n", - "**denominator**: `EXPECTED_REPORTS`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e90a1c20", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:19.803233Z", - "iopub.status.busy": "2025-12-19T10:23:19.799996Z", - "iopub.status.idle": "2025-12-19T10:23:19.994060Z", - "shell.execute_reply": "2025-12-19T10:23:19.991575Z" - }, - "papermill": { - "duration": 0.200465, - "end_time": "2025-12-19T10:23:19.997024", - "exception": false, - "start_time": "2025-12-19T10:23:19.796559", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000173, + "end_time": "2025-12-19T10:23:20.075561", + "exception": false, + "start_time": "2025-12-19T10:23:20.075388", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 3.7. Final visual check on REPORTING_RATE values" + ], + "id": "6b19e88d" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Calculate REPORTING_RATE as ACTUAL_REPORTS / EXPECTED_REPORTS\n", - "reporting_rate_results <- dhis2_reporting_wide_adm2 %>%\n", - " mutate(REPORTING_RATE = ACTUAL_REPORTS / EXPECTED_REPORTS)\n", - "\n", - "log_msg(glue::glue(\"DHIS2 reporting rate calculated as ACTUAL_REPORTS / EXPECTED_REPORTS. Dataframe dimensions: {paste(dim(reporting_rate_results), collapse=', ')}\"))\n", - "head(reporting_rate_results, 3) " - ] - }, - { - "cell_type": "markdown", - "id": "0556eba8-3d6a-45b1-af02-9bdf7da6fc99", - "metadata": { - "papermill": { - "duration": 0.000123, - "end_time": "2025-12-19T10:23:19.997465", - "exception": false, - "start_time": "2025-12-19T10:23:19.997342", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Add log message to communicate range of REPORTING_RATE values and warn if any values are outside [0,1]\n", + "min_rr <- min(reporting_rate_dataset$REPORTING_RATE, na.rm = TRUE)\n", + "max_rr <- max(reporting_rate_dataset$REPORTING_RATE, na.rm = TRUE)\n", + "if (min_rr < 0 | max_rr > 1) { \n", + " log_msg(glue::glue(\"🚨 Warning: REPORTING_RATE values are outside the expected range [0,1]. \n", + " Minimum REPORTING_RATE: {round(min_rr, 4)}, Maximum REPORTING_RATE: {round(max_rr, 4)}\"), level = \"warning\")\n", + "} else {\n", + " log_msg(glue::glue(\"✅ REPORTING_RATE values are within the expected range [0,1]. \n", + " Minimum REPORTING_RATE: {round(min_rr, 4)}, Maximum REPORTING_RATE: {round(max_rr, 4)}\"))\n", + "}" + ], + "execution_count": null, + "outputs": [], + "id": "fbfec60f" }, - "tags": [] - }, - "source": [ - "### 3.6. Ensure consistency of table (probably can skip because all data comes from the same source!)\n", - "Left join reporting indicators with DHIS2 routine data.\n", - "Make sure we have a consistent reporting rates table matching periods x org units (safety measure only)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "51e5b97a-e9b9-42d4-b991-0cee4fd5041f", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:20.001909Z", - "iopub.status.busy": "2025-12-19T10:23:19.999878Z", - "iopub.status.idle": "2025-12-19T10:23:20.072344Z", - "shell.execute_reply": "2025-12-19T10:23:20.070004Z" + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:20.080475Z", + "iopub.status.busy": "2025-12-19T10:23:20.078272Z", + "iopub.status.idle": "2025-12-19T10:23:21.456898Z", + "shell.execute_reply": "2025-12-19T10:23:21.453352Z" + }, + "papermill": { + "duration": 1.384875, + "end_time": "2025-12-19T10:23:21.460674", + "exception": false, + "start_time": "2025-12-19T10:23:20.075799", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Simple plot to visualize distribution of REPORTING_RATE\n", + "ggplot(reporting_rate_dataset, aes(x=REPORTING_RATE)) +\n", + " geom_histogram() +\n", + " labs(\n", + " x=\"Dataset Reporting Rate\", y=\"Frequency\",\n", + " title = glue::glue(\"Reporting rate values range from {round(min(reporting_rate_dataset$REPORTING_RATE), 2)} to {round(max(reporting_rate_dataset$REPORTING_RATE), 2)}\")\n", + " ) +\n", + " theme_minimal()" + ], + "execution_count": null, + "outputs": [], + "id": "8878192f" }, - "papermill": { - "duration": 0.077426, - "end_time": "2025-12-19T10:23:20.075077", - "exception": false, - "start_time": "2025-12-19T10:23:19.997651", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000104, + "end_time": "2025-12-19T10:23:21.460981", + "exception": false, + "start_time": "2025-12-19T10:23:21.460877", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## 4. 📁 Export to `data/` folder\n", + "Export as both .csv and .parquet file formats." + ], + "id": "ad181b27-bf7b-4eb5-9200-fda8c2b8eb60" }, - "tags": [], - "vscode": { - "languageId": "r" + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:21.467337Z", + "iopub.status.busy": "2025-12-19T10:23:21.464010Z", + "iopub.status.idle": "2025-12-19T10:23:22.383295Z", + "shell.execute_reply": "2025-12-19T10:23:22.379935Z" + }, + "papermill": { + "duration": 0.926094, + "end_time": "2025-12-19T10:23:22.387190", + "exception": false, + "start_time": "2025-12-19T10:23:21.461096", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "output_dir <- file.path(snt_environment$DATA_PATH, \"dhis2\", \"reporting_rate\")\n", + "dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)\n", + "\n", + "out_msg <- paste0(\"Reporting rate dataset saved under: \", file.path(output_dir, paste0(COUNTRY_CODE, \"_reporting_rate_dataset.csv\")))\n", + "\n", + "# write parquet and csv files\n", + "write_parquet(reporting_rate_dataset, file.path(output_dir, paste0(COUNTRY_CODE, \"_reporting_rate_dataset.parquet\")))\n", + "write.csv(reporting_rate_dataset, file.path(output_dir, paste0(COUNTRY_CODE, \"_reporting_rate_dataset.csv\")), row.names = FALSE)\n", + "\n", + "# log\n", + "log_msg(out_msg)\n" + ], + "execution_count": null, + "outputs": [], + "id": "9adc033d-18d6-4786-8f96-21337b3e005f" } - }, - "outputs": [], - "source": [ - "reporting_rate_dataset <- left_join(dhis2_routine, \n", - " reporting_rate_results %>% select(all_of(fixed_cols_rr)), \n", - " by=c(\"YEAR\", \"MONTH\", \"ADM2_ID\"))\n", - "\n", - "print(dim(reporting_rate_dataset))\n", - "head(reporting_rate_dataset, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "6b19e88d", - "metadata": { - "papermill": { - "duration": 0.000173, - "end_time": "2025-12-19T10:23:20.075561", - "exception": false, - "start_time": "2025-12-19T10:23:20.075388", - "status": "completed" + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" }, - "tags": [] - }, - "source": [ - "### 3.7. Final visual check on REPORTING_RATE values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fbfec60f", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Add log message to communicate range of REPORTING_RATE values and warn if any values are outside [0,1]\n", - "min_rr <- min(reporting_rate_dataset$REPORTING_RATE, na.rm = TRUE)\n", - "max_rr <- max(reporting_rate_dataset$REPORTING_RATE, na.rm = TRUE)\n", - "if (min_rr < 0 | max_rr > 1) { \n", - " log_msg(glue::glue(\"🚨 Warning: REPORTING_RATE values are outside the expected range [0,1]. \n", - " Minimum REPORTING_RATE: {round(min_rr, 4)}, Maximum REPORTING_RATE: {round(max_rr, 4)}\"), level = \"warning\")\n", - "} else {\n", - " log_msg(glue::glue(\"✅ REPORTING_RATE values are within the expected range [0,1]. \n", - " Minimum REPORTING_RATE: {round(min_rr, 4)}, Maximum REPORTING_RATE: {round(max_rr, 4)}\"))\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8878192f", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:20.080475Z", - "iopub.status.busy": "2025-12-19T10:23:20.078272Z", - "iopub.status.idle": "2025-12-19T10:23:21.456898Z", - "shell.execute_reply": "2025-12-19T10:23:21.453352Z" + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" }, "papermill": { - "duration": 1.384875, - "end_time": "2025-12-19T10:23:21.460674", - "exception": false, - "start_time": "2025-12-19T10:23:20.075799", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" + "default_parameters": {}, + "duration": 94.192072, + "end_time": "2025-12-19T10:23:22.614345", + "environment_variables": {}, + "exception": null, + "input_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb", + "output_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataset/papermill_outputs/snt_dhis2_reporting_rate_dataset_OUTPUT_2025-12-19_102148.ipynb", + "parameters": { + "ROUTINE_FILE": "XXX_routine_outliers_imputed.parquet", + "SNT_ROOT_PATH": "/home/hexa/workspace" + }, + "start_time": "2025-12-19T10:21:48.422273", + "version": "2.6.0" } - }, - "outputs": [], - "source": [ - "# Simple plot to visualize distribution of REPORTING_RATE\n", - "ggplot(reporting_rate_dataset, aes(x=REPORTING_RATE)) +\n", - " geom_histogram() +\n", - " labs(\n", - " x=\"Dataset Reporting Rate\", y=\"Frequency\",\n", - " title = glue::glue(\"Reporting rate values range from {round(min(reporting_rate_dataset$REPORTING_RATE), 2)} to {round(max(reporting_rate_dataset$REPORTING_RATE), 2)}\")\n", - " ) +\n", - " theme_minimal()" - ] - }, - { - "cell_type": "markdown", - "id": "ad181b27-bf7b-4eb5-9200-fda8c2b8eb60", - "metadata": { - "papermill": { - "duration": 0.000104, - "end_time": "2025-12-19T10:23:21.460981", - "exception": false, - "start_time": "2025-12-19T10:23:21.460877", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## 4. 📁 Export to `data/` folder\n", - "Export as both .csv and .parquet file formats." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9adc033d-18d6-4786-8f96-21337b3e005f", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:21.467337Z", - "iopub.status.busy": "2025-12-19T10:23:21.464010Z", - "iopub.status.idle": "2025-12-19T10:23:22.383295Z", - "shell.execute_reply": "2025-12-19T10:23:22.379935Z" - }, - "papermill": { - "duration": 0.926094, - "end_time": "2025-12-19T10:23:22.387190", - "exception": false, - "start_time": "2025-12-19T10:23:21.461096", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "output_data_path <- file.path(DATA_PATH, \"reporting_rate\")\n", - "\n", - "# parquet\n", - "file_path <- file.path(output_data_path, paste0(COUNTRY_CODE, \"_reporting_rate_dataset.parquet\")) \n", - "write_parquet(reporting_rate_dataset, file_path)\n", - "log_msg(glue(\"Exported : {file_path}\"))\n", - "\n", - "# csv\n", - "file_path <- file.path(output_data_path, paste0(COUNTRY_CODE, \"_reporting_rate_dataset.csv\"))\n", - "write.csv(reporting_rate_dataset, file_path, row.names = FALSE)\n", - "log_msg(glue(\"Exported : {file_path}\"))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" }, - "papermill": { - "default_parameters": {}, - "duration": 94.192072, - "end_time": "2025-12-19T10:23:22.614345", - "environment_variables": {}, - "exception": null, - "input_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb", - "output_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataset/papermill_outputs/snt_dhis2_reporting_rate_dataset_OUTPUT_2025-12-19_102148.ipynb", - "parameters": { - "ROUTINE_FILE": "NER_routine_outliers_imputed.parquet", - "SNT_ROOT_PATH": "/home/hexa/workspace" - }, - "start_time": "2025-12-19T10:21:48.422273", - "version": "2.6.0" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/pipelines/snt_dhis2_reporting_rate_dataset/reporting/snt_dhis2_reporting_rate_dataset_report.ipynb b/pipelines/snt_dhis2_reporting_rate_dataset/reporting/snt_dhis2_reporting_rate_dataset_report.ipynb index 38091fe..50d9708 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataset/reporting/snt_dhis2_reporting_rate_dataset_report.ipynb +++ b/pipelines/snt_dhis2_reporting_rate_dataset/reporting/snt_dhis2_reporting_rate_dataset_report.ipynb @@ -2,7 +2,6 @@ "cells": [ { "cell_type": "markdown", - "id": "b79cba06", "metadata": { "papermill": { "duration": 0.000249, @@ -15,12 +14,11 @@ }, "source": [ "### 1. Setup" - ] + ], + "id": "b79cba06" }, { "cell_type": "code", - "execution_count": null, - "id": "7ca65bcc", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:27.561213Z", @@ -40,36 +38,27 @@ "languageId": "r" } }, - "outputs": [], "source": [ - "# Project paths\n", - "SNT_ROOT_PATH <- \"/home/hexa/workspace\" \n", - "REPORTING_NB_OUTPUTS_PATH <- file.path(SNT_ROOT_PATH, \"pipelines/snt_dhis2_reporting_rate_dataset/reporting/outputs\")\n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, 'code') # this is where we store snt_utils.r\n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, 'configuration') # .json config file\n", - "DATA_PATH <- file.path(SNT_ROOT_PATH, 'data', 'dhis2') \n", - "\n", - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "# Load palettes\n", - "source(file.path(CODE_PATH, \"snt_palettes.r\"))\n", + "SNT_ROOT_PATH <- \"~/workspace\"\n", + "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_reporting_rate_dataset\")\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_reporting_rate_dataset.r\"))\n", "\n", - "# Load libraries \n", - "required_packages <- c(\"arrow\", \"tidyverse\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"glue\")\n", - "install_and_load(required_packages)\n", + "report_packages <- c(\"arrow\", \"tidyverse\", \"sf\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"glue\")\n", + "snt_environment <- get_setup_variables(SNT_ROOT_PATH = SNT_ROOT_PATH, packages = report_packages)\n", "\n", - "# Environment variables\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", + "CONFIG_PATH <- snt_environment$CONFIG_PATH\n", + "DATA_PATH <- file.path(snt_environment$DATA_PATH, \"dhis2\")\n", + "REPORTING_NB_OUTPUTS_PATH <- file.path(SNT_ROOT_PATH, \"pipelines/snt_dhis2_reporting_rate_dataset/reporting/outputs\")\n", "\n", - "# Load OpenHEXA sdk\n", - "openhexa <- import(\"openhexa.sdk\")" - ] + "source(file.path(CODE_PATH, \"snt_palettes.r\"))" + ], + "execution_count": null, + "outputs": [], + "id": "7ca65bcc" }, { "cell_type": "markdown", - "id": "c5301aa3", "metadata": { "papermill": { "duration": 0.000116, @@ -82,12 +71,11 @@ }, "source": [ "#### 1.1. Load and check `snt config` file" - ] + ], + "id": "c5301aa3" }, { "cell_type": "code", - "execution_count": null, - "id": "76d8a072", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:34.858197Z", @@ -107,23 +95,15 @@ "languageId": "r" } }, - "outputs": [], "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", - " error = function(e) {\n", - " msg <- paste0(\"[ERROR] Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "log_msg(paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, \"SNT_config.json\")))" - ] + "config_json <- load_snt_config(file.path(CONFIG_PATH, \"SNT_config.json\"))" + ], + "execution_count": null, + "outputs": [], + "id": "76d8a072" }, { "cell_type": "code", - "execution_count": null, - "id": "c712ac02", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:35.342494Z", @@ -143,7 +123,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# Configuration settings\n", "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", @@ -154,12 +133,13 @@ "DHIS2_FORMATTED_DATASET_NAME <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", "\n", "REPORTING_RATE_PRODUCT_UID <- config_json$SNT_CONFIG$REPORTING_RATE_PRODUCT_UID # to add to plots subtitles" - ] + ], + "execution_count": null, + "outputs": [], + "id": "c712ac02" }, { "cell_type": "code", - "execution_count": null, - "id": "e02c652e", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:35.373316Z", @@ -179,16 +159,17 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# Make string of product uids for plot subtitles\n", "rr_product_uid <-paste(REPORTING_RATE_PRODUCT_UID,collapse = \", \") \n", "rr_product_uid" - ] + ], + "execution_count": null, + "outputs": [], + "id": "e02c652e" }, { "cell_type": "markdown", - "id": "30b058f4", "metadata": { "papermill": { "duration": 0.000094, @@ -202,12 +183,11 @@ "source": [ "#### 1.2. Load and check `snt metadata` file\n", "This is needed for the correct use of palettes and categories (breaks, or scale)" - ] + ], + "id": "30b058f4" }, { "cell_type": "code", - "execution_count": null, - "id": "98a8ee49", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:35.403224Z", @@ -227,7 +207,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# Load SNT metadata\n", "metadata_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_metadata.json\")) },\n", @@ -238,12 +217,13 @@ " })\n", "\n", "log_msg(paste0(\"SNT metadata loaded from : \", file.path(CONFIG_PATH, \"SNT_metadata.json\")))" - ] + ], + "execution_count": null, + "outputs": [], + "id": "98a8ee49" }, { "cell_type": "code", - "execution_count": null, - "id": "00681217", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:36.357945Z", @@ -263,22 +243,22 @@ "languageId": "r" } }, - "outputs": [], "source": [ "scale_raw <- metadata_json$REPORTING_RATE$SCALE\n", - "if (is.character(scale_raw) && length(scale_raw) == 1) {\n", - " break_vals <- jsonlite::fromJSON(scale_raw)\n", + "break_vals <- if (is.character(scale_raw) && length(scale_raw) == 1) {\n", + " jsonlite::fromJSON(scale_raw)\n", "} else {\n", - " break_vals <- unlist(scale_raw, use.names = FALSE)\n", + " as.numeric(unlist(scale_raw, use.names = FALSE))\n", "}\n", - "break_vals <- as.numeric(break_vals)\n", "\n", "log_msg(paste0(\"Reporting Rate scale break values loaded from SNT_metadata.json : \", paste(break_vals, collapse = \", \")))" - ] + ], + "execution_count": null, + "outputs": [], + "id": "00681217" }, { "cell_type": "markdown", - "id": "f3470564", "metadata": { "papermill": { "duration": 0.000162, @@ -291,11 +271,11 @@ }, "source": [ "### 2. Load Data" - ] + ], + "id": "f3470564" }, { "cell_type": "markdown", - "id": "82397307", "metadata": { "papermill": { "duration": 0.000126, @@ -309,12 +289,11 @@ "source": [ "#### 2.1. Output of pipeline notebook\n", "Import file named `{COUNTRY_CODE}_reporting_rate_dataset.parquet` from **OH Dataset** \"SNT_DHIS2_REPORTING_RATE\" (as in `config_json$SNT_DATASET_IDENTIFIERS$DHIS2_REPORTING_RATE`)" - ] + ], + "id": "82397307" }, { "cell_type": "code", - "execution_count": null, - "id": "70acb2c5", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:36.543564Z", @@ -334,7 +313,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "\n", "reporting_rate_dataset <- tryCatch({ get_latest_dataset_file_in_memory(REPORTING_RATE_DATASET_NAME, glue::glue(\"{COUNTRY_CODE}_reporting_rate_dataset.parquet\")) }, \n", @@ -348,11 +326,13 @@ "log_msg(glue::glue(\"Data file loaded from dataset : {REPORTING_RATE_DATASET_NAME} dataframe dimensions: {paste(dim(reporting_rate_dataset), collapse=', ')}\"))\n", "dim(reporting_rate_dataset)\n", "head(reporting_rate_dataset, 2)" - ] + ], + "execution_count": null, + "outputs": [], + "id": "70acb2c5" }, { "cell_type": "markdown", - "id": "48833515", "metadata": { "papermill": { "duration": 0.000091, @@ -366,12 +346,11 @@ "source": [ "#### 2.2. Shapes\n", "To make choropleth (map)" - ] + ], + "id": "48833515" }, { "cell_type": "code", - "execution_count": null, - "id": "3febd4f4", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:37.798194Z", @@ -391,7 +370,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "shapes <- tryCatch({ get_latest_dataset_file_in_memory(DHIS2_FORMATTED_DATASET_NAME, paste0(COUNTRY_CODE, \"_shapes.geojson\")) }, \n", " error = function(e) { \n", @@ -402,11 +380,13 @@ "\n", "log_msg(glue::glue(\"Shapes loaded from dataset: '{DHIS2_FORMATTED_DATASET_NAME}'. \\nDataframe with dimensions: {paste(dim(shapes), collapse=', ')}\"))\n", "names(shapes)" - ] + ], + "execution_count": null, + "outputs": [], + "id": "3febd4f4" }, { "cell_type": "markdown", - "id": "17067d56", "metadata": { "papermill": { "duration": 0.000166, @@ -419,11 +399,11 @@ }, "source": [ "### 3. Plots" - ] + ], + "id": "17067d56" }, { "cell_type": "markdown", - "id": "9a6369ee", "metadata": { "papermill": { "duration": 0.000109, @@ -436,12 +416,11 @@ }, "source": [ "##### 3.0. Add shapes" - ] + ], + "id": "9a6369ee" }, { "cell_type": "code", - "execution_count": null, - "id": "c6641720", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:41.333105Z", @@ -461,17 +440,18 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# Join shapes to reporting rate data\n", "\n", "data_to_plot <- reporting_rate_dataset %>%\n", " left_join(shapes, by = c(\"ADM2_ID\"))" - ] + ], + "execution_count": null, + "outputs": [], + "id": "c6641720" }, { "cell_type": "markdown", - "id": "0b0d32f1", "metadata": { "papermill": { "duration": 0.000195, @@ -484,11 +464,11 @@ }, "source": [ "#### 3.1. 🎨 Dynamic categories and color assignement" - ] + ], + "id": "0b0d32f1" }, { "cell_type": "markdown", - "id": "cc765e0c", "metadata": { "papermill": { "duration": 0.000109, @@ -501,12 +481,11 @@ }, "source": [ "##### 1. Define breaks and labels" - ] + ], + "id": "cc765e0c" }, { "cell_type": "code", - "execution_count": null, - "id": "2e79132c", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:41.373558Z", @@ -526,19 +505,19 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# Safety code to avoid breaking if nothings is fund in json_metadata\n", "if (is.null(break_vals) || length(break_vals) == 0) {\n", " log_msg(\"[WARNING] No break values found in SNT_metadata.json for REPORTING_RATE$SCALE. Using default values.\", \"warning\")\n", " break_vals <- c(0.5, 0.8, 0.9, 0.95, 1.00)\n", "}" - ] + ], + "execution_count": null, + "outputs": [], + "id": "2e79132c" }, { "cell_type": "code", - "execution_count": null, - "id": "f04cb888", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:41.401034Z", @@ -558,7 +537,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# 1. Define breaks\n", "# Note: assumes that the data starts at 0!\n", @@ -576,11 +554,13 @@ "\n", "# Check\n", "labels" - ] + ], + "execution_count": null, + "outputs": [], + "id": "f04cb888" }, { "cell_type": "markdown", - "id": "cb237801", "metadata": { "papermill": { "duration": 0.000102, @@ -593,12 +573,11 @@ }, "source": [ "##### 2. Create `_CATEGORY` col" - ] + ], + "id": "cb237801" }, { "cell_type": "code", - "execution_count": null, - "id": "f8303488", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:41.439376Z", @@ -618,7 +597,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# reporting_rate_dataset <- reporting_rate_dataset %>%\n", "data_to_plot <- data_to_plot %>%\n", @@ -631,11 +609,13 @@ " include.lowest = TRUE\n", " )\n", " )" - ] + ], + "execution_count": null, + "outputs": [], + "id": "f8303488" }, { "cell_type": "markdown", - "id": "a10237f8", "metadata": { "papermill": { "duration": 0.000102, @@ -648,12 +628,11 @@ }, "source": [ "##### 3. Pick appropriate palette" - ] + ], + "id": "a10237f8" }, { "cell_type": "code", - "execution_count": null, - "id": "2ee6e077", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:41.480216Z", @@ -673,7 +652,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# Count nr of breaks\n", "nr_of_colors <- length(labels)\n", @@ -686,11 +664,13 @@ "names(palette_to_use) <- rev(labels)\n", "\n", "palette_to_use\n" - ] + ], + "execution_count": null, + "outputs": [], + "id": "2ee6e077" }, { "cell_type": "markdown", - "id": "d08c0c14", "metadata": { "papermill": { "duration": 0.000099, @@ -703,11 +683,11 @@ }, "source": [ "#### 3.2. Plots" - ] + ], + "id": "d08c0c14" }, { "cell_type": "markdown", - "id": "b7781198", "metadata": { "papermill": { "duration": 0.000056, @@ -721,12 +701,11 @@ "source": [ "##### 3.2.1 Scatter plot of RR over time (by ADM2)\n", "With this we can see the actula numbners (although cannot tell which ADM2 have low values)." - ] + ], + "id": "b7781198" }, { "cell_type": "code", - "execution_count": null, - "id": "78d92e4a", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:41.522513Z", @@ -746,7 +725,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# Line point plot faceted by YEAR\n", "ggplot(data = data_to_plot) +\n", @@ -791,12 +769,13 @@ " strip.placement = \"outside\",\n", " strip.text = element_text(face = \"bold\", size = 10)\n", " )" - ] + ], + "execution_count": null, + "outputs": [], + "id": "78d92e4a" }, { "cell_type": "code", - "execution_count": null, - "id": "1f47064a", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:42.978498Z", @@ -816,7 +795,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# Export plot as PNG\n", "output_filename <- paste0(COUNTRY_CODE, \"_reporting_rate_dataset_adm2_linepoint_\", paste(REPORTING_RATE_PRODUCT_UID, collapse = \"_\"), \".png\")\n", @@ -835,11 +813,13 @@ "\n", "# Add log message\n", "log_msg(glue::glue(\"📊 Plot (linepoint) saved to: {file.path(output_location, output_filename)}\"))" - ] + ], + "execution_count": null, + "outputs": [], + "id": "1f47064a" }, { "cell_type": "markdown", - "id": "22bb6431", "metadata": { "papermill": { "duration": 0.000147, @@ -853,12 +833,11 @@ "source": [ "##### 3.2.2 Heatmap plot of RR over time (by ADM2)\n", "This is less good for identifying actual values, but allows to see which ADM2 have lower values." - ] + ], + "id": "22bb6431" }, { "cell_type": "code", - "execution_count": null, - "id": "f2445f2a", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:44.094508Z", @@ -878,7 +857,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# Tile plot faceted by YEAR\n", "ggplot(data = data_to_plot) +\n", @@ -915,12 +893,13 @@ " strip.text = element_text(face = \"bold\", size = 10)\n", " ) +\n", " guides(fill = guide_legend(nrow = 1))" - ] + ], + "execution_count": null, + "outputs": [], + "id": "f2445f2a" }, { "cell_type": "code", - "execution_count": null, - "id": "cbe73312", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:46.311134Z", @@ -940,7 +919,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# Export plot as PNG\n", "output_filename <- paste0(COUNTRY_CODE, \"_reporting_rate_dataset_adm2_heatmap_\", paste(REPORTING_RATE_PRODUCT_UID, collapse = \"_\"), \".png\")\n", @@ -956,11 +934,13 @@ "\n", "# Add log message\n", "log_msg(glue::glue(\"📊 Plot (heatmap) saved to: {file.path(output_location, output_filename)}\"))" - ] + ], + "execution_count": null, + "outputs": [], + "id": "cbe73312" }, { "cell_type": "markdown", - "id": "3eef141a", "metadata": { "papermill": { "duration": 0.000164, @@ -973,12 +953,11 @@ }, "source": [ "##### 3.2.3. MAP of Reporting Rate - by month" - ] + ], + "id": "3eef141a" }, { "cell_type": "code", - "execution_count": null, - "id": "83be9c68", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:48.294030Z", @@ -998,7 +977,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# Choropleth map with reporting rate data by ADM2\n", "ggplot(data = data_to_plot) +\n", @@ -1027,12 +1005,13 @@ " cols = vars(MONTH),\n", " switch = \"both\") +\n", " guides(fill = guide_legend(nrow = 1))" - ] + ], + "execution_count": null, + "outputs": [], + "id": "83be9c68" }, { "cell_type": "code", - "execution_count": null, - "id": "e877671d", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:53.252696Z", @@ -1052,7 +1031,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "output_filename <- paste0(COUNTRY_CODE, \"_reporting_rate_dataset_adm2_map_\", paste(REPORTING_RATE_PRODUCT_UID, collapse = \"_\"), \".png\")\n", "output_location <- file.path(REPORTING_NB_OUTPUTS_PATH, \"figures\")\n", @@ -1067,11 +1045,13 @@ "\n", "# Add log message\n", "log_msg(glue::glue(\"📊 Plot (map) saved to: {file.path(output_location, output_filename)}\"))" - ] + ], + "execution_count": null, + "outputs": [], + "id": "e877671d" }, { "cell_type": "markdown", - "id": "f0894be9", "metadata": { "papermill": { "duration": 0.000166, @@ -1085,12 +1065,11 @@ "source": [ "##### 3.2.4. MAP of Reporting Rate - by YEAR\n", "Use average (`mean()`) of monthly values" - ] + ], + "id": "f0894be9" }, { "cell_type": "code", - "execution_count": null, - "id": "cb1995ab", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:56.755998Z", @@ -1110,7 +1089,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "data_to_plot_year <- data_to_plot %>%\n", " group_by(geometry, ADM2_ID, ADM2_NAME, ADM1_NAME, YEAR) %>%\n", @@ -1128,12 +1106,13 @@ " include.lowest = TRUE\n", " )\n", " )" - ] + ], + "execution_count": null, + "outputs": [], + "id": "cb1995ab" }, { "cell_type": "code", - "execution_count": null, - "id": "bd32b0cf", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:56.795010Z", @@ -1153,7 +1132,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# Choropleth map with reporting rate data by ADM2\n", "ggplot(data = data_to_plot_year) +\n", @@ -1180,12 +1158,13 @@ " cols = vars(YEAR)\n", " ) +\n", " guides(fill = guide_legend(nrow = 1))" - ] + ], + "execution_count": null, + "outputs": [], + "id": "bd32b0cf" }, { "cell_type": "code", - "execution_count": null, - "id": "0430641e", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:57.594096Z", @@ -1205,7 +1184,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "output_filename <- paste0(COUNTRY_CODE, \"_reporting_rate_dataset_adm2_map_year_\", paste(REPORTING_RATE_PRODUCT_UID, collapse = \"_\"), \".png\")\n", "output_location <- file.path(REPORTING_NB_OUTPUTS_PATH, \"figures\")\n", @@ -1220,11 +1198,13 @@ "\n", "# Add log message\n", "log_msg(glue::glue(\"📊 Plot (map) saved to: {file.path(output_location, output_filename)}\"))" - ] + ], + "execution_count": null, + "outputs": [], + "id": "0430641e" }, { "cell_type": "markdown", - "id": "8c3bdca4", "metadata": { "papermill": { "duration": 0.000126, @@ -1237,12 +1217,11 @@ }, "source": [ "#### The End :)" - ] + ], + "id": "8c3bdca4" }, { "cell_type": "code", - "execution_count": null, - "id": "f8a62ec5", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:58.523680Z", @@ -1262,10 +1241,12 @@ "languageId": "r" } }, - "outputs": [], "source": [ "log_msg(\"Reporting Rate (Dataset) report notebook completed successfully!\")" - ] + ], + "execution_count": null, + "outputs": [], + "id": "f8a62ec5" } ], "metadata": { @@ -1297,4 +1278,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/pipelines/snt_dhis2_reporting_rate_dataset/utils/snt_dhis2_reporting_rate_dataset.r b/pipelines/snt_dhis2_reporting_rate_dataset/utils/snt_dhis2_reporting_rate_dataset.r new file mode 100644 index 0000000..175e817 --- /dev/null +++ b/pipelines/snt_dhis2_reporting_rate_dataset/utils/snt_dhis2_reporting_rate_dataset.r @@ -0,0 +1,112 @@ +# Load base utils +source(file.path("~/workspace/code", "snt_utils.r")) + + +# JSON reader for this pipeline. +read_workspace_json_file <- function(json_path, resource_label = "JSON file") { + json_path <- as.character(json_path)[[1L]] + tryCatch( + jsonlite::fromJSON(json_path), + error = function(e) { + stop(paste0( + "[ERROR] Error while loading ", + resource_label, + " from `", + json_path, + "`: ", + conditionMessage(e) + )) + } + ) +} + + +#' Get Setup Variables for SNT Workspace +#' Initializes workspace paths, loads R packages, and imports OpenHEXA SDK. +#' +#' @param SNT_ROOT_PATH Character. Root path of the SNT workspace. Default: '~/workspace' +#' @param packages Character vector. R packages to install and load. +#' @return List with `paths_to_check`, `CONFIG_PATH`, `UPLOADS_PATH`, and `DATA_PATH`. +#' +#' @export +get_setup_variables <- function( + SNT_ROOT_PATH = "~/workspace", + packages = c( + "arrow", "dplyr", "tidyr", "ggplot2", + "stringr", "stringi", "jsonlite", "httr", "glue", "reticulate" + ) +) { + paths_to_check <- list( + CONFIG_PATH = file.path(SNT_ROOT_PATH, "configuration"), + UPLOADS_PATH = file.path(SNT_ROOT_PATH, "uploads"), + DATA_PATH = file.path(SNT_ROOT_PATH, "data") + ) + setup_variable <- c( + list(paths_to_check = paths_to_check), + paths_to_check + ) + + install_and_load(packages) + + if (Sys.getenv("PROJ_LIB", "") == "") { + Sys.setenv(PROJ_LIB = "/opt/conda/share/proj") + } + if (Sys.getenv("GDAL_DATA", "") == "") { + Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal") + } + Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") + reticulate::py_config()$python + assign("openhexa", reticulate::import("openhexa.sdk"), envir = .GlobalEnv) + + return(setup_variable) +} + + +#' Load SNT Configuration File +#' @param snt_config_path Character. Full path to `SNT_config.json`. +#' @export +load_snt_config <- function(snt_config_path) { + config_json <- read_workspace_json_file(snt_config_path, "configuration") + log_msg(paste0("SNT configuration loaded from: ", snt_config_path)) + return(config_json) +} + + +#' Fail if Papermill did not inject `ROUTINE_FILE` and `DATASET_ID`. +#' @export +stop_if_dataset_reporting_papermill_params_missing <- function() { + required <- c("ROUTINE_FILE", "DATASET_ID") + missing <- required[!vapply(required, exists, logical(1), inherits = TRUE)] + if (length(missing) > 0) { + stop( + "[ERROR] Missing pipeline parameters (Papermill): ", + paste(missing, collapse = ", ") + ) + } +} + + +#' Load Dataset File from OpenHEXA +#' +#' @param dataset_id Character. OpenHEXA dataset identifier. +#' @param filename Character. Name of file to load. +#' @param verbose Logical. If TRUE, log dataframe dimensions after a successful load. +#' @export +load_dataset_file <- function(dataset_id, filename, verbose = TRUE) { + data <- tryCatch( + { + get_latest_dataset_file_in_memory(dataset_id, filename) + }, + error = function(e) { + stop(glue::glue("[ERROR] Error while loading {filename} file from dataset: {dataset_id}")) + } + ) + if (verbose) { + log_msg(glue::glue( + "{filename} data loaded from dataset : {dataset_id} dataframe dimensions: [{paste(dim(data), collapse = ', ')}]" + )) + } + return(data) +} + + diff --git a/snt_dhis2_reporting_rate_dataelement/pipeline.py b/snt_dhis2_reporting_rate_dataelement/pipeline.py index 88a2d2e..b81909f 100644 --- a/snt_dhis2_reporting_rate_dataelement/pipeline.py +++ b/snt_dhis2_reporting_rate_dataelement/pipeline.py @@ -31,7 +31,8 @@ "activity_indicators", name="Facility Activity indicators", help="Define which data elements will be used to determine the activity of a facility." - " A facility is considered 'active' if at least one of these indicators has a non-missing value greater than zero.", + " A facility is considered 'active' if at least one of these indicators has a non-missing value" + " greater than zero.", multiple=True, choices=["CONF", "SUSP", "TEST", "PRES"], type=str, @@ -119,6 +120,11 @@ def snt_dhis2_reporting_rate_dataelement( routine_file = resolve_routine_filename( country_code=country_code, routine_data_choice=routine_data_choice ) + if routine_data_choice == "raw": + ds_outliers_id = snt_config["SNT_DATASET_IDENTIFIERS"]["DHIS2_DATASET_FORMATTED"] + else: + ds_outliers_id = snt_config["SNT_DATASET_IDENTIFIERS"]["DHIS2_OUTLIERS_IMPUTATION"] + nb_parameters = { "SNT_ROOT_PATH": root_path.as_posix(), "ROUTINE_FILE": routine_file, @@ -126,6 +132,7 @@ def snt_dhis2_reporting_rate_dataelement( "ACTIVITY_INDICATORS": activity_indicators, "VOLUME_ACTIVITY_INDICATORS": volume_activity_indicators, "USE_WEIGHTED_REPORTING_RATES": use_weighted_reporting_rates, + "DATASET_ID": ds_outliers_id, } parameters_file = save_pipeline_parameters( pipeline_name="snt_dhis2_reporting_rate_dataelement", @@ -136,11 +143,6 @@ def snt_dhis2_reporting_rate_dataelement( current_run.log_info(f"Saved pipeline parameters to {parameters_file}") if not run_report_only: - if routine_data_choice == "raw": - ds_outliers_id = snt_config["SNT_DATASET_IDENTIFIERS"]["DHIS2_DATASET_FORMATTED"] - else: - ds_outliers_id = snt_config["SNT_DATASET_IDENTIFIERS"]["DHIS2_OUTLIERS_IMPUTATION"] - # Check the file exists in the dataset if not dataset_file_exists(ds_id=ds_outliers_id, filename=routine_file): current_run.log_warning( @@ -171,7 +173,6 @@ def snt_dhis2_reporting_rate_dataelement( else: current_run.log_info("Skipping calculations, running only the reporting.") - # Compatible with snt_lib (snt_utils): do not pass nb_parameters run_report_notebook( nb_file=pipeline_path / "reporting" / "snt_dhis2_reporting_rate_dataelement_report.ipynb", nb_output_path=pipeline_path / "reporting" / "outputs", @@ -187,7 +188,14 @@ def snt_dhis2_reporting_rate_dataelement( def resolve_routine_filename(country_code: str, routine_data_choice: str) -> str: - """Returns the canonical routine filename for a routine data choice.""" + """Return the canonical routine Parquet filename for a routine data choice. + + Returns: + Filename string (e.g. ``{country_code}_routine_outliers_imputed.parquet``). + + Raises: + ValueError: If ``routine_data_choice`` is not one of the supported values. + """ if routine_data_choice == "raw": return f"{country_code}_routine.parquet" diff --git a/snt_dhis2_reporting_rate_dataset/pipeline.py b/snt_dhis2_reporting_rate_dataset/pipeline.py index b52c32c..ff440f2 100644 --- a/snt_dhis2_reporting_rate_dataset/pipeline.py +++ b/snt_dhis2_reporting_rate_dataset/pipeline.py @@ -90,6 +90,7 @@ def snt_dhis2_reporting_rate_dataset( nb_parameters = { "SNT_ROOT_PATH": root_path.as_posix(), "ROUTINE_FILE": routine_file, + "DATASET_ID": ds_outliers_id, } params_file = save_pipeline_parameters(