From 8eb7f53f4a0ebb04d8654596560a07b2ce004d8c Mon Sep 17 00:00:00 2001 From: claude-marie Date: Fri, 3 Apr 2026 12:05:24 +0200 Subject: [PATCH 01/18] reporting rate update --- .../code/snt_dhis2_reporting_rate.ipynb | 4783 +++++++++-------- .../snt_dhis2_reporting_rate_report.ipynb | 2105 ++++---- .../utils/snt_dhis2_reporting_rate.r | 79 + 3 files changed, 3622 insertions(+), 3345 deletions(-) create mode 100644 pipelines/snt_dhis2_reporting_rate/utils/snt_dhis2_reporting_rate.r diff --git a/pipelines/snt_dhis2_reporting_rate/code/snt_dhis2_reporting_rate.ipynb b/pipelines/snt_dhis2_reporting_rate/code/snt_dhis2_reporting_rate.ipynb index 81eded2..4b8cf71 100644 --- a/pipelines/snt_dhis2_reporting_rate/code/snt_dhis2_reporting_rate.ipynb +++ b/pipelines/snt_dhis2_reporting_rate/code/snt_dhis2_reporting_rate.ipynb @@ -1,2355 +1,2438 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "f5827740-2917-4504-9017-9ec7d408e5f4", - "metadata": {}, - "source": [ - "Script structure:\n", - "\n", - " 0. Parameters: set back-up values for parameters, for when the notebook is run manually (_noy_ via pipeline)\n", - " 1. Setup:\n", - " * Paths\n", - " * Utils functions\n", - " 2. Load Data\n", - " * **Routine data** (DHIS2) already formatted & aggregated (output of pipeline XXX)\n", - " * **Reporting** (DHIS2) pre-computed, already formatted & aggregated (output of pipeline ???)\n", - " * **Shapes** (DHIS2) for plotting (this could be removed if we move the plots to \"report/EDA\" nb)\n", - " 3. Calculate **Reportng Rate (RR)**\n", - " * \"**Dataset**\": using pre-computed reportings from DHIS2/SNIS (was: \"DHIS2\")\n", - " * \"**Data Element**\": using calculated expected nr of report (nr of active facilities) (was: \"CONF\")\n", - " 4. **Export** reporting rate data to `.../data/dhis2/reporting_rate/` as .parquet (and .csv) files for **either**:\n", - " * data**set**: \"XXX_reporting_rate_**dataset**.parquet\" **or**\n", - " * data**element**: \"XXX_reporting_rate_**dataelement**.parquet\"" - ] - }, - { - "cell_type": "markdown", - "id": "5e8f5bf2-922a-468a-8a2c-8e56d7e652df", - "metadata": {}, - "source": [ - "--------------------" - ] - }, - { - "cell_type": "markdown", - "id": "e962c5a4-6b09-4485-8d71-d842159118d3", - "metadata": {}, - "source": [ - "### To Do:\n", - "* For `DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\"`: **add code** to count OPEN facilities () for **countries with \"normal\" pyramids** (i.e., when no mixing of facilities and admin levels ... !). Atm only code for Niger, which runs only if `COUNTRY_CODE == NER`. Should add similar (but simpler) code for the rest of the countries (i.e, `COUNTRY_CODE != NER`)\n", - "* Check why Data Element **Denominator** `routine_active_facilities` is **calculated at `YEAR` (aggregated) instead of `MONTH`** ... possibly fix this to match granularity of other alternatives for denominator (which are calculated at MONTH level)\n", - "* Modify **report notebook** and/or pipeline.py code so that it does not make the **pipeline FAIL** if `reporting_rate_dataset` or `reporting_rate_dataelement` is **not found** (which is now always the case since we only output 1 file at each run!!)" - ] - }, - { - "cell_type": "markdown", - "id": "0cdfdc73-bb9a-48a8-a26b-84ecbab2e0aa", - "metadata": {}, - "source": [ - "----------------" - ] - }, - { - "cell_type": "markdown", - "id": "339f6d58-0965-40ef-b718-96195d2463f8", - "metadata": {}, - "source": [ - "## Parameters" - ] - }, - { - "cell_type": "markdown", - "id": "dd6cd6f8-b91b-4902-8801-a60e11776f98", - "metadata": {}, - "source": [ - "Set Default values **if _not_ provided by pipeline**
\n", - "This makes the execution flexible and \"safe\": nb can be run manually from here or be executed via pipeline, without having to change anything in the code!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "93aac683-8828-4a42-b841-f16c7e8fbb07", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Set BACKUP VALUE: root path - NEVER CHANGE THIS!\n", - "if (!exists(\"SNT_ROOT_PATH\")) {\n", - " SNT_ROOT_PATH <- \"/home/hexa/workspace\" \n", - "}\n", - "\n", - "\n", - "# Choose to run either DataSet OR DataElement method\n", - "if (!exists(\"REPORTING_RATE_METHOD\")) {\n", - " # REPORTING_RATE_METHOD <- \"DATASET\" \n", - " REPORTING_RATE_METHOD <- \"DATAELEMENT\"\n", - "}\n", - "\n", - "\n", - "# Data Elemenet method: Choice of which INDICATORS to use to count the nr of reporting facilities \n", - "# CONF\n", - "if (!exists(\"DATAELEMENT_METHOD_NUMERATOR_CONF\")) {\n", - " DATAELEMENT_METHOD_NUMERATOR_CONF <- TRUE # FALSE\n", - "}\n", - "\n", - "# SUSP\n", - "if (!exists(\"DATAELEMENT_METHOD_NUMERATOR_SUSP\")) {\n", - " DATAELEMENT_METHOD_NUMERATOR_SUSP <- TRUE # FALSE\n", - "}\n", - "\n", - "# TEST\n", - "if (!exists(\"DATAELEMENT_METHOD_NUMERATOR_TEST\")) {\n", - " DATAELEMENT_METHOD_NUMERATOR_TEST <- TRUE # FALSE\n", - "}\n", - "\n", - "\n", - "\n", - "# Data Elemenet RR. Choice: which df to use for nr of `EXPECTED_REPORTS` (DENOMINATOR) \n", - "if (!exists(\"DATAELEMENT_METHOD_DENOMINATOR\")) {\n", - " # DATAELEMENT_METHOD_DENOMINATOR <- \"ROUTINE_ACTIVE_FACILITIES\" \n", - " DATAELEMENT_METHOD_DENOMINATOR <- \"PYRAMID_OPEN_FACILITIES\" \n", - " # DATAELEMENT_METHOD_DENOMINATOR <- \"DHIS2_EXPECTED_REPORTS\" # ⚠️ only if `REPORTING_RATE_METHOD == \"DATASET\"` && DataSet is available!! ⚠️\n", - "} \n" - ] - }, - { - "cell_type": "markdown", - "id": "af076158-1f5a-408d-8ce2-2f2101d0531c", - "metadata": {}, - "source": [ - "## 1. Setup" - ] - }, - { - "cell_type": "markdown", - "id": "3ae826e4-f728-4c8d-81fb-0857234ac622", - "metadata": {}, - "source": [ - "### 1.1. Paths" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b5f1b8ce-db82-4295-8e74-00b765cf0b9d", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# PROJECT PATHS\n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, 'code') # this is where we store snt_utils.r\n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, 'configuration') # .json config file\n", - "DATA_PATH <- file.path(SNT_ROOT_PATH, 'data', 'dhis2') " - ] - }, - { - "cell_type": "markdown", - "id": "22971de0-1431-4cbd-b8c1-3bd3e1609e0d", - "metadata": {}, - "source": [ - "### 1.2. Utils functions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1784fd43-03f3-478b-8148-4b478317ea21", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "source(file.path(CODE_PATH, \"snt_utils.r\"))" - ] - }, - { - "cell_type": "markdown", - "id": "3bbcbd39-54e8-4ece-9244-30d7d30291d2", - "metadata": {}, - "source": [ - "### 1.3. Packages" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "426ecff6-0b4c-474d-a48d-826002205b89", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# List required pcks ----------------> check what are the really required libraries\n", - "required_packages <- c(\"arrow\", # for .parquet\n", - " \"tidyverse\",\n", - " \"stringi\", \n", - " \"jsonlite\", \n", - " \"httr\", \n", - " \"reticulate\")\n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)" - ] - }, - { - "cell_type": "markdown", - "id": "18a8e0c1-ac09-4435-b6f4-5f91fd916396", - "metadata": {}, - "source": [ - "### 1.3.1. OpenHEXA-specific settings" - ] - }, - { - "cell_type": "markdown", - "id": "ebb8c7d5-7c2c-4dbe-a1ba-238419fbedf3", - "metadata": {}, - "source": [ - "#### For 📦{sf}, tell OH where to find stuff ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "91a66fb7-dd5e-43fd-a6a2-d8bb9f0315d6", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")" - ] - }, - { - "cell_type": "markdown", - "id": "ac9ee427-020e-47c5-b2c9-5ca24e1f2779", - "metadata": {}, - "source": [ - "#### Set environment to load openhexa.sdk from the right path" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "aa331278-573d-4a22-ab16-da6972d7b0be", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Set environment to load openhexa.sdk from the right path\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")" - ] - }, - { - "cell_type": "markdown", - "id": "339b2e8b-9bf6-4eaf-b283-d9360c1c6899", - "metadata": {}, - "source": [ - "### 1.4. Load and check `config` file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f1c46526-6844-43ae-bb53-d8d1ad2fac24", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load SNT config\n", - "\n", - "config_file_name <- \"SNT_config.json\" \n", - "config_json <- tryCatch({\n", - " jsonlite::fromJSON(file.path(CONFIG_PATH, config_file_name)) \n", - " },\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, config_file_name))\n", - "log_msg(msg)" - ] - }, - { - "cell_type": "markdown", - "id": "29182f25-b0cf-46aa-9818-49616cd3f353", - "metadata": {}, - "source": [ - "**Save config fields as variables**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c52654c8-8a19-4e0c-a83b-1bc2eecae6bc", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Generic\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", - "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", - "\n", - "# How to treat 0 values (in this case: \"SET_0_TO_NA\" converts 0 to NAs)\n", - "NA_TREATMENT <- config_json$SNT_CONFIG$NA_TREATMENT\n", - "\n", - "# Which (aggregated) indicators to use to evaluate \"activity\" of an HF - for Reporting Rate method \"Ousmane\"\n", - "DHIS2_INDICATORS <- names(config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS)\n", - "\n", - "# Which reporting rate PRODUCT_UID to use (not that this is a dataset in COD, but 2 dataElements in BFA!)\n", - "REPORTING_RATE_PRODUCT_ID <- config_json$SNT_CONFIG$REPORTING_RATE_PRODUCT_UID" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "412572bc-fb96-4f61-ac49-be7f449219b6", - "metadata": {}, - "outputs": [], - "source": [ - "# DHIS2_INDICATORS\n", - "log_msg(paste(\"Expecting the following DHIS2 (aggregated) indicators : \", paste(DHIS2_INDICATORS, collapse=\", \")))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2a0a8562-4a70-455c-9ccf-aa39f4cf4e31", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Fixed cols for routine data formatting \n", - "fixed_cols <- c('OU_ID','PERIOD', 'YEAR', 'MONTH', 'ADM1_ID', 'ADM2_ID') # (OU_NAME has homonimous values!)\n", - "# print(paste(\"Fixed routine data (`dhis2_routine`) columns (always expected): \", paste(fixed_cols, collapse=\", \")))\n", - "log_msg(paste(\"Expecting the following columns from routine data (`dhis2_routine`) : \", paste(fixed_cols, collapse=\", \")))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "86e82d54-2b00-4c25-9b34-3497d4c88c52", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Fixed cols for exporting RR tables: to export output tables with consistent structure\n", - "fixed_cols_rr <- c('YEAR', 'MONTH', 'ADM2_ID', 'REPORTING_RATE') " - ] - }, - { - "cell_type": "markdown", - "id": "dadc7351-e67e-450b-a046-bc64660a7dde", - "metadata": {}, - "source": [ - "### 1.5. 🔍 Check: at least 1 indicator must be selected\n", - "The use can toggle on/off each of the indicators. Therefore, need to make sure at least one is ON.
\n", - "Alternatively, `CONF` could be made mandatory, but I think it looks better if they're all displayed in the Run pipeline view (more intuitive)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2cf6e2a4-0822-4a0c-852e-143da5473d20", - "metadata": {}, - "outputs": [], - "source": [ - "nr_of_indicators_selected <- sum(DATAELEMENT_METHOD_NUMERATOR_CONF, DATAELEMENT_METHOD_NUMERATOR_SUSP, DATAELEMENT_METHOD_NUMERATOR_TEST)\n", - "\n", - "if (nr_of_indicators_selected == 0) {\n", - " msg <- \"[ERROR] Error: no indicator selected, cannot perform calculation of reporting rate method 'Data Element'! Select at least one (e.g., `CONF`).\"\n", - " cat(msg) \n", - " stop(msg)\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "8d8d9be2-bf05-466d-811e-6beea0dccfde", - "metadata": {}, - "source": [ - "## 2. Load Data" - ] - }, - { - "cell_type": "markdown", - "id": "0fa1b169-fc55-4ef1-b58f-6a7dc9d1dec3", - "metadata": {}, - "source": [ - "### 2.1. **Routine** data (DHIS2) \n", - "already formatted & aggregated (output of pipeline XXX)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "586e8da8-4e1c-431a-9b8d-1169167e1c09", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# DHIS2 Dataset extract identifier\n", - "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "\n", - "# Load file from dataset\n", - "dhis2_routine <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_routine.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 routine data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - "})\n", - "\n", - "msg <- paste0(\"DHIS2 routine data loaded from dataset : \", dataset_name, \" dataframe dimensions: \", paste(dim(dhis2_routine), collapse=\", \"))\n", - "log_msg(msg)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a2454183-44f7-4e2e-a0cf-ca112aa183bb", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Ensure correct data type for numerical columns \n", - "dhis2_routine <- dhis2_routine %>%\n", - " mutate(across(c(PERIOD, YEAR, MONTH), as.numeric))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "edb2fcdc-ce0a-4c78-b06a-9f4610ab4714", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "head(dhis2_routine, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "821e1ebf-b2fa-4469-974e-2e4d27d58854", - "metadata": {}, - "source": [ - "#### 🔍 Check expected cols for method **Data Element**, numerator using multiple indicators.\n", - "Only when: `DATAELEMENT_METHOD_NUMERATOR == \"CONF|SUSP|TEST\"`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d3f8b89e-a04e-4e0b-9892-95ce2150e7da", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "head(dhis2_routine, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "adec5412", - "metadata": {}, - "source": [ - "#### 🔍 Check expected cols for method **Data Element**, numerator using multiple indicators.\n", - "Based on which indicator(s) are selected (if any)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0bbcdf8c-873a-4b41-980a-f18d1863ab8f", - "metadata": {}, - "outputs": [], - "source": [ - "# Initialize empty vector\n", - "indicators_selected = c()\n", - "\n", - "# Add elements based on user selection(s)\n", - "if (DATAELEMENT_METHOD_NUMERATOR_CONF) {\n", - " indicators_selected = append(indicators_selected, \"CONF\")\n", - "}\n", - "\n", - "if (DATAELEMENT_METHOD_NUMERATOR_SUSP) {\n", - " indicators_selected = append(indicators_selected, \"SUSP\")\n", - "}\n", - "\n", - "if (DATAELEMENT_METHOD_NUMERATOR_TEST) {\n", - " indicators_selected = append(indicators_selected, \"TEST\")\n", - "}\n", - "\n", - "print(paste0(\"Selected indicators: \", paste(indicators_selected, collapse = \", \")))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b84753f8-aa9c-4563-beae-5e29b3f1e773", - "metadata": {}, - "outputs": [], - "source": [ - "# This is kinda useless now but KEEP in case we ADD MORE CHOICES OF INDICATORS!! \n", - "if(REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", - " if (DATAELEMENT_METHOD_NUMERATOR_CONF | DATAELEMENT_METHOD_NUMERATOR_SUSP | DATAELEMENT_METHOD_NUMERATOR_TEST) {\n", - " log_msg(paste0(\"Indicator(s) \", paste(indicators_selected, collapse = \", \") , \" selected for calculation of numerator for method `Data Element`.\" ))\n", - " \n", - " if ( length(which(indicators_selected %in% names(dhis2_routine))) < length(indicators_selected) ) {\n", - " log_msg(paste0(\"🚨 Warning: one or more of the follow column is missing from `dhis2_routine`: \", paste(expected_col, collapse = \", \"), \".\"), \"warning\")\n", - " } \n", - " }\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "c832da26-fe0c-43fe-8300-2fff5c4cbf34", - "metadata": {}, - "source": [ - "### 2.2. **Reporting** pre-computed from DHIS2 \n", - "Data granularity:\n", - "* **ADM2**\n", - "* **MONTH** (PERIOD)\n", - "\n", - "Note: data comes from different dataset (`DS_NAME`): `A SERVICES DE BASE`, `B SERVICES SECONDAIRES`,`D SERVICE HOPITAL` \n", - "\n", - "The col `DS_METRIC` indicates whether the `VALUE` is `EXPECTED_REPORTS` or `ACTUAL_REPORTS`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0ce295b9-9898-4e12-8a91-92bb25b9e0a2", - "metadata": {}, - "outputs": [], - "source": [ - "# REPORTING_RATE_METHOD <- \"DATAELEMENT\" # \"DATASET\"\n", - "REPORTING_RATE_METHOD" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8a32fc96-5b8e-4108-a224-c0d843df9b47", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", - " # DHIS2 Dataset extract identifier\n", - " dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - " file_name <- paste0(COUNTRY_CODE, \"_reporting.parquet\")\n", - " \n", - " # Load file from dataset\n", - " dhis2_reporting <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, file_name) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 pre-computed REPORTING data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - " })\n", - " \n", - " msg <- paste0(\"DHIS2 pre-computed REPORTING data loaded from file `\", file_name, \"` (from dataset : `\", dataset_name, \"`). Dataframe dimensions: \", \n", - " paste(dim(dhis2_reporting), collapse=\", \"))\n", - " log_msg(msg)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e131d9ee-0e88-4bb6-982b-53b1229fba5f", - "metadata": {}, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", - " # Convert VALUE col to - should not be needed but keep as safety measure \n", - " dhis2_reporting <- dhis2_reporting |>\n", - " mutate(across(c(PERIOD, YEAR, MONTH, VALUE), as.numeric))\n", - "\n", - " head(dhis2_reporting, 3)\n", - " }" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "46e3dba8-d46b-457e-ba90-c663e30c42d2", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# # Convert VALUE col to - should not be needed but keep as safety measure \n", - "# dhis2_reporting <- dhis2_reporting |>\n", - "# mutate(across(c(PERIOD, YEAR, MONTH, VALUE), as.numeric))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5149befe-b6ad-46a9-9879-7637ce5b02be", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# head(dhis2_reporting, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "7a967af3-f6e5-428a-8769-72808f21a125", - "metadata": {}, - "source": [ - "#### 2.2.1. **Filter** to keep only values for `PRODUCT_UID` defined in config.json" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1948c2f7-7a2c-47a2-9dc6-ba29da6d030c", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "REPORTING_RATE_PRODUCT_ID" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e4258098-e24c-4520-914d-0f73354bb3ab", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", - "\n", - " # Handle problems with incorrect configuration - to be improved 🚧\n", - " if (is.null(REPORTING_RATE_PRODUCT_ID)) {\n", - " log_msg(\"🛑 Problem with definition of REPORTING_RATE_PRODUCT_ID, check `SNT_config.json` file!\")\n", - " } else \n", - " product_name <- dhis2_reporting |> filter(PRODUCT_UID %in% REPORTING_RATE_PRODUCT_ID) |> pull(PRODUCT_NAME) |> unique()\n", - " log_msg(glue::glue(\"Using REPORTING_RATE_PRODUCT_ID == `{REPORTING_RATE_PRODUCT_ID}`, corresponding to DHIS2 Product name : `{product_name}`.\"))\n", - "\n", - " }" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c22c6ada-7cb1-4fca-b65e-b51e5eca35a2", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", - "\n", - " dhis2_reporting_filtered <- dhis2_reporting |>\n", - " filter(PRODUCT_UID %in% REPORTING_RATE_PRODUCT_ID) |>\n", - " select(-PRODUCT_UID, -PRODUCT_NAME) # useless cols now\n", - " \n", - " print(dim(dhis2_reporting_filtered))\n", - " head(dhis2_reporting_filtered)\n", - " \n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "9da035cf-5d3f-4df0-a063-2d2497616c82", - "metadata": {}, - "source": [ - "#### 2.2.2. Format to produce `dhis2_reporting_expected`\n", - "🚨 Note: Use `dhis2_reporting_expected$EXPECTED_REPORTS` as new denominator for REPORTING_RATE calculations (methods dataset and dataelement)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7e970f9d-258e-4050-ae69-185b88c79fc3", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", - " \n", - " dhis2_reporting_wide <- dhis2_reporting_filtered |> \n", - " pivot_wider(\n", - " names_from = PRODUCT_METRIC, \n", - " values_from = VALUE\n", - " )\n", - " \n", - " print(dim(dhis2_reporting_wide))\n", - " head(dhis2_reporting_wide)\n", - " \n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "eab31756-ae6b-4152-8ec3-8195236d8732", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Use `dhis2_reporting_expected$EXPECTED_REPORTS` as new denomitor for RR calculations (methods ANY and CONF)\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", - " \n", - " dhis2_reporting_expected <- dhis2_reporting_wide |> \n", - " select(-ACTUAL_REPORTS)\n", - " \n", - " print(dim(dhis2_reporting_expected))\n", - " head(dhis2_reporting_expected)\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "3c3d0f35-889d-4c16-9741-d6e75e2ef096", - "metadata": {}, - "source": [ - "#### 2.2.3. **Checks** on data completeness: _do **periods match** with routine data?_\n", - "Lack of perfect overlap in periods between routine data and reporting rate data might create headhaches downstream!
\n", - "Specifically, **incidence** calculations will show **N2 smaller than N1** due to **aggregation by YEAR when NA** values are present!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7ea57600-418b-45bc-805a-f829e237b4c4", - "metadata": {}, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", - " \n", - " # --- Check Year Compatibility ---\n", - " routine_years <- sort(unique(as.integer(dhis2_routine$YEAR))) # as.integer\n", - " expected_years <- sort(unique(as.integer(dhis2_reporting_expected$YEAR))) # as.integer\n", - " \n", - " if (!setequal(routine_years, expected_years)) {\n", - " missing_in_routine <- setdiff(expected_years, routine_years)\n", - " missing_in_expected <- setdiff(routine_years, expected_years)\n", - " \n", - " if (length(missing_in_routine) > 0) {\n", - " log_msg(paste0(\"🚨 Warning: YEAR value(s) present in 'dhis2_reporting_expected' but not in 'dhis2_routine': \",\n", - " paste(missing_in_routine, collapse = \", \")))\n", - " }\n", - " if (length(missing_in_expected) > 0) {\n", - " log_msg(paste0(\"🚨 Warning: YEAR value(s) present in 'dhis2_routine' but not in 'dhis2_reporting_expected': \",\n", - " paste(missing_in_expected, collapse = \", \")))\n", - " }\n", - " } else {\n", - " log_msg(\"✅ YEAR values are consistent across 'dhis2_routine' and 'dhis2_reporting_expected'.\")\n", - " \n", - " # --- Check Month Compatibility (if years are consistent) ---\n", - " all_years <- unique(routine_years) # Or expected_years, they are the same now\n", - " \n", - " for (year_val in all_years) {\n", - " routine_months_for_year <- dhis2_routine %>%\n", - " filter(YEAR == year_val) %>%\n", - " pull(MONTH) %>%\n", - " unique() %>%\n", - " sort()\n", - " \n", - " expected_months_for_year <- dhis2_reporting_expected %>%\n", - " filter(YEAR == year_val) %>%\n", - " pull(MONTH) %>%\n", - " unique() %>%\n", - " sort()\n", - " \n", - " if (!setequal(routine_months_for_year, expected_months_for_year)) {\n", - " missing_in_routine_months <- setdiff(expected_months_for_year, routine_months_for_year)\n", - " missing_in_expected_months <- setdiff(routine_months_for_year, expected_months_for_year)\n", - " \n", - " if (length(missing_in_routine_months) > 0) {\n", - " log_msg(paste0(\"🚨 Warning: for YEAR \", year_val, \", MONTH value(s) '\", paste(missing_in_routine_months, collapse = \", \"),\n", - " \"' present in 'dhis2_reporting_expected' but not in 'dhis2_routine'!\"\n", - " ))\n", - " }\n", - " if (length(missing_in_expected_months) > 0) {\n", - " log_msg(paste0(\"🚨 Warning: for YEAR \", year_val, \", MONTH value(s) '\", paste(missing_in_expected_months, collapse = \", \"), \n", - " \"' present in 'dhis2_routine' but not in 'dhis2_reporting_expected'!\"\n", - " ))\n", - " }\n", - " } else {\n", - " log_msg(paste0(\"✅ For year \", year_val, \", months are consistent across both data frames.\"))\n", - " }\n", - " }\n", - " }\n", - "\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "5e711191-995b-4f89-b10c-fc2214cdd8b2", - "metadata": {}, - "source": [ - "### 2.3. **Pyramid** to count OPEN facilities (denominator)\n", - "Table (and column) needed for denominator of \"Data Element\" reporting rate if choice == `PYRAMID_OPEN_FACILITIES`\n", - "\n", - "**Important**: the pyramid must contain the `OPENING_DATE` and `CLOSING_DATE` columns (this was implemented in the new extraction pipeline from 2025-09).
\n", - "Then, **depending on the Country** (well, theire pyramid structure) **import** either:\n", - "* **Raw** pyramid for 🇳🇪 Niger: because first need to \"manually\" correctly aggregate the VALUEs for the HF (separate them from admin levels and sum up HD units)\n", - "* **Formatted** pyramid for all other countries encountered so far: 🇨🇩 DRC, 🇧🇫 Burkina Faso ... bevcause their pyramid is already usable right away" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ad3f83b6-2fdd-45da-8a4d-fb06513b6be2", - "metadata": {}, - "outputs": [], - "source": [ - "# DATAELEMENT_METHOD_DENOMINATOR <- \"PYRAMID_OPEN_FACILITIES\"\n", - "DATAELEMENT_METHOD_DENOMINATOR" - ] - }, - { - "cell_type": "markdown", - "id": "e7b80b6e-9e34-4e71-93e8-7e16a110e17c", - "metadata": {}, - "source": [ - "#### **Raw** pyramid for 🇳🇪 **Niger**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "652cf1a7-c9a2-48db-b44d-8fabfd0e072f", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE == \"NER\") {\n", - " \n", - " # DHIS2 Dataset extract identifier\n", - " dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_EXTRACTS\n", - " \n", - " # Load file from dataset\n", - " # Rename `dhis2_pyramid`?? Check with downstream processes ... 🚧\n", - " dhis2_pyramid_raw <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_dhis2_raw_pyramid.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 pyramid RAW data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - " })\n", - " \n", - " msg <- paste0(\"DHIS2 RAW pyramid data loaded from dataset : `\", dataset_name, \"`. Dataframe dimensions: \", paste(dim(dhis2_pyramid_raw), collapse=\", \"))\n", - " log_msg(msg)\n", - " \n", - " head(dhis2_pyramid_raw)\n", - " \n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "f1716ac3-ce8f-4223-9729-6ed826e743bc", - "metadata": {}, - "source": [ - "#### **Formatted** pyramid for all other countries (normal pyramid) 🇨🇩 🇧🇫" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fc16ae54-4915-4333-b458-2b611e2b1792", - "metadata": {}, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE != \"NER\") {\n", - " \n", - " # DHIS2 Dataset extract identifier\n", - " dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - " \n", - " # Load file from dataset\n", - " # Rename `dhis2_pyramid`?? Check with downstream processes ... 🚧\n", - " dhis2_pyramid_formatted <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_pyramid.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 pyramid FORMATTED data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - " })\n", - " \n", - " msg <- paste0(\"DHIS2 pyramid FORMATTED data loaded from dataset : `\", dataset_name, \"`. Dataframe dimensions: \", paste(dim(dhis2_pyramid_formatted), collapse=\", \"))\n", - " log_msg(msg)\n", - " \n", - " head(dhis2_pyramid_formatted)\n", - " \n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "eb4c5c63-d140-46b8-b686-886e612a31dc", - "metadata": {}, - "source": [ - "## 3. Calculate **Reporting Rate** (RR)\n", - "We compute it using 2 approaches, user can decided later on which one to use for incidence adjustment." - ] - }, - { - "cell_type": "markdown", - "id": "cb724aa8-5f06-4e99-aeca-640d0c1b049e", - "metadata": {}, - "source": [ - "## 3.1. \"**Dataset**\" reporting rate: pre-computed, from **DHIS2**\n", - "Exrtacted from DHIS2 and formatted. \n", - "\n", - "Straightforward: `ACTUAL_REPORTS` / `EXPECTED_REPORTS` (just pivot `DS_METRIC` and divide)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "10b2f52b-0217-43f1-88a3-cd01d98869b1", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", - "\n", - " reporting_rate_dataset <- dhis2_reporting_wide |> \n", - " mutate(REPORTING_RATE = ACTUAL_REPORTS / EXPECTED_REPORTS)\n", - " \n", - " print(dim(reporting_rate_dataset))\n", - " head(reporting_rate_dataset, 3)\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "3d49eda8-b4fd-437a-8938-17bf0806f281", - "metadata": {}, - "source": [ - "#### Quick data quality check 🔍" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cff33416-ea66-4eeb-9d33-1597c2f05b0c", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# --- Define function ---------------------------\n", - "inspect_reporting_rate <- function(data_tibble) {\n", - "\n", - " # Dynamically get the name of the tibble passed to the function\n", - " # Extract the litteral name of the variable passed (e.g., \"reporting_rate_dhis2_month\")\n", - " tibble_name_full <- deparse(substitute(data_tibble))\n", - "\n", - " # Extract the 'method' part from the tibble name\n", - " method <- stringr::str_extract(tibble_name_full, \"(?<=reporting_rate_).*\") # \"(?<=reporting_rate_).*?(?=_month)\"\n", - "\n", - " # Calculations for proportion of values > 1\n", - " values_greater_than_1 <- sum(data_tibble$REPORTING_RATE > 1, na.rm = TRUE)\n", - " total_values <- length(data_tibble$REPORTING_RATE)\n", - "\n", - " if (total_values > 0) {\n", - " proportion <- values_greater_than_1 / total_values * 100\n", - " min_rate <- min(data_tibble$REPORTING_RATE, na.rm = TRUE)\n", - " max_rate <- max(data_tibble$REPORTING_RATE, na.rm = TRUE)\n", - " } else {\n", - " proportion <- 0\n", - " min_rate <- NA # Set to NA if no values to calculate min/max\n", - " max_rate <- NA # Set to NA if no values to calculate min/max\n", - " }\n", - "\n", - " if (proportion == 0) {\n", - " clarification = NULL\n", - " } else {\n", - " clarification = \" (there are more reports than expected)\"\n", - " }\n", - "\n", - " # Print the formatted result\n", - " log_msg(\n", - " paste0(\n", - " \"🔍 For reporting rate method : `\", method, \"`, the values of REPORTING_RATE range from \", round(min_rate, 2),\n", - " \" to \", round(max_rate, 2),\n", - " \", and \", round(proportion, 2), \" % of values are >1\", clarification, \".\"\n", - " )\n", - " )\n", - "\n", - " # Histogram\n", - " hist(data_tibble$REPORTING_RATE, \n", - " breaks = 50)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e2f4c11c-c683-4204-ab91-9d41cab4826c", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", - " inspect_reporting_rate(reporting_rate_dataset)\n", - " }" - ] - }, - { - "cell_type": "markdown", - "id": "04870e93-5385-425b-89fd-b815a87cfa21", - "metadata": {}, - "source": [ - "#### Subset cols" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d90671b0-36f8-4c6e-8736-4ea807079f83", - "metadata": { - "scrolled": true, - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", - " reporting_rate_dataset <- reporting_rate_dataset |> \n", - " select(all_of(fixed_cols_rr))\n", - " \n", - " dim(reporting_rate_dataset)\n", - " head(reporting_rate_dataset, 3)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "62e6cb16-0196-447f-b142-aaec2120eecb", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "2dc27c07-80cd-465e-891f-9fb70111dbb0", - "metadata": {}, - "source": [ - "#### Plot by MONTH (heatmap)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c9ce56fc-f86a-4a2b-95b7-fb6ec5b89087", - "metadata": {}, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATASET\") {\n", - " \n", - " # Plot reporting rate heatmap\n", - " options(repr.plot.width = 20, repr.plot.height = 10) \n", - " \n", - " # reporting_rate_conf_month %>%\n", - " reporting_rate_dataset %>%\n", - " mutate(\n", - " DATE = as.Date(paste0(YEAR, \"-\", MONTH, \"-01\"))\n", - " ) %>%\n", - " ggplot(., aes(x = DATE, \n", - " y = factor(ADM2_ID), \n", - " fill = REPORTING_RATE * 100)\n", - " ) + \n", - " geom_tile() +\n", - " scale_fill_viridis_c(\n", - " option = \"C\",\n", - " direction = 1, # blue = low, yellow = high\n", - " limits = c(0, 100),\n", - " name = \"Reporting rate (%)\"\n", - " ) +\n", - " labs(\n", - " title = \"Monthly Reporting Rate by Health District - Method 'DataSet'\",\n", - " subtitle = \"Each tile represents the reporting completeness per district per month\",\n", - " x = \"Month\",\n", - " y = \"Health District\"\n", - " ) +\n", - " theme_minimal(base_size = 13) +\n", - " theme(\n", - " axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 9),\n", - " axis.text.y = element_text(size = 9),\n", - " plot.title = element_text(face = \"bold\", hjust = 0.5, size = 14),\n", - " plot.subtitle = element_text(hjust = 0.5, size = 12),\n", - " legend.position = \"right\",\n", - " panel.grid = element_blank()\n", - " )\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "00bf15b7-baa7-4734-8133-8d4a9cc843a3", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "40b21c65-1b75-42f7-821a-24d31e436c73", - "metadata": {}, - "source": [ - "----------------------------" - ] - }, - { - "cell_type": "markdown", - "id": "17ffece4-9420-4004-993b-b5692cc1d2de", - "metadata": {}, - "source": [ - "## 3.2. **Data Element** reporting rate: based on reporting of one or more indicators\n", - "**_Partially_ following methods by WHO and as per Diallo (2025) paper**\n", - "\n", - "To accurately measure data completeness, we calculate the **monthly** reporting rate per **ADM2**, as the **proportion** of **facilities** (HF or `OU_ID`) that in a given month submitted data for either a single indicator (i.e., **confirmed** malaria case as `CONF`) or for _any_ of the chosen indicators (i.e., `CONF`, `SUSP`, `TEST`). \n", - "\n", - "Basically, \"Data Element\" reporting rate is the number of facilities reporting on 1 or more given indicators, over the total number of facilities.
\n", - "\n", - "For this method the user is allowed to **chose** how to calculate both the **numerator** and the **denominator**.
\n", - "Specifically:\n", - "* **Numerator**: is the number of **facilities that _actually reported_** data, and it is estimated based on whether a facility (FoSa, or HF, or `OU_ID`) **submitted data** for **_any_** of the following **indicators**:\n", - " * `CONF`: confirmed malaria cases and/or\n", - " * `SUSP`: suspected malaria cases and/or\n", - " * `TEST`: tested malaria cases
\n", - " Note: we **recommend** always including `CONF` because it is a core indicator consistently tracked across the dataset. This choice ensures alignment with the structure of the incidence calculation, which is also mainly based on confirmed cases.\n", - "\n", - "
\n", - " \n", - "* **Denominator**: is the number of **facilities _expected_ to report**. This number can be obtained in two different ways:\n", - " * `\"DHIS2_EXPECTED_REPORTS\"`: uses the col `EXPECTED_REPORTS` from the df `dhis2_reporting_expected`.
\n", - " This is obtained directly from DHIS2, and is the same denominator used to calculate the \"Dataset\" reporting rate.\n", - " * `\"ROUTINE_ACTIVE_FACILITIES\"`: uses the col `EXPECTED_REPORTS` from the df `active_facilities`.
\n", - " This is calculated as the number of \"**active**\" facilities (`OU_ID`), defined as those that submitted _any_ data **at least once in a given year**, across ***all*** indicators extracted in `dhis2_routine` (namely: all aggregated indicators as defined in the SNT_config.json file, see: `config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS`)\n", - "\n", - "
\n", - "\n", - "This method improves over simple binary completeness flags by accounting for both spatial (facility coverage) and temporal (monthly timeliness) dimensions.
" - ] - }, - { - "cell_type": "markdown", - "id": "f5dcd3b9-6f02-4fc5-9e5f-2253c015a3d4", - "metadata": {}, - "source": [ - "### Calculate the **numerator**" - ] - }, - { - "cell_type": "markdown", - "id": "a90d9f4a-a058-4ad5-8ef2-f827987b5def", - "metadata": {}, - "source": [ - "**Note**: the col `REPORTED` keeps the same name regardless of the value of `DATAELEMENT_METHOD_NUMERATOR` because \n", - "in this way the code needs to be parametrized only once (here).\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8076609c-46e8-478a-8283-bc63a70102f8", - "metadata": {}, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", - "\n", - "dhis2_routine_active <- dhis2_routine %>%\n", - " mutate(\n", - " # if_any() returns TRUE if the condition is met for any of the selected columns\n", - " ACTIVE = if_else(if_any(all_of(indicators_selected), ~ !is.na(.x)), 1, 0)\n", - " )\n", - "\n", - "log_msg(paste0(\"Evaluating reporting facilities based on indicators: \", paste(indicators_selected, collapse = \", \"), \".\"))\n", - "\n", - "dim(dhis2_routine_active)\n", - "head(dhis2_routine_active, 3)\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "325faf35-ed25-4b8e-b421-934a2852f27e", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a1773313-17e5-478d-b60d-c1193233204d", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "# --- 1. Calculate `SUBMITTED_REPORTS` as the nr of ACTIVE facilities (that REPORTED, each month) ------------------------\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", - "\n", - "dhis2_routine_submitted <- dhis2_routine_active %>% # OLD: dhis2_routine_reporting_month <- dhis2_routine_reporting %>%\n", - " group_by(ADM2_ID, YEAR, MONTH) %>% \n", - " summarise(\n", - " SUBMITTED_REPORTS = sum(ACTIVE, na.rm = TRUE),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " ungroup() %>% \n", - " mutate(YEAR = as.integer(YEAR),\n", - " MONTH = as.integer(MONTH)\n", - " ) \n", - "\n", - "print(dim(dhis2_routine_submitted))\n", - "head(dhis2_routine_submitted, 3)\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a25647e3-5674-44e0-855e-c3a48483310d", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "15f4c12f", - "metadata": {}, - "source": [ - "### Calculate the **denominator**" - ] - }, - { - "cell_type": "markdown", - "id": "06b2070d-c672-425f-a78f-b94a8d16a017", - "metadata": {}, - "source": [ - "#### Option: `ROUTINE_ACTIVE_FACILITIES`\n", - "This is to be used **only when** `DATAELEMENT_METHOD_DENOMINATOR ==`**`ROUTINE_ACTIVE_FACILITIES`** " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "08f03ed1-5831-4fe5-8bde-674a513e8110", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "# Calculate the tot nr of facilities (distinct OU_ID) based on all HF that appear in the routine data (each YEAR)\n", - "# meaning: regardless of what indicators they submit data for, as long as they have submitted something\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"ROUTINE_ACTIVE_FACILITIES\") {\n", - " routine_active_facilities <- dhis2_routine %>%\n", - " # Keep only rows where at least one indicator has non-NA value\n", - " filter(if_any(any_of(DHIS2_INDICATORS), ~ !is.na(.))) %>%\n", - " group_by(YEAR, ADM2_ID) %>%\n", - " summarize(\n", - " EXPECTED_REPORTS = n_distinct(OU_ID),\n", - " .groups = \"drop\" # remove grouping \n", - " )\n", - "\n", - " nr_of_rows <- nrow(routine_active_facilities)\n", - " log_msg(glue::glue(\"Produced df `routine_active_facilities`, with column `EXPECTED_REPORTS` calculated from DHIS2 routine data. Dataframe `routine_active_facilities` has {nr_of_rows} rows.\"))\n", - "\n", - " head(routine_active_facilities, 3)\n", - " \n", - "} \n" - ] - }, - { - "cell_type": "markdown", - "id": "6629dccb-97b0-4b0e-b23f-15b98704323d", - "metadata": {}, - "source": [ - "#### Option: `PYRAMID_OPEN_FACILITIES`\n", - "This is to be used **only when** `DATAELEMENT_METHOD_DENOMINATOR ==`**`PYRAMID_OPEN_FACILITIES`** " - ] - }, - { - "cell_type": "markdown", - "id": "0972ffca-c14a-4b93-85ff-027d056c3759", - "metadata": {}, - "source": [ - "------------------" - ] - }, - { - "cell_type": "markdown", - "id": "d49219b7-5932-4062-a10d-e1f3a4a81449", - "metadata": {}, - "source": [ - "#### TEMPORARY! 🇳🇪 **Niger-specific method**\n", - "🚨 Specific to **Niger EnDoP**: Pre-processing needed to separate facilities from adm levels!! 🚨
\n", - "\n", - "⚠️⚠️⚠️ **TEMPORARY: This will be moved to a dedicated pipeline!** ⚠️⚠️⚠️
\n", - "\n", - "Specifically:\n", - "* **Hospital**s (HD a Hopital District): at **level 4** together with Aires de Sante\n", - "* All other **FoSa**s: at **level 6**, also mixed with the hospital units\n", - "\n", - "Therefore, to assigned closed/open status, it is necessary to attach to each individual facility the closng and opening data column. \n", - "To do this: \n", - "1) extract list of facilities and id across the 2 levels (4 and 6) and\n", - "2) calculate the nr of open facilities per MONTH (PERIOD) per ADM2, ending up with a df with cols: `ADM2_ID`, `YEAR`, `MONTH`, `OPEN_FACILITIES_COUNT` = `EXPECTED_REPORTS`\n", - "3) add this to the df with the **numerator** (`dhis2_routine_submitted`)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e6296329-9bcd-4d2c-afb3-520c6a159cdb", - "metadata": {}, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE == \"NER\") {\n", - " \n", - "# names(dhis2_pyramid_raw)\n", - "dim(dhis2_pyramid_raw)\n", - "head(dhis2_pyramid_raw, 3)\n", - " \n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "6f651b18-2d85-4e26-8952-45dae9020c40", - "metadata": {}, - "source": [ - "#### 1. Create df with list of all **facilities** with their `DATE_OPENED` and `DATE_CLOSED`: `facility_master`\n", - "Separate \"facilities\" (of any type, such as hospitals to CSI, Infermieres etc) from admin levels and hospital units (wards, depts...)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "25dea3c7-44ea-42a7-b467-f470892fcfef", - "metadata": {}, - "outputs": [], - "source": [ - "# Helpers to detect Aires and Hospitals:\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE == \"NER\") {\n", - " \n", - "is_aire_l5 <- function(x) str_detect(x, regex(\"^\\\\s*aire[^a-zA-Z]?\", ignore_case = TRUE))\n", - "is_hospital_l4 <- function(x) str_detect(x, regex(\"^(hd|chr|chu|hgr)\", ignore_case = TRUE))\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "55760b89-f4d0-40c5-9905-f7c7c4fee5c0", - "metadata": {}, - "outputs": [], - "source": [ - "# List of all FoSa (from Aires → Level 6)\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE == \"NER\") {\n", - " \n", - "fosa_master <- dhis2_pyramid_raw %>%\n", - " filter(is_aire_l5(LEVEL_5_NAME)) %>%\n", - " distinct(\n", - " OU_ID = LEVEL_6_ID,\n", - " OU_NAME = LEVEL_6_NAME,\n", - " region = LEVEL_2_NAME,\n", - " district = LEVEL_3_NAME,\n", - " ADM2_ID = LEVEL_3_ID,\n", - " DATE_OPENED = OPENING_DATE, \n", - " DATE_CLOSED = CLOSED_DATE\n", - " ) %>%\n", - " mutate(OU_TYPE = \"FoSa\")\n", - "\n", - "dim(fosa_master)\n", - "head(fosa_master)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0e9c9f59-9c6c-44e4-bbd9-13c3917f5117", - "metadata": {}, - "outputs": [], - "source": [ - "# List of all Hospitals (from Level 4, aggregate dates across children)\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE == \"NER\") {\n", - " \n", - "hosp_master <- dhis2_pyramid_raw %>%\n", - "filter(is_hospital_l4(LEVEL_4_NAME)) %>%\n", - "group_by(LEVEL_4_ID, LEVEL_4_NAME, LEVEL_2_NAME, LEVEL_3_NAME, LEVEL_3_ID) %>%\n", - "summarise(\n", - " OPENING_DATE = suppressWarnings(min(OPENING_DATE, na.rm = TRUE)),\n", - " CLOSED_DATE = suppressWarnings(max(CLOSED_DATE, na.rm = TRUE)),\n", - " .groups = \"drop\"\n", - ") %>%\n", - "mutate(\n", - " DATE_OPENED = ifelse(is.infinite(OPENING_DATE), NA, OPENING_DATE) |> as_datetime(),\n", - " DATE_CLOSED = ifelse(is.infinite(CLOSED_DATE), NA, CLOSED_DATE) |> as_datetime()\n", - " ) %>%\n", - "distinct(\n", - " OU_ID = LEVEL_4_ID, \n", - " OU_NAME = LEVEL_4_NAME,\n", - " region=LEVEL_2_NAME,\n", - " district=LEVEL_3_NAME,\n", - " ADM2_ID=LEVEL_3_ID,\n", - " DATE_OPENED,\n", - " DATE_CLOSED\n", - ") %>%\n", - "mutate(\n", - " OU_TYPE = \"Hospital\"\n", - " )\n", - "\n", - "dim(hosp_master)\n", - "head(hosp_master)\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5859e393-bfac-46e6-b103-cb8177100860", - "metadata": {}, - "outputs": [], - "source": [ - "# Merge both\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE == \"NER\") {\n", - " \n", - "facility_master <- bind_rows(fosa_master, hosp_master) %>% \n", - " select(ADM2_ID, \n", - " OU_ID, \n", - " DATE_OPENED, \n", - " DATE_CLOSED)\n", - "\n", - "dim(facility_master)\n", - "head(facility_master, 3)\n", - "\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "191c3b01-1645-410e-be99-b247bf5f9cfb", - "metadata": {}, - "source": [ - "---------------------" - ] - }, - { - "cell_type": "markdown", - "id": "1ee48156-5ed5-43a5-b927-caa53c10d98e", - "metadata": {}, - "source": [ - "#### **Generic** part: applies to **all countries**" - ] - }, - { - "cell_type": "markdown", - "id": "3aa057a5-6f68-493e-83e2-81bafce42c9e", - "metadata": {}, - "source": [ - "#### 2. Calculate nr of **OPEN facilities** for each `MONTH` per `ADM2`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "91f972f1-bcc9-458f-9662-5574efc7ac9d", - "metadata": {}, - "outputs": [], - "source": [ - "# Define start and end period based on routine data \n", - "\n", - "# if (COUNTRY_CODE == \"NER\" && REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", - " \n", - "PERIOD_START <- dhis2_routine$PERIOD |> min()\n", - "PERIOD_END <- dhis2_routine$PERIOD |> max()\n", - "\n", - "print(paste0(\"Start period: \", PERIOD_START))\n", - "print(paste0(\"End period :\", PERIOD_END))\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0c6ed56f-b2a6-460c-a3dc-6c588c40b54c", - "metadata": {}, - "outputs": [], - "source": [ - "## Create a \"complete\" grid of every month and year for the period range ---------------------------------------\n", - "\n", - "# if (COUNTRY_CODE == \"NER\" && REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", - " \n", - "months_grid <- tibble(\n", - " month_date = seq(\n", - " ymd(paste0(PERIOD_START, \"01\")), # Converts 202201 to \"20220101\" and then to a date\n", - " ymd(paste0(PERIOD_END, \"01\")), # same\n", - " by = \"months\"\n", - " )\n", - ") %>%\n", - " mutate(\n", - " YEAR = year(month_date),\n", - " MONTH = month(month_date)\n", - " )\n", - "\n", - "dim(months_grid) \n", - "head(months_grid, 3)\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "eb7d9a47-053e-43a5-9e0d-c7b717236f3e", - "metadata": {}, - "outputs": [], - "source": [ - "## Create `facility_master` for any (🚨 non-NER) countries\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE != \"NER\") {\n", - "\n", - " # Programmatically define `ADM2_ID`\n", - " ADMIN_2_LEVEL <- str_replace(ADMIN_2, \"NAME\", \"ID\")\n", - " # Programmatically define `OU_ID`\n", - " HF_LEVEL <- glue::glue(\"LEVEL_{config_json$SNT_CONFIG$ANALYTICS_ORG_UNITS_LEVEL}_ID\")\n", - "\n", - " facility_master <- dhis2_pyramid_formatted |>\n", - " mutate(\n", - " DATE_OPENED = with_tz(OPENING_DATE, \"UTC\"),\n", - " DATE_CLOSED = with_tz(CLOSED_DATE, \"UTC\")\n", - " ) |>\n", - " select(\n", - " ADM2_ID = all_of(ADMIN_2_LEVEL), \n", - " OU_ID = all_of(HF_LEVEL),\n", - " DATE_OPENED, #= OPENING_DATE,\n", - " DATE_CLOSED #= CLOSED_DATE\n", - ")\n", - "\n", - "head(facility_master)\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f283cc96-ed69-43a3-964e-57ccb0180a4a", - "metadata": {}, - "outputs": [], - "source": [ - "## Create a \"complete\" grid of every ADM2_ID for every month ---------------------------------------\n", - "\n", - "# if (COUNTRY_CODE == \"NER\" && REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", - " \n", - "# This ensures that even if an ADM2_ID has zero open facilities in a month,\n", - "# it will still appear in the final result with a count of 0.\n", - "complete_grid <- expand_grid(\n", - " ADM2_ID = unique(facility_master$ADM2_ID),\n", - " month_date = months_grid$month_date\n", - ") %>%\n", - " mutate(\n", - " YEAR = year(month_date),\n", - " MONTH = month(month_date),\n", - " month_date = with_tz(as_datetime(month_date), \"UTC\") # GP added 0809\n", - " )\n", - "\n", - "head(complete_grid, 3)\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6e905c46-036a-4aa9-85ad-216f846f9e1b", - "metadata": {}, - "outputs": [], - "source": [ - "## Calculate the number of open facilities ---------------------------------------\n", - "\n", - "# # The facility must have opened on or before the last day of the current month. \n", - "# # To calculate the last day: add one month and subtract one day from the first day.\n", - "# complete_grid$month_date[1] # \"2022-01-01\"\n", - "# complete_grid$month_date[1] + months(1) - days(1) # \"2022-01-31\"\n", - "# # The facility must either still be open (DATE_CLOSED is NA) OR it must have closed on or after the first day of that month.\n", - "\n", - "# if (COUNTRY_CODE == \"NER\" && REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", - " \n", - "open_facilities_count <- facility_master %>%\n", - " # Create a row for every possible combination of facility and month\n", - " crossing(months_grid) %>%\n", - " # A facility is \"open\" if it opened BEFORE the end of the month\n", - " # AND it either never closed (NA) or closed AFTER the start of the month.\n", - " filter(\n", - " DATE_OPENED <= month_date + months(1) - days(1) & # opened on or before the last day of the current month\n", - " (is.na(DATE_CLOSED) | DATE_CLOSED >= month_date) # \n", - " ) %>%\n", - " # Count the number of open facilities for each area and month\n", - " count(ADM2_ID, YEAR, MONTH, name = \"OPEN_FACILITIES_COUNT\")\n", - "\n", - "head(open_facilities_count, 3)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0ba854f2-5925-4154-b86e-3e4e7bb6c363", - "metadata": {}, - "outputs": [], - "source": [ - "## Join the counts back to the complete grid to include zeros --------------------------------------\n", - "\n", - "# if (COUNTRY_CODE == \"NER\" && REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", - " \n", - "pyramid_open_facilities <- complete_grid %>%\n", - " left_join(open_facilities_count, by = c(\"ADM2_ID\", \"YEAR\", \"MONTH\")) %>%\n", - " # If a month had no open facilities, the count will be NA. Change it to 0.\n", - " # Also rename `OPEN_FACILITIES_COUNT` to `EXPECTED_REPORTS` to use same col name as other methods\n", - " mutate(OPEN_FACILITIES_COUNT = replace_na(OPEN_FACILITIES_COUNT, 0)) %>% # DENOMINATOR: consistent col name across all methods \n", - " select(ADM2_ID, YEAR, MONTH, \n", - " EXPECTED_REPORTS = OPEN_FACILITIES_COUNT) %>%\n", - " arrange(ADM2_ID, YEAR, MONTH)\n", - "\n", - "print(dim(pyramid_open_facilities))\n", - "head(pyramid_open_facilities, 3)\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ff1a8537-d093-4d5c-8a44-4b729090cced", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "811310a9-df85-4fa3-af9a-b931eaffd7e5", - "metadata": {}, - "source": [ - "### Calculate **Reporting Rate** " - ] - }, - { - "cell_type": "markdown", - "id": "8827cfd6-479b-4025-a379-d20bf20fcfb4", - "metadata": {}, - "source": [ - "**Join df for Denominator**\n", - "\n", - "**Note**
\n", - "in both df's (`dhis2_reporting_expected` OR `routine_active_facilities`) the col `EXPECTED_REPORTS` has the same name to simplify parametrization: only difference between the 2 options is the df to be joined (right element in `left_join()`)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "670508a0-3075-4f82-aa2c-d26cf867f13d", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# --- 2. Join `dhis2_reporting_expected` OR `dhis2_calculated_expected` to add `EXPECTED_REPORTS` ------------------------------------------------\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", - "\n", - "# Parametrized based on DATAELEMENT_METHOD_DENOMINATOR: left_join() the respective df\n", - "if (DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", - " # Add df of rep rate extracted directly from DHIS2\n", - " dhis2_routine_submitted_expected <- left_join(\n", - " dhis2_routine_submitted, \n", - " dhis2_reporting_expected |> select(ADM2_ID, YEAR, MONTH, EXPECTED_REPORTS), # `dhis2_reporting_expected`\n", - " by = join_by(ADM2_ID, YEAR, MONTH)\n", - " ) \n", - " log_msg(\"Calculating `Data Element` reporting rate, using as denominator `EXPECTED_REPORTS` extracted directly from DHIS2.\")\n", - " \n", - "} else if (DATAELEMENT_METHOD_DENOMINATOR == \"ROUTINE_ACTIVE_FACILITIES\") {\n", - " # Add df of rep rate CALCULATED based on submissiosn in dhis2 routine data \"active\" facilities\n", - " dhis2_routine_submitted_expected <- left_join(\n", - " dhis2_routine_submitted, \n", - " routine_active_facilities, # has only cols: `YEAR`, `ADM2_ID`, `EXPECTED_REPORTS`\n", - " by = join_by(ADM2_ID, YEAR) #, MONTH)\n", - " ) \n", - " log_msg(\"Calculating `Data Element` reporting rate, using as denominator `EXPECTED_REPORTS` as CALCULATED from DHIS2 routine data. Here, ACTIVE facilities \n", - " are defined as facilities that reported on any of the extracted indicators at least once per year.\")\n", - " \n", - "} else if (DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", - " # Add df of rep rate CALCULATED based on OPEN facilities as per PYRAMID RAW\n", - " dhis2_routine_submitted_expected <- left_join(\n", - " dhis2_routine_submitted, \n", - " pyramid_open_facilities, \n", - " by = join_by(ADM2_ID, YEAR, MONTH)\n", - " ) \n", - " log_msg(\"Calculating `Data Element` reporting rate, using as denominator `EXPECTED_REPORTS` as CALCULATED from DHIS2 pyramid. \n", - " This method counts the number of OPEN facilities for each ADM2 per MONTH.\")\n", - "}\n", - "\n", - "# Safety measures ...\n", - "dhis2_routine_submitted_expected <- dhis2_routine_submitted_expected |>\n", - " # ungroup() %>% \n", - " mutate(YEAR = as.integer(YEAR),\n", - " MONTH = as.integer(MONTH)\n", - " ) \n", - "\n", - "\n", - "print(dim(dhis2_routine_submitted_expected))\n", - "head(dhis2_routine_submitted_expected, 3)\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6fad303c-b239-4cf9-93a8-fe3ce5c33c37", - "metadata": {}, - "outputs": [], - "source": [ - "# --- 3. Calculate `REPORTING_RATE` ------------------------------------------------\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", - " \n", - "reporting_rate_dataelement <- dhis2_routine_submitted_expected |>\n", - "mutate(\n", - " REPORTING_RATE = SUBMITTED_REPORTS / EXPECTED_REPORTS\n", - " ) \n", - "\n", - "dim(reporting_rate_dataelement)\n", - "head(reporting_rate_dataelement, 3)\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "68023e8e-f7f6-4201-b097-1996bee57671", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# head(hf_active, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "ae3aa127-c20c-4ca5-af0c-4a4260883cac", - "metadata": {}, - "source": [ - "`#### 🚨 Here 👇 swap denominator: join `dhis2_reporting_expected` to replace `TOTAL_HF` with `EXPECTED_REPORTS``" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a97c7d75-3317-48bc-a2f1-770bf38d141a", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", - " inspect_reporting_rate(reporting_rate_dataelement)\n", - "}\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "92651472-26e2-4131-ac02-288122138b0b", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# # --- 1. create intermediate df `hf_active_month`: summarize nr of \"active\" (reporting) HF by month ------------------------\n", - "# hf_active_month <- hf_active %>% \n", - "# # filter(ADM1_ID == \"rWrCdr321Qu\") |> # ⚠️⚠️⚠️ TEMP subset just for CODE development ... ! ⚠️⚠️⚠️\n", - "# dplyr::group_by(ADM2_ID, YEAR, MONTH) %>%\n", - "# dplyr::summarize(\n", - "# SUBMITTED_REPORTS = length(which(ACTIVE == TRUE)), # 🚨 GP changed to BOOLEAN to save space\n", - "# .groups = \"drop\") |>\n", - "# mutate(YEAR = as.integer(YEAR), \n", - "# MONTH = as.integer(MONTH)\n", - "# )\n", - "\n", - "# print(dim(hf_active_month))\n", - "# head(hf_active_month)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "db5ad094-0601-4a18-9435-db60c1f4e8ff", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", - "\n", - " reporting_rate_dataelement <- reporting_rate_dataelement |> \n", - " select(all_of(fixed_cols_rr))\n", - " \n", - " head(reporting_rate_dataelement, 3)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "05f94483-1524-426e-9fe3-4b9bf572c05e", - "metadata": { - "vscode": { - "languageId": "r" + "cells": [ + { + "cell_type": "markdown", + "id": "f5827740-2917-4504-9017-9ec7d408e5f4", + "metadata": {}, + "source": [ + "Script structure:\n", + "\n", + " 0. Parameters: set back-up values for parameters, for when the notebook is run manually (_noy_ via pipeline)\n", + " 1. Setup:\n", + " * Paths\n", + " * Utils functions\n", + " 2. Load Data\n", + " * **Routine data** (DHIS2) already formatted & aggregated (output of pipeline XXX)\n", + " * **Reporting** (DHIS2) pre-computed, already formatted & aggregated (output of pipeline ???)\n", + " * **Shapes** (DHIS2) for plotting (this could be removed if we move the plots to \"report/EDA\" nb)\n", + " 3. Calculate **Reportng Rate (RR)**\n", + " * \"**Dataset**\": using pre-computed reportings from DHIS2/SNIS (was: \"DHIS2\")\n", + " * \"**Data Element**\": using calculated expected nr of report (nr of active facilities) (was: \"CONF\")\n", + " 4. **Export** reporting rate data to `.../data/dhis2/reporting_rate/` as .parquet (and .csv) files for **either**:\n", + " * data**set**: \"XXX_reporting_rate_**dataset**.parquet\" **or**\n", + " * data**element**: \"XXX_reporting_rate_**dataelement**.parquet\"" + ] + }, + { + "cell_type": "markdown", + "id": "5e8f5bf2-922a-468a-8a2c-8e56d7e652df", + "metadata": {}, + "source": [ + "--------------------" + ] + }, + { + "cell_type": "markdown", + "id": "e962c5a4-6b09-4485-8d71-d842159118d3", + "metadata": {}, + "source": [ + "### To Do:\n", + "* For `DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\"`: **add code** to count OPEN facilities () for **countries with \"normal\" pyramids** (i.e., when no mixing of facilities and admin levels ... !). Atm only code for Niger, which runs only if `COUNTRY_CODE == NER`. Should add similar (but simpler) code for the rest of the countries (i.e, `COUNTRY_CODE != NER`)\n", + "* Check why Data Element **Denominator** `routine_active_facilities` is **calculated at `YEAR` (aggregated) instead of `MONTH`** ... possibly fix this to match granularity of other alternatives for denominator (which are calculated at MONTH level)\n", + "* Modify **report notebook** and/or pipeline.py code so that it does not make the **pipeline FAIL** if `reporting_rate_dataset` or `reporting_rate_dataelement` is **not found** (which is now always the case since we only output 1 file at each run!!)" + ] + }, + { + "cell_type": "markdown", + "id": "0cdfdc73-bb9a-48a8-a26b-84ecbab2e0aa", + "metadata": {}, + "source": [ + "----------------" + ] + }, + { + "cell_type": "markdown", + "id": "339f6d58-0965-40ef-b718-96195d2463f8", + "metadata": {}, + "source": [ + "## Parameters" + ] + }, + { + "cell_type": "markdown", + "id": "dd6cd6f8-b91b-4902-8801-a60e11776f98", + "metadata": {}, + "source": [ + "Set Default values **if _not_ provided by pipeline**
\n", + "This makes the execution flexible and \"safe\": nb can be run manually from here or be executed via pipeline, without having to change anything in the code!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "93aac683-8828-4a42-b841-f16c7e8fbb07", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Set BACKUP VALUE: root path - NEVER CHANGE THIS!\n", + "if (!exists(\"SNT_ROOT_PATH\")) {\n", + " SNT_ROOT_PATH <- \"/home/hexa/workspace\" \n", + "}\n", + "\n", + "\n", + "# Choose to run either DataSet OR DataElement method\n", + "if (!exists(\"REPORTING_RATE_METHOD\")) {\n", + " # REPORTING_RATE_METHOD <- \"DATASET\" \n", + " REPORTING_RATE_METHOD <- \"DATAELEMENT\"\n", + "}\n", + "\n", + "\n", + "# Data Elemenet method: Choice of which INDICATORS to use to count the nr of reporting facilities \n", + "# CONF\n", + "if (!exists(\"DATAELEMENT_METHOD_NUMERATOR_CONF\")) {\n", + " DATAELEMENT_METHOD_NUMERATOR_CONF <- TRUE # FALSE\n", + "}\n", + "\n", + "# SUSP\n", + "if (!exists(\"DATAELEMENT_METHOD_NUMERATOR_SUSP\")) {\n", + " DATAELEMENT_METHOD_NUMERATOR_SUSP <- TRUE # FALSE\n", + "}\n", + "\n", + "# TEST\n", + "if (!exists(\"DATAELEMENT_METHOD_NUMERATOR_TEST\")) {\n", + " DATAELEMENT_METHOD_NUMERATOR_TEST <- TRUE # FALSE\n", + "}\n", + "\n", + "\n", + "\n", + "# Data Elemenet RR. Choice: which df to use for nr of `EXPECTED_REPORTS` (DENOMINATOR) \n", + "if (!exists(\"DATAELEMENT_METHOD_DENOMINATOR\")) {\n", + " # DATAELEMENT_METHOD_DENOMINATOR <- \"ROUTINE_ACTIVE_FACILITIES\" \n", + " DATAELEMENT_METHOD_DENOMINATOR <- \"PYRAMID_OPEN_FACILITIES\" \n", + " # DATAELEMENT_METHOD_DENOMINATOR <- \"DHIS2_EXPECTED_REPORTS\" # ⚠️ only if `REPORTING_RATE_METHOD == \"DATASET\"` && DataSet is available!! ⚠️\n", + "} \n" + ] + }, + { + "cell_type": "markdown", + "id": "af076158-1f5a-408d-8ce2-2f2101d0531c", + "metadata": {}, + "source": [ + "## 1. Setup" + ] + }, + { + "cell_type": "markdown", + "id": "3ae826e4-f728-4c8d-81fb-0857234ac622", + "metadata": {}, + "source": [ + "### 1.1. Paths" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5f1b8ce-db82-4295-8e74-00b765cf0b9d", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# PROJECT PATHS\n", + "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_reporting_rate\")\n", + "CODE_PATH <- file.path(SNT_ROOT_PATH, 'code') # this is where we store snt_utils.r\n", + "CONFIG_PATH <- file.path(SNT_ROOT_PATH, 'configuration') # .json config file\n", + "DATA_PATH <- file.path(SNT_ROOT_PATH, 'data', 'dhis2') " + ] + }, + { + "cell_type": "markdown", + "id": "22971de0-1431-4cbd-b8c1-3bd3e1609e0d", + "metadata": {}, + "source": [ + "### 1.2. Utils functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1784fd43-03f3-478b-8148-4b478317ea21", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_reporting_rate.r\"))" + ] + }, + { + "cell_type": "markdown", + "id": "3bbcbd39-54e8-4ece-9244-30d7d30291d2", + "metadata": {}, + "source": [ + "### 1.3. Packages" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "426ecff6-0b4c-474d-a48d-826002205b89", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# List required pcks ----------------> check what are the really required libraries\n", + "required_packages <- c(\"arrow\", # for .parquet\n", + " \"tidyverse\",\n", + " \"stringi\", \n", + " \"jsonlite\", \n", + " \"httr\", \n", + " \"reticulate\")\n", + "\n", + "# Execute function\n", + "install_and_load(required_packages)" + ] + }, + { + "cell_type": "markdown", + "id": "18a8e0c1-ac09-4435-b6f4-5f91fd916396", + "metadata": {}, + "source": [ + "### 1.3.1. OpenHEXA-specific settings" + ] + }, + { + "cell_type": "markdown", + "id": "ebb8c7d5-7c2c-4dbe-a1ba-238419fbedf3", + "metadata": {}, + "source": [ + "#### For 📦{sf}, tell OH where to find stuff ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "91a66fb7-dd5e-43fd-a6a2-d8bb9f0315d6", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", + "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")" + ] + }, + { + "cell_type": "markdown", + "id": "ac9ee427-020e-47c5-b2c9-5ca24e1f2779", + "metadata": {}, + "source": [ + "#### Set environment to load openhexa.sdk from the right path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa331278-573d-4a22-ab16-da6972d7b0be", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Set environment to load openhexa.sdk from the right path\n", + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "reticulate::py_config()$python\n", + "openhexa <- import(\"openhexa.sdk\")" + ] + }, + { + "cell_type": "markdown", + "id": "339b2e8b-9bf6-4eaf-b283-d9360c1c6899", + "metadata": {}, + "source": [ + "### 1.4. Load and check `config` file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1c46526-6844-43ae-bb53-d8d1ad2fac24", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load SNT config\n", + "\n", + "config_file_name <- \"SNT_config.json\" \n", + "config_json <- tryCatch({\n", + " jsonlite::fromJSON(file.path(CONFIG_PATH, config_file_name)) \n", + " },\n", + " error = function(e) {\n", + " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })\n", + "\n", + "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, config_file_name))\n", + "log_msg(msg)" + ] + }, + { + "cell_type": "markdown", + "id": "29182f25-b0cf-46aa-9818-49616cd3f353", + "metadata": {}, + "source": [ + "**Save config fields as variables**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c52654c8-8a19-4e0c-a83b-1bc2eecae6bc", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Generic\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", + "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", + "\n", + "# How to treat 0 values (in this case: \"SET_0_TO_NA\" converts 0 to NAs)\n", + "NA_TREATMENT <- config_json$SNT_CONFIG$NA_TREATMENT\n", + "\n", + "# Which (aggregated) indicators to use to evaluate \"activity\" of an HF - for Reporting Rate method \"Ousmane\"\n", + "DHIS2_INDICATORS <- names(config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS)\n", + "\n", + "# Which reporting rate PRODUCT_UID to use (not that this is a dataset in COD, but 2 dataElements in BFA!)\n", + "REPORTING_RATE_PRODUCT_ID <- config_json$SNT_CONFIG$REPORTING_RATE_PRODUCT_UID" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "412572bc-fb96-4f61-ac49-be7f449219b6", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# DHIS2_INDICATORS\n", + "log_msg(paste(\"Expecting the following DHIS2 (aggregated) indicators : \", paste(DHIS2_INDICATORS, collapse=\", \")))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a0a8562-4a70-455c-9ccf-aa39f4cf4e31", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Fixed cols for routine data formatting \n", + "fixed_cols <- c('OU_ID','PERIOD', 'YEAR', 'MONTH', 'ADM1_ID', 'ADM2_ID') # (OU_NAME has homonimous values!)\n", + "# print(paste(\"Fixed routine data (`dhis2_routine`) columns (always expected): \", paste(fixed_cols, collapse=\", \")))\n", + "log_msg(paste(\"Expecting the following columns from routine data (`dhis2_routine`) : \", paste(fixed_cols, collapse=\", \")))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "86e82d54-2b00-4c25-9b34-3497d4c88c52", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Fixed cols for exporting RR tables: to export output tables with consistent structure\n", + "fixed_cols_rr <- c('YEAR', 'MONTH', 'ADM2_ID', 'REPORTING_RATE') " + ] + }, + { + "cell_type": "markdown", + "id": "dadc7351-e67e-450b-a046-bc64660a7dde", + "metadata": {}, + "source": [ + "### 1.5. 🔍 Check: at least 1 indicator must be selected\n", + "The use can toggle on/off each of the indicators. Therefore, need to make sure at least one is ON.
\n", + "Alternatively, `CONF` could be made mandatory, but I think it looks better if they're all displayed in the Run pipeline view (more intuitive)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2cf6e2a4-0822-4a0c-852e-143da5473d20", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "nr_of_indicators_selected <- sum(DATAELEMENT_METHOD_NUMERATOR_CONF, DATAELEMENT_METHOD_NUMERATOR_SUSP, DATAELEMENT_METHOD_NUMERATOR_TEST)\n", + "\n", + "if (nr_of_indicators_selected == 0) {\n", + " msg <- \"[ERROR] Error: no indicator selected, cannot perform calculation of reporting rate method 'Data Element'! Select at least one (e.g., `CONF`).\"\n", + " cat(msg) \n", + " stop(msg)\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "8d8d9be2-bf05-466d-811e-6beea0dccfde", + "metadata": {}, + "source": [ + "## 2. Load Data" + ] + }, + { + "cell_type": "markdown", + "id": "0fa1b169-fc55-4ef1-b58f-6a7dc9d1dec3", + "metadata": {}, + "source": [ + "### 2.1. **Routine** data (DHIS2) \n", + "already formatted & aggregated (output of pipeline XXX)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "586e8da8-4e1c-431a-9b8d-1169167e1c09", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# DHIS2 Dataset extract identifier\n", + "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "\n", + "# Load file from dataset\n", + "dhis2_routine <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_routine.parquet\")) }, \n", + " error = function(e) {\n", + " msg <- paste(\"Error while loading DHIS2 routine data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", + " cat(msg)\n", + " stop(msg)\n", + "})\n", + "\n", + "msg <- paste0(\"DHIS2 routine data loaded from dataset : \", dataset_name, \" dataframe dimensions: \", paste(dim(dhis2_routine), collapse=\", \"))\n", + "log_msg(msg)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a2454183-44f7-4e2e-a0cf-ca112aa183bb", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Ensure correct data type for numerical columns \n", + "dhis2_routine <- dhis2_routine %>%\n", + " mutate(across(c(PERIOD, YEAR, MONTH), as.numeric))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "edb2fcdc-ce0a-4c78-b06a-9f4610ab4714", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "head(dhis2_routine, 3)" + ] + }, + { + "cell_type": "markdown", + "id": "821e1ebf-b2fa-4469-974e-2e4d27d58854", + "metadata": {}, + "source": [ + "#### 🔍 Check expected cols for method **Data Element**, numerator using multiple indicators.\n", + "Only when: `DATAELEMENT_METHOD_NUMERATOR == \"CONF|SUSP|TEST\"`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d3f8b89e-a04e-4e0b-9892-95ce2150e7da", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "head(dhis2_routine, 3)" + ] + }, + { + "cell_type": "markdown", + "id": "adec5412", + "metadata": {}, + "source": [ + "#### 🔍 Check expected cols for method **Data Element**, numerator using multiple indicators.\n", + "Based on which indicator(s) are selected (if any)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0bbcdf8c-873a-4b41-980a-f18d1863ab8f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Initialize empty vector\n", + "indicators_selected = c()\n", + "\n", + "# Add elements based on user selection(s)\n", + "if (DATAELEMENT_METHOD_NUMERATOR_CONF) {\n", + " indicators_selected = append(indicators_selected, \"CONF\")\n", + "}\n", + "\n", + "if (DATAELEMENT_METHOD_NUMERATOR_SUSP) {\n", + " indicators_selected = append(indicators_selected, \"SUSP\")\n", + "}\n", + "\n", + "if (DATAELEMENT_METHOD_NUMERATOR_TEST) {\n", + " indicators_selected = append(indicators_selected, \"TEST\")\n", + "}\n", + "\n", + "print(paste0(\"Selected indicators: \", paste(indicators_selected, collapse = \", \")))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b84753f8-aa9c-4563-beae-5e29b3f1e773", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# This is kinda useless now but KEEP in case we ADD MORE CHOICES OF INDICATORS!! \n", + "if(REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", + " if (DATAELEMENT_METHOD_NUMERATOR_CONF | DATAELEMENT_METHOD_NUMERATOR_SUSP | DATAELEMENT_METHOD_NUMERATOR_TEST) {\n", + " log_msg(paste0(\"Indicator(s) \", paste(indicators_selected, collapse = \", \") , \" selected for calculation of numerator for method `Data Element`.\" ))\n", + " \n", + " if ( length(which(indicators_selected %in% names(dhis2_routine))) < length(indicators_selected) ) {\n", + " log_msg(paste0(\"🚨 Warning: one or more of the follow column is missing from `dhis2_routine`: \", paste(expected_col, collapse = \", \"), \".\"), \"warning\")\n", + " } \n", + " }\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "c832da26-fe0c-43fe-8300-2fff5c4cbf34", + "metadata": {}, + "source": [ + "### 2.2. **Reporting** pre-computed from DHIS2 \n", + "Data granularity:\n", + "* **ADM2**\n", + "* **MONTH** (PERIOD)\n", + "\n", + "Note: data comes from different dataset (`DS_NAME`): `A SERVICES DE BASE`, `B SERVICES SECONDAIRES`,`D SERVICE HOPITAL` \n", + "\n", + "The col `DS_METRIC` indicates whether the `VALUE` is `EXPECTED_REPORTS` or `ACTUAL_REPORTS`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ce295b9-9898-4e12-8a91-92bb25b9e0a2", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# REPORTING_RATE_METHOD <- \"DATAELEMENT\" # \"DATASET\"\n", + "REPORTING_RATE_METHOD" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a32fc96-5b8e-4108-a224-c0d843df9b47", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", + " # DHIS2 Dataset extract identifier\n", + " dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + " file_name <- paste0(COUNTRY_CODE, \"_reporting.parquet\")\n", + " \n", + " # Load file from dataset\n", + " dhis2_reporting <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, file_name) }, \n", + " error = function(e) {\n", + " msg <- paste(\"Error while loading DHIS2 pre-computed REPORTING data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", + " cat(msg)\n", + " stop(msg)\n", + " })\n", + " \n", + " msg <- paste0(\"DHIS2 pre-computed REPORTING data loaded from file `\", file_name, \"` (from dataset : `\", dataset_name, \"`). Dataframe dimensions: \", \n", + " paste(dim(dhis2_reporting), collapse=\", \"))\n", + " log_msg(msg)\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e131d9ee-0e88-4bb6-982b-53b1229fba5f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", + " # Convert VALUE col to - should not be needed but keep as safety measure \n", + " dhis2_reporting <- dhis2_reporting |>\n", + " mutate(across(c(PERIOD, YEAR, MONTH, VALUE), as.numeric))\n", + "\n", + " head(dhis2_reporting, 3)\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "46e3dba8-d46b-457e-ba90-c663e30c42d2", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# # Convert VALUE col to - should not be needed but keep as safety measure \n", + "# dhis2_reporting <- dhis2_reporting |>\n", + "# mutate(across(c(PERIOD, YEAR, MONTH, VALUE), as.numeric))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5149befe-b6ad-46a9-9879-7637ce5b02be", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# head(dhis2_reporting, 3)" + ] + }, + { + "cell_type": "markdown", + "id": "7a967af3-f6e5-428a-8769-72808f21a125", + "metadata": {}, + "source": [ + "#### 2.2.1. **Filter** to keep only values for `PRODUCT_UID` defined in config.json" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1948c2f7-7a2c-47a2-9dc6-ba29da6d030c", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "REPORTING_RATE_PRODUCT_ID" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e4258098-e24c-4520-914d-0f73354bb3ab", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", + "\n", + " # Handle problems with incorrect configuration - to be improved 🚧\n", + " if (is.null(REPORTING_RATE_PRODUCT_ID)) {\n", + " log_msg(\"🛑 Problem with definition of REPORTING_RATE_PRODUCT_ID, check `SNT_config.json` file!\")\n", + " } else \n", + " product_name <- dhis2_reporting |> filter(PRODUCT_UID %in% REPORTING_RATE_PRODUCT_ID) |> pull(PRODUCT_NAME) |> unique()\n", + " log_msg(glue::glue(\"Using REPORTING_RATE_PRODUCT_ID == `{REPORTING_RATE_PRODUCT_ID}`, corresponding to DHIS2 Product name : `{product_name}`.\"))\n", + "\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c22c6ada-7cb1-4fca-b65e-b51e5eca35a2", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", + "\n", + " dhis2_reporting_filtered <- dhis2_reporting |>\n", + " filter(PRODUCT_UID %in% REPORTING_RATE_PRODUCT_ID) |>\n", + " select(-PRODUCT_UID, -PRODUCT_NAME) # useless cols now\n", + " \n", + " print(dim(dhis2_reporting_filtered))\n", + " head(dhis2_reporting_filtered)\n", + " \n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "9da035cf-5d3f-4df0-a063-2d2497616c82", + "metadata": {}, + "source": [ + "#### 2.2.2. Format to produce `dhis2_reporting_expected`\n", + "🚨 Note: Use `dhis2_reporting_expected$EXPECTED_REPORTS` as new denominator for REPORTING_RATE calculations (methods dataset and dataelement)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e970f9d-258e-4050-ae69-185b88c79fc3", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", + " \n", + " dhis2_reporting_wide <- dhis2_reporting_filtered |> \n", + " pivot_wider(\n", + " names_from = PRODUCT_METRIC, \n", + " values_from = VALUE\n", + " )\n", + " \n", + " print(dim(dhis2_reporting_wide))\n", + " head(dhis2_reporting_wide)\n", + " \n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eab31756-ae6b-4152-8ec3-8195236d8732", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Use `dhis2_reporting_expected$EXPECTED_REPORTS` as new denomitor for RR calculations (methods ANY and CONF)\n", + "\n", + "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", + " \n", + " dhis2_reporting_expected <- dhis2_reporting_wide |> \n", + " select(-ACTUAL_REPORTS)\n", + " \n", + " print(dim(dhis2_reporting_expected))\n", + " head(dhis2_reporting_expected)\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "3c3d0f35-889d-4c16-9741-d6e75e2ef096", + "metadata": {}, + "source": [ + "#### 2.2.3. **Checks** on data completeness: _do **periods match** with routine data?_\n", + "Lack of perfect overlap in periods between routine data and reporting rate data might create headhaches downstream!
\n", + "Specifically, **incidence** calculations will show **N2 smaller than N1** due to **aggregation by YEAR when NA** values are present!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ea57600-418b-45bc-805a-f829e237b4c4", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", + " \n", + " # --- Check Year Compatibility ---\n", + " routine_years <- sort(unique(as.integer(dhis2_routine$YEAR))) # as.integer\n", + " expected_years <- sort(unique(as.integer(dhis2_reporting_expected$YEAR))) # as.integer\n", + " \n", + " if (!setequal(routine_years, expected_years)) {\n", + " missing_in_routine <- setdiff(expected_years, routine_years)\n", + " missing_in_expected <- setdiff(routine_years, expected_years)\n", + " \n", + " if (length(missing_in_routine) > 0) {\n", + " log_msg(paste0(\"🚨 Warning: YEAR value(s) present in 'dhis2_reporting_expected' but not in 'dhis2_routine': \",\n", + " paste(missing_in_routine, collapse = \", \")))\n", + " }\n", + " if (length(missing_in_expected) > 0) {\n", + " log_msg(paste0(\"🚨 Warning: YEAR value(s) present in 'dhis2_routine' but not in 'dhis2_reporting_expected': \",\n", + " paste(missing_in_expected, collapse = \", \")))\n", + " }\n", + " } else {\n", + " log_msg(\"✅ YEAR values are consistent across 'dhis2_routine' and 'dhis2_reporting_expected'.\")\n", + " \n", + " # --- Check Month Compatibility (if years are consistent) ---\n", + " all_years <- unique(routine_years) # Or expected_years, they are the same now\n", + " \n", + " for (year_val in all_years) {\n", + " routine_months_for_year <- dhis2_routine %>%\n", + " filter(YEAR == year_val) %>%\n", + " pull(MONTH) %>%\n", + " unique() %>%\n", + " sort()\n", + " \n", + " expected_months_for_year <- dhis2_reporting_expected %>%\n", + " filter(YEAR == year_val) %>%\n", + " pull(MONTH) %>%\n", + " unique() %>%\n", + " sort()\n", + " \n", + " if (!setequal(routine_months_for_year, expected_months_for_year)) {\n", + " missing_in_routine_months <- setdiff(expected_months_for_year, routine_months_for_year)\n", + " missing_in_expected_months <- setdiff(routine_months_for_year, expected_months_for_year)\n", + " \n", + " if (length(missing_in_routine_months) > 0) {\n", + " log_msg(paste0(\"🚨 Warning: for YEAR \", year_val, \", MONTH value(s) '\", paste(missing_in_routine_months, collapse = \", \"),\n", + " \"' present in 'dhis2_reporting_expected' but not in 'dhis2_routine'!\"\n", + " ))\n", + " }\n", + " if (length(missing_in_expected_months) > 0) {\n", + " log_msg(paste0(\"🚨 Warning: for YEAR \", year_val, \", MONTH value(s) '\", paste(missing_in_expected_months, collapse = \", \"), \n", + " \"' present in 'dhis2_routine' but not in 'dhis2_reporting_expected'!\"\n", + " ))\n", + " }\n", + " } else {\n", + " log_msg(paste0(\"✅ For year \", year_val, \", months are consistent across both data frames.\"))\n", + " }\n", + " }\n", + " }\n", + "\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "5e711191-995b-4f89-b10c-fc2214cdd8b2", + "metadata": {}, + "source": [ + "### 2.3. **Pyramid** to count OPEN facilities (denominator)\n", + "Table (and column) needed for denominator of \"Data Element\" reporting rate if choice == `PYRAMID_OPEN_FACILITIES`\n", + "\n", + "**Important**: the pyramid must contain the `OPENING_DATE` and `CLOSING_DATE` columns (this was implemented in the new extraction pipeline from 2025-09).
\n", + "Then, **depending on the Country** (well, theire pyramid structure) **import** either:\n", + "* **Raw** pyramid for 🇳🇪 Niger: because first need to \"manually\" correctly aggregate the VALUEs for the HF (separate them from admin levels and sum up HD units)\n", + "* **Formatted** pyramid for all other countries encountered so far: 🇨🇩 DRC, 🇧🇫 Burkina Faso ... bevcause their pyramid is already usable right away" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ad3f83b6-2fdd-45da-8a4d-fb06513b6be2", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# DATAELEMENT_METHOD_DENOMINATOR <- \"PYRAMID_OPEN_FACILITIES\"\n", + "DATAELEMENT_METHOD_DENOMINATOR" + ] + }, + { + "cell_type": "markdown", + "id": "e7b80b6e-9e34-4e71-93e8-7e16a110e17c", + "metadata": {}, + "source": [ + "#### **Raw** pyramid for 🇳🇪 **Niger**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "652cf1a7-c9a2-48db-b44d-8fabfd0e072f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE == \"NER\") {\n", + " \n", + " # DHIS2 Dataset extract identifier\n", + " dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_EXTRACTS\n", + " \n", + " # Load file from dataset\n", + " # Rename `dhis2_pyramid`?? Check with downstream processes ... 🚧\n", + " dhis2_pyramid_raw <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_dhis2_raw_pyramid.parquet\")) }, \n", + " error = function(e) {\n", + " msg <- paste(\"Error while loading DHIS2 pyramid RAW data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", + " cat(msg)\n", + " stop(msg)\n", + " })\n", + " \n", + " msg <- paste0(\"DHIS2 RAW pyramid data loaded from dataset : `\", dataset_name, \"`. Dataframe dimensions: \", paste(dim(dhis2_pyramid_raw), collapse=\", \"))\n", + " log_msg(msg)\n", + " \n", + " head(dhis2_pyramid_raw)\n", + " \n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "f1716ac3-ce8f-4223-9729-6ed826e743bc", + "metadata": {}, + "source": [ + "#### **Formatted** pyramid for all other countries (normal pyramid) 🇨🇩 🇧🇫" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fc16ae54-4915-4333-b458-2b611e2b1792", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE != \"NER\") {\n", + " \n", + " # DHIS2 Dataset extract identifier\n", + " dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + " \n", + " # Load file from dataset\n", + " # Rename `dhis2_pyramid`?? Check with downstream processes ... 🚧\n", + " dhis2_pyramid_formatted <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_pyramid.parquet\")) }, \n", + " error = function(e) {\n", + " msg <- paste(\"Error while loading DHIS2 pyramid FORMATTED data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", + " cat(msg)\n", + " stop(msg)\n", + " })\n", + " \n", + " msg <- paste0(\"DHIS2 pyramid FORMATTED data loaded from dataset : `\", dataset_name, \"`. Dataframe dimensions: \", paste(dim(dhis2_pyramid_formatted), collapse=\", \"))\n", + " log_msg(msg)\n", + " \n", + " head(dhis2_pyramid_formatted)\n", + " \n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "eb4c5c63-d140-46b8-b686-886e612a31dc", + "metadata": {}, + "source": [ + "## 3. Calculate **Reporting Rate** (RR)\n", + "We compute it using 2 approaches, user can decided later on which one to use for incidence adjustment." + ] + }, + { + "cell_type": "markdown", + "id": "cb724aa8-5f06-4e99-aeca-640d0c1b049e", + "metadata": {}, + "source": [ + "## 3.1. \"**Dataset**\" reporting rate: pre-computed, from **DHIS2**\n", + "Exrtacted from DHIS2 and formatted. \n", + "\n", + "Straightforward: `ACTUAL_REPORTS` / `EXPECTED_REPORTS` (just pivot `DS_METRIC` and divide)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10b2f52b-0217-43f1-88a3-cd01d98869b1", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", + "\n", + " reporting_rate_dataset <- dhis2_reporting_wide |> \n", + " mutate(REPORTING_RATE = ACTUAL_REPORTS / EXPECTED_REPORTS)\n", + " \n", + " print(dim(reporting_rate_dataset))\n", + " head(reporting_rate_dataset, 3)\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "3d49eda8-b4fd-437a-8938-17bf0806f281", + "metadata": {}, + "source": [ + "#### Quick data quality check 🔍" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cff33416-ea66-4eeb-9d33-1597c2f05b0c", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# inspect_reporting_rate() loaded from utils/snt_dhis2_reporting_rate.r" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e2f4c11c-c683-4204-ab91-9d41cab4826c", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", + " inspect_reporting_rate(reporting_rate_dataset)\n", + " }" + ] + }, + { + "cell_type": "markdown", + "id": "04870e93-5385-425b-89fd-b815a87cfa21", + "metadata": {}, + "source": [ + "#### Subset cols" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d90671b0-36f8-4c6e-8736-4ea807079f83", + "metadata": { + "scrolled": true, + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATASET\" | DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", + " reporting_rate_dataset <- reporting_rate_dataset |> \n", + " select(all_of(fixed_cols_rr))\n", + " \n", + " dim(reporting_rate_dataset)\n", + " head(reporting_rate_dataset, 3)\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "62e6cb16-0196-447f-b142-aaec2120eecb", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "2dc27c07-80cd-465e-891f-9fb70111dbb0", + "metadata": {}, + "source": [ + "#### Plot by MONTH (heatmap)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9ce56fc-f86a-4a2b-95b7-fb6ec5b89087", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATASET\") {\n", + " \n", + " # Plot reporting rate heatmap\n", + " options(repr.plot.width = 20, repr.plot.height = 10) \n", + " \n", + " # reporting_rate_conf_month %>%\n", + " reporting_rate_dataset %>%\n", + " mutate(\n", + " DATE = as.Date(paste0(YEAR, \"-\", MONTH, \"-01\"))\n", + " ) %>%\n", + " ggplot(., aes(x = DATE, \n", + " y = factor(ADM2_ID), \n", + " fill = REPORTING_RATE * 100)\n", + " ) + \n", + " geom_tile() +\n", + " scale_fill_viridis_c(\n", + " option = \"C\",\n", + " direction = 1, # blue = low, yellow = high\n", + " limits = c(0, 100),\n", + " name = \"Reporting rate (%)\"\n", + " ) +\n", + " labs(\n", + " title = \"Monthly Reporting Rate by Health District - Method 'DataSet'\",\n", + " subtitle = \"Each tile represents the reporting completeness per district per month\",\n", + " x = \"Month\",\n", + " y = \"Health District\"\n", + " ) +\n", + " theme_minimal(base_size = 13) +\n", + " theme(\n", + " axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 9),\n", + " axis.text.y = element_text(size = 9),\n", + " plot.title = element_text(face = \"bold\", hjust = 0.5, size = 14),\n", + " plot.subtitle = element_text(hjust = 0.5, size = 12),\n", + " legend.position = \"right\",\n", + " panel.grid = element_blank()\n", + " )\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "00bf15b7-baa7-4734-8133-8d4a9cc843a3", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "40b21c65-1b75-42f7-821a-24d31e436c73", + "metadata": {}, + "source": [ + "----------------------------" + ] + }, + { + "cell_type": "markdown", + "id": "17ffece4-9420-4004-993b-b5692cc1d2de", + "metadata": {}, + "source": [ + "## 3.2. **Data Element** reporting rate: based on reporting of one or more indicators\n", + "**_Partially_ following methods by WHO and as per Diallo (2025) paper**\n", + "\n", + "To accurately measure data completeness, we calculate the **monthly** reporting rate per **ADM2**, as the **proportion** of **facilities** (HF or `OU_ID`) that in a given month submitted data for either a single indicator (i.e., **confirmed** malaria case as `CONF`) or for _any_ of the chosen indicators (i.e., `CONF`, `SUSP`, `TEST`). \n", + "\n", + "Basically, \"Data Element\" reporting rate is the number of facilities reporting on 1 or more given indicators, over the total number of facilities.
\n", + "\n", + "For this method the user is allowed to **chose** how to calculate both the **numerator** and the **denominator**.
\n", + "Specifically:\n", + "* **Numerator**: is the number of **facilities that _actually reported_** data, and it is estimated based on whether a facility (FoSa, or HF, or `OU_ID`) **submitted data** for **_any_** of the following **indicators**:\n", + " * `CONF`: confirmed malaria cases and/or\n", + " * `SUSP`: suspected malaria cases and/or\n", + " * `TEST`: tested malaria cases
\n", + " Note: we **recommend** always including `CONF` because it is a core indicator consistently tracked across the dataset. This choice ensures alignment with the structure of the incidence calculation, which is also mainly based on confirmed cases.\n", + "\n", + "
\n", + " \n", + "* **Denominator**: is the number of **facilities _expected_ to report**. This number can be obtained in two different ways:\n", + " * `\"DHIS2_EXPECTED_REPORTS\"`: uses the col `EXPECTED_REPORTS` from the df `dhis2_reporting_expected`.
\n", + " This is obtained directly from DHIS2, and is the same denominator used to calculate the \"Dataset\" reporting rate.\n", + " * `\"ROUTINE_ACTIVE_FACILITIES\"`: uses the col `EXPECTED_REPORTS` from the df `active_facilities`.
\n", + " This is calculated as the number of \"**active**\" facilities (`OU_ID`), defined as those that submitted _any_ data **at least once in a given year**, across ***all*** indicators extracted in `dhis2_routine` (namely: all aggregated indicators as defined in the SNT_config.json file, see: `config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS`)\n", + "\n", + "
\n", + "\n", + "This method improves over simple binary completeness flags by accounting for both spatial (facility coverage) and temporal (monthly timeliness) dimensions.
" + ] + }, + { + "cell_type": "markdown", + "id": "f5dcd3b9-6f02-4fc5-9e5f-2253c015a3d4", + "metadata": {}, + "source": [ + "### Calculate the **numerator**" + ] + }, + { + "cell_type": "markdown", + "id": "a90d9f4a-a058-4ad5-8ef2-f827987b5def", + "metadata": {}, + "source": [ + "**Note**: the col `REPORTED` keeps the same name regardless of the value of `DATAELEMENT_METHOD_NUMERATOR` because \n", + "in this way the code needs to be parametrized only once (here).\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8076609c-46e8-478a-8283-bc63a70102f8", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", + "\n", + "dhis2_routine_active <- dhis2_routine %>%\n", + " mutate(\n", + " # if_any() returns TRUE if the condition is met for any of the selected columns\n", + " ACTIVE = if_else(if_any(all_of(indicators_selected), ~ !is.na(.x)), 1, 0)\n", + " )\n", + "\n", + "log_msg(paste0(\"Evaluating reporting facilities based on indicators: \", paste(indicators_selected, collapse = \", \"), \".\"))\n", + "\n", + "dim(dhis2_routine_active)\n", + "head(dhis2_routine_active, 3)\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "325faf35-ed25-4b8e-b421-934a2852f27e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1773313-17e5-478d-b60d-c1193233204d", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# --- 1. Calculate `SUBMITTED_REPORTS` as the nr of ACTIVE facilities (that REPORTED, each month) ------------------------\n", + "\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", + "\n", + "dhis2_routine_submitted <- dhis2_routine_active %>% # OLD: dhis2_routine_reporting_month <- dhis2_routine_reporting %>%\n", + " group_by(ADM2_ID, YEAR, MONTH) %>% \n", + " summarise(\n", + " SUBMITTED_REPORTS = sum(ACTIVE, na.rm = TRUE),\n", + " .groups = \"drop\"\n", + " ) %>%\n", + " ungroup() %>% \n", + " mutate(YEAR = as.integer(YEAR),\n", + " MONTH = as.integer(MONTH)\n", + " ) \n", + "\n", + "print(dim(dhis2_routine_submitted))\n", + "head(dhis2_routine_submitted, 3)\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a25647e3-5674-44e0-855e-c3a48483310d", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "15f4c12f", + "metadata": {}, + "source": [ + "### Calculate the **denominator**" + ] + }, + { + "cell_type": "markdown", + "id": "06b2070d-c672-425f-a78f-b94a8d16a017", + "metadata": {}, + "source": [ + "#### Option: `ROUTINE_ACTIVE_FACILITIES`\n", + "This is to be used **only when** `DATAELEMENT_METHOD_DENOMINATOR ==`**`ROUTINE_ACTIVE_FACILITIES`** " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08f03ed1-5831-4fe5-8bde-674a513e8110", + "metadata": { + "scrolled": true, + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Calculate the tot nr of facilities (distinct OU_ID) based on all HF that appear in the routine data (each YEAR)\n", + "# meaning: regardless of what indicators they submit data for, as long as they have submitted something\n", + "\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"ROUTINE_ACTIVE_FACILITIES\") {\n", + " routine_active_facilities <- dhis2_routine %>%\n", + " # Keep only rows where at least one indicator has non-NA value\n", + " filter(if_any(any_of(DHIS2_INDICATORS), ~ !is.na(.))) %>%\n", + " group_by(YEAR, ADM2_ID) %>%\n", + " summarize(\n", + " EXPECTED_REPORTS = n_distinct(OU_ID),\n", + " .groups = \"drop\" # remove grouping \n", + " )\n", + "\n", + " nr_of_rows <- nrow(routine_active_facilities)\n", + " log_msg(glue::glue(\"Produced df `routine_active_facilities`, with column `EXPECTED_REPORTS` calculated from DHIS2 routine data. Dataframe `routine_active_facilities` has {nr_of_rows} rows.\"))\n", + "\n", + " head(routine_active_facilities, 3)\n", + " \n", + "} \n" + ] + }, + { + "cell_type": "markdown", + "id": "6629dccb-97b0-4b0e-b23f-15b98704323d", + "metadata": {}, + "source": [ + "#### Option: `PYRAMID_OPEN_FACILITIES`\n", + "This is to be used **only when** `DATAELEMENT_METHOD_DENOMINATOR ==`**`PYRAMID_OPEN_FACILITIES`** " + ] + }, + { + "cell_type": "markdown", + "id": "0972ffca-c14a-4b93-85ff-027d056c3759", + "metadata": {}, + "source": [ + "------------------" + ] + }, + { + "cell_type": "markdown", + "id": "d49219b7-5932-4062-a10d-e1f3a4a81449", + "metadata": {}, + "source": [ + "#### TEMPORARY! 🇳🇪 **Niger-specific method**\n", + "🚨 Specific to **Niger EnDoP**: Pre-processing needed to separate facilities from adm levels!! 🚨
\n", + "\n", + "⚠️⚠️⚠️ **TEMPORARY: This will be moved to a dedicated pipeline!** ⚠️⚠️⚠️
\n", + "\n", + "Specifically:\n", + "* **Hospital**s (HD a Hopital District): at **level 4** together with Aires de Sante\n", + "* All other **FoSa**s: at **level 6**, also mixed with the hospital units\n", + "\n", + "Therefore, to assigned closed/open status, it is necessary to attach to each individual facility the closng and opening data column. \n", + "To do this: \n", + "1) extract list of facilities and id across the 2 levels (4 and 6) and\n", + "2) calculate the nr of open facilities per MONTH (PERIOD) per ADM2, ending up with a df with cols: `ADM2_ID`, `YEAR`, `MONTH`, `OPEN_FACILITIES_COUNT` = `EXPECTED_REPORTS`\n", + "3) add this to the df with the **numerator** (`dhis2_routine_submitted`)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e6296329-9bcd-4d2c-afb3-520c6a159cdb", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE == \"NER\") {\n", + " \n", + "# names(dhis2_pyramid_raw)\n", + "dim(dhis2_pyramid_raw)\n", + "head(dhis2_pyramid_raw, 3)\n", + " \n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "6f651b18-2d85-4e26-8952-45dae9020c40", + "metadata": {}, + "source": [ + "#### 1. Create df with list of all **facilities** with their `DATE_OPENED` and `DATE_CLOSED`: `facility_master`\n", + "Separate \"facilities\" (of any type, such as hospitals to CSI, Infermieres etc) from admin levels and hospital units (wards, depts...)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "25dea3c7-44ea-42a7-b467-f470892fcfef", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# is_aire_l5() and is_hospital_l4() loaded from utils/snt_dhis2_reporting_rate.r" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "55760b89-f4d0-40c5-9905-f7c7c4fee5c0", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# List of all FoSa (from Aires → Level 6)\n", + "\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE == \"NER\") {\n", + " \n", + "fosa_master <- dhis2_pyramid_raw %>%\n", + " filter(is_aire_l5(LEVEL_5_NAME)) %>%\n", + " distinct(\n", + " OU_ID = LEVEL_6_ID,\n", + " OU_NAME = LEVEL_6_NAME,\n", + " region = LEVEL_2_NAME,\n", + " district = LEVEL_3_NAME,\n", + " ADM2_ID = LEVEL_3_ID,\n", + " DATE_OPENED = OPENING_DATE, \n", + " DATE_CLOSED = CLOSED_DATE\n", + " ) %>%\n", + " mutate(OU_TYPE = \"FoSa\")\n", + "\n", + "dim(fosa_master)\n", + "head(fosa_master)\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e9c9f59-9c6c-44e4-bbd9-13c3917f5117", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# List of all Hospitals (from Level 4, aggregate dates across children)\n", + "\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE == \"NER\") {\n", + " \n", + "hosp_master <- dhis2_pyramid_raw %>%\n", + "filter(is_hospital_l4(LEVEL_4_NAME)) %>%\n", + "group_by(LEVEL_4_ID, LEVEL_4_NAME, LEVEL_2_NAME, LEVEL_3_NAME, LEVEL_3_ID) %>%\n", + "summarise(\n", + " OPENING_DATE = suppressWarnings(min(OPENING_DATE, na.rm = TRUE)),\n", + " CLOSED_DATE = suppressWarnings(max(CLOSED_DATE, na.rm = TRUE)),\n", + " .groups = \"drop\"\n", + ") %>%\n", + "mutate(\n", + " DATE_OPENED = ifelse(is.infinite(OPENING_DATE), NA, OPENING_DATE) |> as_datetime(),\n", + " DATE_CLOSED = ifelse(is.infinite(CLOSED_DATE), NA, CLOSED_DATE) |> as_datetime()\n", + " ) %>%\n", + "distinct(\n", + " OU_ID = LEVEL_4_ID, \n", + " OU_NAME = LEVEL_4_NAME,\n", + " region=LEVEL_2_NAME,\n", + " district=LEVEL_3_NAME,\n", + " ADM2_ID=LEVEL_3_ID,\n", + " DATE_OPENED,\n", + " DATE_CLOSED\n", + ") %>%\n", + "mutate(\n", + " OU_TYPE = \"Hospital\"\n", + " )\n", + "\n", + "dim(hosp_master)\n", + "head(hosp_master)\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5859e393-bfac-46e6-b103-cb8177100860", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Merge both\n", + "\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE == \"NER\") {\n", + " \n", + "facility_master <- bind_rows(fosa_master, hosp_master) %>% \n", + " select(ADM2_ID, \n", + " OU_ID, \n", + " DATE_OPENED, \n", + " DATE_CLOSED)\n", + "\n", + "dim(facility_master)\n", + "head(facility_master, 3)\n", + "\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "191c3b01-1645-410e-be99-b247bf5f9cfb", + "metadata": {}, + "source": [ + "---------------------" + ] + }, + { + "cell_type": "markdown", + "id": "1ee48156-5ed5-43a5-b927-caa53c10d98e", + "metadata": {}, + "source": [ + "#### **Generic** part: applies to **all countries**" + ] + }, + { + "cell_type": "markdown", + "id": "3aa057a5-6f68-493e-83e2-81bafce42c9e", + "metadata": {}, + "source": [ + "#### 2. Calculate nr of **OPEN facilities** for each `MONTH` per `ADM2`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "91f972f1-bcc9-458f-9662-5574efc7ac9d", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Define start and end period based on routine data \n", + "\n", + "# if (COUNTRY_CODE == \"NER\" && REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", + " \n", + "PERIOD_START <- dhis2_routine$PERIOD |> min()\n", + "PERIOD_END <- dhis2_routine$PERIOD |> max()\n", + "\n", + "print(paste0(\"Start period: \", PERIOD_START))\n", + "print(paste0(\"End period :\", PERIOD_END))\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c6ed56f-b2a6-460c-a3dc-6c588c40b54c", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "## Create a \"complete\" grid of every month and year for the period range ---------------------------------------\n", + "\n", + "# if (COUNTRY_CODE == \"NER\" && REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", + " \n", + "months_grid <- tibble(\n", + " month_date = seq(\n", + " ymd(paste0(PERIOD_START, \"01\")), # Converts 202201 to \"20220101\" and then to a date\n", + " ymd(paste0(PERIOD_END, \"01\")), # same\n", + " by = \"months\"\n", + " )\n", + ") %>%\n", + " mutate(\n", + " YEAR = year(month_date),\n", + " MONTH = month(month_date)\n", + " )\n", + "\n", + "dim(months_grid) \n", + "head(months_grid, 3)\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eb7d9a47-053e-43a5-9e0d-c7b717236f3e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "## Create `facility_master` for any (🚨 non-NER) countries\n", + "\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\" && COUNTRY_CODE != \"NER\") {\n", + "\n", + " # Programmatically define `ADM2_ID`\n", + " ADMIN_2_LEVEL <- str_replace(ADMIN_2, \"NAME\", \"ID\")\n", + " # Programmatically define `OU_ID`\n", + " HF_LEVEL <- glue::glue(\"LEVEL_{config_json$SNT_CONFIG$ANALYTICS_ORG_UNITS_LEVEL}_ID\")\n", + "\n", + " facility_master <- dhis2_pyramid_formatted |>\n", + " mutate(\n", + " DATE_OPENED = with_tz(OPENING_DATE, \"UTC\"),\n", + " DATE_CLOSED = with_tz(CLOSED_DATE, \"UTC\")\n", + " ) |>\n", + " select(\n", + " ADM2_ID = all_of(ADMIN_2_LEVEL), \n", + " OU_ID = all_of(HF_LEVEL),\n", + " DATE_OPENED, #= OPENING_DATE,\n", + " DATE_CLOSED #= CLOSED_DATE\n", + ")\n", + "\n", + "head(facility_master)\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f283cc96-ed69-43a3-964e-57ccb0180a4a", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "## Create a \"complete\" grid of every ADM2_ID for every month ---------------------------------------\n", + "\n", + "# if (COUNTRY_CODE == \"NER\" && REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", + " \n", + "# This ensures that even if an ADM2_ID has zero open facilities in a month,\n", + "# it will still appear in the final result with a count of 0.\n", + "complete_grid <- expand_grid(\n", + " ADM2_ID = unique(facility_master$ADM2_ID),\n", + " month_date = months_grid$month_date\n", + ") %>%\n", + " mutate(\n", + " YEAR = year(month_date),\n", + " MONTH = month(month_date),\n", + " month_date = with_tz(as_datetime(month_date), \"UTC\") # GP added 0809\n", + " )\n", + "\n", + "head(complete_grid, 3)\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6e905c46-036a-4aa9-85ad-216f846f9e1b", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "## Calculate the number of open facilities ---------------------------------------\n", + "\n", + "# # The facility must have opened on or before the last day of the current month. \n", + "# # To calculate the last day: add one month and subtract one day from the first day.\n", + "# complete_grid$month_date[1] # \"2022-01-01\"\n", + "# complete_grid$month_date[1] + months(1) - days(1) # \"2022-01-31\"\n", + "# # The facility must either still be open (DATE_CLOSED is NA) OR it must have closed on or after the first day of that month.\n", + "\n", + "# if (COUNTRY_CODE == \"NER\" && REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", + " \n", + "open_facilities_count <- facility_master %>%\n", + " # Create a row for every possible combination of facility and month\n", + " crossing(months_grid) %>%\n", + " # A facility is \"open\" if it opened BEFORE the end of the month\n", + " # AND it either never closed (NA) or closed AFTER the start of the month.\n", + " filter(\n", + " DATE_OPENED <= month_date + months(1) - days(1) & # opened on or before the last day of the current month\n", + " (is.na(DATE_CLOSED) | DATE_CLOSED >= month_date) # \n", + " ) %>%\n", + " # Count the number of open facilities for each area and month\n", + " count(ADM2_ID, YEAR, MONTH, name = \"OPEN_FACILITIES_COUNT\")\n", + "\n", + "head(open_facilities_count, 3)\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ba854f2-5925-4154-b86e-3e4e7bb6c363", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "## Join the counts back to the complete grid to include zeros --------------------------------------\n", + "\n", + "# if (COUNTRY_CODE == \"NER\" && REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\" && DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", + " \n", + "pyramid_open_facilities <- complete_grid %>%\n", + " left_join(open_facilities_count, by = c(\"ADM2_ID\", \"YEAR\", \"MONTH\")) %>%\n", + " # If a month had no open facilities, the count will be NA. Change it to 0.\n", + " # Also rename `OPEN_FACILITIES_COUNT` to `EXPECTED_REPORTS` to use same col name as other methods\n", + " mutate(OPEN_FACILITIES_COUNT = replace_na(OPEN_FACILITIES_COUNT, 0)) %>% # DENOMINATOR: consistent col name across all methods \n", + " select(ADM2_ID, YEAR, MONTH, \n", + " EXPECTED_REPORTS = OPEN_FACILITIES_COUNT) %>%\n", + " arrange(ADM2_ID, YEAR, MONTH)\n", + "\n", + "print(dim(pyramid_open_facilities))\n", + "head(pyramid_open_facilities, 3)\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff1a8537-d093-4d5c-8a44-4b729090cced", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "811310a9-df85-4fa3-af9a-b931eaffd7e5", + "metadata": {}, + "source": [ + "### Calculate **Reporting Rate** " + ] + }, + { + "cell_type": "markdown", + "id": "8827cfd6-479b-4025-a379-d20bf20fcfb4", + "metadata": {}, + "source": [ + "**Join df for Denominator**\n", + "\n", + "**Note**
\n", + "in both df's (`dhis2_reporting_expected` OR `routine_active_facilities`) the col `EXPECTED_REPORTS` has the same name to simplify parametrization: only difference between the 2 options is the df to be joined (right element in `left_join()`)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "670508a0-3075-4f82-aa2c-d26cf867f13d", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# --- 2. Join `dhis2_reporting_expected` OR `dhis2_calculated_expected` to add `EXPECTED_REPORTS` ------------------------------------------------\n", + "\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", + "\n", + "# Parametrized based on DATAELEMENT_METHOD_DENOMINATOR: left_join() the respective df\n", + "if (DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", + " # Add df of rep rate extracted directly from DHIS2\n", + " dhis2_routine_submitted_expected <- left_join(\n", + " dhis2_routine_submitted, \n", + " dhis2_reporting_expected |> select(ADM2_ID, YEAR, MONTH, EXPECTED_REPORTS), # `dhis2_reporting_expected`\n", + " by = join_by(ADM2_ID, YEAR, MONTH)\n", + " ) \n", + " log_msg(\"Calculating `Data Element` reporting rate, using as denominator `EXPECTED_REPORTS` extracted directly from DHIS2.\")\n", + " \n", + "} else if (DATAELEMENT_METHOD_DENOMINATOR == \"ROUTINE_ACTIVE_FACILITIES\") {\n", + " # Add df of rep rate CALCULATED based on submissiosn in dhis2 routine data \"active\" facilities\n", + " dhis2_routine_submitted_expected <- left_join(\n", + " dhis2_routine_submitted, \n", + " routine_active_facilities, # has only cols: `YEAR`, `ADM2_ID`, `EXPECTED_REPORTS`\n", + " by = join_by(ADM2_ID, YEAR) #, MONTH)\n", + " ) \n", + " log_msg(\"Calculating `Data Element` reporting rate, using as denominator `EXPECTED_REPORTS` as CALCULATED from DHIS2 routine data. Here, ACTIVE facilities \n", + " are defined as facilities that reported on any of the extracted indicators at least once per year.\")\n", + " \n", + "} else if (DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", + " # Add df of rep rate CALCULATED based on OPEN facilities as per PYRAMID RAW\n", + " dhis2_routine_submitted_expected <- left_join(\n", + " dhis2_routine_submitted, \n", + " pyramid_open_facilities, \n", + " by = join_by(ADM2_ID, YEAR, MONTH)\n", + " ) \n", + " log_msg(\"Calculating `Data Element` reporting rate, using as denominator `EXPECTED_REPORTS` as CALCULATED from DHIS2 pyramid. \n", + " This method counts the number of OPEN facilities for each ADM2 per MONTH.\")\n", + "}\n", + "\n", + "# Safety measures ...\n", + "dhis2_routine_submitted_expected <- dhis2_routine_submitted_expected |>\n", + " # ungroup() %>% \n", + " mutate(YEAR = as.integer(YEAR),\n", + " MONTH = as.integer(MONTH)\n", + " ) \n", + "\n", + "\n", + "print(dim(dhis2_routine_submitted_expected))\n", + "head(dhis2_routine_submitted_expected, 3)\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6fad303c-b239-4cf9-93a8-fe3ce5c33c37", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# --- 3. Calculate `REPORTING_RATE` ------------------------------------------------\n", + "\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", + " \n", + "reporting_rate_dataelement <- dhis2_routine_submitted_expected |>\n", + "mutate(\n", + " REPORTING_RATE = SUBMITTED_REPORTS / EXPECTED_REPORTS\n", + " ) \n", + "\n", + "dim(reporting_rate_dataelement)\n", + "head(reporting_rate_dataelement, 3)\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68023e8e-f7f6-4201-b097-1996bee57671", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# head(hf_active, 3)" + ] + }, + { + "cell_type": "markdown", + "id": "ae3aa127-c20c-4ca5-af0c-4a4260883cac", + "metadata": {}, + "source": [ + "`#### 🚨 Here 👇 swap denominator: join `dhis2_reporting_expected` to replace `TOTAL_HF` with `EXPECTED_REPORTS``" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a97c7d75-3317-48bc-a2f1-770bf38d141a", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", + " inspect_reporting_rate(reporting_rate_dataelement)\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92651472-26e2-4131-ac02-288122138b0b", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# # --- 1. create intermediate df `hf_active_month`: summarize nr of \"active\" (reporting) HF by month ------------------------\n", + "# hf_active_month <- hf_active %>% \n", + "# # filter(ADM1_ID == \"rWrCdr321Qu\") |> # ⚠️⚠️⚠️ TEMP subset just for CODE development ... ! ⚠️⚠️⚠️\n", + "# dplyr::group_by(ADM2_ID, YEAR, MONTH) %>%\n", + "# dplyr::summarize(\n", + "# SUBMITTED_REPORTS = length(which(ACTIVE == TRUE)), # 🚨 GP changed to BOOLEAN to save space\n", + "# .groups = \"drop\") |>\n", + "# mutate(YEAR = as.integer(YEAR), \n", + "# MONTH = as.integer(MONTH)\n", + "# )\n", + "\n", + "# print(dim(hf_active_month))\n", + "# head(hf_active_month)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db5ad094-0601-4a18-9435-db60c1f4e8ff", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", + "\n", + " reporting_rate_dataelement <- reporting_rate_dataelement |> \n", + " select(all_of(fixed_cols_rr))\n", + " \n", + " head(reporting_rate_dataelement, 3)\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05f94483-1524-426e-9fe3-4b9bf572c05e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "73ed8e24-1aab-47af-9d91-5bc4899a40e9", + "metadata": {}, + "source": [ + "`#### Quick data quality check 🔍`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "713a5ed3-2aeb-4949-8ecc-6ee3f787a719", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", + "\n", + "# Plot reporting rate heatmap\n", + "options(repr.plot.width = 20, repr.plot.height = 10) \n", + "\n", + "# reporting_rate_conf_month %>%\n", + "reporting_rate_dataelement %>%\n", + "mutate(\n", + " DATE = as.Date(paste0(YEAR, \"-\", MONTH, \"-01\"))\n", + " ) %>%\n", + "ggplot(., aes(x = DATE, \n", + " y = factor(ADM2_ID), \n", + " fill = REPORTING_RATE * 100)\n", + " ) + \n", + " geom_tile() +\n", + " scale_fill_viridis_c(\n", + " option = \"C\",\n", + " direction = 1, # blue = low, yellow = high\n", + " limits = c(0, 100),\n", + " name = \"Reporting rate (%)\"\n", + " ) +\n", + " labs(\n", + " title = \"Monthly Reporting Rate by Health District - Method 'DataElement'\",\n", + " subtitle = \"Each tile represents the reporting completeness per district per month\",\n", + " x = \"Month\",\n", + " y = \"Health District\"\n", + " ) +\n", + " theme_minimal(base_size = 13) +\n", + " theme(\n", + " axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 9),\n", + " axis.text.y = element_text(size = 9),\n", + " plot.title = element_text(face = \"bold\", hjust = 0.5, size = 14),\n", + " plot.subtitle = element_text(hjust = 0.5, size = 12),\n", + " legend.position = \"right\",\n", + " panel.grid = element_blank()\n", + " )\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "93f5b7f0-bf5e-4567-9d16-da2091125988", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "6729e183-5395-4fb7-a535-978c61124710", + "metadata": {}, + "source": [ + "# 4. Export 📁 /data/ folder" + ] + }, + { + "cell_type": "markdown", + "id": "ef68ae41-a0a9-4b45-8b7d-3d1c9b535ad9", + "metadata": {}, + "source": [ + "### 🧹 Clear output directory\n", + "This is needed to ensure that only 2 files are written to the new version of the Dataset:\n", + "* **Data Set** reporting rate (only one way to calculate it, not parametrized as nothing to \"decide\" here)\n", + "* **Data Element** reporting rate: here there are 7 possible combinations of numerator times 3 possible combinatiosn of denominator.
\n", + " These are too many optiosn to give to the incidence pipeline (the step that ingests this data), where these would need to be hardcoded in the pipeline module. When running the incidence pipeline, the user simply choses whether to use `\"dataset\"` or `\"dataelement\"`, and therefore there must be only one file for each option.
\n", + " However, we want to **preserve the info** on the choice of **numerator** and **denominator** in the **filename**. The import function used in incidence therefore only looks for the fixed pattern in the filename, and ignores the tags for numerator and denominator (e.g., \"n-conf-susp-test\", \"d-dexrep\")." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7eb882f3-a443-4363-bcad-be5b4ebc7d8f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Cleanup\n", + "path_to_clear <- file.path(DATA_PATH, \"reporting_rate\")\n", + "files_to_delete <- list.files(path_to_clear, full.names = TRUE, recursive = TRUE)\n", + "unlink(files_to_delete, recursive = TRUE)\n", + "log_msg(glue::glue(\"🧹 Deleting all existing files from `{path_to_clear}`. Output of current pipeline run will replace output of previous run.\"))" + ] + }, + { + "cell_type": "markdown", + "id": "1372184e-a1a9-472a-87d4-69e38a1b139d", + "metadata": { + "papermill": { + "duration": 0.000436, + "end_time": "2025-08-26T09:50:02.570794", + "exception": false, + "start_time": "2025-08-26T09:50:02.570358", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### CSV" + ] + }, + { + "cell_type": "markdown", + "id": "c266c99e-a08e-471b-93dd-dbedb4841483", + "metadata": { + "papermill": { + "duration": 0.000436, + "end_time": "2025-08-26T09:50:02.570794", + "exception": false, + "start_time": "2025-08-26T09:50:02.570358", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### Build up file name for **data Element** method" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1b6532c9-292e-4e8c-8e9d-987867920a6d", + "metadata": { + "papermill": { + "duration": 0.198788, + "end_time": "2025-08-26T09:50:02.770154", + "exception": false, + "start_time": "2025-08-26T09:50:02.571366", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# 🚨 Currently not in use! Keeping for future update to method 🚨 (GP 2025-08-29)\n", + "\n", + "# Abbreviation for Data Elememnt chosen NUMERATOR\n", + "method_num = tolower(paste0(\"n-\", paste(indicators_selected, collapse = \"-\")))\n", + "method_num\n", + "\n", + "\n", + "# Abbreviation for Data Elememnt chosen DENOMINATOR\n", + "if (DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", + " method_den = \"d-dexrep\" # \"d1\"\n", + "} else if (DATAELEMENT_METHOD_DENOMINATOR == \"ROUTINE_ACTIVE_FACILITIES\") {\n", + " method_den = \"d-actfac\" # \"d2\"\n", + " } else if (DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", + " method_den = \"d-opnfcl\" # \"d2\"\n", + " }\n", + "\n", + "method_den" + ] + }, + { + "cell_type": "markdown", + "id": "cf5bcd47-dba1-4a7a-81cf-d036fd0ee4db", + "metadata": { + "papermill": { + "duration": 0.000436, + "end_time": "2025-08-26T09:50:02.570794", + "exception": false, + "start_time": "2025-08-26T09:50:02.570358", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### Write function to assemble path based on method - for .**csv**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b01c022-aa52-4a1b-a2fe-7bcb145a2049", + "metadata": { + "papermill": { + "duration": 0.108587, + "end_time": "2025-08-26T09:50:02.884462", + "exception": false, + "start_time": "2025-08-26T09:50:02.775875", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# snt_write_csv() loaded from utils/snt_dhis2_reporting_rate.r" + ] + }, + { + "cell_type": "markdown", + "id": "512ba94e-b7fc-4e45-bdda-8f5533e4e665", + "metadata": { + "papermill": { + "duration": 0.000436, + "end_time": "2025-08-26T09:50:02.570794", + "exception": false, + "start_time": "2025-08-26T09:50:02.570358", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### Use function to export .csv files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8689ebc3-d975-45be-92fd-1fedfc733f49", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Method \"Dataset\"\n", + "\n", + "if (REPORTING_RATE_METHOD == \"DATASET\") {\n", + " snt_write_csv(x = reporting_rate_dataset, \n", + " output_data_path = DATA_PATH, \n", + " method = \"dataset\",\n", + " country_code = COUNTRY_CODE) \n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b16e74ae-64f2-4267-a6ae-8413c8463af6", + "metadata": { + "papermill": { + "duration": 2.659797, + "end_time": "2025-08-26T09:50:05.545618", + "exception": false, + "start_time": "2025-08-26T09:50:02.885821", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Method \"Data Element\"\n", + "\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", + " snt_write_csv(x = reporting_rate_dataelement,\n", + " output_data_path = DATA_PATH, \n", + " method = \"dataelement\",\n", + " country_code = COUNTRY_CODE)\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "cfd1679e-dc0e-4805-9420-0788884a7713", + "metadata": { + "papermill": { + "duration": 0.000345, + "end_time": "2025-08-26T09:50:05.546427", + "exception": false, + "start_time": "2025-08-26T09:50:05.546082", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### parquet" + ] + }, + { + "cell_type": "markdown", + "id": "bed7679d-c392-4e3a-9fc7-4d6ae6982517", + "metadata": { + "papermill": { + "duration": 0.000436, + "end_time": "2025-08-26T09:50:02.570794", + "exception": false, + "start_time": "2025-08-26T09:50:02.570358", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### Write function to assemble path based on method - for .**parquet**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26e68499-ef55-46f6-a017-8d03fdbff1b4", + "metadata": { + "papermill": { + "duration": 0.100077, + "end_time": "2025-08-26T09:50:05.647079", + "exception": false, + "start_time": "2025-08-26T09:50:05.547002", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# snt_write_parquet() loaded from utils/snt_dhis2_reporting_rate.r" + ] + }, + { + "cell_type": "markdown", + "id": "8250b998-2669-4590-a4fe-770e42b2d43f", + "metadata": { + "papermill": { + "duration": 0.000436, + "end_time": "2025-08-26T09:50:02.570794", + "exception": false, + "start_time": "2025-08-26T09:50:02.570358", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### Use function to export .csv files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "52b5720c-864e-49ac-bf40-6b5551214eaa", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Method \"Dataset\"\n", + "\n", + "if (REPORTING_RATE_METHOD == \"DATASET\") {\n", + " snt_write_parquet(x = reporting_rate_dataset,\n", + " output_data_path = DATA_PATH,\n", + " method = \"dataset\",\n", + " country_code = COUNTRY_CODE\n", + " ) \n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "95bf17f5-6015-464c-9388-df2397d1609c", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Method \"Data Element\"\n", + "\n", + "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", + " snt_write_parquet(x = reporting_rate_dataelement,\n", + " output_data_path = DATA_PATH,\n", + " method = \"dataelement\",\n", + " country_code = COUNTRY_CODE\n", + " )\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65b3e4b8-1a62-47c8-877b-1dae4511e4f0", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] } - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "73ed8e24-1aab-47af-9d91-5bc4899a40e9", - "metadata": {}, - "source": [ - "`#### Quick data quality check 🔍`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "713a5ed3-2aeb-4949-8ecc-6ee3f787a719", - "metadata": { - "vscode": { - "languageId": "r" + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" } - }, - "outputs": [], - "source": [ - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", - "\n", - "# Plot reporting rate heatmap\n", - "options(repr.plot.width = 20, repr.plot.height = 10) \n", - "\n", - "# reporting_rate_conf_month %>%\n", - "reporting_rate_dataelement %>%\n", - "mutate(\n", - " DATE = as.Date(paste0(YEAR, \"-\", MONTH, \"-01\"))\n", - " ) %>%\n", - "ggplot(., aes(x = DATE, \n", - " y = factor(ADM2_ID), \n", - " fill = REPORTING_RATE * 100)\n", - " ) + \n", - " geom_tile() +\n", - " scale_fill_viridis_c(\n", - " option = \"C\",\n", - " direction = 1, # blue = low, yellow = high\n", - " limits = c(0, 100),\n", - " name = \"Reporting rate (%)\"\n", - " ) +\n", - " labs(\n", - " title = \"Monthly Reporting Rate by Health District - Method 'DataElement'\",\n", - " subtitle = \"Each tile represents the reporting completeness per district per month\",\n", - " x = \"Month\",\n", - " y = \"Health District\"\n", - " ) +\n", - " theme_minimal(base_size = 13) +\n", - " theme(\n", - " axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 9),\n", - " axis.text.y = element_text(size = 9),\n", - " plot.title = element_text(face = \"bold\", hjust = 0.5, size = 14),\n", - " plot.subtitle = element_text(hjust = 0.5, size = 12),\n", - " legend.position = \"right\",\n", - " panel.grid = element_blank()\n", - " )\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "93f5b7f0-bf5e-4567-9d16-da2091125988", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "6729e183-5395-4fb7-a535-978c61124710", - "metadata": {}, - "source": [ - "# 4. Export 📁 /data/ folder" - ] - }, - { - "cell_type": "markdown", - "id": "ef68ae41-a0a9-4b45-8b7d-3d1c9b535ad9", - "metadata": {}, - "source": [ - "### 🧹 Clear output directory\n", - "This is needed to ensure that only 2 files are written to the new version of the Dataset:\n", - "* **Data Set** reporting rate (only one way to calculate it, not parametrized as nothing to \"decide\" here)\n", - "* **Data Element** reporting rate: here there are 7 possible combinations of numerator times 3 possible combinatiosn of denominator.
\n", - " These are too many optiosn to give to the incidence pipeline (the step that ingests this data), where these would need to be hardcoded in the pipeline module. When running the incidence pipeline, the user simply choses whether to use `\"dataset\"` or `\"dataelement\"`, and therefore there must be only one file for each option.
\n", - " However, we want to **preserve the info** on the choice of **numerator** and **denominator** in the **filename**. The import function used in incidence therefore only looks for the fixed pattern in the filename, and ignores the tags for numerator and denominator (e.g., \"n-conf-susp-test\", \"d-dexrep\")." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7eb882f3-a443-4363-bcad-be5b4ebc7d8f", - "metadata": {}, - "outputs": [], - "source": [ - "# Cleanup\n", - "path_to_clear <- file.path(DATA_PATH, \"reporting_rate\")\n", - "files_to_delete <- list.files(path_to_clear, full.names = TRUE, recursive = TRUE)\n", - "unlink(files_to_delete, recursive = TRUE)\n", - "log_msg(glue::glue(\"🧹 Deleting all existing files from `{path_to_clear}`. Output of current pipeline run will replace output of previous run.\"))" - ] - }, - { - "cell_type": "markdown", - "id": "1372184e-a1a9-472a-87d4-69e38a1b139d", - "metadata": { - "papermill": { - "duration": 0.000436, - "end_time": "2025-08-26T09:50:02.570794", - "exception": false, - "start_time": "2025-08-26T09:50:02.570358", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### CSV" - ] - }, - { - "cell_type": "markdown", - "id": "c266c99e-a08e-471b-93dd-dbedb4841483", - "metadata": { - "papermill": { - "duration": 0.000436, - "end_time": "2025-08-26T09:50:02.570794", - "exception": false, - "start_time": "2025-08-26T09:50:02.570358", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### Build up file name for **data Element** method" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1b6532c9-292e-4e8c-8e9d-987867920a6d", - "metadata": { - "papermill": { - "duration": 0.198788, - "end_time": "2025-08-26T09:50:02.770154", - "exception": false, - "start_time": "2025-08-26T09:50:02.571366", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "# 🚨 Currently not in use! Keeping for future update to method 🚨 (GP 2025-08-29)\n", - "\n", - "# Abbreviation for Data Elememnt chosen NUMERATOR\n", - "method_num = tolower(paste0(\"n-\", paste(indicators_selected, collapse = \"-\")))\n", - "method_num\n", - "\n", - "\n", - "# Abbreviation for Data Elememnt chosen DENOMINATOR\n", - "if (DATAELEMENT_METHOD_DENOMINATOR == \"DHIS2_EXPECTED_REPORTS\") {\n", - " method_den = \"d-dexrep\" # \"d1\"\n", - "} else if (DATAELEMENT_METHOD_DENOMINATOR == \"ROUTINE_ACTIVE_FACILITIES\") {\n", - " method_den = \"d-actfac\" # \"d2\"\n", - " } else if (DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", - " method_den = \"d-opnfcl\" # \"d2\"\n", - " }\n", - "\n", - "method_den" - ] - }, - { - "cell_type": "markdown", - "id": "cf5bcd47-dba1-4a7a-81cf-d036fd0ee4db", - "metadata": { - "papermill": { - "duration": 0.000436, - "end_time": "2025-08-26T09:50:02.570794", - "exception": false, - "start_time": "2025-08-26T09:50:02.570358", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### Write function to assemble path based on method - for .**csv**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0b01c022-aa52-4a1b-a2fe-7bcb145a2049", - "metadata": { - "papermill": { - "duration": 0.108587, - "end_time": "2025-08-26T09:50:02.884462", - "exception": false, - "start_time": "2025-08-26T09:50:02.775875", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "# write function\n", - "snt_write_csv <- function(x, output_data_path, method) {\n", - " \n", - " full_directory_path <- file.path(output_data_path, \"reporting_rate\")\n", - " \n", - " if (!dir.exists(full_directory_path)) {\n", - " dir.create(full_directory_path, recursive = TRUE)\n", - " }\n", - "\n", - " file_path <- file.path(full_directory_path, paste0(COUNTRY_CODE, \"_reporting_rate_\", method, \".csv\")) \n", - " \n", - " write_csv(x, file_path)\n", - "\n", - " log_msg(paste0(\"Exported : \", file_path))\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "512ba94e-b7fc-4e45-bdda-8f5533e4e665", - "metadata": { - "papermill": { - "duration": 0.000436, - "end_time": "2025-08-26T09:50:02.570794", - "exception": false, - "start_time": "2025-08-26T09:50:02.570358", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### Use function to export .csv files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8689ebc3-d975-45be-92fd-1fedfc733f49", - "metadata": {}, - "outputs": [], - "source": [ - "# Method \"Dataset\"\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATASET\") {\n", - " snt_write_csv(x = reporting_rate_dataset, \n", - " output_data_path = DATA_PATH, \n", - " method = \"dataset\") \n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b16e74ae-64f2-4267-a6ae-8413c8463af6", - "metadata": { - "papermill": { - "duration": 2.659797, - "end_time": "2025-08-26T09:50:05.545618", - "exception": false, - "start_time": "2025-08-26T09:50:02.885821", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "# Method \"Data Element\"\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", - " snt_write_csv(x = reporting_rate_dataelement,\n", - " output_data_path = DATA_PATH, \n", - " method = \"dataelement\")\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "cfd1679e-dc0e-4805-9420-0788884a7713", - "metadata": { - "papermill": { - "duration": 0.000345, - "end_time": "2025-08-26T09:50:05.546427", - "exception": false, - "start_time": "2025-08-26T09:50:05.546082", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### parquet" - ] - }, - { - "cell_type": "markdown", - "id": "bed7679d-c392-4e3a-9fc7-4d6ae6982517", - "metadata": { - "papermill": { - "duration": 0.000436, - "end_time": "2025-08-26T09:50:02.570794", - "exception": false, - "start_time": "2025-08-26T09:50:02.570358", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### Write function to assemble path based on method - for .**parquet**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "26e68499-ef55-46f6-a017-8d03fdbff1b4", - "metadata": { - "papermill": { - "duration": 0.100077, - "end_time": "2025-08-26T09:50:05.647079", - "exception": false, - "start_time": "2025-08-26T09:50:05.547002", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "# write function\n", - "snt_write_parquet <- function(x, output_data_path, method) {\n", - " \n", - " full_directory_path <- file.path(output_data_path, \"reporting_rate\")\n", - " \n", - " if (!dir.exists(full_directory_path)) {\n", - " dir.create(full_directory_path, recursive = TRUE)\n", - " }\n", - "\n", - " file_path <- file.path(full_directory_path, paste0(COUNTRY_CODE, \"_reporting_rate_\", method, \".parquet\")) \n", - " \n", - " arrow::write_parquet(x, file_path)\n", - "\n", - " log_msg(paste0(\"Exported : \", file_path))\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "8250b998-2669-4590-a4fe-770e42b2d43f", - "metadata": { - "papermill": { - "duration": 0.000436, - "end_time": "2025-08-26T09:50:02.570794", - "exception": false, - "start_time": "2025-08-26T09:50:02.570358", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### Use function to export .csv files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "52b5720c-864e-49ac-bf40-6b5551214eaa", - "metadata": {}, - "outputs": [], - "source": [ - "# Method \"Dataset\"\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATASET\") {\n", - " snt_write_parquet(x = reporting_rate_dataset,\n", - " output_data_path = DATA_PATH,\n", - " method = \"dataset\"\n", - " ) \n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "95bf17f5-6015-464c-9388-df2397d1609c", - "metadata": {}, - "outputs": [], - "source": [ - "# Method \"Data Element\"\n", - "\n", - "if (REPORTING_RATE_METHOD == \"DATAELEMENT\") {\n", - " snt_write_parquet(x = reporting_rate_dataelement,\n", - " output_data_path = DATA_PATH,\n", - " method = \"dataelement\"\n", - " )\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "65b3e4b8-1a62-47c8-877b-1dae4511e4f0", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/pipelines/snt_dhis2_reporting_rate/reporting/snt_dhis2_reporting_rate_report.ipynb b/pipelines/snt_dhis2_reporting_rate/reporting/snt_dhis2_reporting_rate_report.ipynb index 27d2cd2..65073fc 100644 --- a/pipelines/snt_dhis2_reporting_rate/reporting/snt_dhis2_reporting_rate_report.ipynb +++ b/pipelines/snt_dhis2_reporting_rate/reporting/snt_dhis2_reporting_rate_report.ipynb @@ -1,998 +1,1113 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "ad4e51fd-ab5a-478c-856a-bc3308ce5781", - "metadata": {}, - "source": [ - "-------------\n", - "🤌🏼 Points to discuss:\n", - "* **what we do want to plot here?**
\n", - " Plot only what is produced by the pipeline (hence reflect choice of parameters from pipeline run) OR all the possible options (all output produced by all pipelines run so far, meaning whatever is writte the to most recent version of the Dataset?)\n", - "* **how to handle missing files?**: namely, situations in which files are not yet been produced. In this reporting rate case, if the user only runs the pipeline to produce the \"Dataset\" reporrting rate file,, then we cannot plot anything for the \"Data Element\" reporting rate as there is no file yet ...\n", - " Atm this is handled with `if` logic, but should be made more elegant to avoid repeating the same code twice (for dataset and for dataelement)\n", - "\n", - "-------------\n", - "\n", - "🚧 To do:\n", - "* **Plots shouls be wrapped as functions (DRY code)**! Cuold save in .R file in this same location to `source()` only here (as plots are specifc to this notebook, no need to save in snt_utils.R)\n", - "* **Display _real_ data**: do **_not_ cap** reporting rate values at 1 (100%)!! It's important to visualize real full range if we want to qualitatively assess and compare different methods!\n", - "* **fix object names**: `routine_data` is NOT routine data ... !!\n", - "* When importing `reporting_rate_data`, try if possible to avoid using `tryCatch`, and use `log_msg(..., \"warning\")` instead (should simplify code and logic ... ). Idea is to **log a meaningful warning without making the pipeline fail** just becauase a file in the report nb is missing ... !\n", - "\n", - "-------------" - ] + "cells": [ + { + "cell_type": "markdown", + "id": "ad4e51fd-ab5a-478c-856a-bc3308ce5781", + "metadata": {}, + "source": [ + "-------------\n", + "🤌🏼 Points to discuss:\n", + "* **what we do want to plot here?**
\n", + " Plot only what is produced by the pipeline (hence reflect choice of parameters from pipeline run) OR all the possible options (all output produced by all pipelines run so far, meaning whatever is writte the to most recent version of the Dataset?)\n", + "* **how to handle missing files?**: namely, situations in which files are not yet been produced. In this reporting rate case, if the user only runs the pipeline to produce the \"Dataset\" reporrting rate file,, then we cannot plot anything for the \"Data Element\" reporting rate as there is no file yet ...\n", + " Atm this is handled with `if` logic, but should be made more elegant to avoid repeating the same code twice (for dataset and for dataelement)\n", + "\n", + "-------------\n", + "\n", + "🚧 To do:\n", + "* **Plots shouls be wrapped as functions (DRY code)**! Cuold save in .R file in this same location to `source()` only here (as plots are specifc to this notebook, no need to save in snt_utils.R)\n", + "* **Display _real_ data**: do **_not_ cap** reporting rate values at 1 (100%)!! It's important to visualize real full range if we want to qualitatively assess and compare different methods!\n", + "* **fix object names**: `routine_data` is NOT routine data ... !!\n", + "* When importing `reporting_rate_data`, try if possible to avoid using `tryCatch`, and use `log_msg(..., \"warning\")` instead (should simplify code and logic ... ). Idea is to **log a meaningful warning without making the pipeline fail** just becauase a file in the report nb is missing ... !\n", + "\n", + "-------------" + ] + }, + { + "cell_type": "markdown", + "id": "80fa8c3c-ed62-4248-8149-ffe2974a7206", + "metadata": {}, + "source": [ + "# Taux de Rapportage des Formations Sanitaires - Health Facility Reporting Rates" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35bc4c99-5e5c-44dc-8c67-7f38eaec708e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Set SNT Paths\n", + "SNT_ROOT_PATH <- \"~/workspace\"\n", + "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", + "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", + "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_reporting_rate\")\n", + "\n", + "# load util functions\n", + "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_reporting_rate.r\"))\n", + "\n", + "# List required packages \n", + "required_packages <- c(\"dplyr\", \"tidyr\", \"terra\", \"ggplot2\", \"stringr\", \"lubridate\", \"viridis\", \"patchwork\", \"zoo\", \"purrr\", \"arrow\", \"sf\", \"reticulate\", \"leaflet\")\n", + "\n", + "# Execute function\n", + "install_and_load(required_packages)\n", + "\n", + "# Set environment to load openhexa.sdk from the right environment\n", + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "reticulate::py_config()$python\n", + "openhexa <- import(\"openhexa.sdk\")\n", + "\n", + "# Load SNT config\n", + "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))},\n", + " error = function(e) {\n", + " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })\n", + "\n", + "# Required environment for the sf packages\n", + "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", + "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dd297a84-5a55-4374-9d2b-3148fde8072d", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Configuration variables\n", + "DATASET_NAME <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_REPORTING_RATE\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "ADM_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa63eb27-746f-420b-87ad-da82139acff9", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# printdim() loaded from utils/snt_dhis2_reporting_rate.r" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a603fdb-e3ae-4aa3-a908-0385ae216d49", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# import DHIS2 shapes data\n", + "DATASET_DHIS2 <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "shapes_data <- tryCatch({ get_latest_dataset_file_in_memory(DATASET_DHIS2, paste0(COUNTRY_CODE, \"_shapes.geojson\")) }, \n", + " error = function(e) {\n", + " msg <- paste(\"Error while loading DHIS2 Shapes data for: \" , COUNTRY_CODE, conditionMessage(e))\n", + " cat(msg)\n", + " stop(msg)\n", + " })" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9656abb7-0085-4feb-974c-fb0b1c68c38f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# import pyramid data\n", + "pyramid_data <- tryCatch({ get_latest_dataset_file_in_memory(DATASET_DHIS2, paste0(COUNTRY_CODE, \"_pyramid.parquet\")) }, \n", + " error = function(e) {\n", + " msg <- paste(\"Error while loading DHIS2 Shapes data for: \" , COUNTRY_CODE, conditionMessage(e))\n", + " cat(msg)\n", + " stop(msg)\n", + " })\n", + "\n", + "# Select distinct (already done in SNT format pipeline)\n", + "ADMIN_1_ID <- str_replace(toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1), \"NAME\", \"ID\")\n", + "ADMIN_2_ID <- str_replace(toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2), \"NAME\", \"ID\")\n", + "\n", + "pyramid_data <- pyramid_data %>%\n", + " distinct(across(all_of(c(ADMIN_1_ID, ADMIN_2_ID))), .keep_all = TRUE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e912503-5c57-4997-8c68-da673bd14626", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "print(dim(pyramid_data))\n", + "head(pyramid_data)" + ] + }, + { + "cell_type": "markdown", + "id": "78ec55d0-3a0d-413d-97cd-303895275f88", + "metadata": {}, + "source": [ + "## A) Taux de Soumission des Rapports / Dataset Reporting Rate\n", + "\n", + "**[FR]**\n", + "Cette section analyse le **taux de soumission des rapports**, tel que calculé dans le Système National d’Information Sanitaire (SNIS). Ce taux est défini comme le nombre de rapports effectivement reçus (rapports actuels) divisé par le nombre de rapports attendus (rapports attendus) sur une période donnée. Les rapports attendus correspondent au nombre de formations sanitaires qui, selon les paramètres du SNIS, devaient soumettre un rapport. Cet indicateur permet d’évaluer si les structures ont transmis les rapports requis, sans tenir compte du contenu ou de l’exhaustivité des données saisies.\n", + "\n", + "**[EN]**\n", + "This section analyzes the **dataset reporting rate**, as calculated in the Health Management Information System (HMIS). The rate is defined as the number of reports actually submitted (actual reports) divided by the number of reports expected (expected reports) over a given period. Expected reports refer to the number of health facilities that were required to report according to SNIS configuration. This indicator helps assess whether health facilities submitted their required reports, regardless of the content or completeness of the data within those reports." + ] + }, + { + "cell_type": "markdown", + "id": "793a685b-a5cc-4e12-9c78-e548beffa213", + "metadata": {}, + "source": [ + "**Question:** Can the reporting rate file be loaded only once (using a parameter see below)? -> if that's the case, we can remove specific plotting codes for the \"dataset\" and \"dataelement\" files and just keep one \"plotting code\" for both types \n", + "\n", + "**Suggestion (link to previous Question):** The file name can be parameterized by injecting the user selection via parameters={...} from the OpenHexa pipeline. \n", + "\n", + "> paste0(COUNTRY_CODE, \"_reporting_rate_\", REPORTING_RATE_METHOD ,\".parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a2d9d428-c5a7-4f35-8a21-8f22adaa6a26", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Import from Dataset\n", + "reporting_rate_data <- tryCatch({\n", + " # Attempt to load the dataset\n", + " get_latest_dataset_file_in_memory(\n", + " DATASET_NAME, \n", + " paste0(COUNTRY_CODE, \"_reporting_rate_dataset.parquet\")\n", + " )\n", + " }, \n", + " error = function(e) {\n", + " # If an error occurs, log a warning\n", + " # msg <- paste(\"[WARNING] Warning: Could not load reporting rate file for:\", COUNTRY_CODE, \". Proceeding with empty data. Error:\", conditionMessage(e))\n", + " msg <- paste(\"[WARNING] Warning: file `\", COUNTRY_CODE, \"_reporting_rate_dataset.parquet` does not exist, skipped loading. \n", + " To generate this file, re-run the reporting rate pipeline. Error:\", conditionMessage(e))\n", + " log_msg(msg, level = \"warning\")\n", + " \n", + " # IMPORTANT: Return an empty tibble with the correct structure SO PIPELINE DOES NOT FAIL\n", + " return(\n", + " tibble(\n", + " YEAR = double(),\n", + " MONTH = double(),\n", + " ADM2_ID = character(),\n", + " REPORTING_RATE = double()\n", + " )\n", + " )\n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d13dea1-e204-4c27-8a44-14e260bcdad1", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Add _NAME cols from pyramid\n", + "if (nrow(reporting_rate_data) != 0) {\n", + " \n", + " ADMIN_2_ID <- str_replace(ADM_2, \"NAME\", \"ID\") \n", + " reporting_rate_data <- reporting_rate_data %>% \n", + " left_join(pyramid_data[c(ADM_2, ADMIN_2_ID)], by = c(\"ADM2_ID\" = ADMIN_2_ID))\n", + " \n", + " colnames(reporting_rate_data)[colnames(reporting_rate_data) == ADM_2] <- \"ADM2_NAME\"\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b96b0b12-e168-421f-b0a9-76e83c48842c", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "head(reporting_rate_data, 3)" + ] + }, + { + "cell_type": "markdown", + "id": "64d787c8-25d3-4b2c-87e9-6d35b206b018", + "metadata": {}, + "source": [ + "**fix:** \n", + " - Just replaced this line with the variable \"ADM2_NAME\" : \n", + "> Plot heatmap \n", + "> options(repr.plot.width = 18, repr.plot.height = 15) \n", + "> ggplot(reporting_rate_data, aes(x = date, y = **ADM2_NAME**, fill = category)) + " + ] + }, + { + "cell_type": "markdown", + "id": "c13f15a4-2788-4d57-9edc-78d0afdbe278", + "metadata": {}, + "source": [ + "### Plot: Heatmap" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bb3b565e-8497-4d88-8d6d-ae6b3e2929b2", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (nrow(reporting_rate_data) != 0) {\n", + " \n", + " # Prepare date column + category\n", + " reporting_rate_data <- reporting_rate_data %>%\n", + " mutate(\n", + " date = as.Date(paste0(YEAR, \"-\", MONTH, \"-01\")),\n", + " ADM2_ID = factor(ADM2_ID),\n", + " # reporting_pct = pmin(REPORTING_RATE, 1) * 100, # `pmin()` caps to 100%\n", + " reporting_pct = REPORTING_RATE * 100,\n", + " category = cut(\n", + " reporting_pct,\n", + " # breaks = c(-Inf, 50, 80, 90, Inf),\n", + " # labels = c(\"<50\", \"50–80\", \"80–90\", \"≥90\"),\n", + " # GP 2025-08-07 added this, but double check (seems too many >100!!)\n", + " breaks = c(-Inf, 50, 80, 90, 100, Inf),\n", + " labels = c(\"<50\", \"50–80\", \"80–90\", \"90-100\", \">100\"),\n", + " right = TRUE # FALSE: intervals are left-closed: lower bound is included\n", + " )\n", + " )\n", + " \n", + " # Define color scale\n", + " reporting_colors <- c(\n", + " \"<50\" = \"#d7191c\", # red\n", + " \"50–80\" = \"#fdae61\", # orange\n", + " \"80–90\" = \"#ffffbf\", # yellow\n", + " \"90-100\" = \"#1a9641\", # green\n", + " \">100\" = \"darkgreen\"\n", + " )\n", + " \n", + " # Plot heatmap\n", + " options(repr.plot.width = 18, repr.plot.height = 15)\n", + " ggplot(reporting_rate_data, aes(x = date, y = ADM2_NAME, fill = category)) + # -> Using a ADM2_NAME Variable to select the column !!\n", + " geom_tile() +\n", + " scale_fill_manual(\n", + " values = reporting_colors,\n", + " name = \"Taux de soumission (%)\"\n", + " ) +\n", + " labs(\n", + " title = \"Taux de soumission des rapports mensuels par district sanitaire\",\n", + " subtitle = \"Monthly Dataset Reporting Rate by Health District\",\n", + " x = \"Mois - Month\",\n", + " y = \"District Sanitaire - Health District\"\n", + " ) +\n", + " theme_minimal(base_size = 16) +\n", + " theme(\n", + " axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 16),\n", + " axis.text.y = element_text(size = 9),\n", + " plot.title = element_text(face = \"bold\", hjust = 0.5, size = 20),\n", + " plot.subtitle = element_text(hjust = 0.5, size = 16),\n", + " # legend.position = \"right\",\n", + " legend.position = \"top\",\n", + " panel.grid = element_blank()\n", + " )\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "28662f31-8aa9-4f83-8dd6-8eb489723652", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (nrow(reporting_rate_data) != 0) {\n", + "\n", + "# Prepare the data\n", + "reporting_rate_data_box <- reporting_rate_data %>%\n", + " mutate(\n", + " MONTH = as.integer(MONTH),\n", + " YEAR = as.factor(YEAR),\n", + " # reporting_pct = pmin(REPORTING_RATE, 1) * 100\n", + " reporting_pct = REPORTING_RATE * 100\n", + " )\n", + "\n", + "# Month labels in French\n", + "month_labels_fr <- c(\n", + " \"Janv\", \"Févr\", \"Mars\", \"Avril\", \"Mai\", \"Juin\",\n", + " \"Juil\", \"Août\", \"Sept\", \"Oct\", \"Nov\", \"Déc\"\n", + ")\n", + "\n", + "# Plot\n", + "options(repr.plot.width = 18, repr.plot.height = 15)\n", + "ggplot(reporting_rate_data_box, aes(x = factor(MONTH), y = reporting_pct, fill = YEAR)) +\n", + " geom_boxplot(outlier.size = 0.8, outlier.alpha = 0.4) +\n", + " scale_x_discrete(labels = month_labels_fr) +\n", + " # scale_y_continuous(name = \"Taux de soumission (%)\", limits = c(0, 100)) +\n", + " scale_y_continuous(name = \"Taux de soumission (%)\") +\n", + " labs(\n", + " title = \"Distribution mensuelle du taux de soumission des rapports\",\n", + " subtitle = \"Monthly Distribution of Dataset Reporting Rate by Health District (2021–2024)\",\n", + " x = \"Mois\",\n", + " fill = \"Année\"\n", + " ) +\n", + " theme_minimal(base_size = 16) +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\", size = 20),\n", + " plot.subtitle = element_text(size = 16),\n", + " legend.position = \"bottom\"\n", + " )\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1fd09749-2042-49d4-b2a3-9c2e6e5eae52", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (nrow(reporting_rate_data) != 0) {\n", + "\n", + "# Step 1: Aggregate to annual reporting rate per district\n", + "annual_data <- reporting_rate_data %>%\n", + " group_by(YEAR, ADM2_ID) %>%\n", + " summarise(reporting_rate = mean(REPORTING_RATE, na.rm = TRUE)) %>%\n", + " ungroup()\n", + "\n", + "# Step 2: Join with spatial data (assuming 'map_sf' contains geometry and ADM2_ID)\n", + "map_data <- shapes_data %>%\n", + " left_join(annual_data, by = \"ADM2_ID\")\n", + "\n", + "# Step 3: Bin the reporting rate into categories\n", + "map_data <- map_data %>%\n", + " mutate(\n", + " reporting_cat = case_when(\n", + " reporting_rate < 0.5 ~ \"<50\",\n", + " reporting_rate < 0.8 ~ \"50-79\", # \"50-80\"\n", + " reporting_rate < 0.9 ~ \"80-89\", # \"80-90\"\n", + " reporting_rate >= 0.9 ~ \">=90\",\n", + " TRUE ~ NA_character_\n", + " ),\n", + " reporting_cat = factor(reporting_cat, levels = c(\"<50\", \"50-79\", \"80-89\", \">=90\")) # levels = c(\"<50\", \"50-80\", \"80-90\", \">=90\")\n", + " )\n", + "\n", + "# Step 4: Define colors\n", + "reporting_colors <- c(\n", + " \"<50\" = \"#d7191c\",\n", + " \"50-79\" = \"#fdae61\",\n", + " \"80-89\" = \"#ffffbf\",\n", + " \">=90\" = \"#1a9641\"\n", + ")\n", + "\n", + "# Step 5: Plot\n", + "options(repr.plot.width = 18, repr.plot.height = 10)\n", + "ggplot(map_data) +\n", + " geom_sf(aes(fill = reporting_cat), color = \"white\", size = 0.2) +\n", + " facet_wrap(~ YEAR) +\n", + " scale_fill_manual(values = reporting_colors, name = \"Taux de soummision (%)\") +\n", + " labs(\n", + " title = \"Taux de soumission des rapports annuels par district sanitaire\",\n", + " subtitle = \"Annual Dataset Reporting Completeness by Health District\"\n", + " ) +\n", + " theme_minimal(base_size = 16) +\n", + " theme(\n", + " legend.position = \"right\",\n", + " strip.text = element_text(face = \"bold\", size = 16),\n", + " plot.title = element_text(face = \"bold\")\n", + " )\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df3f7949-baff-4cd9-a022-594420765289", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (nrow(reporting_rate_data) != 0) {\n", + "\n", + "# Step 1: Compute mean reporting rate per ADM2_ID over all years\n", + "mean_reporting_stats <- map_data %>%\n", + " group_by(ADM2_ID) %>%\n", + " summarise(\n", + " reporting_rate = mean(reporting_rate, na.rm = TRUE),\n", + " .groups = \"drop\"\n", + " ) %>%\n", + " mutate(\n", + " reporting_cat = case_when(\n", + " reporting_rate < 0.5 ~ \"<50\",\n", + " reporting_rate < 0.8 ~ \"50-80\",\n", + " reporting_rate < 0.9 ~ \"80-90\",\n", + " reporting_rate >= 0.9 ~ \">=90\",\n", + " TRUE ~ NA_character_\n", + " )\n", + " )\n", + "\n", + "# Set correct factor levels to match legend\n", + "mean_reporting_stats$reporting_cat <- factor(\n", + " mean_reporting_stats$reporting_cat,\n", + " levels = c(\"<50\", \"50-80\", \"80-90\", \">=90\")\n", + ")\n", + "\n", + "# Step 2: Join with shapes (drop geometry to avoid spatial join conflict)\n", + "mean_reporting_map <- shapes_data %>%\n", + " left_join(st_drop_geometry(mean_reporting_stats), by = \"ADM2_ID\") %>%\n", + " st_as_sf()\n", + "\n", + "# Step 3: Define custom color scale\n", + "reporting_colors <- c(\n", + " \"<50\" = \"#d7191c\", # red\n", + " \"50-80\" = \"#fdae61\", # orange\n", + " \"80-90\" = \"#ffffbf\", # yellow\n", + " \">=90\" = \"#1a9641\" # green\n", + ")\n", + "\n", + "# Step 4: Plot\n", + "options(repr.plot.width = 20, repr.plot.height = 10)\n", + "ggplot(mean_reporting_map) +\n", + " geom_sf(aes(fill = reporting_cat), color = \"white\", size = 0.2) +\n", + " scale_fill_manual(\n", + " values = reporting_colors,\n", + " name = \"Taux de soumission (%)\",\n", + " drop = FALSE\n", + " ) +\n", + " labs(\n", + " title = \"Taux moyen de soumission des rapports (toutes années confondues)\",\n", + " subtitle = \"Mean Annual Dataset Reporting Rate (All Years Combined)\"\n", + " ) +\n", + " theme_minimal(base_size = 16) +\n", + " theme(\n", + " legend.position = \"right\",\n", + " plot.title = element_text(face = \"bold\"),\n", + " plot.subtitle = element_text()\n", + " )\n", + "\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "9497287c-bbd2-446f-946d-88e34233f9f0", + "metadata": {}, + "source": [ + "## B) Taux de rapportage des éléments de données: cas confirmés / Data element Reporting Rate: confirmed cases\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f8cd01e-e4b5-41f5-bff8-d35a1b143d0e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# # import data\n", + "# # was: routine_data\n", + "# reporting_rate_data <- tryCatch({ get_latest_dataset_file_in_memory(DATASET_NAME, paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.parquet\")) }, \n", + "# error = function(e) {\n", + "# msg <- paste(\"Error while loading seasonality file for: \" , COUNTRY_CODE, conditionMessage(e))\n", + "# # cat(msg)\n", + "# log_msg(msg, level = \"warning\") # GP 20250908\n", + "# # stop(msg) # GP 20250908\n", + "# })\n", + "\n", + "# reporting_rate_data <- reporting_rate_data %>%\n", + "# left_join(pyramid_data, by = c(\"ADM2_ID\" = \"LEVEL_3_ID\"))\n", + "\n", + "# printdim(reporting_rate_data)" + ] + }, + { + "cell_type": "markdown", + "id": "8c11349d-598a-4882-a156-3e5b969ab76c", + "metadata": {}, + "source": [ + "### Import and format data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7838692f-fe89-446e-bac5-af5cc7324226", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f8eba51-b883-432b-8f7c-c4860cc9e78c", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", + "# ADMIN_2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ceeba43a-e9ba-4b5a-8d0b-c4faade1367e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# ADMIN_2_LEVEL <- str_replace(ADMIN_2, \"NAME\", \"ID\")\n", + "# ADMIN_2_LEVEL" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "54bf61aa-c324-411f-8eea-93049d1bb252", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# THIS CODE SHOULD BE REMOVED, WE SHOULD ONLY LOAD REPORTING RATE ONCE IN THIS REPORT (parameter " + ] + }, + { + "cell_type": "markdown", + "id": "2338477e-2036-42e6-bfd3-c2b2480395c1", + "metadata": {}, + "source": [ + "**suggestion:** \n", + "- If possible I would try to reuse the same plotting code. So we can remove all the code as from here ...**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "41da51da-7c8a-45d2-b492-a80071dfe2e3", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Import from Dataset\n", + "\n", + "reporting_rate_data <- tryCatch({\n", + " # Attempt to load the dataset\n", + " get_latest_dataset_file_in_memory(\n", + " DATASET_NAME, \n", + " paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.parquet\")\n", + " )\n", + "}, \n", + "error = function(e) {\n", + " # If an error occurs, log a warning\n", + " msg <- paste(\"[WARNING] Warning: file `\", COUNTRY_CODE, \"_reporting_rate_dataelement.parquet` does not exist, skipped loading. \n", + " To generate this file, re-run the reporting rate pipeline. Error:\", conditionMessage(e))\n", + " log_msg(msg, level = \"warning\")\n", + " \n", + " # IMPORTANT: Return an empty tibble with the correct structure SO PIPELINE DOES NOT FAIL\n", + " return(\n", + " tibble(\n", + " YEAR = double(),\n", + " MONTH = double(),\n", + " ADM2_ID = character(),\n", + " REPORTING_RATE = double()\n", + " )\n", + " )\n", + "})\n", + "\n", + "# Add _NAME cols from pyramid\n", + "if (nrow(reporting_rate_data) != 0) {\n", + "\n", + " ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", + " ADMIN_2_LEVEL <- str_replace(ADMIN_2, \"NAME\", \"ID\")\n", + " \n", + " reporting_rate_data <- reporting_rate_data %>%\n", + " # left_join(pyramid_data, by = c(\"ADM2_ID\" = \"LEVEL_3_ID\")) # old\n", + " left_join(pyramid_data, by = c(\"ADM2_ID\" = ADMIN_2_LEVEL))\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dff04b9c-0ee7-44d3-a84f-150bce1c368f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# reporting_rate_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0be95b4e-8678-44be-a361-d2216bcd741c", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# if (nrow(reporting_rate_data) != 0) {\n", + "# reporting_rate_data <- reporting_rate_data %>%\n", + "# left_join(pyramid_data, by = c(\"ADM2_ID\" = \"LEVEL_3_ID\"))\n", + "# }" + ] + }, + { + "cell_type": "markdown", + "id": "42155cf0-0475-45fc-a276-7ff2bd4ed555", + "metadata": {}, + "source": [ + "### Plot: Heatmap" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d4d40b6-7d01-4ed3-83ba-39ffdfbe4b3d", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Prepare date column + category\n", + "\n", + "if (nrow(reporting_rate_data) != 0) {\n", + "\n", + "reporting_rate_data <- reporting_rate_data %>%\n", + " mutate(\n", + " date = as.Date(paste0(YEAR, \"-\", MONTH, \"-01\")),\n", + " ADM2_ID = factor(ADM2_ID),\n", + " # reporting_pct = pmin(REPORTING_RATE, 1) * 100,\n", + " reporting_pct = REPORTING_RATE * 100,\n", + " category = cut(\n", + " reporting_pct,\n", + " # breaks = c(-Inf, 50, 80, 90, Inf),\n", + " # labels = c(\"<50\", \"50–80\", \"80–90\", \"≥90\"),\n", + " # right = FALSE\n", + " breaks = c(-Inf, 50, 80, 90, 100, Inf),\n", + " labels = c(\"<50\", \"50–80\", \"80–90\", \"90-100\", \">100\"),\n", + " right = TRUE\n", + " )\n", + " )\n", + "\n", + "# # Define color scale\n", + "# reporting_colors <- c(\n", + "# \"<50\" = \"#d7191c\", # red\n", + "# \"50–80\" = \"#fdae61\", # orange\n", + "# \"80–90\" = \"#ffffbf\", # yellow\n", + "# \"≥90\" = \"#1a9641\" # green\n", + "# )\n", + "\n", + "# Define color scale\n", + "reporting_colors <- c(\n", + " \"<50\" = \"#d7191c\", # red\n", + " \"50–80\" = \"#fdae61\", # orange\n", + " \"80–90\" = \"#ffffbf\", # yellow\n", + " \"90-100\" = \"#1a9641\", # green\n", + " \">100\" = \"darkgreen\" # \"darkgreen\"\n", + ")\n", + "\n", + "# Plot heatmap\n", + "options(repr.plot.width = 18, repr.plot.height = 15)\n", + "ggplot(reporting_rate_data, aes(x = date, y = LEVEL_3_NAME, fill = category)) +\n", + " geom_tile() +\n", + " scale_fill_manual(\n", + " values = reporting_colors,\n", + " name = \"Taux de soumission (%)\"\n", + " ) +\n", + " labs(\n", + " title = \"Taux de rapportage mensuels par district sanitaire\",\n", + " subtitle = \"Monthly Data Element Reporting Rate by Health District\",\n", + " x = \"Mois - Month\",\n", + " y = \"District Sanitaire - Health District\"\n", + " ) +\n", + " theme_minimal(base_size = 16) +\n", + " theme(\n", + " axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 16),\n", + " axis.text.y = element_text(size = 9),\n", + " plot.title = element_text(face = \"bold\", hjust = 0.5, size = 20),\n", + " plot.subtitle = element_text(hjust = 0.5, size = 16),\n", + " legend.position = \"top\", # \"right\"\n", + " panel.grid = element_blank()\n", + " )\n", + "\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "b14436cf-1f78-4c3f-944b-0c1eb845a3f6", + "metadata": {}, + "source": [ + "### Plot: boxplot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ba3a3e89-17f5-4024-b039-dcedb0f37dc2", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Prepare the data\n", + "\n", + "if (nrow(reporting_rate_data) != 0) {\n", + " \n", + "reporting_rate_data_box <- reporting_rate_data %>%\n", + " mutate(\n", + " MONTH = as.integer(MONTH),\n", + " YEAR = as.factor(YEAR),\n", + " # reporting_pct = pmin(REPORTING_RATE, 1) * 100 # `pmin()` caps values to 1 (then, 100%)\n", + " reporting_pct = REPORTING_RATE * 100\n", + " )\n", + "\n", + "# Month labels in French\n", + "month_labels_fr <- c(\n", + " \"Janv\", \"Févr\", \"Mars\", \"Avril\", \"Mai\", \"Juin\",\n", + " \"Juil\", \"Août\", \"Sept\", \"Oct\", \"Nov\", \"Déc\"\n", + ")\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1472d8f2-56d3-4b7a-82ef-4b344d87d264", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (nrow(reporting_rate_data) != 0) {\n", + "\n", + "# Plot\n", + "options(repr.plot.width = 18, repr.plot.height = 15)\n", + "ggplot(reporting_rate_data_box, aes(x = factor(MONTH), y = reporting_pct, fill = YEAR)) +\n", + " geom_boxplot(outlier.size = 0.8, outlier.alpha = 0.4) +\n", + " scale_x_discrete(labels = month_labels_fr) +\n", + " # scale_y_continuous(name = \"Taux de soumission (%)\", limits = c(0, 100)) +\n", + " scale_y_continuous(name = \"Taux de soumission (%)\") +\n", + " labs(\n", + " title = \"Distribution mensuelle du taux de rapportage\",\n", + " subtitle = \"Monthly Distribution of Data Element Reporting Rate by Health District (2021–2024)\",\n", + " x = \"Mois\",\n", + " fill = \"Année\"\n", + " ) +\n", + " theme_minimal(base_size = 16) +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\", size = 20),\n", + " plot.subtitle = element_text(size = 16),\n", + " legend.position = \"bottom\"\n", + " )\n", + "\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "0d275753-d21e-410a-b7d7-f265ac6e9235", + "metadata": {}, + "source": [ + "### Plot: choropleth" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12674c7e-7745-464c-9cc5-3c1e6dcd63c4", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (nrow(reporting_rate_data) != 0) {\n", + "\n", + "# Step 1: Aggregate to annual reporting rate per district \n", + "annual_data <- reporting_rate_data %>%\n", + " group_by(YEAR, ADM2_ID) %>%\n", + " summarise(reporting_rate = mean(REPORTING_RATE, na.rm = TRUE)) %>%\n", + " ungroup()\n", + "\n", + "# Step 2: Join with spatial data (assuming 'map_sf' contains geometry and ADM2_ID)\n", + "map_data <- shapes_data %>%\n", + " left_join(annual_data, by = \"ADM2_ID\")\n", + "\n", + "# Step 3: Bin the reporting rate into categories\n", + "map_data <- map_data %>%\n", + " mutate(\n", + " reporting_cat = case_when(\n", + " reporting_rate < 0.5 ~ \"<50\",\n", + " reporting_rate < 0.8 ~ \"50-80\",\n", + " reporting_rate < 0.9 ~ \"80-90\",\n", + " reporting_rate >= 0.9 ~ \">=90\",\n", + " TRUE ~ NA_character_\n", + " ),\n", + " reporting_cat = factor(reporting_cat, levels = c(\"<50\", \"50-80\", \"80-90\", \">=90\"))\n", + " )\n", + "\n", + "# Step 4: Define colors\n", + "reporting_colors <- c(\n", + " \"<50\" = \"#d7191c\",\n", + " \"50-80\" = \"#fdae61\",\n", + " \"80-90\" = \"#ffffbf\",\n", + " \">=90\" = \"#1a9641\"\n", + ")\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7777a718-2e6c-4f80-a7d6-ae304a1b49fb", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (nrow(reporting_rate_data) != 0) {\n", + "\n", + "# Step 5: Plot\n", + "options(repr.plot.width = 18, repr.plot.height = 10)\n", + "ggplot(map_data) +\n", + " geom_sf(aes(fill = reporting_cat), color = \"white\", size = 0.2) +\n", + " facet_wrap(~ YEAR) +\n", + " scale_fill_manual(values = reporting_colors, name = \"Taux de soummision (%)\") +\n", + " labs(\n", + " title = \"Taux de rapportage des éléments de donnée annuels par district sanitaire\",\n", + " subtitle = \"Annual Data element Reporting Completeness by Health District\"\n", + " ) +\n", + " theme_minimal(base_size = 16) +\n", + " theme(\n", + " legend.position = \"right\",\n", + " strip.text = element_text(face = \"bold\", size = 16),\n", + " plot.title = element_text(face = \"bold\")\n", + " )\n", + "\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "f25ec0fb-758b-476c-8567-b0dce0a387d1", + "metadata": {}, + "source": [ + "### Plot: choropleth 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c891e330-f56f-4846-88c3-fd13a9fac8e7", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (nrow(reporting_rate_data) != 0) {\n", + "\n", + "# Step 1: Compute mean reporting rate per ADM2_ID over all years\n", + "mean_reporting_stats <- map_data %>%\n", + " group_by(ADM2_ID) %>%\n", + " summarise(\n", + " reporting_rate = mean(reporting_rate, na.rm = TRUE),\n", + " .groups = \"drop\"\n", + " ) %>%\n", + " mutate(\n", + " reporting_cat = case_when(\n", + " reporting_rate < 0.5 ~ \"<50\",\n", + " reporting_rate < 0.8 ~ \"50-80\",\n", + " reporting_rate < 0.9 ~ \"80-90\",\n", + " reporting_rate >= 0.9 ~ \">=90\",\n", + " TRUE ~ NA_character_\n", + " )\n", + " )\n", + "\n", + "# Set correct factor levels to match legend\n", + "mean_reporting_stats$reporting_cat <- factor(\n", + " mean_reporting_stats$reporting_cat,\n", + " levels = c(\"<50\", \"50-80\", \"80-90\", \">=90\")\n", + ")\n", + "\n", + "# Step 2: Join with shapes (drop geometry to avoid spatial join conflict)\n", + "mean_reporting_map <- shapes_data %>%\n", + " left_join(st_drop_geometry(mean_reporting_stats), by = \"ADM2_ID\") %>%\n", + " st_as_sf()\n", + "\n", + "# Step 3: Define custom color scale\n", + "reporting_colors <- c(\n", + " \"<50\" = \"#d7191c\", # red\n", + " \"50-80\" = \"#fdae61\", # orange\n", + " \"80-90\" = \"#ffffbf\", # yellow\n", + " \">=90\" = \"#1a9641\" # green\n", + ")\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4032f2be-dc1e-48cf-9a5f-5c0854c32e9a", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (nrow(reporting_rate_data) != 0) {\n", + "\n", + "# Step 4: Plot\n", + "options(repr.plot.width = 20, repr.plot.height = 10)\n", + "ggplot(mean_reporting_map) +\n", + " geom_sf(aes(fill = reporting_cat), color = \"white\", size = 0.2) +\n", + " scale_fill_manual(\n", + " values = reporting_colors,\n", + " name = \"Taux de soumission (%)\",\n", + " drop = FALSE\n", + " ) +\n", + " labs(\n", + " title = \"Taux moyen de rapportage (toutes années confondues)\",\n", + " subtitle = \"Mean Annual Data Element Reporting Rate (All Years Combined)\"\n", + " ) +\n", + " theme_minimal(base_size = 16) +\n", + " theme(\n", + " legend.position = \"right\",\n", + " plot.title = element_text(face = \"bold\"),\n", + " plot.subtitle = element_text()\n", + " )\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0f1ea54-b02b-4523-b8b7-9dcdf30c39ba", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" + } }, - { - "cell_type": "markdown", - "id": "80fa8c3c-ed62-4248-8149-ffe2974a7206", - "metadata": {}, - "source": [ - "# Taux de Rapportage des Formations Sanitaires - Health Facility Reporting Rates" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "35bc4c99-5e5c-44dc-8c67-7f38eaec708e", - "metadata": {}, - "outputs": [], - "source": [ - "# Set SNT Paths\n", - "SNT_ROOT_PATH <- \"~/workspace\"\n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", - "\n", - "# load util functions\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "\n", - "# List required packages \n", - "required_packages <- c(\"dplyr\", \"tidyr\", \"terra\", \"ggplot2\", \"stringr\", \"lubridate\", \"viridis\", \"patchwork\", \"zoo\", \"purrr\", \"arrow\", \"sf\", \"reticulate\", \"leaflet\")\n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)\n", - "\n", - "# Set environment to load openhexa.sdk from the right environment\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")\n", - "\n", - "# Load SNT config\n", - "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))},\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "# Required environment for the sf packages\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dd297a84-5a55-4374-9d2b-3148fde8072d", - "metadata": {}, - "outputs": [], - "source": [ - "# Configuration variables\n", - "DATASET_NAME <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_REPORTING_RATE\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "ADM_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fa63eb27-746f-420b-87ad-da82139acff9", - "metadata": {}, - "outputs": [], - "source": [ - "# print function\n", - "printdim <- function(df, name = deparse(substitute(df))) {\n", - " cat(\"Dimensions of\", name, \":\", nrow(df), \"rows x\", ncol(df), \"columns\\n\\n\")\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2a603fdb-e3ae-4aa3-a908-0385ae216d49", - "metadata": {}, - "outputs": [], - "source": [ - "# import DHIS2 shapes data\n", - "DATASET_DHIS2 <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "shapes_data <- tryCatch({ get_latest_dataset_file_in_memory(DATASET_DHIS2, paste0(COUNTRY_CODE, \"_shapes.geojson\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 Shapes data for: \" , COUNTRY_CODE, conditionMessage(e))\n", - " cat(msg)\n", - " stop(msg)\n", - " })" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9656abb7-0085-4feb-974c-fb0b1c68c38f", - "metadata": {}, - "outputs": [], - "source": [ - "# import pyramid data\n", - "pyramid_data <- tryCatch({ get_latest_dataset_file_in_memory(DATASET_DHIS2, paste0(COUNTRY_CODE, \"_pyramid.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 Shapes data for: \" , COUNTRY_CODE, conditionMessage(e))\n", - " cat(msg)\n", - " stop(msg)\n", - " })\n", - "\n", - "# Select distinct (already done in SNT format pipeline)\n", - "ADMIN_1_ID <- str_replace(toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1), \"NAME\", \"ID\")\n", - "ADMIN_2_ID <- str_replace(toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2), \"NAME\", \"ID\")\n", - "\n", - "pyramid_data <- pyramid_data %>%\n", - " distinct(across(all_of(c(ADMIN_1_ID, ADMIN_2_ID))), .keep_all = TRUE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1e912503-5c57-4997-8c68-da673bd14626", - "metadata": {}, - "outputs": [], - "source": [ - "print(dim(pyramid_data))\n", - "head(pyramid_data)" - ] - }, - { - "cell_type": "markdown", - "id": "78ec55d0-3a0d-413d-97cd-303895275f88", - "metadata": {}, - "source": [ - "## A) Taux de Soumission des Rapports / Dataset Reporting Rate\n", - "\n", - "**[FR]**\n", - "Cette section analyse le **taux de soumission des rapports**, tel que calculé dans le Système National d’Information Sanitaire (SNIS). Ce taux est défini comme le nombre de rapports effectivement reçus (rapports actuels) divisé par le nombre de rapports attendus (rapports attendus) sur une période donnée. Les rapports attendus correspondent au nombre de formations sanitaires qui, selon les paramètres du SNIS, devaient soumettre un rapport. Cet indicateur permet d’évaluer si les structures ont transmis les rapports requis, sans tenir compte du contenu ou de l’exhaustivité des données saisies.\n", - "\n", - "**[EN]**\n", - "This section analyzes the **dataset reporting rate**, as calculated in the Health Management Information System (HMIS). The rate is defined as the number of reports actually submitted (actual reports) divided by the number of reports expected (expected reports) over a given period. Expected reports refer to the number of health facilities that were required to report according to SNIS configuration. This indicator helps assess whether health facilities submitted their required reports, regardless of the content or completeness of the data within those reports." - ] - }, - { - "cell_type": "markdown", - "id": "793a685b-a5cc-4e12-9c78-e548beffa213", - "metadata": {}, - "source": [ - "**Question:** Can the reporting rate file be loaded only once (using a parameter see below)? -> if that's the case, we can remove specific plotting codes for the \"dataset\" and \"dataelement\" files and just keep one \"plotting code\" for both types \n", - "\n", - "**Suggestion (link to previous Question):** The file name can be parameterized by injecting the user selection via parameters={...} from the OpenHexa pipeline. \n", - "\n", - "> paste0(COUNTRY_CODE, \"_reporting_rate_\", REPORTING_RATE_METHOD ,\".parquet\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a2d9d428-c5a7-4f35-8a21-8f22adaa6a26", - "metadata": {}, - "outputs": [], - "source": [ - "# Import from Dataset\n", - "reporting_rate_data <- tryCatch({\n", - " # Attempt to load the dataset\n", - " get_latest_dataset_file_in_memory(\n", - " DATASET_NAME, \n", - " paste0(COUNTRY_CODE, \"_reporting_rate_dataset.parquet\")\n", - " )\n", - " }, \n", - " error = function(e) {\n", - " # If an error occurs, log a warning\n", - " # msg <- paste(\"[WARNING] Warning: Could not load reporting rate file for:\", COUNTRY_CODE, \". Proceeding with empty data. Error:\", conditionMessage(e))\n", - " msg <- paste(\"[WARNING] Warning: file `\", COUNTRY_CODE, \"_reporting_rate_dataset.parquet` does not exist, skipped loading. \n", - " To generate this file, re-run the reporting rate pipeline. Error:\", conditionMessage(e))\n", - " log_msg(msg, level = \"warning\")\n", - " \n", - " # IMPORTANT: Return an empty tibble with the correct structure SO PIPELINE DOES NOT FAIL\n", - " return(\n", - " tibble(\n", - " YEAR = double(),\n", - " MONTH = double(),\n", - " ADM2_ID = character(),\n", - " REPORTING_RATE = double()\n", - " )\n", - " )\n", - " }\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5d13dea1-e204-4c27-8a44-14e260bcdad1", - "metadata": {}, - "outputs": [], - "source": [ - "# Add _NAME cols from pyramid\n", - "if (nrow(reporting_rate_data) != 0) {\n", - " \n", - " ADMIN_2_ID <- str_replace(ADM_2, \"NAME\", \"ID\") \n", - " reporting_rate_data <- reporting_rate_data %>% \n", - " left_join(pyramid_data[c(ADM_2, ADMIN_2_ID)], by = c(\"ADM2_ID\" = ADMIN_2_ID))\n", - " \n", - " colnames(reporting_rate_data)[colnames(reporting_rate_data) == ADM_2] <- \"ADM2_NAME\"\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b96b0b12-e168-421f-b0a9-76e83c48842c", - "metadata": {}, - "outputs": [], - "source": [ - "head(reporting_rate_data, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "64d787c8-25d3-4b2c-87e9-6d35b206b018", - "metadata": {}, - "source": [ - "**fix:** \n", - " - Just replaced this line with the variable \"ADM2_NAME\" : \n", - "> Plot heatmap \n", - "> options(repr.plot.width = 18, repr.plot.height = 15) \n", - "> ggplot(reporting_rate_data, aes(x = date, y = **ADM2_NAME**, fill = category)) + " - ] - }, - { - "cell_type": "markdown", - "id": "c13f15a4-2788-4d57-9edc-78d0afdbe278", - "metadata": {}, - "source": [ - "### Plot: Heatmap" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bb3b565e-8497-4d88-8d6d-ae6b3e2929b2", - "metadata": {}, - "outputs": [], - "source": [ - "if (nrow(reporting_rate_data) != 0) {\n", - " \n", - " # Prepare date column + category\n", - " reporting_rate_data <- reporting_rate_data %>%\n", - " mutate(\n", - " date = as.Date(paste0(YEAR, \"-\", MONTH, \"-01\")),\n", - " ADM2_ID = factor(ADM2_ID),\n", - " # reporting_pct = pmin(REPORTING_RATE, 1) * 100, # `pmin()` caps to 100%\n", - " reporting_pct = REPORTING_RATE * 100,\n", - " category = cut(\n", - " reporting_pct,\n", - " # breaks = c(-Inf, 50, 80, 90, Inf),\n", - " # labels = c(\"<50\", \"50–80\", \"80–90\", \"≥90\"),\n", - " # GP 2025-08-07 added this, but double check (seems too many >100!!)\n", - " breaks = c(-Inf, 50, 80, 90, 100, Inf),\n", - " labels = c(\"<50\", \"50–80\", \"80–90\", \"90-100\", \">100\"),\n", - " right = TRUE # FALSE: intervals are left-closed: lower bound is included\n", - " )\n", - " )\n", - " \n", - " # Define color scale\n", - " reporting_colors <- c(\n", - " \"<50\" = \"#d7191c\", # red\n", - " \"50–80\" = \"#fdae61\", # orange\n", - " \"80–90\" = \"#ffffbf\", # yellow\n", - " \"90-100\" = \"#1a9641\", # green\n", - " \">100\" = \"darkgreen\"\n", - " )\n", - " \n", - " # Plot heatmap\n", - " options(repr.plot.width = 18, repr.plot.height = 15)\n", - " ggplot(reporting_rate_data, aes(x = date, y = ADM2_NAME, fill = category)) + # -> Using a ADM2_NAME Variable to select the column !!\n", - " geom_tile() +\n", - " scale_fill_manual(\n", - " values = reporting_colors,\n", - " name = \"Taux de soumission (%)\"\n", - " ) +\n", - " labs(\n", - " title = \"Taux de soumission des rapports mensuels par district sanitaire\",\n", - " subtitle = \"Monthly Dataset Reporting Rate by Health District\",\n", - " x = \"Mois - Month\",\n", - " y = \"District Sanitaire - Health District\"\n", - " ) +\n", - " theme_minimal(base_size = 16) +\n", - " theme(\n", - " axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 16),\n", - " axis.text.y = element_text(size = 9),\n", - " plot.title = element_text(face = \"bold\", hjust = 0.5, size = 20),\n", - " plot.subtitle = element_text(hjust = 0.5, size = 16),\n", - " # legend.position = \"right\",\n", - " legend.position = \"top\",\n", - " panel.grid = element_blank()\n", - " )\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "28662f31-8aa9-4f83-8dd6-8eb489723652", - "metadata": {}, - "outputs": [], - "source": [ - "if (nrow(reporting_rate_data) != 0) {\n", - "\n", - "# Prepare the data\n", - "reporting_rate_data_box <- reporting_rate_data %>%\n", - " mutate(\n", - " MONTH = as.integer(MONTH),\n", - " YEAR = as.factor(YEAR),\n", - " # reporting_pct = pmin(REPORTING_RATE, 1) * 100\n", - " reporting_pct = REPORTING_RATE * 100\n", - " )\n", - "\n", - "# Month labels in French\n", - "month_labels_fr <- c(\n", - " \"Janv\", \"Févr\", \"Mars\", \"Avril\", \"Mai\", \"Juin\",\n", - " \"Juil\", \"Août\", \"Sept\", \"Oct\", \"Nov\", \"Déc\"\n", - ")\n", - "\n", - "# Plot\n", - "options(repr.plot.width = 18, repr.plot.height = 15)\n", - "ggplot(reporting_rate_data_box, aes(x = factor(MONTH), y = reporting_pct, fill = YEAR)) +\n", - " geom_boxplot(outlier.size = 0.8, outlier.alpha = 0.4) +\n", - " scale_x_discrete(labels = month_labels_fr) +\n", - " # scale_y_continuous(name = \"Taux de soumission (%)\", limits = c(0, 100)) +\n", - " scale_y_continuous(name = \"Taux de soumission (%)\") +\n", - " labs(\n", - " title = \"Distribution mensuelle du taux de soumission des rapports\",\n", - " subtitle = \"Monthly Distribution of Dataset Reporting Rate by Health District (2021–2024)\",\n", - " x = \"Mois\",\n", - " fill = \"Année\"\n", - " ) +\n", - " theme_minimal(base_size = 16) +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\", size = 20),\n", - " plot.subtitle = element_text(size = 16),\n", - " legend.position = \"bottom\"\n", - " )\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1fd09749-2042-49d4-b2a3-9c2e6e5eae52", - "metadata": {}, - "outputs": [], - "source": [ - "if (nrow(reporting_rate_data) != 0) {\n", - "\n", - "# Step 1: Aggregate to annual reporting rate per district\n", - "annual_data <- reporting_rate_data %>%\n", - " group_by(YEAR, ADM2_ID) %>%\n", - " summarise(reporting_rate = mean(REPORTING_RATE, na.rm = TRUE)) %>%\n", - " ungroup()\n", - "\n", - "# Step 2: Join with spatial data (assuming 'map_sf' contains geometry and ADM2_ID)\n", - "map_data <- shapes_data %>%\n", - " left_join(annual_data, by = \"ADM2_ID\")\n", - "\n", - "# Step 3: Bin the reporting rate into categories\n", - "map_data <- map_data %>%\n", - " mutate(\n", - " reporting_cat = case_when(\n", - " reporting_rate < 0.5 ~ \"<50\",\n", - " reporting_rate < 0.8 ~ \"50-79\", # \"50-80\"\n", - " reporting_rate < 0.9 ~ \"80-89\", # \"80-90\"\n", - " reporting_rate >= 0.9 ~ \">=90\",\n", - " TRUE ~ NA_character_\n", - " ),\n", - " reporting_cat = factor(reporting_cat, levels = c(\"<50\", \"50-79\", \"80-89\", \">=90\")) # levels = c(\"<50\", \"50-80\", \"80-90\", \">=90\")\n", - " )\n", - "\n", - "# Step 4: Define colors\n", - "reporting_colors <- c(\n", - " \"<50\" = \"#d7191c\",\n", - " \"50-79\" = \"#fdae61\",\n", - " \"80-89\" = \"#ffffbf\",\n", - " \">=90\" = \"#1a9641\"\n", - ")\n", - "\n", - "# Step 5: Plot\n", - "options(repr.plot.width = 18, repr.plot.height = 10)\n", - "ggplot(map_data) +\n", - " geom_sf(aes(fill = reporting_cat), color = \"white\", size = 0.2) +\n", - " facet_wrap(~ YEAR) +\n", - " scale_fill_manual(values = reporting_colors, name = \"Taux de soummision (%)\") +\n", - " labs(\n", - " title = \"Taux de soumission des rapports annuels par district sanitaire\",\n", - " subtitle = \"Annual Dataset Reporting Completeness by Health District\"\n", - " ) +\n", - " theme_minimal(base_size = 16) +\n", - " theme(\n", - " legend.position = \"right\",\n", - " strip.text = element_text(face = \"bold\", size = 16),\n", - " plot.title = element_text(face = \"bold\")\n", - " )\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "df3f7949-baff-4cd9-a022-594420765289", - "metadata": {}, - "outputs": [], - "source": [ - "if (nrow(reporting_rate_data) != 0) {\n", - "\n", - "# Step 1: Compute mean reporting rate per ADM2_ID over all years\n", - "mean_reporting_stats <- map_data %>%\n", - " group_by(ADM2_ID) %>%\n", - " summarise(\n", - " reporting_rate = mean(reporting_rate, na.rm = TRUE),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " mutate(\n", - " reporting_cat = case_when(\n", - " reporting_rate < 0.5 ~ \"<50\",\n", - " reporting_rate < 0.8 ~ \"50-80\",\n", - " reporting_rate < 0.9 ~ \"80-90\",\n", - " reporting_rate >= 0.9 ~ \">=90\",\n", - " TRUE ~ NA_character_\n", - " )\n", - " )\n", - "\n", - "# Set correct factor levels to match legend\n", - "mean_reporting_stats$reporting_cat <- factor(\n", - " mean_reporting_stats$reporting_cat,\n", - " levels = c(\"<50\", \"50-80\", \"80-90\", \">=90\")\n", - ")\n", - "\n", - "# Step 2: Join with shapes (drop geometry to avoid spatial join conflict)\n", - "mean_reporting_map <- shapes_data %>%\n", - " left_join(st_drop_geometry(mean_reporting_stats), by = \"ADM2_ID\") %>%\n", - " st_as_sf()\n", - "\n", - "# Step 3: Define custom color scale\n", - "reporting_colors <- c(\n", - " \"<50\" = \"#d7191c\", # red\n", - " \"50-80\" = \"#fdae61\", # orange\n", - " \"80-90\" = \"#ffffbf\", # yellow\n", - " \">=90\" = \"#1a9641\" # green\n", - ")\n", - "\n", - "# Step 4: Plot\n", - "options(repr.plot.width = 20, repr.plot.height = 10)\n", - "ggplot(mean_reporting_map) +\n", - " geom_sf(aes(fill = reporting_cat), color = \"white\", size = 0.2) +\n", - " scale_fill_manual(\n", - " values = reporting_colors,\n", - " name = \"Taux de soumission (%)\",\n", - " drop = FALSE\n", - " ) +\n", - " labs(\n", - " title = \"Taux moyen de soumission des rapports (toutes années confondues)\",\n", - " subtitle = \"Mean Annual Dataset Reporting Rate (All Years Combined)\"\n", - " ) +\n", - " theme_minimal(base_size = 16) +\n", - " theme(\n", - " legend.position = \"right\",\n", - " plot.title = element_text(face = \"bold\"),\n", - " plot.subtitle = element_text()\n", - " )\n", - "\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "9497287c-bbd2-446f-946d-88e34233f9f0", - "metadata": {}, - "source": [ - "## B) Taux de rapportage des éléments de données: cas confirmés / Data element Reporting Rate: confirmed cases\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6f8cd01e-e4b5-41f5-bff8-d35a1b143d0e", - "metadata": {}, - "outputs": [], - "source": [ - "# # import data\n", - "# # was: routine_data\n", - "# reporting_rate_data <- tryCatch({ get_latest_dataset_file_in_memory(DATASET_NAME, paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.parquet\")) }, \n", - "# error = function(e) {\n", - "# msg <- paste(\"Error while loading seasonality file for: \" , COUNTRY_CODE, conditionMessage(e))\n", - "# # cat(msg)\n", - "# log_msg(msg, level = \"warning\") # GP 20250908\n", - "# # stop(msg) # GP 20250908\n", - "# })\n", - "\n", - "# reporting_rate_data <- reporting_rate_data %>%\n", - "# left_join(pyramid_data, by = c(\"ADM2_ID\" = \"LEVEL_3_ID\"))\n", - "\n", - "# printdim(reporting_rate_data)" - ] - }, - { - "cell_type": "markdown", - "id": "8c11349d-598a-4882-a156-3e5b969ab76c", - "metadata": {}, - "source": [ - "### Import and format data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7838692f-fe89-446e-bac5-af5cc7324226", - "metadata": {}, - "outputs": [], - "source": [ - "# config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0f8eba51-b883-432b-8f7c-c4860cc9e78c", - "metadata": {}, - "outputs": [], - "source": [ - "# ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", - "# ADMIN_2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ceeba43a-e9ba-4b5a-8d0b-c4faade1367e", - "metadata": {}, - "outputs": [], - "source": [ - "# ADMIN_2_LEVEL <- str_replace(ADMIN_2, \"NAME\", \"ID\")\n", - "# ADMIN_2_LEVEL" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "54bf61aa-c324-411f-8eea-93049d1bb252", - "metadata": {}, - "outputs": [], - "source": [ - "# THIS CODE SHOULD BE REMOVED, WE SHOULD ONLY LOAD REPORTING RATE ONCE IN THIS REPORT (parameter " - ] - }, - { - "cell_type": "markdown", - "id": "2338477e-2036-42e6-bfd3-c2b2480395c1", - "metadata": {}, - "source": [ - "**suggestion:** \n", - "- If possible I would try to reuse the same plotting code. So we can remove all the code as from here ...**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "41da51da-7c8a-45d2-b492-a80071dfe2e3", - "metadata": {}, - "outputs": [], - "source": [ - "# Import from Dataset\n", - "\n", - "reporting_rate_data <- tryCatch({\n", - " # Attempt to load the dataset\n", - " get_latest_dataset_file_in_memory(\n", - " DATASET_NAME, \n", - " paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.parquet\")\n", - " )\n", - "}, \n", - "error = function(e) {\n", - " # If an error occurs, log a warning\n", - " msg <- paste(\"[WARNING] Warning: file `\", COUNTRY_CODE, \"_reporting_rate_dataelement.parquet` does not exist, skipped loading. \n", - " To generate this file, re-run the reporting rate pipeline. Error:\", conditionMessage(e))\n", - " log_msg(msg, level = \"warning\")\n", - " \n", - " # IMPORTANT: Return an empty tibble with the correct structure SO PIPELINE DOES NOT FAIL\n", - " return(\n", - " tibble(\n", - " YEAR = double(),\n", - " MONTH = double(),\n", - " ADM2_ID = character(),\n", - " REPORTING_RATE = double()\n", - " )\n", - " )\n", - "})\n", - "\n", - "# Add _NAME cols from pyramid\n", - "if (nrow(reporting_rate_data) != 0) {\n", - "\n", - " ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", - " ADMIN_2_LEVEL <- str_replace(ADMIN_2, \"NAME\", \"ID\")\n", - " \n", - " reporting_rate_data <- reporting_rate_data %>%\n", - " # left_join(pyramid_data, by = c(\"ADM2_ID\" = \"LEVEL_3_ID\")) # old\n", - " left_join(pyramid_data, by = c(\"ADM2_ID\" = ADMIN_2_LEVEL))\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dff04b9c-0ee7-44d3-a84f-150bce1c368f", - "metadata": {}, - "outputs": [], - "source": [ - "# reporting_rate_data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0be95b4e-8678-44be-a361-d2216bcd741c", - "metadata": {}, - "outputs": [], - "source": [ - "# if (nrow(reporting_rate_data) != 0) {\n", - "# reporting_rate_data <- reporting_rate_data %>%\n", - "# left_join(pyramid_data, by = c(\"ADM2_ID\" = \"LEVEL_3_ID\"))\n", - "# }" - ] - }, - { - "cell_type": "markdown", - "id": "42155cf0-0475-45fc-a276-7ff2bd4ed555", - "metadata": {}, - "source": [ - "### Plot: Heatmap" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0d4d40b6-7d01-4ed3-83ba-39ffdfbe4b3d", - "metadata": {}, - "outputs": [], - "source": [ - "# Prepare date column + category\n", - "\n", - "if (nrow(reporting_rate_data) != 0) {\n", - "\n", - "reporting_rate_data <- reporting_rate_data %>%\n", - " mutate(\n", - " date = as.Date(paste0(YEAR, \"-\", MONTH, \"-01\")),\n", - " ADM2_ID = factor(ADM2_ID),\n", - " # reporting_pct = pmin(REPORTING_RATE, 1) * 100,\n", - " reporting_pct = REPORTING_RATE * 100,\n", - " category = cut(\n", - " reporting_pct,\n", - " # breaks = c(-Inf, 50, 80, 90, Inf),\n", - " # labels = c(\"<50\", \"50–80\", \"80–90\", \"≥90\"),\n", - " # right = FALSE\n", - " breaks = c(-Inf, 50, 80, 90, 100, Inf),\n", - " labels = c(\"<50\", \"50–80\", \"80–90\", \"90-100\", \">100\"),\n", - " right = TRUE\n", - " )\n", - " )\n", - "\n", - "# # Define color scale\n", - "# reporting_colors <- c(\n", - "# \"<50\" = \"#d7191c\", # red\n", - "# \"50–80\" = \"#fdae61\", # orange\n", - "# \"80–90\" = \"#ffffbf\", # yellow\n", - "# \"≥90\" = \"#1a9641\" # green\n", - "# )\n", - "\n", - "# Define color scale\n", - "reporting_colors <- c(\n", - " \"<50\" = \"#d7191c\", # red\n", - " \"50–80\" = \"#fdae61\", # orange\n", - " \"80–90\" = \"#ffffbf\", # yellow\n", - " \"90-100\" = \"#1a9641\", # green\n", - " \">100\" = \"darkgreen\" # \"darkgreen\"\n", - ")\n", - "\n", - "# Plot heatmap\n", - "options(repr.plot.width = 18, repr.plot.height = 15)\n", - "ggplot(reporting_rate_data, aes(x = date, y = LEVEL_3_NAME, fill = category)) +\n", - " geom_tile() +\n", - " scale_fill_manual(\n", - " values = reporting_colors,\n", - " name = \"Taux de soumission (%)\"\n", - " ) +\n", - " labs(\n", - " title = \"Taux de rapportage mensuels par district sanitaire\",\n", - " subtitle = \"Monthly Data Element Reporting Rate by Health District\",\n", - " x = \"Mois - Month\",\n", - " y = \"District Sanitaire - Health District\"\n", - " ) +\n", - " theme_minimal(base_size = 16) +\n", - " theme(\n", - " axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 16),\n", - " axis.text.y = element_text(size = 9),\n", - " plot.title = element_text(face = \"bold\", hjust = 0.5, size = 20),\n", - " plot.subtitle = element_text(hjust = 0.5, size = 16),\n", - " legend.position = \"top\", # \"right\"\n", - " panel.grid = element_blank()\n", - " )\n", - "\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "b14436cf-1f78-4c3f-944b-0c1eb845a3f6", - "metadata": {}, - "source": [ - "### Plot: boxplot" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ba3a3e89-17f5-4024-b039-dcedb0f37dc2", - "metadata": {}, - "outputs": [], - "source": [ - "# Prepare the data\n", - "\n", - "if (nrow(reporting_rate_data) != 0) {\n", - " \n", - "reporting_rate_data_box <- reporting_rate_data %>%\n", - " mutate(\n", - " MONTH = as.integer(MONTH),\n", - " YEAR = as.factor(YEAR),\n", - " # reporting_pct = pmin(REPORTING_RATE, 1) * 100 # `pmin()` caps values to 1 (then, 100%)\n", - " reporting_pct = REPORTING_RATE * 100\n", - " )\n", - "\n", - "# Month labels in French\n", - "month_labels_fr <- c(\n", - " \"Janv\", \"Févr\", \"Mars\", \"Avril\", \"Mai\", \"Juin\",\n", - " \"Juil\", \"Août\", \"Sept\", \"Oct\", \"Nov\", \"Déc\"\n", - ")\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1472d8f2-56d3-4b7a-82ef-4b344d87d264", - "metadata": {}, - "outputs": [], - "source": [ - "if (nrow(reporting_rate_data) != 0) {\n", - "\n", - "# Plot\n", - "options(repr.plot.width = 18, repr.plot.height = 15)\n", - "ggplot(reporting_rate_data_box, aes(x = factor(MONTH), y = reporting_pct, fill = YEAR)) +\n", - " geom_boxplot(outlier.size = 0.8, outlier.alpha = 0.4) +\n", - " scale_x_discrete(labels = month_labels_fr) +\n", - " # scale_y_continuous(name = \"Taux de soumission (%)\", limits = c(0, 100)) +\n", - " scale_y_continuous(name = \"Taux de soumission (%)\") +\n", - " labs(\n", - " title = \"Distribution mensuelle du taux de rapportage\",\n", - " subtitle = \"Monthly Distribution of Data Element Reporting Rate by Health District (2021–2024)\",\n", - " x = \"Mois\",\n", - " fill = \"Année\"\n", - " ) +\n", - " theme_minimal(base_size = 16) +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\", size = 20),\n", - " plot.subtitle = element_text(size = 16),\n", - " legend.position = \"bottom\"\n", - " )\n", - "\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "0d275753-d21e-410a-b7d7-f265ac6e9235", - "metadata": {}, - "source": [ - "### Plot: choropleth" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "12674c7e-7745-464c-9cc5-3c1e6dcd63c4", - "metadata": {}, - "outputs": [], - "source": [ - "if (nrow(reporting_rate_data) != 0) {\n", - "\n", - "# Step 1: Aggregate to annual reporting rate per district \n", - "annual_data <- reporting_rate_data %>%\n", - " group_by(YEAR, ADM2_ID) %>%\n", - " summarise(reporting_rate = mean(REPORTING_RATE, na.rm = TRUE)) %>%\n", - " ungroup()\n", - "\n", - "# Step 2: Join with spatial data (assuming 'map_sf' contains geometry and ADM2_ID)\n", - "map_data <- shapes_data %>%\n", - " left_join(annual_data, by = \"ADM2_ID\")\n", - "\n", - "# Step 3: Bin the reporting rate into categories\n", - "map_data <- map_data %>%\n", - " mutate(\n", - " reporting_cat = case_when(\n", - " reporting_rate < 0.5 ~ \"<50\",\n", - " reporting_rate < 0.8 ~ \"50-80\",\n", - " reporting_rate < 0.9 ~ \"80-90\",\n", - " reporting_rate >= 0.9 ~ \">=90\",\n", - " TRUE ~ NA_character_\n", - " ),\n", - " reporting_cat = factor(reporting_cat, levels = c(\"<50\", \"50-80\", \"80-90\", \">=90\"))\n", - " )\n", - "\n", - "# Step 4: Define colors\n", - "reporting_colors <- c(\n", - " \"<50\" = \"#d7191c\",\n", - " \"50-80\" = \"#fdae61\",\n", - " \"80-90\" = \"#ffffbf\",\n", - " \">=90\" = \"#1a9641\"\n", - ")\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7777a718-2e6c-4f80-a7d6-ae304a1b49fb", - "metadata": {}, - "outputs": [], - "source": [ - "if (nrow(reporting_rate_data) != 0) {\n", - "\n", - "# Step 5: Plot\n", - "options(repr.plot.width = 18, repr.plot.height = 10)\n", - "ggplot(map_data) +\n", - " geom_sf(aes(fill = reporting_cat), color = \"white\", size = 0.2) +\n", - " facet_wrap(~ YEAR) +\n", - " scale_fill_manual(values = reporting_colors, name = \"Taux de soummision (%)\") +\n", - " labs(\n", - " title = \"Taux de rapportage des éléments de donnée annuels par district sanitaire\",\n", - " subtitle = \"Annual Data element Reporting Completeness by Health District\"\n", - " ) +\n", - " theme_minimal(base_size = 16) +\n", - " theme(\n", - " legend.position = \"right\",\n", - " strip.text = element_text(face = \"bold\", size = 16),\n", - " plot.title = element_text(face = \"bold\")\n", - " )\n", - "\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "f25ec0fb-758b-476c-8567-b0dce0a387d1", - "metadata": {}, - "source": [ - "### Plot: choropleth 2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c891e330-f56f-4846-88c3-fd13a9fac8e7", - "metadata": {}, - "outputs": [], - "source": [ - "if (nrow(reporting_rate_data) != 0) {\n", - "\n", - "# Step 1: Compute mean reporting rate per ADM2_ID over all years\n", - "mean_reporting_stats <- map_data %>%\n", - " group_by(ADM2_ID) %>%\n", - " summarise(\n", - " reporting_rate = mean(reporting_rate, na.rm = TRUE),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " mutate(\n", - " reporting_cat = case_when(\n", - " reporting_rate < 0.5 ~ \"<50\",\n", - " reporting_rate < 0.8 ~ \"50-80\",\n", - " reporting_rate < 0.9 ~ \"80-90\",\n", - " reporting_rate >= 0.9 ~ \">=90\",\n", - " TRUE ~ NA_character_\n", - " )\n", - " )\n", - "\n", - "# Set correct factor levels to match legend\n", - "mean_reporting_stats$reporting_cat <- factor(\n", - " mean_reporting_stats$reporting_cat,\n", - " levels = c(\"<50\", \"50-80\", \"80-90\", \">=90\")\n", - ")\n", - "\n", - "# Step 2: Join with shapes (drop geometry to avoid spatial join conflict)\n", - "mean_reporting_map <- shapes_data %>%\n", - " left_join(st_drop_geometry(mean_reporting_stats), by = \"ADM2_ID\") %>%\n", - " st_as_sf()\n", - "\n", - "# Step 3: Define custom color scale\n", - "reporting_colors <- c(\n", - " \"<50\" = \"#d7191c\", # red\n", - " \"50-80\" = \"#fdae61\", # orange\n", - " \"80-90\" = \"#ffffbf\", # yellow\n", - " \">=90\" = \"#1a9641\" # green\n", - ")\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4032f2be-dc1e-48cf-9a5f-5c0854c32e9a", - "metadata": {}, - "outputs": [], - "source": [ - "if (nrow(reporting_rate_data) != 0) {\n", - "\n", - "# Step 4: Plot\n", - "options(repr.plot.width = 20, repr.plot.height = 10)\n", - "ggplot(mean_reporting_map) +\n", - " geom_sf(aes(fill = reporting_cat), color = \"white\", size = 0.2) +\n", - " scale_fill_manual(\n", - " values = reporting_colors,\n", - " name = \"Taux de soumission (%)\",\n", - " drop = FALSE\n", - " ) +\n", - " labs(\n", - " title = \"Taux moyen de rapportage (toutes années confondues)\",\n", - " subtitle = \"Mean Annual Data Element Reporting Rate (All Years Combined)\"\n", - " ) +\n", - " theme_minimal(base_size = 16) +\n", - " theme(\n", - " legend.position = \"right\",\n", - " plot.title = element_text(face = \"bold\"),\n", - " plot.subtitle = element_text()\n", - " )\n", - "\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a0f1ea54-b02b-4523-b8b7-9dcdf30c39ba", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/pipelines/snt_dhis2_reporting_rate/utils/snt_dhis2_reporting_rate.r b/pipelines/snt_dhis2_reporting_rate/utils/snt_dhis2_reporting_rate.r new file mode 100644 index 0000000..bc84a21 --- /dev/null +++ b/pipelines/snt_dhis2_reporting_rate/utils/snt_dhis2_reporting_rate.r @@ -0,0 +1,79 @@ +# Shared helpers for snt_dhis2_reporting_rate notebooks. + +inspect_reporting_rate <- function(data_tibble) { + tibble_name_full <- deparse(substitute(data_tibble)) + method <- stringr::str_extract(tibble_name_full, "(?<=reporting_rate_).*") + + values_greater_than_1 <- sum(data_tibble$REPORTING_RATE > 1, na.rm = TRUE) + total_values <- length(data_tibble$REPORTING_RATE) + + if (total_values > 0) { + proportion <- values_greater_than_1 / total_values * 100 + min_rate <- min(data_tibble$REPORTING_RATE, na.rm = TRUE) + max_rate <- max(data_tibble$REPORTING_RATE, na.rm = TRUE) + } else { + proportion <- 0 + min_rate <- NA + max_rate <- NA + } + + clarification <- if (proportion == 0) NULL else " (there are more reports than expected)" + + log_msg( + paste0( + "🔍 For reporting rate method : `", method, "`, the values of REPORTING_RATE range from ", round(min_rate, 2), + " to ", round(max_rate, 2), + ", and ", round(proportion, 2), " % of values are >1", clarification, "." + ) + ) + + hist(data_tibble$REPORTING_RATE, breaks = 50) +} + +is_aire_l5 <- function(x) { + stringr::str_detect(x, stringr::regex("^\\s*aire[^a-zA-Z]?", ignore_case = TRUE)) +} + +is_hospital_l4 <- function(x) { + stringr::str_detect(x, stringr::regex("^(hd|chr|chu|hgr)", ignore_case = TRUE)) +} + +snt_write_csv <- function(x, output_data_path, method, country_code = NULL) { + if (is.null(country_code) && exists("COUNTRY_CODE")) { + country_code <- get("COUNTRY_CODE") + } + if (is.null(country_code)) { + stop("country_code is required to export reporting rate csv.") + } + + full_directory_path <- file.path(output_data_path, "reporting_rate") + if (!dir.exists(full_directory_path)) { + dir.create(full_directory_path, recursive = TRUE) + } + + file_path <- file.path(full_directory_path, paste0(country_code, "_reporting_rate_", method, ".csv")) + readr::write_csv(x, file_path) + log_msg(paste0("Exported : ", file_path)) +} + +snt_write_parquet <- function(x, output_data_path, method, country_code = NULL) { + if (is.null(country_code) && exists("COUNTRY_CODE")) { + country_code <- get("COUNTRY_CODE") + } + if (is.null(country_code)) { + stop("country_code is required to export reporting rate parquet.") + } + + full_directory_path <- file.path(output_data_path, "reporting_rate") + if (!dir.exists(full_directory_path)) { + dir.create(full_directory_path, recursive = TRUE) + } + + file_path <- file.path(full_directory_path, paste0(country_code, "_reporting_rate_", method, ".parquet")) + arrow::write_parquet(x, file_path) + log_msg(paste0("Exported : ", file_path)) +} + +printdim <- function(df, name = deparse(substitute(df))) { + cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n") +} From 87a52134aa750b5d7e3b847cc5fcf08503132f4b Mon Sep 17 00:00:00 2001 From: claude-marie Date: Fri, 3 Apr 2026 12:09:08 +0200 Subject: [PATCH 02/18] dataset & dataelement update --- ...snt_dhis2_reporting_rate_dataelement.ipynb | 2212 +++++++------- .../snt_dhis2_reporting_rate_dataelement.r | 183 ++ .../snt_dhis2_reporting_rate_dataset.ipynb | 237 +- ..._dhis2_reporting_rate_dataset_report.ipynb | 2585 ++++++++--------- .../utils/snt_dhis2_reporting_rate_dataset.r | 109 + .../pipeline.py | 89 +- snt_dhis2_reporting_rate_dataset/pipeline.py | 97 +- 7 files changed, 2780 insertions(+), 2732 deletions(-) create mode 100644 pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r create mode 100644 pipelines/snt_dhis2_reporting_rate_dataset/utils/snt_dhis2_reporting_rate_dataset.r diff --git a/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb b/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb index 931e223..8f46501 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb +++ b/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb @@ -1,1232 +1,1068 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "6e8d006c-fd3d-4186-bc8f-b83fdf234e65", - "metadata": { - "papermill": { - "duration": 0.000173, - "end_time": "2026-01-16T10:22:53.011120", - "exception": false, - "start_time": "2026-01-16T10:22:53.010947", - "status": "completed" + "cells": [ + { + "cell_type": "markdown", + "id": "6e8d006c-fd3d-4186-bc8f-b83fdf234e65", + "metadata": { + "papermill": { + "duration": 0.000173, + "end_time": "2026-01-16T10:22:53.011120", + "exception": false, + "start_time": "2026-01-16T10:22:53.010947", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Data Element reporting rate: based on reporting of one or more indicators\n", + "Partially following methods by WHO and as per Diallo (2025) paper\n", + "\n", + "To accurately measure data completeness, we calculate the **monthly** reporting rate per **ADM2**, as the **proportion** of **facilities** (HF or `OU_ID`) that in a given month submitted data for either a single or _any_ of the chosen indicators (i.e., `CONF`, `SUSP`, `TEST`). \n", + "Basically, \"Data Element\" reporting rate is the number of facilities reporting on 1 or more given indicators, over the total number of facilities.
\n", + "For this method the user is allowed to **chose** how to calculate both the **numerator** and **denominator**.
\n", + "\n", + "Specifically: \n", + "\n", + "* **Numerator**: Number of facilities that _actually reported_ data, and it is estimated based on whether a facility (OU_ID) submitted data for **_any_** of the **selected indicators**. \n", + " Note: we **recommend** always including `CONF` because it is a core indicator consistently tracked across the dataset. This choice ensures alignment with the structure of the incidence calculation, which is also mainly based on confirmed cases.\n", + "
\n", + "
\n", + "* **Denominator**: Number of facilities _expected_ to report. This number can be obtained in two different ways: \n", + " * `\"ROUTINE_ACTIVE_FACILITIES\"`: uses the col `EXPECTED_REPORTS` from the df `active_facilities`.
\n", + " This is calculated as the number of \"**active**\" facilities (OU_ID), defined as those that submitted _any_ data **at least once in a given year**, across **all** indicators extracted in `dhis2_routine` (namely: all aggregated indicators as defined in the SNT_config.json file, see: `config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS`)\n", + " * `\"PYRAMID_OPEN_FACILITIES\"`: This method uses the opening and closing dates in DHIS2 (stored in the DHIS2 organisation units) to determine whether a facility was open, and thus expected to report, at the time of calculation.\n", + "
\n", + "
\n", + "* **Output**: Reporting rate table aggregated at administrative level 2 with extensions csv and parquet saved to dataset **SNT_DHIS2_REPORTING_RATE**:\n", + " * cols: YEAR, MONTH, ADM2_ID, REPORTING_RATE\n", + " * Filename: `XXX_reporting_rate_dataelement.`" + ] }, - "tags": [] - }, - "source": [ - "# Data Element reporting rate: based on reporting of one or more indicators\n", - "Partially following methods by WHO and as per Diallo (2025) paper\n", - "\n", - "To accurately measure data completeness, we calculate the **monthly** reporting rate per **ADM2**, as the **proportion** of **facilities** (HF or `OU_ID`) that in a given month submitted data for either a single or _any_ of the chosen indicators (i.e., `CONF`, `SUSP`, `TEST`). \n", - "Basically, \"Data Element\" reporting rate is the number of facilities reporting on 1 or more given indicators, over the total number of facilities.
\n", - "For this method the user is allowed to **chose** how to calculate both the **numerator** and **denominator**.
\n", - "\n", - "Specifically: \n", - "\n", - "* **Numerator**: Number of facilities that _actually reported_ data, and it is estimated based on whether a facility (OU_ID) submitted data for **_any_** of the **selected indicators**. \n", - " Note: we **recommend** always including `CONF` because it is a core indicator consistently tracked across the dataset. This choice ensures alignment with the structure of the incidence calculation, which is also mainly based on confirmed cases.\n", - "
\n", - "
\n", - "* **Denominator**: Number of facilities _expected_ to report. This number can be obtained in two different ways: \n", - " * `\"ROUTINE_ACTIVE_FACILITIES\"`: uses the col `EXPECTED_REPORTS` from the df `active_facilities`.
\n", - " This is calculated as the number of \"**active**\" facilities (OU_ID), defined as those that submitted _any_ data **at least once in a given year**, across **all** indicators extracted in `dhis2_routine` (namely: all aggregated indicators as defined in the SNT_config.json file, see: `config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS`)\n", - " * `\"PYRAMID_OPEN_FACILITIES\"`: This method uses the opening and closing dates in DHIS2 (stored in the DHIS2 organisation units) to determine whether a facility was open, and thus expected to report, at the time of calculation.\n", - "
\n", - "
\n", - "* **Output**: Reporting rate table aggregated at administrative level 2 with extensions csv and parquet saved to dataset **SNT_DHIS2_REPORTING_RATE**:\n", - " * cols: YEAR, MONTH, ADM2_ID, REPORTING_RATE\n", - " * Filename: `XXX_reporting_rate_dataelement.`" - ] - }, - { - "cell_type": "markdown", - "id": "064495be-24e5-4b76-a91f-7ac3d1a27a5a", - "metadata": { - "papermill": { - "duration": 0.000228, - "end_time": "2026-01-16T10:22:53.014752", - "exception": false, - "start_time": "2026-01-16T10:22:53.014524", - "status": "completed" + { + "cell_type": "markdown", + "id": "064495be-24e5-4b76-a91f-7ac3d1a27a5a", + "metadata": { + "papermill": { + "duration": 0.000228, + "end_time": "2026-01-16T10:22:53.014752", + "exception": false, + "start_time": "2026-01-16T10:22:53.014524", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## 1. Setup" + ] }, - "tags": [] - }, - "source": [ - "## 1. Setup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "35ede7cf-257f-439c-a514-26a7290f881d", - "metadata": { - "papermill": { - "duration": 63.150489, - "end_time": "2026-01-16T10:23:56.165530", - "exception": false, - "start_time": "2026-01-16T10:22:53.015041", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "35ede7cf-257f-439c-a514-26a7290f881d", + "metadata": { + "papermill": { + "duration": 63.150489, + "end_time": "2026-01-16T10:23:56.165530", + "exception": false, + "start_time": "2026-01-16T10:22:53.015041", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Project paths\n", + "SNT_ROOT_PATH <- \"/home/hexa/workspace\"\n", + "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_reporting_rate_dataelement\")\n", + "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", + "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", + "DATA_PATH <- file.path(SNT_ROOT_PATH, \"data\", \"dhis2\")\n", + "\n", + "# Load utils\n", + "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_reporting_rate_dataelement.r\"))\n", + "\n", + "# Load libraries\n", + "required_packages <- c(\"arrow\", \"tidyverse\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"glue\", \"zoo\")\n", + "install_and_load(required_packages)\n", + "\n", + "# Environment variables\n", + "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", + "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "\n", + "# Load OpenHEXA sdk\n", + "openhexa <- import(\"openhexa.sdk\")\n" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Project paths\n", - "SNT_ROOT_PATH <- \"/home/hexa/workspace\" \n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, 'code') \n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, 'configuration') \n", - "DATA_PATH <- file.path(SNT_ROOT_PATH, 'data', 'dhis2') \n", - "\n", - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "\n", - "# Load libraries \n", - "required_packages <- c(\"arrow\", \"tidyverse\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"glue\")\n", - "install_and_load(required_packages)\n", - "\n", - "# Environment variables\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "\n", - "# Load OpenHEXA sdk\n", - "openhexa <- import(\"openhexa.sdk\")" - ] - }, - { - "cell_type": "markdown", - "id": "a7a15634-4623-40f2-8e2d-3fa47203aa6e", - "metadata": { - "papermill": { - "duration": 0.00011, - "end_time": "2026-01-16T10:23:56.165873", - "exception": false, - "start_time": "2026-01-16T10:23:56.165763", - "status": "completed" + { + "cell_type": "markdown", + "id": "a7a15634-4623-40f2-8e2d-3fa47203aa6e", + "metadata": { + "papermill": { + "duration": 0.00011, + "end_time": "2026-01-16T10:23:56.165873", + "exception": false, + "start_time": "2026-01-16T10:23:56.165763", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 1.1. Fallback parameters values\n", + "This parameters are injected by papermill when running in OH via pipeline run interface.
\n", + "The code cell below here provides fallback paramater values needed when running this notebook locally." + ] }, - "tags": [] - }, - "source": [ - "### 1.1. Fallback parameters values\n", - "This parameters are injected by papermill when running in OH via pipeline run interface.
\n", - "The code cell below here provides fallback paramater values needed when running this notebook locally." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b17f7685-5291-4e5d-9eec-2d1f9435fccb", - "metadata": { - "papermill": { - "duration": 0.033954, - "end_time": "2026-01-16T10:23:56.199937", - "exception": false, - "start_time": "2026-01-16T10:23:56.165983", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "b17f7685-5291-4e5d-9eec-2d1f9435fccb", + "metadata": { + "papermill": { + "duration": 0.033954, + "end_time": "2026-01-16T10:23:56.199937", + "exception": false, + "start_time": "2026-01-16T10:23:56.165983", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Current options: \n", + "# \"COUNTRY_CODE_routine.parquet\" (RAW data)\n", + "# \"COUNTRY_CODE_routine_outliers_removed.parquet\" \n", + "# \"COUNTRY_CODE_routine_outliers_imputed.parquet\"\n", + "if (!exists(\"ROUTINE_FILE\")) {ROUTINE_FILE <- \"XXX_routine_outliers_imputed.parquet\"}\n", + "\n", + "# Options: \"ROUTINE_ACTIVE_FACILITIES\", \"PYRAMID_OPEN_FACILITIES\"\n", + "if (!exists(\"DATAELEMENT_METHOD_DENOMINATOR\")) {DATAELEMENT_METHOD_DENOMINATOR <- \"ROUTINE_ACTIVE_FACILITIES\"}\n", + "if (!exists(\"ACTIVITY_INDICATORS\")) {ACTIVITY_INDICATORS <- c(\"CONF\", \"PRES\", \"SUSP\")} \n", + "if (!exists(\"VOLUME_ACTIVITY_INDICATORS\")) {VOLUME_ACTIVITY_INDICATORS <- c(\"CONF\", \"PRES\")}\n", + "if (!exists(\"USE_WEIGHTED_REPORTING_RATES\")) {USE_WEIGHTED_REPORTING_RATES <- FALSE}" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Current options:\n", - "# \"COUNTRY_CODE_routine.parquet\" (raw)\n", - "# \"COUNTRY_CODE_routine_outliers_imputed.parquet\"\n", - "# \"COUNTRY_CODE_routine_outliers_removed.parquet\"\n", - "if (!exists(\"ROUTINE_FILE\")) {ROUTINE_FILE <- \"NER_routine_outliers_imputed.parquet\"}\n", - "\n", - "# Options: \"ROUTINE_ACTIVE_FACILITIES\", \"PYRAMID_OPEN_FACILITIES\"\n", - "if (!exists(\"DATAELEMENT_METHOD_DENOMINATOR\")) {DATAELEMENT_METHOD_DENOMINATOR <- \"ROUTINE_ACTIVE_FACILITIES\"}\n", - "if (!exists(\"ACTIVITY_INDICATORS\")) {ACTIVITY_INDICATORS <- c(\"CONF\", \"PRES\", \"SUSP\")} \n", - "if (!exists(\"VOLUME_ACTIVITY_INDICATORS\")) {VOLUME_ACTIVITY_INDICATORS <- c(\"CONF\", \"PRES\")}\n", - "if (!exists(\"USE_WEIGHTED_REPORTING_RATES\")) {USE_WEIGHTED_REPORTING_RATES <- FALSE}" - ] - }, - { - "cell_type": "markdown", - "id": "7dedcc32-c531-498d-90b9-89b0ee9fb9be", - "metadata": { - "papermill": { - "duration": 9.5e-05, - "end_time": "2026-01-16T10:23:56.200231", - "exception": false, - "start_time": "2026-01-16T10:23:56.200136", - "status": "completed" + { + "cell_type": "markdown", + "id": "7dedcc32-c531-498d-90b9-89b0ee9fb9be", + "metadata": { + "papermill": { + "duration": 0.000095, + "end_time": "2026-01-16T10:23:56.200231", + "exception": false, + "start_time": "2026-01-16T10:23:56.200136", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 1.2. Load and check `snt config` file" + ] }, - "tags": [] - }, - "source": [ - "### 1.2. Load and check `snt config` file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5b6d29ea-91f3-4c53-b95e-4b485f88383f", - "metadata": { - "papermill": { - "duration": 0.521572, - "end_time": "2026-01-16T10:23:56.721932", - "exception": false, - "start_time": "2026-01-16T10:23:56.200360", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "5b6d29ea-91f3-4c53-b95e-4b485f88383f", + "metadata": { + "papermill": { + "duration": 0.521572, + "end_time": "2026-01-16T10:23:56.721932", + "exception": false, + "start_time": "2026-01-16T10:23:56.200360", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load SNT config\n", + "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", + " error = function(e) {\n", + " msg <- paste0(\"[ERROR] Error while loading configuration\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })\n", + "\n", + "log_msg(paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, \"SNT_config.json\")))" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", - " error = function(e) {\n", - " msg <- paste0(\"[ERROR] Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "log_msg(paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, \"SNT_config.json\")))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c26c981c-dadd-48ac-ae35-613b8ba61a82", - "metadata": { - "papermill": { - "duration": 0.033003, - "end_time": "2026-01-16T10:23:56.755117", - "exception": false, - "start_time": "2026-01-16T10:23:56.722114", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "c26c981c-dadd-48ac-ae35-613b8ba61a82", + "metadata": { + "papermill": { + "duration": 0.033003, + "end_time": "2026-01-16T10:23:56.755117", + "exception": false, + "start_time": "2026-01-16T10:23:56.722114", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Configuration settings\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", + "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", + "\n", + "# DHIS2_INDICATORS <- names(config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS)\n", + "DHIS2_INDICATORS <- c(\"CONF\", \"PRES\", \"SUSP\", \"TEST\") # GP 20260205\n", + "\n", + "ACTIVITY_INDICATORS <- unlist(ACTIVITY_INDICATORS)\n", + "VOLUME_ACTIVITY_INDICATORS <- unlist(VOLUME_ACTIVITY_INDICATORS)\n", + "fixed_cols <- c('PERIOD', 'YEAR', 'MONTH', 'ADM1_ID', 'ADM2_ID', 'OU_ID')\n", + "fixed_cols_rr <- c('YEAR', 'MONTH', 'ADM2_ID', 'REPORTING_RATE') # Fixed cols for exporting RR tables" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Configuration settings\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", - "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", - "\n", - "# How to treat 0 values (in this case: \"SET_0_TO_NA\" converts 0 to NAs)\n", - "# 🚨 NOTE (2025-01-09): The configuration field `NA_TREATMENT` has been removed from SNT_config.json files.\n", - "# It was legacy code from Ousmane and was only used for Reporting Rate calculations (not anymore).\n", - "# It has been replaced by `0_VALUES_PRESERVED` (boolean: true/false) which specifies whether zero values\n", - "# are stored in the DHIS2 instance (true) or converted to NULL to save space (false).\n", - "# See: https://bluesquare.atlassian.net/browse/SNT25-158\n", - "# The variable `NA_TREATMENT` is kept here for backward compatibility but is no longer loaded from config.\n", - "NA_TREATMENT <- config_json$SNT_CONFIG$NA_TREATMENT\n", - "# DHIS2_INDICATORS <- names(config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS) \n", - "DHIS2_INDICATORS <- c(\"CONF\", \"PRES\", \"SUSP\", \"TEST\") # GP 20260205\n", - "\n", - "ACTIVITY_INDICATORS <- unlist(ACTIVITY_INDICATORS)\n", - "VOLUME_ACTIVITY_INDICATORS <- unlist(VOLUME_ACTIVITY_INDICATORS)\n", - "fixed_cols <- c('PERIOD', 'YEAR', 'MONTH', 'ADM1_ID', 'ADM2_ID', 'OU_ID')\n", - "fixed_cols_rr <- c('YEAR', 'MONTH', 'ADM2_ID', 'REPORTING_RATE') # Fixed cols for exporting RR tables" - ] - }, - { - "cell_type": "markdown", - "id": "8bf4a8bb", - "metadata": {}, - "source": [ - "### 1.3. 🔍 Check: at least 1 indicator must be selected\n", - "The use can toggle on/off each of the indicators. Therefore, need to make sure at least one is ON.
\n", - "Indicator `CONF` is mandatory, but I think it looks better if they're all displayed in the Run pipeline view (more intuitive)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "18b40207", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (!length(ACTIVITY_INDICATORS) > 0) {\n", - " msg <- \"[ERROR] Error: no indicator selected, cannot perform calculation of reporting rate method. Select at least one (e.g., `CONF`).\"\n", - " cat(msg) \n", - " stop(msg)\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "e44ae2ab-4af7-475a-8cbe-6d669895a18b", - "metadata": { - "papermill": { - "duration": 9.3e-05, - "end_time": "2026-01-16T10:23:56.779812", - "exception": false, - "start_time": "2026-01-16T10:23:56.779719", - "status": "completed" + { + "cell_type": "markdown", + "id": "8bf4a8bb", + "metadata": {}, + "source": [ + "### 1.3. 🔍 Check: at least 1 indicator must be selected\n", + "The use can toggle on/off each of the indicators. Therefore, need to make sure at least one is ON.
\n", + "Indicator `CONF` is mandatory, but I think it looks better if they're all displayed in the Run pipeline view (more intuitive)." + ] }, - "tags": [] - }, - "source": [ - "## 2. Load Data" - ] - }, - { - "cell_type": "markdown", - "id": "39e2add7-bbc7-4312-9a6f-9886d675f532", - "metadata": { - "papermill": { - "duration": 6.9e-05, - "end_time": "2026-01-16T10:23:56.779987", - "exception": false, - "start_time": "2026-01-16T10:23:56.779918", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "18b40207", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (!length(ACTIVITY_INDICATORS) > 0) {\n", + " msg <- \"[ERROR] Error: no indicator selected, cannot perform calculation of reporting rate method. Select at least one (e.g., `CONF`).\"\n", + " cat(msg) \n", + " stop(msg)\n", + "}" + ] }, - "tags": [] - }, - "source": [ - "### 2.1. Routine data (DHIS2) \n", - "**Note on pipeline behaviour**:
\n", - "The value of `ROUTINE_FILE` is resolved within the pipeline.py code and injected into the notebook as parameter." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a1213723-f7e2-4238-9f37-f1795b187232", - "metadata": { - "papermill": { - "duration": 2.018878, - "end_time": "2026-01-16T10:23:58.798963", - "exception": false, - "start_time": "2026-01-16T10:23:56.780085", - "status": "completed" + { + "cell_type": "markdown", + "id": "e44ae2ab-4af7-475a-8cbe-6d669895a18b", + "metadata": { + "papermill": { + "duration": 0.000093, + "end_time": "2026-01-16T10:23:56.779812", + "exception": false, + "start_time": "2026-01-16T10:23:56.779719", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## 2. Load Data" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# select dataset\n", - "if (ROUTINE_FILE == glue(\"{COUNTRY_CODE}_routine.parquet\")) {\n", - " rountine_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "} else {\n", - " rountine_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION\n", - "}\n", - " \n", - "# Load file from dataset\n", - "dhis2_routine <- tryCatch({ get_latest_dataset_file_in_memory(rountine_dataset_name, ROUTINE_FILE) }, \n", - " error = function(e) {\n", - " msg <- paste(\"[ERROR] Error while loading DHIS2 routine data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - "})\n", - "dhis2_routine <- dhis2_routine %>% mutate(across(c(PERIOD, YEAR, MONTH), as.numeric)) # Ensure correct data type for numerical columns \n", - "\n", - "# log\n", - "log_msg(glue(\"DHIS2 routine file {ROUTINE_FILE} loaded from dataset: {rountine_dataset_name}. Dataframe dimensions: {paste(dim(dhis2_routine), collapse=', ')}\"))\n", - "dim(dhis2_routine)\n", - "head(dhis2_routine, 2)" - ] - }, - { - "cell_type": "markdown", - "id": "a8b91360-1a4e-4fc4-9883-602bc0ab2a2a", - "metadata": { - "papermill": { - "duration": 0.000138, - "end_time": "2026-01-16T10:23:58.799287", - "exception": false, - "start_time": "2026-01-16T10:23:58.799149", - "status": "completed" + { + "cell_type": "markdown", + "id": "39e2add7-bbc7-4312-9a6f-9886d675f532", + "metadata": { + "papermill": { + "duration": 0.000069, + "end_time": "2026-01-16T10:23:56.779987", + "exception": false, + "start_time": "2026-01-16T10:23:56.779918", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 2.1. Routine data (DHIS2) \n", + "**Note on pipeline behaviour**:
\n", + "The value of `ROUTINE_FILE` is resolved within the pipeline.py code and injected into the notebook as parameter." + ] }, - "tags": [] - }, - "source": [ - "### 2.2. Organisation units (DHIS2 pyramid)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2fd92901-901e-4019-be78-a7718050c1c4", - "metadata": { - "papermill": { - "duration": 0.992899, - "end_time": "2026-01-16T10:23:59.792385", - "exception": false, - "start_time": "2026-01-16T10:23:58.799486", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "a1213723-f7e2-4238-9f37-f1795b187232", + "metadata": { + "papermill": { + "duration": 2.018878, + "end_time": "2026-01-16T10:23:58.798963", + "exception": false, + "start_time": "2026-01-16T10:23:56.780085", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "rountine_dataset_name <- select_routine_dataset_name_dataelement(ROUTINE_FILE, COUNTRY_CODE, config_json)\n", + "dhis2_routine <- load_routine_data_dataelement(rountine_dataset_name, ROUTINE_FILE, COUNTRY_CODE)\n", + "dim(dhis2_routine)\n", + "head(dhis2_routine, 2)\n" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load file from dataset\n", - "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "\n", - "dhis2_pyramid_formatted <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, \"_pyramid.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 pyramid FORMATTED data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - "})\n", - " \n", - "msg <- paste0(\"DHIS2 pyramid FORMATTED data loaded from dataset: `\", dataset_name, \"`. Dataframe dimensions: \", paste(dim(dhis2_pyramid_formatted), collapse=\", \"))\n", - "log_msg(msg)\n", - "dim(dhis2_pyramid_formatted)\n", - "head(dhis2_pyramid_formatted,2)" - ] - }, - { - "cell_type": "markdown", - "id": "2b7f4e50-3731-46bc-b7a7-2ef5317da9d1", - "metadata": { - "papermill": { - "duration": 0.000106, - "end_time": "2026-01-16T10:23:59.792710", - "exception": false, - "start_time": "2026-01-16T10:23:59.792604", - "status": "completed" + { + "cell_type": "markdown", + "id": "a8b91360-1a4e-4fc4-9883-602bc0ab2a2a", + "metadata": { + "papermill": { + "duration": 0.000138, + "end_time": "2026-01-16T10:23:58.799287", + "exception": false, + "start_time": "2026-01-16T10:23:58.799149", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 2.2. Organisation units (DHIS2 pyramid)" + ] }, - "tags": [] - }, - "source": [ - "### 2.3. Check whether selected indicators are present in routine data\n", - "Extra precaution measure to avoid breaks downstream.
\n", - "\n", - "Note: This logic should be moved to pipeline.py 🐍" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "19ff7e56-2397-4ca1-b072-bca4ba1b3d0c", - "metadata": { - "papermill": { - "duration": 0.024863, - "end_time": "2026-01-16T10:23:59.817677", - "exception": false, - "start_time": "2026-01-16T10:23:59.792814", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "2fd92901-901e-4019-be78-a7718050c1c4", + "metadata": { + "papermill": { + "duration": 0.992899, + "end_time": "2026-01-16T10:23:59.792385", + "exception": false, + "start_time": "2026-01-16T10:23:58.799486", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "dhis2_pyramid_formatted <- load_pyramid_data_dataelement(config_json, COUNTRY_CODE)\n", + "dim(dhis2_pyramid_formatted)\n", + "head(dhis2_pyramid_formatted, 2)\n" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (!all(ACTIVITY_INDICATORS %in% names(dhis2_routine))) {\n", - " log_msg(glue(\"🚨 Warning: one or more of the follow column is missing from `dhis2_routine`: {paste(ACTIVITY_INDICATORS, collapse = ', ')}\"), \"warning\")\n", - "}\n", - "\n", - "if (!all(VOLUME_ACTIVITY_INDICATORS %in% names(dhis2_routine))) {\n", - " msg <- glue(\"[ERROR] Volume activity indicator {VOLUME_ACTIVITY_INDICATORS} not present in the routine data. Process cannot continue.\")\n", - " cat(msg)\n", - " stop(msg)\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "bcbd3a9f-5e45-4ae5-8671-e23155236295", - "metadata": { - "papermill": { - "duration": 9.1e-05, - "end_time": "2026-01-16T10:23:59.817949", - "exception": false, - "start_time": "2026-01-16T10:23:59.817858", - "status": "completed" + { + "cell_type": "markdown", + "id": "2b7f4e50-3731-46bc-b7a7-2ef5317da9d1", + "metadata": { + "papermill": { + "duration": 0.000106, + "end_time": "2026-01-16T10:23:59.792710", + "exception": false, + "start_time": "2026-01-16T10:23:59.792604", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 2.3. Check whether selected indicators are present in routine data\n", + "Extra precaution measure to avoid breaks downstream.
\n", + "\n", + "Note: This logic should be moved to pipeline.py 🐍" + ] }, - "tags": [] - }, - "source": [ - "## 3. Reporting rates computations" - ] - }, - { - "cell_type": "markdown", - "id": "7d62cdb6", - "metadata": {}, - "source": [ - "#### 3.0. Define start and end period based on routine data " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3bc2e76a-b5c7-4c71-90f2-c66926ca560a", - "metadata": { - "papermill": { - "duration": 0.044172, - "end_time": "2026-01-16T10:23:59.862224", - "exception": false, - "start_time": "2026-01-16T10:23:59.818052", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "19ff7e56-2397-4ca1-b072-bca4ba1b3d0c", + "metadata": { + "papermill": { + "duration": 0.024863, + "end_time": "2026-01-16T10:23:59.817677", + "exception": false, + "start_time": "2026-01-16T10:23:59.792814", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (!all(ACTIVITY_INDICATORS %in% names(dhis2_routine))) {\n", + " log_msg(glue(\"🚨 Warning: one or more of the follow column is missing from `dhis2_routine`: {paste(ACTIVITY_INDICATORS, collapse = ', ')}\"), \"warning\")\n", + "}\n", + "\n", + "if (!all(VOLUME_ACTIVITY_INDICATORS %in% names(dhis2_routine))) {\n", + " msg <- glue(\"[ERROR] Volume activity indicator {VOLUME_ACTIVITY_INDICATORS} not present in the routine data. Process cannot continue.\")\n", + " cat(msg)\n", + " stop(msg)\n", + "}" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "PERIOD_START <- dhis2_routine$PERIOD %>% min()\n", - "PERIOD_END <- dhis2_routine$PERIOD %>% max()\n", - "\n", - "period_vector <- format(seq(ym(PERIOD_START), ym(PERIOD_END), by = \"month\"), \"%Y%m\")\n", - "cat(glue(\"Start period: {PERIOD_START} \\nEnd period: {PERIOD_END} \\nPeriods count: {length(period_vector)}\"))" - ] - }, - { - "cell_type": "markdown", - "id": "526bc3af-01c1-4ddc-b3b9-077354e57559", - "metadata": { - "papermill": { - "duration": 0.000109, - "end_time": "2026-01-16T10:23:59.862555", - "exception": false, - "start_time": "2026-01-16T10:23:59.862446", - "status": "completed" + { + "cell_type": "markdown", + "id": "bcbd3a9f-5e45-4ae5-8671-e23155236295", + "metadata": { + "papermill": { + "duration": 0.000091, + "end_time": "2026-01-16T10:23:59.817949", + "exception": false, + "start_time": "2026-01-16T10:23:59.817858", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## 3. Reporting rates computations" + ] }, - "tags": [] - }, - "source": [ - "#### 3.1. Build master table (all PERIOD x OU)\n", - "The master table contains all combinations of period x organisation unit " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9308197a-0852-4d34-8888-cf5564f35a9d", - "metadata": { - "papermill": { - "duration": 0.289128, - "end_time": "2026-01-16T10:24:00.151791", - "exception": false, - "start_time": "2026-01-16T10:23:59.862663", - "status": "completed" + { + "cell_type": "markdown", + "id": "7d62cdb6", + "metadata": {}, + "source": [ + "#### 3.0. Define start and end period based on routine data " + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "log_msg(glue(\"Building master table with periods from {PERIOD_START} to {PERIOD_END}. Periods count: {length(period_vector)}\"))\n", - "\n", - "facility_master <- dhis2_pyramid_formatted %>%\n", - " rename(\n", - " OU_ID = glue::glue(\"LEVEL_{config_json$SNT_CONFIG$ANALYTICS_ORG_UNITS_LEVEL}_ID\"),\n", - " OU_NAME = glue::glue(\"LEVEL_{config_json$SNT_CONFIG$ANALYTICS_ORG_UNITS_LEVEL}_NAME\"),\n", - " ADM2_ID = str_replace(ADMIN_2, \"NAME\", \"ID\"),\n", - " ADM2_NAME = all_of(ADMIN_2),\n", - " ADM1_ID = str_replace(ADMIN_1, \"NAME\", \"ID\"),\n", - " ADM1_NAME = all_of(ADMIN_1)\n", - " ) %>%\n", - " select(ADM1_ID, ADM1_NAME, ADM2_ID, ADM2_NAME, OU_ID, OU_NAME, OPENING_DATE, CLOSED_DATE) %>%\n", - " distinct() %>%\n", - " tidyr::crossing(PERIOD = period_vector) %>%\n", - " mutate(PERIOD=as.numeric(PERIOD))\n", - " " - ] - }, - { - "cell_type": "markdown", - "id": "d5af25ad-f17c-4cdc-ac96-908af49fe558", - "metadata": { - "papermill": { - "duration": 0.000114, - "end_time": "2026-01-16T10:24:00.152094", - "exception": false, - "start_time": "2026-01-16T10:24:00.151980", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "3bc2e76a-b5c7-4c71-90f2-c66926ca560a", + "metadata": { + "papermill": { + "duration": 0.044172, + "end_time": "2026-01-16T10:23:59.862224", + "exception": false, + "start_time": "2026-01-16T10:23:59.818052", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "PERIOD_START <- dhis2_routine$PERIOD %>% min()\n", + "PERIOD_END <- dhis2_routine$PERIOD %>% max()\n", + "\n", + "period_vector <- format(seq(ym(PERIOD_START), ym(PERIOD_END), by = \"month\"), \"%Y%m\")\n", + "cat(glue(\"Start period: {PERIOD_START} \\nEnd period: {PERIOD_END} \\nPeriods count: {length(period_vector)}\"))" + ] }, - "tags": [] - }, - "source": [ - "#### 3.2. Identify \"Active\" facilities\n", - "\n", - "Facilities **reporting** zero or positive values on any of the selected indicators (**\"Activity indicators\"**) are considered to be **active**. Note that this method only counts **non-null** (not `NA`s) to prevent counting empty submissions as valid reporting.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7b279d27", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "log_msg(glue(\"Assessing facility reporting activity based on the following indicators: {paste(ACTIVITY_INDICATORS, collapse=', ')}\"))\n", - "\n", - "facility_master_routine <- left_join(\n", - " facility_master,\n", - " # dhis2_routine %>% select(OU_ID, PERIOD, all_of(DHIS2_INDICATORS)), # GP 2026-02-04\n", - " dhis2_routine %>% select(OU_ID, PERIOD, any_of(DHIS2_INDICATORS)), \n", - " by = c(\"OU_ID\", \"PERIOD\")\n", - " ) %>%\n", - " mutate(\n", - " YEAR = as.numeric(substr(PERIOD, 1, 4)),\n", - " ACTIVE_THIS_PERIOD = ifelse(\n", - " rowSums(!is.na(across(all_of(ACTIVITY_INDICATORS))) & across(all_of(ACTIVITY_INDICATORS)) >= 0) > 0, 1, 0), \n", - " COUNT = 1 # Counting every facility\n", - " )" - ] - }, - { - "cell_type": "markdown", - "id": "89c3e5c8-4a4e-497d-9d75-2aed2e8fe619", - "metadata": { - "papermill": { - "duration": 0.000107, - "end_time": "2026-01-16T10:24:01.626760", - "exception": false, - "start_time": "2026-01-16T10:24:01.626653", - "status": "completed" + { + "cell_type": "markdown", + "id": "526bc3af-01c1-4ddc-b3b9-077354e57559", + "metadata": { + "papermill": { + "duration": 0.000109, + "end_time": "2026-01-16T10:23:59.862555", + "exception": false, + "start_time": "2026-01-16T10:23:59.862446", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 3.1. Build master table (all PERIOD x OU)\n", + "The master table contains all combinations of period x organisation unit " + ] }, - "tags": [] - }, - "source": [ - "#### 3.3. Identify `OPEN` facilities (denominator)\n", - "The \"OPEN\" variable indicates whether a facility is considered structurally open for a given reporting period.\n", - "\n", - "A facility is flagged as open (OPEN = 1) for a period if both of the following conditions are met:\n", - "1. No explicit closure in the facility name. The facility name does not contain closure keywords such as “CLOTUR”, “FERMÉ”, “FERMEE”, or similar.\n", - "\n", - "2. The period falls within the facility’s opening and closing dates. The opening date is not after the reporting period, and the closing date is not before or equal to the reporting period.\n", - "\n", - "If either of these conditions is not met, the facility is considered not open (OPEN = 0) for that period." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0b71f1d8-2048-4b62-865c-9acfe61b5b89", - "metadata": { - "papermill": { - "duration": 1.317923, - "end_time": "2026-01-16T10:24:02.944800", - "exception": false, - "start_time": "2026-01-16T10:24:01.626877", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "9308197a-0852-4d34-8888-cf5564f35a9d", + "metadata": { + "papermill": { + "duration": 0.289128, + "end_time": "2026-01-16T10:24:00.151791", + "exception": false, + "start_time": "2026-01-16T10:23:59.862663", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "log_msg(glue(\"Building master table with periods from {PERIOD_START} to {PERIOD_END}. Periods count: {length(period_vector)}\"))\n", + "facility_master <- build_facility_master_dataelement(\n", + " dhis2_pyramid_formatted = dhis2_pyramid_formatted,\n", + " period_vector = period_vector,\n", + " config_json = config_json,\n", + " ADMIN_1 = ADMIN_1,\n", + " ADMIN_2 = ADMIN_2\n", + ")\n" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "facility_master_routine <- facility_master_routine %>%\n", - " mutate(\n", - " period_date = as.Date(ym(PERIOD)),\n", - " \n", - " # Flag facilities explicitly marked as closed in their name\n", - " NAME_CLOSED = str_detect(\n", - " toupper(OU_NAME),\n", - " \"CLOTUR|FERM(E|EE)?\"\n", - " ),\n", - "\n", - " # Check whether the facility is open during the period using open/close dates\n", - " OPEN_BY_DATE = \n", - " !(is.na(OPENING_DATE) | as.Date(OPENING_DATE) > period_date |\n", - " (!is.na(CLOSED_DATE) & as.Date(CLOSED_DATE) <= period_date)\n", - " ),\n", - " \n", - " # Final definition of an open facility for the period:\n", - " # not explicitly closed, within opening/closing dates,\n", - " # and started reporting\n", - " OPEN = ifelse(\n", - " !NAME_CLOSED & OPEN_BY_DATE,\n", - " 1, 0\n", - " )\n", - " )" - ] - }, - { - "cell_type": "markdown", - "id": "657fd6ca", - "metadata": {}, - "source": [ - "#### 3.4. Identify \"Active\" facilities for each YEAR (denominator)" - ] - }, - { - "cell_type": "markdown", - "id": "a598e4b7", - "metadata": {}, - "source": [ - "
\n", - " Important: this step could have a huge influence on reporting rates!
\n", - " Activity can be evaluated over 1 year or across all years, based on grouping: group_by(OU_ID, YEAR):
\n", - "
    \n", - "
  • With YEAR → “active that year”
  • \n", - "
  • Without YEAR → “ever active over the entire extracted period”
  • \n", - "
\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "002e7fbf-1f68-4419-be2d-f16d8c72936d", - "metadata": { - "papermill": { - "duration": 0.173961, - "end_time": "2026-01-16T10:24:05.948136", - "exception": false, - "start_time": "2026-01-16T10:24:05.774175", - "status": "completed" + { + "cell_type": "markdown", + "id": "d5af25ad-f17c-4cdc-ac96-908af49fe558", + "metadata": { + "papermill": { + "duration": 0.000114, + "end_time": "2026-01-16T10:24:00.152094", + "exception": false, + "start_time": "2026-01-16T10:24:00.151980", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 3.2. Identify \"Active\" facilities\n", + "\n", + "Facilities **reporting** zero or positive values on any of the selected indicators (**\"Activity indicators\"**) are considered to be **active**. Note that this method only counts **non-null** (not `NA`s) to prevent counting empty submissions as valid reporting.\n" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Flag facilities with at least one report in the year\n", - "facility_master_routine_01 <- facility_master_routine %>%\n", - " group_by(OU_ID, YEAR) %>%\n", - " mutate(ACTIVE_THIS_YEAR = max(ACTIVE_THIS_PERIOD, na.rm = TRUE)) %>% # use max() to flag if ACTIVE_THIS_PERIOD is 1 at least once\n", - " ungroup()" - ] - }, - { - "cell_type": "markdown", - "id": "160c08ec-cc9a-4e1a-99ec-f703db83a71d", - "metadata": { - "papermill": { - "duration": 9.8e-05, - "end_time": "2026-01-16T10:24:05.948452", - "exception": false, - "start_time": "2026-01-16T10:24:05.948354", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "7b279d27", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "reporting_rate_dataelement <- compute_reporting_rate_dataelement(\n", + " facility_master = facility_master,\n", + " dhis2_routine = dhis2_routine,\n", + " DHIS2_INDICATORS = DHIS2_INDICATORS,\n", + " ACTIVITY_INDICATORS = ACTIVITY_INDICATORS,\n", + " VOLUME_ACTIVITY_INDICATORS = VOLUME_ACTIVITY_INDICATORS,\n", + " DATAELEMENT_METHOD_DENOMINATOR = DATAELEMENT_METHOD_DENOMINATOR,\n", + " USE_WEIGHTED_REPORTING_RATES = USE_WEIGHTED_REPORTING_RATES\n", + ")\n" + ] }, - "tags": [] - }, - "source": [ - "#### 3.5. Compute Weighting factor based on \"volume of activity\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4420e559-4134-4fc3-8950-9972ebede00e", - "metadata": { - "papermill": { - "duration": 0.520673, - "end_time": "2026-01-16T10:24:06.469233", - "exception": false, - "start_time": "2026-01-16T10:24:05.948560", - "status": "completed" + { + "cell_type": "markdown", + "id": "89c3e5c8-4a4e-497d-9d75-2aed2e8fe619", + "metadata": { + "papermill": { + "duration": 0.000107, + "end_time": "2026-01-16T10:24:01.626760", + "exception": false, + "start_time": "2026-01-16T10:24:01.626653", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 3.3. Identify `OPEN` facilities (denominator)\n", + "The \"OPEN\" variable indicates whether a facility is considered structurally open for a given reporting period.\n", + "\n", + "A facility is flagged as open (OPEN = 1) for a period if both of the following conditions are met:\n", + "1. No explicit closure in the facility name. The facility name does not contain closure keywords such as “CLOTUR”, “FERMÉ”, “FERMEE”, or similar.\n", + "\n", + "2. The period falls within the facility’s opening and closing dates. The opening date is not after the reporting period, and the closing date is not before or equal to the reporting period.\n", + "\n", + "If either of these conditions is not met, the facility is considered not open (OPEN = 0) for that period." + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "log_msg(glue(\"Computing volume of activity using indicator: {paste(VOLUME_ACTIVITY_INDICATORS, collapse=', ')}\"))\n", - "\n", - "# Compute MEAN_REPORTED_CASES_BY_HF as total cases over months with activity\n", - "mean_monthly_cases <- dhis2_routine %>% \n", - " mutate(total_cases_by_hf_month = rowSums(across(all_of(VOLUME_ACTIVITY_INDICATORS)), na.rm = TRUE)) %>%\n", - " group_by(ADM2_ID, OU_ID) %>% \n", - " summarise(\n", - " total_cases_by_hf_year = sum(total_cases_by_hf_month, na.rm = TRUE),\n", - " number_of_reporting_months = length(which(total_cases_by_hf_month > 0)),\n", - " .groups = \"drop\"\n", - " ) %>% \n", - " mutate(MEAN_REPORTED_CASES_BY_HF = total_cases_by_hf_year / number_of_reporting_months) %>%\n", - " select(ADM2_ID, OU_ID, MEAN_REPORTED_CASES_BY_HF)\n", - "\n", - "mean_monthly_cases_adm2 <- mean_monthly_cases %>% \n", - " select(ADM2_ID, MEAN_REPORTED_CASES_BY_HF) %>% \n", - " group_by(ADM2_ID) %>% \n", - " summarise(SUMMED_MEAN_REPORTED_CASES_BY_ADM2 = sum(MEAN_REPORTED_CASES_BY_HF, na.rm=TRUE), \n", - " NR_OF_HF = n())\n", - "\n", - "# Compute weights\n", - "hf_weights <- mean_monthly_cases %>% \n", - " left_join(mean_monthly_cases_adm2, by = \"ADM2_ID\") %>%\n", - " mutate(WEIGHT = MEAN_REPORTED_CASES_BY_HF / SUMMED_MEAN_REPORTED_CASES_BY_ADM2 * NR_OF_HF)\n", - "\n", - "# Join with rest of data\n", - "facility_master_routine_02 <- facility_master_routine_01 %>%\n", - " left_join(hf_weights %>% select(OU_ID, WEIGHT), by = c(\"OU_ID\"))" - ] - }, - { - "cell_type": "markdown", - "id": "2fed8529-70e9-4e2e-a498-fe3dd7499bb3", - "metadata": { - "papermill": { - "duration": 0.000108, - "end_time": "2026-01-16T10:24:06.469622", - "exception": false, - "start_time": "2026-01-16T10:24:06.469514", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "0b71f1d8-2048-4b62-865c-9acfe61b5b89", + "metadata": { + "papermill": { + "duration": 1.317923, + "end_time": "2026-01-16T10:24:02.944800", + "exception": false, + "start_time": "2026-01-16T10:24:01.626877", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Moved to utils for readability.\n" + ] }, - "tags": [] - }, - "source": [ - "#### 3.6. Compute Weighted variables" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "216f7658-c1da-44e4-9f4f-fdb44fd40259", - "metadata": { - "papermill": { - "duration": 0.483413, - "end_time": "2026-01-16T10:24:06.953139", - "exception": false, - "start_time": "2026-01-16T10:24:06.469726", - "status": "completed" + { + "cell_type": "markdown", + "id": "657fd6ca", + "metadata": {}, + "source": [ + "#### 3.4. Identify \"Active\" facilities for each YEAR (denominator)" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "log_msg(glue(\"Computing weighted variables for reporting rate calculation.\"))\n", - "\n", - "facility_master_routine_02$ACTIVE_THIS_PERIOD_W <- facility_master_routine_02$ACTIVE_THIS_PERIOD * facility_master_routine_02$WEIGHT\n", - "facility_master_routine_02$COUNT_W <- facility_master_routine_02$COUNT * facility_master_routine_02$WEIGHT \n", - "facility_master_routine_02$OPEN_W <- facility_master_routine_02$OPEN * facility_master_routine_02$WEIGHT\n", - "facility_master_routine_02$ACTIVE_THIS_YEAR_W <- facility_master_routine_02$ACTIVE_THIS_YEAR * facility_master_routine_02$WEIGHT\n", - "\n", - "dim(facility_master_routine_02)\n", - "head(facility_master_routine_02, 2)" - ] - }, - { - "cell_type": "markdown", - "id": "9c0367f7-91cd-4524-abe4-11adf2fcea02", - "metadata": { - "papermill": { - "duration": 0.000172, - "end_time": "2026-01-16T10:24:06.953755", - "exception": false, - "start_time": "2026-01-16T10:24:06.953583", - "status": "completed" + { + "cell_type": "markdown", + "id": "a598e4b7", + "metadata": {}, + "source": [ + "
\n", + " Important: this step could have a huge influence on reporting rates!
\n", + " Activity can be evaluated over 1 year or across all years, based on grouping: group_by(OU_ID, YEAR):
\n", + "
    \n", + "
  • With YEAR → “active that year”
  • \n", + "
  • Without YEAR → “ever active over the entire extracted period”
  • \n", + "
\n", + "
" + ] }, - "tags": [] - }, - "source": [ - "#### 3.7. Aggregate data at ADM2 level" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "af13191e", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "log_msg(glue(\"Aggregating data at admin level 2.\"))\n", - "\n", - "reporting_rate_adm2 <- facility_master_routine_02 %>% \n", - " group_by(ADM1_ID, ADM1_NAME, ADM2_ID, ADM2_NAME, YEAR, PERIOD) %>%\n", - " summarise(\n", - " HF_ACTIVE_THIS_PERIOD_BY_ADM2 = sum(ACTIVE_THIS_PERIOD, na.rm = TRUE), # (numerator) sum of all facilities active per PERIOD\n", - " NR_OF_HF_BY_ADM2 = sum(COUNT, na.rm = TRUE),\n", - " NR_OF_OPEN_HF_BY_ADM2 = sum(OPEN, na.rm = TRUE),\n", - " HF_ACTIVE_THIS_YEAR_BY_ADM2 = sum(ACTIVE_THIS_YEAR, na.rm = TRUE), # (denominator) sum of all facilities active at least once in the YEAR\n", - " HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED = sum(ACTIVE_THIS_PERIOD_W, na.rm = TRUE),\n", - " NR_OF_HF_BY_ADM2_WEIGHTED = sum(COUNT_W, na.rm = TRUE),\n", - " NR_OF_OPEN_HF_BY_ADM2_WEIGHTED = sum(OPEN_W, na.rm = TRUE),\n", - " HF_ACTIVE_THIS_YEAR_BY_ADM2_WEIGHTED = sum(ACTIVE_THIS_YEAR_W, na.rm = TRUE), \n", - " .groups = \"drop\")\n", - "\n", - "dim(reporting_rate_adm2)\n", - "# head(reporting_rate_adm2, 5)" - ] - }, - { - "cell_type": "markdown", - "id": "7d381937", - "metadata": {}, - "source": [ - "#### 3.8. Calculate Reporting Rates (all methods)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b41263f8", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "log_msg(glue(\"Calculating Reporting Rates at admin level 2. Using all methods, weighted and unweighted.\"))\n", - "\n", - "reporting_rate_adm2 <- reporting_rate_adm2 %>% \n", - " mutate(\n", - " RR_TOTAL_HF = HF_ACTIVE_THIS_PERIOD_BY_ADM2 / NR_OF_HF_BY_ADM2,\n", - " RR_OPEN_HF = HF_ACTIVE_THIS_PERIOD_BY_ADM2 / NR_OF_OPEN_HF_BY_ADM2,\n", - " RR_ACTIVE_HF = HF_ACTIVE_THIS_PERIOD_BY_ADM2 / HF_ACTIVE_THIS_YEAR_BY_ADM2,\n", - " RR_TOTAL_HF_W = HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED / NR_OF_HF_BY_ADM2_WEIGHTED,\n", - " RR_OPEN_HF_W = HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED / NR_OF_OPEN_HF_BY_ADM2_WEIGHTED,\n", - " RR_ACTIVE_HF_W = HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED / HF_ACTIVE_THIS_YEAR_BY_ADM2_WEIGHTED\n", - " )\n", - "\n", - "dim(reporting_rate_adm2)\n", - "head(reporting_rate_adm2, 5)" - ] - }, - { - "cell_type": "markdown", - "id": "5e593659", - "metadata": { - "papermill": { - "duration": 0.000108, - "end_time": "2026-01-16T10:24:07.310579", - "exception": false, - "start_time": "2026-01-16T10:24:07.310471", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "002e7fbf-1f68-4419-be2d-f16d8c72936d", + "metadata": { + "papermill": { + "duration": 0.173961, + "end_time": "2026-01-16T10:24:05.948136", + "exception": false, + "start_time": "2026-01-16T10:24:05.774175", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Moved to utils for readability.\n" + ] }, - "tags": [] - }, - "source": [ - "## 4. Select correct col for `REPORTING_RATE` based on denominator method" - ] - }, - { - "cell_type": "markdown", - "id": "c75f2249", - "metadata": { - "papermill": { - "duration": 5.7e-05, - "end_time": "2026-01-16T10:24:07.310743", - "exception": false, - "start_time": "2026-01-16T10:24:07.310686", - "status": "completed" + { + "cell_type": "markdown", + "id": "160c08ec-cc9a-4e1a-99ec-f703db83a71d", + "metadata": { + "papermill": { + "duration": 0.000098, + "end_time": "2026-01-16T10:24:05.948452", + "exception": false, + "start_time": "2026-01-16T10:24:05.948354", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 3.5. Compute Weighting factor based on \"volume of activity\"" + ] }, - "tags": [] - }, - "source": [ - "### 4.1. Select results and format" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "75e71b38", - "metadata": { - "papermill": { - "duration": 0.020644, - "end_time": "2026-01-16T10:24:07.351317", - "exception": false, - "start_time": "2026-01-16T10:24:07.330673", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "4420e559-4134-4fc3-8950-9972ebede00e", + "metadata": { + "papermill": { + "duration": 0.520673, + "end_time": "2026-01-16T10:24:06.469233", + "exception": false, + "start_time": "2026-01-16T10:24:05.948560", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Moved to utils for readability.\n" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (DATAELEMENT_METHOD_DENOMINATOR == \"ROUTINE_ACTIVE_FACILITIES\") { \n", - " rr_column_selection <- \"RR_ACTIVE_HF\" \n", - " if (USE_WEIGHTED_REPORTING_RATES) {\n", - " rr_column_selection <- \"RR_ACTIVE_HF_W\"\n", - " }\n", - "} else if (DATAELEMENT_METHOD_DENOMINATOR == \"PYRAMID_OPEN_FACILITIES\") {\n", - " rr_column_selection <- \"RR_OPEN_HF\"\n", - " if (USE_WEIGHTED_REPORTING_RATES) {\n", - " rr_column_selection <- \"RR_OPEN_HF_W\"\n", - " }\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3df36abb", - "metadata": { - "papermill": { - "duration": 0.140976, - "end_time": "2026-01-16T10:24:07.492479", - "exception": false, - "start_time": "2026-01-16T10:24:07.351503", - "status": "completed" + { + "cell_type": "markdown", + "id": "2fed8529-70e9-4e2e-a498-fe3dd7499bb3", + "metadata": { + "papermill": { + "duration": 0.000108, + "end_time": "2026-01-16T10:24:06.469622", + "exception": false, + "start_time": "2026-01-16T10:24:06.469514", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 3.6. Compute Weighted variables" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "log_msg(glue(\"Using reporting rate column: `{rr_column_selection}` \n", - "based on DATAELEMENT_METHOD_DENOMINATOR == {DATAELEMENT_METHOD_DENOMINATOR} \n", - "and USE_WEIGHTED_REPORTING_RATES == {USE_WEIGHTED_REPORTING_RATES}\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0ccc272c", - "metadata": { - "papermill": { - "duration": 0.182574, - "end_time": "2026-01-16T10:24:07.675242", - "exception": false, - "start_time": "2026-01-16T10:24:07.492668", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "216f7658-c1da-44e4-9f4f-fdb44fd40259", + "metadata": { + "papermill": { + "duration": 0.483413, + "end_time": "2026-01-16T10:24:06.953139", + "exception": false, + "start_time": "2026-01-16T10:24:06.469726", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Moved to utils for readability.\n" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "log_msg(glue(\"Formatting table for '{DATAELEMENT_METHOD_DENOMINATOR}' selection.\"))\n", - "\n", - "# Select column and format final table\n", - "reporting_rate_dataelement <- reporting_rate_adm2 %>%\n", - " mutate(MONTH = PERIOD %% 100) %>%\n", - " rename(REPORTING_RATE = !!sym(rr_column_selection)) %>%\n", - " select(all_of(fixed_cols_rr))\n", - "\n", - "print(dim(reporting_rate_dataelement))\n", - "head(reporting_rate_dataelement, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "ca66e785", - "metadata": { - "papermill": { - "duration": 0.000109, - "end_time": "2026-01-16T10:24:07.675637", - "exception": false, - "start_time": "2026-01-16T10:24:07.675528", - "status": "completed" + { + "cell_type": "markdown", + "id": "9c0367f7-91cd-4524-abe4-11adf2fcea02", + "metadata": { + "papermill": { + "duration": 0.000172, + "end_time": "2026-01-16T10:24:06.953755", + "exception": false, + "start_time": "2026-01-16T10:24:06.953583", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 3.7. Aggregate data at ADM2 level" + ] }, - "tags": [] - }, - "source": [ - "## 5. Inspect reporting rate values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "31535459", - "metadata": { - "papermill": { - "duration": 0.160299, - "end_time": "2026-01-16T10:24:07.836039", - "exception": false, - "start_time": "2026-01-16T10:24:07.675740", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "af13191e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Moved to utils for readability.\n" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "hist(reporting_rate_dataelement$REPORTING_RATE, breaks=50, \n", - "main=paste0(\"Histogram of REPORTING_RATE\\n(\", DATAELEMENT_METHOD_DENOMINATOR, \",\\n\", ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted\", \"Unweighted\"), \")\"), \n", - "xlab=\"REPORTING_RATE\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6778f17d", - "metadata": { - "papermill": { - "duration": 0.896382, - "end_time": "2026-01-16T10:24:08.732660", - "exception": false, - "start_time": "2026-01-16T10:24:07.836278", - "status": "completed" + { + "cell_type": "markdown", + "id": "7d381937", + "metadata": {}, + "source": [ + "#### 3.8. Calculate Reporting Rates (all methods)" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Boxplot\n", - "ggplot(reporting_rate_dataelement,\n", - " aes(x = factor(YEAR), y = REPORTING_RATE)) +\n", - " geom_boxplot(outlier.alpha = 0.3) +\n", - " labs(\n", - " x = \"Year\",\n", - " y = glue::glue(\"REPORTING_RATE ({DATAELEMENT_METHOD_DENOMINATOR})\"),\n", - " title = \"Distribution of REPORTING_RATE per year\",\n", - " subtitle = ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted Reporting Rates\", \"Unweighted Reporting Rates\")\n", - " ) +\n", - " theme_minimal()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a7f013fd", - "metadata": { - "papermill": { - "duration": 0.859448, - "end_time": "2026-01-16T10:24:09.592295", - "exception": false, - "start_time": "2026-01-16T10:24:08.732847", - "status": "completed" + { + "cell_type": "code", + "execution_count": null, + "id": "b41263f8", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Moved to utils for readability.\n" + ] + }, + { + "cell_type": "markdown", + "id": "5e593659", + "metadata": { + "papermill": { + "duration": 0.000108, + "end_time": "2026-01-16T10:24:07.310579", + "exception": false, + "start_time": "2026-01-16T10:24:07.310471", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## 4. Select correct col for `REPORTING_RATE` based on denominator method" + ] }, - "tags": [], - "vscode": { - "languageId": "r" + { + "cell_type": "markdown", + "id": "c75f2249", + "metadata": { + "papermill": { + "duration": 0.000057, + "end_time": "2026-01-16T10:24:07.310743", + "exception": false, + "start_time": "2026-01-16T10:24:07.310686", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 4.1. Select results and format" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75e71b38", + "metadata": { + "papermill": { + "duration": 0.020644, + "end_time": "2026-01-16T10:24:07.351317", + "exception": false, + "start_time": "2026-01-16T10:24:07.330673", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Moved to utils for readability.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3df36abb", + "metadata": { + "papermill": { + "duration": 0.140976, + "end_time": "2026-01-16T10:24:07.492479", + "exception": false, + "start_time": "2026-01-16T10:24:07.351503", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Moved to utils for readability.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ccc272c", + "metadata": { + "papermill": { + "duration": 0.182574, + "end_time": "2026-01-16T10:24:07.675242", + "exception": false, + "start_time": "2026-01-16T10:24:07.492668", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Moved to utils for readability.\n" + ] + }, + { + "cell_type": "markdown", + "id": "ca66e785", + "metadata": { + "papermill": { + "duration": 0.000109, + "end_time": "2026-01-16T10:24:07.675637", + "exception": false, + "start_time": "2026-01-16T10:24:07.675528", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## 5. Inspect reporting rate values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31535459", + "metadata": { + "papermill": { + "duration": 0.160299, + "end_time": "2026-01-16T10:24:07.836039", + "exception": false, + "start_time": "2026-01-16T10:24:07.675740", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "hist(reporting_rate_dataelement$REPORTING_RATE, breaks=50, \n", + "main=paste0(\"Histogram of REPORTING_RATE\\n(\", DATAELEMENT_METHOD_DENOMINATOR, \",\\n\", ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted\", \"Unweighted\"), \")\"), \n", + "xlab=\"REPORTING_RATE\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6778f17d", + "metadata": { + "papermill": { + "duration": 0.896382, + "end_time": "2026-01-16T10:24:08.732660", + "exception": false, + "start_time": "2026-01-16T10:24:07.836278", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Boxplot\n", + "ggplot(reporting_rate_dataelement,\n", + " aes(x = factor(YEAR), y = REPORTING_RATE)) +\n", + " geom_boxplot(outlier.alpha = 0.3) +\n", + " labs(\n", + " x = \"Year\",\n", + " y = glue::glue(\"REPORTING_RATE ({DATAELEMENT_METHOD_DENOMINATOR})\"),\n", + " title = \"Distribution of REPORTING_RATE per year\",\n", + " subtitle = ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted Reporting Rates\", \"Unweighted Reporting Rates\")\n", + " ) +\n", + " theme_minimal()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7f013fd", + "metadata": { + "papermill": { + "duration": 0.859448, + "end_time": "2026-01-16T10:24:09.592295", + "exception": false, + "start_time": "2026-01-16T10:24:08.732847", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "ggplot(reporting_rate_dataelement,\n", + " aes(x = factor(YEAR), y = REPORTING_RATE)) +\n", + "# Boxplot without outliers\n", + " geom_boxplot(outlier.alpha = 0) +\n", + " geom_point(alpha = 0.3, position = position_jitter(width = 0.35)) +\n", + " labs(\n", + " x = \"Year\",\n", + " y = glue::glue(\"REPORTING_RATE based on {DATAELEMENT_METHOD_DENOMINATOR}\"),\n", + " title = \"Distribution of REPORTING_RATE per year\",\n", + " subtitle = ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted Reporting Rates\", \"Unweighted Reporting Rates\")\n", + " ) +\n", + " theme_minimal()" + ] + }, + { + "cell_type": "markdown", + "id": "2866816a-7015-4c5c-b904-f553f3b4790d", + "metadata": { + "papermill": { + "duration": 0.000088, + "end_time": "2026-01-16T10:24:09.592563", + "exception": false, + "start_time": "2026-01-16T10:24:09.592475", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## 5. 📁 Export to `data/` folder" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bbf27852-8ec5-4370-aae2-49e082928fe1", + "metadata": { + "papermill": { + "duration": 0.919937, + "end_time": "2026-01-16T10:24:10.512602", + "exception": false, + "start_time": "2026-01-16T10:24:09.592665", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "export_reporting_rate_dataelement(\n", + " reporting_rate_dataelement = reporting_rate_dataelement,\n", + " DATA_PATH = DATA_PATH,\n", + " COUNTRY_CODE = COUNTRY_CODE\n", + ")\n" + ] } - }, - "outputs": [], - "source": [ - "ggplot(reporting_rate_dataelement,\n", - " aes(x = factor(YEAR), y = REPORTING_RATE)) +\n", - "# Boxplot without outliers\n", - " geom_boxplot(outlier.alpha = 0) +\n", - " geom_point(alpha = 0.3, position = position_jitter(width = 0.35)) +\n", - " labs(\n", - " x = \"Year\",\n", - " y = glue::glue(\"REPORTING_RATE based on {DATAELEMENT_METHOD_DENOMINATOR}\"),\n", - " title = \"Distribution of REPORTING_RATE per year\",\n", - " subtitle = ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted Reporting Rates\", \"Unweighted Reporting Rates\")\n", - " ) +\n", - " theme_minimal()" - ] - }, - { - "cell_type": "markdown", - "id": "2866816a-7015-4c5c-b904-f553f3b4790d", - "metadata": { - "papermill": { - "duration": 8.8e-05, - "end_time": "2026-01-16T10:24:09.592563", - "exception": false, - "start_time": "2026-01-16T10:24:09.592475", - "status": "completed" + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" }, - "tags": [] - }, - "source": [ - "## 5. 📁 Export to `data/` folder" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bbf27852-8ec5-4370-aae2-49e082928fe1", - "metadata": { - "papermill": { - "duration": 0.919937, - "end_time": "2026-01-16T10:24:10.512602", - "exception": false, - "start_time": "2026-01-16T10:24:09.592665", - "status": "completed" + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" }, - "tags": [], - "vscode": { - "languageId": "r" + "papermill": { + "default_parameters": {}, + "duration": 81.158347, + "end_time": "2026-01-16T10:24:10.736106", + "environment_variables": {}, + "exception": null, + "input_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb", + "output_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataelement/papermill_outputs/snt_dhis2_reporting_rate_dataelement_OUTPUT_2026-01-16_102249.ipynb", + "parameters": { + "AVAILABILITY_INDICATORS": [ + "CONF", + "PRES", + "SUSP", + "TEST" + ], + "DATAELEMENT_METHOD_DENOMINATOR": "ROUTINE_ACTIVE_FACILITIES", + "ROUTINE_FILE": "XXX_routine_outliers_removed.parquet", + "SNT_ROOT_PATH": "/home/hexa/workspace", + "USE_WEIGHTED_REPORTING_RATES": true, + "VOLUME_ACTIVITY_INDICATORS": [ + "CONF", + "PRES" + ] + }, + "start_time": "2026-01-16T10:22:49.577759", + "version": "2.6.0" } - }, - "outputs": [], - "source": [ - "output_data_path <- file.path(DATA_PATH, \"reporting_rate\")\n", - "\n", - "# parquet\n", - "file_path <- file.path(output_data_path, paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.parquet\"))\n", - "write_parquet(reporting_rate_dataelement, file_path)\n", - "log_msg(glue(\"Exported : {file_path}\"))\n", - "\n", - "# csv\n", - "file_path <- file.path(output_data_path, paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.csv\"))\n", - "write.csv(reporting_rate_dataelement, file_path, row.names = FALSE)\n", - "log_msg(glue(\"Exported : {file_path}\"))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" }, - "papermill": { - "default_parameters": {}, - "duration": 81.158347, - "end_time": "2026-01-16T10:24:10.736106", - "environment_variables": {}, - "exception": null, - "input_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb", - "output_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataelement/papermill_outputs/snt_dhis2_reporting_rate_dataelement_OUTPUT_2026-01-16_102249.ipynb", - "parameters": { - "AVAILABILITY_INDICATORS": [ - "CONF", - "PRES", - "SUSP", - "TEST" - ], - "DATAELEMENT_METHOD_DENOMINATOR": "ROUTINE_ACTIVE_FACILITIES", - "ROUTINE_FILE": "NER_routine_outliers_removed.parquet", - "SNT_ROOT_PATH": "/home/hexa/workspace", - "USE_WEIGHTED_REPORTING_RATES": true, - "VOLUME_ACTIVITY_INDICATORS": [ - "CONF", - "PRES" - ] - }, - "start_time": "2026-01-16T10:22:49.577759", - "version": "2.6.0" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r b/pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r new file mode 100644 index 0000000..c73601b --- /dev/null +++ b/pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r @@ -0,0 +1,183 @@ +select_routine_dataset_name_dataelement <- function(ROUTINE_FILE, COUNTRY_CODE, config_json) { + if (ROUTINE_FILE == glue::glue("{COUNTRY_CODE}_routine.parquet")) { + return(config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED) + } + config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION +} + + +load_routine_data_dataelement <- function(rountine_dataset_name, ROUTINE_FILE, COUNTRY_CODE) { + dhis2_routine <- tryCatch({ + get_latest_dataset_file_in_memory(rountine_dataset_name, ROUTINE_FILE) + }, error = function(e) { + msg <- paste("[ERROR] Error while loading DHIS2 routine data file for: ", COUNTRY_CODE, conditionMessage(e)) + cat(msg) + stop(msg) + }) + + dhis2_routine <- dhis2_routine %>% + dplyr::mutate(dplyr::across(c(PERIOD, YEAR, MONTH), as.numeric)) + + log_msg(glue::glue( + "DHIS2 routine file {ROUTINE_FILE} loaded from dataset: {rountine_dataset_name}. Dataframe dimensions: {paste(dim(dhis2_routine), collapse=', ')}" + )) + + dhis2_routine +} + + +load_pyramid_data_dataelement <- function(config_json, COUNTRY_CODE) { + dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED + + dhis2_pyramid_formatted <- tryCatch({ + get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, "_pyramid.parquet")) + }, error = function(e) { + msg <- paste("Error while loading DHIS2 pyramid FORMATTED data file for: ", COUNTRY_CODE, conditionMessage(e)) + cat(msg) + stop(msg) + }) + + log_msg(paste0( + "DHIS2 pyramid FORMATTED data loaded from dataset: `", dataset_name, + "`. Dataframe dimensions: ", paste(dim(dhis2_pyramid_formatted), collapse = ", ") + )) + + dhis2_pyramid_formatted +} + + +build_facility_master_dataelement <- function( + dhis2_pyramid_formatted, + period_vector, + config_json, + ADMIN_1, + ADMIN_2 +) { + dhis2_pyramid_formatted %>% + dplyr::rename( + OU_ID = glue::glue("LEVEL_{config_json$SNT_CONFIG$ANALYTICS_ORG_UNITS_LEVEL}_ID"), + OU_NAME = glue::glue("LEVEL_{config_json$SNT_CONFIG$ANALYTICS_ORG_UNITS_LEVEL}_NAME"), + ADM2_ID = stringr::str_replace(ADMIN_2, "NAME", "ID"), + ADM2_NAME = dplyr::all_of(ADMIN_2), + ADM1_ID = stringr::str_replace(ADMIN_1, "NAME", "ID"), + ADM1_NAME = dplyr::all_of(ADMIN_1) + ) %>% + dplyr::select(ADM1_ID, ADM1_NAME, ADM2_ID, ADM2_NAME, OU_ID, OU_NAME, OPENING_DATE, CLOSED_DATE) %>% + dplyr::distinct() %>% + tidyr::crossing(PERIOD = period_vector) %>% + dplyr::mutate(PERIOD = as.numeric(PERIOD)) +} + + +compute_reporting_rate_dataelement <- function( + facility_master, + dhis2_routine, + DHIS2_INDICATORS, + ACTIVITY_INDICATORS, + VOLUME_ACTIVITY_INDICATORS, + DATAELEMENT_METHOD_DENOMINATOR, + USE_WEIGHTED_REPORTING_RATES +) { + facility_master_routine <- dplyr::left_join( + facility_master, + dhis2_routine %>% dplyr::select(OU_ID, PERIOD, dplyr::any_of(DHIS2_INDICATORS)), + by = c("OU_ID", "PERIOD") + ) %>% + dplyr::mutate( + YEAR = as.numeric(substr(PERIOD, 1, 4)), + ACTIVE_THIS_PERIOD = ifelse( + rowSums(!is.na(dplyr::across(dplyr::all_of(ACTIVITY_INDICATORS))) & + dplyr::across(dplyr::all_of(ACTIVITY_INDICATORS)) >= 0) > 0, 1, 0 + ), + COUNT = 1 + ) %>% + dplyr::mutate( + period_date = as.Date(zoo::as.yearmon(as.character(PERIOD), "%Y%m")), + NAME_CLOSED = stringr::str_detect(toupper(OU_NAME), "CLOTUR|FERM(E|EE)?"), + OPEN_BY_DATE = !(is.na(OPENING_DATE) | as.Date(OPENING_DATE) > period_date | + (!is.na(CLOSED_DATE) & as.Date(CLOSED_DATE) <= period_date)), + OPEN = ifelse(!NAME_CLOSED & OPEN_BY_DATE, 1, 0) + ) %>% + dplyr::group_by(OU_ID, YEAR) %>% + dplyr::mutate(ACTIVE_THIS_YEAR = max(ACTIVE_THIS_PERIOD, na.rm = TRUE)) %>% + dplyr::ungroup() + + mean_monthly_cases <- dhis2_routine %>% + dplyr::mutate(total_cases_by_hf_month = rowSums(dplyr::across(dplyr::all_of(VOLUME_ACTIVITY_INDICATORS)), na.rm = TRUE)) %>% + dplyr::group_by(ADM2_ID, OU_ID) %>% + dplyr::summarise( + total_cases_by_hf_year = sum(total_cases_by_hf_month, na.rm = TRUE), + number_of_reporting_months = length(which(total_cases_by_hf_month > 0)), + .groups = "drop" + ) %>% + dplyr::mutate(MEAN_REPORTED_CASES_BY_HF = total_cases_by_hf_year / number_of_reporting_months) %>% + dplyr::select(ADM2_ID, OU_ID, MEAN_REPORTED_CASES_BY_HF) + + mean_monthly_cases_adm2 <- mean_monthly_cases %>% + dplyr::select(ADM2_ID, MEAN_REPORTED_CASES_BY_HF) %>% + dplyr::group_by(ADM2_ID) %>% + dplyr::summarise( + SUMMED_MEAN_REPORTED_CASES_BY_ADM2 = sum(MEAN_REPORTED_CASES_BY_HF, na.rm = TRUE), + NR_OF_HF = dplyr::n() + ) + + hf_weights <- mean_monthly_cases %>% + dplyr::left_join(mean_monthly_cases_adm2, by = "ADM2_ID") %>% + dplyr::mutate(WEIGHT = MEAN_REPORTED_CASES_BY_HF / SUMMED_MEAN_REPORTED_CASES_BY_ADM2 * NR_OF_HF) + + facility_master_routine_02 <- facility_master_routine %>% + dplyr::left_join(hf_weights %>% dplyr::select(OU_ID, WEIGHT), by = c("OU_ID")) + + facility_master_routine_02$ACTIVE_THIS_PERIOD_W <- facility_master_routine_02$ACTIVE_THIS_PERIOD * facility_master_routine_02$WEIGHT + facility_master_routine_02$COUNT_W <- facility_master_routine_02$COUNT * facility_master_routine_02$WEIGHT + facility_master_routine_02$OPEN_W <- facility_master_routine_02$OPEN * facility_master_routine_02$WEIGHT + facility_master_routine_02$ACTIVE_THIS_YEAR_W <- facility_master_routine_02$ACTIVE_THIS_YEAR * facility_master_routine_02$WEIGHT + + reporting_rate_adm2 <- facility_master_routine_02 %>% + dplyr::group_by(ADM1_ID, ADM1_NAME, ADM2_ID, ADM2_NAME, YEAR, PERIOD) %>% + dplyr::summarise( + HF_ACTIVE_THIS_PERIOD_BY_ADM2 = sum(ACTIVE_THIS_PERIOD, na.rm = TRUE), + NR_OF_HF_BY_ADM2 = sum(COUNT, na.rm = TRUE), + NR_OF_OPEN_HF_BY_ADM2 = sum(OPEN, na.rm = TRUE), + HF_ACTIVE_THIS_YEAR_BY_ADM2 = sum(ACTIVE_THIS_YEAR, na.rm = TRUE), + HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED = sum(ACTIVE_THIS_PERIOD_W, na.rm = TRUE), + NR_OF_HF_BY_ADM2_WEIGHTED = sum(COUNT_W, na.rm = TRUE), + NR_OF_OPEN_HF_BY_ADM2_WEIGHTED = sum(OPEN_W, na.rm = TRUE), + HF_ACTIVE_THIS_YEAR_BY_ADM2_WEIGHTED = sum(ACTIVE_THIS_YEAR_W, na.rm = TRUE), + .groups = "drop" + ) %>% + dplyr::mutate( + RR_TOTAL_HF = HF_ACTIVE_THIS_PERIOD_BY_ADM2 / NR_OF_HF_BY_ADM2, + RR_OPEN_HF = HF_ACTIVE_THIS_PERIOD_BY_ADM2 / NR_OF_OPEN_HF_BY_ADM2, + RR_ACTIVE_HF = HF_ACTIVE_THIS_PERIOD_BY_ADM2 / HF_ACTIVE_THIS_YEAR_BY_ADM2, + RR_TOTAL_HF_W = HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED / NR_OF_HF_BY_ADM2_WEIGHTED, + RR_OPEN_HF_W = HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED / NR_OF_OPEN_HF_BY_ADM2_WEIGHTED, + RR_ACTIVE_HF_W = HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED / HF_ACTIVE_THIS_YEAR_BY_ADM2_WEIGHTED + ) + + rr_column_selection <- if (DATAELEMENT_METHOD_DENOMINATOR == "ROUTINE_ACTIVE_FACILITIES") "RR_ACTIVE_HF" else "RR_OPEN_HF" + if (USE_WEIGHTED_REPORTING_RATES) { + rr_column_selection <- if (DATAELEMENT_METHOD_DENOMINATOR == "ROUTINE_ACTIVE_FACILITIES") "RR_ACTIVE_HF_W" else "RR_OPEN_HF_W" + } + + reporting_rate_adm2 %>% + dplyr::mutate(MONTH = PERIOD %% 100) %>% + dplyr::rename(REPORTING_RATE = !!rlang::sym(rr_column_selection)) %>% + dplyr::select(YEAR, MONTH, ADM2_ID, REPORTING_RATE) +} + + +export_reporting_rate_dataelement <- function(reporting_rate_dataelement, DATA_PATH, COUNTRY_CODE) { + output_data_path <- file.path(DATA_PATH, "reporting_rate") + if (!dir.exists(output_data_path)) { + dir.create(output_data_path, recursive = TRUE) + } + + file_path <- file.path(output_data_path, paste0(COUNTRY_CODE, "_reporting_rate_dataelement.parquet")) + arrow::write_parquet(reporting_rate_dataelement, file_path) + log_msg(glue::glue("Exported : {file_path}")) + + file_path <- file.path(output_data_path, paste0(COUNTRY_CODE, "_reporting_rate_dataelement.csv")) + write.csv(reporting_rate_dataelement, file_path, row.names = FALSE) + log_msg(glue::glue("Exported : {file_path}")) +} diff --git a/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb b/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb index dcac610..f26b3b6 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb +++ b/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb @@ -51,13 +51,9 @@ "\n", "### Pipeline parameters\n", "\n", - "- **Routine data source**: Select the routine dataset variant used for reporting rate computation.\n", - "\n", - "- **`raw`**: Loads routine data from the formatted dataset.\n", - "\n", - "- **`imputed`**: Loads routine data from the outliers dataset using imputed values.\n", - "\n", - "- **`outliers_removed`**: Loads routine data from the outliers dataset after outliers removal." + "- **Outliers detection method**: Specify which method was used to detect outliers in routine data. Choose \"Routine data (Raw)\" to use raw routine data.\n", + " \n", + "- **Use routine with outliers removed**: Toggle this on to use the routine data after outliers have been removed (using the outliers detection method selected above). Else, this pipeline will use either the imputed routine data (to replace the outlier values removed) or the raw routine data if you selected \"Routine data (Raw)\" as your choice of “Outlier processing method”." ] }, { @@ -103,16 +99,18 @@ "outputs": [], "source": [ "# Project paths\n", - "SNT_ROOT_PATH <- \"/home/hexa/workspace\" \n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, 'code') \n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, 'configuration') \n", - "DATA_PATH <- file.path(SNT_ROOT_PATH, 'data', 'dhis2') \n", + "SNT_ROOT_PATH <- \"/home/hexa/workspace\"\n", + "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_reporting_rate_dataset\")\n", + "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", + "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", + "DATA_PATH <- file.path(SNT_ROOT_PATH, \"data\", \"dhis2\")\n", "\n", "# Load utils\n", "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_reporting_rate_dataset.r\"))\n", "\n", - "# Load libraries \n", - "required_packages <- c(\"arrow\", \"tidyverse\", \"glue\", \"jsonlite\", \"httr\", \"reticulate\") \n", + "# Load libraries\n", + "required_packages <- c(\"arrow\", \"tidyverse\", \"glue\", \"jsonlite\", \"httr\", \"reticulate\")\n", "install_and_load(required_packages)\n", "\n", "# Environment variables\n", @@ -121,7 +119,7 @@ "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", "\n", "# Load OpenHEXA sdk\n", - "openhexa <- import(\"openhexa.sdk\")" + "openhexa <- import(\"openhexa.sdk\")\n" ] }, { @@ -377,34 +375,10 @@ }, "outputs": [], "source": [ - "# select dataset\n", - "if (ROUTINE_FILE == glue::glue(\"{COUNTRY_CODE}_routine.parquet\")) {\n", - " rountine_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "} else {\n", - " rountine_dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION\n", - "}\n", - "\n", - "# Load file from dataset\n", - "dhis2_routine <- tryCatch({ get_latest_dataset_file_in_memory(rountine_dataset_name, ROUTINE_FILE) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading DHIS2 routine data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - "})\n", - "\n", - "dhis2_routine <- dhis2_routine %>% mutate(across(c(PERIOD, YEAR, MONTH), as.numeric)) # Ensure correct data type for numerical columns \n", - "\n", - "# Subset data to keep only columns defined in fixed_cols_rr (if defined)\n", - "if (exists(\"fixed_cols_rr\")) {\n", - " dhis2_routine <- dhis2_routine %>% \n", - " select(any_of(fixed_cols_rr)) |> \n", - " distinct()\n", - "}\n", - "\n", - "# log\n", - "log_msg(glue::glue(\"DHIS2 routine file {ROUTINE_FILE} loaded from dataset : {rountine_dataset_name} dataframe dimensions: {paste(dim(dhis2_routine), collapse=', ')}\"))\n", + "rountine_dataset_name <- select_routine_dataset_name_dataset(ROUTINE_FILE, COUNTRY_CODE, config_json)\n", + "dhis2_routine <- load_routine_data_dataset(rountine_dataset_name, ROUTINE_FILE, COUNTRY_CODE, fixed_cols_rr)\n", "dim(dhis2_routine)\n", - "head(dhis2_routine, 3)" + "head(dhis2_routine, 3)\n" ] }, { @@ -449,23 +423,8 @@ }, "outputs": [], "source": [ - "dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "file_name <- paste0(COUNTRY_CODE, \"_reporting.parquet\") # reporting rate file\n", - "\n", - "# Load file from dataset\n", - "dhis2_reporting <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, file_name) }, \n", - " error = function(e) {\n", - " msg <- paste(\"[ERROR] Error while loading DHIS2 dataset reporting rates file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - "})\n", - "dhis2_reporting <- dhis2_reporting %>% mutate(across(c(PERIOD, YEAR, MONTH, VALUE), as.numeric)) # numeric values\n", - "\n", - "msg <- paste0(\"DHIS2 Datatset reporting data loaded from file `\", file_name, \"` (from dataset : `\", dataset_name, \"`). \n", - "Dataframe dimensions: \", \n", - " paste(dim(dhis2_reporting), collapse=\", \"))\n", - "log_msg(msg)\n", - "head(dhis2_reporting, 3)" + "dhis2_reporting <- load_reporting_data_dataset(config_json, COUNTRY_CODE)\n", + "head(dhis2_reporting, 3)\n" ] }, { @@ -530,16 +489,11 @@ }, "outputs": [], "source": [ - "# Check if REPORTING_RATE_PRODUCT_ID present in the data: if yes, filter to keep only those, else skip filtering (keep all) and log a warning\n", - "if (all(REPORTING_RATE_PRODUCT_ID %in% unique(dhis2_reporting$PRODUCT_UID))) {\n", - " dhis2_reporting <- dhis2_reporting %>% filter(PRODUCT_UID %in% REPORTING_RATE_PRODUCT_ID)\n", - " log_msg(glue::glue(\"🪮 Filtering DHIS2 reporting data to keep only values for REPORTING_RATE_PRODUCT_UID(s): {paste(REPORTING_RATE_PRODUCT_ID, collapse=', ')}.\n", - " Removed {nrow(dhis2_reporting) - nrow(dhis2_reporting %>% filter(PRODUCT_UID %in% REPORTING_RATE_PRODUCT_ID))} rows.\n", - " Dataframe dimensions after filtering: {paste(dim(dhis2_reporting), collapse=', ')}\"))\n", - "} else {\n", - " log_msg(glue::glue(\"🚨 Warning: REPORTING_RATE_PRODUCT_UID: {paste(REPORTING_RATE_PRODUCT_ID, collapse=', ')} not found in DHIS2 reporting data PRODUCT_UIDs: {paste(unique(dhis2_reporting$PRODUCT_UID), collapse=', ')}. \n", - " 🦘 Skipping filtering and keeping all data. Dataframe dimensions: {paste(dim(dhis2_reporting), collapse=', ')}\"), level = \"warning\")\n", - "}" + "reporting_rate_results <- compute_reporting_rate_dataset(\n", + " dhis2_reporting = dhis2_reporting,\n", + " REPORTING_RATE_PRODUCT_ID = REPORTING_RATE_PRODUCT_ID,\n", + " COUNTRY_CODE = COUNTRY_CODE\n", + ")\n" ] }, { @@ -584,16 +538,7 @@ }, "outputs": [], "source": [ - "# Pivot wider to have one column per PRODUCT_METRIC (which now indicates whether the VALUE is \"ACTUAL_REPORTS\" or \"EXPECTED_REPORTS\")\n", - "dhis2_reporting_wide <- dhis2_reporting %>%\n", - " pivot_wider(names_from = PRODUCT_METRIC, values_from = VALUE)\n", - "\n", - "# Log msg\n", - "log_msg(glue::glue(\"Pivoted DHIS2 reporting data to wide format, with one column per PRODUCT_METRIC (ACTUAL_REPORTS, EXPECTED_REPORTS).\n", - "Dimensions after pivot: {paste(dim(dhis2_reporting_wide), collapse=', ')}\"))\n", - "\n", - "dim(dhis2_reporting_wide)\n", - "head(dhis2_reporting_wide, 3)" + "# Moved to utils for readability.\n" ] }, { @@ -656,25 +601,7 @@ }, "outputs": [], "source": [ - "# Check if any OU_ID is present in more than one PRODUCT_UID\n", - "# and if so list them\n", - "ou_product_counts <- dhis2_reporting %>%\n", - " group_by(OU_ID, OU_NAME) %>%\n", - " mutate(PRODUCT_UID_count = n_distinct(PRODUCT_UID)) %>%\n", - " filter(PRODUCT_UID_count > 1) %>%\n", - " select(ADM1_NAME, ADM2_NAME, OU_ID, OU_NAME, PRODUCT_UID_count) %>%\n", - " distinct() \n", - "\n", - "ou_product_counts\n", - "\n", - "# Log msg: which OU_ID have multiple PRODUCT_UIDs\n", - "if (nrow(ou_product_counts) > 0) {\n", - " log_msg(glue::glue(\"🚨 Warning: The following OU_IDs are associated with multiple PRODUCT_UIDs in the DHIS2 reporting data:\n", - "{paste(apply(ou_product_counts, 1, function(row) paste0(' - ', row['OU_NAME'], ' (', row['OU_ID'], ')')), collapse='\\n')}\"), \n", - " level = \"warning\")\n", - "} else {\n", - " log_msg(\"All OU_IDs are associated with a single PRODUCT_UID in the DHIS2 reporting data.\")\n", - "}" + "# Moved to utils for readability.\n" ] }, { @@ -727,20 +654,7 @@ }, "outputs": [], "source": [ - "# Step 1: check for duplicated OU_ID by PERIOD (there should be only 1 value of OU_ID per PERIOD)\n", - "dupl_ou_period <- dhis2_reporting_wide %>%\n", - " group_by(OU_ID, PERIOD) %>%\n", - " filter(n() > 1) %>%\n", - " ungroup() %>%\n", - " select(OU_ID, OU_NAME, PERIOD, PRODUCT_UID, ends_with(\"REPORTS\"))\n", - "\n", - "# Log msg\n", - "if (nrow(dupl_ou_period) > 0) {\n", - " log_msg(glue::glue(\"🚨 Warning: The OU_IDs are associated with multiple PRODUCT_UIDs affect {nrow(dupl_ou_period)} PERIOD entries (rows) in the DHIS2 reporting data.\"))\n", - "}\n", - "\n", - "dim(dupl_ou_period)\n", - "head(dupl_ou_period, 5)" + "# Moved to utils for readability.\n" ] }, { @@ -768,29 +682,7 @@ }, "outputs": [], "source": [ - "# Step 2: remove duplicated OU_ID by PERIOD\n", - "# Use the following logic:\n", - "# - 1. first, check that values (ACTUAL_REPORTS, EXPECTED_REPORTS) are all 0 or 1 (if not that needs to be handled differently, so skip for now)\n", - "# - 2. then, if multiple PRODUCT_UIDs exist for the same OU_ID and PERIOD, keep the one with the highest ACTUAL_REPORTS value\n", - "# (this is because if values agree, then we can simply keep one, if they don't agree, that means that we have 1 and 0 values, so we keep the 1)\n", - "\n", - "if (all(dupl_ou_period$ACTUAL_REPORTS %in% c(0,1)) & all(dupl_ou_period$EXPECTED_REPORTS %in% c(0,1))) {\n", - " dhis2_reporting_wide <- dhis2_reporting_wide %>%\n", - " group_by(PERIOD, OU_ID) %>%\n", - " mutate(ACTUAL_REPORTS_deduplicated = ifelse(OU_ID %in% dupl_ou_period$OU_ID, max(ACTUAL_REPORTS), ACTUAL_REPORTS)) %>%\n", - " ungroup() %>%\n", - " filter(!(OU_ID %in% dupl_ou_period$OU_ID) | (ACTUAL_REPORTS == ACTUAL_REPORTS_deduplicated)) %>%\n", - " select(-ACTUAL_REPORTS_deduplicated)\n", - "\n", - " log_msg(glue::glue(\"✅ Deduplicated DHIS2 reporting data by keeping only one PRODUCT_UID per OU_ID and PERIOD, based on highest ACTUAL_REPORTS value.\n", - " Dataframe dimensions after deduplication: {paste(dim(dhis2_reporting_wide), collapse=', ')}\"))\n", - "} else {\n", - " log_msg(\"🚨 Warning: Cannot deduplicate OU_ID by PERIOD in DHIS2 reporting data because ACTUAL_REPORTS or EXPECTED_REPORTS contain values other than 0 or 1. \n", - " Analysis will continue without removing duplicated entries.\", level = \"warning\")\n", - "} \n", - "\n", - "dim(dhis2_reporting_wide)\n", - "head(dhis2_reporting_wide, 3)" + "# Moved to utils for readability.\n" ] }, { @@ -838,38 +730,7 @@ }, "outputs": [], "source": [ - "# Modify dhis2_reporting_wide to replace all values of ACTUAL_REPORTS and EXPECTED_REPORTS that are >1 with 1\n", - "if (COUNTRY_CODE == \"NER\") {\n", - " log_msg(\"🇳🇪 Special handling for NER: replacing all values of ACTUAL_REPORTS and EXPECTED_REPORTS that are >1 with 1.\")\n", - "\n", - " # Check if any values >1 exist\n", - " n_actual_reports_gt1 <- sum(dhis2_reporting_wide$ACTUAL_REPORTS > 1, na.rm = TRUE)\n", - " n_expected_reports_gt1 <- sum(dhis2_reporting_wide$EXPECTED_REPORTS > 1, na.rm = TRUE)\n", - "\n", - " # Extract the PRODUCT_UID and PRODUCT_NAME associated with those values\n", - " if (n_actual_reports_gt1 > 0 | n_expected_reports_gt1 > 0) {\n", - " dupl_actual_reports <- dhis2_reporting_wide %>%\n", - " filter(ACTUAL_REPORTS > 1) %>%\n", - " select(PRODUCT_UID, PRODUCT_NAME) %>%\n", - " distinct()\n", - "\n", - " log_msg(glue::glue(\"Note: Found {n_actual_reports_gt1} entries with ACTUAL_REPORTS > 1 and {n_expected_reports_gt1} entries with EXPECTED_REPORTS > 1.\n", - "Affected PRODUCT_UIDs and PRODUCT_NAMEs for ACTUAL_REPORTS > 1:\n", - "{paste(apply(dupl_actual_reports, 1, function(row) paste0(row['PRODUCT_NAME'], ' (', row['PRODUCT_UID'], ')')), collapse='\\n')}\"))\n", - "\n", - " dhis2_reporting_wide <- dhis2_reporting_wide %>%\n", - " mutate(\n", - " ACTUAL_REPORTS = ifelse(ACTUAL_REPORTS > 1, 1, ACTUAL_REPORTS),\n", - " EXPECTED_REPORTS = ifelse(EXPECTED_REPORTS > 1, 1, EXPECTED_REPORTS)\n", - " )\n", - "\n", - " log_msg(\"✅ Replaced all values of ACTUAL_REPORTS and EXPECTED_REPORTS that were >1 with 1.\")\n", - "\n", - "} # else nothing to replace\n", - "\n", - " dim(dhis2_reporting_wide)\n", - " head(dhis2_reporting_wide, 3)\n", - "}" + "# Moved to utils for readability.\n" ] }, { @@ -914,24 +775,7 @@ }, "outputs": [], "source": [ - "# Sum up values (now at acility level) to get totals per ADM2_ID and PERIOD\n", - "dhis2_reporting_wide_adm2 <- dhis2_reporting_wide %>%\n", - " group_by(\n", - " PERIOD, \n", - " YEAR, MONTH, # keep these just for sanity check (not needed for grouping)\n", - " ADM1_NAME, ADM1_ID, # keep these just for sanity check (not needed for grouping)\n", - " ADM2_NAME, ADM2_ID\n", - " ) %>%\n", - " summarise(\n", - " ACTUAL_REPORTS = sum(ACTUAL_REPORTS, na.rm = TRUE),\n", - " EXPECTED_REPORTS = sum(EXPECTED_REPORTS, na.rm = TRUE),\n", - " .groups = 'drop'\n", - " ) \n", - "\n", - "# Add log messages\n", - "log_msg(glue::glue(\"DHIS2 reporting data pivoted to wide format and aggregated at ADM2 level. \n", - "Dataframe dimensions: {paste(dim(dhis2_reporting_wide_adm2), collapse=', ')}\"))\n", - "head(dhis2_reporting_wide_adm2, 3)" + "# Moved to utils for readability.\n" ] }, { @@ -978,12 +822,7 @@ }, "outputs": [], "source": [ - "# Calculate REPORTING_RATE as ACTUAL_REPORTS / EXPECTED_REPORTS\n", - "reporting_rate_results <- dhis2_reporting_wide_adm2 %>%\n", - " mutate(REPORTING_RATE = ACTUAL_REPORTS / EXPECTED_REPORTS)\n", - "\n", - "log_msg(glue::glue(\"DHIS2 reporting rate calculated as ACTUAL_REPORTS / EXPECTED_REPORTS. Dataframe dimensions: {paste(dim(reporting_rate_results), collapse=', ')}\"))\n", - "head(reporting_rate_results, 3) " + "# Moved to utils for readability.\n" ] }, { @@ -1156,17 +995,11 @@ }, "outputs": [], "source": [ - "output_data_path <- file.path(DATA_PATH, \"reporting_rate\")\n", - "\n", - "# parquet\n", - "file_path <- file.path(output_data_path, paste0(COUNTRY_CODE, \"_reporting_rate_dataset.parquet\")) \n", - "write_parquet(reporting_rate_dataset, file_path)\n", - "log_msg(glue(\"Exported : {file_path}\"))\n", - "\n", - "# csv\n", - "file_path <- file.path(output_data_path, paste0(COUNTRY_CODE, \"_reporting_rate_dataset.csv\"))\n", - "write.csv(reporting_rate_dataset, file_path, row.names = FALSE)\n", - "log_msg(glue(\"Exported : {file_path}\"))" + "export_reporting_rate_dataset(\n", + " reporting_rate_dataset = reporting_rate_dataset,\n", + " DATA_PATH = DATA_PATH,\n", + " COUNTRY_CODE = COUNTRY_CODE\n", + ")\n" ] } ], @@ -1193,7 +1026,7 @@ "input_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb", "output_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataset/papermill_outputs/snt_dhis2_reporting_rate_dataset_OUTPUT_2025-12-19_102148.ipynb", "parameters": { - "ROUTINE_FILE": "NER_routine_outliers_imputed.parquet", + "ROUTINE_FILE": "XXX_routine_outliers_imputed.parquet", "SNT_ROOT_PATH": "/home/hexa/workspace" }, "start_time": "2025-12-19T10:21:48.422273", diff --git a/pipelines/snt_dhis2_reporting_rate_dataset/reporting/snt_dhis2_reporting_rate_dataset_report.ipynb b/pipelines/snt_dhis2_reporting_rate_dataset/reporting/snt_dhis2_reporting_rate_dataset_report.ipynb index 38091fe..90e4762 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataset/reporting/snt_dhis2_reporting_rate_dataset_report.ipynb +++ b/pipelines/snt_dhis2_reporting_rate_dataset/reporting/snt_dhis2_reporting_rate_dataset_report.ipynb @@ -1,1300 +1,1299 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "b79cba06", - "metadata": { - "papermill": { - "duration": 0.000249, - "end_time": "2025-12-19T10:23:27.548651", - "exception": false, - "start_time": "2025-12-19T10:23:27.548402", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 1. Setup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7ca65bcc", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:27.561213Z", - "iopub.status.busy": "2025-12-19T10:23:27.553197Z", - "iopub.status.idle": "2025-12-19T10:23:34.811467Z", - "shell.execute_reply": "2025-12-19T10:23:34.808478Z" - }, - "papermill": { - "duration": 7.265364, - "end_time": "2025-12-19T10:23:34.814448", - "exception": false, - "start_time": "2025-12-19T10:23:27.549084", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Project paths\n", - "SNT_ROOT_PATH <- \"/home/hexa/workspace\" \n", - "REPORTING_NB_OUTPUTS_PATH <- file.path(SNT_ROOT_PATH, \"pipelines/snt_dhis2_reporting_rate_dataset/reporting/outputs\")\n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, 'code') # this is where we store snt_utils.r\n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, 'configuration') # .json config file\n", - "DATA_PATH <- file.path(SNT_ROOT_PATH, 'data', 'dhis2') \n", - "\n", - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "# Load palettes\n", - "source(file.path(CODE_PATH, \"snt_palettes.r\"))\n", - "\n", - "# Load libraries \n", - "required_packages <- c(\"arrow\", \"tidyverse\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"glue\")\n", - "install_and_load(required_packages)\n", - "\n", - "# Environment variables\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "\n", - "# Load OpenHEXA sdk\n", - "openhexa <- import(\"openhexa.sdk\")" - ] - }, - { - "cell_type": "markdown", - "id": "c5301aa3", - "metadata": { - "papermill": { - "duration": 0.000116, - "end_time": "2025-12-19T10:23:34.814852", - "exception": false, - "start_time": "2025-12-19T10:23:34.814736", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 1.1. Load and check `snt config` file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "76d8a072", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:34.858197Z", - "iopub.status.busy": "2025-12-19T10:23:34.817039Z", - "iopub.status.idle": "2025-12-19T10:23:35.335737Z", - "shell.execute_reply": "2025-12-19T10:23:35.333547Z" - }, - "papermill": { - "duration": 0.52329, - "end_time": "2025-12-19T10:23:35.338288", - "exception": false, - "start_time": "2025-12-19T10:23:34.814998", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", - " error = function(e) {\n", - " msg <- paste0(\"[ERROR] Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "log_msg(paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, \"SNT_config.json\")))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c712ac02", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:35.342494Z", - "iopub.status.busy": "2025-12-19T10:23:35.340803Z", - "iopub.status.idle": "2025-12-19T10:23:35.366376Z", - "shell.execute_reply": "2025-12-19T10:23:35.364165Z" - }, - "papermill": { - "duration": 0.030446, - "end_time": "2025-12-19T10:23:35.368977", - "exception": false, - "start_time": "2025-12-19T10:23:35.338531", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Configuration settings\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", - "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", - "\n", - "REPORTING_RATE_DATASET_NAME <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_REPORTING_RATE\n", - "DHIS2_FORMATTED_DATASET_NAME <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "\n", - "REPORTING_RATE_PRODUCT_UID <- config_json$SNT_CONFIG$REPORTING_RATE_PRODUCT_UID # to add to plots subtitles" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e02c652e", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:35.373316Z", - "iopub.status.busy": "2025-12-19T10:23:35.371377Z", - "iopub.status.idle": "2025-12-19T10:23:35.396646Z", - "shell.execute_reply": "2025-12-19T10:23:35.394442Z" - }, - "papermill": { - "duration": 0.029675, - "end_time": "2025-12-19T10:23:35.398945", - "exception": false, - "start_time": "2025-12-19T10:23:35.369270", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Make string of product uids for plot subtitles\n", - "rr_product_uid <-paste(REPORTING_RATE_PRODUCT_UID,collapse = \", \") \n", - "rr_product_uid" - ] - }, - { - "cell_type": "markdown", - "id": "30b058f4", - "metadata": { - "papermill": { - "duration": 0.000094, - "end_time": "2025-12-19T10:23:35.399231", - "exception": false, - "start_time": "2025-12-19T10:23:35.399137", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 1.2. Load and check `snt metadata` file\n", - "This is needed for the correct use of palettes and categories (breaks, or scale)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "98a8ee49", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:35.403224Z", - "iopub.status.busy": "2025-12-19T10:23:35.401458Z", - "iopub.status.idle": "2025-12-19T10:23:36.335964Z", - "shell.execute_reply": "2025-12-19T10:23:36.330643Z" - }, - "papermill": { - "duration": 0.940593, - "end_time": "2025-12-19T10:23:36.339927", - "exception": false, - "start_time": "2025-12-19T10:23:35.399334", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load SNT metadata\n", - "metadata_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_metadata.json\")) },\n", - " error = function(e) {\n", - " msg <- paste0(\"[ERROR] Error while loading metadata\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "log_msg(paste0(\"SNT metadata loaded from : \", file.path(CONFIG_PATH, \"SNT_metadata.json\")))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "00681217", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:36.357945Z", - "iopub.status.busy": "2025-12-19T10:23:36.343228Z", - "iopub.status.idle": "2025-12-19T10:23:36.535579Z", - "shell.execute_reply": "2025-12-19T10:23:36.533231Z" - }, - "papermill": { - "duration": 0.198107, - "end_time": "2025-12-19T10:23:36.538224", - "exception": false, - "start_time": "2025-12-19T10:23:36.340117", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "scale_raw <- metadata_json$REPORTING_RATE$SCALE\n", - "if (is.character(scale_raw) && length(scale_raw) == 1) {\n", - " break_vals <- jsonlite::fromJSON(scale_raw)\n", - "} else {\n", - " break_vals <- unlist(scale_raw, use.names = FALSE)\n", - "}\n", - "break_vals <- as.numeric(break_vals)\n", - "\n", - "log_msg(paste0(\"Reporting Rate scale break values loaded from SNT_metadata.json : \", paste(break_vals, collapse = \", \")))" - ] - }, - { - "cell_type": "markdown", - "id": "f3470564", - "metadata": { - "papermill": { - "duration": 0.000162, - "end_time": "2025-12-19T10:23:36.538638", - "exception": false, - "start_time": "2025-12-19T10:23:36.538476", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 2. Load Data" - ] - }, - { - "cell_type": "markdown", - "id": "82397307", - "metadata": { - "papermill": { - "duration": 0.000126, - "end_time": "2025-12-19T10:23:36.538947", - "exception": false, - "start_time": "2025-12-19T10:23:36.538821", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 2.1. Output of pipeline notebook\n", - "Import file named `{COUNTRY_CODE}_reporting_rate_dataset.parquet` from **OH Dataset** \"SNT_DHIS2_REPORTING_RATE\" (as in `config_json$SNT_DATASET_IDENTIFIERS$DHIS2_REPORTING_RATE`)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "70acb2c5", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:36.543564Z", - "iopub.status.busy": "2025-12-19T10:23:36.541311Z", - "iopub.status.idle": "2025-12-19T10:23:37.788619Z", - "shell.execute_reply": "2025-12-19T10:23:37.785121Z" - }, - "papermill": { - "duration": 1.253125, - "end_time": "2025-12-19T10:23:37.792249", - "exception": false, - "start_time": "2025-12-19T10:23:36.539124", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "\n", - "reporting_rate_dataset <- tryCatch({ get_latest_dataset_file_in_memory(REPORTING_RATE_DATASET_NAME, glue::glue(\"{COUNTRY_CODE}_reporting_rate_dataset.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading Reporting Rate (Dataset) data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - "})\n", - "\n", - "# log\n", - "log_msg(glue::glue(\"Data file loaded from dataset : {REPORTING_RATE_DATASET_NAME} dataframe dimensions: {paste(dim(reporting_rate_dataset), collapse=', ')}\"))\n", - "dim(reporting_rate_dataset)\n", - "head(reporting_rate_dataset, 2)" - ] - }, - { - "cell_type": "markdown", - "id": "48833515", - "metadata": { - "papermill": { - "duration": 0.000091, - "end_time": "2025-12-19T10:23:37.792528", - "exception": false, - "start_time": "2025-12-19T10:23:37.792437", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 2.2. Shapes\n", - "To make choropleth (map)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3febd4f4", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:37.798194Z", - "iopub.status.busy": "2025-12-19T10:23:37.795402Z", - "iopub.status.idle": "2025-12-19T10:23:41.325848Z", - "shell.execute_reply": "2025-12-19T10:23:41.323895Z" - }, - "papermill": { - "duration": 3.535554, - "end_time": "2025-12-19T10:23:41.328226", - "exception": false, - "start_time": "2025-12-19T10:23:37.792672", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "shapes <- tryCatch({ get_latest_dataset_file_in_memory(DHIS2_FORMATTED_DATASET_NAME, paste0(COUNTRY_CODE, \"_shapes.geojson\")) }, \n", - " error = function(e) { \n", - " msg <- paste0(COUNTRY_CODE , \" Shapes data is not available in dataset: '\" , DHIS2_FORMATTED_DATASET_NAME, \"' last version.\")\n", - " log_msg(msg, \"warning\")\n", - " shapes <- NULL\n", - " })\n", - "\n", - "log_msg(glue::glue(\"Shapes loaded from dataset: '{DHIS2_FORMATTED_DATASET_NAME}'. \\nDataframe with dimensions: {paste(dim(shapes), collapse=', ')}\"))\n", - "names(shapes)" - ] - }, - { - "cell_type": "markdown", - "id": "17067d56", - "metadata": { - "papermill": { - "duration": 0.000166, - "end_time": "2025-12-19T10:23:41.328651", - "exception": false, - "start_time": "2025-12-19T10:23:41.328485", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 3. Plots" - ] - }, - { - "cell_type": "markdown", - "id": "9a6369ee", - "metadata": { - "papermill": { - "duration": 0.000109, - "end_time": "2025-12-19T10:23:41.328959", - "exception": false, - "start_time": "2025-12-19T10:23:41.328850", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "##### 3.0. Add shapes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c6641720", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:41.333105Z", - "iopub.status.busy": "2025-12-19T10:23:41.331427Z", - "iopub.status.idle": "2025-12-19T10:23:41.365417Z", - "shell.execute_reply": "2025-12-19T10:23:41.363294Z" - }, - "papermill": { - "duration": 0.03905, - "end_time": "2025-12-19T10:23:41.368213", - "exception": false, - "start_time": "2025-12-19T10:23:41.329163", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Join shapes to reporting rate data\n", - "\n", - "data_to_plot <- reporting_rate_dataset %>%\n", - " left_join(shapes, by = c(\"ADM2_ID\"))" - ] - }, - { - "cell_type": "markdown", - "id": "0b0d32f1", - "metadata": { - "papermill": { - "duration": 0.000195, - "end_time": "2025-12-19T10:23:41.368739", - "exception": false, - "start_time": "2025-12-19T10:23:41.368544", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 3.1. 🎨 Dynamic categories and color assignement" - ] - }, - { - "cell_type": "markdown", - "id": "cc765e0c", - "metadata": { - "papermill": { - "duration": 0.000109, - "end_time": "2025-12-19T10:23:41.369057", - "exception": false, - "start_time": "2025-12-19T10:23:41.368948", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "##### 1. Define breaks and labels" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2e79132c", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:41.373558Z", - "iopub.status.busy": "2025-12-19T10:23:41.371555Z", - "iopub.status.idle": "2025-12-19T10:23:41.392950Z", - "shell.execute_reply": "2025-12-19T10:23:41.390333Z" - }, - "papermill": { - "duration": 0.026996, - "end_time": "2025-12-19T10:23:41.396238", - "exception": false, - "start_time": "2025-12-19T10:23:41.369242", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Safety code to avoid breaking if nothings is fund in json_metadata\n", - "if (is.null(break_vals) || length(break_vals) == 0) {\n", - " log_msg(\"[WARNING] No break values found in SNT_metadata.json for REPORTING_RATE$SCALE. Using default values.\", \"warning\")\n", - " break_vals <- c(0.5, 0.8, 0.9, 0.95, 1.00)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f04cb888", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:41.401034Z", - "iopub.status.busy": "2025-12-19T10:23:41.398849Z", - "iopub.status.idle": "2025-12-19T10:23:41.430720Z", - "shell.execute_reply": "2025-12-19T10:23:41.428238Z" - }, - "papermill": { - "duration": 0.037712, - "end_time": "2025-12-19T10:23:41.434131", - "exception": false, - "start_time": "2025-12-19T10:23:41.396419", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# 1. Define breaks\n", - "# Note: assumes that the data starts at 0!\n", - "# break_vals <- metadata_json$REPORTING_RATE$SCALE # moved upstream\n", - "\n", - "# 2. Create the full set of cut points (0 to Infinity)\n", - "full_breaks <- c(0, break_vals, Inf)\n", - "\n", - "# 3. Create dynamic labels\n", - "labels <- c(\n", - " paste0(\"< \", break_vals[1]), # First label\n", - " paste0(break_vals[-length(break_vals)], \" - \", break_vals[-1]), # Middle labels\n", - " paste0(\"> \", break_vals[length(break_vals)]) # Last label\n", - ")\n", - "\n", - "# Check\n", - "labels" - ] - }, - { - "cell_type": "markdown", - "id": "cb237801", - "metadata": { - "papermill": { - "duration": 0.000102, - "end_time": "2025-12-19T10:23:41.434442", - "exception": false, - "start_time": "2025-12-19T10:23:41.434340", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "##### 2. Create `_CATEGORY` col" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f8303488", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:41.439376Z", - "iopub.status.busy": "2025-12-19T10:23:41.437165Z", - "iopub.status.idle": "2025-12-19T10:23:41.471891Z", - "shell.execute_reply": "2025-12-19T10:23:41.469251Z" - }, - "papermill": { - "duration": 0.040632, - "end_time": "2025-12-19T10:23:41.475176", - "exception": false, - "start_time": "2025-12-19T10:23:41.434544", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# reporting_rate_dataset <- reporting_rate_dataset %>%\n", - "data_to_plot <- data_to_plot %>%\n", - " mutate(\n", - " REPORTING_RATE_CATEGORY = cut(\n", - " REPORTING_RATE,\n", - " breaks = full_breaks,\n", - " labels = labels,\n", - " right = TRUE, # so that 1.00 is assigned to \"0.95 - 1.00\"\n", - " include.lowest = TRUE\n", - " )\n", - " )" - ] - }, - { - "cell_type": "markdown", - "id": "a10237f8", - "metadata": { - "papermill": { - "duration": 0.000102, - "end_time": "2025-12-19T10:23:41.475483", - "exception": false, - "start_time": "2025-12-19T10:23:41.475381", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "##### 3. Pick appropriate palette" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2ee6e077", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:41.480216Z", - "iopub.status.busy": "2025-12-19T10:23:41.478061Z", - "iopub.status.idle": "2025-12-19T10:23:41.513805Z", - "shell.execute_reply": "2025-12-19T10:23:41.511268Z" - }, - "papermill": { - "duration": 0.04138, - "end_time": "2025-12-19T10:23:41.516984", - "exception": false, - "start_time": "2025-12-19T10:23:41.475604", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Count nr of breaks\n", - "nr_of_colors <- length(labels)\n", - "\n", - "# nr_of_colors\n", - "palette_to_use <- get_range_from_count(nr_of_colors)\n", - "\n", - "# Need to make palettes as named vectors so that scale_color_manual() and scale_fill_manual() can use them properly\n", - "# Note: need to reverse order of labels to match the palette order \"meaning\" (red \"\" should correcpond to lowest value)\n", - "names(palette_to_use) <- rev(labels)\n", - "\n", - "palette_to_use\n" - ] - }, - { - "cell_type": "markdown", - "id": "d08c0c14", - "metadata": { - "papermill": { - "duration": 0.000099, - "end_time": "2025-12-19T10:23:41.517267", - "exception": false, - "start_time": "2025-12-19T10:23:41.517168", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 3.2. Plots" - ] - }, - { - "cell_type": "markdown", - "id": "b7781198", - "metadata": { - "papermill": { - "duration": 0.000056, - "end_time": "2025-12-19T10:23:41.517425", - "exception": false, - "start_time": "2025-12-19T10:23:41.517369", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "##### 3.2.1 Scatter plot of RR over time (by ADM2)\n", - "With this we can see the actula numbners (although cannot tell which ADM2 have low values)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "78d92e4a", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:41.522513Z", - "iopub.status.busy": "2025-12-19T10:23:41.520272Z", - "iopub.status.idle": "2025-12-19T10:23:42.935181Z", - "shell.execute_reply": "2025-12-19T10:23:42.932661Z" - }, - "papermill": { - "duration": 1.456494, - "end_time": "2025-12-19T10:23:42.974012", - "exception": false, - "start_time": "2025-12-19T10:23:41.517518", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Line point plot faceted by YEAR\n", - "ggplot(data = data_to_plot) +\n", - " geom_line(aes(x = MONTH,\n", - " y = REPORTING_RATE,\n", - " group = ADM2_ID,\n", - " color = REPORTING_RATE_CATEGORY), \n", - " alpha = 0.3,\n", - " show.legend = FALSE\n", - " ) +\n", - " geom_point(aes(x = MONTH,\n", - " y = REPORTING_RATE,\n", - " group = ADM2_ID,\n", - " color = REPORTING_RATE_CATEGORY)) + \n", - " facet_grid(~YEAR) + \n", - " scale_color_manual(\n", - " values = palette_to_use, # 🎨 NEW dynamic colors & breaks!\n", - " na.value = \"white\",\n", - " name = \"Reporting Rate Categories\"\n", - " ) +\n", - " scale_x_continuous(breaks = seq(1, 12, 1)) +\n", - " scale_y_continuous(\n", - " breaks = c(0, break_vals), # 🎨 NEW dynamic colors & breaks!\n", - " # Dynamically set max value to fit actual data (do show values >1 if present)\n", - " limits = c(0, max(data_to_plot$REPORTING_RATE, na.rm = TRUE) + 0.1)\n", - " ) +\n", - " labs(\n", - " title = \"Reporting Rate (Dataset)\",\n", - " subtitle = paste0(\"Product UID : \", rr_product_uid),\n", - " x = \"Month\",\n", - " y = \"Reporting Rate\\n(Dataset)\" ) +\n", - " theme_minimal() +\n", - " theme(\n", - " plot.subtitle = element_text(margin=margin(0,0,20,0)),\n", - " legend.position = \"none\",\n", - " legend.title = element_blank(),\n", - " # legend.key.width = unit(3, \"cm\"),\n", - " # legend.key.height = unit(0.25, \"cm\"),\n", - " axis.title.y = element_blank(),\n", - " panel.grid.minor = element_blank(),\n", - " panel.grid.major.x = element_blank(),\n", - " strip.placement = \"outside\",\n", - " strip.text = element_text(face = \"bold\", size = 10)\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1f47064a", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:42.978498Z", - "iopub.status.busy": "2025-12-19T10:23:42.976659Z", - "iopub.status.idle": "2025-12-19T10:23:44.087244Z", - "shell.execute_reply": "2025-12-19T10:23:44.085182Z" - }, - "papermill": { - "duration": 1.11568, - "end_time": "2025-12-19T10:23:44.089891", - "exception": false, - "start_time": "2025-12-19T10:23:42.974211", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Export plot as PNG\n", - "output_filename <- paste0(COUNTRY_CODE, \"_reporting_rate_dataset_adm2_linepoint_\", paste(REPORTING_RATE_PRODUCT_UID, collapse = \"_\"), \".png\")\n", - "output_location <- file.path(REPORTING_NB_OUTPUTS_PATH, \"figures\")\n", - "\n", - "ggsave(\n", - " filename = output_filename, \n", - " path = output_location, \n", - " create.dir = TRUE,\n", - " height = 15,\n", - " width = 45,\n", - " units = \"cm\",\n", - " bg = \"white\",\n", - " dpi = 200\n", - " )\n", - "\n", - "# Add log message\n", - "log_msg(glue::glue(\"📊 Plot (linepoint) saved to: {file.path(output_location, output_filename)}\"))" - ] - }, - { - "cell_type": "markdown", - "id": "22bb6431", - "metadata": { - "papermill": { - "duration": 0.000147, - "end_time": "2025-12-19T10:23:44.090320", - "exception": false, - "start_time": "2025-12-19T10:23:44.090173", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "##### 3.2.2 Heatmap plot of RR over time (by ADM2)\n", - "This is less good for identifying actual values, but allows to see which ADM2 have lower values." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f2445f2a", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:44.094508Z", - "iopub.status.busy": "2025-12-19T10:23:44.092577Z", - "iopub.status.idle": "2025-12-19T10:23:46.262550Z", - "shell.execute_reply": "2025-12-19T10:23:46.259633Z" - }, - "papermill": { - "duration": 2.21647, - "end_time": "2025-12-19T10:23:46.306927", - "exception": false, - "start_time": "2025-12-19T10:23:44.090457", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Tile plot faceted by YEAR\n", - "ggplot(data = data_to_plot) +\n", - " geom_tile(aes(x = MONTH,\n", - " y = fct_rev(ADM2_NAME),\n", - " fill = REPORTING_RATE_CATEGORY), \n", - " color = \"white\",\n", - " show.legend = TRUE\n", - " ) +\n", - " scale_fill_manual(\n", - " values = palette_to_use, # 🎨 NEW dynamic colors & breaks!\n", - " na.value = \"white\",\n", - " name = \"Reporting Rate: \"\n", - " ) +\n", - " scale_x_continuous(breaks = seq(1, 12, 1)) +\n", - " labs(\n", - " title = \"Reporting Rate (Dataset)\",\n", - " subtitle = paste0(\"Product UID : \", rr_product_uid),\n", - " x = \"Month\"\n", - " ) +\n", - " facet_grid(rows = vars(ADM1_NAME), cols = vars(YEAR), \n", - " scales = \"free_y\", space = \"free_y\",\n", - " switch = \"y\") +\n", - " theme_minimal() +\n", - " theme(\n", - " plot.subtitle = element_text(margin=margin(0,0,20,0)),\n", - " legend.position = \"bottom\",\n", - " legend.key.height = unit(0.25, \"cm\"),\n", - " axis.text.x = element_text(size = 7),\n", - " axis.title.y = element_blank(),\n", - " panel.grid.minor = element_blank(),\n", - " panel.grid.major = element_blank(),\n", - " strip.placement = \"outside\", \n", - " strip.text = element_text(face = \"bold\", size = 10)\n", - " ) +\n", - " guides(fill = guide_legend(nrow = 1))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cbe73312", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:46.311134Z", - "iopub.status.busy": "2025-12-19T10:23:46.309412Z", - "iopub.status.idle": "2025-12-19T10:23:48.286664Z", - "shell.execute_reply": "2025-12-19T10:23:48.284571Z" - }, - "papermill": { - "duration": 1.982105, - "end_time": "2025-12-19T10:23:48.289215", - "exception": false, - "start_time": "2025-12-19T10:23:46.307110", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Export plot as PNG\n", - "output_filename <- paste0(COUNTRY_CODE, \"_reporting_rate_dataset_adm2_heatmap_\", paste(REPORTING_RATE_PRODUCT_UID, collapse = \"_\"), \".png\")\n", - "output_location <- file.path(REPORTING_NB_OUTPUTS_PATH, \"figures\")\n", - "\n", - "ggsave(\n", - " filename = output_filename, \n", - " path = output_location, \n", - " create.dir = TRUE,\n", - " width = 20, height = 30, units = \"cm\", \n", - " dpi = 200\n", - " )\n", - "\n", - "# Add log message\n", - "log_msg(glue::glue(\"📊 Plot (heatmap) saved to: {file.path(output_location, output_filename)}\"))" - ] - }, - { - "cell_type": "markdown", - "id": "3eef141a", - "metadata": { - "papermill": { - "duration": 0.000164, - "end_time": "2025-12-19T10:23:48.289656", - "exception": false, - "start_time": "2025-12-19T10:23:48.289492", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "##### 3.2.3. MAP of Reporting Rate - by month" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "83be9c68", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:48.294030Z", - "iopub.status.busy": "2025-12-19T10:23:48.292256Z", - "iopub.status.idle": "2025-12-19T10:23:53.205670Z", - "shell.execute_reply": "2025-12-19T10:23:53.203104Z" - }, - "papermill": { - "duration": 4.958481, - "end_time": "2025-12-19T10:23:53.248341", - "exception": false, - "start_time": "2025-12-19T10:23:48.289860", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Choropleth map with reporting rate data by ADM2\n", - "ggplot(data = data_to_plot) +\n", - " geom_sf(aes(\n", - " fill = REPORTING_RATE_CATEGORY,\n", - " geometry = geometry), \n", - " color = \"white\",\n", - " size = 0.01) +\n", - " scale_fill_manual(\n", - " values = palette_to_use, # 🎨 NEW dynamic colors & breaks!\n", - " na.value = \"white\",\n", - " ) +\n", - " theme_void() +\n", - " theme(\n", - " plot.subtitle = element_text(margin=margin(5,0,20,0)),\n", - " legend.position = \"bottom\",\n", - " legend.title = element_blank(),\n", - " legend.key.height = unit(0.25, \"cm\")\n", - " ) +\n", - " labs(\n", - " title = paste(\"Reporting Rate (Dataset)\"),\n", - " subtitle = paste0(\"Product UID : \", rr_product_uid),\n", - " ) +\n", - " facet_grid(\n", - " rows = vars(YEAR), \n", - " cols = vars(MONTH),\n", - " switch = \"both\") +\n", - " guides(fill = guide_legend(nrow = 1))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e877671d", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:53.252696Z", - "iopub.status.busy": "2025-12-19T10:23:53.250972Z", - "iopub.status.idle": "2025-12-19T10:23:56.748868Z", - "shell.execute_reply": "2025-12-19T10:23:56.746990Z" - }, - "papermill": { - "duration": 3.502689, - "end_time": "2025-12-19T10:23:56.751218", - "exception": false, - "start_time": "2025-12-19T10:23:53.248529", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "output_filename <- paste0(COUNTRY_CODE, \"_reporting_rate_dataset_adm2_map_\", paste(REPORTING_RATE_PRODUCT_UID, collapse = \"_\"), \".png\")\n", - "output_location <- file.path(REPORTING_NB_OUTPUTS_PATH, \"figures\")\n", - "\n", - "ggsave(\n", - " filename = output_filename, \n", - " path = output_location, \n", - " create.dir = TRUE,\n", - " width = 50, height = 20, units = \"cm\", \n", - " dpi = 200\n", - " )\n", - "\n", - "# Add log message\n", - "log_msg(glue::glue(\"📊 Plot (map) saved to: {file.path(output_location, output_filename)}\"))" - ] - }, - { - "cell_type": "markdown", - "id": "f0894be9", - "metadata": { - "papermill": { - "duration": 0.000166, - "end_time": "2025-12-19T10:23:56.751636", - "exception": false, - "start_time": "2025-12-19T10:23:56.751470", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "##### 3.2.4. MAP of Reporting Rate - by YEAR\n", - "Use average (`mean()`) of monthly values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cb1995ab", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:56.755998Z", - "iopub.status.busy": "2025-12-19T10:23:56.753982Z", - "iopub.status.idle": "2025-12-19T10:23:56.788391Z", - "shell.execute_reply": "2025-12-19T10:23:56.786447Z" - }, - "papermill": { - "duration": 0.039325, - "end_time": "2025-12-19T10:23:56.791143", - "exception": false, - "start_time": "2025-12-19T10:23:56.751818", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "data_to_plot_year <- data_to_plot %>%\n", - " group_by(geometry, ADM2_ID, ADM2_NAME, ADM1_NAME, YEAR) %>%\n", - " summarise(\n", - " REPORTING_RATE = mean(REPORTING_RATE, na.rm = TRUE),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " # Calculate REPORTING_RATE_CATEGORY again based on the yearly average\n", - " mutate(\n", - " REPORTING_RATE_CATEGORY = cut(\n", - " REPORTING_RATE,\n", - " breaks = full_breaks,\n", - " labels = labels,\n", - " right = TRUE, # so that 1.00 is assigned to \"0.95 - 1.00\"\n", - " include.lowest = TRUE\n", - " )\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bd32b0cf", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:56.795010Z", - "iopub.status.busy": "2025-12-19T10:23:56.793453Z", - "iopub.status.idle": "2025-12-19T10:23:57.582261Z", - "shell.execute_reply": "2025-12-19T10:23:57.579294Z" - }, - "papermill": { - "duration": 0.798686, - "end_time": "2025-12-19T10:23:57.590023", - "exception": false, - "start_time": "2025-12-19T10:23:56.791337", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Choropleth map with reporting rate data by ADM2\n", - "ggplot(data = data_to_plot_year) +\n", - " geom_sf(aes(\n", - " fill = REPORTING_RATE_CATEGORY,\n", - " geometry = geometry), \n", - " color = \"white\",\n", - " size = 0.01) +\n", - " scale_fill_manual(\n", - " values = palette_to_use, # 🎨 NEW dynamic colors & breaks!\n", - " na.value = \"white\"\n", - " ) +\n", - " theme_void() +\n", - " theme(\n", - " plot.subtitle = element_text(margin=margin(5,0,20,0)),\n", - " legend.position = \"bottom\",\n", - " ) +\n", - " labs(\n", - " title = \"Reporting Rate (Dataset) - mean per Year\",\n", - " subtitle = paste0(\"Product UID : \", rr_product_uid),\n", - " fill = \"Reporting Rate: \"\n", - " ) +\n", - " facet_grid(\n", - " cols = vars(YEAR)\n", - " ) +\n", - " guides(fill = guide_legend(nrow = 1))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0430641e", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:57.594096Z", - "iopub.status.busy": "2025-12-19T10:23:57.592357Z", - "iopub.status.idle": "2025-12-19T10:23:58.516754Z", - "shell.execute_reply": "2025-12-19T10:23:58.514785Z" - }, - "papermill": { - "duration": 0.928933, - "end_time": "2025-12-19T10:23:58.519148", - "exception": false, - "start_time": "2025-12-19T10:23:57.590215", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "output_filename <- paste0(COUNTRY_CODE, \"_reporting_rate_dataset_adm2_map_year_\", paste(REPORTING_RATE_PRODUCT_UID, collapse = \"_\"), \".png\")\n", - "output_location <- file.path(REPORTING_NB_OUTPUTS_PATH, \"figures\")\n", - "\n", - "ggsave(\n", - " filename = output_filename, \n", - " path = output_location, \n", - " create.dir = TRUE,\n", - " width = 31, height = 13, units = \"cm\", \n", - " dpi = 200\n", - " )\n", - "\n", - "# Add log message\n", - "log_msg(glue::glue(\"📊 Plot (map) saved to: {file.path(output_location, output_filename)}\"))" - ] - }, - { - "cell_type": "markdown", - "id": "8c3bdca4", - "metadata": { - "papermill": { - "duration": 0.000126, - "end_time": "2025-12-19T10:23:58.519515", - "exception": false, - "start_time": "2025-12-19T10:23:58.519389", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### The End :)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f8a62ec5", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:58.523680Z", - "iopub.status.busy": "2025-12-19T10:23:58.522024Z", - "iopub.status.idle": "2025-12-19T10:23:58.733860Z", - "shell.execute_reply": "2025-12-19T10:23:58.731929Z" - }, - "papermill": { - "duration": 0.216448, - "end_time": "2025-12-19T10:23:58.736160", - "exception": false, - "start_time": "2025-12-19T10:23:58.519712", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "log_msg(\"Reporting Rate (Dataset) report notebook completed successfully!\")" - ] + "cells": [ + { + "cell_type": "markdown", + "id": "b79cba06", + "metadata": { + "papermill": { + "duration": 0.000249, + "end_time": "2025-12-19T10:23:27.548651", + "exception": false, + "start_time": "2025-12-19T10:23:27.548402", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 1. Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ca65bcc", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:27.561213Z", + "iopub.status.busy": "2025-12-19T10:23:27.553197Z", + "iopub.status.idle": "2025-12-19T10:23:34.811467Z", + "shell.execute_reply": "2025-12-19T10:23:34.808478Z" + }, + "papermill": { + "duration": 7.265364, + "end_time": "2025-12-19T10:23:34.814448", + "exception": false, + "start_time": "2025-12-19T10:23:27.549084", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Project paths\n", + "SNT_ROOT_PATH <- \"/home/hexa/workspace\" \n", + "REPORTING_NB_OUTPUTS_PATH <- file.path(SNT_ROOT_PATH, \"pipelines/snt_dhis2_reporting_rate_dataset/reporting/outputs\")\n", + "CODE_PATH <- file.path(SNT_ROOT_PATH, 'code') # this is where we store snt_utils.r\n", + "CONFIG_PATH <- file.path(SNT_ROOT_PATH, 'configuration') # .json config file\n", + "DATA_PATH <- file.path(SNT_ROOT_PATH, 'data', 'dhis2') \n", + "\n", + "# Load utils\n", + "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "# Load palettes\n", + "source(file.path(CODE_PATH, \"snt_palettes.r\"))\n", + "\n", + "# Load libraries \n", + "required_packages <- c(\"arrow\", \"tidyverse\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"glue\")\n", + "install_and_load(required_packages)\n", + "\n", + "# Environment variables\n", + "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", + "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "\n", + "# Load OpenHEXA sdk\n", + "openhexa <- import(\"openhexa.sdk\")" + ] + }, + { + "cell_type": "markdown", + "id": "c5301aa3", + "metadata": { + "papermill": { + "duration": 0.000116, + "end_time": "2025-12-19T10:23:34.814852", + "exception": false, + "start_time": "2025-12-19T10:23:34.814736", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 1.1. Load and check `snt config` file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "76d8a072", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:34.858197Z", + "iopub.status.busy": "2025-12-19T10:23:34.817039Z", + "iopub.status.idle": "2025-12-19T10:23:35.335737Z", + "shell.execute_reply": "2025-12-19T10:23:35.333547Z" + }, + "papermill": { + "duration": 0.52329, + "end_time": "2025-12-19T10:23:35.338288", + "exception": false, + "start_time": "2025-12-19T10:23:34.814998", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load SNT config\n", + "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", + " error = function(e) {\n", + " msg <- paste0(\"[ERROR] Error while loading configuration\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })\n", + "\n", + "log_msg(paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, \"SNT_config.json\")))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c712ac02", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:35.342494Z", + "iopub.status.busy": "2025-12-19T10:23:35.340803Z", + "iopub.status.idle": "2025-12-19T10:23:35.366376Z", + "shell.execute_reply": "2025-12-19T10:23:35.364165Z" + }, + "papermill": { + "duration": 0.030446, + "end_time": "2025-12-19T10:23:35.368977", + "exception": false, + "start_time": "2025-12-19T10:23:35.338531", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Configuration settings\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", + "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", + "\n", + "REPORTING_RATE_DATASET_NAME <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_REPORTING_RATE\n", + "DHIS2_FORMATTED_DATASET_NAME <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "\n", + "REPORTING_RATE_PRODUCT_UID <- config_json$SNT_CONFIG$REPORTING_RATE_PRODUCT_UID # to add to plots subtitles" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e02c652e", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:35.373316Z", + "iopub.status.busy": "2025-12-19T10:23:35.371377Z", + "iopub.status.idle": "2025-12-19T10:23:35.396646Z", + "shell.execute_reply": "2025-12-19T10:23:35.394442Z" + }, + "papermill": { + "duration": 0.029675, + "end_time": "2025-12-19T10:23:35.398945", + "exception": false, + "start_time": "2025-12-19T10:23:35.369270", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" + }, + "outputs": [], + "source": [ + "# Make string of product uids for plot subtitles\n", + "rr_product_uid <-paste(REPORTING_RATE_PRODUCT_UID,collapse = \", \") \n", + "rr_product_uid" + ] + }, + { + "cell_type": "markdown", + "id": "30b058f4", + "metadata": { + "papermill": { + "duration": 0.000094, + "end_time": "2025-12-19T10:23:35.399231", + "exception": false, + "start_time": "2025-12-19T10:23:35.399137", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 1.2. Load and check `snt metadata` file\n", + "This is needed for the correct use of palettes and categories (breaks, or scale)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98a8ee49", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:35.403224Z", + "iopub.status.busy": "2025-12-19T10:23:35.401458Z", + "iopub.status.idle": "2025-12-19T10:23:36.335964Z", + "shell.execute_reply": "2025-12-19T10:23:36.330643Z" }, "papermill": { - "default_parameters": {}, - "duration": 32.950872, - "end_time": "2025-12-19T10:23:59.058917", - "environment_variables": {}, - "exception": null, - "input_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataset/reporting/snt_dhis2_reporting_rate_dataset_report.ipynb", - "output_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataset/reporting/outputs/snt_dhis2_reporting_rate_dataset_report_OUTPUT_2025-12-19_102325.ipynb", - "parameters": {}, - "start_time": "2025-12-19T10:23:26.108045", - "version": "2.6.0" + "duration": 0.940593, + "end_time": "2025-12-19T10:23:36.339927", + "exception": false, + "start_time": "2025-12-19T10:23:35.399334", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" } + }, + "outputs": [], + "source": [ + "# Load SNT metadata\n", + "metadata_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_metadata.json\")) },\n", + " error = function(e) {\n", + " msg <- paste0(\"[ERROR] Error while loading metadata\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })\n", + "\n", + "log_msg(paste0(\"SNT metadata loaded from : \", file.path(CONFIG_PATH, \"SNT_metadata.json\")))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "00681217", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:36.357945Z", + "iopub.status.busy": "2025-12-19T10:23:36.343228Z", + "iopub.status.idle": "2025-12-19T10:23:36.535579Z", + "shell.execute_reply": "2025-12-19T10:23:36.533231Z" + }, + "papermill": { + "duration": 0.198107, + "end_time": "2025-12-19T10:23:36.538224", + "exception": false, + "start_time": "2025-12-19T10:23:36.340117", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "scale_raw <- metadata_json$REPORTING_RATE$SCALE\n", + "break_vals <- if (is.character(scale_raw) && length(scale_raw) == 1) {\n", + " jsonlite::fromJSON(scale_raw)\n", + "} else {\n", + " as.numeric(unlist(scale_raw, use.names = FALSE))\n", + "}\n", + "\n", + "log_msg(paste0(\"Reporting Rate scale break values loaded from SNT_metadata.json : \", paste(break_vals, collapse = \", \")))" + ] + }, + { + "cell_type": "markdown", + "id": "f3470564", + "metadata": { + "papermill": { + "duration": 0.000162, + "end_time": "2025-12-19T10:23:36.538638", + "exception": false, + "start_time": "2025-12-19T10:23:36.538476", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 2. Load Data" + ] + }, + { + "cell_type": "markdown", + "id": "82397307", + "metadata": { + "papermill": { + "duration": 0.000126, + "end_time": "2025-12-19T10:23:36.538947", + "exception": false, + "start_time": "2025-12-19T10:23:36.538821", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 2.1. Output of pipeline notebook\n", + "Import file named `{COUNTRY_CODE}_reporting_rate_dataset.parquet` from **OH Dataset** \"SNT_DHIS2_REPORTING_RATE\" (as in `config_json$SNT_DATASET_IDENTIFIERS$DHIS2_REPORTING_RATE`)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70acb2c5", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:36.543564Z", + "iopub.status.busy": "2025-12-19T10:23:36.541311Z", + "iopub.status.idle": "2025-12-19T10:23:37.788619Z", + "shell.execute_reply": "2025-12-19T10:23:37.785121Z" + }, + "papermill": { + "duration": 1.253125, + "end_time": "2025-12-19T10:23:37.792249", + "exception": false, + "start_time": "2025-12-19T10:23:36.539124", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "\n", + "reporting_rate_dataset <- tryCatch({ get_latest_dataset_file_in_memory(REPORTING_RATE_DATASET_NAME, glue::glue(\"{COUNTRY_CODE}_reporting_rate_dataset.parquet\")) }, \n", + " error = function(e) {\n", + " msg <- paste(\"Error while loading Reporting Rate (Dataset) data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", + " cat(msg)\n", + " stop(msg)\n", + "})\n", + "\n", + "# log\n", + "log_msg(glue::glue(\"Data file loaded from dataset : {REPORTING_RATE_DATASET_NAME} dataframe dimensions: {paste(dim(reporting_rate_dataset), collapse=', ')}\"))\n", + "dim(reporting_rate_dataset)\n", + "head(reporting_rate_dataset, 2)" + ] + }, + { + "cell_type": "markdown", + "id": "48833515", + "metadata": { + "papermill": { + "duration": 0.000091, + "end_time": "2025-12-19T10:23:37.792528", + "exception": false, + "start_time": "2025-12-19T10:23:37.792437", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 2.2. Shapes\n", + "To make choropleth (map)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3febd4f4", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:37.798194Z", + "iopub.status.busy": "2025-12-19T10:23:37.795402Z", + "iopub.status.idle": "2025-12-19T10:23:41.325848Z", + "shell.execute_reply": "2025-12-19T10:23:41.323895Z" + }, + "papermill": { + "duration": 3.535554, + "end_time": "2025-12-19T10:23:41.328226", + "exception": false, + "start_time": "2025-12-19T10:23:37.792672", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "shapes <- tryCatch({ get_latest_dataset_file_in_memory(DHIS2_FORMATTED_DATASET_NAME, paste0(COUNTRY_CODE, \"_shapes.geojson\")) }, \n", + " error = function(e) { \n", + " msg <- paste0(COUNTRY_CODE , \" Shapes data is not available in dataset: '\" , DHIS2_FORMATTED_DATASET_NAME, \"' last version.\")\n", + " log_msg(msg, \"warning\")\n", + " shapes <- NULL\n", + " })\n", + "\n", + "log_msg(glue::glue(\"Shapes loaded from dataset: '{DHIS2_FORMATTED_DATASET_NAME}'. \\nDataframe with dimensions: {paste(dim(shapes), collapse=', ')}\"))\n", + "names(shapes)" + ] + }, + { + "cell_type": "markdown", + "id": "17067d56", + "metadata": { + "papermill": { + "duration": 0.000166, + "end_time": "2025-12-19T10:23:41.328651", + "exception": false, + "start_time": "2025-12-19T10:23:41.328485", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 3. Plots" + ] + }, + { + "cell_type": "markdown", + "id": "9a6369ee", + "metadata": { + "papermill": { + "duration": 0.000109, + "end_time": "2025-12-19T10:23:41.328959", + "exception": false, + "start_time": "2025-12-19T10:23:41.328850", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "##### 3.0. Add shapes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6641720", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:41.333105Z", + "iopub.status.busy": "2025-12-19T10:23:41.331427Z", + "iopub.status.idle": "2025-12-19T10:23:41.365417Z", + "shell.execute_reply": "2025-12-19T10:23:41.363294Z" + }, + "papermill": { + "duration": 0.03905, + "end_time": "2025-12-19T10:23:41.368213", + "exception": false, + "start_time": "2025-12-19T10:23:41.329163", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Join shapes to reporting rate data\n", + "\n", + "data_to_plot <- reporting_rate_dataset %>%\n", + " left_join(shapes, by = c(\"ADM2_ID\"))" + ] + }, + { + "cell_type": "markdown", + "id": "0b0d32f1", + "metadata": { + "papermill": { + "duration": 0.000195, + "end_time": "2025-12-19T10:23:41.368739", + "exception": false, + "start_time": "2025-12-19T10:23:41.368544", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 3.1. 🎨 Dynamic categories and color assignement" + ] + }, + { + "cell_type": "markdown", + "id": "cc765e0c", + "metadata": { + "papermill": { + "duration": 0.000109, + "end_time": "2025-12-19T10:23:41.369057", + "exception": false, + "start_time": "2025-12-19T10:23:41.368948", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "##### 1. Define breaks and labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e79132c", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:41.373558Z", + "iopub.status.busy": "2025-12-19T10:23:41.371555Z", + "iopub.status.idle": "2025-12-19T10:23:41.392950Z", + "shell.execute_reply": "2025-12-19T10:23:41.390333Z" + }, + "papermill": { + "duration": 0.026996, + "end_time": "2025-12-19T10:23:41.396238", + "exception": false, + "start_time": "2025-12-19T10:23:41.369242", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Safety code to avoid breaking if nothings is fund in json_metadata\n", + "if (is.null(break_vals) || length(break_vals) == 0) {\n", + " log_msg(\"[WARNING] No break values found in SNT_metadata.json for REPORTING_RATE$SCALE. Using default values.\", \"warning\")\n", + " break_vals <- c(0.5, 0.8, 0.9, 0.95, 1.00)\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f04cb888", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:41.401034Z", + "iopub.status.busy": "2025-12-19T10:23:41.398849Z", + "iopub.status.idle": "2025-12-19T10:23:41.430720Z", + "shell.execute_reply": "2025-12-19T10:23:41.428238Z" + }, + "papermill": { + "duration": 0.037712, + "end_time": "2025-12-19T10:23:41.434131", + "exception": false, + "start_time": "2025-12-19T10:23:41.396419", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# 1. Define breaks\n", + "# Note: assumes that the data starts at 0!\n", + "# break_vals <- metadata_json$REPORTING_RATE$SCALE # moved upstream\n", + "\n", + "# 2. Create the full set of cut points (0 to Infinity)\n", + "full_breaks <- c(0, break_vals, Inf)\n", + "\n", + "# 3. Create dynamic labels\n", + "labels <- c(\n", + " paste0(\"< \", break_vals[1]), # First label\n", + " paste0(break_vals[-length(break_vals)], \" - \", break_vals[-1]), # Middle labels\n", + " paste0(\"> \", break_vals[length(break_vals)]) # Last label\n", + ")\n", + "\n", + "# Check\n", + "labels" + ] + }, + { + "cell_type": "markdown", + "id": "cb237801", + "metadata": { + "papermill": { + "duration": 0.000102, + "end_time": "2025-12-19T10:23:41.434442", + "exception": false, + "start_time": "2025-12-19T10:23:41.434340", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "##### 2. Create `_CATEGORY` col" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f8303488", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:41.439376Z", + "iopub.status.busy": "2025-12-19T10:23:41.437165Z", + "iopub.status.idle": "2025-12-19T10:23:41.471891Z", + "shell.execute_reply": "2025-12-19T10:23:41.469251Z" + }, + "papermill": { + "duration": 0.040632, + "end_time": "2025-12-19T10:23:41.475176", + "exception": false, + "start_time": "2025-12-19T10:23:41.434544", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# reporting_rate_dataset <- reporting_rate_dataset %>%\n", + "data_to_plot <- data_to_plot %>%\n", + " mutate(\n", + " REPORTING_RATE_CATEGORY = cut(\n", + " REPORTING_RATE,\n", + " breaks = full_breaks,\n", + " labels = labels,\n", + " right = TRUE, # so that 1.00 is assigned to \"0.95 - 1.00\"\n", + " include.lowest = TRUE\n", + " )\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "a10237f8", + "metadata": { + "papermill": { + "duration": 0.000102, + "end_time": "2025-12-19T10:23:41.475483", + "exception": false, + "start_time": "2025-12-19T10:23:41.475381", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "##### 3. Pick appropriate palette" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ee6e077", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:41.480216Z", + "iopub.status.busy": "2025-12-19T10:23:41.478061Z", + "iopub.status.idle": "2025-12-19T10:23:41.513805Z", + "shell.execute_reply": "2025-12-19T10:23:41.511268Z" + }, + "papermill": { + "duration": 0.04138, + "end_time": "2025-12-19T10:23:41.516984", + "exception": false, + "start_time": "2025-12-19T10:23:41.475604", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Count nr of breaks\n", + "nr_of_colors <- length(labels)\n", + "\n", + "# nr_of_colors\n", + "palette_to_use <- get_range_from_count(nr_of_colors)\n", + "\n", + "# Need to make palettes as named vectors so that scale_color_manual() and scale_fill_manual() can use them properly\n", + "# Note: need to reverse order of labels to match the palette order \"meaning\" (red \"\" should correcpond to lowest value)\n", + "names(palette_to_use) <- rev(labels)\n", + "\n", + "palette_to_use\n" + ] + }, + { + "cell_type": "markdown", + "id": "d08c0c14", + "metadata": { + "papermill": { + "duration": 0.000099, + "end_time": "2025-12-19T10:23:41.517267", + "exception": false, + "start_time": "2025-12-19T10:23:41.517168", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 3.2. Plots" + ] + }, + { + "cell_type": "markdown", + "id": "b7781198", + "metadata": { + "papermill": { + "duration": 0.000056, + "end_time": "2025-12-19T10:23:41.517425", + "exception": false, + "start_time": "2025-12-19T10:23:41.517369", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "##### 3.2.1 Scatter plot of RR over time (by ADM2)\n", + "With this we can see the actula numbners (although cannot tell which ADM2 have low values)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78d92e4a", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:41.522513Z", + "iopub.status.busy": "2025-12-19T10:23:41.520272Z", + "iopub.status.idle": "2025-12-19T10:23:42.935181Z", + "shell.execute_reply": "2025-12-19T10:23:42.932661Z" + }, + "papermill": { + "duration": 1.456494, + "end_time": "2025-12-19T10:23:42.974012", + "exception": false, + "start_time": "2025-12-19T10:23:41.517518", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Line point plot faceted by YEAR\n", + "ggplot(data = data_to_plot) +\n", + " geom_line(aes(x = MONTH,\n", + " y = REPORTING_RATE,\n", + " group = ADM2_ID,\n", + " color = REPORTING_RATE_CATEGORY), \n", + " alpha = 0.3,\n", + " show.legend = FALSE\n", + " ) +\n", + " geom_point(aes(x = MONTH,\n", + " y = REPORTING_RATE,\n", + " group = ADM2_ID,\n", + " color = REPORTING_RATE_CATEGORY)) + \n", + " facet_grid(~YEAR) + \n", + " scale_color_manual(\n", + " values = palette_to_use, # 🎨 NEW dynamic colors & breaks!\n", + " na.value = \"white\",\n", + " name = \"Reporting Rate Categories\"\n", + " ) +\n", + " scale_x_continuous(breaks = seq(1, 12, 1)) +\n", + " scale_y_continuous(\n", + " breaks = c(0, break_vals), # 🎨 NEW dynamic colors & breaks!\n", + " # Dynamically set max value to fit actual data (do show values >1 if present)\n", + " limits = c(0, max(data_to_plot$REPORTING_RATE, na.rm = TRUE) + 0.1)\n", + " ) +\n", + " labs(\n", + " title = \"Reporting Rate (Dataset)\",\n", + " subtitle = paste0(\"Product UID : \", rr_product_uid),\n", + " x = \"Month\",\n", + " y = \"Reporting Rate\\n(Dataset)\" ) +\n", + " theme_minimal() +\n", + " theme(\n", + " plot.subtitle = element_text(margin=margin(0,0,20,0)),\n", + " legend.position = \"none\",\n", + " legend.title = element_blank(),\n", + " # legend.key.width = unit(3, \"cm\"),\n", + " # legend.key.height = unit(0.25, \"cm\"),\n", + " axis.title.y = element_blank(),\n", + " panel.grid.minor = element_blank(),\n", + " panel.grid.major.x = element_blank(),\n", + " strip.placement = \"outside\",\n", + " strip.text = element_text(face = \"bold\", size = 10)\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f47064a", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:42.978498Z", + "iopub.status.busy": "2025-12-19T10:23:42.976659Z", + "iopub.status.idle": "2025-12-19T10:23:44.087244Z", + "shell.execute_reply": "2025-12-19T10:23:44.085182Z" + }, + "papermill": { + "duration": 1.11568, + "end_time": "2025-12-19T10:23:44.089891", + "exception": false, + "start_time": "2025-12-19T10:23:42.974211", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Export plot as PNG\n", + "output_filename <- paste0(COUNTRY_CODE, \"_reporting_rate_dataset_adm2_linepoint_\", paste(REPORTING_RATE_PRODUCT_UID, collapse = \"_\"), \".png\")\n", + "output_location <- file.path(REPORTING_NB_OUTPUTS_PATH, \"figures\")\n", + "\n", + "ggsave(\n", + " filename = output_filename, \n", + " path = output_location, \n", + " create.dir = TRUE,\n", + " height = 15,\n", + " width = 45,\n", + " units = \"cm\",\n", + " bg = \"white\",\n", + " dpi = 200\n", + " )\n", + "\n", + "# Add log message\n", + "log_msg(glue::glue(\"📊 Plot (linepoint) saved to: {file.path(output_location, output_filename)}\"))" + ] + }, + { + "cell_type": "markdown", + "id": "22bb6431", + "metadata": { + "papermill": { + "duration": 0.000147, + "end_time": "2025-12-19T10:23:44.090320", + "exception": false, + "start_time": "2025-12-19T10:23:44.090173", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "##### 3.2.2 Heatmap plot of RR over time (by ADM2)\n", + "This is less good for identifying actual values, but allows to see which ADM2 have lower values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f2445f2a", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:44.094508Z", + "iopub.status.busy": "2025-12-19T10:23:44.092577Z", + "iopub.status.idle": "2025-12-19T10:23:46.262550Z", + "shell.execute_reply": "2025-12-19T10:23:46.259633Z" + }, + "papermill": { + "duration": 2.21647, + "end_time": "2025-12-19T10:23:46.306927", + "exception": false, + "start_time": "2025-12-19T10:23:44.090457", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Tile plot faceted by YEAR\n", + "ggplot(data = data_to_plot) +\n", + " geom_tile(aes(x = MONTH,\n", + " y = fct_rev(ADM2_NAME),\n", + " fill = REPORTING_RATE_CATEGORY), \n", + " color = \"white\",\n", + " show.legend = TRUE\n", + " ) +\n", + " scale_fill_manual(\n", + " values = palette_to_use, # 🎨 NEW dynamic colors & breaks!\n", + " na.value = \"white\",\n", + " name = \"Reporting Rate: \"\n", + " ) +\n", + " scale_x_continuous(breaks = seq(1, 12, 1)) +\n", + " labs(\n", + " title = \"Reporting Rate (Dataset)\",\n", + " subtitle = paste0(\"Product UID : \", rr_product_uid),\n", + " x = \"Month\"\n", + " ) +\n", + " facet_grid(rows = vars(ADM1_NAME), cols = vars(YEAR), \n", + " scales = \"free_y\", space = \"free_y\",\n", + " switch = \"y\") +\n", + " theme_minimal() +\n", + " theme(\n", + " plot.subtitle = element_text(margin=margin(0,0,20,0)),\n", + " legend.position = \"bottom\",\n", + " legend.key.height = unit(0.25, \"cm\"),\n", + " axis.text.x = element_text(size = 7),\n", + " axis.title.y = element_blank(),\n", + " panel.grid.minor = element_blank(),\n", + " panel.grid.major = element_blank(),\n", + " strip.placement = \"outside\", \n", + " strip.text = element_text(face = \"bold\", size = 10)\n", + " ) +\n", + " guides(fill = guide_legend(nrow = 1))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cbe73312", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:46.311134Z", + "iopub.status.busy": "2025-12-19T10:23:46.309412Z", + "iopub.status.idle": "2025-12-19T10:23:48.286664Z", + "shell.execute_reply": "2025-12-19T10:23:48.284571Z" + }, + "papermill": { + "duration": 1.982105, + "end_time": "2025-12-19T10:23:48.289215", + "exception": false, + "start_time": "2025-12-19T10:23:46.307110", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Export plot as PNG\n", + "output_filename <- paste0(COUNTRY_CODE, \"_reporting_rate_dataset_adm2_heatmap_\", paste(REPORTING_RATE_PRODUCT_UID, collapse = \"_\"), \".png\")\n", + "output_location <- file.path(REPORTING_NB_OUTPUTS_PATH, \"figures\")\n", + "\n", + "ggsave(\n", + " filename = output_filename, \n", + " path = output_location, \n", + " create.dir = TRUE,\n", + " width = 20, height = 30, units = \"cm\", \n", + " dpi = 200\n", + " )\n", + "\n", + "# Add log message\n", + "log_msg(glue::glue(\"📊 Plot (heatmap) saved to: {file.path(output_location, output_filename)}\"))" + ] + }, + { + "cell_type": "markdown", + "id": "3eef141a", + "metadata": { + "papermill": { + "duration": 0.000164, + "end_time": "2025-12-19T10:23:48.289656", + "exception": false, + "start_time": "2025-12-19T10:23:48.289492", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "##### 3.2.3. MAP of Reporting Rate - by month" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "83be9c68", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:48.294030Z", + "iopub.status.busy": "2025-12-19T10:23:48.292256Z", + "iopub.status.idle": "2025-12-19T10:23:53.205670Z", + "shell.execute_reply": "2025-12-19T10:23:53.203104Z" + }, + "papermill": { + "duration": 4.958481, + "end_time": "2025-12-19T10:23:53.248341", + "exception": false, + "start_time": "2025-12-19T10:23:48.289860", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Choropleth map with reporting rate data by ADM2\n", + "ggplot(data = data_to_plot) +\n", + " geom_sf(aes(\n", + " fill = REPORTING_RATE_CATEGORY,\n", + " geometry = geometry), \n", + " color = \"white\",\n", + " size = 0.01) +\n", + " scale_fill_manual(\n", + " values = palette_to_use, # 🎨 NEW dynamic colors & breaks!\n", + " na.value = \"white\",\n", + " ) +\n", + " theme_void() +\n", + " theme(\n", + " plot.subtitle = element_text(margin=margin(5,0,20,0)),\n", + " legend.position = \"bottom\",\n", + " legend.title = element_blank(),\n", + " legend.key.height = unit(0.25, \"cm\")\n", + " ) +\n", + " labs(\n", + " title = paste(\"Reporting Rate (Dataset)\"),\n", + " subtitle = paste0(\"Product UID : \", rr_product_uid),\n", + " ) +\n", + " facet_grid(\n", + " rows = vars(YEAR), \n", + " cols = vars(MONTH),\n", + " switch = \"both\") +\n", + " guides(fill = guide_legend(nrow = 1))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e877671d", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:53.252696Z", + "iopub.status.busy": "2025-12-19T10:23:53.250972Z", + "iopub.status.idle": "2025-12-19T10:23:56.748868Z", + "shell.execute_reply": "2025-12-19T10:23:56.746990Z" + }, + "papermill": { + "duration": 3.502689, + "end_time": "2025-12-19T10:23:56.751218", + "exception": false, + "start_time": "2025-12-19T10:23:53.248529", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "output_filename <- paste0(COUNTRY_CODE, \"_reporting_rate_dataset_adm2_map_\", paste(REPORTING_RATE_PRODUCT_UID, collapse = \"_\"), \".png\")\n", + "output_location <- file.path(REPORTING_NB_OUTPUTS_PATH, \"figures\")\n", + "\n", + "ggsave(\n", + " filename = output_filename, \n", + " path = output_location, \n", + " create.dir = TRUE,\n", + " width = 50, height = 20, units = \"cm\", \n", + " dpi = 200\n", + " )\n", + "\n", + "# Add log message\n", + "log_msg(glue::glue(\"📊 Plot (map) saved to: {file.path(output_location, output_filename)}\"))" + ] + }, + { + "cell_type": "markdown", + "id": "f0894be9", + "metadata": { + "papermill": { + "duration": 0.000166, + "end_time": "2025-12-19T10:23:56.751636", + "exception": false, + "start_time": "2025-12-19T10:23:56.751470", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "##### 3.2.4. MAP of Reporting Rate - by YEAR\n", + "Use average (`mean()`) of monthly values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb1995ab", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:56.755998Z", + "iopub.status.busy": "2025-12-19T10:23:56.753982Z", + "iopub.status.idle": "2025-12-19T10:23:56.788391Z", + "shell.execute_reply": "2025-12-19T10:23:56.786447Z" + }, + "papermill": { + "duration": 0.039325, + "end_time": "2025-12-19T10:23:56.791143", + "exception": false, + "start_time": "2025-12-19T10:23:56.751818", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "data_to_plot_year <- data_to_plot %>%\n", + " group_by(geometry, ADM2_ID, ADM2_NAME, ADM1_NAME, YEAR) %>%\n", + " summarise(\n", + " REPORTING_RATE = mean(REPORTING_RATE, na.rm = TRUE),\n", + " .groups = \"drop\"\n", + " ) %>%\n", + " # Calculate REPORTING_RATE_CATEGORY again based on the yearly average\n", + " mutate(\n", + " REPORTING_RATE_CATEGORY = cut(\n", + " REPORTING_RATE,\n", + " breaks = full_breaks,\n", + " labels = labels,\n", + " right = TRUE, # so that 1.00 is assigned to \"0.95 - 1.00\"\n", + " include.lowest = TRUE\n", + " )\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd32b0cf", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:56.795010Z", + "iopub.status.busy": "2025-12-19T10:23:56.793453Z", + "iopub.status.idle": "2025-12-19T10:23:57.582261Z", + "shell.execute_reply": "2025-12-19T10:23:57.579294Z" + }, + "papermill": { + "duration": 0.798686, + "end_time": "2025-12-19T10:23:57.590023", + "exception": false, + "start_time": "2025-12-19T10:23:56.791337", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Choropleth map with reporting rate data by ADM2\n", + "ggplot(data = data_to_plot_year) +\n", + " geom_sf(aes(\n", + " fill = REPORTING_RATE_CATEGORY,\n", + " geometry = geometry), \n", + " color = \"white\",\n", + " size = 0.01) +\n", + " scale_fill_manual(\n", + " values = palette_to_use, # 🎨 NEW dynamic colors & breaks!\n", + " na.value = \"white\"\n", + " ) +\n", + " theme_void() +\n", + " theme(\n", + " plot.subtitle = element_text(margin=margin(5,0,20,0)),\n", + " legend.position = \"bottom\",\n", + " ) +\n", + " labs(\n", + " title = \"Reporting Rate (Dataset) - mean per Year\",\n", + " subtitle = paste0(\"Product UID : \", rr_product_uid),\n", + " fill = \"Reporting Rate: \"\n", + " ) +\n", + " facet_grid(\n", + " cols = vars(YEAR)\n", + " ) +\n", + " guides(fill = guide_legend(nrow = 1))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0430641e", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:57.594096Z", + "iopub.status.busy": "2025-12-19T10:23:57.592357Z", + "iopub.status.idle": "2025-12-19T10:23:58.516754Z", + "shell.execute_reply": "2025-12-19T10:23:58.514785Z" + }, + "papermill": { + "duration": 0.928933, + "end_time": "2025-12-19T10:23:58.519148", + "exception": false, + "start_time": "2025-12-19T10:23:57.590215", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "output_filename <- paste0(COUNTRY_CODE, \"_reporting_rate_dataset_adm2_map_year_\", paste(REPORTING_RATE_PRODUCT_UID, collapse = \"_\"), \".png\")\n", + "output_location <- file.path(REPORTING_NB_OUTPUTS_PATH, \"figures\")\n", + "\n", + "ggsave(\n", + " filename = output_filename, \n", + " path = output_location, \n", + " create.dir = TRUE,\n", + " width = 31, height = 13, units = \"cm\", \n", + " dpi = 200\n", + " )\n", + "\n", + "# Add log message\n", + "log_msg(glue::glue(\"📊 Plot (map) saved to: {file.path(output_location, output_filename)}\"))" + ] + }, + { + "cell_type": "markdown", + "id": "8c3bdca4", + "metadata": { + "papermill": { + "duration": 0.000126, + "end_time": "2025-12-19T10:23:58.519515", + "exception": false, + "start_time": "2025-12-19T10:23:58.519389", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### The End :)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f8a62ec5", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:58.523680Z", + "iopub.status.busy": "2025-12-19T10:23:58.522024Z", + "iopub.status.idle": "2025-12-19T10:23:58.733860Z", + "shell.execute_reply": "2025-12-19T10:23:58.731929Z" + }, + "papermill": { + "duration": 0.216448, + "end_time": "2025-12-19T10:23:58.736160", + "exception": false, + "start_time": "2025-12-19T10:23:58.519712", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "log_msg(\"Reporting Rate (Dataset) report notebook completed successfully!\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" }, - "nbformat": 4, - "nbformat_minor": 5 + "papermill": { + "default_parameters": {}, + "duration": 32.950872, + "end_time": "2025-12-19T10:23:59.058917", + "environment_variables": {}, + "exception": null, + "input_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataset/reporting/snt_dhis2_reporting_rate_dataset_report.ipynb", + "output_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataset/reporting/outputs/snt_dhis2_reporting_rate_dataset_report_OUTPUT_2025-12-19_102325.ipynb", + "parameters": {}, + "start_time": "2025-12-19T10:23:26.108045", + "version": "2.6.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/pipelines/snt_dhis2_reporting_rate_dataset/utils/snt_dhis2_reporting_rate_dataset.r b/pipelines/snt_dhis2_reporting_rate_dataset/utils/snt_dhis2_reporting_rate_dataset.r new file mode 100644 index 0000000..285727c --- /dev/null +++ b/pipelines/snt_dhis2_reporting_rate_dataset/utils/snt_dhis2_reporting_rate_dataset.r @@ -0,0 +1,109 @@ +select_routine_dataset_name_dataset <- function(ROUTINE_FILE, COUNTRY_CODE, config_json) { + if (ROUTINE_FILE == glue::glue("{COUNTRY_CODE}_routine.parquet")) { + return(config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED) + } + config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION +} + + +load_routine_data_dataset <- function(rountine_dataset_name, ROUTINE_FILE, COUNTRY_CODE, fixed_cols_rr) { + dhis2_routine <- tryCatch({ + get_latest_dataset_file_in_memory(rountine_dataset_name, ROUTINE_FILE) + }, error = function(e) { + msg <- paste("Error while loading DHIS2 routine data file for: ", COUNTRY_CODE, conditionMessage(e)) + cat(msg) + stop(msg) + }) + + dhis2_routine <- dhis2_routine %>% dplyr::mutate(dplyr::across(c(PERIOD, YEAR, MONTH), as.numeric)) + dhis2_routine <- dhis2_routine %>% dplyr::select(dplyr::any_of(fixed_cols_rr)) %>% dplyr::distinct() + + log_msg(glue::glue( + "DHIS2 routine file {ROUTINE_FILE} loaded from dataset : {rountine_dataset_name} dataframe dimensions: {paste(dim(dhis2_routine), collapse=', ')}" + )) + dhis2_routine +} + + +load_reporting_data_dataset <- function(config_json, COUNTRY_CODE) { + dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED + file_name <- paste0(COUNTRY_CODE, "_reporting.parquet") + + dhis2_reporting <- tryCatch({ + get_latest_dataset_file_in_memory(dataset_name, file_name) + }, error = function(e) { + msg <- paste("[ERROR] Error while loading DHIS2 dataset reporting rates file for: ", COUNTRY_CODE, conditionMessage(e)) + cat(msg) + stop(msg) + }) + dhis2_reporting <- dhis2_reporting %>% dplyr::mutate(dplyr::across(c(PERIOD, YEAR, MONTH, VALUE), as.numeric)) + + log_msg(paste0( + "DHIS2 Datatset reporting data loaded from file `", file_name, "` (from dataset : `", dataset_name, "`). Dataframe dimensions: ", + paste(dim(dhis2_reporting), collapse = ", ") + )) + dhis2_reporting +} + + +compute_reporting_rate_dataset <- function(dhis2_reporting, REPORTING_RATE_PRODUCT_ID, COUNTRY_CODE) { + if (all(REPORTING_RATE_PRODUCT_ID %in% unique(dhis2_reporting$PRODUCT_UID))) { + dhis2_reporting <- dhis2_reporting %>% dplyr::filter(PRODUCT_UID %in% REPORTING_RATE_PRODUCT_ID) + } else { + log_msg(glue::glue( + "🚨 Warning: REPORTING_RATE_PRODUCT_UID: {paste(REPORTING_RATE_PRODUCT_ID, collapse=', ')} not found in DHIS2 reporting data. Skipping filtering." + ), level = "warning") + } + + dhis2_reporting_wide <- dhis2_reporting %>% tidyr::pivot_wider(names_from = PRODUCT_METRIC, values_from = VALUE) + + dupl_ou_period <- dhis2_reporting_wide %>% + dplyr::group_by(OU_ID, PERIOD) %>% + dplyr::filter(dplyr::n() > 1) %>% + dplyr::ungroup() %>% + dplyr::select(OU_ID, OU_NAME, PERIOD, PRODUCT_UID, dplyr::ends_with("REPORTS")) + + if (all(dupl_ou_period$ACTUAL_REPORTS %in% c(0, 1)) & all(dupl_ou_period$EXPECTED_REPORTS %in% c(0, 1))) { + dhis2_reporting_wide <- dhis2_reporting_wide %>% + dplyr::group_by(PERIOD, OU_ID) %>% + dplyr::mutate(ACTUAL_REPORTS_deduplicated = ifelse(OU_ID %in% dupl_ou_period$OU_ID, max(ACTUAL_REPORTS), ACTUAL_REPORTS)) %>% + dplyr::ungroup() %>% + dplyr::filter(!(OU_ID %in% dupl_ou_period$OU_ID) | (ACTUAL_REPORTS == ACTUAL_REPORTS_deduplicated)) %>% + dplyr::select(-ACTUAL_REPORTS_deduplicated) + } + + if (COUNTRY_CODE == "NER") { + dhis2_reporting_wide <- dhis2_reporting_wide %>% + dplyr::mutate( + ACTUAL_REPORTS = ifelse(ACTUAL_REPORTS > 1, 1, ACTUAL_REPORTS), + EXPECTED_REPORTS = ifelse(EXPECTED_REPORTS > 1, 1, EXPECTED_REPORTS) + ) + } + + dhis2_reporting_wide_adm2 <- dhis2_reporting_wide %>% + dplyr::group_by(PERIOD, YEAR, MONTH, ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID) %>% + dplyr::summarise( + ACTUAL_REPORTS = sum(ACTUAL_REPORTS, na.rm = TRUE), + EXPECTED_REPORTS = sum(EXPECTED_REPORTS, na.rm = TRUE), + .groups = "drop" + ) + + dhis2_reporting_wide_adm2 %>% + dplyr::mutate(REPORTING_RATE = ACTUAL_REPORTS / EXPECTED_REPORTS) +} + + +export_reporting_rate_dataset <- function(reporting_rate_dataset, DATA_PATH, COUNTRY_CODE) { + output_data_path <- file.path(DATA_PATH, "reporting_rate") + if (!dir.exists(output_data_path)) { + dir.create(output_data_path, recursive = TRUE) + } + + file_path <- file.path(output_data_path, paste0(COUNTRY_CODE, "_reporting_rate_dataset.parquet")) + arrow::write_parquet(reporting_rate_dataset, file_path) + log_msg(glue::glue("Exported : {file_path}")) + + file_path <- file.path(output_data_path, paste0(COUNTRY_CODE, "_reporting_rate_dataset.csv")) + write.csv(reporting_rate_dataset, file_path, row.names = FALSE) + log_msg(glue::glue("Exported : {file_path}")) +} diff --git a/snt_dhis2_reporting_rate_dataelement/pipeline.py b/snt_dhis2_reporting_rate_dataelement/pipeline.py index 88a2d2e..da862da 100644 --- a/snt_dhis2_reporting_rate_dataelement/pipeline.py +++ b/snt_dhis2_reporting_rate_dataelement/pipeline.py @@ -15,18 +15,36 @@ @pipeline("snt_dhis2_reporting_rate_dataelement") @parameter( - "routine_data_choice", - name="Routine data source", - help="Select which routine data to use. " - "'raw' loads formatted routine data, " - "'imputed' loads outliers-imputed routine data, " - "'outliers_removed' loads routine data with outliers removed.", + "outliers_method", + name="Outliers detection method", + help="Specify which method was used to detect outliers in routine data. " + "Chose 'Routine data (Raw)' to use raw routine data.", multiple=False, - choices=["raw", "imputed", "outliers_removed"], + choices=[ + "Routine data (Raw)", + "Mean (Classic)", + "Median (Classic)", + "IQR (Classic)", + "Trend (PATH)", + "MG Partial (MagicGlasses2)", + "MG Complete (MagicGlasses2)", + ], type=str, - default="imputed", + default="Routine data (Raw)", required=True, ) +@parameter( + "use_removed_outliers", + name="Use routine data with outliers removed (else: uses imputed)", + help="Enable this option to use routine data after outliers have been removed, " + "based on the outlier detection method you selected above. " + " If you leave this off, the pipeline will instead use either:" + " A) the imputed routine data (where outlier values have been replaced), or" + " B) the raw routine data, if you chose 'Routine data (Raw)' as your outlier processing method.", + type=bool, + default=False, + required=False, +) @parameter( "activity_indicators", name="Facility Activity indicators", @@ -86,7 +104,8 @@ required=False, ) def snt_dhis2_reporting_rate_dataelement( - routine_data_choice: str, + outliers_method: str, + use_removed_outliers: bool, activity_indicators: str, volume_activity_indicators: str, dataelement_method_denominator: str, @@ -116,9 +135,7 @@ def snt_dhis2_reporting_rate_dataelement( country_code = snt_config["SNT_CONFIG"]["COUNTRY_CODE"] # Build parameters dict and save to JSON in all cases (like other pipelines) - routine_file = resolve_routine_filename( - country_code=country_code, routine_data_choice=routine_data_choice - ) + routine_file = f"{country_code}{resolve_routine_filename(outliers_method, use_removed_outliers)}" nb_parameters = { "SNT_ROOT_PATH": root_path.as_posix(), "ROUTINE_FILE": routine_file, @@ -136,7 +153,7 @@ def snt_dhis2_reporting_rate_dataelement( current_run.log_info(f"Saved pipeline parameters to {parameters_file}") if not run_report_only: - if routine_data_choice == "raw": + if outliers_method == "Routine data (Raw)": ds_outliers_id = snt_config["SNT_DATASET_IDENTIFIERS"]["DHIS2_DATASET_FORMATTED"] else: ds_outliers_id = snt_config["SNT_DATASET_IDENTIFIERS"]["DHIS2_OUTLIERS_IMPUTATION"] @@ -186,18 +203,44 @@ def snt_dhis2_reporting_rate_dataelement( raise -def resolve_routine_filename(country_code: str, routine_data_choice: str) -> str: - """Returns the canonical routine filename for a routine data choice.""" - if routine_data_choice == "raw": - return f"{country_code}_routine.parquet" +def resolve_routine_filename(outliers_method: str, is_removed: bool) -> str: + """Returns the routine data filename based on the selected outliers method. + + Parameters + ---------- + outliers_method : str + The method used for outlier removal. + is_removed : bool + Whether to return the filename for removed outliers or imputed outliers. + + Returns + ------- + str + The filename corresponding to the selected outliers method. + + Raises + ------ + ValueError + If the outliers method is unknown. + """ + if outliers_method == "Routine data (Raw)": + return "_routine.parquet" + + method_suffix_map = { + "Mean (Classic)": "mean", + "Median (Classic)": "median", + "IQR (Classic)": "iqr", + "Trend (PATH)": "trend", + "MG Partial (MagicGlasses2)": "mg-partial", + "MG Complete (MagicGlasses2)": "mg-complete", + } - if routine_data_choice == "imputed": - return f"{country_code}_routine_outliers_imputed.parquet" - - if routine_data_choice == "outliers_removed": - return f"{country_code}_routine_outliers_removed.parquet" + try: + suffix = method_suffix_map[outliers_method] + except KeyError as err: + raise ValueError(f"Unknown outliers method: {outliers_method}") from err - raise ValueError(f"Unknown routine data choice: {routine_data_choice}") + return f"_routine_outliers{'_removed' if is_removed else '_imputed'}.parquet" if __name__ == "__main__": diff --git a/snt_dhis2_reporting_rate_dataset/pipeline.py b/snt_dhis2_reporting_rate_dataset/pipeline.py index b52c32c..77926dc 100644 --- a/snt_dhis2_reporting_rate_dataset/pipeline.py +++ b/snt_dhis2_reporting_rate_dataset/pipeline.py @@ -15,18 +15,36 @@ @pipeline("snt_dhis2_reporting_rate_dataset") @parameter( - "routine_data_choice", - name="Routine data source", - help="Select which routine data to use. " - "'raw' loads formatted routine data, " - "'imputed' loads outliers-imputed routine data, " - "'outliers_removed' loads routine data with outliers removed.", + "outliers_method", + name="Outlier processing method", + help="Specify which method was used to detect outliers in routine data. " + "Chose 'Routine data (Raw)' to use raw routine data.", multiple=False, - choices=["raw", "imputed", "outliers_removed"], + choices=[ + "Routine data (Raw)", + "Mean (Classic)", + "Median (Classic)", + "IQR (Classic)", + "Trend (PATH)", + "MG Partial (MagicGlasses2)", + "MG Complete (MagicGlasses2)", + ], type=str, - default="imputed", + default="Routine data (Raw)", required=True, ) +@parameter( + "use_removed_outliers", + name="Use routine data with outliers removed (else: uses imputed)", + help="Enable this option to use routine data after outliers have been removed, " + "based on the outlier detection method you selected above. " + " If you leave this off, the pipeline will instead use either:" + " A) the imputed routine data (where outlier values have been replaced), or" + " B) the raw routine data, if you chose 'Routine data (Raw)' as your outlier processing method.", + type=bool, + default=False, + required=False, +) @parameter( "run_report_only", name="Run reporting notebook only", @@ -48,7 +66,7 @@ required=False, ) def snt_dhis2_reporting_rate_dataset( - routine_data_choice: str, run_report_only: bool, pull_scripts: bool + outliers_method: list, use_removed_outliers: bool, run_report_only: bool, pull_scripts: bool ): """Orchestration function. Calls other functions within the pipeline.""" if pull_scripts: @@ -72,18 +90,19 @@ def snt_dhis2_reporting_rate_dataset( country_code = snt_config["SNT_CONFIG"]["COUNTRY_CODE"] if not run_report_only: - routine_file = resolve_routine_filename( - country_code=country_code, routine_data_choice=routine_data_choice - ) - if routine_data_choice == "raw": + routine_file = resolve_routine_filename(outliers_method, use_removed_outliers) + routine_file = f"{country_code}{routine_file}" + if outliers_method == "Routine data (Raw)": ds_outliers_id = snt_config["SNT_DATASET_IDENTIFIERS"]["DHIS2_DATASET_FORMATTED"] else: ds_outliers_id = snt_config["SNT_DATASET_IDENTIFIERS"]["DHIS2_OUTLIERS_IMPUTATION"] + # Check the file exists in the dataset if not dataset_file_exists(ds_id=ds_outliers_id, filename=routine_file): current_run.log_warning( - f"Routine file {routine_file} was not found in dataset {ds_outliers_id}. " - "Perhaps the outliers-imputation pipeline has not been run yet. Processing cannot continue." + f"Routine file {routine_file} not found in the dataset {ds_outliers_id}, " + "perhaps the outliers imputation pipeline has not been run yet. " + "Processing cannot continue." ) return @@ -93,7 +112,7 @@ def snt_dhis2_reporting_rate_dataset( } params_file = save_pipeline_parameters( - pipeline_name="snt_dhis2_reporting_rate_dataset", + pipeline_name="snt_dhis2_reporting_rate_dataelement", parameters=nb_parameters, output_path=data_path, country_code=country_code, @@ -134,18 +153,44 @@ def snt_dhis2_reporting_rate_dataset( raise -def resolve_routine_filename(country_code: str, routine_data_choice: str) -> str: - """Returns the canonical routine filename for a routine data choice.""" - if routine_data_choice == "raw": - return f"{country_code}_routine.parquet" +def resolve_routine_filename(outliers_method: str, is_removed: bool) -> str: + """Returns the routine data filename based on the selected outliers method. + + Parameters + ---------- + outliers_method : str + The method used for outlier removal. + is_removed : bool + Whether to return the filename for removed outliers or imputed outliers. + + Returns + ------- + str + The filename corresponding to the selected outliers method. + + Raises + ------ + ValueError + If the outliers method is unknown. + """ + if outliers_method == "Routine data (Raw)": + return "_routine.parquet" + + method_suffix_map = { + "Mean (Classic)": "mean", + "Median (Classic)": "median", + "IQR (Classic)": "iqr", + "Trend (PATH)": "trend", + "MG Partial (MagicGlasses2)": "mg-partial", + "MG Complete (MagicGlasses2)": "mg-complete", + } - if routine_data_choice == "imputed": - return f"{country_code}_routine_outliers_imputed.parquet" - - if routine_data_choice == "outliers_removed": - return f"{country_code}_routine_outliers_removed.parquet" + try: + suffix = method_suffix_map[outliers_method] + except KeyError as err: + raise ValueError(f"Unknown outliers method: {outliers_method}") from err - raise ValueError(f"Unknown routine data choice: {routine_data_choice}") + return f"_routine_outliers{'_removed' if is_removed else '_imputed'}.parquet" if __name__ == "__main__": From eb3113ed0f4d4b6de44e6f4782fc143005db48cc Mon Sep 17 00:00:00 2001 From: claude-marie Date: Fri, 3 Apr 2026 12:26:50 +0200 Subject: [PATCH 03/18] last fix --- .../pipeline.py | 89 +++++------------ snt_dhis2_reporting_rate_dataset/pipeline.py | 97 +++++-------------- 2 files changed, 49 insertions(+), 137 deletions(-) diff --git a/snt_dhis2_reporting_rate_dataelement/pipeline.py b/snt_dhis2_reporting_rate_dataelement/pipeline.py index da862da..88a2d2e 100644 --- a/snt_dhis2_reporting_rate_dataelement/pipeline.py +++ b/snt_dhis2_reporting_rate_dataelement/pipeline.py @@ -15,36 +15,18 @@ @pipeline("snt_dhis2_reporting_rate_dataelement") @parameter( - "outliers_method", - name="Outliers detection method", - help="Specify which method was used to detect outliers in routine data. " - "Chose 'Routine data (Raw)' to use raw routine data.", + "routine_data_choice", + name="Routine data source", + help="Select which routine data to use. " + "'raw' loads formatted routine data, " + "'imputed' loads outliers-imputed routine data, " + "'outliers_removed' loads routine data with outliers removed.", multiple=False, - choices=[ - "Routine data (Raw)", - "Mean (Classic)", - "Median (Classic)", - "IQR (Classic)", - "Trend (PATH)", - "MG Partial (MagicGlasses2)", - "MG Complete (MagicGlasses2)", - ], + choices=["raw", "imputed", "outliers_removed"], type=str, - default="Routine data (Raw)", + default="imputed", required=True, ) -@parameter( - "use_removed_outliers", - name="Use routine data with outliers removed (else: uses imputed)", - help="Enable this option to use routine data after outliers have been removed, " - "based on the outlier detection method you selected above. " - " If you leave this off, the pipeline will instead use either:" - " A) the imputed routine data (where outlier values have been replaced), or" - " B) the raw routine data, if you chose 'Routine data (Raw)' as your outlier processing method.", - type=bool, - default=False, - required=False, -) @parameter( "activity_indicators", name="Facility Activity indicators", @@ -104,8 +86,7 @@ required=False, ) def snt_dhis2_reporting_rate_dataelement( - outliers_method: str, - use_removed_outliers: bool, + routine_data_choice: str, activity_indicators: str, volume_activity_indicators: str, dataelement_method_denominator: str, @@ -135,7 +116,9 @@ def snt_dhis2_reporting_rate_dataelement( country_code = snt_config["SNT_CONFIG"]["COUNTRY_CODE"] # Build parameters dict and save to JSON in all cases (like other pipelines) - routine_file = f"{country_code}{resolve_routine_filename(outliers_method, use_removed_outliers)}" + routine_file = resolve_routine_filename( + country_code=country_code, routine_data_choice=routine_data_choice + ) nb_parameters = { "SNT_ROOT_PATH": root_path.as_posix(), "ROUTINE_FILE": routine_file, @@ -153,7 +136,7 @@ def snt_dhis2_reporting_rate_dataelement( current_run.log_info(f"Saved pipeline parameters to {parameters_file}") if not run_report_only: - if outliers_method == "Routine data (Raw)": + if routine_data_choice == "raw": ds_outliers_id = snt_config["SNT_DATASET_IDENTIFIERS"]["DHIS2_DATASET_FORMATTED"] else: ds_outliers_id = snt_config["SNT_DATASET_IDENTIFIERS"]["DHIS2_OUTLIERS_IMPUTATION"] @@ -203,44 +186,18 @@ def snt_dhis2_reporting_rate_dataelement( raise -def resolve_routine_filename(outliers_method: str, is_removed: bool) -> str: - """Returns the routine data filename based on the selected outliers method. - - Parameters - ---------- - outliers_method : str - The method used for outlier removal. - is_removed : bool - Whether to return the filename for removed outliers or imputed outliers. - - Returns - ------- - str - The filename corresponding to the selected outliers method. - - Raises - ------ - ValueError - If the outliers method is unknown. - """ - if outliers_method == "Routine data (Raw)": - return "_routine.parquet" - - method_suffix_map = { - "Mean (Classic)": "mean", - "Median (Classic)": "median", - "IQR (Classic)": "iqr", - "Trend (PATH)": "trend", - "MG Partial (MagicGlasses2)": "mg-partial", - "MG Complete (MagicGlasses2)": "mg-complete", - } +def resolve_routine_filename(country_code: str, routine_data_choice: str) -> str: + """Returns the canonical routine filename for a routine data choice.""" + if routine_data_choice == "raw": + return f"{country_code}_routine.parquet" - try: - suffix = method_suffix_map[outliers_method] - except KeyError as err: - raise ValueError(f"Unknown outliers method: {outliers_method}") from err + if routine_data_choice == "imputed": + return f"{country_code}_routine_outliers_imputed.parquet" + + if routine_data_choice == "outliers_removed": + return f"{country_code}_routine_outliers_removed.parquet" - return f"_routine_outliers{'_removed' if is_removed else '_imputed'}.parquet" + raise ValueError(f"Unknown routine data choice: {routine_data_choice}") if __name__ == "__main__": diff --git a/snt_dhis2_reporting_rate_dataset/pipeline.py b/snt_dhis2_reporting_rate_dataset/pipeline.py index 77926dc..b52c32c 100644 --- a/snt_dhis2_reporting_rate_dataset/pipeline.py +++ b/snt_dhis2_reporting_rate_dataset/pipeline.py @@ -15,36 +15,18 @@ @pipeline("snt_dhis2_reporting_rate_dataset") @parameter( - "outliers_method", - name="Outlier processing method", - help="Specify which method was used to detect outliers in routine data. " - "Chose 'Routine data (Raw)' to use raw routine data.", + "routine_data_choice", + name="Routine data source", + help="Select which routine data to use. " + "'raw' loads formatted routine data, " + "'imputed' loads outliers-imputed routine data, " + "'outliers_removed' loads routine data with outliers removed.", multiple=False, - choices=[ - "Routine data (Raw)", - "Mean (Classic)", - "Median (Classic)", - "IQR (Classic)", - "Trend (PATH)", - "MG Partial (MagicGlasses2)", - "MG Complete (MagicGlasses2)", - ], + choices=["raw", "imputed", "outliers_removed"], type=str, - default="Routine data (Raw)", + default="imputed", required=True, ) -@parameter( - "use_removed_outliers", - name="Use routine data with outliers removed (else: uses imputed)", - help="Enable this option to use routine data after outliers have been removed, " - "based on the outlier detection method you selected above. " - " If you leave this off, the pipeline will instead use either:" - " A) the imputed routine data (where outlier values have been replaced), or" - " B) the raw routine data, if you chose 'Routine data (Raw)' as your outlier processing method.", - type=bool, - default=False, - required=False, -) @parameter( "run_report_only", name="Run reporting notebook only", @@ -66,7 +48,7 @@ required=False, ) def snt_dhis2_reporting_rate_dataset( - outliers_method: list, use_removed_outliers: bool, run_report_only: bool, pull_scripts: bool + routine_data_choice: str, run_report_only: bool, pull_scripts: bool ): """Orchestration function. Calls other functions within the pipeline.""" if pull_scripts: @@ -90,19 +72,18 @@ def snt_dhis2_reporting_rate_dataset( country_code = snt_config["SNT_CONFIG"]["COUNTRY_CODE"] if not run_report_only: - routine_file = resolve_routine_filename(outliers_method, use_removed_outliers) - routine_file = f"{country_code}{routine_file}" - if outliers_method == "Routine data (Raw)": + routine_file = resolve_routine_filename( + country_code=country_code, routine_data_choice=routine_data_choice + ) + if routine_data_choice == "raw": ds_outliers_id = snt_config["SNT_DATASET_IDENTIFIERS"]["DHIS2_DATASET_FORMATTED"] else: ds_outliers_id = snt_config["SNT_DATASET_IDENTIFIERS"]["DHIS2_OUTLIERS_IMPUTATION"] - # Check the file exists in the dataset if not dataset_file_exists(ds_id=ds_outliers_id, filename=routine_file): current_run.log_warning( - f"Routine file {routine_file} not found in the dataset {ds_outliers_id}, " - "perhaps the outliers imputation pipeline has not been run yet. " - "Processing cannot continue." + f"Routine file {routine_file} was not found in dataset {ds_outliers_id}. " + "Perhaps the outliers-imputation pipeline has not been run yet. Processing cannot continue." ) return @@ -112,7 +93,7 @@ def snt_dhis2_reporting_rate_dataset( } params_file = save_pipeline_parameters( - pipeline_name="snt_dhis2_reporting_rate_dataelement", + pipeline_name="snt_dhis2_reporting_rate_dataset", parameters=nb_parameters, output_path=data_path, country_code=country_code, @@ -153,44 +134,18 @@ def snt_dhis2_reporting_rate_dataset( raise -def resolve_routine_filename(outliers_method: str, is_removed: bool) -> str: - """Returns the routine data filename based on the selected outliers method. - - Parameters - ---------- - outliers_method : str - The method used for outlier removal. - is_removed : bool - Whether to return the filename for removed outliers or imputed outliers. - - Returns - ------- - str - The filename corresponding to the selected outliers method. - - Raises - ------ - ValueError - If the outliers method is unknown. - """ - if outliers_method == "Routine data (Raw)": - return "_routine.parquet" - - method_suffix_map = { - "Mean (Classic)": "mean", - "Median (Classic)": "median", - "IQR (Classic)": "iqr", - "Trend (PATH)": "trend", - "MG Partial (MagicGlasses2)": "mg-partial", - "MG Complete (MagicGlasses2)": "mg-complete", - } +def resolve_routine_filename(country_code: str, routine_data_choice: str) -> str: + """Returns the canonical routine filename for a routine data choice.""" + if routine_data_choice == "raw": + return f"{country_code}_routine.parquet" - try: - suffix = method_suffix_map[outliers_method] - except KeyError as err: - raise ValueError(f"Unknown outliers method: {outliers_method}") from err + if routine_data_choice == "imputed": + return f"{country_code}_routine_outliers_imputed.parquet" + + if routine_data_choice == "outliers_removed": + return f"{country_code}_routine_outliers_removed.parquet" - return f"_routine_outliers{'_removed' if is_removed else '_imputed'}.parquet" + raise ValueError(f"Unknown routine data choice: {routine_data_choice}") if __name__ == "__main__": From 674a41d20f2f870deefb89965155f19b71a443fc Mon Sep 17 00:00:00 2001 From: claude-marie Date: Fri, 10 Apr 2026 09:58:07 +0200 Subject: [PATCH 04/18] less black boxish pipelines --- ...snt_dhis2_reporting_rate_dataelement.ipynb | 2132 +++++++++-------- .../snt_dhis2_reporting_rate_dataset.ipynb | 89 +- 2 files changed, 1185 insertions(+), 1036 deletions(-) diff --git a/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb b/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb index 8f46501..8a94d83 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb +++ b/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb @@ -1,1068 +1,1152 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "6e8d006c-fd3d-4186-bc8f-b83fdf234e65", - "metadata": { - "papermill": { - "duration": 0.000173, - "end_time": "2026-01-16T10:22:53.011120", - "exception": false, - "start_time": "2026-01-16T10:22:53.010947", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "# Data Element reporting rate: based on reporting of one or more indicators\n", - "Partially following methods by WHO and as per Diallo (2025) paper\n", - "\n", - "To accurately measure data completeness, we calculate the **monthly** reporting rate per **ADM2**, as the **proportion** of **facilities** (HF or `OU_ID`) that in a given month submitted data for either a single or _any_ of the chosen indicators (i.e., `CONF`, `SUSP`, `TEST`). \n", - "Basically, \"Data Element\" reporting rate is the number of facilities reporting on 1 or more given indicators, over the total number of facilities.
\n", - "For this method the user is allowed to **chose** how to calculate both the **numerator** and **denominator**.
\n", - "\n", - "Specifically: \n", - "\n", - "* **Numerator**: Number of facilities that _actually reported_ data, and it is estimated based on whether a facility (OU_ID) submitted data for **_any_** of the **selected indicators**. \n", - " Note: we **recommend** always including `CONF` because it is a core indicator consistently tracked across the dataset. This choice ensures alignment with the structure of the incidence calculation, which is also mainly based on confirmed cases.\n", - "
\n", - "
\n", - "* **Denominator**: Number of facilities _expected_ to report. This number can be obtained in two different ways: \n", - " * `\"ROUTINE_ACTIVE_FACILITIES\"`: uses the col `EXPECTED_REPORTS` from the df `active_facilities`.
\n", - " This is calculated as the number of \"**active**\" facilities (OU_ID), defined as those that submitted _any_ data **at least once in a given year**, across **all** indicators extracted in `dhis2_routine` (namely: all aggregated indicators as defined in the SNT_config.json file, see: `config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS`)\n", - " * `\"PYRAMID_OPEN_FACILITIES\"`: This method uses the opening and closing dates in DHIS2 (stored in the DHIS2 organisation units) to determine whether a facility was open, and thus expected to report, at the time of calculation.\n", - "
\n", - "
\n", - "* **Output**: Reporting rate table aggregated at administrative level 2 with extensions csv and parquet saved to dataset **SNT_DHIS2_REPORTING_RATE**:\n", - " * cols: YEAR, MONTH, ADM2_ID, REPORTING_RATE\n", - " * Filename: `XXX_reporting_rate_dataelement.`" - ] - }, - { - "cell_type": "markdown", - "id": "064495be-24e5-4b76-a91f-7ac3d1a27a5a", - "metadata": { - "papermill": { - "duration": 0.000228, - "end_time": "2026-01-16T10:22:53.014752", - "exception": false, - "start_time": "2026-01-16T10:22:53.014524", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## 1. Setup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "35ede7cf-257f-439c-a514-26a7290f881d", - "metadata": { - "papermill": { - "duration": 63.150489, - "end_time": "2026-01-16T10:23:56.165530", - "exception": false, - "start_time": "2026-01-16T10:22:53.015041", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Project paths\n", - "SNT_ROOT_PATH <- \"/home/hexa/workspace\"\n", - "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_reporting_rate_dataelement\")\n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", - "DATA_PATH <- file.path(SNT_ROOT_PATH, \"data\", \"dhis2\")\n", - "\n", - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_reporting_rate_dataelement.r\"))\n", - "\n", - "# Load libraries\n", - "required_packages <- c(\"arrow\", \"tidyverse\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"glue\", \"zoo\")\n", - "install_and_load(required_packages)\n", - "\n", - "# Environment variables\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "\n", - "# Load OpenHEXA sdk\n", - "openhexa <- import(\"openhexa.sdk\")\n" - ] - }, - { - "cell_type": "markdown", - "id": "a7a15634-4623-40f2-8e2d-3fa47203aa6e", - "metadata": { - "papermill": { - "duration": 0.00011, - "end_time": "2026-01-16T10:23:56.165873", - "exception": false, - "start_time": "2026-01-16T10:23:56.165763", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 1.1. Fallback parameters values\n", - "This parameters are injected by papermill when running in OH via pipeline run interface.
\n", - "The code cell below here provides fallback paramater values needed when running this notebook locally." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b17f7685-5291-4e5d-9eec-2d1f9435fccb", - "metadata": { - "papermill": { - "duration": 0.033954, - "end_time": "2026-01-16T10:23:56.199937", - "exception": false, - "start_time": "2026-01-16T10:23:56.165983", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Current options: \n", - "# \"COUNTRY_CODE_routine.parquet\" (RAW data)\n", - "# \"COUNTRY_CODE_routine_outliers_removed.parquet\" \n", - "# \"COUNTRY_CODE_routine_outliers_imputed.parquet\"\n", - "if (!exists(\"ROUTINE_FILE\")) {ROUTINE_FILE <- \"XXX_routine_outliers_imputed.parquet\"}\n", - "\n", - "# Options: \"ROUTINE_ACTIVE_FACILITIES\", \"PYRAMID_OPEN_FACILITIES\"\n", - "if (!exists(\"DATAELEMENT_METHOD_DENOMINATOR\")) {DATAELEMENT_METHOD_DENOMINATOR <- \"ROUTINE_ACTIVE_FACILITIES\"}\n", - "if (!exists(\"ACTIVITY_INDICATORS\")) {ACTIVITY_INDICATORS <- c(\"CONF\", \"PRES\", \"SUSP\")} \n", - "if (!exists(\"VOLUME_ACTIVITY_INDICATORS\")) {VOLUME_ACTIVITY_INDICATORS <- c(\"CONF\", \"PRES\")}\n", - "if (!exists(\"USE_WEIGHTED_REPORTING_RATES\")) {USE_WEIGHTED_REPORTING_RATES <- FALSE}" - ] - }, - { - "cell_type": "markdown", - "id": "7dedcc32-c531-498d-90b9-89b0ee9fb9be", - "metadata": { - "papermill": { - "duration": 0.000095, - "end_time": "2026-01-16T10:23:56.200231", - "exception": false, - "start_time": "2026-01-16T10:23:56.200136", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 1.2. Load and check `snt config` file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5b6d29ea-91f3-4c53-b95e-4b485f88383f", - "metadata": { - "papermill": { - "duration": 0.521572, - "end_time": "2026-01-16T10:23:56.721932", - "exception": false, - "start_time": "2026-01-16T10:23:56.200360", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", - " error = function(e) {\n", - " msg <- paste0(\"[ERROR] Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "log_msg(paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, \"SNT_config.json\")))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c26c981c-dadd-48ac-ae35-613b8ba61a82", - "metadata": { - "papermill": { - "duration": 0.033003, - "end_time": "2026-01-16T10:23:56.755117", - "exception": false, - "start_time": "2026-01-16T10:23:56.722114", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Configuration settings\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", - "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", - "\n", - "# DHIS2_INDICATORS <- names(config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS)\n", - "DHIS2_INDICATORS <- c(\"CONF\", \"PRES\", \"SUSP\", \"TEST\") # GP 20260205\n", - "\n", - "ACTIVITY_INDICATORS <- unlist(ACTIVITY_INDICATORS)\n", - "VOLUME_ACTIVITY_INDICATORS <- unlist(VOLUME_ACTIVITY_INDICATORS)\n", - "fixed_cols <- c('PERIOD', 'YEAR', 'MONTH', 'ADM1_ID', 'ADM2_ID', 'OU_ID')\n", - "fixed_cols_rr <- c('YEAR', 'MONTH', 'ADM2_ID', 'REPORTING_RATE') # Fixed cols for exporting RR tables" - ] - }, - { - "cell_type": "markdown", - "id": "8bf4a8bb", - "metadata": {}, - "source": [ - "### 1.3. 🔍 Check: at least 1 indicator must be selected\n", - "The use can toggle on/off each of the indicators. Therefore, need to make sure at least one is ON.
\n", - "Indicator `CONF` is mandatory, but I think it looks better if they're all displayed in the Run pipeline view (more intuitive)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "18b40207", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (!length(ACTIVITY_INDICATORS) > 0) {\n", - " msg <- \"[ERROR] Error: no indicator selected, cannot perform calculation of reporting rate method. Select at least one (e.g., `CONF`).\"\n", - " cat(msg) \n", - " stop(msg)\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "e44ae2ab-4af7-475a-8cbe-6d669895a18b", - "metadata": { - "papermill": { - "duration": 0.000093, - "end_time": "2026-01-16T10:23:56.779812", - "exception": false, - "start_time": "2026-01-16T10:23:56.779719", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## 2. Load Data" - ] - }, - { - "cell_type": "markdown", - "id": "39e2add7-bbc7-4312-9a6f-9886d675f532", - "metadata": { - "papermill": { - "duration": 0.000069, - "end_time": "2026-01-16T10:23:56.779987", - "exception": false, - "start_time": "2026-01-16T10:23:56.779918", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 2.1. Routine data (DHIS2) \n", - "**Note on pipeline behaviour**:
\n", - "The value of `ROUTINE_FILE` is resolved within the pipeline.py code and injected into the notebook as parameter." - ] + "cells": [ + { + "cell_type": "markdown", + "id": "6e8d006c-fd3d-4186-bc8f-b83fdf234e65", + "metadata": { + "papermill": { + "duration": 0.000173, + "end_time": "2026-01-16T10:22:53.011120", + "exception": false, + "start_time": "2026-01-16T10:22:53.010947", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "a1213723-f7e2-4238-9f37-f1795b187232", - "metadata": { - "papermill": { - "duration": 2.018878, - "end_time": "2026-01-16T10:23:58.798963", - "exception": false, - "start_time": "2026-01-16T10:23:56.780085", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "rountine_dataset_name <- select_routine_dataset_name_dataelement(ROUTINE_FILE, COUNTRY_CODE, config_json)\n", - "dhis2_routine <- load_routine_data_dataelement(rountine_dataset_name, ROUTINE_FILE, COUNTRY_CODE)\n", - "dim(dhis2_routine)\n", - "head(dhis2_routine, 2)\n" - ] + "tags": [] + }, + "source": [ + "# Data Element reporting rate: based on reporting of one or more indicators\n", + "Partially following methods by WHO and as per Diallo (2025) paper\n", + "\n", + "To accurately measure data completeness, we calculate the **monthly** reporting rate per **ADM2**, as the **proportion** of **facilities** (HF or `OU_ID`) that in a given month submitted data for either a single or _any_ of the chosen indicators (i.e., `CONF`, `SUSP`, `TEST`). \n", + "Basically, \"Data Element\" reporting rate is the number of facilities reporting on 1 or more given indicators, over the total number of facilities.
\n", + "For this method the user is allowed to **chose** how to calculate both the **numerator** and **denominator**.
\n", + "\n", + "Specifically: \n", + "\n", + "* **Numerator**: Number of facilities that _actually reported_ data, and it is estimated based on whether a facility (OU_ID) submitted data for **_any_** of the **selected indicators**. \n", + " Note: we **recommend** always including `CONF` because it is a core indicator consistently tracked across the dataset. This choice ensures alignment with the structure of the incidence calculation, which is also mainly based on confirmed cases.\n", + "
\n", + "
\n", + "* **Denominator**: Number of facilities _expected_ to report. This number can be obtained in two different ways: \n", + " * `\"ROUTINE_ACTIVE_FACILITIES\"`: uses the col `EXPECTED_REPORTS` from the df `active_facilities`.
\n", + " This is calculated as the number of \"**active**\" facilities (OU_ID), defined as those that submitted _any_ data **at least once in a given year**, across **all** indicators extracted in `dhis2_routine` (namely: all aggregated indicators as defined in the SNT_config.json file, see: `config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS`)\n", + " * `\"PYRAMID_OPEN_FACILITIES\"`: This method uses the opening and closing dates in DHIS2 (stored in the DHIS2 organisation units) to determine whether a facility was open, and thus expected to report, at the time of calculation.\n", + "
\n", + "
\n", + "* **Output**: Reporting rate table aggregated at administrative level 2 with extensions csv and parquet saved to dataset **SNT_DHIS2_REPORTING_RATE**:\n", + " * cols: YEAR, MONTH, ADM2_ID, REPORTING_RATE\n", + " * Filename: `XXX_reporting_rate_dataelement.`" + ] + }, + { + "cell_type": "markdown", + "id": "064495be-24e5-4b76-a91f-7ac3d1a27a5a", + "metadata": { + "papermill": { + "duration": 0.000228, + "end_time": "2026-01-16T10:22:53.014752", + "exception": false, + "start_time": "2026-01-16T10:22:53.014524", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "a8b91360-1a4e-4fc4-9883-602bc0ab2a2a", - "metadata": { - "papermill": { - "duration": 0.000138, - "end_time": "2026-01-16T10:23:58.799287", - "exception": false, - "start_time": "2026-01-16T10:23:58.799149", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 2.2. Organisation units (DHIS2 pyramid)" - ] + "tags": [] + }, + "source": [ + "## 1. Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35ede7cf-257f-439c-a514-26a7290f881d", + "metadata": { + "papermill": { + "duration": 63.150489, + "end_time": "2026-01-16T10:23:56.165530", + "exception": false, + "start_time": "2026-01-16T10:22:53.015041", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "2fd92901-901e-4019-be78-a7718050c1c4", - "metadata": { - "papermill": { - "duration": 0.992899, - "end_time": "2026-01-16T10:23:59.792385", - "exception": false, - "start_time": "2026-01-16T10:23:58.799486", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "dhis2_pyramid_formatted <- load_pyramid_data_dataelement(config_json, COUNTRY_CODE)\n", - "dim(dhis2_pyramid_formatted)\n", - "head(dhis2_pyramid_formatted, 2)\n" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Project paths\n", + "SNT_ROOT_PATH <- \"/home/hexa/workspace\"\n", + "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_reporting_rate_dataelement\")\n", + "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", + "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", + "DATA_PATH <- file.path(SNT_ROOT_PATH, \"data\", \"dhis2\")\n", + "\n", + "# Load utils\n", + "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_reporting_rate_dataelement.r\"))\n", + "\n", + "# Load libraries\n", + "required_packages <- c(\"arrow\", \"tidyverse\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"glue\", \"zoo\")\n", + "install_and_load(required_packages)\n", + "\n", + "# Environment variables\n", + "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", + "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "\n", + "# Load OpenHEXA sdk\n", + "openhexa <- import(\"openhexa.sdk\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "a7a15634-4623-40f2-8e2d-3fa47203aa6e", + "metadata": { + "papermill": { + "duration": 0.00011, + "end_time": "2026-01-16T10:23:56.165873", + "exception": false, + "start_time": "2026-01-16T10:23:56.165763", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "2b7f4e50-3731-46bc-b7a7-2ef5317da9d1", - "metadata": { - "papermill": { - "duration": 0.000106, - "end_time": "2026-01-16T10:23:59.792710", - "exception": false, - "start_time": "2026-01-16T10:23:59.792604", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 2.3. Check whether selected indicators are present in routine data\n", - "Extra precaution measure to avoid breaks downstream.
\n", - "\n", - "Note: This logic should be moved to pipeline.py 🐍" - ] + "tags": [] + }, + "source": [ + "### 1.1. Fallback parameters values\n", + "This parameters are injected by papermill when running in OH via pipeline run interface.
\n", + "The code cell below here provides fallback paramater values needed when running this notebook locally." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b17f7685-5291-4e5d-9eec-2d1f9435fccb", + "metadata": { + "papermill": { + "duration": 0.033954, + "end_time": "2026-01-16T10:23:56.199937", + "exception": false, + "start_time": "2026-01-16T10:23:56.165983", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "19ff7e56-2397-4ca1-b072-bca4ba1b3d0c", - "metadata": { - "papermill": { - "duration": 0.024863, - "end_time": "2026-01-16T10:23:59.817677", - "exception": false, - "start_time": "2026-01-16T10:23:59.792814", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (!all(ACTIVITY_INDICATORS %in% names(dhis2_routine))) {\n", - " log_msg(glue(\"🚨 Warning: one or more of the follow column is missing from `dhis2_routine`: {paste(ACTIVITY_INDICATORS, collapse = ', ')}\"), \"warning\")\n", - "}\n", - "\n", - "if (!all(VOLUME_ACTIVITY_INDICATORS %in% names(dhis2_routine))) {\n", - " msg <- glue(\"[ERROR] Volume activity indicator {VOLUME_ACTIVITY_INDICATORS} not present in the routine data. Process cannot continue.\")\n", - " cat(msg)\n", - " stop(msg)\n", - "}" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Current options: \n", + "# \"COUNTRY_CODE_routine.parquet\" (RAW data)\n", + "# \"COUNTRY_CODE_routine_outliers_removed.parquet\" \n", + "# \"COUNTRY_CODE_routine_outliers_imputed.parquet\"\n", + "if (!exists(\"ROUTINE_FILE\")) {ROUTINE_FILE <- \"XXX_routine_outliers_imputed.parquet\"}\n", + "\n", + "# Options: \"ROUTINE_ACTIVE_FACILITIES\", \"PYRAMID_OPEN_FACILITIES\"\n", + "if (!exists(\"DATAELEMENT_METHOD_DENOMINATOR\")) {DATAELEMENT_METHOD_DENOMINATOR <- \"ROUTINE_ACTIVE_FACILITIES\"}\n", + "if (!exists(\"ACTIVITY_INDICATORS\")) {ACTIVITY_INDICATORS <- c(\"CONF\", \"PRES\", \"SUSP\")} \n", + "if (!exists(\"VOLUME_ACTIVITY_INDICATORS\")) {VOLUME_ACTIVITY_INDICATORS <- c(\"CONF\", \"PRES\")}\n", + "if (!exists(\"USE_WEIGHTED_REPORTING_RATES\")) {USE_WEIGHTED_REPORTING_RATES <- FALSE}" + ] + }, + { + "cell_type": "markdown", + "id": "7dedcc32-c531-498d-90b9-89b0ee9fb9be", + "metadata": { + "papermill": { + "duration": 9.5e-05, + "end_time": "2026-01-16T10:23:56.200231", + "exception": false, + "start_time": "2026-01-16T10:23:56.200136", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "bcbd3a9f-5e45-4ae5-8671-e23155236295", - "metadata": { - "papermill": { - "duration": 0.000091, - "end_time": "2026-01-16T10:23:59.817949", - "exception": false, - "start_time": "2026-01-16T10:23:59.817858", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## 3. Reporting rates computations" - ] + "tags": [] + }, + "source": [ + "### 1.2. Load and check `snt config` file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b6d29ea-91f3-4c53-b95e-4b485f88383f", + "metadata": { + "papermill": { + "duration": 0.521572, + "end_time": "2026-01-16T10:23:56.721932", + "exception": false, + "start_time": "2026-01-16T10:23:56.200360", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "7d62cdb6", - "metadata": {}, - "source": [ - "#### 3.0. Define start and end period based on routine data " - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load SNT config\n", + "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", + " error = function(e) {\n", + " msg <- paste0(\"[ERROR] Error while loading configuration\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })\n", + "\n", + "log_msg(paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, \"SNT_config.json\")))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c26c981c-dadd-48ac-ae35-613b8ba61a82", + "metadata": { + "papermill": { + "duration": 0.033003, + "end_time": "2026-01-16T10:23:56.755117", + "exception": false, + "start_time": "2026-01-16T10:23:56.722114", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "3bc2e76a-b5c7-4c71-90f2-c66926ca560a", - "metadata": { - "papermill": { - "duration": 0.044172, - "end_time": "2026-01-16T10:23:59.862224", - "exception": false, - "start_time": "2026-01-16T10:23:59.818052", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "PERIOD_START <- dhis2_routine$PERIOD %>% min()\n", - "PERIOD_END <- dhis2_routine$PERIOD %>% max()\n", - "\n", - "period_vector <- format(seq(ym(PERIOD_START), ym(PERIOD_END), by = \"month\"), \"%Y%m\")\n", - "cat(glue(\"Start period: {PERIOD_START} \\nEnd period: {PERIOD_END} \\nPeriods count: {length(period_vector)}\"))" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Configuration settings\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", + "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", + "\n", + "# DHIS2_INDICATORS <- names(config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS)\n", + "DHIS2_INDICATORS <- c(\"CONF\", \"PRES\", \"SUSP\", \"TEST\") # GP 20260205\n", + "\n", + "ACTIVITY_INDICATORS <- unlist(ACTIVITY_INDICATORS)\n", + "VOLUME_ACTIVITY_INDICATORS <- unlist(VOLUME_ACTIVITY_INDICATORS)\n", + "fixed_cols <- c('PERIOD', 'YEAR', 'MONTH', 'ADM1_ID', 'ADM2_ID', 'OU_ID')\n", + "fixed_cols_rr <- c('YEAR', 'MONTH', 'ADM2_ID', 'REPORTING_RATE') # Fixed cols for exporting RR tables" + ] + }, + { + "cell_type": "markdown", + "id": "8bf4a8bb", + "metadata": {}, + "source": [ + "### 1.3. 🔍 Check: at least 1 indicator must be selected\n", + "The use can toggle on/off each of the indicators. Therefore, need to make sure at least one is ON.
\n", + "Indicator `CONF` is mandatory, but I think it looks better if they're all displayed in the Run pipeline view (more intuitive)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18b40207", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (!length(ACTIVITY_INDICATORS) > 0) {\n", + " msg <- \"[ERROR] Error: no indicator selected, cannot perform calculation of reporting rate method. Select at least one (e.g., `CONF`).\"\n", + " cat(msg) \n", + " stop(msg)\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "e44ae2ab-4af7-475a-8cbe-6d669895a18b", + "metadata": { + "papermill": { + "duration": 9.3e-05, + "end_time": "2026-01-16T10:23:56.779812", + "exception": false, + "start_time": "2026-01-16T10:23:56.779719", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "526bc3af-01c1-4ddc-b3b9-077354e57559", - "metadata": { - "papermill": { - "duration": 0.000109, - "end_time": "2026-01-16T10:23:59.862555", - "exception": false, - "start_time": "2026-01-16T10:23:59.862446", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 3.1. Build master table (all PERIOD x OU)\n", - "The master table contains all combinations of period x organisation unit " - ] + "tags": [] + }, + "source": [ + "## 2. Load Data" + ] + }, + { + "cell_type": "markdown", + "id": "39e2add7-bbc7-4312-9a6f-9886d675f532", + "metadata": { + "papermill": { + "duration": 6.9e-05, + "end_time": "2026-01-16T10:23:56.779987", + "exception": false, + "start_time": "2026-01-16T10:23:56.779918", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "9308197a-0852-4d34-8888-cf5564f35a9d", - "metadata": { - "papermill": { - "duration": 0.289128, - "end_time": "2026-01-16T10:24:00.151791", - "exception": false, - "start_time": "2026-01-16T10:23:59.862663", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "log_msg(glue(\"Building master table with periods from {PERIOD_START} to {PERIOD_END}. Periods count: {length(period_vector)}\"))\n", - "facility_master <- build_facility_master_dataelement(\n", - " dhis2_pyramid_formatted = dhis2_pyramid_formatted,\n", - " period_vector = period_vector,\n", - " config_json = config_json,\n", - " ADMIN_1 = ADMIN_1,\n", - " ADMIN_2 = ADMIN_2\n", - ")\n" - ] + "tags": [] + }, + "source": [ + "### 2.1. Routine data (DHIS2) \n", + "**Note on pipeline behaviour**:
\n", + "The value of `ROUTINE_FILE` is resolved within the pipeline.py code and injected into the notebook as parameter." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1213723-f7e2-4238-9f37-f1795b187232", + "metadata": { + "papermill": { + "duration": 2.018878, + "end_time": "2026-01-16T10:23:58.798963", + "exception": false, + "start_time": "2026-01-16T10:23:56.780085", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "d5af25ad-f17c-4cdc-ac96-908af49fe558", - "metadata": { - "papermill": { - "duration": 0.000114, - "end_time": "2026-01-16T10:24:00.152094", - "exception": false, - "start_time": "2026-01-16T10:24:00.151980", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 3.2. Identify \"Active\" facilities\n", - "\n", - "Facilities **reporting** zero or positive values on any of the selected indicators (**\"Activity indicators\"**) are considered to be **active**. Note that this method only counts **non-null** (not `NA`s) to prevent counting empty submissions as valid reporting.\n" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "rountine_dataset_name <- select_routine_dataset_name_dataelement(ROUTINE_FILE, COUNTRY_CODE, config_json)\n", + "dhis2_routine <- load_routine_data_dataelement(rountine_dataset_name, ROUTINE_FILE, COUNTRY_CODE)\n", + "dim(dhis2_routine)\n", + "head(dhis2_routine, 2)\n" + ] + }, + { + "cell_type": "markdown", + "id": "a8b91360-1a4e-4fc4-9883-602bc0ab2a2a", + "metadata": { + "papermill": { + "duration": 0.000138, + "end_time": "2026-01-16T10:23:58.799287", + "exception": false, + "start_time": "2026-01-16T10:23:58.799149", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "7b279d27", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "reporting_rate_dataelement <- compute_reporting_rate_dataelement(\n", - " facility_master = facility_master,\n", - " dhis2_routine = dhis2_routine,\n", - " DHIS2_INDICATORS = DHIS2_INDICATORS,\n", - " ACTIVITY_INDICATORS = ACTIVITY_INDICATORS,\n", - " VOLUME_ACTIVITY_INDICATORS = VOLUME_ACTIVITY_INDICATORS,\n", - " DATAELEMENT_METHOD_DENOMINATOR = DATAELEMENT_METHOD_DENOMINATOR,\n", - " USE_WEIGHTED_REPORTING_RATES = USE_WEIGHTED_REPORTING_RATES\n", - ")\n" - ] + "tags": [] + }, + "source": [ + "### 2.2. Organisation units (DHIS2 pyramid)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2fd92901-901e-4019-be78-a7718050c1c4", + "metadata": { + "papermill": { + "duration": 0.992899, + "end_time": "2026-01-16T10:23:59.792385", + "exception": false, + "start_time": "2026-01-16T10:23:58.799486", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "89c3e5c8-4a4e-497d-9d75-2aed2e8fe619", - "metadata": { - "papermill": { - "duration": 0.000107, - "end_time": "2026-01-16T10:24:01.626760", - "exception": false, - "start_time": "2026-01-16T10:24:01.626653", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 3.3. Identify `OPEN` facilities (denominator)\n", - "The \"OPEN\" variable indicates whether a facility is considered structurally open for a given reporting period.\n", - "\n", - "A facility is flagged as open (OPEN = 1) for a period if both of the following conditions are met:\n", - "1. No explicit closure in the facility name. The facility name does not contain closure keywords such as “CLOTUR”, “FERMÉ”, “FERMEE”, or similar.\n", - "\n", - "2. The period falls within the facility’s opening and closing dates. The opening date is not after the reporting period, and the closing date is not before or equal to the reporting period.\n", - "\n", - "If either of these conditions is not met, the facility is considered not open (OPEN = 0) for that period." - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "dhis2_pyramid_formatted <- load_pyramid_data_dataelement(config_json, COUNTRY_CODE)\n", + "dim(dhis2_pyramid_formatted)\n", + "head(dhis2_pyramid_formatted, 2)\n" + ] + }, + { + "cell_type": "markdown", + "id": "2b7f4e50-3731-46bc-b7a7-2ef5317da9d1", + "metadata": { + "papermill": { + "duration": 0.000106, + "end_time": "2026-01-16T10:23:59.792710", + "exception": false, + "start_time": "2026-01-16T10:23:59.792604", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "0b71f1d8-2048-4b62-865c-9acfe61b5b89", - "metadata": { - "papermill": { - "duration": 1.317923, - "end_time": "2026-01-16T10:24:02.944800", - "exception": false, - "start_time": "2026-01-16T10:24:01.626877", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Moved to utils for readability.\n" - ] + "tags": [] + }, + "source": [ + "### 2.3. Check whether selected indicators are present in routine data\n", + "Extra precaution measure to avoid breaks downstream.
\n", + "\n", + "Note: This logic should be moved to pipeline.py 🐍" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19ff7e56-2397-4ca1-b072-bca4ba1b3d0c", + "metadata": { + "papermill": { + "duration": 0.024863, + "end_time": "2026-01-16T10:23:59.817677", + "exception": false, + "start_time": "2026-01-16T10:23:59.792814", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "657fd6ca", - "metadata": {}, - "source": [ - "#### 3.4. Identify \"Active\" facilities for each YEAR (denominator)" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "if (!all(ACTIVITY_INDICATORS %in% names(dhis2_routine))) {\n", + " log_msg(glue(\"🚨 Warning: one or more of the follow column is missing from `dhis2_routine`: {paste(ACTIVITY_INDICATORS, collapse = ', ')}\"), \"warning\")\n", + "}\n", + "\n", + "if (!all(VOLUME_ACTIVITY_INDICATORS %in% names(dhis2_routine))) {\n", + " msg <- glue(\"[ERROR] Volume activity indicator {VOLUME_ACTIVITY_INDICATORS} not present in the routine data. Process cannot continue.\")\n", + " cat(msg)\n", + " stop(msg)\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "bcbd3a9f-5e45-4ae5-8671-e23155236295", + "metadata": { + "papermill": { + "duration": 9.1e-05, + "end_time": "2026-01-16T10:23:59.817949", + "exception": false, + "start_time": "2026-01-16T10:23:59.817858", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "a598e4b7", - "metadata": {}, - "source": [ - "
\n", - " Important: this step could have a huge influence on reporting rates!
\n", - " Activity can be evaluated over 1 year or across all years, based on grouping: group_by(OU_ID, YEAR):
\n", - "
    \n", - "
  • With YEAR → “active that year”
  • \n", - "
  • Without YEAR → “ever active over the entire extracted period”
  • \n", - "
\n", - "
" - ] + "tags": [] + }, + "source": [ + "## 3. Reporting rates computations" + ] + }, + { + "cell_type": "markdown", + "id": "7d62cdb6", + "metadata": {}, + "source": [ + "#### 3.0. Define start and end period based on routine data " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3bc2e76a-b5c7-4c71-90f2-c66926ca560a", + "metadata": { + "papermill": { + "duration": 0.044172, + "end_time": "2026-01-16T10:23:59.862224", + "exception": false, + "start_time": "2026-01-16T10:23:59.818052", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "002e7fbf-1f68-4419-be2d-f16d8c72936d", - "metadata": { - "papermill": { - "duration": 0.173961, - "end_time": "2026-01-16T10:24:05.948136", - "exception": false, - "start_time": "2026-01-16T10:24:05.774175", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Moved to utils for readability.\n" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "PERIOD_START <- dhis2_routine$PERIOD %>% min()\n", + "PERIOD_END <- dhis2_routine$PERIOD %>% max()\n", + "\n", + "period_vector <- format(seq(ym(PERIOD_START), ym(PERIOD_END), by = \"month\"), \"%Y%m\")\n", + "cat(glue(\"Start period: {PERIOD_START} \\nEnd period: {PERIOD_END} \\nPeriods count: {length(period_vector)}\"))" + ] + }, + { + "cell_type": "markdown", + "id": "526bc3af-01c1-4ddc-b3b9-077354e57559", + "metadata": { + "papermill": { + "duration": 0.000109, + "end_time": "2026-01-16T10:23:59.862555", + "exception": false, + "start_time": "2026-01-16T10:23:59.862446", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "160c08ec-cc9a-4e1a-99ec-f703db83a71d", - "metadata": { - "papermill": { - "duration": 0.000098, - "end_time": "2026-01-16T10:24:05.948452", - "exception": false, - "start_time": "2026-01-16T10:24:05.948354", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 3.5. Compute Weighting factor based on \"volume of activity\"" - ] + "tags": [] + }, + "source": [ + "#### 3.1. Build master table (all PERIOD x OU)\n", + "The master table contains all combinations of period x organisation unit " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9308197a-0852-4d34-8888-cf5564f35a9d", + "metadata": { + "papermill": { + "duration": 0.289128, + "end_time": "2026-01-16T10:24:00.151791", + "exception": false, + "start_time": "2026-01-16T10:23:59.862663", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "4420e559-4134-4fc3-8950-9972ebede00e", - "metadata": { - "papermill": { - "duration": 0.520673, - "end_time": "2026-01-16T10:24:06.469233", - "exception": false, - "start_time": "2026-01-16T10:24:05.948560", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Moved to utils for readability.\n" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "log_msg(glue(\"Building master table with periods from {PERIOD_START} to {PERIOD_END}. Periods count: {length(period_vector)}\"))\n", + "facility_master <- build_facility_master_dataelement(\n", + " dhis2_pyramid_formatted = dhis2_pyramid_formatted,\n", + " period_vector = period_vector,\n", + " config_json = config_json,\n", + " ADMIN_1 = ADMIN_1,\n", + " ADMIN_2 = ADMIN_2\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "id": "d5af25ad-f17c-4cdc-ac96-908af49fe558", + "metadata": { + "papermill": { + "duration": 0.000114, + "end_time": "2026-01-16T10:24:00.152094", + "exception": false, + "start_time": "2026-01-16T10:24:00.151980", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "2fed8529-70e9-4e2e-a498-fe3dd7499bb3", - "metadata": { - "papermill": { - "duration": 0.000108, - "end_time": "2026-01-16T10:24:06.469622", - "exception": false, - "start_time": "2026-01-16T10:24:06.469514", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 3.6. Compute Weighted variables" - ] + "tags": [] + }, + "source": [ + "#### 3.2. Identify \"Active\" facilities\n", + "\n", + "Facilities **reporting** zero or positive values on any of the selected indicators (**\"Activity indicators\"**) are considered to be **active**. Note that this method only counts **non-null** (not `NA`s) to prevent counting empty submissions as valid reporting.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7b279d27", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Join routine values to the facility master and define monthly activity\n", + "facility_master_routine <- dplyr::left_join(\n", + " facility_master,\n", + " dhis2_routine %>% dplyr::select(OU_ID, PERIOD, dplyr::any_of(DHIS2_INDICATORS)),\n", + " by = c(\"OU_ID\", \"PERIOD\")\n", + ") %>%\n", + " dplyr::mutate(\n", + " YEAR = as.numeric(substr(PERIOD, 1, 4)),\n", + " ACTIVE_THIS_PERIOD = ifelse(\n", + " rowSums(!is.na(dplyr::across(dplyr::all_of(ACTIVITY_INDICATORS))) &\n", + " dplyr::across(dplyr::all_of(ACTIVITY_INDICATORS)) >= 0) > 0, 1, 0\n", + " ),\n", + " COUNT = 1\n", + " )\n" + ] + }, + { + "cell_type": "markdown", + "id": "89c3e5c8-4a4e-497d-9d75-2aed2e8fe619", + "metadata": { + "papermill": { + "duration": 0.000107, + "end_time": "2026-01-16T10:24:01.626760", + "exception": false, + "start_time": "2026-01-16T10:24:01.626653", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "216f7658-c1da-44e4-9f4f-fdb44fd40259", - "metadata": { - "papermill": { - "duration": 0.483413, - "end_time": "2026-01-16T10:24:06.953139", - "exception": false, - "start_time": "2026-01-16T10:24:06.469726", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Moved to utils for readability.\n" - ] + "tags": [] + }, + "source": [ + "#### 3.3. Identify `OPEN` facilities (denominator)\n", + "The \"OPEN\" variable indicates whether a facility is considered structurally open for a given reporting period.\n", + "\n", + "A facility is flagged as open (OPEN = 1) for a period if both of the following conditions are met:\n", + "1. No explicit closure in the facility name. The facility name does not contain closure keywords such as “CLOTUR”, “FERMÉ”, “FERMEE”, or similar.\n", + "\n", + "2. The period falls within the facility’s opening and closing dates. The opening date is not after the reporting period, and the closing date is not before or equal to the reporting period.\n", + "\n", + "If either of these conditions is not met, the facility is considered not open (OPEN = 0) for that period." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b71f1d8-2048-4b62-865c-9acfe61b5b89", + "metadata": { + "papermill": { + "duration": 1.317923, + "end_time": "2026-01-16T10:24:02.944800", + "exception": false, + "start_time": "2026-01-16T10:24:01.626877", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "9c0367f7-91cd-4524-abe4-11adf2fcea02", - "metadata": { - "papermill": { - "duration": 0.000172, - "end_time": "2026-01-16T10:24:06.953755", - "exception": false, - "start_time": "2026-01-16T10:24:06.953583", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 3.7. Aggregate data at ADM2 level" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# 3.3 Identify OPEN facilities from naming and opening/closing dates\n", + "facility_master_routine <- facility_master_routine %>%\n", + " dplyr::mutate(\n", + " period_date = as.Date(zoo::as.yearmon(as.character(PERIOD), \"%Y%m\")),\n", + " NAME_CLOSED = stringr::str_detect(toupper(OU_NAME), \"CLOTUR|FERM(E|EE)?\"),\n", + " OPEN_BY_DATE = !(is.na(OPENING_DATE) | as.Date(OPENING_DATE) > period_date |\n", + " (!is.na(CLOSED_DATE) & as.Date(CLOSED_DATE) <= period_date)),\n", + " OPEN = ifelse(!NAME_CLOSED & OPEN_BY_DATE, 1, 0)\n", + " )\n" + ] + }, + { + "cell_type": "markdown", + "id": "657fd6ca", + "metadata": {}, + "source": [ + "#### 3.4. Identify \"Active\" facilities for each YEAR (denominator)" + ] + }, + { + "cell_type": "markdown", + "id": "a598e4b7", + "metadata": {}, + "source": [ + "
\n", + " Important: this step could have a huge influence on reporting rates!
\n", + " Activity can be evaluated over 1 year or across all years, based on grouping: group_by(OU_ID, YEAR):
\n", + "
    \n", + "
  • With YEAR → “active that year”
  • \n", + "
  • Without YEAR → “ever active over the entire extracted period”
  • \n", + "
\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "002e7fbf-1f68-4419-be2d-f16d8c72936d", + "metadata": { + "papermill": { + "duration": 0.173961, + "end_time": "2026-01-16T10:24:05.948136", + "exception": false, + "start_time": "2026-01-16T10:24:05.774175", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "af13191e", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Moved to utils for readability.\n" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# 3.4 Mark facilities active at least once per year\n", + "facility_master_routine <- facility_master_routine %>%\n", + " dplyr::group_by(OU_ID, YEAR) %>%\n", + " dplyr::mutate(ACTIVE_THIS_YEAR = max(ACTIVE_THIS_PERIOD, na.rm = TRUE)) %>%\n", + " dplyr::ungroup()\n" + ] + }, + { + "cell_type": "markdown", + "id": "160c08ec-cc9a-4e1a-99ec-f703db83a71d", + "metadata": { + "papermill": { + "duration": 9.8e-05, + "end_time": "2026-01-16T10:24:05.948452", + "exception": false, + "start_time": "2026-01-16T10:24:05.948354", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "7d381937", - "metadata": {}, - "source": [ - "#### 3.8. Calculate Reporting Rates (all methods)" - ] + "tags": [] + }, + "source": [ + "#### 3.5. Compute Weighting factor based on \"volume of activity\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4420e559-4134-4fc3-8950-9972ebede00e", + "metadata": { + "papermill": { + "duration": 0.520673, + "end_time": "2026-01-16T10:24:06.469233", + "exception": false, + "start_time": "2026-01-16T10:24:05.948560", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "b41263f8", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Moved to utils for readability.\n" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# 3.5 Compute facility weights from volume of activity\n", + "mean_monthly_cases <- dhis2_routine %>%\n", + " dplyr::mutate(total_cases_by_hf_month = rowSums(dplyr::across(dplyr::all_of(VOLUME_ACTIVITY_INDICATORS)), na.rm = TRUE)) %>%\n", + " dplyr::group_by(ADM2_ID, OU_ID) %>%\n", + " dplyr::summarise(\n", + " total_cases_by_hf_year = sum(total_cases_by_hf_month, na.rm = TRUE),\n", + " number_of_reporting_months = length(which(total_cases_by_hf_month > 0)),\n", + " .groups = \"drop\"\n", + " ) %>%\n", + " dplyr::mutate(MEAN_REPORTED_CASES_BY_HF = total_cases_by_hf_year / number_of_reporting_months) %>%\n", + " dplyr::select(ADM2_ID, OU_ID, MEAN_REPORTED_CASES_BY_HF)\n", + "\n", + "mean_monthly_cases_adm2 <- mean_monthly_cases %>%\n", + " dplyr::select(ADM2_ID, MEAN_REPORTED_CASES_BY_HF) %>%\n", + " dplyr::group_by(ADM2_ID) %>%\n", + " dplyr::summarise(\n", + " SUMMED_MEAN_REPORTED_CASES_BY_ADM2 = sum(MEAN_REPORTED_CASES_BY_HF, na.rm = TRUE),\n", + " NR_OF_HF = dplyr::n(),\n", + " .groups = \"drop\"\n", + " )\n", + "\n", + "hf_weights <- mean_monthly_cases %>%\n", + " dplyr::left_join(mean_monthly_cases_adm2, by = \"ADM2_ID\") %>%\n", + " dplyr::mutate(WEIGHT = MEAN_REPORTED_CASES_BY_HF / SUMMED_MEAN_REPORTED_CASES_BY_ADM2 * NR_OF_HF)\n" + ] + }, + { + "cell_type": "markdown", + "id": "2fed8529-70e9-4e2e-a498-fe3dd7499bb3", + "metadata": { + "papermill": { + "duration": 0.000108, + "end_time": "2026-01-16T10:24:06.469622", + "exception": false, + "start_time": "2026-01-16T10:24:06.469514", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "5e593659", - "metadata": { - "papermill": { - "duration": 0.000108, - "end_time": "2026-01-16T10:24:07.310579", - "exception": false, - "start_time": "2026-01-16T10:24:07.310471", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## 4. Select correct col for `REPORTING_RATE` based on denominator method" - ] + "tags": [] + }, + "source": [ + "#### 3.6. Compute Weighted variables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "216f7658-c1da-44e4-9f4f-fdb44fd40259", + "metadata": { + "papermill": { + "duration": 0.483413, + "end_time": "2026-01-16T10:24:06.953139", + "exception": false, + "start_time": "2026-01-16T10:24:06.469726", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "c75f2249", - "metadata": { - "papermill": { - "duration": 0.000057, - "end_time": "2026-01-16T10:24:07.310743", - "exception": false, - "start_time": "2026-01-16T10:24:07.310686", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 4.1. Select results and format" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# 3.6 Apply weights to monthly status variables\n", + "facility_master_routine_02 <- facility_master_routine %>%\n", + " dplyr::left_join(hf_weights %>% dplyr::select(OU_ID, WEIGHT), by = c(\"OU_ID\"))\n", + "\n", + "facility_master_routine_02$ACTIVE_THIS_PERIOD_W <- facility_master_routine_02$ACTIVE_THIS_PERIOD * facility_master_routine_02$WEIGHT\n", + "facility_master_routine_02$COUNT_W <- facility_master_routine_02$COUNT * facility_master_routine_02$WEIGHT\n", + "facility_master_routine_02$OPEN_W <- facility_master_routine_02$OPEN * facility_master_routine_02$WEIGHT\n", + "facility_master_routine_02$ACTIVE_THIS_YEAR_W <- facility_master_routine_02$ACTIVE_THIS_YEAR * facility_master_routine_02$WEIGHT\n" + ] + }, + { + "cell_type": "markdown", + "id": "9c0367f7-91cd-4524-abe4-11adf2fcea02", + "metadata": { + "papermill": { + "duration": 0.000172, + "end_time": "2026-01-16T10:24:06.953755", + "exception": false, + "start_time": "2026-01-16T10:24:06.953583", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "75e71b38", - "metadata": { - "papermill": { - "duration": 0.020644, - "end_time": "2026-01-16T10:24:07.351317", - "exception": false, - "start_time": "2026-01-16T10:24:07.330673", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Moved to utils for readability.\n" - ] + "tags": [] + }, + "source": [ + "#### 3.7. Aggregate data at ADM2 level" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "af13191e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# 3.7 Aggregate monthly counts at ADM2 level\n", + "reporting_rate_adm2 <- facility_master_routine_02 %>%\n", + " dplyr::group_by(ADM1_ID, ADM1_NAME, ADM2_ID, ADM2_NAME, YEAR, PERIOD) %>%\n", + " dplyr::summarise(\n", + " HF_ACTIVE_THIS_PERIOD_BY_ADM2 = sum(ACTIVE_THIS_PERIOD, na.rm = TRUE),\n", + " NR_OF_HF_BY_ADM2 = sum(COUNT, na.rm = TRUE),\n", + " NR_OF_OPEN_HF_BY_ADM2 = sum(OPEN, na.rm = TRUE),\n", + " HF_ACTIVE_THIS_YEAR_BY_ADM2 = sum(ACTIVE_THIS_YEAR, na.rm = TRUE),\n", + " HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED = sum(ACTIVE_THIS_PERIOD_W, na.rm = TRUE),\n", + " NR_OF_HF_BY_ADM2_WEIGHTED = sum(COUNT_W, na.rm = TRUE),\n", + " NR_OF_OPEN_HF_BY_ADM2_WEIGHTED = sum(OPEN_W, na.rm = TRUE),\n", + " HF_ACTIVE_THIS_YEAR_BY_ADM2_WEIGHTED = sum(ACTIVE_THIS_YEAR_W, na.rm = TRUE),\n", + " .groups = \"drop\"\n", + " ) %>%\n", + " dplyr::mutate(\n", + " RR_TOTAL_HF = HF_ACTIVE_THIS_PERIOD_BY_ADM2 / NR_OF_HF_BY_ADM2,\n", + " RR_OPEN_HF = HF_ACTIVE_THIS_PERIOD_BY_ADM2 / NR_OF_OPEN_HF_BY_ADM2,\n", + " RR_ACTIVE_HF = HF_ACTIVE_THIS_PERIOD_BY_ADM2 / HF_ACTIVE_THIS_YEAR_BY_ADM2,\n", + " RR_TOTAL_HF_W = HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED / NR_OF_HF_BY_ADM2_WEIGHTED,\n", + " RR_OPEN_HF_W = HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED / NR_OF_OPEN_HF_BY_ADM2_WEIGHTED,\n", + " RR_ACTIVE_HF_W = HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED / HF_ACTIVE_THIS_YEAR_BY_ADM2_WEIGHTED\n", + " )\n" + ] + }, + { + "cell_type": "markdown", + "id": "7d381937", + "metadata": {}, + "source": [ + "#### 3.8. Calculate Reporting Rates (all methods)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b41263f8", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# 3.8 Select final reporting-rate definition for export\n", + "rr_column_selection <- if (DATAELEMENT_METHOD_DENOMINATOR == \"ROUTINE_ACTIVE_FACILITIES\") \"RR_ACTIVE_HF\" else \"RR_OPEN_HF\"\n", + "if (USE_WEIGHTED_REPORTING_RATES) {\n", + " rr_column_selection <- if (DATAELEMENT_METHOD_DENOMINATOR == \"ROUTINE_ACTIVE_FACILITIES\") \"RR_ACTIVE_HF_W\" else \"RR_OPEN_HF_W\"\n", + "}\n", + "\n", + "reporting_rate_dataelement <- reporting_rate_adm2 %>%\n", + " dplyr::mutate(MONTH = PERIOD %% 100) %>%\n", + " dplyr::rename(REPORTING_RATE = !!rlang::sym(rr_column_selection)) %>%\n", + " dplyr::select(YEAR, MONTH, ADM2_ID, REPORTING_RATE)\n" + ] + }, + { + "cell_type": "markdown", + "id": "5e593659", + "metadata": { + "papermill": { + "duration": 0.000108, + "end_time": "2026-01-16T10:24:07.310579", + "exception": false, + "start_time": "2026-01-16T10:24:07.310471", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "3df36abb", - "metadata": { - "papermill": { - "duration": 0.140976, - "end_time": "2026-01-16T10:24:07.492479", - "exception": false, - "start_time": "2026-01-16T10:24:07.351503", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Moved to utils for readability.\n" - ] + "tags": [] + }, + "source": [ + "## 4. Select correct col for `REPORTING_RATE` based on denominator method" + ] + }, + { + "cell_type": "markdown", + "id": "c75f2249", + "metadata": { + "papermill": { + "duration": 5.7e-05, + "end_time": "2026-01-16T10:24:07.310743", + "exception": false, + "start_time": "2026-01-16T10:24:07.310686", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "0ccc272c", - "metadata": { - "papermill": { - "duration": 0.182574, - "end_time": "2026-01-16T10:24:07.675242", - "exception": false, - "start_time": "2026-01-16T10:24:07.492668", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Moved to utils for readability.\n" - ] + "tags": [] + }, + "source": [ + "### 4.1. Select results and format" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75e71b38", + "metadata": { + "papermill": { + "duration": 0.020644, + "end_time": "2026-01-16T10:24:07.351317", + "exception": false, + "start_time": "2026-01-16T10:24:07.330673", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "ca66e785", - "metadata": { - "papermill": { - "duration": 0.000109, - "end_time": "2026-01-16T10:24:07.675637", - "exception": false, - "start_time": "2026-01-16T10:24:07.675528", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## 5. Inspect reporting rate values" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# 4.1 Confirm which denominator/weighting option was selected\n", + "cat(glue::glue(\n", + " \"Selected denominator method: {DATAELEMENT_METHOD_DENOMINATOR} | Weighted reporting rates: {USE_WEIGHTED_REPORTING_RATES}\"\n", + "))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3df36abb", + "metadata": { + "papermill": { + "duration": 0.140976, + "end_time": "2026-01-16T10:24:07.492479", + "exception": false, + "start_time": "2026-01-16T10:24:07.351503", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "31535459", - "metadata": { - "papermill": { - "duration": 0.160299, - "end_time": "2026-01-16T10:24:07.836039", - "exception": false, - "start_time": "2026-01-16T10:24:07.675740", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "hist(reporting_rate_dataelement$REPORTING_RATE, breaks=50, \n", - "main=paste0(\"Histogram of REPORTING_RATE\\n(\", DATAELEMENT_METHOD_DENOMINATOR, \",\\n\", ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted\", \"Unweighted\"), \")\"), \n", - "xlab=\"REPORTING_RATE\")" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Output preview\n", + "dim(reporting_rate_dataelement)\n", + "head(reporting_rate_dataelement, 5)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ccc272c", + "metadata": { + "papermill": { + "duration": 0.182574, + "end_time": "2026-01-16T10:24:07.675242", + "exception": false, + "start_time": "2026-01-16T10:24:07.492668", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "6778f17d", - "metadata": { - "papermill": { - "duration": 0.896382, - "end_time": "2026-01-16T10:24:08.732660", - "exception": false, - "start_time": "2026-01-16T10:24:07.836278", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Boxplot\n", - "ggplot(reporting_rate_dataelement,\n", - " aes(x = factor(YEAR), y = REPORTING_RATE)) +\n", - " geom_boxplot(outlier.alpha = 0.3) +\n", - " labs(\n", - " x = \"Year\",\n", - " y = glue::glue(\"REPORTING_RATE ({DATAELEMENT_METHOD_DENOMINATOR})\"),\n", - " title = \"Distribution of REPORTING_RATE per year\",\n", - " subtitle = ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted Reporting Rates\", \"Unweighted Reporting Rates\")\n", - " ) +\n", - " theme_minimal()" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Basic data quality checks\n", + "summary(reporting_rate_dataelement$REPORTING_RATE)\n", + "sum(is.na(reporting_rate_dataelement$REPORTING_RATE))\n" + ] + }, + { + "cell_type": "markdown", + "id": "ca66e785", + "metadata": { + "papermill": { + "duration": 0.000109, + "end_time": "2026-01-16T10:24:07.675637", + "exception": false, + "start_time": "2026-01-16T10:24:07.675528", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "a7f013fd", - "metadata": { - "papermill": { - "duration": 0.859448, - "end_time": "2026-01-16T10:24:09.592295", - "exception": false, - "start_time": "2026-01-16T10:24:08.732847", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "ggplot(reporting_rate_dataelement,\n", - " aes(x = factor(YEAR), y = REPORTING_RATE)) +\n", - "# Boxplot without outliers\n", - " geom_boxplot(outlier.alpha = 0) +\n", - " geom_point(alpha = 0.3, position = position_jitter(width = 0.35)) +\n", - " labs(\n", - " x = \"Year\",\n", - " y = glue::glue(\"REPORTING_RATE based on {DATAELEMENT_METHOD_DENOMINATOR}\"),\n", - " title = \"Distribution of REPORTING_RATE per year\",\n", - " subtitle = ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted Reporting Rates\", \"Unweighted Reporting Rates\")\n", - " ) +\n", - " theme_minimal()" - ] + "tags": [] + }, + "source": [ + "## 5. Inspect reporting rate values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31535459", + "metadata": { + "papermill": { + "duration": 0.160299, + "end_time": "2026-01-16T10:24:07.836039", + "exception": false, + "start_time": "2026-01-16T10:24:07.675740", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "2866816a-7015-4c5c-b904-f553f3b4790d", - "metadata": { - "papermill": { - "duration": 0.000088, - "end_time": "2026-01-16T10:24:09.592563", - "exception": false, - "start_time": "2026-01-16T10:24:09.592475", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## 5. 📁 Export to `data/` folder" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "hist(reporting_rate_dataelement$REPORTING_RATE, breaks=50, \n", + "main=paste0(\"Histogram of REPORTING_RATE\\n(\", DATAELEMENT_METHOD_DENOMINATOR, \",\\n\", ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted\", \"Unweighted\"), \")\"), \n", + "xlab=\"REPORTING_RATE\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6778f17d", + "metadata": { + "papermill": { + "duration": 0.896382, + "end_time": "2026-01-16T10:24:08.732660", + "exception": false, + "start_time": "2026-01-16T10:24:07.836278", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "bbf27852-8ec5-4370-aae2-49e082928fe1", - "metadata": { - "papermill": { - "duration": 0.919937, - "end_time": "2026-01-16T10:24:10.512602", - "exception": false, - "start_time": "2026-01-16T10:24:09.592665", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "export_reporting_rate_dataelement(\n", - " reporting_rate_dataelement = reporting_rate_dataelement,\n", - " DATA_PATH = DATA_PATH,\n", - " COUNTRY_CODE = COUNTRY_CODE\n", - ")\n" - ] + "tags": [], + "vscode": { + "languageId": "r" } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" + }, + "outputs": [], + "source": [ + "# Boxplot\n", + "ggplot(reporting_rate_dataelement,\n", + " aes(x = factor(YEAR), y = REPORTING_RATE)) +\n", + " geom_boxplot(outlier.alpha = 0.3) +\n", + " labs(\n", + " x = \"Year\",\n", + " y = glue::glue(\"REPORTING_RATE ({DATAELEMENT_METHOD_DENOMINATOR})\"),\n", + " title = \"Distribution of REPORTING_RATE per year\",\n", + " subtitle = ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted Reporting Rates\", \"Unweighted Reporting Rates\")\n", + " ) +\n", + " theme_minimal()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7f013fd", + "metadata": { + "papermill": { + "duration": 0.859448, + "end_time": "2026-01-16T10:24:09.592295", + "exception": false, + "start_time": "2026-01-16T10:24:08.732847", + "status": "completed" }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "ggplot(reporting_rate_dataelement,\n", + " aes(x = factor(YEAR), y = REPORTING_RATE)) +\n", + "# Boxplot without outliers\n", + " geom_boxplot(outlier.alpha = 0) +\n", + " geom_point(alpha = 0.3, position = position_jitter(width = 0.35)) +\n", + " labs(\n", + " x = \"Year\",\n", + " y = glue::glue(\"REPORTING_RATE based on {DATAELEMENT_METHOD_DENOMINATOR}\"),\n", + " title = \"Distribution of REPORTING_RATE per year\",\n", + " subtitle = ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted Reporting Rates\", \"Unweighted Reporting Rates\")\n", + " ) +\n", + " theme_minimal()" + ] + }, + { + "cell_type": "markdown", + "id": "2866816a-7015-4c5c-b904-f553f3b4790d", + "metadata": { + "papermill": { + "duration": 8.8e-05, + "end_time": "2026-01-16T10:24:09.592563", + "exception": false, + "start_time": "2026-01-16T10:24:09.592475", + "status": "completed" }, + "tags": [] + }, + "source": [ + "## 5. 📁 Export to `data/` folder" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bbf27852-8ec5-4370-aae2-49e082928fe1", + "metadata": { "papermill": { - "default_parameters": {}, - "duration": 81.158347, - "end_time": "2026-01-16T10:24:10.736106", - "environment_variables": {}, - "exception": null, - "input_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb", - "output_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataelement/papermill_outputs/snt_dhis2_reporting_rate_dataelement_OUTPUT_2026-01-16_102249.ipynb", - "parameters": { - "AVAILABILITY_INDICATORS": [ - "CONF", - "PRES", - "SUSP", - "TEST" - ], - "DATAELEMENT_METHOD_DENOMINATOR": "ROUTINE_ACTIVE_FACILITIES", - "ROUTINE_FILE": "XXX_routine_outliers_removed.parquet", - "SNT_ROOT_PATH": "/home/hexa/workspace", - "USE_WEIGHTED_REPORTING_RATES": true, - "VOLUME_ACTIVITY_INDICATORS": [ - "CONF", - "PRES" - ] - }, - "start_time": "2026-01-16T10:22:49.577759", - "version": "2.6.0" + "duration": 0.919937, + "end_time": "2026-01-16T10:24:10.512602", + "exception": false, + "start_time": "2026-01-16T10:24:09.592665", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" } + }, + "outputs": [], + "source": [ + "export_reporting_rate_dataelement(\n", + " reporting_rate_dataelement = reporting_rate_dataelement,\n", + " DATA_PATH = DATA_PATH,\n", + " COUNTRY_CODE = COUNTRY_CODE\n", + ")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" }, - "nbformat": 4, - "nbformat_minor": 5 + "papermill": { + "default_parameters": {}, + "duration": 81.158347, + "end_time": "2026-01-16T10:24:10.736106", + "environment_variables": {}, + "exception": null, + "input_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb", + "output_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataelement/papermill_outputs/snt_dhis2_reporting_rate_dataelement_OUTPUT_2026-01-16_102249.ipynb", + "parameters": { + "AVAILABILITY_INDICATORS": [ + "CONF", + "PRES", + "SUSP", + "TEST" + ], + "DATAELEMENT_METHOD_DENOMINATOR": "ROUTINE_ACTIVE_FACILITIES", + "ROUTINE_FILE": "XXX_routine_outliers_removed.parquet", + "SNT_ROOT_PATH": "/home/hexa/workspace", + "USE_WEIGHTED_REPORTING_RATES": true, + "VOLUME_ACTIVITY_INDICATORS": [ + "CONF", + "PRES" + ] + }, + "start_time": "2026-01-16T10:22:49.577759", + "version": "2.6.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb b/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb index f26b3b6..f39b231 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb +++ b/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb @@ -489,11 +489,57 @@ }, "outputs": [], "source": [ - "reporting_rate_results <- compute_reporting_rate_dataset(\n", - " dhis2_reporting = dhis2_reporting,\n", - " REPORTING_RATE_PRODUCT_ID = REPORTING_RATE_PRODUCT_ID,\n", - " COUNTRY_CODE = COUNTRY_CODE\n", - ")\n" + "# 3.1 Filter Reporting Rate data by selected dataset PRODUCT_UID(s)\n", + "if (length(REPORTING_RATE_PRODUCT_ID) > 0 && all(REPORTING_RATE_PRODUCT_ID %in% unique(dhis2_reporting$PRODUCT_UID))) {\n", + " dhis2_reporting <- dhis2_reporting %>% dplyr::filter(PRODUCT_UID %in% REPORTING_RATE_PRODUCT_ID)\n", + "} else if (length(REPORTING_RATE_PRODUCT_ID) > 0) {\n", + " log_msg(glue::glue(\n", + " \"?? Warning: REPORTING_RATE_PRODUCT_UID: {paste(REPORTING_RATE_PRODUCT_ID, collapse=', ')} not found in DHIS2 reporting data. Skipping filtering.\"\n", + " ), level = \"warning\")\n", + "}\n", + "\n", + "# 3.2 Pivot wider on PRODUCT_METRIC\n", + "dhis2_reporting_wide <- dhis2_reporting %>%\n", + " tidyr::pivot_wider(names_from = PRODUCT_METRIC, values_from = VALUE)\n", + "\n", + "# 3.3 Detect duplicated OU_ID / PERIOD combinations across datasets\n", + "dupl_ou_period <- dhis2_reporting_wide %>%\n", + " dplyr::group_by(OU_ID, PERIOD) %>%\n", + " dplyr::filter(dplyr::n() > 1) %>%\n", + " dplyr::ungroup() %>%\n", + " dplyr::select(OU_ID, OU_NAME, PERIOD, PRODUCT_UID, dplyr::ends_with(\"REPORTS\"))\n", + "\n", + "# If duplicates are binary reports (0/1), keep the row where ACTUAL_REPORTS is maximal\n", + "if (nrow(dupl_ou_period) > 0 &&\n", + " all(dupl_ou_period$ACTUAL_REPORTS %in% c(0, 1), na.rm = TRUE) &&\n", + " all(dupl_ou_period$EXPECTED_REPORTS %in% c(0, 1), na.rm = TRUE)) {\n", + "\n", + " dhis2_reporting_wide <- dhis2_reporting_wide %>%\n", + " dplyr::group_by(PERIOD, OU_ID) %>%\n", + " dplyr::mutate(ACTUAL_REPORTS_deduplicated = ifelse(OU_ID %in% dupl_ou_period$OU_ID, max(ACTUAL_REPORTS), ACTUAL_REPORTS)) %>%\n", + " dplyr::ungroup() %>%\n", + " dplyr::filter(!(OU_ID %in% dupl_ou_period$OU_ID) | (ACTUAL_REPORTS == ACTUAL_REPORTS_deduplicated)) %>%\n", + " dplyr::select(-ACTUAL_REPORTS_deduplicated)\n", + "}\n", + "\n", + "# Country-specific normalization for Niger where reports can exceed 1\n", + "if (COUNTRY_CODE == \"NER\") {\n", + " dhis2_reporting_wide <- dhis2_reporting_wide %>%\n", + " dplyr::mutate(\n", + " ACTUAL_REPORTS = ifelse(ACTUAL_REPORTS > 1, 1, ACTUAL_REPORTS),\n", + " EXPECTED_REPORTS = ifelse(EXPECTED_REPORTS > 1, 1, EXPECTED_REPORTS)\n", + " )\n", + "}\n", + "\n", + "# 3.4 Aggregate at ADM2 and compute reporting rate\n", + "reporting_rate_results <- dhis2_reporting_wide %>%\n", + " dplyr::group_by(PERIOD, YEAR, MONTH, ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID) %>%\n", + " dplyr::summarise(\n", + " ACTUAL_REPORTS = sum(ACTUAL_REPORTS, na.rm = TRUE),\n", + " EXPECTED_REPORTS = sum(EXPECTED_REPORTS, na.rm = TRUE),\n", + " .groups = \"drop\"\n", + " ) %>%\n", + " dplyr::mutate(REPORTING_RATE = ACTUAL_REPORTS / EXPECTED_REPORTS)\n" ] }, { @@ -538,7 +584,9 @@ }, "outputs": [], "source": [ - "# Moved to utils for readability.\n" + "# 3.2 Quick check after pivot\n", + "dim(dhis2_reporting_wide)\n", + "head(dhis2_reporting_wide, 3)\n" ] }, { @@ -601,7 +649,9 @@ }, "outputs": [], "source": [ - "# Moved to utils for readability.\n" + "# Count duplicated OU_ID/PERIOD combinations found\n", + "cat(glue::glue(\"Duplicated OU_ID-PERIOD rows detected: {nrow(dupl_ou_period)}\"))\n", + "head(dupl_ou_period, 5)\n" ] }, { @@ -654,7 +704,12 @@ }, "outputs": [], "source": [ - "# Moved to utils for readability.\n" + "# Verify deduplication effect at OU_ID/PERIOD level\n", + "dupl_after_cleaning <- dhis2_reporting_wide %>%\n", + " dplyr::group_by(OU_ID, PERIOD) %>%\n", + " dplyr::filter(dplyr::n() > 1) %>%\n", + " dplyr::ungroup()\n", + "cat(glue::glue(\"Remaining duplicated OU_ID-PERIOD rows after cleaning: {nrow(dupl_after_cleaning)}\"))\n" ] }, { @@ -682,7 +737,8 @@ }, "outputs": [], "source": [ - "# Moved to utils for readability.\n" + "# Optional inspection of cleaned rows\n", + "head(dhis2_reporting_wide, 5)\n" ] }, { @@ -730,7 +786,13 @@ }, "outputs": [], "source": [ - "# Moved to utils for readability.\n" + "# NER-specific normalization quality check\n", + "if (COUNTRY_CODE == \"NER\") {\n", + " cat(\"Applied NER normalization: ACTUAL_REPORTS and EXPECTED_REPORTS capped at 1.\n", + "\")\n", + "}\n", + "summary(dhis2_reporting_wide$ACTUAL_REPORTS)\n", + "summary(dhis2_reporting_wide$EXPECTED_REPORTS)\n" ] }, { @@ -775,7 +837,9 @@ }, "outputs": [], "source": [ - "# Moved to utils for readability.\n" + "# 3.4 Aggregate table preview\n", + "dim(reporting_rate_results)\n", + "head(reporting_rate_results, 5)\n" ] }, { @@ -822,7 +886,8 @@ }, "outputs": [], "source": [ - "# Moved to utils for readability.\n" + "# 3.5 Reporting rate range check\n", + "summary(reporting_rate_results$REPORTING_RATE)\n" ] }, { From 19e8566bccf50241601dea7588e10ff944458278 Mon Sep 17 00:00:00 2001 From: claude-marie Date: Fri, 10 Apr 2026 11:05:13 +0200 Subject: [PATCH 05/18] fix parameters --- .../pipeline.py | 49 ------------------- 1 file changed, 49 deletions(-) diff --git a/snt_dhis2_reporting_rate_dataelement/pipeline.py b/snt_dhis2_reporting_rate_dataelement/pipeline.py index 88a2d2e..6a96b15 100644 --- a/snt_dhis2_reporting_rate_dataelement/pipeline.py +++ b/snt_dhis2_reporting_rate_dataelement/pipeline.py @@ -27,47 +27,6 @@ default="imputed", required=True, ) -@parameter( - "activity_indicators", - name="Facility Activity indicators", - help="Define which data elements will be used to determine the activity of a facility." - " A facility is considered 'active' if at least one of these indicators has a non-missing value greater than zero.", - multiple=True, - choices=["CONF", "SUSP", "TEST", "PRES"], - type=str, - default=["CONF", "PRES"], - required=True, -) -@parameter( - "volume_activity_indicators", - name="Volume activity indicators", - help="Define which data elements will be used to determine the volume of activity at a facility." - " Volume of activity is used to calculate WEIGHTED reporting rates.", - multiple=True, - choices=["CONF", "SUSP", "TEST", "PRES"], - type=str, - default=["CONF", "PRES"], - required=True, -) -@parameter( - "dataelement_method_denominator", - name="Denominator method", - help="How to calculate the total nr of facilities expected to report.", - type=str, - choices=["ROUTINE_ACTIVE_FACILITIES", "PYRAMID_OPEN_FACILITIES"], - default="ROUTINE_ACTIVE_FACILITIES", - required=True, -) -@parameter( - "use_weighted_reporting_rates", - name="Use weighted reporting rates", - help="Weighted reporting rates are calculated using the volume of activity. " - "If TRUE, these values will populate the REPORTING_RATE column of the exported data. " - "If FALSE, unweighted reporting rates will be used instead.", - type=bool, - default=False, - required=False, -) @parameter( "run_report_only", name="Run reporting only", @@ -87,10 +46,6 @@ ) def snt_dhis2_reporting_rate_dataelement( routine_data_choice: str, - activity_indicators: str, - volume_activity_indicators: str, - dataelement_method_denominator: str, - use_weighted_reporting_rates: bool, run_report_only: bool, pull_scripts: bool, ): @@ -122,10 +77,6 @@ def snt_dhis2_reporting_rate_dataelement( nb_parameters = { "SNT_ROOT_PATH": root_path.as_posix(), "ROUTINE_FILE": routine_file, - "DATAELEMENT_METHOD_DENOMINATOR": dataelement_method_denominator, - "ACTIVITY_INDICATORS": activity_indicators, - "VOLUME_ACTIVITY_INDICATORS": volume_activity_indicators, - "USE_WEIGHTED_REPORTING_RATES": use_weighted_reporting_rates, } parameters_file = save_pipeline_parameters( pipeline_name="snt_dhis2_reporting_rate_dataelement", From 1f9ea6ed052bb35e770ef9a53ab4a01642dca726 Mon Sep 17 00:00:00 2001 From: claude-marie Date: Mon, 13 Apr 2026 17:35:35 +0200 Subject: [PATCH 06/18] quick fix --- ...snt_dhis2_reporting_rate_dataelement.ipynb | 2214 ++++++++--------- .../snt_dhis2_reporting_rate_dataelement.r | 215 +- .../snt_dhis2_reporting_rate_dataset.ipynb | 2114 ++++++++-------- .../utils/snt_dhis2_reporting_rate_dataset.r | 168 +- .../pipeline.py | 1 + snt_dhis2_reporting_rate_dataset/pipeline.py | 1 + 6 files changed, 2294 insertions(+), 2419 deletions(-) diff --git a/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb b/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb index 8a94d83..438b6eb 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb +++ b/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb @@ -1,1152 +1,1148 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "6e8d006c-fd3d-4186-bc8f-b83fdf234e65", - "metadata": { - "papermill": { - "duration": 0.000173, - "end_time": "2026-01-16T10:22:53.011120", - "exception": false, - "start_time": "2026-01-16T10:22:53.010947", - "status": "completed" + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000173, + "end_time": "2026-01-16T10:22:53.011120", + "exception": false, + "start_time": "2026-01-16T10:22:53.010947", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Data Element reporting rate: based on reporting of one or more indicators\n", + "Partially following methods by WHO and as per Diallo (2025) paper\n", + "\n", + "To accurately measure data completeness, we calculate the **monthly** reporting rate per **ADM2**, as the **proportion** of **facilities** (HF or `OU_ID`) that in a given month submitted data for either a single or _any_ of the chosen indicators (i.e., `CONF`, `SUSP`, `TEST`). \n", + "Basically, \"Data Element\" reporting rate is the number of facilities reporting on 1 or more given indicators, over the total number of facilities.
\n", + "For this method the user is allowed to **chose** how to calculate both the **numerator** and **denominator**.
\n", + "\n", + "Specifically: \n", + "\n", + "* **Numerator**: Number of facilities that _actually reported_ data, and it is estimated based on whether a facility (OU_ID) submitted data for **_any_** of the **selected indicators**. \n", + " Note: we **recommend** always including `CONF` because it is a core indicator consistently tracked across the dataset. This choice ensures alignment with the structure of the incidence calculation, which is also mainly based on confirmed cases.\n", + "
\n", + "
\n", + "* **Denominator**: Number of facilities _expected_ to report. This number can be obtained in two different ways: \n", + " * `\"ROUTINE_ACTIVE_FACILITIES\"`: uses the col `EXPECTED_REPORTS` from the df `active_facilities`.
\n", + " This is calculated as the number of \"**active**\" facilities (OU_ID), defined as those that submitted _any_ data **at least once in a given year**, across **all** indicators extracted in `dhis2_routine` (namely: all aggregated indicators as defined in the SNT_config.json file, see: `config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS`)\n", + " * `\"PYRAMID_OPEN_FACILITIES\"`: This method uses the opening and closing dates in DHIS2 (stored in the DHIS2 organisation units) to determine whether a facility was open, and thus expected to report, at the time of calculation.\n", + "
\n", + "
\n", + "* **Output**: Reporting rate table aggregated at administrative level 2 with extensions csv and parquet saved to dataset **SNT_DHIS2_REPORTING_RATE**:\n", + " * cols: YEAR, MONTH, ADM2_ID, REPORTING_RATE\n", + " * Filename: `XXX_reporting_rate_dataelement.`" + ], + "id": "6e8d006c-fd3d-4186-bc8f-b83fdf234e65" }, - "tags": [] - }, - "source": [ - "# Data Element reporting rate: based on reporting of one or more indicators\n", - "Partially following methods by WHO and as per Diallo (2025) paper\n", - "\n", - "To accurately measure data completeness, we calculate the **monthly** reporting rate per **ADM2**, as the **proportion** of **facilities** (HF or `OU_ID`) that in a given month submitted data for either a single or _any_ of the chosen indicators (i.e., `CONF`, `SUSP`, `TEST`). \n", - "Basically, \"Data Element\" reporting rate is the number of facilities reporting on 1 or more given indicators, over the total number of facilities.
\n", - "For this method the user is allowed to **chose** how to calculate both the **numerator** and **denominator**.
\n", - "\n", - "Specifically: \n", - "\n", - "* **Numerator**: Number of facilities that _actually reported_ data, and it is estimated based on whether a facility (OU_ID) submitted data for **_any_** of the **selected indicators**. \n", - " Note: we **recommend** always including `CONF` because it is a core indicator consistently tracked across the dataset. This choice ensures alignment with the structure of the incidence calculation, which is also mainly based on confirmed cases.\n", - "
\n", - "
\n", - "* **Denominator**: Number of facilities _expected_ to report. This number can be obtained in two different ways: \n", - " * `\"ROUTINE_ACTIVE_FACILITIES\"`: uses the col `EXPECTED_REPORTS` from the df `active_facilities`.
\n", - " This is calculated as the number of \"**active**\" facilities (OU_ID), defined as those that submitted _any_ data **at least once in a given year**, across **all** indicators extracted in `dhis2_routine` (namely: all aggregated indicators as defined in the SNT_config.json file, see: `config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS`)\n", - " * `\"PYRAMID_OPEN_FACILITIES\"`: This method uses the opening and closing dates in DHIS2 (stored in the DHIS2 organisation units) to determine whether a facility was open, and thus expected to report, at the time of calculation.\n", - "
\n", - "
\n", - "* **Output**: Reporting rate table aggregated at administrative level 2 with extensions csv and parquet saved to dataset **SNT_DHIS2_REPORTING_RATE**:\n", - " * cols: YEAR, MONTH, ADM2_ID, REPORTING_RATE\n", - " * Filename: `XXX_reporting_rate_dataelement.`" - ] - }, - { - "cell_type": "markdown", - "id": "064495be-24e5-4b76-a91f-7ac3d1a27a5a", - "metadata": { - "papermill": { - "duration": 0.000228, - "end_time": "2026-01-16T10:22:53.014752", - "exception": false, - "start_time": "2026-01-16T10:22:53.014524", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000228, + "end_time": "2026-01-16T10:22:53.014752", + "exception": false, + "start_time": "2026-01-16T10:22:53.014524", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## 1. Setup" + ], + "id": "064495be-24e5-4b76-a91f-7ac3d1a27a5a" }, - "tags": [] - }, - "source": [ - "## 1. Setup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "35ede7cf-257f-439c-a514-26a7290f881d", - "metadata": { - "papermill": { - "duration": 63.150489, - "end_time": "2026-01-16T10:23:56.165530", - "exception": false, - "start_time": "2026-01-16T10:22:53.015041", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "papermill": { + "duration": 63.150489, + "end_time": "2026-01-16T10:23:56.165530", + "exception": false, + "start_time": "2026-01-16T10:22:53.015041", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "SNT_ROOT_PATH <- \"/home/hexa/workspace\"\n", + "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_reporting_rate_dataelement\")\n", + "\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_reporting_rate_dataelement.r\"))\n", + "setup <- get_setup_variables(SNT_ROOT_PATH = SNT_ROOT_PATH)\n", + "" + ], + "execution_count": null, + "outputs": [], + "id": "35ede7cf-257f-439c-a514-26a7290f881d" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Project paths\n", - "SNT_ROOT_PATH <- \"/home/hexa/workspace\"\n", - "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_reporting_rate_dataelement\")\n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", - "DATA_PATH <- file.path(SNT_ROOT_PATH, \"data\", \"dhis2\")\n", - "\n", - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_reporting_rate_dataelement.r\"))\n", - "\n", - "# Load libraries\n", - "required_packages <- c(\"arrow\", \"tidyverse\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"glue\", \"zoo\")\n", - "install_and_load(required_packages)\n", - "\n", - "# Environment variables\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "\n", - "# Load OpenHEXA sdk\n", - "openhexa <- import(\"openhexa.sdk\")\n" - ] - }, - { - "cell_type": "markdown", - "id": "a7a15634-4623-40f2-8e2d-3fa47203aa6e", - "metadata": { - "papermill": { - "duration": 0.00011, - "end_time": "2026-01-16T10:23:56.165873", - "exception": false, - "start_time": "2026-01-16T10:23:56.165763", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.00011, + "end_time": "2026-01-16T10:23:56.165873", + "exception": false, + "start_time": "2026-01-16T10:23:56.165763", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 1.1. Fallback parameters values\n", + "This parameters are injected by papermill when running in OH via pipeline run interface.
\n", + "The code cell below here provides fallback paramater values needed when running this notebook locally." + ], + "id": "a7a15634-4623-40f2-8e2d-3fa47203aa6e" }, - "tags": [] - }, - "source": [ - "### 1.1. Fallback parameters values\n", - "This parameters are injected by papermill when running in OH via pipeline run interface.
\n", - "The code cell below here provides fallback paramater values needed when running this notebook locally." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b17f7685-5291-4e5d-9eec-2d1f9435fccb", - "metadata": { - "papermill": { - "duration": 0.033954, - "end_time": "2026-01-16T10:23:56.199937", - "exception": false, - "start_time": "2026-01-16T10:23:56.165983", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "papermill": { + "duration": 0.033954, + "end_time": "2026-01-16T10:23:56.199937", + "exception": false, + "start_time": "2026-01-16T10:23:56.165983", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Current options: \n", + "# \"COUNTRY_CODE_routine.parquet\" (RAW data)\n", + "# \"COUNTRY_CODE_routine_outliers_removed.parquet\" \n", + "# \"COUNTRY_CODE_routine_outliers_imputed.parquet\"\n", + "if (!exists(\"ROUTINE_FILE\")) {ROUTINE_FILE <- \"XXX_routine_outliers_imputed.parquet\"}\n", + "\n", + "# Resolved by pipeline.py based on routine_data_choice; fallback to formatted dataset\n", + "if (!exists(\"DATASET_ID\")) {DATASET_ID <- \"\"}\n", + "\n", + "# Options: \"ROUTINE_ACTIVE_FACILITIES\", \"PYRAMID_OPEN_FACILITIES\"\n", + "if (!exists(\"DATAELEMENT_METHOD_DENOMINATOR\")) {DATAELEMENT_METHOD_DENOMINATOR <- \"ROUTINE_ACTIVE_FACILITIES\"}\n", + "if (!exists(\"ACTIVITY_INDICATORS\")) {ACTIVITY_INDICATORS <- c(\"CONF\", \"PRES\", \"SUSP\")} \n", + "if (!exists(\"VOLUME_ACTIVITY_INDICATORS\")) {VOLUME_ACTIVITY_INDICATORS <- c(\"CONF\", \"PRES\")}\n", + "if (!exists(\"USE_WEIGHTED_REPORTING_RATES\")) {USE_WEIGHTED_REPORTING_RATES <- FALSE}" + ], + "execution_count": null, + "outputs": [], + "id": "b17f7685-5291-4e5d-9eec-2d1f9435fccb" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Current options: \n", - "# \"COUNTRY_CODE_routine.parquet\" (RAW data)\n", - "# \"COUNTRY_CODE_routine_outliers_removed.parquet\" \n", - "# \"COUNTRY_CODE_routine_outliers_imputed.parquet\"\n", - "if (!exists(\"ROUTINE_FILE\")) {ROUTINE_FILE <- \"XXX_routine_outliers_imputed.parquet\"}\n", - "\n", - "# Options: \"ROUTINE_ACTIVE_FACILITIES\", \"PYRAMID_OPEN_FACILITIES\"\n", - "if (!exists(\"DATAELEMENT_METHOD_DENOMINATOR\")) {DATAELEMENT_METHOD_DENOMINATOR <- \"ROUTINE_ACTIVE_FACILITIES\"}\n", - "if (!exists(\"ACTIVITY_INDICATORS\")) {ACTIVITY_INDICATORS <- c(\"CONF\", \"PRES\", \"SUSP\")} \n", - "if (!exists(\"VOLUME_ACTIVITY_INDICATORS\")) {VOLUME_ACTIVITY_INDICATORS <- c(\"CONF\", \"PRES\")}\n", - "if (!exists(\"USE_WEIGHTED_REPORTING_RATES\")) {USE_WEIGHTED_REPORTING_RATES <- FALSE}" - ] - }, - { - "cell_type": "markdown", - "id": "7dedcc32-c531-498d-90b9-89b0ee9fb9be", - "metadata": { - "papermill": { - "duration": 9.5e-05, - "end_time": "2026-01-16T10:23:56.200231", - "exception": false, - "start_time": "2026-01-16T10:23:56.200136", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000095, + "end_time": "2026-01-16T10:23:56.200231", + "exception": false, + "start_time": "2026-01-16T10:23:56.200136", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 1.2. Load and check `snt config` file" + ], + "id": "7dedcc32-c531-498d-90b9-89b0ee9fb9be" }, - "tags": [] - }, - "source": [ - "### 1.2. Load and check `snt config` file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5b6d29ea-91f3-4c53-b95e-4b485f88383f", - "metadata": { - "papermill": { - "duration": 0.521572, - "end_time": "2026-01-16T10:23:56.721932", - "exception": false, - "start_time": "2026-01-16T10:23:56.200360", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "papermill": { + "duration": 0.521572, + "end_time": "2026-01-16T10:23:56.721932", + "exception": false, + "start_time": "2026-01-16T10:23:56.200360", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "config_json <- load_snt_config(file.path(setup$CONFIG_PATH, \"SNT_config.json\"))" + ], + "execution_count": null, + "outputs": [], + "id": "5b6d29ea-91f3-4c53-b95e-4b485f88383f" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", - " error = function(e) {\n", - " msg <- paste0(\"[ERROR] Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "log_msg(paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, \"SNT_config.json\")))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c26c981c-dadd-48ac-ae35-613b8ba61a82", - "metadata": { - "papermill": { - "duration": 0.033003, - "end_time": "2026-01-16T10:23:56.755117", - "exception": false, - "start_time": "2026-01-16T10:23:56.722114", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "papermill": { + "duration": 0.033003, + "end_time": "2026-01-16T10:23:56.755117", + "exception": false, + "start_time": "2026-01-16T10:23:56.722114", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Configuration settings\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", + "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", + "\n", + "# DHIS2_INDICATORS <- names(config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS)\n", + "DHIS2_INDICATORS <- c(\"CONF\", \"PRES\", \"SUSP\", \"TEST\") # GP 20260205\n", + "\n", + "ACTIVITY_INDICATORS <- unlist(ACTIVITY_INDICATORS)\n", + "VOLUME_ACTIVITY_INDICATORS <- unlist(VOLUME_ACTIVITY_INDICATORS)\n", + "fixed_cols <- c('PERIOD', 'YEAR', 'MONTH', 'ADM1_ID', 'ADM2_ID', 'OU_ID')\n", + "fixed_cols_rr <- c('YEAR', 'MONTH', 'ADM2_ID', 'REPORTING_RATE') # Fixed cols for exporting RR tables" + ], + "execution_count": null, + "outputs": [], + "id": "c26c981c-dadd-48ac-ae35-613b8ba61a82" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Configuration settings\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", - "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", - "\n", - "# DHIS2_INDICATORS <- names(config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS)\n", - "DHIS2_INDICATORS <- c(\"CONF\", \"PRES\", \"SUSP\", \"TEST\") # GP 20260205\n", - "\n", - "ACTIVITY_INDICATORS <- unlist(ACTIVITY_INDICATORS)\n", - "VOLUME_ACTIVITY_INDICATORS <- unlist(VOLUME_ACTIVITY_INDICATORS)\n", - "fixed_cols <- c('PERIOD', 'YEAR', 'MONTH', 'ADM1_ID', 'ADM2_ID', 'OU_ID')\n", - "fixed_cols_rr <- c('YEAR', 'MONTH', 'ADM2_ID', 'REPORTING_RATE') # Fixed cols for exporting RR tables" - ] - }, - { - "cell_type": "markdown", - "id": "8bf4a8bb", - "metadata": {}, - "source": [ - "### 1.3. 🔍 Check: at least 1 indicator must be selected\n", - "The use can toggle on/off each of the indicators. Therefore, need to make sure at least one is ON.
\n", - "Indicator `CONF` is mandatory, but I think it looks better if they're all displayed in the Run pipeline view (more intuitive)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "18b40207", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (!length(ACTIVITY_INDICATORS) > 0) {\n", - " msg <- \"[ERROR] Error: no indicator selected, cannot perform calculation of reporting rate method. Select at least one (e.g., `CONF`).\"\n", - " cat(msg) \n", - " stop(msg)\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "e44ae2ab-4af7-475a-8cbe-6d669895a18b", - "metadata": { - "papermill": { - "duration": 9.3e-05, - "end_time": "2026-01-16T10:23:56.779812", - "exception": false, - "start_time": "2026-01-16T10:23:56.779719", - "status": "completed" + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.3. 🔍 Check: at least 1 indicator must be selected\n", + "The use can toggle on/off each of the indicators. Therefore, need to make sure at least one is ON.
\n", + "Indicator `CONF` is mandatory, but I think it looks better if they're all displayed in the Run pipeline view (more intuitive)." + ], + "id": "8bf4a8bb" }, - "tags": [] - }, - "source": [ - "## 2. Load Data" - ] - }, - { - "cell_type": "markdown", - "id": "39e2add7-bbc7-4312-9a6f-9886d675f532", - "metadata": { - "papermill": { - "duration": 6.9e-05, - "end_time": "2026-01-16T10:23:56.779987", - "exception": false, - "start_time": "2026-01-16T10:23:56.779918", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "if (!length(ACTIVITY_INDICATORS) > 0) {\n", + " msg <- \"[ERROR] Error: no indicator selected, cannot perform calculation of reporting rate method. Select at least one (e.g., `CONF`).\"\n", + " cat(msg) \n", + " stop(msg)\n", + "}" + ], + "execution_count": null, + "outputs": [], + "id": "18b40207" }, - "tags": [] - }, - "source": [ - "### 2.1. Routine data (DHIS2) \n", - "**Note on pipeline behaviour**:
\n", - "The value of `ROUTINE_FILE` is resolved within the pipeline.py code and injected into the notebook as parameter." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a1213723-f7e2-4238-9f37-f1795b187232", - "metadata": { - "papermill": { - "duration": 2.018878, - "end_time": "2026-01-16T10:23:58.798963", - "exception": false, - "start_time": "2026-01-16T10:23:56.780085", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000093, + "end_time": "2026-01-16T10:23:56.779812", + "exception": false, + "start_time": "2026-01-16T10:23:56.779719", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## 2. Load Data" + ], + "id": "e44ae2ab-4af7-475a-8cbe-6d669895a18b" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "rountine_dataset_name <- select_routine_dataset_name_dataelement(ROUTINE_FILE, COUNTRY_CODE, config_json)\n", - "dhis2_routine <- load_routine_data_dataelement(rountine_dataset_name, ROUTINE_FILE, COUNTRY_CODE)\n", - "dim(dhis2_routine)\n", - "head(dhis2_routine, 2)\n" - ] - }, - { - "cell_type": "markdown", - "id": "a8b91360-1a4e-4fc4-9883-602bc0ab2a2a", - "metadata": { - "papermill": { - "duration": 0.000138, - "end_time": "2026-01-16T10:23:58.799287", - "exception": false, - "start_time": "2026-01-16T10:23:58.799149", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000069, + "end_time": "2026-01-16T10:23:56.779987", + "exception": false, + "start_time": "2026-01-16T10:23:56.779918", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 2.1. Routine data (DHIS2) \n", + "**Note on pipeline behaviour**:
\n", + "The value of `ROUTINE_FILE` is resolved within the pipeline.py code and injected into the notebook as parameter." + ], + "id": "39e2add7-bbc7-4312-9a6f-9886d675f532" }, - "tags": [] - }, - "source": [ - "### 2.2. Organisation units (DHIS2 pyramid)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2fd92901-901e-4019-be78-a7718050c1c4", - "metadata": { - "papermill": { - "duration": 0.992899, - "end_time": "2026-01-16T10:23:59.792385", - "exception": false, - "start_time": "2026-01-16T10:23:58.799486", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "papermill": { + "duration": 2.018878, + "end_time": "2026-01-16T10:23:58.798963", + "exception": false, + "start_time": "2026-01-16T10:23:56.780085", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "dhis2_routine <- load_dataset_file(DATASET_ID, ROUTINE_FILE)\n", + "dhis2_routine <- dhis2_routine %>%\n", + " dplyr::mutate(dplyr::across(c(PERIOD, YEAR, MONTH), as.numeric))\n", + "dim(dhis2_routine)\n", + "head(dhis2_routine, 2)\n", + "" + ], + "execution_count": null, + "outputs": [], + "id": "a1213723-f7e2-4238-9f37-f1795b187232" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "dhis2_pyramid_formatted <- load_pyramid_data_dataelement(config_json, COUNTRY_CODE)\n", - "dim(dhis2_pyramid_formatted)\n", - "head(dhis2_pyramid_formatted, 2)\n" - ] - }, - { - "cell_type": "markdown", - "id": "2b7f4e50-3731-46bc-b7a7-2ef5317da9d1", - "metadata": { - "papermill": { - "duration": 0.000106, - "end_time": "2026-01-16T10:23:59.792710", - "exception": false, - "start_time": "2026-01-16T10:23:59.792604", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000138, + "end_time": "2026-01-16T10:23:58.799287", + "exception": false, + "start_time": "2026-01-16T10:23:58.799149", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 2.2. Organisation units (DHIS2 pyramid)" + ], + "id": "a8b91360-1a4e-4fc4-9883-602bc0ab2a2a" }, - "tags": [] - }, - "source": [ - "### 2.3. Check whether selected indicators are present in routine data\n", - "Extra precaution measure to avoid breaks downstream.
\n", - "\n", - "Note: This logic should be moved to pipeline.py 🐍" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "19ff7e56-2397-4ca1-b072-bca4ba1b3d0c", - "metadata": { - "papermill": { - "duration": 0.024863, - "end_time": "2026-01-16T10:23:59.817677", - "exception": false, - "start_time": "2026-01-16T10:23:59.792814", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "papermill": { + "duration": 0.992899, + "end_time": "2026-01-16T10:23:59.792385", + "exception": false, + "start_time": "2026-01-16T10:23:58.799486", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "dhis2_pyramid_formatted <- load_dataset_file(\n", + " config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED,\n", + " paste0(COUNTRY_CODE, \"_pyramid.parquet\")\n", + ")\n", + "dim(dhis2_pyramid_formatted)\n", + "head(dhis2_pyramid_formatted, 2)\n", + "" + ], + "execution_count": null, + "outputs": [], + "id": "2fd92901-901e-4019-be78-a7718050c1c4" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "if (!all(ACTIVITY_INDICATORS %in% names(dhis2_routine))) {\n", - " log_msg(glue(\"🚨 Warning: one or more of the follow column is missing from `dhis2_routine`: {paste(ACTIVITY_INDICATORS, collapse = ', ')}\"), \"warning\")\n", - "}\n", - "\n", - "if (!all(VOLUME_ACTIVITY_INDICATORS %in% names(dhis2_routine))) {\n", - " msg <- glue(\"[ERROR] Volume activity indicator {VOLUME_ACTIVITY_INDICATORS} not present in the routine data. Process cannot continue.\")\n", - " cat(msg)\n", - " stop(msg)\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "bcbd3a9f-5e45-4ae5-8671-e23155236295", - "metadata": { - "papermill": { - "duration": 9.1e-05, - "end_time": "2026-01-16T10:23:59.817949", - "exception": false, - "start_time": "2026-01-16T10:23:59.817858", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000106, + "end_time": "2026-01-16T10:23:59.792710", + "exception": false, + "start_time": "2026-01-16T10:23:59.792604", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 2.3. Check whether selected indicators are present in routine data\n", + "Extra precaution measure to avoid breaks downstream.
\n", + "\n", + "Note: This logic should be moved to pipeline.py 🐍" + ], + "id": "2b7f4e50-3731-46bc-b7a7-2ef5317da9d1" }, - "tags": [] - }, - "source": [ - "## 3. Reporting rates computations" - ] - }, - { - "cell_type": "markdown", - "id": "7d62cdb6", - "metadata": {}, - "source": [ - "#### 3.0. Define start and end period based on routine data " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3bc2e76a-b5c7-4c71-90f2-c66926ca560a", - "metadata": { - "papermill": { - "duration": 0.044172, - "end_time": "2026-01-16T10:23:59.862224", - "exception": false, - "start_time": "2026-01-16T10:23:59.818052", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "papermill": { + "duration": 0.024863, + "end_time": "2026-01-16T10:23:59.817677", + "exception": false, + "start_time": "2026-01-16T10:23:59.792814", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "if (!all(ACTIVITY_INDICATORS %in% names(dhis2_routine))) {\n", + " log_msg(glue(\"🚨 Warning: one or more of the follow column is missing from `dhis2_routine`: {paste(ACTIVITY_INDICATORS, collapse = ', ')}\"), \"warning\")\n", + "}\n", + "\n", + "if (!all(VOLUME_ACTIVITY_INDICATORS %in% names(dhis2_routine))) {\n", + " msg <- glue(\"[ERROR] Volume activity indicator {VOLUME_ACTIVITY_INDICATORS} not present in the routine data. Process cannot continue.\")\n", + " cat(msg)\n", + " stop(msg)\n", + "}" + ], + "execution_count": null, + "outputs": [], + "id": "19ff7e56-2397-4ca1-b072-bca4ba1b3d0c" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "PERIOD_START <- dhis2_routine$PERIOD %>% min()\n", - "PERIOD_END <- dhis2_routine$PERIOD %>% max()\n", - "\n", - "period_vector <- format(seq(ym(PERIOD_START), ym(PERIOD_END), by = \"month\"), \"%Y%m\")\n", - "cat(glue(\"Start period: {PERIOD_START} \\nEnd period: {PERIOD_END} \\nPeriods count: {length(period_vector)}\"))" - ] - }, - { - "cell_type": "markdown", - "id": "526bc3af-01c1-4ddc-b3b9-077354e57559", - "metadata": { - "papermill": { - "duration": 0.000109, - "end_time": "2026-01-16T10:23:59.862555", - "exception": false, - "start_time": "2026-01-16T10:23:59.862446", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000091, + "end_time": "2026-01-16T10:23:59.817949", + "exception": false, + "start_time": "2026-01-16T10:23:59.817858", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## 3. Reporting rates computations" + ], + "id": "bcbd3a9f-5e45-4ae5-8671-e23155236295" }, - "tags": [] - }, - "source": [ - "#### 3.1. Build master table (all PERIOD x OU)\n", - "The master table contains all combinations of period x organisation unit " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9308197a-0852-4d34-8888-cf5564f35a9d", - "metadata": { - "papermill": { - "duration": 0.289128, - "end_time": "2026-01-16T10:24:00.151791", - "exception": false, - "start_time": "2026-01-16T10:23:59.862663", - "status": "completed" + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3.0. Define start and end period based on routine data " + ], + "id": "7d62cdb6" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "log_msg(glue(\"Building master table with periods from {PERIOD_START} to {PERIOD_END}. Periods count: {length(period_vector)}\"))\n", - "facility_master <- build_facility_master_dataelement(\n", - " dhis2_pyramid_formatted = dhis2_pyramid_formatted,\n", - " period_vector = period_vector,\n", - " config_json = config_json,\n", - " ADMIN_1 = ADMIN_1,\n", - " ADMIN_2 = ADMIN_2\n", - ")\n" - ] - }, - { - "cell_type": "markdown", - "id": "d5af25ad-f17c-4cdc-ac96-908af49fe558", - "metadata": { - "papermill": { - "duration": 0.000114, - "end_time": "2026-01-16T10:24:00.152094", - "exception": false, - "start_time": "2026-01-16T10:24:00.151980", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "papermill": { + "duration": 0.044172, + "end_time": "2026-01-16T10:23:59.862224", + "exception": false, + "start_time": "2026-01-16T10:23:59.818052", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "PERIOD_START <- dhis2_routine$PERIOD %>% min()\n", + "PERIOD_END <- dhis2_routine$PERIOD %>% max()\n", + "\n", + "period_vector <- format(seq(ym(PERIOD_START), ym(PERIOD_END), by = \"month\"), \"%Y%m\")\n", + "cat(glue(\"Start period: {PERIOD_START} \\nEnd period: {PERIOD_END} \\nPeriods count: {length(period_vector)}\"))" + ], + "execution_count": null, + "outputs": [], + "id": "3bc2e76a-b5c7-4c71-90f2-c66926ca560a" }, - "tags": [] - }, - "source": [ - "#### 3.2. Identify \"Active\" facilities\n", - "\n", - "Facilities **reporting** zero or positive values on any of the selected indicators (**\"Activity indicators\"**) are considered to be **active**. Note that this method only counts **non-null** (not `NA`s) to prevent counting empty submissions as valid reporting.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7b279d27", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Join routine values to the facility master and define monthly activity\n", - "facility_master_routine <- dplyr::left_join(\n", - " facility_master,\n", - " dhis2_routine %>% dplyr::select(OU_ID, PERIOD, dplyr::any_of(DHIS2_INDICATORS)),\n", - " by = c(\"OU_ID\", \"PERIOD\")\n", - ") %>%\n", - " dplyr::mutate(\n", - " YEAR = as.numeric(substr(PERIOD, 1, 4)),\n", - " ACTIVE_THIS_PERIOD = ifelse(\n", - " rowSums(!is.na(dplyr::across(dplyr::all_of(ACTIVITY_INDICATORS))) &\n", - " dplyr::across(dplyr::all_of(ACTIVITY_INDICATORS)) >= 0) > 0, 1, 0\n", - " ),\n", - " COUNT = 1\n", - " )\n" - ] - }, - { - "cell_type": "markdown", - "id": "89c3e5c8-4a4e-497d-9d75-2aed2e8fe619", - "metadata": { - "papermill": { - "duration": 0.000107, - "end_time": "2026-01-16T10:24:01.626760", - "exception": false, - "start_time": "2026-01-16T10:24:01.626653", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000109, + "end_time": "2026-01-16T10:23:59.862555", + "exception": false, + "start_time": "2026-01-16T10:23:59.862446", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 3.1. Build master table (all PERIOD x OU)\n", + "The master table contains all combinations of period x organisation unit " + ], + "id": "526bc3af-01c1-4ddc-b3b9-077354e57559" }, - "tags": [] - }, - "source": [ - "#### 3.3. Identify `OPEN` facilities (denominator)\n", - "The \"OPEN\" variable indicates whether a facility is considered structurally open for a given reporting period.\n", - "\n", - "A facility is flagged as open (OPEN = 1) for a period if both of the following conditions are met:\n", - "1. No explicit closure in the facility name. The facility name does not contain closure keywords such as “CLOTUR”, “FERMÉ”, “FERMEE”, or similar.\n", - "\n", - "2. The period falls within the facility’s opening and closing dates. The opening date is not after the reporting period, and the closing date is not before or equal to the reporting period.\n", - "\n", - "If either of these conditions is not met, the facility is considered not open (OPEN = 0) for that period." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0b71f1d8-2048-4b62-865c-9acfe61b5b89", - "metadata": { - "papermill": { - "duration": 1.317923, - "end_time": "2026-01-16T10:24:02.944800", - "exception": false, - "start_time": "2026-01-16T10:24:01.626877", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "papermill": { + "duration": 0.289128, + "end_time": "2026-01-16T10:24:00.151791", + "exception": false, + "start_time": "2026-01-16T10:23:59.862663", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "log_msg(glue(\"Building master table with periods from {PERIOD_START} to {PERIOD_END}. Periods count: {length(period_vector)}\"))\n", + "facility_master <- build_facility_master_dataelement(\n", + " dhis2_pyramid_formatted = dhis2_pyramid_formatted,\n", + " period_vector = period_vector,\n", + " config_json = config_json,\n", + " ADMIN_1 = ADMIN_1,\n", + " ADMIN_2 = ADMIN_2\n", + ")\n" + ], + "execution_count": null, + "outputs": [], + "id": "9308197a-0852-4d34-8888-cf5564f35a9d" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# 3.3 Identify OPEN facilities from naming and opening/closing dates\n", - "facility_master_routine <- facility_master_routine %>%\n", - " dplyr::mutate(\n", - " period_date = as.Date(zoo::as.yearmon(as.character(PERIOD), \"%Y%m\")),\n", - " NAME_CLOSED = stringr::str_detect(toupper(OU_NAME), \"CLOTUR|FERM(E|EE)?\"),\n", - " OPEN_BY_DATE = !(is.na(OPENING_DATE) | as.Date(OPENING_DATE) > period_date |\n", - " (!is.na(CLOSED_DATE) & as.Date(CLOSED_DATE) <= period_date)),\n", - " OPEN = ifelse(!NAME_CLOSED & OPEN_BY_DATE, 1, 0)\n", - " )\n" - ] - }, - { - "cell_type": "markdown", - "id": "657fd6ca", - "metadata": {}, - "source": [ - "#### 3.4. Identify \"Active\" facilities for each YEAR (denominator)" - ] - }, - { - "cell_type": "markdown", - "id": "a598e4b7", - "metadata": {}, - "source": [ - "
\n", - " Important: this step could have a huge influence on reporting rates!
\n", - " Activity can be evaluated over 1 year or across all years, based on grouping: group_by(OU_ID, YEAR):
\n", - "
    \n", - "
  • With YEAR → “active that year”
  • \n", - "
  • Without YEAR → “ever active over the entire extracted period”
  • \n", - "
\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "002e7fbf-1f68-4419-be2d-f16d8c72936d", - "metadata": { - "papermill": { - "duration": 0.173961, - "end_time": "2026-01-16T10:24:05.948136", - "exception": false, - "start_time": "2026-01-16T10:24:05.774175", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000114, + "end_time": "2026-01-16T10:24:00.152094", + "exception": false, + "start_time": "2026-01-16T10:24:00.151980", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 3.2. Identify \"Active\" facilities\n", + "\n", + "Facilities **reporting** zero or positive values on any of the selected indicators (**\"Activity indicators\"**) are considered to be **active**. Note that this method only counts **non-null** (not `NA`s) to prevent counting empty submissions as valid reporting.\n" + ], + "id": "d5af25ad-f17c-4cdc-ac96-908af49fe558" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# 3.4 Mark facilities active at least once per year\n", - "facility_master_routine <- facility_master_routine %>%\n", - " dplyr::group_by(OU_ID, YEAR) %>%\n", - " dplyr::mutate(ACTIVE_THIS_YEAR = max(ACTIVE_THIS_PERIOD, na.rm = TRUE)) %>%\n", - " dplyr::ungroup()\n" - ] - }, - { - "cell_type": "markdown", - "id": "160c08ec-cc9a-4e1a-99ec-f703db83a71d", - "metadata": { - "papermill": { - "duration": 9.8e-05, - "end_time": "2026-01-16T10:24:05.948452", - "exception": false, - "start_time": "2026-01-16T10:24:05.948354", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Join routine values to the facility master and define monthly activity\n", + "facility_master_routine <- dplyr::left_join(\n", + " facility_master,\n", + " dhis2_routine %>% dplyr::select(OU_ID, PERIOD, dplyr::any_of(DHIS2_INDICATORS)),\n", + " by = c(\"OU_ID\", \"PERIOD\")\n", + ") %>%\n", + " dplyr::mutate(\n", + " YEAR = as.numeric(substr(PERIOD, 1, 4)),\n", + " ACTIVE_THIS_PERIOD = ifelse(\n", + " rowSums(!is.na(dplyr::across(dplyr::all_of(ACTIVITY_INDICATORS))) &\n", + " dplyr::across(dplyr::all_of(ACTIVITY_INDICATORS)) >= 0) > 0, 1, 0\n", + " ),\n", + " COUNT = 1\n", + " )\n" + ], + "execution_count": null, + "outputs": [], + "id": "7b279d27" }, - "tags": [] - }, - "source": [ - "#### 3.5. Compute Weighting factor based on \"volume of activity\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4420e559-4134-4fc3-8950-9972ebede00e", - "metadata": { - "papermill": { - "duration": 0.520673, - "end_time": "2026-01-16T10:24:06.469233", - "exception": false, - "start_time": "2026-01-16T10:24:05.948560", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000107, + "end_time": "2026-01-16T10:24:01.626760", + "exception": false, + "start_time": "2026-01-16T10:24:01.626653", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 3.3. Identify `OPEN` facilities (denominator)\n", + "The \"OPEN\" variable indicates whether a facility is considered structurally open for a given reporting period.\n", + "\n", + "A facility is flagged as open (OPEN = 1) for a period if both of the following conditions are met:\n", + "1. No explicit closure in the facility name. The facility name does not contain closure keywords such as “CLOTUR”, “FERMÉ”, “FERMEE”, or similar.\n", + "\n", + "2. The period falls within the facility’s opening and closing dates. The opening date is not after the reporting period, and the closing date is not before or equal to the reporting period.\n", + "\n", + "If either of these conditions is not met, the facility is considered not open (OPEN = 0) for that period." + ], + "id": "89c3e5c8-4a4e-497d-9d75-2aed2e8fe619" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# 3.5 Compute facility weights from volume of activity\n", - "mean_monthly_cases <- dhis2_routine %>%\n", - " dplyr::mutate(total_cases_by_hf_month = rowSums(dplyr::across(dplyr::all_of(VOLUME_ACTIVITY_INDICATORS)), na.rm = TRUE)) %>%\n", - " dplyr::group_by(ADM2_ID, OU_ID) %>%\n", - " dplyr::summarise(\n", - " total_cases_by_hf_year = sum(total_cases_by_hf_month, na.rm = TRUE),\n", - " number_of_reporting_months = length(which(total_cases_by_hf_month > 0)),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " dplyr::mutate(MEAN_REPORTED_CASES_BY_HF = total_cases_by_hf_year / number_of_reporting_months) %>%\n", - " dplyr::select(ADM2_ID, OU_ID, MEAN_REPORTED_CASES_BY_HF)\n", - "\n", - "mean_monthly_cases_adm2 <- mean_monthly_cases %>%\n", - " dplyr::select(ADM2_ID, MEAN_REPORTED_CASES_BY_HF) %>%\n", - " dplyr::group_by(ADM2_ID) %>%\n", - " dplyr::summarise(\n", - " SUMMED_MEAN_REPORTED_CASES_BY_ADM2 = sum(MEAN_REPORTED_CASES_BY_HF, na.rm = TRUE),\n", - " NR_OF_HF = dplyr::n(),\n", - " .groups = \"drop\"\n", - " )\n", - "\n", - "hf_weights <- mean_monthly_cases %>%\n", - " dplyr::left_join(mean_monthly_cases_adm2, by = \"ADM2_ID\") %>%\n", - " dplyr::mutate(WEIGHT = MEAN_REPORTED_CASES_BY_HF / SUMMED_MEAN_REPORTED_CASES_BY_ADM2 * NR_OF_HF)\n" - ] - }, - { - "cell_type": "markdown", - "id": "2fed8529-70e9-4e2e-a498-fe3dd7499bb3", - "metadata": { - "papermill": { - "duration": 0.000108, - "end_time": "2026-01-16T10:24:06.469622", - "exception": false, - "start_time": "2026-01-16T10:24:06.469514", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "papermill": { + "duration": 1.317923, + "end_time": "2026-01-16T10:24:02.944800", + "exception": false, + "start_time": "2026-01-16T10:24:01.626877", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# 3.3 Identify OPEN facilities from naming and opening/closing dates\n", + "facility_master_routine <- facility_master_routine %>%\n", + " dplyr::mutate(\n", + " period_date = as.Date(zoo::as.yearmon(as.character(PERIOD), \"%Y%m\")),\n", + " NAME_CLOSED = stringr::str_detect(toupper(OU_NAME), \"CLOTUR|FERM(E|EE)?\"),\n", + " OPEN_BY_DATE = !(is.na(OPENING_DATE) | as.Date(OPENING_DATE) > period_date |\n", + " (!is.na(CLOSED_DATE) & as.Date(CLOSED_DATE) <= period_date)),\n", + " OPEN = ifelse(!NAME_CLOSED & OPEN_BY_DATE, 1, 0)\n", + " )\n" + ], + "execution_count": null, + "outputs": [], + "id": "0b71f1d8-2048-4b62-865c-9acfe61b5b89" }, - "tags": [] - }, - "source": [ - "#### 3.6. Compute Weighted variables" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "216f7658-c1da-44e4-9f4f-fdb44fd40259", - "metadata": { - "papermill": { - "duration": 0.483413, - "end_time": "2026-01-16T10:24:06.953139", - "exception": false, - "start_time": "2026-01-16T10:24:06.469726", - "status": "completed" + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3.4. Identify \"Active\" facilities for each YEAR (denominator)" + ], + "id": "657fd6ca" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# 3.6 Apply weights to monthly status variables\n", - "facility_master_routine_02 <- facility_master_routine %>%\n", - " dplyr::left_join(hf_weights %>% dplyr::select(OU_ID, WEIGHT), by = c(\"OU_ID\"))\n", - "\n", - "facility_master_routine_02$ACTIVE_THIS_PERIOD_W <- facility_master_routine_02$ACTIVE_THIS_PERIOD * facility_master_routine_02$WEIGHT\n", - "facility_master_routine_02$COUNT_W <- facility_master_routine_02$COUNT * facility_master_routine_02$WEIGHT\n", - "facility_master_routine_02$OPEN_W <- facility_master_routine_02$OPEN * facility_master_routine_02$WEIGHT\n", - "facility_master_routine_02$ACTIVE_THIS_YEAR_W <- facility_master_routine_02$ACTIVE_THIS_YEAR * facility_master_routine_02$WEIGHT\n" - ] - }, - { - "cell_type": "markdown", - "id": "9c0367f7-91cd-4524-abe4-11adf2fcea02", - "metadata": { - "papermill": { - "duration": 0.000172, - "end_time": "2026-01-16T10:24:06.953755", - "exception": false, - "start_time": "2026-01-16T10:24:06.953583", - "status": "completed" + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + " Important: this step could have a huge influence on reporting rates!
\n", + " Activity can be evaluated over 1 year or across all years, based on grouping: group_by(OU_ID, YEAR):
\n", + "
    \n", + "
  • With YEAR → “active that year”
  • \n", + "
  • Without YEAR → “ever active over the entire extracted period”
  • \n", + "
\n", + "
" + ], + "id": "a598e4b7" }, - "tags": [] - }, - "source": [ - "#### 3.7. Aggregate data at ADM2 level" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "af13191e", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# 3.7 Aggregate monthly counts at ADM2 level\n", - "reporting_rate_adm2 <- facility_master_routine_02 %>%\n", - " dplyr::group_by(ADM1_ID, ADM1_NAME, ADM2_ID, ADM2_NAME, YEAR, PERIOD) %>%\n", - " dplyr::summarise(\n", - " HF_ACTIVE_THIS_PERIOD_BY_ADM2 = sum(ACTIVE_THIS_PERIOD, na.rm = TRUE),\n", - " NR_OF_HF_BY_ADM2 = sum(COUNT, na.rm = TRUE),\n", - " NR_OF_OPEN_HF_BY_ADM2 = sum(OPEN, na.rm = TRUE),\n", - " HF_ACTIVE_THIS_YEAR_BY_ADM2 = sum(ACTIVE_THIS_YEAR, na.rm = TRUE),\n", - " HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED = sum(ACTIVE_THIS_PERIOD_W, na.rm = TRUE),\n", - " NR_OF_HF_BY_ADM2_WEIGHTED = sum(COUNT_W, na.rm = TRUE),\n", - " NR_OF_OPEN_HF_BY_ADM2_WEIGHTED = sum(OPEN_W, na.rm = TRUE),\n", - " HF_ACTIVE_THIS_YEAR_BY_ADM2_WEIGHTED = sum(ACTIVE_THIS_YEAR_W, na.rm = TRUE),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " dplyr::mutate(\n", - " RR_TOTAL_HF = HF_ACTIVE_THIS_PERIOD_BY_ADM2 / NR_OF_HF_BY_ADM2,\n", - " RR_OPEN_HF = HF_ACTIVE_THIS_PERIOD_BY_ADM2 / NR_OF_OPEN_HF_BY_ADM2,\n", - " RR_ACTIVE_HF = HF_ACTIVE_THIS_PERIOD_BY_ADM2 / HF_ACTIVE_THIS_YEAR_BY_ADM2,\n", - " RR_TOTAL_HF_W = HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED / NR_OF_HF_BY_ADM2_WEIGHTED,\n", - " RR_OPEN_HF_W = HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED / NR_OF_OPEN_HF_BY_ADM2_WEIGHTED,\n", - " RR_ACTIVE_HF_W = HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED / HF_ACTIVE_THIS_YEAR_BY_ADM2_WEIGHTED\n", - " )\n" - ] - }, - { - "cell_type": "markdown", - "id": "7d381937", - "metadata": {}, - "source": [ - "#### 3.8. Calculate Reporting Rates (all methods)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b41263f8", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# 3.8 Select final reporting-rate definition for export\n", - "rr_column_selection <- if (DATAELEMENT_METHOD_DENOMINATOR == \"ROUTINE_ACTIVE_FACILITIES\") \"RR_ACTIVE_HF\" else \"RR_OPEN_HF\"\n", - "if (USE_WEIGHTED_REPORTING_RATES) {\n", - " rr_column_selection <- if (DATAELEMENT_METHOD_DENOMINATOR == \"ROUTINE_ACTIVE_FACILITIES\") \"RR_ACTIVE_HF_W\" else \"RR_OPEN_HF_W\"\n", - "}\n", - "\n", - "reporting_rate_dataelement <- reporting_rate_adm2 %>%\n", - " dplyr::mutate(MONTH = PERIOD %% 100) %>%\n", - " dplyr::rename(REPORTING_RATE = !!rlang::sym(rr_column_selection)) %>%\n", - " dplyr::select(YEAR, MONTH, ADM2_ID, REPORTING_RATE)\n" - ] - }, - { - "cell_type": "markdown", - "id": "5e593659", - "metadata": { - "papermill": { - "duration": 0.000108, - "end_time": "2026-01-16T10:24:07.310579", - "exception": false, - "start_time": "2026-01-16T10:24:07.310471", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "papermill": { + "duration": 0.173961, + "end_time": "2026-01-16T10:24:05.948136", + "exception": false, + "start_time": "2026-01-16T10:24:05.774175", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# 3.4 Mark facilities active at least once per year\n", + "facility_master_routine <- facility_master_routine %>%\n", + " dplyr::group_by(OU_ID, YEAR) %>%\n", + " dplyr::mutate(ACTIVE_THIS_YEAR = max(ACTIVE_THIS_PERIOD, na.rm = TRUE)) %>%\n", + " dplyr::ungroup()\n" + ], + "execution_count": null, + "outputs": [], + "id": "002e7fbf-1f68-4419-be2d-f16d8c72936d" }, - "tags": [] - }, - "source": [ - "## 4. Select correct col for `REPORTING_RATE` based on denominator method" - ] - }, - { - "cell_type": "markdown", - "id": "c75f2249", - "metadata": { - "papermill": { - "duration": 5.7e-05, - "end_time": "2026-01-16T10:24:07.310743", - "exception": false, - "start_time": "2026-01-16T10:24:07.310686", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000098, + "end_time": "2026-01-16T10:24:05.948452", + "exception": false, + "start_time": "2026-01-16T10:24:05.948354", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 3.5. Compute Weighting factor based on \"volume of activity\"" + ], + "id": "160c08ec-cc9a-4e1a-99ec-f703db83a71d" }, - "tags": [] - }, - "source": [ - "### 4.1. Select results and format" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "75e71b38", - "metadata": { - "papermill": { - "duration": 0.020644, - "end_time": "2026-01-16T10:24:07.351317", - "exception": false, - "start_time": "2026-01-16T10:24:07.330673", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "papermill": { + "duration": 0.520673, + "end_time": "2026-01-16T10:24:06.469233", + "exception": false, + "start_time": "2026-01-16T10:24:05.948560", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# 3.5 Compute facility weights from volume of activity\n", + "mean_monthly_cases <- dhis2_routine %>%\n", + " dplyr::mutate(total_cases_by_hf_month = rowSums(dplyr::across(dplyr::all_of(VOLUME_ACTIVITY_INDICATORS)), na.rm = TRUE)) %>%\n", + " dplyr::group_by(ADM2_ID, OU_ID) %>%\n", + " dplyr::summarise(\n", + " total_cases_by_hf_year = sum(total_cases_by_hf_month, na.rm = TRUE),\n", + " number_of_reporting_months = length(which(total_cases_by_hf_month > 0)),\n", + " .groups = \"drop\"\n", + " ) %>%\n", + " dplyr::mutate(MEAN_REPORTED_CASES_BY_HF = total_cases_by_hf_year / number_of_reporting_months) %>%\n", + " dplyr::select(ADM2_ID, OU_ID, MEAN_REPORTED_CASES_BY_HF)\n", + "\n", + "mean_monthly_cases_adm2 <- mean_monthly_cases %>%\n", + " dplyr::select(ADM2_ID, MEAN_REPORTED_CASES_BY_HF) %>%\n", + " dplyr::group_by(ADM2_ID) %>%\n", + " dplyr::summarise(\n", + " SUMMED_MEAN_REPORTED_CASES_BY_ADM2 = sum(MEAN_REPORTED_CASES_BY_HF, na.rm = TRUE),\n", + " NR_OF_HF = dplyr::n(),\n", + " .groups = \"drop\"\n", + " )\n", + "\n", + "hf_weights <- mean_monthly_cases %>%\n", + " dplyr::left_join(mean_monthly_cases_adm2, by = \"ADM2_ID\") %>%\n", + " dplyr::mutate(WEIGHT = MEAN_REPORTED_CASES_BY_HF / SUMMED_MEAN_REPORTED_CASES_BY_ADM2 * NR_OF_HF)\n" + ], + "execution_count": null, + "outputs": [], + "id": "4420e559-4134-4fc3-8950-9972ebede00e" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# 4.1 Confirm which denominator/weighting option was selected\n", - "cat(glue::glue(\n", - " \"Selected denominator method: {DATAELEMENT_METHOD_DENOMINATOR} | Weighted reporting rates: {USE_WEIGHTED_REPORTING_RATES}\"\n", - "))\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3df36abb", - "metadata": { - "papermill": { - "duration": 0.140976, - "end_time": "2026-01-16T10:24:07.492479", - "exception": false, - "start_time": "2026-01-16T10:24:07.351503", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000108, + "end_time": "2026-01-16T10:24:06.469622", + "exception": false, + "start_time": "2026-01-16T10:24:06.469514", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 3.6. Compute Weighted variables" + ], + "id": "2fed8529-70e9-4e2e-a498-fe3dd7499bb3" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Output preview\n", - "dim(reporting_rate_dataelement)\n", - "head(reporting_rate_dataelement, 5)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0ccc272c", - "metadata": { - "papermill": { - "duration": 0.182574, - "end_time": "2026-01-16T10:24:07.675242", - "exception": false, - "start_time": "2026-01-16T10:24:07.492668", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "papermill": { + "duration": 0.483413, + "end_time": "2026-01-16T10:24:06.953139", + "exception": false, + "start_time": "2026-01-16T10:24:06.469726", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# 3.6 Apply weights to monthly status variables\n", + "facility_master_routine_02 <- facility_master_routine %>%\n", + " dplyr::left_join(hf_weights %>% dplyr::select(OU_ID, WEIGHT), by = c(\"OU_ID\"))\n", + "\n", + "facility_master_routine_02$ACTIVE_THIS_PERIOD_W <- facility_master_routine_02$ACTIVE_THIS_PERIOD * facility_master_routine_02$WEIGHT\n", + "facility_master_routine_02$COUNT_W <- facility_master_routine_02$COUNT * facility_master_routine_02$WEIGHT\n", + "facility_master_routine_02$OPEN_W <- facility_master_routine_02$OPEN * facility_master_routine_02$WEIGHT\n", + "facility_master_routine_02$ACTIVE_THIS_YEAR_W <- facility_master_routine_02$ACTIVE_THIS_YEAR * facility_master_routine_02$WEIGHT\n" + ], + "execution_count": null, + "outputs": [], + "id": "216f7658-c1da-44e4-9f4f-fdb44fd40259" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Basic data quality checks\n", - "summary(reporting_rate_dataelement$REPORTING_RATE)\n", - "sum(is.na(reporting_rate_dataelement$REPORTING_RATE))\n" - ] - }, - { - "cell_type": "markdown", - "id": "ca66e785", - "metadata": { - "papermill": { - "duration": 0.000109, - "end_time": "2026-01-16T10:24:07.675637", - "exception": false, - "start_time": "2026-01-16T10:24:07.675528", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000172, + "end_time": "2026-01-16T10:24:06.953755", + "exception": false, + "start_time": "2026-01-16T10:24:06.953583", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 3.7. Aggregate data at ADM2 level" + ], + "id": "9c0367f7-91cd-4524-abe4-11adf2fcea02" }, - "tags": [] - }, - "source": [ - "## 5. Inspect reporting rate values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "31535459", - "metadata": { - "papermill": { - "duration": 0.160299, - "end_time": "2026-01-16T10:24:07.836039", - "exception": false, - "start_time": "2026-01-16T10:24:07.675740", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# 3.7 Aggregate monthly counts at ADM2 level\n", + "reporting_rate_adm2 <- facility_master_routine_02 %>%\n", + " dplyr::group_by(ADM1_ID, ADM1_NAME, ADM2_ID, ADM2_NAME, YEAR, PERIOD) %>%\n", + " dplyr::summarise(\n", + " HF_ACTIVE_THIS_PERIOD_BY_ADM2 = sum(ACTIVE_THIS_PERIOD, na.rm = TRUE),\n", + " NR_OF_HF_BY_ADM2 = sum(COUNT, na.rm = TRUE),\n", + " NR_OF_OPEN_HF_BY_ADM2 = sum(OPEN, na.rm = TRUE),\n", + " HF_ACTIVE_THIS_YEAR_BY_ADM2 = sum(ACTIVE_THIS_YEAR, na.rm = TRUE),\n", + " HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED = sum(ACTIVE_THIS_PERIOD_W, na.rm = TRUE),\n", + " NR_OF_HF_BY_ADM2_WEIGHTED = sum(COUNT_W, na.rm = TRUE),\n", + " NR_OF_OPEN_HF_BY_ADM2_WEIGHTED = sum(OPEN_W, na.rm = TRUE),\n", + " HF_ACTIVE_THIS_YEAR_BY_ADM2_WEIGHTED = sum(ACTIVE_THIS_YEAR_W, na.rm = TRUE),\n", + " .groups = \"drop\"\n", + " ) %>%\n", + " dplyr::mutate(\n", + " RR_TOTAL_HF = HF_ACTIVE_THIS_PERIOD_BY_ADM2 / NR_OF_HF_BY_ADM2,\n", + " RR_OPEN_HF = HF_ACTIVE_THIS_PERIOD_BY_ADM2 / NR_OF_OPEN_HF_BY_ADM2,\n", + " RR_ACTIVE_HF = HF_ACTIVE_THIS_PERIOD_BY_ADM2 / HF_ACTIVE_THIS_YEAR_BY_ADM2,\n", + " RR_TOTAL_HF_W = HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED / NR_OF_HF_BY_ADM2_WEIGHTED,\n", + " RR_OPEN_HF_W = HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED / NR_OF_OPEN_HF_BY_ADM2_WEIGHTED,\n", + " RR_ACTIVE_HF_W = HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED / HF_ACTIVE_THIS_YEAR_BY_ADM2_WEIGHTED\n", + " )\n" + ], + "execution_count": null, + "outputs": [], + "id": "af13191e" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "hist(reporting_rate_dataelement$REPORTING_RATE, breaks=50, \n", - "main=paste0(\"Histogram of REPORTING_RATE\\n(\", DATAELEMENT_METHOD_DENOMINATOR, \",\\n\", ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted\", \"Unweighted\"), \")\"), \n", - "xlab=\"REPORTING_RATE\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6778f17d", - "metadata": { - "papermill": { - "duration": 0.896382, - "end_time": "2026-01-16T10:24:08.732660", - "exception": false, - "start_time": "2026-01-16T10:24:07.836278", - "status": "completed" + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3.8. Calculate Reporting Rates (all methods)" + ], + "id": "7d381937" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Boxplot\n", - "ggplot(reporting_rate_dataelement,\n", - " aes(x = factor(YEAR), y = REPORTING_RATE)) +\n", - " geom_boxplot(outlier.alpha = 0.3) +\n", - " labs(\n", - " x = \"Year\",\n", - " y = glue::glue(\"REPORTING_RATE ({DATAELEMENT_METHOD_DENOMINATOR})\"),\n", - " title = \"Distribution of REPORTING_RATE per year\",\n", - " subtitle = ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted Reporting Rates\", \"Unweighted Reporting Rates\")\n", - " ) +\n", - " theme_minimal()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a7f013fd", - "metadata": { - "papermill": { - "duration": 0.859448, - "end_time": "2026-01-16T10:24:09.592295", - "exception": false, - "start_time": "2026-01-16T10:24:08.732847", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# 3.8 Select final reporting-rate definition for export\n", + "rr_column_selection <- if (DATAELEMENT_METHOD_DENOMINATOR == \"ROUTINE_ACTIVE_FACILITIES\") \"RR_ACTIVE_HF\" else \"RR_OPEN_HF\"\n", + "if (USE_WEIGHTED_REPORTING_RATES) {\n", + " rr_column_selection <- if (DATAELEMENT_METHOD_DENOMINATOR == \"ROUTINE_ACTIVE_FACILITIES\") \"RR_ACTIVE_HF_W\" else \"RR_OPEN_HF_W\"\n", + "}\n", + "\n", + "reporting_rate_dataelement <- reporting_rate_adm2 %>%\n", + " dplyr::mutate(MONTH = PERIOD %% 100) %>%\n", + " dplyr::rename(REPORTING_RATE = !!rlang::sym(rr_column_selection)) %>%\n", + " dplyr::select(YEAR, MONTH, ADM2_ID, REPORTING_RATE)\n" + ], + "execution_count": null, + "outputs": [], + "id": "b41263f8" + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000108, + "end_time": "2026-01-16T10:24:07.310579", + "exception": false, + "start_time": "2026-01-16T10:24:07.310471", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## 4. Select correct col for `REPORTING_RATE` based on denominator method" + ], + "id": "5e593659" }, - "tags": [], - "vscode": { - "languageId": "r" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000057, + "end_time": "2026-01-16T10:24:07.310743", + "exception": false, + "start_time": "2026-01-16T10:24:07.310686", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 4.1. Select results and format" + ], + "id": "c75f2249" + }, + { + "cell_type": "code", + "metadata": { + "papermill": { + "duration": 0.020644, + "end_time": "2026-01-16T10:24:07.351317", + "exception": false, + "start_time": "2026-01-16T10:24:07.330673", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# 4.1 Confirm which denominator/weighting option was selected\n", + "cat(glue::glue(\n", + " \"Selected denominator method: {DATAELEMENT_METHOD_DENOMINATOR} | Weighted reporting rates: {USE_WEIGHTED_REPORTING_RATES}\"\n", + "))\n" + ], + "execution_count": null, + "outputs": [], + "id": "75e71b38" + }, + { + "cell_type": "code", + "metadata": { + "papermill": { + "duration": 0.140976, + "end_time": "2026-01-16T10:24:07.492479", + "exception": false, + "start_time": "2026-01-16T10:24:07.351503", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Output preview\n", + "dim(reporting_rate_dataelement)\n", + "head(reporting_rate_dataelement, 5)\n" + ], + "execution_count": null, + "outputs": [], + "id": "3df36abb" + }, + { + "cell_type": "code", + "metadata": { + "papermill": { + "duration": 0.182574, + "end_time": "2026-01-16T10:24:07.675242", + "exception": false, + "start_time": "2026-01-16T10:24:07.492668", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Basic data quality checks\n", + "summary(reporting_rate_dataelement$REPORTING_RATE)\n", + "sum(is.na(reporting_rate_dataelement$REPORTING_RATE))\n" + ], + "execution_count": null, + "outputs": [], + "id": "0ccc272c" + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000109, + "end_time": "2026-01-16T10:24:07.675637", + "exception": false, + "start_time": "2026-01-16T10:24:07.675528", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## 5. Inspect reporting rate values" + ], + "id": "ca66e785" + }, + { + "cell_type": "code", + "metadata": { + "papermill": { + "duration": 0.160299, + "end_time": "2026-01-16T10:24:07.836039", + "exception": false, + "start_time": "2026-01-16T10:24:07.675740", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "hist(reporting_rate_dataelement$REPORTING_RATE, breaks=50, \n", + "main=paste0(\"Histogram of REPORTING_RATE\\n(\", DATAELEMENT_METHOD_DENOMINATOR, \",\\n\", ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted\", \"Unweighted\"), \")\"), \n", + "xlab=\"REPORTING_RATE\")" + ], + "execution_count": null, + "outputs": [], + "id": "31535459" + }, + { + "cell_type": "code", + "metadata": { + "papermill": { + "duration": 0.896382, + "end_time": "2026-01-16T10:24:08.732660", + "exception": false, + "start_time": "2026-01-16T10:24:07.836278", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Boxplot\n", + "ggplot(reporting_rate_dataelement,\n", + " aes(x = factor(YEAR), y = REPORTING_RATE)) +\n", + " geom_boxplot(outlier.alpha = 0.3) +\n", + " labs(\n", + " x = \"Year\",\n", + " y = glue::glue(\"REPORTING_RATE ({DATAELEMENT_METHOD_DENOMINATOR})\"),\n", + " title = \"Distribution of REPORTING_RATE per year\",\n", + " subtitle = ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted Reporting Rates\", \"Unweighted Reporting Rates\")\n", + " ) +\n", + " theme_minimal()" + ], + "execution_count": null, + "outputs": [], + "id": "6778f17d" + }, + { + "cell_type": "code", + "metadata": { + "papermill": { + "duration": 0.859448, + "end_time": "2026-01-16T10:24:09.592295", + "exception": false, + "start_time": "2026-01-16T10:24:08.732847", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "ggplot(reporting_rate_dataelement,\n", + " aes(x = factor(YEAR), y = REPORTING_RATE)) +\n", + "# Boxplot without outliers\n", + " geom_boxplot(outlier.alpha = 0) +\n", + " geom_point(alpha = 0.3, position = position_jitter(width = 0.35)) +\n", + " labs(\n", + " x = \"Year\",\n", + " y = glue::glue(\"REPORTING_RATE based on {DATAELEMENT_METHOD_DENOMINATOR}\"),\n", + " title = \"Distribution of REPORTING_RATE per year\",\n", + " subtitle = ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted Reporting Rates\", \"Unweighted Reporting Rates\")\n", + " ) +\n", + " theme_minimal()" + ], + "execution_count": null, + "outputs": [], + "id": "a7f013fd" + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000088, + "end_time": "2026-01-16T10:24:09.592563", + "exception": false, + "start_time": "2026-01-16T10:24:09.592475", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## 5. 📁 Export to `data/` folder" + ], + "id": "2866816a-7015-4c5c-b904-f553f3b4790d" + }, + { + "cell_type": "code", + "metadata": { + "papermill": { + "duration": 0.919937, + "end_time": "2026-01-16T10:24:10.512602", + "exception": false, + "start_time": "2026-01-16T10:24:09.592665", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "output_dir <- file.path(setup$DATA_PATH, \"dhis2\", \"reporting_rate\")\n", + "dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)\n", + "\n", + "write.csv(\n", + " reporting_rate_dataelement,\n", + " file.path(output_dir, paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.csv\")),\n", + " row.names = FALSE\n", + ")\n", + "log_msg(glue::glue(\"Exported: {file.path(output_dir, paste0(COUNTRY_CODE, '_reporting_rate_dataelement.csv'))}\"))\n", + "\n", + "arrow::write_parquet(\n", + " reporting_rate_dataelement,\n", + " file.path(output_dir, paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.parquet\"))\n", + ")\n", + "log_msg(glue::glue(\"Exported: {file.path(output_dir, paste0(COUNTRY_CODE, '_reporting_rate_dataelement.parquet'))}\"))\n", + "" + ], + "execution_count": null, + "outputs": [], + "id": "bbf27852-8ec5-4370-aae2-49e082928fe1" } - }, - "outputs": [], - "source": [ - "ggplot(reporting_rate_dataelement,\n", - " aes(x = factor(YEAR), y = REPORTING_RATE)) +\n", - "# Boxplot without outliers\n", - " geom_boxplot(outlier.alpha = 0) +\n", - " geom_point(alpha = 0.3, position = position_jitter(width = 0.35)) +\n", - " labs(\n", - " x = \"Year\",\n", - " y = glue::glue(\"REPORTING_RATE based on {DATAELEMENT_METHOD_DENOMINATOR}\"),\n", - " title = \"Distribution of REPORTING_RATE per year\",\n", - " subtitle = ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted Reporting Rates\", \"Unweighted Reporting Rates\")\n", - " ) +\n", - " theme_minimal()" - ] - }, - { - "cell_type": "markdown", - "id": "2866816a-7015-4c5c-b904-f553f3b4790d", - "metadata": { - "papermill": { - "duration": 8.8e-05, - "end_time": "2026-01-16T10:24:09.592563", - "exception": false, - "start_time": "2026-01-16T10:24:09.592475", - "status": "completed" + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" }, - "tags": [] - }, - "source": [ - "## 5. 📁 Export to `data/` folder" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bbf27852-8ec5-4370-aae2-49e082928fe1", - "metadata": { - "papermill": { - "duration": 0.919937, - "end_time": "2026-01-16T10:24:10.512602", - "exception": false, - "start_time": "2026-01-16T10:24:09.592665", - "status": "completed" + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" }, - "tags": [], - "vscode": { - "languageId": "r" + "papermill": { + "default_parameters": {}, + "duration": 81.158347, + "end_time": "2026-01-16T10:24:10.736106", + "environment_variables": {}, + "exception": null, + "input_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb", + "output_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataelement/papermill_outputs/snt_dhis2_reporting_rate_dataelement_OUTPUT_2026-01-16_102249.ipynb", + "parameters": { + "AVAILABILITY_INDICATORS": [ + "CONF", + "PRES", + "SUSP", + "TEST" + ], + "DATAELEMENT_METHOD_DENOMINATOR": "ROUTINE_ACTIVE_FACILITIES", + "ROUTINE_FILE": "XXX_routine_outliers_removed.parquet", + "SNT_ROOT_PATH": "/home/hexa/workspace", + "USE_WEIGHTED_REPORTING_RATES": true, + "VOLUME_ACTIVITY_INDICATORS": [ + "CONF", + "PRES" + ] + }, + "start_time": "2026-01-16T10:22:49.577759", + "version": "2.6.0" } - }, - "outputs": [], - "source": [ - "export_reporting_rate_dataelement(\n", - " reporting_rate_dataelement = reporting_rate_dataelement,\n", - " DATA_PATH = DATA_PATH,\n", - " COUNTRY_CODE = COUNTRY_CODE\n", - ")\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" }, - "papermill": { - "default_parameters": {}, - "duration": 81.158347, - "end_time": "2026-01-16T10:24:10.736106", - "environment_variables": {}, - "exception": null, - "input_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb", - "output_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataelement/papermill_outputs/snt_dhis2_reporting_rate_dataelement_OUTPUT_2026-01-16_102249.ipynb", - "parameters": { - "AVAILABILITY_INDICATORS": [ - "CONF", - "PRES", - "SUSP", - "TEST" - ], - "DATAELEMENT_METHOD_DENOMINATOR": "ROUTINE_ACTIVE_FACILITIES", - "ROUTINE_FILE": "XXX_routine_outliers_removed.parquet", - "SNT_ROOT_PATH": "/home/hexa/workspace", - "USE_WEIGHTED_REPORTING_RATES": true, - "VOLUME_ACTIVITY_INDICATORS": [ - "CONF", - "PRES" - ] - }, - "start_time": "2026-01-16T10:22:49.577759", - "version": "2.6.0" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r b/pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r index c73601b..936e586 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r +++ b/pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r @@ -1,48 +1,75 @@ -select_routine_dataset_name_dataelement <- function(ROUTINE_FILE, COUNTRY_CODE, config_json) { - if (ROUTINE_FILE == glue::glue("{COUNTRY_CODE}_routine.parquet")) { - return(config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED) - } - config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION -} - - -load_routine_data_dataelement <- function(rountine_dataset_name, ROUTINE_FILE, COUNTRY_CODE) { - dhis2_routine <- tryCatch({ - get_latest_dataset_file_in_memory(rountine_dataset_name, ROUTINE_FILE) - }, error = function(e) { - msg <- paste("[ERROR] Error while loading DHIS2 routine data file for: ", COUNTRY_CODE, conditionMessage(e)) - cat(msg) - stop(msg) - }) +# Load base utils +source(file.path("~/workspace/code", "snt_utils.r")) + + +#' Get Setup Variables for SNT Workspace +#' Initializes workspace paths, loads R packages, and imports OpenHEXA SDK. +#' +#' @param SNT_ROOT_PATH Character. Root path of the SNT workspace. Default: '~/workspace' +#' @param packages Character vector. R packages to install and load. +#' @return List with CONFIG_PATH, UPLOADS_PATH, DATA_PATH. +#' +#' @export +get_setup_variables <- function( + SNT_ROOT_PATH = "~/workspace", + packages = c("arrow", "dplyr", "tidyr", "stringr", "stringi", "jsonlite", "httr", "glue", "reticulate", "zoo") +) { + setup_variable <- list( + CONFIG_PATH = file.path(SNT_ROOT_PATH, "configuration"), + UPLOADS_PATH = file.path(SNT_ROOT_PATH, "uploads"), + DATA_PATH = file.path(SNT_ROOT_PATH, "data") + ) - dhis2_routine <- dhis2_routine %>% - dplyr::mutate(dplyr::across(c(PERIOD, YEAR, MONTH), as.numeric)) + install_and_load(packages) - log_msg(glue::glue( - "DHIS2 routine file {ROUTINE_FILE} loaded from dataset: {rountine_dataset_name}. Dataframe dimensions: {paste(dim(dhis2_routine), collapse=', ')}" - )) + Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") + reticulate::py_config()$python + assign("openhexa", reticulate::import("openhexa.sdk"), envir = .GlobalEnv) - dhis2_routine + return(setup_variable) } -load_pyramid_data_dataelement <- function(config_json, COUNTRY_CODE) { - dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED - - dhis2_pyramid_formatted <- tryCatch({ - get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, "_pyramid.parquet")) - }, error = function(e) { - msg <- paste("Error while loading DHIS2 pyramid FORMATTED data file for: ", COUNTRY_CODE, conditionMessage(e)) - cat(msg) - stop(msg) - }) +#' Load SNT Configuration File +#' Reads and parses a JSON configuration file. +#' @param snt_config_path Character. Path to the configuration JSON file. +#' @return List containing parsed configuration. +#' +#' @export +load_snt_config <- function(snt_config_path) { + config_json <- tryCatch( + { jsonlite::fromJSON(snt_config_path) }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading configuration: {snt_config_path}") + cat(msg) + stop(msg) + } + ) + log_msg(paste0("SNT configuration loaded from: ", snt_config_path)) + return(config_json) +} - log_msg(paste0( - "DHIS2 pyramid FORMATTED data loaded from dataset: `", dataset_name, - "`. Dataframe dimensions: ", paste(dim(dhis2_pyramid_formatted), collapse = ", ") - )) - dhis2_pyramid_formatted +#' Load Dataset File from OpenHEXA +#' Retrieves the latest version of a file from an OpenHEXA dataset. +#' +#' @param dataset_id Character. OpenHEXA dataset identifier. +#' @param filename Character. Name of file to load. +#' @return Dataframe containing the loaded data. +#' +#' @export +load_dataset_file <- function(dataset_id, filename) { + data <- tryCatch( + { get_latest_dataset_file_in_memory(dataset_id, filename) }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading {filename} file: {conditionMessage(e)}") + log_msg(msg, "error") + stop(msg) + } + ) + msg <- glue::glue("{filename} data loaded from dataset: {dataset_id} dataframe dimensions: [{paste(dim(data), collapse = ', ')}]") + log_msg(msg) + return(data) } @@ -67,117 +94,3 @@ build_facility_master_dataelement <- function( tidyr::crossing(PERIOD = period_vector) %>% dplyr::mutate(PERIOD = as.numeric(PERIOD)) } - - -compute_reporting_rate_dataelement <- function( - facility_master, - dhis2_routine, - DHIS2_INDICATORS, - ACTIVITY_INDICATORS, - VOLUME_ACTIVITY_INDICATORS, - DATAELEMENT_METHOD_DENOMINATOR, - USE_WEIGHTED_REPORTING_RATES -) { - facility_master_routine <- dplyr::left_join( - facility_master, - dhis2_routine %>% dplyr::select(OU_ID, PERIOD, dplyr::any_of(DHIS2_INDICATORS)), - by = c("OU_ID", "PERIOD") - ) %>% - dplyr::mutate( - YEAR = as.numeric(substr(PERIOD, 1, 4)), - ACTIVE_THIS_PERIOD = ifelse( - rowSums(!is.na(dplyr::across(dplyr::all_of(ACTIVITY_INDICATORS))) & - dplyr::across(dplyr::all_of(ACTIVITY_INDICATORS)) >= 0) > 0, 1, 0 - ), - COUNT = 1 - ) %>% - dplyr::mutate( - period_date = as.Date(zoo::as.yearmon(as.character(PERIOD), "%Y%m")), - NAME_CLOSED = stringr::str_detect(toupper(OU_NAME), "CLOTUR|FERM(E|EE)?"), - OPEN_BY_DATE = !(is.na(OPENING_DATE) | as.Date(OPENING_DATE) > period_date | - (!is.na(CLOSED_DATE) & as.Date(CLOSED_DATE) <= period_date)), - OPEN = ifelse(!NAME_CLOSED & OPEN_BY_DATE, 1, 0) - ) %>% - dplyr::group_by(OU_ID, YEAR) %>% - dplyr::mutate(ACTIVE_THIS_YEAR = max(ACTIVE_THIS_PERIOD, na.rm = TRUE)) %>% - dplyr::ungroup() - - mean_monthly_cases <- dhis2_routine %>% - dplyr::mutate(total_cases_by_hf_month = rowSums(dplyr::across(dplyr::all_of(VOLUME_ACTIVITY_INDICATORS)), na.rm = TRUE)) %>% - dplyr::group_by(ADM2_ID, OU_ID) %>% - dplyr::summarise( - total_cases_by_hf_year = sum(total_cases_by_hf_month, na.rm = TRUE), - number_of_reporting_months = length(which(total_cases_by_hf_month > 0)), - .groups = "drop" - ) %>% - dplyr::mutate(MEAN_REPORTED_CASES_BY_HF = total_cases_by_hf_year / number_of_reporting_months) %>% - dplyr::select(ADM2_ID, OU_ID, MEAN_REPORTED_CASES_BY_HF) - - mean_monthly_cases_adm2 <- mean_monthly_cases %>% - dplyr::select(ADM2_ID, MEAN_REPORTED_CASES_BY_HF) %>% - dplyr::group_by(ADM2_ID) %>% - dplyr::summarise( - SUMMED_MEAN_REPORTED_CASES_BY_ADM2 = sum(MEAN_REPORTED_CASES_BY_HF, na.rm = TRUE), - NR_OF_HF = dplyr::n() - ) - - hf_weights <- mean_monthly_cases %>% - dplyr::left_join(mean_monthly_cases_adm2, by = "ADM2_ID") %>% - dplyr::mutate(WEIGHT = MEAN_REPORTED_CASES_BY_HF / SUMMED_MEAN_REPORTED_CASES_BY_ADM2 * NR_OF_HF) - - facility_master_routine_02 <- facility_master_routine %>% - dplyr::left_join(hf_weights %>% dplyr::select(OU_ID, WEIGHT), by = c("OU_ID")) - - facility_master_routine_02$ACTIVE_THIS_PERIOD_W <- facility_master_routine_02$ACTIVE_THIS_PERIOD * facility_master_routine_02$WEIGHT - facility_master_routine_02$COUNT_W <- facility_master_routine_02$COUNT * facility_master_routine_02$WEIGHT - facility_master_routine_02$OPEN_W <- facility_master_routine_02$OPEN * facility_master_routine_02$WEIGHT - facility_master_routine_02$ACTIVE_THIS_YEAR_W <- facility_master_routine_02$ACTIVE_THIS_YEAR * facility_master_routine_02$WEIGHT - - reporting_rate_adm2 <- facility_master_routine_02 %>% - dplyr::group_by(ADM1_ID, ADM1_NAME, ADM2_ID, ADM2_NAME, YEAR, PERIOD) %>% - dplyr::summarise( - HF_ACTIVE_THIS_PERIOD_BY_ADM2 = sum(ACTIVE_THIS_PERIOD, na.rm = TRUE), - NR_OF_HF_BY_ADM2 = sum(COUNT, na.rm = TRUE), - NR_OF_OPEN_HF_BY_ADM2 = sum(OPEN, na.rm = TRUE), - HF_ACTIVE_THIS_YEAR_BY_ADM2 = sum(ACTIVE_THIS_YEAR, na.rm = TRUE), - HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED = sum(ACTIVE_THIS_PERIOD_W, na.rm = TRUE), - NR_OF_HF_BY_ADM2_WEIGHTED = sum(COUNT_W, na.rm = TRUE), - NR_OF_OPEN_HF_BY_ADM2_WEIGHTED = sum(OPEN_W, na.rm = TRUE), - HF_ACTIVE_THIS_YEAR_BY_ADM2_WEIGHTED = sum(ACTIVE_THIS_YEAR_W, na.rm = TRUE), - .groups = "drop" - ) %>% - dplyr::mutate( - RR_TOTAL_HF = HF_ACTIVE_THIS_PERIOD_BY_ADM2 / NR_OF_HF_BY_ADM2, - RR_OPEN_HF = HF_ACTIVE_THIS_PERIOD_BY_ADM2 / NR_OF_OPEN_HF_BY_ADM2, - RR_ACTIVE_HF = HF_ACTIVE_THIS_PERIOD_BY_ADM2 / HF_ACTIVE_THIS_YEAR_BY_ADM2, - RR_TOTAL_HF_W = HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED / NR_OF_HF_BY_ADM2_WEIGHTED, - RR_OPEN_HF_W = HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED / NR_OF_OPEN_HF_BY_ADM2_WEIGHTED, - RR_ACTIVE_HF_W = HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED / HF_ACTIVE_THIS_YEAR_BY_ADM2_WEIGHTED - ) - - rr_column_selection <- if (DATAELEMENT_METHOD_DENOMINATOR == "ROUTINE_ACTIVE_FACILITIES") "RR_ACTIVE_HF" else "RR_OPEN_HF" - if (USE_WEIGHTED_REPORTING_RATES) { - rr_column_selection <- if (DATAELEMENT_METHOD_DENOMINATOR == "ROUTINE_ACTIVE_FACILITIES") "RR_ACTIVE_HF_W" else "RR_OPEN_HF_W" - } - - reporting_rate_adm2 %>% - dplyr::mutate(MONTH = PERIOD %% 100) %>% - dplyr::rename(REPORTING_RATE = !!rlang::sym(rr_column_selection)) %>% - dplyr::select(YEAR, MONTH, ADM2_ID, REPORTING_RATE) -} - - -export_reporting_rate_dataelement <- function(reporting_rate_dataelement, DATA_PATH, COUNTRY_CODE) { - output_data_path <- file.path(DATA_PATH, "reporting_rate") - if (!dir.exists(output_data_path)) { - dir.create(output_data_path, recursive = TRUE) - } - - file_path <- file.path(output_data_path, paste0(COUNTRY_CODE, "_reporting_rate_dataelement.parquet")) - arrow::write_parquet(reporting_rate_dataelement, file_path) - log_msg(glue::glue("Exported : {file_path}")) - - file_path <- file.path(output_data_path, paste0(COUNTRY_CODE, "_reporting_rate_dataelement.csv")) - write.csv(reporting_rate_dataelement, file_path, row.names = FALSE) - log_msg(glue::glue("Exported : {file_path}")) -} diff --git a/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb b/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb index f39b231..7e22286 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb +++ b/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb @@ -1,1103 +1,1103 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "30bf8dfc", - "metadata": {}, - "source": [ - "# **Dataset Reporting Rate: Calculation Based on DHIS2 Extracted Data**\n", - "\n", - "The **reporting rate** measures the proportion of registered health facilities that submit data. It is calculated for each administrative level 2 (`ADM2`) area and for each reporting period (`PERIOD` in YYYYMM format).\n", - "
\n", - "\n", - "**Dataset Selection**
\n", - "The choice of dataset(s) used for reporting rate calculation is controlled by modifying the SNT_config.json configuration file. This allows flexible selection among multiple datasets extracted from the same DHIS2 instance.\n", - "\n", - "**Calculation Logic**
\n", - "From the selected dataset(s):\n", - "- **Numerator:** Number of facilities that _actually_ reported, derived from the element \"ACTUAL_REPORTS\".\n", - "- **Denominator:** Number of facilities _expected_ to report, derived from the element \"EXPECTED_REPORTS\".\n", - "\n", - "After aggregating these counts at the ADM2 level, the reporting rate is computed as:\n", - "
\n", - "REPORTING RATE = ACTUAL_REPORTS / EXPECTED_REPORTS\n", - "
\n", - "and expressed as a **proportion** between 0 and 1.\n", - "
\n", - "\n", - "-----\n", - "\n", - "### Additional Data Processing Steps\n", - "\n", - "- **Handling Multiple Datasets:** \n", - " When multiple datasets are available, the pipeline uses only those specified in SNT_config.json. For these selected datasets, the counts of actual and expected reports are summed by ADM2 area.\n", - "\n", - "- **Deduplication of Entries:** \n", - " Sometimes, the same organizational unit (OU_ID) may appear in multiple datasets for the same period, risking double counting. To address this, deduplication is performed by keeping only the entry with the **highest** ACTUAL_REPORTS value for each unique combination of OU_ID and PERIOD. \n", - "
    \n", - "
  • Why keep the highest? Because ACTUAL_REPORTS values are binary (0 or 1). If duplicates agree (all 0 or all 1), keeping one suffices. If they differ (some 0, some 1), keeping the 1 ensures that presence of a report is not missed.
  • \n", - "
  • 🚨Important: Deduplication only proceeds if all duplicated values are within {0,1}. If other values are present, deduplication is skipped with a warning to avoid incorrect data handling.
  • \n", - "
\n", - "\n", - "-----\n", - "\n", - "\n", - "### 🇳🇪 Niger-Specific Processing: \n", - " In Niger, datasets for HOP (hospital) facilities are already **pre-aggregated** and may contain values greater than 1 for actual or expected reports, reflecting subunits or departments within a hospital. \n", - "
\n", - " To accurately represent reporting at the facility level and avoid overcounting, all values greater than 1 are converted to 1 (presence/absence). This ensures that the reporting rate reflects whether the hospital as a whole reported, rather than counting multiple subunits separately. This step also prevents cases where ACTUAL_REPORTS exceeds EXPECTED_REPORTS.\n", - "\n", - "------\n", - "\n", - "### Pipeline parameters\n", - "\n", - "- **Outliers detection method**: Specify which method was used to detect outliers in routine data. Choose \"Routine data (Raw)\" to use raw routine data.\n", - " \n", - "- **Use routine with outliers removed**: Toggle this on to use the routine data after outliers have been removed (using the outliers detection method selected above). Else, this pipeline will use either the imputed routine data (to replace the outlier values removed) or the raw routine data if you selected \"Routine data (Raw)\" as your choice of “Outlier processing method”." - ] - }, - { - "cell_type": "markdown", - "id": "064495be-24e5-4b76-a91f-7ac3d1a27a5a", - "metadata": { - "papermill": { - "duration": 9.2e-05, - "end_time": "2025-12-19T10:21:50.273573", - "exception": false, - "start_time": "2025-12-19T10:21:50.273481", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## 1. Setup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "35ede7cf-257f-439c-a514-26a7290f881d", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:21:50.332786Z", - "iopub.status.busy": "2025-12-19T10:21:50.277536Z", - "iopub.status.idle": "2025-12-19T10:23:03.339080Z", - "shell.execute_reply": "2025-12-19T10:23:03.336413Z" - }, - "papermill": { - "duration": 73.068006, - "end_time": "2025-12-19T10:23:03.341764", - "exception": false, - "start_time": "2025-12-19T10:21:50.273758", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Project paths\n", - "SNT_ROOT_PATH <- \"/home/hexa/workspace\"\n", - "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_reporting_rate_dataset\")\n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, \"configuration\")\n", - "DATA_PATH <- file.path(SNT_ROOT_PATH, \"data\", \"dhis2\")\n", - "\n", - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_reporting_rate_dataset.r\"))\n", - "\n", - "# Load libraries\n", - "required_packages <- c(\"arrow\", \"tidyverse\", \"glue\", \"jsonlite\", \"httr\", \"reticulate\")\n", - "install_and_load(required_packages)\n", - "\n", - "# Environment variables\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "\n", - "# Load OpenHEXA sdk\n", - "openhexa <- import(\"openhexa.sdk\")\n" - ] - }, - { - "cell_type": "markdown", - "id": "7dedcc32-c531-498d-90b9-89b0ee9fb9be", - "metadata": { - "papermill": { - "duration": 0.00017, - "end_time": "2025-12-19T10:23:03.342235", - "exception": false, - "start_time": "2025-12-19T10:23:03.342065", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 1.1. Load and check `config_json` file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5b6d29ea-91f3-4c53-b95e-4b485f88383f", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:03.351367Z", - "iopub.status.busy": "2025-12-19T10:23:03.348819Z", - "iopub.status.idle": "2025-12-19T10:23:03.979814Z", - "shell.execute_reply": "2025-12-19T10:23:03.976617Z" - }, - "papermill": { - "duration": 0.640406, - "end_time": "2025-12-19T10:23:03.982829", - "exception": false, - "start_time": "2025-12-19T10:23:03.342423", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", - " error = function(e) {\n", - " msg <- paste0(\"[ERROR] Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "log_msg(paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, \"SNT_config.json\")))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c26c981c-dadd-48ac-ae35-613b8ba61a82", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:03.987632Z", - "iopub.status.busy": "2025-12-19T10:23:03.985301Z", - "iopub.status.idle": "2025-12-19T10:23:04.011308Z", - "shell.execute_reply": "2025-12-19T10:23:04.008941Z" - }, - "papermill": { - "duration": 0.031002, - "end_time": "2025-12-19T10:23:04.014107", - "exception": false, - "start_time": "2025-12-19T10:23:03.983105", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Configuration settings\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", - "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", - "\n", - "# Which reporting rate PRODUCT_UID to use (DHIS2 dataset id)\n", - "REPORTING_RATE_PRODUCT_ID <- config_json$SNT_CONFIG$REPORTING_RATE_PRODUCT_UID \n", - "\n", - "fixed_cols_rr <- c('YEAR', 'MONTH', 'ADM2_ID', 'REPORTING_RATE') # Fixed cols for exporting RR tables" - ] - }, - { - "cell_type": "markdown", - "id": "a7a15634-4623-40f2-8e2d-3fa47203aa6e", - "metadata": { - "papermill": { - "duration": 0.00015, - "end_time": "2025-12-19T10:23:04.014523", - "exception": false, - "start_time": "2025-12-19T10:23:04.014373", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 1.2. Validate parameters" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b17f7685-5291-4e5d-9eec-2d1f9435fccb", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:04.019283Z", - "iopub.status.busy": "2025-12-19T10:23:04.017257Z", - "iopub.status.idle": "2025-12-19T10:23:04.039652Z", - "shell.execute_reply": "2025-12-19T10:23:04.037292Z" - }, - "papermill": { - "duration": 0.02788, - "end_time": "2025-12-19T10:23:04.042642", - "exception": false, - "start_time": "2025-12-19T10:23:04.014762", - "status": "completed" + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# **Dataset Reporting Rate: Calculation Based on DHIS2 Extracted Data**\n", + "\n", + "The **reporting rate** measures the proportion of registered health facilities that submit data. It is calculated for each administrative level 2 (`ADM2`) area and for each reporting period (`PERIOD` in YYYYMM format).\n", + "
\n", + "\n", + "**Dataset Selection**
\n", + "The choice of dataset(s) used for reporting rate calculation is controlled by modifying the SNT_config.json configuration file. This allows flexible selection among multiple datasets extracted from the same DHIS2 instance.\n", + "\n", + "**Calculation Logic**
\n", + "From the selected dataset(s):\n", + "- **Numerator:** Number of facilities that _actually_ reported, derived from the element \"ACTUAL_REPORTS\".\n", + "- **Denominator:** Number of facilities _expected_ to report, derived from the element \"EXPECTED_REPORTS\".\n", + "\n", + "After aggregating these counts at the ADM2 level, the reporting rate is computed as:\n", + "
\n", + "REPORTING RATE = ACTUAL_REPORTS / EXPECTED_REPORTS\n", + "
\n", + "and expressed as a **proportion** between 0 and 1.\n", + "
\n", + "\n", + "-----\n", + "\n", + "### Additional Data Processing Steps\n", + "\n", + "- **Handling Multiple Datasets:** \n", + " When multiple datasets are available, the pipeline uses only those specified in SNT_config.json. For these selected datasets, the counts of actual and expected reports are summed by ADM2 area.\n", + "\n", + "- **Deduplication of Entries:** \n", + " Sometimes, the same organizational unit (OU_ID) may appear in multiple datasets for the same period, risking double counting. To address this, deduplication is performed by keeping only the entry with the **highest** ACTUAL_REPORTS value for each unique combination of OU_ID and PERIOD. \n", + "
    \n", + "
  • Why keep the highest? Because ACTUAL_REPORTS values are binary (0 or 1). If duplicates agree (all 0 or all 1), keeping one suffices. If they differ (some 0, some 1), keeping the 1 ensures that presence of a report is not missed.
  • \n", + "
  • 🚨Important: Deduplication only proceeds if all duplicated values are within {0,1}. If other values are present, deduplication is skipped with a warning to avoid incorrect data handling.
  • \n", + "
\n", + "\n", + "-----\n", + "\n", + "\n", + "### 🇳🇪 Niger-Specific Processing: \n", + " In Niger, datasets for HOP (hospital) facilities are already **pre-aggregated** and may contain values greater than 1 for actual or expected reports, reflecting subunits or departments within a hospital. \n", + "
\n", + " To accurately represent reporting at the facility level and avoid overcounting, all values greater than 1 are converted to 1 (presence/absence). This ensures that the reporting rate reflects whether the hospital as a whole reported, rather than counting multiple subunits separately. This step also prevents cases where ACTUAL_REPORTS exceeds EXPECTED_REPORTS.\n", + "\n", + "------\n", + "\n", + "### Pipeline parameters\n", + "\n", + "- **Outliers detection method**: Specify which method was used to detect outliers in routine data. Choose \"Routine data (Raw)\" to use raw routine data.\n", + " \n", + "- **Use routine with outliers removed**: Toggle this on to use the routine data after outliers have been removed (using the outliers detection method selected above). Else, this pipeline will use either the imputed routine data (to replace the outlier values removed) or the raw routine data if you selected \"Routine data (Raw)\" as your choice of “Outlier processing method”." + ], + "id": "30bf8dfc" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# default: raw routine\n", - "if (!exists(\"ROUTINE_FILE\")) { ROUTINE_FILE <- glue::glue(\"{COUNTRY_CODE}_routine.parquet\") }" - ] - }, - { - "cell_type": "markdown", - "id": "8d8b20f5-901b-46c7-a0ef-9850cba6e650", - "metadata": { - "papermill": { - "duration": 0.000144, - "end_time": "2025-12-19T10:23:04.043066", - "exception": false, - "start_time": "2025-12-19T10:23:04.042922", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000092, + "end_time": "2025-12-19T10:21:50.273573", + "exception": false, + "start_time": "2025-12-19T10:21:50.273481", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## 1. Setup" + ], + "id": "064495be-24e5-4b76-a91f-7ac3d1a27a5a" }, - "tags": [] - }, - "source": [ - "#### 1.3. 🔍 Check REPORTING_RATE_PRODUCT_ID is configured" - ] - }, - { - "cell_type": "markdown", - "id": "682a62d5", - "metadata": {}, - "source": [ - "### 🐍 This probably to be moved to pipeline.py code?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7469898d", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:04.047782Z", - "iopub.status.busy": "2025-12-19T10:23:04.045631Z", - "iopub.status.idle": "2025-12-19T10:23:04.545551Z", - "shell.execute_reply": "2025-12-19T10:23:04.542372Z" + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:21:50.332786Z", + "iopub.status.busy": "2025-12-19T10:21:50.277536Z", + "iopub.status.idle": "2025-12-19T10:23:03.339080Z", + "shell.execute_reply": "2025-12-19T10:23:03.336413Z" + }, + "papermill": { + "duration": 73.068006, + "end_time": "2025-12-19T10:23:03.341764", + "exception": false, + "start_time": "2025-12-19T10:21:50.273758", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "SNT_ROOT_PATH <- \"/home/hexa/workspace\"\n", + "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_reporting_rate_dataset\")\n", + "\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_reporting_rate_dataset.r\"))\n", + "setup <- get_setup_variables(SNT_ROOT_PATH = SNT_ROOT_PATH)\n", + "" + ], + "execution_count": null, + "outputs": [], + "id": "35ede7cf-257f-439c-a514-26a7290f881d" }, - "papermill": { - "duration": 0.505908, - "end_time": "2025-12-19T10:23:04.549148", - "exception": false, - "start_time": "2025-12-19T10:23:04.043240", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.00017, + "end_time": "2025-12-19T10:23:03.342235", + "exception": false, + "start_time": "2025-12-19T10:23:03.342065", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 1.1. Load and check `config_json` file" + ], + "id": "7dedcc32-c531-498d-90b9-89b0ee9fb9be" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Check if REPORTING_RATE_PRODUCT_ID is configured\n", - "if (is.null(REPORTING_RATE_PRODUCT_ID) || length(REPORTING_RATE_PRODUCT_ID) == 0) {\n", - " log_msg(\"🚨 Warning: REPORTING_RATE_PRODUCT_ID is not configured properly in 'SNT_config.json'. \n", - " This will prevent filtering by reporting dataset, and all values will be retained.\", level = \"warning\" )\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "e44ae2ab-4af7-475a-8cbe-6d669895a18b", - "metadata": { - "papermill": { - "duration": 0.000139, - "end_time": "2025-12-19T10:23:04.549558", - "exception": false, - "start_time": "2025-12-19T10:23:04.549419", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:03.351367Z", + "iopub.status.busy": "2025-12-19T10:23:03.348819Z", + "iopub.status.idle": "2025-12-19T10:23:03.979814Z", + "shell.execute_reply": "2025-12-19T10:23:03.976617Z" + }, + "papermill": { + "duration": 0.640406, + "end_time": "2025-12-19T10:23:03.982829", + "exception": false, + "start_time": "2025-12-19T10:23:03.342423", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "config_json <- load_snt_config(file.path(setup$CONFIG_PATH, \"SNT_config.json\"))" + ], + "execution_count": null, + "outputs": [], + "id": "5b6d29ea-91f3-4c53-b95e-4b485f88383f" }, - "tags": [] - }, - "source": [ - "## 2. Load Data" - ] - }, - { - "cell_type": "markdown", - "id": "39e2add7-bbc7-4312-9a6f-9886d675f532", - "metadata": { - "papermill": { - "duration": 0.000152, - "end_time": "2025-12-19T10:23:04.549924", - "exception": false, - "start_time": "2025-12-19T10:23:04.549772", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:03.987632Z", + "iopub.status.busy": "2025-12-19T10:23:03.985301Z", + "iopub.status.idle": "2025-12-19T10:23:04.011308Z", + "shell.execute_reply": "2025-12-19T10:23:04.008941Z" + }, + "papermill": { + "duration": 0.031002, + "end_time": "2025-12-19T10:23:04.014107", + "exception": false, + "start_time": "2025-12-19T10:23:03.983105", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Configuration settings\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", + "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", + "\n", + "# Which reporting rate PRODUCT_UID to use (DHIS2 dataset id)\n", + "REPORTING_RATE_PRODUCT_ID <- config_json$SNT_CONFIG$REPORTING_RATE_PRODUCT_UID \n", + "\n", + "fixed_cols_rr <- c('YEAR', 'MONTH', 'ADM2_ID', 'REPORTING_RATE') # Fixed cols for exporting RR tables" + ], + "execution_count": null, + "outputs": [], + "id": "c26c981c-dadd-48ac-ae35-613b8ba61a82" }, - "tags": [] - }, - "source": [ - "### 2.1. Load routine data (DHIS2) \n", - "Already formatted routine data, we use this as the master table
\n", - "(only used at the very end before exporting the table)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a1213723-f7e2-4238-9f37-f1795b187232", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:04.554212Z", - "iopub.status.busy": "2025-12-19T10:23:04.552423Z", - "iopub.status.idle": "2025-12-19T10:23:05.773324Z", - "shell.execute_reply": "2025-12-19T10:23:05.771316Z" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.00015, + "end_time": "2025-12-19T10:23:04.014523", + "exception": false, + "start_time": "2025-12-19T10:23:04.014373", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 1.2. Validate parameters" + ], + "id": "a7a15634-4623-40f2-8e2d-3fa47203aa6e" }, - "papermill": { - "duration": 1.225668, - "end_time": "2025-12-19T10:23:05.775768", - "exception": false, - "start_time": "2025-12-19T10:23:04.550100", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:04.019283Z", + "iopub.status.busy": "2025-12-19T10:23:04.017257Z", + "iopub.status.idle": "2025-12-19T10:23:04.039652Z", + "shell.execute_reply": "2025-12-19T10:23:04.037292Z" + }, + "papermill": { + "duration": 0.02788, + "end_time": "2025-12-19T10:23:04.042642", + "exception": false, + "start_time": "2025-12-19T10:23:04.014762", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# default: raw routine\n", + "if (!exists(\"ROUTINE_FILE\")) { ROUTINE_FILE <- glue::glue(\"{COUNTRY_CODE}_routine.parquet\") }\n", + "\n", + "# Resolved by pipeline.py based on routine_data_choice; fallback to empty string\n", + "if (!exists(\"DATASET_ID\")) { DATASET_ID <- \"\" }" + ], + "execution_count": null, + "outputs": [], + "id": "b17f7685-5291-4e5d-9eec-2d1f9435fccb" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "rountine_dataset_name <- select_routine_dataset_name_dataset(ROUTINE_FILE, COUNTRY_CODE, config_json)\n", - "dhis2_routine <- load_routine_data_dataset(rountine_dataset_name, ROUTINE_FILE, COUNTRY_CODE, fixed_cols_rr)\n", - "dim(dhis2_routine)\n", - "head(dhis2_routine, 3)\n" - ] - }, - { - "cell_type": "markdown", - "id": "dccc8626-7798-4bcd-ae5f-d7502dfdc452", - "metadata": { - "papermill": { - "duration": 0.000155, - "end_time": "2025-12-19T10:23:05.776205", - "exception": false, - "start_time": "2025-12-19T10:23:05.776050", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000144, + "end_time": "2025-12-19T10:23:04.043066", + "exception": false, + "start_time": "2025-12-19T10:23:04.042922", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 1.3. 🔍 Check REPORTING_RATE_PRODUCT_ID is configured" + ], + "id": "8d8b20f5-901b-46c7-a0ef-9850cba6e650" }, - "tags": [] - }, - "source": [ - "### 2.2. Load Reporting Rate data (DHIS2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0e352c76-f2fb-43ba-b85d-391d808057a8", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:05.780487Z", - "iopub.status.busy": "2025-12-19T10:23:05.778651Z", - "iopub.status.idle": "2025-12-19T10:23:07.096742Z", - "shell.execute_reply": "2025-12-19T10:23:07.094774Z" + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 🐍 This probably to be moved to pipeline.py code?" + ], + "id": "682a62d5" }, - "papermill": { - "duration": 1.322737, - "end_time": "2025-12-19T10:23:07.099136", - "exception": false, - "start_time": "2025-12-19T10:23:05.776399", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "dhis2_reporting <- load_reporting_data_dataset(config_json, COUNTRY_CODE)\n", - "head(dhis2_reporting, 3)\n" - ] - }, - { - "cell_type": "markdown", - "id": "4d5f398b", - "metadata": { - "papermill": { - "duration": 0.000151, - "end_time": "2025-12-19T10:23:07.099531", - "exception": false, - "start_time": "2025-12-19T10:23:07.099380", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## 3. Transform reporting data" - ] - }, - { - "cell_type": "markdown", - "id": "adcbee0b", - "metadata": { - "papermill": { - "duration": 0.0001, - "end_time": "2025-12-19T10:23:07.099849", - "exception": false, - "start_time": "2025-12-19T10:23:07.099749", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 3.1. Filter Reporting Rate data by \"Dataset\" (`PRODUCT_UID`)\n", - "Logic:\n", - "* Value(s) (string) for `PRODUCT_UID` defined in the config.json file\n", - "* If none provided (**empty** field) skip filtering and **keep everything**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "795a5e74", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:07.104617Z", - "iopub.status.busy": "2025-12-19T10:23:07.102475Z", - "iopub.status.idle": "2025-12-19T10:23:08.406561Z", - "shell.execute_reply": "2025-12-19T10:23:08.404419Z" + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:04.047782Z", + "iopub.status.busy": "2025-12-19T10:23:04.045631Z", + "iopub.status.idle": "2025-12-19T10:23:04.545551Z", + "shell.execute_reply": "2025-12-19T10:23:04.542372Z" + }, + "papermill": { + "duration": 0.505908, + "end_time": "2025-12-19T10:23:04.549148", + "exception": false, + "start_time": "2025-12-19T10:23:04.043240", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Check if REPORTING_RATE_PRODUCT_ID is configured\n", + "if (is.null(REPORTING_RATE_PRODUCT_ID) || length(REPORTING_RATE_PRODUCT_ID) == 0) {\n", + " log_msg(\"🚨 Warning: REPORTING_RATE_PRODUCT_ID is not configured properly in 'SNT_config.json'. \n", + " This will prevent filtering by reporting dataset, and all values will be retained.\", level = \"warning\" )\n", + "}" + ], + "execution_count": null, + "outputs": [], + "id": "7469898d" }, - "papermill": { - "duration": 1.309322, - "end_time": "2025-12-19T10:23:08.409343", - "exception": false, - "start_time": "2025-12-19T10:23:07.100021", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000139, + "end_time": "2025-12-19T10:23:04.549558", + "exception": false, + "start_time": "2025-12-19T10:23:04.549419", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## 2. Load Data" + ], + "id": "e44ae2ab-4af7-475a-8cbe-6d669895a18b" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# 3.1 Filter Reporting Rate data by selected dataset PRODUCT_UID(s)\n", - "if (length(REPORTING_RATE_PRODUCT_ID) > 0 && all(REPORTING_RATE_PRODUCT_ID %in% unique(dhis2_reporting$PRODUCT_UID))) {\n", - " dhis2_reporting <- dhis2_reporting %>% dplyr::filter(PRODUCT_UID %in% REPORTING_RATE_PRODUCT_ID)\n", - "} else if (length(REPORTING_RATE_PRODUCT_ID) > 0) {\n", - " log_msg(glue::glue(\n", - " \"?? Warning: REPORTING_RATE_PRODUCT_UID: {paste(REPORTING_RATE_PRODUCT_ID, collapse=', ')} not found in DHIS2 reporting data. Skipping filtering.\"\n", - " ), level = \"warning\")\n", - "}\n", - "\n", - "# 3.2 Pivot wider on PRODUCT_METRIC\n", - "dhis2_reporting_wide <- dhis2_reporting %>%\n", - " tidyr::pivot_wider(names_from = PRODUCT_METRIC, values_from = VALUE)\n", - "\n", - "# 3.3 Detect duplicated OU_ID / PERIOD combinations across datasets\n", - "dupl_ou_period <- dhis2_reporting_wide %>%\n", - " dplyr::group_by(OU_ID, PERIOD) %>%\n", - " dplyr::filter(dplyr::n() > 1) %>%\n", - " dplyr::ungroup() %>%\n", - " dplyr::select(OU_ID, OU_NAME, PERIOD, PRODUCT_UID, dplyr::ends_with(\"REPORTS\"))\n", - "\n", - "# If duplicates are binary reports (0/1), keep the row where ACTUAL_REPORTS is maximal\n", - "if (nrow(dupl_ou_period) > 0 &&\n", - " all(dupl_ou_period$ACTUAL_REPORTS %in% c(0, 1), na.rm = TRUE) &&\n", - " all(dupl_ou_period$EXPECTED_REPORTS %in% c(0, 1), na.rm = TRUE)) {\n", - "\n", - " dhis2_reporting_wide <- dhis2_reporting_wide %>%\n", - " dplyr::group_by(PERIOD, OU_ID) %>%\n", - " dplyr::mutate(ACTUAL_REPORTS_deduplicated = ifelse(OU_ID %in% dupl_ou_period$OU_ID, max(ACTUAL_REPORTS), ACTUAL_REPORTS)) %>%\n", - " dplyr::ungroup() %>%\n", - " dplyr::filter(!(OU_ID %in% dupl_ou_period$OU_ID) | (ACTUAL_REPORTS == ACTUAL_REPORTS_deduplicated)) %>%\n", - " dplyr::select(-ACTUAL_REPORTS_deduplicated)\n", - "}\n", - "\n", - "# Country-specific normalization for Niger where reports can exceed 1\n", - "if (COUNTRY_CODE == \"NER\") {\n", - " dhis2_reporting_wide <- dhis2_reporting_wide %>%\n", - " dplyr::mutate(\n", - " ACTUAL_REPORTS = ifelse(ACTUAL_REPORTS > 1, 1, ACTUAL_REPORTS),\n", - " EXPECTED_REPORTS = ifelse(EXPECTED_REPORTS > 1, 1, EXPECTED_REPORTS)\n", - " )\n", - "}\n", - "\n", - "# 3.4 Aggregate at ADM2 and compute reporting rate\n", - "reporting_rate_results <- dhis2_reporting_wide %>%\n", - " dplyr::group_by(PERIOD, YEAR, MONTH, ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID) %>%\n", - " dplyr::summarise(\n", - " ACTUAL_REPORTS = sum(ACTUAL_REPORTS, na.rm = TRUE),\n", - " EXPECTED_REPORTS = sum(EXPECTED_REPORTS, na.rm = TRUE),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " dplyr::mutate(REPORTING_RATE = ACTUAL_REPORTS / EXPECTED_REPORTS)\n" - ] - }, - { - "cell_type": "markdown", - "id": "4237408a", - "metadata": { - "papermill": { - "duration": 0.000133, - "end_time": "2025-12-19T10:23:08.409660", - "exception": false, - "start_time": "2025-12-19T10:23:08.409527", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000152, + "end_time": "2025-12-19T10:23:04.549924", + "exception": false, + "start_time": "2025-12-19T10:23:04.549772", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 2.1. Load routine data (DHIS2) \n", + "Already formatted routine data, we use this as the master table
\n", + "(only used at the very end before exporting the table)" + ], + "id": "39e2add7-bbc7-4312-9a6f-9886d675f532" }, - "tags": [] - }, - "source": [ - "### 3.2. Pivot wider" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5c3b9a65", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:08.413415Z", - "iopub.status.busy": "2025-12-19T10:23:08.411805Z", - "iopub.status.idle": "2025-12-19T10:23:08.884793Z", - "shell.execute_reply": "2025-12-19T10:23:08.880916Z" + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:04.554212Z", + "iopub.status.busy": "2025-12-19T10:23:04.552423Z", + "iopub.status.idle": "2025-12-19T10:23:05.773324Z", + "shell.execute_reply": "2025-12-19T10:23:05.771316Z" + }, + "papermill": { + "duration": 1.225668, + "end_time": "2025-12-19T10:23:05.775768", + "exception": false, + "start_time": "2025-12-19T10:23:04.550100", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "dhis2_routine <- load_dataset_file(DATASET_ID, ROUTINE_FILE)\n", + "dhis2_routine <- dhis2_routine %>%\n", + " dplyr::mutate(dplyr::across(c(PERIOD, YEAR, MONTH), as.numeric)) %>%\n", + " dplyr::select(dplyr::any_of(fixed_cols_rr)) %>%\n", + " dplyr::distinct()\n", + "dim(dhis2_routine)\n", + "head(dhis2_routine, 3)\n", + "" + ], + "execution_count": null, + "outputs": [], + "id": "a1213723-f7e2-4238-9f37-f1795b187232" }, - "papermill": { - "duration": 0.479538, - "end_time": "2025-12-19T10:23:08.889341", - "exception": false, - "start_time": "2025-12-19T10:23:08.409803", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000155, + "end_time": "2025-12-19T10:23:05.776205", + "exception": false, + "start_time": "2025-12-19T10:23:05.776050", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 2.2. Load Reporting Rate data (DHIS2)" + ], + "id": "dccc8626-7798-4bcd-ae5f-d7502dfdc452" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# 3.2 Quick check after pivot\n", - "dim(dhis2_reporting_wide)\n", - "head(dhis2_reporting_wide, 3)\n" - ] - }, - { - "cell_type": "markdown", - "id": "0f485148", - "metadata": { - "papermill": { - "duration": 0.000186, - "end_time": "2025-12-19T10:23:08.889829", - "exception": false, - "start_time": "2025-12-19T10:23:08.889643", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:05.780487Z", + "iopub.status.busy": "2025-12-19T10:23:05.778651Z", + "iopub.status.idle": "2025-12-19T10:23:07.096742Z", + "shell.execute_reply": "2025-12-19T10:23:07.094774Z" + }, + "papermill": { + "duration": 1.322737, + "end_time": "2025-12-19T10:23:07.099136", + "exception": false, + "start_time": "2025-12-19T10:23:05.776399", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "dhis2_reporting <- load_dataset_file(\n", + " config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED,\n", + " paste0(COUNTRY_CODE, \"_reporting.parquet\")\n", + ")\n", + "dhis2_reporting <- dhis2_reporting %>%\n", + " dplyr::mutate(dplyr::across(c(PERIOD, YEAR, MONTH, VALUE), as.numeric))\n", + "head(dhis2_reporting, 3)\n", + "" + ], + "execution_count": null, + "outputs": [], + "id": "0e352c76-f2fb-43ba-b85d-391d808057a8" }, - "tags": [] - }, - "source": [ - "### 👯 Handle **duplicated** values (`OU_ID`)\n", - "Using multiple datasets relies on the **assumption** that **each dataset is complementary to the other(s)**. Namely, there should be no \"dupliacted\" orgunits that are counted in more than one dataset! Else, we would be **double counting**." - ] - }, - { - "cell_type": "markdown", - "id": "55dececa", - "metadata": { - "papermill": { - "duration": 0.000122, - "end_time": "2025-12-19T10:23:08.890157", - "exception": false, - "start_time": "2025-12-19T10:23:08.890035", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000151, + "end_time": "2025-12-19T10:23:07.099531", + "exception": false, + "start_time": "2025-12-19T10:23:07.099380", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## 3. Transform reporting data" + ], + "id": "4d5f398b" }, - "tags": [] - }, - "source": [ - "#### Check for duplicated values (`OU_ID`)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d761bd15", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:08.899486Z", - "iopub.status.busy": "2025-12-19T10:23:08.894706Z", - "iopub.status.idle": "2025-12-19T10:23:09.476248Z", - "shell.execute_reply": "2025-12-19T10:23:09.470283Z" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.0001, + "end_time": "2025-12-19T10:23:07.099849", + "exception": false, + "start_time": "2025-12-19T10:23:07.099749", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 3.1. Filter Reporting Rate data by \"Dataset\" (`PRODUCT_UID`)\n", + "Logic:\n", + "* Value(s) (string) for `PRODUCT_UID` defined in the config.json file\n", + "* If none provided (**empty** field) skip filtering and **keep everything**" + ], + "id": "adcbee0b" }, - "papermill": { - "duration": 0.590832, - "end_time": "2025-12-19T10:23:09.481144", - "exception": false, - "start_time": "2025-12-19T10:23:08.890312", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:07.104617Z", + "iopub.status.busy": "2025-12-19T10:23:07.102475Z", + "iopub.status.idle": "2025-12-19T10:23:08.406561Z", + "shell.execute_reply": "2025-12-19T10:23:08.404419Z" + }, + "papermill": { + "duration": 1.309322, + "end_time": "2025-12-19T10:23:08.409343", + "exception": false, + "start_time": "2025-12-19T10:23:07.100021", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# 3.1 Filter Reporting Rate data by selected dataset PRODUCT_UID(s)\n", + "if (length(REPORTING_RATE_PRODUCT_ID) > 0 && all(REPORTING_RATE_PRODUCT_ID %in% unique(dhis2_reporting$PRODUCT_UID))) {\n", + " dhis2_reporting <- dhis2_reporting %>% dplyr::filter(PRODUCT_UID %in% REPORTING_RATE_PRODUCT_ID)\n", + "} else if (length(REPORTING_RATE_PRODUCT_ID) > 0) {\n", + " log_msg(glue::glue(\n", + " \"?? Warning: REPORTING_RATE_PRODUCT_UID: {paste(REPORTING_RATE_PRODUCT_ID, collapse=', ')} not found in DHIS2 reporting data. Skipping filtering.\"\n", + " ), level = \"warning\")\n", + "}\n", + "\n", + "# 3.2 Pivot wider on PRODUCT_METRIC\n", + "dhis2_reporting_wide <- dhis2_reporting %>%\n", + " tidyr::pivot_wider(names_from = PRODUCT_METRIC, values_from = VALUE)\n", + "\n", + "# 3.3 Detect duplicated OU_ID / PERIOD combinations across datasets\n", + "dupl_ou_period <- dhis2_reporting_wide %>%\n", + " dplyr::group_by(OU_ID, PERIOD) %>%\n", + " dplyr::filter(dplyr::n() > 1) %>%\n", + " dplyr::ungroup() %>%\n", + " dplyr::select(OU_ID, OU_NAME, PERIOD, PRODUCT_UID, dplyr::ends_with(\"REPORTS\"))\n", + "\n", + "# If duplicates are binary reports (0/1), keep the row where ACTUAL_REPORTS is maximal\n", + "if (nrow(dupl_ou_period) > 0 &&\n", + " all(dupl_ou_period$ACTUAL_REPORTS %in% c(0, 1), na.rm = TRUE) &&\n", + " all(dupl_ou_period$EXPECTED_REPORTS %in% c(0, 1), na.rm = TRUE)) {\n", + "\n", + " dhis2_reporting_wide <- dhis2_reporting_wide %>%\n", + " dplyr::group_by(PERIOD, OU_ID) %>%\n", + " dplyr::mutate(ACTUAL_REPORTS_deduplicated = ifelse(OU_ID %in% dupl_ou_period$OU_ID, max(ACTUAL_REPORTS), ACTUAL_REPORTS)) %>%\n", + " dplyr::ungroup() %>%\n", + " dplyr::filter(!(OU_ID %in% dupl_ou_period$OU_ID) | (ACTUAL_REPORTS == ACTUAL_REPORTS_deduplicated)) %>%\n", + " dplyr::select(-ACTUAL_REPORTS_deduplicated)\n", + "}\n", + "\n", + "# Country-specific normalization for Niger where reports can exceed 1\n", + "if (COUNTRY_CODE == \"NER\") {\n", + " dhis2_reporting_wide <- dhis2_reporting_wide %>%\n", + " dplyr::mutate(\n", + " ACTUAL_REPORTS = ifelse(ACTUAL_REPORTS > 1, 1, ACTUAL_REPORTS),\n", + " EXPECTED_REPORTS = ifelse(EXPECTED_REPORTS > 1, 1, EXPECTED_REPORTS)\n", + " )\n", + "}\n", + "\n", + "# 3.4 Aggregate at ADM2 and compute reporting rate\n", + "reporting_rate_results <- dhis2_reporting_wide %>%\n", + " dplyr::group_by(PERIOD, YEAR, MONTH, ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID) %>%\n", + " dplyr::summarise(\n", + " ACTUAL_REPORTS = sum(ACTUAL_REPORTS, na.rm = TRUE),\n", + " EXPECTED_REPORTS = sum(EXPECTED_REPORTS, na.rm = TRUE),\n", + " .groups = \"drop\"\n", + " ) %>%\n", + " dplyr::mutate(REPORTING_RATE = ACTUAL_REPORTS / EXPECTED_REPORTS)\n" + ], + "execution_count": null, + "outputs": [], + "id": "795a5e74" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Count duplicated OU_ID/PERIOD combinations found\n", - "cat(glue::glue(\"Duplicated OU_ID-PERIOD rows detected: {nrow(dupl_ou_period)}\"))\n", - "head(dupl_ou_period, 5)\n" - ] - }, - { - "cell_type": "markdown", - "id": "805ed555", - "metadata": { - "papermill": { - "duration": 0.000139, - "end_time": "2025-12-19T10:23:09.481549", - "exception": false, - "start_time": "2025-12-19T10:23:09.481410", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000133, + "end_time": "2025-12-19T10:23:08.409660", + "exception": false, + "start_time": "2025-12-19T10:23:08.409527", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 3.2. Pivot wider" + ], + "id": "4237408a" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "source": [ - "#### Remove duplicated OU_IDs (shared across PRODUCT_UIDs)\n", - "Logic: \n", - "1. Identify if any `OU_ID` is present in both datasets\n", - "2. For these, keep `max(ACTUAL_REPORTS)` (since `EXPECTED_REPORTS` is always == 1) because: \n", - " * if both same value (either both 0 or both 1) => simply deduplicate (`distinct()`)\n", - " * if else if different values, meaning that one dataset say 1 and the other 0 => keep 1 (facility _did_ report)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "593b013a", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:09.488856Z", - "iopub.status.busy": "2025-12-19T10:23:09.484674Z", - "iopub.status.idle": "2025-12-19T10:23:13.563200Z", - "shell.execute_reply": "2025-12-19T10:23:13.559294Z" + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:08.413415Z", + "iopub.status.busy": "2025-12-19T10:23:08.411805Z", + "iopub.status.idle": "2025-12-19T10:23:08.884793Z", + "shell.execute_reply": "2025-12-19T10:23:08.880916Z" + }, + "papermill": { + "duration": 0.479538, + "end_time": "2025-12-19T10:23:08.889341", + "exception": false, + "start_time": "2025-12-19T10:23:08.409803", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# 3.2 Quick check after pivot\n", + "dim(dhis2_reporting_wide)\n", + "head(dhis2_reporting_wide, 3)\n" + ], + "execution_count": null, + "outputs": [], + "id": "5c3b9a65" }, - "papermill": { - "duration": 4.086946, - "end_time": "2025-12-19T10:23:13.568699", - "exception": false, - "start_time": "2025-12-19T10:23:09.481753", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000186, + "end_time": "2025-12-19T10:23:08.889829", + "exception": false, + "start_time": "2025-12-19T10:23:08.889643", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 👯 Handle **duplicated** values (`OU_ID`)\n", + "Using multiple datasets relies on the **assumption** that **each dataset is complementary to the other(s)**. Namely, there should be no \"dupliacted\" orgunits that are counted in more than one dataset! Else, we would be **double counting**." + ], + "id": "0f485148" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Verify deduplication effect at OU_ID/PERIOD level\n", - "dupl_after_cleaning <- dhis2_reporting_wide %>%\n", - " dplyr::group_by(OU_ID, PERIOD) %>%\n", - " dplyr::filter(dplyr::n() > 1) %>%\n", - " dplyr::ungroup()\n", - "cat(glue::glue(\"Remaining duplicated OU_ID-PERIOD rows after cleaning: {nrow(dupl_after_cleaning)}\"))\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c72bd93a", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:13.581200Z", - "iopub.status.busy": "2025-12-19T10:23:13.574942Z", - "iopub.status.idle": "2025-12-19T10:23:18.911910Z", - "shell.execute_reply": "2025-12-19T10:23:18.907746Z" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000122, + "end_time": "2025-12-19T10:23:08.890157", + "exception": false, + "start_time": "2025-12-19T10:23:08.890035", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### Check for duplicated values (`OU_ID`)" + ], + "id": "55dececa" }, - "papermill": { - "duration": 5.346749, - "end_time": "2025-12-19T10:23:18.915815", - "exception": false, - "start_time": "2025-12-19T10:23:13.569066", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:08.899486Z", + "iopub.status.busy": "2025-12-19T10:23:08.894706Z", + "iopub.status.idle": "2025-12-19T10:23:09.476248Z", + "shell.execute_reply": "2025-12-19T10:23:09.470283Z" + }, + "papermill": { + "duration": 0.590832, + "end_time": "2025-12-19T10:23:09.481144", + "exception": false, + "start_time": "2025-12-19T10:23:08.890312", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Count duplicated OU_ID/PERIOD combinations found\n", + "cat(glue::glue(\"Duplicated OU_ID-PERIOD rows detected: {nrow(dupl_ou_period)}\"))\n", + "head(dupl_ou_period, 5)\n" + ], + "execution_count": null, + "outputs": [], + "id": "d761bd15" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Optional inspection of cleaned rows\n", - "head(dhis2_reporting_wide, 5)\n" - ] - }, - { - "cell_type": "markdown", - "id": "2f26c614", - "metadata": { - "papermill": { - "duration": 0.000236, - "end_time": "2025-12-19T10:23:18.916421", - "exception": false, - "start_time": "2025-12-19T10:23:18.916185", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000139, + "end_time": "2025-12-19T10:23:09.481549", + "exception": false, + "start_time": "2025-12-19T10:23:09.481410", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "#### Remove duplicated OU_IDs (shared across PRODUCT_UIDs)\n", + "Logic: \n", + "1. Identify if any `OU_ID` is present in both datasets\n", + "2. For these, keep `max(ACTUAL_REPORTS)` (since `EXPECTED_REPORTS` is always == 1) because: \n", + " * if both same value (either both 0 or both 1) => simply deduplicate (`distinct()`)\n", + " * if else if different values, meaning that one dataset say 1 and the other 0 => keep 1 (facility _did_ report)" + ], + "id": "805ed555" }, - "tags": [] - }, - "source": [ - "### 3.3. (🇳🇪 NER only) Make HOP aggregated values (0, >1) into presence/absence (0, 1)\n", - "Specific for Niger SNIS instance!
\n", - "Values for dataset HOP (\"ki7YKOfyxjf\" = \"HOP 03 ACTIVITES DE LUTTE CONTRE LE PALUDISME\") count the individual \"sub-units\" (departments, etc ... ) of a given hospital and therefore can have values >1.
\n", - "For consistency with CSI (where all values are raw, and therefore only 0 and 1), we need to convert all HOP value >1 into 1." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4118991c", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:18.924306Z", - "iopub.status.busy": "2025-12-19T10:23:18.920810Z", - "iopub.status.idle": "2025-12-19T10:23:19.482033Z", - "shell.execute_reply": "2025-12-19T10:23:19.479013Z" + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:09.488856Z", + "iopub.status.busy": "2025-12-19T10:23:09.484674Z", + "iopub.status.idle": "2025-12-19T10:23:13.563200Z", + "shell.execute_reply": "2025-12-19T10:23:13.559294Z" + }, + "papermill": { + "duration": 4.086946, + "end_time": "2025-12-19T10:23:13.568699", + "exception": false, + "start_time": "2025-12-19T10:23:09.481753", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Verify deduplication effect at OU_ID/PERIOD level\n", + "dupl_after_cleaning <- dhis2_reporting_wide %>%\n", + " dplyr::group_by(OU_ID, PERIOD) %>%\n", + " dplyr::filter(dplyr::n() > 1) %>%\n", + " dplyr::ungroup()\n", + "cat(glue::glue(\"Remaining duplicated OU_ID-PERIOD rows after cleaning: {nrow(dupl_after_cleaning)}\"))\n" + ], + "execution_count": null, + "outputs": [], + "id": "593b013a" }, - "papermill": { - "duration": 0.56938, - "end_time": "2025-12-19T10:23:19.486133", - "exception": false, - "start_time": "2025-12-19T10:23:18.916753", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:13.581200Z", + "iopub.status.busy": "2025-12-19T10:23:13.574942Z", + "iopub.status.idle": "2025-12-19T10:23:18.911910Z", + "shell.execute_reply": "2025-12-19T10:23:18.907746Z" + }, + "papermill": { + "duration": 5.346749, + "end_time": "2025-12-19T10:23:18.915815", + "exception": false, + "start_time": "2025-12-19T10:23:13.569066", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Optional inspection of cleaned rows\n", + "head(dhis2_reporting_wide, 5)\n" + ], + "execution_count": null, + "outputs": [], + "id": "c72bd93a" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# NER-specific normalization quality check\n", - "if (COUNTRY_CODE == \"NER\") {\n", - " cat(\"Applied NER normalization: ACTUAL_REPORTS and EXPECTED_REPORTS capped at 1.\n", - "\")\n", - "}\n", - "summary(dhis2_reporting_wide$ACTUAL_REPORTS)\n", - "summary(dhis2_reporting_wide$EXPECTED_REPORTS)\n" - ] - }, - { - "cell_type": "markdown", - "id": "066319a3", - "metadata": { - "papermill": { - "duration": 0.000172, - "end_time": "2025-12-19T10:23:19.486674", - "exception": false, - "start_time": "2025-12-19T10:23:19.486502", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000236, + "end_time": "2025-12-19T10:23:18.916421", + "exception": false, + "start_time": "2025-12-19T10:23:18.916185", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 3.3. (🇳🇪 NER only) Make HOP aggregated values (0, >1) into presence/absence (0, 1)\n", + "Specific for Niger SNIS instance!
\n", + "Values for dataset HOP (\"ki7YKOfyxjf\" = \"HOP 03 ACTIVITES DE LUTTE CONTRE LE PALUDISME\") count the individual \"sub-units\" (departments, etc ... ) of a given hospital and therefore can have values >1.
\n", + "For consistency with CSI (where all values are raw, and therefore only 0 and 1), we need to convert all HOP value >1 into 1." + ], + "id": "2f26c614" }, - "tags": [] - }, - "source": [ - "### 3.4. Aggregate at AMD2 level" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e94eeddd", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:19.494212Z", - "iopub.status.busy": "2025-12-19T10:23:19.491141Z", - "iopub.status.idle": "2025-12-19T10:23:19.791631Z", - "shell.execute_reply": "2025-12-19T10:23:19.786378Z" + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:18.924306Z", + "iopub.status.busy": "2025-12-19T10:23:18.920810Z", + "iopub.status.idle": "2025-12-19T10:23:19.482033Z", + "shell.execute_reply": "2025-12-19T10:23:19.479013Z" + }, + "papermill": { + "duration": 0.56938, + "end_time": "2025-12-19T10:23:19.486133", + "exception": false, + "start_time": "2025-12-19T10:23:18.916753", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# NER-specific normalization quality check\n", + "if (COUNTRY_CODE == \"NER\") {\n", + " cat(\"Applied NER normalization: ACTUAL_REPORTS and EXPECTED_REPORTS capped at 1.\n", + "\")\n", + "}\n", + "summary(dhis2_reporting_wide$ACTUAL_REPORTS)\n", + "summary(dhis2_reporting_wide$EXPECTED_REPORTS)\n" + ], + "execution_count": null, + "outputs": [], + "id": "4118991c" }, - "papermill": { - "duration": 0.308903, - "end_time": "2025-12-19T10:23:19.795888", - "exception": false, - "start_time": "2025-12-19T10:23:19.486985", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000172, + "end_time": "2025-12-19T10:23:19.486674", + "exception": false, + "start_time": "2025-12-19T10:23:19.486502", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 3.4. Aggregate at AMD2 level" + ], + "id": "066319a3" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# 3.4 Aggregate table preview\n", - "dim(reporting_rate_results)\n", - "head(reporting_rate_results, 5)\n" - ] - }, - { - "cell_type": "markdown", - "id": "eb181891", - "metadata": { - "papermill": { - "duration": 0.000151, - "end_time": "2025-12-19T10:23:19.796350", - "exception": false, - "start_time": "2025-12-19T10:23:19.796199", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:19.494212Z", + "iopub.status.busy": "2025-12-19T10:23:19.491141Z", + "iopub.status.idle": "2025-12-19T10:23:19.791631Z", + "shell.execute_reply": "2025-12-19T10:23:19.786378Z" + }, + "papermill": { + "duration": 0.308903, + "end_time": "2025-12-19T10:23:19.795888", + "exception": false, + "start_time": "2025-12-19T10:23:19.486985", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# 3.4 Aggregate table preview\n", + "dim(reporting_rate_results)\n", + "head(reporting_rate_results, 5)\n" + ], + "execution_count": null, + "outputs": [], + "id": "e94eeddd" }, - "tags": [] - }, - "source": [ - "### 3.5. Calculate REPORTING_RATE\n", - "**numerator**: `ACTUAL_REPORTS`
\n", - "**denominator**: `EXPECTED_REPORTS`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e90a1c20", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:19.803233Z", - "iopub.status.busy": "2025-12-19T10:23:19.799996Z", - "iopub.status.idle": "2025-12-19T10:23:19.994060Z", - "shell.execute_reply": "2025-12-19T10:23:19.991575Z" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000151, + "end_time": "2025-12-19T10:23:19.796350", + "exception": false, + "start_time": "2025-12-19T10:23:19.796199", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 3.5. Calculate REPORTING_RATE\n", + "**numerator**: `ACTUAL_REPORTS`
\n", + "**denominator**: `EXPECTED_REPORTS`" + ], + "id": "eb181891" }, - "papermill": { - "duration": 0.200465, - "end_time": "2025-12-19T10:23:19.997024", - "exception": false, - "start_time": "2025-12-19T10:23:19.796559", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:19.803233Z", + "iopub.status.busy": "2025-12-19T10:23:19.799996Z", + "iopub.status.idle": "2025-12-19T10:23:19.994060Z", + "shell.execute_reply": "2025-12-19T10:23:19.991575Z" + }, + "papermill": { + "duration": 0.200465, + "end_time": "2025-12-19T10:23:19.997024", + "exception": false, + "start_time": "2025-12-19T10:23:19.796559", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# 3.5 Reporting rate range check\n", + "summary(reporting_rate_results$REPORTING_RATE)\n" + ], + "execution_count": null, + "outputs": [], + "id": "e90a1c20" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# 3.5 Reporting rate range check\n", - "summary(reporting_rate_results$REPORTING_RATE)\n" - ] - }, - { - "cell_type": "markdown", - "id": "0556eba8-3d6a-45b1-af02-9bdf7da6fc99", - "metadata": { - "papermill": { - "duration": 0.000123, - "end_time": "2025-12-19T10:23:19.997465", - "exception": false, - "start_time": "2025-12-19T10:23:19.997342", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000123, + "end_time": "2025-12-19T10:23:19.997465", + "exception": false, + "start_time": "2025-12-19T10:23:19.997342", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 3.6. Ensure consistency of table (probably can skip because all data comes from the same source!)\n", + "Left join reporting indicators with DHIS2 routine data.\n", + "Make sure we have a consistent reporting rates table matching periods x org units (safety measure only)." + ], + "id": "0556eba8-3d6a-45b1-af02-9bdf7da6fc99" }, - "tags": [] - }, - "source": [ - "### 3.6. Ensure consistency of table (probably can skip because all data comes from the same source!)\n", - "Left join reporting indicators with DHIS2 routine data.\n", - "Make sure we have a consistent reporting rates table matching periods x org units (safety measure only)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "51e5b97a-e9b9-42d4-b991-0cee4fd5041f", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:20.001909Z", - "iopub.status.busy": "2025-12-19T10:23:19.999878Z", - "iopub.status.idle": "2025-12-19T10:23:20.072344Z", - "shell.execute_reply": "2025-12-19T10:23:20.070004Z" + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:20.001909Z", + "iopub.status.busy": "2025-12-19T10:23:19.999878Z", + "iopub.status.idle": "2025-12-19T10:23:20.072344Z", + "shell.execute_reply": "2025-12-19T10:23:20.070004Z" + }, + "papermill": { + "duration": 0.077426, + "end_time": "2025-12-19T10:23:20.075077", + "exception": false, + "start_time": "2025-12-19T10:23:19.997651", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "reporting_rate_dataset <- left_join(dhis2_routine, \n", + " reporting_rate_results %>% select(all_of(fixed_cols_rr)), \n", + " by=c(\"YEAR\", \"MONTH\", \"ADM2_ID\"))\n", + "\n", + "print(dim(reporting_rate_dataset))\n", + "head(reporting_rate_dataset, 3)" + ], + "execution_count": null, + "outputs": [], + "id": "51e5b97a-e9b9-42d4-b991-0cee4fd5041f" }, - "papermill": { - "duration": 0.077426, - "end_time": "2025-12-19T10:23:20.075077", - "exception": false, - "start_time": "2025-12-19T10:23:19.997651", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000173, + "end_time": "2025-12-19T10:23:20.075561", + "exception": false, + "start_time": "2025-12-19T10:23:20.075388", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 3.7. Final visual check on REPORTING_RATE values" + ], + "id": "6b19e88d" }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "reporting_rate_dataset <- left_join(dhis2_routine, \n", - " reporting_rate_results %>% select(all_of(fixed_cols_rr)), \n", - " by=c(\"YEAR\", \"MONTH\", \"ADM2_ID\"))\n", - "\n", - "print(dim(reporting_rate_dataset))\n", - "head(reporting_rate_dataset, 3)" - ] - }, - { - "cell_type": "markdown", - "id": "6b19e88d", - "metadata": { - "papermill": { - "duration": 0.000173, - "end_time": "2025-12-19T10:23:20.075561", - "exception": false, - "start_time": "2025-12-19T10:23:20.075388", - "status": "completed" + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Add log message to communicate range of REPORTING_RATE values and warn if any values are outside [0,1]\n", + "min_rr <- min(reporting_rate_dataset$REPORTING_RATE, na.rm = TRUE)\n", + "max_rr <- max(reporting_rate_dataset$REPORTING_RATE, na.rm = TRUE)\n", + "if (min_rr < 0 | max_rr > 1) { \n", + " log_msg(glue::glue(\"🚨 Warning: REPORTING_RATE values are outside the expected range [0,1]. \n", + " Minimum REPORTING_RATE: {round(min_rr, 4)}, Maximum REPORTING_RATE: {round(max_rr, 4)}\"), level = \"warning\")\n", + "} else {\n", + " log_msg(glue::glue(\"✅ REPORTING_RATE values are within the expected range [0,1]. \n", + " Minimum REPORTING_RATE: {round(min_rr, 4)}, Maximum REPORTING_RATE: {round(max_rr, 4)}\"))\n", + "}" + ], + "execution_count": null, + "outputs": [], + "id": "fbfec60f" }, - "tags": [] - }, - "source": [ - "### 3.7. Final visual check on REPORTING_RATE values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fbfec60f", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Add log message to communicate range of REPORTING_RATE values and warn if any values are outside [0,1]\n", - "min_rr <- min(reporting_rate_dataset$REPORTING_RATE, na.rm = TRUE)\n", - "max_rr <- max(reporting_rate_dataset$REPORTING_RATE, na.rm = TRUE)\n", - "if (min_rr < 0 | max_rr > 1) { \n", - " log_msg(glue::glue(\"🚨 Warning: REPORTING_RATE values are outside the expected range [0,1]. \n", - " Minimum REPORTING_RATE: {round(min_rr, 4)}, Maximum REPORTING_RATE: {round(max_rr, 4)}\"), level = \"warning\")\n", - "} else {\n", - " log_msg(glue::glue(\"✅ REPORTING_RATE values are within the expected range [0,1]. \n", - " Minimum REPORTING_RATE: {round(min_rr, 4)}, Maximum REPORTING_RATE: {round(max_rr, 4)}\"))\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8878192f", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:20.080475Z", - "iopub.status.busy": "2025-12-19T10:23:20.078272Z", - "iopub.status.idle": "2025-12-19T10:23:21.456898Z", - "shell.execute_reply": "2025-12-19T10:23:21.453352Z" + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:20.080475Z", + "iopub.status.busy": "2025-12-19T10:23:20.078272Z", + "iopub.status.idle": "2025-12-19T10:23:21.456898Z", + "shell.execute_reply": "2025-12-19T10:23:21.453352Z" + }, + "papermill": { + "duration": 1.384875, + "end_time": "2025-12-19T10:23:21.460674", + "exception": false, + "start_time": "2025-12-19T10:23:20.075799", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Simple plot to visualize distribution of REPORTING_RATE\n", + "ggplot(reporting_rate_dataset, aes(x=REPORTING_RATE)) +\n", + " geom_histogram() +\n", + " labs(\n", + " x=\"Dataset Reporting Rate\", y=\"Frequency\",\n", + " title = glue::glue(\"Reporting rate values range from {round(min(reporting_rate_dataset$REPORTING_RATE), 2)} to {round(max(reporting_rate_dataset$REPORTING_RATE), 2)}\")\n", + " ) +\n", + " theme_minimal()" + ], + "execution_count": null, + "outputs": [], + "id": "8878192f" }, - "papermill": { - "duration": 1.384875, - "end_time": "2025-12-19T10:23:21.460674", - "exception": false, - "start_time": "2025-12-19T10:23:20.075799", - "status": "completed" + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000104, + "end_time": "2025-12-19T10:23:21.460981", + "exception": false, + "start_time": "2025-12-19T10:23:21.460877", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## 4. 📁 Export to `data/` folder\n", + "Export as both .csv and .parquet file formats." + ], + "id": "ad181b27-bf7b-4eb5-9200-fda8c2b8eb60" }, - "tags": [], - "vscode": { - "languageId": "r" + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:21.467337Z", + "iopub.status.busy": "2025-12-19T10:23:21.464010Z", + "iopub.status.idle": "2025-12-19T10:23:22.383295Z", + "shell.execute_reply": "2025-12-19T10:23:22.379935Z" + }, + "papermill": { + "duration": 0.926094, + "end_time": "2025-12-19T10:23:22.387190", + "exception": false, + "start_time": "2025-12-19T10:23:21.461096", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "output_dir <- file.path(setup$DATA_PATH, \"dhis2\", \"reporting_rate\")\n", + "dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)\n", + "\n", + "write.csv(\n", + " reporting_rate_dataset,\n", + " file.path(output_dir, paste0(COUNTRY_CODE, \"_reporting_rate_dataset.csv\")),\n", + " row.names = FALSE\n", + ")\n", + "log_msg(glue::glue(\"Exported: {file.path(output_dir, paste0(COUNTRY_CODE, '_reporting_rate_dataset.csv'))}\"))\n", + "\n", + "arrow::write_parquet(\n", + " reporting_rate_dataset,\n", + " file.path(output_dir, paste0(COUNTRY_CODE, \"_reporting_rate_dataset.parquet\"))\n", + ")\n", + "log_msg(glue::glue(\"Exported: {file.path(output_dir, paste0(COUNTRY_CODE, '_reporting_rate_dataset.parquet'))}\"))\n", + "" + ], + "execution_count": null, + "outputs": [], + "id": "9adc033d-18d6-4786-8f96-21337b3e005f" } - }, - "outputs": [], - "source": [ - "# Simple plot to visualize distribution of REPORTING_RATE\n", - "ggplot(reporting_rate_dataset, aes(x=REPORTING_RATE)) +\n", - " geom_histogram() +\n", - " labs(\n", - " x=\"Dataset Reporting Rate\", y=\"Frequency\",\n", - " title = glue::glue(\"Reporting rate values range from {round(min(reporting_rate_dataset$REPORTING_RATE), 2)} to {round(max(reporting_rate_dataset$REPORTING_RATE), 2)}\")\n", - " ) +\n", - " theme_minimal()" - ] - }, - { - "cell_type": "markdown", - "id": "ad181b27-bf7b-4eb5-9200-fda8c2b8eb60", - "metadata": { - "papermill": { - "duration": 0.000104, - "end_time": "2025-12-19T10:23:21.460981", - "exception": false, - "start_time": "2025-12-19T10:23:21.460877", - "status": "completed" + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" }, - "tags": [] - }, - "source": [ - "## 4. 📁 Export to `data/` folder\n", - "Export as both .csv and .parquet file formats." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9adc033d-18d6-4786-8f96-21337b3e005f", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:21.467337Z", - "iopub.status.busy": "2025-12-19T10:23:21.464010Z", - "iopub.status.idle": "2025-12-19T10:23:22.383295Z", - "shell.execute_reply": "2025-12-19T10:23:22.379935Z" + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" }, "papermill": { - "duration": 0.926094, - "end_time": "2025-12-19T10:23:22.387190", - "exception": false, - "start_time": "2025-12-19T10:23:21.461096", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" + "default_parameters": {}, + "duration": 94.192072, + "end_time": "2025-12-19T10:23:22.614345", + "environment_variables": {}, + "exception": null, + "input_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb", + "output_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataset/papermill_outputs/snt_dhis2_reporting_rate_dataset_OUTPUT_2025-12-19_102148.ipynb", + "parameters": { + "ROUTINE_FILE": "XXX_routine_outliers_imputed.parquet", + "SNT_ROOT_PATH": "/home/hexa/workspace" + }, + "start_time": "2025-12-19T10:21:48.422273", + "version": "2.6.0" } - }, - "outputs": [], - "source": [ - "export_reporting_rate_dataset(\n", - " reporting_rate_dataset = reporting_rate_dataset,\n", - " DATA_PATH = DATA_PATH,\n", - " COUNTRY_CODE = COUNTRY_CODE\n", - ")\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" }, - "papermill": { - "default_parameters": {}, - "duration": 94.192072, - "end_time": "2025-12-19T10:23:22.614345", - "environment_variables": {}, - "exception": null, - "input_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb", - "output_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataset/papermill_outputs/snt_dhis2_reporting_rate_dataset_OUTPUT_2025-12-19_102148.ipynb", - "parameters": { - "ROUTINE_FILE": "XXX_routine_outliers_imputed.parquet", - "SNT_ROOT_PATH": "/home/hexa/workspace" - }, - "start_time": "2025-12-19T10:21:48.422273", - "version": "2.6.0" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/pipelines/snt_dhis2_reporting_rate_dataset/utils/snt_dhis2_reporting_rate_dataset.r b/pipelines/snt_dhis2_reporting_rate_dataset/utils/snt_dhis2_reporting_rate_dataset.r index 285727c..57dc948 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataset/utils/snt_dhis2_reporting_rate_dataset.r +++ b/pipelines/snt_dhis2_reporting_rate_dataset/utils/snt_dhis2_reporting_rate_dataset.r @@ -1,109 +1,73 @@ -select_routine_dataset_name_dataset <- function(ROUTINE_FILE, COUNTRY_CODE, config_json) { - if (ROUTINE_FILE == glue::glue("{COUNTRY_CODE}_routine.parquet")) { - return(config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED) - } - config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION +# Load base utils +source(file.path("~/workspace/code", "snt_utils.r")) + + +#' Get Setup Variables for SNT Workspace +#' Initializes workspace paths, loads R packages, and imports OpenHEXA SDK. +#' +#' @param SNT_ROOT_PATH Character. Root path of the SNT workspace. Default: '~/workspace' +#' @param packages Character vector. R packages to install and load. +#' @return List with CONFIG_PATH, UPLOADS_PATH, DATA_PATH. +#' +#' @export +get_setup_variables <- function( + SNT_ROOT_PATH = "~/workspace", + packages = c("arrow", "dplyr", "tidyr", "stringr", "stringi", "jsonlite", "httr", "glue", "reticulate") +) { + setup_variable <- list( + CONFIG_PATH = file.path(SNT_ROOT_PATH, "configuration"), + UPLOADS_PATH = file.path(SNT_ROOT_PATH, "uploads"), + DATA_PATH = file.path(SNT_ROOT_PATH, "data") + ) + + install_and_load(packages) + + Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") + reticulate::py_config()$python + assign("openhexa", reticulate::import("openhexa.sdk"), envir = .GlobalEnv) + + return(setup_variable) } -load_routine_data_dataset <- function(rountine_dataset_name, ROUTINE_FILE, COUNTRY_CODE, fixed_cols_rr) { - dhis2_routine <- tryCatch({ - get_latest_dataset_file_in_memory(rountine_dataset_name, ROUTINE_FILE) - }, error = function(e) { - msg <- paste("Error while loading DHIS2 routine data file for: ", COUNTRY_CODE, conditionMessage(e)) - cat(msg) - stop(msg) - }) - - dhis2_routine <- dhis2_routine %>% dplyr::mutate(dplyr::across(c(PERIOD, YEAR, MONTH), as.numeric)) - dhis2_routine <- dhis2_routine %>% dplyr::select(dplyr::any_of(fixed_cols_rr)) %>% dplyr::distinct() - - log_msg(glue::glue( - "DHIS2 routine file {ROUTINE_FILE} loaded from dataset : {rountine_dataset_name} dataframe dimensions: {paste(dim(dhis2_routine), collapse=', ')}" - )) - dhis2_routine -} - - -load_reporting_data_dataset <- function(config_json, COUNTRY_CODE) { - dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED - file_name <- paste0(COUNTRY_CODE, "_reporting.parquet") - - dhis2_reporting <- tryCatch({ - get_latest_dataset_file_in_memory(dataset_name, file_name) - }, error = function(e) { - msg <- paste("[ERROR] Error while loading DHIS2 dataset reporting rates file for: ", COUNTRY_CODE, conditionMessage(e)) - cat(msg) - stop(msg) - }) - dhis2_reporting <- dhis2_reporting %>% dplyr::mutate(dplyr::across(c(PERIOD, YEAR, MONTH, VALUE), as.numeric)) - - log_msg(paste0( - "DHIS2 Datatset reporting data loaded from file `", file_name, "` (from dataset : `", dataset_name, "`). Dataframe dimensions: ", - paste(dim(dhis2_reporting), collapse = ", ") - )) - dhis2_reporting -} - - -compute_reporting_rate_dataset <- function(dhis2_reporting, REPORTING_RATE_PRODUCT_ID, COUNTRY_CODE) { - if (all(REPORTING_RATE_PRODUCT_ID %in% unique(dhis2_reporting$PRODUCT_UID))) { - dhis2_reporting <- dhis2_reporting %>% dplyr::filter(PRODUCT_UID %in% REPORTING_RATE_PRODUCT_ID) - } else { - log_msg(glue::glue( - "🚨 Warning: REPORTING_RATE_PRODUCT_UID: {paste(REPORTING_RATE_PRODUCT_ID, collapse=', ')} not found in DHIS2 reporting data. Skipping filtering." - ), level = "warning") - } - - dhis2_reporting_wide <- dhis2_reporting %>% tidyr::pivot_wider(names_from = PRODUCT_METRIC, values_from = VALUE) - - dupl_ou_period <- dhis2_reporting_wide %>% - dplyr::group_by(OU_ID, PERIOD) %>% - dplyr::filter(dplyr::n() > 1) %>% - dplyr::ungroup() %>% - dplyr::select(OU_ID, OU_NAME, PERIOD, PRODUCT_UID, dplyr::ends_with("REPORTS")) - - if (all(dupl_ou_period$ACTUAL_REPORTS %in% c(0, 1)) & all(dupl_ou_period$EXPECTED_REPORTS %in% c(0, 1))) { - dhis2_reporting_wide <- dhis2_reporting_wide %>% - dplyr::group_by(PERIOD, OU_ID) %>% - dplyr::mutate(ACTUAL_REPORTS_deduplicated = ifelse(OU_ID %in% dupl_ou_period$OU_ID, max(ACTUAL_REPORTS), ACTUAL_REPORTS)) %>% - dplyr::ungroup() %>% - dplyr::filter(!(OU_ID %in% dupl_ou_period$OU_ID) | (ACTUAL_REPORTS == ACTUAL_REPORTS_deduplicated)) %>% - dplyr::select(-ACTUAL_REPORTS_deduplicated) - } - - if (COUNTRY_CODE == "NER") { - dhis2_reporting_wide <- dhis2_reporting_wide %>% - dplyr::mutate( - ACTUAL_REPORTS = ifelse(ACTUAL_REPORTS > 1, 1, ACTUAL_REPORTS), - EXPECTED_REPORTS = ifelse(EXPECTED_REPORTS > 1, 1, EXPECTED_REPORTS) - ) - } - - dhis2_reporting_wide_adm2 <- dhis2_reporting_wide %>% - dplyr::group_by(PERIOD, YEAR, MONTH, ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID) %>% - dplyr::summarise( - ACTUAL_REPORTS = sum(ACTUAL_REPORTS, na.rm = TRUE), - EXPECTED_REPORTS = sum(EXPECTED_REPORTS, na.rm = TRUE), - .groups = "drop" - ) - - dhis2_reporting_wide_adm2 %>% - dplyr::mutate(REPORTING_RATE = ACTUAL_REPORTS / EXPECTED_REPORTS) +#' Load SNT Configuration File +#' Reads and parses a JSON configuration file. +#' @param snt_config_path Character. Path to the configuration JSON file. +#' @return List containing parsed configuration. +#' +#' @export +load_snt_config <- function(snt_config_path) { + config_json <- tryCatch( + { jsonlite::fromJSON(snt_config_path) }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading configuration: {snt_config_path}") + cat(msg) + stop(msg) + } + ) + log_msg(paste0("SNT configuration loaded from: ", snt_config_path)) + return(config_json) } -export_reporting_rate_dataset <- function(reporting_rate_dataset, DATA_PATH, COUNTRY_CODE) { - output_data_path <- file.path(DATA_PATH, "reporting_rate") - if (!dir.exists(output_data_path)) { - dir.create(output_data_path, recursive = TRUE) - } - - file_path <- file.path(output_data_path, paste0(COUNTRY_CODE, "_reporting_rate_dataset.parquet")) - arrow::write_parquet(reporting_rate_dataset, file_path) - log_msg(glue::glue("Exported : {file_path}")) - - file_path <- file.path(output_data_path, paste0(COUNTRY_CODE, "_reporting_rate_dataset.csv")) - write.csv(reporting_rate_dataset, file_path, row.names = FALSE) - log_msg(glue::glue("Exported : {file_path}")) +#' Load Dataset File from OpenHEXA +#' Retrieves the latest version of a file from an OpenHEXA dataset. +#' +#' @param dataset_id Character. OpenHEXA dataset identifier. +#' @param filename Character. Name of file to load. +#' @return Dataframe containing the loaded data. +#' +#' @export +load_dataset_file <- function(dataset_id, filename) { + data <- tryCatch( + { get_latest_dataset_file_in_memory(dataset_id, filename) }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading {filename} file: {conditionMessage(e)}") + log_msg(msg, "error") + stop(msg) + } + ) + msg <- glue::glue("{filename} data loaded from dataset: {dataset_id} dataframe dimensions: [{paste(dim(data), collapse = ', ')}]") + log_msg(msg) + return(data) } diff --git a/snt_dhis2_reporting_rate_dataelement/pipeline.py b/snt_dhis2_reporting_rate_dataelement/pipeline.py index 6a96b15..50c1d08 100644 --- a/snt_dhis2_reporting_rate_dataelement/pipeline.py +++ b/snt_dhis2_reporting_rate_dataelement/pipeline.py @@ -77,6 +77,7 @@ def snt_dhis2_reporting_rate_dataelement( nb_parameters = { "SNT_ROOT_PATH": root_path.as_posix(), "ROUTINE_FILE": routine_file, + "DATASET_ID": ds_outliers_id, } parameters_file = save_pipeline_parameters( pipeline_name="snt_dhis2_reporting_rate_dataelement", diff --git a/snt_dhis2_reporting_rate_dataset/pipeline.py b/snt_dhis2_reporting_rate_dataset/pipeline.py index b52c32c..ff440f2 100644 --- a/snt_dhis2_reporting_rate_dataset/pipeline.py +++ b/snt_dhis2_reporting_rate_dataset/pipeline.py @@ -90,6 +90,7 @@ def snt_dhis2_reporting_rate_dataset( nb_parameters = { "SNT_ROOT_PATH": root_path.as_posix(), "ROUTINE_FILE": routine_file, + "DATASET_ID": ds_outliers_id, } params_file = save_pipeline_parameters( From 94366ed626b6ad97bce47b2fbfd4f059fd9a2957 Mon Sep 17 00:00:00 2001 From: claude-marie Date: Tue, 14 Apr 2026 12:47:28 +0200 Subject: [PATCH 07/18] tested & working in snt dev --- ...snt_dhis2_reporting_rate_dataelement.ipynb | 9 +- ...s2_reporting_rate_dataelement_report.ipynb | 256 +++++++++--------- .../snt_dhis2_reporting_rate_dataelement.r | 14 +- .../snt_dhis2_reporting_rate_dataset.ipynb | 4 +- .../utils/snt_dhis2_reporting_rate_dataset.r | 14 +- .../pipeline.py | 12 +- 6 files changed, 167 insertions(+), 142 deletions(-) diff --git a/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb b/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb index 438b6eb..03f68be 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb +++ b/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb @@ -71,7 +71,9 @@ } }, "source": [ - "SNT_ROOT_PATH <- \"/home/hexa/workspace\"\n", + "if (!exists(\"SNT_ROOT_PATH\")) {\n", + " SNT_ROOT_PATH <- \"/home/hexa/workspace\"\n", + "}\n", "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_reporting_rate_dataelement\")\n", "\n", "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_reporting_rate_dataelement.r\"))\n", @@ -438,7 +440,10 @@ "PERIOD_START <- dhis2_routine$PERIOD %>% min()\n", "PERIOD_END <- dhis2_routine$PERIOD %>% max()\n", "\n", - "period_vector <- format(seq(ym(PERIOD_START), ym(PERIOD_END), by = \"month\"), \"%Y%m\")\n", + "period_vector <- format(\n", + " seq(lubridate::ym(PERIOD_START), lubridate::ym(PERIOD_END), by = \"month\"),\n", + " \"%Y%m\"\n", + ")\n", "cat(glue(\"Start period: {PERIOD_START} \\nEnd period: {PERIOD_END} \\nPeriods count: {length(period_vector)}\"))" ], "execution_count": null, diff --git a/pipelines/snt_dhis2_reporting_rate_dataelement/reporting/snt_dhis2_reporting_rate_dataelement_report.ipynb b/pipelines/snt_dhis2_reporting_rate_dataelement/reporting/snt_dhis2_reporting_rate_dataelement_report.ipynb index 1d77c2b..c9c5775 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataelement/reporting/snt_dhis2_reporting_rate_dataelement_report.ipynb +++ b/pipelines/snt_dhis2_reporting_rate_dataelement/reporting/snt_dhis2_reporting_rate_dataelement_report.ipynb @@ -2,7 +2,6 @@ "cells": [ { "cell_type": "markdown", - "id": "b79cba06", "metadata": { "papermill": { "duration": 0.000249, @@ -15,12 +14,11 @@ }, "source": [ "### 1. Setup" - ] + ], + "id": "b79cba06" }, { "cell_type": "code", - "execution_count": null, - "id": "7ca65bcc", "metadata": { "papermill": { "duration": 7.265364, @@ -34,7 +32,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# Project paths\n", "SNT_ROOT_PATH <- \"/home/hexa/workspace\" \n", @@ -48,8 +45,10 @@ "# Load palettes\n", "source(file.path(CODE_PATH, \"snt_palettes.r\"))\n", "\n", - "# Load libraries \n", - "required_packages <- c(\"arrow\", \"tidyverse\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"glue\")\n", + "# Load libraries (sf: geom_sf + geojson from get_latest_dataset_file_in_memory)\n", + "required_packages <- c(\n", + " \"arrow\", \"tidyverse\", \"sf\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"glue\"\n", + ")\n", "install_and_load(required_packages)\n", "\n", "# Environment variables\n", @@ -59,11 +58,13 @@ "\n", "# Load OpenHEXA sdk\n", "openhexa <- import(\"openhexa.sdk\")" - ] + ], + "execution_count": null, + "outputs": [], + "id": "7ca65bcc" }, { "cell_type": "markdown", - "id": "c5301aa3", "metadata": { "papermill": { "duration": 0.000116, @@ -76,12 +77,11 @@ }, "source": [ "#### 1.1. Load and check `snt config` file" - ] + ], + "id": "c5301aa3" }, { "cell_type": "code", - "execution_count": null, - "id": "76d8a072", "metadata": { "papermill": { "duration": 0.52329, @@ -95,7 +95,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# Load SNT config\n", "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", @@ -106,12 +105,13 @@ " })\n", "\n", "log_msg(paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, \"SNT_config.json\")))" - ] + ], + "execution_count": null, + "outputs": [], + "id": "76d8a072" }, { "cell_type": "code", - "execution_count": null, - "id": "c712ac02", "metadata": { "papermill": { "duration": 0.030446, @@ -125,7 +125,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# Configuration settings\n", "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", @@ -135,11 +134,13 @@ "# Reporting Rate data is stored in the same OH Dataset regardless of whether is comes from DataSet or DataElement method\n", "REPORTING_RATE_DATASET_NAME <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_REPORTING_RATE\n", "DHIS2_FORMATTED_DATASET_NAME <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED" - ] + ], + "execution_count": null, + "outputs": [], + "id": "c712ac02" }, { "cell_type": "markdown", - "id": "30b058f4", "metadata": { "papermill": { "duration": 0.000094, @@ -153,12 +154,11 @@ "source": [ "#### 1.2. Load and check `snt metadata` file\n", "This is needed for the correct use of palettes and categories (breaks, or scale)" - ] + ], + "id": "30b058f4" }, { "cell_type": "code", - "execution_count": null, - "id": "98a8ee49", "metadata": { "papermill": { "duration": 0.940593, @@ -172,7 +172,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# Load SNT metadata\n", "metadata_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_metadata.json\")) },\n", @@ -183,12 +182,13 @@ " })\n", "\n", "log_msg(paste0(\"SNT metadata loaded from : \", file.path(CONFIG_PATH, \"SNT_metadata.json\")))" - ] + ], + "execution_count": null, + "outputs": [], + "id": "98a8ee49" }, { "cell_type": "code", - "execution_count": null, - "id": "00681217", "metadata": { "papermill": { "duration": 0.198107, @@ -202,7 +202,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "scale_raw <- metadata_json$REPORTING_RATE$SCALE\n", "\n", @@ -219,11 +218,13 @@ "break_vals <- break_vals / 100\n", "\n", "log_msg(paste0(\"Reporting Rate scale break values loaded from SNT_metadata.json (treated as percentages): \", paste(break_vals, collapse = \", \")))" - ] + ], + "execution_count": null, + "outputs": [], + "id": "00681217" }, { "cell_type": "markdown", - "id": "f3470564", "metadata": { "papermill": { "duration": 0.000162, @@ -236,11 +237,11 @@ }, "source": [ "### 2. Load Data" - ] + ], + "id": "f3470564" }, { "cell_type": "markdown", - "id": "82397307", "metadata": { "papermill": { "duration": 0.000126, @@ -253,12 +254,11 @@ }, "source": [ "#### 2.1. Output of main pipeline notebook" - ] + ], + "id": "82397307" }, { "cell_type": "code", - "execution_count": null, - "id": "70acb2c5", "metadata": { "papermill": { "duration": 1.253125, @@ -272,7 +272,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# Important: this will break if reporting rate was calculated as DataSet method because it will not find the file\n", "# (will find \"{COUNTRY_CODE}_reporting_rate_dataset.parquet\" instead)\n", @@ -290,11 +289,13 @@ "log_msg(glue::glue(\"Data file `{rr_filename}` loaded from dataset: `{REPORTING_RATE_DATASET_NAME}`. Dataframe dimensions: {paste(dim(reporting_rate), collapse=', ')}\"))\n", "dim(reporting_rate)\n", "head(reporting_rate, 2)" - ] + ], + "execution_count": null, + "outputs": [], + "id": "70acb2c5" }, { "cell_type": "markdown", - "id": "48833515", "metadata": { "papermill": { "duration": 0.000091, @@ -308,12 +309,11 @@ "source": [ "#### 2.2. Shapes\n", "To make choropleth (map)" - ] + ], + "id": "48833515" }, { "cell_type": "code", - "execution_count": null, - "id": "3febd4f4", "metadata": { "papermill": { "duration": 3.535554, @@ -327,7 +327,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "shapes <- tryCatch({ get_latest_dataset_file_in_memory(DHIS2_FORMATTED_DATASET_NAME, paste0(COUNTRY_CODE, \"_shapes.geojson\")) }, \n", " error = function(e) { \n", @@ -338,11 +337,13 @@ "\n", "log_msg(glue::glue(\"Shapes loaded from dataset: '{DHIS2_FORMATTED_DATASET_NAME}'. \\nDataframe with dimensions: {paste(dim(shapes), collapse=', ')}\"))\n", "names(shapes)" - ] + ], + "execution_count": null, + "outputs": [], + "id": "3febd4f4" }, { "cell_type": "markdown", - "id": "17067d56", "metadata": { "papermill": { "duration": 0.000166, @@ -355,11 +356,11 @@ }, "source": [ "### 3. Plots" - ] + ], + "id": "17067d56" }, { "cell_type": "markdown", - "id": "9a6369ee", "metadata": { "papermill": { "duration": 0.000109, @@ -372,12 +373,11 @@ }, "source": [ "##### 3.0. Add shapes" - ] + ], + "id": "9a6369ee" }, { "cell_type": "code", - "execution_count": null, - "id": "c6641720", "metadata": { "papermill": { "duration": 0.03905, @@ -391,15 +391,16 @@ "languageId": "r" } }, - "outputs": [], "source": [ "data_to_plot <- reporting_rate %>%\n", " left_join(shapes, by = c(\"ADM2_ID\"))" - ] + ], + "execution_count": null, + "outputs": [], + "id": "c6641720" }, { "cell_type": "markdown", - "id": "0b0d32f1", "metadata": { "papermill": { "duration": 0.000195, @@ -412,11 +413,11 @@ }, "source": [ "#### 3.1. 🎨 Dynamic categories and color assignement" - ] + ], + "id": "0b0d32f1" }, { "cell_type": "markdown", - "id": "cc765e0c", "metadata": { "papermill": { "duration": 0.000109, @@ -429,12 +430,11 @@ }, "source": [ "##### 1. Define breaks and labels" - ] + ], + "id": "cc765e0c" }, { "cell_type": "code", - "execution_count": null, - "id": "2e79132c", "metadata": { "papermill": { "duration": 0.026996, @@ -448,19 +448,19 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# Safety code to avoid breaking if nothings is fund in json_metadata\n", "if (is.null(break_vals) || length(break_vals) == 0) {\n", " log_msg(\"[WARNING] No break values found in SNT_metadata.json for REPORTING_RATE$SCALE. Using default values.\", \"warning\")\n", " break_vals <- c(0.5, 0.8, 0.9, 0.95, 1.00)\n", "}" - ] + ], + "execution_count": null, + "outputs": [], + "id": "2e79132c" }, { "cell_type": "code", - "execution_count": null, - "id": "f04cb888", "metadata": { "papermill": { "duration": 0.037712, @@ -474,7 +474,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# 1. Define breaks\n", "# Note: assumes that the data starts at 0!\n", @@ -492,11 +491,13 @@ "\n", "# Check\n", "labels" - ] + ], + "execution_count": null, + "outputs": [], + "id": "f04cb888" }, { "cell_type": "markdown", - "id": "cb237801", "metadata": { "papermill": { "duration": 0.000102, @@ -509,12 +510,11 @@ }, "source": [ "##### 2. Create `_CATEGORY` col" - ] + ], + "id": "cb237801" }, { "cell_type": "code", - "execution_count": null, - "id": "f8303488", "metadata": { "papermill": { "duration": 0.040632, @@ -528,7 +528,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# reporting_rate_dataset <- reporting_rate_dataset %>%\n", "data_to_plot <- data_to_plot %>%\n", @@ -541,11 +540,13 @@ " include.lowest = TRUE\n", " )\n", " )" - ] + ], + "execution_count": null, + "outputs": [], + "id": "f8303488" }, { "cell_type": "markdown", - "id": "a10237f8", "metadata": { "papermill": { "duration": 0.000102, @@ -558,12 +559,11 @@ }, "source": [ "##### 3. Pick appropriate palette" - ] + ], + "id": "a10237f8" }, { "cell_type": "code", - "execution_count": null, - "id": "2ee6e077", "metadata": { "papermill": { "duration": 0.04138, @@ -577,7 +577,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# Count nr of breaks\n", "nr_of_colors <- length(labels)\n", @@ -590,11 +589,13 @@ "names(palette_to_use) <- rev(labels)\n", "\n", "palette_to_use\n" - ] + ], + "execution_count": null, + "outputs": [], + "id": "2ee6e077" }, { "cell_type": "markdown", - "id": "d08c0c14", "metadata": { "papermill": { "duration": 0.000099, @@ -607,11 +608,11 @@ }, "source": [ "#### 3.2. Plots" - ] + ], + "id": "d08c0c14" }, { "cell_type": "markdown", - "id": "b7781198", "metadata": { "papermill": { "duration": 0.000056, @@ -625,12 +626,11 @@ "source": [ "##### 3.2.1 Scatter plot of RR over time (by ADM2)\n", "With this we can see the actula numbners (although cannot tell which ADM2 have low values)." - ] + ], + "id": "b7781198" }, { "cell_type": "code", - "execution_count": null, - "id": "78d92e4a", "metadata": { "papermill": { "duration": 1.456494, @@ -644,7 +644,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# Line point plot faceted by YEAR\n", "ggplot(data = data_to_plot) +\n", @@ -686,12 +685,13 @@ " strip.placement = \"outside\",\n", " strip.text = element_text(face = \"bold\", size = 10)\n", " )" - ] + ], + "execution_count": null, + "outputs": [], + "id": "78d92e4a" }, { "cell_type": "code", - "execution_count": null, - "id": "1f47064a", "metadata": { "papermill": { "duration": 1.11568, @@ -705,7 +705,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "output_file <- paste0(COUNTRY_CODE, \"_reporting_rate_dataelement_adm2_linepoint.png\")\n", "output_location <- file.path(REPORTING_NB_OUTPUTS_PATH, \"figures\")\n", @@ -723,11 +722,13 @@ "\n", "# Add log message\n", "log_msg(glue::glue(\"📊 Plot (linepoint) saved to: {file.path(output_location, output_file)}\"))" - ] + ], + "execution_count": null, + "outputs": [], + "id": "1f47064a" }, { "cell_type": "markdown", - "id": "22bb6431", "metadata": { "papermill": { "duration": 0.000147, @@ -741,12 +742,11 @@ "source": [ "##### 3.2.2 Heatmap plot of RR over time (by ADM2)\n", "This is less good for identifying actual values, but allows to see which ADM2 have lower values." - ] + ], + "id": "22bb6431" }, { "cell_type": "code", - "execution_count": null, - "id": "f2445f2a", "metadata": { "papermill": { "duration": 2.21647, @@ -760,7 +760,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# Tile plot faceted by YEAR\n", "ggplot(data = data_to_plot) +\n", @@ -796,12 +795,13 @@ " strip.text = element_text(face = \"bold\", size = 10)\n", " ) +\n", " guides(fill = guide_legend(nrow = 1))" - ] + ], + "execution_count": null, + "outputs": [], + "id": "f2445f2a" }, { "cell_type": "code", - "execution_count": null, - "id": "cbe73312", "metadata": { "papermill": { "duration": 1.982105, @@ -815,7 +815,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "output_file <- paste0(COUNTRY_CODE, \"_reporting_rate_dataelement_adm2_heatmap.png\")\n", "output_location <- file.path(REPORTING_NB_OUTPUTS_PATH, \"figures\")\n", @@ -830,11 +829,13 @@ "\n", "# Add log message\n", "log_msg(glue::glue(\"📊 Plot (heatmap) saved to: {file.path(output_location, output_file)}\"))" - ] + ], + "execution_count": null, + "outputs": [], + "id": "cbe73312" }, { "cell_type": "markdown", - "id": "3eef141a", "metadata": { "papermill": { "duration": 0.000164, @@ -847,12 +848,11 @@ }, "source": [ "##### 3.2.3. MAP of Reporting Rate - by month" - ] + ], + "id": "3eef141a" }, { "cell_type": "code", - "execution_count": null, - "id": "83be9c68", "metadata": { "papermill": { "duration": 4.958481, @@ -866,7 +866,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# Choropleth map with reporting rate data by ADM2\n", "ggplot(data = data_to_plot) +\n", @@ -894,12 +893,13 @@ " cols = vars(MONTH),\n", " switch = \"both\") +\n", " guides(fill = guide_legend(nrow = 1))" - ] + ], + "execution_count": null, + "outputs": [], + "id": "83be9c68" }, { "cell_type": "code", - "execution_count": null, - "id": "e877671d", "metadata": { "papermill": { "duration": 3.502689, @@ -913,7 +913,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "output_file <- paste0(COUNTRY_CODE, \"_reporting_rate_dataelement_adm2_month_map.png\")\n", "output_location <- file.path(REPORTING_NB_OUTPUTS_PATH, \"figures\")\n", @@ -928,11 +927,13 @@ "\n", "# Add log message\n", "log_msg(glue::glue(\"📊 Plot (map) saved to: {file.path(output_location, output_file)}\"))" - ] + ], + "execution_count": null, + "outputs": [], + "id": "e877671d" }, { "cell_type": "markdown", - "id": "f0894be9", "metadata": { "papermill": { "duration": 0.000166, @@ -946,12 +947,11 @@ "source": [ "##### 3.2.4. MAP of Reporting Rate - by YEAR\n", "Use average (`mean()`) of monthly values" - ] + ], + "id": "f0894be9" }, { "cell_type": "code", - "execution_count": null, - "id": "cb1995ab", "metadata": { "papermill": { "duration": 0.039325, @@ -965,7 +965,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "data_to_plot_year <- data_to_plot %>%\n", " group_by(geometry, ADM2_ID, ADM2_NAME, ADM1_NAME, YEAR) %>%\n", @@ -983,12 +982,13 @@ " include.lowest = TRUE\n", " )\n", " )" - ] + ], + "execution_count": null, + "outputs": [], + "id": "cb1995ab" }, { "cell_type": "code", - "execution_count": null, - "id": "bd32b0cf", "metadata": { "papermill": { "duration": 0.798686, @@ -1002,7 +1002,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# Choropleth map with reporting rate data by ADM2\n", "ggplot(data = data_to_plot_year) +\n", @@ -1028,12 +1027,13 @@ " cols = vars(YEAR)\n", " ) +\n", " guides(fill = guide_legend(nrow = 1))" - ] + ], + "execution_count": null, + "outputs": [], + "id": "bd32b0cf" }, { "cell_type": "code", - "execution_count": null, - "id": "0430641e", "metadata": { "papermill": { "duration": 0.928933, @@ -1047,7 +1047,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "output_file <- paste0(COUNTRY_CODE, \"_reporting_rate_dataelement_adm2_year_map.png\")\n", "output_location <- file.path(REPORTING_NB_OUTPUTS_PATH, \"figures\")\n", @@ -1062,11 +1061,13 @@ "\n", "# Add log message\n", "log_msg(glue::glue(\"📊 Plot (map) saved to: {file.path(output_location, output_file)}\"))" - ] + ], + "execution_count": null, + "outputs": [], + "id": "0430641e" }, { "cell_type": "markdown", - "id": "8c3bdca4", "metadata": { "papermill": { "duration": 0.000126, @@ -1079,12 +1080,11 @@ }, "source": [ "#### The End :)" - ] + ], + "id": "8c3bdca4" }, { "cell_type": "code", - "execution_count": null, - "id": "f8a62ec5", "metadata": { "papermill": { "duration": 0.216448, @@ -1098,10 +1098,12 @@ "languageId": "r" } }, - "outputs": [], "source": [ "log_msg(\"Reporting Rate (Data Element) report notebook completed successfully!\")" - ] + ], + "execution_count": null, + "outputs": [], + "id": "f8a62ec5" } ], "metadata": { @@ -1133,4 +1135,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r b/pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r index 936e586..1172d9c 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r +++ b/pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r @@ -7,18 +7,26 @@ source(file.path("~/workspace/code", "snt_utils.r")) #' #' @param SNT_ROOT_PATH Character. Root path of the SNT workspace. Default: '~/workspace' #' @param packages Character vector. R packages to install and load. -#' @return List with CONFIG_PATH, UPLOADS_PATH, DATA_PATH. +#' @return List with `paths_to_check` (CONFIG_PATH, UPLOADS_PATH, DATA_PATH) and the +#' same three paths at the top level for backward compatibility (`setup$CONFIG_PATH`, …). #' #' @export get_setup_variables <- function( SNT_ROOT_PATH = "~/workspace", - packages = c("arrow", "dplyr", "tidyr", "stringr", "stringi", "jsonlite", "httr", "glue", "reticulate", "zoo") + packages = c( + "arrow", "rlang", "dplyr", "tidyr", "lubridate", "ggplot2", + "stringr", "stringi", "jsonlite", "httr", "glue", "reticulate", "zoo" + ) ) { - setup_variable <- list( + paths_to_check <- list( CONFIG_PATH = file.path(SNT_ROOT_PATH, "configuration"), UPLOADS_PATH = file.path(SNT_ROOT_PATH, "uploads"), DATA_PATH = file.path(SNT_ROOT_PATH, "data") ) + setup_variable <- c( + list(paths_to_check = paths_to_check), + paths_to_check + ) install_and_load(packages) diff --git a/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb b/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb index 7e22286..ccc4465 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb +++ b/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb @@ -95,7 +95,9 @@ } }, "source": [ - "SNT_ROOT_PATH <- \"/home/hexa/workspace\"\n", + "if (!exists(\"SNT_ROOT_PATH\")) {\n", + " SNT_ROOT_PATH <- \"/home/hexa/workspace\"\n", + "}\n", "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_reporting_rate_dataset\")\n", "\n", "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_reporting_rate_dataset.r\"))\n", diff --git a/pipelines/snt_dhis2_reporting_rate_dataset/utils/snt_dhis2_reporting_rate_dataset.r b/pipelines/snt_dhis2_reporting_rate_dataset/utils/snt_dhis2_reporting_rate_dataset.r index 57dc948..559e4fe 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataset/utils/snt_dhis2_reporting_rate_dataset.r +++ b/pipelines/snt_dhis2_reporting_rate_dataset/utils/snt_dhis2_reporting_rate_dataset.r @@ -7,18 +7,26 @@ source(file.path("~/workspace/code", "snt_utils.r")) #' #' @param SNT_ROOT_PATH Character. Root path of the SNT workspace. Default: '~/workspace' #' @param packages Character vector. R packages to install and load. -#' @return List with CONFIG_PATH, UPLOADS_PATH, DATA_PATH. +#' @return List with `paths_to_check` (CONFIG_PATH, UPLOADS_PATH, DATA_PATH) and the +#' same three paths at the top level for backward compatibility (`setup$CONFIG_PATH`, …). #' #' @export get_setup_variables <- function( SNT_ROOT_PATH = "~/workspace", - packages = c("arrow", "dplyr", "tidyr", "stringr", "stringi", "jsonlite", "httr", "glue", "reticulate") + packages = c( + "arrow", "dplyr", "tidyr", "ggplot2", + "stringr", "stringi", "jsonlite", "httr", "glue", "reticulate" + ) ) { - setup_variable <- list( + paths_to_check <- list( CONFIG_PATH = file.path(SNT_ROOT_PATH, "configuration"), UPLOADS_PATH = file.path(SNT_ROOT_PATH, "uploads"), DATA_PATH = file.path(SNT_ROOT_PATH, "data") ) + setup_variable <- c( + list(paths_to_check = paths_to_check), + paths_to_check + ) install_and_load(packages) diff --git a/snt_dhis2_reporting_rate_dataelement/pipeline.py b/snt_dhis2_reporting_rate_dataelement/pipeline.py index 50c1d08..7d8ea36 100644 --- a/snt_dhis2_reporting_rate_dataelement/pipeline.py +++ b/snt_dhis2_reporting_rate_dataelement/pipeline.py @@ -70,10 +70,15 @@ def snt_dhis2_reporting_rate_dataelement( validate_config(snt_config) country_code = snt_config["SNT_CONFIG"]["COUNTRY_CODE"] - # Build parameters dict and save to JSON in all cases (like other pipelines) routine_file = resolve_routine_filename( country_code=country_code, routine_data_choice=routine_data_choice ) + if routine_data_choice == "raw": + ds_outliers_id = snt_config["SNT_DATASET_IDENTIFIERS"]["DHIS2_DATASET_FORMATTED"] + else: + ds_outliers_id = snt_config["SNT_DATASET_IDENTIFIERS"]["DHIS2_OUTLIERS_IMPUTATION"] + + # Build parameters dict and save to JSON in all cases (like other pipelines) nb_parameters = { "SNT_ROOT_PATH": root_path.as_posix(), "ROUTINE_FILE": routine_file, @@ -88,11 +93,6 @@ def snt_dhis2_reporting_rate_dataelement( current_run.log_info(f"Saved pipeline parameters to {parameters_file}") if not run_report_only: - if routine_data_choice == "raw": - ds_outliers_id = snt_config["SNT_DATASET_IDENTIFIERS"]["DHIS2_DATASET_FORMATTED"] - else: - ds_outliers_id = snt_config["SNT_DATASET_IDENTIFIERS"]["DHIS2_OUTLIERS_IMPUTATION"] - # Check the file exists in the dataset if not dataset_file_exists(ds_id=ds_outliers_id, filename=routine_file): current_run.log_warning( From c219e51bd3c1650305b9c1389e7a8d0e3caab5d2 Mon Sep 17 00:00:00 2001 From: claude-marie Date: Tue, 14 Apr 2026 14:22:15 +0200 Subject: [PATCH 08/18] a lot of changes, make theipeline easier --- ...snt_dhis2_reporting_rate_dataelement.ipynb | 419 +-- ...s2_reporting_rate_dataelement_report.ipynb | 42 +- .../snt_dhis2_reporting_rate_dataelement.r | 163 +- .../snt_dhis2_reporting_rate_dataset.ipynb | 61 +- ..._dhis2_reporting_rate_dataset_report.ipynb | 2568 ++++++++--------- .../utils/snt_dhis2_reporting_rate_dataset.r | 57 + snt_dhis2_reporting_rate_dataset/pipeline.py | 1 - 7 files changed, 1668 insertions(+), 1643 deletions(-) diff --git a/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb b/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb index 03f68be..c4b57f0 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb +++ b/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "6e8d006c-fd3d-4186-bc8f-b83fdf234e65", "metadata": { "papermill": { "duration": 0.000173, @@ -35,11 +36,11 @@ "* **Output**: Reporting rate table aggregated at administrative level 2 with extensions csv and parquet saved to dataset **SNT_DHIS2_REPORTING_RATE**:\n", " * cols: YEAR, MONTH, ADM2_ID, REPORTING_RATE\n", " * Filename: `XXX_reporting_rate_dataelement.`" - ], - "id": "6e8d006c-fd3d-4186-bc8f-b83fdf234e65" + ] }, { "cell_type": "markdown", + "id": "064495be-24e5-4b76-a91f-7ac3d1a27a5a", "metadata": { "papermill": { "duration": 0.000228, @@ -52,11 +53,12 @@ }, "source": [ "## 1. Setup" - ], - "id": "064495be-24e5-4b76-a91f-7ac3d1a27a5a" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "35ede7cf-257f-439c-a514-26a7290f881d", "metadata": { "papermill": { "duration": 63.150489, @@ -70,76 +72,16 @@ "languageId": "r" } }, - "source": [ - "if (!exists(\"SNT_ROOT_PATH\")) {\n", - " SNT_ROOT_PATH <- \"/home/hexa/workspace\"\n", - "}\n", - "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_reporting_rate_dataelement\")\n", - "\n", - "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_reporting_rate_dataelement.r\"))\n", - "setup <- get_setup_variables(SNT_ROOT_PATH = SNT_ROOT_PATH)\n", - "" - ], - "execution_count": null, "outputs": [], - "id": "35ede7cf-257f-439c-a514-26a7290f881d" - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.00011, - "end_time": "2026-01-16T10:23:56.165873", - "exception": false, - "start_time": "2026-01-16T10:23:56.165763", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 1.1. Fallback parameters values\n", - "This parameters are injected by papermill when running in OH via pipeline run interface.
\n", - "The code cell below here provides fallback paramater values needed when running this notebook locally." - ], - "id": "a7a15634-4623-40f2-8e2d-3fa47203aa6e" - }, - { - "cell_type": "code", - "metadata": { - "papermill": { - "duration": 0.033954, - "end_time": "2026-01-16T10:23:56.199937", - "exception": false, - "start_time": "2026-01-16T10:23:56.165983", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, "source": [ - "# Current options: \n", - "# \"COUNTRY_CODE_routine.parquet\" (RAW data)\n", - "# \"COUNTRY_CODE_routine_outliers_removed.parquet\" \n", - "# \"COUNTRY_CODE_routine_outliers_imputed.parquet\"\n", - "if (!exists(\"ROUTINE_FILE\")) {ROUTINE_FILE <- \"XXX_routine_outliers_imputed.parquet\"}\n", - "\n", - "# Resolved by pipeline.py based on routine_data_choice; fallback to formatted dataset\n", - "if (!exists(\"DATASET_ID\")) {DATASET_ID <- \"\"}\n", - "\n", - "# Options: \"ROUTINE_ACTIVE_FACILITIES\", \"PYRAMID_OPEN_FACILITIES\"\n", - "if (!exists(\"DATAELEMENT_METHOD_DENOMINATOR\")) {DATAELEMENT_METHOD_DENOMINATOR <- \"ROUTINE_ACTIVE_FACILITIES\"}\n", - "if (!exists(\"ACTIVITY_INDICATORS\")) {ACTIVITY_INDICATORS <- c(\"CONF\", \"PRES\", \"SUSP\")} \n", - "if (!exists(\"VOLUME_ACTIVITY_INDICATORS\")) {VOLUME_ACTIVITY_INDICATORS <- c(\"CONF\", \"PRES\")}\n", - "if (!exists(\"USE_WEIGHTED_REPORTING_RATES\")) {USE_WEIGHTED_REPORTING_RATES <- FALSE}" - ], - "execution_count": null, - "outputs": [], - "id": "b17f7685-5291-4e5d-9eec-2d1f9435fccb" + "source(\"~/workspace/pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r\")\n", + "snt_environment <- get_setup_variables()\n", + "config_json <- load_snt_config(file.path(snt_environment$CONFIG_PATH, \"SNT_config.json\"))\n" + ] }, { "cell_type": "markdown", + "id": "7dedcc32-c531-498d-90b9-89b0ee9fb9be", "metadata": { "papermill": { "duration": 0.000095, @@ -151,12 +93,14 @@ "tags": [] }, "source": [ - "### 1.2. Load and check `snt config` file" - ], - "id": "7dedcc32-c531-498d-90b9-89b0ee9fb9be" + "### 1.1. Pipeline parameters\n", + "`ROUTINE_FILE` and `DATASET_ID` are injected by Papermill (checked inside `parse_dataelement_snt_settings()`). Denominator, weighting, and indicator lists come from `SNT_config.json` (`SNT_CONFIG$REPORTING_RATE_DATAELEMENT`) when present; otherwise that function applies the documented defaults in the pipeline utils.\n" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "5b6d29ea-91f3-4c53-b95e-4b485f88383f", "metadata": { "papermill": { "duration": 0.521572, @@ -170,15 +114,15 @@ "languageId": "r" } }, - "source": [ - "config_json <- load_snt_config(file.path(setup$CONFIG_PATH, \"SNT_config.json\"))" - ], - "execution_count": null, "outputs": [], - "id": "5b6d29ea-91f3-4c53-b95e-4b485f88383f" + "source": [ + "cx <- parse_dataelement_snt_settings(config_json)\n", + "list2env(cx, envir = .GlobalEnv)\n" + ] }, { - "cell_type": "code", + "cell_type": "markdown", + "id": "c26c981c-dadd-48ac-ae35-613b8ba61a82", "metadata": { "papermill": { "duration": 0.033003, @@ -193,53 +137,27 @@ } }, "source": [ - "# Configuration settings\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", - "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", - "\n", - "# DHIS2_INDICATORS <- names(config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS)\n", - "DHIS2_INDICATORS <- c(\"CONF\", \"PRES\", \"SUSP\", \"TEST\") # GP 20260205\n", - "\n", - "ACTIVITY_INDICATORS <- unlist(ACTIVITY_INDICATORS)\n", - "VOLUME_ACTIVITY_INDICATORS <- unlist(VOLUME_ACTIVITY_INDICATORS)\n", - "fixed_cols <- c('PERIOD', 'YEAR', 'MONTH', 'ADM1_ID', 'ADM2_ID', 'OU_ID')\n", - "fixed_cols_rr <- c('YEAR', 'MONTH', 'ADM2_ID', 'REPORTING_RATE') # Fixed cols for exporting RR tables" - ], - "execution_count": null, - "outputs": [], - "id": "c26c981c-dadd-48ac-ae35-613b8ba61a82" - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 1.3. 🔍 Check: at least 1 indicator must be selected\n", - "The use can toggle on/off each of the indicators. Therefore, need to make sure at least one is ON.
\n", - "Indicator `CONF` is mandatory, but I think it looks better if they're all displayed in the Run pipeline view (more intuitive)." - ], - "id": "8bf4a8bb" + "### 1.2. Checks\n", + "Validate activity-indicator selection before heavy joins.\n" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "8bf4a8bb", "metadata": { "vscode": { "languageId": "r" } }, - "source": [ - "if (!length(ACTIVITY_INDICATORS) > 0) {\n", - " msg <- \"[ERROR] Error: no indicator selected, cannot perform calculation of reporting rate method. Select at least one (e.g., `CONF`).\"\n", - " cat(msg) \n", - " stop(msg)\n", - "}" - ], - "execution_count": null, "outputs": [], - "id": "18b40207" + "source": [ + "stopifnot_nonempty_activity_indicators(ACTIVITY_INDICATORS)\n" + ] }, { "cell_type": "markdown", + "id": "e44ae2ab-4af7-475a-8cbe-6d669895a18b", "metadata": { "papermill": { "duration": 0.000093, @@ -252,11 +170,11 @@ }, "source": [ "## 2. Load Data" - ], - "id": "e44ae2ab-4af7-475a-8cbe-6d669895a18b" + ] }, { "cell_type": "markdown", + "id": "39e2add7-bbc7-4312-9a6f-9886d675f532", "metadata": { "papermill": { "duration": 0.000069, @@ -271,11 +189,12 @@ "### 2.1. Routine data (DHIS2) \n", "**Note on pipeline behaviour**:
\n", "The value of `ROUTINE_FILE` is resolved within the pipeline.py code and injected into the notebook as parameter." - ], - "id": "39e2add7-bbc7-4312-9a6f-9886d675f532" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "a1213723-f7e2-4238-9f37-f1795b187232", "metadata": { "papermill": { "duration": 2.018878, @@ -289,20 +208,18 @@ "languageId": "r" } }, + "outputs": [], "source": [ "dhis2_routine <- load_dataset_file(DATASET_ID, ROUTINE_FILE)\n", "dhis2_routine <- dhis2_routine %>%\n", " dplyr::mutate(dplyr::across(c(PERIOD, YEAR, MONTH), as.numeric))\n", "dim(dhis2_routine)\n", - "head(dhis2_routine, 2)\n", - "" - ], - "execution_count": null, - "outputs": [], - "id": "a1213723-f7e2-4238-9f37-f1795b187232" + "head(dhis2_routine, 2)\n" + ] }, { "cell_type": "markdown", + "id": "a8b91360-1a4e-4fc4-9883-602bc0ab2a2a", "metadata": { "papermill": { "duration": 0.000138, @@ -315,11 +232,12 @@ }, "source": [ "### 2.2. Organisation units (DHIS2 pyramid)" - ], - "id": "a8b91360-1a4e-4fc4-9883-602bc0ab2a2a" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "2fd92901-901e-4019-be78-a7718050c1c4", "metadata": { "papermill": { "duration": 0.992899, @@ -333,21 +251,19 @@ "languageId": "r" } }, + "outputs": [], "source": [ "dhis2_pyramid_formatted <- load_dataset_file(\n", " config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED,\n", " paste0(COUNTRY_CODE, \"_pyramid.parquet\")\n", ")\n", "dim(dhis2_pyramid_formatted)\n", - "head(dhis2_pyramid_formatted, 2)\n", - "" - ], - "execution_count": null, - "outputs": [], - "id": "2fd92901-901e-4019-be78-a7718050c1c4" + "head(dhis2_pyramid_formatted, 2)\n" + ] }, { "cell_type": "markdown", + "id": "2b7f4e50-3731-46bc-b7a7-2ef5317da9d1", "metadata": { "papermill": { "duration": 0.000106, @@ -363,11 +279,12 @@ "Extra precaution measure to avoid breaks downstream.
\n", "\n", "Note: This logic should be moved to pipeline.py 🐍" - ], - "id": "2b7f4e50-3731-46bc-b7a7-2ef5317da9d1" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "19ff7e56-2397-4ca1-b072-bca4ba1b3d0c", "metadata": { "papermill": { "duration": 0.024863, @@ -381,23 +298,14 @@ "languageId": "r" } }, - "source": [ - "if (!all(ACTIVITY_INDICATORS %in% names(dhis2_routine))) {\n", - " log_msg(glue(\"🚨 Warning: one or more of the follow column is missing from `dhis2_routine`: {paste(ACTIVITY_INDICATORS, collapse = ', ')}\"), \"warning\")\n", - "}\n", - "\n", - "if (!all(VOLUME_ACTIVITY_INDICATORS %in% names(dhis2_routine))) {\n", - " msg <- glue(\"[ERROR] Volume activity indicator {VOLUME_ACTIVITY_INDICATORS} not present in the routine data. Process cannot continue.\")\n", - " cat(msg)\n", - " stop(msg)\n", - "}" - ], - "execution_count": null, "outputs": [], - "id": "19ff7e56-2397-4ca1-b072-bca4ba1b3d0c" + "source": [ + "validate_indicator_columns_in_routine(dhis2_routine, ACTIVITY_INDICATORS, VOLUME_ACTIVITY_INDICATORS)\n" + ] }, { "cell_type": "markdown", + "id": "bcbd3a9f-5e45-4ae5-8671-e23155236295", "metadata": { "papermill": { "duration": 0.000091, @@ -410,19 +318,20 @@ }, "source": [ "## 3. Reporting rates computations" - ], - "id": "bcbd3a9f-5e45-4ae5-8671-e23155236295" + ] }, { "cell_type": "markdown", + "id": "7d62cdb6", "metadata": {}, "source": [ "#### 3.0. Define start and end period based on routine data " - ], - "id": "7d62cdb6" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "3bc2e76a-b5c7-4c71-90f2-c66926ca560a", "metadata": { "papermill": { "duration": 0.044172, @@ -436,22 +345,18 @@ "languageId": "r" } }, - "source": [ - "PERIOD_START <- dhis2_routine$PERIOD %>% min()\n", - "PERIOD_END <- dhis2_routine$PERIOD %>% max()\n", - "\n", - "period_vector <- format(\n", - " seq(lubridate::ym(PERIOD_START), lubridate::ym(PERIOD_END), by = \"month\"),\n", - " \"%Y%m\"\n", - ")\n", - "cat(glue(\"Start period: {PERIOD_START} \\nEnd period: {PERIOD_END} \\nPeriods count: {length(period_vector)}\"))" - ], - "execution_count": null, "outputs": [], - "id": "3bc2e76a-b5c7-4c71-90f2-c66926ca560a" + "source": [ + "pv <- monthly_period_vector_from_routine(dhis2_routine)\n", + "PERIOD_START <- pv$PERIOD_START\n", + "PERIOD_END <- pv$PERIOD_END\n", + "period_vector <- pv$period_vector\n", + "log_msg(glue::glue(\"Routine period range: {PERIOD_START} to {PERIOD_END} ({length(period_vector)} months)\"))\n" + ] }, { "cell_type": "markdown", + "id": "526bc3af-01c1-4ddc-b3b9-077354e57559", "metadata": { "papermill": { "duration": 0.000109, @@ -465,11 +370,12 @@ "source": [ "#### 3.1. Build master table (all PERIOD x OU)\n", "The master table contains all combinations of period x organisation unit " - ], - "id": "526bc3af-01c1-4ddc-b3b9-077354e57559" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "9308197a-0852-4d34-8888-cf5564f35a9d", "metadata": { "papermill": { "duration": 0.289128, @@ -483,6 +389,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "log_msg(glue(\"Building master table with periods from {PERIOD_START} to {PERIOD_END}. Periods count: {length(period_vector)}\"))\n", "facility_master <- build_facility_master_dataelement(\n", @@ -492,13 +399,11 @@ " ADMIN_1 = ADMIN_1,\n", " ADMIN_2 = ADMIN_2\n", ")\n" - ], - "execution_count": null, - "outputs": [], - "id": "9308197a-0852-4d34-8888-cf5564f35a9d" + ] }, { "cell_type": "markdown", + "id": "d5af25ad-f17c-4cdc-ac96-908af49fe558", "metadata": { "papermill": { "duration": 0.000114, @@ -513,16 +418,18 @@ "#### 3.2. Identify \"Active\" facilities\n", "\n", "Facilities **reporting** zero or positive values on any of the selected indicators (**\"Activity indicators\"**) are considered to be **active**. Note that this method only counts **non-null** (not `NA`s) to prevent counting empty submissions as valid reporting.\n" - ], - "id": "d5af25ad-f17c-4cdc-ac96-908af49fe558" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "7b279d27", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Join routine values to the facility master and define monthly activity\n", "facility_master_routine <- dplyr::left_join(\n", @@ -538,13 +445,11 @@ " ),\n", " COUNT = 1\n", " )\n" - ], - "execution_count": null, - "outputs": [], - "id": "7b279d27" + ] }, { "cell_type": "markdown", + "id": "89c3e5c8-4a4e-497d-9d75-2aed2e8fe619", "metadata": { "papermill": { "duration": 0.000107, @@ -565,11 +470,12 @@ "2. The period falls within the facility’s opening and closing dates. The opening date is not after the reporting period, and the closing date is not before or equal to the reporting period.\n", "\n", "If either of these conditions is not met, the facility is considered not open (OPEN = 0) for that period." - ], - "id": "89c3e5c8-4a4e-497d-9d75-2aed2e8fe619" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "0b71f1d8-2048-4b62-865c-9acfe61b5b89", "metadata": { "papermill": { "duration": 1.317923, @@ -583,6 +489,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# 3.3 Identify OPEN facilities from naming and opening/closing dates\n", "facility_master_routine <- facility_master_routine %>%\n", @@ -593,21 +500,19 @@ " (!is.na(CLOSED_DATE) & as.Date(CLOSED_DATE) <= period_date)),\n", " OPEN = ifelse(!NAME_CLOSED & OPEN_BY_DATE, 1, 0)\n", " )\n" - ], - "execution_count": null, - "outputs": [], - "id": "0b71f1d8-2048-4b62-865c-9acfe61b5b89" + ] }, { "cell_type": "markdown", + "id": "657fd6ca", "metadata": {}, "source": [ "#### 3.4. Identify \"Active\" facilities for each YEAR (denominator)" - ], - "id": "657fd6ca" + ] }, { "cell_type": "markdown", + "id": "a598e4b7", "metadata": {}, "source": [ "
\n", @@ -618,11 +523,12 @@ "
  • Without YEAR → “ever active over the entire extracted period”
  • \n", " \n", "
    " - ], - "id": "a598e4b7" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "002e7fbf-1f68-4419-be2d-f16d8c72936d", "metadata": { "papermill": { "duration": 0.173961, @@ -636,19 +542,18 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# 3.4 Mark facilities active at least once per year\n", "facility_master_routine <- facility_master_routine %>%\n", " dplyr::group_by(OU_ID, YEAR) %>%\n", " dplyr::mutate(ACTIVE_THIS_YEAR = max(ACTIVE_THIS_PERIOD, na.rm = TRUE)) %>%\n", " dplyr::ungroup()\n" - ], - "execution_count": null, - "outputs": [], - "id": "002e7fbf-1f68-4419-be2d-f16d8c72936d" + ] }, { "cell_type": "markdown", + "id": "160c08ec-cc9a-4e1a-99ec-f703db83a71d", "metadata": { "papermill": { "duration": 0.000098, @@ -661,11 +566,12 @@ }, "source": [ "#### 3.5. Compute Weighting factor based on \"volume of activity\"" - ], - "id": "160c08ec-cc9a-4e1a-99ec-f703db83a71d" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "4420e559-4134-4fc3-8950-9972ebede00e", "metadata": { "papermill": { "duration": 0.520673, @@ -679,6 +585,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# 3.5 Compute facility weights from volume of activity\n", "mean_monthly_cases <- dhis2_routine %>%\n", @@ -704,13 +611,11 @@ "hf_weights <- mean_monthly_cases %>%\n", " dplyr::left_join(mean_monthly_cases_adm2, by = \"ADM2_ID\") %>%\n", " dplyr::mutate(WEIGHT = MEAN_REPORTED_CASES_BY_HF / SUMMED_MEAN_REPORTED_CASES_BY_ADM2 * NR_OF_HF)\n" - ], - "execution_count": null, - "outputs": [], - "id": "4420e559-4134-4fc3-8950-9972ebede00e" + ] }, { "cell_type": "markdown", + "id": "2fed8529-70e9-4e2e-a498-fe3dd7499bb3", "metadata": { "papermill": { "duration": 0.000108, @@ -723,11 +628,12 @@ }, "source": [ "#### 3.6. Compute Weighted variables" - ], - "id": "2fed8529-70e9-4e2e-a498-fe3dd7499bb3" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "216f7658-c1da-44e4-9f4f-fdb44fd40259", "metadata": { "papermill": { "duration": 0.483413, @@ -741,6 +647,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# 3.6 Apply weights to monthly status variables\n", "facility_master_routine_02 <- facility_master_routine %>%\n", @@ -750,13 +657,11 @@ "facility_master_routine_02$COUNT_W <- facility_master_routine_02$COUNT * facility_master_routine_02$WEIGHT\n", "facility_master_routine_02$OPEN_W <- facility_master_routine_02$OPEN * facility_master_routine_02$WEIGHT\n", "facility_master_routine_02$ACTIVE_THIS_YEAR_W <- facility_master_routine_02$ACTIVE_THIS_YEAR * facility_master_routine_02$WEIGHT\n" - ], - "execution_count": null, - "outputs": [], - "id": "216f7658-c1da-44e4-9f4f-fdb44fd40259" + ] }, { "cell_type": "markdown", + "id": "9c0367f7-91cd-4524-abe4-11adf2fcea02", "metadata": { "papermill": { "duration": 0.000172, @@ -769,16 +674,18 @@ }, "source": [ "#### 3.7. Aggregate data at ADM2 level" - ], - "id": "9c0367f7-91cd-4524-abe4-11adf2fcea02" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "af13191e", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# 3.7 Aggregate monthly counts at ADM2 level\n", "reporting_rate_adm2 <- facility_master_routine_02 %>%\n", @@ -802,26 +709,26 @@ " RR_OPEN_HF_W = HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED / NR_OF_OPEN_HF_BY_ADM2_WEIGHTED,\n", " RR_ACTIVE_HF_W = HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED / HF_ACTIVE_THIS_YEAR_BY_ADM2_WEIGHTED\n", " )\n" - ], - "execution_count": null, - "outputs": [], - "id": "af13191e" + ] }, { "cell_type": "markdown", + "id": "7d381937", "metadata": {}, "source": [ "#### 3.8. Calculate Reporting Rates (all methods)" - ], - "id": "7d381937" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "b41263f8", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# 3.8 Select final reporting-rate definition for export\n", "rr_column_selection <- if (DATAELEMENT_METHOD_DENOMINATOR == \"ROUTINE_ACTIVE_FACILITIES\") \"RR_ACTIVE_HF\" else \"RR_OPEN_HF\"\n", @@ -833,13 +740,11 @@ " dplyr::mutate(MONTH = PERIOD %% 100) %>%\n", " dplyr::rename(REPORTING_RATE = !!rlang::sym(rr_column_selection)) %>%\n", " dplyr::select(YEAR, MONTH, ADM2_ID, REPORTING_RATE)\n" - ], - "execution_count": null, - "outputs": [], - "id": "b41263f8" + ] }, { "cell_type": "markdown", + "id": "5e593659", "metadata": { "papermill": { "duration": 0.000108, @@ -852,11 +757,11 @@ }, "source": [ "## 4. Select correct col for `REPORTING_RATE` based on denominator method" - ], - "id": "5e593659" + ] }, { "cell_type": "markdown", + "id": "c75f2249", "metadata": { "papermill": { "duration": 0.000057, @@ -869,11 +774,12 @@ }, "source": [ "### 4.1. Select results and format" - ], - "id": "c75f2249" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "75e71b38", "metadata": { "papermill": { "duration": 0.020644, @@ -887,18 +793,18 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# 4.1 Confirm which denominator/weighting option was selected\n", "cat(glue::glue(\n", " \"Selected denominator method: {DATAELEMENT_METHOD_DENOMINATOR} | Weighted reporting rates: {USE_WEIGHTED_REPORTING_RATES}\"\n", "))\n" - ], - "execution_count": null, - "outputs": [], - "id": "75e71b38" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "3df36abb", "metadata": { "papermill": { "duration": 0.140976, @@ -912,17 +818,17 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# Output preview\n", "dim(reporting_rate_dataelement)\n", "head(reporting_rate_dataelement, 5)\n" - ], - "execution_count": null, - "outputs": [], - "id": "3df36abb" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "0ccc272c", "metadata": { "papermill": { "duration": 0.182574, @@ -936,17 +842,16 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# Basic data quality checks\n", "summary(reporting_rate_dataelement$REPORTING_RATE)\n", "sum(is.na(reporting_rate_dataelement$REPORTING_RATE))\n" - ], - "execution_count": null, - "outputs": [], - "id": "0ccc272c" + ] }, { "cell_type": "markdown", + "id": "ca66e785", "metadata": { "papermill": { "duration": 0.000109, @@ -959,11 +864,12 @@ }, "source": [ "## 5. Inspect reporting rate values" - ], - "id": "ca66e785" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "31535459", "metadata": { "papermill": { "duration": 0.160299, @@ -977,17 +883,17 @@ "languageId": "r" } }, + "outputs": [], "source": [ "hist(reporting_rate_dataelement$REPORTING_RATE, breaks=50, \n", "main=paste0(\"Histogram of REPORTING_RATE\\n(\", DATAELEMENT_METHOD_DENOMINATOR, \",\\n\", ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted\", \"Unweighted\"), \")\"), \n", "xlab=\"REPORTING_RATE\")" - ], - "execution_count": null, - "outputs": [], - "id": "31535459" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "6778f17d", "metadata": { "papermill": { "duration": 0.896382, @@ -1001,6 +907,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# Boxplot\n", "ggplot(reporting_rate_dataelement,\n", @@ -1013,13 +920,12 @@ " subtitle = ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted Reporting Rates\", \"Unweighted Reporting Rates\")\n", " ) +\n", " theme_minimal()" - ], - "execution_count": null, - "outputs": [], - "id": "6778f17d" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "a7f013fd", "metadata": { "papermill": { "duration": 0.859448, @@ -1033,6 +939,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "ggplot(reporting_rate_dataelement,\n", " aes(x = factor(YEAR), y = REPORTING_RATE)) +\n", @@ -1046,13 +953,11 @@ " subtitle = ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted Reporting Rates\", \"Unweighted Reporting Rates\")\n", " ) +\n", " theme_minimal()" - ], - "execution_count": null, - "outputs": [], - "id": "a7f013fd" + ] }, { "cell_type": "markdown", + "id": "2866816a-7015-4c5c-b904-f553f3b4790d", "metadata": { "papermill": { "duration": 0.000088, @@ -1065,11 +970,12 @@ }, "source": [ "## 5. 📁 Export to `data/` folder" - ], - "id": "2866816a-7015-4c5c-b904-f553f3b4790d" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "bbf27852-8ec5-4370-aae2-49e082928fe1", "metadata": { "papermill": { "duration": 0.919937, @@ -1083,27 +989,10 @@ "languageId": "r" } }, - "source": [ - "output_dir <- file.path(setup$DATA_PATH, \"dhis2\", \"reporting_rate\")\n", - "dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)\n", - "\n", - "write.csv(\n", - " reporting_rate_dataelement,\n", - " file.path(output_dir, paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.csv\")),\n", - " row.names = FALSE\n", - ")\n", - "log_msg(glue::glue(\"Exported: {file.path(output_dir, paste0(COUNTRY_CODE, '_reporting_rate_dataelement.csv'))}\"))\n", - "\n", - "arrow::write_parquet(\n", - " reporting_rate_dataelement,\n", - " file.path(output_dir, paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.parquet\"))\n", - ")\n", - "log_msg(glue::glue(\"Exported: {file.path(output_dir, paste0(COUNTRY_CODE, '_reporting_rate_dataelement.parquet'))}\"))\n", - "" - ], - "execution_count": null, "outputs": [], - "id": "bbf27852-8ec5-4370-aae2-49e082928fe1" + "source": [ + "write_reporting_rate_dataelement_outputs(reporting_rate_dataelement, snt_environment, COUNTRY_CODE)\n" + ] } ], "metadata": { @@ -1150,4 +1039,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/pipelines/snt_dhis2_reporting_rate_dataelement/reporting/snt_dhis2_reporting_rate_dataelement_report.ipynb b/pipelines/snt_dhis2_reporting_rate_dataelement/reporting/snt_dhis2_reporting_rate_dataelement_report.ipynb index c9c5775..2d91055 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataelement/reporting/snt_dhis2_reporting_rate_dataelement_report.ipynb +++ b/pipelines/snt_dhis2_reporting_rate_dataelement/reporting/snt_dhis2_reporting_rate_dataelement_report.ipynb @@ -33,31 +33,19 @@ } }, "source": [ - "# Project paths\n", - "SNT_ROOT_PATH <- \"/home/hexa/workspace\" \n", - "REPORTING_NB_OUTPUTS_PATH <- file.path(SNT_ROOT_PATH, \"pipelines/snt_dhis2_reporting_rate_dataelement/reporting/outputs\")\n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, 'code') # this is where we store snt_utils.r\n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, 'configuration') # .json config file\n", - "DATA_PATH <- file.path(SNT_ROOT_PATH, 'data', 'dhis2') \n", - "\n", - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "# Load palettes\n", - "source(file.path(CODE_PATH, \"snt_palettes.r\"))\n", + "SNT_ROOT_PATH <- \"~/workspace\"\n", + "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_reporting_rate_dataelement\")\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_reporting_rate_dataelement.r\"))\n", "\n", - "# Load libraries (sf: geom_sf + geojson from get_latest_dataset_file_in_memory)\n", - "required_packages <- c(\n", - " \"arrow\", \"tidyverse\", \"sf\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"glue\"\n", - ")\n", - "install_and_load(required_packages)\n", + "report_packages <- c(\"arrow\", \"tidyverse\", \"sf\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"glue\")\n", + "snt_environment <- get_setup_variables(SNT_ROOT_PATH = SNT_ROOT_PATH, packages = report_packages)\n", "\n", - "# Environment variables\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", + "CONFIG_PATH <- snt_environment$CONFIG_PATH\n", + "DATA_PATH <- file.path(snt_environment$DATA_PATH, \"dhis2\")\n", + "REPORTING_NB_OUTPUTS_PATH <- file.path(SNT_ROOT_PATH, \"pipelines/snt_dhis2_reporting_rate_dataelement/reporting/outputs\")\n", "\n", - "# Load OpenHEXA sdk\n", - "openhexa <- import(\"openhexa.sdk\")" + "source(file.path(CODE_PATH, \"snt_palettes.r\"))" ], "execution_count": null, "outputs": [], @@ -96,15 +84,7 @@ } }, "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", - " error = function(e) {\n", - " msg <- paste0(\"[ERROR] Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "log_msg(paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, \"SNT_config.json\")))" + "config_json <- load_snt_config(file.path(CONFIG_PATH, \"SNT_config.json\"))" ], "execution_count": null, "outputs": [], diff --git a/pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r b/pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r index 1172d9c..a76010b 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r +++ b/pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r @@ -1,4 +1,7 @@ # Load base utils +# Bootstrap matches `snt_dhis2_population_transformation`: fixed-path `source()` of this +# file, `snt_environment <- get_setup_variables()`, then `load_snt_config()`. +# Helpers: `load_dataset_file()`, optional `paths_to_check` in the setup list. source(file.path("~/workspace/code", "snt_utils.r")) @@ -7,8 +10,8 @@ source(file.path("~/workspace/code", "snt_utils.r")) #' #' @param SNT_ROOT_PATH Character. Root path of the SNT workspace. Default: '~/workspace' #' @param packages Character vector. R packages to install and load. -#' @return List with `paths_to_check` (CONFIG_PATH, UPLOADS_PATH, DATA_PATH) and the -#' same three paths at the top level for backward compatibility (`setup$CONFIG_PATH`, …). +#' @return List with `paths_to_check` plus `CONFIG_PATH`, `UPLOADS_PATH`, `DATA_PATH` +#' (use as `snt_environment$CONFIG_PATH`, same pattern as population transformation). #' #' @export get_setup_variables <- function( @@ -30,6 +33,7 @@ get_setup_variables <- function( install_and_load(packages) + configure_conda_r_spatial_env() Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") reticulate::py_config()$python assign("openhexa", reticulate::import("openhexa.sdk"), envir = .GlobalEnv) @@ -81,6 +85,161 @@ load_dataset_file <- function(dataset_id, filename) { } +#' Conda-friendly defaults for PROJ/GDAL (used when reading spatial data). +configure_conda_r_spatial_env <- function() { + if (Sys.getenv("PROJ_LIB", "") == "") { + Sys.setenv(PROJ_LIB = "/opt/conda/share/proj") + } + if (Sys.getenv("GDAL_DATA", "") == "") { + Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal") + } +} + + +#' Standard aggregated indicator codes present in formatted routine extracts. +standard_dhis2_indicator_codes_for_dataelement <- function() { + c("CONF", "PRES", "SUSP", "TEST") +} + + +#' Fail if Papermill did not inject `ROUTINE_FILE` and `DATASET_ID`. +#' +#' Kept as a named entry point so older notebooks that call this before +#' `parse_dataelement_snt_settings()` keep working after utils refactors. +assert_papermill_dataelement_params <- function() { + required_pm <- c("ROUTINE_FILE", "DATASET_ID") + missing_pm <- required_pm[!vapply(required_pm, exists, logical(1), inherits = TRUE)] + if (length(missing_pm) > 0) { + stop( + "[ERROR] Missing pipeline parameters (Papermill): ", + paste(missing_pm, collapse = ", "), + ". Expected only ROUTINE_FILE and DATASET_ID from `snt_dhis2_reporting_rate_dataelement`." + ) + } +} + + +#' Normalize optional `SNT_CONFIG$REPORTING_RATE_DATAELEMENT` list from JSON. +#' +#' When absent, uses the same defaults as the historical OpenHEXA parameters +#' (denominator `ROUTINE_ACTIVE_FACILITIES`, unweighted, activity CONF/PRES/SUSP, +#' volume CONF/PRES). +#' +#' Also calls `assert_papermill_dataelement_params()` (redundant if the notebook +#' already called it). +parse_dataelement_snt_settings <- function(config_json) { + assert_papermill_dataelement_params() + + rc <- config_json$SNT_CONFIG$REPORTING_RATE_DATAELEMENT + if (is.null(rc) || length(rc) == 0) { + rc <- list() + } + + denom <- rc$DATAELEMENT_METHOD_DENOMINATOR + denom_ch <- if (is.null(denom)) "" else as.character(denom)[[1]] + if (!nzchar(denom_ch) || is.na(denom_ch)) { + denom <- "ROUTINE_ACTIVE_FACILITIES" + } else { + denom <- denom_ch + } + + use_w <- rc$USE_WEIGHTED_REPORTING_RATES + if (is.null(use_w)) { + use_w <- FALSE + } else { + use_w <- isTRUE(use_w) + } + + act <- rc$ACTIVITY_INDICATORS + if (is.null(act)) { + act <- c("CONF", "PRES", "SUSP") + } + act <- as.character(unlist(act, use.names = FALSE)) + + vol <- rc$VOLUME_ACTIVITY_INDICATORS + if (is.null(vol)) { + vol <- c("CONF", "PRES") + } + vol <- as.character(unlist(vol, use.names = FALSE)) + + list( + COUNTRY_CODE = config_json$SNT_CONFIG$COUNTRY_CODE, + ADMIN_1 = toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1), + ADMIN_2 = toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2), + DHIS2_INDICATORS = standard_dhis2_indicator_codes_for_dataelement(), + DATAELEMENT_METHOD_DENOMINATOR = denom, + USE_WEIGHTED_REPORTING_RATES = use_w, + ACTIVITY_INDICATORS = act, + VOLUME_ACTIVITY_INDICATORS = vol, + fixed_cols = c("PERIOD", "YEAR", "MONTH", "ADM1_ID", "ADM2_ID", "OU_ID"), + fixed_cols_rr = c("YEAR", "MONTH", "ADM2_ID", "REPORTING_RATE") + ) +} + + +stopifnot_nonempty_activity_indicators <- function(activity_indicators) { + if (!length(activity_indicators)) { + stop("[ERROR] No activity indicators selected; choose at least one (e.g. CONF).") + } +} + + +validate_indicator_columns_in_routine <- function( + dhis2_routine, + activity_indicators, + volume_activity_indicators +) { + if (!all(activity_indicators %in% names(dhis2_routine))) { + log_msg( + glue::glue( + "Warning: one or more activity indicators are missing from `dhis2_routine`: ", + "{paste(activity_indicators, collapse = ', ')}" + ), + "warning" + ) + } + if (!all(volume_activity_indicators %in% names(dhis2_routine))) { + msg <- glue::glue( + "[ERROR] Volume activity indicator(s) not present in routine data: ", + "{paste(volume_activity_indicators, collapse = ', ')}" + ) + log_msg(msg, "error") + stop(msg) + } +} + + +#' YYYYMM sequence covering the routine period range (inclusive by month). +monthly_period_vector_from_routine <- function(dhis2_routine) { + period_start <- min(dhis2_routine$PERIOD, na.rm = TRUE) + period_end <- max(dhis2_routine$PERIOD, na.rm = TRUE) + pv <- format( + seq(lubridate::ym(period_start), lubridate::ym(period_end), by = "month"), + "%Y%m" + ) + list( + PERIOD_START = period_start, + PERIOD_END = period_end, + period_vector = pv + ) +} + + +#' Write CSV + Parquet under `/dhis2/reporting_rate/`. +write_reporting_rate_dataelement_outputs <- function(reporting_rate_tbl, snt_environment, country_code) { + output_dir <- file.path(snt_environment$DATA_PATH, "dhis2", "reporting_rate") + dir.create(output_dir, recursive = TRUE, showWarnings = FALSE) + base <- paste0(country_code, "_reporting_rate_dataelement") + csv_path <- file.path(output_dir, paste0(base, ".csv")) + pq_path <- file.path(output_dir, paste0(base, ".parquet")) + utils::write.csv(reporting_rate_tbl, csv_path, row.names = FALSE) + log_msg(glue::glue("Exported: {csv_path}")) + arrow::write_parquet(reporting_rate_tbl, pq_path) + log_msg(glue::glue("Exported: {pq_path}")) + invisible(list(csv_path = csv_path, parquet_path = pq_path)) +} + + build_facility_master_dataelement <- function( dhis2_pyramid_formatted, period_vector, diff --git a/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb b/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb index ccc4465..a4ef3b6 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb +++ b/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb @@ -50,6 +50,8 @@ "\n", "### Pipeline parameters\n", "\n", + "`ROUTINE_FILE` and `DATASET_ID` are injected by Papermill (checked inside `parse_reporting_rate_dataset_snt_settings()`).\n", + "\n", "- **Outliers detection method**: Specify which method was used to detect outliers in routine data. Choose \"Routine data (Raw)\" to use raw routine data.\n", " \n", "- **Use routine with outliers removed**: Toggle this on to use the routine data after outliers have been removed (using the outliers detection method selected above). Else, this pipeline will use either the imputed routine data (to replace the outlier values removed) or the raw routine data if you selected \"Routine data (Raw)\" as your choice of “Outlier processing method”." @@ -95,13 +97,8 @@ } }, "source": [ - "if (!exists(\"SNT_ROOT_PATH\")) {\n", - " SNT_ROOT_PATH <- \"/home/hexa/workspace\"\n", - "}\n", - "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_reporting_rate_dataset\")\n", - "\n", - "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_reporting_rate_dataset.r\"))\n", - "setup <- get_setup_variables(SNT_ROOT_PATH = SNT_ROOT_PATH)\n", + "source(\"~/workspace/pipelines/snt_dhis2_reporting_rate_dataset/utils/snt_dhis2_reporting_rate_dataset.r\")\n", + "snt_environment <- get_setup_variables()\n", "" ], "execution_count": null, @@ -147,7 +144,7 @@ } }, "source": [ - "config_json <- load_snt_config(file.path(setup$CONFIG_PATH, \"SNT_config.json\"))" + "config_json <- load_snt_config(file.path(snt_environment$CONFIG_PATH, \"SNT_config.json\"))" ], "execution_count": null, "outputs": [], @@ -175,15 +172,8 @@ } }, "source": [ - "# Configuration settings\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", - "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", - "\n", - "# Which reporting rate PRODUCT_UID to use (DHIS2 dataset id)\n", - "REPORTING_RATE_PRODUCT_ID <- config_json$SNT_CONFIG$REPORTING_RATE_PRODUCT_UID \n", - "\n", - "fixed_cols_rr <- c('YEAR', 'MONTH', 'ADM2_ID', 'REPORTING_RATE') # Fixed cols for exporting RR tables" + "cx <- parse_reporting_rate_dataset_snt_settings(config_json)\n", + "list2env(cx, envir = .GlobalEnv)\n" ], "execution_count": null, "outputs": [], @@ -202,42 +192,11 @@ "tags": [] }, "source": [ - "#### 1.2. Validate parameters" + "#### 1.2. Config + Papermill\n", + "`ROUTINE_FILE` and `DATASET_ID` are checked inside `parse_reporting_rate_dataset_snt_settings()` (older notebooks may still call `assert_papermill_reporting_rate_dataset_params()` explicitly)." ], "id": "a7a15634-4623-40f2-8e2d-3fa47203aa6e" }, - { - "cell_type": "code", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:04.019283Z", - "iopub.status.busy": "2025-12-19T10:23:04.017257Z", - "iopub.status.idle": "2025-12-19T10:23:04.039652Z", - "shell.execute_reply": "2025-12-19T10:23:04.037292Z" - }, - "papermill": { - "duration": 0.02788, - "end_time": "2025-12-19T10:23:04.042642", - "exception": false, - "start_time": "2025-12-19T10:23:04.014762", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "source": [ - "# default: raw routine\n", - "if (!exists(\"ROUTINE_FILE\")) { ROUTINE_FILE <- glue::glue(\"{COUNTRY_CODE}_routine.parquet\") }\n", - "\n", - "# Resolved by pipeline.py based on routine_data_choice; fallback to empty string\n", - "if (!exists(\"DATASET_ID\")) { DATASET_ID <- \"\" }" - ], - "execution_count": null, - "outputs": [], - "id": "b17f7685-5291-4e5d-9eec-2d1f9435fccb" - }, { "cell_type": "markdown", "metadata": { @@ -1048,7 +1007,7 @@ } }, "source": [ - "output_dir <- file.path(setup$DATA_PATH, \"dhis2\", \"reporting_rate\")\n", + "output_dir <- file.path(snt_environment$DATA_PATH, \"dhis2\", \"reporting_rate\")\n", "dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)\n", "\n", "write.csv(\n", diff --git a/pipelines/snt_dhis2_reporting_rate_dataset/reporting/snt_dhis2_reporting_rate_dataset_report.ipynb b/pipelines/snt_dhis2_reporting_rate_dataset/reporting/snt_dhis2_reporting_rate_dataset_report.ipynb index 90e4762..50d9708 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataset/reporting/snt_dhis2_reporting_rate_dataset_report.ipynb +++ b/pipelines/snt_dhis2_reporting_rate_dataset/reporting/snt_dhis2_reporting_rate_dataset_report.ipynb @@ -1,1299 +1,1281 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "b79cba06", - "metadata": { - "papermill": { - "duration": 0.000249, - "end_time": "2025-12-19T10:23:27.548651", - "exception": false, - "start_time": "2025-12-19T10:23:27.548402", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 1. Setup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7ca65bcc", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:27.561213Z", - "iopub.status.busy": "2025-12-19T10:23:27.553197Z", - "iopub.status.idle": "2025-12-19T10:23:34.811467Z", - "shell.execute_reply": "2025-12-19T10:23:34.808478Z" - }, - "papermill": { - "duration": 7.265364, - "end_time": "2025-12-19T10:23:34.814448", - "exception": false, - "start_time": "2025-12-19T10:23:27.549084", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Project paths\n", - "SNT_ROOT_PATH <- \"/home/hexa/workspace\" \n", - "REPORTING_NB_OUTPUTS_PATH <- file.path(SNT_ROOT_PATH, \"pipelines/snt_dhis2_reporting_rate_dataset/reporting/outputs\")\n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, 'code') # this is where we store snt_utils.r\n", - "CONFIG_PATH <- file.path(SNT_ROOT_PATH, 'configuration') # .json config file\n", - "DATA_PATH <- file.path(SNT_ROOT_PATH, 'data', 'dhis2') \n", - "\n", - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "# Load palettes\n", - "source(file.path(CODE_PATH, \"snt_palettes.r\"))\n", - "\n", - "# Load libraries \n", - "required_packages <- c(\"arrow\", \"tidyverse\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"glue\")\n", - "install_and_load(required_packages)\n", - "\n", - "# Environment variables\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "\n", - "# Load OpenHEXA sdk\n", - "openhexa <- import(\"openhexa.sdk\")" - ] - }, - { - "cell_type": "markdown", - "id": "c5301aa3", - "metadata": { - "papermill": { - "duration": 0.000116, - "end_time": "2025-12-19T10:23:34.814852", - "exception": false, - "start_time": "2025-12-19T10:23:34.814736", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 1.1. Load and check `snt config` file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "76d8a072", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:34.858197Z", - "iopub.status.busy": "2025-12-19T10:23:34.817039Z", - "iopub.status.idle": "2025-12-19T10:23:35.335737Z", - "shell.execute_reply": "2025-12-19T10:23:35.333547Z" - }, - "papermill": { - "duration": 0.52329, - "end_time": "2025-12-19T10:23:35.338288", - "exception": false, - "start_time": "2025-12-19T10:23:34.814998", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load SNT config\n", - "config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\")) },\n", - " error = function(e) {\n", - " msg <- paste0(\"[ERROR] Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "log_msg(paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, \"SNT_config.json\")))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c712ac02", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:35.342494Z", - "iopub.status.busy": "2025-12-19T10:23:35.340803Z", - "iopub.status.idle": "2025-12-19T10:23:35.366376Z", - "shell.execute_reply": "2025-12-19T10:23:35.364165Z" - }, - "papermill": { - "duration": 0.030446, - "end_time": "2025-12-19T10:23:35.368977", - "exception": false, - "start_time": "2025-12-19T10:23:35.338531", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Configuration settings\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", - "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", - "\n", - "REPORTING_RATE_DATASET_NAME <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_REPORTING_RATE\n", - "DHIS2_FORMATTED_DATASET_NAME <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "\n", - "REPORTING_RATE_PRODUCT_UID <- config_json$SNT_CONFIG$REPORTING_RATE_PRODUCT_UID # to add to plots subtitles" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e02c652e", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:35.373316Z", - "iopub.status.busy": "2025-12-19T10:23:35.371377Z", - "iopub.status.idle": "2025-12-19T10:23:35.396646Z", - "shell.execute_reply": "2025-12-19T10:23:35.394442Z" - }, - "papermill": { - "duration": 0.029675, - "end_time": "2025-12-19T10:23:35.398945", - "exception": false, - "start_time": "2025-12-19T10:23:35.369270", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000249, + "end_time": "2025-12-19T10:23:27.548651", + "exception": false, + "start_time": "2025-12-19T10:23:27.548402", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 1. Setup" + ], + "id": "b79cba06" + }, + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:27.561213Z", + "iopub.status.busy": "2025-12-19T10:23:27.553197Z", + "iopub.status.idle": "2025-12-19T10:23:34.811467Z", + "shell.execute_reply": "2025-12-19T10:23:34.808478Z" + }, + "papermill": { + "duration": 7.265364, + "end_time": "2025-12-19T10:23:34.814448", + "exception": false, + "start_time": "2025-12-19T10:23:27.549084", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "SNT_ROOT_PATH <- \"~/workspace\"\n", + "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_reporting_rate_dataset\")\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_reporting_rate_dataset.r\"))\n", + "\n", + "report_packages <- c(\"arrow\", \"tidyverse\", \"sf\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"glue\")\n", + "snt_environment <- get_setup_variables(SNT_ROOT_PATH = SNT_ROOT_PATH, packages = report_packages)\n", + "\n", + "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", + "CONFIG_PATH <- snt_environment$CONFIG_PATH\n", + "DATA_PATH <- file.path(snt_environment$DATA_PATH, \"dhis2\")\n", + "REPORTING_NB_OUTPUTS_PATH <- file.path(SNT_ROOT_PATH, \"pipelines/snt_dhis2_reporting_rate_dataset/reporting/outputs\")\n", + "\n", + "source(file.path(CODE_PATH, \"snt_palettes.r\"))" + ], + "execution_count": null, + "outputs": [], + "id": "7ca65bcc" + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000116, + "end_time": "2025-12-19T10:23:34.814852", + "exception": false, + "start_time": "2025-12-19T10:23:34.814736", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 1.1. Load and check `snt config` file" + ], + "id": "c5301aa3" + }, + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:34.858197Z", + "iopub.status.busy": "2025-12-19T10:23:34.817039Z", + "iopub.status.idle": "2025-12-19T10:23:35.335737Z", + "shell.execute_reply": "2025-12-19T10:23:35.333547Z" + }, + "papermill": { + "duration": 0.52329, + "end_time": "2025-12-19T10:23:35.338288", + "exception": false, + "start_time": "2025-12-19T10:23:34.814998", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "config_json <- load_snt_config(file.path(CONFIG_PATH, \"SNT_config.json\"))" + ], + "execution_count": null, + "outputs": [], + "id": "76d8a072" + }, + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:35.342494Z", + "iopub.status.busy": "2025-12-19T10:23:35.340803Z", + "iopub.status.idle": "2025-12-19T10:23:35.366376Z", + "shell.execute_reply": "2025-12-19T10:23:35.364165Z" + }, + "papermill": { + "duration": 0.030446, + "end_time": "2025-12-19T10:23:35.368977", + "exception": false, + "start_time": "2025-12-19T10:23:35.338531", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Configuration settings\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", + "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", + "\n", + "REPORTING_RATE_DATASET_NAME <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_REPORTING_RATE\n", + "DHIS2_FORMATTED_DATASET_NAME <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "\n", + "REPORTING_RATE_PRODUCT_UID <- config_json$SNT_CONFIG$REPORTING_RATE_PRODUCT_UID # to add to plots subtitles" + ], + "execution_count": null, + "outputs": [], + "id": "c712ac02" + }, + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:35.373316Z", + "iopub.status.busy": "2025-12-19T10:23:35.371377Z", + "iopub.status.idle": "2025-12-19T10:23:35.396646Z", + "shell.execute_reply": "2025-12-19T10:23:35.394442Z" + }, + "papermill": { + "duration": 0.029675, + "end_time": "2025-12-19T10:23:35.398945", + "exception": false, + "start_time": "2025-12-19T10:23:35.369270", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Make string of product uids for plot subtitles\n", + "rr_product_uid <-paste(REPORTING_RATE_PRODUCT_UID,collapse = \", \") \n", + "rr_product_uid" + ], + "execution_count": null, + "outputs": [], + "id": "e02c652e" + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000094, + "end_time": "2025-12-19T10:23:35.399231", + "exception": false, + "start_time": "2025-12-19T10:23:35.399137", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 1.2. Load and check `snt metadata` file\n", + "This is needed for the correct use of palettes and categories (breaks, or scale)" + ], + "id": "30b058f4" + }, + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:35.403224Z", + "iopub.status.busy": "2025-12-19T10:23:35.401458Z", + "iopub.status.idle": "2025-12-19T10:23:36.335964Z", + "shell.execute_reply": "2025-12-19T10:23:36.330643Z" + }, + "papermill": { + "duration": 0.940593, + "end_time": "2025-12-19T10:23:36.339927", + "exception": false, + "start_time": "2025-12-19T10:23:35.399334", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Load SNT metadata\n", + "metadata_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_metadata.json\")) },\n", + " error = function(e) {\n", + " msg <- paste0(\"[ERROR] Error while loading metadata\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })\n", + "\n", + "log_msg(paste0(\"SNT metadata loaded from : \", file.path(CONFIG_PATH, \"SNT_metadata.json\")))" + ], + "execution_count": null, + "outputs": [], + "id": "98a8ee49" + }, + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:36.357945Z", + "iopub.status.busy": "2025-12-19T10:23:36.343228Z", + "iopub.status.idle": "2025-12-19T10:23:36.535579Z", + "shell.execute_reply": "2025-12-19T10:23:36.533231Z" + }, + "papermill": { + "duration": 0.198107, + "end_time": "2025-12-19T10:23:36.538224", + "exception": false, + "start_time": "2025-12-19T10:23:36.340117", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "scale_raw <- metadata_json$REPORTING_RATE$SCALE\n", + "break_vals <- if (is.character(scale_raw) && length(scale_raw) == 1) {\n", + " jsonlite::fromJSON(scale_raw)\n", + "} else {\n", + " as.numeric(unlist(scale_raw, use.names = FALSE))\n", + "}\n", + "\n", + "log_msg(paste0(\"Reporting Rate scale break values loaded from SNT_metadata.json : \", paste(break_vals, collapse = \", \")))" + ], + "execution_count": null, + "outputs": [], + "id": "00681217" + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000162, + "end_time": "2025-12-19T10:23:36.538638", + "exception": false, + "start_time": "2025-12-19T10:23:36.538476", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 2. Load Data" + ], + "id": "f3470564" + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000126, + "end_time": "2025-12-19T10:23:36.538947", + "exception": false, + "start_time": "2025-12-19T10:23:36.538821", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 2.1. Output of pipeline notebook\n", + "Import file named `{COUNTRY_CODE}_reporting_rate_dataset.parquet` from **OH Dataset** \"SNT_DHIS2_REPORTING_RATE\" (as in `config_json$SNT_DATASET_IDENTIFIERS$DHIS2_REPORTING_RATE`)" + ], + "id": "82397307" + }, + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:36.543564Z", + "iopub.status.busy": "2025-12-19T10:23:36.541311Z", + "iopub.status.idle": "2025-12-19T10:23:37.788619Z", + "shell.execute_reply": "2025-12-19T10:23:37.785121Z" + }, + "papermill": { + "duration": 1.253125, + "end_time": "2025-12-19T10:23:37.792249", + "exception": false, + "start_time": "2025-12-19T10:23:36.539124", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "\n", + "reporting_rate_dataset <- tryCatch({ get_latest_dataset_file_in_memory(REPORTING_RATE_DATASET_NAME, glue::glue(\"{COUNTRY_CODE}_reporting_rate_dataset.parquet\")) }, \n", + " error = function(e) {\n", + " msg <- paste(\"Error while loading Reporting Rate (Dataset) data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", + " cat(msg)\n", + " stop(msg)\n", + "})\n", + "\n", + "# log\n", + "log_msg(glue::glue(\"Data file loaded from dataset : {REPORTING_RATE_DATASET_NAME} dataframe dimensions: {paste(dim(reporting_rate_dataset), collapse=', ')}\"))\n", + "dim(reporting_rate_dataset)\n", + "head(reporting_rate_dataset, 2)" + ], + "execution_count": null, + "outputs": [], + "id": "70acb2c5" + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000091, + "end_time": "2025-12-19T10:23:37.792528", + "exception": false, + "start_time": "2025-12-19T10:23:37.792437", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 2.2. Shapes\n", + "To make choropleth (map)" + ], + "id": "48833515" + }, + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:37.798194Z", + "iopub.status.busy": "2025-12-19T10:23:37.795402Z", + "iopub.status.idle": "2025-12-19T10:23:41.325848Z", + "shell.execute_reply": "2025-12-19T10:23:41.323895Z" + }, + "papermill": { + "duration": 3.535554, + "end_time": "2025-12-19T10:23:41.328226", + "exception": false, + "start_time": "2025-12-19T10:23:37.792672", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "shapes <- tryCatch({ get_latest_dataset_file_in_memory(DHIS2_FORMATTED_DATASET_NAME, paste0(COUNTRY_CODE, \"_shapes.geojson\")) }, \n", + " error = function(e) { \n", + " msg <- paste0(COUNTRY_CODE , \" Shapes data is not available in dataset: '\" , DHIS2_FORMATTED_DATASET_NAME, \"' last version.\")\n", + " log_msg(msg, \"warning\")\n", + " shapes <- NULL\n", + " })\n", + "\n", + "log_msg(glue::glue(\"Shapes loaded from dataset: '{DHIS2_FORMATTED_DATASET_NAME}'. \\nDataframe with dimensions: {paste(dim(shapes), collapse=', ')}\"))\n", + "names(shapes)" + ], + "execution_count": null, + "outputs": [], + "id": "3febd4f4" + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000166, + "end_time": "2025-12-19T10:23:41.328651", + "exception": false, + "start_time": "2025-12-19T10:23:41.328485", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### 3. Plots" + ], + "id": "17067d56" + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000109, + "end_time": "2025-12-19T10:23:41.328959", + "exception": false, + "start_time": "2025-12-19T10:23:41.328850", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "##### 3.0. Add shapes" + ], + "id": "9a6369ee" + }, + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:41.333105Z", + "iopub.status.busy": "2025-12-19T10:23:41.331427Z", + "iopub.status.idle": "2025-12-19T10:23:41.365417Z", + "shell.execute_reply": "2025-12-19T10:23:41.363294Z" + }, + "papermill": { + "duration": 0.03905, + "end_time": "2025-12-19T10:23:41.368213", + "exception": false, + "start_time": "2025-12-19T10:23:41.329163", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Join shapes to reporting rate data\n", + "\n", + "data_to_plot <- reporting_rate_dataset %>%\n", + " left_join(shapes, by = c(\"ADM2_ID\"))" + ], + "execution_count": null, + "outputs": [], + "id": "c6641720" + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000195, + "end_time": "2025-12-19T10:23:41.368739", + "exception": false, + "start_time": "2025-12-19T10:23:41.368544", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 3.1. 🎨 Dynamic categories and color assignement" + ], + "id": "0b0d32f1" + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000109, + "end_time": "2025-12-19T10:23:41.369057", + "exception": false, + "start_time": "2025-12-19T10:23:41.368948", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "##### 1. Define breaks and labels" + ], + "id": "cc765e0c" + }, + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:41.373558Z", + "iopub.status.busy": "2025-12-19T10:23:41.371555Z", + "iopub.status.idle": "2025-12-19T10:23:41.392950Z", + "shell.execute_reply": "2025-12-19T10:23:41.390333Z" + }, + "papermill": { + "duration": 0.026996, + "end_time": "2025-12-19T10:23:41.396238", + "exception": false, + "start_time": "2025-12-19T10:23:41.369242", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Safety code to avoid breaking if nothings is fund in json_metadata\n", + "if (is.null(break_vals) || length(break_vals) == 0) {\n", + " log_msg(\"[WARNING] No break values found in SNT_metadata.json for REPORTING_RATE$SCALE. Using default values.\", \"warning\")\n", + " break_vals <- c(0.5, 0.8, 0.9, 0.95, 1.00)\n", + "}" + ], + "execution_count": null, + "outputs": [], + "id": "2e79132c" + }, + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:41.401034Z", + "iopub.status.busy": "2025-12-19T10:23:41.398849Z", + "iopub.status.idle": "2025-12-19T10:23:41.430720Z", + "shell.execute_reply": "2025-12-19T10:23:41.428238Z" + }, + "papermill": { + "duration": 0.037712, + "end_time": "2025-12-19T10:23:41.434131", + "exception": false, + "start_time": "2025-12-19T10:23:41.396419", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# 1. Define breaks\n", + "# Note: assumes that the data starts at 0!\n", + "# break_vals <- metadata_json$REPORTING_RATE$SCALE # moved upstream\n", + "\n", + "# 2. Create the full set of cut points (0 to Infinity)\n", + "full_breaks <- c(0, break_vals, Inf)\n", + "\n", + "# 3. Create dynamic labels\n", + "labels <- c(\n", + " paste0(\"< \", break_vals[1]), # First label\n", + " paste0(break_vals[-length(break_vals)], \" - \", break_vals[-1]), # Middle labels\n", + " paste0(\"> \", break_vals[length(break_vals)]) # Last label\n", + ")\n", + "\n", + "# Check\n", + "labels" + ], + "execution_count": null, + "outputs": [], + "id": "f04cb888" + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000102, + "end_time": "2025-12-19T10:23:41.434442", + "exception": false, + "start_time": "2025-12-19T10:23:41.434340", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "##### 2. Create `_CATEGORY` col" + ], + "id": "cb237801" + }, + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:41.439376Z", + "iopub.status.busy": "2025-12-19T10:23:41.437165Z", + "iopub.status.idle": "2025-12-19T10:23:41.471891Z", + "shell.execute_reply": "2025-12-19T10:23:41.469251Z" + }, + "papermill": { + "duration": 0.040632, + "end_time": "2025-12-19T10:23:41.475176", + "exception": false, + "start_time": "2025-12-19T10:23:41.434544", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# reporting_rate_dataset <- reporting_rate_dataset %>%\n", + "data_to_plot <- data_to_plot %>%\n", + " mutate(\n", + " REPORTING_RATE_CATEGORY = cut(\n", + " REPORTING_RATE,\n", + " breaks = full_breaks,\n", + " labels = labels,\n", + " right = TRUE, # so that 1.00 is assigned to \"0.95 - 1.00\"\n", + " include.lowest = TRUE\n", + " )\n", + " )" + ], + "execution_count": null, + "outputs": [], + "id": "f8303488" + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000102, + "end_time": "2025-12-19T10:23:41.475483", + "exception": false, + "start_time": "2025-12-19T10:23:41.475381", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "##### 3. Pick appropriate palette" + ], + "id": "a10237f8" + }, + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:41.480216Z", + "iopub.status.busy": "2025-12-19T10:23:41.478061Z", + "iopub.status.idle": "2025-12-19T10:23:41.513805Z", + "shell.execute_reply": "2025-12-19T10:23:41.511268Z" + }, + "papermill": { + "duration": 0.04138, + "end_time": "2025-12-19T10:23:41.516984", + "exception": false, + "start_time": "2025-12-19T10:23:41.475604", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Count nr of breaks\n", + "nr_of_colors <- length(labels)\n", + "\n", + "# nr_of_colors\n", + "palette_to_use <- get_range_from_count(nr_of_colors)\n", + "\n", + "# Need to make palettes as named vectors so that scale_color_manual() and scale_fill_manual() can use them properly\n", + "# Note: need to reverse order of labels to match the palette order \"meaning\" (red \"\" should correcpond to lowest value)\n", + "names(palette_to_use) <- rev(labels)\n", + "\n", + "palette_to_use\n" + ], + "execution_count": null, + "outputs": [], + "id": "2ee6e077" + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000099, + "end_time": "2025-12-19T10:23:41.517267", + "exception": false, + "start_time": "2025-12-19T10:23:41.517168", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### 3.2. Plots" + ], + "id": "d08c0c14" + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000056, + "end_time": "2025-12-19T10:23:41.517425", + "exception": false, + "start_time": "2025-12-19T10:23:41.517369", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "##### 3.2.1 Scatter plot of RR over time (by ADM2)\n", + "With this we can see the actula numbners (although cannot tell which ADM2 have low values)." + ], + "id": "b7781198" + }, + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:41.522513Z", + "iopub.status.busy": "2025-12-19T10:23:41.520272Z", + "iopub.status.idle": "2025-12-19T10:23:42.935181Z", + "shell.execute_reply": "2025-12-19T10:23:42.932661Z" + }, + "papermill": { + "duration": 1.456494, + "end_time": "2025-12-19T10:23:42.974012", + "exception": false, + "start_time": "2025-12-19T10:23:41.517518", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Line point plot faceted by YEAR\n", + "ggplot(data = data_to_plot) +\n", + " geom_line(aes(x = MONTH,\n", + " y = REPORTING_RATE,\n", + " group = ADM2_ID,\n", + " color = REPORTING_RATE_CATEGORY), \n", + " alpha = 0.3,\n", + " show.legend = FALSE\n", + " ) +\n", + " geom_point(aes(x = MONTH,\n", + " y = REPORTING_RATE,\n", + " group = ADM2_ID,\n", + " color = REPORTING_RATE_CATEGORY)) + \n", + " facet_grid(~YEAR) + \n", + " scale_color_manual(\n", + " values = palette_to_use, # 🎨 NEW dynamic colors & breaks!\n", + " na.value = \"white\",\n", + " name = \"Reporting Rate Categories\"\n", + " ) +\n", + " scale_x_continuous(breaks = seq(1, 12, 1)) +\n", + " scale_y_continuous(\n", + " breaks = c(0, break_vals), # 🎨 NEW dynamic colors & breaks!\n", + " # Dynamically set max value to fit actual data (do show values >1 if present)\n", + " limits = c(0, max(data_to_plot$REPORTING_RATE, na.rm = TRUE) + 0.1)\n", + " ) +\n", + " labs(\n", + " title = \"Reporting Rate (Dataset)\",\n", + " subtitle = paste0(\"Product UID : \", rr_product_uid),\n", + " x = \"Month\",\n", + " y = \"Reporting Rate\\n(Dataset)\" ) +\n", + " theme_minimal() +\n", + " theme(\n", + " plot.subtitle = element_text(margin=margin(0,0,20,0)),\n", + " legend.position = \"none\",\n", + " legend.title = element_blank(),\n", + " # legend.key.width = unit(3, \"cm\"),\n", + " # legend.key.height = unit(0.25, \"cm\"),\n", + " axis.title.y = element_blank(),\n", + " panel.grid.minor = element_blank(),\n", + " panel.grid.major.x = element_blank(),\n", + " strip.placement = \"outside\",\n", + " strip.text = element_text(face = \"bold\", size = 10)\n", + " )" + ], + "execution_count": null, + "outputs": [], + "id": "78d92e4a" + }, + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:42.978498Z", + "iopub.status.busy": "2025-12-19T10:23:42.976659Z", + "iopub.status.idle": "2025-12-19T10:23:44.087244Z", + "shell.execute_reply": "2025-12-19T10:23:44.085182Z" + }, + "papermill": { + "duration": 1.11568, + "end_time": "2025-12-19T10:23:44.089891", + "exception": false, + "start_time": "2025-12-19T10:23:42.974211", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Export plot as PNG\n", + "output_filename <- paste0(COUNTRY_CODE, \"_reporting_rate_dataset_adm2_linepoint_\", paste(REPORTING_RATE_PRODUCT_UID, collapse = \"_\"), \".png\")\n", + "output_location <- file.path(REPORTING_NB_OUTPUTS_PATH, \"figures\")\n", + "\n", + "ggsave(\n", + " filename = output_filename, \n", + " path = output_location, \n", + " create.dir = TRUE,\n", + " height = 15,\n", + " width = 45,\n", + " units = \"cm\",\n", + " bg = \"white\",\n", + " dpi = 200\n", + " )\n", + "\n", + "# Add log message\n", + "log_msg(glue::glue(\"📊 Plot (linepoint) saved to: {file.path(output_location, output_filename)}\"))" + ], + "execution_count": null, + "outputs": [], + "id": "1f47064a" + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000147, + "end_time": "2025-12-19T10:23:44.090320", + "exception": false, + "start_time": "2025-12-19T10:23:44.090173", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "##### 3.2.2 Heatmap plot of RR over time (by ADM2)\n", + "This is less good for identifying actual values, but allows to see which ADM2 have lower values." + ], + "id": "22bb6431" + }, + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:44.094508Z", + "iopub.status.busy": "2025-12-19T10:23:44.092577Z", + "iopub.status.idle": "2025-12-19T10:23:46.262550Z", + "shell.execute_reply": "2025-12-19T10:23:46.259633Z" + }, + "papermill": { + "duration": 2.21647, + "end_time": "2025-12-19T10:23:46.306927", + "exception": false, + "start_time": "2025-12-19T10:23:44.090457", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Tile plot faceted by YEAR\n", + "ggplot(data = data_to_plot) +\n", + " geom_tile(aes(x = MONTH,\n", + " y = fct_rev(ADM2_NAME),\n", + " fill = REPORTING_RATE_CATEGORY), \n", + " color = \"white\",\n", + " show.legend = TRUE\n", + " ) +\n", + " scale_fill_manual(\n", + " values = palette_to_use, # 🎨 NEW dynamic colors & breaks!\n", + " na.value = \"white\",\n", + " name = \"Reporting Rate: \"\n", + " ) +\n", + " scale_x_continuous(breaks = seq(1, 12, 1)) +\n", + " labs(\n", + " title = \"Reporting Rate (Dataset)\",\n", + " subtitle = paste0(\"Product UID : \", rr_product_uid),\n", + " x = \"Month\"\n", + " ) +\n", + " facet_grid(rows = vars(ADM1_NAME), cols = vars(YEAR), \n", + " scales = \"free_y\", space = \"free_y\",\n", + " switch = \"y\") +\n", + " theme_minimal() +\n", + " theme(\n", + " plot.subtitle = element_text(margin=margin(0,0,20,0)),\n", + " legend.position = \"bottom\",\n", + " legend.key.height = unit(0.25, \"cm\"),\n", + " axis.text.x = element_text(size = 7),\n", + " axis.title.y = element_blank(),\n", + " panel.grid.minor = element_blank(),\n", + " panel.grid.major = element_blank(),\n", + " strip.placement = \"outside\", \n", + " strip.text = element_text(face = \"bold\", size = 10)\n", + " ) +\n", + " guides(fill = guide_legend(nrow = 1))" + ], + "execution_count": null, + "outputs": [], + "id": "f2445f2a" + }, + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:46.311134Z", + "iopub.status.busy": "2025-12-19T10:23:46.309412Z", + "iopub.status.idle": "2025-12-19T10:23:48.286664Z", + "shell.execute_reply": "2025-12-19T10:23:48.284571Z" + }, + "papermill": { + "duration": 1.982105, + "end_time": "2025-12-19T10:23:48.289215", + "exception": false, + "start_time": "2025-12-19T10:23:46.307110", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Export plot as PNG\n", + "output_filename <- paste0(COUNTRY_CODE, \"_reporting_rate_dataset_adm2_heatmap_\", paste(REPORTING_RATE_PRODUCT_UID, collapse = \"_\"), \".png\")\n", + "output_location <- file.path(REPORTING_NB_OUTPUTS_PATH, \"figures\")\n", + "\n", + "ggsave(\n", + " filename = output_filename, \n", + " path = output_location, \n", + " create.dir = TRUE,\n", + " width = 20, height = 30, units = \"cm\", \n", + " dpi = 200\n", + " )\n", + "\n", + "# Add log message\n", + "log_msg(glue::glue(\"📊 Plot (heatmap) saved to: {file.path(output_location, output_filename)}\"))" + ], + "execution_count": null, + "outputs": [], + "id": "cbe73312" + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000164, + "end_time": "2025-12-19T10:23:48.289656", + "exception": false, + "start_time": "2025-12-19T10:23:48.289492", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "##### 3.2.3. MAP of Reporting Rate - by month" + ], + "id": "3eef141a" + }, + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:48.294030Z", + "iopub.status.busy": "2025-12-19T10:23:48.292256Z", + "iopub.status.idle": "2025-12-19T10:23:53.205670Z", + "shell.execute_reply": "2025-12-19T10:23:53.203104Z" + }, + "papermill": { + "duration": 4.958481, + "end_time": "2025-12-19T10:23:53.248341", + "exception": false, + "start_time": "2025-12-19T10:23:48.289860", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Choropleth map with reporting rate data by ADM2\n", + "ggplot(data = data_to_plot) +\n", + " geom_sf(aes(\n", + " fill = REPORTING_RATE_CATEGORY,\n", + " geometry = geometry), \n", + " color = \"white\",\n", + " size = 0.01) +\n", + " scale_fill_manual(\n", + " values = palette_to_use, # 🎨 NEW dynamic colors & breaks!\n", + " na.value = \"white\",\n", + " ) +\n", + " theme_void() +\n", + " theme(\n", + " plot.subtitle = element_text(margin=margin(5,0,20,0)),\n", + " legend.position = \"bottom\",\n", + " legend.title = element_blank(),\n", + " legend.key.height = unit(0.25, \"cm\")\n", + " ) +\n", + " labs(\n", + " title = paste(\"Reporting Rate (Dataset)\"),\n", + " subtitle = paste0(\"Product UID : \", rr_product_uid),\n", + " ) +\n", + " facet_grid(\n", + " rows = vars(YEAR), \n", + " cols = vars(MONTH),\n", + " switch = \"both\") +\n", + " guides(fill = guide_legend(nrow = 1))" + ], + "execution_count": null, + "outputs": [], + "id": "83be9c68" + }, + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:53.252696Z", + "iopub.status.busy": "2025-12-19T10:23:53.250972Z", + "iopub.status.idle": "2025-12-19T10:23:56.748868Z", + "shell.execute_reply": "2025-12-19T10:23:56.746990Z" + }, + "papermill": { + "duration": 3.502689, + "end_time": "2025-12-19T10:23:56.751218", + "exception": false, + "start_time": "2025-12-19T10:23:53.248529", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "output_filename <- paste0(COUNTRY_CODE, \"_reporting_rate_dataset_adm2_map_\", paste(REPORTING_RATE_PRODUCT_UID, collapse = \"_\"), \".png\")\n", + "output_location <- file.path(REPORTING_NB_OUTPUTS_PATH, \"figures\")\n", + "\n", + "ggsave(\n", + " filename = output_filename, \n", + " path = output_location, \n", + " create.dir = TRUE,\n", + " width = 50, height = 20, units = \"cm\", \n", + " dpi = 200\n", + " )\n", + "\n", + "# Add log message\n", + "log_msg(glue::glue(\"📊 Plot (map) saved to: {file.path(output_location, output_filename)}\"))" + ], + "execution_count": null, + "outputs": [], + "id": "e877671d" + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000166, + "end_time": "2025-12-19T10:23:56.751636", + "exception": false, + "start_time": "2025-12-19T10:23:56.751470", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "##### 3.2.4. MAP of Reporting Rate - by YEAR\n", + "Use average (`mean()`) of monthly values" + ], + "id": "f0894be9" + }, + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:56.755998Z", + "iopub.status.busy": "2025-12-19T10:23:56.753982Z", + "iopub.status.idle": "2025-12-19T10:23:56.788391Z", + "shell.execute_reply": "2025-12-19T10:23:56.786447Z" + }, + "papermill": { + "duration": 0.039325, + "end_time": "2025-12-19T10:23:56.791143", + "exception": false, + "start_time": "2025-12-19T10:23:56.751818", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "data_to_plot_year <- data_to_plot %>%\n", + " group_by(geometry, ADM2_ID, ADM2_NAME, ADM1_NAME, YEAR) %>%\n", + " summarise(\n", + " REPORTING_RATE = mean(REPORTING_RATE, na.rm = TRUE),\n", + " .groups = \"drop\"\n", + " ) %>%\n", + " # Calculate REPORTING_RATE_CATEGORY again based on the yearly average\n", + " mutate(\n", + " REPORTING_RATE_CATEGORY = cut(\n", + " REPORTING_RATE,\n", + " breaks = full_breaks,\n", + " labels = labels,\n", + " right = TRUE, # so that 1.00 is assigned to \"0.95 - 1.00\"\n", + " include.lowest = TRUE\n", + " )\n", + " )" + ], + "execution_count": null, + "outputs": [], + "id": "cb1995ab" + }, + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:56.795010Z", + "iopub.status.busy": "2025-12-19T10:23:56.793453Z", + "iopub.status.idle": "2025-12-19T10:23:57.582261Z", + "shell.execute_reply": "2025-12-19T10:23:57.579294Z" + }, + "papermill": { + "duration": 0.798686, + "end_time": "2025-12-19T10:23:57.590023", + "exception": false, + "start_time": "2025-12-19T10:23:56.791337", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Choropleth map with reporting rate data by ADM2\n", + "ggplot(data = data_to_plot_year) +\n", + " geom_sf(aes(\n", + " fill = REPORTING_RATE_CATEGORY,\n", + " geometry = geometry), \n", + " color = \"white\",\n", + " size = 0.01) +\n", + " scale_fill_manual(\n", + " values = palette_to_use, # 🎨 NEW dynamic colors & breaks!\n", + " na.value = \"white\"\n", + " ) +\n", + " theme_void() +\n", + " theme(\n", + " plot.subtitle = element_text(margin=margin(5,0,20,0)),\n", + " legend.position = \"bottom\",\n", + " ) +\n", + " labs(\n", + " title = \"Reporting Rate (Dataset) - mean per Year\",\n", + " subtitle = paste0(\"Product UID : \", rr_product_uid),\n", + " fill = \"Reporting Rate: \"\n", + " ) +\n", + " facet_grid(\n", + " cols = vars(YEAR)\n", + " ) +\n", + " guides(fill = guide_legend(nrow = 1))" + ], + "execution_count": null, + "outputs": [], + "id": "bd32b0cf" + }, + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:57.594096Z", + "iopub.status.busy": "2025-12-19T10:23:57.592357Z", + "iopub.status.idle": "2025-12-19T10:23:58.516754Z", + "shell.execute_reply": "2025-12-19T10:23:58.514785Z" + }, + "papermill": { + "duration": 0.928933, + "end_time": "2025-12-19T10:23:58.519148", + "exception": false, + "start_time": "2025-12-19T10:23:57.590215", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "output_filename <- paste0(COUNTRY_CODE, \"_reporting_rate_dataset_adm2_map_year_\", paste(REPORTING_RATE_PRODUCT_UID, collapse = \"_\"), \".png\")\n", + "output_location <- file.path(REPORTING_NB_OUTPUTS_PATH, \"figures\")\n", + "\n", + "ggsave(\n", + " filename = output_filename, \n", + " path = output_location, \n", + " create.dir = TRUE,\n", + " width = 31, height = 13, units = \"cm\", \n", + " dpi = 200\n", + " )\n", + "\n", + "# Add log message\n", + "log_msg(glue::glue(\"📊 Plot (map) saved to: {file.path(output_location, output_filename)}\"))" + ], + "execution_count": null, + "outputs": [], + "id": "0430641e" + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.000126, + "end_time": "2025-12-19T10:23:58.519515", + "exception": false, + "start_time": "2025-12-19T10:23:58.519389", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#### The End :)" + ], + "id": "8c3bdca4" + }, + { + "cell_type": "code", + "metadata": { + "execution": { + "iopub.execute_input": "2025-12-19T10:23:58.523680Z", + "iopub.status.busy": "2025-12-19T10:23:58.522024Z", + "iopub.status.idle": "2025-12-19T10:23:58.733860Z", + "shell.execute_reply": "2025-12-19T10:23:58.731929Z" + }, + "papermill": { + "duration": 0.216448, + "end_time": "2025-12-19T10:23:58.736160", + "exception": false, + "start_time": "2025-12-19T10:23:58.519712", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "source": [ + "log_msg(\"Reporting Rate (Dataset) report notebook completed successfully!\")" + ], + "execution_count": null, + "outputs": [], + "id": "f8a62ec5" } - }, - "outputs": [], - "source": [ - "# Make string of product uids for plot subtitles\n", - "rr_product_uid <-paste(REPORTING_RATE_PRODUCT_UID,collapse = \", \") \n", - "rr_product_uid" - ] - }, - { - "cell_type": "markdown", - "id": "30b058f4", - "metadata": { - "papermill": { - "duration": 0.000094, - "end_time": "2025-12-19T10:23:35.399231", - "exception": false, - "start_time": "2025-12-19T10:23:35.399137", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 1.2. Load and check `snt metadata` file\n", - "This is needed for the correct use of palettes and categories (breaks, or scale)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "98a8ee49", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:35.403224Z", - "iopub.status.busy": "2025-12-19T10:23:35.401458Z", - "iopub.status.idle": "2025-12-19T10:23:36.335964Z", - "shell.execute_reply": "2025-12-19T10:23:36.330643Z" + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" }, "papermill": { - "duration": 0.940593, - "end_time": "2025-12-19T10:23:36.339927", - "exception": false, - "start_time": "2025-12-19T10:23:35.399334", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" + "default_parameters": {}, + "duration": 32.950872, + "end_time": "2025-12-19T10:23:59.058917", + "environment_variables": {}, + "exception": null, + "input_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataset/reporting/snt_dhis2_reporting_rate_dataset_report.ipynb", + "output_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataset/reporting/outputs/snt_dhis2_reporting_rate_dataset_report_OUTPUT_2025-12-19_102325.ipynb", + "parameters": {}, + "start_time": "2025-12-19T10:23:26.108045", + "version": "2.6.0" } - }, - "outputs": [], - "source": [ - "# Load SNT metadata\n", - "metadata_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_metadata.json\")) },\n", - " error = function(e) {\n", - " msg <- paste0(\"[ERROR] Error while loading metadata\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "log_msg(paste0(\"SNT metadata loaded from : \", file.path(CONFIG_PATH, \"SNT_metadata.json\")))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "00681217", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:36.357945Z", - "iopub.status.busy": "2025-12-19T10:23:36.343228Z", - "iopub.status.idle": "2025-12-19T10:23:36.535579Z", - "shell.execute_reply": "2025-12-19T10:23:36.533231Z" - }, - "papermill": { - "duration": 0.198107, - "end_time": "2025-12-19T10:23:36.538224", - "exception": false, - "start_time": "2025-12-19T10:23:36.340117", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "scale_raw <- metadata_json$REPORTING_RATE$SCALE\n", - "break_vals <- if (is.character(scale_raw) && length(scale_raw) == 1) {\n", - " jsonlite::fromJSON(scale_raw)\n", - "} else {\n", - " as.numeric(unlist(scale_raw, use.names = FALSE))\n", - "}\n", - "\n", - "log_msg(paste0(\"Reporting Rate scale break values loaded from SNT_metadata.json : \", paste(break_vals, collapse = \", \")))" - ] - }, - { - "cell_type": "markdown", - "id": "f3470564", - "metadata": { - "papermill": { - "duration": 0.000162, - "end_time": "2025-12-19T10:23:36.538638", - "exception": false, - "start_time": "2025-12-19T10:23:36.538476", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 2. Load Data" - ] - }, - { - "cell_type": "markdown", - "id": "82397307", - "metadata": { - "papermill": { - "duration": 0.000126, - "end_time": "2025-12-19T10:23:36.538947", - "exception": false, - "start_time": "2025-12-19T10:23:36.538821", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 2.1. Output of pipeline notebook\n", - "Import file named `{COUNTRY_CODE}_reporting_rate_dataset.parquet` from **OH Dataset** \"SNT_DHIS2_REPORTING_RATE\" (as in `config_json$SNT_DATASET_IDENTIFIERS$DHIS2_REPORTING_RATE`)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "70acb2c5", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:36.543564Z", - "iopub.status.busy": "2025-12-19T10:23:36.541311Z", - "iopub.status.idle": "2025-12-19T10:23:37.788619Z", - "shell.execute_reply": "2025-12-19T10:23:37.785121Z" - }, - "papermill": { - "duration": 1.253125, - "end_time": "2025-12-19T10:23:37.792249", - "exception": false, - "start_time": "2025-12-19T10:23:36.539124", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "\n", - "reporting_rate_dataset <- tryCatch({ get_latest_dataset_file_in_memory(REPORTING_RATE_DATASET_NAME, glue::glue(\"{COUNTRY_CODE}_reporting_rate_dataset.parquet\")) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading Reporting Rate (Dataset) data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - "})\n", - "\n", - "# log\n", - "log_msg(glue::glue(\"Data file loaded from dataset : {REPORTING_RATE_DATASET_NAME} dataframe dimensions: {paste(dim(reporting_rate_dataset), collapse=', ')}\"))\n", - "dim(reporting_rate_dataset)\n", - "head(reporting_rate_dataset, 2)" - ] - }, - { - "cell_type": "markdown", - "id": "48833515", - "metadata": { - "papermill": { - "duration": 0.000091, - "end_time": "2025-12-19T10:23:37.792528", - "exception": false, - "start_time": "2025-12-19T10:23:37.792437", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 2.2. Shapes\n", - "To make choropleth (map)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3febd4f4", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:37.798194Z", - "iopub.status.busy": "2025-12-19T10:23:37.795402Z", - "iopub.status.idle": "2025-12-19T10:23:41.325848Z", - "shell.execute_reply": "2025-12-19T10:23:41.323895Z" - }, - "papermill": { - "duration": 3.535554, - "end_time": "2025-12-19T10:23:41.328226", - "exception": false, - "start_time": "2025-12-19T10:23:37.792672", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "shapes <- tryCatch({ get_latest_dataset_file_in_memory(DHIS2_FORMATTED_DATASET_NAME, paste0(COUNTRY_CODE, \"_shapes.geojson\")) }, \n", - " error = function(e) { \n", - " msg <- paste0(COUNTRY_CODE , \" Shapes data is not available in dataset: '\" , DHIS2_FORMATTED_DATASET_NAME, \"' last version.\")\n", - " log_msg(msg, \"warning\")\n", - " shapes <- NULL\n", - " })\n", - "\n", - "log_msg(glue::glue(\"Shapes loaded from dataset: '{DHIS2_FORMATTED_DATASET_NAME}'. \\nDataframe with dimensions: {paste(dim(shapes), collapse=', ')}\"))\n", - "names(shapes)" - ] - }, - { - "cell_type": "markdown", - "id": "17067d56", - "metadata": { - "papermill": { - "duration": 0.000166, - "end_time": "2025-12-19T10:23:41.328651", - "exception": false, - "start_time": "2025-12-19T10:23:41.328485", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### 3. Plots" - ] - }, - { - "cell_type": "markdown", - "id": "9a6369ee", - "metadata": { - "papermill": { - "duration": 0.000109, - "end_time": "2025-12-19T10:23:41.328959", - "exception": false, - "start_time": "2025-12-19T10:23:41.328850", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "##### 3.0. Add shapes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c6641720", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:41.333105Z", - "iopub.status.busy": "2025-12-19T10:23:41.331427Z", - "iopub.status.idle": "2025-12-19T10:23:41.365417Z", - "shell.execute_reply": "2025-12-19T10:23:41.363294Z" - }, - "papermill": { - "duration": 0.03905, - "end_time": "2025-12-19T10:23:41.368213", - "exception": false, - "start_time": "2025-12-19T10:23:41.329163", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Join shapes to reporting rate data\n", - "\n", - "data_to_plot <- reporting_rate_dataset %>%\n", - " left_join(shapes, by = c(\"ADM2_ID\"))" - ] - }, - { - "cell_type": "markdown", - "id": "0b0d32f1", - "metadata": { - "papermill": { - "duration": 0.000195, - "end_time": "2025-12-19T10:23:41.368739", - "exception": false, - "start_time": "2025-12-19T10:23:41.368544", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 3.1. 🎨 Dynamic categories and color assignement" - ] - }, - { - "cell_type": "markdown", - "id": "cc765e0c", - "metadata": { - "papermill": { - "duration": 0.000109, - "end_time": "2025-12-19T10:23:41.369057", - "exception": false, - "start_time": "2025-12-19T10:23:41.368948", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "##### 1. Define breaks and labels" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2e79132c", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:41.373558Z", - "iopub.status.busy": "2025-12-19T10:23:41.371555Z", - "iopub.status.idle": "2025-12-19T10:23:41.392950Z", - "shell.execute_reply": "2025-12-19T10:23:41.390333Z" - }, - "papermill": { - "duration": 0.026996, - "end_time": "2025-12-19T10:23:41.396238", - "exception": false, - "start_time": "2025-12-19T10:23:41.369242", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Safety code to avoid breaking if nothings is fund in json_metadata\n", - "if (is.null(break_vals) || length(break_vals) == 0) {\n", - " log_msg(\"[WARNING] No break values found in SNT_metadata.json for REPORTING_RATE$SCALE. Using default values.\", \"warning\")\n", - " break_vals <- c(0.5, 0.8, 0.9, 0.95, 1.00)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f04cb888", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:41.401034Z", - "iopub.status.busy": "2025-12-19T10:23:41.398849Z", - "iopub.status.idle": "2025-12-19T10:23:41.430720Z", - "shell.execute_reply": "2025-12-19T10:23:41.428238Z" - }, - "papermill": { - "duration": 0.037712, - "end_time": "2025-12-19T10:23:41.434131", - "exception": false, - "start_time": "2025-12-19T10:23:41.396419", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# 1. Define breaks\n", - "# Note: assumes that the data starts at 0!\n", - "# break_vals <- metadata_json$REPORTING_RATE$SCALE # moved upstream\n", - "\n", - "# 2. Create the full set of cut points (0 to Infinity)\n", - "full_breaks <- c(0, break_vals, Inf)\n", - "\n", - "# 3. Create dynamic labels\n", - "labels <- c(\n", - " paste0(\"< \", break_vals[1]), # First label\n", - " paste0(break_vals[-length(break_vals)], \" - \", break_vals[-1]), # Middle labels\n", - " paste0(\"> \", break_vals[length(break_vals)]) # Last label\n", - ")\n", - "\n", - "# Check\n", - "labels" - ] - }, - { - "cell_type": "markdown", - "id": "cb237801", - "metadata": { - "papermill": { - "duration": 0.000102, - "end_time": "2025-12-19T10:23:41.434442", - "exception": false, - "start_time": "2025-12-19T10:23:41.434340", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "##### 2. Create `_CATEGORY` col" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f8303488", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:41.439376Z", - "iopub.status.busy": "2025-12-19T10:23:41.437165Z", - "iopub.status.idle": "2025-12-19T10:23:41.471891Z", - "shell.execute_reply": "2025-12-19T10:23:41.469251Z" - }, - "papermill": { - "duration": 0.040632, - "end_time": "2025-12-19T10:23:41.475176", - "exception": false, - "start_time": "2025-12-19T10:23:41.434544", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# reporting_rate_dataset <- reporting_rate_dataset %>%\n", - "data_to_plot <- data_to_plot %>%\n", - " mutate(\n", - " REPORTING_RATE_CATEGORY = cut(\n", - " REPORTING_RATE,\n", - " breaks = full_breaks,\n", - " labels = labels,\n", - " right = TRUE, # so that 1.00 is assigned to \"0.95 - 1.00\"\n", - " include.lowest = TRUE\n", - " )\n", - " )" - ] - }, - { - "cell_type": "markdown", - "id": "a10237f8", - "metadata": { - "papermill": { - "duration": 0.000102, - "end_time": "2025-12-19T10:23:41.475483", - "exception": false, - "start_time": "2025-12-19T10:23:41.475381", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "##### 3. Pick appropriate palette" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2ee6e077", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:41.480216Z", - "iopub.status.busy": "2025-12-19T10:23:41.478061Z", - "iopub.status.idle": "2025-12-19T10:23:41.513805Z", - "shell.execute_reply": "2025-12-19T10:23:41.511268Z" - }, - "papermill": { - "duration": 0.04138, - "end_time": "2025-12-19T10:23:41.516984", - "exception": false, - "start_time": "2025-12-19T10:23:41.475604", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Count nr of breaks\n", - "nr_of_colors <- length(labels)\n", - "\n", - "# nr_of_colors\n", - "palette_to_use <- get_range_from_count(nr_of_colors)\n", - "\n", - "# Need to make palettes as named vectors so that scale_color_manual() and scale_fill_manual() can use them properly\n", - "# Note: need to reverse order of labels to match the palette order \"meaning\" (red \"\" should correcpond to lowest value)\n", - "names(palette_to_use) <- rev(labels)\n", - "\n", - "palette_to_use\n" - ] - }, - { - "cell_type": "markdown", - "id": "d08c0c14", - "metadata": { - "papermill": { - "duration": 0.000099, - "end_time": "2025-12-19T10:23:41.517267", - "exception": false, - "start_time": "2025-12-19T10:23:41.517168", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### 3.2. Plots" - ] - }, - { - "cell_type": "markdown", - "id": "b7781198", - "metadata": { - "papermill": { - "duration": 0.000056, - "end_time": "2025-12-19T10:23:41.517425", - "exception": false, - "start_time": "2025-12-19T10:23:41.517369", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "##### 3.2.1 Scatter plot of RR over time (by ADM2)\n", - "With this we can see the actula numbners (although cannot tell which ADM2 have low values)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "78d92e4a", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:41.522513Z", - "iopub.status.busy": "2025-12-19T10:23:41.520272Z", - "iopub.status.idle": "2025-12-19T10:23:42.935181Z", - "shell.execute_reply": "2025-12-19T10:23:42.932661Z" - }, - "papermill": { - "duration": 1.456494, - "end_time": "2025-12-19T10:23:42.974012", - "exception": false, - "start_time": "2025-12-19T10:23:41.517518", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Line point plot faceted by YEAR\n", - "ggplot(data = data_to_plot) +\n", - " geom_line(aes(x = MONTH,\n", - " y = REPORTING_RATE,\n", - " group = ADM2_ID,\n", - " color = REPORTING_RATE_CATEGORY), \n", - " alpha = 0.3,\n", - " show.legend = FALSE\n", - " ) +\n", - " geom_point(aes(x = MONTH,\n", - " y = REPORTING_RATE,\n", - " group = ADM2_ID,\n", - " color = REPORTING_RATE_CATEGORY)) + \n", - " facet_grid(~YEAR) + \n", - " scale_color_manual(\n", - " values = palette_to_use, # 🎨 NEW dynamic colors & breaks!\n", - " na.value = \"white\",\n", - " name = \"Reporting Rate Categories\"\n", - " ) +\n", - " scale_x_continuous(breaks = seq(1, 12, 1)) +\n", - " scale_y_continuous(\n", - " breaks = c(0, break_vals), # 🎨 NEW dynamic colors & breaks!\n", - " # Dynamically set max value to fit actual data (do show values >1 if present)\n", - " limits = c(0, max(data_to_plot$REPORTING_RATE, na.rm = TRUE) + 0.1)\n", - " ) +\n", - " labs(\n", - " title = \"Reporting Rate (Dataset)\",\n", - " subtitle = paste0(\"Product UID : \", rr_product_uid),\n", - " x = \"Month\",\n", - " y = \"Reporting Rate\\n(Dataset)\" ) +\n", - " theme_minimal() +\n", - " theme(\n", - " plot.subtitle = element_text(margin=margin(0,0,20,0)),\n", - " legend.position = \"none\",\n", - " legend.title = element_blank(),\n", - " # legend.key.width = unit(3, \"cm\"),\n", - " # legend.key.height = unit(0.25, \"cm\"),\n", - " axis.title.y = element_blank(),\n", - " panel.grid.minor = element_blank(),\n", - " panel.grid.major.x = element_blank(),\n", - " strip.placement = \"outside\",\n", - " strip.text = element_text(face = \"bold\", size = 10)\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1f47064a", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:42.978498Z", - "iopub.status.busy": "2025-12-19T10:23:42.976659Z", - "iopub.status.idle": "2025-12-19T10:23:44.087244Z", - "shell.execute_reply": "2025-12-19T10:23:44.085182Z" - }, - "papermill": { - "duration": 1.11568, - "end_time": "2025-12-19T10:23:44.089891", - "exception": false, - "start_time": "2025-12-19T10:23:42.974211", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Export plot as PNG\n", - "output_filename <- paste0(COUNTRY_CODE, \"_reporting_rate_dataset_adm2_linepoint_\", paste(REPORTING_RATE_PRODUCT_UID, collapse = \"_\"), \".png\")\n", - "output_location <- file.path(REPORTING_NB_OUTPUTS_PATH, \"figures\")\n", - "\n", - "ggsave(\n", - " filename = output_filename, \n", - " path = output_location, \n", - " create.dir = TRUE,\n", - " height = 15,\n", - " width = 45,\n", - " units = \"cm\",\n", - " bg = \"white\",\n", - " dpi = 200\n", - " )\n", - "\n", - "# Add log message\n", - "log_msg(glue::glue(\"📊 Plot (linepoint) saved to: {file.path(output_location, output_filename)}\"))" - ] - }, - { - "cell_type": "markdown", - "id": "22bb6431", - "metadata": { - "papermill": { - "duration": 0.000147, - "end_time": "2025-12-19T10:23:44.090320", - "exception": false, - "start_time": "2025-12-19T10:23:44.090173", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "##### 3.2.2 Heatmap plot of RR over time (by ADM2)\n", - "This is less good for identifying actual values, but allows to see which ADM2 have lower values." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f2445f2a", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:44.094508Z", - "iopub.status.busy": "2025-12-19T10:23:44.092577Z", - "iopub.status.idle": "2025-12-19T10:23:46.262550Z", - "shell.execute_reply": "2025-12-19T10:23:46.259633Z" - }, - "papermill": { - "duration": 2.21647, - "end_time": "2025-12-19T10:23:46.306927", - "exception": false, - "start_time": "2025-12-19T10:23:44.090457", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Tile plot faceted by YEAR\n", - "ggplot(data = data_to_plot) +\n", - " geom_tile(aes(x = MONTH,\n", - " y = fct_rev(ADM2_NAME),\n", - " fill = REPORTING_RATE_CATEGORY), \n", - " color = \"white\",\n", - " show.legend = TRUE\n", - " ) +\n", - " scale_fill_manual(\n", - " values = palette_to_use, # 🎨 NEW dynamic colors & breaks!\n", - " na.value = \"white\",\n", - " name = \"Reporting Rate: \"\n", - " ) +\n", - " scale_x_continuous(breaks = seq(1, 12, 1)) +\n", - " labs(\n", - " title = \"Reporting Rate (Dataset)\",\n", - " subtitle = paste0(\"Product UID : \", rr_product_uid),\n", - " x = \"Month\"\n", - " ) +\n", - " facet_grid(rows = vars(ADM1_NAME), cols = vars(YEAR), \n", - " scales = \"free_y\", space = \"free_y\",\n", - " switch = \"y\") +\n", - " theme_minimal() +\n", - " theme(\n", - " plot.subtitle = element_text(margin=margin(0,0,20,0)),\n", - " legend.position = \"bottom\",\n", - " legend.key.height = unit(0.25, \"cm\"),\n", - " axis.text.x = element_text(size = 7),\n", - " axis.title.y = element_blank(),\n", - " panel.grid.minor = element_blank(),\n", - " panel.grid.major = element_blank(),\n", - " strip.placement = \"outside\", \n", - " strip.text = element_text(face = \"bold\", size = 10)\n", - " ) +\n", - " guides(fill = guide_legend(nrow = 1))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cbe73312", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:46.311134Z", - "iopub.status.busy": "2025-12-19T10:23:46.309412Z", - "iopub.status.idle": "2025-12-19T10:23:48.286664Z", - "shell.execute_reply": "2025-12-19T10:23:48.284571Z" - }, - "papermill": { - "duration": 1.982105, - "end_time": "2025-12-19T10:23:48.289215", - "exception": false, - "start_time": "2025-12-19T10:23:46.307110", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Export plot as PNG\n", - "output_filename <- paste0(COUNTRY_CODE, \"_reporting_rate_dataset_adm2_heatmap_\", paste(REPORTING_RATE_PRODUCT_UID, collapse = \"_\"), \".png\")\n", - "output_location <- file.path(REPORTING_NB_OUTPUTS_PATH, \"figures\")\n", - "\n", - "ggsave(\n", - " filename = output_filename, \n", - " path = output_location, \n", - " create.dir = TRUE,\n", - " width = 20, height = 30, units = \"cm\", \n", - " dpi = 200\n", - " )\n", - "\n", - "# Add log message\n", - "log_msg(glue::glue(\"📊 Plot (heatmap) saved to: {file.path(output_location, output_filename)}\"))" - ] - }, - { - "cell_type": "markdown", - "id": "3eef141a", - "metadata": { - "papermill": { - "duration": 0.000164, - "end_time": "2025-12-19T10:23:48.289656", - "exception": false, - "start_time": "2025-12-19T10:23:48.289492", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "##### 3.2.3. MAP of Reporting Rate - by month" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "83be9c68", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:48.294030Z", - "iopub.status.busy": "2025-12-19T10:23:48.292256Z", - "iopub.status.idle": "2025-12-19T10:23:53.205670Z", - "shell.execute_reply": "2025-12-19T10:23:53.203104Z" - }, - "papermill": { - "duration": 4.958481, - "end_time": "2025-12-19T10:23:53.248341", - "exception": false, - "start_time": "2025-12-19T10:23:48.289860", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Choropleth map with reporting rate data by ADM2\n", - "ggplot(data = data_to_plot) +\n", - " geom_sf(aes(\n", - " fill = REPORTING_RATE_CATEGORY,\n", - " geometry = geometry), \n", - " color = \"white\",\n", - " size = 0.01) +\n", - " scale_fill_manual(\n", - " values = palette_to_use, # 🎨 NEW dynamic colors & breaks!\n", - " na.value = \"white\",\n", - " ) +\n", - " theme_void() +\n", - " theme(\n", - " plot.subtitle = element_text(margin=margin(5,0,20,0)),\n", - " legend.position = \"bottom\",\n", - " legend.title = element_blank(),\n", - " legend.key.height = unit(0.25, \"cm\")\n", - " ) +\n", - " labs(\n", - " title = paste(\"Reporting Rate (Dataset)\"),\n", - " subtitle = paste0(\"Product UID : \", rr_product_uid),\n", - " ) +\n", - " facet_grid(\n", - " rows = vars(YEAR), \n", - " cols = vars(MONTH),\n", - " switch = \"both\") +\n", - " guides(fill = guide_legend(nrow = 1))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e877671d", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:53.252696Z", - "iopub.status.busy": "2025-12-19T10:23:53.250972Z", - "iopub.status.idle": "2025-12-19T10:23:56.748868Z", - "shell.execute_reply": "2025-12-19T10:23:56.746990Z" - }, - "papermill": { - "duration": 3.502689, - "end_time": "2025-12-19T10:23:56.751218", - "exception": false, - "start_time": "2025-12-19T10:23:53.248529", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "output_filename <- paste0(COUNTRY_CODE, \"_reporting_rate_dataset_adm2_map_\", paste(REPORTING_RATE_PRODUCT_UID, collapse = \"_\"), \".png\")\n", - "output_location <- file.path(REPORTING_NB_OUTPUTS_PATH, \"figures\")\n", - "\n", - "ggsave(\n", - " filename = output_filename, \n", - " path = output_location, \n", - " create.dir = TRUE,\n", - " width = 50, height = 20, units = \"cm\", \n", - " dpi = 200\n", - " )\n", - "\n", - "# Add log message\n", - "log_msg(glue::glue(\"📊 Plot (map) saved to: {file.path(output_location, output_filename)}\"))" - ] - }, - { - "cell_type": "markdown", - "id": "f0894be9", - "metadata": { - "papermill": { - "duration": 0.000166, - "end_time": "2025-12-19T10:23:56.751636", - "exception": false, - "start_time": "2025-12-19T10:23:56.751470", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "##### 3.2.4. MAP of Reporting Rate - by YEAR\n", - "Use average (`mean()`) of monthly values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cb1995ab", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:56.755998Z", - "iopub.status.busy": "2025-12-19T10:23:56.753982Z", - "iopub.status.idle": "2025-12-19T10:23:56.788391Z", - "shell.execute_reply": "2025-12-19T10:23:56.786447Z" - }, - "papermill": { - "duration": 0.039325, - "end_time": "2025-12-19T10:23:56.791143", - "exception": false, - "start_time": "2025-12-19T10:23:56.751818", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "data_to_plot_year <- data_to_plot %>%\n", - " group_by(geometry, ADM2_ID, ADM2_NAME, ADM1_NAME, YEAR) %>%\n", - " summarise(\n", - " REPORTING_RATE = mean(REPORTING_RATE, na.rm = TRUE),\n", - " .groups = \"drop\"\n", - " ) %>%\n", - " # Calculate REPORTING_RATE_CATEGORY again based on the yearly average\n", - " mutate(\n", - " REPORTING_RATE_CATEGORY = cut(\n", - " REPORTING_RATE,\n", - " breaks = full_breaks,\n", - " labels = labels,\n", - " right = TRUE, # so that 1.00 is assigned to \"0.95 - 1.00\"\n", - " include.lowest = TRUE\n", - " )\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bd32b0cf", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:56.795010Z", - "iopub.status.busy": "2025-12-19T10:23:56.793453Z", - "iopub.status.idle": "2025-12-19T10:23:57.582261Z", - "shell.execute_reply": "2025-12-19T10:23:57.579294Z" - }, - "papermill": { - "duration": 0.798686, - "end_time": "2025-12-19T10:23:57.590023", - "exception": false, - "start_time": "2025-12-19T10:23:56.791337", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Choropleth map with reporting rate data by ADM2\n", - "ggplot(data = data_to_plot_year) +\n", - " geom_sf(aes(\n", - " fill = REPORTING_RATE_CATEGORY,\n", - " geometry = geometry), \n", - " color = \"white\",\n", - " size = 0.01) +\n", - " scale_fill_manual(\n", - " values = palette_to_use, # 🎨 NEW dynamic colors & breaks!\n", - " na.value = \"white\"\n", - " ) +\n", - " theme_void() +\n", - " theme(\n", - " plot.subtitle = element_text(margin=margin(5,0,20,0)),\n", - " legend.position = \"bottom\",\n", - " ) +\n", - " labs(\n", - " title = \"Reporting Rate (Dataset) - mean per Year\",\n", - " subtitle = paste0(\"Product UID : \", rr_product_uid),\n", - " fill = \"Reporting Rate: \"\n", - " ) +\n", - " facet_grid(\n", - " cols = vars(YEAR)\n", - " ) +\n", - " guides(fill = guide_legend(nrow = 1))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0430641e", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:57.594096Z", - "iopub.status.busy": "2025-12-19T10:23:57.592357Z", - "iopub.status.idle": "2025-12-19T10:23:58.516754Z", - "shell.execute_reply": "2025-12-19T10:23:58.514785Z" - }, - "papermill": { - "duration": 0.928933, - "end_time": "2025-12-19T10:23:58.519148", - "exception": false, - "start_time": "2025-12-19T10:23:57.590215", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "output_filename <- paste0(COUNTRY_CODE, \"_reporting_rate_dataset_adm2_map_year_\", paste(REPORTING_RATE_PRODUCT_UID, collapse = \"_\"), \".png\")\n", - "output_location <- file.path(REPORTING_NB_OUTPUTS_PATH, \"figures\")\n", - "\n", - "ggsave(\n", - " filename = output_filename, \n", - " path = output_location, \n", - " create.dir = TRUE,\n", - " width = 31, height = 13, units = \"cm\", \n", - " dpi = 200\n", - " )\n", - "\n", - "# Add log message\n", - "log_msg(glue::glue(\"📊 Plot (map) saved to: {file.path(output_location, output_filename)}\"))" - ] - }, - { - "cell_type": "markdown", - "id": "8c3bdca4", - "metadata": { - "papermill": { - "duration": 0.000126, - "end_time": "2025-12-19T10:23:58.519515", - "exception": false, - "start_time": "2025-12-19T10:23:58.519389", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "#### The End :)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f8a62ec5", - "metadata": { - "execution": { - "iopub.execute_input": "2025-12-19T10:23:58.523680Z", - "iopub.status.busy": "2025-12-19T10:23:58.522024Z", - "iopub.status.idle": "2025-12-19T10:23:58.733860Z", - "shell.execute_reply": "2025-12-19T10:23:58.731929Z" - }, - "papermill": { - "duration": 0.216448, - "end_time": "2025-12-19T10:23:58.736160", - "exception": false, - "start_time": "2025-12-19T10:23:58.519712", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "log_msg(\"Reporting Rate (Dataset) report notebook completed successfully!\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" }, - "papermill": { - "default_parameters": {}, - "duration": 32.950872, - "end_time": "2025-12-19T10:23:59.058917", - "environment_variables": {}, - "exception": null, - "input_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataset/reporting/snt_dhis2_reporting_rate_dataset_report.ipynb", - "output_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataset/reporting/outputs/snt_dhis2_reporting_rate_dataset_report_OUTPUT_2025-12-19_102325.ipynb", - "parameters": {}, - "start_time": "2025-12-19T10:23:26.108045", - "version": "2.6.0" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/pipelines/snt_dhis2_reporting_rate_dataset/utils/snt_dhis2_reporting_rate_dataset.r b/pipelines/snt_dhis2_reporting_rate_dataset/utils/snt_dhis2_reporting_rate_dataset.r index 559e4fe..4aa744d 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataset/utils/snt_dhis2_reporting_rate_dataset.r +++ b/pipelines/snt_dhis2_reporting_rate_dataset/utils/snt_dhis2_reporting_rate_dataset.r @@ -1,4 +1,6 @@ # Load base utils +# Same bootstrap pattern as reporting_rate_dataelement / formatting review: +# `load_dataset_file()`, `load_snt_config()`, `get_setup_variables()` + paths_to_check. source(file.path("~/workspace/code", "snt_utils.r")) @@ -30,6 +32,12 @@ get_setup_variables <- function( install_and_load(packages) + if (Sys.getenv("PROJ_LIB", "") == "") { + Sys.setenv(PROJ_LIB = "/opt/conda/share/proj") + } + if (Sys.getenv("GDAL_DATA", "") == "") { + Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal") + } Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") reticulate::py_config()$python assign("openhexa", reticulate::import("openhexa.sdk"), envir = .GlobalEnv) @@ -58,6 +66,40 @@ load_snt_config <- function(snt_config_path) { } +#' Fail if Papermill did not inject `ROUTINE_FILE` and `DATASET_ID`. +#' +#' Kept as a named entry point so older notebooks that call this before other +#' setup keep working after utils refactors. +assert_papermill_reporting_rate_dataset_params <- function() { + required <- c("ROUTINE_FILE", "DATASET_ID") + missing <- required[!vapply(required, exists, logical(1), inherits = TRUE)] + if (length(missing) > 0) { + stop( + "[ERROR] Missing pipeline parameters (Papermill): ", + paste(missing, collapse = ", "), + ". Expected only ROUTINE_FILE and DATASET_ID from `snt_dhis2_reporting_rate_dataset`." + ) + } +} + + +#' Build globals used in the dataset reporting-rate notebook from `SNT_config.json`. +#' +#' Calls `assert_papermill_reporting_rate_dataset_params()` first (redundant if the +#' notebook already called it). +parse_reporting_rate_dataset_snt_settings <- function(config_json) { + assert_papermill_reporting_rate_dataset_params() + + list( + COUNTRY_CODE = config_json$SNT_CONFIG$COUNTRY_CODE, + ADMIN_1 = toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1), + ADMIN_2 = toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2), + REPORTING_RATE_PRODUCT_ID = config_json$SNT_CONFIG$REPORTING_RATE_PRODUCT_UID, + fixed_cols_rr = c("YEAR", "MONTH", "ADM2_ID", "REPORTING_RATE") + ) +} + + #' Load Dataset File from OpenHEXA #' Retrieves the latest version of a file from an OpenHEXA dataset. #' @@ -79,3 +121,18 @@ load_dataset_file <- function(dataset_id, filename) { log_msg(msg) return(data) } + + +#' Write CSV + Parquet under `/dhis2/reporting_rate/`. +write_reporting_rate_dataset_outputs <- function(reporting_rate_tbl, snt_environment, country_code) { + output_dir <- file.path(snt_environment$DATA_PATH, "dhis2", "reporting_rate") + dir.create(output_dir, recursive = TRUE, showWarnings = FALSE) + base <- paste0(country_code, "_reporting_rate_dataset") + csv_path <- file.path(output_dir, paste0(base, ".csv")) + pq_path <- file.path(output_dir, paste0(base, ".parquet")) + utils::write.csv(reporting_rate_tbl, csv_path, row.names = FALSE) + log_msg(glue::glue("Exported: {csv_path}")) + arrow::write_parquet(reporting_rate_tbl, pq_path) + log_msg(glue::glue("Exported: {pq_path}")) + invisible(list(csv_path = csv_path, parquet_path = pq_path)) +} diff --git a/snt_dhis2_reporting_rate_dataset/pipeline.py b/snt_dhis2_reporting_rate_dataset/pipeline.py index ff440f2..8d15ce4 100644 --- a/snt_dhis2_reporting_rate_dataset/pipeline.py +++ b/snt_dhis2_reporting_rate_dataset/pipeline.py @@ -88,7 +88,6 @@ def snt_dhis2_reporting_rate_dataset( return nb_parameters = { - "SNT_ROOT_PATH": root_path.as_posix(), "ROUTINE_FILE": routine_file, "DATASET_ID": ds_outliers_id, } From a14fc9c3678cecfcbd8dd54b409bac561dcab78c Mon Sep 17 00:00:00 2001 From: claude-marie Date: Thu, 16 Apr 2026 11:56:43 +0200 Subject: [PATCH 09/18] fix for moment --- ...snt_dhis2_reporting_rate_dataelement.ipynb | 48 +++- ...s2_reporting_rate_dataelement_report.ipynb | 41 +--- .../snt_dhis2_reporting_rate_dataelement.r | 218 +++++++++++++----- .../snt_dhis2_reporting_rate_dataset.ipynb | 48 ++-- .../utils/snt_dhis2_reporting_rate_dataset.r | 115 +++++---- 5 files changed, 307 insertions(+), 163 deletions(-) diff --git a/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb b/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb index c4b57f0..d152bbb 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb +++ b/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb @@ -94,7 +94,20 @@ }, "source": [ "### 1.1. Pipeline parameters\n", - "`ROUTINE_FILE` and `DATASET_ID` are injected by Papermill (checked inside `parse_dataelement_snt_settings()`). Denominator, weighting, and indicator lists come from `SNT_config.json` (`SNT_CONFIG$REPORTING_RATE_DATAELEMENT`) when present; otherwise that function applies the documented defaults in the pipeline utils.\n" + "`ROUTINE_FILE` and `DATASET_ID` are injected by Papermill (validated in `build_dataelement_reporting_settings_from_config()`). Same layout as `snt_dhis2_population_transformation.ipynb`: define inputs in the notebook, then **Save variables** from config / parsed settings.\n" + ] + }, + { + "cell_type": "markdown", + "id": "a1b2c3d4-save-vars-md", + "metadata": { + "vscode": { + "languageId": "markdown" + } + }, + "source": [ + "#### Save variables\n", + "Indicator lists and remaining fields used downstream (mirrors the population notebook block that assigns `COUNTRY_CODE`, `ADMIN_1`, … from `config_json`).\n" ] }, { @@ -116,8 +129,27 @@ }, "outputs": [], "source": [ - "cx <- parse_dataelement_snt_settings(config_json)\n", - "list2env(cx, envir = .GlobalEnv)\n" + "# Routine column names for numerators (edit here; passed through to reporting-rate logic).\n", + "activity_indicators <- c(\"CONF\", \"PRES\", \"SUSP\")\n", + "volume_activity_indicators <- c(\"CONF\", \"PRES\")\n", + "\n", + "settings <- build_dataelement_reporting_settings_from_config(\n", + " config_json,\n", + " activity_indicators = activity_indicators,\n", + " volume_activity_indicators = volume_activity_indicators\n", + ")\n", + "\n", + "# Save variables (explicit assignments like population transformation `config_json` block).\n", + "COUNTRY_CODE <- settings$COUNTRY_CODE\n", + "ADMIN_1 <- settings$ADMIN_1\n", + "ADMIN_2 <- settings$ADMIN_2\n", + "DHIS2_INDICATORS <- settings$DHIS2_INDICATORS\n", + "DATAELEMENT_METHOD_DENOMINATOR <- settings$DATAELEMENT_METHOD_DENOMINATOR\n", + "USE_WEIGHTED_REPORTING_RATES <- settings$USE_WEIGHTED_REPORTING_RATES\n", + "ACTIVITY_INDICATORS <- settings$ACTIVITY_INDICATORS\n", + "VOLUME_ACTIVITY_INDICATORS <- settings$VOLUME_ACTIVITY_INDICATORS\n", + "fixed_cols <- settings$fixed_cols\n", + "fixed_cols_rr <- settings$fixed_cols_rr\n" ] }, { @@ -152,7 +184,7 @@ }, "outputs": [], "source": [ - "stopifnot_nonempty_activity_indicators(ACTIVITY_INDICATORS)\n" + "stop_if_activity_indicators_empty(ACTIVITY_INDICATORS)\n" ] }, { @@ -300,7 +332,7 @@ }, "outputs": [], "source": [ - "validate_indicator_columns_in_routine(dhis2_routine, ACTIVITY_INDICATORS, VOLUME_ACTIVITY_INDICATORS)\n" + "check_required_indicators_present_in_routine(dhis2_routine, ACTIVITY_INDICATORS, VOLUME_ACTIVITY_INDICATORS)\n" ] }, { @@ -347,7 +379,7 @@ }, "outputs": [], "source": [ - "pv <- monthly_period_vector_from_routine(dhis2_routine)\n", + "pv <- summarize_routine_period_range_as_month_vector(dhis2_routine)\n", "PERIOD_START <- pv$PERIOD_START\n", "PERIOD_END <- pv$PERIOD_END\n", "period_vector <- pv$period_vector\n", @@ -392,7 +424,7 @@ "outputs": [], "source": [ "log_msg(glue(\"Building master table with periods from {PERIOD_START} to {PERIOD_END}. Periods count: {length(period_vector)}\"))\n", - "facility_master <- build_facility_master_dataelement(\n", + "facility_master <- build_facilities_crossed_with_monthly_periods(\n", " dhis2_pyramid_formatted = dhis2_pyramid_formatted,\n", " period_vector = period_vector,\n", " config_json = config_json,\n", @@ -991,7 +1023,7 @@ }, "outputs": [], "source": [ - "write_reporting_rate_dataelement_outputs(reporting_rate_dataelement, snt_environment, COUNTRY_CODE)\n" + "save_dataelement_reporting_rate_csv_and_parquet(reporting_rate_dataelement, snt_environment, COUNTRY_CODE)\n" ] } ], diff --git a/pipelines/snt_dhis2_reporting_rate_dataelement/reporting/snt_dhis2_reporting_rate_dataelement_report.ipynb b/pipelines/snt_dhis2_reporting_rate_dataelement/reporting/snt_dhis2_reporting_rate_dataelement_report.ipynb index 2d91055..836aaa4 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataelement/reporting/snt_dhis2_reporting_rate_dataelement_report.ipynb +++ b/pipelines/snt_dhis2_reporting_rate_dataelement/reporting/snt_dhis2_reporting_rate_dataelement_report.ipynb @@ -33,17 +33,17 @@ } }, "source": [ - "SNT_ROOT_PATH <- \"~/workspace\"\n", - "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_reporting_rate_dataelement\")\n", - "source(file.path(PIPELINE_PATH, \"utils\", \"snt_dhis2_reporting_rate_dataelement.r\"))\n", + "source(\"~/workspace/pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r\")\n", "\n", "report_packages <- c(\"arrow\", \"tidyverse\", \"sf\", \"stringi\", \"jsonlite\", \"httr\", \"reticulate\", \"glue\")\n", - "snt_environment <- get_setup_variables(SNT_ROOT_PATH = SNT_ROOT_PATH, packages = report_packages)\n", + "snt_environment <- get_setup_variables(packages = report_packages)\n", "\n", - "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", "CONFIG_PATH <- snt_environment$CONFIG_PATH\n", + "SNT_ROOT_PATH <- dirname(CONFIG_PATH)\n", + "CODE_PATH <- file.path(SNT_ROOT_PATH, \"code\")\n", "DATA_PATH <- file.path(snt_environment$DATA_PATH, \"dhis2\")\n", - "REPORTING_NB_OUTPUTS_PATH <- file.path(SNT_ROOT_PATH, \"pipelines/snt_dhis2_reporting_rate_dataelement/reporting/outputs\")\n", + "PIPELINE_PATH <- file.path(SNT_ROOT_PATH, \"pipelines\", \"snt_dhis2_reporting_rate_dataelement\")\n", + "REPORTING_NB_OUTPUTS_PATH <- file.path(PIPELINE_PATH, \"reporting\", \"outputs\")\n", "\n", "source(file.path(CODE_PATH, \"snt_palettes.r\"))" ], @@ -153,15 +153,7 @@ } }, "source": [ - "# Load SNT metadata\n", - "metadata_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_metadata.json\")) },\n", - " error = function(e) {\n", - " msg <- paste0(\"[ERROR] Error while loading metadata\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "log_msg(paste0(\"SNT metadata loaded from : \", file.path(CONFIG_PATH, \"SNT_metadata.json\")))" + "metadata_json <- load_snt_metadata(file.path(CONFIG_PATH, \"SNT_metadata.json\"))" ], "execution_count": null, "outputs": [], @@ -258,15 +250,8 @@ "\n", "rr_filename <- glue::glue(\"{COUNTRY_CODE}_reporting_rate_dataelement.parquet\")\n", "\n", - "reporting_rate <- tryCatch({ get_latest_dataset_file_in_memory(REPORTING_RATE_DATASET_NAME, rr_filename) }, \n", - " error = function(e) {\n", - " msg <- paste(\"Error while loading Reporting Rate (Data Element) data file for: \" , COUNTRY_CODE, conditionMessage(e)) # log error message\n", - " cat(msg)\n", - " stop(msg)\n", - "})\n", + "reporting_rate <- load_dataset_file(REPORTING_RATE_DATASET_NAME, rr_filename)\n", "\n", - "# log\n", - "log_msg(glue::glue(\"Data file `{rr_filename}` loaded from dataset: `{REPORTING_RATE_DATASET_NAME}`. Dataframe dimensions: {paste(dim(reporting_rate), collapse=', ')}\"))\n", "dim(reporting_rate)\n", "head(reporting_rate, 2)" ], @@ -308,14 +293,8 @@ } }, "source": [ - "shapes <- tryCatch({ get_latest_dataset_file_in_memory(DHIS2_FORMATTED_DATASET_NAME, paste0(COUNTRY_CODE, \"_shapes.geojson\")) }, \n", - " error = function(e) { \n", - " msg <- paste0(COUNTRY_CODE , \" Shapes data is not available in dataset: '\" , DHIS2_FORMATTED_DATASET_NAME, \"' last version.\")\n", - " log_msg(msg, \"warning\")\n", - " shapes <- NULL\n", - " })\n", - "\n", - "log_msg(glue::glue(\"Shapes loaded from dataset: '{DHIS2_FORMATTED_DATASET_NAME}'. \\nDataframe with dimensions: {paste(dim(shapes), collapse=', ')}\"))\n", + "shapes_filename <- paste0(COUNTRY_CODE, \"_shapes.geojson\")\n", + "shapes <- load_dataset_file(DHIS2_FORMATTED_DATASET_NAME, shapes_filename)\n", "names(shapes)" ], "execution_count": null, diff --git a/pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r b/pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r index a76010b..5cc3da9 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r +++ b/pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r @@ -1,10 +1,31 @@ # Load base utils # Bootstrap matches `snt_dhis2_population_transformation`: fixed-path `source()` of this # file, `snt_environment <- get_setup_variables()`, then `load_snt_config()`. -# Helpers: `load_dataset_file()`, optional `paths_to_check` in the setup list. +# Helpers are named to read like notebook steps (see Esteban's note on structuring +# workflow): `load_dataset_file()`, `build_dataelement_reporting_settings_from_config()`, +# `save_dataelement_reporting_rate_csv_and_parquet()`, etc. source(file.path("~/workspace/code", "snt_utils.r")) +# JSON reader for this pipeline only (`snt_utils.r` must stay untouched per project rules). +read_workspace_json_file <- function(json_path, resource_label = "JSON file") { + json_path <- as.character(json_path)[[1L]] + tryCatch( + jsonlite::fromJSON(json_path), + error = function(e) { + stop(paste0( + "[ERROR] Error while loading ", + resource_label, + " from `", + json_path, + "`: ", + conditionMessage(e) + )) + } + ) +} + + #' Get Setup Variables for SNT Workspace #' Initializes workspace paths, loads R packages, and imports OpenHEXA SDK. #' @@ -49,38 +70,48 @@ get_setup_variables <- function( #' #' @export load_snt_config <- function(snt_config_path) { - config_json <- tryCatch( - { jsonlite::fromJSON(snt_config_path) }, - error = function(e) { - msg <- glue::glue("[ERROR] Error while loading configuration: {snt_config_path}") - cat(msg) - stop(msg) - } - ) + config_json <- read_workspace_json_file(snt_config_path, "configuration") log_msg(paste0("SNT configuration loaded from: ", snt_config_path)) return(config_json) } +#' Load SNT Metadata File +#' Reads and parses `SNT_metadata.json` (or another workspace metadata JSON). +#' @param snt_metadata_path Character. Path to the metadata JSON file. +#' @return List containing parsed metadata. +#' +#' @export +load_snt_metadata <- function(snt_metadata_path) { + metadata_json <- read_workspace_json_file(snt_metadata_path, "SNT metadata") + log_msg(paste0("SNT metadata loaded from: ", snt_metadata_path)) + return(metadata_json) +} + + #' Load Dataset File from OpenHEXA #' Retrieves the latest version of a file from an OpenHEXA dataset. #' #' @param dataset_id Character. OpenHEXA dataset identifier. #' @param filename Character. Name of file to load. +#' @param verbose Logical. If TRUE, log dataframe dimensions after a successful load. #' @return Dataframe containing the loaded data. #' #' @export -load_dataset_file <- function(dataset_id, filename) { +load_dataset_file <- function(dataset_id, filename, verbose = TRUE) { data <- tryCatch( - { get_latest_dataset_file_in_memory(dataset_id, filename) }, + { + get_latest_dataset_file_in_memory(dataset_id, filename) + }, error = function(e) { - msg <- glue::glue("[ERROR] Error while loading {filename} file: {conditionMessage(e)}") - log_msg(msg, "error") - stop(msg) + stop(glue::glue("[ERROR] Error while loading {filename} file from dataset: {dataset_id}")) } ) - msg <- glue::glue("{filename} data loaded from dataset: {dataset_id} dataframe dimensions: [{paste(dim(data), collapse = ', ')}]") - log_msg(msg) + if (verbose) { + log_msg(glue::glue( + "{filename} data loaded from dataset : {dataset_id} dataframe dimensions: [{paste(dim(data), collapse = ', ')}]" + )) + } return(data) } @@ -96,16 +127,14 @@ configure_conda_r_spatial_env <- function() { } -#' Standard aggregated indicator codes present in formatted routine extracts. -standard_dhis2_indicator_codes_for_dataelement <- function() { - c("CONF", "PRES", "SUSP", "TEST") -} +# Standard aggregated indicator codes present in formatted routine extracts. +STANDARD_DHIS2_INDICATOR_CODES_DATAELEMENT <- c("CONF", "PRES", "SUSP", "TEST") #' Fail if Papermill did not inject `ROUTINE_FILE` and `DATASET_ID`. #' #' Kept as a named entry point so older notebooks that call this before -#' `parse_dataelement_snt_settings()` keep working after utils refactors. +#' `build_dataelement_reporting_settings_from_config()` keep working after utils refactors. assert_papermill_dataelement_params <- function() { required_pm <- c("ROUTINE_FILE", "DATASET_ID") missing_pm <- required_pm[!vapply(required_pm, exists, logical(1), inherits = TRUE)] @@ -119,54 +148,97 @@ assert_papermill_dataelement_params <- function() { } -#' Normalize optional `SNT_CONFIG$REPORTING_RATE_DATAELEMENT` list from JSON. -#' -#' When absent, uses the same defaults as the historical OpenHEXA parameters -#' (denominator `ROUTINE_ACTIVE_FACILITIES`, unweighted, activity CONF/PRES/SUSP, -#' volume CONF/PRES). -#' -#' Also calls `assert_papermill_dataelement_params()` (redundant if the notebook -#' already called it). -parse_dataelement_snt_settings <- function(config_json) { - assert_papermill_dataelement_params() +# --- `SNT_CONFIG$REPORTING_RATE_DATAELEMENT` : small steps with explicit names -------- +read_reporting_rate_dataelement_config_block <- function(config_json) { rc <- config_json$SNT_CONFIG$REPORTING_RATE_DATAELEMENT if (is.null(rc) || length(rc) == 0) { - rc <- list() + return(list()) } + rc +} + +resolve_dataelement_denominator_method <- function(rc) { denom <- rc$DATAELEMENT_METHOD_DENOMINATOR denom_ch <- if (is.null(denom)) "" else as.character(denom)[[1]] if (!nzchar(denom_ch) || is.na(denom_ch)) { - denom <- "ROUTINE_ACTIVE_FACILITIES" + "ROUTINE_ACTIVE_FACILITIES" } else { - denom <- denom_ch + denom_ch } +} + +resolve_weighted_reporting_rate_toggle <- function(rc) { use_w <- rc$USE_WEIGHTED_REPORTING_RATES if (is.null(use_w)) { - use_w <- FALSE + FALSE } else { - use_w <- isTRUE(use_w) + isTRUE(use_w) } +} + - act <- rc$ACTIVITY_INDICATORS - if (is.null(act)) { - act <- c("CONF", "PRES", "SUSP") +resolve_activity_indicator_column_names <- function(rc, activity_indicators) { + if (is.null(activity_indicators)) { + act <- rc$ACTIVITY_INDICATORS + if (is.null(act)) { + act <- c("CONF", "PRES", "SUSP") + } + as.character(unlist(act, use.names = FALSE)) + } else { + as.character(unlist(activity_indicators, use.names = FALSE)) } - act <- as.character(unlist(act, use.names = FALSE)) +} - vol <- rc$VOLUME_ACTIVITY_INDICATORS - if (is.null(vol)) { - vol <- c("CONF", "PRES") + +resolve_volume_indicator_column_names <- function(rc, volume_activity_indicators) { + if (is.null(volume_activity_indicators)) { + vol <- rc$VOLUME_ACTIVITY_INDICATORS + if (is.null(vol)) { + vol <- c("CONF", "PRES") + } + as.character(unlist(vol, use.names = FALSE)) + } else { + as.character(unlist(volume_activity_indicators, use.names = FALSE)) } - vol <- as.character(unlist(vol, use.names = FALSE)) +} + + +#' Build the named settings list used by the dataelement reporting-rate notebook. +#' +#' Reads `SNT_config.json` (country, admins, optional `REPORTING_RATE_DATAELEMENT` +#' overrides). When absent, uses the same defaults as the historical OpenHEXA parameters +#' (denominator `ROUTINE_ACTIVE_FACILITIES`, unweighted, activity CONF/PRES/SUSP, +#' volume CONF/PRES). +#' +#' Pass non-NULL `activity_indicators` / `volume_activity_indicators` from the notebook +#' to make column choices visible in the notebook; pass `NULL` to take them from JSON +#' (then built-in defaults if still missing). +#' +#' Also calls `assert_papermill_dataelement_params()` (redundant if the notebook +#' already called it). +#' +#' @export +build_dataelement_reporting_settings_from_config <- function( + config_json, + activity_indicators = NULL, + volume_activity_indicators = NULL +) { + assert_papermill_dataelement_params() + + rc <- read_reporting_rate_dataelement_config_block(config_json) + denom <- resolve_dataelement_denominator_method(rc) + use_w <- resolve_weighted_reporting_rate_toggle(rc) + act <- resolve_activity_indicator_column_names(rc, activity_indicators) + vol <- resolve_volume_indicator_column_names(rc, volume_activity_indicators) list( COUNTRY_CODE = config_json$SNT_CONFIG$COUNTRY_CODE, ADMIN_1 = toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1), ADMIN_2 = toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2), - DHIS2_INDICATORS = standard_dhis2_indicator_codes_for_dataelement(), + DHIS2_INDICATORS = STANDARD_DHIS2_INDICATOR_CODES_DATAELEMENT, DATAELEMENT_METHOD_DENOMINATOR = denom, USE_WEIGHTED_REPORTING_RATES = use_w, ACTIVITY_INDICATORS = act, @@ -177,14 +249,34 @@ parse_dataelement_snt_settings <- function(config_json) { } -stopifnot_nonempty_activity_indicators <- function(activity_indicators) { - if (!length(activity_indicators)) { +# Legacy alias (same function; prefer `build_dataelement_reporting_settings_from_config`). +parse_dataelement_snt_settings <- build_dataelement_reporting_settings_from_config + + +activity_indicator_list_is_nonempty <- function(activity_indicators) { + length(activity_indicators) > 0L +} + + +#' Stop early if the analyst left the activity-indicator list empty. +#' @export +stop_if_activity_indicators_empty <- function(activity_indicators) { + if (!activity_indicator_list_is_nonempty(activity_indicators)) { stop("[ERROR] No activity indicators selected; choose at least one (e.g. CONF).") } + invisible(TRUE) } -validate_indicator_columns_in_routine <- function( +# Legacy alias. +assert_activity_indicators <- stop_if_activity_indicators_empty + +has_activity_indicators <- activity_indicator_list_is_nonempty + + +#' Check that routine columns exist for the chosen activity / volume indicators. +#' @export +check_required_indicators_present_in_routine <- function( dhis2_routine, activity_indicators, volume_activity_indicators @@ -209,8 +301,13 @@ validate_indicator_columns_in_routine <- function( } -#' YYYYMM sequence covering the routine period range (inclusive by month). -monthly_period_vector_from_routine <- function(dhis2_routine) { +# Legacy alias. +validate_indicator_columns_in_routine <- check_required_indicators_present_in_routine + + +#' First / last PERIOD in routine and full vector of YYYYMM months in between. +#' @export +summarize_routine_period_range_as_month_vector <- function(dhis2_routine) { period_start <- min(dhis2_routine$PERIOD, na.rm = TRUE) period_end <- max(dhis2_routine$PERIOD, na.rm = TRUE) pv <- format( @@ -225,8 +322,13 @@ monthly_period_vector_from_routine <- function(dhis2_routine) { } -#' Write CSV + Parquet under `/dhis2/reporting_rate/`. -write_reporting_rate_dataelement_outputs <- function(reporting_rate_tbl, snt_environment, country_code) { +# Legacy alias. +monthly_period_vector_from_routine <- summarize_routine_period_range_as_month_vector + + +#' Save the final reporting-rate table as CSV + Parquet under `data/dhis2/reporting_rate/`. +#' @export +save_dataelement_reporting_rate_csv_and_parquet <- function(reporting_rate_tbl, snt_environment, country_code) { output_dir <- file.path(snt_environment$DATA_PATH, "dhis2", "reporting_rate") dir.create(output_dir, recursive = TRUE, showWarnings = FALSE) base <- paste0(country_code, "_reporting_rate_dataelement") @@ -240,7 +342,13 @@ write_reporting_rate_dataelement_outputs <- function(reporting_rate_tbl, snt_env } -build_facility_master_dataelement <- function( +# Legacy alias. +write_reporting_rate_dataelement_outputs <- save_dataelement_reporting_rate_csv_and_parquet + + +#' Pyramid table crossed with every month in the routine period (facility master for RR). +#' @export +build_facilities_crossed_with_monthly_periods <- function( dhis2_pyramid_formatted, period_vector, config_json, @@ -261,3 +369,7 @@ build_facility_master_dataelement <- function( tidyr::crossing(PERIOD = period_vector) %>% dplyr::mutate(PERIOD = as.numeric(PERIOD)) } + + +# Legacy alias. +build_facility_master_dataelement <- build_facilities_crossed_with_monthly_periods diff --git a/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb b/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb index a4ef3b6..838837d 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb +++ b/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb @@ -50,7 +50,7 @@ "\n", "### Pipeline parameters\n", "\n", - "`ROUTINE_FILE` and `DATASET_ID` are injected by Papermill (checked inside `parse_reporting_rate_dataset_snt_settings()`).\n", + "`ROUTINE_FILE` and `DATASET_ID` are injected by Papermill; they are checked when building notebook settings from `config_json` (`build_dataset_method_reporting_settings_from_config()` in section 1.2).\n", "\n", "- **Outliers detection method**: Specify which method was used to detect outliers in routine data. Choose \"Routine data (Raw)\" to use raw routine data.\n", " \n", @@ -172,8 +172,13 @@ } }, "source": [ - "cx <- parse_reporting_rate_dataset_snt_settings(config_json)\n", - "list2env(cx, envir = .GlobalEnv)\n" + "settings_dataset_rr <- build_dataset_method_reporting_settings_from_config(config_json)\n", + "COUNTRY_CODE <- settings_dataset_rr$COUNTRY_CODE\n", + "ADMIN_1 <- settings_dataset_rr$ADMIN_1\n", + "ADMIN_2 <- settings_dataset_rr$ADMIN_2\n", + "REPORTING_RATE_PRODUCT_ID <- settings_dataset_rr$REPORTING_RATE_PRODUCT_ID\n", + "fixed_cols_rr <- settings_dataset_rr$fixed_cols_rr\n", + "" ], "execution_count": null, "outputs": [], @@ -193,7 +198,7 @@ }, "source": [ "#### 1.2. Config + Papermill\n", - "`ROUTINE_FILE` and `DATASET_ID` are checked inside `parse_reporting_rate_dataset_snt_settings()` (older notebooks may still call `assert_papermill_reporting_rate_dataset_params()` explicitly)." + "`build_dataset_method_reporting_settings_from_config(config_json)` checks Papermill parameters, then reads country, admins, product UID, and the fixed routine column list from `config_json` (same idea as `snt_dhis2_population_transformation.ipynb`)." ], "id": "a7a15634-4623-40f2-8e2d-3fa47203aa6e" }, @@ -364,10 +369,10 @@ } }, "source": [ - "dhis2_reporting <- load_dataset_file(\n", - " config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED,\n", - " paste0(COUNTRY_CODE, \"_reporting.parquet\")\n", - ")\n", + "formatting_dataset_id <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "reporting_parquet_name <- paste0(COUNTRY_CODE, \"_reporting.parquet\")\n", + "\n", + "dhis2_reporting <- load_dataset_file(formatting_dataset_id, reporting_parquet_name)\n", "dhis2_reporting <- dhis2_reporting %>%\n", " dplyr::mutate(dplyr::across(c(PERIOD, YEAR, MONTH, VALUE), as.numeric))\n", "head(dhis2_reporting, 3)\n", @@ -704,10 +709,8 @@ "tags": [] }, "source": [ - "### 3.3. (🇳🇪 NER only) Make HOP aggregated values (0, >1) into presence/absence (0, 1)\n", - "Specific for Niger SNIS instance!
    \n", - "Values for dataset HOP (\"ki7YKOfyxjf\" = \"HOP 03 ACTIVITES DE LUTTE CONTRE LE PALUDISME\") count the individual \"sub-units\" (departments, etc ... ) of a given hospital and therefore can have values >1.
    \n", - "For consistency with CSI (where all values are raw, and therefore only 0 and 1), we need to convert all HOP value >1 into 1." + "### 3.3. ACTUAL / EXPECTED summaries after cleaning\n", + "Niger-specific capping (values > 1 set to 1) is applied in **step 3.1** above when `COUNTRY_CODE == \"NER\"`. This cell only prints `summary()` for quick QC on all countries." ], "id": "2f26c614" }, @@ -733,11 +736,6 @@ } }, "source": [ - "# NER-specific normalization quality check\n", - "if (COUNTRY_CODE == \"NER\") {\n", - " cat(\"Applied NER normalization: ACTUAL_REPORTS and EXPECTED_REPORTS capped at 1.\n", - "\")\n", - "}\n", "summary(dhis2_reporting_wide$ACTUAL_REPORTS)\n", "summary(dhis2_reporting_wide$EXPECTED_REPORTS)\n" ], @@ -1007,21 +1005,11 @@ } }, "source": [ - "output_dir <- file.path(snt_environment$DATA_PATH, \"dhis2\", \"reporting_rate\")\n", - "dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)\n", - "\n", - "write.csv(\n", - " reporting_rate_dataset,\n", - " file.path(output_dir, paste0(COUNTRY_CODE, \"_reporting_rate_dataset.csv\")),\n", - " row.names = FALSE\n", - ")\n", - "log_msg(glue::glue(\"Exported: {file.path(output_dir, paste0(COUNTRY_CODE, '_reporting_rate_dataset.csv'))}\"))\n", - "\n", - "arrow::write_parquet(\n", + "save_dataset_method_reporting_rate_csv_and_parquet(\n", " reporting_rate_dataset,\n", - " file.path(output_dir, paste0(COUNTRY_CODE, \"_reporting_rate_dataset.parquet\"))\n", + " snt_environment,\n", + " COUNTRY_CODE\n", ")\n", - "log_msg(glue::glue(\"Exported: {file.path(output_dir, paste0(COUNTRY_CODE, '_reporting_rate_dataset.parquet'))}\"))\n", "" ], "execution_count": null, diff --git a/pipelines/snt_dhis2_reporting_rate_dataset/utils/snt_dhis2_reporting_rate_dataset.r b/pipelines/snt_dhis2_reporting_rate_dataset/utils/snt_dhis2_reporting_rate_dataset.r index 4aa744d..05f0822 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataset/utils/snt_dhis2_reporting_rate_dataset.r +++ b/pipelines/snt_dhis2_reporting_rate_dataset/utils/snt_dhis2_reporting_rate_dataset.r @@ -1,9 +1,28 @@ # Load base utils -# Same bootstrap pattern as reporting_rate_dataelement / formatting review: -# `load_dataset_file()`, `load_snt_config()`, `get_setup_variables()` + paths_to_check. +# Helpers are named so the dataset-method reporting notebook reads like a checklist +# (same idea as `snt_dhis2_reporting_rate_dataelement` utils). source(file.path("~/workspace/code", "snt_utils.r")) +# JSON reader for this pipeline only (`snt_utils.r` unchanged). +read_workspace_json_file <- function(json_path, resource_label = "JSON file") { + json_path <- as.character(json_path)[[1L]] + tryCatch( + jsonlite::fromJSON(json_path), + error = function(e) { + stop(paste0( + "[ERROR] Error while loading ", + resource_label, + " from `", + json_path, + "`: ", + conditionMessage(e) + )) + } + ) +} + + #' Get Setup Variables for SNT Workspace #' Initializes workspace paths, loads R packages, and imports OpenHEXA SDK. #' @@ -47,30 +66,18 @@ get_setup_variables <- function( #' Load SNT Configuration File -#' Reads and parses a JSON configuration file. -#' @param snt_config_path Character. Path to the configuration JSON file. -#' @return List containing parsed configuration. -#' +#' @param snt_config_path Character. Full path to `SNT_config.json`. #' @export load_snt_config <- function(snt_config_path) { - config_json <- tryCatch( - { jsonlite::fromJSON(snt_config_path) }, - error = function(e) { - msg <- glue::glue("[ERROR] Error while loading configuration: {snt_config_path}") - cat(msg) - stop(msg) - } - ) + config_json <- read_workspace_json_file(snt_config_path, "configuration") log_msg(paste0("SNT configuration loaded from: ", snt_config_path)) return(config_json) } #' Fail if Papermill did not inject `ROUTINE_FILE` and `DATASET_ID`. -#' -#' Kept as a named entry point so older notebooks that call this before other -#' setup keep working after utils refactors. -assert_papermill_reporting_rate_dataset_params <- function() { +#' @export +stop_if_dataset_reporting_papermill_params_missing <- function() { required <- c("ROUTINE_FILE", "DATASET_ID") missing <- required[!vapply(required, exists, logical(1), inherits = TRUE)] if (length(missing) > 0) { @@ -83,56 +90,82 @@ assert_papermill_reporting_rate_dataset_params <- function() { } -#' Build globals used in the dataset reporting-rate notebook from `SNT_config.json`. -#' -#' Calls `assert_papermill_reporting_rate_dataset_params()` first (redundant if the -#' notebook already called it). -parse_reporting_rate_dataset_snt_settings <- function(config_json) { - assert_papermill_reporting_rate_dataset_params() +# Legacy alias. +assert_papermill_reporting_rate_dataset_params <- stop_if_dataset_reporting_papermill_params_missing + +#' Country, admins, and product filter from `SNT_config.json` (dataset-method RR). +read_dataset_reporting_identity_from_config <- function(config_json) { list( COUNTRY_CODE = config_json$SNT_CONFIG$COUNTRY_CODE, ADMIN_1 = toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1), ADMIN_2 = toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2), - REPORTING_RATE_PRODUCT_ID = config_json$SNT_CONFIG$REPORTING_RATE_PRODUCT_UID, - fixed_cols_rr = c("YEAR", "MONTH", "ADM2_ID", "REPORTING_RATE") + REPORTING_RATE_PRODUCT_ID = config_json$SNT_CONFIG$REPORTING_RATE_PRODUCT_UID ) } +#' Column names kept when trimming routine extracts to reporting-rate grain. +fixed_columns_for_dataset_reporting_rate_routine_slice <- function() { + c("YEAR", "MONTH", "ADM2_ID", "REPORTING_RATE") +} + + +#' Build the named settings list used by the dataset-method reporting-rate notebook. +#' +#' Calls `stop_if_dataset_reporting_papermill_params_missing()` first, then reads +#' country / admins / product UID and the fixed routine column list from `config_json`. +#' +#' @export +build_dataset_method_reporting_settings_from_config <- function(config_json) { + stop_if_dataset_reporting_papermill_params_missing() + id <- read_dataset_reporting_identity_from_config(config_json) + c(id, list(fixed_cols_rr = fixed_columns_for_dataset_reporting_rate_routine_slice())) +} + + +# Legacy alias (same as removed `parse_reporting_rate_dataset_snt_settings`). +parse_reporting_rate_dataset_snt_settings <- build_dataset_method_reporting_settings_from_config + + #' Load Dataset File from OpenHEXA -#' Retrieves the latest version of a file from an OpenHEXA dataset. #' #' @param dataset_id Character. OpenHEXA dataset identifier. #' @param filename Character. Name of file to load. -#' @return Dataframe containing the loaded data. -#' +#' @param verbose Logical. If TRUE, log dataframe dimensions after a successful load. #' @export -load_dataset_file <- function(dataset_id, filename) { +load_dataset_file <- function(dataset_id, filename, verbose = TRUE) { data <- tryCatch( - { get_latest_dataset_file_in_memory(dataset_id, filename) }, + { + get_latest_dataset_file_in_memory(dataset_id, filename) + }, error = function(e) { - msg <- glue::glue("[ERROR] Error while loading {filename} file: {conditionMessage(e)}") - log_msg(msg, "error") - stop(msg) + stop(glue::glue("[ERROR] Error while loading {filename} file from dataset: {dataset_id}")) } ) - msg <- glue::glue("{filename} data loaded from dataset: {dataset_id} dataframe dimensions: [{paste(dim(data), collapse = ', ')}]") - log_msg(msg) + if (verbose) { + log_msg(glue::glue( + "{filename} data loaded from dataset : {dataset_id} dataframe dimensions: [{paste(dim(data), collapse = ', ')}]" + )) + } return(data) } -#' Write CSV + Parquet under `/dhis2/reporting_rate/`. -write_reporting_rate_dataset_outputs <- function(reporting_rate_tbl, snt_environment, country_code) { +#' Save final dataset-method reporting-rate table as CSV + Parquet under `data/dhis2/reporting_rate/`. +#' @export +save_dataset_method_reporting_rate_csv_and_parquet <- function(reporting_rate_tbl, snt_environment, country_code) { output_dir <- file.path(snt_environment$DATA_PATH, "dhis2", "reporting_rate") dir.create(output_dir, recursive = TRUE, showWarnings = FALSE) - base <- paste0(country_code, "_reporting_rate_dataset") - csv_path <- file.path(output_dir, paste0(base, ".csv")) - pq_path <- file.path(output_dir, paste0(base, ".parquet")) + csv_path <- file.path(output_dir, paste0(country_code, "_reporting_rate_dataset.csv")) + pq_path <- file.path(output_dir, paste0(country_code, "_reporting_rate_dataset.parquet")) utils::write.csv(reporting_rate_tbl, csv_path, row.names = FALSE) log_msg(glue::glue("Exported: {csv_path}")) arrow::write_parquet(reporting_rate_tbl, pq_path) log_msg(glue::glue("Exported: {pq_path}")) invisible(list(csv_path = csv_path, parquet_path = pq_path)) } + + +# Legacy alias. +write_reporting_rate_dataset_outputs <- save_dataset_method_reporting_rate_csv_and_parquet From 33bc8975636f4c520e775ba9ea7f1e081ef04d68 Mon Sep 17 00:00:00 2001 From: claude-marie Date: Thu, 16 Apr 2026 13:03:55 +0200 Subject: [PATCH 10/18] final fix --- ...s2_reporting_rate_dataelement_report.ipynb | 251 +++++++++--------- 1 file changed, 126 insertions(+), 125 deletions(-) diff --git a/pipelines/snt_dhis2_reporting_rate_dataelement/reporting/snt_dhis2_reporting_rate_dataelement_report.ipynb b/pipelines/snt_dhis2_reporting_rate_dataelement/reporting/snt_dhis2_reporting_rate_dataelement_report.ipynb index 836aaa4..5410101 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataelement/reporting/snt_dhis2_reporting_rate_dataelement_report.ipynb +++ b/pipelines/snt_dhis2_reporting_rate_dataelement/reporting/snt_dhis2_reporting_rate_dataelement_report.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "b79cba06", "metadata": { "papermill": { "duration": 0.000249, @@ -14,11 +15,12 @@ }, "source": [ "### 1. Setup" - ], - "id": "b79cba06" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "7ca65bcc", "metadata": { "papermill": { "duration": 7.265364, @@ -32,6 +34,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "source(\"~/workspace/pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r\")\n", "\n", @@ -46,13 +49,11 @@ "REPORTING_NB_OUTPUTS_PATH <- file.path(PIPELINE_PATH, \"reporting\", \"outputs\")\n", "\n", "source(file.path(CODE_PATH, \"snt_palettes.r\"))" - ], - "execution_count": null, - "outputs": [], - "id": "7ca65bcc" + ] }, { "cell_type": "markdown", + "id": "c5301aa3", "metadata": { "papermill": { "duration": 0.000116, @@ -65,11 +66,12 @@ }, "source": [ "#### 1.1. Load and check `snt config` file" - ], - "id": "c5301aa3" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "76d8a072", "metadata": { "papermill": { "duration": 0.52329, @@ -83,15 +85,16 @@ "languageId": "r" } }, + "outputs": [], "source": [ + "# we should move this thing in snt_utils at some points\n", "config_json <- load_snt_config(file.path(CONFIG_PATH, \"SNT_config.json\"))" - ], - "execution_count": null, - "outputs": [], - "id": "76d8a072" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "c712ac02", "metadata": { "papermill": { "duration": 0.030446, @@ -105,6 +108,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# Configuration settings\n", "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", @@ -114,13 +118,11 @@ "# Reporting Rate data is stored in the same OH Dataset regardless of whether is comes from DataSet or DataElement method\n", "REPORTING_RATE_DATASET_NAME <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_REPORTING_RATE\n", "DHIS2_FORMATTED_DATASET_NAME <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED" - ], - "execution_count": null, - "outputs": [], - "id": "c712ac02" + ] }, { "cell_type": "markdown", + "id": "30b058f4", "metadata": { "papermill": { "duration": 0.000094, @@ -134,11 +136,12 @@ "source": [ "#### 1.2. Load and check `snt metadata` file\n", "This is needed for the correct use of palettes and categories (breaks, or scale)" - ], - "id": "30b058f4" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "98a8ee49", "metadata": { "papermill": { "duration": 0.940593, @@ -152,15 +155,15 @@ "languageId": "r" } }, + "outputs": [], "source": [ "metadata_json <- load_snt_metadata(file.path(CONFIG_PATH, \"SNT_metadata.json\"))" - ], - "execution_count": null, - "outputs": [], - "id": "98a8ee49" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "00681217", "metadata": { "papermill": { "duration": 0.198107, @@ -174,6 +177,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "scale_raw <- metadata_json$REPORTING_RATE$SCALE\n", "\n", @@ -190,13 +194,11 @@ "break_vals <- break_vals / 100\n", "\n", "log_msg(paste0(\"Reporting Rate scale break values loaded from SNT_metadata.json (treated as percentages): \", paste(break_vals, collapse = \", \")))" - ], - "execution_count": null, - "outputs": [], - "id": "00681217" + ] }, { "cell_type": "markdown", + "id": "f3470564", "metadata": { "papermill": { "duration": 0.000162, @@ -209,11 +211,11 @@ }, "source": [ "### 2. Load Data" - ], - "id": "f3470564" + ] }, { "cell_type": "markdown", + "id": "82397307", "metadata": { "papermill": { "duration": 0.000126, @@ -226,11 +228,12 @@ }, "source": [ "#### 2.1. Output of main pipeline notebook" - ], - "id": "82397307" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "70acb2c5", "metadata": { "papermill": { "duration": 1.253125, @@ -244,6 +247,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# Important: this will break if reporting rate was calculated as DataSet method because it will not find the file\n", "# (will find \"{COUNTRY_CODE}_reporting_rate_dataset.parquet\" instead)\n", @@ -254,13 +258,11 @@ "\n", "dim(reporting_rate)\n", "head(reporting_rate, 2)" - ], - "execution_count": null, - "outputs": [], - "id": "70acb2c5" + ] }, { "cell_type": "markdown", + "id": "48833515", "metadata": { "papermill": { "duration": 0.000091, @@ -274,11 +276,12 @@ "source": [ "#### 2.2. Shapes\n", "To make choropleth (map)" - ], - "id": "48833515" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "3febd4f4", "metadata": { "papermill": { "duration": 3.535554, @@ -292,17 +295,16 @@ "languageId": "r" } }, + "outputs": [], "source": [ "shapes_filename <- paste0(COUNTRY_CODE, \"_shapes.geojson\")\n", "shapes <- load_dataset_file(DHIS2_FORMATTED_DATASET_NAME, shapes_filename)\n", "names(shapes)" - ], - "execution_count": null, - "outputs": [], - "id": "3febd4f4" + ] }, { "cell_type": "markdown", + "id": "17067d56", "metadata": { "papermill": { "duration": 0.000166, @@ -315,11 +317,11 @@ }, "source": [ "### 3. Plots" - ], - "id": "17067d56" + ] }, { "cell_type": "markdown", + "id": "9a6369ee", "metadata": { "papermill": { "duration": 0.000109, @@ -332,11 +334,12 @@ }, "source": [ "##### 3.0. Add shapes" - ], - "id": "9a6369ee" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "c6641720", "metadata": { "papermill": { "duration": 0.03905, @@ -350,16 +353,15 @@ "languageId": "r" } }, + "outputs": [], "source": [ "data_to_plot <- reporting_rate %>%\n", " left_join(shapes, by = c(\"ADM2_ID\"))" - ], - "execution_count": null, - "outputs": [], - "id": "c6641720" + ] }, { "cell_type": "markdown", + "id": "0b0d32f1", "metadata": { "papermill": { "duration": 0.000195, @@ -372,11 +374,11 @@ }, "source": [ "#### 3.1. 🎨 Dynamic categories and color assignement" - ], - "id": "0b0d32f1" + ] }, { "cell_type": "markdown", + "id": "cc765e0c", "metadata": { "papermill": { "duration": 0.000109, @@ -389,11 +391,12 @@ }, "source": [ "##### 1. Define breaks and labels" - ], - "id": "cc765e0c" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "2e79132c", "metadata": { "papermill": { "duration": 0.026996, @@ -407,19 +410,19 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# Safety code to avoid breaking if nothings is fund in json_metadata\n", "if (is.null(break_vals) || length(break_vals) == 0) {\n", " log_msg(\"[WARNING] No break values found in SNT_metadata.json for REPORTING_RATE$SCALE. Using default values.\", \"warning\")\n", " break_vals <- c(0.5, 0.8, 0.9, 0.95, 1.00)\n", "}" - ], - "execution_count": null, - "outputs": [], - "id": "2e79132c" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "f04cb888", "metadata": { "papermill": { "duration": 0.037712, @@ -433,6 +436,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# 1. Define breaks\n", "# Note: assumes that the data starts at 0!\n", @@ -450,13 +454,11 @@ "\n", "# Check\n", "labels" - ], - "execution_count": null, - "outputs": [], - "id": "f04cb888" + ] }, { "cell_type": "markdown", + "id": "cb237801", "metadata": { "papermill": { "duration": 0.000102, @@ -469,11 +471,12 @@ }, "source": [ "##### 2. Create `_CATEGORY` col" - ], - "id": "cb237801" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "f8303488", "metadata": { "papermill": { "duration": 0.040632, @@ -487,6 +490,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# reporting_rate_dataset <- reporting_rate_dataset %>%\n", "data_to_plot <- data_to_plot %>%\n", @@ -499,13 +503,11 @@ " include.lowest = TRUE\n", " )\n", " )" - ], - "execution_count": null, - "outputs": [], - "id": "f8303488" + ] }, { "cell_type": "markdown", + "id": "a10237f8", "metadata": { "papermill": { "duration": 0.000102, @@ -518,11 +520,12 @@ }, "source": [ "##### 3. Pick appropriate palette" - ], - "id": "a10237f8" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "2ee6e077", "metadata": { "papermill": { "duration": 0.04138, @@ -536,6 +539,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# Count nr of breaks\n", "nr_of_colors <- length(labels)\n", @@ -548,13 +552,11 @@ "names(palette_to_use) <- rev(labels)\n", "\n", "palette_to_use\n" - ], - "execution_count": null, - "outputs": [], - "id": "2ee6e077" + ] }, { "cell_type": "markdown", + "id": "d08c0c14", "metadata": { "papermill": { "duration": 0.000099, @@ -567,11 +569,11 @@ }, "source": [ "#### 3.2. Plots" - ], - "id": "d08c0c14" + ] }, { "cell_type": "markdown", + "id": "b7781198", "metadata": { "papermill": { "duration": 0.000056, @@ -585,11 +587,12 @@ "source": [ "##### 3.2.1 Scatter plot of RR over time (by ADM2)\n", "With this we can see the actula numbners (although cannot tell which ADM2 have low values)." - ], - "id": "b7781198" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "78d92e4a", "metadata": { "papermill": { "duration": 1.456494, @@ -603,6 +606,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# Line point plot faceted by YEAR\n", "ggplot(data = data_to_plot) +\n", @@ -644,13 +648,12 @@ " strip.placement = \"outside\",\n", " strip.text = element_text(face = \"bold\", size = 10)\n", " )" - ], - "execution_count": null, - "outputs": [], - "id": "78d92e4a" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "1f47064a", "metadata": { "papermill": { "duration": 1.11568, @@ -664,6 +667,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "output_file <- paste0(COUNTRY_CODE, \"_reporting_rate_dataelement_adm2_linepoint.png\")\n", "output_location <- file.path(REPORTING_NB_OUTPUTS_PATH, \"figures\")\n", @@ -681,13 +685,11 @@ "\n", "# Add log message\n", "log_msg(glue::glue(\"📊 Plot (linepoint) saved to: {file.path(output_location, output_file)}\"))" - ], - "execution_count": null, - "outputs": [], - "id": "1f47064a" + ] }, { "cell_type": "markdown", + "id": "22bb6431", "metadata": { "papermill": { "duration": 0.000147, @@ -701,11 +703,12 @@ "source": [ "##### 3.2.2 Heatmap plot of RR over time (by ADM2)\n", "This is less good for identifying actual values, but allows to see which ADM2 have lower values." - ], - "id": "22bb6431" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "f2445f2a", "metadata": { "papermill": { "duration": 2.21647, @@ -719,6 +722,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# Tile plot faceted by YEAR\n", "ggplot(data = data_to_plot) +\n", @@ -754,13 +758,12 @@ " strip.text = element_text(face = \"bold\", size = 10)\n", " ) +\n", " guides(fill = guide_legend(nrow = 1))" - ], - "execution_count": null, - "outputs": [], - "id": "f2445f2a" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "cbe73312", "metadata": { "papermill": { "duration": 1.982105, @@ -774,6 +777,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "output_file <- paste0(COUNTRY_CODE, \"_reporting_rate_dataelement_adm2_heatmap.png\")\n", "output_location <- file.path(REPORTING_NB_OUTPUTS_PATH, \"figures\")\n", @@ -788,13 +792,11 @@ "\n", "# Add log message\n", "log_msg(glue::glue(\"📊 Plot (heatmap) saved to: {file.path(output_location, output_file)}\"))" - ], - "execution_count": null, - "outputs": [], - "id": "cbe73312" + ] }, { "cell_type": "markdown", + "id": "3eef141a", "metadata": { "papermill": { "duration": 0.000164, @@ -807,11 +809,12 @@ }, "source": [ "##### 3.2.3. MAP of Reporting Rate - by month" - ], - "id": "3eef141a" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "83be9c68", "metadata": { "papermill": { "duration": 4.958481, @@ -825,6 +828,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# Choropleth map with reporting rate data by ADM2\n", "ggplot(data = data_to_plot) +\n", @@ -852,13 +856,12 @@ " cols = vars(MONTH),\n", " switch = \"both\") +\n", " guides(fill = guide_legend(nrow = 1))" - ], - "execution_count": null, - "outputs": [], - "id": "83be9c68" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "e877671d", "metadata": { "papermill": { "duration": 3.502689, @@ -872,6 +875,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "output_file <- paste0(COUNTRY_CODE, \"_reporting_rate_dataelement_adm2_month_map.png\")\n", "output_location <- file.path(REPORTING_NB_OUTPUTS_PATH, \"figures\")\n", @@ -886,13 +890,11 @@ "\n", "# Add log message\n", "log_msg(glue::glue(\"📊 Plot (map) saved to: {file.path(output_location, output_file)}\"))" - ], - "execution_count": null, - "outputs": [], - "id": "e877671d" + ] }, { "cell_type": "markdown", + "id": "f0894be9", "metadata": { "papermill": { "duration": 0.000166, @@ -906,11 +908,12 @@ "source": [ "##### 3.2.4. MAP of Reporting Rate - by YEAR\n", "Use average (`mean()`) of monthly values" - ], - "id": "f0894be9" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "cb1995ab", "metadata": { "papermill": { "duration": 0.039325, @@ -924,6 +927,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "data_to_plot_year <- data_to_plot %>%\n", " group_by(geometry, ADM2_ID, ADM2_NAME, ADM1_NAME, YEAR) %>%\n", @@ -941,13 +945,12 @@ " include.lowest = TRUE\n", " )\n", " )" - ], - "execution_count": null, - "outputs": [], - "id": "cb1995ab" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "bd32b0cf", "metadata": { "papermill": { "duration": 0.798686, @@ -961,6 +964,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# Choropleth map with reporting rate data by ADM2\n", "ggplot(data = data_to_plot_year) +\n", @@ -986,13 +990,12 @@ " cols = vars(YEAR)\n", " ) +\n", " guides(fill = guide_legend(nrow = 1))" - ], - "execution_count": null, - "outputs": [], - "id": "bd32b0cf" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "0430641e", "metadata": { "papermill": { "duration": 0.928933, @@ -1006,6 +1009,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "output_file <- paste0(COUNTRY_CODE, \"_reporting_rate_dataelement_adm2_year_map.png\")\n", "output_location <- file.path(REPORTING_NB_OUTPUTS_PATH, \"figures\")\n", @@ -1020,13 +1024,11 @@ "\n", "# Add log message\n", "log_msg(glue::glue(\"📊 Plot (map) saved to: {file.path(output_location, output_file)}\"))" - ], - "execution_count": null, - "outputs": [], - "id": "0430641e" + ] }, { "cell_type": "markdown", + "id": "8c3bdca4", "metadata": { "papermill": { "duration": 0.000126, @@ -1039,11 +1041,12 @@ }, "source": [ "#### The End :)" - ], - "id": "8c3bdca4" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "f8a62ec5", "metadata": { "papermill": { "duration": 0.216448, @@ -1057,12 +1060,10 @@ "languageId": "r" } }, + "outputs": [], "source": [ "log_msg(\"Reporting Rate (Data Element) report notebook completed successfully!\")" - ], - "execution_count": null, - "outputs": [], - "id": "f8a62ec5" + ] } ], "metadata": { @@ -1094,4 +1095,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} From 3eaefbe26276dbb4db20f879ebd7467a1b98a195 Mon Sep 17 00:00:00 2001 From: claude-marie Date: Mon, 20 Apr 2026 23:09:58 +0200 Subject: [PATCH 11/18] milestone comments --- ...snt_dhis2_reporting_rate_dataelement.ipynb | 69 ++++-- .../snt_dhis2_reporting_rate_dataelement.r | 208 ++++-------------- .../snt_dhis2_reporting_rate_dataset.ipynb | 46 ++-- .../utils/snt_dhis2_reporting_rate_dataset.r | 58 +---- 4 files changed, 102 insertions(+), 279 deletions(-) diff --git a/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb b/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb index d152bbb..d8a52a9 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb +++ b/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb @@ -94,7 +94,7 @@ }, "source": [ "### 1.1. Pipeline parameters\n", - "`ROUTINE_FILE` and `DATASET_ID` are injected by Papermill (validated in `build_dataelement_reporting_settings_from_config()`). Same layout as `snt_dhis2_population_transformation.ipynb`: define inputs in the notebook, then **Save variables** from config / parsed settings.\n" + "`ROUTINE_FILE` and `DATASET_ID` are injected by Papermill. Same layout as `snt_dhis2_population_transformation.ipynb`: define and assign variables explicitly in the notebook setup.\n" ] }, { @@ -129,27 +129,34 @@ }, "outputs": [], "source": [ - "# Routine column names for numerators (edit here; passed through to reporting-rate logic).\n", - "activity_indicators <- c(\"CONF\", \"PRES\", \"SUSP\")\n", - "volume_activity_indicators <- c(\"CONF\", \"PRES\")\n", + "assert_papermill_dataelement_params()\n", "\n", - "settings <- build_dataelement_reporting_settings_from_config(\n", - " config_json,\n", - " activity_indicators = activity_indicators,\n", - " volume_activity_indicators = volume_activity_indicators\n", - ")\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", + "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", + "DHIS2_INDICATORS <- c(\"CONF\", \"PRES\", \"SUSP\", \"TEST\")\n", + "\n", + "if (!exists(\"DATAELEMENT_METHOD_DENOMINATOR\", inherits = TRUE)) {\n", + " DATAELEMENT_METHOD_DENOMINATOR <- \"ROUTINE_ACTIVE_FACILITIES\"\n", + "}\n", + "if (!exists(\"USE_WEIGHTED_REPORTING_RATES\", inherits = TRUE)) {\n", + " USE_WEIGHTED_REPORTING_RATES <- FALSE\n", + "}\n", + "\n", + "if (exists(\"AVAILABILITY_INDICATORS\", inherits = TRUE)) {\n", + " ACTIVITY_INDICATORS <- as.character(unlist(AVAILABILITY_INDICATORS, use.names = FALSE))\n", + "} else {\n", + " ACTIVITY_INDICATORS <- c(\"CONF\", \"PRES\", \"SUSP\")\n", + "}\n", + "\n", + "if (exists(\"VOLUME_ACTIVITY_INDICATORS\", inherits = TRUE)) {\n", + " VOLUME_ACTIVITY_INDICATORS <- as.character(unlist(VOLUME_ACTIVITY_INDICATORS, use.names = FALSE))\n", + "} else {\n", + " VOLUME_ACTIVITY_INDICATORS <- c(\"CONF\", \"PRES\")\n", + "}\n", "\n", - "# Save variables (explicit assignments like population transformation `config_json` block).\n", - "COUNTRY_CODE <- settings$COUNTRY_CODE\n", - "ADMIN_1 <- settings$ADMIN_1\n", - "ADMIN_2 <- settings$ADMIN_2\n", - "DHIS2_INDICATORS <- settings$DHIS2_INDICATORS\n", - "DATAELEMENT_METHOD_DENOMINATOR <- settings$DATAELEMENT_METHOD_DENOMINATOR\n", - "USE_WEIGHTED_REPORTING_RATES <- settings$USE_WEIGHTED_REPORTING_RATES\n", - "ACTIVITY_INDICATORS <- settings$ACTIVITY_INDICATORS\n", - "VOLUME_ACTIVITY_INDICATORS <- settings$VOLUME_ACTIVITY_INDICATORS\n", - "fixed_cols <- settings$fixed_cols\n", - "fixed_cols_rr <- settings$fixed_cols_rr\n" + "fixed_cols <- c(\"PERIOD\", \"YEAR\", \"MONTH\", \"ADM1_ID\", \"ADM2_ID\", \"OU_ID\")\n", + "fixed_cols_rr <- c(\"YEAR\", \"MONTH\", \"ADM2_ID\", \"REPORTING_RATE\")\n" ] }, { @@ -332,7 +339,18 @@ }, "outputs": [], "source": [ - "check_required_indicators_present_in_routine(dhis2_routine, ACTIVITY_INDICATORS, VOLUME_ACTIVITY_INDICATORS)\n" + "validate_required_columns(\n", + " data = dhis2_routine,\n", + " required_columns = ACTIVITY_INDICATORS,\n", + " data_label = \"`dhis2_routine` (activity indicators)\",\n", + " on_missing = \"warning\"\n", + ")\n", + "validate_required_columns(\n", + " data = dhis2_routine,\n", + " required_columns = VOLUME_ACTIVITY_INDICATORS,\n", + " data_label = \"`dhis2_routine` (volume activity indicators)\",\n", + " on_missing = \"error\"\n", + ")\n" ] }, { @@ -1023,7 +1041,14 @@ }, "outputs": [], "source": [ - "save_dataelement_reporting_rate_csv_and_parquet(reporting_rate_dataelement, snt_environment, COUNTRY_CODE)\n" + "output_dir <- file.path(snt_environment$DATA_PATH, \"dhis2\", \"reporting_rate\")\n", + "dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)\n", + "csv_path <- file.path(output_dir, paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.csv\"))\n", + "parquet_path <- file.path(output_dir, paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.parquet\"))\n", + "utils::write.csv(reporting_rate_dataelement, csv_path, row.names = FALSE)\n", + "log_msg(glue::glue(\"Exported: {csv_path}\"))\n", + "arrow::write_parquet(reporting_rate_dataelement, parquet_path)\n", + "log_msg(glue::glue(\"Exported: {parquet_path}\"))\n" ] } ], diff --git a/pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r b/pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r index 5cc3da9..21381a7 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r +++ b/pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r @@ -1,9 +1,7 @@ # Load base utils # Bootstrap matches `snt_dhis2_population_transformation`: fixed-path `source()` of this # file, `snt_environment <- get_setup_variables()`, then `load_snt_config()`. -# Helpers are named to read like notebook steps (see Esteban's note on structuring -# workflow): `load_dataset_file()`, `build_dataelement_reporting_settings_from_config()`, -# `save_dataelement_reporting_rate_csv_and_parquet()`, etc. +# Keep helpers small and reusable; pipeline-specific assignments stay in notebooks. source(file.path("~/workspace/code", "snt_utils.r")) @@ -127,14 +125,7 @@ configure_conda_r_spatial_env <- function() { } -# Standard aggregated indicator codes present in formatted routine extracts. -STANDARD_DHIS2_INDICATOR_CODES_DATAELEMENT <- c("CONF", "PRES", "SUSP", "TEST") - - #' Fail if Papermill did not inject `ROUTINE_FILE` and `DATASET_ID`. -#' -#' Kept as a named entry point so older notebooks that call this before -#' `build_dataelement_reporting_settings_from_config()` keep working after utils refactors. assert_papermill_dataelement_params <- function() { required_pm <- c("ROUTINE_FILE", "DATASET_ID") missing_pm <- required_pm[!vapply(required_pm, exists, logical(1), inherits = TRUE)] @@ -148,111 +139,6 @@ assert_papermill_dataelement_params <- function() { } -# --- `SNT_CONFIG$REPORTING_RATE_DATAELEMENT` : small steps with explicit names -------- - -read_reporting_rate_dataelement_config_block <- function(config_json) { - rc <- config_json$SNT_CONFIG$REPORTING_RATE_DATAELEMENT - if (is.null(rc) || length(rc) == 0) { - return(list()) - } - rc -} - - -resolve_dataelement_denominator_method <- function(rc) { - denom <- rc$DATAELEMENT_METHOD_DENOMINATOR - denom_ch <- if (is.null(denom)) "" else as.character(denom)[[1]] - if (!nzchar(denom_ch) || is.na(denom_ch)) { - "ROUTINE_ACTIVE_FACILITIES" - } else { - denom_ch - } -} - - -resolve_weighted_reporting_rate_toggle <- function(rc) { - use_w <- rc$USE_WEIGHTED_REPORTING_RATES - if (is.null(use_w)) { - FALSE - } else { - isTRUE(use_w) - } -} - - -resolve_activity_indicator_column_names <- function(rc, activity_indicators) { - if (is.null(activity_indicators)) { - act <- rc$ACTIVITY_INDICATORS - if (is.null(act)) { - act <- c("CONF", "PRES", "SUSP") - } - as.character(unlist(act, use.names = FALSE)) - } else { - as.character(unlist(activity_indicators, use.names = FALSE)) - } -} - - -resolve_volume_indicator_column_names <- function(rc, volume_activity_indicators) { - if (is.null(volume_activity_indicators)) { - vol <- rc$VOLUME_ACTIVITY_INDICATORS - if (is.null(vol)) { - vol <- c("CONF", "PRES") - } - as.character(unlist(vol, use.names = FALSE)) - } else { - as.character(unlist(volume_activity_indicators, use.names = FALSE)) - } -} - - -#' Build the named settings list used by the dataelement reporting-rate notebook. -#' -#' Reads `SNT_config.json` (country, admins, optional `REPORTING_RATE_DATAELEMENT` -#' overrides). When absent, uses the same defaults as the historical OpenHEXA parameters -#' (denominator `ROUTINE_ACTIVE_FACILITIES`, unweighted, activity CONF/PRES/SUSP, -#' volume CONF/PRES). -#' -#' Pass non-NULL `activity_indicators` / `volume_activity_indicators` from the notebook -#' to make column choices visible in the notebook; pass `NULL` to take them from JSON -#' (then built-in defaults if still missing). -#' -#' Also calls `assert_papermill_dataelement_params()` (redundant if the notebook -#' already called it). -#' -#' @export -build_dataelement_reporting_settings_from_config <- function( - config_json, - activity_indicators = NULL, - volume_activity_indicators = NULL -) { - assert_papermill_dataelement_params() - - rc <- read_reporting_rate_dataelement_config_block(config_json) - denom <- resolve_dataelement_denominator_method(rc) - use_w <- resolve_weighted_reporting_rate_toggle(rc) - act <- resolve_activity_indicator_column_names(rc, activity_indicators) - vol <- resolve_volume_indicator_column_names(rc, volume_activity_indicators) - - list( - COUNTRY_CODE = config_json$SNT_CONFIG$COUNTRY_CODE, - ADMIN_1 = toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1), - ADMIN_2 = toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2), - DHIS2_INDICATORS = STANDARD_DHIS2_INDICATOR_CODES_DATAELEMENT, - DATAELEMENT_METHOD_DENOMINATOR = denom, - USE_WEIGHTED_REPORTING_RATES = use_w, - ACTIVITY_INDICATORS = act, - VOLUME_ACTIVITY_INDICATORS = vol, - fixed_cols = c("PERIOD", "YEAR", "MONTH", "ADM1_ID", "ADM2_ID", "OU_ID"), - fixed_cols_rr = c("YEAR", "MONTH", "ADM2_ID", "REPORTING_RATE") - ) -} - - -# Legacy alias (same function; prefer `build_dataelement_reporting_settings_from_config`). -parse_dataelement_snt_settings <- build_dataelement_reporting_settings_from_config - - activity_indicator_list_is_nonempty <- function(activity_indicators) { length(activity_indicators) > 0L } @@ -268,41 +154,49 @@ stop_if_activity_indicators_empty <- function(activity_indicators) { } -# Legacy alias. -assert_activity_indicators <- stop_if_activity_indicators_empty - -has_activity_indicators <- activity_indicator_list_is_nonempty +#' Return required columns that are missing from `data`. +#' @export +find_missing_columns <- function(data, required_columns) { + if (!is.data.frame(data)) { + stop("[ERROR] `data` must be a data.frame.") + } + required_columns <- as.character(unlist(required_columns, use.names = FALSE)) + required_columns <- required_columns[!is.na(required_columns) & nzchar(required_columns)] + required_columns <- unique(required_columns) + setdiff(required_columns, names(data)) +} -#' Check that routine columns exist for the chosen activity / volume indicators. +#' Validate that required columns exist in `data`. +#' +#' Returns missing columns invisibly. Behavior on missing columns is controlled by +#' `on_missing`: `"error"`, `"warning"`, or `"none"`. #' @export -check_required_indicators_present_in_routine <- function( - dhis2_routine, - activity_indicators, - volume_activity_indicators +validate_required_columns <- function( + data, + required_columns, + data_label = "data", + on_missing = c("error", "warning", "none") ) { - if (!all(activity_indicators %in% names(dhis2_routine))) { - log_msg( - glue::glue( - "Warning: one or more activity indicators are missing from `dhis2_routine`: ", - "{paste(activity_indicators, collapse = ', ')}" - ), - "warning" - ) - } - if (!all(volume_activity_indicators %in% names(dhis2_routine))) { - msg <- glue::glue( - "[ERROR] Volume activity indicator(s) not present in routine data: ", - "{paste(volume_activity_indicators, collapse = ', ')}" - ) - log_msg(msg, "error") - stop(msg) + on_missing <- match.arg(on_missing) + missing_columns <- find_missing_columns(data, required_columns) + if (length(missing_columns) == 0L) { + return(invisible(character(0))) } -} + msg <- glue::glue( + "{data_label} missing required column(s): {paste(missing_columns, collapse = ', ')}" + ) -# Legacy alias. -validate_indicator_columns_in_routine <- check_required_indicators_present_in_routine + if (on_missing == "error") { + log_msg(paste0("[ERROR] ", msg), "error") + stop(paste0("[ERROR] ", msg)) + } + if (on_missing == "warning") { + log_msg(paste0("Warning: ", msg), "warning") + } + invisible(missing_columns) +} #' First / last PERIOD in routine and full vector of YYYYMM months in between. @@ -322,30 +216,6 @@ summarize_routine_period_range_as_month_vector <- function(dhis2_routine) { } -# Legacy alias. -monthly_period_vector_from_routine <- summarize_routine_period_range_as_month_vector - - -#' Save the final reporting-rate table as CSV + Parquet under `data/dhis2/reporting_rate/`. -#' @export -save_dataelement_reporting_rate_csv_and_parquet <- function(reporting_rate_tbl, snt_environment, country_code) { - output_dir <- file.path(snt_environment$DATA_PATH, "dhis2", "reporting_rate") - dir.create(output_dir, recursive = TRUE, showWarnings = FALSE) - base <- paste0(country_code, "_reporting_rate_dataelement") - csv_path <- file.path(output_dir, paste0(base, ".csv")) - pq_path <- file.path(output_dir, paste0(base, ".parquet")) - utils::write.csv(reporting_rate_tbl, csv_path, row.names = FALSE) - log_msg(glue::glue("Exported: {csv_path}")) - arrow::write_parquet(reporting_rate_tbl, pq_path) - log_msg(glue::glue("Exported: {pq_path}")) - invisible(list(csv_path = csv_path, parquet_path = pq_path)) -} - - -# Legacy alias. -write_reporting_rate_dataelement_outputs <- save_dataelement_reporting_rate_csv_and_parquet - - #' Pyramid table crossed with every month in the routine period (facility master for RR). #' @export build_facilities_crossed_with_monthly_periods <- function( @@ -369,7 +239,3 @@ build_facilities_crossed_with_monthly_periods <- function( tidyr::crossing(PERIOD = period_vector) %>% dplyr::mutate(PERIOD = as.numeric(PERIOD)) } - - -# Legacy alias. -build_facility_master_dataelement <- build_facilities_crossed_with_monthly_periods diff --git a/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb b/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb index 838837d..478fb3f 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb +++ b/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb @@ -41,16 +41,9 @@ "-----\n", "\n", "\n", - "### 🇳🇪 Niger-Specific Processing: \n", - " In Niger, datasets for HOP (hospital) facilities are already **pre-aggregated** and may contain values greater than 1 for actual or expected reports, reflecting subunits or departments within a hospital. \n", - "
    \n", - " To accurately represent reporting at the facility level and avoid overcounting, all values greater than 1 are converted to 1 (presence/absence). This ensures that the reporting rate reflects whether the hospital as a whole reported, rather than counting multiple subunits separately. This step also prevents cases where ACTUAL_REPORTS exceeds EXPECTED_REPORTS.\n", - "\n", - "------\n", - "\n", "### Pipeline parameters\n", "\n", - "`ROUTINE_FILE` and `DATASET_ID` are injected by Papermill; they are checked when building notebook settings from `config_json` (`build_dataset_method_reporting_settings_from_config()` in section 1.2).\n", + "`ROUTINE_FILE` and `DATASET_ID` are injected by Papermill and validated in setup section 1.2.\n", "\n", "- **Outliers detection method**: Specify which method was used to detect outliers in routine data. Choose \"Routine data (Raw)\" to use raw routine data.\n", " \n", @@ -172,12 +165,13 @@ } }, "source": [ - "settings_dataset_rr <- build_dataset_method_reporting_settings_from_config(config_json)\n", - "COUNTRY_CODE <- settings_dataset_rr$COUNTRY_CODE\n", - "ADMIN_1 <- settings_dataset_rr$ADMIN_1\n", - "ADMIN_2 <- settings_dataset_rr$ADMIN_2\n", - "REPORTING_RATE_PRODUCT_ID <- settings_dataset_rr$REPORTING_RATE_PRODUCT_ID\n", - "fixed_cols_rr <- settings_dataset_rr$fixed_cols_rr\n", + "stop_if_dataset_reporting_papermill_params_missing()\n", + "\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", + "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", + "REPORTING_RATE_PRODUCT_ID <- config_json$SNT_CONFIG$REPORTING_RATE_PRODUCT_UID\n", + "fixed_cols_rr <- c(\"YEAR\", \"MONTH\", \"ADM2_ID\", \"REPORTING_RATE\")\n", "" ], "execution_count": null, @@ -198,7 +192,7 @@ }, "source": [ "#### 1.2. Config + Papermill\n", - "`build_dataset_method_reporting_settings_from_config(config_json)` checks Papermill parameters, then reads country, admins, product UID, and the fixed routine column list from `config_json` (same idea as `snt_dhis2_population_transformation.ipynb`)." + "Validate Papermill inputs, then assign country/admin/product and fixed reporting-rate columns explicitly from `config_json`." ], "id": "a7a15634-4623-40f2-8e2d-3fa47203aa6e" }, @@ -474,15 +468,6 @@ " dplyr::select(-ACTUAL_REPORTS_deduplicated)\n", "}\n", "\n", - "# Country-specific normalization for Niger where reports can exceed 1\n", - "if (COUNTRY_CODE == \"NER\") {\n", - " dhis2_reporting_wide <- dhis2_reporting_wide %>%\n", - " dplyr::mutate(\n", - " ACTUAL_REPORTS = ifelse(ACTUAL_REPORTS > 1, 1, ACTUAL_REPORTS),\n", - " EXPECTED_REPORTS = ifelse(EXPECTED_REPORTS > 1, 1, EXPECTED_REPORTS)\n", - " )\n", - "}\n", - "\n", "# 3.4 Aggregate at ADM2 and compute reporting rate\n", "reporting_rate_results <- dhis2_reporting_wide %>%\n", " dplyr::group_by(PERIOD, YEAR, MONTH, ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID) %>%\n", @@ -1005,11 +990,14 @@ } }, "source": [ - "save_dataset_method_reporting_rate_csv_and_parquet(\n", - " reporting_rate_dataset,\n", - " snt_environment,\n", - " COUNTRY_CODE\n", - ")\n", + "output_dir <- file.path(snt_environment$DATA_PATH, \"dhis2\", \"reporting_rate\")\n", + "dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)\n", + "csv_path <- file.path(output_dir, paste0(COUNTRY_CODE, \"_reporting_rate_dataset.csv\"))\n", + "parquet_path <- file.path(output_dir, paste0(COUNTRY_CODE, \"_reporting_rate_dataset.parquet\"))\n", + "utils::write.csv(reporting_rate_dataset, csv_path, row.names = FALSE)\n", + "log_msg(glue::glue(\"Exported: {csv_path}\"))\n", + "arrow::write_parquet(reporting_rate_dataset, parquet_path)\n", + "log_msg(glue::glue(\"Exported: {parquet_path}\"))\n", "" ], "execution_count": null, diff --git a/pipelines/snt_dhis2_reporting_rate_dataset/utils/snt_dhis2_reporting_rate_dataset.r b/pipelines/snt_dhis2_reporting_rate_dataset/utils/snt_dhis2_reporting_rate_dataset.r index 05f0822..5b11b62 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataset/utils/snt_dhis2_reporting_rate_dataset.r +++ b/pipelines/snt_dhis2_reporting_rate_dataset/utils/snt_dhis2_reporting_rate_dataset.r @@ -1,6 +1,5 @@ # Load base utils -# Helpers are named so the dataset-method reporting notebook reads like a checklist -# (same idea as `snt_dhis2_reporting_rate_dataelement` utils). +# Keep helpers small and reusable; pipeline-specific assignments stay in notebook code. source(file.path("~/workspace/code", "snt_utils.r")) @@ -90,44 +89,6 @@ stop_if_dataset_reporting_papermill_params_missing <- function() { } -# Legacy alias. -assert_papermill_reporting_rate_dataset_params <- stop_if_dataset_reporting_papermill_params_missing - - -#' Country, admins, and product filter from `SNT_config.json` (dataset-method RR). -read_dataset_reporting_identity_from_config <- function(config_json) { - list( - COUNTRY_CODE = config_json$SNT_CONFIG$COUNTRY_CODE, - ADMIN_1 = toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1), - ADMIN_2 = toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2), - REPORTING_RATE_PRODUCT_ID = config_json$SNT_CONFIG$REPORTING_RATE_PRODUCT_UID - ) -} - - -#' Column names kept when trimming routine extracts to reporting-rate grain. -fixed_columns_for_dataset_reporting_rate_routine_slice <- function() { - c("YEAR", "MONTH", "ADM2_ID", "REPORTING_RATE") -} - - -#' Build the named settings list used by the dataset-method reporting-rate notebook. -#' -#' Calls `stop_if_dataset_reporting_papermill_params_missing()` first, then reads -#' country / admins / product UID and the fixed routine column list from `config_json`. -#' -#' @export -build_dataset_method_reporting_settings_from_config <- function(config_json) { - stop_if_dataset_reporting_papermill_params_missing() - id <- read_dataset_reporting_identity_from_config(config_json) - c(id, list(fixed_cols_rr = fixed_columns_for_dataset_reporting_rate_routine_slice())) -} - - -# Legacy alias (same as removed `parse_reporting_rate_dataset_snt_settings`). -parse_reporting_rate_dataset_snt_settings <- build_dataset_method_reporting_settings_from_config - - #' Load Dataset File from OpenHEXA #' #' @param dataset_id Character. OpenHEXA dataset identifier. @@ -152,20 +113,3 @@ load_dataset_file <- function(dataset_id, filename, verbose = TRUE) { } -#' Save final dataset-method reporting-rate table as CSV + Parquet under `data/dhis2/reporting_rate/`. -#' @export -save_dataset_method_reporting_rate_csv_and_parquet <- function(reporting_rate_tbl, snt_environment, country_code) { - output_dir <- file.path(snt_environment$DATA_PATH, "dhis2", "reporting_rate") - dir.create(output_dir, recursive = TRUE, showWarnings = FALSE) - csv_path <- file.path(output_dir, paste0(country_code, "_reporting_rate_dataset.csv")) - pq_path <- file.path(output_dir, paste0(country_code, "_reporting_rate_dataset.parquet")) - utils::write.csv(reporting_rate_tbl, csv_path, row.names = FALSE) - log_msg(glue::glue("Exported: {csv_path}")) - arrow::write_parquet(reporting_rate_tbl, pq_path) - log_msg(glue::glue("Exported: {pq_path}")) - invisible(list(csv_path = csv_path, parquet_path = pq_path)) -} - - -# Legacy alias. -write_reporting_rate_dataset_outputs <- save_dataset_method_reporting_rate_csv_and_parquet From 65b914642de6ae27958fab980b0c36ab417f6b80 Mon Sep 17 00:00:00 2001 From: claude-marie Date: Mon, 20 Apr 2026 23:40:11 +0200 Subject: [PATCH 12/18] should be it --- ...snt_dhis2_reporting_rate_dataelement.ipynb | 15 +- .../snt_dhis2_reporting_rate_dataset.ipynb | 270 +++++++++--------- 2 files changed, 148 insertions(+), 137 deletions(-) diff --git a/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb b/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb index d8a52a9..3aa4e02 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb +++ b/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb @@ -1043,12 +1043,15 @@ "source": [ "output_dir <- file.path(snt_environment$DATA_PATH, \"dhis2\", \"reporting_rate\")\n", "dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)\n", - "csv_path <- file.path(output_dir, paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.csv\"))\n", - "parquet_path <- file.path(output_dir, paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.parquet\"))\n", - "utils::write.csv(reporting_rate_dataelement, csv_path, row.names = FALSE)\n", - "log_msg(glue::glue(\"Exported: {csv_path}\"))\n", - "arrow::write_parquet(reporting_rate_dataelement, parquet_path)\n", - "log_msg(glue::glue(\"Exported: {parquet_path}\"))\n" + "\n", + "out_msg <- paste0(\"Reporting rate dataelement saved under: \", file.path(output_dir, paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.csv\")))\n", + "\n", + "# write parquet and csv files\n", + "write_parquet(reporting_rate_dataelement, file.path(output_dir, paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.parquet\")))\n", + "write.csv(reporting_rate_dataelement, file.path(output_dir, paste0(COUNTRY_CODE, \"_reporting_rate_dataelement.csv\")), row.names = FALSE)\n", + "\n", + "# log\n", + "log_msg(out_msg)\n" ] } ], diff --git a/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb b/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb index 478fb3f..a59c868 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb +++ b/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "30bf8dfc", "metadata": {}, "source": [ "# **Dataset Reporting Rate: Calculation Based on DHIS2 Extracted Data**\n", @@ -48,11 +49,11 @@ "- **Outliers detection method**: Specify which method was used to detect outliers in routine data. Choose \"Routine data (Raw)\" to use raw routine data.\n", " \n", "- **Use routine with outliers removed**: Toggle this on to use the routine data after outliers have been removed (using the outliers detection method selected above). Else, this pipeline will use either the imputed routine data (to replace the outlier values removed) or the raw routine data if you selected \"Routine data (Raw)\" as your choice of “Outlier processing method”." - ], - "id": "30bf8dfc" + ] }, { "cell_type": "markdown", + "id": "064495be-24e5-4b76-a91f-7ac3d1a27a5a", "metadata": { "papermill": { "duration": 0.000092, @@ -65,11 +66,12 @@ }, "source": [ "## 1. Setup" - ], - "id": "064495be-24e5-4b76-a91f-7ac3d1a27a5a" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "35ede7cf-257f-439c-a514-26a7290f881d", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:21:50.332786Z", @@ -89,17 +91,15 @@ "languageId": "r" } }, + "outputs": [], "source": [ "source(\"~/workspace/pipelines/snt_dhis2_reporting_rate_dataset/utils/snt_dhis2_reporting_rate_dataset.r\")\n", - "snt_environment <- get_setup_variables()\n", - "" - ], - "execution_count": null, - "outputs": [], - "id": "35ede7cf-257f-439c-a514-26a7290f881d" + "snt_environment <- get_setup_variables()\n" + ] }, { "cell_type": "markdown", + "id": "7dedcc32-c531-498d-90b9-89b0ee9fb9be", "metadata": { "papermill": { "duration": 0.00017, @@ -112,11 +112,12 @@ }, "source": [ "#### 1.1. Load and check `config_json` file" - ], - "id": "7dedcc32-c531-498d-90b9-89b0ee9fb9be" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "5b6d29ea-91f3-4c53-b95e-4b485f88383f", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:03.351367Z", @@ -136,15 +137,15 @@ "languageId": "r" } }, + "outputs": [], "source": [ "config_json <- load_snt_config(file.path(snt_environment$CONFIG_PATH, \"SNT_config.json\"))" - ], - "execution_count": null, - "outputs": [], - "id": "5b6d29ea-91f3-4c53-b95e-4b485f88383f" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "c26c981c-dadd-48ac-ae35-613b8ba61a82", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:03.987632Z", @@ -164,6 +165,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "stop_if_dataset_reporting_papermill_params_missing()\n", "\n", @@ -171,15 +173,12 @@ "ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)\n", "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", "REPORTING_RATE_PRODUCT_ID <- config_json$SNT_CONFIG$REPORTING_RATE_PRODUCT_UID\n", - "fixed_cols_rr <- c(\"YEAR\", \"MONTH\", \"ADM2_ID\", \"REPORTING_RATE\")\n", - "" - ], - "execution_count": null, - "outputs": [], - "id": "c26c981c-dadd-48ac-ae35-613b8ba61a82" + "fixed_cols_rr <- c(\"YEAR\", \"MONTH\", \"ADM2_ID\", \"REPORTING_RATE\")\n" + ] }, { "cell_type": "markdown", + "id": "a7a15634-4623-40f2-8e2d-3fa47203aa6e", "metadata": { "papermill": { "duration": 0.00015, @@ -193,11 +192,11 @@ "source": [ "#### 1.2. Config + Papermill\n", "Validate Papermill inputs, then assign country/admin/product and fixed reporting-rate columns explicitly from `config_json`." - ], - "id": "a7a15634-4623-40f2-8e2d-3fa47203aa6e" + ] }, { "cell_type": "markdown", + "id": "8d8b20f5-901b-46c7-a0ef-9850cba6e650", "metadata": { "papermill": { "duration": 0.000144, @@ -210,19 +209,20 @@ }, "source": [ "#### 1.3. 🔍 Check REPORTING_RATE_PRODUCT_ID is configured" - ], - "id": "8d8b20f5-901b-46c7-a0ef-9850cba6e650" + ] }, { "cell_type": "markdown", + "id": "682a62d5", "metadata": {}, "source": [ "### 🐍 This probably to be moved to pipeline.py code?" - ], - "id": "682a62d5" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "7469898d", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:04.047782Z", @@ -242,19 +242,18 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# Check if REPORTING_RATE_PRODUCT_ID is configured\n", "if (is.null(REPORTING_RATE_PRODUCT_ID) || length(REPORTING_RATE_PRODUCT_ID) == 0) {\n", " log_msg(\"🚨 Warning: REPORTING_RATE_PRODUCT_ID is not configured properly in 'SNT_config.json'. \n", " This will prevent filtering by reporting dataset, and all values will be retained.\", level = \"warning\" )\n", "}" - ], - "execution_count": null, - "outputs": [], - "id": "7469898d" + ] }, { "cell_type": "markdown", + "id": "e44ae2ab-4af7-475a-8cbe-6d669895a18b", "metadata": { "papermill": { "duration": 0.000139, @@ -267,11 +266,11 @@ }, "source": [ "## 2. Load Data" - ], - "id": "e44ae2ab-4af7-475a-8cbe-6d669895a18b" + ] }, { "cell_type": "markdown", + "id": "39e2add7-bbc7-4312-9a6f-9886d675f532", "metadata": { "papermill": { "duration": 0.000152, @@ -286,11 +285,12 @@ "### 2.1. Load routine data (DHIS2) \n", "Already formatted routine data, we use this as the master table
    \n", "(only used at the very end before exporting the table)" - ], - "id": "39e2add7-bbc7-4312-9a6f-9886d675f532" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "a1213723-f7e2-4238-9f37-f1795b187232", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:04.554212Z", @@ -310,6 +310,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "dhis2_routine <- load_dataset_file(DATASET_ID, ROUTINE_FILE)\n", "dhis2_routine <- dhis2_routine %>%\n", @@ -317,15 +318,12 @@ " dplyr::select(dplyr::any_of(fixed_cols_rr)) %>%\n", " dplyr::distinct()\n", "dim(dhis2_routine)\n", - "head(dhis2_routine, 3)\n", - "" - ], - "execution_count": null, - "outputs": [], - "id": "a1213723-f7e2-4238-9f37-f1795b187232" + "head(dhis2_routine, 3)\n" + ] }, { "cell_type": "markdown", + "id": "dccc8626-7798-4bcd-ae5f-d7502dfdc452", "metadata": { "papermill": { "duration": 0.000155, @@ -338,11 +336,12 @@ }, "source": [ "### 2.2. Load Reporting Rate data (DHIS2)" - ], - "id": "dccc8626-7798-4bcd-ae5f-d7502dfdc452" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "0e352c76-f2fb-43ba-b85d-391d808057a8", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:05.780487Z", @@ -362,6 +361,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "formatting_dataset_id <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", "reporting_parquet_name <- paste0(COUNTRY_CODE, \"_reporting.parquet\")\n", @@ -369,15 +369,12 @@ "dhis2_reporting <- load_dataset_file(formatting_dataset_id, reporting_parquet_name)\n", "dhis2_reporting <- dhis2_reporting %>%\n", " dplyr::mutate(dplyr::across(c(PERIOD, YEAR, MONTH, VALUE), as.numeric))\n", - "head(dhis2_reporting, 3)\n", - "" - ], - "execution_count": null, - "outputs": [], - "id": "0e352c76-f2fb-43ba-b85d-391d808057a8" + "head(dhis2_reporting, 3)\n" + ] }, { "cell_type": "markdown", + "id": "4d5f398b", "metadata": { "papermill": { "duration": 0.000151, @@ -390,11 +387,11 @@ }, "source": [ "## 3. Transform reporting data" - ], - "id": "4d5f398b" + ] }, { "cell_type": "markdown", + "id": "adcbee0b", "metadata": { "papermill": { "duration": 0.0001, @@ -410,11 +407,12 @@ "Logic:\n", "* Value(s) (string) for `PRODUCT_UID` defined in the config.json file\n", "* If none provided (**empty** field) skip filtering and **keep everything**" - ], - "id": "adcbee0b" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "795a5e74", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:07.104617Z", @@ -434,6 +432,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# 3.1 Filter Reporting Rate data by selected dataset PRODUCT_UID(s)\n", "if (length(REPORTING_RATE_PRODUCT_ID) > 0 && all(REPORTING_RATE_PRODUCT_ID %in% unique(dhis2_reporting$PRODUCT_UID))) {\n", @@ -468,6 +467,16 @@ " dplyr::select(-ACTUAL_REPORTS_deduplicated)\n", "}\n", "\n", + "# Country-specific normalization for Niger where reports can exceed 1\n", + "if (COUNTRY_CODE == \"NER\") {\n", + " log_msg(\"Special handling for NER: capping ACTUAL_REPORTS and EXPECTED_REPORTS values above 1.\")\n", + " dhis2_reporting_wide <- dhis2_reporting_wide %>%\n", + " dplyr::mutate(\n", + " ACTUAL_REPORTS = ifelse(ACTUAL_REPORTS > 1, 1, ACTUAL_REPORTS),\n", + " EXPECTED_REPORTS = ifelse(EXPECTED_REPORTS > 1, 1, EXPECTED_REPORTS)\n", + " )\n", + "}\n", + "\n", "# 3.4 Aggregate at ADM2 and compute reporting rate\n", "reporting_rate_results <- dhis2_reporting_wide %>%\n", " dplyr::group_by(PERIOD, YEAR, MONTH, ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID) %>%\n", @@ -477,13 +486,11 @@ " .groups = \"drop\"\n", " ) %>%\n", " dplyr::mutate(REPORTING_RATE = ACTUAL_REPORTS / EXPECTED_REPORTS)\n" - ], - "execution_count": null, - "outputs": [], - "id": "795a5e74" + ] }, { "cell_type": "markdown", + "id": "4237408a", "metadata": { "papermill": { "duration": 0.000133, @@ -496,11 +503,12 @@ }, "source": [ "### 3.2. Pivot wider" - ], - "id": "4237408a" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "5c3b9a65", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:08.413415Z", @@ -520,17 +528,16 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# 3.2 Quick check after pivot\n", "dim(dhis2_reporting_wide)\n", "head(dhis2_reporting_wide, 3)\n" - ], - "execution_count": null, - "outputs": [], - "id": "5c3b9a65" + ] }, { "cell_type": "markdown", + "id": "0f485148", "metadata": { "papermill": { "duration": 0.000186, @@ -544,11 +551,11 @@ "source": [ "### 👯 Handle **duplicated** values (`OU_ID`)\n", "Using multiple datasets relies on the **assumption** that **each dataset is complementary to the other(s)**. Namely, there should be no \"dupliacted\" orgunits that are counted in more than one dataset! Else, we would be **double counting**." - ], - "id": "0f485148" + ] }, { "cell_type": "markdown", + "id": "55dececa", "metadata": { "papermill": { "duration": 0.000122, @@ -561,11 +568,12 @@ }, "source": [ "#### Check for duplicated values (`OU_ID`)" - ], - "id": "55dececa" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "d761bd15", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:08.899486Z", @@ -585,17 +593,16 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# Count duplicated OU_ID/PERIOD combinations found\n", "cat(glue::glue(\"Duplicated OU_ID-PERIOD rows detected: {nrow(dupl_ou_period)}\"))\n", "head(dupl_ou_period, 5)\n" - ], - "execution_count": null, - "outputs": [], - "id": "d761bd15" + ] }, { "cell_type": "markdown", + "id": "805ed555", "metadata": { "papermill": { "duration": 0.000139, @@ -616,11 +623,12 @@ "2. For these, keep `max(ACTUAL_REPORTS)` (since `EXPECTED_REPORTS` is always == 1) because: \n", " * if both same value (either both 0 or both 1) => simply deduplicate (`distinct()`)\n", " * if else if different values, meaning that one dataset say 1 and the other 0 => keep 1 (facility _did_ report)" - ], - "id": "805ed555" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "593b013a", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:09.488856Z", @@ -640,6 +648,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# Verify deduplication effect at OU_ID/PERIOD level\n", "dupl_after_cleaning <- dhis2_reporting_wide %>%\n", @@ -647,13 +656,12 @@ " dplyr::filter(dplyr::n() > 1) %>%\n", " dplyr::ungroup()\n", "cat(glue::glue(\"Remaining duplicated OU_ID-PERIOD rows after cleaning: {nrow(dupl_after_cleaning)}\"))\n" - ], - "execution_count": null, - "outputs": [], - "id": "593b013a" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "c72bd93a", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:13.581200Z", @@ -673,16 +681,15 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# Optional inspection of cleaned rows\n", "head(dhis2_reporting_wide, 5)\n" - ], - "execution_count": null, - "outputs": [], - "id": "c72bd93a" + ] }, { "cell_type": "markdown", + "id": "2f26c614", "metadata": { "papermill": { "duration": 0.000236, @@ -695,12 +702,13 @@ }, "source": [ "### 3.3. ACTUAL / EXPECTED summaries after cleaning\n", - "Niger-specific capping (values > 1 set to 1) is applied in **step 3.1** above when `COUNTRY_CODE == \"NER\"`. This cell only prints `summary()` for quick QC on all countries." - ], - "id": "2f26c614" + "Niger-specific capping (values > 1 set to 1) is applied above when `COUNTRY_CODE == \"NER\"`. This cell only prints `summary()` for quick QC on all countries." + ] }, { "cell_type": "code", + "execution_count": null, + "id": "4118991c", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:18.924306Z", @@ -720,16 +728,15 @@ "languageId": "r" } }, + "outputs": [], "source": [ "summary(dhis2_reporting_wide$ACTUAL_REPORTS)\n", "summary(dhis2_reporting_wide$EXPECTED_REPORTS)\n" - ], - "execution_count": null, - "outputs": [], - "id": "4118991c" + ] }, { "cell_type": "markdown", + "id": "066319a3", "metadata": { "papermill": { "duration": 0.000172, @@ -742,11 +749,12 @@ }, "source": [ "### 3.4. Aggregate at AMD2 level" - ], - "id": "066319a3" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "e94eeddd", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:19.494212Z", @@ -766,17 +774,16 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# 3.4 Aggregate table preview\n", "dim(reporting_rate_results)\n", "head(reporting_rate_results, 5)\n" - ], - "execution_count": null, - "outputs": [], - "id": "e94eeddd" + ] }, { "cell_type": "markdown", + "id": "eb181891", "metadata": { "papermill": { "duration": 0.000151, @@ -791,11 +798,12 @@ "### 3.5. Calculate REPORTING_RATE\n", "**numerator**: `ACTUAL_REPORTS`
    \n", "**denominator**: `EXPECTED_REPORTS`" - ], - "id": "eb181891" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "e90a1c20", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:19.803233Z", @@ -815,16 +823,15 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# 3.5 Reporting rate range check\n", "summary(reporting_rate_results$REPORTING_RATE)\n" - ], - "execution_count": null, - "outputs": [], - "id": "e90a1c20" + ] }, { "cell_type": "markdown", + "id": "0556eba8-3d6a-45b1-af02-9bdf7da6fc99", "metadata": { "papermill": { "duration": 0.000123, @@ -839,11 +846,12 @@ "### 3.6. Ensure consistency of table (probably can skip because all data comes from the same source!)\n", "Left join reporting indicators with DHIS2 routine data.\n", "Make sure we have a consistent reporting rates table matching periods x org units (safety measure only)." - ], - "id": "0556eba8-3d6a-45b1-af02-9bdf7da6fc99" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "51e5b97a-e9b9-42d4-b991-0cee4fd5041f", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:20.001909Z", @@ -863,6 +871,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "reporting_rate_dataset <- left_join(dhis2_routine, \n", " reporting_rate_results %>% select(all_of(fixed_cols_rr)), \n", @@ -870,13 +879,11 @@ "\n", "print(dim(reporting_rate_dataset))\n", "head(reporting_rate_dataset, 3)" - ], - "execution_count": null, - "outputs": [], - "id": "51e5b97a-e9b9-42d4-b991-0cee4fd5041f" + ] }, { "cell_type": "markdown", + "id": "6b19e88d", "metadata": { "papermill": { "duration": 0.000173, @@ -889,16 +896,18 @@ }, "source": [ "### 3.7. Final visual check on REPORTING_RATE values" - ], - "id": "6b19e88d" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "fbfec60f", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Add log message to communicate range of REPORTING_RATE values and warn if any values are outside [0,1]\n", "min_rr <- min(reporting_rate_dataset$REPORTING_RATE, na.rm = TRUE)\n", @@ -910,13 +919,12 @@ " log_msg(glue::glue(\"✅ REPORTING_RATE values are within the expected range [0,1]. \n", " Minimum REPORTING_RATE: {round(min_rr, 4)}, Maximum REPORTING_RATE: {round(max_rr, 4)}\"))\n", "}" - ], - "execution_count": null, - "outputs": [], - "id": "fbfec60f" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "8878192f", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:20.080475Z", @@ -936,6 +944,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "# Simple plot to visualize distribution of REPORTING_RATE\n", "ggplot(reporting_rate_dataset, aes(x=REPORTING_RATE)) +\n", @@ -945,13 +954,11 @@ " title = glue::glue(\"Reporting rate values range from {round(min(reporting_rate_dataset$REPORTING_RATE), 2)} to {round(max(reporting_rate_dataset$REPORTING_RATE), 2)}\")\n", " ) +\n", " theme_minimal()" - ], - "execution_count": null, - "outputs": [], - "id": "8878192f" + ] }, { "cell_type": "markdown", + "id": "ad181b27-bf7b-4eb5-9200-fda8c2b8eb60", "metadata": { "papermill": { "duration": 0.000104, @@ -965,11 +972,12 @@ "source": [ "## 4. 📁 Export to `data/` folder\n", "Export as both .csv and .parquet file formats." - ], - "id": "ad181b27-bf7b-4eb5-9200-fda8c2b8eb60" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "9adc033d-18d6-4786-8f96-21337b3e005f", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:21.467337Z", @@ -989,20 +997,20 @@ "languageId": "r" } }, + "outputs": [], "source": [ "output_dir <- file.path(snt_environment$DATA_PATH, \"dhis2\", \"reporting_rate\")\n", "dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)\n", - "csv_path <- file.path(output_dir, paste0(COUNTRY_CODE, \"_reporting_rate_dataset.csv\"))\n", - "parquet_path <- file.path(output_dir, paste0(COUNTRY_CODE, \"_reporting_rate_dataset.parquet\"))\n", - "utils::write.csv(reporting_rate_dataset, csv_path, row.names = FALSE)\n", - "log_msg(glue::glue(\"Exported: {csv_path}\"))\n", - "arrow::write_parquet(reporting_rate_dataset, parquet_path)\n", - "log_msg(glue::glue(\"Exported: {parquet_path}\"))\n", - "" - ], - "execution_count": null, - "outputs": [], - "id": "9adc033d-18d6-4786-8f96-21337b3e005f" + "\n", + "out_msg <- paste0(\"Reporting rate dataset saved under: \", file.path(output_dir, paste0(COUNTRY_CODE, \"_reporting_rate_dataset.csv\")))\n", + "\n", + "# write parquet and csv files\n", + "write_parquet(reporting_rate_dataset, file.path(output_dir, paste0(COUNTRY_CODE, \"_reporting_rate_dataset.parquet\")))\n", + "write.csv(reporting_rate_dataset, file.path(output_dir, paste0(COUNTRY_CODE, \"_reporting_rate_dataset.csv\")), row.names = FALSE)\n", + "\n", + "# log\n", + "log_msg(out_msg)\n" + ] } ], "metadata": { @@ -1037,4 +1045,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} From 8a8bc8e45931233bff96454a7a466a64465e0504 Mon Sep 17 00:00:00 2001 From: claude-marie Date: Tue, 21 Apr 2026 12:23:24 +0200 Subject: [PATCH 13/18] useless sort --- .../code/snt_dhis2_quality_of_care.ipynb | 40 ++----------------- 1 file changed, 4 insertions(+), 36 deletions(-) diff --git a/pipelines/snt_dhis2_quality_of_care/code/snt_dhis2_quality_of_care.ipynb b/pipelines/snt_dhis2_quality_of_care/code/snt_dhis2_quality_of_care.ipynb index 3d650ea..f60ec58 100644 --- a/pipelines/snt_dhis2_quality_of_care/code/snt_dhis2_quality_of_care.ipynb +++ b/pipelines/snt_dhis2_quality_of_care/code/snt_dhis2_quality_of_care.ipynb @@ -79,46 +79,14 @@ " stop(glue::glue(\"Invalid data_action: {data_action}. Allowed: {paste(allowed_actions, collapse=', ')}\"))\n", "}\n", "\n", - "# Automatically find the latest routine outliers-imputed file in the dataset\n", - "# Pattern: {COUNTRY_CODE}_routine_outliers-*_{data_action}.parquet\n", - "log_msg(glue::glue(\"Searching for latest routine outliers-imputed file in dataset (data_action: {data_action})...\"))\n", + "routine_filename <- glue::glue(\"{COUNTRY_CODE}_routine_outliers_{data_action}.parquet\")\n", + "log_msg(glue::glue(\"Loading routine file from latest dataset version: {routine_filename}\"))\n", "\n", - "dataset_last_version <- openhexa$workspace$get_dataset(OUTLIERS_DATASET)$latest_version\n", - "if (is.null(dataset_last_version)) {\n", - " stop(glue::glue(\"[ERROR] No version available in dataset `{OUTLIERS_DATASET}`. Process stopped.\"))\n", - "}\n", - "\n", - "# Pattern to match: {COUNTRY_CODE}_routine_outliers-*_{data_action}.parquet\n", - "pattern_prefix <- glue::glue(\"{COUNTRY_CODE}_routine_outliers-\")\n", - "pattern_suffix <- glue::glue(\"_{data_action}.parquet\")\n", - "routine_filename <- NULL\n", - "files_list <- reticulate::iterate(dataset_last_version$files)\n", - "\n", - "# Find all matching files and select the latest one\n", - "matching_files <- c()\n", - "for (file in files_list) {\n", - " filename <- file$filename\n", - " if (startsWith(filename, pattern_prefix) && endsWith(filename, pattern_suffix)) {\n", - " matching_files <- c(matching_files, filename)\n", - " }\n", - "}\n", - "\n", - "if (length(matching_files) == 0) {\n", - " stop(glue::glue(\"[ERROR] No file matching pattern `{pattern_prefix}*{pattern_suffix}` found in dataset `{OUTLIERS_DATASET}`. \",\n", - " \"Please run an outlier imputation pipeline first (e.g., snt_dhis2_outliers_imputation_mean) with `data_action=\\\"{data_action}\\\"`.\"))\n", - "}\n", - "\n", - "# Select the latest file (alphabetically sorted, which should correspond to most recent method)\n", - "routine_filename <- sort(matching_files, decreasing = TRUE)[1]\n", - "\n", - "log_msg(glue::glue(\"Found {length(matching_files)} matching file(s). Using latest: {routine_filename}\"))\n", - "\n", - "# Load the routine file\n", "routine <- tryCatch({\n", " get_latest_dataset_file_in_memory(OUTLIERS_DATASET, routine_filename)\n", "}, error = function(e) {\n", - " msg <- paste0(\"[ERROR] 🛑 Error while loading DHIS2 routine data file `\", routine_filename, \n", - " \"` from `\", OUTLIERS_DATASET, \"`. [ERROR DETAILS] \", conditionMessage(e))\n", + " msg <- paste0(\"[ERROR] Error while loading DHIS2 routine data file `\", routine_filename,\n", + " \"` from `\", OUTLIERS_DATASET, \"`: \", conditionMessage(e))\n", " stop(msg)\n", "})\n", "\n", From 8fc838ad77ae08f671522bdd4bcd5633bcb5ed47 Mon Sep 17 00:00:00 2001 From: claude-marie Date: Wed, 22 Apr 2026 13:32:00 +0200 Subject: [PATCH 14/18] fix and move somestuff --- .../code/snt_dhis2_quality_of_care.ipynb | 546 +++-------- .../snt_dhis2_quality_of_care_report.ipynb | 892 +++++------------- .../utils/snt_dhis2_quality_of_care.r | 247 +++++ .../utils/snt_dhis2_quality_of_care_report.r | 125 +++ 4 files changed, 772 insertions(+), 1038 deletions(-) create mode 100644 pipelines/snt_dhis2_quality_of_care/utils/snt_dhis2_quality_of_care.r create mode 100644 pipelines/snt_dhis2_quality_of_care/utils/snt_dhis2_quality_of_care_report.r diff --git a/pipelines/snt_dhis2_quality_of_care/code/snt_dhis2_quality_of_care.ipynb b/pipelines/snt_dhis2_quality_of_care/code/snt_dhis2_quality_of_care.ipynb index f60ec58..0a13620 100644 --- a/pipelines/snt_dhis2_quality_of_care/code/snt_dhis2_quality_of_care.ipynb +++ b/pipelines/snt_dhis2_quality_of_care/code/snt_dhis2_quality_of_care.ipynb @@ -1,398 +1,156 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "fad6c24e", - "metadata": {}, - "source": [ - "## Quality of Care Indicators\n", - "\n", - "Compute district-year quality-of-care indicators from DHIS2 outliers-imputed routine data.\n", - "\n", - "Indicators:\n", - "- testing_rate = TEST / SUSP\n", - "- treatment_rate = MALTREAT / CONF\n", - "- case_fatality_rate = MALDTH / MALADM\n", - "- prop_adm_malaria = MALADM / ALLADM\n", - "- prop_malaria_deaths = MALDTH / ALLDTH\n", - "- non_malaria_all_cause_outpatients = ALLOUT (absolute)\n", - "- presumed_cases = PRES (absolute)\n", - "\n", - "Stock-out indicators are not implemented yet (on hold, NMDR data pending)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "317c4085", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Preliminaries\n", - "options(scipen=999)\n", - "\n", - "ROOT_PATH <- \"~/workspace\"\n", - "CONFIG_PATH <- file.path(ROOT_PATH, \"configuration\")\n", - "CODE_PATH <- file.path(ROOT_PATH, \"code\")\n", - "DATA_PATH <- file.path(ROOT_PATH, \"data\")\n", - "OUTPUT_DATA_PATH <- file.path(DATA_PATH, \"dhis2\", \"quality_of_care\")\n", - "FIGURES_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_quality_of_care\", \"reporting\", \"outputs\", \"figures\")\n", - "\n", - "dir.create(OUTPUT_DATA_PATH, recursive = TRUE, showWarnings = FALSE)\n", - "dir.create(FIGURES_PATH, recursive = TRUE, showWarnings = FALSE)\n", - "\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "required_packages <- c(\"jsonlite\", \"data.table\", \"arrow\", \"sf\", \"ggplot2\", \"glue\", \"reticulate\", \"RColorBrewer\", \"dplyr\")\n", - "install_and_load(required_packages)\n", - "\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "openhexa <- reticulate::import(\"openhexa.sdk\")\n", - "\n", - "config_json <- jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "DHIS2_FORMATTED_DATASET <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "OUTLIERS_DATASET <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "98b78bf7", - "metadata": { - "vscode": { - "languageId": "r" + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Quality of Care Indicators\n", + "\n", + "Compute district-year quality-of-care indicators from DHIS2 outliers-imputed routine data.\n", + "\n", + "Indicators:\n", + "- testing_rate = TEST / SUSP\n", + "- treatment_rate = MALTREAT / CONF\n", + "- case_fatality_rate = MALDTH / MALADM\n", + "- prop_adm_malaria = MALADM / ALLADM\n", + "- prop_malaria_deaths = MALDTH / ALLDTH\n", + "- non_malaria_all_cause_outpatients = ALLOUT (absolute)\n", + "- presumed_cases = PRES (absolute)\n", + "\n", + "Stock-out indicators are not implemented yet (on hold, NMDR data pending)." + ], + "id": "fad6c24e" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Preliminaries\n", + "options(scipen = 999)\n", + "\n", + "ROOT_PATH <- \"~/workspace\"\n", + "source(file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_quality_of_care\", \"utils\", \"snt_dhis2_quality_of_care.r\"))\n", + "\n", + "snt_environment <- get_setup_variables(\n", + " SNT_ROOT_PATH = ROOT_PATH,\n", + " packages = c(\"jsonlite\", \"data.table\", \"arrow\", \"sf\", \"ggplot2\", \"glue\", \"reticulate\", \"RColorBrewer\", \"dplyr\", \"writexl\", \"knitr\", \"scales\", \"gridExtra\")\n", + ")\n", + "config_json <- load_snt_config(snt_environment$CONFIG_PATH, \"SNT_config.json\")\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "DHIS2_FORMATTED_DATASET <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "OUTLIERS_DATASET <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION\n", + "\n", + "PIPELINE_PATH <- file.path(snt_environment$PIPELINES_PATH, \"snt_dhis2_quality_of_care\")\n", + "OUTPUT_DATA_PATH <- file.path(snt_environment$DATA_PATH, \"dhis2\", \"quality_of_care\")\n", + "REPORT_OUTPUTS_PATH <- file.path(PIPELINE_PATH, \"reporting\", \"outputs\")\n", + "FIGURES_PATH <- file.path(REPORT_OUTPUTS_PATH, \"figures\")\n", + "\n", + "dir.create(OUTPUT_DATA_PATH, recursive = TRUE, showWarnings = FALSE)\n", + "dir.create(REPORT_OUTPUTS_PATH, recursive = TRUE, showWarnings = FALSE)\n", + "dir.create(FIGURES_PATH, recursive = TRUE, showWarnings = FALSE)" + ], + "execution_count": null, + "outputs": [], + "id": "317c4085" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Load and prepare inputs\n", + "if (!exists(\"data_action\")) {\n", + " data_action <- \"imputed\"\n", + "}\n", + "data_action <- validate_quality_of_care_action(data_action)\n", + "\n", + "log_msg(glue::glue(\"Using outliers dataset id: {OUTLIERS_DATASET}\"))\n", + "\n", + "routine_filename <- glue::glue(\"{COUNTRY_CODE}_routine_outliers_{data_action}.parquet\")\n", + "log_msg(glue::glue(\"Using routine file: {routine_filename}\"))\n", + "\n", + "routine <- load_dataset_file(\n", + " dataset_id = OUTLIERS_DATASET,\n", + " filename = routine_filename\n", + ")\n", + "\n", + "shapes_filename <- glue::glue(\"{COUNTRY_CODE}_shapes.geojson\")\n", + "shapes <- load_dataset_file(\n", + " dataset_id = DHIS2_FORMATTED_DATASET,\n", + " filename = shapes_filename\n", + ")\n", + "\n", + "core_cols <- c(\"ADM2_ID\", \"YEAR\")\n", + "core_missing <- setdiff(core_cols, names(routine))\n", + "if (length(core_missing) > 0) {\n", + " stop(glue::glue(\"[ERROR] Missing core columns: {paste(core_missing, collapse = ', ')}\"))\n", + "}\n", + "\n", + "# Keep indicator list in notebook (pipeline logic), not hardcoded in utils functions.\n", + "indicator_cols <- c(\"TEST\", \"SUSP\", \"MALTREAT\", \"CONF\", \"MALDTH\", \"MALADM\", \"ALLADM\", \"ALLDTH\", \"ALLOUT\", \"PRES\")\n", + "missing_cols <- setdiff(indicator_cols, names(routine))\n", + "if (length(missing_cols) > 0) {\n", + " log_msg(glue::glue(\"[WARNING] Missing indicator columns: {paste(missing_cols, collapse = ', ')}\"), level = \"warning\")\n", + "}\n", + "\n", + "routine <- normalize_qoc_routine_types(routine, indicator_cols = indicator_cols)\n", + "qoc <- aggregate_qoc_district_year(routine, indicator_cols = indicator_cols)\n", + "qoc <- add_quality_of_care_derived_indicators(qoc)\n", + "qoc <- attach_quality_of_care_shapes(qoc, shapes)\n", + "\n", + "save_quality_of_care_outputs(\n", + " qoc_dt = qoc,\n", + " output_data_path = OUTPUT_DATA_PATH,\n", + " country_code = COUNTRY_CODE,\n", + " data_action = data_action\n", + ")" + ], + "execution_count": null, + "outputs": [], + "id": "98b78bf7" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Build yearly maps (saved as PNG)\n", + "save_quality_of_care_maps(\n", + " qoc_dt = qoc,\n", + " shapes_sf = shapes,\n", + " figures_path = FIGURES_PATH\n", + ")" + ], + "execution_count": null, + "outputs": [], + "id": "984689b0" } - }, - "outputs": [], - "source": [ - "# Validate data_action parameter\n", - "if (!exists(\"data_action\")) {\n", - " data_action <- \"imputed\"\n", - "}\n", - "\n", - "allowed_actions <- c(\"imputed\", \"removed\")\n", - "if (!(data_action %in% allowed_actions)) {\n", - " stop(glue::glue(\"Invalid data_action: {data_action}. Allowed: {paste(allowed_actions, collapse=', ')}\"))\n", - "}\n", - "\n", - "routine_filename <- glue::glue(\"{COUNTRY_CODE}_routine_outliers_{data_action}.parquet\")\n", - "log_msg(glue::glue(\"Loading routine file from latest dataset version: {routine_filename}\"))\n", - "\n", - "routine <- tryCatch({\n", - " get_latest_dataset_file_in_memory(OUTLIERS_DATASET, routine_filename)\n", - "}, error = function(e) {\n", - " msg <- paste0(\"[ERROR] Error while loading DHIS2 routine data file `\", routine_filename,\n", - " \"` from `\", OUTLIERS_DATASET, \"`: \", conditionMessage(e))\n", - " stop(msg)\n", - "})\n", - "\n", - "shapes <- get_latest_dataset_file_in_memory(DHIS2_FORMATTED_DATASET, paste0(COUNTRY_CODE, \"_shapes.geojson\"))\n", - "\n", - "setDT(routine)\n", - "\n", - "# Core required columns (must exist)\n", - "core_cols <- c(\"ADM2_ID\", \"YEAR\")\n", - "core_missing <- setdiff(core_cols, names(routine))\n", - "if (length(core_missing) > 0) {\n", - " stop(glue::glue(\"Missing core required columns in routine data: {paste(core_missing, collapse=', ')}\"))\n", - "}\n", - "\n", - "# Optional indicator columns (will be checked and handled gracefully)\n", - "indicator_cols <- c(\"TEST\", \"SUSP\", \"MALTREAT\", \"CONF\", \"MALDTH\", \"MALADM\", \"ALLADM\", \"ALLDTH\", \"ALLOUT\", \"PRES\")\n", - "available_cols <- intersect(indicator_cols, names(routine))\n", - "missing_cols <- setdiff(indicator_cols, names(routine))\n", - "\n", - "if (length(missing_cols) > 0) {\n", - " log_msg(glue::glue(\"[WARNING] Some indicator columns are missing: {paste(missing_cols, collapse=', ')}. These indicators will not be calculated.\"), level = \"warning\")\n", - "}\n", - "\n", - "# Convert available numeric columns\n", - "# Handle \"-\" and other non-numeric values by converting them to NA first\n", - "num_cols <- intersect(available_cols, names(routine))\n", - "if (length(num_cols) > 0) {\n", - " for (col in num_cols) {\n", - " # First convert to character to handle \"-\" strings, then replace with NA, then convert to numeric\n", - " col_vals <- as.character(routine[[col]])\n", - " col_vals[is.na(col_vals) | col_vals == \"\" | col_vals == \"-\"] <- NA_character_\n", - " routine[, (col) := as.numeric(col_vals)]\n", - " }\n", - "}\n", - "routine[, YEAR := as.integer(YEAR)]\n", - "routine[, ADM2_ID := as.character(ADM2_ID)]\n", - "\n", - "# Aggregate available columns only using lapply\n", - "if (length(available_cols) > 0) {\n", - " qoc <- routine[, lapply(.SD, function(x) sum(x, na.rm = TRUE)), \n", - " .SDcols = available_cols, \n", - " by = .(ADM2_ID, YEAR)]\n", - "} else {\n", - " # If no indicator columns available, create empty structure\n", - " qoc <- routine[, .(ADM2_ID, YEAR)]\n", - " qoc <- unique(qoc)\n", - "}\n", - "\n", - "# Calculate indicators only if required columns are available\n", - "if (\"TEST\" %in% names(qoc) && \"SUSP\" %in% names(qoc)) {\n", - " qoc[, testing_rate := fifelse(SUSP > 0, TEST / SUSP, NA_real_)]\n", - "} else {\n", - " log_msg(\"[WARNING] Cannot calculate testing_rate: missing TEST or SUSP columns\", level = \"warning\")\n", - "}\n", - "\n", - "if (\"MALTREAT\" %in% names(qoc) && \"CONF\" %in% names(qoc)) {\n", - " qoc[, treatment_rate := fifelse(CONF > 0, MALTREAT / CONF, NA_real_)]\n", - "} else {\n", - " log_msg(\"[WARNING] Cannot calculate treatment_rate: missing MALTREAT or CONF columns\", level = \"warning\")\n", - "}\n", - "\n", - "if (\"MALDTH\" %in% names(qoc) && \"MALADM\" %in% names(qoc)) {\n", - " qoc[, case_fatality_rate := fifelse(MALADM > 0, MALDTH / MALADM, NA_real_)]\n", - "} else {\n", - " log_msg(\"[WARNING] Cannot calculate case_fatality_rate: missing MALDTH or MALADM columns\", level = \"warning\")\n", - "}\n", - "\n", - "if (\"MALADM\" %in% names(qoc) && \"ALLADM\" %in% names(qoc)) {\n", - " qoc[, prop_adm_malaria := fifelse(ALLADM > 0, MALADM / ALLADM, NA_real_)]\n", - "} else {\n", - " log_msg(\"[WARNING] Cannot calculate prop_adm_malaria: missing MALADM or ALLADM columns\", level = \"warning\")\n", - "}\n", - "\n", - "if (\"MALDTH\" %in% names(qoc) && \"ALLDTH\" %in% names(qoc)) {\n", - " qoc[, prop_malaria_deaths := fifelse(ALLDTH > 0, MALDTH / ALLDTH, NA_real_)]\n", - " # Compatibility alias to match historical notebook export naming\n", - " qoc[, prop_deaths_malaria := prop_malaria_deaths]\n", - "} else {\n", - " log_msg(\"[WARNING] Cannot calculate prop_malaria_deaths: missing MALDTH or ALLDTH columns\", level = \"warning\")\n", - "}\n", - "\n", - "if (\"ALLOUT\" %in% names(qoc)) {\n", - " qoc[, non_malaria_all_cause_outpatients := ALLOUT]\n", - "} else {\n", - " log_msg(\"[WARNING] Cannot calculate non_malaria_all_cause_outpatients: missing ALLOUT column\", level = \"warning\")\n", - "}\n", - "\n", - "if (\"PRES\" %in% names(qoc)) {\n", - " qoc[, presumed_cases := PRES]\n", - "} else {\n", - " log_msg(\"[WARNING] Cannot calculate presumed_cases: missing PRES column\", level = \"warning\")\n", - "}\n", - "\n", - "shapes_dt <- as.data.table(sf::st_drop_geometry(shapes))\n", - "if (\"ADM2_ID\" %in% names(shapes_dt) && \"ADM2_NAME\" %in% names(shapes_dt)) {\n", - " shapes_dt[, ADM2_ID := as.character(ADM2_ID)]\n", - " qoc <- merge(qoc, unique(shapes_dt[, .(ADM2_ID, ADM2_NAME)]), by = \"ADM2_ID\", all.x = TRUE)\n", - "}\n", - "\n", - "# Persist only district-year outputs (requested)\n", - "out_district_parquet <- file.path(OUTPUT_DATA_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_district_year_{data_action}.parquet\"))\n", - "out_district_csv <- file.path(OUTPUT_DATA_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_district_year_{data_action}.csv\"))\n", - "\n", - "arrow::write_parquet(qoc, out_district_parquet)\n", - "data.table::fwrite(qoc, out_district_csv)\n", - "\n", - "log_msg(glue::glue(\"Saved outputs: {out_district_parquet}, {out_district_csv}\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "984689b0", - "metadata": { - "vscode": { - "languageId": "r" + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" } - }, - "outputs": [], - "source": [ - "# Yearly maps by ADM2\n", - "# Ensure ADM2_ID is character in both objects (do this once before the function)\n", - "shapes$ADM2_ID <- as.character(shapes$ADM2_ID)\n", - "qoc$ADM2_ID <- as.character(qoc$ADM2_ID)\n", - "\n", - "plot_yearly_map <- function(df, sf_shapes, value_col, title_prefix, filename_prefix, is_rate = TRUE) {\n", - " # Check if value_col exists in df\n", - " if (!(value_col %in% names(df))) {\n", - " log_msg(glue::glue(\"[WARNING] Column '{value_col}' not found in data. Skipping map generation.\"), level = \"warning\")\n", - " return(invisible(NULL))\n", - " }\n", - " \n", - " # Create a local copy of sf_shapes to avoid modifying the original\n", - " sf_shapes_local <- sf_shapes\n", - " if (!is.character(sf_shapes_local$ADM2_ID)) {\n", - " sf_shapes_local$ADM2_ID <- as.character(sf_shapes_local$ADM2_ID)\n", - " }\n", - " \n", - " years <- sort(unique(df$YEAR))\n", - " for (yr in years) {\n", - " df_y <- df[YEAR == yr]\n", - " \n", - " # Check if df_y has any rows\n", - " if (nrow(df_y) == 0) {\n", - " log_msg(glue::glue(\"[WARNING] No data for '{value_col}' in year {yr}. Skipping map.\"), level = \"warning\")\n", - " next\n", - " }\n", - " \n", - " # Ensure ADM2_ID is character in df_y\n", - " df_y$ADM2_ID <- as.character(df_y$ADM2_ID)\n", - " \n", - " # Use dplyr::left_join for sf objects to preserve geometry (use local copy)\n", - " map_df <- dplyr::left_join(sf_shapes_local, df_y, by = \"ADM2_ID\")\n", - "\n", - " # Check if value_col exists in map_df after merge\n", - " if (!(value_col %in% names(map_df))) {\n", - " log_msg(glue::glue(\"[WARNING] Column '{value_col}' not found after merge for year {yr}. Skipping map.\"), level = \"warning\")\n", - " next\n", - " }\n", - "\n", - " vals <- map_df[[value_col]]\n", - " finite_vals <- vals[is.finite(vals) & !is.na(vals)]\n", - " \n", - " # If no valid values, skip this map\n", - " if (length(finite_vals) == 0) {\n", - " log_msg(glue::glue(\"[WARNING] No valid values for '{value_col}' in year {yr}. Skipping map.\"), level = \"warning\")\n", - " next\n", - " }\n", - "\n", - " # Create cat column BEFORE creating the plot\n", - " cat_vals <- NULL\n", - " fill_palette <- NULL\n", - " \n", - " if (is_rate) {\n", - " # Create cat column with proper handling of NA values\n", - " cat_result <- tryCatch({\n", - " cat_vals <- cut(\n", - " vals,\n", - " breaks = c(-Inf, 0, 0.2, 0.4, 0.6, 0.8, 1.0, Inf),\n", - " labels = c(\"<0\", \"0-0.2\", \"0.2-0.4\", \"0.4-0.6\", \"0.6-0.8\", \"0.8-1.0\", \">1.0\"),\n", - " include.lowest = TRUE\n", - " )\n", - " fill_palette <- \"YlOrRd\"\n", - " TRUE # Success\n", - " }, error = function(e) {\n", - " log_msg(glue::glue(\"[WARNING] Failed to create categories for '{value_col}' year {yr}: {conditionMessage(e)}\"), level = \"warning\")\n", - " FALSE # Failure\n", - " })\n", - " if (!cat_result) {\n", - " next\n", - " }\n", - " } else {\n", - " cat_result <- tryCatch({\n", - " if (length(finite_vals) > 4) {\n", - " br <- unique(as.numeric(quantile(finite_vals, probs = seq(0, 1, 0.2), na.rm = TRUE)))\n", - " if (length(br) < 2) {\n", - " cat_vals <- as.factor(rep(\"all\", nrow(map_df)))\n", - " } else {\n", - " cat_vals <- cut(vals, breaks = br, include.lowest = TRUE)\n", - " }\n", - " } else {\n", - " cat_vals <- as.factor(vals)\n", - " }\n", - " fill_palette <- \"Blues\"\n", - " TRUE # Success\n", - " }, error = function(e) {\n", - " log_msg(glue::glue(\"[WARNING] Failed to create categories for '{value_col}' year {yr}: {conditionMessage(e)}\"), level = \"warning\")\n", - " FALSE # Failure\n", - " })\n", - " if (!cat_result) {\n", - " next\n", - " }\n", - " }\n", - " \n", - " # Check if cat_vals was created successfully\n", - " if (is.null(cat_vals) || length(cat_vals) != nrow(map_df)) {\n", - " log_msg(glue::glue(\"[WARNING] Failed to create 'cat' column for '{value_col}' in year {yr}. Skipping map.\"), level = \"warning\")\n", - " next\n", - " }\n", - " \n", - " # Check if all values are NA (cut failed) - but allow some NA values\n", - " if (all(is.na(cat_vals))) {\n", - " log_msg(glue::glue(\"[WARNING] All 'cat' values are NA for '{value_col}' in year {yr}. Skipping map.\"), level = \"warning\")\n", - " next\n", - " }\n", - " \n", - " # Add cat column using dplyr::mutate to ensure it's properly added to sf object\n", - " map_df <- dplyr::mutate(map_df, cat = as.factor(cat_vals))\n", - " \n", - " # Verify cat column exists before creating plot\n", - " if (!(\"cat\" %in% names(map_df))) {\n", - " log_msg(glue::glue(\"[WARNING] Failed to add 'cat' column to map_df for '{value_col}' in year {yr}. Skipping map.\"), level = \"warning\")\n", - " next\n", - " }\n", - " \n", - " # Create plot AFTER cat column is added\n", - " p <- ggplot(map_df) +\n", - " geom_sf(aes(fill = cat), color = \"grey60\", size = 0.1) +\n", - " scale_fill_brewer(palette = fill_palette, na.value = \"white\", drop = FALSE)\n", - "\n", - " p <- p +\n", - " theme_void() +\n", - " labs(\n", - " title = paste0(title_prefix, \" - \", yr),\n", - " fill = value_col,\n", - " caption = \"Source: SNT DHIS2 outliers-imputed routine data\"\n", - " ) +\n", - " theme(\n", - " legend.position = \"bottom\",\n", - " plot.title = element_text(face = \"bold\", size = 12)\n", - " )\n", - "\n", - " out_png <- file.path(FIGURES_PATH, glue::glue(\"{filename_prefix}_{yr}.png\"))\n", - " \n", - " # Try to save the plot, catch any errors\n", - " tryCatch({\n", - " ggsave(out_png, plot = p, width = 9, height = 7, dpi = 300, bg = \"white\")\n", - " log_msg(glue::glue(\"Saved map: {out_png}\"))\n", - " }, error = function(e) {\n", - " log_msg(glue::glue(\"[WARNING] Failed to save map for '{value_col}' year {yr}: {conditionMessage(e)}\"), level = \"warning\")\n", - " })\n", - " }\n", - "}\n", - "\n", - "# Plot only indicators that were calculated (columns exist)\n", - "if (\"testing_rate\" %in% names(qoc)) {\n", - " plot_yearly_map(qoc, shapes, \"testing_rate\", \"Testing rate (TEST / SUSP)\", \"testing_rate\", TRUE)\n", - "}\n", - "if (\"treatment_rate\" %in% names(qoc)) {\n", - " plot_yearly_map(qoc, shapes, \"treatment_rate\", \"Treatment rate (MALTREAT / CONF)\", \"treatment_rate\", TRUE)\n", - "}\n", - "if (\"case_fatality_rate\" %in% names(qoc)) {\n", - " plot_yearly_map(qoc, shapes, \"case_fatality_rate\", \"In-hospital case fatality rate (MALDTH / MALADM)\", \"case_fatality_rate\", TRUE)\n", - "}\n", - "if (\"prop_adm_malaria\" %in% names(qoc)) {\n", - " plot_yearly_map(qoc, shapes, \"prop_adm_malaria\", \"Proportion admitted for malaria (MALADM / ALLADM)\", \"prop_adm_malaria\", TRUE)\n", - "}\n", - "if (\"prop_malaria_deaths\" %in% names(qoc)) {\n", - " plot_yearly_map(qoc, shapes, \"prop_malaria_deaths\", \"Proportion of malaria deaths (MALDTH / ALLDTH)\", \"prop_malaria_deaths\", TRUE)\n", - "}\n", - "if (\"non_malaria_all_cause_outpatients\" %in% names(qoc)) {\n", - " plot_yearly_map(qoc, shapes, \"non_malaria_all_cause_outpatients\", \"Non-malaria all-cause outpatients (ALLOUT)\", \"allout\", FALSE)\n", - "}\n", - "if (\"presumed_cases\" %in% names(qoc)) {\n", - " plot_yearly_map(qoc, shapes, \"presumed_cases\", \"Presumed cases (PRES)\", \"presumed_cases\", FALSE)\n", - "}\n", - "\n", - "log_msg(glue::glue(\"Saved yearly maps in: {FIGURES_PATH}\"))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/pipelines/snt_dhis2_quality_of_care/reporting/snt_dhis2_quality_of_care_report.ipynb b/pipelines/snt_dhis2_quality_of_care/reporting/snt_dhis2_quality_of_care_report.ipynb index 045eb65..5a96aa6 100644 --- a/pipelines/snt_dhis2_quality_of_care/reporting/snt_dhis2_quality_of_care_report.ipynb +++ b/pipelines/snt_dhis2_quality_of_care/reporting/snt_dhis2_quality_of_care_report.ipynb @@ -1,648 +1,252 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "7d246ae9", - "metadata": {}, - "source": [ - "## Quality of Care Report\n", - "\n", - "This report displays a compact year-level summary of quality-of-care indicators and points to generated map outputs." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5eaa5bab", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "ROOT_PATH <- \"~/workspace\"\n", - "CONFIG_PATH <- file.path(ROOT_PATH, \"configuration\")\n", - "CODE_PATH <- file.path(ROOT_PATH, \"code\")\n", - "DATA_PATH <- file.path(ROOT_PATH, \"data\", \"dhis2\", \"quality_of_care\")\n", - "FIGURES_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_quality_of_care\", \"reporting\", \"outputs\", \"figures\")\n", - "\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "install_and_load(c(\"jsonlite\", \"data.table\", \"arrow\", \"dplyr\", \"knitr\", \"glue\", \"reticulate\", \"writexl\", \"ggplot2\", \"scales\", \"gridExtra\", \"sf\"))\n", - "\n", - "# Create output directories\n", - "REPORT_OUTPUTS_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_quality_of_care\", \"reporting\", \"outputs\")\n", - "dir.create(REPORT_OUTPUTS_PATH, recursive = TRUE, showWarnings = FALSE)\n", - "dir.create(FIGURES_PATH, recursive = TRUE, showWarnings = FALSE)\n", - "\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "openhexa <- reticulate::import(\"openhexa.sdk\")\n", - "\n", - "config_json <- jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1a8320f8", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Use district-year output file (latest action)\n", - "files <- list.files(DATA_PATH, pattern = paste0(\"^\", COUNTRY_CODE, \"_quality_of_care_district_year_(imputed|removed)\\\\.parquet$\"), full.names = TRUE)\n", - "if (length(files) == 0) {\n", - " stop(glue::glue(\"No quality_of_care parquet found in {DATA_PATH}\"))\n", - "}\n", - "\n", - "latest_file <- files[which.max(file.info(files)$mtime)]\n", - "qoc <- as.data.table(arrow::read_parquet(latest_file))\n", - "\n", - "# Build summary table with only available columns\n", - "# Start with unique YEAR values\n", - "summary_tbl <- unique(qoc[, .(YEAR)])\n", - "\n", - "# Add rate indicators (mean) - merge one by one\n", - "if (\"testing_rate\" %in% names(qoc)) {\n", - " summary_tbl <- merge(summary_tbl, \n", - " qoc[, .(testing_rate = mean(testing_rate, na.rm = TRUE)), by = .(YEAR)], \n", - " by = \"YEAR\", all.x = TRUE)\n", - "}\n", - "if (\"treatment_rate\" %in% names(qoc)) {\n", - " summary_tbl <- merge(summary_tbl, \n", - " qoc[, .(treatment_rate = mean(treatment_rate, na.rm = TRUE)), by = .(YEAR)], \n", - " by = \"YEAR\", all.x = TRUE)\n", - "}\n", - "if (\"case_fatality_rate\" %in% names(qoc)) {\n", - " summary_tbl <- merge(summary_tbl, \n", - " qoc[, .(case_fatality_rate = mean(case_fatality_rate, na.rm = TRUE)), by = .(YEAR)], \n", - " by = \"YEAR\", all.x = TRUE)\n", - "}\n", - "if (\"prop_adm_malaria\" %in% names(qoc)) {\n", - " summary_tbl <- merge(summary_tbl, \n", - " qoc[, .(prop_adm_malaria = mean(prop_adm_malaria, na.rm = TRUE)), by = .(YEAR)], \n", - " by = \"YEAR\", all.x = TRUE)\n", - "}\n", - "if (\"prop_malaria_deaths\" %in% names(qoc)) {\n", - " summary_tbl <- merge(summary_tbl, \n", - " qoc[, .(prop_malaria_deaths = mean(prop_malaria_deaths, na.rm = TRUE)), by = .(YEAR)], \n", - " by = \"YEAR\", all.x = TRUE)\n", - "}\n", - "\n", - "# Add absolute indicators (sum)\n", - "if (\"non_malaria_all_cause_outpatients\" %in% names(qoc)) {\n", - " summary_tbl <- merge(summary_tbl, \n", - " qoc[, .(non_malaria_all_cause_outpatients = sum(non_malaria_all_cause_outpatients, na.rm = TRUE)), by = .(YEAR)], \n", - " by = \"YEAR\", all.x = TRUE)\n", - "}\n", - "if (\"presumed_cases\" %in% names(qoc)) {\n", - " summary_tbl <- merge(summary_tbl, \n", - " qoc[, .(presumed_cases = sum(presumed_cases, na.rm = TRUE)), by = .(YEAR)], \n", - " by = \"YEAR\", all.x = TRUE)\n", - "}\n", - "\n", - "summary_tbl <- summary_tbl[order(YEAR)]\n", - "\n", - "# Explicitly list missing indicators so report is self-explanatory\n", - "expected_indicators <- c(\n", - " \"testing_rate\",\n", - " \"treatment_rate\",\n", - " \"case_fatality_rate\",\n", - " \"prop_adm_malaria\",\n", - " \"prop_malaria_deaths\",\n", - " \"non_malaria_all_cause_outpatients\",\n", - " \"presumed_cases\"\n", - ")\n", - "missing_indicators <- setdiff(expected_indicators, names(qoc))\n", - "if (length(missing_indicators) > 0) {\n", - " log_msg(glue::glue(\"[WARNING] Missing indicators in input file: {paste(missing_indicators, collapse=', ')}\"), level = \"warning\")\n", - " cat(glue::glue(\"\\nMissing indicators in this run: {paste(missing_indicators, collapse=', ')}\\n\"))\n", - " cat(\"Reason: required source columns are absent in the selected outliers file.\\n\")\n", - "}\n", - "\n", - "# Save summary data (parquet, csv, xlsx) - following other pipelines pattern\n", - "summary_parquet <- file.path(REPORT_OUTPUTS_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_summary.parquet\"))\n", - "summary_csv <- file.path(REPORT_OUTPUTS_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_summary.csv\"))\n", - "summary_xlsx <- file.path(REPORT_OUTPUTS_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_summary.xlsx\"))\n", - "\n", - "# Save as parquet (primary format, like other pipelines)\n", - "arrow::write_parquet(summary_tbl, summary_parquet)\n", - "\n", - "# Save as csv and xlsx for compatibility\n", - "data.table::fwrite(summary_tbl, summary_csv)\n", - "writexl::write_xlsx(list(summary = as.data.frame(summary_tbl)), summary_xlsx)\n", - "\n", - "log_msg(glue::glue(\"Summary data saved to: {summary_parquet}, {summary_csv}, {summary_xlsx}\"))\n", - "\n", - "knitr::kable(summary_tbl, caption = \"Quality of Care - Year-level summary\")\n", - "\n", - "cat(glue::glue(\"\\nLoaded file: {latest_file}\\n\"))\n", - "cat(glue::glue(\"Map outputs folder: {FIGURES_PATH}\\n\"))\n", - "cat(glue::glue(\"Summary data saved to: {summary_parquet}, {summary_csv}, {summary_xlsx}\\n\"))" - ] - }, - { - "cell_type": "markdown", - "id": "3dc318ac", - "metadata": {}, - "source": [ - "## Graphs by Year" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0e86bb0a", - "metadata": { - "vscode": { - "languageId": "r" + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Quality of Care Report\n", + "\n", + "This report displays a compact year-level summary of quality-of-care indicators and points to generated map outputs." + ], + "id": "7d246ae9" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "ROOT_PATH <- \"~/workspace\"\n", + "source(file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_quality_of_care\", \"utils\", \"snt_dhis2_quality_of_care_report.r\"))\n", + "\n", + "snt_environment <- get_setup_variables(\n", + " SNT_ROOT_PATH = ROOT_PATH,\n", + " packages = c(\"jsonlite\", \"data.table\", \"arrow\", \"dplyr\", \"knitr\", \"glue\", \"reticulate\", \"writexl\", \"ggplot2\", \"scales\", \"gridExtra\", \"sf\")\n", + ")\n", + "config_json <- load_snt_config(snt_environment$CONFIG_PATH, \"SNT_config.json\")\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "DHIS2_FORMATTED_DATASET <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "\n", + "PIPELINE_PATH <- file.path(snt_environment$PIPELINES_PATH, \"snt_dhis2_quality_of_care\")\n", + "DATA_PATH <- file.path(snt_environment$DATA_PATH, \"dhis2\", \"quality_of_care\")\n", + "REPORT_OUTPUTS_PATH <- file.path(PIPELINE_PATH, \"reporting\", \"outputs\")\n", + "FIGURES_PATH <- file.path(REPORT_OUTPUTS_PATH, \"figures\")\n", + "dir.create(DATA_PATH, recursive = TRUE, showWarnings = FALSE)\n", + "dir.create(REPORT_OUTPUTS_PATH, recursive = TRUE, showWarnings = FALSE)\n", + "dir.create(FIGURES_PATH, recursive = TRUE, showWarnings = FALSE)" + ], + "execution_count": null, + "outputs": [], + "id": "5eaa5bab" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Load latest district-year output and build summary\n", + "qoc_ctx <- load_latest_quality_of_care_output(DATA_PATH, COUNTRY_CODE)\n", + "qoc <- qoc_ctx$qoc\n", + "latest_file <- qoc_ctx$latest_file\n", + "\n", + "summary_tbl <- build_quality_of_care_summary(qoc)\n", + "summary_paths <- save_quality_of_care_summary_outputs(\n", + " summary_tbl = summary_tbl,\n", + " report_outputs_path = REPORT_OUTPUTS_PATH,\n", + " country_code = COUNTRY_CODE\n", + ")\n", + "\n", + "knitr::kable(summary_tbl, caption = \"Quality of Care - Year-level summary\")\n", + "\n", + "cat(glue::glue(\"\\nLoaded file: {latest_file}\\n\"))\n", + "cat(glue::glue(\"Map outputs folder: {FIGURES_PATH}\\n\"))\n", + "cat(glue::glue(\"Summary data saved to: {summary_paths$summary_parquet}, {summary_paths$summary_csv}, {summary_paths$summary_xlsx}\\n\"))" + ], + "execution_count": null, + "outputs": [], + "id": "1a8320f8" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Graphs by Year" + ], + "id": "3dc318ac" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Build and save year-level indicator charts\n", + "charts_file <- save_quality_of_care_summary_charts(\n", + " summary_tbl = summary_tbl,\n", + " figures_path = FIGURES_PATH,\n", + " country_code = COUNTRY_CODE\n", + ")\n", + "\n", + "if (!is.null(charts_file)) {\n", + " cat(glue::glue(\"Combined charts saved: {charts_file}\\n\"))\n", + "} else {\n", + " cat(\"No chart produced (no indicator columns available).\\n\")\n", + "}" + ], + "execution_count": null, + "outputs": [], + "id": "0e86bb0a" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Maps by District and Year\n", + "\n", + "Maps are generated directly from the quality-of-care data and district shapes." + ], + "id": "3b625d36" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [ + "# Load shapes and regenerate yearly maps through shared utils\n", + "shapes_filename <- glue::glue(\"{COUNTRY_CODE}_shapes.geojson\")\n", + "shapes <- load_dataset_file(\n", + " dataset_id = DHIS2_FORMATTED_DATASET,\n", + " filename = shapes_filename\n", + ")\n", + "\n", + "save_quality_of_care_maps(\n", + " qoc_dt = qoc,\n", + " shapes_sf = shapes,\n", + " figures_path = FIGURES_PATH\n", + ")\n", + "\n", + "cat(glue::glue(\"Yearly maps saved in: {FIGURES_PATH}\\n\"))" + ], + "execution_count": null, + "outputs": [], + "id": "6056a979" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [], + "execution_count": null, + "outputs": [], + "id": "5b31e4c8" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [], + "id": "8229c37e" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [], + "execution_count": null, + "outputs": [], + "id": "07324c1c" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [], + "id": "7c084da7" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [], + "execution_count": null, + "outputs": [], + "id": "c9f52975" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [], + "execution_count": null, + "outputs": [], + "id": "006866ce" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [], + "id": "f7225165" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [], + "execution_count": null, + "outputs": [], + "id": "420ed27f" + }, + { + "cell_type": "code", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "source": [], + "execution_count": null, + "outputs": [], + "id": "67ddb838" } - }, - "outputs": [], - "source": [ - "# Create bar charts by year (same as original notebook - 4x2 grid layout)\n", - "# Prepare data - convert rates to percentages\n", - "plot_data <- copy(summary_tbl)\n", - "\n", - "# Create the same 4x2 subplot layout as original notebook\n", - "if (nrow(plot_data) > 0) {\n", - " # Create a list to store individual plots (in order: 4x2 grid)\n", - " plots_list <- list()\n", - " \n", - " # Row 0, Col 0: Testing rate\n", - " if (\"testing_rate\" %in% names(plot_data)) {\n", - " p <- ggplot(plot_data, aes(x = factor(YEAR), y = testing_rate * 100)) +\n", - " geom_bar(stat = \"identity\", fill = \"#2563eb\", color = \"#1e40af\", width = 0.7) +\n", - " geom_text(aes(label = paste0(round(testing_rate * 100, 1), \"%\")), \n", - " vjust = -0.5, size = 2.5) +\n", - " labs(title = \"Testing rate (TEST / SUSP)\", x = \"Année\", y = \"%\") +\n", - " theme_minimal() +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\", size = 10),\n", - " axis.text.x = element_text(angle = 45, hjust = 1, size = 9),\n", - " panel.grid.major.y = element_line(linetype = \"dashed\", color = scales::alpha(\"grey\", 0.7)),\n", - " plot.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " panel.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " plot.margin = margin(5, 5, 5, 5)\n", - " ) +\n", - " scale_y_continuous(expand = expansion(mult = c(0, 0.1)))\n", - " plots_list[[\"testing_rate\"]] <- p\n", - " }\n", - " \n", - " # Row 0, Col 1: Treatment rate\n", - " if (\"treatment_rate\" %in% names(plot_data)) {\n", - " p <- ggplot(plot_data, aes(x = factor(YEAR), y = treatment_rate * 100)) +\n", - " geom_bar(stat = \"identity\", fill = \"#2563eb\", color = \"#1e40af\", width = 0.7) +\n", - " geom_text(aes(label = paste0(round(treatment_rate * 100, 1), \"%\")), \n", - " vjust = -0.5, size = 2.5) +\n", - " labs(title = \"Treatment rate (MALTREAT / CONF)\", x = \"Année\", y = \"%\") +\n", - " theme_minimal() +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\", size = 10),\n", - " axis.text.x = element_text(angle = 45, hjust = 1, size = 9),\n", - " panel.grid.major.y = element_line(linetype = \"dashed\", color = scales::alpha(\"grey\", 0.7)),\n", - " plot.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " panel.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " plot.margin = margin(5, 5, 5, 5)\n", - " ) +\n", - " scale_y_continuous(expand = expansion(mult = c(0, 0.1)))\n", - " plots_list[[\"treatment_rate\"]] <- p\n", - " }\n", - " \n", - " # Row 1, Col 0: Case fatality rate\n", - " if (\"case_fatality_rate\" %in% names(plot_data)) {\n", - " p <- ggplot(plot_data, aes(x = factor(YEAR), y = case_fatality_rate * 100)) +\n", - " geom_bar(stat = \"identity\", fill = \"#2563eb\", color = \"#1e40af\", width = 0.7) +\n", - " geom_text(aes(label = paste0(round(case_fatality_rate * 100, 1), \"%\")), \n", - " vjust = -0.5, size = 2.5) +\n", - " labs(title = \"Case fatality rate (MALDTH / MALADM)\", x = \"Année\", y = \"%\") +\n", - " theme_minimal() +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\", size = 10),\n", - " axis.text.x = element_text(angle = 45, hjust = 1, size = 9),\n", - " panel.grid.major.y = element_line(linetype = \"dashed\", color = scales::alpha(\"grey\", 0.7)),\n", - " plot.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " panel.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " plot.margin = margin(5, 5, 5, 5)\n", - " ) +\n", - " scale_y_continuous(expand = expansion(mult = c(0, 0.1)))\n", - " plots_list[[\"case_fatality_rate\"]] <- p\n", - " }\n", - " \n", - " # Row 1, Col 1: Proportion admissions malaria\n", - " if (\"prop_adm_malaria\" %in% names(plot_data)) {\n", - " p <- ggplot(plot_data, aes(x = factor(YEAR), y = prop_adm_malaria * 100)) +\n", - " geom_bar(stat = \"identity\", fill = \"#2563eb\", color = \"#1e40af\", width = 0.7) +\n", - " geom_text(aes(label = paste0(round(prop_adm_malaria * 100, 1), \"%\")), \n", - " vjust = -0.5, size = 2.5) +\n", - " labs(title = \"Prop. admissions paludisme (MALADM / ALLADM)\", x = \"Année\", y = \"%\") +\n", - " theme_minimal() +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\", size = 10),\n", - " axis.text.x = element_text(angle = 45, hjust = 1, size = 9),\n", - " panel.grid.major.y = element_line(linetype = \"dashed\", color = scales::alpha(\"grey\", 0.7)),\n", - " plot.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " panel.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " plot.margin = margin(5, 5, 5, 5)\n", - " ) +\n", - " scale_y_continuous(expand = expansion(mult = c(0, 0.1)))\n", - " plots_list[[\"prop_adm_malaria\"]] <- p\n", - " }\n", - " \n", - " # Row 2, Col 0: Proportion deaths malaria\n", - " if (\"prop_malaria_deaths\" %in% names(plot_data)) {\n", - " p <- ggplot(plot_data, aes(x = factor(YEAR), y = prop_malaria_deaths * 100)) +\n", - " geom_bar(stat = \"identity\", fill = \"#2563eb\", color = \"#1e40af\", width = 0.7) +\n", - " geom_text(aes(label = paste0(round(prop_malaria_deaths * 100, 1), \"%\")), \n", - " vjust = -0.5, size = 2.5) +\n", - " labs(title = \"Prop. décès paludisme (MALDTH / ALLDTH)\", x = \"Année\", y = \"%\") +\n", - " theme_minimal() +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\", size = 10),\n", - " axis.text.x = element_text(angle = 45, hjust = 1, size = 9),\n", - " panel.grid.major.y = element_line(linetype = \"dashed\", color = scales::alpha(\"grey\", 0.7)),\n", - " plot.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " panel.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " plot.margin = margin(5, 5, 5, 5)\n", - " ) +\n", - " scale_y_continuous(expand = expansion(mult = c(0, 0.1)))\n", - " plots_list[[\"prop_malaria_deaths\"]] <- p\n", - " }\n", - " \n", - " # Row 2, Col 1: Presumed cases (absolute)\n", - " if (\"presumed_cases\" %in% names(plot_data)) {\n", - " format_label <- function(v) {\n", - " ifelse(is.na(v) | v == 0, \"0\",\n", - " ifelse(v >= 1e6, paste0(round(v/1e6, 2), \"M\"),\n", - " format(round(v), big.mark = \" \", scientific = FALSE)\n", - " )\n", - " )\n", - " }\n", - " p <- ggplot(plot_data, aes(x = factor(YEAR), y = presumed_cases)) +\n", - " geom_bar(stat = \"identity\", fill = \"#2563eb\", color = \"#1e40af\", width = 0.7) +\n", - " geom_text(aes(label = format_label(presumed_cases)), \n", - " vjust = -0.5, size = 2.5) +\n", - " labs(title = \"Cas présumés (PRES)\", x = \"Année\", y = \"Nombre\") +\n", - " theme_minimal() +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\", size = 10),\n", - " axis.text.x = element_text(angle = 45, hjust = 1, size = 9),\n", - " panel.grid.major.y = element_line(linetype = \"dashed\", color = scales::alpha(\"grey\", 0.7)),\n", - " plot.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " panel.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " plot.margin = margin(5, 5, 5, 5)\n", - " ) +\n", - " scale_y_continuous(labels = scales::comma, expand = expansion(mult = c(0, 0.1)))\n", - " plots_list[[\"presumed_cases\"]] <- p\n", - " }\n", - " \n", - " # Row 3, Col 0: Non-malaria all-cause outpatients (absolute)\n", - " if (\"non_malaria_all_cause_outpatients\" %in% names(plot_data)) {\n", - " format_label <- function(v) {\n", - " ifelse(is.na(v) | v == 0, \"0\",\n", - " ifelse(v >= 1e6, paste0(round(v/1e6, 2), \"M\"),\n", - " format(round(v), big.mark = \" \", scientific = FALSE)\n", - " )\n", - " )\n", - " }\n", - " p <- ggplot(plot_data, aes(x = factor(YEAR), y = non_malaria_all_cause_outpatients)) +\n", - " geom_bar(stat = \"identity\", fill = \"#2563eb\", color = \"#1e40af\", width = 0.7) +\n", - " geom_text(aes(label = format_label(non_malaria_all_cause_outpatients)), \n", - " vjust = -0.5, size = 2.5) +\n", - " labs(title = \"Consultations externes non-paludisme (ALLOUT)\", x = \"Année\", y = \"Nombre\") +\n", - " theme_minimal() +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\", size = 10),\n", - " axis.text.x = element_text(angle = 45, hjust = 1, size = 9),\n", - " panel.grid.major.y = element_line(linetype = \"dashed\", color = scales::alpha(\"grey\", 0.7)),\n", - " plot.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " panel.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " plot.margin = margin(5, 5, 5, 5)\n", - " ) +\n", - " scale_y_continuous(labels = scales::comma, expand = expansion(mult = c(0, 0.1)))\n", - " plots_list[[\"non_malaria_all_cause_outpatients\"]] <- p\n", - " }\n", - " \n", - " # Create and display combined plot (dynamic grid for readability)\n", - " if (length(plots_list) > 0) {\n", - " # Order plots as in original\n", - " plot_order <- c(\"testing_rate\", \"treatment_rate\", \"case_fatality_rate\", \"prop_adm_malaria\", \n", - " \"prop_malaria_deaths\", \"presumed_cases\", \"non_malaria_all_cause_outpatients\")\n", - " available_plots <- plots_list[intersect(plot_order, names(plots_list))]\n", - "\n", - " if (length(available_plots) > 0) {\n", - " n_plots <- length(available_plots)\n", - " ncol_layout <- 2\n", - " nrow_layout <- ceiling(n_plots / ncol_layout)\n", - "\n", - " # Bigger display in report so labels are readable\n", - " options(repr.plot.width = 14, repr.plot.height = max(7, 4.8 * nrow_layout))\n", - "\n", - " combined_plot <- do.call(grid.arrange, c(available_plots, ncol = ncol_layout, nrow = nrow_layout))\n", - " print(combined_plot)\n", - "\n", - " # Save at larger size for presentation readability\n", - " combined_file <- file.path(FIGURES_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_by_year.png\"))\n", - " ggsave(\n", - " combined_file,\n", - " plot = combined_plot,\n", - " width = 18,\n", - " height = max(8, 5.2 * nrow_layout),\n", - " dpi = 300,\n", - " bg = \"white\",\n", - " units = \"in\"\n", - " )\n", - " log_msg(glue::glue(\"Combined bar charts saved: {combined_file}\"))\n", - " }\n", - " }\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "3b625d36", - "metadata": {}, - "source": [ - "## Maps by District and Year\n", - "\n", - "Maps are generated directly from the quality-of-care data and district shapes." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6056a979", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load shapes geojson from dataset (like seasonality pipeline)\n", - "DHIS2_FORMATTED_DATASET <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "\n", - "shapes <- tryCatch({\n", - " get_latest_dataset_file_in_memory(DHIS2_FORMATTED_DATASET, paste0(COUNTRY_CODE, \"_shapes.geojson\"))\n", - "}, error = function(e) {\n", - " msg <- paste0(\"Error while loading DHIS2 Shapes data for: \", COUNTRY_CODE, \". \", conditionMessage(e))\n", - " log_msg(msg, level = \"error\")\n", - " stop(msg)\n", - "})\n", - "\n", - "# Ensure ADM2_ID is character in both datasets\n", - "shapes$ADM2_ID <- as.character(shapes$ADM2_ID)\n", - "qoc$ADM2_ID <- as.character(qoc$ADM2_ID)\n", - "\n", - "# Merge shapes with quality-of-care data\n", - "qoc_sf <- shapes %>%\n", - " dplyr::left_join(qoc, by = \"ADM2_ID\")\n", - "\n", - "# Helper to build readable interval labels for legends\n", - "format_interval_labels <- function(breaks_vec) {\n", - " labels <- c()\n", - " for (i in seq_len(length(breaks_vec) - 1)) {\n", - " a <- breaks_vec[i]\n", - " b <- breaks_vec[i + 1]\n", - " labels <- c(labels, paste0(scales::comma(round(a)), \" - \", scales::comma(round(b))))\n", - " }\n", - " labels\n", - "}\n", - "\n", - "# Function to plot yearly maps (similar to code notebook but inline in report)\n", - "plot_yearly_map_report <- function(sf_data, value_col, title_prefix, is_rate = TRUE) {\n", - " if (!(value_col %in% names(sf_data))) {\n", - " log_msg(glue::glue(\"[WARNING] Column '{value_col}' not found. Skipping map generation.\"), level = \"warning\")\n", - " return(invisible(NULL))\n", - " }\n", - " \n", - " years <- sort(unique(sf_data$YEAR[!is.na(sf_data$YEAR)]))\n", - " if (length(years) == 0) {\n", - " log_msg(glue::glue(\"[WARNING] No valid years for '{value_col}'. Skipping map.\"), level = \"warning\")\n", - " return(invisible(NULL))\n", - " }\n", - " \n", - " # Create plots for each year\n", - " plot_list <- list()\n", - " base_shapes <- sf_data %>% dplyr::select(ADM2_ID, geometry) %>% dplyr::distinct()\n", - "\n", - " for (yr in years) {\n", - " # Keep all districts on map, then join year values\n", - " year_vals <- sf_data[sf_data$YEAR == yr, c(\"ADM2_ID\", value_col), drop = FALSE]\n", - " year_vals <- sf::st_drop_geometry(year_vals)\n", - " year_vals <- year_vals[!duplicated(year_vals$ADM2_ID), , drop = FALSE]\n", - " sf_y <- dplyr::left_join(base_shapes, year_vals, by = \"ADM2_ID\")\n", - "\n", - " vals <- sf_y[[value_col]]\n", - " finite_vals <- vals[is.finite(vals) & !is.na(vals)]\n", - "\n", - " if (length(finite_vals) == 0) {\n", - " next\n", - " }\n", - "\n", - " # Create categories\n", - " if (is_rate) {\n", - " cat_vals <- cut(\n", - " vals,\n", - " breaks = c(-Inf, 0, 0.2, 0.4, 0.6, 0.8, 1.0, Inf),\n", - " labels = c(\"<0\", \"0-0.2\", \"0.2-0.4\", \"0.4-0.6\", \"0.6-0.8\", \"0.8-1.0\", \">1.0\"),\n", - " include.lowest = TRUE\n", - " )\n", - " fill_palette <- \"YlOrRd\"\n", - " } else {\n", - " # Use readable fixed-count classes for absolute values\n", - " n_classes <- 5\n", - " br <- unique(as.numeric(quantile(finite_vals, probs = seq(0, 1, length.out = n_classes + 1), na.rm = TRUE)))\n", - " br <- sort(br)\n", - " if (length(br) < 2) {\n", - " br <- c(min(finite_vals, na.rm = TRUE), max(finite_vals, na.rm = TRUE) + 1)\n", - " }\n", - " if (length(unique(br)) < 2) {\n", - " cat_vals <- as.factor(rep(\"single value\", nrow(sf_y)))\n", - " } else {\n", - " labels_abs <- format_interval_labels(br)\n", - " cat_vals <- cut(vals, breaks = br, include.lowest = TRUE, labels = labels_abs)\n", - " }\n", - " fill_palette <- \"Blues\"\n", - " }\n", - "\n", - " sf_y$cat <- as.factor(cat_vals)\n", - "\n", - " p <- ggplot(sf_y) +\n", - " geom_sf(aes(fill = cat), color = \"grey60\", size = 0.12) +\n", - " scale_fill_brewer(palette = fill_palette, na.value = \"#f3f4f6\", drop = FALSE) +\n", - " theme_void() +\n", - " labs(\n", - " title = paste0(title_prefix, \" - \", yr),\n", - " fill = ifelse(is_rate, \"Rate class\", \"Value class\")\n", - " ) +\n", - " guides(fill = guide_legend(nrow = 2, byrow = TRUE)) +\n", - " theme(\n", - " legend.position = \"bottom\",\n", - " legend.text = element_text(size = 9),\n", - " legend.title = element_text(size = 10, face = \"bold\"),\n", - " plot.title = element_text(face = \"bold\", size = 13)\n", - " )\n", - "\n", - " plot_list[[as.character(yr)]] <- p\n", - " }\n", - " \n", - " # Display all plots\n", - " if (length(plot_list) > 0) {\n", - " options(repr.plot.width = 10, repr.plot.height = 8)\n", - " for (yr_name in names(plot_list)) {\n", - " print(plot_list[[yr_name]])\n", - " }\n", - " }\n", - "}\n", - "\n", - "# Generate maps for each available indicator\n", - "cat(\"### Testing Rate\\n\")\n", - "if (\"testing_rate\" %in% names(qoc_sf)) {\n", - " plot_yearly_map_report(qoc_sf, \"testing_rate\", \"Testing rate (TEST / SUSP)\", TRUE)\n", - "}\n", - "\n", - "cat(\"\\n### Treatment Rate\\n\")\n", - "if (\"treatment_rate\" %in% names(qoc_sf)) {\n", - " plot_yearly_map_report(qoc_sf, \"treatment_rate\", \"Treatment rate (MALTREAT / CONF)\", TRUE)\n", - "}\n", - "\n", - "cat(\"\\n### Case Fatality Rate\\n\")\n", - "if (\"case_fatality_rate\" %in% names(qoc_sf)) {\n", - " plot_yearly_map_report(qoc_sf, \"case_fatality_rate\", \"In-hospital case fatality rate (MALDTH / MALADM)\", TRUE)\n", - "}\n", - "\n", - "cat(\"\\n### Proportion Admissions Malaria\\n\")\n", - "if (\"prop_adm_malaria\" %in% names(qoc_sf)) {\n", - " plot_yearly_map_report(qoc_sf, \"prop_adm_malaria\", \"Proportion admitted for malaria (MALADM / ALLADM)\", TRUE)\n", - "}\n", - "\n", - "cat(\"\\n### Proportion Malaria Deaths\\n\")\n", - "if (\"prop_malaria_deaths\" %in% names(qoc_sf)) {\n", - " plot_yearly_map_report(qoc_sf, \"prop_malaria_deaths\", \"Proportion of malaria deaths (MALDTH / ALLDTH)\", TRUE)\n", - "}\n", - "\n", - "cat(\"\\n### Non-malaria All-cause Outpatients\\n\")\n", - "if (\"non_malaria_all_cause_outpatients\" %in% names(qoc_sf)) {\n", - " plot_yearly_map_report(qoc_sf, \"non_malaria_all_cause_outpatients\", \"Non-malaria all-cause outpatients (ALLOUT)\", FALSE)\n", - "}\n", - "\n", - "cat(\"\\n### Presumed Cases\\n\")\n", - "if (\"presumed_cases\" %in% names(qoc_sf)) {\n", - " plot_yearly_map_report(qoc_sf, \"presumed_cases\", \"Presumed cases (PRES)\", FALSE)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5b31e4c8", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "8229c37e", - "metadata": {}, - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "07324c1c", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "7c084da7", - "metadata": {}, - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c9f52975", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "006866ce", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "f7225165", - "metadata": {}, - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "420ed27f", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "67ddb838", - "metadata": { - "vscode": { - "languageId": "r" + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" } - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/pipelines/snt_dhis2_quality_of_care/utils/snt_dhis2_quality_of_care.r b/pipelines/snt_dhis2_quality_of_care/utils/snt_dhis2_quality_of_care.r new file mode 100644 index 0000000..647b8fd --- /dev/null +++ b/pipelines/snt_dhis2_quality_of_care/utils/snt_dhis2_quality_of_care.r @@ -0,0 +1,247 @@ +# Quality of Care shared R helpers. +# +# This file is the module used by the QoC *pipeline code* notebook: +# - bootstrap paths + install/load packages +# - config + dataset loading +# - map export utility used by both the pipeline and reporting notebooks +# - compute helpers for district-year outputs +# +# Reporting-only helpers live in: +# - snt_dhis2_quality_of_care_report.r + +# Load shared SNT helpers. +source(file.path("~/workspace", "code", "snt_utils.r")) + + +#' Bootstrap context for Quality of Care notebooks. +#' +#' Returns base workspace paths only. +get_setup_variables <- function( + SNT_ROOT_PATH = "~/workspace", + packages = c("arrow", "dplyr", "tidyr", "stringr", "stringi", "jsonlite", "httr", "glue", "reticulate") +) { + install_and_load(packages) + + Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") + reticulate::py_config()$python + assign("openhexa", reticulate::import("openhexa.sdk"), envir = .GlobalEnv) + + paths_to_check <- list( + CONFIG_PATH = file.path(SNT_ROOT_PATH, "configuration"), + UPLOADS_PATH = file.path(SNT_ROOT_PATH, "uploads"), + DATA_PATH = file.path(SNT_ROOT_PATH, "data"), + PIPELINES_PATH = file.path(SNT_ROOT_PATH, "pipelines") + ) + + for (target_path in paths_to_check) { + dir.create(target_path, recursive = TRUE, showWarnings = FALSE) + } + + setup_variable <- c( + list(paths_to_check = paths_to_check), + paths_to_check + ) + + return(setup_variable) +} + + +#' Load SNT configuration file. +load_snt_config <- function(config_path, config_file_name = "SNT_config.json") { + config_file <- file.path(config_path, config_file_name) + config_json <- tryCatch( + { + jsonlite::fromJSON(config_file) + }, + error = function(e) { + stop(glue::glue("[ERROR] Error while loading configuration from `{config_file}`: {conditionMessage(e)}")) + } + ) + log_msg(paste0("SNT configuration loaded from: ", config_file)) + return(config_json) +} + + +#' Load dataset file from OpenHEXA. +load_dataset_file <- function(dataset_id, filename, verbose = TRUE) { + if (!exists("openhexa", inherits = TRUE) || is.null(get("openhexa", inherits = TRUE))) { + stop("[ERROR] OpenHEXA SDK is not available. Run `get_setup_variables()` before loading dataset files.") + } + + data <- tryCatch( + { + get_latest_dataset_file_in_memory(dataset_id, filename) + }, + error = function(e) { + stop(glue::glue("[ERROR] Error while loading {filename} file from dataset: {dataset_id}")) + } + ) + + if (verbose) { + log_msg(glue::glue( + "{filename} data loaded from dataset : {dataset_id} dataframe dimensions: [{paste(dim(data), collapse = ', ')}]" + )) + } + + return(data) +} + + +#' Generate and save yearly district maps for QoC indicators. +#' +#' This is used by both the pipeline notebook and the reporting notebook. +save_quality_of_care_maps <- function(qoc_dt, shapes_sf, figures_path) { + shapes_sf$ADM2_ID <- as.character(shapes_sf$ADM2_ID) + qoc_dt$ADM2_ID <- as.character(qoc_dt$ADM2_ID) + + plot_yearly_map <- function(df, sf_shapes, value_col, title_prefix, filename_prefix, is_rate = TRUE) { + if (!(value_col %in% names(df))) return(invisible(NULL)) + sf_shapes_local <- sf_shapes + years <- sort(unique(df$YEAR)) + + for (yr in years) { + tryCatch( + { + df_y <- df[YEAR == yr] + if (nrow(df_y) == 0) return(invisible(NULL)) + df_y$ADM2_ID <- as.character(df_y$ADM2_ID) + map_df <- dplyr::left_join(sf_shapes_local, df_y, by = "ADM2_ID") + if (!(value_col %in% names(map_df))) return(invisible(NULL)) + + vals <- map_df[[value_col]] + finite_vals <- vals[is.finite(vals) & !is.na(vals)] + if (length(finite_vals) == 0) return(invisible(NULL)) + + if (is_rate) { + cat_vals <- cut(vals, breaks = c(-Inf, 0, 0.2, 0.4, 0.6, 0.8, 1.0, Inf), labels = c("<0", "0-0.2", "0.2-0.4", "0.4-0.6", "0.6-0.8", "0.8-1.0", ">1.0"), include.lowest = TRUE) + fill_palette <- "YlOrRd" + } else { + if (length(finite_vals) > 4) { + br <- unique(as.numeric(quantile(finite_vals, probs = seq(0, 1, 0.2), na.rm = TRUE))) + if (length(br) < 2) { + cat_vals <- as.factor(rep("all", nrow(map_df))) + } else { + cat_vals <- cut(vals, breaks = br, include.lowest = TRUE) + } + } else { + cat_vals <- as.factor(vals) + } + fill_palette <- "Blues" + } + + map_df <- dplyr::mutate(map_df, cat = as.factor(cat_vals)) + p <- ggplot2::ggplot(map_df) + + ggplot2::geom_sf(ggplot2::aes(fill = cat), color = "grey60", size = 0.1) + + ggplot2::scale_fill_brewer(palette = fill_palette, na.value = "white", drop = FALSE) + + ggplot2::theme_void() + + ggplot2::labs(title = paste0(title_prefix, " - ", yr), fill = value_col, caption = "Source: SNT DHIS2 outliers-imputed routine data") + + ggplot2::theme(legend.position = "bottom", plot.title = ggplot2::element_text(face = "bold", size = 12)) + + out_png <- file.path(figures_path, glue::glue("{filename_prefix}_{yr}.png")) + ggplot2::ggsave(out_png, plot = p, width = 9, height = 7, dpi = 300, bg = "white") + log_msg(glue::glue("Saved map: {out_png}")) + }, + error = function(e) { + log_msg(glue::glue("[WARNING] Failed to build/save map for `{value_col}` year `{yr}`: {conditionMessage(e)}"), level = "warning") + } + ) + } + } + + if ("testing_rate" %in% names(qoc_dt)) plot_yearly_map(qoc_dt, shapes_sf, "testing_rate", "Testing rate (TEST / SUSP)", "testing_rate", TRUE) + if ("treatment_rate" %in% names(qoc_dt)) plot_yearly_map(qoc_dt, shapes_sf, "treatment_rate", "Treatment rate (MALTREAT / CONF)", "treatment_rate", TRUE) + if ("case_fatality_rate" %in% names(qoc_dt)) plot_yearly_map(qoc_dt, shapes_sf, "case_fatality_rate", "In-hospital case fatality rate (MALDTH / MALADM)", "case_fatality_rate", TRUE) + if ("prop_adm_malaria" %in% names(qoc_dt)) plot_yearly_map(qoc_dt, shapes_sf, "prop_adm_malaria", "Proportion admitted for malaria (MALADM / ALLADM)", "prop_adm_malaria", TRUE) + if ("prop_malaria_deaths" %in% names(qoc_dt)) plot_yearly_map(qoc_dt, shapes_sf, "prop_malaria_deaths", "Proportion of malaria deaths (MALDTH / ALLDTH)", "prop_malaria_deaths", TRUE) + if ("non_malaria_all_cause_outpatients" %in% names(qoc_dt)) plot_yearly_map(qoc_dt, shapes_sf, "non_malaria_all_cause_outpatients", "Non-malaria all-cause outpatients (ALLOUT)", "allout", FALSE) + if ("presumed_cases" %in% names(qoc_dt)) plot_yearly_map(qoc_dt, shapes_sf, "presumed_cases", "Presumed cases (PRES)", "presumed_cases", FALSE) + + log_msg(glue::glue("Saved yearly maps in: {figures_path}")) + invisible(TRUE) +} + + +#' Validate quality-of-care action parameter. +validate_quality_of_care_action <- function(data_action) { + if (is.null(data_action) || !nzchar(data_action)) { + return("imputed") + } + allowed_actions <- c("imputed", "removed") + if (!(data_action %in% allowed_actions)) { + stop(glue::glue("[ERROR] Invalid data_action `{data_action}`. Allowed: {paste(allowed_actions, collapse = ', ')}")) + } + data_action +} + + +#' Normalize target indicator columns and keys in routine table. +normalize_qoc_routine_types <- function(routine, indicator_cols) { + data.table::setDT(routine) + indicator_cols <- as.character(indicator_cols) + available_cols <- intersect(indicator_cols, names(routine)) + + for (col in available_cols) { + col_vals <- as.character(routine[[col]]) + col_vals[is.na(col_vals) | col_vals == "" | col_vals == "-"] <- NA_character_ + routine[, (col) := suppressWarnings(as.numeric(col_vals))] + } + + if ("YEAR" %in% names(routine)) routine[, YEAR := as.integer(YEAR)] + if ("ADM2_ID" %in% names(routine)) routine[, ADM2_ID := as.character(ADM2_ID)] + routine +} + + +#' Aggregate QoC routine indicators by district and year. +aggregate_qoc_district_year <- function(routine, indicator_cols, group_cols = c("ADM2_ID", "YEAR")) { + group_cols <- as.character(group_cols) + indicator_cols <- as.character(indicator_cols) + available_cols <- intersect(indicator_cols, names(routine)) + + if (length(available_cols) > 0) { + routine[, lapply(.SD, function(x) sum(x, na.rm = TRUE)), .SDcols = available_cols, by = group_cols] + } else { + unique(routine[, ..group_cols]) + } +} + + +#' Add derived quality-of-care indicators to aggregated district-year data. +add_quality_of_care_derived_indicators <- function(qoc) { + if ("TEST" %in% names(qoc) && "SUSP" %in% names(qoc)) qoc[, testing_rate := data.table::fifelse(SUSP > 0, TEST / SUSP, NA_real_)] + if ("MALTREAT" %in% names(qoc) && "CONF" %in% names(qoc)) qoc[, treatment_rate := data.table::fifelse(CONF > 0, MALTREAT / CONF, NA_real_)] + if ("MALDTH" %in% names(qoc) && "MALADM" %in% names(qoc)) qoc[, case_fatality_rate := data.table::fifelse(MALADM > 0, MALDTH / MALADM, NA_real_)] + if ("MALADM" %in% names(qoc) && "ALLADM" %in% names(qoc)) qoc[, prop_adm_malaria := data.table::fifelse(ALLADM > 0, MALADM / ALLADM, NA_real_)] + if ("MALDTH" %in% names(qoc) && "ALLDTH" %in% names(qoc)) { + qoc[, prop_malaria_deaths := data.table::fifelse(ALLDTH > 0, MALDTH / ALLDTH, NA_real_)] + qoc[, prop_deaths_malaria := prop_malaria_deaths] + } + if ("ALLOUT" %in% names(qoc)) qoc[, non_malaria_all_cause_outpatients := ALLOUT] + if ("PRES" %in% names(qoc)) qoc[, presumed_cases := PRES] + + qoc +} + + +#' Merge ADM2 labels into Quality of Care outputs. +attach_quality_of_care_shapes <- function(qoc_dt, shapes_sf) { + shapes_dt <- data.table::as.data.table(sf::st_drop_geometry(shapes_sf)) + if ("ADM2_ID" %in% names(shapes_dt) && "ADM2_NAME" %in% names(shapes_dt)) { + shapes_dt[, ADM2_ID := as.character(ADM2_ID)] + qoc_dt <- merge(qoc_dt, unique(shapes_dt[, .(ADM2_ID, ADM2_NAME)]), by = "ADM2_ID", all.x = TRUE) + } + qoc_dt +} + + +#' Save district-year Quality of Care outputs. +save_quality_of_care_outputs <- function(qoc_dt, output_data_path, country_code, data_action) { + out_district_parquet <- file.path(output_data_path, glue::glue("{country_code}_quality_of_care_district_year_{data_action}.parquet")) + out_district_csv <- file.path(output_data_path, glue::glue("{country_code}_quality_of_care_district_year_{data_action}.csv")) + + arrow::write_parquet(qoc_dt, out_district_parquet) + data.table::fwrite(qoc_dt, out_district_csv) + log_msg(glue::glue("Saved outputs: {out_district_parquet}, {out_district_csv}")) + + list(parquet = out_district_parquet, csv = out_district_csv) +} diff --git a/pipelines/snt_dhis2_quality_of_care/utils/snt_dhis2_quality_of_care_report.r b/pipelines/snt_dhis2_quality_of_care/utils/snt_dhis2_quality_of_care_report.r new file mode 100644 index 0000000..c8dd0e2 --- /dev/null +++ b/pipelines/snt_dhis2_quality_of_care/utils/snt_dhis2_quality_of_care_report.r @@ -0,0 +1,125 @@ +# Quality of Care **reporting** helpers (used by the reporting notebook). + +source( + file.path( + "~/workspace", + "pipelines", + "snt_dhis2_quality_of_care", + "utils", + "snt_dhis2_quality_of_care.r" + ) +) + + +#' Load latest Quality of Care district-year output. +load_latest_quality_of_care_output <- function(output_data_path, country_code) { + files <- list.files(output_data_path, pattern = paste0("^", country_code, "_quality_of_care_district_year_(imputed|removed)\\.parquet$"), full.names = TRUE) + if (length(files) == 0) { + stop(glue::glue("[ERROR] No quality_of_care parquet found in {output_data_path}")) + } + latest_file <- files[which.max(file.info(files)$mtime)] + qoc <- data.table::as.data.table(arrow::read_parquet(latest_file)) + list(qoc = qoc, latest_file = latest_file) +} + + +#' Build year-level Quality of Care summary table. +build_quality_of_care_summary <- function(qoc_dt) { + summary_tbl <- unique(qoc_dt[, .(YEAR)]) + + if ("testing_rate" %in% names(qoc_dt)) summary_tbl <- merge(summary_tbl, qoc_dt[, .(testing_rate = mean(testing_rate, na.rm = TRUE)), by = .(YEAR)], by = "YEAR", all.x = TRUE) + if ("treatment_rate" %in% names(qoc_dt)) summary_tbl <- merge(summary_tbl, qoc_dt[, .(treatment_rate = mean(treatment_rate, na.rm = TRUE)), by = .(YEAR)], by = "YEAR", all.x = TRUE) + if ("case_fatality_rate" %in% names(qoc_dt)) summary_tbl <- merge(summary_tbl, qoc_dt[, .(case_fatality_rate = mean(case_fatality_rate, na.rm = TRUE)), by = .(YEAR)], by = "YEAR", all.x = TRUE) + if ("prop_adm_malaria" %in% names(qoc_dt)) summary_tbl <- merge(summary_tbl, qoc_dt[, .(prop_adm_malaria = mean(prop_adm_malaria, na.rm = TRUE)), by = .(YEAR)], by = "YEAR", all.x = TRUE) + if ("prop_malaria_deaths" %in% names(qoc_dt)) summary_tbl <- merge(summary_tbl, qoc_dt[, .(prop_malaria_deaths = mean(prop_malaria_deaths, na.rm = TRUE)), by = .(YEAR)], by = "YEAR", all.x = TRUE) + if ("non_malaria_all_cause_outpatients" %in% names(qoc_dt)) summary_tbl <- merge(summary_tbl, qoc_dt[, .(non_malaria_all_cause_outpatients = sum(non_malaria_all_cause_outpatients, na.rm = TRUE)), by = .(YEAR)], by = "YEAR", all.x = TRUE) + if ("presumed_cases" %in% names(qoc_dt)) summary_tbl <- merge(summary_tbl, qoc_dt[, .(presumed_cases = sum(presumed_cases, na.rm = TRUE)), by = .(YEAR)], by = "YEAR", all.x = TRUE) + + summary_tbl[order(YEAR)] +} + + +#' Save year-level summary outputs for report consumption. +save_quality_of_care_summary_outputs <- function(summary_tbl, report_outputs_path, country_code) { + summary_parquet <- file.path(report_outputs_path, glue::glue("{country_code}_quality_of_care_summary.parquet")) + summary_csv <- file.path(report_outputs_path, glue::glue("{country_code}_quality_of_care_summary.csv")) + summary_xlsx <- file.path(report_outputs_path, glue::glue("{country_code}_quality_of_care_summary.xlsx")) + + arrow::write_parquet(summary_tbl, summary_parquet) + data.table::fwrite(summary_tbl, summary_csv) + writexl::write_xlsx(list(summary = as.data.frame(summary_tbl)), summary_xlsx) + + log_msg(glue::glue("Summary data saved to: {summary_parquet}, {summary_csv}, {summary_xlsx}")) + list(summary_parquet = summary_parquet, summary_csv = summary_csv, summary_xlsx = summary_xlsx) +} + + +#' Build and save year-level bar chart panel for QoC indicators. +save_quality_of_care_summary_charts <- function(summary_tbl, figures_path, country_code) { + plot_data <- data.table::copy(summary_tbl) + if (nrow(plot_data) == 0) return(NULL) + + make_pct_plot <- function(col_name, title_name) { + ggplot2::ggplot(plot_data, ggplot2::aes(x = factor(YEAR), y = .data[[col_name]] * 100)) + + ggplot2::geom_bar(stat = "identity", fill = "#2563eb", color = "#1e40af", width = 0.7) + + ggplot2::geom_text(ggplot2::aes(label = paste0(round(.data[[col_name]] * 100, 1), "%")), vjust = -0.5, size = 2.5) + + ggplot2::labs(title = title_name, x = "Annee", y = "%") + + ggplot2::theme_minimal() + + ggplot2::theme( + plot.title = ggplot2::element_text(face = "bold", size = 10), + axis.text.x = ggplot2::element_text(angle = 45, hjust = 1, size = 9), + panel.grid.major.y = ggplot2::element_line(linetype = "dashed", color = scales::alpha("grey", 0.7)), + plot.background = ggplot2::element_rect(fill = "#fafafa", color = NA), + panel.background = ggplot2::element_rect(fill = "#fafafa", color = NA), + plot.margin = ggplot2::margin(5, 5, 5, 5) + ) + + ggplot2::scale_y_continuous(expand = ggplot2::expansion(mult = c(0, 0.1))) + } + + make_abs_plot <- function(col_name, title_name) { + format_label <- function(v) { + ifelse( + is.na(v) | v == 0, + "0", + ifelse(v >= 1e6, paste0(round(v / 1e6, 2), "M"), format(round(v), big.mark = " ", scientific = FALSE)) + ) + } + ggplot2::ggplot(plot_data, ggplot2::aes(x = factor(YEAR), y = .data[[col_name]])) + + ggplot2::geom_bar(stat = "identity", fill = "#2563eb", color = "#1e40af", width = 0.7) + + ggplot2::geom_text(ggplot2::aes(label = format_label(.data[[col_name]])), vjust = -0.5, size = 2.5) + + ggplot2::labs(title = title_name, x = "Annee", y = "Nombre") + + ggplot2::theme_minimal() + + ggplot2::theme( + plot.title = ggplot2::element_text(face = "bold", size = 10), + axis.text.x = ggplot2::element_text(angle = 45, hjust = 1, size = 9), + panel.grid.major.y = ggplot2::element_line(linetype = "dashed", color = scales::alpha("grey", 0.7)), + plot.background = ggplot2::element_rect(fill = "#fafafa", color = NA), + panel.background = ggplot2::element_rect(fill = "#fafafa", color = NA), + plot.margin = ggplot2::margin(5, 5, 5, 5) + ) + + ggplot2::scale_y_continuous(labels = scales::comma, expand = ggplot2::expansion(mult = c(0, 0.1))) + } + + plots_list <- list() + if ("testing_rate" %in% names(plot_data)) plots_list[["testing_rate"]] <- make_pct_plot("testing_rate", "Testing rate (TEST / SUSP)") + if ("treatment_rate" %in% names(plot_data)) plots_list[["treatment_rate"]] <- make_pct_plot("treatment_rate", "Treatment rate (MALTREAT / CONF)") + if ("case_fatality_rate" %in% names(plot_data)) plots_list[["case_fatality_rate"]] <- make_pct_plot("case_fatality_rate", "Case fatality rate (MALDTH / MALADM)") + if ("prop_adm_malaria" %in% names(plot_data)) plots_list[["prop_adm_malaria"]] <- make_pct_plot("prop_adm_malaria", "Prop. admissions paludisme (MALADM / ALLADM)") + if ("prop_malaria_deaths" %in% names(plot_data)) plots_list[["prop_malaria_deaths"]] <- make_pct_plot("prop_malaria_deaths", "Prop. deces paludisme (MALDTH / ALLDTH)") + if ("presumed_cases" %in% names(plot_data)) plots_list[["presumed_cases"]] <- make_abs_plot("presumed_cases", "Cas presumes (PRES)") + if ("non_malaria_all_cause_outpatients" %in% names(plot_data)) plots_list[["non_malaria_all_cause_outpatients"]] <- make_abs_plot("non_malaria_all_cause_outpatients", "Consultations externes non-paludisme (ALLOUT)") + + if (length(plots_list) == 0) return(NULL) + + plot_order <- c("testing_rate", "treatment_rate", "case_fatality_rate", "prop_adm_malaria", "prop_malaria_deaths", "presumed_cases", "non_malaria_all_cause_outpatients") + available_plots <- plots_list[intersect(plot_order, names(plots_list))] + n_plots <- length(available_plots) + ncol_layout <- 2 + nrow_layout <- ceiling(n_plots / ncol_layout) + + combined_plot <- do.call(gridExtra::grid.arrange, c(available_plots, ncol = ncol_layout, nrow = nrow_layout)) + out_file <- file.path(figures_path, glue::glue("{country_code}_quality_of_care_by_year.png")) + ggplot2::ggsave(out_file, plot = combined_plot, width = 18, height = max(8, 5.2 * nrow_layout), dpi = 300, bg = "white", units = "in") + log_msg(glue::glue("Combined bar charts saved: {out_file}")) + out_file +} From 74fc21a923e7984a7ebf9717f901b683f8ff7fd1 Mon Sep 17 00:00:00 2001 From: claude-marie Date: Wed, 22 Apr 2026 13:39:43 +0200 Subject: [PATCH 15/18] comment --- .../utils/snt_dhis2_quality_of_care.r | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/snt_dhis2_quality_of_care/utils/snt_dhis2_quality_of_care.r b/pipelines/snt_dhis2_quality_of_care/utils/snt_dhis2_quality_of_care.r index 647b8fd..9fba138 100644 --- a/pipelines/snt_dhis2_quality_of_care/utils/snt_dhis2_quality_of_care.r +++ b/pipelines/snt_dhis2_quality_of_care/utils/snt_dhis2_quality_of_care.r @@ -174,7 +174,7 @@ validate_quality_of_care_action <- function(data_action) { } -#' Normalize target indicator columns and keys in routine table. +#' Coerce `indicator_cols` to numeric; YEAR and ADM2_ID types if those columns exist. normalize_qoc_routine_types <- function(routine, indicator_cols) { data.table::setDT(routine) indicator_cols <- as.character(indicator_cols) @@ -192,7 +192,7 @@ normalize_qoc_routine_types <- function(routine, indicator_cols) { } -#' Aggregate QoC routine indicators by district and year. +#' Sum `indicator_cols` by `group_cols` (default ADM2_ID, YEAR). aggregate_qoc_district_year <- function(routine, indicator_cols, group_cols = c("ADM2_ID", "YEAR")) { group_cols <- as.character(group_cols) indicator_cols <- as.character(indicator_cols) From 1b9faae1acb93f66b63bedbcd5e9be65450f6180 Mon Sep 17 00:00:00 2001 From: claude-marie Date: Wed, 22 Apr 2026 14:11:23 +0200 Subject: [PATCH 16/18] docs(qoc): clarify pipeline notebook sources this module only Made-with: Cursor --- .../snt_dhis2_quality_of_care/utils/snt_dhis2_quality_of_care.r | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pipelines/snt_dhis2_quality_of_care/utils/snt_dhis2_quality_of_care.r b/pipelines/snt_dhis2_quality_of_care/utils/snt_dhis2_quality_of_care.r index 9fba138..639f62b 100644 --- a/pipelines/snt_dhis2_quality_of_care/utils/snt_dhis2_quality_of_care.r +++ b/pipelines/snt_dhis2_quality_of_care/utils/snt_dhis2_quality_of_care.r @@ -8,6 +8,8 @@ # # Reporting-only helpers live in: # - snt_dhis2_quality_of_care_report.r +# +# Pipeline code notebook sources this file only (not a separate *_code.r). # Load shared SNT helpers. source(file.path("~/workspace", "code", "snt_utils.r")) From bae8d52c5ee735971fce082eb866dc460b95588f Mon Sep 17 00:00:00 2001 From: claude-marie Date: Tue, 28 Apr 2026 11:02:59 +0200 Subject: [PATCH 17/18] fix parameters --- ...snt_dhis2_reporting_rate_dataelement.ipynb | 307 +++++++++--------- .../snt_dhis2_reporting_rate_dataelement.r | 19 +- .../snt_dhis2_reporting_rate_dataset.ipynb | 234 ++++++------- .../utils/snt_dhis2_reporting_rate_dataset.r | 9 +- .../pipeline.py | 62 +++- snt_dhis2_reporting_rate_dataset/pipeline.py | 1 + 6 files changed, 337 insertions(+), 295 deletions(-) diff --git a/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb b/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb index 3aa4e02..fca21e6 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb +++ b/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb @@ -2,7 +2,6 @@ "cells": [ { "cell_type": "markdown", - "id": "6e8d006c-fd3d-4186-bc8f-b83fdf234e65", "metadata": { "papermill": { "duration": 0.000173, @@ -36,11 +35,11 @@ "* **Output**: Reporting rate table aggregated at administrative level 2 with extensions csv and parquet saved to dataset **SNT_DHIS2_REPORTING_RATE**:\n", " * cols: YEAR, MONTH, ADM2_ID, REPORTING_RATE\n", " * Filename: `XXX_reporting_rate_dataelement.`" - ] + ], + "id": "6e8d006c-fd3d-4186-bc8f-b83fdf234e65" }, { "cell_type": "markdown", - "id": "064495be-24e5-4b76-a91f-7ac3d1a27a5a", "metadata": { "papermill": { "duration": 0.000228, @@ -53,12 +52,11 @@ }, "source": [ "## 1. Setup" - ] + ], + "id": "064495be-24e5-4b76-a91f-7ac3d1a27a5a" }, { "cell_type": "code", - "execution_count": null, - "id": "35ede7cf-257f-439c-a514-26a7290f881d", "metadata": { "papermill": { "duration": 63.150489, @@ -72,16 +70,17 @@ "languageId": "r" } }, - "outputs": [], "source": [ "source(\"~/workspace/pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r\")\n", "snt_environment <- get_setup_variables()\n", "config_json <- load_snt_config(file.path(snt_environment$CONFIG_PATH, \"SNT_config.json\"))\n" - ] + ], + "execution_count": null, + "outputs": [], + "id": "35ede7cf-257f-439c-a514-26a7290f881d" }, { "cell_type": "markdown", - "id": "7dedcc32-c531-498d-90b9-89b0ee9fb9be", "metadata": { "papermill": { "duration": 0.000095, @@ -94,12 +93,13 @@ }, "source": [ "### 1.1. Pipeline parameters\n", - "`ROUTINE_FILE` and `DATASET_ID` are injected by Papermill. Same layout as `snt_dhis2_population_transformation.ipynb`: define and assign variables explicitly in the notebook setup.\n" - ] + "Required parameters are injected by Papermill and validated in the notebook setup.\n", + "" + ], + "id": "7dedcc32-c531-498d-90b9-89b0ee9fb9be" }, { "cell_type": "markdown", - "id": "a1b2c3d4-save-vars-md", "metadata": { "vscode": { "languageId": "markdown" @@ -108,12 +108,11 @@ "source": [ "#### Save variables\n", "Indicator lists and remaining fields used downstream (mirrors the population notebook block that assigns `COUNTRY_CODE`, `ADMIN_1`, … from `config_json`).\n" - ] + ], + "id": "a1b2c3d4-save-vars-md" }, { "cell_type": "code", - "execution_count": null, - "id": "5b6d29ea-91f3-4c53-b95e-4b485f88383f", "metadata": { "papermill": { "duration": 0.521572, @@ -127,7 +126,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "assert_papermill_dataelement_params()\n", "\n", @@ -136,32 +134,19 @@ "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", "DHIS2_INDICATORS <- c(\"CONF\", \"PRES\", \"SUSP\", \"TEST\")\n", "\n", - "if (!exists(\"DATAELEMENT_METHOD_DENOMINATOR\", inherits = TRUE)) {\n", - " DATAELEMENT_METHOD_DENOMINATOR <- \"ROUTINE_ACTIVE_FACILITIES\"\n", - "}\n", - "if (!exists(\"USE_WEIGHTED_REPORTING_RATES\", inherits = TRUE)) {\n", - " USE_WEIGHTED_REPORTING_RATES <- FALSE\n", - "}\n", - "\n", - "if (exists(\"AVAILABILITY_INDICATORS\", inherits = TRUE)) {\n", - " ACTIVITY_INDICATORS <- as.character(unlist(AVAILABILITY_INDICATORS, use.names = FALSE))\n", - "} else {\n", - " ACTIVITY_INDICATORS <- c(\"CONF\", \"PRES\", \"SUSP\")\n", - "}\n", - "\n", - "if (exists(\"VOLUME_ACTIVITY_INDICATORS\", inherits = TRUE)) {\n", - " VOLUME_ACTIVITY_INDICATORS <- as.character(unlist(VOLUME_ACTIVITY_INDICATORS, use.names = FALSE))\n", - "} else {\n", - " VOLUME_ACTIVITY_INDICATORS <- c(\"CONF\", \"PRES\")\n", - "}\n", + "ACTIVITY_INDICATORS <- as.character(unlist(ACTIVITY_INDICATORS, use.names = FALSE))\n", + "VOLUME_ACTIVITY_INDICATORS <- as.character(unlist(VOLUME_ACTIVITY_INDICATORS, use.names = FALSE))\n", "\n", "fixed_cols <- c(\"PERIOD\", \"YEAR\", \"MONTH\", \"ADM1_ID\", \"ADM2_ID\", \"OU_ID\")\n", - "fixed_cols_rr <- c(\"YEAR\", \"MONTH\", \"ADM2_ID\", \"REPORTING_RATE\")\n" - ] + "fixed_cols_rr <- c(\"YEAR\", \"MONTH\", \"ADM2_ID\", \"REPORTING_RATE\")\n", + "" + ], + "execution_count": null, + "outputs": [], + "id": "5b6d29ea-91f3-4c53-b95e-4b485f88383f" }, { "cell_type": "markdown", - "id": "c26c981c-dadd-48ac-ae35-613b8ba61a82", "metadata": { "papermill": { "duration": 0.033003, @@ -178,25 +163,25 @@ "source": [ "### 1.2. Checks\n", "Validate activity-indicator selection before heavy joins.\n" - ] + ], + "id": "c26c981c-dadd-48ac-ae35-613b8ba61a82" }, { "cell_type": "code", - "execution_count": null, - "id": "8bf4a8bb", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "stop_if_activity_indicators_empty(ACTIVITY_INDICATORS)\n" - ] + ], + "execution_count": null, + "outputs": [], + "id": "8bf4a8bb" }, { "cell_type": "markdown", - "id": "e44ae2ab-4af7-475a-8cbe-6d669895a18b", "metadata": { "papermill": { "duration": 0.000093, @@ -209,11 +194,11 @@ }, "source": [ "## 2. Load Data" - ] + ], + "id": "e44ae2ab-4af7-475a-8cbe-6d669895a18b" }, { "cell_type": "markdown", - "id": "39e2add7-bbc7-4312-9a6f-9886d675f532", "metadata": { "papermill": { "duration": 0.000069, @@ -228,12 +213,11 @@ "### 2.1. Routine data (DHIS2) \n", "**Note on pipeline behaviour**:
    \n", "The value of `ROUTINE_FILE` is resolved within the pipeline.py code and injected into the notebook as parameter." - ] + ], + "id": "39e2add7-bbc7-4312-9a6f-9886d675f532" }, { "cell_type": "code", - "execution_count": null, - "id": "a1213723-f7e2-4238-9f37-f1795b187232", "metadata": { "papermill": { "duration": 2.018878, @@ -247,18 +231,19 @@ "languageId": "r" } }, - "outputs": [], "source": [ "dhis2_routine <- load_dataset_file(DATASET_ID, ROUTINE_FILE)\n", "dhis2_routine <- dhis2_routine %>%\n", " dplyr::mutate(dplyr::across(c(PERIOD, YEAR, MONTH), as.numeric))\n", "dim(dhis2_routine)\n", "head(dhis2_routine, 2)\n" - ] + ], + "execution_count": null, + "outputs": [], + "id": "a1213723-f7e2-4238-9f37-f1795b187232" }, { "cell_type": "markdown", - "id": "a8b91360-1a4e-4fc4-9883-602bc0ab2a2a", "metadata": { "papermill": { "duration": 0.000138, @@ -271,12 +256,11 @@ }, "source": [ "### 2.2. Organisation units (DHIS2 pyramid)" - ] + ], + "id": "a8b91360-1a4e-4fc4-9883-602bc0ab2a2a" }, { "cell_type": "code", - "execution_count": null, - "id": "2fd92901-901e-4019-be78-a7718050c1c4", "metadata": { "papermill": { "duration": 0.992899, @@ -290,7 +274,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "dhis2_pyramid_formatted <- load_dataset_file(\n", " config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED,\n", @@ -298,11 +281,13 @@ ")\n", "dim(dhis2_pyramid_formatted)\n", "head(dhis2_pyramid_formatted, 2)\n" - ] + ], + "execution_count": null, + "outputs": [], + "id": "2fd92901-901e-4019-be78-a7718050c1c4" }, { "cell_type": "markdown", - "id": "2b7f4e50-3731-46bc-b7a7-2ef5317da9d1", "metadata": { "papermill": { "duration": 0.000106, @@ -318,12 +303,11 @@ "Extra precaution measure to avoid breaks downstream.
    \n", "\n", "Note: This logic should be moved to pipeline.py 🐍" - ] + ], + "id": "2b7f4e50-3731-46bc-b7a7-2ef5317da9d1" }, { "cell_type": "code", - "execution_count": null, - "id": "19ff7e56-2397-4ca1-b072-bca4ba1b3d0c", "metadata": { "papermill": { "duration": 0.024863, @@ -337,7 +321,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "validate_required_columns(\n", " data = dhis2_routine,\n", @@ -351,11 +334,13 @@ " data_label = \"`dhis2_routine` (volume activity indicators)\",\n", " on_missing = \"error\"\n", ")\n" - ] + ], + "execution_count": null, + "outputs": [], + "id": "19ff7e56-2397-4ca1-b072-bca4ba1b3d0c" }, { "cell_type": "markdown", - "id": "bcbd3a9f-5e45-4ae5-8671-e23155236295", "metadata": { "papermill": { "duration": 0.000091, @@ -368,20 +353,19 @@ }, "source": [ "## 3. Reporting rates computations" - ] + ], + "id": "bcbd3a9f-5e45-4ae5-8671-e23155236295" }, { "cell_type": "markdown", - "id": "7d62cdb6", "metadata": {}, "source": [ "#### 3.0. Define start and end period based on routine data " - ] + ], + "id": "7d62cdb6" }, { "cell_type": "code", - "execution_count": null, - "id": "3bc2e76a-b5c7-4c71-90f2-c66926ca560a", "metadata": { "papermill": { "duration": 0.044172, @@ -395,18 +379,19 @@ "languageId": "r" } }, - "outputs": [], "source": [ "pv <- summarize_routine_period_range_as_month_vector(dhis2_routine)\n", "PERIOD_START <- pv$PERIOD_START\n", "PERIOD_END <- pv$PERIOD_END\n", "period_vector <- pv$period_vector\n", "log_msg(glue::glue(\"Routine period range: {PERIOD_START} to {PERIOD_END} ({length(period_vector)} months)\"))\n" - ] + ], + "execution_count": null, + "outputs": [], + "id": "3bc2e76a-b5c7-4c71-90f2-c66926ca560a" }, { "cell_type": "markdown", - "id": "526bc3af-01c1-4ddc-b3b9-077354e57559", "metadata": { "papermill": { "duration": 0.000109, @@ -420,12 +405,11 @@ "source": [ "#### 3.1. Build master table (all PERIOD x OU)\n", "The master table contains all combinations of period x organisation unit " - ] + ], + "id": "526bc3af-01c1-4ddc-b3b9-077354e57559" }, { "cell_type": "code", - "execution_count": null, - "id": "9308197a-0852-4d34-8888-cf5564f35a9d", "metadata": { "papermill": { "duration": 0.289128, @@ -439,7 +423,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "log_msg(glue(\"Building master table with periods from {PERIOD_START} to {PERIOD_END}. Periods count: {length(period_vector)}\"))\n", "facility_master <- build_facilities_crossed_with_monthly_periods(\n", @@ -449,11 +432,13 @@ " ADMIN_1 = ADMIN_1,\n", " ADMIN_2 = ADMIN_2\n", ")\n" - ] + ], + "execution_count": null, + "outputs": [], + "id": "9308197a-0852-4d34-8888-cf5564f35a9d" }, { "cell_type": "markdown", - "id": "d5af25ad-f17c-4cdc-ac96-908af49fe558", "metadata": { "papermill": { "duration": 0.000114, @@ -468,18 +453,16 @@ "#### 3.2. Identify \"Active\" facilities\n", "\n", "Facilities **reporting** zero or positive values on any of the selected indicators (**\"Activity indicators\"**) are considered to be **active**. Note that this method only counts **non-null** (not `NA`s) to prevent counting empty submissions as valid reporting.\n" - ] + ], + "id": "d5af25ad-f17c-4cdc-ac96-908af49fe558" }, { "cell_type": "code", - "execution_count": null, - "id": "7b279d27", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Join routine values to the facility master and define monthly activity\n", "facility_master_routine <- dplyr::left_join(\n", @@ -495,11 +478,13 @@ " ),\n", " COUNT = 1\n", " )\n" - ] + ], + "execution_count": null, + "outputs": [], + "id": "7b279d27" }, { "cell_type": "markdown", - "id": "89c3e5c8-4a4e-497d-9d75-2aed2e8fe619", "metadata": { "papermill": { "duration": 0.000107, @@ -520,12 +505,11 @@ "2. The period falls within the facility’s opening and closing dates. The opening date is not after the reporting period, and the closing date is not before or equal to the reporting period.\n", "\n", "If either of these conditions is not met, the facility is considered not open (OPEN = 0) for that period." - ] + ], + "id": "89c3e5c8-4a4e-497d-9d75-2aed2e8fe619" }, { "cell_type": "code", - "execution_count": null, - "id": "0b71f1d8-2048-4b62-865c-9acfe61b5b89", "metadata": { "papermill": { "duration": 1.317923, @@ -539,7 +523,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# 3.3 Identify OPEN facilities from naming and opening/closing dates\n", "facility_master_routine <- facility_master_routine %>%\n", @@ -550,19 +533,21 @@ " (!is.na(CLOSED_DATE) & as.Date(CLOSED_DATE) <= period_date)),\n", " OPEN = ifelse(!NAME_CLOSED & OPEN_BY_DATE, 1, 0)\n", " )\n" - ] + ], + "execution_count": null, + "outputs": [], + "id": "0b71f1d8-2048-4b62-865c-9acfe61b5b89" }, { "cell_type": "markdown", - "id": "657fd6ca", "metadata": {}, "source": [ "#### 3.4. Identify \"Active\" facilities for each YEAR (denominator)" - ] + ], + "id": "657fd6ca" }, { "cell_type": "markdown", - "id": "a598e4b7", "metadata": {}, "source": [ "
    \n", @@ -573,12 +558,11 @@ "
  • Without YEAR → “ever active over the entire extracted period”
  • \n", " \n", "
    " - ] + ], + "id": "a598e4b7" }, { "cell_type": "code", - "execution_count": null, - "id": "002e7fbf-1f68-4419-be2d-f16d8c72936d", "metadata": { "papermill": { "duration": 0.173961, @@ -592,18 +576,19 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# 3.4 Mark facilities active at least once per year\n", "facility_master_routine <- facility_master_routine %>%\n", " dplyr::group_by(OU_ID, YEAR) %>%\n", " dplyr::mutate(ACTIVE_THIS_YEAR = max(ACTIVE_THIS_PERIOD, na.rm = TRUE)) %>%\n", " dplyr::ungroup()\n" - ] + ], + "execution_count": null, + "outputs": [], + "id": "002e7fbf-1f68-4419-be2d-f16d8c72936d" }, { "cell_type": "markdown", - "id": "160c08ec-cc9a-4e1a-99ec-f703db83a71d", "metadata": { "papermill": { "duration": 0.000098, @@ -616,12 +601,11 @@ }, "source": [ "#### 3.5. Compute Weighting factor based on \"volume of activity\"" - ] + ], + "id": "160c08ec-cc9a-4e1a-99ec-f703db83a71d" }, { "cell_type": "code", - "execution_count": null, - "id": "4420e559-4134-4fc3-8950-9972ebede00e", "metadata": { "papermill": { "duration": 0.520673, @@ -635,7 +619,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# 3.5 Compute facility weights from volume of activity\n", "mean_monthly_cases <- dhis2_routine %>%\n", @@ -661,11 +644,13 @@ "hf_weights <- mean_monthly_cases %>%\n", " dplyr::left_join(mean_monthly_cases_adm2, by = \"ADM2_ID\") %>%\n", " dplyr::mutate(WEIGHT = MEAN_REPORTED_CASES_BY_HF / SUMMED_MEAN_REPORTED_CASES_BY_ADM2 * NR_OF_HF)\n" - ] + ], + "execution_count": null, + "outputs": [], + "id": "4420e559-4134-4fc3-8950-9972ebede00e" }, { "cell_type": "markdown", - "id": "2fed8529-70e9-4e2e-a498-fe3dd7499bb3", "metadata": { "papermill": { "duration": 0.000108, @@ -678,12 +663,11 @@ }, "source": [ "#### 3.6. Compute Weighted variables" - ] + ], + "id": "2fed8529-70e9-4e2e-a498-fe3dd7499bb3" }, { "cell_type": "code", - "execution_count": null, - "id": "216f7658-c1da-44e4-9f4f-fdb44fd40259", "metadata": { "papermill": { "duration": 0.483413, @@ -697,7 +681,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# 3.6 Apply weights to monthly status variables\n", "facility_master_routine_02 <- facility_master_routine %>%\n", @@ -707,11 +690,13 @@ "facility_master_routine_02$COUNT_W <- facility_master_routine_02$COUNT * facility_master_routine_02$WEIGHT\n", "facility_master_routine_02$OPEN_W <- facility_master_routine_02$OPEN * facility_master_routine_02$WEIGHT\n", "facility_master_routine_02$ACTIVE_THIS_YEAR_W <- facility_master_routine_02$ACTIVE_THIS_YEAR * facility_master_routine_02$WEIGHT\n" - ] + ], + "execution_count": null, + "outputs": [], + "id": "216f7658-c1da-44e4-9f4f-fdb44fd40259" }, { "cell_type": "markdown", - "id": "9c0367f7-91cd-4524-abe4-11adf2fcea02", "metadata": { "papermill": { "duration": 0.000172, @@ -724,18 +709,16 @@ }, "source": [ "#### 3.7. Aggregate data at ADM2 level" - ] + ], + "id": "9c0367f7-91cd-4524-abe4-11adf2fcea02" }, { "cell_type": "code", - "execution_count": null, - "id": "af13191e", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# 3.7 Aggregate monthly counts at ADM2 level\n", "reporting_rate_adm2 <- facility_master_routine_02 %>%\n", @@ -759,26 +742,26 @@ " RR_OPEN_HF_W = HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED / NR_OF_OPEN_HF_BY_ADM2_WEIGHTED,\n", " RR_ACTIVE_HF_W = HF_ACTIVE_THIS_PERIOD_BY_ADM2_WEIGHTED / HF_ACTIVE_THIS_YEAR_BY_ADM2_WEIGHTED\n", " )\n" - ] + ], + "execution_count": null, + "outputs": [], + "id": "af13191e" }, { "cell_type": "markdown", - "id": "7d381937", "metadata": {}, "source": [ "#### 3.8. Calculate Reporting Rates (all methods)" - ] + ], + "id": "7d381937" }, { "cell_type": "code", - "execution_count": null, - "id": "b41263f8", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# 3.8 Select final reporting-rate definition for export\n", "rr_column_selection <- if (DATAELEMENT_METHOD_DENOMINATOR == \"ROUTINE_ACTIVE_FACILITIES\") \"RR_ACTIVE_HF\" else \"RR_OPEN_HF\"\n", @@ -790,11 +773,13 @@ " dplyr::mutate(MONTH = PERIOD %% 100) %>%\n", " dplyr::rename(REPORTING_RATE = !!rlang::sym(rr_column_selection)) %>%\n", " dplyr::select(YEAR, MONTH, ADM2_ID, REPORTING_RATE)\n" - ] + ], + "execution_count": null, + "outputs": [], + "id": "b41263f8" }, { "cell_type": "markdown", - "id": "5e593659", "metadata": { "papermill": { "duration": 0.000108, @@ -807,11 +792,11 @@ }, "source": [ "## 4. Select correct col for `REPORTING_RATE` based on denominator method" - ] + ], + "id": "5e593659" }, { "cell_type": "markdown", - "id": "c75f2249", "metadata": { "papermill": { "duration": 0.000057, @@ -824,12 +809,11 @@ }, "source": [ "### 4.1. Select results and format" - ] + ], + "id": "c75f2249" }, { "cell_type": "code", - "execution_count": null, - "id": "75e71b38", "metadata": { "papermill": { "duration": 0.020644, @@ -843,18 +827,18 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# 4.1 Confirm which denominator/weighting option was selected\n", "cat(glue::glue(\n", " \"Selected denominator method: {DATAELEMENT_METHOD_DENOMINATOR} | Weighted reporting rates: {USE_WEIGHTED_REPORTING_RATES}\"\n", "))\n" - ] + ], + "execution_count": null, + "outputs": [], + "id": "75e71b38" }, { "cell_type": "code", - "execution_count": null, - "id": "3df36abb", "metadata": { "papermill": { "duration": 0.140976, @@ -868,17 +852,17 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# Output preview\n", "dim(reporting_rate_dataelement)\n", "head(reporting_rate_dataelement, 5)\n" - ] + ], + "execution_count": null, + "outputs": [], + "id": "3df36abb" }, { "cell_type": "code", - "execution_count": null, - "id": "0ccc272c", "metadata": { "papermill": { "duration": 0.182574, @@ -892,16 +876,17 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# Basic data quality checks\n", "summary(reporting_rate_dataelement$REPORTING_RATE)\n", "sum(is.na(reporting_rate_dataelement$REPORTING_RATE))\n" - ] + ], + "execution_count": null, + "outputs": [], + "id": "0ccc272c" }, { "cell_type": "markdown", - "id": "ca66e785", "metadata": { "papermill": { "duration": 0.000109, @@ -914,12 +899,11 @@ }, "source": [ "## 5. Inspect reporting rate values" - ] + ], + "id": "ca66e785" }, { "cell_type": "code", - "execution_count": null, - "id": "31535459", "metadata": { "papermill": { "duration": 0.160299, @@ -933,17 +917,17 @@ "languageId": "r" } }, - "outputs": [], "source": [ "hist(reporting_rate_dataelement$REPORTING_RATE, breaks=50, \n", "main=paste0(\"Histogram of REPORTING_RATE\\n(\", DATAELEMENT_METHOD_DENOMINATOR, \",\\n\", ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted\", \"Unweighted\"), \")\"), \n", "xlab=\"REPORTING_RATE\")" - ] + ], + "execution_count": null, + "outputs": [], + "id": "31535459" }, { "cell_type": "code", - "execution_count": null, - "id": "6778f17d", "metadata": { "papermill": { "duration": 0.896382, @@ -957,7 +941,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# Boxplot\n", "ggplot(reporting_rate_dataelement,\n", @@ -970,12 +953,13 @@ " subtitle = ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted Reporting Rates\", \"Unweighted Reporting Rates\")\n", " ) +\n", " theme_minimal()" - ] + ], + "execution_count": null, + "outputs": [], + "id": "6778f17d" }, { "cell_type": "code", - "execution_count": null, - "id": "a7f013fd", "metadata": { "papermill": { "duration": 0.859448, @@ -989,7 +973,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "ggplot(reporting_rate_dataelement,\n", " aes(x = factor(YEAR), y = REPORTING_RATE)) +\n", @@ -1003,11 +986,13 @@ " subtitle = ifelse(USE_WEIGHTED_REPORTING_RATES, \"Weighted Reporting Rates\", \"Unweighted Reporting Rates\")\n", " ) +\n", " theme_minimal()" - ] + ], + "execution_count": null, + "outputs": [], + "id": "a7f013fd" }, { "cell_type": "markdown", - "id": "2866816a-7015-4c5c-b904-f553f3b4790d", "metadata": { "papermill": { "duration": 0.000088, @@ -1020,12 +1005,11 @@ }, "source": [ "## 5. 📁 Export to `data/` folder" - ] + ], + "id": "2866816a-7015-4c5c-b904-f553f3b4790d" }, { "cell_type": "code", - "execution_count": null, - "id": "bbf27852-8ec5-4370-aae2-49e082928fe1", "metadata": { "papermill": { "duration": 0.919937, @@ -1039,7 +1023,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "output_dir <- file.path(snt_environment$DATA_PATH, \"dhis2\", \"reporting_rate\")\n", "dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)\n", @@ -1052,7 +1035,10 @@ "\n", "# log\n", "log_msg(out_msg)\n" - ] + ], + "execution_count": null, + "outputs": [], + "id": "bbf27852-8ec5-4370-aae2-49e082928fe1" } ], "metadata": { @@ -1078,12 +1064,11 @@ "input_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataelement/code/snt_dhis2_reporting_rate_dataelement.ipynb", "output_path": "/home/hexa/workspace/pipelines/snt_dhis2_reporting_rate_dataelement/papermill_outputs/snt_dhis2_reporting_rate_dataelement_OUTPUT_2026-01-16_102249.ipynb", "parameters": { - "AVAILABILITY_INDICATORS": [ + "ACTIVITY_INDICATORS": [ "CONF", - "PRES", - "SUSP", - "TEST" + "PRES" ], + "DATASET_ID": "DHIS2_OUTLIERS_IMPUTATION", "DATAELEMENT_METHOD_DENOMINATOR": "ROUTINE_ACTIVE_FACILITIES", "ROUTINE_FILE": "XXX_routine_outliers_removed.parquet", "SNT_ROOT_PATH": "/home/hexa/workspace", @@ -1099,4 +1084,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r b/pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r index 21381a7..c75f279 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r +++ b/pipelines/snt_dhis2_reporting_rate_dataelement/utils/snt_dhis2_reporting_rate_dataelement.r @@ -1,11 +1,8 @@ # Load base utils -# Bootstrap matches `snt_dhis2_population_transformation`: fixed-path `source()` of this -# file, `snt_environment <- get_setup_variables()`, then `load_snt_config()`. -# Keep helpers small and reusable; pipeline-specific assignments stay in notebooks. source(file.path("~/workspace/code", "snt_utils.r")) -# JSON reader for this pipeline only (`snt_utils.r` must stay untouched per project rules). +# JSON reader for this pipeline. read_workspace_json_file <- function(json_path, resource_label = "JSON file") { json_path <- as.character(json_path)[[1L]] tryCatch( @@ -125,15 +122,21 @@ configure_conda_r_spatial_env <- function() { } -#' Fail if Papermill did not inject `ROUTINE_FILE` and `DATASET_ID`. +#' Fail if Papermill did not inject the required pipeline parameters. assert_papermill_dataelement_params <- function() { - required_pm <- c("ROUTINE_FILE", "DATASET_ID") + required_pm <- c( + "ROUTINE_FILE", + "DATASET_ID", + "DATAELEMENT_METHOD_DENOMINATOR", + "ACTIVITY_INDICATORS", + "VOLUME_ACTIVITY_INDICATORS", + "USE_WEIGHTED_REPORTING_RATES" + ) missing_pm <- required_pm[!vapply(required_pm, exists, logical(1), inherits = TRUE)] if (length(missing_pm) > 0) { stop( "[ERROR] Missing pipeline parameters (Papermill): ", - paste(missing_pm, collapse = ", "), - ". Expected only ROUTINE_FILE and DATASET_ID from `snt_dhis2_reporting_rate_dataelement`." + paste(missing_pm, collapse = ", ") ) } } diff --git a/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb b/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb index a59c868..9450d74 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb +++ b/pipelines/snt_dhis2_reporting_rate_dataset/code/snt_dhis2_reporting_rate_dataset.ipynb @@ -2,7 +2,6 @@ "cells": [ { "cell_type": "markdown", - "id": "30bf8dfc", "metadata": {}, "source": [ "# **Dataset Reporting Rate: Calculation Based on DHIS2 Extracted Data**\n", @@ -49,11 +48,11 @@ "- **Outliers detection method**: Specify which method was used to detect outliers in routine data. Choose \"Routine data (Raw)\" to use raw routine data.\n", " \n", "- **Use routine with outliers removed**: Toggle this on to use the routine data after outliers have been removed (using the outliers detection method selected above). Else, this pipeline will use either the imputed routine data (to replace the outlier values removed) or the raw routine data if you selected \"Routine data (Raw)\" as your choice of “Outlier processing method”." - ] + ], + "id": "30bf8dfc" }, { "cell_type": "markdown", - "id": "064495be-24e5-4b76-a91f-7ac3d1a27a5a", "metadata": { "papermill": { "duration": 0.000092, @@ -66,12 +65,11 @@ }, "source": [ "## 1. Setup" - ] + ], + "id": "064495be-24e5-4b76-a91f-7ac3d1a27a5a" }, { "cell_type": "code", - "execution_count": null, - "id": "35ede7cf-257f-439c-a514-26a7290f881d", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:21:50.332786Z", @@ -91,15 +89,16 @@ "languageId": "r" } }, - "outputs": [], "source": [ "source(\"~/workspace/pipelines/snt_dhis2_reporting_rate_dataset/utils/snt_dhis2_reporting_rate_dataset.r\")\n", "snt_environment <- get_setup_variables()\n" - ] + ], + "execution_count": null, + "outputs": [], + "id": "35ede7cf-257f-439c-a514-26a7290f881d" }, { "cell_type": "markdown", - "id": "7dedcc32-c531-498d-90b9-89b0ee9fb9be", "metadata": { "papermill": { "duration": 0.00017, @@ -112,12 +111,11 @@ }, "source": [ "#### 1.1. Load and check `config_json` file" - ] + ], + "id": "7dedcc32-c531-498d-90b9-89b0ee9fb9be" }, { "cell_type": "code", - "execution_count": null, - "id": "5b6d29ea-91f3-4c53-b95e-4b485f88383f", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:03.351367Z", @@ -137,15 +135,15 @@ "languageId": "r" } }, - "outputs": [], "source": [ "config_json <- load_snt_config(file.path(snt_environment$CONFIG_PATH, \"SNT_config.json\"))" - ] + ], + "execution_count": null, + "outputs": [], + "id": "5b6d29ea-91f3-4c53-b95e-4b485f88383f" }, { "cell_type": "code", - "execution_count": null, - "id": "c26c981c-dadd-48ac-ae35-613b8ba61a82", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:03.987632Z", @@ -165,7 +163,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "stop_if_dataset_reporting_papermill_params_missing()\n", "\n", @@ -174,11 +171,13 @@ "ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)\n", "REPORTING_RATE_PRODUCT_ID <- config_json$SNT_CONFIG$REPORTING_RATE_PRODUCT_UID\n", "fixed_cols_rr <- c(\"YEAR\", \"MONTH\", \"ADM2_ID\", \"REPORTING_RATE\")\n" - ] + ], + "execution_count": null, + "outputs": [], + "id": "c26c981c-dadd-48ac-ae35-613b8ba61a82" }, { "cell_type": "markdown", - "id": "a7a15634-4623-40f2-8e2d-3fa47203aa6e", "metadata": { "papermill": { "duration": 0.00015, @@ -192,11 +191,11 @@ "source": [ "#### 1.2. Config + Papermill\n", "Validate Papermill inputs, then assign country/admin/product and fixed reporting-rate columns explicitly from `config_json`." - ] + ], + "id": "a7a15634-4623-40f2-8e2d-3fa47203aa6e" }, { "cell_type": "markdown", - "id": "8d8b20f5-901b-46c7-a0ef-9850cba6e650", "metadata": { "papermill": { "duration": 0.000144, @@ -209,20 +208,19 @@ }, "source": [ "#### 1.3. 🔍 Check REPORTING_RATE_PRODUCT_ID is configured" - ] + ], + "id": "8d8b20f5-901b-46c7-a0ef-9850cba6e650" }, { "cell_type": "markdown", - "id": "682a62d5", "metadata": {}, "source": [ - "### 🐍 This probably to be moved to pipeline.py code?" - ] + "### 1.3. Validate reporting-rate product configuration" + ], + "id": "682a62d5" }, { "cell_type": "code", - "execution_count": null, - "id": "7469898d", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:04.047782Z", @@ -242,18 +240,19 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# Check if REPORTING_RATE_PRODUCT_ID is configured\n", "if (is.null(REPORTING_RATE_PRODUCT_ID) || length(REPORTING_RATE_PRODUCT_ID) == 0) {\n", " log_msg(\"🚨 Warning: REPORTING_RATE_PRODUCT_ID is not configured properly in 'SNT_config.json'. \n", " This will prevent filtering by reporting dataset, and all values will be retained.\", level = \"warning\" )\n", "}" - ] + ], + "execution_count": null, + "outputs": [], + "id": "7469898d" }, { "cell_type": "markdown", - "id": "e44ae2ab-4af7-475a-8cbe-6d669895a18b", "metadata": { "papermill": { "duration": 0.000139, @@ -266,11 +265,11 @@ }, "source": [ "## 2. Load Data" - ] + ], + "id": "e44ae2ab-4af7-475a-8cbe-6d669895a18b" }, { "cell_type": "markdown", - "id": "39e2add7-bbc7-4312-9a6f-9886d675f532", "metadata": { "papermill": { "duration": 0.000152, @@ -285,12 +284,11 @@ "### 2.1. Load routine data (DHIS2) \n", "Already formatted routine data, we use this as the master table
    \n", "(only used at the very end before exporting the table)" - ] + ], + "id": "39e2add7-bbc7-4312-9a6f-9886d675f532" }, { "cell_type": "code", - "execution_count": null, - "id": "a1213723-f7e2-4238-9f37-f1795b187232", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:04.554212Z", @@ -310,7 +308,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "dhis2_routine <- load_dataset_file(DATASET_ID, ROUTINE_FILE)\n", "dhis2_routine <- dhis2_routine %>%\n", @@ -319,11 +316,13 @@ " dplyr::distinct()\n", "dim(dhis2_routine)\n", "head(dhis2_routine, 3)\n" - ] + ], + "execution_count": null, + "outputs": [], + "id": "a1213723-f7e2-4238-9f37-f1795b187232" }, { "cell_type": "markdown", - "id": "dccc8626-7798-4bcd-ae5f-d7502dfdc452", "metadata": { "papermill": { "duration": 0.000155, @@ -336,12 +335,11 @@ }, "source": [ "### 2.2. Load Reporting Rate data (DHIS2)" - ] + ], + "id": "dccc8626-7798-4bcd-ae5f-d7502dfdc452" }, { "cell_type": "code", - "execution_count": null, - "id": "0e352c76-f2fb-43ba-b85d-391d808057a8", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:05.780487Z", @@ -361,7 +359,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "formatting_dataset_id <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", "reporting_parquet_name <- paste0(COUNTRY_CODE, \"_reporting.parquet\")\n", @@ -370,11 +367,13 @@ "dhis2_reporting <- dhis2_reporting %>%\n", " dplyr::mutate(dplyr::across(c(PERIOD, YEAR, MONTH, VALUE), as.numeric))\n", "head(dhis2_reporting, 3)\n" - ] + ], + "execution_count": null, + "outputs": [], + "id": "0e352c76-f2fb-43ba-b85d-391d808057a8" }, { "cell_type": "markdown", - "id": "4d5f398b", "metadata": { "papermill": { "duration": 0.000151, @@ -387,11 +386,11 @@ }, "source": [ "## 3. Transform reporting data" - ] + ], + "id": "4d5f398b" }, { "cell_type": "markdown", - "id": "adcbee0b", "metadata": { "papermill": { "duration": 0.0001, @@ -407,12 +406,11 @@ "Logic:\n", "* Value(s) (string) for `PRODUCT_UID` defined in the config.json file\n", "* If none provided (**empty** field) skip filtering and **keep everything**" - ] + ], + "id": "adcbee0b" }, { "cell_type": "code", - "execution_count": null, - "id": "795a5e74", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:07.104617Z", @@ -432,7 +430,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# 3.1 Filter Reporting Rate data by selected dataset PRODUCT_UID(s)\n", "if (length(REPORTING_RATE_PRODUCT_ID) > 0 && all(REPORTING_RATE_PRODUCT_ID %in% unique(dhis2_reporting$PRODUCT_UID))) {\n", @@ -486,11 +483,13 @@ " .groups = \"drop\"\n", " ) %>%\n", " dplyr::mutate(REPORTING_RATE = ACTUAL_REPORTS / EXPECTED_REPORTS)\n" - ] + ], + "execution_count": null, + "outputs": [], + "id": "795a5e74" }, { "cell_type": "markdown", - "id": "4237408a", "metadata": { "papermill": { "duration": 0.000133, @@ -503,12 +502,11 @@ }, "source": [ "### 3.2. Pivot wider" - ] + ], + "id": "4237408a" }, { "cell_type": "code", - "execution_count": null, - "id": "5c3b9a65", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:08.413415Z", @@ -528,16 +526,17 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# 3.2 Quick check after pivot\n", "dim(dhis2_reporting_wide)\n", "head(dhis2_reporting_wide, 3)\n" - ] + ], + "execution_count": null, + "outputs": [], + "id": "5c3b9a65" }, { "cell_type": "markdown", - "id": "0f485148", "metadata": { "papermill": { "duration": 0.000186, @@ -551,11 +550,11 @@ "source": [ "### 👯 Handle **duplicated** values (`OU_ID`)\n", "Using multiple datasets relies on the **assumption** that **each dataset is complementary to the other(s)**. Namely, there should be no \"dupliacted\" orgunits that are counted in more than one dataset! Else, we would be **double counting**." - ] + ], + "id": "0f485148" }, { "cell_type": "markdown", - "id": "55dececa", "metadata": { "papermill": { "duration": 0.000122, @@ -568,12 +567,11 @@ }, "source": [ "#### Check for duplicated values (`OU_ID`)" - ] + ], + "id": "55dececa" }, { "cell_type": "code", - "execution_count": null, - "id": "d761bd15", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:08.899486Z", @@ -593,16 +591,17 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# Count duplicated OU_ID/PERIOD combinations found\n", "cat(glue::glue(\"Duplicated OU_ID-PERIOD rows detected: {nrow(dupl_ou_period)}\"))\n", "head(dupl_ou_period, 5)\n" - ] + ], + "execution_count": null, + "outputs": [], + "id": "d761bd15" }, { "cell_type": "markdown", - "id": "805ed555", "metadata": { "papermill": { "duration": 0.000139, @@ -623,12 +622,11 @@ "2. For these, keep `max(ACTUAL_REPORTS)` (since `EXPECTED_REPORTS` is always == 1) because: \n", " * if both same value (either both 0 or both 1) => simply deduplicate (`distinct()`)\n", " * if else if different values, meaning that one dataset say 1 and the other 0 => keep 1 (facility _did_ report)" - ] + ], + "id": "805ed555" }, { "cell_type": "code", - "execution_count": null, - "id": "593b013a", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:09.488856Z", @@ -648,7 +646,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# Verify deduplication effect at OU_ID/PERIOD level\n", "dupl_after_cleaning <- dhis2_reporting_wide %>%\n", @@ -656,12 +653,13 @@ " dplyr::filter(dplyr::n() > 1) %>%\n", " dplyr::ungroup()\n", "cat(glue::glue(\"Remaining duplicated OU_ID-PERIOD rows after cleaning: {nrow(dupl_after_cleaning)}\"))\n" - ] + ], + "execution_count": null, + "outputs": [], + "id": "593b013a" }, { "cell_type": "code", - "execution_count": null, - "id": "c72bd93a", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:13.581200Z", @@ -681,15 +679,16 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# Optional inspection of cleaned rows\n", "head(dhis2_reporting_wide, 5)\n" - ] + ], + "execution_count": null, + "outputs": [], + "id": "c72bd93a" }, { "cell_type": "markdown", - "id": "2f26c614", "metadata": { "papermill": { "duration": 0.000236, @@ -703,12 +702,11 @@ "source": [ "### 3.3. ACTUAL / EXPECTED summaries after cleaning\n", "Niger-specific capping (values > 1 set to 1) is applied above when `COUNTRY_CODE == \"NER\"`. This cell only prints `summary()` for quick QC on all countries." - ] + ], + "id": "2f26c614" }, { "cell_type": "code", - "execution_count": null, - "id": "4118991c", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:18.924306Z", @@ -728,15 +726,16 @@ "languageId": "r" } }, - "outputs": [], "source": [ "summary(dhis2_reporting_wide$ACTUAL_REPORTS)\n", "summary(dhis2_reporting_wide$EXPECTED_REPORTS)\n" - ] + ], + "execution_count": null, + "outputs": [], + "id": "4118991c" }, { "cell_type": "markdown", - "id": "066319a3", "metadata": { "papermill": { "duration": 0.000172, @@ -749,12 +748,11 @@ }, "source": [ "### 3.4. Aggregate at AMD2 level" - ] + ], + "id": "066319a3" }, { "cell_type": "code", - "execution_count": null, - "id": "e94eeddd", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:19.494212Z", @@ -774,16 +772,17 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# 3.4 Aggregate table preview\n", "dim(reporting_rate_results)\n", "head(reporting_rate_results, 5)\n" - ] + ], + "execution_count": null, + "outputs": [], + "id": "e94eeddd" }, { "cell_type": "markdown", - "id": "eb181891", "metadata": { "papermill": { "duration": 0.000151, @@ -798,12 +797,11 @@ "### 3.5. Calculate REPORTING_RATE\n", "**numerator**: `ACTUAL_REPORTS`
    \n", "**denominator**: `EXPECTED_REPORTS`" - ] + ], + "id": "eb181891" }, { "cell_type": "code", - "execution_count": null, - "id": "e90a1c20", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:19.803233Z", @@ -823,15 +821,16 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# 3.5 Reporting rate range check\n", "summary(reporting_rate_results$REPORTING_RATE)\n" - ] + ], + "execution_count": null, + "outputs": [], + "id": "e90a1c20" }, { "cell_type": "markdown", - "id": "0556eba8-3d6a-45b1-af02-9bdf7da6fc99", "metadata": { "papermill": { "duration": 0.000123, @@ -843,15 +842,14 @@ "tags": [] }, "source": [ - "### 3.6. Ensure consistency of table (probably can skip because all data comes from the same source!)\n", + "### 3.6. Ensure table consistency\n", "Left join reporting indicators with DHIS2 routine data.\n", "Make sure we have a consistent reporting rates table matching periods x org units (safety measure only)." - ] + ], + "id": "0556eba8-3d6a-45b1-af02-9bdf7da6fc99" }, { "cell_type": "code", - "execution_count": null, - "id": "51e5b97a-e9b9-42d4-b991-0cee4fd5041f", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:20.001909Z", @@ -871,7 +869,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "reporting_rate_dataset <- left_join(dhis2_routine, \n", " reporting_rate_results %>% select(all_of(fixed_cols_rr)), \n", @@ -879,11 +876,13 @@ "\n", "print(dim(reporting_rate_dataset))\n", "head(reporting_rate_dataset, 3)" - ] + ], + "execution_count": null, + "outputs": [], + "id": "51e5b97a-e9b9-42d4-b991-0cee4fd5041f" }, { "cell_type": "markdown", - "id": "6b19e88d", "metadata": { "papermill": { "duration": 0.000173, @@ -896,18 +895,16 @@ }, "source": [ "### 3.7. Final visual check on REPORTING_RATE values" - ] + ], + "id": "6b19e88d" }, { "cell_type": "code", - "execution_count": null, - "id": "fbfec60f", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Add log message to communicate range of REPORTING_RATE values and warn if any values are outside [0,1]\n", "min_rr <- min(reporting_rate_dataset$REPORTING_RATE, na.rm = TRUE)\n", @@ -919,12 +916,13 @@ " log_msg(glue::glue(\"✅ REPORTING_RATE values are within the expected range [0,1]. \n", " Minimum REPORTING_RATE: {round(min_rr, 4)}, Maximum REPORTING_RATE: {round(max_rr, 4)}\"))\n", "}" - ] + ], + "execution_count": null, + "outputs": [], + "id": "fbfec60f" }, { "cell_type": "code", - "execution_count": null, - "id": "8878192f", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:20.080475Z", @@ -944,7 +942,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "# Simple plot to visualize distribution of REPORTING_RATE\n", "ggplot(reporting_rate_dataset, aes(x=REPORTING_RATE)) +\n", @@ -954,11 +951,13 @@ " title = glue::glue(\"Reporting rate values range from {round(min(reporting_rate_dataset$REPORTING_RATE), 2)} to {round(max(reporting_rate_dataset$REPORTING_RATE), 2)}\")\n", " ) +\n", " theme_minimal()" - ] + ], + "execution_count": null, + "outputs": [], + "id": "8878192f" }, { "cell_type": "markdown", - "id": "ad181b27-bf7b-4eb5-9200-fda8c2b8eb60", "metadata": { "papermill": { "duration": 0.000104, @@ -972,12 +971,11 @@ "source": [ "## 4. 📁 Export to `data/` folder\n", "Export as both .csv and .parquet file formats." - ] + ], + "id": "ad181b27-bf7b-4eb5-9200-fda8c2b8eb60" }, { "cell_type": "code", - "execution_count": null, - "id": "9adc033d-18d6-4786-8f96-21337b3e005f", "metadata": { "execution": { "iopub.execute_input": "2025-12-19T10:23:21.467337Z", @@ -997,7 +995,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "output_dir <- file.path(snt_environment$DATA_PATH, \"dhis2\", \"reporting_rate\")\n", "dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)\n", @@ -1010,7 +1007,10 @@ "\n", "# log\n", "log_msg(out_msg)\n" - ] + ], + "execution_count": null, + "outputs": [], + "id": "9adc033d-18d6-4786-8f96-21337b3e005f" } ], "metadata": { @@ -1045,4 +1045,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/pipelines/snt_dhis2_reporting_rate_dataset/utils/snt_dhis2_reporting_rate_dataset.r b/pipelines/snt_dhis2_reporting_rate_dataset/utils/snt_dhis2_reporting_rate_dataset.r index 5b11b62..175e817 100644 --- a/pipelines/snt_dhis2_reporting_rate_dataset/utils/snt_dhis2_reporting_rate_dataset.r +++ b/pipelines/snt_dhis2_reporting_rate_dataset/utils/snt_dhis2_reporting_rate_dataset.r @@ -1,9 +1,8 @@ # Load base utils -# Keep helpers small and reusable; pipeline-specific assignments stay in notebook code. source(file.path("~/workspace/code", "snt_utils.r")) -# JSON reader for this pipeline only (`snt_utils.r` unchanged). +# JSON reader for this pipeline. read_workspace_json_file <- function(json_path, resource_label = "JSON file") { json_path <- as.character(json_path)[[1L]] tryCatch( @@ -27,8 +26,7 @@ read_workspace_json_file <- function(json_path, resource_label = "JSON file") { #' #' @param SNT_ROOT_PATH Character. Root path of the SNT workspace. Default: '~/workspace' #' @param packages Character vector. R packages to install and load. -#' @return List with `paths_to_check` (CONFIG_PATH, UPLOADS_PATH, DATA_PATH) and the -#' same three paths at the top level for backward compatibility (`setup$CONFIG_PATH`, …). +#' @return List with `paths_to_check`, `CONFIG_PATH`, `UPLOADS_PATH`, and `DATA_PATH`. #' #' @export get_setup_variables <- function( @@ -82,8 +80,7 @@ stop_if_dataset_reporting_papermill_params_missing <- function() { if (length(missing) > 0) { stop( "[ERROR] Missing pipeline parameters (Papermill): ", - paste(missing, collapse = ", "), - ". Expected only ROUTINE_FILE and DATASET_ID from `snt_dhis2_reporting_rate_dataset`." + paste(missing, collapse = ", ") ) } } diff --git a/snt_dhis2_reporting_rate_dataelement/pipeline.py b/snt_dhis2_reporting_rate_dataelement/pipeline.py index 7d8ea36..b81909f 100644 --- a/snt_dhis2_reporting_rate_dataelement/pipeline.py +++ b/snt_dhis2_reporting_rate_dataelement/pipeline.py @@ -27,6 +27,48 @@ default="imputed", required=True, ) +@parameter( + "activity_indicators", + name="Facility Activity indicators", + help="Define which data elements will be used to determine the activity of a facility." + " A facility is considered 'active' if at least one of these indicators has a non-missing value" + " greater than zero.", + multiple=True, + choices=["CONF", "SUSP", "TEST", "PRES"], + type=str, + default=["CONF", "PRES"], + required=True, +) +@parameter( + "volume_activity_indicators", + name="Volume activity indicators", + help="Define which data elements will be used to determine the volume of activity at a facility." + " Volume of activity is used to calculate WEIGHTED reporting rates.", + multiple=True, + choices=["CONF", "SUSP", "TEST", "PRES"], + type=str, + default=["CONF", "PRES"], + required=True, +) +@parameter( + "dataelement_method_denominator", + name="Denominator method", + help="How to calculate the total nr of facilities expected to report.", + type=str, + choices=["ROUTINE_ACTIVE_FACILITIES", "PYRAMID_OPEN_FACILITIES"], + default="ROUTINE_ACTIVE_FACILITIES", + required=True, +) +@parameter( + "use_weighted_reporting_rates", + name="Use weighted reporting rates", + help="Weighted reporting rates are calculated using the volume of activity. " + "If TRUE, these values will populate the REPORTING_RATE column of the exported data. " + "If FALSE, unweighted reporting rates will be used instead.", + type=bool, + default=False, + required=False, +) @parameter( "run_report_only", name="Run reporting only", @@ -46,6 +88,10 @@ ) def snt_dhis2_reporting_rate_dataelement( routine_data_choice: str, + activity_indicators: str, + volume_activity_indicators: str, + dataelement_method_denominator: str, + use_weighted_reporting_rates: bool, run_report_only: bool, pull_scripts: bool, ): @@ -70,6 +116,7 @@ def snt_dhis2_reporting_rate_dataelement( validate_config(snt_config) country_code = snt_config["SNT_CONFIG"]["COUNTRY_CODE"] + # Build parameters dict and save to JSON in all cases (like other pipelines) routine_file = resolve_routine_filename( country_code=country_code, routine_data_choice=routine_data_choice ) @@ -78,10 +125,13 @@ def snt_dhis2_reporting_rate_dataelement( else: ds_outliers_id = snt_config["SNT_DATASET_IDENTIFIERS"]["DHIS2_OUTLIERS_IMPUTATION"] - # Build parameters dict and save to JSON in all cases (like other pipelines) nb_parameters = { "SNT_ROOT_PATH": root_path.as_posix(), "ROUTINE_FILE": routine_file, + "DATAELEMENT_METHOD_DENOMINATOR": dataelement_method_denominator, + "ACTIVITY_INDICATORS": activity_indicators, + "VOLUME_ACTIVITY_INDICATORS": volume_activity_indicators, + "USE_WEIGHTED_REPORTING_RATES": use_weighted_reporting_rates, "DATASET_ID": ds_outliers_id, } parameters_file = save_pipeline_parameters( @@ -123,7 +173,6 @@ def snt_dhis2_reporting_rate_dataelement( else: current_run.log_info("Skipping calculations, running only the reporting.") - # Compatible with snt_lib (snt_utils): do not pass nb_parameters run_report_notebook( nb_file=pipeline_path / "reporting" / "snt_dhis2_reporting_rate_dataelement_report.ipynb", nb_output_path=pipeline_path / "reporting" / "outputs", @@ -139,7 +188,14 @@ def snt_dhis2_reporting_rate_dataelement( def resolve_routine_filename(country_code: str, routine_data_choice: str) -> str: - """Returns the canonical routine filename for a routine data choice.""" + """Return the canonical routine Parquet filename for a routine data choice. + + Returns: + Filename string (e.g. ``{country_code}_routine_outliers_imputed.parquet``). + + Raises: + ValueError: If ``routine_data_choice`` is not one of the supported values. + """ if routine_data_choice == "raw": return f"{country_code}_routine.parquet" diff --git a/snt_dhis2_reporting_rate_dataset/pipeline.py b/snt_dhis2_reporting_rate_dataset/pipeline.py index 8d15ce4..ff440f2 100644 --- a/snt_dhis2_reporting_rate_dataset/pipeline.py +++ b/snt_dhis2_reporting_rate_dataset/pipeline.py @@ -88,6 +88,7 @@ def snt_dhis2_reporting_rate_dataset( return nb_parameters = { + "SNT_ROOT_PATH": root_path.as_posix(), "ROUTINE_FILE": routine_file, "DATASET_ID": ds_outliers_id, } From c52e00347eaab7a4210766b61b0ba3866b9a5bb8 Mon Sep 17 00:00:00 2001 From: claude-marie Date: Tue, 28 Apr 2026 12:27:14 +0200 Subject: [PATCH 18/18] merge conlfict --- .../code/snt_dhis2_quality_of_care.ipynb | 288 +++++----- .../snt_dhis2_quality_of_care_report.ipynb | 499 +++++++++--------- .../utils/snt_dhis2_quality_of_care.r | 264 +++++---- .../utils/snt_dhis2_quality_of_care_report.r | 65 ++- 4 files changed, 546 insertions(+), 570 deletions(-) diff --git a/pipelines/snt_dhis2_quality_of_care/code/snt_dhis2_quality_of_care.ipynb b/pipelines/snt_dhis2_quality_of_care/code/snt_dhis2_quality_of_care.ipynb index 0a13620..3e3d55c 100644 --- a/pipelines/snt_dhis2_quality_of_care/code/snt_dhis2_quality_of_care.ipynb +++ b/pipelines/snt_dhis2_quality_of_care/code/snt_dhis2_quality_of_care.ipynb @@ -1,156 +1,140 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Quality of Care Indicators\n", - "\n", - "Compute district-year quality-of-care indicators from DHIS2 outliers-imputed routine data.\n", - "\n", - "Indicators:\n", - "- testing_rate = TEST / SUSP\n", - "- treatment_rate = MALTREAT / CONF\n", - "- case_fatality_rate = MALDTH / MALADM\n", - "- prop_adm_malaria = MALADM / ALLADM\n", - "- prop_malaria_deaths = MALDTH / ALLDTH\n", - "- non_malaria_all_cause_outpatients = ALLOUT (absolute)\n", - "- presumed_cases = PRES (absolute)\n", - "\n", - "Stock-out indicators are not implemented yet (on hold, NMDR data pending)." - ], - "id": "fad6c24e" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "# Preliminaries\n", - "options(scipen = 999)\n", - "\n", - "ROOT_PATH <- \"~/workspace\"\n", - "source(file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_quality_of_care\", \"utils\", \"snt_dhis2_quality_of_care.r\"))\n", - "\n", - "snt_environment <- get_setup_variables(\n", - " SNT_ROOT_PATH = ROOT_PATH,\n", - " packages = c(\"jsonlite\", \"data.table\", \"arrow\", \"sf\", \"ggplot2\", \"glue\", \"reticulate\", \"RColorBrewer\", \"dplyr\", \"writexl\", \"knitr\", \"scales\", \"gridExtra\")\n", - ")\n", - "config_json <- load_snt_config(snt_environment$CONFIG_PATH, \"SNT_config.json\")\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "DHIS2_FORMATTED_DATASET <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "OUTLIERS_DATASET <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION\n", - "\n", - "PIPELINE_PATH <- file.path(snt_environment$PIPELINES_PATH, \"snt_dhis2_quality_of_care\")\n", - "OUTPUT_DATA_PATH <- file.path(snt_environment$DATA_PATH, \"dhis2\", \"quality_of_care\")\n", - "REPORT_OUTPUTS_PATH <- file.path(PIPELINE_PATH, \"reporting\", \"outputs\")\n", - "FIGURES_PATH <- file.path(REPORT_OUTPUTS_PATH, \"figures\")\n", - "\n", - "dir.create(OUTPUT_DATA_PATH, recursive = TRUE, showWarnings = FALSE)\n", - "dir.create(REPORT_OUTPUTS_PATH, recursive = TRUE, showWarnings = FALSE)\n", - "dir.create(FIGURES_PATH, recursive = TRUE, showWarnings = FALSE)" - ], - "execution_count": null, - "outputs": [], - "id": "317c4085" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "# Load and prepare inputs\n", - "if (!exists(\"data_action\")) {\n", - " data_action <- \"imputed\"\n", - "}\n", - "data_action <- validate_quality_of_care_action(data_action)\n", - "\n", - "log_msg(glue::glue(\"Using outliers dataset id: {OUTLIERS_DATASET}\"))\n", - "\n", - "routine_filename <- glue::glue(\"{COUNTRY_CODE}_routine_outliers_{data_action}.parquet\")\n", - "log_msg(glue::glue(\"Using routine file: {routine_filename}\"))\n", - "\n", - "routine <- load_dataset_file(\n", - " dataset_id = OUTLIERS_DATASET,\n", - " filename = routine_filename\n", - ")\n", - "\n", - "shapes_filename <- glue::glue(\"{COUNTRY_CODE}_shapes.geojson\")\n", - "shapes <- load_dataset_file(\n", - " dataset_id = DHIS2_FORMATTED_DATASET,\n", - " filename = shapes_filename\n", - ")\n", - "\n", - "core_cols <- c(\"ADM2_ID\", \"YEAR\")\n", - "core_missing <- setdiff(core_cols, names(routine))\n", - "if (length(core_missing) > 0) {\n", - " stop(glue::glue(\"[ERROR] Missing core columns: {paste(core_missing, collapse = ', ')}\"))\n", - "}\n", - "\n", - "# Keep indicator list in notebook (pipeline logic), not hardcoded in utils functions.\n", - "indicator_cols <- c(\"TEST\", \"SUSP\", \"MALTREAT\", \"CONF\", \"MALDTH\", \"MALADM\", \"ALLADM\", \"ALLDTH\", \"ALLOUT\", \"PRES\")\n", - "missing_cols <- setdiff(indicator_cols, names(routine))\n", - "if (length(missing_cols) > 0) {\n", - " log_msg(glue::glue(\"[WARNING] Missing indicator columns: {paste(missing_cols, collapse = ', ')}\"), level = \"warning\")\n", - "}\n", - "\n", - "routine <- normalize_qoc_routine_types(routine, indicator_cols = indicator_cols)\n", - "qoc <- aggregate_qoc_district_year(routine, indicator_cols = indicator_cols)\n", - "qoc <- add_quality_of_care_derived_indicators(qoc)\n", - "qoc <- attach_quality_of_care_shapes(qoc, shapes)\n", - "\n", - "save_quality_of_care_outputs(\n", - " qoc_dt = qoc,\n", - " output_data_path = OUTPUT_DATA_PATH,\n", - " country_code = COUNTRY_CODE,\n", - " data_action = data_action\n", - ")" - ], - "execution_count": null, - "outputs": [], - "id": "98b78bf7" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "# Build yearly maps (saved as PNG)\n", - "save_quality_of_care_maps(\n", - " qoc_dt = qoc,\n", - " shapes_sf = shapes,\n", - " figures_path = FIGURES_PATH\n", - ")" - ], - "execution_count": null, - "outputs": [], - "id": "984689b0" + "cells": [ + { + "cell_type": "markdown", + "id": "fad6c24e", + "metadata": {}, + "source": [ + "## Quality of Care Indicators\n", + "\n", + "Compute district-year quality-of-care indicators from DHIS2 routine data produced by outliers pipelines (`imputed` or `removed`).\n", + "\n", + "Indicators:\n", + "- testing_rate = TEST / SUSP\n", + "- treatment_rate = MALTREAT / CONF\n", + "- case_fatality_rate = MALDTH / MALADM\n", + "- prop_adm_malaria = MALADM / ALLADM\n", + "- prop_malaria_deaths = MALDTH / ALLDTH\n", + "- non_malaria_all_cause_outpatients = ALLOUT (absolute)\n", + "- presumed_cases = PRES (absolute)\n", + "\n", + "Stock-out indicators are not implemented yet (on hold, NMDR data pending)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "317c4085", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Preliminaries\n", + "options(scipen = 999)\n", + "\n", + "ROOT_PATH <- \"~/workspace\"\n", + "source(file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_quality_of_care\", \"utils\", \"snt_dhis2_quality_of_care.r\"))\n", + "\n", + "snt_environment <- get_setup_variables(SNT_ROOT_PATH = ROOT_PATH, packages = c(\"jsonlite\", \"data.table\", \"arrow\", \"sf\", \"ggplot2\", \"glue\", \"reticulate\", \"RColorBrewer\", \"dplyr\", \"knitr\", \"scales\", \"gridExtra\"))\n", + "config_json <- load_snt_config(snt_environment$CONFIG_PATH, \"SNT_config.json\")\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "DHIS2_FORMATTED_DATASET <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "OUTLIERS_DATASET <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION\n", + "\n", + "PIPELINE_PATH <- file.path(snt_environment$PIPELINES_PATH, \"snt_dhis2_quality_of_care\")\n", + "OUTPUT_DATA_PATH <- file.path(snt_environment$DATA_PATH, \"dhis2\", \"quality_of_care\")\n", + "REPORT_OUTPUTS_PATH <- file.path(PIPELINE_PATH, \"reporting\", \"outputs\")\n", + "FIGURES_PATH <- file.path(REPORT_OUTPUTS_PATH, \"figures\")\n", + "dir.create(OUTPUT_DATA_PATH, recursive = TRUE, showWarnings = FALSE)\n", + "dir.create(REPORT_OUTPUTS_PATH, recursive = TRUE, showWarnings = FALSE)\n", + "dir.create(FIGURES_PATH, recursive = TRUE, showWarnings = FALSE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98b78bf7", + "metadata": { + "vscode": { + "languageId": "r" } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" + }, + "outputs": [], + "source": [ + "if (!exists(\"data_action\")) data_action <- \"imputed\"\n", + "data_action <- validate_quality_of_care_action(data_action)\n", + "\n", + "indicator_cols <- c(\"TEST\", \"SUSP\", \"MALTREAT\", \"CONF\", \"MALDTH\", \"MALADM\", \"ALLADM\", \"ALLDTH\", \"ALLOUT\", \"PRES\")\n", + "\n", + "routine <- load_dataset_file(\n", + " dataset_id = OUTLIERS_DATASET,\n", + " filename = glue::glue(\"{COUNTRY_CODE}_routine_outliers_{data_action}.parquet\")\n", + ")\n", + "shapes <- load_dataset_file(\n", + " dataset_id = DHIS2_FORMATTED_DATASET,\n", + " filename = glue::glue(\"{COUNTRY_CODE}_shapes.geojson\")\n", + ")\n", + "\n", + "routine <- normalize_qoc_routine_types(routine, indicator_cols = indicator_cols)\n", + "qoc <- aggregate_qoc_district_year(routine, indicator_cols = indicator_cols)\n", + "\n", + "# Derived indicators — edit here to add / remove / modify\n", + "if (\"TEST\" %in% names(qoc) && \"SUSP\" %in% names(qoc)) qoc[, testing_rate := fifelse(SUSP > 0, TEST / SUSP, NA_real_)]\n", + "if (\"MALTREAT\" %in% names(qoc) && \"CONF\" %in% names(qoc)) qoc[, treatment_rate := fifelse(CONF > 0, MALTREAT / CONF, NA_real_)]\n", + "if (\"MALDTH\" %in% names(qoc) && \"MALADM\" %in% names(qoc)) qoc[, case_fatality_rate := fifelse(MALADM > 0, MALDTH / MALADM, NA_real_)]\n", + "if (\"MALADM\" %in% names(qoc) && \"ALLADM\" %in% names(qoc)) qoc[, prop_adm_malaria := fifelse(ALLADM > 0, MALADM / ALLADM, NA_real_)]\n", + "if (\"MALDTH\" %in% names(qoc) && \"ALLDTH\" %in% names(qoc)) qoc[, prop_malaria_deaths := fifelse(ALLDTH > 0, MALDTH / ALLDTH, NA_real_)]\n", + "if (\"ALLOUT\" %in% names(qoc)) qoc[, non_malaria_all_cause_outpatients := ALLOUT]\n", + "if (\"PRES\" %in% names(qoc)) qoc[, presumed_cases := PRES]\n", + "\n", + "qoc <- attach_quality_of_care_shapes(qoc, shapes)\n", + "\n", + "save_quality_of_care_outputs(\n", + " qoc_dt = qoc,\n", + " output_data_path = OUTPUT_DATA_PATH,\n", + " country_code = COUNTRY_CODE,\n", + " data_action = data_action\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "984689b0", + "metadata": { + "vscode": { + "languageId": "r" } + }, + "outputs": [], + "source": [ + "# Build yearly maps (saved as PNG)\n", + "save_quality_of_care_maps(\n", + " qoc_dt = qoc,\n", + " shapes_sf = shapes,\n", + " figures_path = FIGURES_PATH\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pipelines/snt_dhis2_quality_of_care/reporting/snt_dhis2_quality_of_care_report.ipynb b/pipelines/snt_dhis2_quality_of_care/reporting/snt_dhis2_quality_of_care_report.ipynb index 5a96aa6..696fa7b 100644 --- a/pipelines/snt_dhis2_quality_of_care/reporting/snt_dhis2_quality_of_care_report.ipynb +++ b/pipelines/snt_dhis2_quality_of_care/reporting/snt_dhis2_quality_of_care_report.ipynb @@ -1,252 +1,255 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Quality of Care Report\n", - "\n", - "This report displays a compact year-level summary of quality-of-care indicators and points to generated map outputs." - ], - "id": "7d246ae9" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "ROOT_PATH <- \"~/workspace\"\n", - "source(file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_quality_of_care\", \"utils\", \"snt_dhis2_quality_of_care_report.r\"))\n", - "\n", - "snt_environment <- get_setup_variables(\n", - " SNT_ROOT_PATH = ROOT_PATH,\n", - " packages = c(\"jsonlite\", \"data.table\", \"arrow\", \"dplyr\", \"knitr\", \"glue\", \"reticulate\", \"writexl\", \"ggplot2\", \"scales\", \"gridExtra\", \"sf\")\n", - ")\n", - "config_json <- load_snt_config(snt_environment$CONFIG_PATH, \"SNT_config.json\")\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "DHIS2_FORMATTED_DATASET <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "\n", - "PIPELINE_PATH <- file.path(snt_environment$PIPELINES_PATH, \"snt_dhis2_quality_of_care\")\n", - "DATA_PATH <- file.path(snt_environment$DATA_PATH, \"dhis2\", \"quality_of_care\")\n", - "REPORT_OUTPUTS_PATH <- file.path(PIPELINE_PATH, \"reporting\", \"outputs\")\n", - "FIGURES_PATH <- file.path(REPORT_OUTPUTS_PATH, \"figures\")\n", - "dir.create(DATA_PATH, recursive = TRUE, showWarnings = FALSE)\n", - "dir.create(REPORT_OUTPUTS_PATH, recursive = TRUE, showWarnings = FALSE)\n", - "dir.create(FIGURES_PATH, recursive = TRUE, showWarnings = FALSE)" - ], - "execution_count": null, - "outputs": [], - "id": "5eaa5bab" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "# Load latest district-year output and build summary\n", - "qoc_ctx <- load_latest_quality_of_care_output(DATA_PATH, COUNTRY_CODE)\n", - "qoc <- qoc_ctx$qoc\n", - "latest_file <- qoc_ctx$latest_file\n", - "\n", - "summary_tbl <- build_quality_of_care_summary(qoc)\n", - "summary_paths <- save_quality_of_care_summary_outputs(\n", - " summary_tbl = summary_tbl,\n", - " report_outputs_path = REPORT_OUTPUTS_PATH,\n", - " country_code = COUNTRY_CODE\n", - ")\n", - "\n", - "knitr::kable(summary_tbl, caption = \"Quality of Care - Year-level summary\")\n", - "\n", - "cat(glue::glue(\"\\nLoaded file: {latest_file}\\n\"))\n", - "cat(glue::glue(\"Map outputs folder: {FIGURES_PATH}\\n\"))\n", - "cat(glue::glue(\"Summary data saved to: {summary_paths$summary_parquet}, {summary_paths$summary_csv}, {summary_paths$summary_xlsx}\\n\"))" - ], - "execution_count": null, - "outputs": [], - "id": "1a8320f8" - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Graphs by Year" - ], - "id": "3dc318ac" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "# Build and save year-level indicator charts\n", - "charts_file <- save_quality_of_care_summary_charts(\n", - " summary_tbl = summary_tbl,\n", - " figures_path = FIGURES_PATH,\n", - " country_code = COUNTRY_CODE\n", - ")\n", - "\n", - "if (!is.null(charts_file)) {\n", - " cat(glue::glue(\"Combined charts saved: {charts_file}\\n\"))\n", - "} else {\n", - " cat(\"No chart produced (no indicator columns available).\\n\")\n", - "}" - ], - "execution_count": null, - "outputs": [], - "id": "0e86bb0a" - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Maps by District and Year\n", - "\n", - "Maps are generated directly from the quality-of-care data and district shapes." - ], - "id": "3b625d36" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [ - "# Load shapes and regenerate yearly maps through shared utils\n", - "shapes_filename <- glue::glue(\"{COUNTRY_CODE}_shapes.geojson\")\n", - "shapes <- load_dataset_file(\n", - " dataset_id = DHIS2_FORMATTED_DATASET,\n", - " filename = shapes_filename\n", - ")\n", - "\n", - "save_quality_of_care_maps(\n", - " qoc_dt = qoc,\n", - " shapes_sf = shapes,\n", - " figures_path = FIGURES_PATH\n", - ")\n", - "\n", - "cat(glue::glue(\"Yearly maps saved in: {FIGURES_PATH}\\n\"))" - ], - "execution_count": null, - "outputs": [], - "id": "6056a979" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [], - "execution_count": null, - "outputs": [], - "id": "5b31e4c8" - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [], - "id": "8229c37e" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [], - "execution_count": null, - "outputs": [], - "id": "07324c1c" - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [], - "id": "7c084da7" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [], - "execution_count": null, - "outputs": [], - "id": "c9f52975" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [], - "execution_count": null, - "outputs": [], - "id": "006866ce" - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [], - "id": "f7225165" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [], - "execution_count": null, - "outputs": [], - "id": "420ed27f" - }, - { - "cell_type": "code", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "source": [], - "execution_count": null, - "outputs": [], - "id": "67ddb838" + "cells": [ + { + "cell_type": "markdown", + "id": "7d246ae9", + "metadata": {}, + "source": [ + "## Quality of Care Report\n", + "\n", + "This report displays a compact year-level summary of quality-of-care indicators and points to generated map outputs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5eaa5bab", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "ROOT_PATH <- \"~/workspace\"\n", + "source(file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_quality_of_care\", \"utils\", \"snt_dhis2_quality_of_care_report.r\"))\n", + "\n", + "snt_environment <- get_setup_variables(SNT_ROOT_PATH = ROOT_PATH, packages = c(\"jsonlite\", \"data.table\", \"arrow\", \"sf\", \"ggplot2\", \"glue\", \"reticulate\", \"RColorBrewer\", \"dplyr\", \"knitr\", \"scales\", \"gridExtra\", \"IRdisplay\"))\n", + "config_json <- load_snt_config(snt_environment$CONFIG_PATH, \"SNT_config.json\")\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "DHIS2_FORMATTED_DATASET <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "PIPELINE_PATH <- file.path(snt_environment$PIPELINES_PATH, \"snt_dhis2_quality_of_care\")\n", + "OUTPUT_DATA_PATH <- file.path(snt_environment$DATA_PATH, \"dhis2\", \"quality_of_care\")\n", + "REPORT_OUTPUTS_PATH <- file.path(PIPELINE_PATH, \"reporting\", \"outputs\")\n", + "FIGURES_PATH <- file.path(REPORT_OUTPUTS_PATH, \"figures\")\n", + "dir.create(OUTPUT_DATA_PATH, recursive = TRUE, showWarnings = FALSE)\n", + "dir.create(REPORT_OUTPUTS_PATH, recursive = TRUE, showWarnings = FALSE)\n", + "dir.create(FIGURES_PATH, recursive = TRUE, showWarnings = FALSE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a8320f8", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load latest district-year output and build summary\n", + "qoc_ctx <- load_latest_quality_of_care_output(OUTPUT_DATA_PATH, COUNTRY_CODE)\n", + "qoc <- qoc_ctx$qoc\n", + "latest_file <- qoc_ctx$latest_file\n", + "\n", + "summary_tbl <- build_quality_of_care_summary(qoc)\n", + "summary_paths <- save_quality_of_care_summary_outputs(\n", + " summary_tbl = summary_tbl,\n", + " report_outputs_path = REPORT_OUTPUTS_PATH,\n", + " country_code = COUNTRY_CODE\n", + ")\n", + "\n", + "knitr::kable(summary_tbl, caption = \"Quality of Care - Year-level summary\")\n", + "\n", + "cat(glue::glue(\"\\nLoaded file: {latest_file}\\n\"))\n", + "cat(glue::glue(\"Map outputs folder: {FIGURES_PATH}\\n\"))\n", + "cat(glue::glue(\"Summary data saved to: {summary_paths$summary_parquet}, {summary_paths$summary_csv}\\n\"))" + ] + }, + { + "cell_type": "markdown", + "id": "3dc318ac", + "metadata": {}, + "source": [ + "## Graphs by Year" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e86bb0a", + "metadata": { + "vscode": { + "languageId": "r" } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" + }, + "outputs": [], + "source": [ + "# Build, save, and display year-level indicator chart\n", + "charts_file <- save_quality_of_care_summary_charts(\n", + " summary_tbl = summary_tbl,\n", + " figures_path = FIGURES_PATH,\n", + " country_code = COUNTRY_CODE\n", + ")\n", + "\n", + "IRdisplay::display_png(file = normalizePath(path.expand(charts_file)))" + ] + }, + { + "cell_type": "markdown", + "id": "3b625d36", + "metadata": {}, + "source": [ + "## Maps by District and Year\n", + "\n", + "Maps are generated directly from the quality-of-care data and district shapes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6056a979", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load shapes, regenerate yearly maps, and display them\n", + "shapes_filename <- glue::glue(\"{COUNTRY_CODE}_shapes.geojson\")\n", + "shapes <- load_dataset_file(\n", + " dataset_id = DHIS2_FORMATTED_DATASET,\n", + " filename = shapes_filename\n", + ")\n", + "\n", + "save_quality_of_care_maps(\n", + " qoc_dt = qoc,\n", + " shapes_sf = shapes,\n", + " figures_path = FIGURES_PATH\n", + ")\n", + "\n", + "years <- sort(unique(qoc$YEAR))\n", + "years_regex <- paste(years, collapse = \"|\")\n", + "map_files <- list.files(\n", + " FIGURES_PATH,\n", + " pattern = glue::glue(\"^(testing_rate|treatment_rate|case_fatality_rate|prop_adm_malaria|prop_malaria_deaths|allout|presumed_cases)_({years_regex})[.]png$\"),\n", + " full.names = TRUE\n", + ")\n", + "map_files <- sort(map_files)\n", + "\n", + "for (map_file in map_files) {\n", + " IRdisplay::display_png(file = normalizePath(path.expand(map_file)))\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b31e4c8", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "8229c37e", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "07324c1c", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "7c084da7", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9f52975", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "006866ce", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "f7225165", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "420ed27f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "67ddb838", + "metadata": { + "vscode": { + "languageId": "r" } + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pipelines/snt_dhis2_quality_of_care/utils/snt_dhis2_quality_of_care.r b/pipelines/snt_dhis2_quality_of_care/utils/snt_dhis2_quality_of_care.r index 639f62b..4d40380 100644 --- a/pipelines/snt_dhis2_quality_of_care/utils/snt_dhis2_quality_of_care.r +++ b/pipelines/snt_dhis2_quality_of_care/utils/snt_dhis2_quality_of_care.r @@ -1,70 +1,43 @@ -# Quality of Care shared R helpers. -# -# This file is the module used by the QoC *pipeline code* notebook: -# - bootstrap paths + install/load packages -# - config + dataset loading -# - map export utility used by both the pipeline and reporting notebooks -# - compute helpers for district-year outputs -# -# Reporting-only helpers live in: -# - snt_dhis2_quality_of_care_report.r -# -# Pipeline code notebook sources this file only (not a separate *_code.r). - # Load shared SNT helpers. source(file.path("~/workspace", "code", "snt_utils.r")) -#' Bootstrap context for Quality of Care notebooks. -#' -#' Returns base workspace paths only. +#' Load packages, OpenHEXA, and return base workspace paths (one list, four names). +#' @param SNT_ROOT_PATH Workspace root. Default `~/workspace`. +#' @param packages R packages to install/load. +#' @return Named list: `CONFIG_PATH`, `UPLOADS_PATH`, `DATA_PATH`, `PIPELINES_PATH`. get_setup_variables <- function( SNT_ROOT_PATH = "~/workspace", packages = c("arrow", "dplyr", "tidyr", "stringr", "stringi", "jsonlite", "httr", "glue", "reticulate") ) { - install_and_load(packages) - - Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") - reticulate::py_config()$python - assign("openhexa", reticulate::import("openhexa.sdk"), envir = .GlobalEnv) - - paths_to_check <- list( - CONFIG_PATH = file.path(SNT_ROOT_PATH, "configuration"), - UPLOADS_PATH = file.path(SNT_ROOT_PATH, "uploads"), - DATA_PATH = file.path(SNT_ROOT_PATH, "data"), + base_paths <- list( + CONFIG_PATH = file.path(SNT_ROOT_PATH, "configuration"), + UPLOADS_PATH = file.path(SNT_ROOT_PATH, "uploads"), + DATA_PATH = file.path(SNT_ROOT_PATH, "data"), PIPELINES_PATH = file.path(SNT_ROOT_PATH, "pipelines") ) - for (target_path in paths_to_check) { - dir.create(target_path, recursive = TRUE, showWarnings = FALSE) + for (p in base_paths) { + if (!dir.exists(p)) { + dir.create(p, recursive = TRUE, showWarnings = FALSE) + } } - setup_variable <- c( - list(paths_to_check = paths_to_check), - paths_to_check - ) - - return(setup_variable) -} + install_and_load(packages) + Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") + reticulate::py_config()$python + assign("openhexa", reticulate::import("openhexa.sdk"), envir = .GlobalEnv) -#' Load SNT configuration file. -load_snt_config <- function(config_path, config_file_name = "SNT_config.json") { - config_file <- file.path(config_path, config_file_name) - config_json <- tryCatch( - { - jsonlite::fromJSON(config_file) - }, - error = function(e) { - stop(glue::glue("[ERROR] Error while loading configuration from `{config_file}`: {conditionMessage(e)}")) - } - ) - log_msg(paste0("SNT configuration loaded from: ", config_file)) - return(config_json) + return(base_paths) } - #' Load dataset file from OpenHEXA. +#' +#' @param dataset_id Character. OpenHEXA dataset identifier. +#' @param filename Character. Name of file to load. +#' @param verbose Logical. If TRUE, log dataframe dimensions after a successful load. +#' @return Dataframe containing the loaded data. load_dataset_file <- function(dataset_id, filename, verbose = TRUE) { if (!exists("openhexa", inherits = TRUE) || is.null(get("openhexa", inherits = TRUE))) { stop("[ERROR] OpenHEXA SDK is not available. Run `get_setup_variables()` before loading dataset files.") @@ -88,10 +61,96 @@ load_dataset_file <- function(dataset_id, filename, verbose = TRUE) { return(data) } +#' Validate quality-of-care action parameter. +#' +#' @param data_action Action string expected to be `imputed` or `removed`. +#' @return Validated action string. +validate_quality_of_care_action <- function(data_action) { + if (is.null(data_action) || !nzchar(data_action)) { + return("imputed") + } + allowed_actions <- c("imputed", "removed") + if (!(data_action %in% allowed_actions)) { + stop(glue::glue("[ERROR] Invalid data_action `{data_action}`. Allowed: {paste(allowed_actions, collapse = ', ')}")) + } + data_action +} + +#' Compute district-year Quality of Care indicators. +#' +#' @param routine Routine dataframe loaded from outliers dataset. +#' @param indicator_cols Character vector of routine indicator column names to coerce to numeric +#' (define in the notebook or config, not hardcoded here). +#' @return Data table with district-year indicators. +normalize_qoc_routine_types <- function(routine, indicator_cols) { + data.table::setDT(routine) + available_cols <- intersect(indicator_cols, names(routine)) + + for (col in available_cols) { + col_vals <- as.character(routine[[col]]) + col_vals[is.na(col_vals) | col_vals == "" | col_vals == "-"] <- NA_character_ + routine[, (col) := as.numeric(col_vals)] + } + + routine[, YEAR := as.integer(YEAR)] + routine[, ADM2_ID := as.character(ADM2_ID)] + routine +} + +#' Aggregate QoC routine indicators by district and year. +#' +#' @param routine Routine data table with normalized types. +#' @param indicator_cols Character vector of column names to sum (must match the vector used +#' in [normalize_qoc_routine_types()]). +#' @return Aggregated district-year data table. +aggregate_qoc_district_year <- function(routine, indicator_cols) { + available_cols <- intersect(indicator_cols, names(routine)) + + if (length(available_cols) > 0) { + routine[, lapply(.SD, function(x) sum(x, na.rm = TRUE)), .SDcols = available_cols, by = .(ADM2_ID, YEAR)] + } else { + unique(routine[, .(ADM2_ID, YEAR)]) + } +} + +#' Merge ADM2 labels into Quality of Care outputs. +#' +#' @param qoc_dt Quality-of-care data table. +#' @param shapes_sf Shapes sf table. +#' @return Data table with optional ADM2_NAME. +attach_quality_of_care_shapes <- function(qoc_dt, shapes_sf) { + shapes_dt <- data.table::as.data.table(sf::st_drop_geometry(shapes_sf)) + if ("ADM2_ID" %in% names(shapes_dt) && "ADM2_NAME" %in% names(shapes_dt)) { + shapes_dt[, ADM2_ID := as.character(ADM2_ID)] + qoc_dt <- merge(qoc_dt, unique(shapes_dt[, .(ADM2_ID, ADM2_NAME)]), by = "ADM2_ID", all.x = TRUE) + } + qoc_dt +} + +#' Save district-year Quality of Care outputs. +#' +#' @param qoc_dt Computed quality-of-care data table. +#' @param output_data_path Output directory path. +#' @param country_code Country code. +#' @param data_action Action suffix for output naming. +#' @return Named list with `parquet` and `csv` output file paths. +save_quality_of_care_outputs <- function(qoc_dt, output_data_path, country_code, data_action) { + out_district_parquet <- file.path(output_data_path, glue::glue("{country_code}_quality_of_care_district_year_{data_action}.parquet")) + out_district_csv <- file.path(output_data_path, glue::glue("{country_code}_quality_of_care_district_year_{data_action}.csv")) + + arrow::write_parquet(qoc_dt, out_district_parquet) + data.table::fwrite(qoc_dt, out_district_csv) + log_msg(glue::glue("Saved outputs: {out_district_parquet}, {out_district_csv}")) + + list(parquet = out_district_parquet, csv = out_district_csv) +} #' Generate and save yearly district maps for QoC indicators. #' -#' This is used by both the pipeline notebook and the reporting notebook. +#' @param qoc_dt Quality-of-care data table. +#' @param shapes_sf District shapes sf. +#' @param figures_path Folder where PNG maps are written. +#' @return Invisibly returns `TRUE`. save_quality_of_care_maps <- function(qoc_dt, shapes_sf, figures_path) { shapes_sf$ADM2_ID <- as.character(shapes_sf$ADM2_ID) qoc_dt$ADM2_ID <- as.character(qoc_dt$ADM2_ID) @@ -105,14 +164,14 @@ save_quality_of_care_maps <- function(qoc_dt, shapes_sf, figures_path) { tryCatch( { df_y <- df[YEAR == yr] - if (nrow(df_y) == 0) return(invisible(NULL)) + if (nrow(df_y) == 0) next df_y$ADM2_ID <- as.character(df_y$ADM2_ID) map_df <- dplyr::left_join(sf_shapes_local, df_y, by = "ADM2_ID") - if (!(value_col %in% names(map_df))) return(invisible(NULL)) + if (!(value_col %in% names(map_df))) next vals <- map_df[[value_col]] finite_vals <- vals[is.finite(vals) & !is.na(vals)] - if (length(finite_vals) == 0) return(invisible(NULL)) + if (length(finite_vals) == 0) next if (is_rate) { cat_vals <- cut(vals, breaks = c(-Inf, 0, 0.2, 0.4, 0.6, 0.8, 1.0, Inf), labels = c("<0", "0-0.2", "0.2-0.4", "0.4-0.6", "0.6-0.8", "0.8-1.0", ">1.0"), include.lowest = TRUE) @@ -150,100 +209,15 @@ save_quality_of_care_maps <- function(qoc_dt, shapes_sf, figures_path) { } } - if ("testing_rate" %in% names(qoc_dt)) plot_yearly_map(qoc_dt, shapes_sf, "testing_rate", "Testing rate (TEST / SUSP)", "testing_rate", TRUE) - if ("treatment_rate" %in% names(qoc_dt)) plot_yearly_map(qoc_dt, shapes_sf, "treatment_rate", "Treatment rate (MALTREAT / CONF)", "treatment_rate", TRUE) - if ("case_fatality_rate" %in% names(qoc_dt)) plot_yearly_map(qoc_dt, shapes_sf, "case_fatality_rate", "In-hospital case fatality rate (MALDTH / MALADM)", "case_fatality_rate", TRUE) - if ("prop_adm_malaria" %in% names(qoc_dt)) plot_yearly_map(qoc_dt, shapes_sf, "prop_adm_malaria", "Proportion admitted for malaria (MALADM / ALLADM)", "prop_adm_malaria", TRUE) - if ("prop_malaria_deaths" %in% names(qoc_dt)) plot_yearly_map(qoc_dt, shapes_sf, "prop_malaria_deaths", "Proportion of malaria deaths (MALDTH / ALLDTH)", "prop_malaria_deaths", TRUE) - if ("non_malaria_all_cause_outpatients" %in% names(qoc_dt)) plot_yearly_map(qoc_dt, shapes_sf, "non_malaria_all_cause_outpatients", "Non-malaria all-cause outpatients (ALLOUT)", "allout", FALSE) - if ("presumed_cases" %in% names(qoc_dt)) plot_yearly_map(qoc_dt, shapes_sf, "presumed_cases", "Presumed cases (PRES)", "presumed_cases", FALSE) + plot_yearly_map(qoc_dt, shapes_sf, "testing_rate","Testing rate (TEST / SUSP)","testing_rate",TRUE) + plot_yearly_map(qoc_dt, shapes_sf, "treatment_rate","Treatment rate (MALTREAT / CONF)","treatment_rate",TRUE) + plot_yearly_map(qoc_dt, shapes_sf, "case_fatality_rate","In-hospital case fatality rate (MALDTH / MALADM)","case_fatality_rate",TRUE) + plot_yearly_map(qoc_dt, shapes_sf, "prop_adm_malaria","Proportion admitted for malaria (MALADM / ALLADM)","prop_adm_malaria",TRUE) + plot_yearly_map(qoc_dt, shapes_sf, "prop_malaria_deaths","Proportion of malaria deaths (MALDTH / ALLDTH)","prop_malaria_deaths",TRUE) + plot_yearly_map(qoc_dt, shapes_sf, "non_malaria_all_cause_outpatients","Non-malaria all-cause outpatients (ALLOUT)","allout",FALSE) + plot_yearly_map(qoc_dt, shapes_sf, "presumed_cases","Presumed cases (PRES)","presumed_cases",FALSE) log_msg(glue::glue("Saved yearly maps in: {figures_path}")) invisible(TRUE) } - -#' Validate quality-of-care action parameter. -validate_quality_of_care_action <- function(data_action) { - if (is.null(data_action) || !nzchar(data_action)) { - return("imputed") - } - allowed_actions <- c("imputed", "removed") - if (!(data_action %in% allowed_actions)) { - stop(glue::glue("[ERROR] Invalid data_action `{data_action}`. Allowed: {paste(allowed_actions, collapse = ', ')}")) - } - data_action -} - - -#' Coerce `indicator_cols` to numeric; YEAR and ADM2_ID types if those columns exist. -normalize_qoc_routine_types <- function(routine, indicator_cols) { - data.table::setDT(routine) - indicator_cols <- as.character(indicator_cols) - available_cols <- intersect(indicator_cols, names(routine)) - - for (col in available_cols) { - col_vals <- as.character(routine[[col]]) - col_vals[is.na(col_vals) | col_vals == "" | col_vals == "-"] <- NA_character_ - routine[, (col) := suppressWarnings(as.numeric(col_vals))] - } - - if ("YEAR" %in% names(routine)) routine[, YEAR := as.integer(YEAR)] - if ("ADM2_ID" %in% names(routine)) routine[, ADM2_ID := as.character(ADM2_ID)] - routine -} - - -#' Sum `indicator_cols` by `group_cols` (default ADM2_ID, YEAR). -aggregate_qoc_district_year <- function(routine, indicator_cols, group_cols = c("ADM2_ID", "YEAR")) { - group_cols <- as.character(group_cols) - indicator_cols <- as.character(indicator_cols) - available_cols <- intersect(indicator_cols, names(routine)) - - if (length(available_cols) > 0) { - routine[, lapply(.SD, function(x) sum(x, na.rm = TRUE)), .SDcols = available_cols, by = group_cols] - } else { - unique(routine[, ..group_cols]) - } -} - - -#' Add derived quality-of-care indicators to aggregated district-year data. -add_quality_of_care_derived_indicators <- function(qoc) { - if ("TEST" %in% names(qoc) && "SUSP" %in% names(qoc)) qoc[, testing_rate := data.table::fifelse(SUSP > 0, TEST / SUSP, NA_real_)] - if ("MALTREAT" %in% names(qoc) && "CONF" %in% names(qoc)) qoc[, treatment_rate := data.table::fifelse(CONF > 0, MALTREAT / CONF, NA_real_)] - if ("MALDTH" %in% names(qoc) && "MALADM" %in% names(qoc)) qoc[, case_fatality_rate := data.table::fifelse(MALADM > 0, MALDTH / MALADM, NA_real_)] - if ("MALADM" %in% names(qoc) && "ALLADM" %in% names(qoc)) qoc[, prop_adm_malaria := data.table::fifelse(ALLADM > 0, MALADM / ALLADM, NA_real_)] - if ("MALDTH" %in% names(qoc) && "ALLDTH" %in% names(qoc)) { - qoc[, prop_malaria_deaths := data.table::fifelse(ALLDTH > 0, MALDTH / ALLDTH, NA_real_)] - qoc[, prop_deaths_malaria := prop_malaria_deaths] - } - if ("ALLOUT" %in% names(qoc)) qoc[, non_malaria_all_cause_outpatients := ALLOUT] - if ("PRES" %in% names(qoc)) qoc[, presumed_cases := PRES] - - qoc -} - - -#' Merge ADM2 labels into Quality of Care outputs. -attach_quality_of_care_shapes <- function(qoc_dt, shapes_sf) { - shapes_dt <- data.table::as.data.table(sf::st_drop_geometry(shapes_sf)) - if ("ADM2_ID" %in% names(shapes_dt) && "ADM2_NAME" %in% names(shapes_dt)) { - shapes_dt[, ADM2_ID := as.character(ADM2_ID)] - qoc_dt <- merge(qoc_dt, unique(shapes_dt[, .(ADM2_ID, ADM2_NAME)]), by = "ADM2_ID", all.x = TRUE) - } - qoc_dt -} - - -#' Save district-year Quality of Care outputs. -save_quality_of_care_outputs <- function(qoc_dt, output_data_path, country_code, data_action) { - out_district_parquet <- file.path(output_data_path, glue::glue("{country_code}_quality_of_care_district_year_{data_action}.parquet")) - out_district_csv <- file.path(output_data_path, glue::glue("{country_code}_quality_of_care_district_year_{data_action}.csv")) - - arrow::write_parquet(qoc_dt, out_district_parquet) - data.table::fwrite(qoc_dt, out_district_csv) - log_msg(glue::glue("Saved outputs: {out_district_parquet}, {out_district_csv}")) - - list(parquet = out_district_parquet, csv = out_district_csv) -} diff --git a/pipelines/snt_dhis2_quality_of_care/utils/snt_dhis2_quality_of_care_report.r b/pipelines/snt_dhis2_quality_of_care/utils/snt_dhis2_quality_of_care_report.r index c8dd0e2..dfb20ec 100644 --- a/pipelines/snt_dhis2_quality_of_care/utils/snt_dhis2_quality_of_care_report.r +++ b/pipelines/snt_dhis2_quality_of_care/utils/snt_dhis2_quality_of_care_report.r @@ -1,19 +1,18 @@ -# Quality of Care **reporting** helpers (used by the reporting notebook). - -source( - file.path( - "~/workspace", - "pipelines", - "snt_dhis2_quality_of_care", - "utils", - "snt_dhis2_quality_of_care.r" - ) -) +# Load pipeline helpers (common + code-specific functions). +source(file.path("~/workspace", "pipelines", "snt_dhis2_quality_of_care", "utils", "snt_dhis2_quality_of_care.r")) #' Load latest Quality of Care district-year output. +#' +#' @param output_data_path Path to quality-of-care data outputs. +#' @param country_code Country code. +#' @return Named list with `qoc` (data table) and `latest_file` (path). load_latest_quality_of_care_output <- function(output_data_path, country_code) { - files <- list.files(output_data_path, pattern = paste0("^", country_code, "_quality_of_care_district_year_(imputed|removed)\\.parquet$"), full.names = TRUE) + files <- list.files( + output_data_path, + pattern = paste0("^", country_code, "_quality_of_care_district_year_(imputed|removed)\\.parquet$"), + full.names = TRUE + ) if (length(files) == 0) { stop(glue::glue("[ERROR] No quality_of_care parquet found in {output_data_path}")) } @@ -24,37 +23,53 @@ load_latest_quality_of_care_output <- function(output_data_path, country_code) { #' Build year-level Quality of Care summary table. +#' +#' @param qoc_dt Quality-of-care district-year data table. +#' @return Year-level summary table ordered by YEAR. build_quality_of_care_summary <- function(qoc_dt) { + mean_cols <- c("testing_rate", "treatment_rate", "case_fatality_rate", "prop_adm_malaria", "prop_malaria_deaths") + sum_cols <- c("non_malaria_all_cause_outpatients", "presumed_cases") + summary_tbl <- unique(qoc_dt[, .(YEAR)]) - if ("testing_rate" %in% names(qoc_dt)) summary_tbl <- merge(summary_tbl, qoc_dt[, .(testing_rate = mean(testing_rate, na.rm = TRUE)), by = .(YEAR)], by = "YEAR", all.x = TRUE) - if ("treatment_rate" %in% names(qoc_dt)) summary_tbl <- merge(summary_tbl, qoc_dt[, .(treatment_rate = mean(treatment_rate, na.rm = TRUE)), by = .(YEAR)], by = "YEAR", all.x = TRUE) - if ("case_fatality_rate" %in% names(qoc_dt)) summary_tbl <- merge(summary_tbl, qoc_dt[, .(case_fatality_rate = mean(case_fatality_rate, na.rm = TRUE)), by = .(YEAR)], by = "YEAR", all.x = TRUE) - if ("prop_adm_malaria" %in% names(qoc_dt)) summary_tbl <- merge(summary_tbl, qoc_dt[, .(prop_adm_malaria = mean(prop_adm_malaria, na.rm = TRUE)), by = .(YEAR)], by = "YEAR", all.x = TRUE) - if ("prop_malaria_deaths" %in% names(qoc_dt)) summary_tbl <- merge(summary_tbl, qoc_dt[, .(prop_malaria_deaths = mean(prop_malaria_deaths, na.rm = TRUE)), by = .(YEAR)], by = "YEAR", all.x = TRUE) - if ("non_malaria_all_cause_outpatients" %in% names(qoc_dt)) summary_tbl <- merge(summary_tbl, qoc_dt[, .(non_malaria_all_cause_outpatients = sum(non_malaria_all_cause_outpatients, na.rm = TRUE)), by = .(YEAR)], by = "YEAR", all.x = TRUE) - if ("presumed_cases" %in% names(qoc_dt)) summary_tbl <- merge(summary_tbl, qoc_dt[, .(presumed_cases = sum(presumed_cases, na.rm = TRUE)), by = .(YEAR)], by = "YEAR", all.x = TRUE) + for (col in intersect(mean_cols, names(qoc_dt))) { + agg <- qoc_dt[, setNames(list(mean(get(col), na.rm = TRUE)), col), by = .(YEAR)] + summary_tbl <- merge(summary_tbl, agg, by = "YEAR", all.x = TRUE) + } + + for (col in intersect(sum_cols, names(qoc_dt))) { + agg <- qoc_dt[, setNames(list(sum(get(col), na.rm = TRUE)), col), by = .(YEAR)] + summary_tbl <- merge(summary_tbl, agg, by = "YEAR", all.x = TRUE) + } summary_tbl[order(YEAR)] } -#' Save year-level summary outputs for report consumption. +#' Save year-level summary outputs (parquet and csv only; no Excel — avoids extra deps). +#' +#' @param summary_tbl Summary table. +#' @param report_outputs_path Reporting outputs folder. +#' @param country_code Country code. +#' @return Named list with `summary_parquet` and `summary_csv` paths. save_quality_of_care_summary_outputs <- function(summary_tbl, report_outputs_path, country_code) { summary_parquet <- file.path(report_outputs_path, glue::glue("{country_code}_quality_of_care_summary.parquet")) - summary_csv <- file.path(report_outputs_path, glue::glue("{country_code}_quality_of_care_summary.csv")) - summary_xlsx <- file.path(report_outputs_path, glue::glue("{country_code}_quality_of_care_summary.xlsx")) + summary_csv <- file.path(report_outputs_path, glue::glue("{country_code}_quality_of_care_summary.csv")) arrow::write_parquet(summary_tbl, summary_parquet) data.table::fwrite(summary_tbl, summary_csv) - writexl::write_xlsx(list(summary = as.data.frame(summary_tbl)), summary_xlsx) - log_msg(glue::glue("Summary data saved to: {summary_parquet}, {summary_csv}, {summary_xlsx}")) - list(summary_parquet = summary_parquet, summary_csv = summary_csv, summary_xlsx = summary_xlsx) + log_msg(glue::glue("Summary data saved to: {summary_parquet}, {summary_csv}")) + list(summary_parquet = summary_parquet, summary_csv = summary_csv) } #' Build and save year-level bar chart panel for QoC indicators. +#' +#' @param summary_tbl Year-level summary table. +#' @param figures_path Folder where the combined chart is saved. +#' @param country_code Country code used in output file name. +#' @return Path to saved chart, or NULL if no indicator columns are available. save_quality_of_care_summary_charts <- function(summary_tbl, figures_path, country_code) { plot_data <- data.table::copy(summary_tbl) if (nrow(plot_data) == 0) return(NULL)