Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
392 changes: 51 additions & 341 deletions pipelines/snt_dhis2_quality_of_care/code/snt_dhis2_quality_of_care.ipynb

Large diffs are not rendered by default.

Large diffs are not rendered by default.

223 changes: 223 additions & 0 deletions pipelines/snt_dhis2_quality_of_care/utils/snt_dhis2_quality_of_care.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
# Load shared SNT helpers.
source(file.path("~/workspace", "code", "snt_utils.r"))


#' Load packages, OpenHEXA, and return base workspace paths (one list, four names).
#' @param SNT_ROOT_PATH Workspace root. Default `~/workspace`.
#' @param packages R packages to install/load.
#' @return Named list: `CONFIG_PATH`, `UPLOADS_PATH`, `DATA_PATH`, `PIPELINES_PATH`.
get_setup_variables <- function(
SNT_ROOT_PATH = "~/workspace",
packages = c("arrow", "dplyr", "tidyr", "stringr", "stringi", "jsonlite", "httr", "glue", "reticulate")
) {
base_paths <- list(
CONFIG_PATH = file.path(SNT_ROOT_PATH, "configuration"),
UPLOADS_PATH = file.path(SNT_ROOT_PATH, "uploads"),
DATA_PATH = file.path(SNT_ROOT_PATH, "data"),
PIPELINES_PATH = file.path(SNT_ROOT_PATH, "pipelines")
)

for (p in base_paths) {
if (!dir.exists(p)) {
dir.create(p, recursive = TRUE, showWarnings = FALSE)
}
}

install_and_load(packages)

Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python")
reticulate::py_config()$python
assign("openhexa", reticulate::import("openhexa.sdk"), envir = .GlobalEnv)

return(base_paths)
}

#' Load dataset file from OpenHEXA.
#'
#' @param dataset_id Character. OpenHEXA dataset identifier.
#' @param filename Character. Name of file to load.
#' @param verbose Logical. If TRUE, log dataframe dimensions after a successful load.
#' @return Dataframe containing the loaded data.
load_dataset_file <- function(dataset_id, filename, verbose = TRUE) {
if (!exists("openhexa", inherits = TRUE) || is.null(get("openhexa", inherits = TRUE))) {
stop("[ERROR] OpenHEXA SDK is not available. Run `get_setup_variables()` before loading dataset files.")
}

data <- tryCatch(
{
get_latest_dataset_file_in_memory(dataset_id, filename)
},
error = function(e) {
stop(glue::glue("[ERROR] Error while loading {filename} file from dataset: {dataset_id}"))
}
)

if (verbose) {
log_msg(glue::glue(
"{filename} data loaded from dataset : {dataset_id} dataframe dimensions: [{paste(dim(data), collapse = ', ')}]"
))
}

return(data)
}

#' Validate quality-of-care action parameter.
#'
#' @param data_action Action string expected to be `imputed` or `removed`.
#' @return Validated action string.
validate_quality_of_care_action <- function(data_action) {
if (is.null(data_action) || !nzchar(data_action)) {
return("imputed")
}
allowed_actions <- c("imputed", "removed")
if (!(data_action %in% allowed_actions)) {
stop(glue::glue("[ERROR] Invalid data_action `{data_action}`. Allowed: {paste(allowed_actions, collapse = ', ')}"))
}
data_action
}

#' Compute district-year Quality of Care indicators.
#'
#' @param routine Routine dataframe loaded from outliers dataset.
#' @param indicator_cols Character vector of routine indicator column names to coerce to numeric
#' (define in the notebook or config, not hardcoded here).
#' @return Data table with district-year indicators.
normalize_qoc_routine_types <- function(routine, indicator_cols) {
data.table::setDT(routine)
available_cols <- intersect(indicator_cols, names(routine))

for (col in available_cols) {
col_vals <- as.character(routine[[col]])
col_vals[is.na(col_vals) | col_vals == "" | col_vals == "-"] <- NA_character_
routine[, (col) := as.numeric(col_vals)]
}

routine[, YEAR := as.integer(YEAR)]
routine[, ADM2_ID := as.character(ADM2_ID)]
routine
}

#' Aggregate QoC routine indicators by district and year.
#'
#' @param routine Routine data table with normalized types.
#' @param indicator_cols Character vector of column names to sum (must match the vector used
#' in [normalize_qoc_routine_types()]).
#' @return Aggregated district-year data table.
aggregate_qoc_district_year <- function(routine, indicator_cols) {
available_cols <- intersect(indicator_cols, names(routine))

if (length(available_cols) > 0) {
routine[, lapply(.SD, function(x) sum(x, na.rm = TRUE)), .SDcols = available_cols, by = .(ADM2_ID, YEAR)]
} else {
unique(routine[, .(ADM2_ID, YEAR)])
}
}

#' Merge ADM2 labels into Quality of Care outputs.
#'
#' @param qoc_dt Quality-of-care data table.
#' @param shapes_sf Shapes sf table.
#' @return Data table with optional ADM2_NAME.
attach_quality_of_care_shapes <- function(qoc_dt, shapes_sf) {
shapes_dt <- data.table::as.data.table(sf::st_drop_geometry(shapes_sf))
if ("ADM2_ID" %in% names(shapes_dt) && "ADM2_NAME" %in% names(shapes_dt)) {
shapes_dt[, ADM2_ID := as.character(ADM2_ID)]
qoc_dt <- merge(qoc_dt, unique(shapes_dt[, .(ADM2_ID, ADM2_NAME)]), by = "ADM2_ID", all.x = TRUE)
}
qoc_dt
}

#' Save district-year Quality of Care outputs.
#'
#' @param qoc_dt Computed quality-of-care data table.
#' @param output_data_path Output directory path.
#' @param country_code Country code.
#' @param data_action Action suffix for output naming.
#' @return Named list with `parquet` and `csv` output file paths.
save_quality_of_care_outputs <- function(qoc_dt, output_data_path, country_code, data_action) {
out_district_parquet <- file.path(output_data_path, glue::glue("{country_code}_quality_of_care_district_year_{data_action}.parquet"))
out_district_csv <- file.path(output_data_path, glue::glue("{country_code}_quality_of_care_district_year_{data_action}.csv"))

arrow::write_parquet(qoc_dt, out_district_parquet)
data.table::fwrite(qoc_dt, out_district_csv)
log_msg(glue::glue("Saved outputs: {out_district_parquet}, {out_district_csv}"))

list(parquet = out_district_parquet, csv = out_district_csv)
}

#' Generate and save yearly district maps for QoC indicators.
#'
#' @param qoc_dt Quality-of-care data table.
#' @param shapes_sf District shapes sf.
#' @param figures_path Folder where PNG maps are written.
#' @return Invisibly returns `TRUE`.
save_quality_of_care_maps <- function(qoc_dt, shapes_sf, figures_path) {
shapes_sf$ADM2_ID <- as.character(shapes_sf$ADM2_ID)
qoc_dt$ADM2_ID <- as.character(qoc_dt$ADM2_ID)

plot_yearly_map <- function(df, sf_shapes, value_col, title_prefix, filename_prefix, is_rate = TRUE) {
if (!(value_col %in% names(df))) return(invisible(NULL))
sf_shapes_local <- sf_shapes
years <- sort(unique(df$YEAR))

for (yr in years) {
tryCatch(
{
df_y <- df[YEAR == yr]
if (nrow(df_y) == 0) next
df_y$ADM2_ID <- as.character(df_y$ADM2_ID)
map_df <- dplyr::left_join(sf_shapes_local, df_y, by = "ADM2_ID")
if (!(value_col %in% names(map_df))) next

vals <- map_df[[value_col]]
finite_vals <- vals[is.finite(vals) & !is.na(vals)]
if (length(finite_vals) == 0) next

if (is_rate) {
cat_vals <- cut(vals, breaks = c(-Inf, 0, 0.2, 0.4, 0.6, 0.8, 1.0, Inf), labels = c("<0", "0-0.2", "0.2-0.4", "0.4-0.6", "0.6-0.8", "0.8-1.0", ">1.0"), include.lowest = TRUE)
fill_palette <- "YlOrRd"
} else {
if (length(finite_vals) > 4) {
br <- unique(as.numeric(quantile(finite_vals, probs = seq(0, 1, 0.2), na.rm = TRUE)))
if (length(br) < 2) {
cat_vals <- as.factor(rep("all", nrow(map_df)))
} else {
cat_vals <- cut(vals, breaks = br, include.lowest = TRUE)
}
} else {
cat_vals <- as.factor(vals)
}
fill_palette <- "Blues"
}

map_df <- dplyr::mutate(map_df, cat = as.factor(cat_vals))
p <- ggplot2::ggplot(map_df) +
ggplot2::geom_sf(ggplot2::aes(fill = cat), color = "grey60", size = 0.1) +
ggplot2::scale_fill_brewer(palette = fill_palette, na.value = "white", drop = FALSE) +
ggplot2::theme_void() +
ggplot2::labs(title = paste0(title_prefix, " - ", yr), fill = value_col, caption = "Source: SNT DHIS2 outliers-imputed routine data") +
ggplot2::theme(legend.position = "bottom", plot.title = ggplot2::element_text(face = "bold", size = 12))

out_png <- file.path(figures_path, glue::glue("{filename_prefix}_{yr}.png"))
ggplot2::ggsave(out_png, plot = p, width = 9, height = 7, dpi = 300, bg = "white")
log_msg(glue::glue("Saved map: {out_png}"))
},
error = function(e) {
log_msg(glue::glue("[WARNING] Failed to build/save map for `{value_col}` year `{yr}`: {conditionMessage(e)}"), level = "warning")
}
)
}
}

plot_yearly_map(qoc_dt, shapes_sf, "testing_rate","Testing rate (TEST / SUSP)","testing_rate",TRUE)
plot_yearly_map(qoc_dt, shapes_sf, "treatment_rate","Treatment rate (MALTREAT / CONF)","treatment_rate",TRUE)
plot_yearly_map(qoc_dt, shapes_sf, "case_fatality_rate","In-hospital case fatality rate (MALDTH / MALADM)","case_fatality_rate",TRUE)
plot_yearly_map(qoc_dt, shapes_sf, "prop_adm_malaria","Proportion admitted for malaria (MALADM / ALLADM)","prop_adm_malaria",TRUE)
plot_yearly_map(qoc_dt, shapes_sf, "prop_malaria_deaths","Proportion of malaria deaths (MALDTH / ALLDTH)","prop_malaria_deaths",TRUE)
plot_yearly_map(qoc_dt, shapes_sf, "non_malaria_all_cause_outpatients","Non-malaria all-cause outpatients (ALLOUT)","allout",FALSE)
plot_yearly_map(qoc_dt, shapes_sf, "presumed_cases","Presumed cases (PRES)","presumed_cases",FALSE)

log_msg(glue::glue("Saved yearly maps in: {figures_path}"))
invisible(TRUE)
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
# Load pipeline helpers (common + code-specific functions).
source(file.path("~/workspace", "pipelines", "snt_dhis2_quality_of_care", "utils", "snt_dhis2_quality_of_care.r"))


#' Load latest Quality of Care district-year output.
#'
#' @param output_data_path Path to quality-of-care data outputs.
#' @param country_code Country code.
#' @return Named list with `qoc` (data table) and `latest_file` (path).
load_latest_quality_of_care_output <- function(output_data_path, country_code) {
files <- list.files(
output_data_path,
pattern = paste0("^", country_code, "_quality_of_care_district_year_(imputed|removed)\\.parquet$"),
full.names = TRUE
)
if (length(files) == 0) {
stop(glue::glue("[ERROR] No quality_of_care parquet found in {output_data_path}"))
}
latest_file <- files[which.max(file.info(files)$mtime)]
qoc <- data.table::as.data.table(arrow::read_parquet(latest_file))
list(qoc = qoc, latest_file = latest_file)
}


#' Build year-level Quality of Care summary table.
#'
#' @param qoc_dt Quality-of-care district-year data table.
#' @return Year-level summary table ordered by YEAR.
build_quality_of_care_summary <- function(qoc_dt) {
mean_cols <- c("testing_rate", "treatment_rate", "case_fatality_rate", "prop_adm_malaria", "prop_malaria_deaths")
sum_cols <- c("non_malaria_all_cause_outpatients", "presumed_cases")

summary_tbl <- unique(qoc_dt[, .(YEAR)])

for (col in intersect(mean_cols, names(qoc_dt))) {
agg <- qoc_dt[, setNames(list(mean(get(col), na.rm = TRUE)), col), by = .(YEAR)]
summary_tbl <- merge(summary_tbl, agg, by = "YEAR", all.x = TRUE)
}

for (col in intersect(sum_cols, names(qoc_dt))) {
agg <- qoc_dt[, setNames(list(sum(get(col), na.rm = TRUE)), col), by = .(YEAR)]
summary_tbl <- merge(summary_tbl, agg, by = "YEAR", all.x = TRUE)
}

summary_tbl[order(YEAR)]
}


#' Save year-level summary outputs (parquet and csv only; no Excel — avoids extra deps).
#'
#' @param summary_tbl Summary table.
#' @param report_outputs_path Reporting outputs folder.
#' @param country_code Country code.
#' @return Named list with `summary_parquet` and `summary_csv` paths.
save_quality_of_care_summary_outputs <- function(summary_tbl, report_outputs_path, country_code) {
summary_parquet <- file.path(report_outputs_path, glue::glue("{country_code}_quality_of_care_summary.parquet"))
summary_csv <- file.path(report_outputs_path, glue::glue("{country_code}_quality_of_care_summary.csv"))

arrow::write_parquet(summary_tbl, summary_parquet)
data.table::fwrite(summary_tbl, summary_csv)

log_msg(glue::glue("Summary data saved to: {summary_parquet}, {summary_csv}"))
list(summary_parquet = summary_parquet, summary_csv = summary_csv)
}


#' Build and save year-level bar chart panel for QoC indicators.
#'
#' @param summary_tbl Year-level summary table.
#' @param figures_path Folder where the combined chart is saved.
#' @param country_code Country code used in output file name.
#' @return Path to saved chart, or NULL if no indicator columns are available.
save_quality_of_care_summary_charts <- function(summary_tbl, figures_path, country_code) {
plot_data <- data.table::copy(summary_tbl)
if (nrow(plot_data) == 0) return(NULL)

make_pct_plot <- function(col_name, title_name) {
ggplot2::ggplot(plot_data, ggplot2::aes(x = factor(YEAR), y = .data[[col_name]] * 100)) +
ggplot2::geom_bar(stat = "identity", fill = "#2563eb", color = "#1e40af", width = 0.7) +
ggplot2::geom_text(ggplot2::aes(label = paste0(round(.data[[col_name]] * 100, 1), "%")), vjust = -0.5, size = 2.5) +
ggplot2::labs(title = title_name, x = "Annee", y = "%") +
ggplot2::theme_minimal() +
ggplot2::theme(
plot.title = ggplot2::element_text(face = "bold", size = 10),
axis.text.x = ggplot2::element_text(angle = 45, hjust = 1, size = 9),
panel.grid.major.y = ggplot2::element_line(linetype = "dashed", color = scales::alpha("grey", 0.7)),
plot.background = ggplot2::element_rect(fill = "#fafafa", color = NA),
panel.background = ggplot2::element_rect(fill = "#fafafa", color = NA),
plot.margin = ggplot2::margin(5, 5, 5, 5)
) +
ggplot2::scale_y_continuous(expand = ggplot2::expansion(mult = c(0, 0.1)))
}

make_abs_plot <- function(col_name, title_name) {
format_label <- function(v) {
ifelse(
is.na(v) | v == 0,
"0",
ifelse(v >= 1e6, paste0(round(v / 1e6, 2), "M"), format(round(v), big.mark = " ", scientific = FALSE))
)
}
ggplot2::ggplot(plot_data, ggplot2::aes(x = factor(YEAR), y = .data[[col_name]])) +
ggplot2::geom_bar(stat = "identity", fill = "#2563eb", color = "#1e40af", width = 0.7) +
ggplot2::geom_text(ggplot2::aes(label = format_label(.data[[col_name]])), vjust = -0.5, size = 2.5) +
ggplot2::labs(title = title_name, x = "Annee", y = "Nombre") +
ggplot2::theme_minimal() +
ggplot2::theme(
plot.title = ggplot2::element_text(face = "bold", size = 10),
axis.text.x = ggplot2::element_text(angle = 45, hjust = 1, size = 9),
panel.grid.major.y = ggplot2::element_line(linetype = "dashed", color = scales::alpha("grey", 0.7)),
plot.background = ggplot2::element_rect(fill = "#fafafa", color = NA),
panel.background = ggplot2::element_rect(fill = "#fafafa", color = NA),
plot.margin = ggplot2::margin(5, 5, 5, 5)
) +
ggplot2::scale_y_continuous(labels = scales::comma, expand = ggplot2::expansion(mult = c(0, 0.1)))
}

plots_list <- list()
if ("testing_rate" %in% names(plot_data)) plots_list[["testing_rate"]] <- make_pct_plot("testing_rate", "Testing rate (TEST / SUSP)")
if ("treatment_rate" %in% names(plot_data)) plots_list[["treatment_rate"]] <- make_pct_plot("treatment_rate", "Treatment rate (MALTREAT / CONF)")
if ("case_fatality_rate" %in% names(plot_data)) plots_list[["case_fatality_rate"]] <- make_pct_plot("case_fatality_rate", "Case fatality rate (MALDTH / MALADM)")
if ("prop_adm_malaria" %in% names(plot_data)) plots_list[["prop_adm_malaria"]] <- make_pct_plot("prop_adm_malaria", "Prop. admissions paludisme (MALADM / ALLADM)")
if ("prop_malaria_deaths" %in% names(plot_data)) plots_list[["prop_malaria_deaths"]] <- make_pct_plot("prop_malaria_deaths", "Prop. deces paludisme (MALDTH / ALLDTH)")
if ("presumed_cases" %in% names(plot_data)) plots_list[["presumed_cases"]] <- make_abs_plot("presumed_cases", "Cas presumes (PRES)")
if ("non_malaria_all_cause_outpatients" %in% names(plot_data)) plots_list[["non_malaria_all_cause_outpatients"]] <- make_abs_plot("non_malaria_all_cause_outpatients", "Consultations externes non-paludisme (ALLOUT)")

if (length(plots_list) == 0) return(NULL)

plot_order <- c("testing_rate", "treatment_rate", "case_fatality_rate", "prop_adm_malaria", "prop_malaria_deaths", "presumed_cases", "non_malaria_all_cause_outpatients")
available_plots <- plots_list[intersect(plot_order, names(plots_list))]
n_plots <- length(available_plots)
ncol_layout <- 2
nrow_layout <- ceiling(n_plots / ncol_layout)

combined_plot <- do.call(gridExtra::grid.arrange, c(available_plots, ncol = ncol_layout, nrow = nrow_layout))
out_file <- file.path(figures_path, glue::glue("{country_code}_quality_of_care_by_year.png"))
ggplot2::ggsave(out_file, plot = combined_plot, width = 18, height = max(8, 5.2 * nrow_layout), dpi = 300, bg = "white", units = "in")
log_msg(glue::glue("Combined bar charts saved: {out_file}"))
out_file
}
Loading