diff --git a/DESCRIPTION b/DESCRIPTION index fabf52d8..5dcee62a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -5,7 +5,7 @@ Description: Base 'DataSHIELD' functions for the server side. 'DataSHIELD' is a been designed to only share non disclosive summary statistics, with built in automated output checking based on statistical disclosure control. With data sites setting the threshold values for the automated output checks. For more details, see 'citation("dsBase")'. -Version: 6.3.5 +Version: 6.3.6 Authors@R: c(person(given = "Paul", family = "Burton", role = c("aut"), @@ -65,6 +65,9 @@ Imports: stringr, lme4, dplyr, + tibble, + purrr, + tidyselect, reshape2, polycor (>= 0.8), splines, diff --git a/NAMESPACE b/NAMESPACE index e52e5d10..6793ba57 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -41,7 +41,12 @@ export(dmtC2SDS) export(elsplineDS) export(extractQuantilesDS1) export(extractQuantilesDS2) +export(fixClassDS) +export(fixColsDS) +export(fixLevelsDS) export(gamlssDS) +export(getAllLevelsDS) +export(getClassAllColsDS) export(getWGSRDS) export(glmDS1) export(glmDS2) @@ -139,5 +144,15 @@ import(dplyr) import(gamlss) import(gamlss.dist) import(mice) +importFrom(dplyr,"%>%") +importFrom(dplyr,across) +importFrom(dplyr,mutate) +importFrom(dplyr,select) importFrom(gamlss.dist,pST3) importFrom(gamlss.dist,qST3) +importFrom(purrr,imap) +importFrom(purrr,map) +importFrom(purrr,set_names) +importFrom(tibble,as_tibble) +importFrom(tidyselect,all_of) +importFrom(tidyselect,peek_vars) diff --git a/R/standardiseDfDS.R b/R/standardiseDfDS.R new file mode 100644 index 00000000..24e3eecf --- /dev/null +++ b/R/standardiseDfDS.R @@ -0,0 +1,133 @@ +#' Get the Class of All Columns in a Data Frame +#' @param df.name A string representing the name of the data frame. +#' @return A tibble with the class of each column in the data frame. +#' @importFrom dplyr %>% +#' @importFrom tibble as_tibble +#' @importFrom purrr map +#' @export +getClassAllColsDS <- function(df.name){ + dsBase::checkPermissivePrivacyControlLevel(c('permissive', 'banana', 'carrot')) + + df.name <- eval(parse(text = df.name), envir = parent.frame()) + all_classes <- map(df.name, class) %>% as_tibble() + return(all_classes) +} + +#' Change Class of Target Variables in a Data Frame +#' @param df.name A string representing the name of the data frame. +#' @param target_vars A character vector specifying the columns to be modified. +#' @param target_class A character vector specifying the new classes for each column (1 = factor, +#' 2 = integer, 3 = numeric, 4 = character, 5 = logical). +#' @return A modified data frame with the specified columns converted to the target classes. +#' @importFrom dplyr mutate across +#' @importFrom tidyselect all_of +#' @export +fixClassDS <- function(df.name, target_vars, target_class) { + dsBase::checkPermissivePrivacyControlLevel(c('permissive', 'banana', 'carrot')) + + df <- eval(parse(text = df.name), envir = parent.frame()) + df_transformed <- df %>% + mutate( + across(all_of(target_vars), + ~ .convertClass(.x, target_class[which(target_vars == cur_column())]))) + return(df_transformed) +} + +#' Convert a Vector to a Specified Class +#' @param x The vector to be converted. +#' @param class_name A string indicating the target class (1 = factor, 2 = integer, 3 = numeric, +#' 4 = character, 5 = logical). +#' @return The converted vector. +#' @noRd +.convertClass <- function(target_var, target_class_code) { + switch(target_class_code, + "1" = as.factor(target_var), + "2" = as.integer(target_var), + "3" = as.numeric(target_var), + "4" = as.character(target_var), + "5" = as.logical(target_var) + ) +} + +#' Add Missing Columns with NA Values +#' @param .data A string representing the name of the data frame. +#' @param cols A character vector specifying the columns to be added if missing. +#' @return A modified data frame with missing columns added and filled with NA. +#' @importFrom dplyr mutate select +#' @importFrom tidyselect peek_vars +#' @importFrom purrr set_names +#' @export +fixColsDS <- function(.data, cols) { + dsBase::checkPermissivePrivacyControlLevel(c('permissive', 'banana', 'carrot')) + + .data <- eval(parse(text = .data), envir = parent.frame()) + missing <- setdiff(cols, colnames(.data)) + out <- .data %>% + mutate(!!!set_names(rep(list(NA), length(missing)), missing)) %>% + select(sort(peek_vars())) + return(out) +} + +#' Retrieve Factor Levels for Specific Columns +#' @param df.name A string representing the name of the data frame. +#' @param factor_vars A character vector specifying the factor columns. +#' @return A list of factor levels for the specified columns. +#' @importFrom tidyselect all_of +#' @importFrom purrr map imap +#' @export +getAllLevelsDS <- function(df.name, factor_vars) { + dsBase::checkPermissivePrivacyControlLevel(c('permissive', 'banana', 'carrot')) + + df <- eval(parse(text = df.name), envir = parent.frame()) + factor_vars_split <- strsplit(factor_vars, ",\\s*")[[1]] + levels <- purrr::map(df[factor_vars_split], base::levels) + + disclosure_check <- imap(levels, function(lvls, var) { + .checkLevelsDisclosure(df = df, var = var, levels = lvls) + }) + + failed_vars <- names(disclosure_check)[unlist(disclosure_check)] + + if(length(failed_vars) > 0) { + stop("Based on the value of nfilter.levels.density, these factor variables", " {", failed_vars, "} ", "have too many levels compared to the length of the variable. Please reduce the numnber of levels or change the variable type and try again") + } else { + return(levels) + } +} + +#' Check variable levels against disclosure thresholds +#' +#' Internal helper function to verify whether the number of levels in a variable +#' exceeds the allowed density threshold defined by `dsBase::listDisclosureSettingsDS()`. +#' +#' @param df A data frame containing the variable. +#' @param var Character string. Name of the variable to check. +#' @param levels Character vector. Levels of the variable. +#' +#' @return Logical. `TRUE` if the check fails (i.e., disclosure threshold is violated), +#' otherwise `FALSE`. +#' +#' @keywords internal +#' @noRd +.checkLevelsDisclosure <- function(df, var, levels) { + thr <- dsBase::listDisclosureSettingsDS() + nfilter.levels.density <- as.numeric(thr$nfilter.levels.density) + n_levels <- length(levels) + length_var <- length(df[[var]]) + fail <- (length_var * nfilter.levels.density) < n_levels + return(fail) +} + +#' Set Factor Levels for Specific Columns in a Data Frame +#' @param df.name A string representing the name of the data frame to modify. +#' @param vars A character vector specifying the columns to be modified. +#' @param levels A named list where each element contains the levels for the corresponding factor variable. +#' @return A modified data frame with the specified columns converted to factors with the provided levels. +#' @export +fixLevelsDS <- function(df.name, vars, levels) { + dsBase::checkPermissivePrivacyControlLevel(c('permissive', 'banana', 'carrot')) + + df.name <- eval(parse(text = df.name), envir = parent.frame()) + out <- df.name %>% + mutate(across(all_of(vars), ~factor(., levels = levels[[dplyr::cur_column()]]))) +} diff --git a/inst/DATASHIELD b/inst/DATASHIELD index 0c59f0c2..a7acb62a 100644 --- a/inst/DATASHIELD +++ b/inst/DATASHIELD @@ -70,7 +70,9 @@ AggregateMethods: is.null=base::is.null, is.numeric=base::is.numeric, NROW=base::NROW, - t.test=stats::t.test + t.test=stats::t.test, + getClassAllColsDS, + getAllLevelsDS AssignMethods: absDS, asCharacterDS, @@ -161,7 +163,10 @@ AssignMethods: acos=base::acos, atan=base::atan, sum=base::sum, - unlist=base::unlist + unlist=base::unlist, + fixClassDS, + fixColsDS, + fixLevelsDS Options: datashield.privacyLevel=5, default.datashield.privacyControlLevel="banana", diff --git a/man/fixClassDS.Rd b/man/fixClassDS.Rd new file mode 100644 index 00000000..d7b6bf17 --- /dev/null +++ b/man/fixClassDS.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/standardiseDfDS.R +\name{fixClassDS} +\alias{fixClassDS} +\title{Change Class of Target Variables in a Data Frame} +\usage{ +fixClassDS(df.name, target_vars, target_class) +} +\arguments{ +\item{df.name}{A string representing the name of the data frame.} + +\item{target_vars}{A character vector specifying the columns to be modified.} + +\item{target_class}{A character vector specifying the new classes for each column (1 = factor, +2 = integer, 3 = numeric, 4 = character, 5 = logical).} +} +\value{ +A modified data frame with the specified columns converted to the target classes. +} +\description{ +Change Class of Target Variables in a Data Frame +} diff --git a/man/fixColsDS.Rd b/man/fixColsDS.Rd new file mode 100644 index 00000000..709d9472 --- /dev/null +++ b/man/fixColsDS.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/standardiseDfDS.R +\name{fixColsDS} +\alias{fixColsDS} +\title{Add Missing Columns with NA Values} +\usage{ +fixColsDS(.data, cols) +} +\arguments{ +\item{.data}{A string representing the name of the data frame.} + +\item{cols}{A character vector specifying the columns to be added if missing.} +} +\value{ +A modified data frame with missing columns added and filled with NA. +} +\description{ +Add Missing Columns with NA Values +} diff --git a/man/fixLevelsDS.Rd b/man/fixLevelsDS.Rd new file mode 100644 index 00000000..096757a9 --- /dev/null +++ b/man/fixLevelsDS.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/standardiseDfDS.R +\name{fixLevelsDS} +\alias{fixLevelsDS} +\title{Set Factor Levels for Specific Columns in a Data Frame} +\usage{ +fixLevelsDS(df.name, vars, levels) +} +\arguments{ +\item{df.name}{A string representing the name of the data frame to modify.} + +\item{vars}{A character vector specifying the columns to be modified.} + +\item{levels}{A named list where each element contains the levels for the corresponding factor variable.} +} +\value{ +A modified data frame with the specified columns converted to factors with the provided levels. +} +\description{ +Set Factor Levels for Specific Columns in a Data Frame +} diff --git a/man/getAllLevelsDS.Rd b/man/getAllLevelsDS.Rd new file mode 100644 index 00000000..e5030725 --- /dev/null +++ b/man/getAllLevelsDS.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/standardiseDfDS.R +\name{getAllLevelsDS} +\alias{getAllLevelsDS} +\title{Retrieve Factor Levels for Specific Columns} +\usage{ +getAllLevelsDS(df.name, factor_vars) +} +\arguments{ +\item{df.name}{A string representing the name of the data frame.} + +\item{factor_vars}{A character vector specifying the factor columns.} +} +\value{ +A list of factor levels for the specified columns. +} +\description{ +Retrieve Factor Levels for Specific Columns +} diff --git a/man/getClassAllColsDS.Rd b/man/getClassAllColsDS.Rd new file mode 100644 index 00000000..cb2de0e7 --- /dev/null +++ b/man/getClassAllColsDS.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/standardiseDfDS.R +\name{getClassAllColsDS} +\alias{getClassAllColsDS} +\title{Get the Class of All Columns in a Data Frame} +\usage{ +getClassAllColsDS(df.name) +} +\arguments{ +\item{df.name}{A string representing the name of the data frame.} +} +\value{ +A tibble with the class of each column in the data frame. +} +\description{ +Get the Class of All Columns in a Data Frame +}