diff --git a/DESCRIPTION b/DESCRIPTION index 18c70ac7..4f87c1f1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -65,6 +65,9 @@ Imports: stringr, lme4, dplyr, + tibble, + purrr, + tidyselect, reshape2, polycor (>= 0.8), splines, diff --git a/NAMESPACE b/NAMESPACE index e52e5d10..6793ba57 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -41,7 +41,12 @@ export(dmtC2SDS) export(elsplineDS) export(extractQuantilesDS1) export(extractQuantilesDS2) +export(fixClassDS) +export(fixColsDS) +export(fixLevelsDS) export(gamlssDS) +export(getAllLevelsDS) +export(getClassAllColsDS) export(getWGSRDS) export(glmDS1) export(glmDS2) @@ -139,5 +144,15 @@ import(dplyr) import(gamlss) import(gamlss.dist) import(mice) +importFrom(dplyr,"%>%") +importFrom(dplyr,across) +importFrom(dplyr,mutate) +importFrom(dplyr,select) importFrom(gamlss.dist,pST3) importFrom(gamlss.dist,qST3) +importFrom(purrr,imap) +importFrom(purrr,map) +importFrom(purrr,set_names) +importFrom(tibble,as_tibble) +importFrom(tidyselect,all_of) +importFrom(tidyselect,peek_vars) diff --git a/R/standardiseDfDS.R b/R/standardiseDfDS.R new file mode 100644 index 00000000..24e3eecf --- /dev/null +++ b/R/standardiseDfDS.R @@ -0,0 +1,133 @@ +#' Get the Class of All Columns in a Data Frame +#' @param df.name A string representing the name of the data frame. +#' @return A tibble with the class of each column in the data frame. +#' @importFrom dplyr %>% +#' @importFrom tibble as_tibble +#' @importFrom purrr map +#' @export +getClassAllColsDS <- function(df.name){ + dsBase::checkPermissivePrivacyControlLevel(c('permissive', 'banana', 'carrot')) + + df.name <- eval(parse(text = df.name), envir = parent.frame()) + all_classes <- map(df.name, class) %>% as_tibble() + return(all_classes) +} + +#' Change Class of Target Variables in a Data Frame +#' @param df.name A string representing the name of the data frame. +#' @param target_vars A character vector specifying the columns to be modified. +#' @param target_class A character vector specifying the new classes for each column (1 = factor, +#' 2 = integer, 3 = numeric, 4 = character, 5 = logical). +#' @return A modified data frame with the specified columns converted to the target classes. +#' @importFrom dplyr mutate across +#' @importFrom tidyselect all_of +#' @export +fixClassDS <- function(df.name, target_vars, target_class) { + dsBase::checkPermissivePrivacyControlLevel(c('permissive', 'banana', 'carrot')) + + df <- eval(parse(text = df.name), envir = parent.frame()) + df_transformed <- df %>% + mutate( + across(all_of(target_vars), + ~ .convertClass(.x, target_class[which(target_vars == cur_column())]))) + return(df_transformed) +} + +#' Convert a Vector to a Specified Class +#' @param x The vector to be converted. +#' @param class_name A string indicating the target class (1 = factor, 2 = integer, 3 = numeric, +#' 4 = character, 5 = logical). +#' @return The converted vector. +#' @noRd +.convertClass <- function(target_var, target_class_code) { + switch(target_class_code, + "1" = as.factor(target_var), + "2" = as.integer(target_var), + "3" = as.numeric(target_var), + "4" = as.character(target_var), + "5" = as.logical(target_var) + ) +} + +#' Add Missing Columns with NA Values +#' @param .data A string representing the name of the data frame. +#' @param cols A character vector specifying the columns to be added if missing. +#' @return A modified data frame with missing columns added and filled with NA. +#' @importFrom dplyr mutate select +#' @importFrom tidyselect peek_vars +#' @importFrom purrr set_names +#' @export +fixColsDS <- function(.data, cols) { + dsBase::checkPermissivePrivacyControlLevel(c('permissive', 'banana', 'carrot')) + + .data <- eval(parse(text = .data), envir = parent.frame()) + missing <- setdiff(cols, colnames(.data)) + out <- .data %>% + mutate(!!!set_names(rep(list(NA), length(missing)), missing)) %>% + select(sort(peek_vars())) + return(out) +} + +#' Retrieve Factor Levels for Specific Columns +#' @param df.name A string representing the name of the data frame. +#' @param factor_vars A character vector specifying the factor columns. +#' @return A list of factor levels for the specified columns. +#' @importFrom tidyselect all_of +#' @importFrom purrr map imap +#' @export +getAllLevelsDS <- function(df.name, factor_vars) { + dsBase::checkPermissivePrivacyControlLevel(c('permissive', 'banana', 'carrot')) + + df <- eval(parse(text = df.name), envir = parent.frame()) + factor_vars_split <- strsplit(factor_vars, ",\\s*")[[1]] + levels <- purrr::map(df[factor_vars_split], base::levels) + + disclosure_check <- imap(levels, function(lvls, var) { + .checkLevelsDisclosure(df = df, var = var, levels = lvls) + }) + + failed_vars <- names(disclosure_check)[unlist(disclosure_check)] + + if(length(failed_vars) > 0) { + stop("Based on the value of nfilter.levels.density, these factor variables", " {", failed_vars, "} ", "have too many levels compared to the length of the variable. Please reduce the numnber of levels or change the variable type and try again") + } else { + return(levels) + } +} + +#' Check variable levels against disclosure thresholds +#' +#' Internal helper function to verify whether the number of levels in a variable +#' exceeds the allowed density threshold defined by `dsBase::listDisclosureSettingsDS()`. +#' +#' @param df A data frame containing the variable. +#' @param var Character string. Name of the variable to check. +#' @param levels Character vector. Levels of the variable. +#' +#' @return Logical. `TRUE` if the check fails (i.e., disclosure threshold is violated), +#' otherwise `FALSE`. +#' +#' @keywords internal +#' @noRd +.checkLevelsDisclosure <- function(df, var, levels) { + thr <- dsBase::listDisclosureSettingsDS() + nfilter.levels.density <- as.numeric(thr$nfilter.levels.density) + n_levels <- length(levels) + length_var <- length(df[[var]]) + fail <- (length_var * nfilter.levels.density) < n_levels + return(fail) +} + +#' Set Factor Levels for Specific Columns in a Data Frame +#' @param df.name A string representing the name of the data frame to modify. +#' @param vars A character vector specifying the columns to be modified. +#' @param levels A named list where each element contains the levels for the corresponding factor variable. +#' @return A modified data frame with the specified columns converted to factors with the provided levels. +#' @export +fixLevelsDS <- function(df.name, vars, levels) { + dsBase::checkPermissivePrivacyControlLevel(c('permissive', 'banana', 'carrot')) + + df.name <- eval(parse(text = df.name), envir = parent.frame()) + out <- df.name %>% + mutate(across(all_of(vars), ~factor(., levels = levels[[dplyr::cur_column()]]))) +} diff --git a/inst/DATASHIELD b/inst/DATASHIELD index 0c59f0c2..a7acb62a 100644 --- a/inst/DATASHIELD +++ b/inst/DATASHIELD @@ -70,7 +70,9 @@ AggregateMethods: is.null=base::is.null, is.numeric=base::is.numeric, NROW=base::NROW, - t.test=stats::t.test + t.test=stats::t.test, + getClassAllColsDS, + getAllLevelsDS AssignMethods: absDS, asCharacterDS, @@ -161,7 +163,10 @@ AssignMethods: acos=base::acos, atan=base::atan, sum=base::sum, - unlist=base::unlist + unlist=base::unlist, + fixClassDS, + fixColsDS, + fixLevelsDS Options: datashield.privacyLevel=5, default.datashield.privacyControlLevel="banana", diff --git a/man/fixClassDS.Rd b/man/fixClassDS.Rd new file mode 100644 index 00000000..d7b6bf17 --- /dev/null +++ b/man/fixClassDS.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/standardiseDfDS.R +\name{fixClassDS} +\alias{fixClassDS} +\title{Change Class of Target Variables in a Data Frame} +\usage{ +fixClassDS(df.name, target_vars, target_class) +} +\arguments{ +\item{df.name}{A string representing the name of the data frame.} + +\item{target_vars}{A character vector specifying the columns to be modified.} + +\item{target_class}{A character vector specifying the new classes for each column (1 = factor, +2 = integer, 3 = numeric, 4 = character, 5 = logical).} +} +\value{ +A modified data frame with the specified columns converted to the target classes. +} +\description{ +Change Class of Target Variables in a Data Frame +} diff --git a/man/fixColsDS.Rd b/man/fixColsDS.Rd new file mode 100644 index 00000000..709d9472 --- /dev/null +++ b/man/fixColsDS.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/standardiseDfDS.R +\name{fixColsDS} +\alias{fixColsDS} +\title{Add Missing Columns with NA Values} +\usage{ +fixColsDS(.data, cols) +} +\arguments{ +\item{.data}{A string representing the name of the data frame.} + +\item{cols}{A character vector specifying the columns to be added if missing.} +} +\value{ +A modified data frame with missing columns added and filled with NA. +} +\description{ +Add Missing Columns with NA Values +} diff --git a/man/fixLevelsDS.Rd b/man/fixLevelsDS.Rd new file mode 100644 index 00000000..096757a9 --- /dev/null +++ b/man/fixLevelsDS.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/standardiseDfDS.R +\name{fixLevelsDS} +\alias{fixLevelsDS} +\title{Set Factor Levels for Specific Columns in a Data Frame} +\usage{ +fixLevelsDS(df.name, vars, levels) +} +\arguments{ +\item{df.name}{A string representing the name of the data frame to modify.} + +\item{vars}{A character vector specifying the columns to be modified.} + +\item{levels}{A named list where each element contains the levels for the corresponding factor variable.} +} +\value{ +A modified data frame with the specified columns converted to factors with the provided levels. +} +\description{ +Set Factor Levels for Specific Columns in a Data Frame +} diff --git a/man/getAllLevelsDS.Rd b/man/getAllLevelsDS.Rd new file mode 100644 index 00000000..e5030725 --- /dev/null +++ b/man/getAllLevelsDS.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/standardiseDfDS.R +\name{getAllLevelsDS} +\alias{getAllLevelsDS} +\title{Retrieve Factor Levels for Specific Columns} +\usage{ +getAllLevelsDS(df.name, factor_vars) +} +\arguments{ +\item{df.name}{A string representing the name of the data frame.} + +\item{factor_vars}{A character vector specifying the factor columns.} +} +\value{ +A list of factor levels for the specified columns. +} +\description{ +Retrieve Factor Levels for Specific Columns +} diff --git a/man/getClassAllColsDS.Rd b/man/getClassAllColsDS.Rd new file mode 100644 index 00000000..cb2de0e7 --- /dev/null +++ b/man/getClassAllColsDS.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/standardiseDfDS.R +\name{getClassAllColsDS} +\alias{getClassAllColsDS} +\title{Get the Class of All Columns in a Data Frame} +\usage{ +getClassAllColsDS(df.name) +} +\arguments{ +\item{df.name}{A string representing the name of the data frame.} +} +\value{ +A tibble with the class of each column in the data frame. +} +\description{ +Get the Class of All Columns in a Data Frame +}