From 46f367fc16259fdb436bc0bd3fbd1ea6f9f644a8 Mon Sep 17 00:00:00 2001 From: "Sow, Haby" Date: Fri, 29 May 2026 10:10:40 -0400 Subject: [PATCH 1/4] n environment fix --- .Rbuildignore | 2 ++ .gitignore | 1 + R/allocate.R | 11 +++++++---- R/srs.R | 18 ++++++++++++------ R/sys.R | 23 ++++++++++++----------- R/sys_pps.R | 9 +++++---- R/util.R | 5 +++++ man/SampleSelectR-package.Rd | 1 + 8 files changed, 45 insertions(+), 25 deletions(-) diff --git a/.Rbuildignore b/.Rbuildignore index 6022675..a799dca 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -7,3 +7,5 @@ ^_pkgdown\.yml$ ^docs$ ^pkgdown$ +^\.positai$ +^\.claude$ diff --git a/.gitignore b/.gitignore index 667204b..51d5219 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ .DS_Store .quarto docs +.positai diff --git a/R/allocate.R b/R/allocate.R index e21684b..1821f70 100644 --- a/R/allocate.R +++ b/R/allocate.R @@ -279,7 +279,10 @@ allocate <- function(allocation, N.h, n.samp = NULL, S.h = NULL, c.h = NULL, cos } } # Calculate the total (raw) sample size - n <- max(ceiling(sum(allocations)), length(N.h) * lbound) + + + final_n <- max(ceiling(sum(allocations)), length(N.h) * lbound) + sizes <- allocations @@ -292,7 +295,7 @@ allocate <- function(allocation, N.h, n.samp = NULL, S.h = NULL, c.h = NULL, cos total_allocated <- sum(adjusted_allocations) # Calculate the difference from the total n - difference <- n - total_allocated + difference <- final_n - total_allocated # If difference is positive, distribute it proportionally if (difference != 0) { @@ -314,7 +317,7 @@ allocate <- function(allocation, N.h, n.samp = NULL, S.h = NULL, c.h = NULL, cos # Step 5: Adjust the rounded allocations to ensure the sum equals n total_allocated <- sum(rounded_allocations) - difference <- n - total_allocated + difference <- final_n - total_allocated # Adjust the allocations by adding/subtracting the difference while (difference != 0) { i <- sample(1:num_groups, 1) # Randomly select an index @@ -345,7 +348,7 @@ allocate <- function(allocation, N.h, n.samp = NULL, S.h = NULL, c.h = NULL, cos } outputs <- as.integer(rounded_allocations) if (allocation == "optimal") { - n.print <- n + n.print <- final_n } else { n.print <- n.samp } diff --git a/R/srs.R b/R/srs.R index 677c5e7..d3467ac 100644 --- a/R/srs.R +++ b/R/srs.R @@ -28,6 +28,7 @@ #' @export + srs <- function(frame, n, outall = FALSE, curstrat = NULL) { check_frame_type(frame) check_n(n, frame, curstrat, n_le_N = TRUE) @@ -35,18 +36,22 @@ srs <- function(frame, n, outall = FALSE, curstrat = NULL) { N <- nrow(frame) - # Take the srs and create sampling-related columns + # Take the SRS and create sampling-related columns selectedVector <- sample(x = N, size = n, replace = FALSE) + frame <- frame |> tidytable::mutate( rowNum = tidytable::row_number(), - SelectionProbability = n / N, - SamplingWeight = ifelse(.data$rowNum %in% selectedVector, N / n, NA), - SelectionIndicator = ifelse(.data$rowNum %in% selectedVector, TRUE, FALSE) + SelectionProbability = .env$n / .env$N, + SamplingWeight = tidytable::if_else( + .data$rowNum %in% .env$selectedVector, + .env$N / .env$n, + NA_real_ + ), + SelectionIndicator = .data$rowNum %in% .env$selectedVector ) |> tidytable::select(-tidytable::all_of("rowNum")) - # Output to screen Sampling_Output(n, N, curstrat = curstrat) @@ -55,8 +60,9 @@ srs <- function(frame, n, outall = FALSE, curstrat = NULL) { sample <- frame |> tidytable::filter(.data$SelectionIndicator) |> tidytable::select(-tidytable::all_of("SelectionIndicator")) + return(sample) - } else if (outall) { + } else { return(frame) } } diff --git a/R/sys.R b/R/sys.R index 5fb4083..ec50f95 100644 --- a/R/sys.R +++ b/R/sys.R @@ -34,6 +34,7 @@ #' #' @export + sys <- function(frame, n, curstrat = NULL, outall = FALSE) { check_frame_type(frame) check_n(n, frame, curstrat, n_le_N = TRUE) @@ -42,32 +43,32 @@ sys <- function(frame, n, curstrat = NULL, outall = FALSE) { N <- nrow(frame) # Sampling method - k <- N / n # Sampling interval - r <- runif(1, 1, k) # We use a random start between 1 and k + r <- runif(1, 1, k) # Random start between 1 and k selectedVector <- floor(r + k * (0:(n - 1))) # Selected row indices - # We make sure that selected indices are within frame range - + # Make sure selected indices are within frame range selectedVector <- selectedVector[selectedVector <= N] - # Creating variables accordingly + # Create variables frame <- frame |> tidytable::mutate( numrow = tidytable::row_number(), - SelectionIndicator = .data$numrow %in% selectedVector, - SelectionProbability = n / N, - SamplingWeight = ifelse(.data$SelectionIndicator, N / n, NA) + SelectionIndicator = .data$numrow %in% .env$selectedVector, + SelectionProbability = .env$n / .env$N, + SamplingWeight = tidytable::if_else( + .data$SelectionIndicator, + .env$N / .env$n, + NA_real_ + ) ) |> tidytable::select(-tidytable::all_of("numrow")) # Output to screen Sampling_Output(n, N, k = k, r = r, curstrat = curstrat) - - # Return only selected rows and make sure the selected sample is a data.frame, tibble, or data.table - + # Return only the sample or the frame with selection indicator based on value of outall if (outall) { return(frame) } else { diff --git a/R/sys_pps.R b/R/sys_pps.R index ca0ff0c..4e5d3df 100644 --- a/R/sys_pps.R +++ b/R/sys_pps.R @@ -47,7 +47,7 @@ sys_pps <- function(frame, n, mos, outall = FALSE, curstrat = NULL) { tbd_data_1 <- frame |> tidytable::mutate( rowNum = tidytable::row_number(), - ExpectedHits = n * (!!(symbol_mos) / totalSize), + ExpectedHits = .env$n * (!!(symbol_mos) / totalSize), SamplingWeight = .data$ExpectedHits^-1 ) @@ -64,11 +64,12 @@ sys_pps <- function(frame, n, mos, outall = FALSE, curstrat = NULL) { selectedVector <- findInterval(selectedSizePoints, sizeIntervals) # Using selectedVector, get the total counts of each index + selectedVector_counts <- selectedVector |> as.data.frame() |> - tidytable::count(selectedVector) |> - # Rename to NumberHits - tidytable::rename(NumberHits = n) + tidytable::count(selectedVector, name = "NumberHits") + + tbd_data_2 <- tbd_data_1 |> tidytable::left_join( diff --git a/R/util.R b/R/util.R index 6b1f9e8..606e7fb 100644 --- a/R/util.R +++ b/R/util.R @@ -1,3 +1,8 @@ + + +# Declare tidy-evaluation pronouns used in package code +utils::globalVariables(c(".env")) + #' Check if the frame is a valid data structure #' #' Ensures that the input frame is a data.frame, data.table, or tibble. diff --git a/man/SampleSelectR-package.Rd b/man/SampleSelectR-package.Rd index ec80e39..de3ab40 100644 --- a/man/SampleSelectR-package.Rd +++ b/man/SampleSelectR-package.Rd @@ -12,6 +12,7 @@ Randomly select samples with SRS, systematic, and various PPS methods. Also incl Useful links: \itemize{ \item \url{https://github.com/RTIInternational/SampleSelectR} + \item \url{https://rtiinternational.github.io/SampleSelectR/} \item Report bugs at \url{https://github.com/RTIInternational/SampleSelectR/issues} } From dc39467b07613f17039e6b9ac084f593d62bdf95 Mon Sep 17 00:00:00 2001 From: "Sow, Haby" Date: Fri, 29 May 2026 10:59:59 -0400 Subject: [PATCH 2/4] Rlang fix --- NAMESPACE | 1 + R/SampleSelectR-package.R | 3 ++- R/util.R | 4 ---- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 7fffe22..37e108c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -17,4 +17,5 @@ importFrom(data.table,.NGRP) importFrom(data.table,.SD) importFrom(data.table,data.table) importFrom(rlang,.data) +importFrom(rlang,.env) importFrom(stats,runif) diff --git a/R/SampleSelectR-package.R b/R/SampleSelectR-package.R index 50f5a29..1a615a1 100644 --- a/R/SampleSelectR-package.R +++ b/R/SampleSelectR-package.R @@ -11,7 +11,8 @@ #' @importFrom data.table .SD #' @importFrom data.table := #' @importFrom data.table data.table -#' @importFrom stats runif #' @importFrom rlang .data +#' @importFrom rlang .env +#' @importFrom stats runif ## usethis namespace: end NULL diff --git a/R/util.R b/R/util.R index 606e7fb..5c34668 100644 --- a/R/util.R +++ b/R/util.R @@ -1,8 +1,4 @@ - -# Declare tidy-evaluation pronouns used in package code -utils::globalVariables(c(".env")) - #' Check if the frame is a valid data structure #' #' Ensures that the input frame is a data.frame, data.table, or tibble. From c9ac4a11b436d7fa8f88dbc63320ba3a7f21f850 Mon Sep 17 00:00:00 2001 From: "Zimmer, Stephanie" Date: Fri, 29 May 2026 14:46:23 -0400 Subject: [PATCH 3/4] Add a few more .env pronouns for clarity --- R/chromy_pps.R | 4 ++-- R/sys_pps.R | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/chromy_pps.R b/R/chromy_pps.R index 0692e76..e70dc6a 100644 --- a/R/chromy_pps.R +++ b/R/chromy_pps.R @@ -53,8 +53,8 @@ chromy_pps <- function(frame, n, mos, outall = FALSE, curstrat = NULL) { frame_hits <- frame |> tidytable::mutate( - ExpectedHits = exphits, - NumberHits = chromy_inner(exphits), + ExpectedHits = .env$exphits, + NumberHits = chromy_inner(.env$exphits), SelectionIndicator = .data$NumberHits > 0, SamplingWeight = ifelse(.data$SelectionIndicator, 1 / .data$ExpectedHits, NA), ) diff --git a/R/sys_pps.R b/R/sys_pps.R index 4e5d3df..53619fd 100644 --- a/R/sys_pps.R +++ b/R/sys_pps.R @@ -47,7 +47,7 @@ sys_pps <- function(frame, n, mos, outall = FALSE, curstrat = NULL) { tbd_data_1 <- frame |> tidytable::mutate( rowNum = tidytable::row_number(), - ExpectedHits = .env$n * (!!(symbol_mos) / totalSize), + ExpectedHits = .env$n * (!!(symbol_mos) / .env$totalSize), SamplingWeight = .data$ExpectedHits^-1 ) @@ -79,7 +79,7 @@ sys_pps <- function(frame, n, mos, outall = FALSE, curstrat = NULL) { # Need to zero filled NumberHits tidytable::mutate( NumberHits = tidytable::replace_na(.data$NumberHits, replace = 0), - SelectionIndicator = .data$rowNum %in% selectedVector, + SelectionIndicator = .data$rowNum %in% .env$selectedVector, # Make SamplingWeight to be NA if not selected SamplingWeight = tidytable::case_when( SelectionIndicator == TRUE ~ .data$SamplingWeight, From c64fde0feaf68aab4d516f03cd964dfe9ffb81dd Mon Sep 17 00:00:00 2001 From: "Zimmer, Stephanie" Date: Fri, 29 May 2026 15:08:20 -0400 Subject: [PATCH 4/4] Add test for sample frame with column --- tests/testthat/test-select_sample.R | 147 ++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 tests/testthat/test-select_sample.R diff --git a/tests/testthat/test-select_sample.R b/tests/testthat/test-select_sample.R new file mode 100644 index 0000000..a593c31 --- /dev/null +++ b/tests/testthat/test-select_sample.R @@ -0,0 +1,147 @@ +test_that("n on frame does not cause issue - chromy_pps", { + county_2023_slim_n <- county_2023 |> + tidytable::select(GEOID, Region, Pop_Tot) |> + tidytable::mutate( + n = 50, + ExpHits_man = 10 * Pop_Tot / sum(Pop_Tot), + .by = "Region" + ) + + sampsizes <- county_2023_slim_n |> + tidytable::distinct(Region) |> + tidytable::mutate(sample_size = 10) + + set.seed(12345) + samp1 <- county_2023_slim_n |> + select_sample( + "chromy_pps", + n = sampsizes, + strata = "Region", + mos = "Pop_Tot", + outall = TRUE + ) + set.seed(12345) + samp2 <- county_2023_slim_n |> + tidytable::select(-n) |> + select_sample( + "chromy_pps", + n = sampsizes, + strata = "Region", + mos = "Pop_Tot", + outall = TRUE + ) + expect_equal( + samp1 |> tidytable::select(-n), + samp2 + ) +}) + +test_that("n on frame does not cause issue - srs", { + county_2023_slim_n <- county_2023 |> + tidytable::select(GEOID, Region, Pop_Tot) |> + tidytable::mutate( + n = 50, + ExpHits_man = 10 * Pop_Tot / sum(Pop_Tot), + .by = "Region" + ) + + sampsizes <- county_2023_slim_n |> + tidytable::distinct(Region) |> + tidytable::mutate(sample_size = 10) + + set.seed(12345) + samp1 <- county_2023_slim_n |> + select_sample( + "srs", + n = sampsizes, + strata = "Region", + outall = TRUE + ) + set.seed(12345) + samp2 <- county_2023_slim_n |> + tidytable::select(-n) |> + select_sample( + "srs", + n = sampsizes, + strata = "Region", + outall = TRUE + ) + expect_equal( + samp1 |> tidytable::select(-n), + samp2 + ) +}) + +test_that("n on frame does not cause issue - sys_pps", { + county_2023_slim_n <- county_2023 |> + tidytable::select(GEOID, Region, Pop_Tot) |> + tidytable::mutate( + n = 50, + ExpHits_man = 10 * Pop_Tot / sum(Pop_Tot), + .by = "Region" + ) + + sampsizes <- county_2023_slim_n |> + tidytable::distinct(Region) |> + tidytable::mutate(sample_size = 10) + + set.seed(12345) + samp1 <- county_2023_slim_n |> + select_sample( + "sys_pps", + n = sampsizes, + strata = "Region", + mos = "Pop_Tot", + outall = TRUE + ) + set.seed(12345) + samp2 <- county_2023_slim_n |> + tidytable::select(-n) |> + select_sample( + "sys_pps", + n = sampsizes, + strata = "Region", + mos = "Pop_Tot", + outall = TRUE + ) + expect_equal( + samp1 |> tidytable::select(-n), + samp2 + ) +}) + +test_that("n on frame does not cause issue - sys", { + county_2023_slim_n <- county_2023 |> + tidytable::select(GEOID, Region, Pop_Tot) |> + tidytable::mutate( + n = 50, + ExpHits_man = 10 * Pop_Tot / sum(Pop_Tot), + .by = "Region" + ) + + sampsizes <- county_2023_slim_n |> + tidytable::distinct(Region) |> + tidytable::mutate(sample_size = 10) + + set.seed(12345) + samp1 <- county_2023_slim_n |> + select_sample( + "sys", + n = sampsizes, + strata = "Region", + outall = TRUE + ) + set.seed(12345) + samp2 <- county_2023_slim_n |> + tidytable::select(-n) |> + select_sample( + "sys", + n = sampsizes, + strata = "Region", + outall = TRUE + ) + expect_equal( + samp1 |> tidytable::select(-n), + samp2 + ) +})