diff --git a/.Rbuildignore b/.Rbuildignore index 6022675..a799dca 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -7,3 +7,5 @@ ^_pkgdown\.yml$ ^docs$ ^pkgdown$ +^\.positai$ +^\.claude$ diff --git a/.gitignore b/.gitignore index 667204b..51d5219 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ .DS_Store .quarto docs +.positai diff --git a/NAMESPACE b/NAMESPACE index 7fffe22..37e108c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -17,4 +17,5 @@ importFrom(data.table,.NGRP) importFrom(data.table,.SD) importFrom(data.table,data.table) importFrom(rlang,.data) +importFrom(rlang,.env) importFrom(stats,runif) diff --git a/R/SampleSelectR-package.R b/R/SampleSelectR-package.R index 50f5a29..1a615a1 100644 --- a/R/SampleSelectR-package.R +++ b/R/SampleSelectR-package.R @@ -11,7 +11,8 @@ #' @importFrom data.table .SD #' @importFrom data.table := #' @importFrom data.table data.table -#' @importFrom stats runif #' @importFrom rlang .data +#' @importFrom rlang .env +#' @importFrom stats runif ## usethis namespace: end NULL diff --git a/R/allocate.R b/R/allocate.R index e21684b..1821f70 100644 --- a/R/allocate.R +++ b/R/allocate.R @@ -279,7 +279,10 @@ allocate <- function(allocation, N.h, n.samp = NULL, S.h = NULL, c.h = NULL, cos } } # Calculate the total (raw) sample size - n <- max(ceiling(sum(allocations)), length(N.h) * lbound) + + + final_n <- max(ceiling(sum(allocations)), length(N.h) * lbound) + sizes <- allocations @@ -292,7 +295,7 @@ allocate <- function(allocation, N.h, n.samp = NULL, S.h = NULL, c.h = NULL, cos total_allocated <- sum(adjusted_allocations) # Calculate the difference from the total n - difference <- n - total_allocated + difference <- final_n - total_allocated # If difference is positive, distribute it proportionally if (difference != 0) { @@ -314,7 +317,7 @@ allocate <- function(allocation, N.h, n.samp = NULL, S.h = NULL, c.h = NULL, cos # Step 5: Adjust the rounded allocations to ensure the sum equals n total_allocated <- sum(rounded_allocations) - difference <- n - total_allocated + difference <- final_n - total_allocated # Adjust the allocations by adding/subtracting the difference while (difference != 0) { i <- sample(1:num_groups, 1) # Randomly select an index @@ -345,7 +348,7 @@ allocate <- function(allocation, N.h, n.samp = NULL, S.h = NULL, c.h = NULL, cos } outputs <- as.integer(rounded_allocations) if (allocation == "optimal") { - n.print <- n + n.print <- final_n } else { n.print <- n.samp } diff --git a/R/chromy_pps.R b/R/chromy_pps.R index 0692e76..e70dc6a 100644 --- a/R/chromy_pps.R +++ b/R/chromy_pps.R @@ -53,8 +53,8 @@ chromy_pps <- function(frame, n, mos, outall = FALSE, curstrat = NULL) { frame_hits <- frame |> tidytable::mutate( - ExpectedHits = exphits, - NumberHits = chromy_inner(exphits), + ExpectedHits = .env$exphits, + NumberHits = chromy_inner(.env$exphits), SelectionIndicator = .data$NumberHits > 0, SamplingWeight = ifelse(.data$SelectionIndicator, 1 / .data$ExpectedHits, NA), ) diff --git a/R/srs.R b/R/srs.R index 677c5e7..d3467ac 100644 --- a/R/srs.R +++ b/R/srs.R @@ -28,6 +28,7 @@ #' @export + srs <- function(frame, n, outall = FALSE, curstrat = NULL) { check_frame_type(frame) check_n(n, frame, curstrat, n_le_N = TRUE) @@ -35,18 +36,22 @@ srs <- function(frame, n, outall = FALSE, curstrat = NULL) { N <- nrow(frame) - # Take the srs and create sampling-related columns + # Take the SRS and create sampling-related columns selectedVector <- sample(x = N, size = n, replace = FALSE) + frame <- frame |> tidytable::mutate( rowNum = tidytable::row_number(), - SelectionProbability = n / N, - SamplingWeight = ifelse(.data$rowNum %in% selectedVector, N / n, NA), - SelectionIndicator = ifelse(.data$rowNum %in% selectedVector, TRUE, FALSE) + SelectionProbability = .env$n / .env$N, + SamplingWeight = tidytable::if_else( + .data$rowNum %in% .env$selectedVector, + .env$N / .env$n, + NA_real_ + ), + SelectionIndicator = .data$rowNum %in% .env$selectedVector ) |> tidytable::select(-tidytable::all_of("rowNum")) - # Output to screen Sampling_Output(n, N, curstrat = curstrat) @@ -55,8 +60,9 @@ srs <- function(frame, n, outall = FALSE, curstrat = NULL) { sample <- frame |> tidytable::filter(.data$SelectionIndicator) |> tidytable::select(-tidytable::all_of("SelectionIndicator")) + return(sample) - } else if (outall) { + } else { return(frame) } } diff --git a/R/sys.R b/R/sys.R index 5fb4083..ec50f95 100644 --- a/R/sys.R +++ b/R/sys.R @@ -34,6 +34,7 @@ #' #' @export + sys <- function(frame, n, curstrat = NULL, outall = FALSE) { check_frame_type(frame) check_n(n, frame, curstrat, n_le_N = TRUE) @@ -42,32 +43,32 @@ sys <- function(frame, n, curstrat = NULL, outall = FALSE) { N <- nrow(frame) # Sampling method - k <- N / n # Sampling interval - r <- runif(1, 1, k) # We use a random start between 1 and k + r <- runif(1, 1, k) # Random start between 1 and k selectedVector <- floor(r + k * (0:(n - 1))) # Selected row indices - # We make sure that selected indices are within frame range - + # Make sure selected indices are within frame range selectedVector <- selectedVector[selectedVector <= N] - # Creating variables accordingly + # Create variables frame <- frame |> tidytable::mutate( numrow = tidytable::row_number(), - SelectionIndicator = .data$numrow %in% selectedVector, - SelectionProbability = n / N, - SamplingWeight = ifelse(.data$SelectionIndicator, N / n, NA) + SelectionIndicator = .data$numrow %in% .env$selectedVector, + SelectionProbability = .env$n / .env$N, + SamplingWeight = tidytable::if_else( + .data$SelectionIndicator, + .env$N / .env$n, + NA_real_ + ) ) |> tidytable::select(-tidytable::all_of("numrow")) # Output to screen Sampling_Output(n, N, k = k, r = r, curstrat = curstrat) - - # Return only selected rows and make sure the selected sample is a data.frame, tibble, or data.table - + # Return only the sample or the frame with selection indicator based on value of outall if (outall) { return(frame) } else { diff --git a/R/sys_pps.R b/R/sys_pps.R index ca0ff0c..53619fd 100644 --- a/R/sys_pps.R +++ b/R/sys_pps.R @@ -47,7 +47,7 @@ sys_pps <- function(frame, n, mos, outall = FALSE, curstrat = NULL) { tbd_data_1 <- frame |> tidytable::mutate( rowNum = tidytable::row_number(), - ExpectedHits = n * (!!(symbol_mos) / totalSize), + ExpectedHits = .env$n * (!!(symbol_mos) / .env$totalSize), SamplingWeight = .data$ExpectedHits^-1 ) @@ -64,11 +64,12 @@ sys_pps <- function(frame, n, mos, outall = FALSE, curstrat = NULL) { selectedVector <- findInterval(selectedSizePoints, sizeIntervals) # Using selectedVector, get the total counts of each index + selectedVector_counts <- selectedVector |> as.data.frame() |> - tidytable::count(selectedVector) |> - # Rename to NumberHits - tidytable::rename(NumberHits = n) + tidytable::count(selectedVector, name = "NumberHits") + + tbd_data_2 <- tbd_data_1 |> tidytable::left_join( @@ -78,7 +79,7 @@ sys_pps <- function(frame, n, mos, outall = FALSE, curstrat = NULL) { # Need to zero filled NumberHits tidytable::mutate( NumberHits = tidytable::replace_na(.data$NumberHits, replace = 0), - SelectionIndicator = .data$rowNum %in% selectedVector, + SelectionIndicator = .data$rowNum %in% .env$selectedVector, # Make SamplingWeight to be NA if not selected SamplingWeight = tidytable::case_when( SelectionIndicator == TRUE ~ .data$SamplingWeight, diff --git a/R/util.R b/R/util.R index 6b1f9e8..5c34668 100644 --- a/R/util.R +++ b/R/util.R @@ -1,3 +1,4 @@ + #' Check if the frame is a valid data structure #' #' Ensures that the input frame is a data.frame, data.table, or tibble. diff --git a/man/SampleSelectR-package.Rd b/man/SampleSelectR-package.Rd index ec80e39..de3ab40 100644 --- a/man/SampleSelectR-package.Rd +++ b/man/SampleSelectR-package.Rd @@ -12,6 +12,7 @@ Randomly select samples with SRS, systematic, and various PPS methods. Also incl Useful links: \itemize{ \item \url{https://github.com/RTIInternational/SampleSelectR} + \item \url{https://rtiinternational.github.io/SampleSelectR/} \item Report bugs at \url{https://github.com/RTIInternational/SampleSelectR/issues} } diff --git a/tests/testthat/test-select_sample.R b/tests/testthat/test-select_sample.R new file mode 100644 index 0000000..a593c31 --- /dev/null +++ b/tests/testthat/test-select_sample.R @@ -0,0 +1,147 @@ +test_that("n on frame does not cause issue - chromy_pps", { + county_2023_slim_n <- county_2023 |> + tidytable::select(GEOID, Region, Pop_Tot) |> + tidytable::mutate( + n = 50, + ExpHits_man = 10 * Pop_Tot / sum(Pop_Tot), + .by = "Region" + ) + + sampsizes <- county_2023_slim_n |> + tidytable::distinct(Region) |> + tidytable::mutate(sample_size = 10) + + set.seed(12345) + samp1 <- county_2023_slim_n |> + select_sample( + "chromy_pps", + n = sampsizes, + strata = "Region", + mos = "Pop_Tot", + outall = TRUE + ) + set.seed(12345) + samp2 <- county_2023_slim_n |> + tidytable::select(-n) |> + select_sample( + "chromy_pps", + n = sampsizes, + strata = "Region", + mos = "Pop_Tot", + outall = TRUE + ) + expect_equal( + samp1 |> tidytable::select(-n), + samp2 + ) +}) + +test_that("n on frame does not cause issue - srs", { + county_2023_slim_n <- county_2023 |> + tidytable::select(GEOID, Region, Pop_Tot) |> + tidytable::mutate( + n = 50, + ExpHits_man = 10 * Pop_Tot / sum(Pop_Tot), + .by = "Region" + ) + + sampsizes <- county_2023_slim_n |> + tidytable::distinct(Region) |> + tidytable::mutate(sample_size = 10) + + set.seed(12345) + samp1 <- county_2023_slim_n |> + select_sample( + "srs", + n = sampsizes, + strata = "Region", + outall = TRUE + ) + set.seed(12345) + samp2 <- county_2023_slim_n |> + tidytable::select(-n) |> + select_sample( + "srs", + n = sampsizes, + strata = "Region", + outall = TRUE + ) + expect_equal( + samp1 |> tidytable::select(-n), + samp2 + ) +}) + +test_that("n on frame does not cause issue - sys_pps", { + county_2023_slim_n <- county_2023 |> + tidytable::select(GEOID, Region, Pop_Tot) |> + tidytable::mutate( + n = 50, + ExpHits_man = 10 * Pop_Tot / sum(Pop_Tot), + .by = "Region" + ) + + sampsizes <- county_2023_slim_n |> + tidytable::distinct(Region) |> + tidytable::mutate(sample_size = 10) + + set.seed(12345) + samp1 <- county_2023_slim_n |> + select_sample( + "sys_pps", + n = sampsizes, + strata = "Region", + mos = "Pop_Tot", + outall = TRUE + ) + set.seed(12345) + samp2 <- county_2023_slim_n |> + tidytable::select(-n) |> + select_sample( + "sys_pps", + n = sampsizes, + strata = "Region", + mos = "Pop_Tot", + outall = TRUE + ) + expect_equal( + samp1 |> tidytable::select(-n), + samp2 + ) +}) + +test_that("n on frame does not cause issue - sys", { + county_2023_slim_n <- county_2023 |> + tidytable::select(GEOID, Region, Pop_Tot) |> + tidytable::mutate( + n = 50, + ExpHits_man = 10 * Pop_Tot / sum(Pop_Tot), + .by = "Region" + ) + + sampsizes <- county_2023_slim_n |> + tidytable::distinct(Region) |> + tidytable::mutate(sample_size = 10) + + set.seed(12345) + samp1 <- county_2023_slim_n |> + select_sample( + "sys", + n = sampsizes, + strata = "Region", + outall = TRUE + ) + set.seed(12345) + samp2 <- county_2023_slim_n |> + tidytable::select(-n) |> + select_sample( + "sys", + n = sampsizes, + strata = "Region", + outall = TRUE + ) + expect_equal( + samp1 |> tidytable::select(-n), + samp2 + ) +})