PennLINC · mattcieslak · Apr 22, 2026 · Apr 1, 2026 · Apr 1, 2026 · Apr 22, 2026
diff --git a/R/analyse-helpers.R b/R/analyse-helpers.R
@@ -11,6 +11,46 @@
 }
 
 
+#' Resolve the response scalar for formula-based analyses
+#'
+#' Uses the left-hand side of the formula when `scalar` is omitted.
+#' If `scalar` is supplied, validates that it exists and matches the formula
+#' response when that response can be mapped to a scalar name.
+#' @noRd
+.resolve_formula_scalar <- function(formula, data, scalar = NULL) {
+  scalar_names <- names(scalars(data))
+  lhs_expr <- tryCatch(formula[[2]], error = function(e) NULL)
+  lhs_vars <- if (is.null(lhs_expr)) character(0) else all.vars(lhs_expr)
+  lhs_scalar <- intersect(lhs_vars, scalar_names)
+
+  if (is.null(scalar)) {
+    if (length(lhs_scalar) == 1L) {
+      return(lhs_scalar)
+    }
+    stop(
+      "Could not infer the response scalar from the formula. ",
+      "Please supply `scalar=` explicitly."
+    )
+  }
+
+  if (!(scalar %in% scalar_names)) {
+    stop(
+      "scalar '", scalar, "' not found in data. Available scalars: ",
+      paste(scalar_names, collapse = ", ")
+    )
+  }
+
+  if (length(lhs_scalar) == 1L && !identical(lhs_scalar, scalar)) {
+    stop(
+      "The formula response refers to scalar '", lhs_scalar,
+      "' but `scalar=` was set to '", scalar, "'."
+    )
+  }
+
+  scalar
+}
+
+
 #' Validate and default element.subset
 #' @return Integer vector of element indices
 #' @noRd

diff --git a/R/analyse.R b/R/analyse.R
@@ -132,7 +132,7 @@
 #' @rdname ModelArray.lm
 #' @export
 
-ModelArray.lm <- function(formula, data, phenotypes, scalar, element.subset = NULL, full.outputs = FALSE,
+ModelArray.lm <- function(formula, data, phenotypes, scalar = NULL, element.subset = NULL, full.outputs = FALSE,
                           var.terms = c("estimate", "statistic", "p.value"),
                           var.model = c("adj.r.squared", "p.value"),
                           correct.p.value.terms = c("fdr"), correct.p.value.model = c("fdr"),
@@ -147,6 +147,7 @@ ModelArray.lm <- function(formula, data, phenotypes, scalar, element.subset = NU
                           return_output = TRUE,
                           ...) {
   .validate_modelarray_input(data)
+  scalar <- .resolve_formula_scalar(formula, data, scalar)
   element.subset <- .validate_element_subset(element.subset, data, scalar)
   phenotypes <- .align_phenotypes(data, phenotypes, scalar)
 
@@ -414,7 +415,7 @@ ModelArray.lm <- function(formula, data, phenotypes, scalar, element.subset = NU
 #' @rdname ModelArray.gam
 #' @export
 
-ModelArray.gam <- function(formula, data, phenotypes, scalar, element.subset = NULL, full.outputs = FALSE,
+ModelArray.gam <- function(formula, data, phenotypes, scalar = NULL, element.subset = NULL, full.outputs = FALSE,
                            var.smoothTerms = c("statistic", "p.value"),
                            var.parametricTerms = c("estimate", "statistic", "p.value"),
                            var.model = c("dev.expl"),
@@ -432,6 +433,7 @@ ModelArray.gam <- function(formula, data, phenotypes, scalar, element.subset = N
                            return_output = TRUE,
                            ...) {
   .validate_modelarray_input(data)
+  scalar <- .resolve_formula_scalar(formula, data, scalar)
   element.subset <- .validate_element_subset(element.subset, data, scalar)
 
   # check if the formula is valid in terms of mgcv::gam()

diff --git a/README.Rmd b/README.Rmd
@@ -36,7 +36,7 @@ library(ModelArray)
 modelarray <- ModelArray("data.h5", scalar_types = c("FDC"))
 phenotypes  <- read.csv("cohort.csv")
 
-results <- ModelArray.lm(FDC ~ Age + sex + motion, modelarray, phenotypes, "FDC",
+results <- ModelArray.lm(FDC ~ Age + sex + motion, modelarray, phenotypes,
   n_cores = 4
 )
 ```

diff --git a/README.md b/README.md
@@ -31,7 +31,7 @@ library(ModelArray)
 modelarray <- ModelArray("data.h5", scalar_types = c("FDC"))
 phenotypes  <- read.csv("cohort.csv")
 
-results <- ModelArray.lm(FDC ~ Age + sex + motion, modelarray, phenotypes, "FDC",
+results <- ModelArray.lm(FDC ~ Age + sex + motion, modelarray, phenotypes,
   n_cores = 4
 )
 ```

diff --git a/tests/testthat/test-analyse_helpers.R b/tests/testthat/test-analyse_helpers.R
@@ -3,6 +3,24 @@ test_that(".validate_modelarray_input rejects non-ModelArray", {
   expect_error(.validate_modelarray_input(data.frame()), "not ModelArray")
 })
 
+test_that(".resolve_formula_scalar infers and validates scalar names", {
+  h5_path <- system.file("extdata", "n50_fixels.h5", package = "ModelArray")
+  ma <- ModelArray(h5_path, scalar_types = c("FD"))
+
+  expect_identical(.resolve_formula_scalar(FD ~ age, ma), "FD")
+  expect_identical(.resolve_formula_scalar(log(FD) ~ age, ma), "FD")
+  expect_identical(.resolve_formula_scalar(FD ~ age, ma, scalar = "FD"), "FD")
+
+  expect_error(
+    .resolve_formula_scalar(unknown_response ~ age, ma),
+    "Could not infer the response scalar"
+  )
+  expect_error(
+    .resolve_formula_scalar(FD ~ age, ma, scalar = "FA"),
+    "not found in data"
+  )
+})
+
 test_that(".validate_element_subset defaults and validates", {
   h5_path <- system.file("extdata", "n50_fixels.h5", package = "ModelArray")
   ma <- ModelArray(h5_path, scalar_types = c("FD"))

diff --git a/tests/testthat/test-merge.R b/tests/testthat/test-merge.R
@@ -137,3 +137,133 @@ test_that("mergeModelArrays returns proper structure", {
   expect_equal(nInputFiles(merged$data, "FD2"), length(src))
   expect_equal(nElements(merged$data, "FD"), nElements(ma_fd, "FD"))
 })
+
+test_that("merged ModelArrays work with analyseOneElement helpers", {
+  h5_path <- system.file("extdata", "n50_fixels.h5", package = "ModelArray")
+  ma_fd <- ModelArray(h5_path, scalar_types = c("FD"))
+  src <- sources(ma_fd)[["FD"]]
+  base <- scalars(ma_fd)[["FD"]]
+
+  ma2 <- new("ModelArray",
+    scalars = list(
+      FD2 = base * 1.5 + 2,
+      FD3 = log(base + 1)
+    ),
+    sources = list(FD2 = src, FD3 = src),
+    results = list(),
+    path = c(FD2 = h5_path, FD3 = h5_path)
+  )
+
+  phen1 <- data.frame(
+    subject_id = paste0("subj_", seq_along(src)),
+    source_file = src,
+    age = seq_along(src),
+    stringsAsFactors = FALSE
+  )
+  phen2 <- data.frame(
+    subject_id = rev(paste0("subj_", seq_along(src))),
+    source_file = rev(src),
+    stringsAsFactors = FALSE
+  )
+
+  merged <- mergeModelArrays(
+    list(ma_fd, ma2),
+    list(phen1, phen2),
+    merge_on = "subject_id"
+  )
+
+  lm_init <- suppressWarnings(analyseOneElement.lm(
+    1L,
+    FD ~ FD2 + FD3 + age,
+    merged$data,
+    merged$phenotypes,
+    scalar = "FD",
+    var.terms = c("estimate", "statistic", "p.value"),
+    var.model = c("adj.r.squared", "p.value"),
+    num.subj.lthr = 1,
+    num.stat.output = 12,
+    flag_initiate = TRUE,
+    on_error = "stop"
+  ))
+  expect_true("FD2.estimate" %in% lm_init$column_names)
+  expect_true("FD3.estimate" %in% lm_init$column_names)
+
+  lm_fit <- suppressWarnings(ModelArray.lm(
+    FD ~ FD2 + FD3 + age,
+    merged$data,
+    merged$phenotypes,
+    element.subset = as.integer(1:2),
+    verbose = FALSE,
+    pbar = FALSE,
+    n_cores = 1,
+    on_error = "stop"
+  ))
+  expect_true(all(is.finite(lm_fit$FD2.estimate)))
+  expect_true(all(is.finite(lm_fit$FD3.estimate)))
+
+  expect_error(
+    ModelArray.lm(
+      FD ~ FD2 + FD3 + age,
+      merged$data,
+      merged$phenotypes,
+      scalar = "FD2",
+      element.subset = as.integer(1:2),
+      verbose = FALSE,
+      pbar = FALSE,
+      n_cores = 1,
+      on_error = "stop"
+    ),
+    "formula response refers to scalar 'FD'"
+  )
+
+  gam_init <- analyseOneElement.gam(
+    1L,
+    FD ~ s(age) + FD2 + FD3,
+    merged$data,
+    merged$phenotypes,
+    scalar = "FD",
+    var.smoothTerms = c("edf", "statistic", "p.value"),
+    var.parametricTerms = c("estimate", "statistic", "p.value"),
+    var.model = c("adj.r.squared", "dev.expl"),
+    num.subj.lthr = 1,
+    num.stat.output = 12,
+    flag_initiate = TRUE,
+    flag_sse = FALSE,
+    on_error = "stop"
+  )
+  expect_true("FD2.estimate" %in% gam_init$column_names)
+  expect_true("FD3.estimate" %in% gam_init$column_names)
+
+  gam_fit <- ModelArray.gam(
+    FD ~ s(age) + FD2 + FD3,
+    merged$data,
+    merged$phenotypes,
+    element.subset = as.integer(1:2),
+    verbose = FALSE,
+    pbar = FALSE,
+    n_cores = 1,
+    on_error = "stop"
+  )
+  expect_true("FD2.estimate" %in% colnames(gam_fit))
+  expect_true("FD3.estimate" %in% colnames(gam_fit))
+
+  wrap_out <- analyseOneElement.wrap(
+    1L,
+    function(data) {
+      data.frame(
+        mean_fd = mean(data$FD, na.rm = TRUE),
+        cor_fd2 = cor(data$FD, data$FD2, use = "complete.obs"),
+        cor_fd3 = cor(data$FD, data$FD3, use = "complete.obs")
+      )
+    },
+    merged$data,
+    merged$phenotypes,
+    scalar = "FD",
+    num.subj.lthr = 1,
+    num.stat.output = 4,
+    flag_initiate = FALSE,
+    on_error = "stop"
+  )
+  expect_true(is.finite(unname(wrap_out[[3]])))
+  expect_true(is.finite(unname(wrap_out[[4]])))
+})