From 8957e800ff3f75ec561378900563b9e41482a352 Mon Sep 17 00:00:00 2001
From: Eric Weine <ericw456@mit.edu>
Date: Thu, 16 Apr 2026 22:18:10 -0400
Subject: [PATCH 1/2] improved speed through multiple parallel operations

---
 R/RcppExports.R     |  16 ++++++
 R/likelihood.R      |  10 ++--
 R/misc.R            |   2 +-
 src/RcppExports.cpp |  64 ++++++++++++++++++++++++
 src/cost.cpp        | 101 ++++++++++++++++++++++++++++++++++++++
 src/misc.cpp        | 117 +++++++++++++++++++++++++++++++++++++++++++-
 6 files changed, 302 insertions(+), 8 deletions(-)

diff --git a/R/RcppExports.R b/R/RcppExports.R
index 99b83a2c..69937a42 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -25,6 +25,14 @@ cost_sparse_rcpp <- function(X, A, B, e, poisson) {
     .Call('_fastTopics_cost_sparse_rcpp', PACKAGE = 'fastTopics', X, A, B, e, poisson)
 }
 
+cost_parallel_rcpp <- function(X, A, B, e, poisson) {
+    .Call('_fastTopics_cost_parallel_rcpp', PACKAGE = 'fastTopics', X, A, B, e, poisson)
+}
+
+cost_sparse_parallel_rcpp <- function(X, A, B, e, poisson) {
+    .Call('_fastTopics_cost_sparse_parallel_rcpp', PACKAGE = 'fastTopics', X, A, B, e, poisson)
+}
+
 le_diff_rcpp <- function(X) {
     .Call('_fastTopics_le_diff_rcpp', PACKAGE = 'fastTopics', X)
 }
@@ -33,6 +41,14 @@ x_over_crossprod_rcpp <- function(i, j, x, A, B, e) {
     .Call('_fastTopics_x_over_crossprod_rcpp', PACKAGE = 'fastTopics', i, j, x, A, B, e)
 }
 
+x_over_crossprod_parallel_rcpp <- function(i, j, x, A, B, e) {
+    .Call('_fastTopics_x_over_crossprod_parallel_rcpp', PACKAGE = 'fastTopics', i, j, x, A, B, e)
+}
+
+poisson_nmf_kkt_sparse_parallel_rcpp <- function(X, L, F, e) {
+    .Call('_fastTopics_poisson_nmf_kkt_sparse_parallel_rcpp', PACKAGE = 'fastTopics', X, L, F, e)
+}
+
 mixem_rcpp <- function(L, w, x0, numiter) {
     .Call('_fastTopics_mixem_rcpp', PACKAGE = 'fastTopics', L, w, x0, numiter)
 }
diff --git a/R/likelihood.R b/R/likelihood.R
index 8c0a66e2..677be845 100644
--- a/R/likelihood.R
+++ b/R/likelihood.R
@@ -110,9 +110,9 @@ cost <- function (X, A, B, e = 1e-8, family = c("poisson","multinom"),
     f  <- rowSums(poisson*AB - X*log(AB + e))
   } else if (version == "Rcpp") {
     if (is.matrix(X))
-      f <- drop(cost_rcpp(X,A,B,e,poisson))
+      f <- drop(cost_parallel_rcpp(X,A,B,e,poisson))
     else
-      f <- drop(cost_sparse_rcpp(X,A,B,e,poisson))
+      f <- drop(cost_sparse_parallel_rcpp(X,A,B,e,poisson))
   }
   names(f) <- rownames(X)
   return(f)
@@ -185,9 +185,9 @@ poisson_nmf_kkt <- function (X, F, L, e = 1e-8) {
     return(list(F = F*(t(1 - A) %*% L),
                 L = L*((1 - A) %*% F)))
   } else {
-    A <- x_over_tcrossprod(X,t(L),t(F),e)
-    return(list(F = F*(repmat(colSums(L),ncol(X)) - as.matrix(t(A) %*% L)),
-                L = L*(repmat(colSums(F),nrow(X)) - as.matrix(A %*% F))))
+    result <- poisson_nmf_kkt_sparse_parallel_rcpp(X,L,F,e)
+    return(list(F = F*(repmat(colSums(L),ncol(X)) - result$tAL),
+                L = L*(repmat(colSums(F),nrow(X)) - result$AF)))
   }
 }
 
diff --git a/R/misc.R b/R/misc.R
index d84bf8c4..6ccbf05e 100644
--- a/R/misc.R
+++ b/R/misc.R
@@ -55,7 +55,7 @@ apply.nonzeros <- function (X, f) {
 #' 
 x_over_tcrossprod <- function (X, A, B, e) {
   d <- summary(X)
-  y <- drop(x_over_crossprod_rcpp(d$i - 1,d$j - 1,d$x,A,B,e))
+  y <- drop(x_over_crossprod_parallel_rcpp(d$i - 1,d$j - 1,d$x,A,B,e))
   return(sparseMatrix(i = d$i,j = d$j,x = y,dims = dim(X)))
 }
 
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index ce923eab..b9bf5b97 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -97,6 +97,36 @@ BEGIN_RCPP
     return rcpp_result_gen;
 END_RCPP
 }
+// cost_parallel_rcpp
+arma::vec cost_parallel_rcpp(const arma::mat& X, const arma::mat& A, const arma::mat& B, double e, bool poisson);
+RcppExport SEXP _fastTopics_cost_parallel_rcpp(SEXP XSEXP, SEXP ASEXP, SEXP BSEXP, SEXP eSEXP, SEXP poissonSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< const arma::mat& >::type X(XSEXP);
+    Rcpp::traits::input_parameter< const arma::mat& >::type A(ASEXP);
+    Rcpp::traits::input_parameter< const arma::mat& >::type B(BSEXP);
+    Rcpp::traits::input_parameter< double >::type e(eSEXP);
+    Rcpp::traits::input_parameter< bool >::type poisson(poissonSEXP);
+    rcpp_result_gen = Rcpp::wrap(cost_parallel_rcpp(X, A, B, e, poisson));
+    return rcpp_result_gen;
+END_RCPP
+}
+// cost_sparse_parallel_rcpp
+arma::vec cost_sparse_parallel_rcpp(const arma::sp_mat& X, const arma::mat& A, const arma::mat& B, double e, bool poisson);
+RcppExport SEXP _fastTopics_cost_sparse_parallel_rcpp(SEXP XSEXP, SEXP ASEXP, SEXP BSEXP, SEXP eSEXP, SEXP poissonSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< const arma::sp_mat& >::type X(XSEXP);
+    Rcpp::traits::input_parameter< const arma::mat& >::type A(ASEXP);
+    Rcpp::traits::input_parameter< const arma::mat& >::type B(BSEXP);
+    Rcpp::traits::input_parameter< double >::type e(eSEXP);
+    Rcpp::traits::input_parameter< bool >::type poisson(poissonSEXP);
+    rcpp_result_gen = Rcpp::wrap(cost_sparse_parallel_rcpp(X, A, B, e, poisson));
+    return rcpp_result_gen;
+END_RCPP
+}
 // le_diff_rcpp
 arma::mat le_diff_rcpp(const arma::mat& X);
 RcppExport SEXP _fastTopics_le_diff_rcpp(SEXP XSEXP) {
@@ -124,6 +154,36 @@ BEGIN_RCPP
     return rcpp_result_gen;
 END_RCPP
 }
+// x_over_crossprod_parallel_rcpp
+arma::vec x_over_crossprod_parallel_rcpp(const arma::vec& i, const arma::vec& j, const arma::vec& x, const arma::mat& A, const arma::mat& B, double e);
+RcppExport SEXP _fastTopics_x_over_crossprod_parallel_rcpp(SEXP iSEXP, SEXP jSEXP, SEXP xSEXP, SEXP ASEXP, SEXP BSEXP, SEXP eSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< const arma::vec& >::type i(iSEXP);
+    Rcpp::traits::input_parameter< const arma::vec& >::type j(jSEXP);
+    Rcpp::traits::input_parameter< const arma::vec& >::type x(xSEXP);
+    Rcpp::traits::input_parameter< const arma::mat& >::type A(ASEXP);
+    Rcpp::traits::input_parameter< const arma::mat& >::type B(BSEXP);
+    Rcpp::traits::input_parameter< double >::type e(eSEXP);
+    rcpp_result_gen = Rcpp::wrap(x_over_crossprod_parallel_rcpp(i, j, x, A, B, e));
+    return rcpp_result_gen;
+END_RCPP
+}
+// poisson_nmf_kkt_sparse_parallel_rcpp
+Rcpp::List poisson_nmf_kkt_sparse_parallel_rcpp(const arma::sp_mat& X, const arma::mat& L, const arma::mat& F, double e);
+RcppExport SEXP _fastTopics_poisson_nmf_kkt_sparse_parallel_rcpp(SEXP XSEXP, SEXP LSEXP, SEXP FSEXP, SEXP eSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< const arma::sp_mat& >::type X(XSEXP);
+    Rcpp::traits::input_parameter< const arma::mat& >::type L(LSEXP);
+    Rcpp::traits::input_parameter< const arma::mat& >::type F(FSEXP);
+    Rcpp::traits::input_parameter< double >::type e(eSEXP);
+    rcpp_result_gen = Rcpp::wrap(poisson_nmf_kkt_sparse_parallel_rcpp(X, L, F, e));
+    return rcpp_result_gen;
+END_RCPP
+}
 // mixem_rcpp
 arma::vec mixem_rcpp(const arma::mat& L, const arma::vec& w, const arma::vec& x0, unsigned int numiter);
 RcppExport SEXP _fastTopics_mixem_rcpp(SEXP LSEXP, SEXP wSEXP, SEXP x0SEXP, SEXP numiterSEXP) {
@@ -414,8 +474,12 @@ static const R_CallMethodDef CallEntries[] = {
     {"_fastTopics_ccd_update_factors_sparse_parallel_rcpp", (DL_FUNC) &_fastTopics_ccd_update_factors_sparse_parallel_rcpp, 4},
     {"_fastTopics_cost_rcpp", (DL_FUNC) &_fastTopics_cost_rcpp, 5},
     {"_fastTopics_cost_sparse_rcpp", (DL_FUNC) &_fastTopics_cost_sparse_rcpp, 5},
+    {"_fastTopics_cost_parallel_rcpp", (DL_FUNC) &_fastTopics_cost_parallel_rcpp, 5},
+    {"_fastTopics_cost_sparse_parallel_rcpp", (DL_FUNC) &_fastTopics_cost_sparse_parallel_rcpp, 5},
     {"_fastTopics_le_diff_rcpp", (DL_FUNC) &_fastTopics_le_diff_rcpp, 1},
     {"_fastTopics_x_over_crossprod_rcpp", (DL_FUNC) &_fastTopics_x_over_crossprod_rcpp, 6},
+    {"_fastTopics_x_over_crossprod_parallel_rcpp", (DL_FUNC) &_fastTopics_x_over_crossprod_parallel_rcpp, 6},
+    {"_fastTopics_poisson_nmf_kkt_sparse_parallel_rcpp", (DL_FUNC) &_fastTopics_poisson_nmf_kkt_sparse_parallel_rcpp, 4},
     {"_fastTopics_mixem_rcpp", (DL_FUNC) &_fastTopics_mixem_rcpp, 4},
     {"_fastTopics_pnmfem_update_factors_rcpp", (DL_FUNC) &_fastTopics_pnmfem_update_factors_rcpp, 5},
     {"_fastTopics_pnmfem_update_factors_sparse_rcpp", (DL_FUNC) &_fastTopics_pnmfem_update_factors_sparse_rcpp, 5},
diff --git a/src/cost.cpp b/src/cost.cpp
index a7cd1f05..3d3f6f90 100644
--- a/src/cost.cpp
+++ b/src/cost.cpp
@@ -1,8 +1,81 @@
+#include <RcppParallel.h>
 #include "misc.h"
 #include "cost.h"
 
 using namespace arma;
 
+// CLASS DEFINITIONS
+// -----------------
+// Worker for parallel computation of the dense cost function using
+// RcppParallel reduce. Each thread accumulates its own partial sum
+// over a range of columns of X, and the join() method combines them.
+struct cost_worker : public RcppParallel::Worker {
+  const mat&  X;
+  const mat&  A;
+  const mat&  B;
+  double      e;
+  bool        poisson;
+  vec         f;
+
+  cost_worker (const mat& X, const mat& A, const mat& B, double e,
+               bool poisson) :
+    X(X), A(A), B(B), e(e), poisson(poisson), f(X.n_rows, fill::zeros) { };
+
+  // Split constructor: creates a new worker with a zeroed accumulator.
+  cost_worker (const cost_worker& other, RcppParallel::Split) :
+    X(other.X), A(other.A), B(other.B), e(other.e), poisson(other.poisson),
+    f(other.X.n_rows, fill::zeros) { };
+
+  // Accumulate contributions from columns [begin, end).
+  void operator() (std::size_t begin, std::size_t end) {
+    vec y(A.n_rows);
+    for (unsigned int j = begin; j < end; j++) {
+      y  = A * B.col(j);
+      f -= X.col(j) % log(y + e);
+      if (poisson)
+        f += y;
+    }
+  }
+
+  void join (const cost_worker& other) { f += other.f; }
+};
+
+// Worker for parallel computation of the sparse cost function.
+struct cost_sparse_worker : public RcppParallel::Worker {
+  const sp_mat& X;
+  const mat&    A;
+  const mat&    B;
+  double        e;
+  bool          poisson;
+  vec           f;
+
+  cost_sparse_worker (const sp_mat& X, const mat& A, const mat& B, double e,
+                      bool poisson) :
+    X(X), A(A), B(B), e(e), poisson(poisson), f(X.n_rows, fill::zeros) { };
+
+  cost_sparse_worker (const cost_sparse_worker& other, RcppParallel::Split) :
+    X(other.X), A(other.A), B(other.B), e(other.e), poisson(other.poisson),
+    f(other.X.n_rows, fill::zeros) { };
+
+  void operator() (std::size_t begin, std::size_t end) {
+    unsigned int i;
+    vec y(A.n_rows);
+    for (unsigned int j = begin; j < end; j++) {
+      sp_mat::const_col_iterator xj = X.begin_col(j);
+      sp_mat::const_col_iterator xm = X.end_col(j);
+      y = A * B.col(j);
+      for (; xj != xm; ++xj) {
+        i     = xj.row();
+        f(i) -= (*xj) * log(y(i) + e);
+      }
+      if (poisson)
+        f += y;
+    }
+  }
+
+  void join (const cost_sparse_worker& other) { f += other.f; }
+};
+
 // FUNCTION DEFINITIONS
 // --------------------
 // Compute negative log-likelihoods for assessing a topic model fit or
@@ -52,6 +125,34 @@ arma::vec cost (const mat& X, const mat& A, const mat& B, double e,
   return f;
 }
 
+// This is the same as cost_rcpp, except that Intel Threading Building
+// Blocks (TBB) are used to compute the contributions from each column
+// of X in parallel.
+//
+// [[Rcpp::depends(RcppArmadillo)]]
+// [[Rcpp::depends(RcppParallel)]]
+// [[Rcpp::export]]
+arma::vec cost_parallel_rcpp (const arma::mat& X, const arma::mat& A,
+			      const arma::mat& B, double e, bool poisson) {
+  cost_worker worker(X,A,B,e,poisson);
+  parallelReduce(0,X.n_cols,worker);
+  return worker.f;
+}
+
+// This is the same as cost_sparse_rcpp, except that Intel Threading
+// Building Blocks (TBB) are used to compute the contributions from
+// each column of X in parallel.
+//
+// [[Rcpp::depends(RcppArmadillo)]]
+// [[Rcpp::depends(RcppParallel)]]
+// [[Rcpp::export]]
+arma::vec cost_sparse_parallel_rcpp (const arma::sp_mat& X, const arma::mat& A,
+				     const arma::mat& B, double e, bool poisson) {
+  cost_sparse_worker worker(X,A,B,e,poisson);
+  parallelReduce(0,X.n_cols,worker);
+  return worker.f;
+}
+
 // Helper function for cost_sparse_rcpp.
 arma::vec cost_sparse (const sp_mat& X, const mat& A, const mat& B,
 		       double e, bool poisson) {
diff --git a/src/misc.cpp b/src/misc.cpp
index e5384a2d..9370ab57 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -1,7 +1,82 @@
+#include <RcppParallel.h>
 #include "misc.h"
 
 using namespace arma;
 
+// CLASS DEFINITIONS
+// -----------------
+// Worker for computing t(A) %*% L and A %*% F simultaneously in one
+// pass over the nonzeros of X, where A[i,j] = X[i,j] / (dot(L[i,:],
+// F[j,:]) + e). L is n x k, F is m x k.
+//
+// t(A) %*% L is m x k: row j = sum_{i: X[i,j]!=0} A[i,j] * L[i,:]
+//   Each column j of X writes only to row j -> parallelFor-safe.
+//
+// A %*% F is n x k: row i = sum_{j: X[i,j]!=0} A[i,j] * F[j,:]
+//   Multiple columns can write to the same row -> needs reduction.
+//
+// Both are accumulated in a single parallelReduce over columns of X.
+struct kkt_sparse_worker : public RcppParallel::Worker {
+  const sp_mat& X;
+  const mat&    L;
+  const mat&    F;
+  double        e;
+  mat           tAL;  // thread-local m x k accumulator for t(A) %*% L
+  mat           AF;   // thread-local n x k accumulator for A %*% F
+
+  kkt_sparse_worker (const sp_mat& X, const mat& L, const mat& F, double e) :
+    X(X), L(L), F(F), e(e),
+    tAL(X.n_cols, L.n_cols, fill::zeros),
+    AF(X.n_rows,  F.n_cols, fill::zeros) { };
+
+  kkt_sparse_worker (const kkt_sparse_worker& other, RcppParallel::Split) :
+    X(other.X), L(other.L), F(other.F), e(other.e),
+    tAL(other.X.n_cols, other.L.n_cols, fill::zeros),
+    AF(other.X.n_rows,  other.F.n_cols, fill::zeros) { };
+
+  void operator() (std::size_t begin, std::size_t end) {
+    for (unsigned int j = begin; j < end; j++) {
+      sp_mat::const_col_iterator xi = X.begin_col(j);
+      sp_mat::const_col_iterator xm = X.end_col(j);
+      rowvec Fj = F.row(j);
+      for (; xi != xm; ++xi) {
+        unsigned int i = xi.row();
+        double       a = (*xi) / (dot(L.row(i), Fj) + e);
+        tAL.row(j) += a * L.row(i);
+        AF.row(i)  += a * Fj;
+      }
+    }
+  }
+
+  void join (const kkt_sparse_worker& other) {
+    tAL += other.tAL;
+    AF  += other.AF;
+  }
+};
+
+// Worker for parallel computation of x_over_crossprod. Each element
+// t of the output is independent: y(t) = x(t) / (dot(A.col(i(t)),
+// B.col(j(t))) + e). parallelFor is safe here because each thread
+// writes to a disjoint range of y.
+struct x_over_crossprod_worker : public RcppParallel::Worker {
+  const vec&  i_idx;
+  const vec&  j_idx;
+  const mat&  A;
+  const mat&  B;
+  double      e;
+  vec&        y;
+
+  x_over_crossprod_worker (const vec& i_idx, const vec& j_idx, const mat& A,
+                            const mat& B, double e, vec& y) :
+    i_idx(i_idx), j_idx(j_idx), A(A), B(B), e(e), y(y) { };
+
+  void operator() (std::size_t begin, std::size_t end) {
+    for (unsigned int t = begin; t < end; t++)
+      y(t) /= (dot(A.col((unsigned int) i_idx(t)),
+                   B.col((unsigned int) j_idx(t))) + e);
+  }
+};
+
 // FUNCTION DECLARATIONS
 // ---------------------
 void le_diff (const vec& x, vec& y);
@@ -28,10 +103,10 @@ arma::mat le_diff_rcpp (const arma::mat& X) {
 }
 
 // This is used to implement x_over_tcrossprod.
-// 
+//
 // [[Rcpp::export]]
 arma::vec x_over_crossprod_rcpp (const arma::vec& i, const arma::vec& j,
-				 const arma::vec& x, const arma::mat& A, 
+				 const arma::vec& x, const arma::mat& A,
 				 const arma::mat& B, double e) {
   unsigned int n = x.n_elem;
   vec y = x;
@@ -40,6 +115,44 @@ arma::vec x_over_crossprod_rcpp (const arma::vec& i, const arma::vec& j,
   return y;
 }
 
+// This is the same as x_over_crossprod_rcpp, except that Intel
+// Threading Building Blocks (TBB) are used to process the nonzero
+// elements in parallel.
+//
+// [[Rcpp::depends(RcppArmadillo)]]
+// [[Rcpp::depends(RcppParallel)]]
+// [[Rcpp::export]]
+arma::vec x_over_crossprod_parallel_rcpp (const arma::vec& i,
+					  const arma::vec& j,
+					  const arma::vec& x,
+					  const arma::mat& A,
+					  const arma::mat& B, double e) {
+  vec y = x;
+  x_over_crossprod_worker worker(i,j,A,B,e,y);
+  parallelFor(0,x.n_elem,worker);
+  return y;
+}
+
+// Compute KKT residuals for the sparse Poisson NMF problem in
+// parallel. Returns a list with two matrices:
+//   tAL: t(A) %*% L (m x k), where A[i,j] = X[i,j]/(dot(L[i,:],F[j,:])+e)
+//   AF:  A %*% F    (n x k)
+// This fuses the x_over_tcrossprod, t(A)%*%L, and A%*%F computations
+// into a single parallel pass over the nonzeros of X.
+//
+// [[Rcpp::depends(RcppArmadillo)]]
+// [[Rcpp::depends(RcppParallel)]]
+// [[Rcpp::export]]
+Rcpp::List poisson_nmf_kkt_sparse_parallel_rcpp (const arma::sp_mat& X,
+						  const arma::mat& L,
+						  const arma::mat& F,
+						  double e) {
+  kkt_sparse_worker worker(X,L,F,e);
+  parallelReduce(0,X.n_cols,worker);
+  return Rcpp::List::create(Rcpp::Named("tAL") = worker.tAL,
+			    Rcpp::Named("AF")  = worker.AF);
+}
+
 // For vector x, return a vector of the same length y containing the
 // "least extreme" differences y(i) = x(i) - x(j), in which j is the
 // index not equal to i such that abs(x(i) - x(j)) is the smallest

From 65fb7230c0cb564f5bddf84c66af0f086437001c Mon Sep 17 00:00:00 2001
From: Eric Weine <ericw456@mit.edu>
Date: Fri, 17 Apr 2026 09:48:57 -0400
Subject: [PATCH 2/2] added tests to ensure that Rcpp and Rcpp_parallel
 versions give the same results

---
 R/likelihood.R                   | 20 ++++++++++++++++---
 tests/testthat/test_likelihood.R | 33 +++++++++++++++++++++++++++-----
 2 files changed, 45 insertions(+), 8 deletions(-)

diff --git a/R/likelihood.R b/R/likelihood.R
index 677be845..2a155eeb 100644
--- a/R/likelihood.R
+++ b/R/likelihood.R
@@ -93,7 +93,7 @@ cost <- function (X, A, B, e = 1e-8, family = c("poisson","multinom"),
     A <- matrix(A)
   if (!is.matrix(B))
     B <- matrix(B,1,ncol(X))
-    
+
   # Check and process "model" and "version" input arguments.
   family  <- match.arg(family)
   poisson <- family == "poisson"
@@ -101,7 +101,7 @@ cost <- function (X, A, B, e = 1e-8, family = c("poisson","multinom"),
     if (is.matrix(X))
       version <- "R"
     else
-      version <- "Rcpp"
+      version <- "Rcpp_parallel"
   }
 
   # Compute the terms in the log-likelihoods that depend on A or B.
@@ -109,6 +109,11 @@ cost <- function (X, A, B, e = 1e-8, family = c("poisson","multinom"),
     AB <- A %*% B
     f  <- rowSums(poisson*AB - X*log(AB + e))
   } else if (version == "Rcpp") {
+    if (is.matrix(X))
+      f <- drop(cost_rcpp(X,A,B,e,poisson))
+    else
+      f <- drop(cost_sparse_rcpp(X,A,B,e,poisson))
+  } else if (version == "Rcpp_parallel") {
     if (is.matrix(X))
       f <- drop(cost_parallel_rcpp(X,A,B,e,poisson))
     else
@@ -179,11 +184,20 @@ deviance_poisson_const <- function (X) {
 # Compute the residuals of the first-order Karush-Kuhn-Tucker (KKT)
 # conditions for Poisson non-negative matrix factorization at solution
 # estimate (F,L).
-poisson_nmf_kkt <- function (X, F, L, e = 1e-8) {
+#' @importFrom Matrix sparseMatrix
+poisson_nmf_kkt <- function (X, F, L, e = 1e-8,
+                             version = c("Rcpp_parallel","Rcpp")) {
+  version <- match.arg(version)
   if (is.matrix(X)) {
     A <- X/(tcrossprod(L,F) + e)
     return(list(F = F*(t(1 - A) %*% L),
                 L = L*((1 - A) %*% F)))
+  } else if (version == "Rcpp") {
+    d <- summary(X)
+    y <- drop(x_over_crossprod_rcpp(d$i - 1,d$j - 1,d$x,t(L),t(F),e))
+    A <- sparseMatrix(i = d$i,j = d$j,x = y,dims = dim(X))
+    return(list(F = F*(repmat(colSums(L),ncol(X)) - as.matrix(t(A) %*% L)),
+                L = L*(repmat(colSums(F),nrow(X)) - as.matrix(A %*% F))))
   } else {
     result <- poisson_nmf_kkt_sparse_parallel_rcpp(X,L,F,e)
     return(list(F = F*(repmat(colSums(L),ncol(X)) - result$tAL),
diff --git a/tests/testthat/test_likelihood.R b/tests/testthat/test_likelihood.R
index 3f244887..8aa848a6 100644
--- a/tests/testthat/test_likelihood.R
+++ b/tests/testthat/test_likelihood.R
@@ -1,12 +1,12 @@
 context("likelihood")
 
-test_that(paste("R and Rcpp versions of cost function return same result",
-                "for sparse and dense matrix"),{
+test_that(paste("R, Rcpp and Rcpp_parallel versions of cost function return",
+                "same result for sparse and dense matrix"),{
   set.seed(1)
 
   # Repeat the tests for a few different choices of k.
   for (k in 1:4)  {
-      
+
     # Generate a data set.
     out <- simulate_count_data(10,8,k)
     X   <- out$X
@@ -19,11 +19,15 @@ test_that(paste("R and Rcpp versions of cost function return same result",
     f2 <- cost(X,L,t(F),version = "Rcpp")
     f3 <- cost(Y,L,t(F),version = "R")
     f4 <- cost(Y,L,t(F),version = "Rcpp")
-  
+    f5 <- cost(X,L,t(F),version = "Rcpp_parallel")
+    f6 <- cost(Y,L,t(F),version = "Rcpp_parallel")
+
     # The cost function calculations should all give the same result.
     expect_equal(f1,f2)
     expect_equal(f1,f3)
     expect_equal(f1,f4)
+    expect_equal(f1,f5)
+    expect_equal(f1,f6)
   }
 })
 
@@ -108,7 +112,7 @@ test_that(paste("deviance_poisson_topic_nmf gives correct result for sparse",
 
 test_that("poisson_nmf_kkt gives same result for sparse and dense matrices",{
 
-  # Generate a data set.  
+  # Generate a data set.
   set.seed(1)
   out <- simulate_count_data(10,8,3)
   X   <- out$X
@@ -121,6 +125,25 @@ test_that("poisson_nmf_kkt gives same result for sparse and dense matrices",{
   expect_equal(out1,out2,tolerance = 1e-15,scale = 1)
 })
 
+test_that(paste("Rcpp and Rcpp_parallel versions of poisson_nmf_kkt give",
+                "same result for sparse matrix"),{
+
+  # Generate a data set.
+  set.seed(1)
+  for (k in 1:4) {
+    out <- simulate_count_data(10,8,k)
+    Y   <- as(out$X,"CsparseMatrix")
+    F   <- out$F
+    L   <- out$L
+
+    # Compute KKT residuals with both versions and confirm they agree.
+    out1 <- poisson_nmf_kkt(Y,F,L,version = "Rcpp")
+    out2 <- poisson_nmf_kkt(Y,F,L,version = "Rcpp_parallel")
+    expect_equal(out1$F,out2$F,tolerance = 1e-14,scale = 1)
+    expect_equal(out1$L,out2$L,tolerance = 1e-14,scale = 1)
+  }
+})
+
 test_that(paste("loglik_poisson_nmf and loglik_multinom_topic_model give",
                 "the same result when provided with either a Poisson NMF",
                 "or multinomial topic model fit"),{