From 0acd9b9cc9bc929f91162d8f63f7084ede7a5ade Mon Sep 17 00:00:00 2001 From: JPRichings Date: Sat, 25 Apr 2026 22:53:17 +0000 Subject: [PATCH 01/19] update to remove implicit thrust vector host to device copy --- quest/src/gpu/gpu_kernels.cuh | 19 ++++++---- quest/src/gpu/gpu_subroutines.cpp | 62 +++++++++++++++++++++++++++---- 2 files changed, 65 insertions(+), 16 deletions(-) diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh index 4f2a737e..74cc3436 100644 --- a/quest/src/gpu/gpu_kernels.cuh +++ b/quest/src/gpu/gpu_kernels.cuh @@ -49,6 +49,9 @@ const int NUM_THREADS_PER_BLOCK = 128; +__device__ __constant__ int ctrl_device[30]; + + __forceinline__ __device__ qindex getThreadInd() { return blockIdx.x*blockDim.x + threadIdx.x; @@ -198,7 +201,7 @@ __global__ void kernel_statevec_anyCtrlSwap_subC( template __global__ void kernel_statevec_anyCtrlOneTargDenseMatr_subA( cu_qcomp* amps, qindex numThreads, - int* ctrlsAndTarg, int numCtrls, qindex ctrlStateMask, int targ, + int numCtrls, qindex ctrlStateMask, int targ, cu_qcomp m00, cu_qcomp m01, cu_qcomp m10, cu_qcomp m11 ) { GET_THREAD_IND(n, numThreads); @@ -207,7 +210,7 @@ __global__ void kernel_statevec_anyCtrlOneTargDenseMatr_subA( SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls); // i0 = nth local index where ctrls are active and targ is 0 - qindex i0 = insertBitsWithMaskedValues(n, ctrlsAndTarg, numCtrlBits + 1, ctrlStateMask); + qindex i0 = insertBitsWithMaskedValues(n, ctrls_device, numCtrlBits + 1, ctrlStateMask); qindex i1 = flipBit(i0, targ); // note amps are strided by 2^targ @@ -436,7 +439,7 @@ __global__ void kernel_statevec_anyCtrlManyTargDenseMatr( template __global__ void kernel_statevec_anyCtrlOneTargDiagMatr_sub( cu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, - int* ctrls, int numCtrls, qindex ctrlStateMask, int targ, + int numCtrls, qindex ctrlStateMask, int targ, cu_qcomp m1, cu_qcomp m2 ) { GET_THREAD_IND(n, numThreads); @@ -456,7 +459,7 @@ __global__ void kernel_statevec_anyCtrlOneTargDiagMatr_sub( SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls); // j = nth local index where ctrls are active (in the specified states) - qindex j = insertBitsWithMaskedValues(n, ctrls, numCtrlBits, ctrlStateMask); + qindex j = insertBitsWithMaskedValues(n, ctrl_device, numCtrlBits, ctrlStateMask); // i = global index corresponding to j qindex i = concatenateBits(rank, j, logNumAmpsPerNode); @@ -475,7 +478,7 @@ __global__ void kernel_statevec_anyCtrlOneTargDiagMatr_sub( template __global__ void kernel_statevec_anyCtrlTwoTargDiagMatr_sub( cu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, - int* ctrls, int numCtrls, qindex ctrlStateMask, int targ1, int targ2, + int numCtrls, qindex ctrlStateMask, int targ1, int targ2, cu_qcomp m1, cu_qcomp m2, cu_qcomp m3, cu_qcomp m4 ) { GET_THREAD_IND(n, numThreads); @@ -495,7 +498,7 @@ __global__ void kernel_statevec_anyCtrlTwoTargDiagMatr_sub( SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls); // j = nth local index where ctrls are active (in the specified states) - qindex j = insertBitsWithMaskedValues(n, ctrls, numCtrlBits, ctrlStateMask); + qindex j = insertBitsWithMaskedValues(n, ctrl_device, numCtrlBits, ctrlStateMask); // i = global index corresponding to j qindex i = concatenateBits(rank, j, logNumAmpsPerNode); @@ -516,7 +519,7 @@ __global__ void kernel_statevec_anyCtrlTwoTargDiagMatr_sub( template __global__ void kernel_statevec_anyCtrlAnyTargDiagMatr_sub( cu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, - int* ctrls, int numCtrls, qindex ctrlStateMask, int* targs, int numTargs, + int numCtrls, qindex ctrlStateMask, int* targs, int numTargs, cu_qcomp* elems, cu_qcomp exponent ) { GET_THREAD_IND(n, numThreads); @@ -537,7 +540,7 @@ __global__ void kernel_statevec_anyCtrlAnyTargDiagMatr_sub( SET_VAR_AT_COMPILE_TIME(int, numTargBits, NumTargs, numTargs); // j = nth local index where ctrls are active (in the specified states) - qindex j = insertBitsWithMaskedValues(n, ctrls, numCtrlBits, ctrlStateMask); + qindex j = insertBitsWithMaskedValues(n, ctrl_device, numCtrlBits, ctrlStateMask); // i = global index corresponding to j qindex i = concatenateBits(rank, j, logNumAmpsPerNode); diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp index 5e18048f..c21cc258 100644 --- a/quest/src/gpu/gpu_subroutines.cpp +++ b/quest/src/gpu/gpu_subroutines.cpp @@ -301,14 +301,22 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector ctrls, v qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size() + 1); qindex numBlocks = getNumBlocks(numThreads); - devints sortedQubits = util_getSorted(ctrls, {targ}); + //devints sortedQubits = util_getSorted(ctrls, {targ}); + + vector sortedQubits = util_getSorted(ctrls, {targ}); + qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {0}); auto [m00, m01, m10, m11] = unpackMatrixToCuQcomps(matr); + + int ctrl_device[sortedQubits.size()]; + + cudaMemcpyToSymbol(ctrl_device, sortedQubits.data(), ctrls.size()*sizeof(int)); + kernel_statevec_anyCtrlOneTargDenseMatr_subA <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, - getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ, + ctrls.size(), qubitStateMask, targ, m00, m01, m10, m11 ); @@ -568,13 +576,29 @@ void gpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector ctrls, vec qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size()); qindex numBlocks = getNumBlocks(numThreads); - devints deviceCtrls = util_getSorted(ctrls); + + // removed implicit thrust mem copy + vector sortedCtrls = util_getSorted(ctrls); + + + // Assume size of ctls is at most one per qubit so small enough for device contant memory + int ctrl_device[ctrls.size()]; + + cudaMemcpyToSymbol(ctrl_device, sortedCtrls.data(), ctrls.size()*sizeof(int)); + +// cudaMemcpyToSymbol (const char * symbol, +// const void * src, +// size_t count, +// size_t offset = 0, +// enum cudaMemcpyKind kind = cudaMemcpyHostToDevice +// ) + qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); auto elems = unpackMatrixToCuQcomps(matr); kernel_statevec_anyCtrlOneTargDiagMatr_sub <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, - getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, targ, elems[0], elems[1] + ctrls.size(), ctrlStateMask, targ, elems[0], elems[1] ); // explicitly return to avoid runtime error below @@ -636,13 +660,24 @@ void gpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, vector ctrls, vec qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size()); qindex numBlocks = getNumBlocks(numThreads); - devints deviceCtrls = util_getSorted(ctrls); + // devints deviceCtrls = util_getSorted(ctrls); + + // removed implicit thrust mem copy + vector sortedCtrls = util_getSorted(ctrls); + + + // Assume size of ctls is at most one per qubit so small enough for device contant memory + int ctrl_device[ctrls.size()]; + + cudaMemcpyToSymbol(ctrl_device, sortedCtrls.data(), ctrls.size()*sizeof(int)); + + qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); auto elems = unpackMatrixToCuQcomps(matr); kernel_statevec_anyCtrlTwoTargDiagMatr_sub <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, - getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, targ1, targ2, + ctrls.size(), ctrlStateMask, targ1, targ2, elems[0], elems[1], elems[2], elems[3] ); @@ -705,12 +740,23 @@ void gpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector ctrls, vec qindex numBlocks = getNumBlocks(numThreads); devints deviceTargs = targs; - devints deviceCtrls = util_getSorted(ctrls); + // devints deviceCtrls = util_getSorted(ctrls); + + // removed implicit thrust mem copy + vector sortedCtrls = util_getSorted(ctrls); + + + // Assume size of ctls is at most one per qubit so small enough for device contant memory + int ctrl_device[ctrls.size()]; + + cudaMemcpyToSymbol(ctrl_device, sortedCtrls.data(), ctrls.size()*sizeof(int)); + + qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); kernel_statevec_anyCtrlAnyTargDiagMatr_sub <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, - getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, getPtr(deviceTargs), targs.size(), + ctrls.size(), ctrlStateMask, getPtr(deviceTargs), targs.size(), toCuQcomps(util_getGpuMemPtr(matr)), toCuQcomp(exponent) ); From d7bf558494fa5281af16d22225b0672737265d7a Mon Sep 17 00:00:00 2001 From: JPRichings Date: Tue, 28 Apr 2026 20:50:00 +0000 Subject: [PATCH 02/19] remove thrust devic_vector --- quest/src/gpu/gpu_kernels.cuh | 4 ++-- quest/src/gpu/gpu_subroutines.cpp | 10 +++++----- quest/src/gpu/gpu_thrust.cuh | 6 +++--- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh index 74cc3436..fe2ea80c 100644 --- a/quest/src/gpu/gpu_kernels.cuh +++ b/quest/src/gpu/gpu_kernels.cuh @@ -47,7 +47,7 @@ */ -const int NUM_THREADS_PER_BLOCK = 128; +const int NUM_THREADS_PER_BLOCK =128; __device__ __constant__ int ctrl_device[30]; @@ -210,7 +210,7 @@ __global__ void kernel_statevec_anyCtrlOneTargDenseMatr_subA( SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls); // i0 = nth local index where ctrls are active and targ is 0 - qindex i0 = insertBitsWithMaskedValues(n, ctrls_device, numCtrlBits + 1, ctrlStateMask); + qindex i0 = insertBitsWithMaskedValues(n, ctrl_device, numCtrlBits + 1, ctrlStateMask); qindex i1 = flipBit(i0, targ); // note amps are strided by 2^targ diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp index c21cc258..a92cf021 100644 --- a/quest/src/gpu/gpu_subroutines.cpp +++ b/quest/src/gpu/gpu_subroutines.cpp @@ -310,9 +310,9 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector ctrls, v auto [m00, m01, m10, m11] = unpackMatrixToCuQcomps(matr); - int ctrl_device[sortedQubits.size()]; + //int ctrl_device[sortedQubits.size()]; - cudaMemcpyToSymbol(ctrl_device, sortedQubits.data(), ctrls.size()*sizeof(int)); + cudaMemcpyToSymbol(ctrl_device, sortedQubits.data(), sortedQubits.size()*sizeof(int)); kernel_statevec_anyCtrlOneTargDenseMatr_subA <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, @@ -582,7 +582,7 @@ void gpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector ctrls, vec // Assume size of ctls is at most one per qubit so small enough for device contant memory - int ctrl_device[ctrls.size()]; + //int ctrl_device[ctrls.size()]; cudaMemcpyToSymbol(ctrl_device, sortedCtrls.data(), ctrls.size()*sizeof(int)); @@ -667,7 +667,7 @@ void gpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, vector ctrls, vec // Assume size of ctls is at most one per qubit so small enough for device contant memory - int ctrl_device[ctrls.size()]; + // int ctrl_device[ctrls.size()]; cudaMemcpyToSymbol(ctrl_device, sortedCtrls.data(), ctrls.size()*sizeof(int)); @@ -747,7 +747,7 @@ void gpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector ctrls, vec // Assume size of ctls is at most one per qubit so small enough for device contant memory - int ctrl_device[ctrls.size()]; + //int ctrl_device[ctrls.size()]; cudaMemcpyToSymbol(ctrl_device, sortedCtrls.data(), ctrls.size()*sizeof(int)); diff --git a/quest/src/gpu/gpu_thrust.cuh b/quest/src/gpu/gpu_thrust.cuh index 8c11188e..ce708694 100644 --- a/quest/src/gpu/gpu_thrust.cuh +++ b/quest/src/gpu/gpu_thrust.cuh @@ -791,7 +791,7 @@ qreal thrust_statevec_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector q auto indFunctor = functor_insertBits(getPtr(sortedQubits), valueMask, qubits.size()); auto probFunctor = functor_getAmpNorm(); - auto rawIter = thrust::make_counting_iterator(0); + auto rawIter = thrust::make_counting_iterator(0LL); auto indIter = thrust::make_transform_iterator(rawIter, indFunctor); auto ampIter = thrust::make_permutation_iterator(getStartPtr(qureg), indIter); auto probIter = thrust::make_transform_iterator(ampIter, probFunctor); @@ -1016,7 +1016,7 @@ void thrust_statevec_multiQubitProjector_sub(Qureg qureg, vector qubits, ve auto projFunctor = functor_projectStateVec( getPtr(devQubits), qubits.size(), retainValue, renorm); - auto indIter = thrust::make_counting_iterator(0); + auto indIter = thrust::make_counting_iterator(0LL); auto ampIter = getStartPtr(qureg); qindex numIts = qureg.numAmpsPerNode; @@ -1082,4 +1082,4 @@ void thrust_statevec_initUnnormalisedUniformlyRandomPureStateAmps_sub(Qureg qure -#endif // GPU_THRUST_HPP \ No newline at end of file +#endif // GPU_THRUST_HPP From 324e5bef4be8fe91c09dcbe2aa8ed15b16faa3cf Mon Sep 17 00:00:00 2001 From: JPRichings Date: Tue, 5 May 2026 21:48:30 +0000 Subject: [PATCH 03/19] alternative approach where capture array in struct to remove explict memory copy --- quest/src/core/bitwise.hpp | 17 ++++++++++++++++- quest/src/gpu/gpu_kernels.cuh | 22 +++++++++++++--------- quest/src/gpu/gpu_subroutines.cpp | 30 ++++++++++++++++++++++-------- 3 files changed, 51 insertions(+), 18 deletions(-) diff --git a/quest/src/core/bitwise.hpp b/quest/src/core/bitwise.hpp index 5479c4ee..a797a1fa 100644 --- a/quest/src/core/bitwise.hpp +++ b/quest/src/core/bitwise.hpp @@ -170,6 +170,15 @@ INLINE qindex insertBits(qindex number, int* bitIndices, int numIndices, int bit return number; } +INLINE qindex insertBits(qindex number, const int* bitIndices, int numIndices, int bitValue) { + + // bitIndices must be strictly increasing + for (int i=0; i __global__ void kernel_statevec_anyCtrlOneTargDenseMatr_subA( - cu_qcomp* amps, qindex numThreads, + cu_qcomp* amps, qindex numThreads, __grid_constant__ const ctrl_device_t ctrl, int numCtrls, qindex ctrlStateMask, int targ, cu_qcomp m00, cu_qcomp m01, cu_qcomp m10, cu_qcomp m11 ) { @@ -210,7 +214,7 @@ __global__ void kernel_statevec_anyCtrlOneTargDenseMatr_subA( SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls); // i0 = nth local index where ctrls are active and targ is 0 - qindex i0 = insertBitsWithMaskedValues(n, ctrl_device, numCtrlBits + 1, ctrlStateMask); + qindex i0 = insertBitsWithMaskedValues(n, ctrl.ctrl_device, numCtrlBits + 1, ctrlStateMask); qindex i1 = flipBit(i0, targ); // note amps are strided by 2^targ @@ -438,7 +442,7 @@ __global__ void kernel_statevec_anyCtrlManyTargDenseMatr( template __global__ void kernel_statevec_anyCtrlOneTargDiagMatr_sub( - cu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, + cu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, __grid_constant__ const ctrl_device_t ctrl, int numCtrls, qindex ctrlStateMask, int targ, cu_qcomp m1, cu_qcomp m2 ) { @@ -459,7 +463,7 @@ __global__ void kernel_statevec_anyCtrlOneTargDiagMatr_sub( SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls); // j = nth local index where ctrls are active (in the specified states) - qindex j = insertBitsWithMaskedValues(n, ctrl_device, numCtrlBits, ctrlStateMask); + qindex j = insertBitsWithMaskedValues(n, ctrl.ctrl_device, numCtrlBits, ctrlStateMask); // i = global index corresponding to j qindex i = concatenateBits(rank, j, logNumAmpsPerNode); @@ -477,7 +481,7 @@ __global__ void kernel_statevec_anyCtrlOneTargDiagMatr_sub( template __global__ void kernel_statevec_anyCtrlTwoTargDiagMatr_sub( - cu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, + cu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, __grid_constant__ const ctrl_device_t ctrl, int numCtrls, qindex ctrlStateMask, int targ1, int targ2, cu_qcomp m1, cu_qcomp m2, cu_qcomp m3, cu_qcomp m4 ) { @@ -498,7 +502,7 @@ __global__ void kernel_statevec_anyCtrlTwoTargDiagMatr_sub( SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls); // j = nth local index where ctrls are active (in the specified states) - qindex j = insertBitsWithMaskedValues(n, ctrl_device, numCtrlBits, ctrlStateMask); + qindex j = insertBitsWithMaskedValues(n, ctrl.ctrl_device, numCtrlBits, ctrlStateMask); // i = global index corresponding to j qindex i = concatenateBits(rank, j, logNumAmpsPerNode); @@ -518,7 +522,7 @@ __global__ void kernel_statevec_anyCtrlTwoTargDiagMatr_sub( template __global__ void kernel_statevec_anyCtrlAnyTargDiagMatr_sub( - cu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, + cu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, __grid_constant__ const ctrl_device_t ctrl, int numCtrls, qindex ctrlStateMask, int* targs, int numTargs, cu_qcomp* elems, cu_qcomp exponent ) { @@ -540,7 +544,7 @@ __global__ void kernel_statevec_anyCtrlAnyTargDiagMatr_sub( SET_VAR_AT_COMPILE_TIME(int, numTargBits, NumTargs, numTargs); // j = nth local index where ctrls are active (in the specified states) - qindex j = insertBitsWithMaskedValues(n, ctrl_device, numCtrlBits, ctrlStateMask); + qindex j = insertBitsWithMaskedValues(n, ctrl.ctrl_device, numCtrlBits, ctrlStateMask); // i = global index corresponding to j qindex i = concatenateBits(rank, j, logNumAmpsPerNode); diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp index a92cf021..9bdc99e4 100644 --- a/quest/src/gpu/gpu_subroutines.cpp +++ b/quest/src/gpu/gpu_subroutines.cpp @@ -309,13 +309,17 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector ctrls, v auto [m00, m01, m10, m11] = unpackMatrixToCuQcomps(matr); + ctrl_device_t ctrl; + + std::copy(sortedQubits.begin(), sortedQubits.end(), ctrl.ctrl_device); + //int ctrl_device[sortedQubits.size()]; - cudaMemcpyToSymbol(ctrl_device, sortedQubits.data(), sortedQubits.size()*sizeof(int)); + //cudaMemcpyToSymbol(ctrl_device, sortedQubits.data(), sortedQubits.size()*sizeof(int)); kernel_statevec_anyCtrlOneTargDenseMatr_subA <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, + toCuQcomps(qureg.gpuAmps), numThreads, ctrl, ctrls.size(), qubitStateMask, targ, m00, m01, m10, m11 ); @@ -580,11 +584,15 @@ void gpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector ctrls, vec // removed implicit thrust mem copy vector sortedCtrls = util_getSorted(ctrls); + ctrl_device_t ctrl; + + std::copy(sortedCtrls.begin(), sortedCtrls.end(), ctrl.ctrl_device); + // Assume size of ctls is at most one per qubit so small enough for device contant memory //int ctrl_device[ctrls.size()]; - cudaMemcpyToSymbol(ctrl_device, sortedCtrls.data(), ctrls.size()*sizeof(int)); + // cudaMemcpyToSymbol(ctrl_device, sortedCtrls.data(), ctrls.size()*sizeof(int)); // cudaMemcpyToSymbol (const char * symbol, // const void * src, @@ -597,7 +605,7 @@ void gpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector ctrls, vec auto elems = unpackMatrixToCuQcomps(matr); kernel_statevec_anyCtrlOneTargDiagMatr_sub <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, + toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, ctrl, ctrls.size(), ctrlStateMask, targ, elems[0], elems[1] ); @@ -665,18 +673,21 @@ void gpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, vector ctrls, vec // removed implicit thrust mem copy vector sortedCtrls = util_getSorted(ctrls); + ctrl_device_t ctrl; + + std::copy(sortedCtrls.begin(), sortedCtrls.end(), ctrl.ctrl_device); // Assume size of ctls is at most one per qubit so small enough for device contant memory // int ctrl_device[ctrls.size()]; - cudaMemcpyToSymbol(ctrl_device, sortedCtrls.data(), ctrls.size()*sizeof(int)); + //cudaMemcpyToSymbol(ctrl_device, sortedCtrls.data(), ctrls.size()*sizeof(int)); qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); auto elems = unpackMatrixToCuQcomps(matr); kernel_statevec_anyCtrlTwoTargDiagMatr_sub <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, + toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, ctrl, ctrls.size(), ctrlStateMask, targ1, targ2, elems[0], elems[1], elems[2], elems[3] ); @@ -745,17 +756,20 @@ void gpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector ctrls, vec // removed implicit thrust mem copy vector sortedCtrls = util_getSorted(ctrls); + ctrl_device_t ctrl; + + std::copy(sortedCtrls.begin(), sortedCtrls.end(), ctrl.ctrl_device); // Assume size of ctls is at most one per qubit so small enough for device contant memory //int ctrl_device[ctrls.size()]; - cudaMemcpyToSymbol(ctrl_device, sortedCtrls.data(), ctrls.size()*sizeof(int)); + //cudaMemcpyToSymbol(ctrl_device, sortedCtrls.data(), ctrls.size()*sizeof(int)); qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); kernel_statevec_anyCtrlAnyTargDiagMatr_sub <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, + toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, ctrl, ctrls.size(), ctrlStateMask, getPtr(deviceTargs), targs.size(), toCuQcomps(util_getGpuMemPtr(matr)), toCuQcomp(exponent) ); From e04dc1f9bf62e8ac88a57e93494e3a267f861cfe Mon Sep 17 00:00:00 2001 From: JPRichings Date: Sat, 9 May 2026 20:31:55 +0000 Subject: [PATCH 04/19] identify opportutnities for performance improvement --- quest/src/gpu/gpu_kernels.cuh | 11 +++++-- quest/src/gpu/gpu_subroutines.cpp | 48 +++++++++++++++++-------------- quest/src/gpu/gpu_thrust.cuh | 10 +++---- 3 files changed, 39 insertions(+), 30 deletions(-) diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh index c7345c2a..eb86d91e 100644 --- a/quest/src/gpu/gpu_kernels.cuh +++ b/quest/src/gpu/gpu_kernels.cuh @@ -56,6 +56,11 @@ typedef struct { int ctrl_device[64]; } ctrl_device_t; +struct QubitList_t { + int indices[64]; + int length; +}; + __forceinline__ __device__ qindex getThreadInd() { return blockIdx.x*blockDim.x + threadIdx.x; @@ -101,15 +106,15 @@ __forceinline__ __device__ int cudaGetBitMaskParity(qindex mask) { template __global__ void kernel_statevec_packAmpsIntoBuffer( cu_qcomp* amps, cu_qcomp* buffer, qindex numThreads, - int* qubits, int numQubits, qindex qubitStateMask + __grid_constant__ const QubitList_t qubits, qindex qubitStateMask ) { GET_THREAD_IND(n, numThreads); // use template param to compile-time unroll loop in insertBits() - SET_VAR_AT_COMPILE_TIME(int, numBits, NumCtrls, numQubits); + SET_VAR_AT_COMPILE_TIME(int, numBits, NumCtrls, qubits.length); // i = nth local index where qubits are active - qindex i = insertBitsWithMaskedValues(n, qubits, numBits, qubitStateMask); + qindex i = insertBitsWithMaskedValues(n, qubits.indices, numBits, qubitStateMask); // caller offsets buffer by sub-buffer send-index buffer[n] = amps[i]; diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp index 9bdc99e4..3d24cd94 100644 --- a/quest/src/gpu/gpu_subroutines.cpp +++ b/quest/src/gpu/gpu_subroutines.cpp @@ -144,12 +144,16 @@ qindex gpu_statevec_packAmpsIntoBuffer(Qureg qureg, vector qubits, vector sortedQubits = util_getSorted(qubits); // change qindex qubitStateMask = util_getBitMask(qubits, qubitStates); + QubitList_t qubits_dev; + std::copy(sortedQubits.begin(), sortedQubits.end(), qubits_dev.indices); + qubits_dev.length = sortedQubits.size(); + kernel_statevec_packAmpsIntoBuffer <<>> ( toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[sendInd], numThreads, - getPtr(sortedQubits), qubits.size(), qubitStateMask + qubits_dev, qubitStateMask ); // return the number of packed amps @@ -210,7 +214,7 @@ void gpu_statevec_anyCtrlSwap_subA(Qureg qureg, vector ctrls, vector c qindex numThreads = qureg.numAmpsPerNode / powerOf2(2 + ctrls.size()); qindex numBlocks = getNumBlocks(numThreads); - devints sortedQubits = util_getSorted(ctrls, {targ2, targ1}); + devints sortedQubits = util_getSorted(ctrls, {targ2, targ1}); // change for performance qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ2, targ1}, {0, 1}); kernel_statevec_anyCtrlSwap_subA <<>> ( @@ -235,7 +239,7 @@ void gpu_statevec_anyCtrlSwap_subB(Qureg qureg, vector ctrls, vector c qindex numBlocks = getNumBlocks(numThreads); qindex recvInd = getBufferRecvInd(); - devints sortedCtrls = util_getSorted(ctrls); + devints sortedCtrls = util_getSorted(ctrls); // change for performance qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); kernel_statevec_anyCtrlSwap_subB <<>> ( @@ -260,7 +264,7 @@ void gpu_statevec_anyCtrlSwap_subC(Qureg qureg, vector ctrls, vector c qindex numBlocks = getNumBlocks(numThreads); qindex recvInd = getBufferRecvInd(); - devints sortedQubits = util_getSorted(ctrls, {targ}); + devints sortedQubits = util_getSorted(ctrls, {targ}); // change for performance qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {targState}); kernel_statevec_anyCtrlSwap_subC <<>> ( @@ -303,13 +307,13 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector ctrls, v //devints sortedQubits = util_getSorted(ctrls, {targ}); - vector sortedQubits = util_getSorted(ctrls, {targ}); + vector sortedQubits = util_getSorted(ctrls, {targ}); qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {0}); auto [m00, m01, m10, m11] = unpackMatrixToCuQcomps(matr); - ctrl_device_t ctrl; + ctrl_device_t ctrl; // change for performance Standardise std::copy(sortedQubits.begin(), sortedQubits.end(), ctrl.ctrl_device); @@ -341,7 +345,7 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qureg, vector ctrls, v qindex numBlocks = getNumBlocks(numThreads); qindex recvInd = getBufferRecvInd(); - devints sortedCtrls = util_getSorted(ctrls); + devints sortedCtrls = util_getSorted(ctrls); // change for performance qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); kernel_statevec_anyCtrlOneTargDenseMatr_subB <<>> ( @@ -382,7 +386,7 @@ void gpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector ctrls, ve qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size() + 2); qindex numBlocks = getNumBlocks(numThreads); - devints sortedQubits = util_getSorted(ctrls, {targ1,targ2}); + devints sortedQubits = util_getSorted(ctrls, {targ1,targ2}); // change for performance qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ1,targ2}, {0,0}); // unpack matrix elems which are more efficiently accessed by kernels as args than shared mem (... maybe...) @@ -445,7 +449,7 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector ctrls, ve qindex numBatches = qureg.numAmpsPerNode / powerOf2(ctrls.size() + targs.size()); devints deviceTargs = targs; - devints deviceQubits = util_getSorted(ctrls, targs); + devints deviceQubits = util_getSorted(ctrls, targs); // change for performance qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, targs, vector(targs.size(),0)); // unpacking args (to better distinguish below signatures) @@ -582,7 +586,7 @@ void gpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector ctrls, vec // removed implicit thrust mem copy - vector sortedCtrls = util_getSorted(ctrls); + vector sortedCtrls = util_getSorted(ctrls); // change for performance and standarisation ctrl_device_t ctrl; @@ -671,7 +675,7 @@ void gpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, vector ctrls, vec // devints deviceCtrls = util_getSorted(ctrls); // removed implicit thrust mem copy - vector sortedCtrls = util_getSorted(ctrls); + vector sortedCtrls = util_getSorted(ctrls); // change for performance and standardisationn ctrl_device_t ctrl; @@ -750,7 +754,7 @@ void gpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector ctrls, vec qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size()); qindex numBlocks = getNumBlocks(numThreads); - devints deviceTargs = targs; + devints deviceTargs = targs; // change for performance and standardise // devints deviceCtrls = util_getSorted(ctrls); // removed implicit thrust mem copy @@ -871,8 +875,8 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subA(Qureg qureg, vector ct auto maskXY = util_getBitMask(targsXY); auto maskYZ = util_getBitMask(util_getConcatenated(y, z)); - devints deviceTargs = targsXY; - devints deviceQubits = util_getSorted(ctrls, targsXY); + devints deviceTargs = targsXY; + devints deviceQubits = util_getSorted(ctrls, targsXY); // change for performance qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, targsXY, vector(targsXY.size(),0)); // unlike the analogous cpu routine, this function has only a single parallelisation @@ -910,7 +914,7 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subB(Qureg qureg, vector ct auto maskXY = util_getBitMask(util_getConcatenated(x, y)); auto maskYZ = util_getBitMask(util_getConcatenated(y, z)); - devints sortedCtrls = util_getSorted(ctrls); + devints sortedCtrls = util_getSorted(ctrls); // change for performance qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); kernel_statevector_anyCtrlPauliTensorOrGadget_subB <<>> ( @@ -946,7 +950,7 @@ void gpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(Qureg qureg, vector c qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size()); qindex numBlocks = getNumBlocks(numThreads); - devints sortedCtrls = util_getSorted(ctrls); + devints sortedCtrls = util_getSorted(ctrls); // change for performance qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); qindex targMask = util_getBitMask(targs); @@ -986,8 +990,8 @@ void gpu_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector coeffs ptrs.push_back(toCuQcomps(qureg.gpuAmps)); // copy coeff and qureg lists into GPU memory - devcuqcompptrs devQuregAmps = ptrs; - devcomps devCoeffs = coeffs; + devcuqcompptrs devQuregAmps = ptrs; // review performance + devcomps devCoeffs = coeffs; // review performance kernel_statevec_setQuregToWeightedSum_sub <<>> ( toCuQcomps(outQureg.gpuAmps), numThreads, @@ -1499,7 +1503,7 @@ void gpu_densmatr_partialTrace_sub(Qureg inQureg, Qureg outQureg, vector ta qindex numThreads = outQureg.numAmpsPerNode; qindex numBlocks = getNumBlocks(numThreads); - devints devTargs = targs; + devints devTargs = targs; // change for performance devints devPairTargs = pairTargs; devints devAllTargs = util_getSorted(targs, pairTargs); @@ -1620,7 +1624,7 @@ void gpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu qindex numBlocks = getNumBlocks(numThreads); // allocate exponentially-big temporary memory (error if failed) - devints devQubits = qubits; + devints devQubits = qubits; // change for performance devreals devProbs = getDeviceRealsVec(powerOf2(qubits.size())); // throws kernel_statevec_calcProbsOfAllMultiQubitOutcomes_sub <<>> ( @@ -1657,7 +1661,7 @@ void gpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu qindex numAmpsPerCol = powerOf2(qureg.numQubits); // allocate exponentially-big temporary memory (error if failed) - devints devQubits = qubits; + devints devQubits = qubits; // change for performance devreals devProbs = getDeviceRealsVec(powerOf2(qubits.size())); // throws kernel_densmatr_calcProbsOfAllMultiQubitOutcomes_sub <<>> ( diff --git a/quest/src/gpu/gpu_thrust.cuh b/quest/src/gpu/gpu_thrust.cuh index ce708694..ce859860 100644 --- a/quest/src/gpu/gpu_thrust.cuh +++ b/quest/src/gpu/gpu_thrust.cuh @@ -72,7 +72,7 @@ */ -using devints = thrust::device_vector; +using devints = thrust::device_vector; // remove for performance int* getPtr(devints& qubits) { @@ -785,7 +785,7 @@ qreal thrust_densmatr_calcTotalProb_sub(Qureg qureg) { template qreal thrust_statevec_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector qubits, vector outcomes) { - devints sortedQubits = util_getSorted(qubits); + devints sortedQubits = util_getSorted(qubits); // change for performance qindex valueMask = util_getBitMask(qubits, outcomes); auto indFunctor = functor_insertBits(getPtr(sortedQubits), valueMask, qubits.size()); @@ -807,7 +807,7 @@ qreal thrust_densmatr_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector q // cannot move these into functor_insertBits constructor, since the memory // would dangle - and we cannot bind deviceints as an attribute - it's host-only! - devints sortedQubits = util_getSorted(qubits); + devints sortedQubits = util_getSorted(qubits); // change for performance qindex valueMask = util_getBitMask(qubits, outcomes); auto basisIndFunctor = functor_insertBits(getPtr(sortedQubits), valueMask, qubits.size()); @@ -1011,7 +1011,7 @@ cu_qcomp thrust_densmatr_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDi template void thrust_statevec_multiQubitProjector_sub(Qureg qureg, vector qubits, vector outcomes, qreal renorm) { - devints devQubits = qubits; + devints devQubits = qubits; // change for performance qindex retainValue = getIntegerFromBits(outcomes.data(), outcomes.size()); auto projFunctor = functor_projectStateVec( getPtr(devQubits), qubits.size(), retainValue, renorm); @@ -1027,7 +1027,7 @@ void thrust_statevec_multiQubitProjector_sub(Qureg qureg, vector qubits, ve template void thrust_densmatr_multiQubitProjector_sub(Qureg qureg, vector qubits, vector outcomes, qreal renorm) { - devints devQubits = qubits; + devints devQubits = qubits; // change for performance qindex retainValue = getIntegerFromBits(outcomes.data(), outcomes.size()); auto projFunctor = functor_projectDensMatr( getPtr(devQubits), qubits.size(), qureg.rank, qureg.numQubits, From abf038726ea7b7029cbc7e1478a321412859fc90 Mon Sep 17 00:00:00 2001 From: JPRichings Date: Sat, 9 May 2026 20:39:13 +0000 Subject: [PATCH 05/19] kernel_statevec_anyCtrlSwap_subA performance change --- quest/src/gpu/gpu_kernels.cuh | 6 +++--- quest/src/gpu/gpu_subroutines.cpp | 8 ++++++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh index eb86d91e..3639840e 100644 --- a/quest/src/gpu/gpu_kernels.cuh +++ b/quest/src/gpu/gpu_kernels.cuh @@ -145,16 +145,16 @@ __global__ void kernel_statevec_packPairSummedAmpsIntoBuffer( template __global__ void kernel_statevec_anyCtrlSwap_subA( cu_qcomp* amps, qindex numThreads, - int* ctrlsAndTargs, int numCtrls, qindex ctrlsAndTargsMask, int targ1, int targ2 + __grid_constant__ const QubitList_t ctrlsAndTargs, qindex ctrlsAndTargsMask, int targ1, int targ2 ) { GET_THREAD_IND(n, numThreads); // use template param to compile-time unroll loop in insertBits() - SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls); + SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, ctrlsAndTargs.length, numCtrls); int numQubitBits = 2 + numCtrlBits; // i01 = nth local index where ctrls are active, targ2=0 and targ1=1 - qindex i01 = insertBitsWithMaskedValues(n, ctrlsAndTargs, numQubitBits, ctrlsAndTargsMask); + qindex i01 = insertBitsWithMaskedValues(n, ctrlsAndTargs.indices, numQubitBits, ctrlsAndTargsMask); qindex i10 = flipTwoBits(i01, targ2, targ1); // swap amps diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp index 3d24cd94..ac393b1a 100644 --- a/quest/src/gpu/gpu_subroutines.cpp +++ b/quest/src/gpu/gpu_subroutines.cpp @@ -214,12 +214,16 @@ void gpu_statevec_anyCtrlSwap_subA(Qureg qureg, vector ctrls, vector c qindex numThreads = qureg.numAmpsPerNode / powerOf2(2 + ctrls.size()); qindex numBlocks = getNumBlocks(numThreads); - devints sortedQubits = util_getSorted(ctrls, {targ2, targ1}); // change for performance + vector sortedQubits = util_getSorted(ctrls, {targ2, targ1}); // change for performance qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ2, targ1}, {0, 1}); + QubitList_t qubits_dev; + std::copy(sortedQubits.begin(), sortedQubits.end(), qubits_dev.indices); + qubits_dev.length = sortedQubits.size(); + kernel_statevec_anyCtrlSwap_subA <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, - getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ1, targ2 + qubits_dev, qubitStateMask, targ1, targ2 ); #else From a3cd184021ee9ab9f3750e4ccf79f3e4c75cffc1 Mon Sep 17 00:00:00 2001 From: JPRichings Date: Sat, 9 May 2026 20:47:23 +0000 Subject: [PATCH 06/19] update kernel_statevec_anyCtrlSwap_subB for performance --- quest/src/gpu/gpu_kernels.cuh | 8 ++++---- quest/src/gpu/gpu_subroutines.cpp | 8 ++++++-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh index 3639840e..bf127bd4 100644 --- a/quest/src/gpu/gpu_kernels.cuh +++ b/quest/src/gpu/gpu_kernels.cuh @@ -150,7 +150,7 @@ __global__ void kernel_statevec_anyCtrlSwap_subA( GET_THREAD_IND(n, numThreads); // use template param to compile-time unroll loop in insertBits() - SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, ctrlsAndTargs.length, numCtrls); + SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrlsAndTargs.length); int numQubitBits = 2 + numCtrlBits; // i01 = nth local index where ctrls are active, targ2=0 and targ1=1 @@ -167,15 +167,15 @@ __global__ void kernel_statevec_anyCtrlSwap_subA( template __global__ void kernel_statevec_anyCtrlSwap_subB( cu_qcomp* amps, cu_qcomp* buffer, qindex numThreads, - int* ctrls, int numCtrls, qindex ctrlStateMask + __grid_constant__ const QubitList_t ctrls, qindex ctrlStateMask ) { GET_THREAD_IND(n, numThreads); // use template param to compile-time unroll loop in insertBits() - SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls); + SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrls.length); // i = nth local index where ctrls are active - qindex i = insertBitsWithMaskedValues(n, ctrls, numCtrlBits, ctrlStateMask); + qindex i = insertBitsWithMaskedValues(n, ctrls.indices, numCtrlBits, ctrlStateMask); // caller offsets buffer if necessary amps[i] = buffer[n]; diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp index ac393b1a..a91d4723 100644 --- a/quest/src/gpu/gpu_subroutines.cpp +++ b/quest/src/gpu/gpu_subroutines.cpp @@ -243,12 +243,16 @@ void gpu_statevec_anyCtrlSwap_subB(Qureg qureg, vector ctrls, vector c qindex numBlocks = getNumBlocks(numThreads); qindex recvInd = getBufferRecvInd(); - devints sortedCtrls = util_getSorted(ctrls); // change for performance + vector sortedCtrls = util_getSorted(ctrls); // change for performance qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); + QubitList_t Ctrls_dev; + std::copy(sortedCtrls.begin(), sortedCtrls.end(), Ctrls_dev.indices); + Ctrls_dev.length = sortedCtrls.size(); + kernel_statevec_anyCtrlSwap_subB <<>> ( toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, - getPtr(sortedCtrls), ctrls.size(), ctrlStateMask + Ctrls_dev, ctrlStateMask ); #else From 5dd472d8aee43f42796b6d3c68a4b802186af818 Mon Sep 17 00:00:00 2001 From: JPRichings Date: Sat, 9 May 2026 20:52:17 +0000 Subject: [PATCH 07/19] update kernel_statevec_anyCtrlSwap_subC for performance --- quest/src/gpu/gpu_kernels.cuh | 6 +++--- quest/src/gpu/gpu_subroutines.cpp | 8 ++++++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh index bf127bd4..b4171365 100644 --- a/quest/src/gpu/gpu_kernels.cuh +++ b/quest/src/gpu/gpu_kernels.cuh @@ -185,16 +185,16 @@ __global__ void kernel_statevec_anyCtrlSwap_subB( template __global__ void kernel_statevec_anyCtrlSwap_subC( cu_qcomp* amps, cu_qcomp* buffer, qindex numThreads, - int* ctrlsAndTarg, int numCtrls, qindex ctrlsAndTargMask + __grid_constant__ const QubitList_t ctrlsAndTarg, qindex ctrlsAndTargMask ) { GET_THREAD_IND(n, numThreads); // use template param to compile-time unroll loop in insertBits() - SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls); + SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrlsAndTarg.length); int numQubitBits = numCtrlBits + 1; // i = nth local index where ctrls and targ are in specified states - qindex i = insertBitsWithMaskedValues(n, ctrlsAndTarg, numQubitBits, ctrlsAndTargMask); + qindex i = insertBitsWithMaskedValues(n, ctrlsAndTarg.indices, numQubitBits, ctrlsAndTargMask); // caller offsets buffer if necessary amps[i] = buffer[n]; diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp index a91d4723..dbc3aea6 100644 --- a/quest/src/gpu/gpu_subroutines.cpp +++ b/quest/src/gpu/gpu_subroutines.cpp @@ -272,12 +272,16 @@ void gpu_statevec_anyCtrlSwap_subC(Qureg qureg, vector ctrls, vector c qindex numBlocks = getNumBlocks(numThreads); qindex recvInd = getBufferRecvInd(); - devints sortedQubits = util_getSorted(ctrls, {targ}); // change for performance + vector sortedQubits = util_getSorted(ctrls, {targ}); // change for performance qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {targState}); + QubitList_t Qubits_dev; + std::copy(sorteQubits.begin(), sortedQubits.end(), Qubits_dev.indices); + Qubits_dev.length = sortedQubits.size(); + kernel_statevec_anyCtrlSwap_subC <<>> ( toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, - getPtr(sortedQubits), ctrls.size(), qubitStateMask + Qubits_dev, qubitStateMask ); #else From a23b231038fd2c949e6404db82ae5d3ae0649f46 Mon Sep 17 00:00:00 2001 From: JPRichings Date: Sat, 9 May 2026 20:58:06 +0000 Subject: [PATCH 08/19] update kernel_statevec_anyCtrlOneTargDenseMatr_subB for performance --- quest/src/gpu/gpu_kernels.cuh | 6 +++--- quest/src/gpu/gpu_subroutines.cpp | 8 ++++++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh index b4171365..0e448dff 100644 --- a/quest/src/gpu/gpu_kernels.cuh +++ b/quest/src/gpu/gpu_kernels.cuh @@ -234,16 +234,16 @@ __global__ void kernel_statevec_anyCtrlOneTargDenseMatr_subA( template __global__ void kernel_statevec_anyCtrlOneTargDenseMatr_subB( cu_qcomp* amps, cu_qcomp* buffer, qindex numThreads, - int* ctrls, int numCtrls, qindex ctrlStateMask, + __grid_constant__ const QubitList_t ctrls, qindex ctrlStateMask, cu_qcomp fac0, cu_qcomp fac1 ) { GET_THREAD_IND(n, numThreads); // use template param to compile-time unroll loop in insertBits() - SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls); + SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrls.length); // i = nth local index where ctrl bits are active - qindex i = insertBitsWithMaskedValues(n, ctrls, numCtrlBits, ctrlStateMask); + qindex i = insertBitsWithMaskedValues(n, ctrls.indices, numCtrlBits, ctrlStateMask); // caller offsets buffer by receive-index amps[i] = fac0*amps[i] + fac1*buffer[n]; diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp index dbc3aea6..17131064 100644 --- a/quest/src/gpu/gpu_subroutines.cpp +++ b/quest/src/gpu/gpu_subroutines.cpp @@ -357,12 +357,16 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qureg, vector ctrls, v qindex numBlocks = getNumBlocks(numThreads); qindex recvInd = getBufferRecvInd(); - devints sortedCtrls = util_getSorted(ctrls); // change for performance + vector sortedCtrls = util_getSorted(ctrls); // change for performance qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); + QubitList_t Ctrls_dev; + std::copy(sortedCtrls.begin(), sortedCtrls.end(), Ctrls_dev.indices); + Ctrls_dev.length = sortedCtrls.size(); + kernel_statevec_anyCtrlOneTargDenseMatr_subB <<>> ( toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, - getPtr(sortedCtrls), ctrls.size(), ctrlStateMask, + Ctrls_dev, ctrlStateMask, toCuQcomp(fac0), toCuQcomp(fac1) ); From 6ea9d760bc6790c89b0aa6758c26dc221a5585f5 Mon Sep 17 00:00:00 2001 From: JPRichings Date: Sat, 9 May 2026 21:03:45 +0000 Subject: [PATCH 09/19] update kernel_statevec_anyCtrlTwoTargDenseMatr_sub for performance --- quest/src/gpu/gpu_kernels.cuh | 6 +++--- quest/src/gpu/gpu_subroutines.cpp | 8 ++++++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh index 0e448dff..10bbf2e1 100644 --- a/quest/src/gpu/gpu_kernels.cuh +++ b/quest/src/gpu/gpu_kernels.cuh @@ -259,7 +259,7 @@ __global__ void kernel_statevec_anyCtrlOneTargDenseMatr_subB( template __global__ void kernel_statevec_anyCtrlTwoTargDenseMatr_sub( cu_qcomp* amps, qindex numThreads, - int* ctrlsAndTarg, int numCtrls, qindex ctrlStateMask, int targ1, int targ2, + __grid_constant__ const QubitList_t ctrlsAndTarg, qindex ctrlStateMask, int targ1, int targ2, cu_qcomp m00, cu_qcomp m01, cu_qcomp m02, cu_qcomp m03, cu_qcomp m10, cu_qcomp m11, cu_qcomp m12, cu_qcomp m13, cu_qcomp m20, cu_qcomp m21, cu_qcomp m22, cu_qcomp m23, @@ -268,10 +268,10 @@ __global__ void kernel_statevec_anyCtrlTwoTargDenseMatr_sub( GET_THREAD_IND(n, numThreads); // use template param to compile-time unroll loop in insertBits() - SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls); + SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrlsAndTarg.length); // i00 = nth local index where ctrls are active and both targs are 0 - qindex i00 = insertBitsWithMaskedValues(n, ctrlsAndTarg, numCtrlBits + 2, ctrlStateMask); + qindex i00 = insertBitsWithMaskedValues(n, ctrlsAndTarg.indices, numCtrlBits + 2, ctrlStateMask); qindex i01 = flipBit(i00, targ1); qindex i10 = flipBit(i00, targ2); qindex i11 = flipBit(i01, targ2); diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp index 17131064..18c83451 100644 --- a/quest/src/gpu/gpu_subroutines.cpp +++ b/quest/src/gpu/gpu_subroutines.cpp @@ -402,15 +402,19 @@ void gpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector ctrls, ve qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size() + 2); qindex numBlocks = getNumBlocks(numThreads); - devints sortedQubits = util_getSorted(ctrls, {targ1,targ2}); // change for performance + vector sortedQubits = util_getSorted(ctrls, {targ1,targ2}); // change for performance qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ1,targ2}, {0,0}); + QubitList_t qubits_dev; + std::copy(sortedQubits.begin(), sortedQubits.end(), qubits_dev.indices); + qubits_dev.length = sortedQubits.size(); + // unpack matrix elems which are more efficiently accessed by kernels as args than shared mem (... maybe...) auto m = unpackMatrixToCuQcomps(matr); kernel_statevec_anyCtrlTwoTargDenseMatr_sub <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, - getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ1, targ2, + qubits_dev, qubitStateMask, targ1, targ2, m[0], m[1], m[2], m[3], m[4], m[5], m[6], m[7], m[8], m[9], m[10], m[11], m[12], m[13], m[14], m[15] ); From 5ef2d27ad83e4b017662edc0decdcb8928eefa42 Mon Sep 17 00:00:00 2001 From: JPRichings Date: Sat, 9 May 2026 21:14:51 +0000 Subject: [PATCH 10/19] update kernel_statevec_anyCtrlFewTargDenseMatr for performance --- quest/src/gpu/gpu_kernels.cuh | 8 ++++---- quest/src/gpu/gpu_subroutines.cpp | 22 +++++++++++++++------- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh index 10bbf2e1..53714595 100644 --- a/quest/src/gpu/gpu_kernels.cuh +++ b/quest/src/gpu/gpu_kernels.cuh @@ -325,18 +325,18 @@ __global__ void kernel_statevec_anyCtrlFewTargDenseMatr( REGISTER cu_qcomp privateCache[1 << NumTargs]; // we know NumTargs <= 5, though NumCtrls is permitted anything (including -1) - SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls); + SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrlsAndTargs.length); constexpr qindex numTargAmps = (1 << NumTargs); // explicit, in lieu of powerOf2 // i0 = nth local index where ctrls are active and targs are all zero - qindex i0 = insertBitsWithMaskedValues(n, ctrlsAndTargs, numCtrlBits + NumTargs, ctrlsAndTargsMask); // loop may be unrolled + qindex i0 = insertBitsWithMaskedValues(n, ctrlsAndTargs.indices, numCtrlBits + NumTargs, ctrlsAndTargsMask); // loop may be unrolled // populate cache (force unroll to ensure compile-time cache indices) #pragma unroll for (qindex k=0; k ctrls, ve // task each thread with processing more than a single batch qindex numBatches = qureg.numAmpsPerNode / powerOf2(ctrls.size() + targs.size()); - devints deviceTargs = targs; - devints deviceQubits = util_getSorted(ctrls, targs); // change for performance + vector deviceTargs = targs; + vector deviceQubits = util_getSorted(ctrls, targs); // change for performance qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, targs, vector(targs.size(),0)); + QubitList_t qubits_dev; + std::copy(deviceQubits.begin(), deviceQubits.end(), qubits_dev.indices); + qubits_dev.length = deviceQubits.size(); + + QubitList_t targs_dev; + std::copy(deviceTargs.begin(), deviceTargs.end(), targs_dev.indices); + targs_dev.length = deviceTargs.size(); + // unpacking args (to better distinguish below signatures) auto ampsPtr = toCuQcomps(qureg.gpuAmps); auto matrPtr = toCuQcomps(matr.gpuElemsFlat); - auto qubitsPtr = getPtr(deviceQubits); - auto targsPtr = getPtr(deviceTargs); - auto nCtrls = ctrls.size(); + // auto qubitsPtr = getPtr(deviceQubits); + // auto targsPtr = getPtr(deviceTargs); + // auto nCtrls = ctrls.size(); // this function updates amplitudes in batches of 2^NumTargs, where each is // determined by distinct mixtures of the existing 2^NumTargs values, which @@ -505,8 +513,8 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector ctrls, ve <<>> ( ampsPtr, numThreads, - qubitsPtr, nCtrls, qubitStateMask, - targsPtr, matrPtr + qubits_dev, qubitStateMask, + targs_dev, matrPtr ); } else { From a353450a3a7bb3383735d2d0c3aa6f81896d79e9 Mon Sep 17 00:00:00 2001 From: JPRichings Date: Sat, 9 May 2026 21:25:00 +0000 Subject: [PATCH 11/19] update kernel_statevector_anyCtrlPauliTensorOrGadget_subA for performance --- quest/src/gpu/gpu_kernels.cuh | 14 +++++++------- quest/src/gpu/gpu_subroutines.cpp | 16 ++++++++++++---- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh index 53714595..da88b7ba 100644 --- a/quest/src/gpu/gpu_kernels.cuh +++ b/quest/src/gpu/gpu_kernels.cuh @@ -308,7 +308,7 @@ __forceinline__ __device__ qindex getThreadsNthGlobalArrInd(qindex n, qindex thr template __global__ void kernel_statevec_anyCtrlFewTargDenseMatr( cu_qcomp* amps, qindex numThreads, - int* ctrlsAndTargs, int numCtrls, qindex ctrlsAndTargsMask, int* targs, + __grid_constant__ const QubitList_t ctrlsAndTargs, qindex ctrlsAndTargsMask, __grid_constant__ const QubitList_t targs, cu_qcomp* flatMatrElems ) { GET_THREAD_IND(n, numThreads); @@ -623,15 +623,15 @@ __global__ void kernel_densmatr_allTargDiagMatr_sub( template __global__ void kernel_statevector_anyCtrlPauliTensorOrGadget_subA( cu_qcomp* amps, qindex numThreads, - int* ctrlsAndTargs, int numCtrls, qindex ctrlsAndTargsStateMask, - int* targsXY, int numXY, qindex maskXY, qindex maskYZ, + __grid_constant__ const QubitList_t ctrlsAndTargs, qindex ctrlsAndTargsStateMask, + __grid_constant__ const QubitList_t targsXY, qindex maskXY, qindex maskYZ, cu_qcomp powI, cu_qcomp ampFac, cu_qcomp pairAmpFac ) { GET_THREAD_IND(t, numThreads); // use template params to compile-time unroll loops in insertBits() and setBits() - SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls); - SET_VAR_AT_COMPILE_TIME(int, numTargBits, NumTargs, numXY); + SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrlsAndTargs.length); + SET_VAR_AT_COMPILE_TIME(int, numTargBits, NumTargs, targsXY.length); // n = local index of amp sub-batch with common i0, v = value of target bits qindex numInnerIts = powerOf2(numTargBits) / 2; @@ -639,10 +639,10 @@ __global__ void kernel_statevector_anyCtrlPauliTensorOrGadget_subA( qindex v = t % numInnerIts; // i0 = nth local index where ctrls are active and targs are all zero (loop therein may be unrolled) - qindex i0 = insertBitsWithMaskedValues(n, ctrlsAndTargs, numCtrlBits + numTargBits, ctrlsAndTargsStateMask); + qindex i0 = insertBitsWithMaskedValues(n, ctrlsAndTargs.indices, numCtrlBits + numTargBits, ctrlsAndTargsStateMask); // iA = nth local index where targs have value v, iB = (last - nth) such index - qindex iA = setBits(i0, targsXY, numTargBits, v); // may be unrolled + qindex iA = setBits(i0, targsXY.indices, numTargBits, v); // may be unrolled qindex iB = flipBits(iA, maskXY); // determine whether to multiply amps by +-1 or +-i diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp index a5afa8e3..68a6e226 100644 --- a/quest/src/gpu/gpu_subroutines.cpp +++ b/quest/src/gpu/gpu_subroutines.cpp @@ -903,10 +903,18 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subA(Qureg qureg, vector ct auto maskXY = util_getBitMask(targsXY); auto maskYZ = util_getBitMask(util_getConcatenated(y, z)); - devints deviceTargs = targsXY; - devints deviceQubits = util_getSorted(ctrls, targsXY); // change for performance + vector deviceTargs = targsXY; + vector deviceQubits = util_getSorted(ctrls, targsXY); // change for performance qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, targsXY, vector(targsXY.size(),0)); + QubitList_t qubits_dev; + std::copy(deviceQubits.begin(), deviceQubits.end(), qubits_dev.indices); + qubits_dev.length = deviceQubits.size(); + + QubitList_t targs_dev; + std::copy(deviceTargs.begin(), deviceTargs.end(), targs_dev.indices); + targs_dev.length = deviceTargs.size(); + // unlike the analogous cpu routine, this function has only a single parallelisation // granularity; where every pair-of-amps is modified by an independent thread, despite // that many threads share a common i0 value (appearing in the kernel). This turns out @@ -916,8 +924,8 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subA(Qureg qureg, vector ct qindex numBlocks = getNumBlocks(numThreads); kernel_statevector_anyCtrlPauliTensorOrGadget_subA <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, - getPtr(deviceQubits), ctrls.size(), qubitStateMask, - getPtr(deviceTargs), deviceTargs.size(), + qubits_dev, qubitStateMask, + targs_dev, maskXY, maskYZ, toCuQcomp(powI), toCuQcomp(ampFac), toCuQcomp(pairAmpFac) ); From 1f572748a58b9962730b71d2025c077fcd08ca37 Mon Sep 17 00:00:00 2001 From: JPRichings Date: Sat, 9 May 2026 21:43:39 +0000 Subject: [PATCH 12/19] update kernel_statevector_anyCtrlPauliTensorOrGadget_subA for performance --- quest/src/gpu/gpu_kernels.cuh | 6 +++--- quest/src/gpu/gpu_subroutines.cpp | 8 ++++++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh index da88b7ba..b3898293 100644 --- a/quest/src/gpu/gpu_kernels.cuh +++ b/quest/src/gpu/gpu_kernels.cuh @@ -663,17 +663,17 @@ __global__ void kernel_statevector_anyCtrlPauliTensorOrGadget_subA( template __global__ void kernel_statevector_anyCtrlPauliTensorOrGadget_subB( cu_qcomp* amps, cu_qcomp* buffer, qindex numThreads, - int* ctrls, int numCtrls, qindex ctrlStateMask, + __grid_constant__ const QubitList_t ctrls, qindex ctrlStateMask, qindex maskXY, qindex maskYZ, qindex bufferMaskXY, cu_qcomp powI, cu_qcomp thisAmpFac, cu_qcomp otherAmpFac ) { GET_THREAD_IND(n, numThreads); // use template param to compile-time unroll loop in insertBits() - SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls); + SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrls.length); // i = nth local index where ctrl bits are in specified states - qindex i = insertBitsWithMaskedValues(n, ctrls, numCtrlBits, ctrlStateMask); + qindex i = insertBitsWithMaskedValues(n, ctrls.indices, numCtrlBits, ctrlStateMask); // j = buffer index of amp to be mixed with i qindex j = flipBits(n, bufferMaskXY); diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp index 68a6e226..66247767 100644 --- a/quest/src/gpu/gpu_subroutines.cpp +++ b/quest/src/gpu/gpu_subroutines.cpp @@ -950,12 +950,16 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subB(Qureg qureg, vector ct auto maskXY = util_getBitMask(util_getConcatenated(x, y)); auto maskYZ = util_getBitMask(util_getConcatenated(y, z)); - devints sortedCtrls = util_getSorted(ctrls); // change for performance + vector sortedCtrls = util_getSorted(ctrls); // change for performance qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); + QubitList_t Ctrls_dev; + std::copy(sortedCtrls.begin(), sortedCtrls.end(), Ctrls_dev.indices); + Ctrls_dev.length = sortedCtrls.size(); + kernel_statevector_anyCtrlPauliTensorOrGadget_subB <<>> ( toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, - getPtr(sortedCtrls), ctrls.size(), ctrlStateMask, + Ctrls_dev, ctrlStateMask, maskXY, maskYZ, bufferMaskXY, toCuQcomp(powI), toCuQcomp(ampFac), toCuQcomp(pairAmpFac) ); From 6604823bd31260e3696cb523aa63ab0f160dbd11 Mon Sep 17 00:00:00 2001 From: JPRichings Date: Sat, 9 May 2026 21:47:15 +0000 Subject: [PATCH 13/19] update kernel_statevector_anyCtrlPauliTensorOrGadget_subB for performance --- quest/src/gpu/gpu_kernels.cuh | 6 +++--- quest/src/gpu/gpu_subroutines.cpp | 8 ++++++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh index b3898293..fe626db2 100644 --- a/quest/src/gpu/gpu_kernels.cuh +++ b/quest/src/gpu/gpu_kernels.cuh @@ -698,16 +698,16 @@ __global__ void kernel_statevector_anyCtrlPauliTensorOrGadget_subB( template __global__ void kernel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub( cu_qcomp* amps, qindex numThreads, - int* ctrls, int numCtrls, qindex ctrlStateMask, qindex targMask, + __grid_constant__ const QubitList_t ctrls, qindex ctrlStateMask, qindex targMask, cu_qcomp fac0, cu_qcomp fac1 ) { GET_THREAD_IND(n, numThreads); // use template param to compile-time unroll loop in insertBits() - SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls); + SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrls.length); // i = nth local index where ctrl bits are in specified states - qindex i = insertBitsWithMaskedValues(n, ctrls, numCtrlBits, ctrlStateMask); + qindex i = insertBitsWithMaskedValues(n, ctrls.indices, numCtrlBits, ctrlStateMask); // apply phase to amp depending on parity of targets in global index int p = cudaGetBitMaskParity(i & targMask); diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp index 66247767..991b43e2 100644 --- a/quest/src/gpu/gpu_subroutines.cpp +++ b/quest/src/gpu/gpu_subroutines.cpp @@ -990,13 +990,17 @@ void gpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(Qureg qureg, vector c qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size()); qindex numBlocks = getNumBlocks(numThreads); - devints sortedCtrls = util_getSorted(ctrls); // change for performance + vector sortedCtrls = util_getSorted(ctrls); // change for performance qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); qindex targMask = util_getBitMask(targs); + QubitList_t Ctrls_dev; + std::copy(sortedCtrls.begin(), sortedCtrls.end(), Ctrls_dev.indices); + Ctrls_dev.length = sortedCtrls.size(); + kernel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, - getPtr(sortedCtrls), ctrls.size(), ctrlStateMask, targMask, + Ctrls_dev, ctrlStateMask, targMask, toCuQcomp(fac0), toCuQcomp(fac1) ); From ca95173baa77d1e6598e28ba2de5581dc8621ef1 Mon Sep 17 00:00:00 2001 From: JPRichings Date: Sat, 9 May 2026 21:56:11 +0000 Subject: [PATCH 14/19] update kernel_densmatr_partialTrace_sub for performance --- quest/src/gpu/gpu_kernels.cuh | 12 +++++++----- quest/src/gpu/gpu_subroutines.cpp | 20 ++++++++++++++++---- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh index fe626db2..2d3c6801 100644 --- a/quest/src/gpu/gpu_kernels.cuh +++ b/quest/src/gpu/gpu_kernels.cuh @@ -1146,12 +1146,14 @@ __global__ void kernel_densmatr_oneQubitDamping_subD( template __global__ void kernel_densmatr_partialTrace_sub( cu_qcomp* ampsIn, cu_qcomp* ampsOut, qindex numThreads, - int* ketTargs, int* pairTargs, int* allTargs, int numKetTargs + __grid_constant__ const QubitList_t ketTargs, + __grid_constant__ const QubitList_t pairTargs, + __grid_constant__ const QubitList_t allTargs ) { GET_THREAD_IND(n, numThreads); // use template param to compile-time unroll below loops - SET_VAR_AT_COMPILE_TIME(int, numTargPairs, NumTargs, numKetTargs); + SET_VAR_AT_COMPILE_TIME(int, numTargPairs, NumTargs, KetTargs.length); // may be inferred at compile-time int numAllTargs = 2*numTargPairs; @@ -1163,7 +1165,7 @@ __global__ void kernel_densmatr_partialTrace_sub( /// should change the parallelisation axis in this scenario, or preclude it with validation! // k = nth local index of inQureg where all targs and pairs are zero - qindex k = insertBits(n, allTargs, numAllTargs, 0); // loop may be unrolled + qindex k = insertBits(n, allTargs.indices, numAllTargs, 0); // loop may be unrolled // each outQureg amp results from summing 2^targs inQureg amps cu_qcomp outAmp = getCuQcomp(0, 0); @@ -1173,8 +1175,8 @@ __global__ void kernel_densmatr_partialTrace_sub( // i = nth local index of inQureg where targs=j and pairTargs=j qindex i = k; - i = setBits(i, ketTargs, numTargPairs, j); // loops may be unrolled - i = setBits(i, pairTargs, numTargPairs, j); + i = setBits(i, ketTargs.indices, numTargPairs, j); // loops may be unrolled + i = setBits(i, pairTargs.indices, numTargPairs, j); outAmp = outAmp + ampsIn[i]; } diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp index 991b43e2..f47eedba 100644 --- a/quest/src/gpu/gpu_subroutines.cpp +++ b/quest/src/gpu/gpu_subroutines.cpp @@ -1547,13 +1547,25 @@ void gpu_densmatr_partialTrace_sub(Qureg inQureg, Qureg outQureg, vector ta qindex numThreads = outQureg.numAmpsPerNode; qindex numBlocks = getNumBlocks(numThreads); - devints devTargs = targs; // change for performance - devints devPairTargs = pairTargs; - devints devAllTargs = util_getSorted(targs, pairTargs); + vector devTargs = targs; // change for performance + vector devPairTargs = pairTargs; + vector devAllTargs = util_getSorted(targs, pairTargs); + + QubitList_t targs_dev; + std::copy(devTargs.begin(), devTargs.end(), targs_dev.indices); + targs_dev.length = devTargs.size(); + + QubitList_t pairTargs_dev; + std::copy(devPairTargs.begin(), devPairTargs.end(), pairTargs_dev.indices); + pairTargs_dev.length = devPairTargs.size(); + + QubitList_t allTargs_dev; + std::copy(devAllTargs.begin(), devAllTargs.end(), allTargs_dev.indices); + allTargs_dev.length = devAllTargs.size(); kernel_densmatr_partialTrace_sub <<>> ( toCuQcomps(inQureg.gpuAmps), toCuQcomps(outQureg.gpuAmps), numThreads, - getPtr(devTargs), getPtr(devPairTargs), getPtr(devAllTargs), targs.size() + targs_dev, pairTargs_dev, allTargs_dev ); #else From ec71af6339e3cb3631d5a8200286ceb3c73869b9 Mon Sep 17 00:00:00 2001 From: JPRichings Date: Sat, 9 May 2026 22:02:37 +0000 Subject: [PATCH 15/19] update kernel_statevec_calcProbsOfAllMultiQubitOutcomes_sub for performance --- quest/src/gpu/gpu_kernels.cuh | 6 +++--- quest/src/gpu/gpu_subroutines.cpp | 8 ++++++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh index 2d3c6801..caf41599 100644 --- a/quest/src/gpu/gpu_kernels.cuh +++ b/quest/src/gpu/gpu_kernels.cuh @@ -1195,7 +1195,7 @@ template __global__ void kernel_statevec_calcProbsOfAllMultiQubitOutcomes_sub( qreal* outProbs, cu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, - int* qubits, int numQubits + __grid_constant__ const QubitList_t qubits ) { GET_THREAD_IND(n, numThreads); @@ -1206,7 +1206,7 @@ __global__ void kernel_statevec_calcProbsOfAllMultiQubitOutcomes_sub( /// whether this is worthwhile and faster! // use template param to compile-time unroll below loops - SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, numQubits); + SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubits.length); qreal prob = getCompNorm(amps[n]); @@ -1214,7 +1214,7 @@ __global__ void kernel_statevec_calcProbsOfAllMultiQubitOutcomes_sub( qindex i = concatenateBits(rank, n, logNumAmpsPerNode); // j = outcome index corresponding to prob - qindex j = getValueOfBits(i, qubits, numBits); // loop therein may be unrolled + qindex j = getValueOfBits(i, qubits.indices, numBits); // loop therein may be unrolled atomicAdd(&outProbs[j], prob); } diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp index f47eedba..c08fadef 100644 --- a/quest/src/gpu/gpu_subroutines.cpp +++ b/quest/src/gpu/gpu_subroutines.cpp @@ -1680,12 +1680,16 @@ void gpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu qindex numBlocks = getNumBlocks(numThreads); // allocate exponentially-big temporary memory (error if failed) - devints devQubits = qubits; // change for performance + vector devQubits = qubits; // change for performance devreals devProbs = getDeviceRealsVec(powerOf2(qubits.size())); // throws + QubitList_t qubits_dev; + std::copy(devQubits.begin(), devQubits.end(), qubits_dev.indices); + qubits_dev.length = devQubits.size(); + kernel_statevec_calcProbsOfAllMultiQubitOutcomes_sub <<>> ( getPtr(devProbs), toCuQcomps(qureg.gpuAmps), numThreads, - qureg.rank, qureg.logNumAmpsPerNode, getPtr(devQubits), devQubits.size() + qureg.rank, qureg.logNumAmpsPerNode, qubits_dev ); // overwrite outProbs with GPU memory From 7e5e92d6a77e15bde399a5fda5bbaf8b9bbbd0a9 Mon Sep 17 00:00:00 2001 From: JPRichings Date: Sat, 9 May 2026 22:06:22 +0000 Subject: [PATCH 16/19] update kernel_densmatr_calcProbsOfAllMultiQubitOutcomes_sub for performance --- quest/src/gpu/gpu_kernels.cuh | 6 +++--- quest/src/gpu/gpu_subroutines.cpp | 8 ++++++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh index caf41599..bf4f0bcc 100644 --- a/quest/src/gpu/gpu_kernels.cuh +++ b/quest/src/gpu/gpu_kernels.cuh @@ -1225,12 +1225,12 @@ __global__ void kernel_densmatr_calcProbsOfAllMultiQubitOutcomes_sub( qreal* outProbs, cu_qcomp* amps, qindex numThreads, qindex firstDiagInd, qindex numAmpsPerCol, int rank, qindex logNumAmpsPerNode, - int* qubits, int numQubits + __grid_constant__ const QubitList_t qubits ) { GET_THREAD_IND(n, numThreads); // use template param to compile-time unroll loop in insertBits() - SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, numQubits); + SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubits.length); // i = index of nth local diagonal elem qindex i = fast_getQuregLocalIndexOfDiagonalAmp(n, firstDiagInd, numAmpsPerCol); @@ -1240,7 +1240,7 @@ __global__ void kernel_densmatr_calcProbsOfAllMultiQubitOutcomes_sub( qindex j = concatenateBits(rank, i, logNumAmpsPerNode); // k = outcome index corresponding to - qindex k = getValueOfBits(j, qubits, numBits); // loop therein may be unrolled + qindex k = getValueOfBits(j, qubits.indices, numBits); // loop therein may be unrolled atomicAdd(&outProbs[k], prob); } diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp index c08fadef..38193178 100644 --- a/quest/src/gpu/gpu_subroutines.cpp +++ b/quest/src/gpu/gpu_subroutines.cpp @@ -1721,14 +1721,18 @@ void gpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu qindex numAmpsPerCol = powerOf2(qureg.numQubits); // allocate exponentially-big temporary memory (error if failed) - devints devQubits = qubits; // change for performance + vector devQubits = qubits; // change for performance devreals devProbs = getDeviceRealsVec(powerOf2(qubits.size())); // throws + QubitList_t qubits_dev; + std::copy(devQubits.begin(), devQubits.end(), qubits_dev.indices); + qubits_dev.length = devQubits.size(); + kernel_densmatr_calcProbsOfAllMultiQubitOutcomes_sub <<>> ( getPtr(devProbs), toCuQcomps(qureg.gpuAmps), numThreads, firstDiagInd, numAmpsPerCol, qureg.rank, qureg.logNumAmpsPerNode, - getPtr(devQubits), devQubits.size() + qubits_dev ); // overwrite outProbs with GPU memory From 89b6951f746b4cc76eb54283450096384352dc63 Mon Sep 17 00:00:00 2001 From: JPRichings Date: Sat, 9 May 2026 22:11:24 +0000 Subject: [PATCH 17/19] update kernel_statevec_anyCtrlAnyTargDiagMatr_sub for performance --- quest/src/gpu/gpu_kernels.cuh | 6 +++--- quest/src/gpu/gpu_subroutines.cpp | 8 ++++++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh index bf4f0bcc..1d1b302b 100644 --- a/quest/src/gpu/gpu_kernels.cuh +++ b/quest/src/gpu/gpu_kernels.cuh @@ -528,7 +528,7 @@ __global__ void kernel_statevec_anyCtrlTwoTargDiagMatr_sub( template __global__ void kernel_statevec_anyCtrlAnyTargDiagMatr_sub( cu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, __grid_constant__ const ctrl_device_t ctrl, - int numCtrls, qindex ctrlStateMask, int* targs, int numTargs, + int numCtrls, qindex ctrlStateMask, __grid_constant__ const QubitList_t targs, cu_qcomp* elems, cu_qcomp exponent ) { GET_THREAD_IND(n, numThreads); @@ -546,7 +546,7 @@ __global__ void kernel_statevec_anyCtrlAnyTargDiagMatr_sub( // use template params to compile-time unroll loops in insertBits() and getValueOfBits() SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls); - SET_VAR_AT_COMPILE_TIME(int, numTargBits, NumTargs, numTargs); + SET_VAR_AT_COMPILE_TIME(int, numTargBits, NumTargs, targs.length); // j = nth local index where ctrls are active (in the specified states) qindex j = insertBitsWithMaskedValues(n, ctrl.ctrl_device, numCtrlBits, ctrlStateMask); @@ -555,7 +555,7 @@ __global__ void kernel_statevec_anyCtrlAnyTargDiagMatr_sub( qindex i = concatenateBits(rank, j, logNumAmpsPerNode); // t = value of targeted bits, which may be in the prefix substate - qindex t = getValueOfBits(i, targs, numTargBits); + qindex t = getValueOfBits(i, targs.indices, numTargBits); cu_qcomp elem = elems[t]; diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp index 38193178..99359ae6 100644 --- a/quest/src/gpu/gpu_subroutines.cpp +++ b/quest/src/gpu/gpu_subroutines.cpp @@ -782,9 +782,13 @@ void gpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector ctrls, vec qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size()); qindex numBlocks = getNumBlocks(numThreads); - devints deviceTargs = targs; // change for performance and standardise + vector deviceTargs = targs; // change for performance and standardise // devints deviceCtrls = util_getSorted(ctrls); + QubitList_t targs_dev; + std::copy(deviceTargs.begin(), deviceTargs.end(), targs_dev.indices); + targs_dev.length = deviceTargs.size(); + // removed implicit thrust mem copy vector sortedCtrls = util_getSorted(ctrls); @@ -802,7 +806,7 @@ void gpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector ctrls, vec kernel_statevec_anyCtrlAnyTargDiagMatr_sub <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, ctrl, - ctrls.size(), ctrlStateMask, getPtr(deviceTargs), targs.size(), + ctrls.size(), ctrlStateMask, targs_dev, toCuQcomps(util_getGpuMemPtr(matr)), toCuQcomp(exponent) ); From 8e24362820021895e4e8a822eb14aaec30e46c11 Mon Sep 17 00:00:00 2001 From: JPRichings Date: Sat, 9 May 2026 22:25:19 +0000 Subject: [PATCH 18/19] add const arg versions of bitwise functions --- quest/src/core/bitwise.hpp | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/quest/src/core/bitwise.hpp b/quest/src/core/bitwise.hpp index a797a1fa..9f567371 100644 --- a/quest/src/core/bitwise.hpp +++ b/quest/src/core/bitwise.hpp @@ -191,6 +191,17 @@ INLINE qindex setBits(qindex number, int* bitIndices, int numIndices, qindex bit return number; } +INLINE qindex setBits(qindex number, const int* bitIndices, int numIndices, qindex bitsValue) { + + // bitIndices are arbitrarily ordered, which does not affect number + for (int i=0; i Date: Sat, 9 May 2026 22:25:48 +0000 Subject: [PATCH 19/19] debug performance changes --- quest/src/gpu/gpu_kernels.cuh | 14 +++++++------- quest/src/gpu/gpu_subroutines.cpp | 6 +++--- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh index 1d1b302b..02cfcb38 100644 --- a/quest/src/gpu/gpu_kernels.cuh +++ b/quest/src/gpu/gpu_kernels.cuh @@ -376,14 +376,14 @@ template __global__ void kernel_statevec_anyCtrlManyTargDenseMatr( cu_qcomp* globalCache, cu_qcomp* amps, qindex numThreads, qindex numBatchesPerThread, - int* ctrlsAndTargs, int numCtrls, qindex ctrlsAndTargsMask, - int* targs, int numTargBits, qindex numTargAmps, + __grid_constant__ const QubitList_t ctrlsAndTargs, qindex ctrlsAndTargsMask, + __grid_constant__ const QubitList_t targs, qindex numTargAmps, cu_qcomp* flatMatrElems ) { GET_THREAD_IND(t, numThreads); // NumCtrls might be compile-time known, but numTargBits>5 is always unknown/runtime - SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls); + SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrlsAndTargs.length); // unlike all other kernels, each thread modifies multiple batches of amplitudes for (qindex b=0; b ctrls, vector c qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {targState}); QubitList_t Qubits_dev; - std::copy(sorteQubits.begin(), sortedQubits.end(), Qubits_dev.indices); + std::copy(sortedQubits.begin(), sortedQubits.end(), Qubits_dev.indices); Qubits_dev.length = sortedQubits.size(); kernel_statevec_anyCtrlSwap_subC <<>> ( @@ -552,8 +552,8 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector ctrls, ve <<>> ( toCuQcomps(cache), ampsPtr, numThreads, numBatchesPerThread, - qubitsPtr, nCtrls, qubitStateMask, - targsPtr, targs.size(), powerOf2(targs.size()), matrPtr + qubits_dev, qubitStateMask, + targs_dev, powerOf2(targs.size()), matrPtr ); }