diff --git a/CMakeLists.txt b/CMakeLists.txt index 186971a2b3721614b85e0c86bd5ac20410aaf448..43526d8534cefb67b8ae7dfcfb41f571845b98a1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -70,9 +70,9 @@ if (NEKTAR_USE_CUDA) Operators/DirBndCond/DirBndCondCUDA.cu Operators/NeuBndCond/NeuBndCondCUDA.cu #Operators/RobBndCond/RobBndCondCUDA.cu - #Operators/ConjGrad/ConjGradCUDA.cu - #Operators/FwdTrans/FwdTransCUDA.cu - #Operators/HelmSolve/HelmSolveCUDA.cu + Operators/ConjGrad/ConjGradCUDA.cu + Operators/FwdTrans/FwdTransCUDA.cu + Operators/HelmSolve/HelmSolveCUDA.cu Operators/Matrix/MatrixCUDA.cu ) endif() diff --git a/Operators/AssmbScatr/AssmbScatrCUDA.hpp b/Operators/AssmbScatr/AssmbScatrCUDA.hpp index 2aeee61909da1f9b4f8add1517e1003830c7be82..944e178d5495d39759f35a114e81ac25db924db9 100644 --- a/Operators/AssmbScatr/AssmbScatrCUDA.hpp +++ b/Operators/AssmbScatr/AssmbScatrCUDA.hpp @@ -3,6 +3,7 @@ #include "MemoryRegionCUDA.hpp" #include "Operators/AssmbScatr/AssmbScatrCUDAKernels.cuh" #include "Operators/OperatorAssmbScatr.hpp" +#include "Operators/OperatorHelper.cuh" #include <MultiRegions/AssemblyMap/AssemblyMapCG.h> #include <MultiRegions/ContField.h> @@ -100,9 +101,8 @@ public: auto nElmts = in.GetBlocks()[block_idx].num_elements; auto ncoeff = expPtr->GetNcoeffs(); - // Deterime CUDA grid parameters. - m_gridSize = nElmts / m_blockSize; - m_gridSize += (nElmts % m_blockSize == 0) ? 0 : 1; + // Deterime CUDA grid size. + m_gridSize = GetCUDAGridSize(nElmts, m_blockSize); if (m_signChange) { @@ -144,9 +144,8 @@ public: auto nElmts = out.GetBlocks()[block_idx].num_elements; auto ncoeff = expPtr->GetNcoeffs(); - // Deterime CUDA grid parameters. - m_gridSize = nElmts / m_blockSize; - m_gridSize += (nElmts % m_blockSize == 0) ? 0 : 1; + // Deterime CUDA grid size. + m_gridSize = GetCUDAGridSize(nElmts, m_blockSize); if (m_signChange) { @@ -188,7 +187,7 @@ protected: size_t m_nloc; size_t m_nglo; size_t m_ndir; - size_t m_gridSize; + size_t m_gridSize = 1024; size_t m_blockSize = 32; }; diff --git a/Operators/AssmbScatr/AssmbScatrCUDAKernels.cuh b/Operators/AssmbScatr/AssmbScatrCUDAKernels.cuh index e1cfacd8862a7dd0e7896bece9362dbf0b87db31..5112cf96d0006ea032619181a15beae1616e7b79 100644 --- a/Operators/AssmbScatr/AssmbScatrCUDAKernels.cuh +++ b/Operators/AssmbScatr/AssmbScatrCUDAKernels.cuh @@ -9,17 +9,16 @@ __global__ void AssembleKernel(const size_t ncoeff, const size_t nelmt, { size_t e = blockDim.x * blockIdx.x + threadIdx.x; - if (e >= nelmt) + while (e < nelmt) { - return; - } - - size_t index = offset + e * ncoeff; - - for (size_t i = 0; i < ncoeff; i++) - { - atomicAdd(outptr + assmbptr[index + i], - signptr[index + i] * inptr[index + i]); + size_t index = offset + e * ncoeff; + + for (size_t i = 0; i < ncoeff; i++) + { + atomicAdd(outptr + assmbptr[index + i], + signptr[index + i] * inptr[index + i]); + } + e += blockDim.x * gridDim.x; } } @@ -31,16 +30,15 @@ __global__ void AssembleKernel(const size_t ncoeff, const size_t nelmt, { size_t e = blockDim.x * blockIdx.x + threadIdx.x; - if (e >= nelmt) + while (e < nelmt) { - return; - } + size_t index = offset + e * ncoeff; - size_t index = offset + e * ncoeff; - - for (size_t i = 0; i < ncoeff; i++) - { - atomicAdd(outptr + assmbptr[index + i], sign * inptr[index + i]); + for (size_t i = 0; i < ncoeff; i++) + { + atomicAdd(outptr + assmbptr[index + i], sign * inptr[index + i]); + } + e += blockDim.x * gridDim.x; } } @@ -51,16 +49,15 @@ __global__ void AssembleKernel(const size_t ncoeff, const size_t nelmt, { size_t e = blockDim.x * blockIdx.x + threadIdx.x; - if (e >= nelmt) + while (e < nelmt) { - return; - } + size_t index = offset + e * ncoeff; - size_t index = offset + e * ncoeff; - - for (size_t i = 0; i < ncoeff; i++) - { - atomicAdd(outptr + assmbptr[index + i], inptr[index + i]); + for (size_t i = 0; i < ncoeff; i++) + { + atomicAdd(outptr + assmbptr[index + i], inptr[index + i]); + } + e += blockDim.x * gridDim.x; } } @@ -72,16 +69,15 @@ __global__ void GlobalToLocalKernel(const size_t ncoeff, const size_t nelmt, { size_t e = blockDim.x * blockIdx.x + threadIdx.x; - if (e >= nelmt) + while (e < nelmt) { - return; - } - - size_t index = offset + e * ncoeff; + size_t index = offset + e * ncoeff; - for (size_t i = 0; i < ncoeff; i++) - { - outptr[index + i] = signptr[index + i] * inptr[assmbptr[index + i]]; + for (size_t i = 0; i < ncoeff; i++) + { + outptr[index + i] = signptr[index + i] * inptr[assmbptr[index + i]]; + } + e += blockDim.x * gridDim.x; } } @@ -93,16 +89,15 @@ __global__ void GlobalToLocalKernel(const size_t ncoeff, const size_t nelmt, { size_t e = blockDim.x * blockIdx.x + threadIdx.x; - if (e >= nelmt) + while (e < nelmt) { - return; - } - - size_t index = offset + e * ncoeff; + size_t index = offset + e * ncoeff; - for (size_t i = 0; i < ncoeff; i++) - { - outptr[index + i] = sign * inptr[assmbptr[index + i]]; + for (size_t i = 0; i < ncoeff; i++) + { + outptr[index + i] = sign * inptr[assmbptr[index + i]]; + } + e += blockDim.x * gridDim.x; } } @@ -113,16 +108,15 @@ __global__ void GlobalToLocalKernel(const size_t ncoeff, const size_t nelmt, { size_t e = blockDim.x * blockIdx.x + threadIdx.x; - if (e >= nelmt) + while (e < nelmt) { - return; - } + size_t index = offset + e * ncoeff; - size_t index = offset + e * ncoeff; - - for (size_t i = 0; i < ncoeff; i++) - { - outptr[index + i] = inptr[assmbptr[index + i]]; + for (size_t i = 0; i < ncoeff; i++) + { + outptr[index + i] = inptr[assmbptr[index + i]]; + } + e += blockDim.x * gridDim.x; } } diff --git a/Operators/BwdTrans/BwdTransCUDA.hpp b/Operators/BwdTrans/BwdTransCUDA.hpp index cd25b8155752447ead6ccf56fc2a13c060932165..4cf7c9d69f0229abb7f8011695ed8804ed7c4b10 100644 --- a/Operators/BwdTrans/BwdTransCUDA.hpp +++ b/Operators/BwdTrans/BwdTransCUDA.hpp @@ -1,3 +1,5 @@ +#pragma once + #include "MemoryRegionCUDA.hpp" #include "Operators/BwdTrans/BwdTransCUDAKernels.cuh" #include "Operators/OperatorBwdTrans.hpp" @@ -90,9 +92,8 @@ public: auto nqTot = expPtr->GetTotPoints(); auto shape = expPtr->DetShapeType(); - // Deterime CUDA grid parameters. - m_gridSize = nElmts / m_blockSize; - m_gridSize += (nElmts % m_blockSize == 0) ? 0 : 1; + // Deterime CUDA grid size. + m_gridSize = GetCUDAGridSize(nElmts, m_blockSize); // Flag for collapsed coordinate correction. bool correct = expPtr->GetBasis(0)->GetBasisType() == diff --git a/Operators/ConjGrad/ConjGradCUDA.cu b/Operators/ConjGrad/ConjGradCUDA.cu new file mode 100644 index 0000000000000000000000000000000000000000..6d96e5b4722a00ea2d2ba0389212d6c071c363cc --- /dev/null +++ b/Operators/ConjGrad/ConjGradCUDA.cu @@ -0,0 +1,13 @@ +#include "ConjGradCUDA.hpp" + +namespace Nektar::Operators::detail +{ + +// Register implementation with Operator Factory +template <> +std::string OperatorConjGradImpl<double, ImplCUDA>::className = + GetOperatorFactory<double>().RegisterCreatorFunction( + "ConjGradCUDA", OperatorConjGradImpl<double, ImplCUDA>::instantiate, + "..."); + +} // namespace Nektar::Operators::detail diff --git a/Operators/ConjGrad/ConjGradCUDA.hpp b/Operators/ConjGrad/ConjGradCUDA.hpp new file mode 100644 index 0000000000000000000000000000000000000000..cc679f280bc5c40a7d205d21b2003986895f7862 --- /dev/null +++ b/Operators/ConjGrad/ConjGradCUDA.hpp @@ -0,0 +1,253 @@ +#pragma once + +#include <algorithm> +#include <array> +#include <assert.h> +#include <cmath> +#include <memory> + +#include <LibUtilities/BasicUtils/SessionReader.h> +#include <LibUtilities/BasicUtils/Vmath.hpp> +#include <MultiRegions/ContField.h> + +#include "MemoryRegionCUDA.hpp" +#include "Operators/CUDAMathKernels.cuh" +#include "Operators/OperatorAssmbScatr.hpp" +#include "Operators/OperatorConjGrad.hpp" +#include "Operators/OperatorHelper.cuh" +#include "Operators/OperatorRobBndCond.hpp" + +using namespace Nektar; +using namespace Nektar::MultiRegions; + +namespace Nektar::Operators::detail +{ + +template <typename TData> +class OperatorConjGradImpl<TData, ImplCUDA> : public OperatorConjGrad<TData> +{ +public: + OperatorConjGradImpl(const MultiRegions::ExpListSharedPtr &expansionList) + : OperatorConjGrad<TData>(expansionList), + m_w_A(Field<TData, FieldState::Coeff>::template create< + MemoryRegionCUDA>( + GetBlockAttributes(FieldState::Coeff, expansionList))), + m_s_A(Field<TData, FieldState::Coeff>::template create< + MemoryRegionCUDA>( + GetBlockAttributes(FieldState::Coeff, expansionList))), + m_r_A(Field<TData, FieldState::Coeff>::template create< + MemoryRegionCUDA>( + GetBlockAttributes(FieldState::Coeff, expansionList))), + m_wk(Field<TData, FieldState::Coeff>::template create< + MemoryRegionCUDA>( + GetBlockAttributes(FieldState::Coeff, expansionList))) + { + auto contfield = + std::dynamic_pointer_cast<ContField>(this->m_expansionList); + contfield->GetSession()->LoadParameter("NekLinSysMaxIterations", + m_maxIter, 5000); + contfield->GetSession()->LoadParameter("IterativeSolverTolerance", + m_tol, 1.0E-09); + m_nloc = contfield->GetLocalToGlobalMap()->GetNumLocalCoeffs(); + m_assmbScatr = AssmbScatr<TData>::create(this->m_expansionList, "CUDA"); + // m_robBndCond = RobBndCond<TData>::create(this->m_expansionList, + // "CUDA"); + cudaMalloc((void **)&m_p_A, sizeof(TData) * m_nloc); + cudaMalloc((void **)&m_q_A, sizeof(TData) * m_nloc); + cudaMalloc((void **)&m_vExchange, sizeof(TData) * 4); + + // Deterime CUDA grid size. + m_gridSize = GetCUDAGridSize(m_nloc, m_blockSize); + } + + ~OperatorConjGradImpl(void) + { + cudaFree(m_p_A); + cudaFree(m_q_A); + cudaFree(m_vExchange); + } + + void apply(Field<TData, FieldState::Coeff> &in, + Field<TData, FieldState::Coeff> &out) override + { + // Store pointers to temporary fields (device) + auto *p_in = in.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); + auto *p_out = out.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); + auto *p_w_A = m_w_A.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); + auto *p_s_A = m_s_A.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); + auto *p_r_A = m_r_A.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); + auto *p_wk = m_wk.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); + auto p_p_A = m_p_A; + auto p_q_A = m_q_A; + + // Set the fields to zero (device) + cudaMemset(p_out, 0, sizeof(TData) * m_nloc); + cudaMemset(p_w_A, 0, sizeof(TData) * m_nloc); + cudaMemset(p_s_A, 0, sizeof(TData) * m_nloc); + cudaMemset(p_wk, 0, sizeof(TData) * m_nloc); + cudaMemset(p_p_A, 0, sizeof(TData) * m_nloc); + cudaMemset(p_q_A, 0, sizeof(TData) * m_nloc); + + // Convergence parameters (host) + size_t totalIterations = 0; + TData rhsMagnitude; + TData alpha; + TData beta; + TData rho; + TData rho_new; + TData mu; + TData eps; + + // Temporary (device) + cudaMemset(m_vExchange, 0, sizeof(TData) * 4); + + // Copy RHS into initial residual + cudaMemcpy(p_r_A, p_in, sizeof(TData) * m_nloc, + cudaMemcpyDeviceToDevice); + + // Assembly (communication) + m_assmbScatr->apply(m_r_A, m_wk, true); + dotKernel<<<m_gridSize, m_blockSize, sizeof(TData) * m_blockSize>>>( + m_nloc, p_wk, p_r_A, m_vExchange + 2); + cudaMemcpy(&eps, m_vExchange + 2, sizeof(TData), + cudaMemcpyDeviceToHost); + + // Calculate rhs magnitude + m_assmbScatr->apply(m_r_A, m_wk); + dotKernel<<<m_gridSize, m_blockSize, sizeof(TData) * m_blockSize>>>( + m_nloc, p_in, p_wk, m_vExchange + 3); + cudaMemcpy(&rhsMagnitude, m_vExchange + 3, sizeof(TData), + cudaMemcpyDeviceToHost); + rhsMagnitude = (rhsMagnitude > 1.0e-6) ? rhsMagnitude : 1.0; + + // If input residual is less than tolerance skip solve. + if (eps < m_tol * m_tol * rhsMagnitude) + { + return; + } + + // Apply preconditioner + this->m_precon->apply(m_r_A, m_w_A); + + // Perform the method-specific matrix-vector multiply operation. + this->m_LHS->apply(m_w_A, m_s_A); + + // Apply Robin BCs + // m_robBndCond->apply(m_w_A, m_s_A); + + cudaMemset(m_vExchange, 0, sizeof(TData) * 3); + + dotKernel<<<m_gridSize, m_blockSize, sizeof(TData) * m_blockSize>>>( + m_nloc, p_r_A, p_w_A, m_vExchange + 0); + + dotKernel<<<m_gridSize, m_blockSize, sizeof(TData) * m_blockSize>>>( + m_nloc, p_s_A, p_w_A, m_vExchange + 1); + + cudaMemcpy(&rho, m_vExchange + 0, sizeof(TData), + cudaMemcpyDeviceToHost); + cudaMemcpy(&mu, m_vExchange + 1, sizeof(TData), cudaMemcpyDeviceToHost); + beta = 0.0; + alpha = rho / mu; + totalIterations = 1; + while (true) + { + if (totalIterations > m_maxIter) + { + std::cout << "Exceeded max iterations\n"; + return; + } + + // Compute new search direction p_k + daxpyKernel<<<m_gridSize, m_blockSize>>>(m_nloc, beta, p_p_A, p_w_A, + p_p_A); + + // Compute new search direction q_k + daxpyKernel<<<m_gridSize, m_blockSize>>>(m_nloc, beta, p_q_A, p_s_A, + p_q_A); + + // Update solution x_{k+1} + daxpyKernel<<<m_gridSize, m_blockSize>>>(m_nloc, alpha, p_p_A, + p_out, p_out); + + // Update residual vector r_{k+1} + daxpyKernel<<<m_gridSize, m_blockSize>>>(m_nloc, -alpha, p_q_A, + p_r_A, p_r_A); + + // Apply preconditioner + this->m_precon->apply(m_r_A, m_w_A); + + // Perform the method-specific matrix-vector multiply operation. + this->m_LHS->apply(m_w_A, m_s_A); + + // Apply Robin BCs + // m_robBndCond->apply(m_w_A, m_s_A); + + cudaMemset(m_vExchange, 0, sizeof(TData) * 3); + + // <r_{k+1}, w_{k+1}> + dotKernel<<<m_gridSize, m_blockSize, sizeof(TData) * m_blockSize>>>( + m_nloc, p_r_A, p_w_A, m_vExchange + 0); + + // <s_{k+1}, w_{k+1}> + dotKernel<<<m_gridSize, m_blockSize, sizeof(TData) * m_blockSize>>>( + m_nloc, p_s_A, p_w_A, m_vExchange + 1); + + // <r_{k+1}, r_{k+1}> + m_assmbScatr->apply(m_r_A, m_wk, true); + dotKernel<<<m_gridSize, m_blockSize, sizeof(TData) * m_blockSize>>>( + m_nloc, p_wk, p_r_A, m_vExchange + 2); + + cudaMemcpy(&rho_new, m_vExchange + 0, sizeof(TData), + cudaMemcpyDeviceToHost); + cudaMemcpy(&mu, m_vExchange + 1, sizeof(TData), + cudaMemcpyDeviceToHost); + cudaMemcpy(&eps, m_vExchange + 2, sizeof(TData), + cudaMemcpyDeviceToHost); + + std::cout << "Iteration " << totalIterations << " -- eps = " << eps + << "\n"; + + totalIterations++; + + // Test if norm is within tolerance + if (eps < m_tol * m_tol * rhsMagnitude) + { + break; + } + + // Compute search direction and solution coefficients + beta = rho_new / rho; + alpha = rho_new / (mu - rho_new * beta / alpha); + rho = rho_new; + } + } + + // instantiation function for CreatorFunction in Operator Factory + static std::unique_ptr<Operator<TData>> instantiate( + const MultiRegions::ExpListSharedPtr &expansionList) + { + return std::make_unique<OperatorConjGradImpl<TData, ImplCUDA>>( + expansionList); + } + + // className - for OperatorFactory + static std::string className; + +protected: + std::shared_ptr<OperatorAssmbScatr<TData>> m_assmbScatr; + std::shared_ptr<OperatorRobBndCond<TData>> m_robBndCond; + Field<TData, FieldState::Coeff> m_w_A; + Field<TData, FieldState::Coeff> m_s_A; + Field<TData, FieldState::Coeff> m_r_A; + Field<TData, FieldState::Coeff> m_wk; + TData *m_q_A; + TData *m_p_A; + TData *m_vExchange; + TData m_tol; + size_t m_nloc; + size_t m_maxIter; + size_t m_gridSize = 1024; + size_t m_blockSize = 32; +}; + +} // namespace Nektar::Operators::detail diff --git a/Operators/ConjGrad/ConjGradStdMat.hpp b/Operators/ConjGrad/ConjGradStdMat.hpp index b2eac770e2d0bd7179522aa324b4f18cb3353049..d99870f1d7a81143367d0c0ed316ae25bdd00f79 100644 --- a/Operators/ConjGrad/ConjGradStdMat.hpp +++ b/Operators/ConjGrad/ConjGradStdMat.hpp @@ -1,4 +1,4 @@ -#include "Operators/OperatorConjGrad.hpp" +#pragma once #include <algorithm> #include <array> @@ -11,6 +11,7 @@ #include <MultiRegions/ContField.h> #include "Operators/OperatorAssmbScatr.hpp" +#include "Operators/OperatorConjGrad.hpp" #include "Operators/OperatorRobBndCond.hpp" using namespace Nektar; @@ -40,22 +41,19 @@ public: m_maxIter, 5000); contfield->GetSession()->LoadParameter("IterativeSolverTolerance", m_tol, 1.0E-09); - m_assmbMap = contfield->GetLocalToGlobalMap(); + m_nloc = contfield->GetLocalToGlobalMap()->GetNumLocalCoeffs(); m_assmbScatr = AssmbScatr<TData>::create(this->m_expansionList); m_robBndCond = RobBndCond<TData>::create(this->m_expansionList); - m_p_A = std::make_unique<TData[]>(m_assmbMap->GetNumLocalCoeffs()); - m_q_A = std::make_unique<TData[]>(m_assmbMap->GetNumLocalCoeffs()); + m_p_A = std::make_unique<TData[]>(m_nloc); + m_q_A = std::make_unique<TData[]>(m_nloc); } void apply(Field<TData, FieldState::Coeff> &in, Field<TData, FieldState::Coeff> &out) override { - // get number of local coeffs (=size of in/out fields) - size_t nloc = m_assmbMap->GetNumLocalCoeffs(); - - // store pointers to temporary fields - Array<OneD, TData> inArr(nloc, 0.0); - Array<OneD, TData> outArr(nloc, 0.0); + // Store pointers to temporary fields + Array<OneD, TData> inArr(m_nloc, 0.0); + Array<OneD, TData> outArr(m_nloc, 0.0); auto *p_in = inArr.get(); auto *p_out = outArr.get(); auto *p_w_A = m_w_A.GetStorage().GetCPUPtr(); @@ -65,13 +63,14 @@ public: auto *p_p_A = m_p_A.get(); auto *p_q_A = m_q_A.get(); - // set the fields to zero - std::fill(p_w_A, p_w_A + nloc, 0.0); - std::fill(p_s_A, p_s_A + nloc, 0.0); - std::fill(p_wk, p_wk + nloc, 0.0); - std::fill(p_p_A, p_p_A + nloc, 0.0); - std::fill(p_q_A, p_q_A + nloc, 0.0); + // Set the fields to zero + std::fill(p_w_A, p_w_A + m_nloc, 0.0); + std::fill(p_s_A, p_s_A + m_nloc, 0.0); + std::fill(p_wk, p_wk + m_nloc, 0.0); + std::fill(p_p_A, p_p_A + m_nloc, 0.0); + std::fill(p_q_A, p_q_A + m_nloc, 0.0); + // Convergence parameters size_t totalIterations = 0; TData rhsMagnitude; TData alpha; @@ -99,21 +98,21 @@ public: inptr += nSize; } - // copy RHS into initial residual - std::copy(p_in, p_in + nloc, p_r_A); + // Copy RHS into initial residual + std::copy(p_in, p_in + m_nloc, p_r_A); // Assembly (communication) m_assmbScatr->apply(m_r_A, m_wk, true); - vExchange[2] = std::inner_product(p_wk, p_wk + nloc, p_r_A, 0.0); + vExchange[2] = std::inner_product(p_wk, p_wk + m_nloc, p_r_A, 0.0); // Perform inner-product exchanges // m_rowComm->AllReduce(vExchange, Nektar::LibUtilities::ReduceSum); eps = vExchange[2]; - // calculate rhs magnitude + // Calculate rhs magnitude m_assmbScatr->apply(m_r_A, m_wk); - rhsMagnitude = std::inner_product(p_in, p_in + nloc, p_wk, 0.0); + rhsMagnitude = std::inner_product(p_in, p_in + m_nloc, p_wk, 0.0); // m_rowComm->AllReduce(rhsMagnitude, Nektar::LibUtilities::ReduceSum); rhsMagnitude = (rhsMagnitude > 1.0e-6) ? rhsMagnitude : 1.0; @@ -124,16 +123,16 @@ public: } // Apply preconditioner - m_precon->apply(m_r_A, m_w_A); + this->m_precon->apply(m_r_A, m_w_A); // Perform the method-specific matrix-vector multiply operation. - m_LHS->apply(m_w_A, m_s_A); + this->m_LHS->apply(m_w_A, m_s_A); // Apply Robin BCs m_robBndCond->apply(m_w_A, m_s_A); - vExchange[0] = std::inner_product(p_r_A, p_r_A + nloc, p_w_A, 0.0); - vExchange[1] = std::inner_product(p_s_A, p_s_A + nloc, p_w_A, 0.0); + vExchange[0] = std::inner_product(p_r_A, p_r_A + m_nloc, p_w_A, 0.0); + vExchange[1] = std::inner_product(p_s_A, p_s_A + m_nloc, p_w_A, 0.0); // m_rowComm->AllReduce(vExchange, Nektar::LibUtilities::ReduceSum); @@ -142,7 +141,6 @@ public: beta = 0.0; alpha = rho / mu; totalIterations = 1; - while (true) { if (totalIterations > m_maxIter) @@ -151,46 +149,46 @@ public: return; } - // Compute new search direction p_k, q_k - std::transform(p_p_A, p_p_A + nloc, p_w_A, p_p_A, - [&beta](const TData &pElem, const TData &wElem) { - return beta * pElem + wElem; - }); - std::transform(p_q_A, p_q_A + nloc, p_s_A, p_q_A, - [&beta](const TData &qElem, const TData &sElem) { - return beta * qElem + sElem; - }); + // Compute new search direction p_k + std::transform(p_p_A, p_p_A + m_nloc, p_w_A, p_p_A, + [&beta](const TData &pElem, const TData &wElem) + { return beta * pElem + wElem; }); + + // Compute new search direction q_k + std::transform(p_q_A, p_q_A + m_nloc, p_s_A, p_q_A, + [&beta](const TData &qElem, const TData &sElem) + { return beta * qElem + sElem; }); // Update solution x_{k+1} - std::transform(p_p_A, p_p_A + nloc, p_out, p_out, - [&alpha](const TData &pElem, const TData &xElem) { - return alpha * pElem + xElem; - }); + std::transform(p_p_A, p_p_A + m_nloc, p_out, p_out, + [&alpha](const TData &pElem, const TData &xElem) + { return alpha * pElem + xElem; }); // Update residual vector r_{k+1} - std::transform(p_q_A, p_q_A + nloc, p_r_A, p_r_A, - [&alpha](const TData &qElem, const TData &rElem) { - return -alpha * qElem + rElem; - }); + std::transform(p_q_A, p_q_A + m_nloc, p_r_A, p_r_A, + [&alpha](const TData &qElem, const TData &rElem) + { return -alpha * qElem + rElem; }); // Apply preconditioner - m_precon->apply(m_r_A, m_w_A); + this->m_precon->apply(m_r_A, m_w_A); // Perform the method-specific matrix-vector multiply operation. - m_LHS->apply(m_w_A, m_s_A); + this->m_LHS->apply(m_w_A, m_s_A); // Apply Robin BCs m_robBndCond->apply(m_w_A, m_s_A); // <r_{k+1}, w_{k+1}> - vExchange[0] = std::inner_product(p_r_A, p_r_A + nloc, p_w_A, 0.0); + vExchange[0] = + std::inner_product(p_r_A, p_r_A + m_nloc, p_w_A, 0.0); // <s_{k+1}, w_{k+1}> - vExchange[1] = std::inner_product(p_s_A, p_s_A + nloc, p_w_A, 0.0); + vExchange[1] = + std::inner_product(p_s_A, p_s_A + m_nloc, p_w_A, 0.0); // <r_{k+1}, r_{k+1}> m_assmbScatr->apply(m_r_A, m_wk, true); - vExchange[2] = std::inner_product(p_wk, p_wk + nloc, p_r_A, 0.0); + vExchange[2] = std::inner_product(p_wk, p_wk + m_nloc, p_r_A, 0.0); // Perform inner-product exchanges // m_rowComm->AllReduce(vExchange, Nektar::LibUtilities::ReduceSum); @@ -234,19 +232,6 @@ public: } } - void setLHS(const std::shared_ptr<OperatorLinear<TData, FieldState::Coeff, - FieldState::Coeff>> &ptr) - { - m_LHS = ptr; - } - - void setPrecon( - const std::shared_ptr< - OperatorLinear<TData, FieldState::Coeff, FieldState::Coeff>> &ptr) - { - m_precon = ptr; - } - // instantiation function for CreatorFunction in Operator Factory static std::unique_ptr<Operator<TData>> instantiate( const MultiRegions::ExpListSharedPtr &expansionList) @@ -261,22 +246,14 @@ public: protected: std::shared_ptr<OperatorAssmbScatr<TData>> m_assmbScatr; std::shared_ptr<OperatorRobBndCond<TData>> m_robBndCond; - std::shared_ptr<OperatorLinear<TData, FieldState::Coeff, FieldState::Coeff>> - m_LHS; - std::shared_ptr<OperatorLinear<TData, FieldState::Coeff, FieldState::Coeff>> - m_precon; - AssemblyMapCGSharedPtr m_assmbMap; Field<TData, FieldState::Coeff> m_w_A; Field<TData, FieldState::Coeff> m_s_A; Field<TData, FieldState::Coeff> m_r_A; Field<TData, FieldState::Coeff> m_wk; - std::unique_ptr<TData[]> m_w_A_glo; - std::unique_ptr<TData[]> m_r_A_glo; std::unique_ptr<TData[]> m_q_A; std::unique_ptr<TData[]> m_p_A; - Array<OneD, TData> m_local; - Array<OneD, TData> m_global; TData m_tol; + size_t m_nloc; size_t m_maxIter; }; diff --git a/Operators/DiagPrecon/DiagPreconCUDA.hpp b/Operators/DiagPrecon/DiagPreconCUDA.hpp index 1475153ab1bdd8db3c0ff374059017c1aa9bf0f0..d952f9b659668ab9b6f0abf32a498213ca5363d2 100644 --- a/Operators/DiagPrecon/DiagPreconCUDA.hpp +++ b/Operators/DiagPrecon/DiagPreconCUDA.hpp @@ -2,7 +2,7 @@ #include "Field.hpp" #include "Operators/AssmbScatr/AssmbScatrCUDA.hpp" -#include "Operators/DiagPrecon/DiagPreconCUDAKernels.cuh" +#include "Operators/CUDAMathKernels.cuh" #include "Operators/OperatorDiagPrecon.hpp" #include "Operators/OperatorRobBndCond.hpp" @@ -41,9 +41,8 @@ public: std::static_pointer_cast<OperatorAssmbScatrImpl<TData, ImplCUDA>>( AssmbScatr<TData>::create(this->m_expansionList, "CUDA")); - // Determine CUDA grid parameters. - m_gridSize = m_nGlobal / m_blockSize; - m_gridSize += (m_nGlobal % m_blockSize == 0) ? 0 : 1; + // Deterime CUDA grid size. + m_gridSize = GetCUDAGridSize(m_nGlobal - m_nDir, m_blockSize); } ~OperatorDiagPreconImpl(void) @@ -57,8 +56,8 @@ public: { m_assmbScatr->Assemble(in, m_wk); - DiagPreconKernel<<<m_gridSize, m_blockSize>>>(m_nGlobal, m_nDir, m_diag, - m_wk); + vdivKernel<<<m_gridSize, m_blockSize>>>( + m_nGlobal - m_nDir, m_wk + m_nDir, m_diag + m_nDir, m_wk + m_nDir); cudaMemset(m_wk, 0, sizeof(TData) * m_nDir); @@ -69,38 +68,43 @@ public: const std::shared_ptr< OperatorLinear<TData, FieldState::Coeff, FieldState::Coeff>> &op) { - auto robBCOp = RobBndCond<TData>::create(this->m_expansionList); + // auto robBCOp = RobBndCond<TData>::create(this->m_expansionList, + // "CUDA"); Array<OneD, TData> diag(m_nLocal); // create unit vector field to extract diagonal Field<TData, FieldState::Coeff> unit_vec = - Field<TData, FieldState::Coeff>::create( + Field<TData, FieldState::Coeff>::template create<MemoryRegionCUDA>( GetBlockAttributes(FieldState::Coeff, this->m_expansionList)); // create action field to receive column action from unit vector Field<TData, FieldState::Coeff> action = - Field<TData, FieldState::Coeff>::create( + Field<TData, FieldState::Coeff>::template create<MemoryRegionCUDA>( GetBlockAttributes(FieldState::Coeff, this->m_expansionList)); - auto *uvec_ptr = unit_vec.GetStorage().GetCPUPtr(); - auto *actn_ptr = action.GetStorage().GetCPUPtr(); - auto *diag_ptr = diag.get(); + auto *uvec_ptr = + unit_vec.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); + auto *actn_ptr = + action.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); + TData init = 1.0; for (size_t i = 0; i < m_nLocal; ++i) { // set ith term in unit vector to be 1 - *uvec_ptr = 1.0; + cudaMemcpy(uvec_ptr + i, &init, sizeof(TData), + cudaMemcpyHostToDevice); // apply operator to unit vector and store in action field op->apply(unit_vec, action); - robBCOp->apply(unit_vec, action); + // robBCOp->apply(unit_vec, action); // copy ith row term from the action field to get ith diagonal - *(diag_ptr++) = *(actn_ptr++); + cudaMemcpy(diag.get() + i, actn_ptr + i, sizeof(TData), + cudaMemcpyDeviceToHost); // reset ith term in unit vector to be 0 - *(uvec_ptr++) = 0.0; + cudaMemset(uvec_ptr + i, 0, sizeof(TData)); } // Assembly @@ -135,7 +139,7 @@ protected: size_t m_nGlobal; size_t m_nLocal; size_t m_nDir; - size_t m_gridSize; + size_t m_gridSize = 1024; size_t m_blockSize = 32; }; diff --git a/Operators/DiagPrecon/DiagPreconCUDAKernels.cuh b/Operators/DiagPrecon/DiagPreconCUDAKernels.cuh deleted file mode 100644 index 05b9fbe3a47295de27670790f9820ec7527e295b..0000000000000000000000000000000000000000 --- a/Operators/DiagPrecon/DiagPreconCUDAKernels.cuh +++ /dev/null @@ -1,18 +0,0 @@ -namespace Nektar::Operators::detail -{ - -template <typename TData> -__global__ void DiagPreconKernel(const size_t nGlobal, const size_t nDir, - const TData *diagptr, TData *inoutptr) -{ - size_t i = blockDim.x * blockIdx.x + threadIdx.x; - - if (i < nDir || i >= nGlobal) - { - return; - } - - inoutptr[i] /= diagptr[i]; -} - -} // namespace Nektar::Operators::detail diff --git a/Operators/DirBndCond/DirBndCondCUDA.hpp b/Operators/DirBndCond/DirBndCondCUDA.hpp index 8cd9ee0be4ad595ea1608c3254169bcda46d68c2..a57d3ed4c1aefa8e062c3355b46c82053aa003f4 100644 --- a/Operators/DirBndCond/DirBndCondCUDA.hpp +++ b/Operators/DirBndCond/DirBndCondCUDA.hpp @@ -3,6 +3,7 @@ #include "MemoryRegionCUDA.hpp" #include "Operators/DirBndCond/DirBndCondCUDAKernels.cuh" #include "Operators/OperatorDirBndCond.hpp" +#include "Operators/OperatorHelper.cuh" #include <MultiRegions/AssemblyMap/AssemblyMapCG.h> #include <MultiRegions/ContField.h> @@ -133,9 +134,8 @@ public: // Copy memory to GPU, if necessary and get raw pointers. auto *outptr = out.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); - // Deterime CUDA grid parameters. - m_gridSize = m_bndExpSize / m_blockSize; - m_gridSize += (m_bndExpSize % m_blockSize == 0) ? 0 : 1; + // Deterime CUDA grid size. + m_gridSize = GetCUDAGridSize(m_bndExpSize, m_blockSize); if (m_signChange) { @@ -168,9 +168,9 @@ public: // outarr[it] *= -1; // } - // Deterime CUDA grid parameters. - m_gridSize = m_localDirSize / m_blockSize; - m_gridSize += (m_localDirSize % m_blockSize == 0) ? 0 : 1; + // Deterime CUDA grid size. + m_gridSize = GetCUDAGridSize(m_localDirSize, m_blockSize); + LocalDirBndCondKernel<TData><<<m_gridSize, m_blockSize>>>( m_localDirSize, m_locid0, m_locid1, m_locsign, outptr); } @@ -199,7 +199,7 @@ protected: int *m_locid0; int *m_locid1; TData *m_locsign; - size_t m_gridSize; + size_t m_gridSize = 1024; size_t m_blockSize = 32; }; diff --git a/Operators/DirBndCond/DirBndCondCUDAKernels.cuh b/Operators/DirBndCond/DirBndCondCUDAKernels.cuh index 879444cb9e5380a484c7047011faa6801d9563ad..be9ac0a9a933fb6664f073d1e6a94367ff6100c2 100644 --- a/Operators/DirBndCond/DirBndCondCUDAKernels.cuh +++ b/Operators/DirBndCond/DirBndCondCUDAKernels.cuh @@ -14,19 +14,18 @@ __global__ void DirBndCondKernel(const size_t nsize, const int *offsetptr, { size_t i = blockDim.x * blockIdx.x + threadIdx.x; - if (i >= nsize) + while (i < nsize) { - return; - } - - if (bctypeptr[i] == eDirichlet) - { - size_t offset = offsetptr[i]; - size_t ncoeff = ncoeffptr[i]; - for (size_t j = 0; j < ncoeff; j++) + if (bctypeptr[i] == eDirichlet) { - outptr[mapptr[offset + j]] = inptr[offset + j]; + size_t offset = offsetptr[i]; + size_t ncoeff = ncoeffptr[i]; + for (size_t j = 0; j < ncoeff; j++) + { + outptr[mapptr[offset + j]] = inptr[offset + j]; + } } + i += blockDim.x * gridDim.x; } } @@ -39,20 +38,19 @@ __global__ void DirBndCondKernel(const size_t nsize, const int *offsetptr, { size_t i = blockDim.x * blockIdx.x + threadIdx.x; - if (i >= nsize) + while (i < nsize) { - return; - } - - if (bctypeptr[i] == eDirichlet) - { - size_t offset = offsetptr[i]; - size_t ncoeff = ncoeffptr[i]; - for (size_t j = 0; j < ncoeff; j++) + if (bctypeptr[i] == eDirichlet) { - outptr[mapptr[offset + j]] = - signptr[offset + j] * inptr[offset + j]; + size_t offset = offsetptr[i]; + size_t ncoeff = ncoeffptr[i]; + for (size_t j = 0; j < ncoeff; j++) + { + outptr[mapptr[offset + j]] = + signptr[offset + j] * inptr[offset + j]; + } } + i += blockDim.x * gridDim.x; } } @@ -63,12 +61,11 @@ __global__ void LocalDirBndCondKernel(const size_t nsize, const int *id0ptr, { size_t i = blockDim.x * blockIdx.x + threadIdx.x; - if (i >= nsize) + while (i < nsize) { - return; + outptr[id0ptr[i]] = outptr[id1ptr[i]] * signptr[i]; + i += blockDim.x * gridDim.x; } - - outptr[id0ptr[i]] = outptr[id1ptr[i]] * signptr[i]; } } // namespace Nektar::Operators::detail diff --git a/Operators/FwdTrans/FwdTransCUDA.cu b/Operators/FwdTrans/FwdTransCUDA.cu new file mode 100644 index 0000000000000000000000000000000000000000..163fc9a5dc02c835e84d1545418635dddf216d69 --- /dev/null +++ b/Operators/FwdTrans/FwdTransCUDA.cu @@ -0,0 +1,12 @@ +#include "FwdTransCUDA.hpp" + +namespace Nektar::Operators::detail +{ + +// Register implementation with Operator Factory +template <> +std::string OperatorFwdTransImpl<double, ImplCUDA>::className = + GetOperatorFactory<double>().RegisterCreatorFunction( + "FwdTransCUDA", OperatorFwdTransImpl<double, ImplCUDA>::instantiate, + "..."); +} // namespace Nektar::Operators::detail diff --git a/Operators/FwdTrans/FwdTransCUDA.hpp b/Operators/FwdTrans/FwdTransCUDA.hpp new file mode 100644 index 0000000000000000000000000000000000000000..7b583c7fc879f85439455331b0d593eda6c599c7 --- /dev/null +++ b/Operators/FwdTrans/FwdTransCUDA.hpp @@ -0,0 +1,103 @@ +#pragma once + +#include "MemoryRegionCUDA.hpp" +#include "Operators/CUDAMathKernels.cuh" +#include "Operators/OperatorConjGrad.hpp" +#include "Operators/OperatorDirBndCond.hpp" +#include "Operators/OperatorFwdTrans.hpp" +#include "Operators/OperatorHelper.cuh" +#include "Operators/OperatorIProductWRTBase.hpp" +#include "Operators/OperatorMass.hpp" +#include "Operators/OperatorPrecon.hpp" +#include "Operators/OperatorRobBndCond.hpp" + +using vec_t = tinysimd::simd<double>; + +namespace Nektar::Operators::detail +{ + +template <typename TData> +class OperatorFwdTransImpl<TData, ImplCUDA> : public OperatorFwdTrans<TData> +{ +public: + OperatorFwdTransImpl(const MultiRegions::ExpListSharedPtr &expansionList) + : OperatorFwdTrans<TData>(expansionList), + m_rhs(Field<TData, FieldState::Coeff>::template create< + MemoryRegionCUDA>( + GetBlockAttributes(FieldState::Coeff, expansionList))), + m_tmp(Field<TData, FieldState::Coeff>::template create< + MemoryRegionCUDA>( + GetBlockAttributes(FieldState::Coeff, expansionList))) + { + m_MassOp = Mass<TData>::create(this->m_expansionList, "CUDA"); + m_DirBCOp = DirBndCond<TData>::create(this->m_expansionList, "CUDA"); + // m_RobBCOp = RobBndCond<TData>::create(this->m_expansionList, "CUDA"); + m_IProdOp = + IProductWRTBase<TData>::create(this->m_expansionList, "CUDA"); + m_CGOp = ConjGrad<TData>::create(this->m_expansionList, "CUDA"); + m_CGOp->setLHS(m_MassOp); + } + + void apply(Field<TData, FieldState::Phys> &in, + Field<TData, FieldState::Coeff> &out) override + { + size_t nloc = out.template GetStorage<MemoryRegionCUDA>().size(); + auto *outptr = out.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); + auto *rhsptr = + m_rhs.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); + auto *tmpptr = + m_tmp.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); + + // Deterime CUDA grid size. + m_gridSize = GetCUDAGridSize(nloc, m_blockSize); + + // IProductWRT of RHS + m_IProdOp->apply(in, m_rhs); + + // Handle Dirichlet BCs + m_DirBCOp->apply(out); + m_MassOp->apply(out, m_tmp); + subKernel<<<m_gridSize, m_blockSize>>>(nloc, rhsptr, tmpptr, rhsptr); + + // Handle Robin BCs + // m_RobBCOp->apply(out, m_rhs, true); + + // Solve for u_hat using Conjugate Gradient + m_CGOp->apply(m_rhs, m_tmp); + + // Add Dirichlet BCs + addKernel<<<m_gridSize, m_blockSize>>>(nloc, outptr, tmpptr, outptr); + } + + void setPrecon( + const std::shared_ptr<OperatorPrecon<TData>> &precon) override + { + precon->configure(m_MassOp); + + m_CGOp->setPrecon(precon); + } + + // instantiation function for CreatorFunction in OperatorFactory + static std::unique_ptr<Operator<TData>> instantiate( + const MultiRegions::ExpListSharedPtr &expansionList) + { + return std::make_unique<OperatorFwdTransImpl<TData, ImplCUDA>>( + expansionList); + } + + // className - for OperatorFactory + static std::string className; + +protected: + std::shared_ptr<OperatorIProductWRTBase<TData>> m_IProdOp; + std::shared_ptr<OperatorDirBndCond<TData>> m_DirBCOp; + std::shared_ptr<OperatorRobBndCond<TData>> m_RobBCOp; + std::shared_ptr<OperatorMass<TData>> m_MassOp; + std::shared_ptr<OperatorConjGrad<TData>> m_CGOp; + Field<TData, FieldState::Coeff> m_rhs; + Field<TData, FieldState::Coeff> m_tmp; + size_t m_gridSize = 1024; + size_t m_blockSize = 32; +}; + +} // namespace Nektar::Operators::detail diff --git a/Operators/FwdTrans/FwdTransStdMat.hpp b/Operators/FwdTrans/FwdTransStdMat.hpp index dc43cffc1d82bd4ad6543c481b7a0e68dc581eab..37a8a2bdf0e4708e81f78a6773d3d96f27e69d30 100644 --- a/Operators/FwdTrans/FwdTransStdMat.hpp +++ b/Operators/FwdTrans/FwdTransStdMat.hpp @@ -7,7 +7,6 @@ #include "Operators/OperatorMass.hpp" #include "Operators/OperatorPrecon.hpp" #include "Operators/OperatorRobBndCond.hpp" -#include <StdRegions/StdExpansion.h> using vec_t = tinysimd::simd<double>; @@ -24,7 +23,7 @@ public: GetBlockAttributes(FieldState::Coeff, expansionList, vec_t::width), 1, vec_t::alignment)), - m_dir(Field<TData, FieldState::Coeff>::create( + m_tmp(Field<TData, FieldState::Coeff>::create( GetBlockAttributes(FieldState::Coeff, expansionList, vec_t::width), 1, vec_t::alignment)) @@ -43,36 +42,36 @@ public: size_t nloc = out.GetStorage().size(); auto *outptr = out.GetStorage().GetCPUPtr(); auto *rhsptr = m_rhs.GetStorage().GetCPUPtr(); - auto *dirptr = m_dir.GetStorage().GetCPUPtr(); + auto *tmpptr = m_tmp.GetStorage().GetCPUPtr(); // IProductWRT of RHS m_IProdOp->apply(in, m_rhs); // Handle Dirichlet BCs - m_DirBCOp->apply(m_dir); - m_MassOp->apply(m_dir, out); // use out as temporary storage - std::transform( - rhsptr, rhsptr + nloc, outptr, rhsptr, - [](const TData &rhs, const TData &dir) { return rhs - dir; }); + m_DirBCOp->apply(out); + m_MassOp->apply(out, m_tmp); + std::transform(rhsptr, rhsptr + nloc, tmpptr, rhsptr, + [](const TData &rhs, const TData &dir) + { return rhs - dir; }); // Handle Robin BCs - m_RobBCOp->apply(m_dir, m_rhs, true); + m_RobBCOp->apply(out, m_rhs, true); // Solve for u_hat using Conjugate Gradient - m_CGOp->apply(m_rhs, out); + m_CGOp->apply(m_rhs, m_tmp); // Add Dirichlet BCs - std::transform( - outptr, outptr + nloc, dirptr, outptr, - [](const TData &x, const TData &dir) { return x + dir; }); + std::transform(outptr, outptr + nloc, tmpptr, outptr, + [](const TData &x, const TData &diff) + { return x + diff; }); } void setPrecon( const std::shared_ptr<OperatorPrecon<TData>> &precon) override { - m_CGOp->setPrecon(precon); - precon->configure(m_MassOp); + + m_CGOp->setPrecon(precon); } // instantiation function for CreatorFunction in OperatorFactory @@ -93,7 +92,7 @@ protected: std::shared_ptr<OperatorMass<TData>> m_MassOp; std::shared_ptr<OperatorConjGrad<TData>> m_CGOp; Field<TData, FieldState::Coeff> m_rhs; - Field<TData, FieldState::Coeff> m_dir; + Field<TData, FieldState::Coeff> m_tmp; }; } // namespace Nektar::Operators::detail diff --git a/Operators/HelmSolve/HelmSolveCUDA.cu b/Operators/HelmSolve/HelmSolveCUDA.cu new file mode 100644 index 0000000000000000000000000000000000000000..1911225229379b2d112e3885a52bba969436378d --- /dev/null +++ b/Operators/HelmSolve/HelmSolveCUDA.cu @@ -0,0 +1,13 @@ +#include "HelmSolveCUDA.hpp" + +namespace Nektar::Operators::detail +{ + +// Register implementation with Operator Factory +template <> +std::string OperatorHelmSolveImpl<double, ImplCUDA>::className = + GetOperatorFactory<double>().RegisterCreatorFunction( + "HelmSolveCUDA", + OperatorHelmSolveImpl<double, ImplCUDA>::instantiate, "..."); + +} // namespace Nektar::Operators::detail diff --git a/Operators/HelmSolve/HelmSolveCUDA.hpp b/Operators/HelmSolve/HelmSolveCUDA.hpp new file mode 100644 index 0000000000000000000000000000000000000000..fabde0bb96ce05962dbd93f46ce8f0a37f1db7e4 --- /dev/null +++ b/Operators/HelmSolve/HelmSolveCUDA.hpp @@ -0,0 +1,116 @@ +#pragma once + +#include "MemoryRegionCUDA.hpp" +#include "Operators/CUDAMathKernels.cuh" +#include "Operators/OperatorConjGrad.hpp" +#include "Operators/OperatorDirBndCond.hpp" +#include "Operators/OperatorHelmSolve.hpp" +#include "Operators/OperatorHelmholtz.hpp" +#include "Operators/OperatorHelper.cuh" +#include "Operators/OperatorIProductWRTBase.hpp" +#include "Operators/OperatorLinear.hpp" +#include "Operators/OperatorNeuBndCond.hpp" +#include "Operators/OperatorPrecon.hpp" +#include "Operators/OperatorRobBndCond.hpp" + +using vec_t = tinysimd::simd<double>; + +namespace Nektar::Operators::detail +{ + +template <typename TData> +class OperatorHelmSolveImpl<TData, ImplCUDA> : public OperatorHelmSolve<TData> +{ +public: + OperatorHelmSolveImpl(const MultiRegions::ExpListSharedPtr &expansionList) + : OperatorHelmSolve<TData>(expansionList), + m_rhs(Field<TData, FieldState::Coeff>::template create< + MemoryRegionCUDA>( + GetBlockAttributes(FieldState::Coeff, expansionList))), + m_tmp(Field<TData, FieldState::Coeff>::template create< + MemoryRegionCUDA>( + GetBlockAttributes(FieldState::Coeff, expansionList))) + { + m_IProdOp = + IProductWRTBase<TData>::create(this->m_expansionList, "CUDA"); + m_DirBCOp = DirBndCond<TData>::create(this->m_expansionList, "CUDA"); + m_NeuBCOp = NeuBndCond<TData>::create(this->m_expansionList, "CUDA"); + // m_RobBCOp = RobBndCond<TData>::create(this->m_expansionList, "CUDA"); + m_HelmOp = Helmholtz<TData>::create(this->m_expansionList, "CUDA"); + m_CGOp = ConjGrad<TData>::create(this->m_expansionList, "CUDA"); + m_CGOp->setLHS(m_HelmOp); + } + + void apply(Field<TData, FieldState::Phys> &in, + Field<TData, FieldState::Coeff> &out) override + { + size_t nloc = out.template GetStorage<MemoryRegionCUDA>().size(); + auto *outptr = out.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); + auto *rhsptr = + m_rhs.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); + auto *tmpptr = + m_tmp.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); + + // Deterime CUDA grid size. + m_gridSize = GetCUDAGridSize(nloc, m_blockSize); + + // IProductWRT of RHS + m_IProdOp->apply(in, m_rhs); + negKernel<<<m_gridSize, m_blockSize>>>(nloc, rhsptr, rhsptr); + + // Handle Neumann BCs on RHS + m_NeuBCOp->apply(m_rhs); + + // Handle Dirichlet BCs + m_DirBCOp->apply(out); + m_HelmOp->apply(out, m_tmp); + subKernel<<<m_gridSize, m_blockSize>>>(nloc, rhsptr, tmpptr, rhsptr); + + // Handle Robin BCs + // m_RobBCOp->apply(out, m_rhs, true); + + // Solve for u_hat using Conjugate Gradient + m_CGOp->apply(m_rhs, m_tmp); + + // Add Dirichlet BCs + addKernel<<<m_gridSize, m_blockSize>>>(nloc, outptr, tmpptr, outptr); + } + + void setLambda(const TData &lambda) + { + m_HelmOp->setLambda(lambda); + } + + void setPrecon( + const std::shared_ptr<OperatorPrecon<TData>> &precon) override + { + precon->configure(m_HelmOp); + + m_CGOp->setPrecon(precon); + } + + // instantiation function for CreatorFunction in OperatorFactory + static std::unique_ptr<Operator<TData>> instantiate( + const MultiRegions::ExpListSharedPtr &expansionList) + { + return std::make_unique<OperatorHelmSolveImpl<TData, ImplCUDA>>( + expansionList); + } + + // className - for OperatorFactory + static std::string className; + +protected: + std::shared_ptr<OperatorIProductWRTBase<TData>> m_IProdOp; + std::shared_ptr<OperatorDirBndCond<TData>> m_DirBCOp; + std::shared_ptr<OperatorNeuBndCond<TData>> m_NeuBCOp; + std::shared_ptr<OperatorRobBndCond<TData>> m_RobBCOp; + std::shared_ptr<OperatorHelmholtz<TData>> m_HelmOp; + std::shared_ptr<OperatorConjGrad<TData>> m_CGOp; + Field<TData, FieldState::Coeff> m_rhs; + Field<TData, FieldState::Coeff> m_tmp; + size_t m_gridSize = 1024; + size_t m_blockSize = 32; +}; + +} // namespace Nektar::Operators::detail diff --git a/Operators/HelmSolve/HelmSolveStdMat.hpp b/Operators/HelmSolve/HelmSolveStdMat.hpp index be1b56224afb92f61605654e8509455c40eb79e1..30e8fe5f233a708c6212c1866aa3e029eeb83500 100644 --- a/Operators/HelmSolve/HelmSolveStdMat.hpp +++ b/Operators/HelmSolve/HelmSolveStdMat.hpp @@ -1,9 +1,8 @@ #pragma once -#include "Operators/OperatorHelmSolve.hpp" - #include "Operators/OperatorConjGrad.hpp" #include "Operators/OperatorDirBndCond.hpp" +#include "Operators/OperatorHelmSolve.hpp" #include "Operators/OperatorHelmholtz.hpp" #include "Operators/OperatorIProductWRTBase.hpp" #include "Operators/OperatorLinear.hpp" @@ -11,10 +10,6 @@ #include "Operators/OperatorPrecon.hpp" #include "Operators/OperatorRobBndCond.hpp" -using namespace Nektar; -using namespace Nektar::Operators; -using namespace Nektar::MultiRegions; - using vec_t = tinysimd::simd<double>; namespace Nektar::Operators::detail @@ -30,7 +25,7 @@ public: GetBlockAttributes(FieldState::Coeff, expansionList, vec_t::width), 1, vec_t::alignment)), - m_dir(Field<TData, FieldState::Coeff>::create( + m_tmp(Field<TData, FieldState::Coeff>::create( GetBlockAttributes(FieldState::Coeff, expansionList, vec_t::width), 1, vec_t::alignment)) @@ -50,7 +45,7 @@ public: size_t nloc = out.GetStorage().size(); auto *outptr = out.GetStorage().GetCPUPtr(); auto *rhsptr = m_rhs.GetStorage().GetCPUPtr(); - auto *dirptr = m_dir.GetStorage().GetCPUPtr(); + auto *tmpptr = m_tmp.GetStorage().GetCPUPtr(); // IProductWRT of RHS m_IProdOp->apply(in, m_rhs); @@ -60,22 +55,22 @@ public: m_NeuBCOp->apply(m_rhs); // Handle Dirichlet BCs - m_DirBCOp->apply(m_dir); - m_HelmOp->apply(m_dir, out); // use out as temporary storage - std::transform( - rhsptr, rhsptr + nloc, outptr, rhsptr, - [](const TData &rhs, const TData &dir) { return rhs - dir; }); + m_DirBCOp->apply(out); + m_HelmOp->apply(out, m_tmp); + std::transform(rhsptr, rhsptr + nloc, tmpptr, rhsptr, + [](const TData &rhs, const TData &dir) + { return rhs - dir; }); // Handle Robin BCs - m_RobBCOp->apply(m_dir, m_rhs, true); + m_RobBCOp->apply(out, m_rhs, true); // Solve for u_hat using Conjugate Gradient - m_CGOp->apply(m_rhs, out); + m_CGOp->apply(m_rhs, m_tmp); // Add Dirichlet BCs - std::transform( - outptr, outptr + nloc, dirptr, outptr, - [](const TData &x, const TData &dir) { return x + dir; }); + std::transform(outptr, outptr + nloc, tmpptr, outptr, + [](const TData &x, const TData &diff) + { return x + diff; }); } void setLambda(const TData &lambda) @@ -110,7 +105,7 @@ protected: std::shared_ptr<OperatorHelmholtz<TData>> m_HelmOp; std::shared_ptr<OperatorConjGrad<TData>> m_CGOp; Field<TData, FieldState::Coeff> m_rhs; - Field<TData, FieldState::Coeff> m_dir; + Field<TData, FieldState::Coeff> m_tmp; }; } // namespace Nektar::Operators::detail diff --git a/Operators/Helmholtz/HelmholtzCUDA.hpp b/Operators/Helmholtz/HelmholtzCUDA.hpp index 17a293ec0a8f761b58e95efa3750bbd6eae11a2d..9294b23d8e4d8ef21c654929d8108fa0d7c4c6e3 100644 --- a/Operators/Helmholtz/HelmholtzCUDA.hpp +++ b/Operators/Helmholtz/HelmholtzCUDA.hpp @@ -88,9 +88,8 @@ public: auto nCoord = expPtr->GetCoordim(); auto nqTot = expPtr->GetTotPoints(); - // Determine CUDA grid parameters. - m_gridSize = (nElmts * nqTot) / m_blockSize; - m_gridSize += ((nElmts * nqTot) % m_blockSize == 0) ? 0 : 1; + // Deterime CUDA grid size. + m_gridSize = GetCUDAGridSize(nqTot * nElmts, m_blockSize); // Multiply by diffusion coefficient. if (nCoord == 1) @@ -138,8 +137,8 @@ private: Field<TData, FieldState::Phys> m_bwd; Field<TData, FieldState::Phys> m_deriv; TData *m_diffCoeff; + size_t m_gridSize = 1024; size_t m_blockSize = 32; - size_t m_gridSize; }; } // namespace Nektar::Operators::detail diff --git a/Operators/Helmholtz/HelmholtzCUDAKernels.cuh b/Operators/Helmholtz/HelmholtzCUDAKernels.cuh index 02f9154de0b855a79c302334ff04a2b9dd2f41a1..65b379058d3ea98949abcb56f8ec14ef5d65602e 100644 --- a/Operators/Helmholtz/HelmholtzCUDAKernels.cuh +++ b/Operators/Helmholtz/HelmholtzCUDAKernels.cuh @@ -7,12 +7,11 @@ __global__ void DiffusionCoeff1DKernel(const size_t nsize, { size_t i = blockDim.x * blockIdx.x + threadIdx.x; - if (i >= nsize) + while (i < nsize) { - return; + deriv0[i] *= diffCoeff[0]; + i += blockDim.x * gridDim.x; } - - deriv0[i] *= diffCoeff[0]; } template <typename TData> @@ -32,15 +31,14 @@ __global__ void DiffusionCoeff2DKernel(const size_t nsize, size_t i = blockDim.x * blockIdx.x + threadIdx.x; - if (i >= nsize) + while (i < nsize) { - return; - } - - TData deriv[2] = {deriv0[i], deriv1[i]}; + TData deriv[2] = {deriv0[i], deriv1[i]}; - deriv0[i] = s_diffCoeff[0] * deriv[0] + s_diffCoeff[1] * deriv[1]; - deriv1[i] = s_diffCoeff[2] * deriv[0] + s_diffCoeff[3] * deriv[1]; + deriv0[i] = s_diffCoeff[0] * deriv[0] + s_diffCoeff[1] * deriv[1]; + deriv1[i] = s_diffCoeff[2] * deriv[0] + s_diffCoeff[3] * deriv[1]; + i += blockDim.x * gridDim.x; + } } template <typename TData> @@ -60,19 +58,18 @@ __global__ void DiffusionCoeff3DKernel(const size_t nsize, TData *diffCoeff, size_t i = blockDim.x * blockIdx.x + threadIdx.x; - if (i >= nsize) + while (i < nsize) { - return; + TData deriv[3] = {deriv0[i], deriv1[i], deriv2[i]}; + + deriv0[i] = s_diffCoeff[0] * deriv[0] + s_diffCoeff[1] * deriv[1] + + s_diffCoeff[2] * deriv[2]; + deriv1[i] = s_diffCoeff[3] * deriv[0] + s_diffCoeff[4] * deriv[1] + + s_diffCoeff[5] * deriv[2]; + deriv2[i] = s_diffCoeff[6] * deriv[0] + s_diffCoeff[7] * deriv[1] + + s_diffCoeff[8] * deriv[2]; + i += blockDim.x * gridDim.x; } - - TData deriv[3] = {deriv0[i], deriv1[i], deriv2[i]}; - - deriv0[i] = s_diffCoeff[0] * deriv[0] + s_diffCoeff[1] * deriv[1] + - s_diffCoeff[2] * deriv[2]; - deriv1[i] = s_diffCoeff[3] * deriv[0] + s_diffCoeff[4] * deriv[1] + - s_diffCoeff[5] * deriv[2]; - deriv2[i] = s_diffCoeff[6] * deriv[0] + s_diffCoeff[7] * deriv[1] + - s_diffCoeff[8] * deriv[2]; } } // namespace Nektar::Operators::detail diff --git a/Operators/IProductWRTBase/IProductWRTBaseCUDA.hpp b/Operators/IProductWRTBase/IProductWRTBaseCUDA.hpp index ec4d03beba0b190e9c274168946befab788e3ae3..dc2632c0a84bac8f8d461b8fff85c6bf4b332300 100644 --- a/Operators/IProductWRTBase/IProductWRTBaseCUDA.hpp +++ b/Operators/IProductWRTBase/IProductWRTBaseCUDA.hpp @@ -100,9 +100,8 @@ public: auto deformed = expPtr->GetMetricInfo()->GetGtype() == SpatialDomains::eDeformed; - // Deterime CUDA grid parameters. - m_gridSize = nElmts / m_blockSize; - m_gridSize += (nElmts % m_blockSize == 0) ? 0 : 1; + // Deterime CUDA grid size. + m_gridSize = GetCUDAGridSize(nElmts, m_blockSize); // Flag for collapsed coordinate correction. bool correct = expPtr->GetBasis(0)->GetBasisType() == diff --git a/Operators/IProductWRTBase/IProductWRTBaseMatFreeKernels.hpp b/Operators/IProductWRTBase/IProductWRTBaseMatFreeKernels.hpp index 2f765d790f12a54522fbaba0f63506d8ac8ab6be..a2a5beb3dd4421a8729e90cce3f720505ff384d0 100644 --- a/Operators/IProductWRTBase/IProductWRTBaseMatFreeKernels.hpp +++ b/Operators/IProductWRTBase/IProductWRTBaseMatFreeKernels.hpp @@ -9,7 +9,7 @@ using vec_t = simd<NekDouble>; template <bool SCALE, bool APPEND> NEK_FORCE_INLINE static void ScaleAppend(vec_t &store, vec_t &pos, - NekDouble scale) + [[maybe_unused]] NekDouble scale) { if constexpr (SCALE && APPEND) { diff --git a/Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDA.hpp b/Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDA.hpp index 7b34b6edd3a142c35561ccc0035d886e83589a54..bb475d9d9fd8f77dec7f42786c047feed4effafa 100644 --- a/Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDA.hpp +++ b/Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDA.hpp @@ -1,3 +1,5 @@ +#pragma once + #include "MemoryRegionCUDA.hpp" #include "Operators/IProductWRTBase/IProductWRTBaseCUDA.hpp" #include "Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDAKernels.cuh" @@ -164,9 +166,8 @@ public: auto deformed = expPtr->GetMetricInfo()->GetGtype() == SpatialDomains::eDeformed; - // Deterime CUDA grid parameters. - m_gridSize = nElmts / m_blockSize; - m_gridSize += (nElmts % m_blockSize == 0) ? 0 : 1; + // Deterime CUDA grid size. + m_gridSize = GetCUDAGridSize(nElmts, m_blockSize); // Flag for collapsed coordinate correction. bool correct = expPtr->GetBasis(0)->GetBasisType() == diff --git a/Operators/Identity/IdentityCUDA.cu b/Operators/Identity/IdentityCUDA.cu index 395628871b333361aa9b13965470b53246faa1ff..e497137a69d83b179d9d271a60e99fd207d63a39 100644 --- a/Operators/Identity/IdentityCUDA.cu +++ b/Operators/Identity/IdentityCUDA.cu @@ -2,6 +2,8 @@ namespace Nektar::Operators::detail { + +// Register implementation with Operator Factory template <> std::string OperatorIdentityImpl<double, FieldState::Coeff, ImplCUDA>::className = @@ -9,10 +11,7 @@ std::string OperatorIdentityImpl<double, FieldState::Coeff, "IdentityCoeffCUDA", OperatorIdentityImpl<double, FieldState::Coeff, ImplCUDA>::instantiate, "..."); -} -namespace Nektar::Operators::detail -{ template <> std::string OperatorIdentityImpl<double, FieldState::Phys, ImplCUDA>::className = @@ -20,4 +19,4 @@ std::string OperatorIdentityImpl<double, FieldState::Phys, "IdentityPhysCUDA", OperatorIdentityImpl<double, FieldState::Phys, ImplCUDA>::instantiate, "..."); -} \ No newline at end of file +} // namespace Nektar::Operators::detail diff --git a/Operators/Identity/IdentityCUDA.hpp b/Operators/Identity/IdentityCUDA.hpp index bb9dc75fa37038737d6feafd14fa8e3ae4638871..c64d5c3f90ea42630a06728679b69ab03fb6c927 100644 --- a/Operators/Identity/IdentityCUDA.hpp +++ b/Operators/Identity/IdentityCUDA.hpp @@ -37,10 +37,6 @@ public: // className - for OperatorFactory static std::string className; - -protected: - size_t m_gridSize; - size_t m_blockSize = 32; }; } // namespace Nektar::Operators::detail diff --git a/Operators/Identity/IdentityStdMat.hpp b/Operators/Identity/IdentityStdMat.hpp index 8bc6d552b69862303e36b9b761c2c4eb83e2c601..e067df69ecd94002b2b16d2e59e4c063e1a69e38 100644 --- a/Operators/Identity/IdentityStdMat.hpp +++ b/Operators/Identity/IdentityStdMat.hpp @@ -19,10 +19,10 @@ public: void apply(Field<TData, TFieldState> &in, Field<TData, TFieldState> &out) override { - size_t N = in.GetStorage().size(); - auto pIn = in.GetStorage().GetCPUPtr(); - auto pOut = out.GetStorage().GetCPUPtr(); - std::copy(pIn, pIn + N, pOut); + size_t N = in.GetStorage().size(); + auto inptr = in.GetStorage().GetCPUPtr(); + auto outptr = out.GetStorage().GetCPUPtr(); + std::copy(inptr, inptr + N, outptr); } // instantiation function for CreatorFunction in OperatorFactory diff --git a/Operators/Mass/MassCUDA.cu b/Operators/Mass/MassCUDA.cu index 68aafa9b8e150233ff8f35ecb5692bdbde9c93d4..d19aefeb32605b9dd99bf4b4afe9126c73173549 100644 --- a/Operators/Mass/MassCUDA.cu +++ b/Operators/Mass/MassCUDA.cu @@ -2,8 +2,11 @@ namespace Nektar::Operators::detail { + +// Add different Mass implementations to the factory. template <> std::string OperatorMassImpl<double, ImplCUDA>::className = GetOperatorFactory<double>().RegisterCreatorFunction( "MassCUDA", OperatorMassImpl<double, ImplCUDA>::instantiate, "..."); -} + +} // namespace Nektar::Operators::detail diff --git a/Operators/Mass/MassCUDA.hpp b/Operators/Mass/MassCUDA.hpp index 8575a961dbf55afd673776eaf05c2eda31f8191f..784f09ef4f214b9a7a1556af0c0f932ebee81e5c 100644 --- a/Operators/Mass/MassCUDA.hpp +++ b/Operators/Mass/MassCUDA.hpp @@ -7,7 +7,6 @@ namespace Nektar::Operators::detail { -// standard matrix implementation template <typename TData> class OperatorMassImpl<TData, ImplCUDA> : public OperatorMass<TData> { diff --git a/Operators/Matrix/MatrixCUDA.hpp b/Operators/Matrix/MatrixCUDA.hpp index 3ab7e5987197a1212ad6525833433a3d27a3be84..0ab87111e827246be4dd9f7ac603b27b37f19ad1 100644 --- a/Operators/Matrix/MatrixCUDA.hpp +++ b/Operators/Matrix/MatrixCUDA.hpp @@ -91,9 +91,8 @@ public: ? expPtr->GetNcoeffs() : expPtr->GetTotPoints(); - // Deterime CUDA grid parameters. - m_gridSize = nElmts / m_blockSize; - m_gridSize += (nElmts % m_blockSize == 0) ? 0 : 1; + // Deterime CUDA grid size. + m_gridSize = GetCUDAGridSize(nElmts, m_blockSize); MatrixKernel<<<m_gridSize, m_blockSize>>>(numPts, nElmts, m_size, m_matrix, inptr, outptr); @@ -118,7 +117,7 @@ public: private: size_t m_size; TData *m_matrix; - size_t m_gridSize; + size_t m_gridSize = 1024; size_t m_blockSize = 32; }; diff --git a/Operators/Matrix/MatrixCUDAKernels.cuh b/Operators/Matrix/MatrixCUDAKernels.cuh index ca9358bcaf3e2f22b609610252e17305c19abfd6..7fb94c9959dcd5dd7813269842077ad4aaa741b4 100644 --- a/Operators/Matrix/MatrixCUDAKernels.cuh +++ b/Operators/Matrix/MatrixCUDAKernels.cuh @@ -7,21 +7,20 @@ __global__ void MatrixKernel(const size_t numPts, const size_t nelmt, { size_t e = blockDim.x * blockIdx.x + threadIdx.x; - if (e >= nelmt) + while (e < nelmt) { - return; - } - - const TData *matrix = mat + e * numPts; - const TData *inptr = in + e * numPts; - TData *outptr = out + e * numPts; + const TData *matrix = mat + e * numPts; + const TData *inptr = in + e * numPts; + TData *outptr = out + e * numPts; - for (size_t j = 0; j < size * size; j += size) - { - for (size_t i = 0; i < numPts; ++i) + for (size_t j = 0; j < size * size; j += size) { - outptr[i] += inptr[i] * matrix[j + i]; + for (size_t i = 0; i < numPts; ++i) + { + outptr[i] += inptr[i] * matrix[j + i]; + } } + e += blockDim.x * gridDim.x; } } diff --git a/Operators/NeuBndCond/NeuBndCondCUDA.hpp b/Operators/NeuBndCond/NeuBndCondCUDA.hpp index 86543eb80a37423ebc9337085a6fb32472158e2f..2b3d7db2cd88546abccf8075b86161bd0567f2b5 100644 --- a/Operators/NeuBndCond/NeuBndCondCUDA.hpp +++ b/Operators/NeuBndCond/NeuBndCondCUDA.hpp @@ -2,6 +2,7 @@ #include "MemoryRegionCUDA.hpp" #include "Operators/NeuBndCond/NeuBndCondCUDAKernels.cuh" +#include "Operators/OperatorHelper.cuh" #include "Operators/OperatorNeuBndCond.hpp" #include <MultiRegions/AssemblyMap/AssemblyMapCG.h> @@ -85,6 +86,9 @@ public: cudaMalloc((void **)&m_coeff, sizeof(TData) * coeff.size()); cudaMemcpy(m_coeff, coeff.get(), sizeof(TData) * coeff.size(), cudaMemcpyHostToDevice); + + // Deterime CUDA grid parameters. + m_gridSize = GetCUDAGridSize(m_bndExpSize, m_blockSize); } ~OperatorNeuBndCondImpl(void) @@ -106,10 +110,6 @@ public: auto *inoutptr = inout.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); - // Deterime CUDA grid parameters. - m_gridSize = m_bndExpSize / m_blockSize; - m_gridSize += (m_bndExpSize % m_blockSize == 0) ? 0 : 1; - if (m_signChange) { NeuBndCondKernel<TData><<<m_gridSize, m_blockSize>>>( @@ -144,7 +144,7 @@ protected: int *m_map; int *m_ncoeff; int *m_offset; - size_t m_gridSize; + size_t m_gridSize = 1024; size_t m_blockSize = 32; }; diff --git a/Operators/NeuBndCond/NeuBndCondCUDAKernels.cuh b/Operators/NeuBndCond/NeuBndCondCUDAKernels.cuh index 1ad7f1fdaf31db8335995738d5919f78a2e6b0a8..86019358d7020a29add9ee6fbb002b49d51b65ba 100644 --- a/Operators/NeuBndCond/NeuBndCondCUDAKernels.cuh +++ b/Operators/NeuBndCond/NeuBndCondCUDAKernels.cuh @@ -14,20 +14,18 @@ __global__ void NeuBndCondKernel(const size_t nsize, const int *offsetptr, { size_t i = blockDim.x * blockIdx.x + threadIdx.x; - if (i >= nsize) + while (i < nsize) { - return; - } - - if (bctypeptr[i] == eNeumann || bctypeptr[i] == eRobin) - { - size_t offset = offsetptr[i]; - size_t ncoeff = ncoeffptr[i]; - for (size_t j = 0; j < ncoeff; j++) + if (bctypeptr[i] == eNeumann || bctypeptr[i] == eRobin) { - // atomicAdd(outptr + mapptr[offset + j], inptr[offset + j]); - outptr[mapptr[offset + j]] += inptr[offset + j]; + size_t offset = offsetptr[i]; + size_t ncoeff = ncoeffptr[i]; + for (size_t j = 0; j < ncoeff; j++) + { + outptr[mapptr[offset + j]] += inptr[offset + j]; + } } + i += blockDim.x * gridDim.x; } } @@ -40,22 +38,19 @@ __global__ void NeuBndCondKernel(const size_t nsize, const int *offsetptr, { size_t i = blockDim.x * blockIdx.x + threadIdx.x; - if (i >= nsize) - { - return; - } - - if (bctypeptr[i] == eNeumann || bctypeptr[i] == eRobin) + while (i < nsize) { - size_t offset = offsetptr[i]; - size_t ncoeff = ncoeffptr[i]; - for (size_t j = 0; j < ncoeff; j++) + if (bctypeptr[i] == eNeumann || bctypeptr[i] == eRobin) { - // atomicAdd(outptr + mapptr[offset + j], signptr[offset + j] * - // inptr[offset + j]); - outptr[mapptr[offset + j]] += - signptr[offset + j] * inptr[offset + j]; + size_t offset = offsetptr[i]; + size_t ncoeff = ncoeffptr[i]; + for (size_t j = 0; j < ncoeff; j++) + { + outptr[mapptr[offset + j]] += + signptr[offset + j] * inptr[offset + j]; + } } + i += blockDim.x * gridDim.x; } } diff --git a/Operators/NullPrecon/NullPreconCUDA.hpp b/Operators/NullPrecon/NullPreconCUDA.hpp index abfcba733e285f79f6f3bea8509d55c3820ebe91..a9561267251db02c89699c432eb1e21626c9cc3b 100644 --- a/Operators/NullPrecon/NullPreconCUDA.hpp +++ b/Operators/NullPrecon/NullPreconCUDA.hpp @@ -1,7 +1,5 @@ #pragma once -#include "Field.hpp" - #include "Operators/OperatorAssmbScatr.hpp" #include "Operators/OperatorNullPrecon.hpp" diff --git a/Operators/NullPrecon/NullPreconStdMat.hpp b/Operators/NullPrecon/NullPreconStdMat.hpp index d7454be300751681cb4b467f5b8cbdb50c0ee3d2..bf3531dc3973dccca717fe2c7e5815c593e6c657 100644 --- a/Operators/NullPrecon/NullPreconStdMat.hpp +++ b/Operators/NullPrecon/NullPreconStdMat.hpp @@ -1,7 +1,5 @@ #pragma once -#include "Field.hpp" - #include "Operators/OperatorAssmbScatr.hpp" #include "Operators/OperatorNullPrecon.hpp" diff --git a/Operators/OperatorConjGrad.hpp b/Operators/OperatorConjGrad.hpp index cd849f19b848cf4d18c18e461efdf2ef1b5397d4..cd52fb51e6857d6adc38bce19d158db185a2192f 100644 --- a/Operators/OperatorConjGrad.hpp +++ b/Operators/OperatorConjGrad.hpp @@ -27,13 +27,25 @@ public: apply(in, out); } - virtual void setLHS( - const std::shared_ptr<OperatorLinear<TData, FieldState::Coeff, - FieldState::Coeff>> &ptr) = 0; - virtual void setPrecon( - const std::shared_ptr<OperatorLinear<TData, FieldState::Coeff, - FieldState::Coeff>> &ptr) = 0; + void setLHS(const std::shared_ptr<OperatorLinear<TData, FieldState::Coeff, + FieldState::Coeff>> &ptr) + { + m_LHS = ptr; + } + + void setPrecon( + const std::shared_ptr< + OperatorLinear<TData, FieldState::Coeff, FieldState::Coeff>> &ptr) + { + m_precon = ptr; + } + +protected: + std::shared_ptr<OperatorLinear<TData, FieldState::Coeff, FieldState::Coeff>> + m_LHS; + std::shared_ptr<OperatorLinear<TData, FieldState::Coeff, FieldState::Coeff>> + m_precon; }; // Descriptor / traits class for ConjGrad to be used by Operator create function diff --git a/Operators/OperatorHelper.cuh b/Operators/OperatorHelper.cuh index 63379da91dd4e6de1e3df603c98b2e1411838e9f..c470db7c780658fd4201609aabe88fab53c597fc 100644 --- a/Operators/OperatorHelper.cuh +++ b/Operators/OperatorHelper.cuh @@ -8,6 +8,11 @@ namespace Nektar::Operators { +static size_t GetCUDAGridSize(size_t ndata, size_t blockSize) +{ + return (ndata + blockSize - 1) / blockSize; +} + template <typename TData> using DataMap = std::map<std::vector<LibUtilities::BasisKey>, std::vector<TData *>>; diff --git a/Operators/PhysDeriv/PhysDerivCUDA.hpp b/Operators/PhysDeriv/PhysDerivCUDA.hpp index 55718baef5d454decb2605e6690a676fc7b201af..197c1e06db4ad0b49ee6675bdc8c6d2152af4d81 100644 --- a/Operators/PhysDeriv/PhysDerivCUDA.hpp +++ b/Operators/PhysDeriv/PhysDerivCUDA.hpp @@ -1,3 +1,5 @@ +# pragma once + #include "MemoryRegionCUDA.hpp" #include "Operators/OperatorHelper.cuh" #include "Operators/OperatorPhysDeriv.hpp" @@ -101,9 +103,8 @@ public: auto deformed = expPtr->GetMetricInfo()->GetGtype() == SpatialDomains::eDeformed; - // Determine CUDA grid parameters. - m_gridSize = nElmts / m_blockSize; - m_gridSize += (nElmts % m_blockSize == 0) ? 0 : 1; + // Determine CUDA grid size. + m_gridSize = GetCUDAGridSize(nElmts, m_blockSize); // Fetch basis key for the current element type. for (size_t d = 0; d < expPtr->GetShapeDimension(); d++) diff --git a/run/Helmholtz2D_varP.xml b/run/Helmholtz2D_varP.xml new file mode 100644 index 0000000000000000000000000000000000000000..321213844432ceb285f6e4a4d6423acf89445ecc --- /dev/null +++ b/run/Helmholtz2D_varP.xml @@ -0,0 +1,346 @@ +<?xml version="1.0" encoding="utf-8"?> + +<NEKTAR xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:noNamespaceSchemaLocation="http://www.nektar.info/nektar.xsd"> + + <GEOMETRY DIM="2" SPACE="2"> + + <VERTEX> + <V ID="0"> -1.000000000000000 3.500000000000000 0.0 </V> + <V ID="1"> -1.000000000000000 0.500000000000000 0.0 </V> + <V ID="2"> -1.000000000000000 2.500000000000000 0.0 </V> + <V ID="3"> -1.000000000000000 1.500000000000000 0.0 </V> + <V ID="4"> 3.800000000000000 4.500000000000000 0.0 </V> + <V ID="5"> 0.200000000000000 4.500000000000000 0.0 </V> + <V ID="6"> 2.900000000000000 4.500000000000000 0.0 </V> + <V ID="7"> 2.000000000000000 4.500000000000000 0.0 </V> + <V ID="8"> 1.100000000000000 4.500000000000000 0.0 </V> + <V ID="9"> 5.000000000000000 0.500000000000000 0.0 </V> + <V ID="10"> 5.000000000000000 3.500000000000000 0.0 </V> + <V ID="11"> 5.000000000000000 1.500000000000000 0.0 </V> + <V ID="12"> 5.000000000000000 2.500000000000000 0.0 </V> + <V ID="13"> 0.200000000000000 -0.500000000000000 0.0 </V> + <V ID="14"> 3.800000000000000 -0.500000000000000 0.0 </V> + <V ID="15"> 1.100000000000000 -0.500000000000000 0.0 </V> + <V ID="16"> 2.000000000000000 -0.500000000000000 0.0 </V> + <V ID="17"> 2.900000000000000 -0.500000000000000 0.0 </V> + <V ID="18"> -0.400000000000000 4.000000000000000 0.0 </V> + <V ID="19"> 4.400000000000000 4.000000000000000 0.0 </V> + <V ID="20"> 4.400000000000000 0.0 0.0 </V> + <V ID="21"> -0.400000000000000 0.0 0.0 </V> + <V ID="22"> -0.040000000000000 2.700000000000000 0.0 </V> + <V ID="23"> 0.920000000000000 1.900000000000000 0.0 </V> + <V ID="24"> 1.880000000000000 1.100000000000000 0.0 </V> + <V ID="25"> 2.840000000000000 0.300000000000000 0.0 </V> + <V ID="26"> -0.119314370713000 1.785562178050000 0.0 </V> + <V ID="27"> 1.713659159880000 0.169773145192000 0.0 </V> + <V ID="28"> 0.677713625957000 1.180143861910000 0.0 </V> + <V ID="29"> -0.188491169544000 0.869785275722000 0.0 </V> + <V ID="30"> 0.715978655871000 0.336366878402000 0.0 </V> + <V ID="31"> 3.289084972900000 0.882472796343000 0.0 </V> + <V ID="32"> 3.682080702820000 1.501561993610000 0.0 </V> + <V ID="33"> 3.972007805980000 2.179240079990000 0.0 </V> + <V ID="34"> 4.218223358320000 2.817871076440000 0.0 </V> + <V ID="35"> 4.379282019760000 3.315489806910000 0.0 </V> + <V ID="36"> 3.668892462910000 3.970508359810000 0.0 </V> + <V ID="37"> 3.155413025260000 3.802413365130000 0.0 </V> + <V ID="38"> 2.269709494930000 3.615953163480000 0.0 </V> + <V ID="39"> 1.341308209110000 3.906056835480000 0.0 </V> + <V ID="40"> 0.934565531168000 3.565834210030000 0.0 </V> + <V ID="41"> 0.461690231830000 3.150972034220000 0.0 </V> + <V ID="42"> 1.383772742880000 2.431995597660000 0.0 </V> + <V ID="43"> 2.324656705230000 1.661732140030000 0.0 </V> + <V ID="44"> 3.642425028600000 3.246403386770000 0.0 </V> + <V ID="45"> 4.022057777820000 3.634175940310000 0.0 </V> + <V ID="46"> 1.833157069900000 2.985307743670000 0.0 </V> + <V ID="47"> 2.755342831120000 2.226765404480000 0.0 </V> + <V ID="48"> 3.179196984040000 2.779077508740000 0.0 </V> + </VERTEX> + + <EDGE> + <E ID="0"> 1 21 </E> + <E ID="1"> 13 21 </E> + <E ID="2"> 13 15 </E> + <E ID="3"> 15 16 </E> + <E ID="4"> 16 17 </E> + <E ID="5"> 14 17 </E> + <E ID="6"> 1 3 </E> + <E ID="7"> 1 29 </E> + <E ID="8"> 21 29 </E> + <E ID="9"> 21 30 </E> + <E ID="10"> 13 30 </E> + <E ID="11"> 15 30 </E> + <E ID="12"> 15 27 </E> + <E ID="13"> 16 27 </E> + <E ID="14"> 16 25 </E> + <E ID="15"> 17 25 </E> + <E ID="16"> 3 29 </E> + <E ID="17"> 29 30 </E> + <E ID="18"> 27 30 </E> + <E ID="19"> 25 27 </E> + <E ID="20"> 2 3 </E> + <E ID="21"> 3 26 </E> + <E ID="22"> 26 29 </E> + <E ID="23"> 28 29 </E> + <E ID="24"> 28 30 </E> + <E ID="25"> 24 30 </E> + <E ID="26"> 24 27 </E> + <E ID="27"> 2 26 </E> + <E ID="28"> 26 28 </E> + <E ID="29"> 24 28 </E> + <E ID="30"> 0 2 </E> + <E ID="31"> 2 22 </E> + <E ID="32"> 22 26 </E> + <E ID="33"> 23 26 </E> + <E ID="34"> 23 28 </E> + <E ID="35"> 0 22 </E> + <E ID="36"> 22 23 </E> + <E ID="37"> 23 24 </E> + <E ID="38"> 24 25 </E> + <E ID="39"> 14 25 </E> + <E ID="40"> 0 18 </E> + <E ID="41"> 22 41 </E> + <E ID="42"> 23 42 </E> + <E ID="43"> 24 43 </E> + <E ID="44"> 25 31 </E> + <E ID="45"> 14 20 </E> + <E ID="46"> 18 41 </E> + <E ID="47"> 41 42 </E> + <E ID="48"> 42 43 </E> + <E ID="49"> 31 43 </E> + <E ID="50"> 20 31 </E> + <E ID="51"> 5 18 </E> + <E ID="52"> 40 41 </E> + <E ID="53"> 42 46 </E> + <E ID="54"> 43 47 </E> + <E ID="55"> 31 32 </E> + <E ID="56"> 9 20 </E> + <E ID="57"> 5 40 </E> + <E ID="58"> 40 46 </E> + <E ID="59"> 46 47 </E> + <E ID="60"> 32 47 </E> + <E ID="61"> 9 32 </E> + <E ID="62"> 5 8 </E> + <E ID="63"> 39 40 </E> + <E ID="64"> 38 46 </E> + <E ID="65"> 47 48 </E> + <E ID="66"> 32 33 </E> + <E ID="67"> 9 11 </E> + <E ID="68"> 8 39 </E> + <E ID="69"> 7 8 </E> + <E ID="70"> 38 39 </E> + <E ID="71"> 7 38 </E> + <E ID="72"> 38 48 </E> + <E ID="73"> 33 48 </E> + <E ID="74"> 11 33 </E> + <E ID="75"> 6 7 </E> + <E ID="76"> 37 38 </E> + <E ID="77"> 44 48 </E> + <E ID="78"> 33 34 </E> + <E ID="79"> 11 12 </E> + <E ID="80"> 6 37 </E> + <E ID="81"> 37 44 </E> + <E ID="82"> 34 44 </E> + <E ID="83"> 12 34 </E> + <E ID="84"> 4 6 </E> + <E ID="85"> 36 37 </E> + <E ID="86"> 44 45 </E> + <E ID="87"> 34 35 </E> + <E ID="88"> 10 12 </E> + <E ID="89"> 4 36 </E> + <E ID="90"> 36 45 </E> + <E ID="91"> 35 45 </E> + <E ID="92"> 10 35 </E> + <E ID="93"> 19 45 </E> + <E ID="94"> 4 19 </E> + <E ID="95"> 10 19 </E> + </EDGE> + + <ELEMENT> + <T ID="0"> 6 7 16 </T> + <T ID="1"> 0 8 7 </T> + <T ID="2"> 9 17 8 </T> + <T ID="3"> 1 10 9 </T> + <T ID="4"> 2 11 10 </T> + <T ID="5"> 11 12 18 </T> + <T ID="6"> 3 13 12 </T> + <T ID="7"> 14 19 13 </T> + <T ID="8"> 4 15 14 </T> + <T ID="9"> 5 39 15 </T> + <T ID="10"> 21 27 20 </T> + <T ID="11"> 16 22 21 </T> + <T ID="12"> 23 28 22 </T> + <T ID="13"> 17 24 23 </T> + <T ID="14"> 24 25 29 </T> + <T ID="15"> 18 26 25 </T> + <T ID="16"> 19 38 26 </T> + <T ID="17"> 30 31 35 </T> + <T ID="18"> 27 32 31 </T> + <T ID="19"> 32 33 36 </T> + <T ID="20"> 28 34 33 </T> + <T ID="21"> 29 37 34 </T> + <Q ID="22"> 35 41 46 40 </Q> + <Q ID="23"> 36 42 47 41 </Q> + <Q ID="24"> 37 43 48 42 </Q> + <Q ID="25"> 38 44 49 43 </Q> + <Q ID="26"> 39 45 50 44 </Q> + <Q ID="27"> 46 52 57 51 </Q> + <Q ID="28"> 47 53 58 52 </Q> + <Q ID="29"> 48 54 59 53 </Q> + <Q ID="30"> 49 55 60 54 </Q> + <Q ID="31"> 50 56 61 55 </Q> + <Q ID="32"> 57 63 68 62 </Q> + <Q ID="33"> 58 64 70 63 </Q> + <Q ID="34"> 59 65 72 64 </Q> + <Q ID="35"> 60 66 73 65 </Q> + <Q ID="36"> 61 67 74 66 </Q> + <Q ID="37"> 68 70 71 69 </Q> + <Q ID="38"> 71 76 80 75 </Q> + <Q ID="39"> 72 77 81 76 </Q> + <Q ID="40"> 73 78 82 77 </Q> + <Q ID="41"> 74 79 83 78 </Q> + <Q ID="42"> 80 85 89 84 </Q> + <Q ID="43"> 81 86 90 85 </Q> + <Q ID="44"> 82 87 91 86 </Q> + <Q ID="45"> 83 88 92 87 </Q> + <Q ID="46"> 89 90 93 94 </Q> + <Q ID="47"> 91 92 95 93 </Q> + </ELEMENT> + + <COMPOSITE> + <C ID="2"> E[0-1] </C> + <C ID="3"> E[2-5] </C> + <C ID="4"> E[45] </C> + <C ID="5"> E[56] </C> + <C ID="6"> E[67] </C> + <C ID="7"> E[79] </C> + <C ID="8"> E[88] </C> + <C ID="9"> E[94-95] </C> + <C ID="10"> E[84] </C> + <C ID="11"> E[75] </C> + <C ID="12"> E[69] </C> + <C ID="13"> E[62] </C> + <C ID="14"> E[51] </C> + <C ID="15"> E[40] </C> + <C ID="16"> E[30] </C> + <C ID="17"> E[20] </C> + <C ID="18"> E[6] </C> + <C ID="19"> Q[22-30] </C> + <C ID="20"> Q[31-47] </C> + <C ID="21"> T[0-7] </C> + <C ID="22"> T[8-21] </C> + </COMPOSITE> + + <DOMAIN> C[19-22] </DOMAIN> + </GEOMETRY> + + <EXPANSIONS> + <E COMPOSITE="C[19]" NUMMODES="7" FIELDS="u" TYPE="MODIFIED" /> + <E COMPOSITE="C[20]" NUMMODES="9" FIELDS="u" TYPE="MODIFIED" /> + <E COMPOSITE="C[21]" NUMMODES="6" FIELDS="u" TYPE="MODIFIED" /> + <E COMPOSITE="C[22]" NUMMODES="8" FIELDS="u" TYPE="MODIFIED" /> + </EXPANSIONS> + + <CONDITIONS> + <SOLVERINFO> + <I PROPERTY="GLOBALSYSSOLN" VALUE="IterativeFull" /> + <I PROPERTY="LinSysIterSolver" VALUE="ConjugateGradientLoc"/> + <I PROPERTY="Preconditioner" VALUE="Diagonal"/> + </SOLVERINFO> + + <PARAMETERS> + <P> Lambda = 1 </P> + <P> IterativeSolverTolerance = 1e-14 </P> + </PARAMETERS> + + <VARIABLES> + <V ID="0"> u </V> + </VARIABLES> + + <BOUNDARYREGIONS> + <B ID="0"> C[2] </B> + <B ID="1"> C[3] </B> + <B ID="2"> C[4] </B> + <B ID="3"> C[5] </B> + <B ID="4"> C[6] </B> + <B ID="5"> C[7] </B> + <B ID="6"> C[8] </B> + <B ID="7"> C[9] </B> + <B ID="8"> C[10] </B> + <B ID="9"> C[11] </B> + <B ID="10"> C[12] </B> + <B ID="11"> C[13] </B> + <B ID="12"> C[14] </B> + <B ID="13"> C[15] </B> + <B ID="14"> C[16] </B> + <B ID="15"> C[17] </B> + <B ID="16"> C[18] </B> + </BOUNDARYREGIONS> + + <BOUNDARYCONDITIONS> + <REGION REF="0"> + <D VAR="u" VALUE="sin(PI*x)*sin(PI*y)" /> + </REGION> + <REGION REF="1"> + <N VAR="u" VALUE="-PI*sin(PI*x)*cos(PI*y)" /> + </REGION> + <REGION REF="2"> + <N VAR="u" + VALUE="(5/sqrt(61))*PI*cos(PI*x)*sin(PI*y)-(6/sqrt(61))*PI*sin(PI*x)*cos(PI*y)" /> + </REGION> + <REGION REF="3"> + <D VAR="u" VALUE="sin(PI*x)*sin(PI*y)" /> + </REGION> + <REGION REF="4"> + <D VAR="u" VALUE="sin(PI*x)*sin(PI*y)" /> + </REGION> + <REGION REF="5"> + <N VAR="u" VALUE="PI*cos(PI*x)*sin(PI*y)" /> + </REGION> + <REGION REF="6"> + <D VAR="u" VALUE="sin(PI*x)*sin(PI*y)" /> + </REGION> + <REGION REF="7"> + <N VAR="u" + VALUE="(5/sqrt(61))*PI*cos(PI*x)*sin(PI*y)+(6/sqrt(61))*PI*sin(PI*x)*cos(PI*y)" /> + </REGION> + <REGION REF="8"> + <D VAR="u" VALUE="sin(PI*x)*sin(PI*y)" /> + </REGION> + <REGION REF="9"> + <D VAR="u" VALUE="sin(PI*x)*sin(PI*y)" /> + </REGION> + <REGION REF="10"> + <D VAR="u" VALUE="sin(PI*x)*sin(PI*y)" /> + </REGION> + <REGION REF="11"> + <D VAR="u" VALUE="sin(PI*x)*sin(PI*y)" /> + </REGION> + <REGION REF="12"> + <D VAR="u" VALUE="sin(PI*x)*sin(PI*y)" /> + </REGION> + <REGION REF="13"> + <D VAR="u" VALUE="sin(PI*x)*sin(PI*y)" /> + </REGION> + <REGION REF="14"> + <D VAR="u" VALUE="sin(PI*x)*sin(PI*y)" /> + </REGION> + <REGION REF="15"> + <D VAR="u" VALUE="sin(PI*x)*sin(PI*y)" /> + </REGION> + <REGION REF="16"> + <D VAR="u" VALUE="sin(PI*x)*sin(PI*y)" /> + </REGION> + </BOUNDARYCONDITIONS> + + <FUNCTION NAME="Forcing"> + <E VAR="u" VALUE="-(Lambda + 2*PI*PI)*sin(PI*x)*sin(PI*y)" /> + </FUNCTION> + + <FUNCTION NAME="ExactSolution"> + <E VAR="u" VALUE="sin(PI*x)*sin(PI*y)" /> + </FUNCTION> + + </CONDITIONS> + +</NEKTAR> diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 47ddaf843384de5a6d218a5c16cebc649c61da95..be911b26ad232d10c0abd982747ecf4467dd42f9 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -289,6 +289,17 @@ target_include_directories(test_fwdtrans PRIVATE ${NEKTAR++_INCLUDE_DIRS}) target_compile_definitions(test_fwdtrans PRIVATE -DTEST_PATH="${CMAKE_SOURCE_DIR}") +IF (NEKTAR_USE_CUDA) + add_executable(test_fwdtranscuda test_fwdtranscuda.cpp) + target_link_libraries(test_fwdtranscuda PRIVATE Operators) + target_link_libraries(test_fwdtranscuda PRIVATE ${NEKTAR++_LIBRARIES}) + target_link_libraries(test_fwdtranscuda PRIVATE Boost::unit_test_framework) + target_include_directories(test_fwdtranscuda PRIVATE "${CMAKE_SOURCE_DIR}") + target_include_directories(test_fwdtranscuda PRIVATE ${NEKTAR++_INCLUDE_DIRS}) + target_compile_definitions(test_fwdtranscuda PRIVATE + -DTEST_PATH="${CMAKE_SOURCE_DIR}") +ENDIF() + add_executable(test_helmholtz test_helmholtz.cpp) target_link_libraries(test_helmholtz PRIVATE Operators) target_link_libraries(test_helmholtz PRIVATE ${NEKTAR++_LIBRARIES}) @@ -318,6 +329,17 @@ target_include_directories(test_helmsolve PRIVATE ${NEKTAR++_INCLUDE_DIRS}) target_compile_definitions(test_helmsolve PRIVATE -DTEST_PATH="${CMAKE_SOURCE_DIR}") +IF (NEKTAR_USE_CUDA) + add_executable(test_helmsolvecuda test_helmsolvecuda.cpp) + target_link_libraries(test_helmsolvecuda PRIVATE Operators) + target_link_libraries(test_helmsolvecuda PRIVATE ${NEKTAR++_LIBRARIES}) + target_link_libraries(test_helmsolvecuda PRIVATE Boost::unit_test_framework) + target_include_directories(test_helmsolvecuda PRIVATE "${CMAKE_SOURCE_DIR}") + target_include_directories(test_helmsolvecuda PRIVATE ${NEKTAR++_INCLUDE_DIRS}) + target_compile_definitions(test_helmsolvecuda PRIVATE + -DTEST_PATH="${CMAKE_SOURCE_DIR}") +ENDIF() + IF (NEKTAR_USE_CUDA) add_test( NAME CUDAKernels @@ -516,12 +538,28 @@ add_test( WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" ) +IF (NEKTAR_USE_CUDA) + add_test( + NAME FwdTransCUDA + COMMAND test_fwdtranscuda + WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" + ) +ENDIF() + add_test( NAME Helmholtz COMMAND test_helmholtz WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" ) +IF (NEKTAR_USE_CUDA) + add_test( + NAME HelmholtzCUDA + COMMAND test_helmholtzcuda + WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" + ) +ENDIF() + add_test( NAME HelmSolve COMMAND test_helmsolve @@ -530,8 +568,8 @@ add_test( IF (NEKTAR_USE_CUDA) add_test( - NAME HelmholtzCUDA - COMMAND test_helmholtzcuda + NAME HelmSolveCUDA + COMMAND test_helmsolvecuda WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" ) ENDIF() diff --git a/tests/init_diagpreconfields.hpp b/tests/init_diagpreconfields.hpp index 6a96e8d1bb9d60793ae2589cda8d643e3739f90b..913a6d551ec8d80f8e549b5354e90fb6fa0b777d 100644 --- a/tests/init_diagpreconfields.hpp +++ b/tests/init_diagpreconfields.hpp @@ -104,7 +104,8 @@ class Helmholtz2D_Tri_Quad : public DiagPreconField public: Helmholtz2D_Tri_Quad() { - meshName = "run/Helmholtz2D_P7_AllBCs.xml"; + //meshName = "run/Helmholtz2D_P7_AllBCs.xml"; + meshName = "run/Helmholtz2D_varP.xml"; } }; diff --git a/tests/init_fwdtransfields.hpp b/tests/init_fwdtransfields.hpp index ba151f2bf1c15556489b3338b0c5012a31f61cf8..f723eea7b7a2d5ad74dbb45b850d46aa8f1dbc2d 100644 --- a/tests/init_fwdtransfields.hpp +++ b/tests/init_fwdtransfields.hpp @@ -144,7 +144,8 @@ class Helmholtz2D_Tri_Quad : public FwdTransField public: Helmholtz2D_Tri_Quad() { - meshName = "run/Helmholtz2D_P7_AllBCs.xml"; + //meshName = "run/Helmholtz2D_P7_AllBCs.xml"; + meshName = "run/Helmholtz2D_varP.xml"; } }; diff --git a/tests/init_helmsolvefields.hpp b/tests/init_helmsolvefields.hpp index 33034087eddee5f0f7e3bf1d618fd213ec72d74f..53f5aec0a37ea4c4a69612f5c07fc28fe2e986b9 100644 --- a/tests/init_helmsolvefields.hpp +++ b/tests/init_helmsolvefields.hpp @@ -148,7 +148,8 @@ class Helmholtz2D_Tri_Quad : public HelmSolveField public: Helmholtz2D_Tri_Quad() { - meshName = "run/Helmholtz2D_P7_AllBCs.xml"; + //meshName = "run/Helmholtz2D_P7_AllBCs.xml"; + meshName = "run/Helmholtz2D_varP.xml"; } }; diff --git a/tests/test_cudakernels.cu b/tests/test_cudakernels.cu index 858b5b23007e35a1d7fa9b928730877dc91a6879..7dea0f6ab3d9a93754c74821901ebdced1ceebb8 100644 --- a/tests/test_cudakernels.cu +++ b/tests/test_cudakernels.cu @@ -40,7 +40,7 @@ BOOST_FIXTURE_TEST_CASE(cuda_dotkernel, CUDAKernels) cudaMemcpy(&h_out, d_out, sizeof(double), cudaMemcpyDeviceToHost); // Check results - BOOST_TEST(fabs(h_out - out) < 1.0E-12); + BOOST_TEST(fabs(h_out - out) < 5.0E-12); boost::test_tools::output_test_stream output; { std::cout << "CUDA = " << h_out << " Vmath = " << out << std::endl; diff --git a/tests/test_diagpreconcuda.cpp b/tests/test_diagpreconcuda.cpp index f48d74876a4ac5f623d505f7eaa6039cd2ff934a..fcf183fba9f892a9c059f970b1a25fac23ce789e 100644 --- a/tests/test_diagpreconcuda.cpp +++ b/tests/test_diagpreconcuda.cpp @@ -122,7 +122,7 @@ BOOST_FIXTURE_TEST_CASE(diagpreconcuda_pyr, Helmholtz3D_Pyr) } } -/*BOOST_FIXTURE_TEST_CASE(diagpreconcuda_tet, Helmholtz3D_Tet) +BOOST_FIXTURE_TEST_CASE(diagpreconcuda_tet, Helmholtz3D_Tet) { Configure(); SetTestCase( @@ -142,6 +142,6 @@ BOOST_FIXTURE_TEST_CASE(diagpreconcuda_pyr, Helmholtz3D_Pyr) fixtcuda_out->template GetStorage<MemoryRegionCUDA>().GetCPUPtr(), fixt_expected->GetStorage().GetCPUPtr(), 1.0E-10); } -}*/ +} BOOST_AUTO_TEST_SUITE_END() diff --git a/tests/test_fwdtranscuda.cpp b/tests/test_fwdtranscuda.cpp new file mode 100644 index 0000000000000000000000000000000000000000..289b808dc0dc3c36b9953595ddd578350546249c --- /dev/null +++ b/tests/test_fwdtranscuda.cpp @@ -0,0 +1,141 @@ +#define BOOST_TEST_DYN_LINK +#define BOOST_TEST_MODULE TestFwdTransCUDA +#include <boost/test/tools/output_test_stream.hpp> +#include <boost/test/unit_test.hpp> + +#include <iostream> +#include <memory> + +#include "Operators/OperatorDiagPrecon.hpp" +#include "Operators/OperatorFwdTrans.hpp" +#include "init_fwdtransfields.hpp" + +BOOST_AUTO_TEST_SUITE(TestFwdTransCUDA) + +BOOST_FIXTURE_TEST_CASE(fwdtranscuda_seg, Helmholtz1D_Seg) +{ + Configure(); + SetTestCase( + fixtcuda_in->GetBlocks(), + fixtcuda_in->template GetStorage<MemoryRegionCUDA>().GetCPUPtr()); + auto FwdTransOp = FwdTrans<>::create(fixt_explist, "CUDA"); + auto DiagPreconOp = DiagPrecon<>::create(fixt_explist, "CUDA"); + FwdTransOp->setPrecon(DiagPreconOp); + FwdTransOp->apply(*fixtcuda_in, *fixtcuda_out); + ExpectedSolution(fixt_expected->GetBlocks(), + fixt_expected->GetStorage().GetCPUPtr()); + BOOST_TEST(fixtcuda_out->compare(*fixt_expected, 1.0E-12)); + boost::test_tools::output_test_stream output; + { + OutputIfNotMatch( + fixtcuda_out->template GetStorage<MemoryRegionCUDA>().GetCPUPtr(), + fixt_expected->GetStorage().GetCPUPtr(), 1.0E-12); + } +} + +BOOST_FIXTURE_TEST_CASE(fwdtranscuda_tri_quad, Helmholtz2D_Tri_Quad) +{ + Configure(); + SetTestCase( + fixtcuda_in->GetBlocks(), + fixtcuda_in->template GetStorage<MemoryRegionCUDA>().GetCPUPtr()); + auto FwdTransOp = FwdTrans<>::create(fixt_explist, "CUDA"); + auto DiagPreconOp = DiagPrecon<>::create(fixt_explist, "CUDA"); + FwdTransOp->setPrecon(DiagPreconOp); + FwdTransOp->apply(*fixtcuda_in, *fixtcuda_out); + ExpectedSolution(fixt_expected->GetBlocks(), + fixt_expected->GetStorage().GetCPUPtr()); + BOOST_TEST(fixtcuda_out->compare(*fixt_expected, 1.0E-08)); + boost::test_tools::output_test_stream output; + { + OutputIfNotMatch( + fixtcuda_out->template GetStorage<MemoryRegionCUDA>().GetCPUPtr(), + fixt_expected->GetStorage().GetCPUPtr(), 1.0E-08); + } +} + +BOOST_FIXTURE_TEST_CASE(fwdtranscuda_hex, Helmholtz3D_Hex) +{ + Configure(); + SetTestCase( + fixtcuda_in->GetBlocks(), + fixtcuda_in->template GetStorage<MemoryRegionCUDA>().GetCPUPtr()); + auto FwdTransOp = FwdTrans<>::create(fixt_explist, "CUDA"); + auto DiagPreconOp = DiagPrecon<>::create(fixt_explist, "CUDA"); + FwdTransOp->setPrecon(DiagPreconOp); + FwdTransOp->apply(*fixtcuda_in, *fixtcuda_out); + ExpectedSolution(fixt_expected->GetBlocks(), + fixt_expected->GetStorage().GetCPUPtr()); + BOOST_TEST(fixtcuda_out->compare(*fixt_expected, 1.0E-08)); + boost::test_tools::output_test_stream output; + { + OutputIfNotMatch( + fixtcuda_out->template GetStorage<MemoryRegionCUDA>().GetCPUPtr(), + fixt_expected->GetStorage().GetCPUPtr(), 1.0E-08); + } +} + +BOOST_FIXTURE_TEST_CASE(fwdtranscuda_prism, Helmholtz3D_Prism) +{ + Configure(); + SetTestCase( + fixtcuda_in->GetBlocks(), + fixtcuda_in->template GetStorage<MemoryRegionCUDA>().GetCPUPtr()); + auto FwdTransOp = FwdTrans<>::create(fixt_explist, "CUDA"); + auto DiagPreconOp = DiagPrecon<>::create(fixt_explist, "CUDA"); + FwdTransOp->setPrecon(DiagPreconOp); + FwdTransOp->apply(*fixtcuda_in, *fixtcuda_out); + ExpectedSolution(fixt_expected->GetBlocks(), + fixt_expected->GetStorage().GetCPUPtr()); + BOOST_TEST(fixtcuda_out->compare(*fixt_expected, 1.0E-08)); + boost::test_tools::output_test_stream output; + { + OutputIfNotMatch( + fixtcuda_out->template GetStorage<MemoryRegionCUDA>().GetCPUPtr(), + fixt_expected->GetStorage().GetCPUPtr(), 1.0E-08); + } +} + +BOOST_FIXTURE_TEST_CASE(fwdtranscuda_pyr, Helmholtz3D_Pyr) +{ + Configure(); + SetTestCase( + fixtcuda_in->GetBlocks(), + fixtcuda_in->template GetStorage<MemoryRegionCUDA>().GetCPUPtr()); + auto FwdTransOp = FwdTrans<>::create(fixt_explist, "CUDA"); + auto DiagPreconOp = DiagPrecon<>::create(fixt_explist, "CUDA"); + FwdTransOp->setPrecon(DiagPreconOp); + FwdTransOp->apply(*fixtcuda_in, *fixtcuda_out); + ExpectedSolution(fixt_expected->GetBlocks(), + fixt_expected->GetStorage().GetCPUPtr()); + BOOST_TEST(fixtcuda_out->compare(*fixt_expected, 1.0E-09)); + boost::test_tools::output_test_stream output; + { + OutputIfNotMatch( + fixtcuda_out->template GetStorage<MemoryRegionCUDA>().GetCPUPtr(), + fixt_expected->GetStorage().GetCPUPtr(), 1.0E-09); + } +} + +BOOST_FIXTURE_TEST_CASE(fwdtranscuda_tet, Helmholtz3D_Tet) +{ + Configure(); + SetTestCase( + fixtcuda_in->GetBlocks(), + fixtcuda_in->template GetStorage<MemoryRegionCUDA>().GetCPUPtr()); + auto FwdTransOp = FwdTrans<>::create(fixt_explist, "CUDA"); + auto DiagPreconOp = DiagPrecon<>::create(fixt_explist, "CUDA"); + FwdTransOp->setPrecon(DiagPreconOp); + FwdTransOp->apply(*fixtcuda_in, *fixtcuda_out); + ExpectedSolution(fixt_expected->GetBlocks(), + fixt_expected->GetStorage().GetCPUPtr()); + BOOST_TEST(fixtcuda_out->compare(*fixt_expected, 1.0E-08)); + boost::test_tools::output_test_stream output; + { + OutputIfNotMatch( + fixtcuda_out->template GetStorage<MemoryRegionCUDA>().GetCPUPtr(), + fixt_expected->GetStorage().GetCPUPtr(), 1.0E-08); + } +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/tests/test_helmsolvecuda.cpp b/tests/test_helmsolvecuda.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3cb87e51c104e2cb06d1b873b56caa53e3a22dca --- /dev/null +++ b/tests/test_helmsolvecuda.cpp @@ -0,0 +1,147 @@ +#define BOOST_TEST_DYN_LINK +#define BOOST_TEST_MODULE TestHelmSolveCUDA +#include <boost/test/tools/output_test_stream.hpp> +#include <boost/test/unit_test.hpp> + +#include <iostream> +#include <memory> + +#include "Operators/OperatorDiagPrecon.hpp" +#include "Operators/OperatorHelmSolve.hpp" +#include "init_helmsolvefields.hpp" + +BOOST_AUTO_TEST_SUITE(TestHelmSolveCUDA) + +BOOST_FIXTURE_TEST_CASE(helmsolvecuda_seg, Helmholtz1D_Seg) +{ + Configure(); + SetTestCase( + fixtcuda_in->GetBlocks(), + fixtcuda_in->template GetStorage<MemoryRegionCUDA>().GetCPUPtr()); + auto HelmSolveOp = HelmSolve<>::create(fixt_explist, "CUDA"); + auto DiagPreconOp = DiagPrecon<>::create(fixt_explist, "CUDA"); + HelmSolveOp->setPrecon(DiagPreconOp); + HelmSolveOp->setLambda(1.0); + HelmSolveOp->apply(*fixtcuda_in, *fixtcuda_out); + ExpectedSolution(fixt_expected->GetBlocks(), + fixt_expected->GetStorage().GetCPUPtr()); + BOOST_TEST(fixtcuda_out->compare(*fixt_expected, 1.0E-12)); + boost::test_tools::output_test_stream output; + { + OutputIfNotMatch( + fixtcuda_out->template GetStorage<MemoryRegionCUDA>().GetCPUPtr(), + fixt_expected->GetStorage().GetCPUPtr(), 1.0E-12); + } +} + +BOOST_FIXTURE_TEST_CASE(helmsolvecuda_tri_quad, Helmholtz2D_Tri_Quad) +{ + Configure(); + SetTestCase( + fixtcuda_in->GetBlocks(), + fixtcuda_in->template GetStorage<MemoryRegionCUDA>().GetCPUPtr()); + auto HelmSolveOp = HelmSolve<>::create(fixt_explist, "CUDA"); + auto DiagPreconOp = DiagPrecon<>::create(fixt_explist, "CUDA"); + HelmSolveOp->setPrecon(DiagPreconOp); + HelmSolveOp->setLambda(1.0); + HelmSolveOp->apply(*fixtcuda_in, *fixtcuda_out); + ExpectedSolution(fixt_expected->GetBlocks(), + fixt_expected->GetStorage().GetCPUPtr()); + BOOST_TEST(fixtcuda_out->compare(*fixt_expected, 1.0E-12)); + boost::test_tools::output_test_stream output; + { + OutputIfNotMatch( + fixtcuda_out->template GetStorage<MemoryRegionCUDA>().GetCPUPtr(), + fixt_expected->GetStorage().GetCPUPtr(), 1.0E-12); + } +} + +BOOST_FIXTURE_TEST_CASE(helmsolvecuda_hex, Helmholtz3D_Hex) +{ + Configure(); + SetTestCase( + fixtcuda_in->GetBlocks(), + fixtcuda_in->template GetStorage<MemoryRegionCUDA>().GetCPUPtr()); + auto HelmSolveOp = HelmSolve<>::create(fixt_explist, "CUDA"); + auto DiagPreconOp = DiagPrecon<>::create(fixt_explist, "CUDA"); + HelmSolveOp->setPrecon(DiagPreconOp); + HelmSolveOp->setLambda(1.0); + HelmSolveOp->apply(*fixtcuda_in, *fixtcuda_out); + ExpectedSolution(fixt_expected->GetBlocks(), + fixt_expected->GetStorage().GetCPUPtr()); + BOOST_TEST(fixtcuda_out->compare(*fixt_expected, 1.0E-10)); + boost::test_tools::output_test_stream output; + { + OutputIfNotMatch( + fixtcuda_out->template GetStorage<MemoryRegionCUDA>().GetCPUPtr(), + fixt_expected->GetStorage().GetCPUPtr(), 1.0E-10); + } +} + +BOOST_FIXTURE_TEST_CASE(helmsolvecuda_prism, Helmholtz3D_Prism) +{ + Configure(); + SetTestCase( + fixtcuda_in->GetBlocks(), + fixtcuda_in->template GetStorage<MemoryRegionCUDA>().GetCPUPtr()); + auto HelmSolveOp = HelmSolve<>::create(fixt_explist, "CUDA"); + auto DiagPreconOp = DiagPrecon<>::create(fixt_explist, "CUDA"); + HelmSolveOp->setPrecon(DiagPreconOp); + HelmSolveOp->setLambda(1.0); + HelmSolveOp->apply(*fixtcuda_in, *fixtcuda_out); + ExpectedSolution(fixt_expected->GetBlocks(), + fixt_expected->GetStorage().GetCPUPtr()); + BOOST_TEST(fixtcuda_out->compare(*fixt_expected, 1.0E-10)); + boost::test_tools::output_test_stream output; + { + OutputIfNotMatch( + fixtcuda_out->template GetStorage<MemoryRegionCUDA>().GetCPUPtr(), + fixt_expected->GetStorage().GetCPUPtr(), 1.0E-10); + } +} + +BOOST_FIXTURE_TEST_CASE(helmsolvecuda_pyr, Helmholtz3D_Pyr) +{ + Configure(); + SetTestCase( + fixtcuda_in->GetBlocks(), + fixtcuda_in->template GetStorage<MemoryRegionCUDA>().GetCPUPtr()); + auto HelmSolveOp = HelmSolve<>::create(fixt_explist, "CUDA"); + auto DiagPreconOp = DiagPrecon<>::create(fixt_explist, "CUDA"); + HelmSolveOp->setPrecon(DiagPreconOp); + HelmSolveOp->setLambda(1.0); + HelmSolveOp->apply(*fixtcuda_in, *fixtcuda_out); + ExpectedSolution(fixt_expected->GetBlocks(), + fixt_expected->GetStorage().GetCPUPtr()); + BOOST_TEST(fixtcuda_out->compare(*fixt_expected, 1.0E-10)); + boost::test_tools::output_test_stream output; + { + OutputIfNotMatch( + fixtcuda_out->template GetStorage<MemoryRegionCUDA>().GetCPUPtr(), + fixt_expected->GetStorage().GetCPUPtr(), 1.0E-10); + } +} + +BOOST_FIXTURE_TEST_CASE(helmsolvecuda_tet, Helmholtz3D_Tet) +{ + Configure(); + SetTestCase( + fixtcuda_in->GetBlocks(), + fixtcuda_in->template GetStorage<MemoryRegionCUDA>().GetCPUPtr()); + auto HelmSolveOp = HelmSolve<>::create(fixt_explist, "CUDA"); + auto DiagPreconOp = DiagPrecon<>::create(fixt_explist, "CUDA"); + HelmSolveOp->setPrecon(DiagPreconOp); + HelmSolveOp->setLambda(1.0); + HelmSolveOp->apply(*fixtcuda_in, *fixtcuda_out); + ExpectedSolution(fixt_expected->GetBlocks(), + fixt_expected->GetStorage().GetCPUPtr()); + BOOST_TEST(fixtcuda_out->compare(*fixt_expected, 1.0E-10)); + boost::test_tools::output_test_stream output; + { + OutputIfNotMatch( + fixtcuda_out->template GetStorage<MemoryRegionCUDA>().GetCPUPtr(), + fixt_expected->GetStorage().GetCPUPtr(), 1.0E-10); + } +} + +BOOST_AUTO_TEST_SUITE_END()