diff --git a/Operators/AssmbScatr/AssmbScatrCUDA.hpp b/Operators/AssmbScatr/AssmbScatrCUDA.hpp index 1aa2a3097ea1d0a7c7902b9baf37dc47358a5d52..2aeee61909da1f9b4f8add1517e1003830c7be82 100644 --- a/Operators/AssmbScatr/AssmbScatrCUDA.hpp +++ b/Operators/AssmbScatr/AssmbScatrCUDA.hpp @@ -1,7 +1,8 @@ +#pragma once + #include "MemoryRegionCUDA.hpp" #include "Operators/AssmbScatr/AssmbScatrCUDAKernels.cuh" #include "Operators/OperatorAssmbScatr.hpp" -#include "Operators/OperatorHelper.cuh" #include <MultiRegions/AssemblyMap/AssemblyMapCG.h> #include <MultiRegions/ContField.h> @@ -24,21 +25,20 @@ public: std::dynamic_pointer_cast<ContField>(this->m_expansionList); m_assmbMap = contfield->GetLocalToGlobalMap(); - // Get the solution type - GlobalSysSolnType solnType = m_assmbMap->GetGlobalSysSolnType(); - auto nloc = m_assmbMap->GetNumLocalCoeffs(); - auto nglo = (solnType == eIterativeFull) - ? m_assmbMap->GetNumGlobalCoeffs() - : m_assmbMap->GetNumGlobalBndCoeffs(); + m_solnType = m_assmbMap->GetGlobalSysSolnType(); + m_nloc = m_assmbMap->GetNumLocalCoeffs(); + m_nglo = (m_solnType == eIterativeFull) + ? m_assmbMap->GetNumGlobalCoeffs() + : m_assmbMap->GetNumGlobalBndCoeffs(); + m_ndir = m_assmbMap->GetNumGlobalDirBndCoeffs(); - // Memory allocation for tmp pointer - cudaMalloc((void **)&m_tmpptr, sizeof(TData) * nglo); - cudaMemset(m_tmpptr, 0.0, sizeof(TData) * nglo); + // Memory allocation for global space pointer + cudaMalloc((void **)&m_gloptr, sizeof(TData) * m_nglo); // Memory allocation for assemble pointer - cudaMalloc((void **)&m_assmbptr, sizeof(int) * nloc); + cudaMalloc((void **)&m_assmbptr, sizeof(int) * m_nloc); auto assmbptr = m_assmbMap->GetLocalToGlobalMap().get(); - cudaMemcpy(m_assmbptr, assmbptr, sizeof(int) * nloc, + cudaMemcpy(m_assmbptr, assmbptr, sizeof(int) * m_nloc, cudaMemcpyHostToDevice); m_signChange = m_assmbMap->AssemblyMap::GetSignChange(); @@ -46,23 +46,20 @@ public: if (m_signChange) { // Memory allocation for sign pointer - cudaMalloc((void **)&m_signptr, sizeof(TData) * nloc); + cudaMalloc((void **)&m_signptr, sizeof(TData) * m_nloc); auto signptr = m_assmbMap->GetLocalToGlobalSign().get(); - cudaMemcpy(m_signptr, signptr, sizeof(TData) * nloc, + cudaMemcpy(m_signptr, signptr, sizeof(TData) * m_nloc, cudaMemcpyHostToDevice); } } ~OperatorAssmbScatrImpl() { - cudaFree(m_tmpptr); + cudaFree(m_gloptr); cudaFree(m_assmbptr); - m_tmpptr = nullptr; - m_assmbptr = nullptr; if (m_signChange) { cudaFree(m_signptr); - m_signptr = nullptr; } } @@ -70,16 +67,15 @@ public: Field<TData, FieldState::Coeff> &out, const bool &zeroDir = false) { - Assemble(in, m_tmpptr); + Assemble(in, m_gloptr); // Zeroing Dirichlet BC if (zeroDir) { - size_t nDir = m_assmbMap->GetNumGlobalDirBndCoeffs(); - cudaMemset(m_tmpptr, 0.0, sizeof(TData) * nDir); + cudaMemset(m_gloptr, 0, sizeof(TData) * m_ndir); } - GlobalToLocal(m_tmpptr, out); + GlobalToLocal(m_gloptr, out); } void Assemble(Field<TData, FieldState::Coeff> &in, TData *outptr) @@ -87,10 +83,10 @@ public: auto const *inptr = in.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); - // Get the solution type - GlobalSysSolnType solnType = m_assmbMap->GetGlobalSysSolnType(); + // Zero output + cudaMemset(outptr, 0, sizeof(TData) * m_nglo); - if (solnType == eIterativeFull) + if (m_solnType == eIterativeFull) { // Initialise index size_t expIdx = 0; @@ -102,7 +98,7 @@ public: // Determine shape and type of the element. auto const expPtr = this->m_expansionList->GetExp(expIdx); auto nElmts = in.GetBlocks()[block_idx].num_elements; - auto numPts = expPtr->GetNcoeffs(); + auto ncoeff = expPtr->GetNcoeffs(); // Deterime CUDA grid parameters. m_gridSize = nElmts / m_blockSize; @@ -111,17 +107,17 @@ public: if (m_signChange) { AssembleKernel<<<m_gridSize, m_blockSize>>>( - numPts, nElmts, offset, m_assmbptr, m_signptr, inptr, + ncoeff, nElmts, offset, m_assmbptr, m_signptr, inptr, outptr); } else { AssembleKernel<<<m_gridSize, m_blockSize>>>( - numPts, nElmts, offset, m_assmbptr, inptr, outptr); + ncoeff, nElmts, offset, m_assmbptr, inptr, outptr); } // Increment pointer and index for next element type. - offset += numPts * nElmts; + offset += ncoeff * nElmts; expIdx += nElmts; } } @@ -131,10 +127,10 @@ public: { auto *outptr = out.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); - // Get the solution type - GlobalSysSolnType solnType = m_assmbMap->GetGlobalSysSolnType(); + // Zero output + cudaMemset(outptr, 0, sizeof(TData) * m_nloc); - if (solnType == eIterativeFull) + if (m_solnType == eIterativeFull) { // Initialise index size_t expIdx = 0; @@ -146,7 +142,7 @@ public: // Determine shape and type of the element. auto const expPtr = this->m_expansionList->GetExp(expIdx); auto nElmts = out.GetBlocks()[block_idx].num_elements; - auto numPts = expPtr->GetNcoeffs(); + auto ncoeff = expPtr->GetNcoeffs(); // Deterime CUDA grid parameters. m_gridSize = nElmts / m_blockSize; @@ -155,17 +151,17 @@ public: if (m_signChange) { GlobalToLocalKernel<<<m_gridSize, m_blockSize>>>( - numPts, nElmts, offset, m_assmbptr, m_signptr, inptr, + ncoeff, nElmts, offset, m_assmbptr, m_signptr, inptr, outptr); } else { GlobalToLocalKernel<<<m_gridSize, m_blockSize>>>( - numPts, nElmts, offset, m_assmbptr, inptr, outptr); + ncoeff, nElmts, offset, m_assmbptr, inptr, outptr); } // Increment pointer and index for next element type. - offset += numPts * nElmts; + offset += ncoeff * nElmts; expIdx += nElmts; } } @@ -184,10 +180,14 @@ public: protected: AssemblyMapCGSharedPtr m_assmbMap; - TData *m_tmpptr; + GlobalSysSolnType m_solnType; + TData *m_gloptr; bool m_signChange; TData *m_signptr; int *m_assmbptr; + size_t m_nloc; + size_t m_nglo; + size_t m_ndir; size_t m_gridSize; size_t m_blockSize = 32; }; diff --git a/Operators/AssmbScatr/AssmbScatrCUDAKernels.cuh b/Operators/AssmbScatr/AssmbScatrCUDAKernels.cuh index ae8c2c20e7afdd4117cd855054034b4fa42d48d7..e1cfacd8862a7dd0e7896bece9362dbf0b87db31 100644 --- a/Operators/AssmbScatr/AssmbScatrCUDAKernels.cuh +++ b/Operators/AssmbScatr/AssmbScatrCUDAKernels.cuh @@ -2,7 +2,7 @@ namespace Nektar::Operators::detail { template <typename TData> -__global__ void AssembleKernel(const size_t numPts, const size_t nelmt, +__global__ void AssembleKernel(const size_t ncoeff, const size_t nelmt, const size_t offset, const int *assmbptr, const TData *signptr, const TData *inptr, TData *outptr) @@ -14,9 +14,9 @@ __global__ void AssembleKernel(const size_t numPts, const size_t nelmt, return; } - size_t index = offset + e * numPts; + size_t index = offset + e * ncoeff; - for (size_t i = 0; i < numPts; i++) + for (size_t i = 0; i < ncoeff; i++) { atomicAdd(outptr + assmbptr[index + i], signptr[index + i] * inptr[index + i]); @@ -24,7 +24,7 @@ __global__ void AssembleKernel(const size_t numPts, const size_t nelmt, } template <typename TData> -__global__ void AssembleKernel(const size_t numPts, const size_t nelmt, +__global__ void AssembleKernel(const size_t ncoeff, const size_t nelmt, const size_t offset, const int *assmbptr, const TData sign, const TData *inptr, TData *outptr) @@ -36,16 +36,16 @@ __global__ void AssembleKernel(const size_t numPts, const size_t nelmt, return; } - size_t index = offset + e * numPts; + size_t index = offset + e * ncoeff; - for (size_t i = 0; i < numPts; i++) + for (size_t i = 0; i < ncoeff; i++) { atomicAdd(outptr + assmbptr[index + i], sign * inptr[index + i]); } } template <typename TData> -__global__ void AssembleKernel(const size_t numPts, const size_t nelmt, +__global__ void AssembleKernel(const size_t ncoeff, const size_t nelmt, const size_t offset, const int *assmbptr, const TData *inptr, TData *outptr) { @@ -56,16 +56,16 @@ __global__ void AssembleKernel(const size_t numPts, const size_t nelmt, return; } - size_t index = offset + e * numPts; + size_t index = offset + e * ncoeff; - for (size_t i = 0; i < numPts; i++) + for (size_t i = 0; i < ncoeff; i++) { atomicAdd(outptr + assmbptr[index + i], inptr[index + i]); } } template <typename TData> -__global__ void GlobalToLocalKernel(const size_t numPts, const size_t nelmt, +__global__ void GlobalToLocalKernel(const size_t ncoeff, const size_t nelmt, const size_t offset, const int *assmbptr, const TData *signptr, const TData *inptr, TData *outptr) @@ -77,16 +77,16 @@ __global__ void GlobalToLocalKernel(const size_t numPts, const size_t nelmt, return; } - size_t index = offset + e * numPts; + size_t index = offset + e * ncoeff; - for (size_t i = 0; i < numPts; i++) + for (size_t i = 0; i < ncoeff; i++) { outptr[index + i] = signptr[index + i] * inptr[assmbptr[index + i]]; } } template <typename TData> -__global__ void GlobalToLocalKernel(const size_t numPts, const size_t nelmt, +__global__ void GlobalToLocalKernel(const size_t ncoeff, const size_t nelmt, const size_t offset, const int *assmbptr, const TData sign, const TData *inptr, TData *outptr) @@ -98,16 +98,16 @@ __global__ void GlobalToLocalKernel(const size_t numPts, const size_t nelmt, return; } - size_t index = offset + e * numPts; + size_t index = offset + e * ncoeff; - for (size_t i = 0; i < numPts; i++) + for (size_t i = 0; i < ncoeff; i++) { outptr[index + i] = sign * inptr[assmbptr[index + i]]; } } template <typename TData> -__global__ void GlobalToLocalKernel(const size_t numPts, const size_t nelmt, +__global__ void GlobalToLocalKernel(const size_t ncoeff, const size_t nelmt, const size_t offset, const int *assmbptr, const TData *inptr, TData *outptr) { @@ -118,9 +118,9 @@ __global__ void GlobalToLocalKernel(const size_t numPts, const size_t nelmt, return; } - size_t index = offset + e * numPts; + size_t index = offset + e * ncoeff; - for (size_t i = 0; i < numPts; i++) + for (size_t i = 0; i < ncoeff; i++) { outptr[index + i] = inptr[assmbptr[index + i]]; } diff --git a/Operators/AssmbScatr/AssmbScatrStdMat.hpp b/Operators/AssmbScatr/AssmbScatrStdMat.hpp index 45cc9f67f8460701675504769113b8c883a8f53f..7f51eb941cfcf532b774ec1d19fde57ea166856d 100644 --- a/Operators/AssmbScatr/AssmbScatrStdMat.hpp +++ b/Operators/AssmbScatr/AssmbScatrStdMat.hpp @@ -2,7 +2,6 @@ #include "Operators/OperatorAssmbScatr.hpp" -#include <LibUtilities/BasicUtils/Vmath.hpp> #include <MultiRegions/AssemblyMap/AssemblyMapCG.h> #include <MultiRegions/ContField.h> @@ -25,37 +24,35 @@ public: std::dynamic_pointer_cast<ContField>(this->m_expansionList); m_assmbMap = contfield->GetLocalToGlobalMap(); - GlobalSysSolnType solnType = m_assmbMap->GetGlobalSysSolnType(); - auto nglo = (solnType == eIterativeFull) - ? m_assmbMap->GetNumGlobalCoeffs() - : m_assmbMap->GetNumGlobalBndCoeffs(); + m_solnType = m_assmbMap->GetGlobalSysSolnType(); + m_nloc = m_assmbMap->GetNumLocalCoeffs(); + m_nglo = (m_solnType == eIterativeFull) + ? m_assmbMap->GetNumGlobalCoeffs() + : m_assmbMap->GetNumGlobalBndCoeffs(); + m_ndir = m_assmbMap->GetNumGlobalDirBndCoeffs(); - m_tmp = Array<OneD, TData>(nglo); + m_glo = Array<OneD, TData>(m_nglo); } void apply(Field<TData, FieldState::Coeff> &in, Field<TData, FieldState::Coeff> &out, const bool &zeroDir = false) { - Assemble(in, m_tmp); + Assemble(in, m_glo); + // Zeroing Dirichlet BC if (zeroDir) { - auto nDir = m_assmbMap->GetNumGlobalDirBndCoeffs(); - Vmath::Zero(nDir, m_tmp.get(), 1); + Vmath::Zero(m_ndir, m_glo.get(), 1); } - GlobalToLocal(m_tmp, out); + GlobalToLocal(m_glo, out); } void Assemble(Field<TData, FieldState::Coeff> &in, Array<OneD, TData> outarray) { - // Get the solution type - GlobalSysSolnType solnType = m_assmbMap->GetGlobalSysSolnType(); - - auto nloc = m_assmbMap->GetNumLocalCoeffs(); - Array<OneD, TData> inarray(nloc); + Array<OneD, TData> inarray(m_nloc); // Copy data from input field auto *inarrptr = inarray.data(); @@ -73,7 +70,7 @@ public: inptr += nSize; } - if (solnType == eIterativeFull) + if (m_solnType == eIterativeFull) { m_assmbMap->Assemble(inarray, outarray); } @@ -86,13 +83,9 @@ public: void GlobalToLocal(Array<OneD, TData> inarray, Field<TData, FieldState::Coeff> &out) { - // Get the solution type - GlobalSysSolnType solnType = m_assmbMap->GetGlobalSysSolnType(); - - auto nloc = m_assmbMap->GetNumLocalCoeffs(); - Array<OneD, TData> outarray(nloc); + Array<OneD, TData> outarray(m_nloc); - if (solnType == eIterativeFull) + if (m_solnType == eIterativeFull) { m_assmbMap->GlobalToLocal(inarray, outarray); } @@ -131,7 +124,11 @@ public: protected: AssemblyMapCGSharedPtr m_assmbMap; - Array<OneD, TData> m_tmp; + GlobalSysSolnType m_solnType; + Array<OneD, TData> m_glo; + size_t m_nloc; + size_t m_nglo; + size_t m_ndir; }; } // namespace Nektar::Operators::detail