diff --git a/Operators/DiagPrecon/DiagPreconCUDA.hpp b/Operators/DiagPrecon/DiagPreconCUDA.hpp index d952f9b659668ab9b6f0abf32a498213ca5363d2..9f03e23a1043bc7b12b63d2ae880698519957aae 100644 --- a/Operators/DiagPrecon/DiagPreconCUDA.hpp +++ b/Operators/DiagPrecon/DiagPreconCUDA.hpp @@ -3,7 +3,9 @@ #include "Field.hpp" #include "Operators/AssmbScatr/AssmbScatrCUDA.hpp" #include "Operators/CUDAMathKernels.cuh" +#include "Operators/DiagPrecon/DiagPreconCUDAKernel.cuh" #include "Operators/OperatorDiagPrecon.hpp" +#include "Operators/OperatorHelper.cuh" #include "Operators/OperatorRobBndCond.hpp" #include <MultiRegions/AssemblyMap/AssemblyMapCG.h> @@ -40,9 +42,6 @@ public: m_assmbScatr = std::static_pointer_cast<OperatorAssmbScatrImpl<TData, ImplCUDA>>( AssmbScatr<TData>::create(this->m_expansionList, "CUDA")); - - // Deterime CUDA grid size. - m_gridSize = GetCUDAGridSize(m_nGlobal - m_nDir, m_blockSize); } ~OperatorDiagPreconImpl(void) @@ -54,6 +53,9 @@ public: void apply(Field<TData, FieldState::Coeff> &in, Field<TData, FieldState::Coeff> &out) override { + // Deterime CUDA grid size. + m_gridSize = GetCUDAGridSize(m_nGlobal - m_nDir, m_blockSize); + m_assmbScatr->Assemble(in, m_wk); vdivKernel<<<m_gridSize, m_blockSize>>>( @@ -71,7 +73,8 @@ public: // auto robBCOp = RobBndCond<TData>::create(this->m_expansionList, // "CUDA"); - Array<OneD, TData> diag(m_nLocal); + TData *diag; + cudaMalloc((void **)&diag, sizeof(TData) * m_nLocal); // create unit vector field to extract diagonal Field<TData, FieldState::Coeff> unit_vec = @@ -87,36 +90,58 @@ public: unit_vec.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); auto *actn_ptr = action.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(); + auto diag_ptr = diag; - TData init = 1.0; - for (size_t i = 0; i < m_nLocal; ++i) + size_t expIdx = 0; + for (size_t block_idx = 0; block_idx < unit_vec.GetBlocks().size(); + ++block_idx) { - // set ith term in unit vector to be 1 - cudaMemcpy(uvec_ptr + i, &init, sizeof(TData), - cudaMemcpyHostToDevice); - - // apply operator to unit vector and store in action field - op->apply(unit_vec, action); - // robBCOp->apply(unit_vec, action); - - // copy ith row term from the action field to get ith diagonal - cudaMemcpy(diag.get() + i, actn_ptr + i, sizeof(TData), - cudaMemcpyDeviceToHost); - - // reset ith term in unit vector to be 0 - cudaMemset(uvec_ptr + i, 0, sizeof(TData)); + // Determine shape and type of the element. + auto const expPtr = this->m_expansionList->GetExp(expIdx); + auto nElmts = unit_vec.GetBlocks()[block_idx].num_elements; + auto nmTot = expPtr->GetNcoeffs(); + + // Deterime CUDA grid size. + m_gridSize = GetCUDAGridSize(nElmts, m_blockSize); + + for (size_t i = 0; i < nmTot; ++i) + { + // set ith term in unit vector to be 1 + SetDiagonalKernel<<<m_gridSize, m_blockSize>>>(nmTot, nElmts, i, + 1.0, uvec_ptr); + + // apply operator to unit vector and store in action field + op->apply(unit_vec, action); + // robBCOp->apply(unit_vec, action); + + // copy ith row term from the action field to get ith diagonal + CopyDiagonalKernel<<<m_gridSize, m_blockSize>>>( + nmTot, nElmts, i, actn_ptr, diag_ptr); + + // reset ith term in unit vector to be 0 + SetDiagonalKernel<<<m_gridSize, m_blockSize>>>(nmTot, nElmts, i, + 0.0, uvec_ptr); + } + + uvec_ptr += unit_vec.GetBlocks()[block_idx].block_size; + diag_ptr += unit_vec.GetBlocks()[block_idx].block_size; + actn_ptr += unit_vec.GetBlocks()[block_idx].block_size; + expIdx += nElmts; } // Assembly - Array<OneD, TData> tmp(m_nGlobal, 0.0); + Array<OneD, TData> locdiag(m_nLocal); + Array<OneD, TData> glodiag(m_nGlobal, 0.0); + cudaMemcpy(locdiag.get(), diag, sizeof(TData) * m_nLocal, + cudaMemcpyDeviceToHost); for (size_t i = 0; i < m_nLocal; ++i) { size_t gid1 = m_assmbMap->GetLocalToGlobalMap(i); - tmp[gid1] += diag[i]; + glodiag[gid1] += locdiag[i]; } - m_assmbMap->UniversalAssemble(tmp); + m_assmbMap->UniversalAssemble(glodiag); - cudaMemcpy(m_diag, tmp.get(), sizeof(TData) * m_nGlobal, + cudaMemcpy(m_diag, glodiag.get(), sizeof(TData) * m_nGlobal, cudaMemcpyHostToDevice); } diff --git a/Operators/DiagPrecon/DiagPreconCUDAKernel.cuh b/Operators/DiagPrecon/DiagPreconCUDAKernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..35ccedb02f6a6c715f51e28e98fdc13d72460de3 --- /dev/null +++ b/Operators/DiagPrecon/DiagPreconCUDAKernel.cuh @@ -0,0 +1,34 @@ +namespace Nektar::Operators::detail +{ + +template <typename TData> +__global__ void SetDiagonalKernel(const size_t nm, const size_t nelmt, + const size_t mode, const TData val, + TData *out) +{ + size_t e = blockDim.x * blockIdx.x + threadIdx.x; + + while (e < nelmt) + { + out[e * nm + mode] = val; + + e += blockDim.x * gridDim.x; + } +} + +template <typename TData> +__global__ void CopyDiagonalKernel(const size_t nm, const size_t nelmt, + const size_t mode, const TData *in, + TData *out) +{ + size_t e = blockDim.x * blockIdx.x + threadIdx.x; + + while (e < nelmt) + { + out[e * nm + mode] = in[e * nm + mode]; + + e += blockDim.x * gridDim.x; + } +} + +} // namespace Nektar::Operators::detail diff --git a/Operators/DiagPrecon/DiagPreconStdMat.hpp b/Operators/DiagPrecon/DiagPreconStdMat.hpp index 6f15ad269dc6acdc1524b3e760cb18f152579ae7..3865d4eb348e41df7321d4a66b95daf4b42602cf 100644 --- a/Operators/DiagPrecon/DiagPreconStdMat.hpp +++ b/Operators/DiagPrecon/DiagPreconStdMat.hpp @@ -78,20 +78,41 @@ public: auto *actn_ptr = action.GetStorage().GetCPUPtr(); auto *diag_ptr = diag.get(); - for (size_t i = 0; i < m_nLocal; ++i) + size_t expIdx = 0; + for (size_t block_idx = 0; block_idx < unit_vec.GetBlocks().size(); + ++block_idx) { - // set ith term in unit vector to be 1 - *uvec_ptr = 1.0; - - // apply operator to unit vector and store in action field - op->apply(unit_vec, action); - robBCOp->apply(unit_vec, action); - - // copy ith row term from the action field to get ith diagonal - *(diag_ptr++) = *(actn_ptr++); - - // reset ith term in unit vector to be 0 - *(uvec_ptr++) = 0.0; + // Determine shape and type of the element. + auto const expPtr = this->m_expansionList->GetExp(expIdx); + auto nElmts = unit_vec.GetBlocks()[block_idx].num_elements; + auto nmTot = expPtr->GetNcoeffs(); + + for (size_t i = 0; i < nmTot; ++i) + { + for (size_t j = 0; j < nElmts; ++j) + { + // set ith term in unit vector to be 1 + uvec_ptr[j * nmTot + i] = 1.0; + } + + // apply operator to unit vector and store in action field + op->apply(unit_vec, action); + robBCOp->apply(unit_vec, action); + + for (size_t j = 0; j < nElmts; ++j) + { + // copy ith row term from the action field to get ith + // diagonal + diag_ptr[j * nmTot + i] = actn_ptr[j * nmTot + i]; + + // reset ith term in unit vector to be 0 + uvec_ptr[j * nmTot + i] = 0.0; + } + } + uvec_ptr += unit_vec.GetBlocks()[block_idx].block_size; + diag_ptr += unit_vec.GetBlocks()[block_idx].block_size; + actn_ptr += unit_vec.GetBlocks()[block_idx].block_size; + expIdx += nElmts; } // Assembly