Improve efficiency of DiagPrecon operators

a2912074 · Jacques Xing · Chris Cantwell · 47f84d4e · a2912074 · a2912074
Commit a2912074 authored 1 year ago by Jacques Xing Committed by Chris Cantwell 1 year ago
--- a/Operators/DiagPrecon/DiagPreconCUDA.hpp
+++ b/Operators/DiagPrecon/DiagPreconCUDA.hpp
@@ -3,7 +3,9 @@
 #include "Field.hpp"
 #include "Operators/AssmbScatr/AssmbScatrCUDA.hpp"
 #include "Operators/CUDAMathKernels.cuh"
+#include "Operators/DiagPrecon/DiagPreconCUDAKernel.cuh"
 #include "Operators/OperatorDiagPrecon.hpp"
+#include "Operators/OperatorHelper.cuh"
 #include "Operators/OperatorRobBndCond.hpp"
 #include <MultiRegions/AssemblyMap/AssemblyMapCG.h>
@@ -40,9 +42,6 @@ public:
        m_assmbScatr =
            std::static_pointer_cast<OperatorAssmbScatrImpl<TData, ImplCUDA>>(
                AssmbScatr<TData>::create(this->m_expansionList, "CUDA"));
-        // Deterime CUDA grid size.
-        m_gridSize = GetCUDAGridSize(m_nGlobal - m_nDir, m_blockSize);
    }
    ~OperatorDiagPreconImpl(void)
@@ -54,6 +53,9 @@ public:
    void apply(Field<TData, FieldState::Coeff> &in,
               Field<TData, FieldState::Coeff> &out) override
    {
+        // Deterime CUDA grid size.
+        m_gridSize = GetCUDAGridSize(m_nGlobal - m_nDir, m_blockSize);
        m_assmbScatr->Assemble(in, m_wk);
        vdivKernel<<<m_gridSize, m_blockSize>>>(
@@ -71,7 +73,8 @@ public:
        // auto robBCOp = RobBndCond<TData>::create(this->m_expansionList,
        // "CUDA");
-        Array<OneD, TData> diag(m_nLocal);
+        TData *diag;
+        cudaMalloc((void **)&diag, sizeof(TData) * m_nLocal);
        // create unit vector field to extract diagonal
        Field<TData, FieldState::Coeff> unit_vec =
@@ -87,36 +90,58 @@ public:
            unit_vec.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
        auto *actn_ptr =
            action.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
+        auto diag_ptr = diag;
-        TData init = 1.0;
+        size_t expIdx = 0;
-        for (size_t i = 0; i < m_nLocal; ++i)
+        for (size_t block_idx = 0; block_idx < unit_vec.GetBlocks().size();
+             ++block_idx)
        {
-            // set ith term in unit vector to be 1
+            // Determine shape and type of the element.
-            cudaMemcpy(uvec_ptr + i, &init, sizeof(TData),
+            auto const expPtr = this->m_expansionList->GetExp(expIdx);
-                       cudaMemcpyHostToDevice);
+            auto nElmts       = unit_vec.GetBlocks()[block_idx].num_elements;
+            auto nmTot        = expPtr->GetNcoeffs();
-            // apply operator to unit vector and store in action field
-            op->apply(unit_vec, action);
+            // Deterime CUDA grid size.
-            // robBCOp->apply(unit_vec, action);
+            m_gridSize = GetCUDAGridSize(nElmts, m_blockSize);
-            // copy ith row term from the action field to get ith diagonal
+            for (size_t i = 0; i < nmTot; ++i)
-            cudaMemcpy(diag.get() + i, actn_ptr + i, sizeof(TData),
+            {
-                       cudaMemcpyDeviceToHost);
+                // set ith term in unit vector to be 1
+                SetDiagonalKernel<<<m_gridSize, m_blockSize>>>(nmTot, nElmts, i,
-            // reset ith term in unit vector to be 0
+                                                               1.0, uvec_ptr);
-            cudaMemset(uvec_ptr + i, 0, sizeof(TData));
+                // apply operator to unit vector and store in action field
+                op->apply(unit_vec, action);
+                // robBCOp->apply(unit_vec, action);
+                // copy ith row term from the action field to get ith diagonal
+                CopyDiagonalKernel<<<m_gridSize, m_blockSize>>>(
+                    nmTot, nElmts, i, actn_ptr, diag_ptr);
+                // reset ith term in unit vector to be 0
+                SetDiagonalKernel<<<m_gridSize, m_blockSize>>>(nmTot, nElmts, i,
+                                                               0.0, uvec_ptr);
+            }
+            uvec_ptr += unit_vec.GetBlocks()[block_idx].block_size;
+            diag_ptr += unit_vec.GetBlocks()[block_idx].block_size;
+            actn_ptr += unit_vec.GetBlocks()[block_idx].block_size;
+            expIdx += nElmts;
        }
        // Assembly
-        Array<OneD, TData> tmp(m_nGlobal, 0.0);
+        Array<OneD, TData> locdiag(m_nLocal);
+        Array<OneD, TData> glodiag(m_nGlobal, 0.0);
+        cudaMemcpy(locdiag.get(), diag, sizeof(TData) * m_nLocal,
+                   cudaMemcpyDeviceToHost);
        for (size_t i = 0; i < m_nLocal; ++i)
        {
            size_t gid1 = m_assmbMap->GetLocalToGlobalMap(i);
-            tmp[gid1] += diag[i];
+            glodiag[gid1] += locdiag[i];
        }
-        m_assmbMap->UniversalAssemble(tmp);
+        m_assmbMap->UniversalAssemble(glodiag);
-        cudaMemcpy(m_diag, tmp.get(), sizeof(TData) * m_nGlobal,
+        cudaMemcpy(m_diag, glodiag.get(), sizeof(TData) * m_nGlobal,
                   cudaMemcpyHostToDevice);
    }

--- a/Operators/DiagPrecon/DiagPreconCUDAKernel.cuh
+++ b/Operators/DiagPrecon/DiagPreconCUDAKernel.cuh
+namespace Nektar::Operators::detail
+{
+template <typename TData>
+__global__ void SetDiagonalKernel(const size_t nm, const size_t nelmt,
+                                  const size_t mode, const TData val,
+                                  TData *out)
+{
+    size_t e = blockDim.x * blockIdx.x + threadIdx.x;
+    while (e < nelmt)
+    {
+        out[e * nm + mode] = val;
+        e += blockDim.x * gridDim.x;
+    }
+}
+template <typename TData>
+__global__ void CopyDiagonalKernel(const size_t nm, const size_t nelmt,
+                                   const size_t mode, const TData *in,
+                                   TData *out)
+{
+    size_t e = blockDim.x * blockIdx.x + threadIdx.x;
+    while (e < nelmt)
+    {
+        out[e * nm + mode] = in[e * nm + mode];
+        e += blockDim.x * gridDim.x;
+    }
+}
+} // namespace Nektar::Operators::detail
--- a/Operators/DiagPrecon/DiagPreconStdMat.hpp
+++ b/Operators/DiagPrecon/DiagPreconStdMat.hpp
@@ -78,20 +78,41 @@ public:
        auto *actn_ptr = action.GetStorage().GetCPUPtr();
        auto *diag_ptr = diag.get();
-        for (size_t i = 0; i < m_nLocal; ++i)
+        size_t expIdx = 0;
+        for (size_t block_idx = 0; block_idx < unit_vec.GetBlocks().size();
+             ++block_idx)
        {
-            // set ith term in unit vector to be 1
+            // Determine shape and type of the element.
-            *uvec_ptr = 1.0;
+            auto const expPtr = this->m_expansionList->GetExp(expIdx);
+            auto nElmts       = unit_vec.GetBlocks()[block_idx].num_elements;
-            // apply operator to unit vector and store in action field
+            auto nmTot        = expPtr->GetNcoeffs();
-            op->apply(unit_vec, action);
-            robBCOp->apply(unit_vec, action);
+            for (size_t i = 0; i < nmTot; ++i)
+            {
-            // copy ith row term from the action field to get ith diagonal
+                for (size_t j = 0; j < nElmts; ++j)
-            *(diag_ptr++) = *(actn_ptr++);
+                {
+                    // set ith term in unit vector to be 1
-            // reset ith term in unit vector to be 0
+                    uvec_ptr[j * nmTot + i] = 1.0;
-            *(uvec_ptr++) = 0.0;
+                }
+                // apply operator to unit vector and store in action field
+                op->apply(unit_vec, action);
+                robBCOp->apply(unit_vec, action);
+                for (size_t j = 0; j < nElmts; ++j)
+                {
+                    // copy ith row term from the action field to get ith
+                    // diagonal
+                    diag_ptr[j * nmTot + i] = actn_ptr[j * nmTot + i];
+                    // reset ith term in unit vector to be 0
+                    uvec_ptr[j * nmTot + i] = 0.0;
+                }
+            }
+            uvec_ptr += unit_vec.GetBlocks()[block_idx].block_size;
+            diag_ptr += unit_vec.GetBlocks()[block_idx].block_size;
+            actn_ptr += unit_vec.GetBlocks()[block_idx].block_size;
+            expIdx += nElmts;
        }
        // Assembly