From d5caaf968e940b160829213825bdb79f9871b129 Mon Sep 17 00:00:00 2001
From: Jacques Xing <jacques.xing@kcl.ac.uk>
Date: Thu, 18 Jan 2024 08:06:53 +0000
Subject: [PATCH] Implement NeuBndCond operator for CUDA

---
 CMakeLists.txt                                |   2 +-
 Operators/NeuBndCond/NeuBndCondCUDA.cu        |  13 ++
 Operators/NeuBndCond/NeuBndCondCUDA.hpp       | 151 ++++++++++++++++++
 .../NeuBndCond/NeuBndCondCUDAKernels.cuh      |  62 +++++++
 Operators/NeuBndCond/NeuBndCondStdMat.hpp     |  10 +-
 tests/CMakeLists.txt                          |  19 +++
 tests/init_neumannfields.hpp                  |  72 +++++++++
 tests/test_neumanncuda.cpp                    |  92 +++++++++++
 8 files changed, 415 insertions(+), 6 deletions(-)
 create mode 100644 Operators/NeuBndCond/NeuBndCondCUDA.cu
 create mode 100644 Operators/NeuBndCond/NeuBndCondCUDA.hpp
 create mode 100644 Operators/NeuBndCond/NeuBndCondCUDAKernels.cuh
 create mode 100644 tests/init_neumannfields.hpp
 create mode 100644 tests/test_neumanncuda.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ecfd7de..1fed5c9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -68,7 +68,7 @@ if (NEKTAR_USE_CUDA)
                    Operators/NullPrecon/NullPreconCUDA.cu 
                    #Operators/DiagPrecon/DiagPreconCUDA.cu
                    Operators/DirBndCond/DirBndCondCUDA.cu
-                   #Operators/NeuBndCond/NeuBndCondCUDA.cu
+                   Operators/NeuBndCond/NeuBndCondCUDA.cu
                    #Operators/RobBndCond/RobBndCondCUDA.cu
                    #Operators/ConjGrad/ConjGradCUDA.cu
                    #Operators/FwdTrans/FwdTransCUDA.cu
diff --git a/Operators/NeuBndCond/NeuBndCondCUDA.cu b/Operators/NeuBndCond/NeuBndCondCUDA.cu
new file mode 100644
index 0000000..7968af2
--- /dev/null
+++ b/Operators/NeuBndCond/NeuBndCondCUDA.cu
@@ -0,0 +1,13 @@
+#include "NeuBndCondCUDA.hpp"
+
+namespace Nektar::Operators::detail
+{
+
+// Register implementation with Operator Factory
+template <>
+std::string OperatorNeuBndCondImpl<double, ImplCUDA>::className =
+    GetOperatorFactory<default_fp_type>().RegisterCreatorFunction(
+        "NeuBndCondCUDA", OperatorNeuBndCondImpl<double, ImplCUDA>::instantiate,
+        "...");
+
+} // namespace Nektar::Operators::detail
diff --git a/Operators/NeuBndCond/NeuBndCondCUDA.hpp b/Operators/NeuBndCond/NeuBndCondCUDA.hpp
new file mode 100644
index 0000000..86543eb
--- /dev/null
+++ b/Operators/NeuBndCond/NeuBndCondCUDA.hpp
@@ -0,0 +1,151 @@
+#pragma once
+
+#include "MemoryRegionCUDA.hpp"
+#include "Operators/NeuBndCond/NeuBndCondCUDAKernels.cuh"
+#include "Operators/OperatorNeuBndCond.hpp"
+
+#include <MultiRegions/AssemblyMap/AssemblyMapCG.h>
+#include <MultiRegions/ContField.h>
+#include <SpatialDomains/Conditions.h>
+
+using namespace Nektar;
+using namespace Nektar::MultiRegions;
+
+namespace Nektar::Operators::detail
+{
+template <typename TData>
+class OperatorNeuBndCondImpl<TData, ImplCUDA> : public OperatorNeuBndCond<TData>
+{
+public:
+    OperatorNeuBndCondImpl(const MultiRegions::ExpListSharedPtr &expansionList)
+        : OperatorNeuBndCond<TData>(expansionList)
+    {
+        auto contfield =
+            std::dynamic_pointer_cast<ContField>(this->m_expansionList);
+        auto assmbMap = contfield->GetLocalToGlobalMap();
+        m_bndExpSize  = contfield->GetBndCondExpansions().size();
+        m_signChange  = assmbMap->GetSignChange();
+
+        // Memory allocation
+        if (m_signChange)
+        {
+            auto &sign = assmbMap->GetBndCondCoeffsToLocalCoeffsSign();
+            cudaMalloc((void **)&m_sign, sizeof(TData) * sign.size());
+            cudaMemcpy(m_sign, sign.get(), sizeof(TData) * sign.size(),
+                       cudaMemcpyHostToDevice);
+        }
+
+        auto &map = assmbMap->GetBndCondCoeffsToLocalCoeffsMap();
+        cudaMalloc((void **)&m_map, sizeof(int) * map.size());
+        cudaMemcpy(m_map, map.get(), sizeof(int) * map.size(),
+                   cudaMemcpyHostToDevice);
+
+        Array<OneD, int> offset(m_bndExpSize, 0);
+        for (size_t i = 1; i < offset.size(); ++i)
+        {
+            offset[i] = offset[i - 1] +
+                        contfield->GetBndCondExpansions()[i - 1]->GetNcoeffs();
+        }
+        cudaMalloc((void **)&m_offset, sizeof(int) * offset.size());
+        cudaMemcpy(m_offset, offset.get(), sizeof(int) * offset.size(),
+                   cudaMemcpyHostToDevice);
+
+        Array<OneD, int> ncoeff(m_bndExpSize);
+        size_t ntotcoeff = 0;
+        for (size_t i = 0; i < ncoeff.size(); ++i)
+        {
+            ncoeff[i] = contfield->GetBndCondExpansions()[i]->GetNcoeffs();
+            ntotcoeff += contfield->GetBndCondExpansions()[i]->GetNcoeffs();
+        }
+        cudaMalloc((void **)&m_ncoeff, sizeof(int) * ncoeff.size());
+        cudaMemcpy(m_ncoeff, ncoeff.get(), sizeof(int) * ncoeff.size(),
+                   cudaMemcpyHostToDevice);
+
+        Array<OneD, BoundaryConditionType> bctype(m_bndExpSize);
+        for (size_t i = 0; i < bctype.size(); ++i)
+        {
+            bctype[i] =
+                contfield->GetBndConditions()[i]->GetBoundaryConditionType();
+        }
+        cudaMalloc((void **)&m_bctype,
+                   sizeof(BoundaryConditionType) * bctype.size());
+        cudaMemcpy(m_bctype, bctype.get(),
+                   sizeof(BoundaryConditionType) * bctype.size(),
+                   cudaMemcpyHostToDevice);
+
+        Array<OneD, TData> coeff(ntotcoeff);
+        for (size_t i = 0; i < offset.size(); ++i)
+        {
+            for (size_t j = 0; j < ncoeff[i]; ++j)
+            {
+                coeff[offset[i] + j] =
+                    contfield->GetBndCondExpansions()[i]->GetCoeffs()[j];
+            }
+        }
+        cudaMalloc((void **)&m_coeff, sizeof(TData) * coeff.size());
+        cudaMemcpy(m_coeff, coeff.get(), sizeof(TData) * coeff.size(),
+                   cudaMemcpyHostToDevice);
+    }
+
+    ~OperatorNeuBndCondImpl(void)
+    {
+        if (m_signChange)
+        {
+            cudaFree(m_sign);
+        }
+        cudaFree(m_map);
+        cudaFree(m_offset);
+        cudaFree(m_ncoeff);
+        cudaFree(m_bctype);
+        cudaFree(m_coeff);
+    }
+
+    void apply(Field<TData, FieldState::Coeff> &inout) override
+    {
+        // Copy memory to GPU, if necessary and get raw pointers.
+        auto *inoutptr =
+            inout.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
+
+        // Deterime CUDA grid parameters.
+        m_gridSize = m_bndExpSize / m_blockSize;
+        m_gridSize += (m_bndExpSize % m_blockSize == 0) ? 0 : 1;
+
+        if (m_signChange)
+        {
+            NeuBndCondKernel<TData><<<m_gridSize, m_blockSize>>>(
+                m_bndExpSize, m_offset, m_bctype, m_ncoeff, m_sign, m_map,
+                m_coeff, inoutptr);
+        }
+        else
+        {
+            NeuBndCondKernel<TData><<<m_gridSize, m_blockSize>>>(
+                m_bndExpSize, m_offset, m_bctype, m_ncoeff, m_map, m_coeff,
+                inoutptr);
+        }
+    }
+
+    // instantiation function for CreatorFunction in OperatorFactory
+    static std::unique_ptr<Operator<TData>> instantiate(
+        const MultiRegions::ExpListSharedPtr &expansionList)
+    {
+        return std::make_unique<OperatorNeuBndCondImpl<TData, ImplCUDA>>(
+            expansionList);
+    }
+
+    // className - for OperatorFactory
+    static std::string className;
+
+protected:
+    bool m_signChange;
+    size_t m_bndExpSize;
+    BoundaryConditionType *m_bctype;
+    TData *m_coeff;
+    TData *m_sign;
+    int *m_map;
+    int *m_ncoeff;
+    int *m_offset;
+    size_t m_gridSize;
+    size_t m_blockSize = 32;
+};
+
+} // namespace Nektar::Operators::detail
diff --git a/Operators/NeuBndCond/NeuBndCondCUDAKernels.cuh b/Operators/NeuBndCond/NeuBndCondCUDAKernels.cuh
new file mode 100644
index 0000000..1ad7f1f
--- /dev/null
+++ b/Operators/NeuBndCond/NeuBndCondCUDAKernels.cuh
@@ -0,0 +1,62 @@
+#include <SpatialDomains/Conditions.h>
+
+using namespace Nektar;
+using namespace Nektar::SpatialDomains;
+
+namespace Nektar::Operators::detail
+{
+
+template <typename TData>
+__global__ void NeuBndCondKernel(const size_t nsize, const int *offsetptr,
+                                 const BoundaryConditionType *bctypeptr,
+                                 const int *ncoeffptr, const int *mapptr,
+                                 const TData *inptr, TData *outptr)
+{
+    size_t i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i >= nsize)
+    {
+        return;
+    }
+
+    if (bctypeptr[i] == eNeumann || bctypeptr[i] == eRobin)
+    {
+        size_t offset = offsetptr[i];
+        size_t ncoeff = ncoeffptr[i];
+        for (size_t j = 0; j < ncoeff; j++)
+        {
+            // atomicAdd(outptr + mapptr[offset + j], inptr[offset + j]);
+            outptr[mapptr[offset + j]] += inptr[offset + j];
+        }
+    }
+}
+
+template <typename TData>
+__global__ void NeuBndCondKernel(const size_t nsize, const int *offsetptr,
+                                 const BoundaryConditionType *bctypeptr,
+                                 const int *ncoeffptr, const TData *signptr,
+                                 const int *mapptr, const TData *inptr,
+                                 TData *outptr)
+{
+    size_t i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i >= nsize)
+    {
+        return;
+    }
+
+    if (bctypeptr[i] == eNeumann || bctypeptr[i] == eRobin)
+    {
+        size_t offset = offsetptr[i];
+        size_t ncoeff = ncoeffptr[i];
+        for (size_t j = 0; j < ncoeff; j++)
+        {
+            // atomicAdd(outptr + mapptr[offset + j], signptr[offset + j] *
+            // inptr[offset + j]);
+            outptr[mapptr[offset + j]] +=
+                signptr[offset + j] * inptr[offset + j];
+        }
+    }
+}
+
+} // namespace Nektar::Operators::detail
diff --git a/Operators/NeuBndCond/NeuBndCondStdMat.hpp b/Operators/NeuBndCond/NeuBndCondStdMat.hpp
index 91ac430..792c0af 100644
--- a/Operators/NeuBndCond/NeuBndCondStdMat.hpp
+++ b/Operators/NeuBndCond/NeuBndCondStdMat.hpp
@@ -34,8 +34,8 @@ public:
         auto ncoeffs = m_assmbMap->GetNumLocalCoeffs();
 
         // Copy data from input field
-        Array<OneD, NekDouble> outarr(ncoeffs, 0.0);
-        auto *inarrptr = outarr.data();
+        Array<OneD, NekDouble> inoutarr(ncoeffs);
+        auto *inarrptr = inoutarr.data();
         auto *inptr    = inout.GetStorage().GetCPUPtr();
 
         for (size_t block_idx = 0; block_idx < inout.GetBlocks().size();
@@ -66,7 +66,7 @@ public:
                     for (size_t j = 0; j < bndCondExpansions[i]->GetNcoeffs();
                          j++)
                     {
-                        outarr[map[bndcnt + j]] +=
+                        inoutarr[map[bndcnt + j]] +=
                             sign[bndcnt + j] * bndcoeff[j];
                     }
                 }
@@ -75,7 +75,7 @@ public:
                     for (size_t j = 0; j < bndCondExpansions[i]->GetNcoeffs();
                          j++)
                     {
-                        outarr[map[bndcnt + j]] += bndcoeff[j];
+                        inoutarr[map[bndcnt + j]] += bndcoeff[j];
                     }
                 }
             }
@@ -83,7 +83,7 @@ public:
         }
 
         // Copy data to output field
-        auto *outarrptr = outarr.data();
+        auto *outarrptr = inoutarr.data();
         auto *outptr    = inout.GetStorage().GetCPUPtr();
 
         for (size_t block_idx = 0; block_idx < inout.GetBlocks().size();
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 6981fc0..0dbf180 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -216,6 +216,17 @@ IF (NEKTAR_USE_CUDA)
         -DTEST_PATH="${CMAKE_SOURCE_DIR}")
 ENDIF()
 
+IF (NEKTAR_USE_CUDA)
+    add_executable(test_neumanncuda test_neumanncuda.cpp)
+    target_link_libraries(test_neumanncuda PRIVATE Operators)
+    target_link_libraries(test_neumanncuda PRIVATE ${NEKTAR++_LIBRARIES})
+    target_link_libraries(test_neumanncuda PRIVATE Boost::unit_test_framework)
+    target_include_directories(test_neumanncuda PRIVATE "${CMAKE_SOURCE_DIR}")
+    target_include_directories(test_neumanncuda PRIVATE ${NEKTAR++_INCLUDE_DIRS})
+    target_compile_definitions(test_neumanncuda PRIVATE
+        -DTEST_PATH="${CMAKE_SOURCE_DIR}")
+ENDIF()
+
 add_executable(test_nullprecon test_nullprecon.cpp)
 target_link_libraries(test_nullprecon PRIVATE Operators)
 target_link_libraries(test_nullprecon PRIVATE ${NEKTAR++_LIBRARIES})
@@ -431,6 +442,14 @@ IF (NEKTAR_USE_CUDA)
     )
 ENDIF()
 
+IF (NEKTAR_USE_CUDA)
+    add_test(
+      NAME NeumannCUDA
+      COMMAND test_neumanncuda
+      WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+    )
+ENDIF()
+
 add_test(
   NAME NullPrecon
   COMMAND test_nullprecon
diff --git a/tests/init_neumannfields.hpp b/tests/init_neumannfields.hpp
new file mode 100644
index 0000000..ac18052
--- /dev/null
+++ b/tests/init_neumannfields.hpp
@@ -0,0 +1,72 @@
+#include "init_fields.hpp"
+
+using namespace std;
+using namespace Nektar::Operators;
+using namespace Nektar::LibUtilities;
+using namespace Nektar;
+
+class NeumannField
+    : public InitFields<double, FieldState::Coeff, FieldState::Coeff,
+                        MultiRegions::ContField>
+{
+public:
+    NeumannField()
+        : InitFields<double, FieldState::Coeff, FieldState::Coeff,
+                     MultiRegions::ContField>()
+    {
+    }
+};
+
+class Helmholtz1D_Seg : public NeumannField
+{
+public:
+    Helmholtz1D_Seg()
+    {
+        meshName = "run/Helmholtz1D_P8.xml";
+    }
+};
+
+class Helmholtz2D_Tri_Quad : public NeumannField
+{
+public:
+    Helmholtz2D_Tri_Quad()
+    {
+        meshName = "run/Helmholtz2D_P7_AllBCs.xml";
+    }
+};
+
+class Helmholtz3D_Hex : public NeumannField
+{
+public:
+    Helmholtz3D_Hex()
+    {
+        meshName = "run/Helmholtz3D_Hex_Heterogeneous.xml";
+    }
+};
+
+class Helmholtz3D_Prism : public NeumannField
+{
+public:
+    Helmholtz3D_Prism()
+    {
+        meshName = "run/Helmholtz3D_Prism_VarP.xml";
+    }
+};
+
+class Helmholtz3D_Pyr : public NeumannField
+{
+public:
+    Helmholtz3D_Pyr()
+    {
+        meshName = "run/Helmholtz3D_Pyr_VarP.xml";
+    }
+};
+
+class Helmholtz3D_Tet : public NeumannField
+{
+public:
+    Helmholtz3D_Tet()
+    {
+        meshName = "run/Helmholtz3D_Tet_VarP.xml";
+    }
+};
diff --git a/tests/test_neumanncuda.cpp b/tests/test_neumanncuda.cpp
new file mode 100644
index 0000000..3305146
--- /dev/null
+++ b/tests/test_neumanncuda.cpp
@@ -0,0 +1,92 @@
+#define BOOST_TEST_DYN_LINK
+#define BOOST_TEST_MODULE TestNeumannCUDA
+#include <boost/test/tools/output_test_stream.hpp>
+#include <boost/test/unit_test.hpp>
+
+#include <iostream>
+#include <memory>
+
+#include "Operators/OperatorNeuBndCond.hpp"
+#include "init_dirichletfields.hpp"
+
+BOOST_AUTO_TEST_SUITE(TestNeumann)
+
+BOOST_FIXTURE_TEST_CASE(dirichlet1dcuda_seg, Helmholtz1D_Seg)
+{
+    Configure();
+    NeuBndCond<>::create(fixt_explist)->apply(*fixt_out);
+    NeuBndCond<>::create(fixt_explist, "CUDA")->apply(*fixtcuda_out);
+    BOOST_TEST(fixtcuda_out->compare(*fixt_out, 1.0E-12));
+    boost::test_tools::output_test_stream output;
+    {
+        OutputIfNotMatch(fixtcuda_out->GetStorage().GetCPUPtr(),
+                         fixt_out->GetStorage().GetCPUPtr(), 1.0E-12);
+    }
+}
+
+BOOST_FIXTURE_TEST_CASE(dirichlet2dcuda_tri_quad, Helmholtz2D_Tri_Quad)
+{
+    Configure();
+    NeuBndCond<>::create(fixt_explist)->apply(*fixt_out);
+    NeuBndCond<>::create(fixt_explist, "CUDA")->apply(*fixtcuda_out);
+    BOOST_TEST(fixtcuda_out->compare(*fixt_out, 1.0E-12));
+    boost::test_tools::output_test_stream output;
+    {
+        OutputIfNotMatch(fixtcuda_out->GetStorage().GetCPUPtr(),
+                         fixt_out->GetStorage().GetCPUPtr(), 1.0E-12);
+    }
+}
+
+BOOST_FIXTURE_TEST_CASE(dirichlet3dcuda_hex, Helmholtz3D_Hex)
+{
+    Configure();
+    NeuBndCond<>::create(fixt_explist)->apply(*fixt_out);
+    NeuBndCond<>::create(fixt_explist, "CUDA")->apply(*fixtcuda_out);
+    BOOST_TEST(fixtcuda_out->compare(*fixt_out, 1.0E-12));
+    boost::test_tools::output_test_stream output;
+    {
+        OutputIfNotMatch(fixtcuda_out->GetStorage().GetCPUPtr(),
+                         fixt_out->GetStorage().GetCPUPtr(), 1.0E-12);
+    }
+}
+
+BOOST_FIXTURE_TEST_CASE(dirichlet3dcuda_prism, Helmholtz3D_Prism)
+{
+    Configure();
+    NeuBndCond<>::create(fixt_explist)->apply(*fixt_out);
+    NeuBndCond<>::create(fixt_explist, "CUDA")->apply(*fixtcuda_out);
+    BOOST_TEST(fixtcuda_out->compare(*fixt_out, 1.0E-12));
+    boost::test_tools::output_test_stream output;
+    {
+        OutputIfNotMatch(fixtcuda_out->GetStorage().GetCPUPtr(),
+                         fixt_out->GetStorage().GetCPUPtr(), 1.0E-12);
+    }
+}
+
+BOOST_FIXTURE_TEST_CASE(dirichlet3dcuda_pyr, Helmholtz3D_Pyr)
+{
+    Configure();
+    NeuBndCond<>::create(fixt_explist)->apply(*fixt_out);
+    NeuBndCond<>::create(fixt_explist, "CUDA")->apply(*fixtcuda_out);
+    BOOST_TEST(fixtcuda_out->compare(*fixt_out, 1.0E-12));
+    boost::test_tools::output_test_stream output;
+    {
+        OutputIfNotMatch(fixtcuda_out->GetStorage().GetCPUPtr(),
+                         fixt_out->GetStorage().GetCPUPtr(), 1.0E-12);
+    }
+}
+
+BOOST_FIXTURE_TEST_CASE(dirichlet3dcuda_tet, Helmholtz3D_Tet)
+{
+    Configure();
+    NeuBndCond<>::create(fixt_explist)->apply(*fixt_out);
+    NeuBndCond<>::create(fixt_explist, "CUDA")->apply(*fixtcuda_out);
+    BOOST_TEST(fixtcuda_out->compare(*fixt_out, 1.0E-12));
+    boost::test_tools::output_test_stream output;
+    {
+        OutputIfNotMatch(fixtcuda_out->GetStorage().GetCPUPtr(),
+                         fixt_out->GetStorage().GetCPUPtr(), 1.0E-12);
+    }
+}
+
+BOOST_AUTO_TEST_SUITE_END()
-- 
GitLab