From 1a28303af19cc1725eb56a90d745782b356ddb10 Mon Sep 17 00:00:00 2001
From: Jacques Xing <jacques.xing@kcl.ac.uk>
Date: Tue, 30 Jan 2024 14:08:02 +0000
Subject: [PATCH] Update ConjGrad StdMat operator implementation for MPI

---
 Field.hpp                               | 10 ++++---
 MemoryRegionCPU.hpp                     |  4 +--
 MemoryRegionCUDA.hpp                    |  8 ++---
 Operators/ConjGrad/ConjGradStdMat.hpp   | 38 +++++++++++++++--------
 Operators/FwdTrans/FwdTransStdMat.hpp   | 12 ++++----
 Operators/HelmSolve/HelmSolveStdMat.hpp | 12 ++++----
 Operators/Matrix/MatrixCUDA.hpp         |  5 ++--
 Operators/Matrix/MatrixStdMat.hpp       |  7 +++--
 Operators/NullPrecon/NullPreconCUDA.hpp |  3 +-
 Operators/Operator.hpp                  |  2 +-
 Operators/OperatorConjGrad.hpp          |  1 -
 tests/CMakeLists.txt                    | 40 ++++++++++++++++++-------
 tests/init_diagpreconfields.hpp         |  2 +-
 tests/init_fields.hpp                   |  1 -
 tests/init_fwdtransfields.hpp           |  2 +-
 tests/init_helmsolvefields.hpp          |  3 +-
 tests/init_nullpreconfields.hpp         |  1 -
 tests/test_nullprecon.cpp               |  1 -
 18 files changed, 91 insertions(+), 61 deletions(-)

diff --git a/Field.hpp b/Field.hpp
index 613694aa..ea51fc98 100644
--- a/Field.hpp
+++ b/Field.hpp
@@ -222,8 +222,9 @@ public:
 
         size_t storage_size = std::accumulate(
             field.block_attributes.begin(), field.block_attributes.end(), 0,
-            [](size_t acc, const BlockAttributes &block)
-            { return acc + block.block_size; });
+            [](size_t acc, const BlockAttributes &block) {
+                return acc + block.block_size;
+            });
 
         // Create new TMemoryRegion and polymorphically store as MemoryRegionCPU
         field.m_storage = std::make_unique<TMemoryRegion<TData>>(
@@ -252,8 +253,9 @@ public:
 
         size_t storage_size = std::accumulate(
             field.block_attributes.begin(), field.block_attributes.end(), 0,
-            [](size_t acc, const BlockAttributes &block)
-            { return acc + block.block_size; });
+            [](size_t acc, const BlockAttributes &block) {
+                return acc + block.block_size;
+            });
 
         // Create new TMemoryRegion and polymorphically store as MemoryRegionCPU
         field.m_storage = std::make_unique<TMemoryRegion<TData>>(
diff --git a/MemoryRegionCPU.hpp b/MemoryRegionCPU.hpp
index b7d762db..7dcd1c2b 100644
--- a/MemoryRegionCPU.hpp
+++ b/MemoryRegionCPU.hpp
@@ -13,8 +13,8 @@
 template <typename TData> class MemoryRegionCPU
 {
 public:
-    MemoryRegionCPU()                                      = delete;
-    MemoryRegionCPU(const MemoryRegionCPU &rhs)            = delete;
+    MemoryRegionCPU()                           = delete;
+    MemoryRegionCPU(const MemoryRegionCPU &rhs) = delete;
     MemoryRegionCPU &operator=(const MemoryRegionCPU &rhs) = delete;
 
     MemoryRegionCPU(MemoryRegionCPU &&rhs)
diff --git a/MemoryRegionCUDA.hpp b/MemoryRegionCUDA.hpp
index d77cfce5..ce5469dd 100644
--- a/MemoryRegionCUDA.hpp
+++ b/MemoryRegionCUDA.hpp
@@ -42,10 +42,10 @@ public:
     void operator=(MemoryRegionCUDA &&rhs)
     {
         MemoryRegionCPU<TData>::operator=(std::move(rhs));
-        m_device     = rhs.m_device;
-        m_size       = rhs.m_size;
-        rhs.m_device = nullptr;
-        rhs.m_size   = 0;
+        m_device                        = rhs.m_device;
+        m_size                          = rhs.m_size;
+        rhs.m_device                    = nullptr;
+        rhs.m_size                      = 0;
     }
 
     virtual TData *GetCPUPtr() override
diff --git a/Operators/ConjGrad/ConjGradStdMat.hpp b/Operators/ConjGrad/ConjGradStdMat.hpp
index d99870f1..d3f9ac7a 100644
--- a/Operators/ConjGrad/ConjGradStdMat.hpp
+++ b/Operators/ConjGrad/ConjGradStdMat.hpp
@@ -41,6 +41,7 @@ public:
                                                m_maxIter, 5000);
         contfield->GetSession()->LoadParameter("IterativeSolverTolerance",
                                                m_tol, 1.0E-09);
+        m_rowComm    = contfield->GetSession()->GetComm()->GetRowComm();
         m_nloc       = contfield->GetLocalToGlobalMap()->GetNumLocalCoeffs();
         m_assmbScatr = AssmbScatr<TData>::create(this->m_expansionList);
         m_robBndCond = RobBndCond<TData>::create(this->m_expansionList);
@@ -51,6 +52,12 @@ public:
     void apply(Field<TData, FieldState::Coeff> &in,
                Field<TData, FieldState::Coeff> &out) override
     {
+        if (m_rowComm->GetSize() > 1 && m_rowComm->GetRank() == 0)
+        {
+            std::cout << "Solve ConjGrad with " << m_rowComm->GetSize()
+                      << " processors" << std::endl;
+        }
+
         // Store pointers to temporary fields
         Array<OneD, TData> inArr(m_nloc, 0.0);
         Array<OneD, TData> outArr(m_nloc, 0.0);
@@ -79,7 +86,7 @@ public:
         TData rho_new;
         TData mu;
         TData eps;
-        std::array<TData, 3> vExchange{0.0, 0.0, 0.0};
+        Array<OneD, TData> vExchange(3, 0.0);
 
         // Copy data from input field
         auto *inarrptr = inArr.data();
@@ -106,14 +113,14 @@ public:
         vExchange[2] = std::inner_product(p_wk, p_wk + m_nloc, p_r_A, 0.0);
 
         // Perform inner-product exchanges
-        // m_rowComm->AllReduce(vExchange, Nektar::LibUtilities::ReduceSum);
+        m_rowComm->AllReduce(vExchange, Nektar::LibUtilities::ReduceSum);
 
         eps = vExchange[2];
 
         // Calculate rhs magnitude
         m_assmbScatr->apply(m_r_A, m_wk);
         rhsMagnitude = std::inner_product(p_in, p_in + m_nloc, p_wk, 0.0);
-        // m_rowComm->AllReduce(rhsMagnitude, Nektar::LibUtilities::ReduceSum);
+        m_rowComm->AllReduce(rhsMagnitude, Nektar::LibUtilities::ReduceSum);
         rhsMagnitude = (rhsMagnitude > 1.0e-6) ? rhsMagnitude : 1.0;
 
         // If input residual is less than tolerance skip solve.
@@ -134,7 +141,7 @@ public:
         vExchange[0] = std::inner_product(p_r_A, p_r_A + m_nloc, p_w_A, 0.0);
         vExchange[1] = std::inner_product(p_s_A, p_s_A + m_nloc, p_w_A, 0.0);
 
-        // m_rowComm->AllReduce(vExchange, Nektar::LibUtilities::ReduceSum);
+        m_rowComm->AllReduce(vExchange, Nektar::LibUtilities::ReduceSum);
 
         rho             = vExchange[0];
         mu              = vExchange[1];
@@ -151,23 +158,27 @@ public:
 
             // Compute new search direction p_k
             std::transform(p_p_A, p_p_A + m_nloc, p_w_A, p_p_A,
-                           [&beta](const TData &pElem, const TData &wElem)
-                           { return beta * pElem + wElem; });
+                           [&beta](const TData &pElem, const TData &wElem) {
+                               return beta * pElem + wElem;
+                           });
 
             // Compute new search direction q_k
             std::transform(p_q_A, p_q_A + m_nloc, p_s_A, p_q_A,
-                           [&beta](const TData &qElem, const TData &sElem)
-                           { return beta * qElem + sElem; });
+                           [&beta](const TData &qElem, const TData &sElem) {
+                               return beta * qElem + sElem;
+                           });
 
             // Update solution x_{k+1}
             std::transform(p_p_A, p_p_A + m_nloc, p_out, p_out,
-                           [&alpha](const TData &pElem, const TData &xElem)
-                           { return alpha * pElem + xElem; });
+                           [&alpha](const TData &pElem, const TData &xElem) {
+                               return alpha * pElem + xElem;
+                           });
 
             // Update residual vector r_{k+1}
             std::transform(p_q_A, p_q_A + m_nloc, p_r_A, p_r_A,
-                           [&alpha](const TData &qElem, const TData &rElem)
-                           { return -alpha * qElem + rElem; });
+                           [&alpha](const TData &qElem, const TData &rElem) {
+                               return -alpha * qElem + rElem;
+                           });
 
             // Apply preconditioner
             this->m_precon->apply(m_r_A, m_w_A);
@@ -191,7 +202,7 @@ public:
             vExchange[2] = std::inner_product(p_wk, p_wk + m_nloc, p_r_A, 0.0);
 
             // Perform inner-product exchanges
-            // m_rowComm->AllReduce(vExchange, Nektar::LibUtilities::ReduceSum);
+            m_rowComm->AllReduce(vExchange, Nektar::LibUtilities::ReduceSum);
 
             rho_new = vExchange[0];
             mu      = vExchange[1];
@@ -244,6 +255,7 @@ public:
     static std::string className;
 
 protected:
+    LibUtilities::CommSharedPtr m_rowComm;
     std::shared_ptr<OperatorAssmbScatr<TData>> m_assmbScatr;
     std::shared_ptr<OperatorRobBndCond<TData>> m_robBndCond;
     Field<TData, FieldState::Coeff> m_w_A;
diff --git a/Operators/FwdTrans/FwdTransStdMat.hpp b/Operators/FwdTrans/FwdTransStdMat.hpp
index 37a8a2bd..d15ccd6e 100644
--- a/Operators/FwdTrans/FwdTransStdMat.hpp
+++ b/Operators/FwdTrans/FwdTransStdMat.hpp
@@ -50,9 +50,9 @@ public:
         // Handle Dirichlet BCs
         m_DirBCOp->apply(out);
         m_MassOp->apply(out, m_tmp);
-        std::transform(rhsptr, rhsptr + nloc, tmpptr, rhsptr,
-                       [](const TData &rhs, const TData &dir)
-                       { return rhs - dir; });
+        std::transform(
+            rhsptr, rhsptr + nloc, tmpptr, rhsptr,
+            [](const TData &rhs, const TData &dir) { return rhs - dir; });
 
         // Handle Robin BCs
         m_RobBCOp->apply(out, m_rhs, true);
@@ -61,9 +61,9 @@ public:
         m_CGOp->apply(m_rhs, m_tmp);
 
         // Add Dirichlet BCs
-        std::transform(outptr, outptr + nloc, tmpptr, outptr,
-                       [](const TData &x, const TData &diff)
-                       { return x + diff; });
+        std::transform(
+            outptr, outptr + nloc, tmpptr, outptr,
+            [](const TData &x, const TData &diff) { return x + diff; });
     }
 
     void setPrecon(
diff --git a/Operators/HelmSolve/HelmSolveStdMat.hpp b/Operators/HelmSolve/HelmSolveStdMat.hpp
index 30e8fe5f..1d15b369 100644
--- a/Operators/HelmSolve/HelmSolveStdMat.hpp
+++ b/Operators/HelmSolve/HelmSolveStdMat.hpp
@@ -57,9 +57,9 @@ public:
         // Handle Dirichlet BCs
         m_DirBCOp->apply(out);
         m_HelmOp->apply(out, m_tmp);
-        std::transform(rhsptr, rhsptr + nloc, tmpptr, rhsptr,
-                       [](const TData &rhs, const TData &dir)
-                       { return rhs - dir; });
+        std::transform(
+            rhsptr, rhsptr + nloc, tmpptr, rhsptr,
+            [](const TData &rhs, const TData &dir) { return rhs - dir; });
 
         // Handle Robin BCs
         m_RobBCOp->apply(out, m_rhs, true);
@@ -68,9 +68,9 @@ public:
         m_CGOp->apply(m_rhs, m_tmp);
 
         // Add Dirichlet BCs
-        std::transform(outptr, outptr + nloc, tmpptr, outptr,
-                       [](const TData &x, const TData &diff)
-                       { return x + diff; });
+        std::transform(
+            outptr, outptr + nloc, tmpptr, outptr,
+            [](const TData &x, const TData &diff) { return x + diff; });
     }
 
     void setLambda(const TData &lambda)
diff --git a/Operators/Matrix/MatrixCUDA.hpp b/Operators/Matrix/MatrixCUDA.hpp
index 0ab87111..eb4575d3 100644
--- a/Operators/Matrix/MatrixCUDA.hpp
+++ b/Operators/Matrix/MatrixCUDA.hpp
@@ -21,8 +21,9 @@ public:
         // expansionlist
         auto blocks = GetBlockAttributes(TFieldState, expansionList);
         m_size      = std::accumulate(blocks.begin(), blocks.end(), 0,
-                                      [](size_t acc, const BlockAttributes &block)
-                                      { return acc + block.block_size; });
+                                 [](size_t acc, const BlockAttributes &block) {
+                                     return acc + block.block_size;
+                                 });
         // create memory for square matrix of given size
         // allocate memory device
         cudaMalloc((void **)&m_matrix, sizeof(TData) * m_size * m_size);
diff --git a/Operators/Matrix/MatrixStdMat.hpp b/Operators/Matrix/MatrixStdMat.hpp
index fd045a13..bf823137 100644
--- a/Operators/Matrix/MatrixStdMat.hpp
+++ b/Operators/Matrix/MatrixStdMat.hpp
@@ -1,7 +1,7 @@
 #pragma once
 
-#include <numeric>
 #include "Operators/OperatorMatrix.hpp"
+#include <numeric>
 
 namespace Nektar::Operators::detail
 {
@@ -18,8 +18,9 @@ public:
         // expansionlist
         auto blocks = GetBlockAttributes(TFieldState, expansionList);
         m_size      = std::accumulate(blocks.begin(), blocks.end(), 0,
-                                      [](size_t acc, const BlockAttributes &block)
-                                      { return acc + block.block_size; });
+                                 [](size_t acc, const BlockAttributes &block) {
+                                     return acc + block.block_size;
+                                 });
 
         // create memory for square matrix of given size
         m_matrix = std::vector<TData>(m_size * m_size);
diff --git a/Operators/NullPrecon/NullPreconCUDA.hpp b/Operators/NullPrecon/NullPreconCUDA.hpp
index a9561267..bb93c12d 100644
--- a/Operators/NullPrecon/NullPreconCUDA.hpp
+++ b/Operators/NullPrecon/NullPreconCUDA.hpp
@@ -9,8 +9,7 @@ namespace Nektar::Operators::detail
 {
 
 template <typename TData>
-class OperatorNullPreconImpl<TData, ImplCUDA>
-    : public OperatorNullPrecon<TData>
+class OperatorNullPreconImpl<TData, ImplCUDA> : public OperatorNullPrecon<TData>
 {
 public:
     OperatorNullPreconImpl(const MultiRegions::ExpListSharedPtr &expansionList)
diff --git a/Operators/Operator.hpp b/Operators/Operator.hpp
index 2af377b0..d03e3ca1 100644
--- a/Operators/Operator.hpp
+++ b/Operators/Operator.hpp
@@ -134,7 +134,7 @@ protected:
         {
             auto expPtr = this->m_expansionList->GetExp(e);
             auto &df    = expPtr->GetMetricInfo()->GetDerivFactors(
-                   expPtr->GetPointsKeys());
+                expPtr->GetPointsKeys());
             size_t nqTot = expPtr->GetTotPoints();
             if (expPtr->GetMetricInfo()->GetGtype() ==
                 SpatialDomains::eDeformed)
diff --git a/Operators/OperatorConjGrad.hpp b/Operators/OperatorConjGrad.hpp
index cd52fb51..ad1bf8ee 100644
--- a/Operators/OperatorConjGrad.hpp
+++ b/Operators/OperatorConjGrad.hpp
@@ -27,7 +27,6 @@ public:
         apply(in, out);
     }
 
-
     void setLHS(const std::shared_ptr<OperatorLinear<TData, FieldState::Coeff,
                                                      FieldState::Coeff>> &ptr)
     {
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index a37c18c3..eeeba79f 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,3 +1,7 @@
+find_package(
+  MPI
+)
+
 find_package(
   Boost
   REQUIRED
@@ -541,11 +545,19 @@ IF (NEKTAR_USE_CUDA)
     )
 ENDIF()
 
-add_test(
-  NAME FwdTrans
-  COMMAND test_fwdtrans
-  WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
-)
+IF (${MPI_FOUND})
+  add_test(
+    NAME FwdTrans
+    COMMAND mpirun -np 2 "${CMAKE_BINARY_DIR}/tests/test_fwdtrans"
+    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+  )
+ELSE()
+    add_test(
+    NAME FwdTrans
+    COMMAND test_fwdtrans
+    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+  )
+ENDIF()
 
 IF (NEKTAR_USE_CUDA)
     add_test(
@@ -569,11 +581,19 @@ IF (NEKTAR_USE_CUDA)
     )
 ENDIF()
 
-add_test(
-  NAME HelmSolve
-  COMMAND test_helmsolve
-  WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
-)
+IF (${MPI_FOUND})
+  add_test(
+    NAME HelmSolve
+    COMMAND mpirun -np 2 "${CMAKE_BINARY_DIR}/tests/test_helmsolve"
+    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+  )
+ELSE()
+  add_test(
+    NAME HelmSolve
+    COMMAND test_helmsolve
+    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+  )
+ENDIF()
 
 IF (NEKTAR_USE_CUDA)
     add_test(
diff --git a/tests/init_diagpreconfields.hpp b/tests/init_diagpreconfields.hpp
index 913a6d55..2be53bb5 100644
--- a/tests/init_diagpreconfields.hpp
+++ b/tests/init_diagpreconfields.hpp
@@ -104,7 +104,7 @@ class Helmholtz2D_Tri_Quad : public DiagPreconField
 public:
     Helmholtz2D_Tri_Quad()
     {
-        //meshName = "run/Helmholtz2D_P7_AllBCs.xml";
+        // meshName = "run/Helmholtz2D_P7_AllBCs.xml";
         meshName = "run/Helmholtz2D_varP.xml";
     }
 };
diff --git a/tests/init_fields.hpp b/tests/init_fields.hpp
index 97a752a9..ea357ff9 100644
--- a/tests/init_fields.hpp
+++ b/tests/init_fields.hpp
@@ -96,7 +96,6 @@ public:
                     session, graph, "u", true, false,
                     Collections::eNoCollection);
         }
-
         else if constexpr (std::is_same_v<TExpList, MultiRegions::DisContField>)
         {
             fixt_explist =
diff --git a/tests/init_fwdtransfields.hpp b/tests/init_fwdtransfields.hpp
index f723eea7..ed8f7180 100644
--- a/tests/init_fwdtransfields.hpp
+++ b/tests/init_fwdtransfields.hpp
@@ -144,7 +144,7 @@ class Helmholtz2D_Tri_Quad : public FwdTransField
 public:
     Helmholtz2D_Tri_Quad()
     {
-        //meshName = "run/Helmholtz2D_P7_AllBCs.xml";
+        // meshName = "run/Helmholtz2D_P7_AllBCs.xml";
         meshName = "run/Helmholtz2D_varP.xml";
     }
 };
diff --git a/tests/init_helmsolvefields.hpp b/tests/init_helmsolvefields.hpp
index 53f5aec0..b6584565 100644
--- a/tests/init_helmsolvefields.hpp
+++ b/tests/init_helmsolvefields.hpp
@@ -148,7 +148,7 @@ class Helmholtz2D_Tri_Quad : public HelmSolveField
 public:
     Helmholtz2D_Tri_Quad()
     {
-        //meshName = "run/Helmholtz2D_P7_AllBCs.xml";
+        // meshName = "run/Helmholtz2D_P7_AllBCs.xml";
         meshName = "run/Helmholtz2D_varP.xml";
     }
 };
@@ -188,4 +188,3 @@ public:
         meshName = "run/Helmholtz3D_Tet_VarP.xml";
     }
 };
-
diff --git a/tests/init_nullpreconfields.hpp b/tests/init_nullpreconfields.hpp
index 80a6633d..44ab2154 100644
--- a/tests/init_nullpreconfields.hpp
+++ b/tests/init_nullpreconfields.hpp
@@ -137,4 +137,3 @@ public:
         meshName = "run/Helmholtz3D_Tet_VarP.xml";
     }
 };
-
diff --git a/tests/test_nullprecon.cpp b/tests/test_nullprecon.cpp
index 05b567d0..b814f775 100644
--- a/tests/test_nullprecon.cpp
+++ b/tests/test_nullprecon.cpp
@@ -11,7 +11,6 @@
 
 BOOST_AUTO_TEST_SUITE(TestNullPrecon)
 
-
 BOOST_FIXTURE_TEST_CASE(nullprecon_seg, Helmholtz1D_Seg)
 {
     Configure();
-- 
GitLab