Compare revisions

Chris Cantwell · Jacques Xing · Chris Cantwell · Chris Cantwell · Dave Moxey · Jacques Xing
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,12 +26,15 @@ message(STATUS "Found Nektar++: version ${NEKTAR++_VERSION}")
 set(CMAKE_INSTALL_RPATH "${NEKTAR++_LIBRARY_DIRS}")
-set(SRC Field.cpp Operators/Operator.cpp Operators/OperatorBwdTrans.cpp Operators/BwdTrans/BwdTransImpl.cpp Operators/OperatorIProductWRTBase.cpp Operators/IProductWRTBase/IProductWRTBaseImpl.cpp)
+set(SRC Field.cpp Operators/Operator.cpp Operators/OperatorBwdTrans.cpp Operators/BwdTrans/BwdTransImpl.cpp Operators/OperatorIProductWRTBase.cpp Operators/IProductWRTBase/IProductWRTBaseImpl.cpp Operators/OperatorIProductWRTDerivBase.cpp Operators/IProductWRTDerivBase/IProductWRTDerivBaseImpl.cpp Operators/OperatorPhysDeriv.cpp Operators/PhysDeriv/PhysDerivImpl.cpp Operators/OperatorHelmholtz.cpp Operators/Helmholtz/HelmholtzImpl.cpp)
+if (NEKTAR_USE_CUDA AND NEKTAR_USE_SIMD)
+    MESSAGE(FATAL_ERROR "Cannot use both SIMD and CUDA")
+endif()
 if (NEKTAR_USE_CUDA)
    enable_language(CUDA)
    add_definitions(-DNEKTAR_USE_CUDA)
-    set(SRC ${SRC} MemoryRegionCUDA.cu Operators/BwdTrans/BwdTransCUDA.cu)
+    set(SRC ${SRC} MemoryRegionCUDA.cu Operators/BwdTrans/BwdTransCUDA.cu Operators/IProductWRTBase/IProductWRTBaseCUDA.cu Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDA.cu Operators/PhysDeriv/PhysDerivCUDA.cu Operators/Helmholtz/HelmholtzCUDA.cu)
 endif()
 if (NEKTAR_USE_SIMD)

--- a/Field.hpp
+++ b/Field.hpp
@@ -99,38 +99,45 @@ public:
        return *this;
    }
-  /**
+    /**
-   * @brief Compare this field to another field, with absolute
+     * @brief Compare this field to another field, with absolute
-   * tolerance tol.
+     * tolerance tol.
-   * @return bool
+     * @return bool
-   */
+     */
-  bool compare(Field<TType, TState> &rhs, double tol)
+    bool compare(Field<TType, TState> &rhs, double tol)
-  {
+    {
-    const std::vector<BlockAttributes>& rhs_blocks = rhs.GetBlocks();
+        const std::vector<BlockAttributes> &rhs_blocks = rhs.GetBlocks();
-    TType *store = GetStorage().GetCPUPtr();
+        TType *store     = GetStorage().GetCPUPtr();
-    TType *rhs_store = rhs.GetStorage().GetCPUPtr();
+        TType *rhs_store = rhs.GetStorage().GetCPUPtr();
-    if (rhs_blocks.size() != block_attributes.size()) return false;
+        if (rhs_blocks.size() != block_attributes.size())
+            return false;
-    size_t i {0};
-    for (size_t bl = 0; bl < block_attributes.size(); ++bl){
+        size_t i{0};
+        for (size_t bl = 0; bl < block_attributes.size(); ++bl)
-      size_t num_elements = block_attributes[bl].num_elements;
+        {
-      size_t num_pts = block_attributes[bl].num_pts;
+            size_t num_elements = block_attributes[bl].num_elements;
-      // Check that each block have the same structure
+            size_t num_pts      = block_attributes[bl].num_pts;
-      if (num_elements != rhs_blocks[bl].num_elements) return false;
-      if (num_pts != rhs_blocks[bl].num_pts) return false;
+            // Check that each block have the same structure
+            if (num_elements != rhs_blocks[bl].num_elements)
-      // Compare elements in the blocks
+                return false;
-      for (size_t el = 0 ; el < num_elements; ++el) {
+            if (num_pts != rhs_blocks[bl].num_pts)
-	for (size_t coeff = 0; coeff < num_pts; ++coeff) {
+                return false;
-	  i = coeff + el * num_pts + bl * num_pts * num_elements;
-	  if (std::abs(store[i] - rhs_store[i]) > tol) return false;
+            // Compare elements in the blocks
-	}
+            for (size_t el = 0; el < num_elements; ++el)
-      }
+            {
+                for (size_t coeff = 0; coeff < num_pts; ++coeff)
+                {
+                    i = coeff + el * num_pts + bl * num_pts * num_elements;
+                    if (std::abs(store[i] - rhs_store[i]) > tol)
+                        return false;
+                }
+            }
+        }
+        return true;
    }
-    return true;
-  }
    /**
     * @brief Static templated creation method.

--- a/Operators/BwdTrans/BwdTransCUDA.hpp
+++ b/Operators/BwdTrans/BwdTransCUDA.hpp
@@ -61,13 +61,7 @@ public:
    ~OperatorBwdTransImpl()
    {
-        for (auto &basis : m_basis)
+        DeallocateDataCUDA<TData>(m_basis);
-        {
-            for (size_t i = 0; i < basis.second.size(); i++)
-            {
-                cudaFree(basis.second[i]);
-            }
-        }
    }
    void apply(Field<TData, FieldState::Coeff> &in,

--- a/Operators/Helmholtz/HelmholtzCUDA.cu
+++ b/Operators/Helmholtz/HelmholtzCUDA.cu
+#include "HelmholtzCUDA.hpp"
+namespace Nektar::Operators::detail
+{
+template <>
+std::string OperatorHelmholtzImpl<double, ImplCUDA>::className =
+    GetOperatorFactory<double>().RegisterCreatorFunction(
+        "HelmholtzCUDA", OperatorHelmholtzImpl<double, ImplCUDA>::instantiate,
+        "...");
+}
--- a/Operators/Helmholtz/HelmholtzCUDA.hpp
+++ b/Operators/Helmholtz/HelmholtzCUDA.hpp
+#include "Operators/BwdTrans/BwdTransCUDA.hpp"
+#include "Operators/Helmholtz/HelmholtzCUDAKernels.cuh"
+#include "Operators/IProductWRTBase/IProductWRTBaseCUDA.hpp"
+#include "Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDA.hpp"
+#include "Operators/OperatorHelmholtz.hpp"
+#include "Operators/PhysDeriv/PhysDerivCUDA.hpp"
+namespace Nektar::Operators::detail
+{
+// standard matrix implementation
+template <typename TData>
+class OperatorHelmholtzImpl<TData, ImplCUDA> : public OperatorHelmholtz<TData>
+{
+public:
+    OperatorHelmholtzImpl(const MultiRegions::ExpListSharedPtr &expansionList)
+        : OperatorHelmholtz<TData>(expansionList),
+          m_bwd(
+              Field<TData, FieldState::Phys>::template create<MemoryRegionCUDA>(
+                  GetBlockAttributes(FieldState::Phys, expansionList))),
+          m_deriv0(
+              Field<TData, FieldState::Phys>::template create<MemoryRegionCUDA>(
+                  GetBlockAttributes(FieldState::Phys, expansionList))),
+          m_deriv1(
+              Field<TData, FieldState::Phys>::template create<MemoryRegionCUDA>(
+                  GetBlockAttributes(FieldState::Phys, expansionList))),
+          m_deriv2(
+              Field<TData, FieldState::Phys>::template create<MemoryRegionCUDA>(
+                  GetBlockAttributes(FieldState::Phys, expansionList)))
+    {
+        auto nCoord = this->m_expansionList->GetCoordim(0);
+        m_BwdTransOp  = BwdTrans<>::create(this->m_expansionList, "CUDA");
+        m_PhysDerivOp = PhysDeriv<>::create(this->m_expansionList, "CUDA");
+        m_IProductWRTBaseOp =
+            IProductWRTBase<>::create(this->m_expansionList, "CUDA");
+        m_IProductWRTDerivBaseOp =
+            IProductWRTDerivBase<>::create(this->m_expansionList, "CUDA");
+        Array<OneD, TData> diffCoeff(nCoord * nCoord, 0.0);
+        for (size_t d = 0; d < nCoord; d++)
+        {
+            diffCoeff[d * nCoord + d] = 1.0; // Temporary solution
+        }
+        cudaMalloc((void **)&m_diffCoeff, sizeof(TData) * nCoord * nCoord);
+        cudaMemcpy(m_diffCoeff, diffCoeff.get(),
+                   sizeof(TData) * nCoord * nCoord, cudaMemcpyHostToDevice);
+    }
+    void apply(Field<TData, FieldState::Coeff> &in,
+               Field<TData, FieldState::Coeff> &out) override
+    {
+        // Step 1: BwdTrans
+        m_BwdTransOp->apply(in, m_bwd);
+        // Step 2: PhysDeriv
+        m_PhysDerivOp->apply(m_bwd, m_deriv0, m_deriv1, m_deriv2);
+        // Step 3: Inner product for mass matrix operation
+        m_IProductWRTBaseOp->apply(m_bwd, out, m_lambda);
+        // Step 4: Multiply by diffusion coefficient
+        DiffusionCoeff(m_deriv0, m_deriv1, m_deriv2);
+        // Step 5: Inner product
+        m_IProductWRTDerivBaseOp->apply(m_deriv0, m_deriv1, m_deriv2, out,
+                                        true);
+    }
+    void DiffusionCoeff(Field<TData, FieldState::Phys> &deriv0,
+                        Field<TData, FieldState::Phys> &deriv1,
+                        Field<TData, FieldState::Phys> &deriv2)
+    {
+        // Initialize pointers.
+        std::vector<TData *> derivptr{deriv0.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(),
+                                      deriv1.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(),
+                                      deriv2.template GetStorage<MemoryRegionCUDA>().GetGPUPtr()};
+        // Initialize index.
+        size_t expIdx = 0;
+        for (size_t block_idx = 0; block_idx < deriv0.GetBlocks().size();
+             ++block_idx)
+        {
+            // Determine shape and type of the element.
+            auto const expPtr = this->m_expansionList->GetExp(expIdx);
+            auto nElmts       = deriv0.GetBlocks()[block_idx].num_elements;
+            auto nCoord       = expPtr->GetCoordim();
+            auto nqTot        = expPtr->GetTotPoints();
+            // Determine CUDA grid parameters.
+            m_gridSize = nElmts / m_blockSize;
+            m_gridSize += (nElmts % m_blockSize == 0) ? 0 : 1;
+            // Multiply by diffusion coefficient.
+            if (nCoord == 1)
+            {
+                auto nq0 = expPtr->GetNumPoints(0);
+                DiffusionCoeff1DKernel<<<m_gridSize, m_blockSize>>>(
+                    nq0, nElmts, m_diffCoeff, derivptr[0]);
+            }
+            else if (nCoord == 2)
+            {
+                auto nq0 = expPtr->GetNumPoints(0);
+                auto nq1 = expPtr->GetNumPoints(1);
+                DiffusionCoeff2DKernel<<<m_gridSize, m_blockSize>>>(
+                    nq0, nq1, nElmts, m_diffCoeff, derivptr[0], derivptr[1]);
+            }
+            else
+            {
+                auto nq0 = expPtr->GetNumPoints(0);
+                auto nq1 = expPtr->GetNumPoints(1);
+                auto nq2 = expPtr->GetNumPoints(2);
+                DiffusionCoeff3DKernel<<<m_gridSize, m_blockSize>>>(
+                    nq0, nq1, nq2, nElmts, m_diffCoeff, derivptr[0],
+                    derivptr[1], derivptr[2]);
+            }
+            // Increment pointer and index for next element type.
+            for (size_t d = 0; d < nCoord; d++)
+            {
+                derivptr[d] += nqTot * nElmts;
+            }
+            expIdx += nElmts;
+        }
+    }
+    static std::unique_ptr<Operator<TData>> instantiate(
+        const MultiRegions::ExpListSharedPtr &expansionList)
+    {
+        return std::make_unique<OperatorHelmholtzImpl<TData, ImplCUDA>>(
+            expansionList);
+    }
+    void SetLambda(TData lambda)
+    {
+        m_lambda = lambda;
+    }
+    static std::string className;
+private:
+    std::shared_ptr<OperatorBwdTrans<TData>> m_BwdTransOp;
+    std::shared_ptr<OperatorPhysDeriv<TData>> m_PhysDerivOp;
+    std::shared_ptr<OperatorIProductWRTBase<TData>> m_IProductWRTBaseOp;
+    std::shared_ptr<OperatorIProductWRTDerivBase<TData>>
+        m_IProductWRTDerivBaseOp;
+    Field<TData, FieldState::Phys> m_bwd;
+    Field<TData, FieldState::Phys> m_deriv0;
+    Field<TData, FieldState::Phys> m_deriv1;
+    Field<TData, FieldState::Phys> m_deriv2;
+    TData m_lambda = 1.0;
+    TData *m_diffCoeff;
+    size_t m_blockSize = 32;
+    size_t m_gridSize;
+};
+} // namespace Nektar::Operators::detail
--- a/Operators/Helmholtz/HelmholtzCUDAKernels.cuh
+++ b/Operators/Helmholtz/HelmholtzCUDAKernels.cuh
+namespace Nektar::Operators::detail
+{
+template <typename TData>
+__global__ void DiffusionCoeff1DKernel(const size_t nq0, const size_t nelmt,
+                                       const TData *diffCoeff, TData *deriv0)
+{
+    size_t e = blockDim.x * blockIdx.x + threadIdx.x;
+    if (e >= nelmt)
+    {
+        return;
+    }
+    // Assign pointers.
+    TData *derivptr = deriv0 + nq0 * e;
+    for (size_t i = 0; i < nq0; i++)
+    {
+        derivptr[i] *= diffCoeff[0];
+    }
+}
+template <typename TData>
+__global__ void DiffusionCoeff2DKernel(const size_t nq0, const size_t nq1,
+                                       const size_t nelmt,
+                                       const TData *diffCoeff, TData *deriv0,
+                                       TData *deriv1)
+{
+    size_t e = blockDim.x * blockIdx.x + threadIdx.x;
+    if (e >= nelmt)
+    {
+        return;
+    }
+    constexpr size_t ncoord = 2;
+    // Assign pointers.
+    TData **derivptr = new TData *[ncoord];
+    derivptr[0]      = deriv0 + nq0 * nq1 * e;
+    derivptr[1]      = deriv1 + nq0 * nq1 * e;
+    for (size_t j = 0, cnt = 0; j < nq1; j++)
+    {
+        for (size_t i = 0; i < nq0; i++)
+        {
+            TData deriv[2] = {derivptr[0][cnt], derivptr[1][cnt]};
+            for (size_t d = 0; d < ncoord; d++)
+            {
+                derivptr[d][cnt] = diffCoeff[d * ncoord + 0] * deriv[0] +
+                                   diffCoeff[d * ncoord + 1] * deriv[1];
+            }
+            cnt++;
+        }
+    }
+}
+template <typename TData>
+__global__ void DiffusionCoeff3DKernel(const size_t nq0, const size_t nq1,
+                                       const size_t nq2, const size_t nelmt,
+                                       TData *diffCoeff, TData *deriv0,
+                                       TData *deriv1, TData *deriv2)
+{
+    size_t e = blockDim.x * blockIdx.x + threadIdx.x;
+    if (e >= nelmt)
+    {
+        return;
+    }
+    constexpr size_t ncoord = 3;
+    // Assign pointers.
+    TData **derivptr = new TData *[ncoord];
+    derivptr[0]      = deriv0 + nq0 * nq1 * nq2 * e;
+    derivptr[1]      = deriv1 + nq0 * nq1 * nq2 * e;
+    derivptr[2]      = deriv2 + nq0 * nq1 * nq2 * e;
+    for (size_t k = 0, cnt = 0; k < nq2; k++)
+    {
+        for (size_t j = 0; j < nq1; j++)
+        {
+            for (size_t i = 0; i < nq0; i++)
+            {
+                TData deriv[3] = {derivptr[0][cnt], derivptr[1][cnt],
+                                  derivptr[2][cnt]};
+                for (size_t d = 0; d < ncoord; d++)
+                {
+                    derivptr[d][cnt] = diffCoeff[d * ncoord + 0] * deriv[0] +
+                                       diffCoeff[d * ncoord + 1] * deriv[1] +
+                                       diffCoeff[d * ncoord + 2] * deriv[2];
+                }
+                cnt++;
+            }
+        }
+    }
+}
+} // namespace Nektar::Operators::detail
--- a/Operators/Helmholtz/HelmholtzImpl.cpp
+++ b/Operators/Helmholtz/HelmholtzImpl.cpp
+#include "HelmholtzStdMat.hpp"
+namespace Nektar::Operators::detail
+{
+// Add different Helmholtz implementations to the factory.
+template <>
+std::string OperatorHelmholtzImpl<double, ImplStdMat>::className =
+    GetOperatorFactory<double>().RegisterCreatorFunction(
+        "HelmholtzStdMat",
+        OperatorHelmholtzImpl<double, ImplStdMat>::instantiate, "...");
+} // namespace Nektar::Operators::detail
--- a/Operators/Helmholtz/HelmholtzStdMat.hpp
+++ b/Operators/Helmholtz/HelmholtzStdMat.hpp
+#include "Operators/BwdTrans/BwdTransStdMat.hpp"
+#include "Operators/IProductWRTBase/IProductWRTBaseStdMat.hpp"
+#include "Operators/IProductWRTDerivBase/IProductWRTDerivBaseStdMat.hpp"
+#include "Operators/OperatorHelmholtz.hpp"
+#include "Operators/PhysDeriv/PhysDerivStdMat.hpp"
+namespace Nektar::Operators::detail
+{
+// standard matrix implementation
+template <typename TData>
+class OperatorHelmholtzImpl<TData, ImplStdMat> : public OperatorHelmholtz<TData>
+{
+public:
+    OperatorHelmholtzImpl(const MultiRegions::ExpListSharedPtr &expansionList)
+        : OperatorHelmholtz<TData>(expansionList),
+          m_bwd(Field<TData, FieldState::Phys>::create(
+              GetBlockAttributes(FieldState::Phys, expansionList))),
+          m_deriv0(Field<TData, FieldState::Phys>::create(
+              GetBlockAttributes(FieldState::Phys, expansionList))),
+          m_deriv1(Field<TData, FieldState::Phys>::create(
+              GetBlockAttributes(FieldState::Phys, expansionList))),
+          m_deriv2(Field<TData, FieldState::Phys>::create(
+              GetBlockAttributes(FieldState::Phys, expansionList))),
+          m_derivcoeff0(Field<TData, FieldState::Phys>::create(
+              GetBlockAttributes(FieldState::Phys, expansionList))),
+          m_derivcoeff1(Field<TData, FieldState::Phys>::create(
+              GetBlockAttributes(FieldState::Phys, expansionList))),
+          m_derivcoeff2(Field<TData, FieldState::Phys>::create(
+              GetBlockAttributes(FieldState::Phys, expansionList)))
+    {
+        auto nCoord = this->m_expansionList->GetCoordim(0);
+        m_BwdTransOp  = BwdTrans<>::create(this->m_expansionList, "StdMat");
+        m_PhysDerivOp = PhysDeriv<>::create(this->m_expansionList, "StdMat");
+        m_IProductWRTBaseOp =
+            IProductWRTBase<>::create(this->m_expansionList, "StdMat");
+        m_IProductWRTDerivBaseOp =
+            IProductWRTDerivBase<>::create(this->m_expansionList, "StdMat");
+        m_diffCoeff = Array<OneD, TData>(nCoord * nCoord, 0.0);
+        for (size_t d = 0; d < nCoord; d++)
+        {
+            m_diffCoeff[d * nCoord + d] = 1.0; // temporary solution
+        }
+    }
+    void apply(Field<TData, FieldState::Coeff> &in,
+               Field<TData, FieldState::Coeff> &out) override
+    {
+        // Step 1: BwdTrans
+        m_BwdTransOp->apply(in, m_bwd);
+        // Step 2: PhysDeriv
+        m_PhysDerivOp->apply(m_bwd, m_deriv0, m_deriv1, m_deriv2);
+        // Step 3: Inner product for mass matrix operation
+        m_IProductWRTBaseOp->apply(m_bwd, out, m_lambda);
+        // Step 4: Multiply by diffusion coefficient
+        DiffusionCoeff(m_deriv0, m_deriv1, m_deriv2, m_derivcoeff0,
+                       m_derivcoeff1, m_derivcoeff2);
+        // Step 5: Inner product
+        m_IProductWRTDerivBaseOp->apply(m_derivcoeff0, m_derivcoeff1,
+                                        m_derivcoeff2, out, true);
+    }
+    void DiffusionCoeff(Field<TData, FieldState::Phys> &deriv0,
+                        Field<TData, FieldState::Phys> &deriv1,
+                        Field<TData, FieldState::Phys> &deriv2,
+                        Field<TData, FieldState::Phys> &derivcoeff0,
+                        Field<TData, FieldState::Phys> &derivcoeff1,
+                        Field<TData, FieldState::Phys> &derivcoeff2)
+    {
+        // Initialize pointers.
+        std::vector<TData *> derivptr{deriv0.GetStorage().GetCPUPtr(),
+                                      deriv1.GetStorage().GetCPUPtr(),
+                                      deriv2.GetStorage().GetCPUPtr()};
+        std::vector<TData *> derivcoeffptr{
+            derivcoeff0.GetStorage().GetCPUPtr(),
+            derivcoeff1.GetStorage().GetCPUPtr(),
+            derivcoeff2.GetStorage().GetCPUPtr()};
+        // Initialize index.
+        size_t expIdx = 0;
+        for (size_t block_idx = 0; block_idx < deriv0.GetBlocks().size();
+             ++block_idx)
+        {
+            // Determine shape and type of the element.
+            auto const expPtr = this->m_expansionList->GetExp(expIdx);
+            auto nElmts       = deriv0.GetBlocks()[block_idx].num_elements;
+            auto nCoord       = expPtr->GetCoordim();
+            auto nqTot        = expPtr->GetTotPoints();
+            // Multiply by diffusion coefficient.
+            for (size_t d = 0; d < nCoord; d++)
+            {
+                Vmath::Smul(nqTot * nElmts, m_diffCoeff[d * nCoord], derivptr[0], 1,
+                            derivcoeffptr[d], 1);
+                for (size_t l = 1; l < nCoord; l++)
+                {
+                    Vmath::Svtvp(nqTot * nElmts, m_diffCoeff[d * nCoord + l],
+                                 derivptr[l], 1, derivcoeffptr[d], 1,
+                                 derivcoeffptr[d], 1);
+                }
+            }
+            // Increment pointer and index for next element type.
+            for (size_t d = 0; d < nCoord; d++)
+            {
+                derivptr[d] += nqTot * nElmts;
+                derivcoeffptr[d] += nqTot * nElmts;
+            }
+            expIdx += nElmts;
+        }
+    }
+    static std::unique_ptr<Operator<TData>> instantiate(
+        const MultiRegions::ExpListSharedPtr &expansionList)
+    {
+        return std::make_unique<OperatorHelmholtzImpl<TData, ImplStdMat>>(
+            expansionList);
+    }
+    void SetLambda(TData lambda)
+    {
+        m_lambda = lambda;
+    }
+    static std::string className;
+private:
+    std::shared_ptr<OperatorBwdTrans<TData>> m_BwdTransOp;
+    std::shared_ptr<OperatorPhysDeriv<TData>> m_PhysDerivOp;
+    std::shared_ptr<OperatorIProductWRTBase<TData>> m_IProductWRTBaseOp;
+    std::shared_ptr<OperatorIProductWRTDerivBase<TData>>
+        m_IProductWRTDerivBaseOp;
+    Field<TData, FieldState::Phys> m_bwd;
+    Field<TData, FieldState::Phys> m_deriv0;
+    Field<TData, FieldState::Phys> m_deriv1;
+    Field<TData, FieldState::Phys> m_deriv2;
+    Field<TData, FieldState::Phys> m_derivcoeff0;
+    Field<TData, FieldState::Phys> m_derivcoeff1;
+    Field<TData, FieldState::Phys> m_derivcoeff2;
+    TData m_lambda = 1.0;
+    Array<OneD, TData> m_diffCoeff;
+};
+} // namespace Nektar::Operators::detail
--- a/Operators/IProductWRTBase/IProductWRTBaseCUDA.cu
+++ b/Operators/IProductWRTBase/IProductWRTBaseCUDA.cu
+#include "IProductWRTBaseCUDA.hpp"
+namespace Nektar::Operators::detail
+{
+template <>
+std::string OperatorIProductWRTBaseImpl<double, ImplCUDA>::className =
+    GetOperatorFactory<double>().RegisterCreatorFunction(
+        "IProductWRTBaseCUDA",
+        OperatorIProductWRTBaseImpl<double, ImplCUDA>::instantiate, "...");
+}
--- a/Operators/IProductWRTBase/IProductWRTBaseCUDA.hpp
+++ b/Operators/IProductWRTBase/IProductWRTBaseCUDA.hpp
+#pragma once
+#include "MemoryRegionCUDA.hpp"
+#include "Operators/IProductWRTBase/IProductWRTBaseCUDAKernels.cuh"
+#include "Operators/OperatorHelper.cuh"
+#include "Operators/OperatorIProductWRTBase.hpp"
+namespace Nektar::Operators::detail
+{
+template <typename TData, bool SCALE = false, bool APPEND = false,
+          bool DEFORMED = false>
+void IProductWRTBase1DKernel(const size_t gridSize, const size_t blockSize,
+                             const size_t nm0, const size_t nq0,
+                             const size_t nElmts, const TData *basis0,
+                             const TData *w0, const TData *jac, const TData *in,
+                             TData *out, TData scale = 1.0);
+template <typename TData, bool SCALE = false, bool APPEND = false,
+          bool DEFORMED = false>
+void IProductWRTBase2DKernel(const size_t gridSize, const size_t blockSize,
+                             LibUtilities::ShapeType shapetype,
+                             const size_t nm0, const size_t nm1,
+                             const size_t nq0, const size_t nq1,
+                             const size_t nElmts, const bool correct,
+                             const TData *basis0, const TData *basis1,
+                             const TData *w0, const TData *w1, const TData *jac,
+                             const TData *in, TData *out, TData scale = 1.0);
+template <typename TData, bool SCALE = false, bool APPEND = false,
+          bool DEFORMED = false>
+void IProductWRTBase3DKernel(const size_t gridSize, const size_t blockSize,
+                             LibUtilities::ShapeType shapetype,
+                             const size_t nm0, const size_t nm1,
+                             const size_t nm2, const size_t nq0,
+                             const size_t nq1, const size_t nq2,
+                             const size_t nElmts, const bool correct,
+                             const TData *basis0, const TData *basis1,
+                             const TData *basis2, const TData *w0,
+                             const TData *w1, const TData *w2, const TData *jac,
+                             const TData *in, TData *out, TData scale = 1.0);
+// IProductWRTBase implementation
+template <typename TData>
+class OperatorIProductWRTBaseImpl<TData, ImplCUDA>
+    : public OperatorIProductWRTBase<TData>
+{
+public:
+    OperatorIProductWRTBaseImpl(
+        const MultiRegions::ExpListSharedPtr &expansionList)
+        : OperatorIProductWRTBase<TData>(expansionList)
+    {
+        // Initialise jacobian.
+        size_t jacSize = Operator<TData>::GetGeometricFactorSize();
+        auto jac       = Operator<TData>::SetJacobian(jacSize);
+        cudaMalloc((void **)&m_jac, sizeof(TData) * jacSize);
+        cudaMemcpy(m_jac, jac.get(), sizeof(TData) * jacSize,
+                   cudaMemcpyHostToDevice);
+        // Initialize basis.
+        m_basis = GetBasisDataCUDA<TData>(expansionList);
+        // Initialize weight.
+        m_weight = GetWeightDataCUDA<TData>(expansionList);
+    }
+    ~OperatorIProductWRTBaseImpl(void)
+    {
+        DeallocateDataCUDA<TData>(m_basis);
+        DeallocateDataCUDA<TData>(m_weight);
+        cudaFree(m_jac);
+    }
+    void apply(Field<TData, FieldState::Phys> &in,
+               Field<TData, FieldState::Coeff> &out,
+               const TData lambda = 1.0) override
+    {
+        // Copy memory to GPU, if necessary and get raw pointers.
+        auto const *inptr =
+            in.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
+        auto *outptr = out.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
+        // Initialize index.
+        size_t expIdx = 0;
+        auto jacptr   = m_jac;
+        // Initialize basiskey.
+        std::vector<LibUtilities::BasisKey> basisKeys(
+            3, LibUtilities::NullBasisKey);
+        // Loop over the blocks.
+        for (size_t block_idx = 0; block_idx < in.GetBlocks().size();
+             ++block_idx)
+        {
+            auto const expPtr = this->m_expansionList->GetExp(expIdx);
+            auto nElmts       = in.GetBlocks()[block_idx].num_elements;
+            auto nqTot        = expPtr->GetTotPoints();
+            auto nmTot        = expPtr->GetNcoeffs();
+            auto ptsKeys      = expPtr->GetPointsKeys();
+            auto deformed     = expPtr->GetMetricInfo()->GetGtype() ==
+                            SpatialDomains::eDeformed;
+            // Deterime CUDA grid parameters.
+            m_gridSize = nElmts / m_blockSize;
+            m_gridSize += (nElmts % m_blockSize == 0) ? 0 : 1;
+            // Flag for collapsed coordinate correction.
+            bool correct = expPtr->GetBasis(0)->GetBasisType() ==
+                           LibUtilities::eModified_A;
+            // Fetch basis key for the current element type.
+            for (size_t d = 0; d < expPtr->GetShapeDimension(); d++)
+            {
+                basisKeys[d] = expPtr->GetBasis(d)->GetBasisKey();
+            }
+            // Function call to kernel functions.
+            if (expPtr->GetShapeDimension() == 1)
+            {
+                auto basis0 = m_basis[basisKeys][0];
+                auto w0     = m_weight[basisKeys][0];
+                auto nm0    = expPtr->GetBasisNumModes(0);
+                auto nq0    = expPtr->GetNumPoints(0);
+                if (deformed)
+                {
+                    if (lambda == 1.0)
+                    {
+                        IProductWRTBase1DKernel<TData, false, false, true>(
+                            m_gridSize, m_blockSize, nm0, nq0, nElmts, basis0,
+                            w0, jacptr, inptr, outptr);
+                    }
+                    else
+                    {
+                        IProductWRTBase1DKernel<TData, true, false, true>(
+                            m_gridSize, m_blockSize, nm0, nq0, nElmts, basis0,
+                            w0, jacptr, inptr, outptr, lambda);
+                    }
+                }
+                else
+                {
+                    if (lambda == 1.0)
+                    {
+                        IProductWRTBase1DKernel<TData, false, false, false>(
+                            m_gridSize, m_blockSize, nm0, nq0, nElmts, basis0,
+                            w0, jacptr, inptr, outptr);
+                    }
+                    else
+                    {
+                        IProductWRTBase1DKernel<TData, true, false, false>(
+                            m_gridSize, m_blockSize, nm0, nq0, nElmts, basis0,
+                            w0, jacptr, inptr, outptr, lambda);
+                    }
+                }
+            }
+            else if (expPtr->GetShapeDimension() == 2)
+            {
+                auto basis0 = m_basis[basisKeys][0];
+                auto basis1 = m_basis[basisKeys][1];
+                auto w0     = m_weight[basisKeys][0];
+                auto w1     = m_weight[basisKeys][1];
+                auto shape  = expPtr->DetShapeType();
+                auto nm0    = expPtr->GetBasisNumModes(0);
+                auto nm1    = expPtr->GetBasisNumModes(1);
+                auto nq0    = expPtr->GetNumPoints(0);
+                auto nq1    = expPtr->GetNumPoints(1);
+                if (deformed)
+                {
+                    if (lambda == 1.0)
+                    {
+                        IProductWRTBase2DKernel<TData, false, false, true>(
+                            m_gridSize, m_blockSize, shape, nm0, nm1, nq0, nq1,
+                            nElmts, correct, basis0, basis1, w0, w1, jacptr,
+                            inptr, outptr);
+                    }
+                    else
+                    {
+                        IProductWRTBase2DKernel<TData, true, false, true>(
+                            m_gridSize, m_blockSize, shape, nm0, nm1, nq0, nq1,
+                            nElmts, correct, basis0, basis1, w0, w1, jacptr,
+                            inptr, outptr, lambda);
+                    }
+                }
+                else
+                {
+                    if (lambda == 1.0)
+                    {
+                        IProductWRTBase2DKernel<TData, false, false, false>(
+                            m_gridSize, m_blockSize, shape, nm0, nm1, nq0, nq1,
+                            nElmts, correct, basis0, basis1, w0, w1, jacptr,
+                            inptr, outptr);
+                    }
+                    else
+                    {
+                        IProductWRTBase2DKernel<TData, true, false, false>(
+                            m_gridSize, m_blockSize, shape, nm0, nm1, nq0, nq1,
+                            nElmts, correct, basis0, basis1, w0, w1, jacptr,
+                            inptr, outptr, lambda);
+                    }
+                }
+            }
+            else if (expPtr->GetShapeDimension() == 3)
+            {
+                auto basis0 = m_basis[basisKeys][0];
+                auto basis1 = m_basis[basisKeys][1];
+                auto basis2 = m_basis[basisKeys][2];
+                auto w0     = m_weight[basisKeys][0];
+                auto w1     = m_weight[basisKeys][1];
+                auto w2     = m_weight[basisKeys][2];
+                auto shape  = expPtr->DetShapeType();
+                auto nm0    = expPtr->GetBasisNumModes(0);
+                auto nm1    = expPtr->GetBasisNumModes(1);
+                auto nm2    = expPtr->GetBasisNumModes(2);
+                auto nq0    = expPtr->GetNumPoints(0);
+                auto nq1    = expPtr->GetNumPoints(1);
+                auto nq2    = expPtr->GetNumPoints(2);
+                if (deformed)
+                {
+                    if (lambda == 1.0)
+                    {
+                        IProductWRTBase3DKernel<TData, false, false, true>(
+                            m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0,
+                            nq1, nq2, nElmts, correct, basis0, basis1, basis2,
+                            w0, w1, w2, jacptr, inptr, outptr);
+                    }
+                    else
+                    {
+                        IProductWRTBase3DKernel<TData, true, false, true>(
+                            m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0,
+                            nq1, nq2, nElmts, correct, basis0, basis1, basis2,
+                            w0, w1, w2, jacptr, inptr, outptr, lambda);
+                    }
+                }
+                else
+                {
+                    if (lambda == 1.0)
+                    {
+                        IProductWRTBase3DKernel<TData, false, false, false>(
+                            m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0,
+                            nq1, nq2, nElmts, correct, basis0, basis1, basis2,
+                            w0, w1, w2, jacptr, inptr, outptr);
+                    }
+                    else
+                    {
+                        IProductWRTBase3DKernel<TData, true, false, false>(
+                            m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0,
+                            nq1, nq2, nElmts, correct, basis0, basis1, basis2,
+                            w0, w1, w2, jacptr, inptr, outptr, lambda);
+                    }
+                }
+            }
+            // Increment pointer and index for next element type.
+            jacptr += deformed ? nqTot * nElmts : nElmts;
+            inptr += nqTot * nElmts;
+            outptr += nmTot * nElmts;
+            expIdx += nElmts;
+        }
+    }
+    static std::unique_ptr<Operator<TData>> instantiate(
+        MultiRegions::ExpListSharedPtr expansionList)
+    {
+        return std::make_unique<OperatorIProductWRTBaseImpl<TData, ImplCUDA>>(
+            expansionList);
+    }
+    static std::string className;
+private:
+    std::map<std::vector<LibUtilities::BasisKey>, std::vector<TData *>> m_basis;
+    std::map<std::vector<LibUtilities::BasisKey>, std::vector<TData *>>
+        m_weight;
+    TData *m_jac;
+    size_t m_blockSize = 32;
+    size_t m_gridSize;
+};
+template <typename TData, bool SCALE, bool APPEND, bool DEFORMED>
+void IProductWRTBase1DKernel(const size_t gridSize, const size_t blockSize,
+                             const size_t nm0, const size_t nq0,
+                             const size_t nElmts, const TData *basis0,
+                             const TData *w0, const TData *jac, const TData *in,
+                             TData *out, TData scale)
+{
+    IProductWRTBaseSegKernel<TData, SCALE, APPEND, DEFORMED>
+        <<<gridSize, blockSize>>>(nm0, nq0, nElmts, basis0, w0, jac, in, out,
+                                  scale);
+}
+template <typename TData, bool SCALE, bool APPEND, bool DEFORMED>
+void IProductWRTBase2DKernel(const size_t gridSize, const size_t blockSize,
+                             LibUtilities::ShapeType shapetype,
+                             const size_t nm0, const size_t nm1,
+                             const size_t nq0, const size_t nq1,
+                             const size_t nElmts, const bool correct,
+                             const TData *basis0, const TData *basis1,
+                             const TData *w0, const TData *w1, const TData *jac,
+                             const TData *in, TData *out, TData scale)
+{
+    if (shapetype == LibUtilities::Quad)
+    {
+        IProductWRTBaseQuadKernel<TData, SCALE, APPEND, DEFORMED>
+            <<<gridSize, blockSize>>>(nm0, nm1, nq0, nq1, nElmts, basis0,
+                                      basis1, w0, w1, jac, in, out, scale);
+    }
+    else if (shapetype == LibUtilities::Tri)
+    {
+        size_t nmTot =
+            LibUtilities::StdTriData::getNumberOfCoefficients(nm0, nm1);
+        IProductWRTBaseTriKernel<TData, SCALE, APPEND, DEFORMED>
+            <<<gridSize, blockSize>>>(nm0, nm1, nmTot, nq0, nq1, nElmts,
+                                      correct, basis0, basis1, w0, w1, jac, in,
+                                      out, scale);
+    }
+}
+template <typename TData, bool SCALE, bool APPEND, bool DEFORMED>
+void IProductWRTBase3DKernel(
+    const size_t gridSize, const size_t blockSize,
+    LibUtilities::ShapeType shapetype, const size_t nm0, const size_t nm1,
+    const size_t nm2, const size_t nq0, const size_t nq1, const size_t nq2,
+    const size_t nElmts, const bool correct, const TData *basis0,
+    const TData *basis1, const TData *basis2, const TData *w0, const TData *w1,
+    const TData *w2, const TData *jac, const TData *in, TData *out, TData scale)
+{
+    if (shapetype == LibUtilities::Hex)
+    {
+        IProductWRTBaseHexKernel<TData, SCALE, APPEND, DEFORMED>
+            <<<gridSize, blockSize>>>(nm0, nm1, nm2, nq0, nq1, nq2, nElmts,
+                                      basis0, basis1, basis2, w0, w1, w2, jac,
+                                      in, out, scale);
+    }
+    else if (shapetype == LibUtilities::Tet)
+    {
+        size_t nmTot =
+            LibUtilities::StdTetData::getNumberOfCoefficients(nm0, nm1, nm2);
+        IProductWRTBaseTetKernel<TData, SCALE, APPEND, DEFORMED>
+            <<<gridSize, blockSize>>>(nm0, nm1, nm2, nmTot, nq0, nq1, nq2,
+                                      nElmts, correct, basis0, basis1, basis2,
+                                      w0, w1, w2, jac, in, out, scale);
+    }
+    else if (shapetype == LibUtilities::Pyr)
+    {
+        size_t nmTot =
+            LibUtilities::StdPyrData::getNumberOfCoefficients(nm0, nm1, nm2);
+        IProductWRTBasePyrKernel<TData, SCALE, APPEND, DEFORMED>
+            <<<gridSize, blockSize>>>(nm0, nm1, nm2, nmTot, nq0, nq1, nq2,
+                                      nElmts, correct, basis0, basis1, basis2,
+                                      w0, w1, w2, jac, in, out, scale);
+    }
+    else if (shapetype == LibUtilities::Prism)
+    {
+        size_t nmTot =
+            LibUtilities::StdPrismData::getNumberOfCoefficients(nm0, nm1, nm2);
+        IProductWRTBasePrismKernel<TData, SCALE, APPEND, DEFORMED>
+            <<<gridSize, blockSize>>>(nm0, nm1, nm2, nmTot, nq0, nq1, nq2,
+                                      nElmts, correct, basis0, basis1, basis2,
+                                      w0, w1, w2, jac, in, out, scale);
+    }
+}
+} // namespace Nektar::Operators::detail
--- a/Operators/IProductWRTBase/IProductWRTBaseCUDAKernels.cuh
+++ b/Operators/IProductWRTBase/IProductWRTBaseCUDAKernels.cuh
--- a/Operators/IProductWRTBase/IProductWRTBaseStdMat.hpp
+++ b/Operators/IProductWRTBase/IProductWRTBaseStdMat.hpp
@@ -14,52 +14,14 @@ public:
        const MultiRegions::ExpListSharedPtr &expansionList)
        : OperatorIProductWRTBase<TData>(std::move(expansionList))
    {
-        size_t jacSize   = 0;
-        size_t nTotElmts = this->m_expansionList->GetNumElmts();
-        // Calculate the jacobian array size
-        for (size_t e = 0; e < nTotElmts; ++e)
-        {
-            // Determine shape and type of the element
-            auto const expPtr = this->m_expansionList->GetExp(e);
-            if (expPtr->GetMetricInfo()->GetGtype() ==
-                SpatialDomains::eDeformed)
-            {
-                jacSize += expPtr->GetTotPoints();
-            }
-            else
-            {
-                jacSize++;
-            }
-        }
-        // Allocate memory for the jacobian
-        m_jac = {jacSize, 0.0};
        // Initialise jacobian.
-        size_t index = 0;
+        size_t jacSize = Operator<TData>::GetGeometricFactorSize();
-        for (size_t e = 0; e < nTotElmts; ++e)
+        m_jac          = Operator<TData>::SetJacobian(jacSize);
-        {
-            auto expPtr = this->m_expansionList->GetExp(e);
-            auto &auxJac =
-                expPtr->GetMetricInfo()->GetJac(expPtr->GetPointsKeys());
-            if (expPtr->GetMetricInfo()->GetGtype() ==
-                SpatialDomains::eDeformed)
-            {
-                size_t nqe = expPtr->GetTotPoints();
-                for (size_t i = 0; i < nqe; ++i)
-                {
-                    m_jac[index++] = auxJac[i];
-                }
-            }
-            else
-            {
-                m_jac[index++] = auxJac[0];
-            }
-        }
    }
    void apply(Field<TData, FieldState::Phys> &in,
-               Field<TData, FieldState::Coeff> &out) override
+               Field<TData, FieldState::Coeff> &out,
+               const TData lambda = 1.0) override
    {
        auto const *inptr = in.GetStorage().GetCPUPtr();
        auto *outptr      = out.GetStorage().GetCPUPtr();
@@ -80,7 +42,7 @@ public:
            // This is the B^{T} matrix
            auto const matPtr = expPtr->GetStdMatrix(key);
-            Array<OneD, NekDouble> wsp(nqTot * nElmts, 0.0);
+            Array<OneD, TData> wsp(nqTot * nElmts, 0.0);
            if (expPtr->GetMetricInfo()->GetGtype() ==
                SpatialDomains::eDeformed)
            {
@@ -103,7 +65,7 @@ public:
            }
            Blas::Dgemm('N', 'N', matPtr->GetRows(), nElmts,
-                        matPtr->GetColumns(), 1.0, matPtr->GetRawPtr(),
+                        matPtr->GetColumns(), lambda, matPtr->GetRawPtr(),
                        matPtr->GetRows(), wsp.get(), nqTot, 0.0, outptr,
                        nmTot);
@@ -123,7 +85,7 @@ public:
    static std::string className;
 private:
-    Array<OneD, NekDouble> m_jac;
+    Array<OneD, TData> m_jac;
 };
 } // namespace Nektar::Operators::detail
--- a/Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDA.cu
+++ b/Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDA.cu
+#include "IProductWRTDerivBaseCUDA.hpp"
+namespace Nektar::Operators::detail
+{
+template <>
+std::string OperatorIProductWRTDerivBaseImpl<double, ImplCUDA>::className =
+    GetOperatorFactory<double>().RegisterCreatorFunction(
+        "IProductWRTDerivBaseCUDA",
+        OperatorIProductWRTDerivBaseImpl<double, ImplCUDA>::instantiate, "...");
+}
--- a/Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDA.hpp
+++ b/Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDA.hpp
+#include "MemoryRegionCUDA.hpp"
+#include "Operators/IProductWRTBase/IProductWRTBaseCUDA.hpp"
+#include "Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDAKernels.cuh"
+#include "Operators/OperatorHelper.cuh"
+#include "Operators/OperatorIProductWRTDerivBase.hpp"
+namespace Nektar::Operators::detail
+{
+template <typename TData, bool DEFORMED = false>
+void IProductWRTDerivBase1DKernel(const size_t gridSize, const size_t blockSize,
+                                  const size_t nq0, const size_t nCoord,
+                                  const size_t nElmts, const size_t dfSize,
+                                  TData *df, TData *in0, TData *in1, TData *in2,
+                                  TData *out0);
+template <typename TData, bool DEFORMED = false>
+void IProductWRTDerivBase2DKernel(const size_t gridSize, const size_t blockSize,
+                                  LibUtilities::ShapeType shapetype,
+                                  const size_t nq0, const size_t nq1,
+                                  const size_t nCoord, const size_t nElmts,
+                                  const TData *Z0, const TData *Z1,
+                                  const size_t dfSize, TData *df, TData *in0,
+                                  TData *in1, TData *in2, TData *out0,
+                                  TData *out1);
+template <typename TData, bool DEFORMED = false>
+void IProductWRTDerivBase3DKernel(
+    const size_t gridSize, const size_t blockSize,
+    LibUtilities::ShapeType shapetype, const size_t nq0, const size_t nq1,
+    const size_t nq2, const size_t nCoord, const size_t nElmts, const TData *Z0,
+    const TData *Z1, const TData *Z2, const size_t dfSize, TData *df,
+    TData *in0, TData *in1, TData *in2, TData *out0, TData *out1, TData *out2);
+// IProductWRTDerivBase implementation
+template <typename TData>
+class OperatorIProductWRTDerivBaseImpl<TData, ImplCUDA>
+    : public OperatorIProductWRTDerivBase<TData>
+{
+public:
+    OperatorIProductWRTDerivBaseImpl(
+        const MultiRegions::ExpListSharedPtr &expansionList)
+        : OperatorIProductWRTDerivBase<TData>(expansionList)
+    {
+        size_t nDim   = this->m_expansionList->GetShapeDimension();
+        size_t nCoord = this->m_expansionList->GetCoordim(0);
+        m_dfSize      = Operator<TData>::GetGeometricFactorSize();
+        // Initialise jacobian.
+        auto jac = Operator<TData>::SetJacobian(m_dfSize);
+        cudaMalloc((void **)&m_jac, sizeof(TData) * m_dfSize);
+        cudaMemcpy(m_jac, jac.get(), sizeof(TData) * m_dfSize,
+                   cudaMemcpyHostToDevice);
+        // Initialise derivative factor.
+        auto derivFac = Operator<TData>::SetDerivativeFactor(m_dfSize);
+        cudaMalloc((void **)&m_derivFac,
+                   sizeof(TData) * nDim * nCoord * m_dfSize);
+        for (size_t d = 0; d < nDim * nCoord; d++)
+        {
+            auto hostPtr   = derivFac[d].get();
+            auto devicePtr = m_derivFac + d * m_dfSize;
+            cudaMemcpy(devicePtr, hostPtr, sizeof(TData) * m_dfSize,
+                       cudaMemcpyHostToDevice);
+        }
+        // Initialize basis.
+        m_basis = GetBasisDataCUDA<TData>(expansionList);
+        // Initialize basis derivative.
+        m_dbasis = GetDeriveBasisDataCUDA<TData>(expansionList);
+        // Initialize weight.
+        m_weight = GetWeightDataCUDA<TData>(expansionList);
+        // Initialize points.
+        m_Z = GetPointDataCUDA<TData>(expansionList);
+        // Initialize derivative matrix.
+        m_D = GetDerivativeDataCUDA<TData>(expansionList);
+        // Initialize workspace memory.
+        auto ndata = this->m_expansionList->GetTotPoints();
+        cudaMalloc((void **)&m_wsp0, sizeof(TData) * ndata);
+        if (nCoord > 1)
+        {
+            cudaMalloc((void **)&m_wsp1, sizeof(TData) * ndata);
+        }
+        if (nCoord > 2)
+        {
+            cudaMalloc((void **)&m_wsp2, sizeof(TData) * ndata);
+        }
+    }
+    ~OperatorIProductWRTDerivBaseImpl(void)
+    {
+        DeallocateDataCUDA<TData>(m_basis);
+        DeallocateDataCUDA<TData>(m_dbasis);
+        DeallocateDataCUDA<TData>(m_weight);
+        DeallocateDataCUDA<TData>(m_Z);
+        DeallocateDataCUDA<TData>(m_D);
+        cudaFree(m_jac);
+        cudaFree(m_derivFac);
+    }
+    void apply(Field<TData, FieldState::Phys> &in0,
+               Field<TData, FieldState::Phys> &in1,
+               Field<TData, FieldState::Phys> &in2,
+               Field<TData, FieldState::Coeff> &out,
+               bool APPEND = false) override
+    {
+        if (!APPEND) // Zero output (temporary solution)
+        {
+            auto *tmpptr =
+                out.template GetStorage<MemoryRegionCUDA>().GetCPUPtr();
+            for (size_t block_idx = 0; block_idx < out.GetBlocks().size();
+                 ++block_idx)
+            {
+                for (size_t i = 0; i < out.GetBlocks()[block_idx].block_size;
+                     i++)
+                {
+                    *(tmpptr++) = 0.0;
+                }
+            }
+        }
+        // Copy memory to GPU, if necessary and get raw pointers.
+        std::vector<TData *> inptr{
+            in0.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(),
+            in1.template GetStorage<MemoryRegionCUDA>().GetGPUPtr(),
+            in2.template GetStorage<MemoryRegionCUDA>().GetGPUPtr()};
+        auto *outptr = out.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
+        std::vector<TData *> wspptr{m_wsp0, m_wsp1, m_wsp2};
+        auto jacptr = m_jac;
+        auto dfptr  = m_derivFac;
+        // Initialize index.
+        size_t expIdx = 0;
+        // Initialize basiskey.
+        std::vector<LibUtilities::BasisKey> basisKeys(
+            3, LibUtilities::NullBasisKey);
+        // Loop over the blocks.
+        for (size_t block_idx = 0; block_idx < out.GetBlocks().size();
+             ++block_idx)
+        {
+            auto const expPtr = this->m_expansionList->GetExp(expIdx);
+            auto nElmts       = out.GetBlocks()[block_idx].num_elements;
+            auto nqTot        = expPtr->GetTotPoints();
+            auto nmTot        = expPtr->GetNcoeffs();
+            auto nDim         = expPtr->GetShapeDimension();
+            auto nCoord       = expPtr->GetCoordim();
+            auto shape        = expPtr->DetShapeType();
+            auto deformed     = expPtr->GetMetricInfo()->GetGtype() ==
+                            SpatialDomains::eDeformed;
+            // Deterime CUDA grid parameters.
+            m_gridSize = nElmts / m_blockSize;
+            m_gridSize += (nElmts % m_blockSize == 0) ? 0 : 1;
+            // Flag for collapsed coordinate correction.
+            bool correct = expPtr->GetBasis(0)->GetBasisType() ==
+                           LibUtilities::eModified_A;
+            // Fetch basis key for the current element type.
+            for (size_t d = 0; d < nDim; d++)
+            {
+                basisKeys[d] = expPtr->GetBasis(d)->GetBasisKey();
+            }
+            // Function call to kernel functions.
+            if (nDim == 1)
+            {
+                auto dbasis0 = m_dbasis[basisKeys][0];
+                auto w0      = m_weight[basisKeys][0];
+                auto D0      = m_D[basisKeys][0];
+                auto nm0     = expPtr->GetBasisNumModes(0);
+                auto nq0     = expPtr->GetNumPoints(0);
+                if (deformed)
+                {
+                    IProductWRTDerivBase1DKernel<TData, true>(
+                        m_gridSize, m_blockSize, nq0, nCoord, nElmts, m_dfSize,
+                        dfptr, inptr[0], inptr[1], inptr[2], wspptr[0]);
+                    IProductWRTBase1DKernel<TData, false, true, true>(
+                        m_gridSize, m_blockSize, nm0, nq0, nElmts, dbasis0, w0,
+                        jacptr, wspptr[0], outptr);
+                }
+                else
+                {
+                    IProductWRTDerivBase1DKernel<TData, false>(
+                        m_gridSize, m_blockSize, nq0, nCoord, nElmts, m_dfSize,
+                        dfptr, inptr[0], inptr[1], inptr[2], wspptr[0]);
+                    IProductWRTBase1DKernel<TData, false, true, false>(
+                        m_gridSize, m_blockSize, nm0, nq0, nElmts, dbasis0, w0,
+                        jacptr, wspptr[0], outptr);
+                }
+            }
+            else if (nDim == 2)
+            {
+                auto basis0  = m_basis[basisKeys][0];
+                auto basis1  = m_basis[basisKeys][1];
+                auto dbasis0 = m_dbasis[basisKeys][0];
+                auto dbasis1 = m_dbasis[basisKeys][1];
+                auto w0      = m_weight[basisKeys][0];
+                auto w1      = m_weight[basisKeys][1];
+                auto D0      = m_D[basisKeys][0];
+                auto D1      = m_D[basisKeys][1];
+                auto Z0      = m_Z[basisKeys][0];
+                auto Z1      = m_Z[basisKeys][1];
+                auto nm0     = expPtr->GetBasisNumModes(0);
+                auto nm1     = expPtr->GetBasisNumModes(1);
+                auto nq0     = expPtr->GetNumPoints(0);
+                auto nq1     = expPtr->GetNumPoints(1);
+                if (deformed)
+                {
+                    IProductWRTDerivBase2DKernel<TData, true>(
+                        m_gridSize, m_blockSize, shape, nq0, nq1, nCoord,
+                        nElmts, Z0, Z1, m_dfSize, dfptr, inptr[0], inptr[1],
+                        inptr[2], wspptr[0], wspptr[1]);
+                    IProductWRTBase2DKernel<TData, false, true, true>(
+                        m_gridSize, m_blockSize, shape, nm0, nm1, nq0, nq1,
+                        nElmts, correct, dbasis0, basis1, w0, w1, jacptr,
+                        wspptr[0], outptr);
+                    IProductWRTBase2DKernel<TData, false, true, true>(
+                        m_gridSize, m_blockSize, shape, nm0, nm1, nq0, nq1,
+                        nElmts, correct, basis0, dbasis1, w0, w1, jacptr,
+                        wspptr[1], outptr);
+                }
+                else
+                {
+                    IProductWRTDerivBase2DKernel<TData, false>(
+                        m_gridSize, m_blockSize, shape, nq0, nq1, nCoord,
+                        nElmts, Z0, Z1, m_dfSize, dfptr, inptr[0], inptr[1],
+                        inptr[2], wspptr[0], wspptr[1]);
+                    IProductWRTBase2DKernel<TData, false, true, false>(
+                        m_gridSize, m_blockSize, shape, nm0, nm1, nq0, nq1,
+                        nElmts, correct, dbasis0, basis1, w0, w1, jacptr,
+                        wspptr[0], outptr);
+                    IProductWRTBase2DKernel<TData, false, true, false>(
+                        m_gridSize, m_blockSize, shape, nm0, nm1, nq0, nq1,
+                        nElmts, correct, basis0, dbasis1, w0, w1, jacptr,
+                        wspptr[1], outptr);
+                }
+            }
+            else if (nDim == 3)
+            {
+                auto basis0  = m_basis[basisKeys][0];
+                auto basis1  = m_basis[basisKeys][1];
+                auto basis2  = m_basis[basisKeys][2];
+                auto dbasis0 = m_dbasis[basisKeys][0];
+                auto dbasis1 = m_dbasis[basisKeys][1];
+                auto dbasis2 = m_dbasis[basisKeys][2];
+                auto w0      = m_weight[basisKeys][0];
+                auto w1      = m_weight[basisKeys][1];
+                auto w2      = m_weight[basisKeys][2];
+                auto D0      = m_D[basisKeys][0];
+                auto D1      = m_D[basisKeys][1];
+                auto D2      = m_D[basisKeys][2];
+                auto Z0      = m_Z[basisKeys][0];
+                auto Z1      = m_Z[basisKeys][1];
+                auto Z2      = m_Z[basisKeys][2];
+                auto nm0     = expPtr->GetBasisNumModes(0);
+                auto nm1     = expPtr->GetBasisNumModes(1);
+                auto nm2     = expPtr->GetBasisNumModes(2);
+                auto nq0     = expPtr->GetNumPoints(0);
+                auto nq1     = expPtr->GetNumPoints(1);
+                auto nq2     = expPtr->GetNumPoints(2);
+                if (deformed)
+                {
+                    IProductWRTDerivBase3DKernel<TData, true>(
+                        m_gridSize, m_blockSize, shape, nq0, nq1, nq2, nCoord,
+                        nElmts, Z0, Z1, Z2, m_dfSize, dfptr, inptr[0], inptr[1],
+                        inptr[2], wspptr[0], wspptr[1], wspptr[2]);
+                    IProductWRTBase3DKernel<TData, false, true, true>(
+                        m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0, nq1,
+                        nq2, nElmts, correct, dbasis0, basis1, basis2, w0, w1,
+                        w2, jacptr, wspptr[0], outptr);
+                    IProductWRTBase3DKernel<TData, false, true, true>(
+                        m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0, nq1,
+                        nq2, nElmts, correct, basis0, dbasis1, basis2, w0, w1,
+                        w2, jacptr, wspptr[1], outptr);
+                    IProductWRTBase3DKernel<TData, false, true, true>(
+                        m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0, nq1,
+                        nq2, nElmts, correct, basis0, basis1, dbasis2, w0, w1,
+                        w2, jacptr, wspptr[2], outptr);
+                }
+                else
+                {
+                    IProductWRTDerivBase3DKernel<TData, false>(
+                        m_gridSize, m_blockSize, shape, nq0, nq1, nq2, nCoord,
+                        nElmts, Z0, Z1, Z2, m_dfSize, dfptr, inptr[0], inptr[1],
+                        inptr[2], wspptr[0], wspptr[1], wspptr[2]);
+                    IProductWRTBase3DKernel<TData, false, true, false>(
+                        m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0, nq1,
+                        nq2, nElmts, correct, dbasis0, basis1, basis2, w0, w1,
+                        w2, jacptr, wspptr[0], outptr);
+                    IProductWRTBase3DKernel<TData, false, true, false>(
+                        m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0, nq1,
+                        nq2, nElmts, correct, basis0, dbasis1, basis2, w0, w1,
+                        w2, jacptr, wspptr[1], outptr);
+                    IProductWRTBase3DKernel<TData, false, true, false>(
+                        m_gridSize, m_blockSize, shape, nm0, nm1, nm2, nq0, nq1,
+                        nq2, nElmts, correct, basis0, basis1, dbasis2, w0, w1,
+                        w2, jacptr, wspptr[2], outptr);
+                }
+            }
+            // Increment pointer and index for next element type.
+            jacptr += deformed ? nqTot * nElmts : nElmts;
+            dfptr += deformed ? nqTot * nElmts : nElmts;
+            for (size_t d = 0; d < nDim; d++)
+            {
+                inptr[d] += nqTot * nElmts;
+                wspptr[d] += nqTot * nElmts;
+            }
+            outptr += nmTot * nElmts;
+            expIdx += nElmts;
+        }
+    }
+    static std::unique_ptr<Operator<TData>> instantiate(
+        MultiRegions::ExpListSharedPtr expansionList)
+    {
+        return std::make_unique<
+            OperatorIProductWRTDerivBaseImpl<TData, ImplCUDA>>(expansionList);
+    }
+    static std::string className;
+private:
+    TData *m_wsp0, *m_wsp1, *m_wsp2;
+    TData *m_derivFac;
+    TData *m_jac;
+    std::map<std::vector<LibUtilities::BasisKey>, std::vector<TData *>> m_basis;
+    std::map<std::vector<LibUtilities::BasisKey>, std::vector<TData *>>
+        m_dbasis;
+    std::map<std::vector<LibUtilities::BasisKey>, std::vector<TData *>>
+        m_weight;
+    std::map<std::vector<LibUtilities::BasisKey>, std::vector<TData *>> m_D;
+    std::map<std::vector<LibUtilities::BasisKey>, std::vector<TData *>> m_Z;
+    size_t m_dfSize;
+    size_t m_blockSize = 32;
+    size_t m_gridSize;
+};
+template <typename TData, bool DEFORMED>
+void IProductWRTDerivBase1DKernel(const size_t gridSize, const size_t blockSize,
+                                  const size_t nq0, const size_t nCoord,
+                                  const size_t nElmts, const size_t dfSize,
+                                  TData *df, TData *in0, TData *in1, TData *in2,
+                                  TData *out0)
+{
+    IProductWRTDerivBaseSegKernel<TData, DEFORMED><<<gridSize, blockSize>>>(
+        nq0, nCoord, nElmts, dfSize, df, in0, in1, in2, out0);
+}
+template <typename TData, bool DEFORMED>
+void IProductWRTDerivBase2DKernel(const size_t gridSize, const size_t blockSize,
+                                  LibUtilities::ShapeType shapetype,
+                                  const size_t nq0, const size_t nq1,
+                                  const size_t nCoord, const size_t nElmts,
+                                  const TData *Z0, const TData *Z1,
+                                  const size_t dfSize, TData *df, TData *in0,
+                                  TData *in1, TData *in2, TData *out0,
+                                  TData *out1)
+{
+    if (shapetype == LibUtilities::Quad)
+    {
+        IProductWRTDerivBaseQuadKernel<TData, DEFORMED>
+            <<<gridSize, blockSize>>>(nq0, nq1, nCoord, nElmts, dfSize, df, in0,
+                                      in1, in2, out0, out1);
+    }
+    else if (shapetype == LibUtilities::Tri)
+    {
+        IProductWRTDerivBaseTriKernel<TData, DEFORMED>
+            <<<gridSize, blockSize>>>(nq0, nq1, nCoord, nElmts, Z0, Z1, dfSize,
+                                      df, in0, in1, in2, out0, out1);
+    }
+}
+template <typename TData, bool DEFORMED>
+void IProductWRTDerivBase3DKernel(
+    const size_t gridSize, const size_t blockSize,
+    LibUtilities::ShapeType shapetype, const size_t nq0, const size_t nq1,
+    const size_t nq2, const size_t nCoord, const size_t nElmts, const TData *Z0,
+    const TData *Z1, const TData *Z2, const size_t dfSize, TData *df,
+    TData *in0, TData *in1, TData *in2, TData *out0, TData *out1, TData *out2)
+{
+    if (shapetype == LibUtilities::Hex)
+    {
+        IProductWRTDerivBaseHexKernel<TData, DEFORMED>
+            <<<gridSize, blockSize>>>(nq0, nq1, nq2, nCoord, nElmts, dfSize, df,
+                                      in0, in1, in2, out0, out1, out2);
+    }
+    else if (shapetype == LibUtilities::Tet)
+    {
+        IProductWRTDerivBaseTetKernel<TData, DEFORMED><<<gridSize, blockSize>>>(
+            nq0, nq1, nq2, nCoord, nElmts, Z0, Z1, Z2, dfSize, df, in0, in1,
+            in2, out0, out1, out2);
+    }
+    else if (shapetype == LibUtilities::Pyr)
+    {
+        IProductWRTDerivBasePyrKernel<TData, DEFORMED><<<gridSize, blockSize>>>(
+            nq0, nq1, nq2, nCoord, nElmts, Z0, Z1, Z2, dfSize, df, in0, in1,
+            in2, out0, out1, out2);
+    }
+    else if (shapetype == LibUtilities::Prism)
+    {
+        IProductWRTDerivBasePrismKernel<TData, DEFORMED>
+            <<<gridSize, blockSize>>>(nq0, nq1, nq2, nCoord, nElmts, Z0, Z2,
+                                      dfSize, df, in0, in1, in2, out0, out1,
+                                      out2);
+    }
+}
+} // namespace Nektar::Operators::detail
--- a/Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDAKernels.cuh
+++ b/Operators/IProductWRTDerivBase/IProductWRTDerivBaseCUDAKernels.cuh
+namespace Nektar::Operators::detail
+{
+template <typename TData, bool DEFORMED>
+__global__ void IProductWRTDerivBaseSegKernel(const size_t nq0,
+                                              const size_t ncoord,
+                                              const size_t nelmt,
+                                              const size_t dfSize, TData *df,
+                                              TData *in0, TData *in1,
+                                              TData *in2, TData *out0)
+{
+    size_t e = blockDim.x * blockIdx.x + threadIdx.x;
+    if (e >= nelmt)
+    {
+        return;
+    }
+    size_t ndf = ncoord;
+    // Assign pointers.
+    TData **inptr = new TData *[ncoord];
+    inptr[0]      = in0 + (nq0 * e);
+    if (ncoord > 1)
+    {
+        inptr[1] = in1 + (nq0 * e);
+    }
+    if (ncoord > 2)
+    {
+        inptr[2] = in2 + (nq0 * e);
+    }
+    TData *outptr0 = out0 + (nq0 * e);
+    TData **dfptr = new TData *[ndf];
+    for (size_t d = 0; d < ndf; d++)
+    {
+        dfptr[d] = df + d * dfSize;
+        dfptr[d] += DEFORMED ? nq0 * e : e;
+    }
+    // Calculate dxi/dx in[0] + dxi/dy in[1] + dxi/dz in[2]
+    for (size_t i = 0; i < nq0; ++i)
+    {
+        size_t dfindex = DEFORMED ? i : 0;
+        outptr0[i]     = dfptr[0][dfindex] * inptr[0][i];
+        for (size_t d = 1; d < ncoord; ++d)
+        {
+            outptr0[i] += (dfptr[d][dfindex] * inptr[d][i]);
+        }
+    }
+    delete[] inptr;
+    delete[] dfptr;
+}
+template <typename TData, bool DEFORMED>
+__global__ void IProductWRTDerivBaseQuadKernel(
+    const size_t nq0, const size_t nq1, const size_t ncoord, const size_t nelmt,
+    const size_t dfSize, TData *df, TData *in0, TData *in1, TData *in2,
+    TData *out0, TData *out1)
+{
+    size_t e = blockDim.x * blockIdx.x + threadIdx.x;
+    if (e >= nelmt)
+    {
+        return;
+    }
+    const auto ndf = 2 * ncoord;
+    // Assign pointers.
+    TData **inptr = new TData *[ncoord];
+    inptr[0]      = in0 + (nq0 * nq1 * e);
+    inptr[1]      = in1 + (nq0 * nq1 * e);
+    if (ncoord > 2)
+    {
+        inptr[2] = in2 + (nq0 * nq1 * e);
+    }
+    TData *outptr0 = out0 + (nq0 * nq1 * e);
+    TData *outptr1 = out1 + (nq0 * nq1 * e);
+    TData **dfptr = new TData *[ndf];
+    for (size_t d = 0; d < ndf; d++)
+    {
+        dfptr[d] = df + d * dfSize;
+        dfptr[d] += DEFORMED ? nq0 * nq1 * e : e;
+    }
+    // Calculate dxi/dx in[0] + dxi/dy in[1] + dxi/dz in[2]
+    for (size_t i = 0; i < nq0 * nq1; ++i)
+    {
+        size_t dfindex = DEFORMED ? i : 0;
+        outptr0[i]     = dfptr[0][dfindex] * inptr[0][i];
+        outptr1[i]     = dfptr[1][dfindex] * inptr[0][i];
+        for (size_t d = 1; d < ncoord; ++d)
+        {
+            outptr0[i] += (dfptr[2 * d][dfindex] * inptr[d][i]);
+            outptr1[i] += (dfptr[2 * d + 1][dfindex] * inptr[d][i]);
+        }
+    }
+    delete[] inptr;
+    delete[] dfptr;
+}
+template <typename TData, bool DEFORMED>
+__global__ void IProductWRTDerivBaseTriKernel(
+    const size_t nq0, const size_t nq1, const size_t ncoord, const size_t nelmt,
+    const TData *Z0, const TData *Z1, const size_t dfSize, TData *df,
+    TData *in0, TData *in1, TData *in2, TData *out0, TData *out1)
+{
+    size_t e = blockDim.x * blockIdx.x + threadIdx.x;
+    if (e >= nelmt)
+    {
+        return;
+    }
+    const auto ndf = 2 * ncoord;
+    // Assign pointers.
+    TData **inptr = new TData *[ncoord];
+    inptr[0]      = in0 + (nq0 * nq1 * e);
+    inptr[1]      = in1 + (nq0 * nq1 * e);
+    if (ncoord > 2)
+    {
+        inptr[2] = in2 + (nq0 * nq1 * e);
+    }
+    TData *outptr0 = out0 + (nq0 * nq1 * e);
+    TData *outptr1 = out1 + (nq0 * nq1 * e);
+    TData **dfptr = new TData *[ndf];
+    for (size_t d = 0; d < ndf; d++)
+    {
+        dfptr[d] = df + d * dfSize;
+        dfptr[d] += DEFORMED ? nq0 * nq1 * e : e;
+    }
+    // Calculate dxi/dx in[0] + dxi/dy in[1] + dxi/dz in[2]
+    for (size_t j = 0, cnt_ji = 0; j < nq1; ++j)
+    {
+        TData f0 = 2.0 / (1.0 - Z1[j]);
+        for (size_t i = 0; i < nq0; ++i, ++cnt_ji)
+        {
+            size_t dfindex  = DEFORMED ? cnt_ji : 0;
+            outptr0[cnt_ji] = dfptr[0][dfindex] * inptr[0][cnt_ji];
+            outptr1[cnt_ji] = dfptr[1][dfindex] * inptr[0][cnt_ji];
+            for (size_t d = 1; d < ncoord; ++d)
+            {
+                outptr0[cnt_ji] += (dfptr[2 * d][dfindex] * inptr[d][cnt_ji]);
+                outptr1[cnt_ji] +=
+                    (dfptr[2 * d + 1][dfindex] * inptr[d][cnt_ji]);
+            }
+            // Multiply by geometric factors
+            TData f1 = 0.5 * (1.0 + Z0[i]);
+            outptr0[cnt_ji] += outptr1[cnt_ji] * f1;
+            outptr0[cnt_ji] *= f0;
+        }
+    }
+    delete[] inptr;
+    delete[] dfptr;
+}
+template <typename TData, bool DEFORMED>
+__global__ void IProductWRTDerivBaseHexKernel(
+    const size_t nq0, const size_t nq1, const size_t nq2, const size_t ncoord,
+    const size_t nelmt, const size_t dfSize, TData *df, TData *in0, TData *in1,
+    TData *in2, TData *out0, TData *out1, TData *out2)
+{
+    size_t e = blockDim.x * blockIdx.x + threadIdx.x;
+    if (e >= nelmt)
+    {
+        return;
+    }
+    const auto ndf = 3 * ncoord;
+    // Assign pointers.
+    TData **inptr = new TData *[ncoord];
+    inptr[0]      = in0 + (nq0 * nq1 * nq2 * e);
+    inptr[1]      = in1 + (nq0 * nq1 * nq2 * e);
+    inptr[2]      = in2 + (nq0 * nq1 * nq2 * e);
+    TData *outptr0 = out0 + (nq0 * nq1 * nq2 * e);
+    TData *outptr1 = out1 + (nq0 * nq1 * nq2 * e);
+    TData *outptr2 = out2 + (nq0 * nq1 * nq2 * e);
+    TData **dfptr = new TData *[ndf];
+    for (size_t d = 0; d < ndf; d++)
+    {
+        dfptr[d] = df + d * dfSize;
+        dfptr[d] += DEFORMED ? nq0 * nq1 * nq2 * e : e;
+    }
+    // Calculate dxi/dx in[0] + dxi/dy in[1] + dxi/dz in[2]
+    for (size_t i = 0; i < nq0 * nq1 * nq2; ++i)
+    {
+        size_t dfindex = DEFORMED ? i : 0;
+        outptr0[i]     = dfptr[0][dfindex] * inptr[0][i];
+        outptr1[i]     = dfptr[1][dfindex] * inptr[0][i];
+        outptr2[i]     = dfptr[2][dfindex] * inptr[0][i];
+        for (size_t d = 1; d < ncoord; ++d)
+        {
+            outptr0[i] += (dfptr[3 * d][dfindex] * inptr[d][i]);
+            outptr1[i] += (dfptr[3 * d + 1][dfindex] * inptr[d][i]);
+            outptr2[i] += (dfptr[3 * d + 2][dfindex] * inptr[d][i]);
+        }
+    }
+    delete[] inptr;
+    delete[] dfptr;
+}
+template <typename TData, bool DEFORMED>
+__global__ void IProductWRTDerivBaseTetKernel(
+    const size_t nq0, const size_t nq1, const size_t nq2, const size_t ncoord,
+    const size_t nelmt, const TData *Z0, const TData *Z1, const TData *Z2,
+    const size_t dfSize, TData *df, TData *in0, TData *in1, TData *in2,
+    TData *out0, TData *out1, TData *out2)
+{
+    size_t e = blockDim.x * blockIdx.x + threadIdx.x;
+    if (e >= nelmt)
+    {
+        return;
+    }
+    const auto ndf = 3 * ncoord;
+    // Assign pointers.
+    TData **inptr = new TData *[ncoord];
+    inptr[0]      = in0 + (nq0 * nq1 * nq2 * e);
+    inptr[1]      = in1 + (nq0 * nq1 * nq2 * e);
+    inptr[2]      = in2 + (nq0 * nq1 * nq2 * e);
+    TData *outptr0 = out0 + (nq0 * nq1 * nq2 * e);
+    TData *outptr1 = out1 + (nq0 * nq1 * nq2 * e);
+    TData *outptr2 = out2 + (nq0 * nq1 * nq2 * e);
+    TData **dfptr = new TData *[ndf];
+    for (size_t d = 0; d < ndf; d++)
+    {
+        dfptr[d] = df + d * dfSize;
+        dfptr[d] += DEFORMED ? nq0 * nq1 * nq2 * e : e;
+    }
+    // Calculate dxi/dx in[0] + dxi/dy in[1] + dxi/dz in[2]
+    for (size_t k = 0, cnt_kji = 0; k < nq2; ++k)
+    {
+        TData f2 = 2.0 / (1.0 - Z2[k]);
+        for (size_t j = 0; j < nq1; ++j)
+        {
+            TData f3 = 0.5 * (1.0 + Z1[j]);
+            TData f0 = 2.0 * f2 / (1.0 - Z1[j]);
+            for (size_t i = 0; i < nq0; ++i, ++cnt_kji)
+            {
+                size_t dfindex   = DEFORMED ? cnt_kji : 0;
+                outptr0[cnt_kji] = dfptr[0][dfindex] * inptr[0][cnt_kji];
+                outptr1[cnt_kji] = dfptr[1][dfindex] * inptr[0][cnt_kji];
+                outptr2[cnt_kji] = dfptr[2][dfindex] * inptr[0][cnt_kji];
+                for (size_t d = 1; d < ncoord; ++d)
+                {
+                    outptr0[cnt_kji] +=
+                        (dfptr[3 * d][dfindex] * inptr[d][cnt_kji]);
+                    outptr1[cnt_kji] +=
+                        (dfptr[3 * d + 1][dfindex] * inptr[d][cnt_kji]);
+                    outptr2[cnt_kji] +=
+                        (dfptr[3 * d + 2][dfindex] * inptr[d][cnt_kji]);
+                }
+                // Multiply by geometric factors
+                TData f1 = 0.5 * (1.0 + Z0[i]);
+                outptr0[cnt_kji] += (outptr1[cnt_kji] + outptr2[cnt_kji]) * f1;
+                outptr0[cnt_kji] *= f0;
+                outptr1[cnt_kji] += outptr2[cnt_kji] * f3;
+                outptr1[cnt_kji] *= f2;
+            }
+        }
+    }
+    delete[] inptr;
+    delete[] dfptr;
+}
+template <typename TData, bool DEFORMED>
+__global__ void IProductWRTDerivBasePrismKernel(
+    const size_t nq0, const size_t nq1, const size_t nq2, const size_t ncoord,
+    const size_t nelmt, const TData *Z0, const TData *Z2, const size_t dfSize,
+    TData *df, TData *in0, TData *in1, TData *in2, TData *out0, TData *out1,
+    TData *out2)
+{
+    size_t e = blockDim.x * blockIdx.x + threadIdx.x;
+    if (e >= nelmt)
+    {
+        return;
+    }
+    const auto ndf = 3 * ncoord;
+    // Assign pointers.
+    TData **inptr = new TData *[ncoord];
+    inptr[0]      = in0 + (nq0 * nq1 * nq2 * e);
+    inptr[1]      = in1 + (nq0 * nq1 * nq2 * e);
+    inptr[2]      = in2 + (nq0 * nq1 * nq2 * e);
+    TData *outptr0 = out0 + (nq0 * nq1 * nq2 * e);
+    TData *outptr1 = out1 + (nq0 * nq1 * nq2 * e);
+    TData *outptr2 = out2 + (nq0 * nq1 * nq2 * e);
+    TData **dfptr = new TData *[ndf];
+    for (size_t d = 0; d < ndf; d++)
+    {
+        dfptr[d] = df + d * dfSize;
+        dfptr[d] += DEFORMED ? nq0 * nq1 * nq2 * e : e;
+    }
+    // Calculate dxi/dx in[0] + dxi/dy in[1] + dxi/dz in[2]
+    for (size_t k = 0, cnt_kji = 0; k < nq2; ++k)
+    {
+        TData f0 = 2.0 / (1.0 - Z2[k]);
+        for (size_t j = 0; j < nq1; ++j)
+        {
+            for (size_t i = 0; i < nq0; ++i, ++cnt_kji)
+            {
+                size_t dfindex   = DEFORMED ? cnt_kji : 0;
+                outptr0[cnt_kji] = dfptr[0][dfindex] * inptr[0][cnt_kji];
+                outptr1[cnt_kji] = dfptr[1][dfindex] * inptr[0][cnt_kji];
+                outptr2[cnt_kji] = dfptr[2][dfindex] * inptr[0][cnt_kji];
+                for (size_t d = 1; d < ncoord; ++d)
+                {
+                    outptr0[cnt_kji] +=
+                        (dfptr[3 * d][dfindex] * inptr[d][cnt_kji]);
+                    outptr1[cnt_kji] +=
+                        (dfptr[3 * d + 1][dfindex] * inptr[d][cnt_kji]);
+                    outptr2[cnt_kji] +=
+                        (dfptr[3 * d + 2][dfindex] * inptr[d][cnt_kji]);
+                }
+                // Multiply by geometric factors
+                TData f1 = 0.5 * (1.0 + Z0[i]);
+                outptr0[cnt_kji] += outptr2[cnt_kji] * f1;
+                outptr0[cnt_kji] *= f0;
+            }
+        }
+    }
+    delete[] inptr;
+    delete[] dfptr;
+}
+template <typename TData, bool DEFORMED>
+__global__ void IProductWRTDerivBasePyrKernel(
+    const size_t nq0, const size_t nq1, const size_t nq2, const size_t ncoord,
+    const size_t nelmt, const TData *Z0, const TData *Z1, const TData *Z2,
+    const size_t dfSize, TData *df, TData *in0, TData *in1, TData *in2,
+    TData *out0, TData *out1, TData *out2)
+{
+    size_t e = blockDim.x * blockIdx.x + threadIdx.x;
+    if (e >= nelmt)
+    {
+        return;
+    }
+    const auto ndf = 3 * ncoord;
+    // Assign pointers.
+    TData **inptr = new TData *[ncoord];
+    inptr[0]      = in0 + (nq0 * nq1 * nq2 * e);
+    inptr[1]      = in1 + (nq0 * nq1 * nq2 * e);
+    inptr[2]      = in2 + (nq0 * nq1 * nq2 * e);
+    TData *outptr0 = out0 + (nq0 * nq1 * nq2 * e);
+    TData *outptr1 = out1 + (nq0 * nq1 * nq2 * e);
+    TData *outptr2 = out2 + (nq0 * nq1 * nq2 * e);
+    TData **dfptr = new TData *[ndf];
+    for (size_t d = 0; d < ndf; d++)
+    {
+        dfptr[d] = df + d * dfSize;
+        dfptr[d] += DEFORMED ? nq0 * nq1 * nq2 * e : e;
+    }
+    // Calculate dxi/dx in[0] + dxi/dy in[1] + dxi/dz in[2]
+    for (size_t k = 0, cnt_kji = 0; k < nq2; ++k)
+    {
+        TData f0 = 2.0 / (1.0 - Z2[k]);
+        for (size_t j = 0; j < nq1; ++j)
+        {
+            TData f2 = 0.5 * (1.0 + Z1[j]);
+            for (size_t i = 0; i < nq0; ++i, ++cnt_kji)
+            {
+                size_t dfindex   = DEFORMED ? cnt_kji : 0;
+                outptr0[cnt_kji] = dfptr[0][dfindex] * inptr[0][cnt_kji];
+                outptr1[cnt_kji] = dfptr[1][dfindex] * inptr[0][cnt_kji];
+                outptr2[cnt_kji] = dfptr[2][dfindex] * inptr[0][cnt_kji];
+                for (size_t d = 1; d < ncoord; ++d)
+                {
+                    outptr0[cnt_kji] +=
+                        (dfptr[3 * d][dfindex] * inptr[d][cnt_kji]);
+                    outptr1[cnt_kji] +=
+                        (dfptr[3 * d + 1][dfindex] * inptr[d][cnt_kji]);
+                    outptr2[cnt_kji] +=
+                        (dfptr[3 * d + 2][dfindex] * inptr[d][cnt_kji]);
+                }
+                // Multiply by geometric factors
+                TData f1 = 0.5 * (1.0 + Z0[i]);
+                outptr0[cnt_kji] += outptr2[cnt_kji] * f1;
+                outptr0[cnt_kji] *= f0;
+                outptr1[cnt_kji] += outptr2[cnt_kji] * f2;
+                outptr1[cnt_kji] *= f0;
+            }
+        }
+    }
+    delete[] inptr;
+    delete[] dfptr;
+}
+} // namespace Nektar::Operators::detail
--- a/Operators/IProductWRTDerivBase/IProductWRTDerivBaseImpl.cpp
+++ b/Operators/IProductWRTDerivBase/IProductWRTDerivBaseImpl.cpp
+#include "IProductWRTDerivBaseStdMat.hpp"
+namespace Nektar::Operators::detail
+{
+// Add different IProductWRTDerivBase implementations to the factory.
+template <>
+std::string OperatorIProductWRTDerivBaseImpl<double, ImplStdMat>::className =
+    GetOperatorFactory<double>().RegisterCreatorFunction(
+        "IProductWRTDerivBaseStdMat",
+        OperatorIProductWRTDerivBaseImpl<double, ImplStdMat>::instantiate,
+        "...");
+} // namespace Nektar::Operators::detail
--- a/Operators/IProductWRTDerivBase/IProductWRTDerivBaseStdMat.hpp
+++ b/Operators/IProductWRTDerivBase/IProductWRTDerivBaseStdMat.hpp
+#include "Operators/OperatorIProductWRTDerivBase.hpp"
+#include <StdRegions/StdExpansion.h>
+namespace Nektar::Operators::detail
+{
+// standard matrix implementation
+template <typename TData>
+class OperatorIProductWRTDerivBaseImpl<TData, ImplStdMat>
+    : public OperatorIProductWRTDerivBase<TData>
+{
+public:
+    OperatorIProductWRTDerivBaseImpl(
+        const MultiRegions::ExpListSharedPtr &expansionList)
+        : OperatorIProductWRTDerivBase<TData>(std::move(expansionList))
+    {
+        size_t nTotElmts = this->m_expansionList->GetNumElmts();
+        size_t nDim      = this->m_expansionList->GetShapeDimension();
+        // Initialise jacobian.
+        size_t jacSize = Operator<TData>::GetGeometricFactorSize();
+        m_jac          = Operator<TData>::SetJacobian(jacSize);
+        m_derivFac     = Operator<TData>::SetDerivativeFactor(jacSize);
+        // Initialize basiskey.
+        std::vector<LibUtilities::BasisKey> basisKeys(
+            3, LibUtilities::NullBasisKey);
+        // Loop over the elements of expansionList.
+        for (size_t e = 0; e < nTotElmts; ++e)
+        {
+            auto const expPtr = this->m_expansionList->GetExp(e);
+            // Fetch basiskeys of current element.
+            for (size_t d = 0; d < nDim; d++)
+            {
+                basisKeys[d] = expPtr->GetBasis(d)->GetBasisKey();
+            }
+            // Copy data to m_matPtr, if necessary.
+            if (m_matPtr.find(basisKeys) == m_matPtr.end())
+            {
+                size_t nqTot = expPtr->GetTotPoints();
+                size_t nmTot = expPtr->GetNcoeffs();
+                auto &matPtr = m_matPtr[basisKeys];
+                matPtr       = Array<OneD, Array<OneD, TData>>(nDim);
+                Array<OneD, NekDouble> tmp(nqTot), t;
+                for (size_t d = 0; d < nDim; ++d)
+                {
+                    // Get IProductWRTDerivBase matrix.
+                    matPtr[d] = Array<OneD, TData>(nqTot * nmTot);
+                    for (size_t i = 0; i < nqTot; ++i)
+                    {
+                        Vmath::Zero(nqTot, tmp, 1);
+                        tmp[i] = 1.0;
+                        expPtr->GetStdExp()->IProductWRTDerivBase(
+                            d, tmp, t = matPtr[d] + i * nmTot);
+                    }
+                }
+            }
+        }
+        // Initialize workspace memory.
+        auto ndata = this->m_expansionList->GetTotPoints();
+        m_wsp      = Array<OneD, Array<OneD, NekDouble>>(3);
+        for (size_t i = 0; i < nDim; ++i)
+        {
+            m_wsp[i] = Array<OneD, NekDouble>(ndata);
+        }
+    }
+    void apply(Field<TData, FieldState::Phys> &in0,
+               Field<TData, FieldState::Phys> &in1,
+               Field<TData, FieldState::Phys> &in2,
+               Field<TData, FieldState::Coeff> &out,
+               bool APPEND = false) override
+    {
+        // Copy memory to GPU, if necessary and get raw pointers.
+        std::vector<TData *> inptr{in0.GetStorage().GetCPUPtr(),
+                                   in1.GetStorage().GetCPUPtr(),
+                                   in2.GetStorage().GetCPUPtr()};
+        auto *outptr = out.GetStorage().GetCPUPtr();
+        std::vector<TData *> wspptr{m_wsp[0].get(), m_wsp[1].get(),
+                                    m_wsp[2].get()};
+        // Initialize index.
+        size_t expIdx = 0;
+        size_t jacIdx = 0;
+        size_t dfIdx  = 0;
+        // Initialize basiskey.
+        std::vector<LibUtilities::BasisKey> basisKeys(
+            3, LibUtilities::NullBasisKey);
+        // Loop over the blocks.
+        for (size_t block_idx = 0; block_idx < out.GetBlocks().size();
+             ++block_idx)
+        {
+            // Determine shape and type of the element.
+            auto const expPtr = this->m_expansionList->GetExp(expIdx);
+            auto nElmts       = out.GetBlocks()[block_idx].num_elements;
+            auto nDim         = expPtr->GetShapeDimension();
+            auto nCoord       = expPtr->GetCoordim();
+            auto nqTot        = expPtr->GetTotPoints();
+            auto nmTot        = expPtr->GetNcoeffs();
+            auto deformed     = expPtr->GetMetricInfo()->GetGtype() ==
+                            SpatialDomains::eDeformed;
+            // calculate dx/dxi in[0] + dy/dxi in[1] + dz/dxi in[2]
+            if (deformed)
+            {
+                for (size_t d = 0; d < nDim; ++d)
+                {
+                    Vmath::Vmul(nqTot * nElmts, m_derivFac[d].get() + dfIdx, 1,
+                                inptr[0], 1, wspptr[d], 1);
+                    for (size_t i = 1; i < nCoord; ++i)
+                    {
+                        Vmath::Vvtvp(nqTot * nElmts,
+                                     m_derivFac[d + i * nDim].get() + dfIdx, 1,
+                                     inptr[i], 1, wspptr[d], 1, wspptr[d], 1);
+                    }
+                }
+            }
+            else
+            {
+                for (size_t e = 0; e < nElmts; ++e)
+                {
+                    for (size_t d = 0; d < nDim; ++d)
+                    {
+                        Vmath::Smul(nqTot, m_derivFac[d][dfIdx + e],
+                                    inptr[0] + e * nqTot, 1,
+                                    wspptr[d] + e * nqTot, 1);
+                        for (size_t i = 1; i < nCoord; ++i)
+                        {
+                            Vmath::Svtvp(
+                                nqTot, m_derivFac[d + i * nDim][dfIdx + e],
+                                inptr[i] + e * nqTot, 1, wspptr[d] + e * nqTot,
+                                1, wspptr[d] + e * nqTot, 1);
+                        }
+                    }
+                }
+            }
+            // Multiply by jacobian.
+            if (deformed)
+            {
+                for (size_t d = 0; d < nDim; ++d)
+                {
+                    Vmath::Vmul(nqTot * nElmts, m_jac.get() + jacIdx, 1,
+                                wspptr[d], 1, wspptr[d], 1);
+                }
+            }
+            else
+            {
+                for (size_t e = 0; e < nElmts; ++e)
+                {
+                    for (size_t d = 0; d < nDim; ++d)
+                    {
+                        Vmath::Smul(nqTot, m_jac[jacIdx + e],
+                                    wspptr[d] + e * nqTot, 1,
+                                    wspptr[d] + e * nqTot, 1);
+                    }
+                }
+            }
+            // Fetch basis key for the current element type.
+            for (size_t d = 0; d < nDim; d++)
+            {
+                basisKeys[d] = expPtr->GetBasis(d)->GetBasisKey();
+            }
+            // Fetch matrix.
+            auto matPtr = m_matPtr[basisKeys];
+            // Matrix products.
+            for (size_t d = 0; d < nDim; d++)
+            {
+                TData alpha = (d == 0 && !APPEND) ? 0.0 : 1.0;
+                Blas::Dgemm('N', 'N', nmTot, nElmts, nqTot, 1.0,
+                            matPtr[d].get(), nmTot, wspptr[d], nqTot, alpha,
+                            outptr, nmTot);
+                // Increment pointer and index for next element type.
+                inptr[d] += nqTot * nElmts;
+                wspptr[d] += nqTot * nElmts;
+            }
+            jacIdx += deformed ? nqTot * nElmts : nElmts;
+            dfIdx += deformed ? nqTot * nElmts : nElmts;
+            outptr += nmTot * nElmts;
+            expIdx += nElmts;
+        }
+    }
+    static std::unique_ptr<Operator<TData>> instantiate(
+        const MultiRegions::ExpListSharedPtr &expansionList)
+    {
+        return std::make_unique<
+            OperatorIProductWRTDerivBaseImpl<TData, ImplStdMat>>(
+            std::move(expansionList));
+    }
+    static std::string className;
+private:
+    Array<OneD, TData> m_jac;
+    Array<OneD, Array<OneD, TData>> m_derivFac;
+    std::map<std::vector<LibUtilities::BasisKey>,
+             Array<OneD, Array<OneD, TData>>>
+        m_matPtr;
+    Array<OneD, Array<OneD, TData>> m_wsp;
+};
+} // namespace Nektar::Operators::detail
--- a/Operators/Operator.hpp
+++ b/Operators/Operator.hpp
@@ -60,6 +60,104 @@ public:
    }
 protected:
+    size_t GetGeometricFactorSize(void)
+    {
+        size_t gfSize    = 0;
+        size_t nTotElmts = this->m_expansionList->GetNumElmts();
+        // Calculate the jacobian array size
+        for (size_t e = 0; e < nTotElmts; ++e)
+        {
+            // Determine shape and type of the element
+            auto const expPtr = this->m_expansionList->GetExp(e);
+            if (expPtr->GetMetricInfo()->GetGtype() ==
+                SpatialDomains::eDeformed)
+            {
+                gfSize += expPtr->GetTotPoints();
+            }
+            else
+            {
+                gfSize++;
+            }
+        }
+        return gfSize;
+    }
+    Array<OneD, TData> SetJacobian(size_t jacSize)
+    {
+        // Allocate memory for the jacobian
+        Array<OneD, TData> jac(jacSize, 0.0);
+        // Initialise jacobian.
+        size_t index     = 0;
+        size_t nTotElmts = this->m_expansionList->GetNumElmts();
+        for (size_t e = 0; e < nTotElmts; ++e)
+        {
+            auto expPtr = this->m_expansionList->GetExp(e);
+            auto &auxJac =
+                expPtr->GetMetricInfo()->GetJac(expPtr->GetPointsKeys());
+            if (expPtr->GetMetricInfo()->GetGtype() ==
+                SpatialDomains::eDeformed)
+            {
+                size_t nqe = expPtr->GetTotPoints();
+                for (size_t i = 0; i < nqe; ++i)
+                {
+                    jac[index++] = auxJac[i];
+                }
+            }
+            else
+            {
+                jac[index++] = auxJac[0];
+            }
+        }
+        return jac;
+    }
+    Array<OneD, Array<OneD, TData>> SetDerivativeFactor(size_t dfSize)
+    {
+        // Allocate memory for the derivative factor
+        size_t nDim   = this->m_expansionList->GetShapeDimension();
+        size_t nCoord = this->m_expansionList->GetCoordim(0);
+        Array<OneD, Array<OneD, TData>> derivFac(nDim * nCoord);
+        for (size_t d = 0; d < nDim * nCoord; d++)
+        {
+            derivFac[d] = Array<OneD, TData>(dfSize);
+        }
+        // Initialise derivative factor.
+        size_t dfindex   = 0;
+        size_t nTotElmts = this->m_expansionList->GetNumElmts();
+        for (size_t e = 0; e < nTotElmts; ++e)
+        {
+            auto expPtr = this->m_expansionList->GetExp(e);
+            auto &df    = expPtr->GetMetricInfo()->GetDerivFactors(
+                expPtr->GetPointsKeys());
+            size_t nqTot = expPtr->GetTotPoints();
+            if (expPtr->GetMetricInfo()->GetGtype() ==
+                SpatialDomains::eDeformed)
+            {
+                for (size_t d = 0; d < nDim * nCoord; d++)
+                {
+                    for (size_t i = 0; i < nqTot; ++i)
+                    {
+                        derivFac[d][dfindex + i] = df[d][i];
+                    }
+                }
+                dfindex += nqTot;
+            }
+            else
+            {
+                for (size_t d = 0; d < nDim * nCoord; d++)
+                {
+                    derivFac[d][dfindex] = df[d][0];
+                }
+                dfindex += 1;
+            }
+        }
+        return derivFac;
+    }
    MultiRegions::ExpListSharedPtr m_expansionList;
 };

--- a/Operators/OperatorHelmholtz.cpp
+++ b/Operators/OperatorHelmholtz.cpp
+#include "OperatorHelmholtz.hpp"
+namespace Nektar::Operators
+{
+template <> const std::string Helmholtz<>::key          = "Helmholtz";
+template <> const std::string Helmholtz<>::default_impl = "MatFree";
+} // namespace Nektar::Operators
--- a/Operators/OperatorHelmholtz.hpp
+++ b/Operators/OperatorHelmholtz.hpp
+#pragma once
+#include "Field.hpp"
+#include "Operator.hpp"
+namespace Nektar::Operators
+{
+// Helmholtz base class
+// Defines the apply operator to enforce apply parameter types
+template <typename TData> class OperatorHelmholtz : public Operator<TData>
+{
+public:
+    virtual ~OperatorHelmholtz() = default;
+    OperatorHelmholtz(const MultiRegions::ExpListSharedPtr &expansionList)
+        : Operator<TData>(expansionList)
+    {
+    }
+    virtual void apply(Field<TData, FieldState::Coeff> &in,
+                       Field<TData, FieldState::Coeff> &out) = 0;
+    virtual void operator()(Field<TData, FieldState::Coeff> &in,
+                            Field<TData, FieldState::Coeff> &out)
+    {
+        apply(in, out);
+    }
+};
+// Descriptor / traits class for Helmholtz
+template <typename TData = default_fp_type> struct Helmholtz
+{
+    using class_name = OperatorHelmholtz<TData>;
+    using FieldIn    = Field<TData, FieldState::Coeff>;
+    using FieldOut   = Field<TData, FieldState::Coeff>;
+    static const std::string key;
+    static const std::string default_impl;
+    static std::shared_ptr<class_name> create(
+        const MultiRegions::ExpListSharedPtr &expansionList,
+        std::string pKey = "")
+    {
+        return Operator<TData>::template create<Helmholtz>(expansionList, pKey);
+    }
+};
+namespace detail
+{
+// Template for Helmholtz implementations
+template <typename TData, typename Op> class OperatorHelmholtzImpl;
+} // namespace detail
+} // namespace Nektar::Operators
No results found