diff --git a/CMakeLists.txt b/CMakeLists.txt
index ecfd7de3891ea837aef2fc09a8328cfda9f516c6..3dc3a533bfac63970319f7ded23b6177b842b4c6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -66,7 +66,7 @@ if (NEKTAR_USE_CUDA)
                    Operators/Helmholtz/HelmholtzCUDA.cu
                    Operators/AssmbScatr/AssmbScatrCUDA.cu 
                    Operators/NullPrecon/NullPreconCUDA.cu 
-                   #Operators/DiagPrecon/DiagPreconCUDA.cu
+                   Operators/DiagPrecon/DiagPreconCUDA.cu
                    Operators/DirBndCond/DirBndCondCUDA.cu
                    #Operators/NeuBndCond/NeuBndCondCUDA.cu
                    #Operators/RobBndCond/RobBndCondCUDA.cu
diff --git a/Operators/DiagPrecon/DiagPreconCUDA.cu b/Operators/DiagPrecon/DiagPreconCUDA.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ae592b096fc30479ae2304b28d5a4a11ac1d7e22
--- /dev/null
+++ b/Operators/DiagPrecon/DiagPreconCUDA.cu
@@ -0,0 +1,13 @@
+#include "DiagPreconCUDA.hpp"
+
+namespace Nektar::Operators::detail
+{
+
+// Register implementation with Operator Factory
+template <>
+std::string OperatorDiagPreconImpl<double, ImplCUDA>::className =
+    GetOperatorFactory<double>().RegisterCreatorFunction(
+        "DiagPreconCUDA", OperatorDiagPreconImpl<double, ImplCUDA>::instantiate,
+        "");
+
+} // namespace Nektar::Operators::detail
diff --git a/Operators/DiagPrecon/DiagPreconCUDA.hpp b/Operators/DiagPrecon/DiagPreconCUDA.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1475153ab1bdd8db3c0ff374059017c1aa9bf0f0
--- /dev/null
+++ b/Operators/DiagPrecon/DiagPreconCUDA.hpp
@@ -0,0 +1,142 @@
+#pragma once
+
+#include "Field.hpp"
+#include "Operators/AssmbScatr/AssmbScatrCUDA.hpp"
+#include "Operators/DiagPrecon/DiagPreconCUDAKernels.cuh"
+#include "Operators/OperatorDiagPrecon.hpp"
+#include "Operators/OperatorRobBndCond.hpp"
+
+#include <MultiRegions/AssemblyMap/AssemblyMapCG.h>
+#include <MultiRegions/ContField.h>
+
+using namespace Nektar;
+using namespace Nektar::MultiRegions;
+using namespace Nektar::SpatialDomains;
+
+namespace Nektar::Operators::detail
+{
+
+template <typename TData>
+class OperatorDiagPreconImpl<TData, ImplCUDA> : public OperatorDiagPrecon<TData>
+{
+public:
+    OperatorDiagPreconImpl(const MultiRegions::ExpListSharedPtr &expansionList)
+        : OperatorDiagPrecon<TData>(expansionList)
+    {
+        auto contfield =
+            std::dynamic_pointer_cast<ContField>(this->m_expansionList);
+        m_assmbMap = contfield->GetLocalToGlobalMap();
+
+        GlobalSysSolnType solvertype = m_assmbMap->GetGlobalSysSolnType();
+        bool isFull = solvertype == eIterativeFull ? true : false;
+        m_nGlobal   = (isFull) ? m_assmbMap->GetNumGlobalCoeffs()
+                               : m_assmbMap->GetNumGlobalBndCoeffs();
+        m_nLocal    = m_assmbMap->GetNumLocalCoeffs();
+        m_nDir      = m_assmbMap->GetNumGlobalDirBndCoeffs();
+
+        cudaMalloc((void **)&m_wk, sizeof(TData) * m_nGlobal);
+        cudaMalloc((void **)&m_diag, sizeof(TData) * m_nGlobal);
+
+        m_assmbScatr =
+            std::static_pointer_cast<OperatorAssmbScatrImpl<TData, ImplCUDA>>(
+                AssmbScatr<TData>::create(this->m_expansionList, "CUDA"));
+
+        // Determine CUDA grid parameters.
+        m_gridSize = m_nGlobal / m_blockSize;
+        m_gridSize += (m_nGlobal % m_blockSize == 0) ? 0 : 1;
+    }
+
+    ~OperatorDiagPreconImpl(void)
+    {
+        cudaFree(m_wk);
+        cudaFree(m_diag);
+    }
+
+    void apply(Field<TData, FieldState::Coeff> &in,
+               Field<TData, FieldState::Coeff> &out) override
+    {
+        m_assmbScatr->Assemble(in, m_wk);
+
+        DiagPreconKernel<<<m_gridSize, m_blockSize>>>(m_nGlobal, m_nDir, m_diag,
+                                                      m_wk);
+
+        cudaMemset(m_wk, 0, sizeof(TData) * m_nDir);
+
+        m_assmbScatr->GlobalToLocal(m_wk, out);
+    }
+
+    void configure(
+        const std::shared_ptr<
+            OperatorLinear<TData, FieldState::Coeff, FieldState::Coeff>> &op)
+    {
+        auto robBCOp = RobBndCond<TData>::create(this->m_expansionList);
+
+        Array<OneD, TData> diag(m_nLocal);
+
+        // create unit vector field to extract diagonal
+        Field<TData, FieldState::Coeff> unit_vec =
+            Field<TData, FieldState::Coeff>::create(
+                GetBlockAttributes(FieldState::Coeff, this->m_expansionList));
+
+        // create action field to receive column action from unit vector
+        Field<TData, FieldState::Coeff> action =
+            Field<TData, FieldState::Coeff>::create(
+                GetBlockAttributes(FieldState::Coeff, this->m_expansionList));
+
+        auto *uvec_ptr = unit_vec.GetStorage().GetCPUPtr();
+        auto *actn_ptr = action.GetStorage().GetCPUPtr();
+        auto *diag_ptr = diag.get();
+
+        for (size_t i = 0; i < m_nLocal; ++i)
+        {
+            // set ith term in unit vector to be 1
+            *uvec_ptr = 1.0;
+
+            // apply operator to unit vector and store in action field
+            op->apply(unit_vec, action);
+            robBCOp->apply(unit_vec, action);
+
+            // copy ith row term from the action field to get ith diagonal
+            *(diag_ptr++) = *(actn_ptr++);
+
+            // reset ith term in unit vector to be 0
+            *(uvec_ptr++) = 0.0;
+        }
+
+        // Assembly
+        Array<OneD, TData> tmp(m_nGlobal, 0.0);
+        for (size_t i = 0; i < m_nLocal; ++i)
+        {
+            size_t gid1 = m_assmbMap->GetLocalToGlobalMap(i);
+            tmp[gid1] += diag[i];
+        }
+        m_assmbMap->UniversalAssemble(tmp);
+
+        cudaMemcpy(m_diag, tmp.get(), sizeof(TData) * m_nGlobal,
+                   cudaMemcpyHostToDevice);
+    }
+
+    // instantiation function for CreatorFunction in OperatorFactory
+    static std::unique_ptr<Operator<TData>> instantiate(
+        const MultiRegions::ExpListSharedPtr &expansionList)
+    {
+        return std::make_unique<OperatorDiagPreconImpl<TData, ImplCUDA>>(
+            expansionList);
+    }
+
+    // className - for OperatorFactory
+    static std::string className;
+
+protected:
+    std::shared_ptr<OperatorAssmbScatrImpl<TData, ImplCUDA>> m_assmbScatr;
+    AssemblyMapCGSharedPtr m_assmbMap;
+    TData *m_diag;
+    TData *m_wk;
+    size_t m_nGlobal;
+    size_t m_nLocal;
+    size_t m_nDir;
+    size_t m_gridSize;
+    size_t m_blockSize = 32;
+};
+
+} // namespace Nektar::Operators::detail
diff --git a/Operators/DiagPrecon/DiagPreconCUDAKernels.cuh b/Operators/DiagPrecon/DiagPreconCUDAKernels.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..05b9fbe3a47295de27670790f9820ec7527e295b
--- /dev/null
+++ b/Operators/DiagPrecon/DiagPreconCUDAKernels.cuh
@@ -0,0 +1,18 @@
+namespace Nektar::Operators::detail
+{
+
+template <typename TData>
+__global__ void DiagPreconKernel(const size_t nGlobal, const size_t nDir,
+                                 const TData *diagptr, TData *inoutptr)
+{
+    size_t i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < nDir || i >= nGlobal)
+    {
+        return;
+    }
+
+    inoutptr[i] /= diagptr[i];
+}
+
+} // namespace Nektar::Operators::detail
diff --git a/Operators/DiagPrecon/DiagPreconStdMat.hpp b/Operators/DiagPrecon/DiagPreconStdMat.hpp
index afde629154ef8e5fe60143b44bc6772f954dba28..6f15ad269dc6acdc1524b3e760cb18f152579ae7 100644
--- a/Operators/DiagPrecon/DiagPreconStdMat.hpp
+++ b/Operators/DiagPrecon/DiagPreconStdMat.hpp
@@ -1,8 +1,10 @@
 #pragma once
 
 #include "Field.hpp"
+#include "Operators/AssmbScatr/AssmbScatrStdMat.hpp"
 #include "Operators/OperatorDiagPrecon.hpp"
 #include "Operators/OperatorRobBndCond.hpp"
+
 #include <MultiRegions/AssemblyMap/AssemblyMapCG.h>
 #include <MultiRegions/ContField.h>
 
@@ -24,85 +26,43 @@ public:
         auto contfield =
             std::dynamic_pointer_cast<ContField>(this->m_expansionList);
         m_assmbMap = contfield->GetLocalToGlobalMap();
-        m_robBCOp  = RobBndCond<TData>::create(this->m_expansionList);
-    }
 
-    void apply(Field<TData, FieldState::Coeff> &in,
-               Field<TData, FieldState::Coeff> &out) override
-    {
         GlobalSysSolnType solvertype = m_assmbMap->GetGlobalSysSolnType();
-
         bool isFull = solvertype == eIterativeFull ? true : false;
+        m_nGlobal   = (isFull) ? m_assmbMap->GetNumGlobalCoeffs()
+                               : m_assmbMap->GetNumGlobalBndCoeffs();
+        m_nLocal    = m_assmbMap->GetNumLocalCoeffs();
+        m_nDir      = m_assmbMap->GetNumGlobalDirBndCoeffs();
 
-        size_t nGlobal = (isFull) ? m_assmbMap->GetNumGlobalCoeffs()
-                                  : m_assmbMap->GetNumGlobalBndCoeffs();
-        size_t nDir    = m_assmbMap->GetNumGlobalDirBndCoeffs();
-        size_t nLocal  = m_assmbMap->GetNumLocalCoeffs();
+        m_wk   = Array<OneD, TData>(m_nGlobal, 0.0);
+        m_diag = Array<OneD, TData>(m_nGlobal, 0.0);
 
-        auto *diag_ptr = m_diag.get();
-
-        // Copy data from input field
-        Array<OneD, TData> pIn(nLocal, 0.0);
-        auto *inarrptr = pIn.data();
-        auto *inptr    = in.GetStorage().GetCPUPtr();
-
-        for (size_t block_idx = 0; block_idx < in.GetBlocks().size();
-             ++block_idx)
-        {
-            auto nSize  = in.GetBlocks()[block_idx].block_size;
-            auto nElmts = in.GetBlocks()[block_idx].num_elements;
-            auto nmTot  = in.GetBlocks()[block_idx].num_pts;
-
-            std::copy(inptr, inptr + nElmts * nmTot, inarrptr);
+        m_assmbScatr =
+            std::static_pointer_cast<OperatorAssmbScatrImpl<TData, ImplStdMat>>(
+                AssmbScatr<TData>::create(this->m_expansionList));
+    }
 
-            inarrptr += nElmts * nmTot;
-            inptr += nSize;
-        }
+    void apply(Field<TData, FieldState::Coeff> &in,
+               Field<TData, FieldState::Coeff> &out) override
+    {
+        m_assmbScatr->Assemble(in, m_wk);
 
-        Array<OneD, TData> wk(nGlobal, 0.0);
-        Array<OneD, TData> pOut(nLocal, 0.0);
-        (isFull) ? m_assmbMap->Assemble(pIn, wk)
-                 : m_assmbMap->AssembleBnd(pIn, wk);
-        std::transform(wk.get() + nDir, wk.get() + nGlobal, diag_ptr + nDir,
-                       wk.get() + nDir,
+        std::transform(m_wk.get() + m_nDir, m_wk.get() + m_nGlobal,
+                       m_diag.get() + m_nDir, m_wk.get() + m_nDir,
                        [](TData in, TData diag) { return in / diag; });
-        std::fill(wk.get(), wk.get() + nDir, 0.0);
-        (isFull) ? m_assmbMap->GlobalToLocal(wk, pOut)
-                 : m_assmbMap->GlobalToLocalBnd(wk, pOut);
-
-        // Copy data to output field
-        auto *outarrptr = pOut.data();
-        auto *outptr    = out.GetStorage().GetCPUPtr();
-
-        for (size_t block_idx = 0; block_idx < out.GetBlocks().size();
-             ++block_idx)
-        {
-            auto nSize  = out.GetBlocks()[block_idx].block_size;
-            auto nElmts = out.GetBlocks()[block_idx].num_elements;
-            auto nmTot  = out.GetBlocks()[block_idx].num_pts;
 
-            std::copy(outarrptr, outarrptr + nElmts * nmTot, outptr);
+        std::fill(m_wk.get(), m_wk.get() + m_nDir, 0.0);
 
-            outarrptr += nElmts * nmTot;
-            outptr += nSize;
-        }
+        m_assmbScatr->GlobalToLocal(m_wk, out);
     }
 
     void configure(
         const std::shared_ptr<
             OperatorLinear<TData, FieldState::Coeff, FieldState::Coeff>> &op)
     {
-        GlobalSysSolnType solvertype = m_assmbMap->GetGlobalSysSolnType();
-
-        bool isFull = solvertype == eIterativeFull ? true : false;
-
-        size_t nLocal  = m_assmbMap->GetNumLocalCoeffs();
-        size_t nGlobal = (isFull) ? m_assmbMap->GetNumGlobalCoeffs()
-                                  : m_assmbMap->GetNumGlobalBndCoeffs();
-
-        Array<OneD, TData> diag(nLocal, 0.0);
+        auto robBCOp = RobBndCond<TData>::create(this->m_expansionList);
 
-        m_diag = Array<OneD, TData>(nGlobal, 0.0);
+        Array<OneD, TData> diag(m_nLocal);
 
         // create unit vector field to extract diagonal
         Field<TData, FieldState::Coeff> unit_vec =
@@ -118,14 +78,14 @@ public:
         auto *actn_ptr = action.GetStorage().GetCPUPtr();
         auto *diag_ptr = diag.get();
 
-        for (size_t i = 0; i < nLocal; ++i)
+        for (size_t i = 0; i < m_nLocal; ++i)
         {
             // set ith term in unit vector to be 1
             *uvec_ptr = 1.0;
 
             // apply operator to unit vector and store in action field
             op->apply(unit_vec, action);
-            m_robBCOp->apply(unit_vec, action);
+            robBCOp->apply(unit_vec, action);
 
             // copy ith row term from the action field to get ith diagonal
             *(diag_ptr++) = *(actn_ptr++);
@@ -135,7 +95,7 @@ public:
         }
 
         // Assembly
-        for (size_t i = 0; i < nLocal; ++i)
+        for (size_t i = 0; i < m_nLocal; ++i)
         {
             size_t gid1 = m_assmbMap->GetLocalToGlobalMap(i);
             m_diag[gid1] += diag[i];
@@ -155,9 +115,13 @@ public:
     static std::string className;
 
 protected:
-    std::shared_ptr<OperatorRobBndCond<TData>> m_robBCOp;
+    std::shared_ptr<OperatorAssmbScatrImpl<TData, ImplStdMat>> m_assmbScatr;
     AssemblyMapCGSharedPtr m_assmbMap;
     Array<OneD, TData> m_diag;
+    Array<OneD, TData> m_wk;
+    size_t m_nGlobal;
+    size_t m_nLocal;
+    size_t m_nDir;
 };
 
 } // namespace Nektar::Operators::detail
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 6981fc0ba90a5668540a0a80b6adc32d62096ae9..57483308d32199e13431bce599b2e97f2f9bf68a 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -245,6 +245,17 @@ target_include_directories(test_diagprecon PRIVATE ${NEKTAR++_INCLUDE_DIRS})
 target_compile_definitions(test_diagprecon PRIVATE
     -DTEST_PATH="${CMAKE_SOURCE_DIR}")
 
+IF (NEKTAR_USE_CUDA)
+    add_executable(test_diagpreconcuda test_diagpreconcuda.cpp)
+    target_link_libraries(test_diagpreconcuda PRIVATE Operators)
+    target_link_libraries(test_diagpreconcuda PRIVATE ${NEKTAR++_LIBRARIES})
+    target_link_libraries(test_diagpreconcuda PRIVATE Boost::unit_test_framework)
+    target_include_directories(test_diagpreconcuda PRIVATE "${CMAKE_SOURCE_DIR}")
+    target_include_directories(test_diagpreconcuda PRIVATE ${NEKTAR++_INCLUDE_DIRS})
+    target_compile_definitions(test_diagpreconcuda PRIVATE
+        -DTEST_PATH="${CMAKE_SOURCE_DIR}")
+ENDIF()
+
 add_executable(test_fwdtrans test_fwdtrans.cpp)
 target_link_libraries(test_fwdtrans PRIVATE Operators)
 target_link_libraries(test_fwdtrans PRIVATE ${NEKTAR++_LIBRARIES})
@@ -451,6 +462,14 @@ add_test(
   WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
 )
 
+IF (NEKTAR_USE_CUDA)
+    add_test(
+      NAME DiagPreconCUDA
+      COMMAND test_diagpreconcuda
+      WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+    )
+ENDIF()
+
 add_test(
   NAME FwdTrans
   COMMAND test_fwdtrans
diff --git a/tests/test_diagprecon.cpp b/tests/test_diagprecon.cpp
index 14a65d0138d4d706140cd97db4f9f7a8e4a0540d..a9c49b8a659936776ce38296193a469fc11f619c 100644
--- a/tests/test_diagprecon.cpp
+++ b/tests/test_diagprecon.cpp
@@ -12,11 +12,6 @@
 
 BOOST_AUTO_TEST_SUITE(TestDiagPrecon)
 
-using namespace std;
-using namespace Nektar::Operators;
-using namespace Nektar::LibUtilities;
-using namespace Nektar;
-
 BOOST_FIXTURE_TEST_CASE(diagprecon_seg, Helmholtz1D_Seg)
 {
     Configure();
diff --git a/tests/test_diagpreconcuda.cpp b/tests/test_diagpreconcuda.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..df15cfa8dd7baa769854a6d23645411475ad3401
--- /dev/null
+++ b/tests/test_diagpreconcuda.cpp
@@ -0,0 +1,129 @@
+#define BOOST_TEST_DYN_LINK
+#define BOOST_TEST_MODULE TestDiagPreconCUDA
+#include <boost/test/tools/output_test_stream.hpp>
+#include <boost/test/unit_test.hpp>
+
+#include <iostream>
+#include <memory>
+
+#include "Operators/OperatorDiagPrecon.hpp"
+#include "Operators/OperatorHelmholtz.hpp"
+#include "init_diagpreconfields.hpp"
+
+BOOST_AUTO_TEST_SUITE(TestDiagPreconCUDA)
+
+BOOST_FIXTURE_TEST_CASE(diagpreconcuda_seg, Helmholtz1D_Seg)
+{
+    Configure();
+    SetTestCase(fixtcuda_in->GetBlocks(), fixtcuda_in->GetStorage().GetCPUPtr());
+    auto HelmholtzOp  = Helmholtz<>::create(fixt_explist, "StdMat");
+    auto DiagPreconOp = DiagPrecon<>::create(fixt_explist, "CUDA");
+    HelmholtzOp->setLambda(1.0);
+    DiagPreconOp->configure(HelmholtzOp);
+    DiagPreconOp->apply(*fixtcuda_in, *fixtcuda_out);
+    ExpectedSolution(fixt_expected->GetBlocks(),
+                     fixt_expected->GetStorage().GetCPUPtr());
+    BOOST_TEST(fixtcuda_out->compare(*fixt_expected, 1.0E-12));
+    boost::test_tools::output_test_stream output;
+    {
+        OutputIfNotMatch(fixtcuda_out->GetStorage().GetCPUPtr(),
+                         fixt_expected->GetStorage().GetCPUPtr(), 1.0E-12);
+    }
+}
+
+BOOST_FIXTURE_TEST_CASE(diagpreconcuda_tri_quad, Helmholtz2D_Tri_Quad)
+{
+    Configure();
+    SetTestCase(fixtcuda_in->GetBlocks(), fixtcuda_in->GetStorage().GetCPUPtr());
+    auto HelmholtzOp  = Helmholtz<>::create(fixt_explist, "StdMat");
+    auto DiagPreconOp = DiagPrecon<>::create(fixt_explist, "CUDA");
+    HelmholtzOp->setLambda(1.0);
+    DiagPreconOp->configure(HelmholtzOp);
+    DiagPreconOp->apply(*fixtcuda_in, *fixtcuda_out);
+    ExpectedSolution(fixt_expected->GetBlocks(),
+                     fixt_expected->GetStorage().GetCPUPtr());
+    BOOST_TEST(fixtcuda_out->compare(*fixt_expected, 1.0E-12));
+    boost::test_tools::output_test_stream output;
+    {
+        OutputIfNotMatch(fixtcuda_out->GetStorage().GetCPUPtr(),
+                         fixt_expected->GetStorage().GetCPUPtr(), 1.0E-12);
+    }
+}
+
+BOOST_FIXTURE_TEST_CASE(diagpreconcuda_hex, Helmholtz3D_Hex)
+{
+    Configure();
+    SetTestCase(fixtcuda_in->GetBlocks(), fixtcuda_in->GetStorage().GetCPUPtr());
+    auto HelmholtzOp  = Helmholtz<>::create(fixt_explist, "StdMat");
+    auto DiagPreconOp = DiagPrecon<>::create(fixt_explist, "CUDA");
+    HelmholtzOp->setLambda(1.0);
+    DiagPreconOp->configure(HelmholtzOp);
+    DiagPreconOp->apply(*fixtcuda_in, *fixtcuda_out);
+    ExpectedSolution(fixt_expected->GetBlocks(),
+                     fixt_expected->GetStorage().GetCPUPtr());
+    BOOST_TEST(fixtcuda_out->compare(*fixt_expected, 1.0E-10));
+    boost::test_tools::output_test_stream output;
+    {
+        OutputIfNotMatch(fixtcuda_out->GetStorage().GetCPUPtr(),
+                         fixt_expected->GetStorage().GetCPUPtr(), 1.0E-10);
+    }
+}
+
+BOOST_FIXTURE_TEST_CASE(diagpreconcuda_prism, Helmholtz3D_Prism)
+{
+    Configure();
+    SetTestCase(fixtcuda_in->GetBlocks(), fixtcuda_in->GetStorage().GetCPUPtr());
+    auto HelmholtzOp  = Helmholtz<>::create(fixt_explist, "StdMat");
+    auto DiagPreconOp = DiagPrecon<>::create(fixt_explist, "CUDA");
+    HelmholtzOp->setLambda(1.0);
+    DiagPreconOp->configure(HelmholtzOp);
+    DiagPreconOp->apply(*fixtcuda_in, *fixtcuda_out);
+    ExpectedSolution(fixt_expected->GetBlocks(),
+                     fixt_expected->GetStorage().GetCPUPtr());
+    BOOST_TEST(fixtcuda_out->compare(*fixt_expected, 1.0E-10));
+    boost::test_tools::output_test_stream output;
+    {
+        OutputIfNotMatch(fixtcuda_out->GetStorage().GetCPUPtr(),
+                         fixt_expected->GetStorage().GetCPUPtr(), 1.0E-10);
+    }
+}
+
+BOOST_FIXTURE_TEST_CASE(diagpreconcuda_pyr, Helmholtz3D_Pyr)
+{
+    Configure();
+    SetTestCase(fixtcuda_in->GetBlocks(), fixtcuda_in->GetStorage().GetCPUPtr());
+    auto HelmholtzOp  = Helmholtz<>::create(fixt_explist, "StdMat");
+    auto DiagPreconOp = DiagPrecon<>::create(fixt_explist, "CUDA");
+    HelmholtzOp->setLambda(1.0);
+    DiagPreconOp->configure(HelmholtzOp);
+    DiagPreconOp->apply(*fixtcuda_in, *fixtcuda_out);
+    ExpectedSolution(fixt_expected->GetBlocks(),
+                     fixt_expected->GetStorage().GetCPUPtr());
+    BOOST_TEST(fixtcuda_out->compare(*fixt_expected, 1.0E-10));
+    boost::test_tools::output_test_stream output;
+    {
+        OutputIfNotMatch(fixtcuda_out->GetStorage().GetCPUPtr(),
+                         fixt_expected->GetStorage().GetCPUPtr(), 1.0E-10);
+    }
+}
+
+BOOST_FIXTURE_TEST_CASE(diagpreconcuda_tet, Helmholtz3D_Tet)
+{
+    Configure();
+    SetTestCase(fixtcuda_in->GetBlocks(), fixtcuda_in->GetStorage().GetCPUPtr());
+    auto HelmholtzOp  = Helmholtz<>::create(fixt_explist, "StdMat");
+    auto DiagPreconOp = DiagPrecon<>::create(fixt_explist, "CUDA");
+    HelmholtzOp->setLambda(1.0);
+    DiagPreconOp->configure(HelmholtzOp);
+    DiagPreconOp->apply(*fixtcuda_in, *fixtcuda_out);
+    ExpectedSolution(fixt_expected->GetBlocks(),
+                     fixt_expected->GetStorage().GetCPUPtr());
+    BOOST_TEST(fixtcuda_out->compare(*fixt_expected, 1.0E-10));
+    boost::test_tools::output_test_stream output;
+    {
+        OutputIfNotMatch(fixtcuda_out->GetStorage().GetCPUPtr(),
+                         fixt_expected->GetStorage().GetCPUPtr(), 1.0E-10);
+    }
+}
+
+BOOST_AUTO_TEST_SUITE_END()