diff --git a/Operators/DiagPrecon/DiagPreconCUDA.hpp b/Operators/DiagPrecon/DiagPreconCUDA.hpp
index d952f9b659668ab9b6f0abf32a498213ca5363d2..9f03e23a1043bc7b12b63d2ae880698519957aae 100644
--- a/Operators/DiagPrecon/DiagPreconCUDA.hpp
+++ b/Operators/DiagPrecon/DiagPreconCUDA.hpp
@@ -3,7 +3,9 @@
 #include "Field.hpp"
 #include "Operators/AssmbScatr/AssmbScatrCUDA.hpp"
 #include "Operators/CUDAMathKernels.cuh"
+#include "Operators/DiagPrecon/DiagPreconCUDAKernel.cuh"
 #include "Operators/OperatorDiagPrecon.hpp"
+#include "Operators/OperatorHelper.cuh"
 #include "Operators/OperatorRobBndCond.hpp"
 
 #include <MultiRegions/AssemblyMap/AssemblyMapCG.h>
@@ -40,9 +42,6 @@ public:
         m_assmbScatr =
             std::static_pointer_cast<OperatorAssmbScatrImpl<TData, ImplCUDA>>(
                 AssmbScatr<TData>::create(this->m_expansionList, "CUDA"));
-
-        // Deterime CUDA grid size.
-        m_gridSize = GetCUDAGridSize(m_nGlobal - m_nDir, m_blockSize);
     }
 
     ~OperatorDiagPreconImpl(void)
@@ -54,6 +53,9 @@ public:
     void apply(Field<TData, FieldState::Coeff> &in,
                Field<TData, FieldState::Coeff> &out) override
     {
+        // Deterime CUDA grid size.
+        m_gridSize = GetCUDAGridSize(m_nGlobal - m_nDir, m_blockSize);
+
         m_assmbScatr->Assemble(in, m_wk);
 
         vdivKernel<<<m_gridSize, m_blockSize>>>(
@@ -71,7 +73,8 @@ public:
         // auto robBCOp = RobBndCond<TData>::create(this->m_expansionList,
         // "CUDA");
 
-        Array<OneD, TData> diag(m_nLocal);
+        TData *diag;
+        cudaMalloc((void **)&diag, sizeof(TData) * m_nLocal);
 
         // create unit vector field to extract diagonal
         Field<TData, FieldState::Coeff> unit_vec =
@@ -87,36 +90,58 @@ public:
             unit_vec.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
         auto *actn_ptr =
             action.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
+        auto diag_ptr = diag;
 
-        TData init = 1.0;
-        for (size_t i = 0; i < m_nLocal; ++i)
+        size_t expIdx = 0;
+        for (size_t block_idx = 0; block_idx < unit_vec.GetBlocks().size();
+             ++block_idx)
         {
-            // set ith term in unit vector to be 1
-            cudaMemcpy(uvec_ptr + i, &init, sizeof(TData),
-                       cudaMemcpyHostToDevice);
-
-            // apply operator to unit vector and store in action field
-            op->apply(unit_vec, action);
-            // robBCOp->apply(unit_vec, action);
-
-            // copy ith row term from the action field to get ith diagonal
-            cudaMemcpy(diag.get() + i, actn_ptr + i, sizeof(TData),
-                       cudaMemcpyDeviceToHost);
-
-            // reset ith term in unit vector to be 0
-            cudaMemset(uvec_ptr + i, 0, sizeof(TData));
+            // Determine shape and type of the element.
+            auto const expPtr = this->m_expansionList->GetExp(expIdx);
+            auto nElmts       = unit_vec.GetBlocks()[block_idx].num_elements;
+            auto nmTot        = expPtr->GetNcoeffs();
+
+            // Deterime CUDA grid size.
+            m_gridSize = GetCUDAGridSize(nElmts, m_blockSize);
+
+            for (size_t i = 0; i < nmTot; ++i)
+            {
+                // set ith term in unit vector to be 1
+                SetDiagonalKernel<<<m_gridSize, m_blockSize>>>(nmTot, nElmts, i,
+                                                               1.0, uvec_ptr);
+
+                // apply operator to unit vector and store in action field
+                op->apply(unit_vec, action);
+                // robBCOp->apply(unit_vec, action);
+
+                // copy ith row term from the action field to get ith diagonal
+                CopyDiagonalKernel<<<m_gridSize, m_blockSize>>>(
+                    nmTot, nElmts, i, actn_ptr, diag_ptr);
+
+                // reset ith term in unit vector to be 0
+                SetDiagonalKernel<<<m_gridSize, m_blockSize>>>(nmTot, nElmts, i,
+                                                               0.0, uvec_ptr);
+            }
+
+            uvec_ptr += unit_vec.GetBlocks()[block_idx].block_size;
+            diag_ptr += unit_vec.GetBlocks()[block_idx].block_size;
+            actn_ptr += unit_vec.GetBlocks()[block_idx].block_size;
+            expIdx += nElmts;
         }
 
         // Assembly
-        Array<OneD, TData> tmp(m_nGlobal, 0.0);
+        Array<OneD, TData> locdiag(m_nLocal);
+        Array<OneD, TData> glodiag(m_nGlobal, 0.0);
+        cudaMemcpy(locdiag.get(), diag, sizeof(TData) * m_nLocal,
+                   cudaMemcpyDeviceToHost);
         for (size_t i = 0; i < m_nLocal; ++i)
         {
             size_t gid1 = m_assmbMap->GetLocalToGlobalMap(i);
-            tmp[gid1] += diag[i];
+            glodiag[gid1] += locdiag[i];
         }
-        m_assmbMap->UniversalAssemble(tmp);
+        m_assmbMap->UniversalAssemble(glodiag);
 
-        cudaMemcpy(m_diag, tmp.get(), sizeof(TData) * m_nGlobal,
+        cudaMemcpy(m_diag, glodiag.get(), sizeof(TData) * m_nGlobal,
                    cudaMemcpyHostToDevice);
     }
 
diff --git a/Operators/DiagPrecon/DiagPreconCUDAKernel.cuh b/Operators/DiagPrecon/DiagPreconCUDAKernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..35ccedb02f6a6c715f51e28e98fdc13d72460de3
--- /dev/null
+++ b/Operators/DiagPrecon/DiagPreconCUDAKernel.cuh
@@ -0,0 +1,34 @@
+namespace Nektar::Operators::detail
+{
+
+template <typename TData>
+__global__ void SetDiagonalKernel(const size_t nm, const size_t nelmt,
+                                  const size_t mode, const TData val,
+                                  TData *out)
+{
+    size_t e = blockDim.x * blockIdx.x + threadIdx.x;
+
+    while (e < nelmt)
+    {
+        out[e * nm + mode] = val;
+
+        e += blockDim.x * gridDim.x;
+    }
+}
+
+template <typename TData>
+__global__ void CopyDiagonalKernel(const size_t nm, const size_t nelmt,
+                                   const size_t mode, const TData *in,
+                                   TData *out)
+{
+    size_t e = blockDim.x * blockIdx.x + threadIdx.x;
+
+    while (e < nelmt)
+    {
+        out[e * nm + mode] = in[e * nm + mode];
+
+        e += blockDim.x * gridDim.x;
+    }
+}
+
+} // namespace Nektar::Operators::detail
diff --git a/Operators/DiagPrecon/DiagPreconStdMat.hpp b/Operators/DiagPrecon/DiagPreconStdMat.hpp
index 6f15ad269dc6acdc1524b3e760cb18f152579ae7..3865d4eb348e41df7321d4a66b95daf4b42602cf 100644
--- a/Operators/DiagPrecon/DiagPreconStdMat.hpp
+++ b/Operators/DiagPrecon/DiagPreconStdMat.hpp
@@ -78,20 +78,41 @@ public:
         auto *actn_ptr = action.GetStorage().GetCPUPtr();
         auto *diag_ptr = diag.get();
 
-        for (size_t i = 0; i < m_nLocal; ++i)
+        size_t expIdx = 0;
+        for (size_t block_idx = 0; block_idx < unit_vec.GetBlocks().size();
+             ++block_idx)
         {
-            // set ith term in unit vector to be 1
-            *uvec_ptr = 1.0;
-
-            // apply operator to unit vector and store in action field
-            op->apply(unit_vec, action);
-            robBCOp->apply(unit_vec, action);
-
-            // copy ith row term from the action field to get ith diagonal
-            *(diag_ptr++) = *(actn_ptr++);
-
-            // reset ith term in unit vector to be 0
-            *(uvec_ptr++) = 0.0;
+            // Determine shape and type of the element.
+            auto const expPtr = this->m_expansionList->GetExp(expIdx);
+            auto nElmts       = unit_vec.GetBlocks()[block_idx].num_elements;
+            auto nmTot        = expPtr->GetNcoeffs();
+
+            for (size_t i = 0; i < nmTot; ++i)
+            {
+                for (size_t j = 0; j < nElmts; ++j)
+                {
+                    // set ith term in unit vector to be 1
+                    uvec_ptr[j * nmTot + i] = 1.0;
+                }
+
+                // apply operator to unit vector and store in action field
+                op->apply(unit_vec, action);
+                robBCOp->apply(unit_vec, action);
+
+                for (size_t j = 0; j < nElmts; ++j)
+                {
+                    // copy ith row term from the action field to get ith
+                    // diagonal
+                    diag_ptr[j * nmTot + i] = actn_ptr[j * nmTot + i];
+
+                    // reset ith term in unit vector to be 0
+                    uvec_ptr[j * nmTot + i] = 0.0;
+                }
+            }
+            uvec_ptr += unit_vec.GetBlocks()[block_idx].block_size;
+            diag_ptr += unit_vec.GetBlocks()[block_idx].block_size;
+            actn_ptr += unit_vec.GetBlocks()[block_idx].block_size;
+            expIdx += nElmts;
         }
 
         // Assembly