diff --git a/Operators/AssmbScatr/AssmbScatrCUDA.hpp b/Operators/AssmbScatr/AssmbScatrCUDA.hpp
index 1aa2a3097ea1d0a7c7902b9baf37dc47358a5d52..2aeee61909da1f9b4f8add1517e1003830c7be82 100644
--- a/Operators/AssmbScatr/AssmbScatrCUDA.hpp
+++ b/Operators/AssmbScatr/AssmbScatrCUDA.hpp
@@ -1,7 +1,8 @@
+#pragma once
+
 #include "MemoryRegionCUDA.hpp"
 #include "Operators/AssmbScatr/AssmbScatrCUDAKernels.cuh"
 #include "Operators/OperatorAssmbScatr.hpp"
-#include "Operators/OperatorHelper.cuh"
 
 #include <MultiRegions/AssemblyMap/AssemblyMapCG.h>
 #include <MultiRegions/ContField.h>
@@ -24,21 +25,20 @@ public:
             std::dynamic_pointer_cast<ContField>(this->m_expansionList);
         m_assmbMap = contfield->GetLocalToGlobalMap();
 
-        // Get the solution type
-        GlobalSysSolnType solnType = m_assmbMap->GetGlobalSysSolnType();
-        auto nloc                  = m_assmbMap->GetNumLocalCoeffs();
-        auto nglo                  = (solnType == eIterativeFull)
-                                         ? m_assmbMap->GetNumGlobalCoeffs()
-                                         : m_assmbMap->GetNumGlobalBndCoeffs();
+        m_solnType = m_assmbMap->GetGlobalSysSolnType();
+        m_nloc     = m_assmbMap->GetNumLocalCoeffs();
+        m_nglo     = (m_solnType == eIterativeFull)
+                         ? m_assmbMap->GetNumGlobalCoeffs()
+                         : m_assmbMap->GetNumGlobalBndCoeffs();
+        m_ndir     = m_assmbMap->GetNumGlobalDirBndCoeffs();
 
-        // Memory allocation for tmp pointer
-        cudaMalloc((void **)&m_tmpptr, sizeof(TData) * nglo);
-        cudaMemset(m_tmpptr, 0.0, sizeof(TData) * nglo);
+        // Memory allocation for global space pointer
+        cudaMalloc((void **)&m_gloptr, sizeof(TData) * m_nglo);
 
         // Memory allocation for assemble pointer
-        cudaMalloc((void **)&m_assmbptr, sizeof(int) * nloc);
+        cudaMalloc((void **)&m_assmbptr, sizeof(int) * m_nloc);
         auto assmbptr = m_assmbMap->GetLocalToGlobalMap().get();
-        cudaMemcpy(m_assmbptr, assmbptr, sizeof(int) * nloc,
+        cudaMemcpy(m_assmbptr, assmbptr, sizeof(int) * m_nloc,
                    cudaMemcpyHostToDevice);
 
         m_signChange = m_assmbMap->AssemblyMap::GetSignChange();
@@ -46,23 +46,20 @@ public:
         if (m_signChange)
         {
             // Memory allocation for sign pointer
-            cudaMalloc((void **)&m_signptr, sizeof(TData) * nloc);
+            cudaMalloc((void **)&m_signptr, sizeof(TData) * m_nloc);
             auto signptr = m_assmbMap->GetLocalToGlobalSign().get();
-            cudaMemcpy(m_signptr, signptr, sizeof(TData) * nloc,
+            cudaMemcpy(m_signptr, signptr, sizeof(TData) * m_nloc,
                        cudaMemcpyHostToDevice);
         }
     }
 
     ~OperatorAssmbScatrImpl()
     {
-        cudaFree(m_tmpptr);
+        cudaFree(m_gloptr);
         cudaFree(m_assmbptr);
-        m_tmpptr   = nullptr;
-        m_assmbptr = nullptr;
         if (m_signChange)
         {
             cudaFree(m_signptr);
-            m_signptr = nullptr;
         }
     }
 
@@ -70,16 +67,15 @@ public:
                Field<TData, FieldState::Coeff> &out,
                const bool &zeroDir = false)
     {
-        Assemble(in, m_tmpptr);
+        Assemble(in, m_gloptr);
 
         // Zeroing Dirichlet BC
         if (zeroDir)
         {
-            size_t nDir = m_assmbMap->GetNumGlobalDirBndCoeffs();
-            cudaMemset(m_tmpptr, 0.0, sizeof(TData) * nDir);
+            cudaMemset(m_gloptr, 0, sizeof(TData) * m_ndir);
         }
 
-        GlobalToLocal(m_tmpptr, out);
+        GlobalToLocal(m_gloptr, out);
     }
 
     void Assemble(Field<TData, FieldState::Coeff> &in, TData *outptr)
@@ -87,10 +83,10 @@ public:
         auto const *inptr =
             in.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
 
-        // Get the solution type
-        GlobalSysSolnType solnType = m_assmbMap->GetGlobalSysSolnType();
+        // Zero output
+        cudaMemset(outptr, 0, sizeof(TData) * m_nglo);
 
-        if (solnType == eIterativeFull)
+        if (m_solnType == eIterativeFull)
         {
             // Initialise index
             size_t expIdx = 0;
@@ -102,7 +98,7 @@ public:
                 // Determine shape and type of the element.
                 auto const expPtr = this->m_expansionList->GetExp(expIdx);
                 auto nElmts       = in.GetBlocks()[block_idx].num_elements;
-                auto numPts       = expPtr->GetNcoeffs();
+                auto ncoeff       = expPtr->GetNcoeffs();
 
                 // Deterime CUDA grid parameters.
                 m_gridSize = nElmts / m_blockSize;
@@ -111,17 +107,17 @@ public:
                 if (m_signChange)
                 {
                     AssembleKernel<<<m_gridSize, m_blockSize>>>(
-                        numPts, nElmts, offset, m_assmbptr, m_signptr, inptr,
+                        ncoeff, nElmts, offset, m_assmbptr, m_signptr, inptr,
                         outptr);
                 }
                 else
                 {
                     AssembleKernel<<<m_gridSize, m_blockSize>>>(
-                        numPts, nElmts, offset, m_assmbptr, inptr, outptr);
+                        ncoeff, nElmts, offset, m_assmbptr, inptr, outptr);
                 }
 
                 // Increment pointer and index for next element type.
-                offset += numPts * nElmts;
+                offset += ncoeff * nElmts;
                 expIdx += nElmts;
             }
         }
@@ -131,10 +127,10 @@ public:
     {
         auto *outptr = out.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
 
-        // Get the solution type
-        GlobalSysSolnType solnType = m_assmbMap->GetGlobalSysSolnType();
+        // Zero output
+        cudaMemset(outptr, 0, sizeof(TData) * m_nloc);
 
-        if (solnType == eIterativeFull)
+        if (m_solnType == eIterativeFull)
         {
             // Initialise index
             size_t expIdx = 0;
@@ -146,7 +142,7 @@ public:
                 // Determine shape and type of the element.
                 auto const expPtr = this->m_expansionList->GetExp(expIdx);
                 auto nElmts       = out.GetBlocks()[block_idx].num_elements;
-                auto numPts       = expPtr->GetNcoeffs();
+                auto ncoeff       = expPtr->GetNcoeffs();
 
                 // Deterime CUDA grid parameters.
                 m_gridSize = nElmts / m_blockSize;
@@ -155,17 +151,17 @@ public:
                 if (m_signChange)
                 {
                     GlobalToLocalKernel<<<m_gridSize, m_blockSize>>>(
-                        numPts, nElmts, offset, m_assmbptr, m_signptr, inptr,
+                        ncoeff, nElmts, offset, m_assmbptr, m_signptr, inptr,
                         outptr);
                 }
                 else
                 {
                     GlobalToLocalKernel<<<m_gridSize, m_blockSize>>>(
-                        numPts, nElmts, offset, m_assmbptr, inptr, outptr);
+                        ncoeff, nElmts, offset, m_assmbptr, inptr, outptr);
                 }
 
                 // Increment pointer and index for next element type.
-                offset += numPts * nElmts;
+                offset += ncoeff * nElmts;
                 expIdx += nElmts;
             }
         }
@@ -184,10 +180,14 @@ public:
 
 protected:
     AssemblyMapCGSharedPtr m_assmbMap;
-    TData *m_tmpptr;
+    GlobalSysSolnType m_solnType;
+    TData *m_gloptr;
     bool m_signChange;
     TData *m_signptr;
     int *m_assmbptr;
+    size_t m_nloc;
+    size_t m_nglo;
+    size_t m_ndir;
     size_t m_gridSize;
     size_t m_blockSize = 32;
 };
diff --git a/Operators/AssmbScatr/AssmbScatrCUDAKernels.cuh b/Operators/AssmbScatr/AssmbScatrCUDAKernels.cuh
index ae8c2c20e7afdd4117cd855054034b4fa42d48d7..e1cfacd8862a7dd0e7896bece9362dbf0b87db31 100644
--- a/Operators/AssmbScatr/AssmbScatrCUDAKernels.cuh
+++ b/Operators/AssmbScatr/AssmbScatrCUDAKernels.cuh
@@ -2,7 +2,7 @@ namespace Nektar::Operators::detail
 {
 
 template <typename TData>
-__global__ void AssembleKernel(const size_t numPts, const size_t nelmt,
+__global__ void AssembleKernel(const size_t ncoeff, const size_t nelmt,
                                const size_t offset, const int *assmbptr,
                                const TData *signptr, const TData *inptr,
                                TData *outptr)
@@ -14,9 +14,9 @@ __global__ void AssembleKernel(const size_t numPts, const size_t nelmt,
         return;
     }
 
-    size_t index = offset + e * numPts;
+    size_t index = offset + e * ncoeff;
 
-    for (size_t i = 0; i < numPts; i++)
+    for (size_t i = 0; i < ncoeff; i++)
     {
         atomicAdd(outptr + assmbptr[index + i],
                   signptr[index + i] * inptr[index + i]);
@@ -24,7 +24,7 @@ __global__ void AssembleKernel(const size_t numPts, const size_t nelmt,
 }
 
 template <typename TData>
-__global__ void AssembleKernel(const size_t numPts, const size_t nelmt,
+__global__ void AssembleKernel(const size_t ncoeff, const size_t nelmt,
                                const size_t offset, const int *assmbptr,
                                const TData sign, const TData *inptr,
                                TData *outptr)
@@ -36,16 +36,16 @@ __global__ void AssembleKernel(const size_t numPts, const size_t nelmt,
         return;
     }
 
-    size_t index = offset + e * numPts;
+    size_t index = offset + e * ncoeff;
 
-    for (size_t i = 0; i < numPts; i++)
+    for (size_t i = 0; i < ncoeff; i++)
     {
         atomicAdd(outptr + assmbptr[index + i], sign * inptr[index + i]);
     }
 }
 
 template <typename TData>
-__global__ void AssembleKernel(const size_t numPts, const size_t nelmt,
+__global__ void AssembleKernel(const size_t ncoeff, const size_t nelmt,
                                const size_t offset, const int *assmbptr,
                                const TData *inptr, TData *outptr)
 {
@@ -56,16 +56,16 @@ __global__ void AssembleKernel(const size_t numPts, const size_t nelmt,
         return;
     }
 
-    size_t index = offset + e * numPts;
+    size_t index = offset + e * ncoeff;
 
-    for (size_t i = 0; i < numPts; i++)
+    for (size_t i = 0; i < ncoeff; i++)
     {
         atomicAdd(outptr + assmbptr[index + i], inptr[index + i]);
     }
 }
 
 template <typename TData>
-__global__ void GlobalToLocalKernel(const size_t numPts, const size_t nelmt,
+__global__ void GlobalToLocalKernel(const size_t ncoeff, const size_t nelmt,
                                     const size_t offset, const int *assmbptr,
                                     const TData *signptr, const TData *inptr,
                                     TData *outptr)
@@ -77,16 +77,16 @@ __global__ void GlobalToLocalKernel(const size_t numPts, const size_t nelmt,
         return;
     }
 
-    size_t index = offset + e * numPts;
+    size_t index = offset + e * ncoeff;
 
-    for (size_t i = 0; i < numPts; i++)
+    for (size_t i = 0; i < ncoeff; i++)
     {
         outptr[index + i] = signptr[index + i] * inptr[assmbptr[index + i]];
     }
 }
 
 template <typename TData>
-__global__ void GlobalToLocalKernel(const size_t numPts, const size_t nelmt,
+__global__ void GlobalToLocalKernel(const size_t ncoeff, const size_t nelmt,
                                     const size_t offset, const int *assmbptr,
                                     const TData sign, const TData *inptr,
                                     TData *outptr)
@@ -98,16 +98,16 @@ __global__ void GlobalToLocalKernel(const size_t numPts, const size_t nelmt,
         return;
     }
 
-    size_t index = offset + e * numPts;
+    size_t index = offset + e * ncoeff;
 
-    for (size_t i = 0; i < numPts; i++)
+    for (size_t i = 0; i < ncoeff; i++)
     {
         outptr[index + i] = sign * inptr[assmbptr[index + i]];
     }
 }
 
 template <typename TData>
-__global__ void GlobalToLocalKernel(const size_t numPts, const size_t nelmt,
+__global__ void GlobalToLocalKernel(const size_t ncoeff, const size_t nelmt,
                                     const size_t offset, const int *assmbptr,
                                     const TData *inptr, TData *outptr)
 {
@@ -118,9 +118,9 @@ __global__ void GlobalToLocalKernel(const size_t numPts, const size_t nelmt,
         return;
     }
 
-    size_t index = offset + e * numPts;
+    size_t index = offset + e * ncoeff;
 
-    for (size_t i = 0; i < numPts; i++)
+    for (size_t i = 0; i < ncoeff; i++)
     {
         outptr[index + i] = inptr[assmbptr[index + i]];
     }
diff --git a/Operators/AssmbScatr/AssmbScatrStdMat.hpp b/Operators/AssmbScatr/AssmbScatrStdMat.hpp
index 45cc9f67f8460701675504769113b8c883a8f53f..7f51eb941cfcf532b774ec1d19fde57ea166856d 100644
--- a/Operators/AssmbScatr/AssmbScatrStdMat.hpp
+++ b/Operators/AssmbScatr/AssmbScatrStdMat.hpp
@@ -2,7 +2,6 @@
 
 #include "Operators/OperatorAssmbScatr.hpp"
 
-#include <LibUtilities/BasicUtils/Vmath.hpp>
 #include <MultiRegions/AssemblyMap/AssemblyMapCG.h>
 #include <MultiRegions/ContField.h>
 
@@ -25,37 +24,35 @@ public:
             std::dynamic_pointer_cast<ContField>(this->m_expansionList);
         m_assmbMap = contfield->GetLocalToGlobalMap();
 
-        GlobalSysSolnType solnType = m_assmbMap->GetGlobalSysSolnType();
-        auto nglo                  = (solnType == eIterativeFull)
-                                         ? m_assmbMap->GetNumGlobalCoeffs()
-                                         : m_assmbMap->GetNumGlobalBndCoeffs();
+        m_solnType = m_assmbMap->GetGlobalSysSolnType();
+        m_nloc     = m_assmbMap->GetNumLocalCoeffs();
+        m_nglo     = (m_solnType == eIterativeFull)
+                         ? m_assmbMap->GetNumGlobalCoeffs()
+                         : m_assmbMap->GetNumGlobalBndCoeffs();
+        m_ndir     = m_assmbMap->GetNumGlobalDirBndCoeffs();
 
-        m_tmp = Array<OneD, TData>(nglo);
+        m_glo = Array<OneD, TData>(m_nglo);
     }
 
     void apply(Field<TData, FieldState::Coeff> &in,
                Field<TData, FieldState::Coeff> &out,
                const bool &zeroDir = false)
     {
-        Assemble(in, m_tmp);
+        Assemble(in, m_glo);
 
+        // Zeroing Dirichlet BC
         if (zeroDir)
         {
-            auto nDir = m_assmbMap->GetNumGlobalDirBndCoeffs();
-            Vmath::Zero(nDir, m_tmp.get(), 1);
+            Vmath::Zero(m_ndir, m_glo.get(), 1);
         }
 
-        GlobalToLocal(m_tmp, out);
+        GlobalToLocal(m_glo, out);
     }
 
     void Assemble(Field<TData, FieldState::Coeff> &in,
                   Array<OneD, TData> outarray)
     {
-        // Get the solution type
-        GlobalSysSolnType solnType = m_assmbMap->GetGlobalSysSolnType();
-
-        auto nloc = m_assmbMap->GetNumLocalCoeffs();
-        Array<OneD, TData> inarray(nloc);
+        Array<OneD, TData> inarray(m_nloc);
 
         // Copy data from input field
         auto *inarrptr = inarray.data();
@@ -73,7 +70,7 @@ public:
             inptr += nSize;
         }
 
-        if (solnType == eIterativeFull)
+        if (m_solnType == eIterativeFull)
         {
             m_assmbMap->Assemble(inarray, outarray);
         }
@@ -86,13 +83,9 @@ public:
     void GlobalToLocal(Array<OneD, TData> inarray,
                        Field<TData, FieldState::Coeff> &out)
     {
-        // Get the solution type
-        GlobalSysSolnType solnType = m_assmbMap->GetGlobalSysSolnType();
-
-        auto nloc = m_assmbMap->GetNumLocalCoeffs();
-        Array<OneD, TData> outarray(nloc);
+        Array<OneD, TData> outarray(m_nloc);
 
-        if (solnType == eIterativeFull)
+        if (m_solnType == eIterativeFull)
         {
             m_assmbMap->GlobalToLocal(inarray, outarray);
         }
@@ -131,7 +124,11 @@ public:
 
 protected:
     AssemblyMapCGSharedPtr m_assmbMap;
-    Array<OneD, TData> m_tmp;
+    GlobalSysSolnType m_solnType;
+    Array<OneD, TData> m_glo;
+    size_t m_nloc;
+    size_t m_nglo;
+    size_t m_ndir;
 };
 
 } // namespace Nektar::Operators::detail