diff --git a/Operators/PhysDeriv/PhysDerivCUDA.cu b/Operators/PhysDeriv/PhysDerivCUDA.cu
index 1db3d8d9b2c119d009250c9e4b54ebafb9f7a71d..e07e45e1d0efc458a666a135babc81e3e67c6839 100644
--- a/Operators/PhysDeriv/PhysDerivCUDA.cu
+++ b/Operators/PhysDeriv/PhysDerivCUDA.cu
@@ -2,9 +2,11 @@
 
 namespace Nektar::Operators::detail
 {
+
 template <>
 std::string OperatorPhysDerivImpl<double, ImplCUDA>::className =
     GetOperatorFactory<double>().RegisterCreatorFunction(
         "PhysDerivCUDA", OperatorPhysDerivImpl<double, ImplCUDA>::instantiate,
         "...");
-}
+
+} // namespace Nektar::Operators::detail
diff --git a/Operators/PhysDeriv/PhysDerivCUDA.hpp b/Operators/PhysDeriv/PhysDerivCUDA.hpp
index 197c1e06db4ad0b49ee6675bdc8c6d2152af4d81..1e14b29b640824c265c450e91712862a1419edf5 100644
--- a/Operators/PhysDeriv/PhysDerivCUDA.hpp
+++ b/Operators/PhysDeriv/PhysDerivCUDA.hpp
@@ -1,39 +1,46 @@
-# pragma once
+#pragma once
 
 #include "MemoryRegionCUDA.hpp"
 #include "Operators/OperatorHelper.cuh"
 #include "Operators/OperatorPhysDeriv.hpp"
 #include "Operators/PhysDeriv/PhysDerivCUDAKernels.cuh"
 
+#define FLAG_QP false
+
 namespace Nektar::Operators::detail
 {
 
 template <typename TData, bool DEFORMED = false>
-void PhysDeriv1DKernel(const size_t gridSize, const size_t blockSize,
-                       const size_t nq0, const size_t nCoord,
-                       const size_t nElmts, const TData *D0,
-                       const size_t dfSize, TData *df, const TData *in,
-                       TData *out0, TData *out1, TData *out2);
+void PhysDeriv1DKernel(const unsigned int gridSize,
+                       const unsigned int blockSize, const unsigned int nq0,
+                       const unsigned int nCoord, const unsigned int nElmts,
+                       const unsigned int nSize, const unsigned int dfSize,
+                       const TData *D0, const TData *df, const TData *in,
+                       TData *out);
 
 template <typename TData, bool DEFORMED = false>
-void PhysDeriv2DKernel(const size_t gridSize, const size_t blockSize,
-                       LibUtilities::ShapeType shapetype, const size_t nq0,
-                       const size_t nq1, const size_t nCoord,
-                       const size_t nElmts, const TData *D0, const TData *D1,
-                       const TData *Z0, const TData *Z1, const size_t dfSize,
-                       TData *df, const TData *in, TData *out0, TData *out1,
-                       TData *out2);
+void PhysDeriv2DKernel(const unsigned int gridSize,
+                       const unsigned int blockSize,
+                       LibUtilities::ShapeType shapetype,
+                       const unsigned int nq0, const unsigned int nq1,
+                       const unsigned int nCoord, const unsigned int nElmts,
+                       const unsigned int nSize, const unsigned int dfSize,
+                       const TData *D0, const TData *D1, const TData *Z0,
+                       const TData *Z1, const TData *df, const TData *in,
+                       TData *out);
 
 template <typename TData, bool DEFORMED = false>
-void PhysDeriv3DKernel(const size_t gridSize, const size_t blockSize,
-                       LibUtilities::ShapeType shapetype, const size_t nq0,
-                       const size_t nq1, const size_t nq2, const size_t nCoord,
-                       const size_t nElmts, const TData *D0, const TData *D1,
-                       const TData *D2, const TData *Z0, const TData *Z1,
-                       const TData *Z2, const size_t dfSize, TData *df,
-                       const TData *in, TData *out0, TData *out1, TData *out2);
+void PhysDeriv3DKernel(const unsigned int gridSize,
+                       const unsigned int blockSize,
+                       LibUtilities::ShapeType shapetype,
+                       const unsigned int nq0, const unsigned int nq1,
+                       const unsigned int nq2, const unsigned int nElmts,
+                       const unsigned int nSize, const unsigned int dfSize,
+                       const TData *D0, const TData *D1, const TData *D2,
+                       const TData *Z0, const TData *Z1, const TData *Z2,
+                       const TData *df, const TData *in, TData *out);
 
-// Matrix-free implementation
+// CUDA implementation
 template <typename TData>
 class OperatorPhysDerivImpl<TData, ImplCUDA> : public OperatorPhysDeriv<TData>
 {
@@ -78,10 +85,9 @@ public:
         // Initialize pointers.
         auto const *inptr =
             in.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
-        auto *outptr0 = out.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
-        auto *outptr1 = outptr0 + out.GetFieldSize();
-        auto *outptr2 = outptr1 + out.GetFieldSize();
-        auto dfptr    = m_derivFac;
+        auto *outptr = out.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
+        auto dfptr   = m_derivFac;
+        size_t nSize = out.GetFieldSize();
 
         // Initialize index.
         size_t expIdx = 0;
@@ -120,14 +126,14 @@ public:
                 if (deformed)
                 {
                     PhysDeriv1DKernel<TData, true>(
-                        m_gridSize, m_blockSize, nq0, nCoord, nElmts, D0,
-                        m_dfSize, dfptr, inptr, outptr0, outptr1, outptr2);
+                        m_gridSize, m_blockSize, nq0, nCoord, nElmts, nSize,
+                        m_dfSize, D0, dfptr, inptr, outptr);
                 }
                 else
                 {
                     PhysDeriv1DKernel<TData, false>(
-                        m_gridSize, m_blockSize, nq0, nCoord, nElmts, D0,
-                        m_dfSize, dfptr, inptr, outptr0, outptr1, outptr2);
+                        m_gridSize, m_blockSize, nq0, nCoord, nElmts, nSize,
+                        m_dfSize, D0, dfptr, inptr, outptr);
                 }
             }
             else if (expPtr->GetShapeDimension() == 2)
@@ -142,15 +148,15 @@ public:
                 {
                     PhysDeriv2DKernel<TData, true>(
                         m_gridSize, m_blockSize, shape, nq0, nq1, nCoord,
-                        nElmts, D0, D1, Z0, Z1, m_dfSize, dfptr, inptr, outptr0,
-                        outptr1, outptr2);
+                        nElmts, nSize, m_dfSize, D0, D1, Z0, Z1, dfptr, inptr,
+                        outptr);
                 }
                 else
                 {
                     PhysDeriv2DKernel<TData, false>(
                         m_gridSize, m_blockSize, shape, nq0, nq1, nCoord,
-                        nElmts, D0, D1, Z0, Z1, m_dfSize, dfptr, inptr, outptr0,
-                        outptr1, outptr2);
+                        nElmts, nSize, m_dfSize, D0, D1, Z0, Z1, dfptr, inptr,
+                        outptr);
                 }
             }
             else if (expPtr->GetShapeDimension() == 3)
@@ -167,24 +173,22 @@ public:
                 if (deformed)
                 {
                     PhysDeriv3DKernel<TData, true>(
-                        m_gridSize, m_blockSize, shape, nq0, nq1, nq2, nCoord,
-                        nElmts, D0, D1, D2, Z0, Z1, Z2, m_dfSize, dfptr, inptr,
-                        outptr0, outptr1, outptr2);
+                        m_gridSize, m_blockSize, shape, nq0, nq1, nq2, nElmts,
+                        nSize, m_dfSize, D0, D1, D2, Z0, Z1, Z2, dfptr, inptr,
+                        outptr);
                 }
                 else
                 {
                     PhysDeriv3DKernel<TData, false>(
-                        m_gridSize, m_blockSize, shape, nq0, nq1, nq2, nCoord,
-                        nElmts, D0, D1, D2, Z0, Z1, Z2, m_dfSize, dfptr, inptr,
-                        outptr0, outptr1, outptr2);
+                        m_gridSize, m_blockSize, shape, nq0, nq1, nq2, nElmts,
+                        nSize, m_dfSize, D0, D1, D2, Z0, Z1, Z2, dfptr, inptr,
+                        outptr);
                 }
             }
 
             // Increment pointer and index for next element type.
             dfptr += deformed ? nqTot * nElmts : nElmts;
-            outptr0 += (nPadElmts + nElmts) * nqTot;
-            outptr1 += (nPadElmts + nElmts) * nqTot;
-            outptr2 += (nPadElmts + nElmts) * nqTot;
+            outptr += (nPadElmts + nElmts) * nqTot;
             inptr += (nPadElmts + nElmts) * nqTot;
             expIdx += nElmts;
         }
@@ -204,88 +208,192 @@ private:
     std::map<std::vector<LibUtilities::BasisKey>, std::vector<TData *>> m_D;
     std::map<std::vector<LibUtilities::BasisKey>, std::vector<TData *>> m_Z;
     size_t m_dfSize;
+    size_t m_gridSize  = 1024;
     size_t m_blockSize = 32;
-    size_t m_gridSize;
 };
 
 template <typename TData, bool DEFORMED>
-void PhysDeriv1DKernel(const size_t gridSize, const size_t blockSize,
-                       const size_t nq0, const size_t nCoord,
-                       const size_t nElmts, const TData *D0,
-                       const size_t dfSize, TData *df, const TData *in,
-                       TData *out0, TData *out1, TData *out2)
+void PhysDeriv1DKernel(const unsigned int gridSize,
+                       const unsigned int blockSize, const unsigned int nq0,
+                       const unsigned int nCoord, const unsigned int nElmts,
+                       const unsigned int nSize, const unsigned int dfSize,
+                       const TData *D0, const TData *df, const TData *in,
+                       TData *out)
 {
     // Compute tensorial derivative.
-    PhysDerivTensor1DKernel<TData>
-        <<<gridSize, blockSize>>>(nq0, nElmts, D0, in, out0);
+    if (!FLAG_QP)
+    {
+        unsigned int nshared = sizeof(TData) * (nq0 * nq0);
+        PhysDerivTensor1DKernel<TData>
+            <<<gridSize, blockSize, nshared>>>(nq0, nElmts, D0, in, out);
+    }
+    else
+    {
+        PhysDerivTensor1DKernel_QP<TData>
+            <<<gridSize, dim3(32)>>>(nq0, nElmts, D0, in, out);
+    }
 
     // Compute physical derivative.
-    PhysDerivSegKernel<TData, DEFORMED><<<gridSize, blockSize>>>(
-        nq0, nCoord, nElmts, dfSize, df, out0, out1, out2);
+    if (!FLAG_QP)
+    {
+        PhysDeriv1DKernel<TData, DEFORMED><<<gridSize, blockSize>>>(
+            nq0, nCoord, nElmts, nSize, dfSize, df, out);
+    }
+    else
+    {
+        PhysDeriv1DKernel_QP<TData, DEFORMED><<<gridSize, dim3(32)>>>(
+            nq0, nCoord, nElmts, nSize, dfSize, df, out);
+    }
 }
 
 template <typename TData, bool DEFORMED>
-void PhysDeriv2DKernel(const size_t gridSize, const size_t blockSize,
-                       LibUtilities::ShapeType shapetype, const size_t nq0,
-                       const size_t nq1, const size_t nCoord,
-                       const size_t nElmts, const TData *D0, const TData *D1,
-                       const TData *Z0, const TData *Z1, const size_t dfSize,
-                       TData *df, const TData *in, TData *out0, TData *out1,
-                       TData *out2)
+void PhysDeriv2DKernel(const unsigned int gridSize,
+                       const unsigned int blockSize,
+                       LibUtilities::ShapeType shapetype,
+                       const unsigned int nq0, const unsigned int nq1,
+                       const unsigned int nCoord, const unsigned int nElmts,
+                       const unsigned int nSize, const unsigned int dfSize,
+                       const TData *D0, const TData *D1, const TData *Z0,
+                       const TData *Z1, const TData *df, const TData *in,
+                       TData *out)
 {
     // Compute tensorial derivative.
-    PhysDerivTensor2DKernel<TData>
-        <<<gridSize, blockSize>>>(nq0, nq1, nElmts, D0, D1, in, out0, out1);
+    if (!FLAG_QP)
+    {
+        unsigned int nshared = sizeof(TData) * (nq0 * nq0 + nq1 * nq1);
+        PhysDerivTensor2DKernel<TData><<<gridSize, blockSize, nshared>>>(
+            nq0, nq1, nElmts, nSize, D0, D1, in, out);
+    }
+    else
+    {
+        PhysDerivTensor2DKernel_QP<TData><<<gridSize, dim3(8, 8)>>>(
+            nq0, nq1, nElmts, nSize, D0, D1, in, out);
+    }
 
     // Compute physical derivative.
     if (shapetype == LibUtilities::Quad)
     {
-        PhysDerivQuadKernel<TData, DEFORMED><<<gridSize, blockSize>>>(
-            nq0, nq1, nCoord, nElmts, dfSize, df, out0, out1, out2);
+        if (!FLAG_QP)
+        {
+            PhysDeriv2DKernel<TData, LibUtilities::Quad, DEFORMED>
+                <<<gridSize, blockSize>>>(nq0, nq1, nCoord, nElmts, nSize,
+                                          dfSize, nullptr, nullptr, df, out);
+        }
+        else
+        {
+            PhysDeriv2DKernel_QP<TData, LibUtilities::Quad, DEFORMED>
+                <<<gridSize, dim3(8, 8)>>>(nq0, nq1, nCoord, nElmts, nSize,
+                                           dfSize, nullptr, nullptr, df, out);
+        }
     }
     else if (shapetype == LibUtilities::Tri)
     {
-        PhysDerivTriKernel<TData, DEFORMED><<<gridSize, blockSize>>>(
-            nq0, nq1, nCoord, nElmts, Z0, Z1, dfSize, df, out0, out1, out2);
+        if (!FLAG_QP)
+        {
+            unsigned int nshared = sizeof(TData) * (nq0 + nq1);
+            PhysDeriv2DKernel<TData, LibUtilities::Tri, DEFORMED>
+                <<<gridSize, blockSize, nshared>>>(
+                    nq0, nq1, nCoord, nElmts, nSize, dfSize, Z0, Z1, df, out);
+        }
+        else
+        {
+            PhysDeriv2DKernel_QP<TData, LibUtilities::Tri, DEFORMED>
+                <<<gridSize, dim3(8, 8)>>>(nq0, nq1, nCoord, nElmts, nSize,
+                                           dfSize, Z0, Z1, df, out);
+        }
     }
 }
 
 template <typename TData, bool DEFORMED>
-void PhysDeriv3DKernel(const size_t gridSize, const size_t blockSize,
-                       LibUtilities::ShapeType shapetype, const size_t nq0,
-                       const size_t nq1, const size_t nq2, const size_t nCoord,
-                       const size_t nElmts, const TData *D0, const TData *D1,
-                       const TData *D2, const TData *Z0, const TData *Z1,
-                       const TData *Z2, const size_t dfSize, TData *df,
-                       const TData *in, TData *out0, TData *out1, TData *out2)
+void PhysDeriv3DKernel(const unsigned int gridSize,
+                       const unsigned int blockSize,
+                       LibUtilities::ShapeType shapetype,
+                       const unsigned int nq0, const unsigned int nq1,
+                       const unsigned int nq2, const unsigned int nElmts,
+                       const unsigned int nSize, const unsigned int dfSize,
+                       const TData *D0, const TData *D1, const TData *D2,
+                       const TData *Z0, const TData *Z1, const TData *Z2,
+                       const TData *df, const TData *in, TData *out)
 {
     // Compute tensorial derivative.
-    PhysDerivTensor3DKernel<TData><<<gridSize, blockSize>>>(
-        nq0, nq1, nq2, nElmts, D0, D1, D2, in, out0, out1, out2);
+    if (!FLAG_QP)
+    {
+        unsigned int nshared =
+            sizeof(TData) * (nq0 * nq0 + nq1 * nq1 + nq2 * nq2);
+        PhysDerivTensor3DKernel<TData><<<gridSize, blockSize, nshared>>>(
+            nq0, nq1, nq2, nElmts, nSize, D0, D1, D2, in, out);
+    }
+    else
+    {
+        PhysDerivTensor3DKernel_QP<TData><<<gridSize, dim3(4, 4, 4)>>>(
+            nq0, nq1, nq2, nElmts, nSize, D0, D1, D2, in, out);
+    }
 
     // Compute physical derivative.
     if (shapetype == LibUtilities::Hex)
     {
-        PhysDerivHexKernel<TData, DEFORMED><<<gridSize, blockSize>>>(
-            nq0, nq1, nq2, nCoord, nElmts, dfSize, df, out0, out1, out2);
+        if (!FLAG_QP)
+        {
+            PhysDeriv3DKernel<TData, LibUtilities::Hex, DEFORMED>
+                <<<gridSize, blockSize>>>(nq0, nq1, nq2, nElmts, nSize, dfSize,
+                                          nullptr, nullptr, nullptr, df, out);
+        }
+        else
+        {
+            PhysDeriv3DKernel_QP<TData, LibUtilities::Hex, DEFORMED>
+                <<<gridSize, dim3(4, 4, 4)>>>(nq0, nq1, nq2, nElmts, nSize,
+                                              dfSize, nullptr, nullptr, nullptr,
+                                              df, out);
+        }
     }
     else if (shapetype == LibUtilities::Tet)
     {
-        PhysDerivTetKernel<TData, DEFORMED>
-            <<<gridSize, blockSize>>>(nq0, nq1, nq2, nCoord, nElmts, Z0, Z1, Z2,
-                                      dfSize, df, out0, out1, out2);
+        if (!FLAG_QP)
+        {
+            unsigned int nshared = sizeof(TData) * (nq0 + 2 * nq1 + nq2);
+            PhysDeriv3DKernel<TData, LibUtilities::Tet, DEFORMED>
+                <<<gridSize, blockSize, nshared>>>(nq0, nq1, nq2, nElmts, nSize,
+                                                   dfSize, Z0, Z1, Z2, df, out);
+        }
+        else
+        {
+            PhysDeriv3DKernel_QP<TData, LibUtilities::Tet, DEFORMED>
+                <<<gridSize, dim3(4, 4, 4)>>>(nq0, nq1, nq2, nElmts, nSize,
+                                              dfSize, Z0, Z1, Z2, df, out);
+        }
     }
     else if (shapetype == LibUtilities::Prism)
     {
-        PhysDerivPrismKernel<TData, DEFORMED>
-            <<<gridSize, blockSize>>>(nq0, nq1, nq2, nCoord, nElmts, Z0, Z1, Z2,
-                                      dfSize, df, out0, out1, out2);
+        if (!FLAG_QP)
+        {
+            unsigned int nshared = sizeof(TData) * (nq0 + nq2);
+            PhysDeriv3DKernel<TData, LibUtilities::Prism, DEFORMED>
+                <<<gridSize, blockSize, nshared>>>(nq0, nq1, nq2, nElmts, nSize,
+                                                   dfSize, Z0, nullptr, Z2, df,
+                                                   out);
+        }
+        else
+        {
+            PhysDeriv3DKernel_QP<TData, LibUtilities::Prism, DEFORMED>
+                <<<gridSize, dim3(4, 4, 4)>>>(nq0, nq1, nq2, nElmts, nSize,
+                                              dfSize, Z0, nullptr, Z2, df, out);
+        }
     }
     else if (shapetype == LibUtilities::Pyr)
     {
-        PhysDerivPyrKernel<TData, DEFORMED>
-            <<<gridSize, blockSize>>>(nq0, nq1, nq2, nCoord, nElmts, Z0, Z1, Z2,
-                                      dfSize, df, out0, out1, out2);
+        if (!FLAG_QP)
+        {
+            unsigned int nshared = sizeof(TData) * (nq0 + nq1 + nq2);
+            PhysDeriv3DKernel<TData, LibUtilities::Pyr, DEFORMED>
+                <<<gridSize, blockSize, nshared>>>(nq0, nq1, nq2, nElmts, nSize,
+                                                   dfSize, Z0, Z1, Z2, df, out);
+        }
+        else
+        {
+            PhysDeriv3DKernel_QP<TData, LibUtilities::Pyr, DEFORMED>
+                <<<gridSize, dim3(4, 4, 4)>>>(nq0, nq1, nq2, nElmts, nSize,
+                                              dfSize, Z0, Z1, Z2, df, out);
+        }
     }
 }
 
diff --git a/Operators/PhysDeriv/PhysDerivCUDAKernels.cuh b/Operators/PhysDeriv/PhysDerivCUDAKernels.cuh
index 28482cf005179ea68ff701cc12bd769cc3515090..15268c08d629d4a4cae4a97d9794c9fd9634b6fb 100644
--- a/Operators/PhysDeriv/PhysDerivCUDAKernels.cuh
+++ b/Operators/PhysDeriv/PhysDerivCUDAKernels.cuh
@@ -1,602 +1,782 @@
+#pragma once
+
 namespace Nektar::Operators::detail
 {
+
 template <typename TData, bool DEFORMED>
-__global__ void PhysDerivSegKernel(const size_t nq0, const size_t ncoord,
-                                   const size_t nelmt, const size_t dfSize,
-                                   TData *df, TData *inout0, TData *inout1,
-                                   TData *inout2)
+__global__ void PhysDeriv1DKernel(
+    const unsigned int nq0, const unsigned int ncoord, const unsigned int nelmt,
+    const unsigned int nSize, const unsigned int dfSize,
+    const TData *__restrict df, TData *__restrict inout)
 {
-    size_t e = blockDim.x * blockIdx.x + threadIdx.x;
-
-    if (e >= nelmt)
-    {
-        return;
-    }
-
-    size_t ndf = ncoord;
-
-    // Assign pointers.
-    TData **inoutptr = new TData *[ncoord];
-    inoutptr[0]      = inout0 + (nq0 * e);
-    if (ncoord > 1)
-    {
-        inoutptr[1] = inout1 + (nq0 * e);
-    }
-    if (ncoord > 2)
-    {
-        inoutptr[2] = inout2 + (nq0 * e);
-    }
+    unsigned int e = blockDim.x * blockIdx.x + threadIdx.x;
 
-    TData **dfptr = new TData *[ndf];
-    for (size_t d = 0; d < ndf; d++)
+    while (e < nelmt)
     {
-        dfptr[d] = df + d * dfSize;
-        dfptr[d] += DEFORMED ? nq0 * e : e;
-    }
+        unsigned int offset = nq0 * e;
 
-    // Compute derivative.
-    for (size_t j = 0; j < nq0; ++j)
-    {
-        size_t dfindex = DEFORMED ? j : 0;
-        for (size_t d = ndf; d > 0; d--)
+        for (unsigned int i = 0; i < nq0; ++i)
         {
-            inoutptr[d - 1][j] = inoutptr[0][j] * dfptr[d - 1][dfindex];
+            unsigned int index   = offset + i;
+            unsigned int dfindex = DEFORMED ? index : e;
+
+            TData d0 = inout[index];
+            for (unsigned int d = 0; d < ncoord; d++)
+            {
+                inout[index + d * nSize] = d0 * df[dfindex + d * dfSize];
+            }
         }
-    }
 
-    delete[] inoutptr;
-    delete[] dfptr;
+        e += blockDim.x * gridDim.x;
+    }
 }
 
 template <typename TData, bool DEFORMED>
-__global__ void PhysDerivQuadKernel(const size_t nq0, const size_t nq1,
-                                    const size_t ncoord, const size_t nelmt,
-                                    const size_t dfSize, TData *df,
-                                    TData *inout0, TData *inout1, TData *inout2)
+__global__ void PhysDeriv1DKernel_QP(
+    const unsigned int nq0, const unsigned int ncoord, const unsigned int nelmt,
+    const unsigned int nSize, const unsigned int dfSize,
+    const TData *__restrict df, TData *__restrict inout)
 {
-    size_t e = blockDim.x * blockIdx.x + threadIdx.x;
-
-    if (e >= nelmt)
-    {
-        return;
-    }
-
-    auto ndf = 2 * ncoord;
+    unsigned int e = blockIdx.x;
 
-    // Assign pointers.
-    TData **inoutptr = new TData *[ncoord];
-    inoutptr[0]      = inout0 + (nq0 * nq1 * e);
-    inoutptr[1]      = inout1 + (nq0 * nq1 * e);
-    if (ncoord > 2)
+    while (e < nelmt)
     {
-        inoutptr[2] = inout2 + (nq0 * nq1 * e);
-    }
+        unsigned int offset = nq0 * e;
 
-    TData **dfptr = new TData *[ndf];
-    for (size_t d = 0; d < ndf; d++)
-    {
-        dfptr[d] = df + d * dfSize;
-        dfptr[d] += DEFORMED ? nq0 * nq1 * e : e;
-    }
-
-    // Compute derivative.
-    for (size_t j = 0, cnt_ji = 0; j < nq1; ++j)
-    {
-        for (size_t i = 0; i < nq0; ++i, ++cnt_ji)
+        for (unsigned int i = threadIdx.x; i < nq0; i += blockDim.x)
         {
-            TData d0 = inoutptr[0][cnt_ji];
-            TData d1 = inoutptr[1][cnt_ji];
+            unsigned int index   = offset + i;
+            unsigned int dfindex = DEFORMED ? index : e;
 
-            size_t dfindex = DEFORMED ? cnt_ji : 0;
-            for (size_t d = 0; d < ncoord; d++)
+            TData d0 = inout[index];
+            for (unsigned int d = 0; d < ncoord; d++)
             {
-                inoutptr[d][cnt_ji] =
-                    d0 * dfptr[2 * d][dfindex] + d1 * dfptr[2 * d + 1][dfindex];
+                inout[index + d * nSize] = d0 * df[dfindex + d * dfSize];
             }
         }
-    }
 
-    delete[] inoutptr;
-    delete[] dfptr;
+        e += gridDim.x;
+    }
 }
 
-template <typename TData, bool DEFORMED>
-__global__ void PhysDerivTriKernel(const size_t nq0, const size_t nq1,
-                                   const size_t ncoord, const size_t nelmt,
-                                   const TData *Z0, const TData *Z1,
-                                   const size_t dfSize, TData *df,
-                                   TData *inout0, TData *inout1, TData *inout2)
+template <typename TData, LibUtilities::ShapeType SHAPETYPE, bool DEFORMED>
+__global__ void PhysDeriv2DKernel(
+    const unsigned int nq0, const unsigned int nq1, const unsigned int ncoord,
+    const unsigned int nelmt, const unsigned int nSize,
+    const unsigned int dfSize, const TData *__restrict Z0,
+    const TData *__restrict Z1, const TData *__restrict df,
+    TData *__restrict inout)
 {
-    size_t e = blockDim.x * blockIdx.x + threadIdx.x;
+    extern __shared__ TData shared[];
+    TData *s_xfrm0, *s_xfrm1;
 
-    if (e >= nelmt)
+    // Copy to shared memory.
+    if constexpr (SHAPETYPE == LibUtilities::Tri)
     {
-        return;
-    }
+        s_xfrm0 = shared;
+        s_xfrm1 = s_xfrm0 + nq1;
 
-    auto ndf = 2 * ncoord;
+        unsigned int sIndex = threadIdx.x;
+        while (sIndex < nq1)
+        {
+            s_xfrm0[sIndex] = 2.0 / (1.0 - Z1[sIndex]);
+            sIndex += blockDim.x;
+        }
 
-    // Assign pointers.
-    TData **inoutptr = new TData *[ncoord];
-    inoutptr[0]      = inout0 + (nq0 * nq1 * e);
-    inoutptr[1]      = inout1 + (nq0 * nq1 * e);
-    if (ncoord > 2)
-    {
-        inoutptr[2] = inout2 + (nq0 * nq1 * e);
-    }
+        sIndex = threadIdx.x;
+        while (sIndex < nq0)
+        {
+            s_xfrm1[sIndex] = 0.5 * (1.0 + Z0[sIndex]);
+            sIndex += blockDim.x;
+        }
 
-    TData **dfptr = new TData *[ndf];
-    for (size_t d = 0; d < ndf; d++)
-    {
-        dfptr[d] = df + d * dfSize;
-        dfptr[d] += DEFORMED ? nq0 * nq1 * e : e;
+        __syncthreads();
     }
 
-    // Compute derivative.
-    for (int j = 0, cnt_ji = 0; j < nq1; ++j)
+    unsigned int e = blockDim.x * blockIdx.x + threadIdx.x;
+
+    while (e < nelmt)
     {
-        TData xfrm0 = 2.0 / (1.0 - Z1[j]);
-        for (int i = 0; i < nq0; ++i, ++cnt_ji)
+        unsigned int offset = nq0 * nq1 * e;
+
+        for (unsigned int j = 0, cnt_ji = 0; j < nq1; ++j)
         {
-            TData d0 = inoutptr[0][cnt_ji];
-            TData d1 = inoutptr[1][cnt_ji];
+            for (unsigned int i = 0; i < nq0; ++i, ++cnt_ji)
+            {
+                unsigned int index   = offset + cnt_ji;
+                unsigned int dfindex = DEFORMED ? index : e;
 
-            // Moving from standard to collapsed coordinates.
-            TData xfrm1 = 0.5 * (1.0 + Z0[i]);
-            d0 *= xfrm0;
-            d1 += d0 * xfrm1;
+                TData d0 = inout[index];
+                TData d1 = inout[index + nSize];
 
-            // Multiply by derivative factors.
-            size_t dfindex = DEFORMED ? cnt_ji : 0;
-            for (size_t d = 0; d < ncoord; d++)
-            {
-                inoutptr[d][cnt_ji] =
-                    d0 * dfptr[2 * d][dfindex] + d1 * dfptr[2 * d + 1][dfindex];
+                // Moving from standard to collapsed coordinates.
+                if constexpr (SHAPETYPE == LibUtilities::Tri)
+                {
+                    d0 *= s_xfrm0[j];
+                    d1 += d0 * s_xfrm1[i];
+                }
+
+                // Multiply by derivative factors.
+                for (unsigned int d = 0; d < ncoord; d++)
+                {
+                    inout[index + d * nSize] =
+                        d0 * df[dfindex + (2 * d) * dfSize] +
+                        d1 * df[dfindex + (2 * d + 1) * dfSize];
+                }
             }
         }
-    }
 
-    delete[] inoutptr;
-    delete[] dfptr;
+        e += blockDim.x * gridDim.x;
+    }
 }
 
-template <typename TData, bool DEFORMED>
-__global__ void PhysDerivHexKernel(const size_t nq0, const size_t nq1,
-                                   const size_t nq2, const size_t ncoord,
-                                   const size_t nelmt, const size_t dfSize,
-                                   TData *df, TData *inout0, TData *inout1,
-                                   TData *inout2)
+template <typename TData, LibUtilities::ShapeType SHAPETYPE, bool DEFORMED>
+__global__ void PhysDeriv2DKernel_QP(
+    const unsigned int nq0, const unsigned int nq1, const unsigned int ncoord,
+    const unsigned int nelmt, const unsigned int nSize,
+    const unsigned int dfSize, const TData *__restrict Z0,
+    const TData *__restrict Z1, const TData *__restrict df,
+    TData *__restrict inout)
 {
-    size_t e = blockDim.x * blockIdx.x + threadIdx.x;
-
-    if (e >= nelmt)
-    {
-        return;
-    }
-
-    size_t ndf = 3 * ncoord;
+    TData xfrm0, xfrm1;
 
-    // Assign pointers.
-    TData **inoutptr = new TData *[ncoord];
-    inoutptr[0]      = inout0 + (nq0 * nq1 * nq2 * e);
-    inoutptr[1]      = inout1 + (nq0 * nq1 * nq2 * e);
-    inoutptr[2]      = inout2 + (nq0 * nq1 * nq2 * e);
+    unsigned int e = blockIdx.x;
 
-    TData **dfptr = new TData *[ndf];
-    for (size_t d = 0; d < ndf; d++)
+    while (e < nelmt)
     {
-        dfptr[d] = df + d * dfSize;
-        dfptr[d] += DEFORMED ? nq0 * nq1 * nq2 * e : e;
-    }
+        unsigned int offset = nq0 * nq1 * e;
 
-    // Compute derivative.
-    for (size_t k = 0, cnt_ijk = 0; k < nq2; ++k)
-    {
-        for (size_t j = 0; j < nq1; ++j)
+        for (unsigned int j = threadIdx.y; j < nq1; j += blockDim.y)
         {
-            for (size_t i = 0; i < nq0; ++i, ++cnt_ijk)
+            if constexpr (SHAPETYPE == LibUtilities::Tri)
             {
-                TData tmp[] = {inoutptr[0][cnt_ijk], inoutptr[1][cnt_ijk],
-                               inoutptr[2][cnt_ijk]};
+                xfrm0 = 2.0 / (1.0 - Z1[j]);
+            }
+
+            for (unsigned int i = threadIdx.x; i < nq0; i += blockDim.x)
+            {
+                unsigned int cnt_ji  = nq0 * j + i;
+                unsigned int index   = offset + cnt_ji;
+                unsigned int dfindex = DEFORMED ? index : e;
+
+                TData d0 = inout[index];
+                TData d1 = inout[index + nSize];
+
+                // Moving from standard to collapsed coordinates.
+                if constexpr (SHAPETYPE == LibUtilities::Tri)
+                {
+                    xfrm1 = 0.5 * (1.0 + Z0[i]);
+                    d0 *= xfrm0;
+                    d1 += d0 * xfrm1;
+                }
 
                 // Multiply by derivative factors.
-                size_t dfindex = DEFORMED ? cnt_ijk : 0;
-                for (size_t d = 0; d < ncoord; ++d)
+                for (unsigned int d = 0; d < ncoord; d++)
                 {
-                    TData sum = 0.0;
-                    for (size_t n = 0; n < ncoord; ++n)
-                    {
-                        sum += tmp[n] * dfptr[d * ncoord + n][dfindex];
-                    }
-                    inoutptr[d][cnt_ijk] = sum;
+                    inout[index + d * nSize] =
+                        d0 * df[dfindex + (2 * d) * dfSize] +
+                        d1 * df[dfindex + (2 * d + 1) * dfSize];
                 }
             }
         }
-    }
 
-    delete[] inoutptr;
-    delete[] dfptr;
+        e += gridDim.x;
+    }
 }
 
-template <typename TData, bool DEFORMED>
-__global__ void PhysDerivTetKernel(const size_t nq0, const size_t nq1,
-                                   const size_t nq2, const size_t ncoord,
-                                   const size_t nelmt, const TData *Z0,
-                                   const TData *Z1, const TData *Z2,
-                                   const size_t dfSize, TData *df,
-                                   TData *inout0, TData *inout1, TData *inout2)
+template <typename TData, LibUtilities::ShapeType SHAPETYPE, bool DEFORMED>
+__global__ void PhysDeriv3DKernel(
+    const unsigned int nq0, const unsigned int nq1, const unsigned int nq2,
+    const unsigned int nelmt, const unsigned int nSize,
+    const unsigned int dfSize, const TData *__restrict Z0,
+    const TData *__restrict Z1, const TData *__restrict Z2,
+    const TData *__restrict df, TData *__restrict inout)
 {
-    size_t e = blockDim.x * blockIdx.x + threadIdx.x;
+    extern __shared__ TData shared[];
+    TData *s_xfrm_eta0, *s_xfrm_eta1, *s_xfrm_eta1m, *s_xfrm_eta2;
 
-    if (e >= nelmt)
+    // Copy to shared memory.
+    if constexpr (SHAPETYPE == LibUtilities::Tet)
     {
-        return;
-    }
+        s_xfrm_eta0  = shared;
+        s_xfrm_eta1  = s_xfrm_eta0 + nq0;
+        s_xfrm_eta1m = s_xfrm_eta1 + nq1;
+        s_xfrm_eta2  = s_xfrm_eta1m + nq1;
 
-    size_t ndf = 3 * ncoord;
+        unsigned int sIndex = threadIdx.x;
+        while (sIndex < nq0)
+        {
+            s_xfrm_eta0[sIndex] = 0.5 * (1.0 + Z0[sIndex]);
+            sIndex += blockDim.x;
+        }
 
-    // Allocate workspace memory.
-    TData *wsp0 = new TData[nq0 * nq1 * nq2];
-    TData *wsp1 = new TData[nq0 * nq1 * nq2];
+        sIndex = threadIdx.x;
+        while (sIndex < nq1)
+        {
+            s_xfrm_eta1[sIndex] = 0.5 * (1.0 + Z1[sIndex]);
+            sIndex += blockDim.x;
+        }
 
-    // Assign pointers.
-    TData **inoutptr = new TData *[ncoord];
-    inoutptr[0]      = inout0 + (nq0 * nq1 * nq2 * e);
-    inoutptr[1]      = inout1 + (nq0 * nq1 * nq2 * e);
-    inoutptr[2]      = inout2 + (nq0 * nq1 * nq2 * e);
+        sIndex = threadIdx.x;
+        while (sIndex < nq1)
+        {
+            s_xfrm_eta1m[sIndex] = 2.0 / (1.0 - Z1[sIndex]);
+            sIndex += blockDim.x;
+        }
 
-    TData **dfptr = new TData *[ndf];
-    for (size_t d = 0; d < ndf; d++)
-    {
-        dfptr[d] = df + d * dfSize;
-        dfptr[d] += DEFORMED ? nq0 * nq1 * nq2 * e : e;
-    }
+        sIndex = threadIdx.x;
+        while (sIndex < nq2)
+        {
+            s_xfrm_eta2[sIndex] = 2.0 / (1.0 - Z2[sIndex]);
+            sIndex += blockDim.x;
+        }
 
-    // Moving from standard to collapsed coordinates.
-    for (int k = 0, eta0 = 0; k < nq2; ++k)
+        __syncthreads();
+    }
+    else if constexpr (SHAPETYPE == LibUtilities::Prism)
     {
-        TData xfrm_eta2 = 2.0 / (1.0 - Z2[k]);
-        for (int j = 0; j < nq1; ++j)
+        s_xfrm_eta0 = shared;
+        s_xfrm_eta2 = s_xfrm_eta0 + nq0;
+
+        unsigned int sIndex = threadIdx.x;
+        while (sIndex < nq0)
         {
-            TData xfrm_eta1 = 2.0 / (1.0 - Z1[j]);
-            TData xfrm      = xfrm_eta1 * xfrm_eta2;
-            for (int i = 0; i < nq0; ++i, ++eta0)
-            {
-                inoutptr[0][eta0] *= xfrm;
-                wsp0[eta0] = inoutptr[0][eta0];
-            }
+            s_xfrm_eta0[sIndex] = 0.5 * (1.0 + Z0[sIndex]);
+            sIndex += blockDim.x;
         }
-    }
 
-    for (int k = 0, eta0 = 0; k < nq2; ++k)
+        sIndex = threadIdx.x;
+        while (sIndex < nq2)
+        {
+            s_xfrm_eta2[sIndex] = 2.0 / (1.0 - Z2[sIndex]);
+            sIndex += blockDim.x;
+        }
+
+        __syncthreads();
+    }
+    else if constexpr (SHAPETYPE == LibUtilities::Pyr)
     {
-        TData xfrm_eta2 = 2.0 / (1.0 - Z2[k]);
-        for (int j = 0; j < nq1; ++j)
+        s_xfrm_eta0 = shared;
+        s_xfrm_eta1 = s_xfrm_eta0 + nq0;
+        s_xfrm_eta2 = s_xfrm_eta1 + nq1;
+
+        unsigned int sIndex = threadIdx.x;
+        while (sIndex < nq0)
         {
-            for (int i = 0; i < nq0; ++i, ++eta0)
-            {
-                TData xfrm_eta0   = 0.5 * (1.0 + Z0[i]);
-                wsp0[eta0]        = xfrm_eta0 * wsp0[eta0];
-                wsp1[eta0]        = xfrm_eta2 * inoutptr[1][eta0];
-                inoutptr[1][eta0] = wsp0[eta0] + wsp1[eta0];
-            }
+            s_xfrm_eta0[sIndex] = 0.5 * (1.0 + Z0[sIndex]);
+            sIndex += blockDim.x;
         }
+
+        sIndex = threadIdx.x;
+        while (sIndex < nq1)
+        {
+            s_xfrm_eta1[sIndex] = 0.5 * (1.0 + Z1[sIndex]);
+            sIndex += blockDim.x;
+        }
+
+        sIndex = threadIdx.x;
+        while (sIndex < nq2)
+        {
+            s_xfrm_eta2[sIndex] = 2.0 / (1.0 - Z2[sIndex]);
+            sIndex += blockDim.x;
+        }
+
+        __syncthreads();
     }
 
-    for (int k = 0, eta0 = 0; k < nq2; ++k)
+    constexpr unsigned int ncoord = 3;
+
+    unsigned int e = blockDim.x * blockIdx.x + threadIdx.x;
+
+    while (e < nelmt)
     {
-        for (int j = 0; j < nq1; ++j)
+        unsigned int offset = nq0 * nq1 * nq2 * e;
+
+        // Moving from standard to collapsed coordinates.
+        if constexpr (SHAPETYPE == LibUtilities::Tet)
         {
-            TData xfrm_eta1 = 0.5 * (1.0 + Z1[j]);
-            for (int i = 0; i < nq0; ++i, ++eta0)
+            for (unsigned int k = 0, cnt_kji = 0; k < nq2; ++k)
             {
-                inoutptr[2][eta0] += wsp0[eta0] + wsp1[eta0] * xfrm_eta1;
+                for (unsigned int j = 0; j < nq1; ++j)
+                {
+                    TData xfrm = s_xfrm_eta1m[j] * s_xfrm_eta2[k];
+                    for (unsigned int i = 0; i < nq0; ++i, ++cnt_kji)
+                    {
+                        unsigned int index = offset + cnt_kji;
+
+                        TData tmp0   = xfrm * inout[index];
+                        TData tmp1   = s_xfrm_eta0[i] * tmp0;
+                        TData tmp2   = s_xfrm_eta2[k] * inout[index + nSize];
+                        inout[index] = tmp0;
+                        inout[index + nSize] = tmp1 + tmp2;
+                        inout[index + 2 * nSize] +=
+                            tmp1 + s_xfrm_eta1[j] * tmp2;
+                    }
+                }
             }
         }
-    }
 
-    // Compute derivative.
-    for (size_t k = 0, cnt_ijk = 0; k < nq2; ++k)
-    {
-        for (size_t j = 0; j < nq1; ++j)
+        for (unsigned int k = 0, cnt_kji = 0; k < nq2; ++k)
         {
-            for (size_t i = 0; i < nq0; ++i, ++cnt_ijk)
+            for (unsigned int j = 0; j < nq1; ++j)
             {
-                TData tmp[] = {inoutptr[0][cnt_ijk], inoutptr[1][cnt_ijk],
-                               inoutptr[2][cnt_ijk]};
-
-                // Multiply by derivative factors.
-                size_t dfindex = DEFORMED ? cnt_ijk : 0;
-                for (size_t d = 0; d < ncoord; ++d)
+                for (unsigned int i = 0; i < nq0; ++i, ++cnt_kji)
                 {
-                    TData sum = 0.0;
-                    for (size_t n = 0; n < ncoord; ++n)
+                    unsigned int index   = offset + cnt_kji;
+                    unsigned int dfindex = DEFORMED ? index : e;
+
+                    TData d0 = inout[index];
+                    TData d1 = inout[index + nSize];
+                    TData d2 = inout[index + 2 * nSize];
+
+                    // Chain-rule for eta_0 and eta_2.
+                    if constexpr (SHAPETYPE == LibUtilities::Prism)
+                    {
+                        d0 *= s_xfrm_eta2[k];
+                        d2 += s_xfrm_eta0[i] * d0;
+                    }
+                    else if constexpr (SHAPETYPE == LibUtilities::Pyr)
                     {
-                        sum += tmp[n] * dfptr[d * ncoord + n][dfindex];
+                        d0 *= s_xfrm_eta2[k];
+                        d1 *= s_xfrm_eta2[k];
+                        d2 += s_xfrm_eta0[i] * d0 + s_xfrm_eta1[j] * d1;
+                    }
+
+                    // Multiply by derivative factors.
+                    for (unsigned int d = 0; d < ncoord; d++)
+                    {
+                        inout[index + d * nSize] =
+                            d0 * df[dfindex + (3 * d) * dfSize] +
+                            d1 * df[dfindex + (3 * d + 1) * dfSize] +
+                            d2 * df[dfindex + (3 * d + 2) * dfSize];
                     }
-                    inoutptr[d][cnt_ijk] = sum;
                 }
             }
         }
-    }
 
-    delete[] wsp0;
-    delete[] wsp1;
-    delete[] inoutptr;
-    delete[] dfptr;
+        e += blockDim.x * gridDim.x;
+    }
 }
 
-template <typename TData, bool DEFORMED>
-__global__ void PhysDerivPrismKernel(
-    const size_t nq0, const size_t nq1, const size_t nq2, const size_t ncoord,
-    const size_t nelmt, const TData *Z0, const TData *Z1, const TData *Z2,
-    const size_t dfSize, TData *df, TData *inout0, TData *inout1, TData *inout2)
+template <typename TData, LibUtilities::ShapeType SHAPETYPE, bool DEFORMED>
+__global__ void PhysDeriv3DKernel_QP(
+    const unsigned int nq0, const unsigned int nq1, const unsigned int nq2,
+    const unsigned int nelmt, const unsigned int nSize,
+    const unsigned int dfSize, const TData *__restrict Z0,
+    const TData *__restrict Z1, const TData *__restrict Z2,
+    const TData *__restrict df, TData *__restrict inout)
 {
-    size_t e = blockDim.x * blockIdx.x + threadIdx.x;
+    TData xfrm_eta0, xfrm_eta1, xfrm_eta1m, xfrm_eta2;
 
-    if (e >= nelmt)
-    {
-        return;
-    }
-
-    size_t ndf = 3 * ncoord;
+    constexpr unsigned int ncoord = 3;
 
-    // Assign pointers.
-    TData **inoutptr = new TData *[ncoord];
-    inoutptr[0]      = inout0 + (nq0 * nq1 * nq2 * e);
-    inoutptr[1]      = inout1 + (nq0 * nq1 * nq2 * e);
-    inoutptr[2]      = inout2 + (nq0 * nq1 * nq2 * e);
+    unsigned int e = blockIdx.x;
 
-    TData **dfptr = new TData *[ndf];
-    for (size_t d = 0; d < ndf; d++)
+    while (e < nelmt)
     {
-        dfptr[d] = df + d * dfSize;
-        dfptr[d] += DEFORMED ? nq0 * nq1 * nq2 * e : e;
-    }
+        unsigned int offset = nq0 * nq1 * nq2 * e;
 
-    // Compute derivative.
-    for (size_t k = 0, cnt_ijk = 0; k < nq2; ++k)
-    {
-        TData xfrm_eta2 = 2.0 / (1.0 - Z2[k]);
-        for (size_t j = 0; j < nq1; ++j)
+        // Moving from standard to collapsed coordinates.
+        if constexpr (SHAPETYPE == LibUtilities::Tet)
         {
-            for (size_t i = 0; i < nq0; ++i, ++cnt_ijk)
+            for (unsigned int k = threadIdx.z; k < nq2; k += blockDim.z)
             {
-                TData d0 = inoutptr[0][cnt_ijk];
-                TData d1 = inoutptr[1][cnt_ijk];
-                TData d2 = inoutptr[2][cnt_ijk];
+                xfrm_eta2 = 2.0 / (1.0 - Z2[k]);
+                for (unsigned int j = threadIdx.y; j < nq1; j += blockDim.y)
+                {
+                    xfrm_eta1m = 2.0 / (1.0 - Z1[j]);
+                    xfrm_eta1  = 0.5 * (1.0 + Z1[j]);
+                    TData xfrm = xfrm_eta1m * xfrm_eta2;
+                    for (unsigned int i = threadIdx.x; i < nq0; i += blockDim.x)
+                    {
+                        unsigned int index =
+                            offset + nq0 * nq1 * k + nq0 * j + i;
+
+                        xfrm_eta0            = 0.5 * (1.0 + Z0[i]);
+                        TData tmp0           = xfrm * inout[index];
+                        TData tmp1           = xfrm_eta0 * tmp0;
+                        TData tmp2           = xfrm_eta2 * inout[index + nSize];
+                        inout[index]         = tmp0;
+                        inout[index + nSize] = tmp1 + tmp2;
+                        inout[index + 2 * nSize] += tmp1 + xfrm_eta1 * tmp2;
+                    }
+                }
+            }
 
-                // Chain-rule for eta_0 and eta_2.
-                TData xfrm_eta0 = 0.5 * (1.0 + Z0[i]);
-                d0 *= xfrm_eta2;
-                d2 += xfrm_eta0 * d0;
+            __syncthreads();
+        }
 
-                // Multiply by derivative factors.
-                size_t dfindex       = DEFORMED ? cnt_ijk : 0;
-                inoutptr[0][cnt_ijk] = d0 * dfptr[0][dfindex] +
-                                       d1 * dfptr[1][dfindex] +
-                                       d2 * dfptr[2][dfindex];
-                inoutptr[1][cnt_ijk] = d0 * dfptr[3][dfindex] +
-                                       d1 * dfptr[4][dfindex] +
-                                       d2 * dfptr[5][dfindex];
-                inoutptr[2][cnt_ijk] = d0 * dfptr[6][dfindex] +
-                                       d1 * dfptr[7][dfindex] +
-                                       d2 * dfptr[8][dfindex];
+        for (unsigned int k = threadIdx.z; k < nq2; k += blockDim.z)
+        {
+            if constexpr (SHAPETYPE == LibUtilities::Prism ||
+                          SHAPETYPE == LibUtilities::Pyr)
+            {
+                xfrm_eta2 = 2.0 / (1.0 - Z2[k]);
+            }
+
+            for (unsigned int j = threadIdx.y; j < nq1; j += blockDim.y)
+            {
+                if constexpr (SHAPETYPE == LibUtilities::Pyr)
+                {
+                    xfrm_eta1 = 0.5 * (1.0 + Z1[j]);
+                }
+
+                for (unsigned int i = threadIdx.x; i < nq0; i += blockDim.x)
+                {
+                    unsigned int cnt_kji = nq0 * nq1 * k + nq0 * j + i;
+                    unsigned int index   = offset + cnt_kji;
+                    unsigned int dfindex = DEFORMED ? index : e;
+
+                    TData d0 = inout[index];
+                    TData d1 = inout[index + nSize];
+                    TData d2 = inout[index + 2 * nSize];
+
+                    if constexpr (SHAPETYPE == LibUtilities::Prism ||
+                                  SHAPETYPE == LibUtilities::Pyr)
+                    {
+                        xfrm_eta0 = 0.5 * (1.0 + Z0[i]);
+                    }
+
+                    // Chain-rule for eta_0 and eta_2.
+                    if constexpr (SHAPETYPE == LibUtilities::Prism)
+                    {
+                        d0 *= xfrm_eta2;
+                        d2 += xfrm_eta0 * d0;
+                    }
+                    else if constexpr (SHAPETYPE == LibUtilities::Pyr)
+                    {
+                        d0 *= xfrm_eta2;
+                        d1 *= xfrm_eta2;
+                        d2 += xfrm_eta0 * d0 + xfrm_eta1 * d1;
+                    }
+
+                    // Multiply by derivative factors.
+                    for (unsigned int d = 0; d < ncoord; d++)
+                    {
+                        inout[index + d * nSize] =
+                            d0 * df[dfindex + (3 * d) * dfSize] +
+                            d1 * df[dfindex + (3 * d + 1) * dfSize] +
+                            d2 * df[dfindex + (3 * d + 2) * dfSize];
+                    }
+                }
             }
         }
-    }
 
-    delete[] inoutptr;
-    delete[] dfptr;
+        e += gridDim.x;
+    }
 }
 
-template <typename TData, bool DEFORMED>
-__global__ void PhysDerivPyrKernel(const size_t nq0, const size_t nq1,
-                                   const size_t nq2, const size_t ncoord,
-                                   const size_t nelmt, const TData *Z0,
-                                   const TData *Z1, const TData *Z2,
-                                   const size_t dfSize, TData *df,
-                                   TData *inout0, TData *inout1, TData *inout2)
+template <typename TData>
+__global__ void PhysDerivTensor1DKernel(const unsigned int nq0,
+                                        const unsigned int nelmt,
+                                        const TData *__restrict D0,
+                                        const TData *__restrict in,
+                                        TData *__restrict out)
 {
-    size_t e = blockDim.x * blockIdx.x + threadIdx.x;
+    extern __shared__ TData shared[];
+    TData *s_D0 = shared;
 
-    if (e >= nelmt)
+    // Copy to shared memory.
+    unsigned int sIndex = threadIdx.x;
+    while (sIndex < nq0 * nq0)
     {
-        return;
+        s_D0[sIndex] = D0[sIndex];
+        sIndex += blockDim.x;
     }
 
-    size_t ndf = 3 * ncoord;
+    __syncthreads();
 
-    // Assign pointers.
-    TData **inoutptr = new TData *[ncoord];
-    inoutptr[0]      = inout0 + (nq0 * nq1 * nq2 * e);
-    inoutptr[1]      = inout1 + (nq0 * nq1 * nq2 * e);
-    inoutptr[2]      = inout2 + (nq0 * nq1 * nq2 * e);
+    unsigned int e = blockDim.x * blockIdx.x + threadIdx.x;
 
-    TData **dfptr = new TData *[ndf];
-    for (size_t d = 0; d < ndf; d++)
+    while (e < nelmt)
     {
-        dfptr[d] = df + d * dfSize;
-        dfptr[d] += DEFORMED ? nq0 * nq1 * nq2 * e : e;
-    }
+        unsigned int offset = nq0 * e;
 
-    // Compute derivative.
-    for (size_t k = 0, cnt_ijk = 0; k < nq2; ++k)
-    {
-        TData xfrm_eta2 = 2.0 / (1.0 - Z2[k]);
-        for (size_t j = 0; j < nq1; ++j)
+        for (unsigned int i = 0; i < nq0; ++i)
         {
-            TData xfrm_eta1 = 0.5 * (1.0 + Z1[j]);
-            for (size_t i = 0; i < nq0; ++i, ++cnt_ijk)
+            TData sum = 0.0;
+            for (unsigned int q = 0; q < nq0; ++q)
             {
-                TData d0 = inoutptr[0][cnt_ijk];
-                TData d1 = inoutptr[1][cnt_ijk];
-                TData d2 = inoutptr[2][cnt_ijk];
-
-                // Chain-rule for eta_0 and eta_2.
-                TData xfrm_eta0 = 0.5 * (1.0 + Z0[i]);
-                d0 *= xfrm_eta2;
-                d1 *= xfrm_eta2;
-                d2 += xfrm_eta0 * d0 + xfrm_eta1 * d1;
-
-                // Multiply by derivative factors.
-                size_t dfindex       = DEFORMED ? cnt_ijk : 0;
-                inoutptr[0][cnt_ijk] = d0 * dfptr[0][dfindex] +
-                                       d1 * dfptr[1][dfindex] +
-                                       d2 * dfptr[2][dfindex];
-                inoutptr[1][cnt_ijk] = d0 * dfptr[3][dfindex] +
-                                       d1 * dfptr[4][dfindex] +
-                                       d2 * dfptr[5][dfindex];
-                inoutptr[2][cnt_ijk] = d0 * dfptr[6][dfindex] +
-                                       d1 * dfptr[7][dfindex] +
-                                       d2 * dfptr[8][dfindex];
+                sum += s_D0[q * nq0 + i] * in[offset + q];
             }
+            out[offset + i] = sum;
         }
-    }
 
-    delete[] inoutptr;
-    delete[] dfptr;
+        e += blockDim.x * gridDim.x;
+    }
 }
 
 template <typename TData>
-__global__ void PhysDerivTensor1DKernel(const size_t nq0, const size_t nelmt,
-                                        const TData *D0, const TData *in,
-                                        TData *out0)
+__global__ void PhysDerivTensor1DKernel_QP(const unsigned int nq0,
+                                           const unsigned int nelmt,
+                                           const TData *__restrict D0,
+                                           const TData *__restrict in,
+                                           TData *__restrict out)
 {
-    size_t e = blockDim.x * blockIdx.x + threadIdx.x;
+    unsigned int e = blockIdx.x;
 
-    if (e >= nelmt)
+    while (e < nelmt)
     {
-        return;
-    }
+        unsigned int offset = nq0 * e;
 
-    // Assign pointers.
-    const TData *inptr = in + (nq0 * e);
-    TData *outptr0     = out0 + (nq0 * e);
-
-    // Direction 1
-    for (size_t i = 0; i < nq0; ++i)
-    {
-        TData sum = 0.0;
-        for (size_t k = 0; k < nq0; ++k)
+        for (unsigned int i = threadIdx.x; i < nq0; i += blockDim.x)
         {
-            sum += D0[k * nq0 + i] * inptr[k];
+            TData sum = 0.0;
+            for (unsigned int q = 0; q < nq0; ++q)
+            {
+                sum += D0[q * nq0 + i] * in[offset + q];
+            }
+            out[offset + i] = sum;
         }
-        outptr0[i] = sum;
+
+        e += gridDim.x;
     }
 }
 
 template <typename TData>
-__global__ void PhysDerivTensor2DKernel(const size_t nq0, const size_t nq1,
-                                        const size_t nelmt, const TData *D0,
-                                        const TData *D1, const TData *in,
-                                        TData *out0, TData *out1)
+__global__ void PhysDerivTensor2DKernel(
+    const unsigned int nq0, const unsigned int nq1, const unsigned int nelmt,
+    const unsigned int nSize, const TData *__restrict D0,
+    const TData *__restrict D1, const TData *__restrict in,
+    TData *__restrict out)
 {
-    size_t e = blockDim.x * blockIdx.x + threadIdx.x;
+    extern __shared__ TData shared[];
+    TData *s_D0 = shared;
+    TData *s_D1 = s_D0 + nq0 * nq0;
 
-    if (e >= nelmt)
+    // Copy to shared memory.
+    unsigned int sIndex = threadIdx.x;
+    while (sIndex < nq0 * nq0)
     {
-        return;
+        s_D0[sIndex] = D0[sIndex];
+        sIndex += blockDim.x;
     }
 
-    // Assign pointers.
-    const TData *inptr = in + (nq0 * nq1 * e);
-    TData *outptr0     = out0 + (nq0 * nq1 * e);
-    TData *outptr1     = out1 + (nq0 * nq1 * e);
+    sIndex = threadIdx.x;
+    while (sIndex < nq1 * nq1)
+    {
+        s_D1[sIndex] = D1[sIndex];
+        sIndex += blockDim.x;
+    }
 
-    // Direction 1
-    for (size_t i = 0; i < nq0; ++i)
+    __syncthreads();
+
+    unsigned int e = blockDim.x * blockIdx.x + threadIdx.x;
+
+    while (e < nelmt)
     {
-        for (size_t j = 0; j < nq1; ++j)
+        unsigned int offset = nq0 * nq1 * e;
+
+        for (unsigned int j = 0; j < nq1; ++j)
         {
-            TData sum = 0.0;
-            for (size_t k = 0; k < nq0; ++k)
+            for (unsigned int i = 0; i < nq0; ++i)
             {
-                sum += D0[k * nq0 + i] * inptr[j * nq0 + k];
+                unsigned int cnt_ji = nq0 * j + i;
+
+                // Direction 1
+                TData sum = 0.0;
+                for (unsigned int q = 0; q < nq0; ++q)
+                {
+                    sum += s_D0[q * nq0 + i] * in[offset + nq0 * j + q];
+                }
+                out[offset + cnt_ji] = sum;
+
+                // Direction 2
+                sum = 0.0;
+                for (unsigned int q = 0; q < nq1; ++q)
+                {
+                    sum += s_D1[q * nq1 + j] * in[offset + nq0 * q + i];
+                }
+                out[offset + cnt_ji + nSize] = sum;
             }
-            outptr0[j * nq0 + i] = sum;
         }
+
+        e += blockDim.x * gridDim.x;
     }
+}
+
+template <typename TData>
+__global__ void PhysDerivTensor2DKernel_QP(
+    const unsigned int nq0, const unsigned int nq1, const unsigned int nelmt,
+    const unsigned int nSize, const TData *__restrict D0,
+    const TData *__restrict D1, const TData *__restrict in,
+    TData *__restrict out)
+{
+    unsigned int e = blockIdx.x;
 
-    // Direction 2
-    for (size_t i = 0; i < nq0; ++i)
+    while (e < nelmt)
     {
-        for (size_t j = 0; j < nq1; ++j)
+        unsigned int offset = nq0 * nq1 * e;
+
+        for (unsigned int j = threadIdx.y; j < nq1; j += blockDim.y)
         {
-            TData sum = 0.0;
-            for (size_t k = 0; k < nq1; ++k)
+            for (unsigned int i = threadIdx.x; i < nq0; i += blockDim.x)
             {
-                sum += D1[k * nq1 + j] * inptr[k * nq0 + i];
+                unsigned int cnt_ji = nq0 * j + i;
+
+                // Direction 1
+                TData sum = 0.0;
+                for (unsigned int q = 0; q < nq0; ++q)
+                {
+                    sum += D0[q * nq0 + i] * in[offset + nq0 * j + q];
+                }
+                out[offset + cnt_ji] = sum;
+
+                // Direction 2
+                sum = 0.0;
+                for (unsigned int q = 0; q < nq1; ++q)
+                {
+                    sum += D1[q * nq1 + j] * in[offset + nq0 * q + i];
+                }
+                out[offset + cnt_ji + nSize] = sum;
             }
-            outptr1[j * nq0 + i] = sum;
         }
+
+        e += gridDim.x;
     }
 }
 
 template <typename TData>
-__global__ void PhysDerivTensor3DKernel(const size_t nq0, const size_t nq1,
-                                        const size_t nq2, const size_t nelmt,
-                                        const TData *D0, const TData *D1,
-                                        const TData *D2, const TData *in,
-                                        TData *out0, TData *out1, TData *out2)
+__global__ void PhysDerivTensor3DKernel(
+    const unsigned int nq0, const unsigned int nq1, const unsigned int nq2,
+    const unsigned int nelmt, const unsigned int nSize,
+    const TData *__restrict D0, const TData *__restrict D1,
+    const TData *__restrict D2, const TData *__restrict in,
+    TData *__restrict out)
 {
-    size_t e = blockDim.x * blockIdx.x + threadIdx.x;
+    extern __shared__ TData shared[];
+    TData *s_D0 = shared;
+    TData *s_D1 = s_D0 + nq0 * nq0;
+    TData *s_D2 = s_D1 + nq1 * nq1;
 
-    if (e >= nelmt)
+    // Copy to shared memory.
+    unsigned int sIndex = threadIdx.x;
+    while (sIndex < nq0 * nq0)
     {
-        return;
+        s_D0[sIndex] = D0[sIndex];
+        sIndex += blockDim.x;
     }
 
-    // Assign pointers.
-    const TData *inptr = in + (nq0 * nq1 * nq2 * e);
-    TData *outptr0     = out0 + (nq0 * nq1 * nq2 * e);
-    TData *outptr1     = out1 + (nq0 * nq1 * nq2 * e);
-    TData *outptr2     = out2 + (nq0 * nq1 * nq2 * e);
+    sIndex = threadIdx.x;
+    while (sIndex < nq1 * nq1)
+    {
+        s_D1[sIndex] = D1[sIndex];
+        sIndex += blockDim.x;
+    }
 
-    // Direction 1
-    for (size_t i = 0; i < nq0; ++i)
+    sIndex = threadIdx.x;
+    while (sIndex < nq2 * nq2)
     {
-        for (size_t j = 0; j < nq1 * nq2; ++j)
-        {
-            TData sum = 0.0;
-            for (size_t k = 0; k < nq0; ++k)
-            {
-                sum += D0[k * nq0 + i] * inptr[j * nq0 + k];
-            }
-            outptr0[j * nq0 + i] = sum;
-        }
+        s_D2[sIndex] = D2[sIndex];
+        sIndex += blockDim.x;
     }
 
-    // Direction 2
-    for (size_t block = 0; block < nq2; ++block)
+    __syncthreads();
+
+    unsigned int e = blockDim.x * blockIdx.x + threadIdx.x;
+
+    while (e < nelmt)
     {
-        size_t start = block * nq0 * nq1;
-        for (size_t i = 0; i < nq0; ++i)
+        unsigned int offset = nq0 * nq1 * nq2 * e;
+
+        for (unsigned int k = 0; k < nq2; k++)
         {
-            for (size_t j = 0; j < nq1; ++j)
+            unsigned int cnt_k = nq0 * nq1 * k;
+            for (unsigned int j = 0; j < nq1; j++)
             {
-                TData sum = 0.0;
-                for (size_t k = 0; k < nq1; ++k)
+                unsigned int cnt_kj = cnt_k + nq0 * j;
+                for (unsigned int i = 0; i < nq0; i++)
                 {
-                    sum += D1[k * nq1 + j] * inptr[start + k * nq0 + i];
+                    unsigned int cnt_ji  = nq0 * j + i;
+                    unsigned int cnt_kji = cnt_k + cnt_ji;
+
+                    // Direction 1
+                    TData sum = 0.0;
+                    for (unsigned int q = 0; q < nq0; ++q)
+                    {
+                        sum += s_D0[q * nq0 + i] * in[offset + cnt_kj + q];
+                    }
+                    out[offset + cnt_kji] = sum;
+
+                    // Direction 2
+                    sum = 0.0;
+                    for (unsigned int q = 0; q < nq1; ++q)
+                    {
+                        sum += s_D1[q * nq1 + j] *
+                               in[offset + cnt_k + nq0 * q + i];
+                    }
+                    out[offset + cnt_kji + nSize] = sum;
+
+                    // Direction 3
+                    sum = 0.0;
+                    for (unsigned int q = 0; q < nq2; ++q)
+                    {
+                        sum += s_D2[q * nq2 + k] *
+                               in[offset + nq0 * nq1 * q + cnt_ji];
+                    }
+                    out[offset + cnt_kji + 2 * nSize] = sum;
                 }
-                outptr1[start + j * nq0 + i] = sum;
             }
         }
+
+        e += blockDim.x * gridDim.x;
     }
+}
 
-    // Direction 3
-    for (size_t i = 0; i < nq0 * nq1; ++i)
+template <typename TData>
+__global__ void PhysDerivTensor3DKernel_QP(
+    const unsigned int nq0, const unsigned int nq1, const unsigned int nq2,
+    const unsigned int nelmt, const unsigned int nSize,
+    const TData *__restrict D0, const TData *__restrict D1,
+    const TData *__restrict D2, const TData *__restrict in,
+    TData *__restrict out)
+{
+    unsigned int e = blockIdx.x;
+
+    while (e < nelmt)
     {
-        for (size_t j = 0; j < nq2; ++j)
+        unsigned int offset = nq0 * nq1 * nq2 * e;
+
+        for (unsigned int k = threadIdx.z; k < nq2; k += blockDim.z)
         {
-            TData sum = 0.0;
-            for (size_t k = 0; k < nq2; ++k)
+            for (unsigned int j = threadIdx.y; j < nq1; j += blockDim.y)
             {
-                sum += D2[k * nq2 + j] * inptr[k * nq0 * nq1 + i];
+                for (unsigned int i = threadIdx.x; i < nq0; i += blockDim.x)
+                {
+                    unsigned int cnt_kji = nq0 * nq1 * k + nq0 * j + i;
+
+                    // Direction 1
+                    TData sum = 0.0;
+                    for (unsigned int q = 0; q < nq0; ++q)
+                    {
+                        sum += D0[q * nq0 + i] *
+                               in[offset + nq0 * nq1 * k + nq0 * j + q];
+                    }
+                    out[offset + cnt_kji] = sum;
+
+                    // Direction 2
+                    sum = 0.0;
+                    for (unsigned int q = 0; q < nq1; ++q)
+                    {
+                        sum += D1[q * nq1 + j] *
+                               in[offset + nq0 * nq1 * k + nq0 * q + i];
+                    }
+                    out[offset + cnt_kji + nSize] = sum;
+
+                    // Direction 3
+                    sum = 0.0;
+                    for (unsigned int q = 0; q < nq2; ++q)
+                    {
+                        sum += D2[q * nq2 + k] *
+                               in[offset + nq0 * nq1 * q + nq0 * j + i];
+                    }
+                    out[offset + cnt_kji + 2 * nSize] = sum;
+                }
             }
-            outptr2[j * nq0 * nq1 + i] = sum;
         }
+
+        e += gridDim.x;
     }
 }
+
 } // namespace Nektar::Operators::detail
diff --git a/Operators/PhysDeriv/PhysDerivMatFree.hpp b/Operators/PhysDeriv/PhysDerivMatFree.hpp
index cd0532b74c90b1eb85b6bc46a7b6487a22dbefea..1a671a8cc4d8ac1891a6e824992d8c1e9e64aace 100644
--- a/Operators/PhysDeriv/PhysDerivMatFree.hpp
+++ b/Operators/PhysDeriv/PhysDerivMatFree.hpp
@@ -1,11 +1,12 @@
-#include <LibUtilities/Foundations/Basis.h>
+#pragma once
 
 #include <LibUtilities/BasicUtils/ShapeType.hpp>
 #include <LibUtilities/BasicUtils/SharedArray.hpp>
+#include <LibUtilities/Foundations/Basis.h>
+#include <LibUtilities/SimdLib/tinysimd.hpp>
 
 #include "Operators/OperatorPhysDeriv.hpp"
 #include "PhysDerivMatFreeKernels.hpp"
-#include <LibUtilities/SimdLib/tinysimd.hpp>
 
 namespace Nektar::Operators::detail
 {
diff --git a/Operators/PhysDeriv/PhysDerivMatFreeKernels.hpp b/Operators/PhysDeriv/PhysDerivMatFreeKernels.hpp
index 762f95de82967d368fdd75ba11def79e26fecf09..e4ef1184ddc4632146a963255f188dbf1a25985d 100644
--- a/Operators/PhysDeriv/PhysDerivMatFreeKernels.hpp
+++ b/Operators/PhysDeriv/PhysDerivMatFreeKernels.hpp
@@ -1,4 +1,5 @@
 #pragma once
+
 #include <LibUtilities/BasicUtils/NekInline.hpp>
 
 namespace Nektar::Operators::detail
diff --git a/Operators/PhysDeriv/PhysDerivStdMat.hpp b/Operators/PhysDeriv/PhysDerivStdMat.hpp
index 86baed306156a46ca161b78fe2b093e41f8f658c..a72856f956f8308a14bf1c9daec76ea9a51d8df6 100644
--- a/Operators/PhysDeriv/PhysDerivStdMat.hpp
+++ b/Operators/PhysDeriv/PhysDerivStdMat.hpp
@@ -1,6 +1,9 @@
-#include "Operators/OperatorPhysDeriv.hpp"
+#pragma once
+
 #include <StdRegions/StdExpansion.h>
 
+#include "Operators/OperatorPhysDeriv.hpp"
+
 namespace Nektar::Operators::detail
 {
 
@@ -62,10 +65,8 @@ public:
     {
         // Initialize pointers.
         auto const *inptr = in.GetStorage().GetCPUPtr();
-        auto *outptr0     = out.GetStorage().GetCPUPtr();
-        auto *outptr1     = outptr0 + out.GetFieldSize();
-        auto *outptr2     = outptr1 + out.GetFieldSize();
-        std::vector<TData *> outptr{outptr0, outptr1, outptr2};
+        auto *outptr      = out.GetStorage().GetCPUPtr();
+        auto nSize        = out.GetFieldSize();
 
         // Initialize index.
         size_t expIdx  = 0;
@@ -113,39 +114,38 @@ public:
                 {
                     Vmath::Vmul(nqTot * nElmts,
                                 m_derivFac[i * nDim].get() + dfindex, 1,
-                                deriv[0].get(), 1, outptr[i], 1);
+                                deriv[0].get(), 1, outptr + i * nSize, 1);
                     for (size_t d = 1; d < nDim; d++)
                     {
                         Vmath::Vvtvp(nqTot * nElmts,
                                      m_derivFac[i * nDim + d].get() + dfindex,
-                                     1, deriv[d].get(), 1, outptr[i], 1,
-                                     outptr[i], 1);
+                                     1, deriv[d].get(), 1, outptr + i * nSize,
+                                     1, outptr + i * nSize, 1);
                     }
-                    outptr[i] += (nPadElmts + nElmts) * nqTot;
                 }
+                outptr += (nPadElmts + nElmts) * nqTot;
                 dfindex += nqTot * nElmts;
             }
             else
             {
-                for (size_t i = 0; i < nCoord; i++)
+                for (size_t e = 0; e < nElmts; ++e)
                 {
-                    for (size_t e = 0; e < nElmts; ++e)
+                    for (size_t i = 0; i < nCoord; i++)
                     {
                         Vmath::Smul(nqTot, m_derivFac[i * nDim][dfindex + e],
-                                    deriv[0].get() + e * nqTot, 1, outptr[i],
-                                    1);
+                                    deriv[0].get() + e * nqTot, 1,
+                                    outptr + i * nSize, 1);
                         for (size_t d = 1; d < nDim; d++)
                         {
-                            Vmath::Svtvp(nqTot,
-                                         m_derivFac[i * nDim + d][dfindex + e],
-                                         deriv[d].get() + e * nqTot, 1,
-                                         outptr[i], 1, outptr[i], 1);
+                            Vmath::Svtvp(
+                                nqTot, m_derivFac[i * nDim + d][dfindex + e],
+                                deriv[d].get() + e * nqTot, 1,
+                                outptr + i * nSize, 1, outptr + i * nSize, 1);
                         }
-                        outptr[i] += nqTot;
                     }
-                    // skip padding elements
-                    outptr[i] += nPadElmts * nqTot;
+                    outptr += nqTot;
                 }
+                outptr += nPadElmts * nqTot;
                 dfindex += nElmts;
             }
             inptr += (nPadElmts + nElmts) * nqTot;