diff --git a/.gitlab-ci/packaging.yml b/.gitlab-ci/packaging.yml
index b734089ad8eb6c22d4041fa32c637882d9ec61a3..d6457be0272cd06088fc696ffe672bb68929b5da 100644
--- a/.gitlab-ci/packaging.yml
+++ b/.gitlab-ci/packaging.yml
@@ -33,7 +33,7 @@
   tags:
     - pkg
   variables:
-    GIT_SUBMODULE_STRATEGY: normal
+    GIT_SUBMODULE_STRATEGY: recursive
   script:
     - OS_DIST=$(echo $CI_JOB_NAME | cut -d- -f 2)
     - OS_VERSION=$(echo $CI_JOB_NAME | cut -d- -f 3)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8f4dfb3c9876a09c4933f50956b18a636c30f23d..f62b8667744116e660efc505b12eeb2d0b9b0788 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -39,6 +39,7 @@ v5.6.0
 - Fix MPI communicator for Parallel-in-Time (!1801)
 - Fix warnings issued by MSVC compiler in LibUtilities and StdRegions (!1740)
 - Fix Parareal convergence monitoring (!1802)
+- Avoid copying input/output to/from padded aligned arrays in MatrixFree operators(!1783)
 
 **CompressibleFlowSolver**
 - Complete second Frechet derivative implementation (!1761)
diff --git a/library/Collections/BwdTrans.cpp b/library/Collections/BwdTrans.cpp
index 84c5c5f03fa7f5162e1df4478b9425eb83adc644..2ee0f0a314006e7362baf730f0325cc132e98b99 100644
--- a/library/Collections/BwdTrans.cpp
+++ b/library/Collections/BwdTrans.cpp
@@ -155,7 +155,7 @@ OperatorKey BwdTrans_StdMat::m_typeArr[] = {
  * @brief Backward transform operator using matrix free operators.
  */
 class BwdTrans_MatrixFree final : virtual public Operator,
-                                  MatrixFreeOneInOneOut,
+                                  MatrixFreeBase,
                                   virtual public BwdTrans_Helper
 {
 public:
@@ -169,19 +169,7 @@ public:
                     [[maybe_unused]] Array<OneD, NekDouble> &output2,
                     [[maybe_unused]] Array<OneD, NekDouble> &wsp) final
     {
-        if (m_isPadded)
-        {
-            // copy into padded vector
-            Vmath::Vcopy(m_nIn, input, 1, m_input, 1);
-            // call op
-            (*m_oper)(m_input, m_output);
-            // copy out of padded vector
-            Vmath::Vcopy(m_nOut, m_output, 1, output0, 1);
-        }
-        else
-        {
-            (*m_oper)(input, output0);
-        }
+        (*m_oper)(input, output0);
     }
 
     void operator()([[maybe_unused]] int dir,
@@ -200,9 +188,9 @@ private:
                         CoalescedGeomDataSharedPtr pGeomData,
                         StdRegions::FactorMap factors)
         : Operator(pCollExp, pGeomData, factors), BwdTrans_Helper(),
-          MatrixFreeOneInOneOut(pCollExp[0]->GetStdExp()->GetNcoeffs(),
-                                pCollExp[0]->GetStdExp()->GetTotPoints(),
-                                pCollExp.size())
+          MatrixFreeBase(pCollExp[0]->GetStdExp()->GetNcoeffs(),
+                         pCollExp[0]->GetStdExp()->GetTotPoints(),
+                         pCollExp.size())
     {
         // Basis vector.
         const auto dim = pCollExp[0]->GetStdExp()->GetShapeDimension();
@@ -219,7 +207,7 @@ private:
         std::string op_string = "BwdTrans";
         op_string += MatrixFree::GetOpstring(shapeType, false);
         auto oper = MatrixFree::GetOperatorFactory().CreateInstance(
-            op_string, basis, m_nElmtPad);
+            op_string, basis, pCollExp.size());
 
         m_oper = std::dynamic_pointer_cast<MatrixFree::BwdTrans>(oper);
         ASSERTL0(m_oper, "Failed to cast pointer.");
diff --git a/library/Collections/Helmholtz.cpp b/library/Collections/Helmholtz.cpp
index 93706346fa16ff61e1657002e3e1a70b08a61b0c..001b3bbdf944fbc6ae1d57a06de43d7f4918de80 100644
--- a/library/Collections/Helmholtz.cpp
+++ b/library/Collections/Helmholtz.cpp
@@ -583,7 +583,7 @@ OperatorKey Helmholtz_IterPerExp::m_typeArr[] = {
  * @brief Helmholtz operator using matrix free operators.
  */
 class Helmholtz_MatrixFree final : virtual public Operator,
-                                   MatrixFreeOneInOneOut,
+                                   MatrixFreeBase,
                                    Helmholtz_Helper
 {
 public:
@@ -597,17 +597,7 @@ public:
                     [[maybe_unused]] Array<OneD, NekDouble> &output2,
                     [[maybe_unused]] Array<OneD, NekDouble> &wsp) final
     {
-        if (m_isPadded)
-        {
-            // copy into padded vector
-            Vmath::Vcopy(m_nmtot, input, 1, m_input, 1);
-            (*m_oper)(m_input, m_output);
-            Vmath::Vcopy(m_nmtot, m_output, 1, output0, 1);
-        }
-        else
-        {
-            (*m_oper)(input, output0);
-        }
+        (*m_oper)(input, output0);
     }
 
     void operator()([[maybe_unused]] int dir,
@@ -723,9 +713,9 @@ private:
                          CoalescedGeomDataSharedPtr pGeomData,
                          StdRegions::FactorMap factors)
         : Operator(pCollExp, pGeomData, factors),
-          MatrixFreeOneInOneOut(pCollExp[0]->GetStdExp()->GetNcoeffs(),
-                                pCollExp[0]->GetStdExp()->GetNcoeffs(),
-                                pCollExp.size()),
+          MatrixFreeBase(pCollExp[0]->GetStdExp()->GetNcoeffs(),
+                         pCollExp[0]->GetStdExp()->GetNcoeffs(),
+                         pCollExp.size()),
           Helmholtz_Helper()
     {
 
@@ -747,7 +737,7 @@ private:
         std::string op_string = "Helmholtz";
         op_string += MatrixFree::GetOpstring(shapeType, m_isDeformed);
         auto oper = MatrixFree::GetOperatorFactory().CreateInstance(
-            op_string, basis, m_nElmtPad);
+            op_string, basis, pCollExp.size());
 
         // Set Jacobian
         oper->SetJac(pGeomData->GetJacInterLeave(pCollExp, m_nElmtPad));
diff --git a/library/Collections/IProductWRTBase.cpp b/library/Collections/IProductWRTBase.cpp
index 80ccf759d482a6e9d82779539c42e54d576c7724..d50bc3972548945b83fefdafc926d85cd444f73e 100644
--- a/library/Collections/IProductWRTBase.cpp
+++ b/library/Collections/IProductWRTBase.cpp
@@ -175,7 +175,7 @@ OperatorKey IProductWRTBase_StdMat::m_typeArr[] = {
  * @brief Inner product operator using operator using matrix free operators.
  */
 class IProductWRTBase_MatrixFree final : virtual public Operator,
-                                         MatrixFreeOneInOneOut,
+                                         MatrixFreeBase,
                                          virtual public IProductWRTBase_Helper
 {
 public:
@@ -189,19 +189,7 @@ public:
                     [[maybe_unused]] Array<OneD, NekDouble> &output2,
                     [[maybe_unused]] Array<OneD, NekDouble> &wsp) final
     {
-        if (m_isPadded)
-        {
-            // copy into padded vector
-            Vmath::Vcopy(m_nIn, input, 1, m_input, 1);
-            // call op
-            (*m_oper)(m_input, m_output);
-            // copy out of padded vector
-            Vmath::Vcopy(m_nOut, m_output, 1, output, 1);
-        }
-        else
-        {
-            (*m_oper)(input, output);
-        }
+        (*m_oper)(input, output);
     }
 
     void operator()([[maybe_unused]] int dir,
@@ -219,9 +207,9 @@ private:
         vector<StdRegions::StdExpansionSharedPtr> pCollExp,
         CoalescedGeomDataSharedPtr pGeomData, StdRegions::FactorMap factors)
         : Operator(pCollExp, pGeomData, factors), IProductWRTBase_Helper(),
-          MatrixFreeOneInOneOut(pCollExp[0]->GetStdExp()->GetTotPoints(),
-                                pCollExp[0]->GetStdExp()->GetNcoeffs(),
-                                pCollExp.size())
+          MatrixFreeBase(pCollExp[0]->GetStdExp()->GetTotPoints(),
+                         pCollExp[0]->GetStdExp()->GetNcoeffs(),
+                         pCollExp.size())
     {
         // Basis vector
         const auto dim = pCollExp[0]->GetStdExp()->GetShapeDimension();
@@ -238,7 +226,7 @@ private:
         std::string op_string = "IProduct";
         op_string += MatrixFree::GetOpstring(shapeType, m_isDeformed);
         auto oper = MatrixFree::GetOperatorFactory().CreateInstance(
-            op_string, basis, m_nElmtPad);
+            op_string, basis, pCollExp.size());
 
         // Set Jacobian
         oper->SetJac(pGeomData->GetJacInterLeave(pCollExp, m_nElmtPad));
diff --git a/library/Collections/IProductWRTDerivBase.cpp b/library/Collections/IProductWRTDerivBase.cpp
index 83244f9cd24ec2e2f28b6d8df76261c0ec81f38e..534b85eb262003f78dbf05d6439f9a6deed86705 100644
--- a/library/Collections/IProductWRTDerivBase.cpp
+++ b/library/Collections/IProductWRTDerivBase.cpp
@@ -271,7 +271,7 @@ OperatorKey IProductWRTDerivBase_StdMat::m_typeArr[] = {
  * @brief Inner product operator using operator using matrix free operators.
  */
 class IProductWRTDerivBase_MatrixFree final : virtual public Operator,
-                                              MatrixFreeMultiInOneOut,
+                                              MatrixFreeBase,
                                               IProductWRTDerivBase_Helper
 {
 public:
@@ -286,66 +286,32 @@ public:
                     [[maybe_unused]] Array<OneD, NekDouble> &wsp) final
     {
         Array<OneD, NekDouble> output;
+        Array<OneD, Array<OneD, NekDouble>> input(m_coordim);
 
-        if (m_isPadded)
+        // copy into padded vector
+        switch (m_coordim)
         {
-            // copy into padded vector
-            switch (m_coordim)
-            {
-                case 1:
-                    output = entry1;
-                    Vmath::Vcopy(m_nIn, entry0, 1, m_input[0], 1);
-                    break;
-                case 2:
-                    output = entry2;
-                    Vmath::Vcopy(m_nIn, entry0, 1, m_input[0], 1);
-                    Vmath::Vcopy(m_nIn, entry1, 1, m_input[1], 1);
-                    break;
-                case 3:
-                    output = entry3;
-                    Vmath::Vcopy(m_nIn, entry0, 1, m_input[0], 1);
-                    Vmath::Vcopy(m_nIn, entry1, 1, m_input[1], 1);
-                    Vmath::Vcopy(m_nIn, entry2, 1, m_input[2], 1);
-                    break;
-                default:
-                    NEKERROR(ErrorUtil::efatal, "m_coordim value not valid");
-                    break;
-            }
-
-            // call op
-            (*m_oper)(m_input, m_output);
-            // copy out of padded vector
-            Vmath::Vcopy(m_nOut, m_output, 1, output, 1);
+            case 1:
+                input[0] = entry0;
+                output   = entry1;
+                break;
+            case 2:
+                input[0] = entry0;
+                input[1] = entry1;
+                output   = entry2;
+                break;
+            case 3:
+                input[0] = entry0;
+                input[1] = entry1;
+                input[2] = entry2;
+                output   = entry3;
+                break;
+            default:
+                NEKERROR(ErrorUtil::efatal, "coordim not valid");
+                break;
         }
-        else
-        {
-            Array<OneD, Array<OneD, NekDouble>> input{m_coordim};
 
-            // copy into padded vector
-            switch (m_coordim)
-            {
-                case 1:
-                    output   = entry1;
-                    input[0] = entry0;
-                    break;
-                case 2:
-                    output   = entry2;
-                    input[0] = entry0;
-                    input[1] = entry1;
-                    break;
-                case 3:
-                    output   = entry3;
-                    input[0] = entry0;
-                    input[1] = entry1;
-                    input[2] = entry2;
-                    break;
-                default:
-                    NEKERROR(ErrorUtil::efatal, "coordim not valid");
-                    break;
-            }
-
-            (*m_oper)(input, output);
-        }
+        (*m_oper)(input, output);
     }
 
     void operator()([[maybe_unused]] int dir,
@@ -358,19 +324,19 @@ public:
 
 private:
     std::shared_ptr<MatrixFree::IProductWRTDerivBase> m_oper;
+    int m_coordim;
 
     IProductWRTDerivBase_MatrixFree(
         vector<StdRegions::StdExpansionSharedPtr> pCollExp,
         CoalescedGeomDataSharedPtr pGeomData, StdRegions::FactorMap factors)
         : Operator(pCollExp, pGeomData, factors),
-          MatrixFreeMultiInOneOut(pCollExp[0]->GetCoordim(),
-                                  pCollExp[0]->GetStdExp()->GetTotPoints(),
-                                  pCollExp[0]->GetStdExp()->GetNcoeffs(),
-                                  pCollExp.size()),
+          MatrixFreeBase(pCollExp[0]->GetStdExp()->GetTotPoints(),
+                         pCollExp[0]->GetStdExp()->GetNcoeffs(),
+                         pCollExp.size()),
           IProductWRTDerivBase_Helper()
     {
-        // Check if deformed
         const auto dim = pCollExp[0]->GetStdExp()->GetShapeDimension();
+        m_coordim      = pCollExp[0]->GetCoordim();
 
         // Basis vector
         std::vector<LibUtilities::BasisSharedPtr> basis(dim);
@@ -386,7 +352,7 @@ private:
         std::string op_string = "IProductWRTDerivBase";
         op_string += MatrixFree::GetOpstring(shapeType, m_isDeformed);
         auto oper = MatrixFree::GetOperatorFactory().CreateInstance(
-            op_string, basis, m_nElmtPad);
+            op_string, basis, pCollExp.size());
 
         // Set Jacobian
         oper->SetJac(pGeomData->GetJacInterLeave(pCollExp, m_nElmtPad));
diff --git a/library/Collections/LinearAdvectionDiffusionReaction.cpp b/library/Collections/LinearAdvectionDiffusionReaction.cpp
index 63be926dd62d762ef300efd6d12b467a738063ea..cc917c236adf401bc9e2481cc789f71c0215c3fc 100644
--- a/library/Collections/LinearAdvectionDiffusionReaction.cpp
+++ b/library/Collections/LinearAdvectionDiffusionReaction.cpp
@@ -548,7 +548,7 @@ OperatorKey LinearAdvectionDiffusionReaction_IterPerExp::m_typeArr[] = {
  */
 class LinearAdvectionDiffusionReaction_MatrixFree final
     : virtual public Operator,
-      MatrixFreeOneInOneOut,
+      MatrixFreeBase,
       LinearAdvectionDiffusionReaction_Helper
 {
 public:
@@ -562,17 +562,7 @@ public:
                     [[maybe_unused]] Array<OneD, NekDouble> &output2,
                     [[maybe_unused]] Array<OneD, NekDouble> &wsp) final
     {
-        if (m_isPadded)
-        {
-            // copy into padded vector
-            Vmath::Vcopy(m_nmtot, input, 1, m_input, 1);
-            (*m_oper)(m_input, m_output);
-            Vmath::Vcopy(m_nmtot, m_output, 1, output0, 1);
-        }
-        else
-        {
-            (*m_oper)(input, output0);
-        }
+        (*m_oper)(input, output0);
     }
 
     void operator()([[maybe_unused]] int dir,
@@ -746,9 +736,9 @@ private:
         vector<StdRegions::StdExpansionSharedPtr> pCollExp,
         CoalescedGeomDataSharedPtr pGeomData, StdRegions::FactorMap factors)
         : Operator(pCollExp, pGeomData, factors),
-          MatrixFreeOneInOneOut(pCollExp[0]->GetStdExp()->GetNcoeffs(),
-                                pCollExp[0]->GetStdExp()->GetNcoeffs(),
-                                pCollExp.size()),
+          MatrixFreeBase(pCollExp[0]->GetStdExp()->GetNcoeffs(),
+                         pCollExp[0]->GetStdExp()->GetNcoeffs(),
+                         pCollExp.size()),
           LinearAdvectionDiffusionReaction_Helper()
     {
         m_nmtot = m_numElmt * pCollExp[0]->GetStdExp()->GetNcoeffs();
@@ -770,7 +760,7 @@ private:
         std::string op_string = "LinearAdvectionDiffusionReaction";
         op_string += MatrixFree::GetOpstring(shapeType, m_isDeformed);
         auto oper = MatrixFree::GetOperatorFactory().CreateInstance(
-            op_string, basis, m_nElmtPad);
+            op_string, basis, pCollExp.size());
 
         // Get N quadpoints with padding
         m_nqtot = m_numElmt * pCollExp[0]->GetStdExp()->GetTotPoints();
diff --git a/library/Collections/MatrixFreeBase.h b/library/Collections/MatrixFreeBase.h
index d3031f3fd64db105e2cddb3f4d1d8e2782c1dd5b..b12299e59f7601ac5e3196113c427dc6b0648b15 100644
--- a/library/Collections/MatrixFreeBase.h
+++ b/library/Collections/MatrixFreeBase.h
@@ -50,56 +50,6 @@ public:
                    const unsigned int nCollSize)
         : m_nIn(nIn * nCollSize), m_nOut(nOut * nCollSize)
     {
-    }
-
-protected:
-    /// flag for padding
-    bool m_isPadded{false};
-    ///  size after padding
-    unsigned int m_nElmtPad;
-    unsigned int m_nIn;
-    unsigned int m_nOut;
-};
-
-class MatrixFreeOneInOneOut : protected MatrixFreeBase
-{
-public:
-    /// Constructor
-    MatrixFreeOneInOneOut(const unsigned int nIn, const unsigned int nOut,
-                          const unsigned int nCollSize)
-        : MatrixFreeBase(nIn, nOut, nCollSize)
-    {
-        // Padding if needed
-        using vec_t           = tinysimd::simd<NekDouble>;
-        const auto nElmtNoPad = nCollSize;
-        m_nElmtPad            = nElmtNoPad;
-
-        if (nElmtNoPad % vec_t::width != 0)
-        {
-            m_isPadded = true;
-            m_nElmtPad =
-                nElmtNoPad + vec_t::width - (nElmtNoPad % vec_t::width);
-            m_input  = Array<OneD, NekDouble>{nIn * m_nElmtPad, 0.0};
-            m_output = Array<OneD, NekDouble>{nOut * m_nElmtPad, 0.0};
-        }
-    }
-
-protected:
-    /// padded input/output vectors
-    Array<OneD, NekDouble> m_input, m_output;
-};
-
-class MatrixFreeMultiInOneOut : protected MatrixFreeBase
-{
-public:
-    /// Constructor
-    MatrixFreeMultiInOneOut(const unsigned int coordim, const unsigned int nIn,
-                            const unsigned int nOut,
-                            const unsigned int nCollSize)
-        : MatrixFreeBase(nIn, nOut, nCollSize)
-    {
-        m_coordim = coordim;
-
         // Padding if needed
         using vec_t           = tinysimd::simd<NekDouble>;
         const auto nElmtNoPad = nCollSize;
@@ -110,74 +60,19 @@ public:
             m_isPadded = true;
             m_nElmtPad =
                 nElmtNoPad + vec_t::width - (nElmtNoPad % vec_t::width);
-
-            m_input    = Array<OneD, Array<OneD, NekDouble>>(m_coordim);
-            m_input[0] = Array<OneD, NekDouble>{nIn * m_nElmtPad, 0.0};
-            if (m_coordim == 2)
-            {
-                m_input[1] = Array<OneD, NekDouble>{nIn * m_nElmtPad, 0.0};
-            }
-            else if (m_coordim == 3)
-            {
-                m_input[1] = Array<OneD, NekDouble>{nIn * m_nElmtPad, 0.0};
-                m_input[2] = Array<OneD, NekDouble>{nIn * m_nElmtPad, 0.0};
-            }
-            m_output = Array<OneD, NekDouble>{nOut * m_nElmtPad, 0.0};
         }
     }
 
 protected:
-    /// coordinates dimension
-    unsigned short m_coordim;
-    /// padded input/output vectors
-    Array<OneD, Array<OneD, NekDouble>> m_input;
-    Array<OneD, NekDouble> m_output;
+    /// flag for padding
+    bool m_isPadded{false};
+    ///  size after padding
+    unsigned int m_nElmtPad;
+    /// actural size of input array
+    unsigned int m_nIn;
+    /// actural size of output array
+    unsigned int m_nOut;
 };
 
-class MatrixFreeOneInMultiOut : protected MatrixFreeBase
-{
-public:
-    /// Constructor
-    MatrixFreeOneInMultiOut(const unsigned int coordim, const unsigned int nIn,
-                            const unsigned int nOut,
-                            const unsigned int nCollSize)
-        : MatrixFreeBase(nIn, nOut, nCollSize)
-    {
-        m_coordim = coordim;
-
-        // Padding if needed
-        using vec_t           = tinysimd::simd<NekDouble>;
-        const auto nElmtNoPad = nCollSize;
-        m_nElmtPad            = nElmtNoPad;
-
-        if (nElmtNoPad % vec_t::width != 0)
-        {
-            m_isPadded = true;
-            m_nElmtPad =
-                nElmtNoPad + vec_t::width - (nElmtNoPad % vec_t::width);
-
-            m_input = Array<OneD, NekDouble>{nIn * m_nElmtPad, 0.0};
-
-            m_output    = Array<OneD, Array<OneD, NekDouble>>(m_coordim);
-            m_output[0] = Array<OneD, NekDouble>{nOut * m_nElmtPad, 0.0};
-            if (m_coordim == 2)
-            {
-                m_output[1] = Array<OneD, NekDouble>{nOut * m_nElmtPad, 0.0};
-            }
-            else if (m_coordim == 3)
-            {
-                m_output[1] = Array<OneD, NekDouble>{nOut * m_nElmtPad, 0.0};
-                m_output[2] = Array<OneD, NekDouble>{nOut * m_nElmtPad, 0.0};
-            }
-        }
-    }
-
-protected:
-    /// coordinates dimension
-    unsigned short m_coordim;
-    /// padded input/output vectors
-    Array<OneD, NekDouble> m_input;
-    Array<OneD, Array<OneD, NekDouble>> m_output;
-};
 } // namespace Nektar::Collections
 #endif // NEKTAR_LIBRARY_COLLECTIONS_MATRIXFREEBASE_H
diff --git a/library/Collections/PhysDeriv.cpp b/library/Collections/PhysDeriv.cpp
index 6d6504e1bcaf72b9dd1713ee76c9d21b93b365d2..7c06af80516ec1f2a7f8f2241601b6a8133d83c1 100644
--- a/library/Collections/PhysDeriv.cpp
+++ b/library/Collections/PhysDeriv.cpp
@@ -267,7 +267,7 @@ OperatorKey PhysDeriv_StdMat::m_typeArr[] = {
  * @brief Phys deriv operator using matrix free operators.
  */
 class PhysDeriv_MatrixFree final : virtual public Operator,
-                                   MatrixFreeOneInMultiOut,
+                                   MatrixFreeBase,
                                    virtual public PhysDeriv_Helper
 {
 public:
@@ -281,87 +281,69 @@ public:
                     Array<OneD, NekDouble> &output2,
                     [[maybe_unused]] Array<OneD, NekDouble> &wsp) final
     {
-        if (m_isPadded)
-        {
-            // copy into padded vector
-            Vmath::Vcopy(m_nIn, input, 1, m_input, 1);
-            (*m_oper)(m_input, m_output);
-        }
-        else
-        {
-            (*m_oper)(input, m_output);
-        }
-
+        Array<OneD, Array<OneD, NekDouble>> output(m_coordim);
         // currently using temporary local temporary space for output
         // to allow for other operator call below which is
         // directionally dependent
         switch (m_coordim)
         {
             case 1:
-                Vmath::Vcopy(m_nOut, m_output[0], 1, output0, 1);
+                output[0] = output0;
                 break;
             case 2:
-                Vmath::Vcopy(m_nOut, m_output[0], 1, output0, 1);
-                Vmath::Vcopy(m_nOut, m_output[1], 1, output1, 1);
+                output[0] = output0;
+                output[1] = output1;
                 break;
             case 3:
-                Vmath::Vcopy(m_nOut, m_output[0], 1, output0, 1);
-                Vmath::Vcopy(m_nOut, m_output[1], 1, output1, 1);
-                Vmath::Vcopy(m_nOut, m_output[2], 1, output2, 1);
+                output[0] = output0;
+                output[1] = output1;
+                output[2] = output2;
                 break;
             default:
                 NEKERROR(ErrorUtil::efatal, "Unknown coordinate dimension");
                 break;
         }
+        (*m_oper)(input, output);
     }
 
     void operator()(int dir, const Array<OneD, const NekDouble> &input,
                     Array<OneD, NekDouble> &output,
                     [[maybe_unused]] Array<OneD, NekDouble> &wsp) final
     {
-        if (m_isPadded)
-        {
-            // copy into padded vector
-            Vmath::Vcopy(m_nIn, input, 1, m_input, 1);
-            (*m_oper)(m_input, m_output);
-        }
-        else
-        {
-            (*m_oper)(input, m_output);
-        }
+        (*m_oper)(input, m_output);
         Vmath::Vcopy(m_nOut, m_output[dir], 1, output, 1);
     }
 
 private:
     std::shared_ptr<MatrixFree::PhysDeriv> m_oper;
+    int m_coordim;
+    Array<OneD, Array<OneD, NekDouble>> m_output;
 
     PhysDeriv_MatrixFree(vector<StdRegions::StdExpansionSharedPtr> pCollExp,
                          CoalescedGeomDataSharedPtr pGeomData,
                          StdRegions::FactorMap factors)
         : Operator(pCollExp, pGeomData, factors), PhysDeriv_Helper(),
-          MatrixFreeOneInMultiOut(pCollExp[0]->GetCoordim(),
-                                  pCollExp[0]->GetStdExp()->GetTotPoints(),
-                                  pCollExp[0]->GetStdExp()->GetTotPoints(),
-                                  pCollExp.size())
+          MatrixFreeBase(pCollExp[0]->GetStdExp()->GetTotPoints(),
+                         pCollExp[0]->GetStdExp()->GetTotPoints(),
+                         pCollExp.size())
     {
         // Check if deformed
         bool deformed{pGeomData->IsDeformed(pCollExp)};
         const auto dim = pCollExp[0]->GetStdExp()->GetShapeDimension();
 
-        if (m_isPadded == false) // declare local space non-padded case
+        // only used operator(dir, in, out)
+        m_coordim   = pCollExp[0]->GetCoordim();
+        int nOut    = pCollExp[0]->GetStdExp()->GetTotPoints();
+        m_output    = Array<OneD, Array<OneD, NekDouble>>(m_coordim);
+        m_output[0] = Array<OneD, NekDouble>{nOut * m_nElmtPad, 0.0};
+        if (m_coordim == 2)
         {
-            int nOut    = pCollExp[0]->GetStdExp()->GetTotPoints();
-            m_output    = Array<OneD, Array<OneD, NekDouble>>(m_coordim);
-            m_output[0] = Array<OneD, NekDouble>{nOut * m_nElmtPad, 0.0};
-            if (m_coordim == 2)
-            {
-                m_output[1] = Array<OneD, NekDouble>{nOut * m_nElmtPad, 0.0};
-            }
-            else if (m_coordim == 3)
-            {
-                m_output[1] = Array<OneD, NekDouble>{nOut * m_nElmtPad, 0.0};
-                m_output[2] = Array<OneD, NekDouble>{nOut * m_nElmtPad, 0.0};
-            }
+            m_output[1] = Array<OneD, NekDouble>{nOut * m_nElmtPad, 0.0};
+        }
+        else if (m_coordim == 3)
+        {
+            m_output[1] = Array<OneD, NekDouble>{nOut * m_nElmtPad, 0.0};
+            m_output[2] = Array<OneD, NekDouble>{nOut * m_nElmtPad, 0.0};
         }
 
         // Basis vector.
@@ -378,7 +360,7 @@ private:
         std::string op_string = "PhysDeriv";
         op_string += MatrixFree::GetOpstring(shapeType, deformed);
         auto oper = MatrixFree::GetOperatorFactory().CreateInstance(
-            op_string, basis, m_nElmtPad);
+            op_string, basis, pCollExp.size());
 
         // Set derivative factors
         oper->SetDF(pGeomData->GetDerivFactorsInterLeave(pCollExp, m_nElmtPad));
diff --git a/library/Collections/PhysInterp1DScaled.cpp b/library/Collections/PhysInterp1DScaled.cpp
index 3399015c4cf5dca8be7190b976fdfaa9268c4728..1f7c4bdb189ef03a7f32dfba6155dcf6bbf19175 100644
--- a/library/Collections/PhysInterp1DScaled.cpp
+++ b/library/Collections/PhysInterp1DScaled.cpp
@@ -182,7 +182,7 @@ private:
  */
 
 class PhysInterp1DScaled_MatrixFree final : virtual public Operator,
-                                            MatrixFreeOneInOneOut,
+                                            MatrixFreeBase,
                                             PhysInterp1DScaled_Helper
 {
 public:
@@ -198,19 +198,7 @@ public:
                     [[maybe_unused]] Array<OneD, NekDouble> &output2,
                     [[maybe_unused]] Array<OneD, NekDouble> &wsp) final
     {
-        if (m_isPadded)
-        {
-            // copy into padded vector
-            Vmath::Vcopy(m_nIn, input, 1, m_input, 1);
-            // call op
-            (*m_oper)(m_input, m_output);
-            // copy out of padded vector
-            Vmath::Vcopy(m_nOut, m_output, 1, output0, 1);
-        }
-        else
-        {
-            (*m_oper)(input, output0);
-        }
+        (*m_oper)(input, output0);
     }
 
     void operator()([[maybe_unused]] int dir,
@@ -228,15 +216,15 @@ public:
     }
 
 private:
-    std::shared_ptr<MatrixFree::PhysInterp1DScaled> m_oper;
+    std::shared_ptr<MatrixFree::BwdTrans> m_oper;
 
     PhysInterp1DScaled_MatrixFree(
         vector<StdRegions::StdExpansionSharedPtr> pCollExp,
         CoalescedGeomDataSharedPtr pGeomData, StdRegions::FactorMap factors)
         : Operator(pCollExp, pGeomData, factors),
-          MatrixFreeOneInOneOut(pCollExp[0]->GetStdExp()->GetNcoeffs(),
-                                pCollExp[0]->GetStdExp()->GetTotPoints(),
-                                pCollExp.size()),
+          MatrixFreeBase(pCollExp[0]->GetStdExp()->GetNcoeffs(),
+                         pCollExp[0]->GetStdExp()->GetTotPoints(),
+                         pCollExp.size()),
           PhysInterp1DScaled_Helper()
     {
         // Basis vector.
@@ -251,13 +239,12 @@ private:
         auto shapeType = pCollExp[0]->GetStdExp()->DetShapeType();
 
         // Generate operator string and create operator.
-        std::string op_string = "PhysInterp1DScaled";
+        std::string op_string = "BwdTrans";
         op_string += MatrixFree::GetOpstring(shapeType, false);
         auto oper = MatrixFree::GetOperatorFactory().CreateInstance(
-            op_string, basis, m_nElmtPad);
+            op_string, basis, pCollExp.size());
 
-        m_oper =
-            std::dynamic_pointer_cast<MatrixFree::PhysInterp1DScaled>(oper);
+        m_oper = std::dynamic_pointer_cast<MatrixFree::BwdTrans>(oper);
         ASSERTL0(m_oper, "Failed to cast pointer.");
     }
 };
diff --git a/library/LibUtilities/SimdLib/avx2.hpp b/library/LibUtilities/SimdLib/avx2.hpp
index 25821a2073ec69aea1ad06bccfef7d221808e474..95ec832fdfeb0518563e9485093ffe10b4b2a876 100644
--- a/library/LibUtilities/SimdLib/avx2.hpp
+++ b/library/LibUtilities/SimdLib/avx2.hpp
@@ -605,6 +605,21 @@ inline avx2Double4 log(avx2Double4 in)
 #endif
 }
 
+inline void load_unalign_interleave(
+    const double *in, const std::uint32_t dataLen,
+    std::vector<avx2Double4, allocator<avx2Double4>> &out)
+{
+    alignas(avx2Double4::alignment) avx2Double4::scalarArray tmp;
+    for (size_t i = 0; i < dataLen; ++i)
+    {
+        tmp[0] = in[i];
+        tmp[1] = in[i + dataLen];
+        tmp[2] = in[i + 2 * dataLen];
+        tmp[3] = in[i + 3 * dataLen];
+        out[i].load(tmp);
+    }
+}
+
 inline void load_interleave(
     const double *in, std::uint32_t dataLen,
     std::vector<avx2Double4, allocator<avx2Double4>> &out)
@@ -640,6 +655,21 @@ inline void load_interleave(
     }
 }
 
+inline void deinterleave_unalign_store(
+    const std::vector<avx2Double4, allocator<avx2Double4>> &in,
+    const std::uint32_t dataLen, double *out)
+{
+    alignas(avx2Double4::alignment) avx2Double4::scalarArray tmp;
+    for (size_t i = 0; i < dataLen; ++i)
+    {
+        in[i].store(tmp);
+        out[i]               = tmp[0];
+        out[i + dataLen]     = tmp[1];
+        out[i + 2 * dataLen] = tmp[2];
+        out[i + 3 * dataLen] = tmp[3];
+    }
+}
+
 inline void deinterleave_store(
     const std::vector<avx2Double4, allocator<avx2Double4>> &in,
     std::uint32_t dataLen, double *out)
@@ -862,6 +892,25 @@ inline avx2Float8 log(avx2Float8 in)
     return ret;
 }
 
+inline void load_unalign_interleave(
+    const double *in, const std::uint32_t dataLen,
+    std::vector<avx2Float8, allocator<avx2Float8>> &out)
+{
+    alignas(avx2Float8::alignment) avx2Float8::scalarArray tmp;
+    for (size_t i = 0; i < dataLen; ++i)
+    {
+        tmp[0] = in[i];
+        tmp[1] = in[i + dataLen];
+        tmp[2] = in[i + 2 * dataLen];
+        tmp[3] = in[i + 3 * dataLen];
+        tmp[4] = in[i + 4 * dataLen];
+        tmp[5] = in[i + 5 * dataLen];
+        tmp[6] = in[i + 6 * dataLen];
+        tmp[7] = in[i + 7 * dataLen];
+        out[i].load(tmp);
+    }
+}
+
 inline void load_interleave(const float *in, std::uint32_t dataLen,
                             std::vector<avx2Float8, allocator<avx2Float8>> &out)
 {
@@ -898,6 +947,25 @@ inline void load_interleave(const float *in, std::uint32_t dataLen,
     }
 }
 
+inline void deinterleave_unalign_store(
+    const std::vector<avx2Float8, allocator<avx2Float8>> &in,
+    const std::uint32_t dataLen, double *out)
+{
+    alignas(avx2Float8::alignment) avx2Float8::scalarArray tmp;
+    for (size_t i = 0; i < dataLen; ++i)
+    {
+        in[i].store(tmp);
+        out[i]               = tmp[0];
+        out[i + dataLen]     = tmp[1];
+        out[i + 2 * dataLen] = tmp[2];
+        out[i + 3 * dataLen] = tmp[3];
+        out[i + 4 * dataLen] = tmp[4];
+        out[i + 5 * dataLen] = tmp[5];
+        out[i + 6 * dataLen] = tmp[6];
+        out[i + 7 * dataLen] = tmp[7];
+    }
+}
+
 inline void deinterleave_store(
     const std::vector<avx2Float8, allocator<avx2Float8>> &in,
     std::uint32_t dataLen, float *out)
diff --git a/library/LibUtilities/SimdLib/avx512.hpp b/library/LibUtilities/SimdLib/avx512.hpp
index 518681ef71b0b92631da8c86d4e6415b23cbb841..d40c8f4959786371e23c4aad38064c9e98a67ca9 100644
--- a/library/LibUtilities/SimdLib/avx512.hpp
+++ b/library/LibUtilities/SimdLib/avx512.hpp
@@ -601,6 +601,25 @@ inline avx512Double8 log(avx512Double8 in)
 #endif
 }
 
+inline void load_unalign_interleave(
+    const double *in, const std::uint32_t dataLen,
+    std::vector<avx512Double8, allocator<avx512Double8>> &out)
+{
+    alignas(avx512Double8::alignment) avx512Double8::scalarArray tmp;
+    for (size_t i = 0; i < dataLen; ++i)
+    {
+        tmp[0] = in[i];
+        tmp[1] = in[i + dataLen];
+        tmp[2] = in[i + 2 * dataLen];
+        tmp[3] = in[i + 3 * dataLen];
+        tmp[4] = in[i + 4 * dataLen];
+        tmp[5] = in[i + 5 * dataLen];
+        tmp[6] = in[i + 6 * dataLen];
+        tmp[7] = in[i + 7 * dataLen];
+        out[i].load(tmp);
+    }
+}
+
 inline void load_interleave(
     const double *in, std::uint32_t dataLen,
     std::vector<avx512Double8, allocator<avx512Double8>> &out)
@@ -640,6 +659,25 @@ inline void load_interleave(
     }
 }
 
+inline void deinterleave_unalign_store(
+    const std::vector<avx512Double8, allocator<avx512Double8>> &in,
+    const std::uint32_t dataLen, double *out)
+{
+    alignas(avx512Double8::alignment) avx512Double8::scalarArray tmp;
+    for (size_t i = 0; i < dataLen; ++i)
+    {
+        in[i].store(tmp);
+        out[i]               = tmp[0];
+        out[i + dataLen]     = tmp[1];
+        out[i + 2 * dataLen] = tmp[2];
+        out[i + 3 * dataLen] = tmp[3];
+        out[i + 4 * dataLen] = tmp[4];
+        out[i + 5 * dataLen] = tmp[5];
+        out[i + 6 * dataLen] = tmp[6];
+        out[i + 7 * dataLen] = tmp[7];
+    }
+}
+
 inline void deinterleave_store(
     const std::vector<avx512Double8, allocator<avx512Double8>> &in,
     std::uint32_t dataLen, double *out)
@@ -864,6 +902,33 @@ inline avx512Float16 log(avx512Float16 in)
 #endif
 }
 
+inline void load_unalign_interleave(
+    const double *in, const std::uint32_t dataLen,
+    std::vector<avx512Float16, allocator<avx512Float16>> &out)
+{
+    alignas(avx512Float16::alignment) avx512Float16::scalarArray tmp;
+    for (size_t i = 0; i < dataLen; ++i)
+    {
+        tmp[0]  = in[i];
+        tmp[1]  = in[i + dataLen];
+        tmp[2]  = in[i + 2 * dataLen];
+        tmp[3]  = in[i + 3 * dataLen];
+        tmp[4]  = in[i + 4 * dataLen];
+        tmp[5]  = in[i + 5 * dataLen];
+        tmp[6]  = in[i + 6 * dataLen];
+        tmp[7]  = in[i + 7 * dataLen];
+        tmp[8]  = in[i + 8 * dataLen];
+        tmp[9]  = in[i + 9 * dataLen];
+        tmp[10] = in[i + 10 * dataLen];
+        tmp[11] = in[i + 11 * dataLen];
+        tmp[12] = in[i + 12 * dataLen];
+        tmp[13] = in[i + 13 * dataLen];
+        tmp[14] = in[i + 14 * dataLen];
+        tmp[15] = in[i + 15 * dataLen];
+        out[i].load(tmp);
+    }
+}
+
 inline void load_interleave(
     const float *in, std::uint32_t dataLen,
     std::vector<avx512Float16, allocator<avx512Float16>> &out)
@@ -871,8 +936,22 @@ inline void load_interleave(
 
     alignas(avx512Float16::alignment)
         avx512Float16::scalarIndexType tmp[avx512Float16::width] = {
-            0,           dataLen,     2 * dataLen, 3 * dataLen,
-            4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
+            0,
+            dataLen,
+            2 * dataLen,
+            3 * dataLen,
+            4 * dataLen,
+            5 * dataLen,
+            6 * dataLen,
+            7 * dataLen,
+            8 * dataLen,
+            9 * dataLen,
+            10 * dataLen,
+            11 * dataLen,
+            12 * dataLen,
+            13 * dataLen,
+            14 * dataLen,
+            15 * dataLen};
 
     using index_t = avx512Int16<avx512Float16::scalarIndexType>;
     index_t index0(tmp);
@@ -903,6 +982,33 @@ inline void load_interleave(
     }
 }
 
+inline void deinterleave_unalign_store(
+    const std::vector<avx512Float16, allocator<avx512Float16>> &in,
+    const std::uint32_t dataLen, double *out)
+{
+    alignas(avx512Float16::alignment) avx512Float16::scalarArray tmp;
+    for (size_t i = 0; i < dataLen; ++i)
+    {
+        in[i].store(tmp);
+        out[i]                = tmp[0];
+        out[i + dataLen]      = tmp[1];
+        out[i + 2 * dataLen]  = tmp[2];
+        out[i + 3 * dataLen]  = tmp[3];
+        out[i + 4 * dataLen]  = tmp[4];
+        out[i + 5 * dataLen]  = tmp[5];
+        out[i + 6 * dataLen]  = tmp[6];
+        out[i + 7 * dataLen]  = tmp[7];
+        out[i + 8 * dataLen]  = tmp[8];
+        out[i + 9 * dataLen]  = tmp[9];
+        out[i + 10 * dataLen] = tmp[10];
+        out[i + 11 * dataLen] = tmp[11];
+        out[i + 12 * dataLen] = tmp[12];
+        out[i + 13 * dataLen] = tmp[13];
+        out[i + 14 * dataLen] = tmp[14];
+        out[i + 15 * dataLen] = tmp[15];
+    }
+}
+
 inline void deinterleave_store(
     const std::vector<avx512Float16, allocator<avx512Float16>> &in,
     std::uint32_t dataLen, float *out)
@@ -911,8 +1017,22 @@ inline void deinterleave_store(
 
     alignas(avx512Float16::alignment)
         avx512Float16::scalarIndexType tmp[avx512Float16::width] = {
-            0,           dataLen,     2 * dataLen, 3 * dataLen,
-            4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
+            0,
+            dataLen,
+            2 * dataLen,
+            3 * dataLen,
+            4 * dataLen,
+            5 * dataLen,
+            6 * dataLen,
+            7 * dataLen,
+            8 * dataLen,
+            9 * dataLen,
+            10 * dataLen,
+            11 * dataLen,
+            12 * dataLen,
+            13 * dataLen,
+            14 * dataLen,
+            15 * dataLen};
     using index_t = avx512Int16<avx512Float16::scalarIndexType>;
 
     index_t index0(tmp);
diff --git a/library/LibUtilities/SimdLib/scalar.hpp b/library/LibUtilities/SimdLib/scalar.hpp
index dc795a73ff707d718e307e97258765f84cdbfbc4..8eadd62216885e8a68f30193cdf80666a56b6cac 100644
--- a/library/LibUtilities/SimdLib/scalar.hpp
+++ b/library/LibUtilities/SimdLib/scalar.hpp
@@ -306,7 +306,18 @@ template <typename T> inline scalarT<T> log(scalarT<T> in)
 }
 
 template <typename T>
-inline void load_interleave(const T *in, size_t dataLen,
+inline void load_unalign_interleave(
+    const T *in, const size_t dataLen,
+    std::vector<scalarT<T>, allocator<scalarT<T>>> &out)
+{
+    for (size_t i = 0; i < dataLen; ++i)
+    {
+        out[i] = in[i];
+    }
+}
+
+template <typename T>
+inline void load_interleave(const T *in, const size_t dataLen,
                             std::vector<scalarT<T>, allocator<scalarT<T>>> &out)
 {
     for (size_t i = 0; i < dataLen; ++i)
@@ -315,10 +326,21 @@ inline void load_interleave(const T *in, size_t dataLen,
     }
 }
 
+template <typename T>
+inline void deinterleave_unalign_store(
+    const std::vector<scalarT<T>, allocator<scalarT<T>>> &in,
+    const size_t dataLen, T *out)
+{
+    for (size_t i = 0; i < dataLen; ++i)
+    {
+        out[i] = in[i]._data;
+    }
+}
+
 template <typename T>
 inline void deinterleave_store(
-    const std::vector<scalarT<T>, allocator<scalarT<T>>> &in, size_t dataLen,
-    T *out)
+    const std::vector<scalarT<T>, allocator<scalarT<T>>> &in,
+    const size_t dataLen, T *out)
 {
     for (size_t i = 0; i < dataLen; ++i)
     {
diff --git a/library/MatrixFreeOps/BwdTrans.h b/library/MatrixFreeOps/BwdTrans.h
index 51b1d73a3c3618633b0eda60308322a9cf210c11..e81cb44ae58091c1f1649781372ed98bdaa40082 100644
--- a/library/MatrixFreeOps/BwdTrans.h
+++ b/library/MatrixFreeOps/BwdTrans.h
@@ -126,20 +126,44 @@ struct BwdTransTemplate
 
         std::vector<vec_t, allocator<vec_t>> tmpIn(m_nmTot), tmpOut(nqTot);
 
-        for (int e = 0; e < this->m_nBlocks; ++e)
+        // temporary aligned storage for local fields
+        NekDouble *locField = static_cast<NekDouble *>(::operator new[](
+            nqBlocks * sizeof(NekDouble), std::align_val_t(vec_t::alignment)));
+
+        for (int e = 0; e < this->m_nBlocks - 1; ++e)
         {
-            // Load and transpose data
-            load_interleave(inptr, m_nmTot, tmpIn);
+            // Load data to aligned storage and interleave it
+            // std::copy(inptr, inptr + nmBlocks, locField);
+            // load_interleave(locField, m_nmTot, tmpIn);
+            load_unalign_interleave(inptr, m_nmTot, tmpIn);
 
             BwdTrans1DKernel<SHAPE_TYPE>(nm0, nq0, tmpIn, this->m_bdata[0],
                                          tmpOut);
 
             // de-interleave and store data
-            deinterleave_store(tmpOut, nqTot, outptr);
+            // deinterleave_store(tmpOut, nqTot, locField);
+            // std::copy(locField, locField + nqBlocks, outptr);
+            deinterleave_unalign_store(tmpOut, nqTot, outptr);
 
             inptr += nmBlocks;
             outptr += nqBlocks;
         }
+        // last block
+        {
+            int acturalSize = nmBlocks - this->m_nPads * m_nmTot;
+            std::copy(inptr, inptr + acturalSize, locField);
+            load_interleave(locField, m_nmTot, tmpIn);
+
+            BwdTrans1DKernel<SHAPE_TYPE>(nm0, nq0, tmpIn, this->m_bdata[0],
+                                         tmpOut);
+
+            // de-interleave and store data
+            deinterleave_store(tmpOut, nqTot, locField);
+            acturalSize = nqBlocks - this->m_nPads * nqTot;
+            std::copy(locField, locField + acturalSize, outptr);
+        }
+        // free aligned memory
+        ::operator delete[](locField, std::align_val_t(vec_t::alignment));
     }
 
     // Size based template version.
@@ -159,20 +183,41 @@ struct BwdTransTemplate
 
         std::vector<vec_t, allocator<vec_t>> tmpIn(m_nmTot), tmpOut(nqTot);
 
-        for (int e = 0; e < this->m_nBlocks; ++e)
+        // temporary aligned storage for local fields
+        alignas(vec_t::alignment) NekDouble locField[nqBlocks];
+
+        for (int e = 0; e < this->m_nBlocks - 1; ++e)
         {
-            // Load and transpose data
-            load_interleave(inptr, m_nmTot, tmpIn);
+            // Load data to aligned storage and interleave it
+            // std::copy(inptr, inptr + nmBlocks, locField);
+            // load_interleave(locField, m_nmTot, tmpIn);
+            load_unalign_interleave(inptr, m_nmTot, tmpIn);
 
             BwdTrans1DKernel<SHAPE_TYPE>(nm0, nq0, tmpIn, this->m_bdata[0],
                                          tmpOut);
 
             // de-interleave and store data
-            deinterleave_store(tmpOut, nqTot, outptr);
+            // deinterleave_store(tmpOut, nqTot, locField);
+            // std::copy(locField, locField + nqBlocks, outptr);
+            deinterleave_unalign_store(tmpOut, nqTot, outptr);
 
             inptr += nmBlocks;
             outptr += nqBlocks;
         }
+        // last block
+        {
+            int acturalSize = nmBlocks - this->m_nPads * m_nmTot;
+            std::copy(inptr, inptr + acturalSize, locField);
+            load_interleave(locField, m_nmTot, tmpIn);
+
+            BwdTrans1DKernel<SHAPE_TYPE>(nm0, nq0, tmpIn, this->m_bdata[0],
+                                         tmpOut);
+
+            // de-interleave and store data
+            deinterleave_store(tmpOut, nqTot, locField);
+            acturalSize = nqBlocks - this->m_nPads * nqTot;
+            std::copy(locField, locField + acturalSize, outptr);
+        }
     }
 
 #elif defined(SHAPE_DIMENSION_2D)
@@ -203,21 +248,46 @@ struct BwdTransTemplate
         std::vector<vec_t, allocator<vec_t>> wsp0(wsp0Size), tmpIn(m_nmTot),
             tmpOut(nqTot);
 
-        for (int e = 0; e < this->m_nBlocks; ++e)
+        // temporary aligned storage for local fields
+        NekDouble *locField = static_cast<NekDouble *>(::operator new[](
+            nqBlocks * sizeof(NekDouble), std::align_val_t(vec_t::alignment)));
+
+        for (int e = 0; e < this->m_nBlocks - 1; ++e)
         {
-            // Load and transpose data
-            load_interleave(inptr, m_nmTot, tmpIn);
+            // Load data to aligned storage and interleave it
+            // std::copy(inptr, inptr + nmBlocks, locField);
+            // load_interleave(locField, m_nmTot, tmpIn);
+            load_unalign_interleave(inptr, m_nmTot, tmpIn);
 
             BwdTrans2DKernel<SHAPE_TYPE>(nm0, nm1, nq0, nq1, correct, tmpIn,
                                          this->m_bdata[0], this->m_bdata[1],
                                          wsp0, tmpOut);
 
             // de-interleave and store data
-            deinterleave_store(tmpOut, nqTot, outptr);
+            // deinterleave_store(tmpOut, nqTot, locField);
+            // std::copy(locField, locField + nqBlocks, outptr);
+            deinterleave_unalign_store(tmpOut, nqTot, outptr);
 
             inptr += nmBlocks;
             outptr += nqBlocks;
         }
+        // last block
+        {
+            int acturalSize = nmBlocks - this->m_nPads * m_nmTot;
+            std::copy(inptr, inptr + acturalSize, locField);
+            load_interleave(locField, m_nmTot, tmpIn);
+
+            BwdTrans2DKernel<SHAPE_TYPE>(nm0, nm1, nq0, nq1, correct, tmpIn,
+                                         this->m_bdata[0], this->m_bdata[1],
+                                         wsp0, tmpOut);
+
+            // de-interleave and store data
+            deinterleave_store(tmpOut, nqTot, locField);
+            acturalSize = nqBlocks - this->m_nPads * nqTot;
+            std::copy(locField, locField + acturalSize, outptr);
+        }
+        // free aligned memory
+        ::operator delete[](locField, std::align_val_t(vec_t::alignment));
     }
 
     // Size based template version.
@@ -241,21 +311,43 @@ struct BwdTransTemplate
         std::vector<vec_t, allocator<vec_t>> wsp0(wsp0Size), tmpIn(m_nmTot),
             tmpOut(nqTot);
 
-        for (int e = 0; e < this->m_nBlocks; ++e)
+        // temporary aligned storage for local fields
+        alignas(vec_t::alignment) NekDouble locField[nqBlocks];
+
+        for (int e = 0; e < this->m_nBlocks - 1; ++e)
         {
-            // Load and transpose data
-            load_interleave(inptr, m_nmTot, tmpIn);
+            // Load data to aligned storage and interleave it
+            // std::copy(inptr, inptr + nmBlocks, locField);
+            // load_interleave(locField, m_nmTot, tmpIn);
+            load_unalign_interleave(inptr, m_nmTot, tmpIn);
 
             BwdTrans2DKernel<SHAPE_TYPE>(nm0, nm1, nq0, nq1, correct, tmpIn,
                                          this->m_bdata[0], this->m_bdata[1],
                                          wsp0, tmpOut);
 
             // de-interleave and store data
-            deinterleave_store(tmpOut, nqTot, outptr);
+            // deinterleave_store(tmpOut, nqTot, locField);
+            // std::copy(locField, locField + nqBlocks, outptr);
+            deinterleave_unalign_store(tmpOut, nqTot, outptr);
 
             inptr += nmBlocks;
             outptr += nqBlocks;
         }
+        // last block
+        {
+            int acturalSize = nmBlocks - this->m_nPads * m_nmTot;
+            std::copy(inptr, inptr + acturalSize, locField);
+            load_interleave(locField, m_nmTot, tmpIn);
+
+            BwdTrans2DKernel<SHAPE_TYPE>(nm0, nm1, nq0, nq1, correct, tmpIn,
+                                         this->m_bdata[0], this->m_bdata[1],
+                                         wsp0, tmpOut);
+
+            // de-interleave and store data
+            deinterleave_store(tmpOut, nqTot, locField);
+            acturalSize = nqBlocks - this->m_nPads * nqTot;
+            std::copy(locField, locField + acturalSize, outptr);
+        }
     }
 
 #elif defined(SHAPE_DIMENSION_3D)
@@ -290,21 +382,46 @@ struct BwdTransTemplate
         std::vector<vec_t, allocator<vec_t>> wsp0(wsp0Size), wsp1(wsp1Size),
             tmpIn(m_nmTot), tmpOut(nqTot);
 
-        for (int e = 0; e < this->m_nBlocks; ++e)
+        // temporary aligned storage for local fields
+        NekDouble *locField = static_cast<NekDouble *>(::operator new[](
+            nqBlocks * sizeof(NekDouble), std::align_val_t(vec_t::alignment)));
+
+        for (int e = 0; e < this->m_nBlocks - 1; ++e)
         {
-            // Load and transpose data
-            load_interleave(inptr, m_nmTot, tmpIn);
+            // Load data to aligned storage and interleave it
+            // std::copy(inptr, inptr + nmBlocks, locField);
+            // load_interleave(locField, m_nmTot, tmpIn);
+            load_unalign_interleave(inptr, m_nmTot, tmpIn);
 
             BwdTrans3DKernel<SHAPE_TYPE>(
                 nm0, nm1, nm2, nq0, nq1, nq2, correct, tmpIn, this->m_bdata[0],
                 this->m_bdata[1], this->m_bdata[2], wsp0, wsp1, tmpOut);
 
             // de-interleave and store data
-            deinterleave_store(tmpOut, nqTot, outptr);
+            // deinterleave_store(tmpOut, nqTot, locField);
+            // std::copy(locField, locField + nqBlocks, outptr);
+            deinterleave_unalign_store(tmpOut, nqTot, outptr);
 
             inptr += nmBlocks;
             outptr += nqBlocks;
         }
+        // last block
+        {
+            int acturalSize = nmBlocks - this->m_nPads * m_nmTot;
+            std::copy(inptr, inptr + acturalSize, locField);
+            load_interleave(locField, m_nmTot, tmpIn);
+
+            BwdTrans3DKernel<SHAPE_TYPE>(
+                nm0, nm1, nm2, nq0, nq1, nq2, correct, tmpIn, this->m_bdata[0],
+                this->m_bdata[1], this->m_bdata[2], wsp0, wsp1, tmpOut);
+
+            // de-interleave and store data
+            deinterleave_store(tmpOut, nqTot, locField);
+            acturalSize = nqBlocks - this->m_nPads * nqTot;
+            std::copy(locField, locField + acturalSize, outptr);
+        }
+        // free aligned memory
+        ::operator delete[](locField, std::align_val_t(vec_t::alignment));
     }
 
     // Size based template version.
@@ -329,21 +446,43 @@ struct BwdTransTemplate
         std::vector<vec_t, allocator<vec_t>> wsp0(wsp0Size), wsp1(wsp1Size),
             tmpIn(m_nmTot), tmpOut(nqTot);
 
-        for (int e = 0; e < this->m_nBlocks; ++e)
+        // temporary aligned storage for local fields
+        alignas(vec_t::alignment) NekDouble locField[nqBlocks];
+
+        for (int e = 0; e < this->m_nBlocks - 1; ++e)
         {
-            // Load and transpose data
-            load_interleave(inptr, m_nmTot, tmpIn);
+            // Load data to aligned storage and interleave it
+            // std::copy(inptr, inptr + nmBlocks, locField);
+            // load_interleave(locField, m_nmTot, tmpIn);
+            load_unalign_interleave(inptr, m_nmTot, tmpIn);
 
             BwdTrans3DKernel<SHAPE_TYPE>(
                 nm0, nm1, nm2, nq0, nq1, nq2, correct, tmpIn, this->m_bdata[0],
                 this->m_bdata[1], this->m_bdata[2], wsp0, wsp1, tmpOut);
 
             // de-interleave and store data
-            deinterleave_store(tmpOut, nqTot, outptr);
+            // deinterleave_store(tmpOut, nqTot, locField);
+            // std::copy(locField, locField + nqBlocks, outptr);
+            deinterleave_unalign_store(tmpOut, nqTot, outptr);
 
             inptr += nmBlocks;
             outptr += nqBlocks;
         }
+        // last block
+        {
+            int acturalSize = nmBlocks - this->m_nPads * m_nmTot;
+            std::copy(inptr, inptr + acturalSize, locField);
+            load_interleave(locField, m_nmTot, tmpIn);
+
+            BwdTrans3DKernel<SHAPE_TYPE>(
+                nm0, nm1, nm2, nq0, nq1, nq2, correct, tmpIn, this->m_bdata[0],
+                this->m_bdata[1], this->m_bdata[2], wsp0, wsp1, tmpOut);
+
+            // de-interleave and store data
+            deinterleave_store(tmpOut, nqTot, locField);
+            acturalSize = nqBlocks - this->m_nPads * nqTot;
+            std::copy(locField, locField + acturalSize, outptr);
+        }
     }
 
 #endif // SHAPE_DIMENSION
diff --git a/library/MatrixFreeOps/CMakeLists.txt b/library/MatrixFreeOps/CMakeLists.txt
index 267ece36c04cbade90bb671a1881e264591c6011..7dc170f643320e3f4cbb0a9d780d240669da22ee 100644
--- a/library/MatrixFreeOps/CMakeLists.txt
+++ b/library/MatrixFreeOps/CMakeLists.txt
@@ -16,7 +16,7 @@ CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/SwitchLimits.h.in
 # so to reduce compile time timeout failures due to the large number
 # of template impementations.
 
-SET(OPERATORS BwdTrans PhysDeriv Helmholtz LinearAdvectionDiffusionReaction IProduct IProductWRTDerivBase PhysInterp1DScaled)
+SET(OPERATORS BwdTrans PhysDeriv Helmholtz LinearAdvectionDiffusionReaction IProduct IProductWRTDerivBase)
 SET(SHAPES Seg Tri Quad Hex Tet Prism Pyr )
 SET(DIMENSIONS 1 2 2 3 3 3 3 )
 
diff --git a/library/MatrixFreeOps/Helmholtz.h b/library/MatrixFreeOps/Helmholtz.h
index 0f98083234d878839a5956ef2e5116f9060e398b..6d1ec4bfa01b64b27d0cdce83bb0570263a44e98 100644
--- a/library/MatrixFreeOps/Helmholtz.h
+++ b/library/MatrixFreeOps/Helmholtz.h
@@ -187,69 +187,58 @@ struct HelmholtzTemplate
         std::vector<vec_t, allocator<vec_t>> bwd(nqTot), deriv0(nqTot),
             deriv1(nqTot);
 
-        const vec_t *jac_ptr = {};
-        const vec_t *df_ptr  = {};
+        // temporary aligned storage for local fields
+        NekDouble *locField = static_cast<NekDouble *>(::operator new[](
+            nmBlocks * sizeof(NekDouble), std::align_val_t(vec_t::alignment)));
 
-        // Get size of derivative factor block
-        auto dfSize = ndf;
-        if (DEFORMED)
-        {
-            dfSize *= nqTot;
-        }
+        const vec_t *jac_ptr = &((*this->m_jac)[0]);
+        const vec_t *df_ptr  = &((*this->m_df)[0]);
 
-        for (int e = 0; e < this->m_nBlocks; e++)
+        const auto dfSize = ndf * nqTot;
+
+        for (int e = 0; e < this->m_nBlocks - 1; e++)
         {
-            df_ptr = &((*this->m_df)[e * dfSize]);
+            // Load data to aligned storage and interleave it
+            // std::copy(inptr, inptr + nmBlocks, locField);
+            // load_interleave(locField, m_nmTot, tmpIn);
+            load_unalign_interleave(inptr, m_nmTot, tmpIn);
 
-            // Load and transpose data
-            load_interleave(inptr, m_nmTot, tmpIn);
+            fusedKernel2D(nm0, nm1, nq0, nq1, correct, jac_ptr, df_ptr, tmpIn,
+                          tmpOut, wsp0, bwd, deriv0, deriv1);
 
-            if (!DEFORMED)
+            // de-interleave and store data
+            // deinterleave_store(tmpOut, m_nmTot, locField);
+            // std::copy(locField, locField + nmBlocks, outptr);
+            deinterleave_unalign_store(tmpOut, m_nmTot, outptr);
+
+            inptr += nmBlocks;
+            outptr += nmBlocks;
+            if constexpr (DEFORMED)
             {
-                jac_ptr = &((*this->m_jac)[e]);
+                jac_ptr += nqTot;
+                df_ptr += dfSize;
             }
             else
             {
-                jac_ptr = &((*this->m_jac)[e * nqTot]);
+                ++jac_ptr;
+                df_ptr += ndf;
             }
+        }
+        // last block
+        {
+            int acturalSize = nmBlocks - this->m_nPads * m_nmTot;
+            std::copy(inptr, inptr + acturalSize, locField);
+            load_interleave(locField, m_nmTot, tmpIn);
 
-            // Step 1: BwdTrans
-            BwdTrans2DKernel<SHAPE_TYPE>(nm0, nm1, nq0, nq1, correct, tmpIn,
-                                         this->m_bdata[0], this->m_bdata[1],
-                                         wsp0, bwd);
-
-            // Step 2: inner product for mass matrix operation
-            IProduct2DKernel<SHAPE_TYPE, true, false, DEFORMED>(
-                nm0, nm1, nq0, nq1, correct, bwd, this->m_bdata[0],
-                this->m_bdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0,
-                tmpOut, m_lambda);
-
-            // Step 3: take derivatives in collapsed coordinate space
-            PhysDerivTensor2DKernel(nq0, nq1, bwd, this->m_D[0], this->m_D[1],
-                                    deriv0, deriv1);
-
-            DiffusionCoeff2DKernel<SHAPE_TYPE, DEFORMED>(
-                nq0, nq1, this->m_isConstVarDiff, this->m_constVarDiff,
-                this->m_isVarDiff, this->m_varD00, this->m_varD01,
-                this->m_varD11, df_ptr, this->m_h0, this->m_h1, deriv0, deriv1);
-
-            // Step 4: Apply Laplacian metrics & inner product
-            IProduct2DKernel<SHAPE_TYPE, false, true, DEFORMED>(
-                nm0, nm1, nq0, nq1, correct, deriv0, this->m_dbdata[0],
-                this->m_bdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0,
-                tmpOut);
-
-            IProduct2DKernel<SHAPE_TYPE, false, true, DEFORMED>(
-                nm0, nm1, nq0, nq1, correct, deriv1, this->m_bdata[0],
-                this->m_dbdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0,
-                tmpOut);
+            fusedKernel2D(nm0, nm1, nq0, nq1, correct, jac_ptr, df_ptr, tmpIn,
+                          tmpOut, wsp0, bwd, deriv0, deriv1);
 
             // de-interleave and store data
-            deinterleave_store(tmpOut, m_nmTot, outptr);
-
-            inptr += nmBlocks;
-            outptr += nmBlocks;
+            deinterleave_store(tmpOut, m_nmTot, locField);
+            std::copy(locField, locField + acturalSize, outptr);
         }
+        // free aligned memory
+        ::operator delete[](locField, std::align_val_t(vec_t::alignment));
     }
 
     // Size based template version.
@@ -279,71 +268,99 @@ struct HelmholtzTemplate
         std::vector<vec_t, allocator<vec_t>> bwd(nqTot), deriv0(nqTot),
             deriv1(nqTot);
 
-        const vec_t *jac_ptr = {};
-        const vec_t *df_ptr  = {};
+        // temporary aligned storage for local fields
+        alignas(vec_t::alignment) NekDouble locField[nqTot * vec_t::width];
 
-        // Get size of derivative factor block
-        auto dfSize = ndf;
-        if (DEFORMED)
-        {
-            dfSize *= nqTot;
-        }
+        const vec_t *jac_ptr = &((*this->m_jac)[0]);
+        const vec_t *df_ptr  = &((*this->m_df)[0]);
 
-        for (int e = 0; e < this->m_nBlocks; e++)
+        constexpr auto dfSize = ndf * nqTot;
+
+        for (int e = 0; e < this->m_nBlocks - 1; e++)
         {
-            df_ptr = &((*this->m_df)[e * dfSize]);
+            // Load data to aligned storage and interleave it
+            // std::copy(inptr, inptr + nmBlocks, locField);
+            // load_interleave(locField, m_nmTot, tmpIn);
+            load_unalign_interleave(inptr, m_nmTot, tmpIn);
+
+            fusedKernel2D(nm0, nm1, nq0, nq1, correct, jac_ptr, df_ptr, tmpIn,
+                          tmpOut, wsp0, bwd, deriv0, deriv1);
 
-            // Load and transpose data
-            load_interleave(inptr, m_nmTot, tmpIn);
+            // de-interleave and store data
+            // deinterleave_store(tmpOut, m_nmTot, locField);
+            // std::copy(locField, locField + nmBlocks, outptr);
+            deinterleave_unalign_store(tmpOut, m_nmTot, outptr);
 
-            if (!DEFORMED)
+            inptr += nmBlocks;
+            outptr += nmBlocks;
+            if constexpr (DEFORMED)
             {
-                jac_ptr = &((*this->m_jac)[e]);
+                jac_ptr += nqTot;
+                df_ptr += dfSize;
             }
             else
             {
-                jac_ptr = &((*this->m_jac)[e * nqTot]);
+                ++jac_ptr;
+                df_ptr += ndf;
             }
+        }
+        // last block
+        {
+            int acturalSize = nmBlocks - this->m_nPads * m_nmTot;
+            std::copy(inptr, inptr + acturalSize, locField);
+            load_interleave(locField, m_nmTot, tmpIn);
 
-            // Step 1: BwdTrans
-            BwdTrans2DKernel<SHAPE_TYPE>(nm0, nm1, nq0, nq1, correct, tmpIn,
-                                         this->m_bdata[0], this->m_bdata[1],
-                                         wsp0, bwd);
-
-            // Step 2: inner product for mass matrix operation
-            IProduct2DKernel<SHAPE_TYPE, true, false, DEFORMED>(
-                nm0, nm1, nq0, nq1, correct, bwd, this->m_bdata[0],
-                this->m_bdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0,
-                tmpOut, m_lambda);
-
-            // Step 3: take derivatives in collapsed coordinate space
-            PhysDerivTensor2DKernel(nq0, nq1, bwd, this->m_D[0], this->m_D[1],
-                                    deriv0, deriv1);
-
-            DiffusionCoeff2DKernel<SHAPE_TYPE, DEFORMED>(
-                nq0, nq1, this->m_isConstVarDiff, this->m_constVarDiff,
-                this->m_isVarDiff, this->m_varD00, this->m_varD01,
-                this->m_varD11, df_ptr, this->m_h0, this->m_h1, deriv0, deriv1);
-
-            // Step 4: Apply Laplacian metrics & inner product
-            IProduct2DKernel<SHAPE_TYPE, false, true, DEFORMED>(
-                nm0, nm1, nq0, nq1, correct, deriv0, this->m_dbdata[0],
-                this->m_bdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0,
-                tmpOut);
-
-            IProduct2DKernel<SHAPE_TYPE, false, true, DEFORMED>(
-                nm0, nm1, nq0, nq1, correct, deriv1, this->m_bdata[0],
-                this->m_dbdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0,
-                tmpOut);
+            fusedKernel2D(nm0, nm1, nq0, nq1, correct, jac_ptr, df_ptr, tmpIn,
+                          tmpOut, wsp0, bwd, deriv0, deriv1);
 
             // de-interleave and store data
-            deinterleave_store(tmpOut, m_nmTot, outptr);
-
-            inptr += nmBlocks;
-            outptr += nmBlocks;
+            deinterleave_store(tmpOut, m_nmTot, locField);
+            std::copy(locField, locField + acturalSize, outptr);
         }
     }
 
+    NEK_FORCE_INLINE void fusedKernel2D(
+        const size_t nm0, const size_t nm1, const size_t nq0, const size_t nq1,
+        const bool correct, const vec_t *jac_ptr, const vec_t *df_ptr,
+        const std::vector<vec_t, allocator<vec_t>> &tmpIn,
+        std::vector<vec_t, allocator<vec_t>> &tmpOut,
+        std::vector<vec_t, allocator<vec_t>> &wsp0,
+        std::vector<vec_t, allocator<vec_t>> &bwd,
+        std::vector<vec_t, allocator<vec_t>> &deriv0,
+        std::vector<vec_t, allocator<vec_t>> &deriv1)
+    {
+        // Step 1: BwdTrans
+        BwdTrans2DKernel<SHAPE_TYPE>(nm0, nm1, nq0, nq1, correct, tmpIn,
+                                     this->m_bdata[0], this->m_bdata[1], wsp0,
+                                     bwd);
+
+        // Step 2: inner product for mass matrix operation
+        IProduct2DKernel<SHAPE_TYPE, true, false, DEFORMED>(
+            nm0, nm1, nq0, nq1, correct, bwd, this->m_bdata[0],
+            this->m_bdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0, tmpOut,
+            m_lambda);
+
+        // Step 3: take derivatives in collapsed coordinate space
+        PhysDerivTensor2DKernel(nq0, nq1, bwd, this->m_D[0], this->m_D[1],
+                                deriv0, deriv1);
+
+        DiffusionCoeff2DKernel<SHAPE_TYPE, DEFORMED>(
+            nq0, nq1, this->m_isConstVarDiff, this->m_constVarDiff,
+            this->m_isVarDiff, this->m_varD00, this->m_varD01, this->m_varD11,
+            df_ptr, this->m_h0, this->m_h1, deriv0, deriv1);
+
+        // Step 4: Apply Laplacian metrics & inner product
+        IProduct2DKernel<SHAPE_TYPE, false, true, DEFORMED>(
+            nm0, nm1, nq0, nq1, correct, deriv0, this->m_dbdata[0],
+            this->m_bdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0,
+            tmpOut);
+
+        IProduct2DKernel<SHAPE_TYPE, false, true, DEFORMED>(
+            nm0, nm1, nq0, nq1, correct, deriv1, this->m_bdata[0],
+            this->m_dbdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0,
+            tmpOut);
+    }
+
 #elif defined(SHAPE_DIMENSION_3D)
 
     // Non-size based operator.
@@ -383,78 +400,60 @@ struct HelmholtzTemplate
         std::vector<vec_t, allocator<vec_t>> deriv0(nqTot), deriv1(nqTot),
             deriv2(nqTot);
 
-        const vec_t *jac_ptr = {};
-        const vec_t *df_ptr  = {};
+        // temporary aligned storage for local fields
+        NekDouble *locField = static_cast<NekDouble *>(::operator new[](
+            nmBlocks * sizeof(NekDouble), std::align_val_t(vec_t::alignment)));
 
-        // Get size of derivative factor block
-        auto dfSize = ndf;
-        if (DEFORMED)
-        {
-            dfSize *= nqTot;
-        }
+        const vec_t *jac_ptr = &((*this->m_jac)[0]);
+        const vec_t *df_ptr  = &((*this->m_df)[0]);
 
-        for (int e = 0; e < this->m_nBlocks; e++)
+        const auto dfSize = ndf * nqTot;
+
+        for (int e = 0; e < this->m_nBlocks - 1; e++)
         {
-            df_ptr = &((*this->m_df)[e * dfSize]);
+            // Load data to aligned storage and interleave it
+            // std::copy(inptr, inptr + nmBlocks, locField);
+            // load_interleave(locField, m_nmTot, tmpIn);
+            load_unalign_interleave(inptr, m_nmTot, tmpIn);
 
-            // Load and transpose data
-            load_interleave(inptr, m_nmTot, tmpIn);
+            fusedKernel3D(nm0, nm1, nm2, nq0, nq1, nq2, correct, jac_ptr,
+                          df_ptr, tmpIn, tmpOut, wsp0, wsp1, wsp2, bwd, deriv0,
+                          deriv1, deriv2);
 
-            if (!DEFORMED)
+            // de-interleave and store data
+            // deinterleave_store(tmpOut, m_nmTot, locField);
+            // std::copy(locField, locField + nmBlocks, outptr);
+            deinterleave_unalign_store(tmpOut, m_nmTot, outptr);
+
+            inptr += nmBlocks;
+            outptr += nmBlocks;
+            if constexpr (DEFORMED)
             {
-                jac_ptr = &((*this->m_jac)[e]);
+                jac_ptr += nqTot;
+                df_ptr += dfSize;
             }
             else
             {
-                jac_ptr = &((*this->m_jac)[e * nqTot]);
+                ++jac_ptr;
+                df_ptr += ndf;
             }
+        }
+        // last block
+        {
+            int acturalSize = nmBlocks - this->m_nPads * m_nmTot;
+            std::copy(inptr, inptr + acturalSize, locField);
+            load_interleave(locField, m_nmTot, tmpIn);
 
-            // Step 1: BwdTrans
-            BwdTrans3DKernel<SHAPE_TYPE>(
-                nm0, nm1, nm2, nq0, nq1, nq2, correct, tmpIn, this->m_bdata[0],
-                this->m_bdata[1], this->m_bdata[2], wsp0, wsp1, bwd);
-
-            // // Step 2: inner product for mass matrix operation
-            IProduct3DKernel<SHAPE_TYPE, true, false, DEFORMED>(
-                nm0, nm1, nm2, nq0, nq1, nq2, correct, bwd, this->m_bdata[0],
-                this->m_bdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1],
-                this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut, m_lambda);
-
-            // Step 3: take derivatives in standard space
-            PhysDerivTensor3DKernel(nq0, nq1, nq2, bwd, this->m_D[0],
-                                    this->m_D[1], this->m_D[2], deriv0, deriv1,
-                                    deriv2);
-
-            // Step 4: Apply Laplacian metrics & inner product
-            DiffusionCoeff3DKernel<SHAPE_TYPE, DEFORMED>(
-                nq0, nq1, nq2, this->m_isConstVarDiff, this->m_constVarDiff,
-                this->m_isVarDiff, this->m_varD00, this->m_varD01,
-                this->m_varD11, this->m_varD02, this->m_varD12, this->m_varD22,
-                df_ptr, this->m_h0, this->m_h1, this->m_h2, this->m_h3, deriv0,
-                deriv1, deriv2);
-
-            IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>(
-                nm0, nm1, nm2, nq0, nq1, nq2, correct, deriv0,
-                this->m_dbdata[0], this->m_bdata[1], this->m_bdata[2],
-                this->m_w[0], this->m_w[1], this->m_w[2], jac_ptr, wsp0, wsp1,
-                wsp2, tmpOut);
-
-            IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>(
-                nm0, nm1, nm2, nq0, nq1, nq2, correct, deriv1, this->m_bdata[0],
-                this->m_dbdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1],
-                this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut);
-
-            IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>(
-                nm0, nm1, nm2, nq0, nq1, nq2, correct, deriv2, this->m_bdata[0],
-                this->m_bdata[1], this->m_dbdata[2], this->m_w[0], this->m_w[1],
-                this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut);
+            fusedKernel3D(nm0, nm1, nm2, nq0, nq1, nq2, correct, jac_ptr,
+                          df_ptr, tmpIn, tmpOut, wsp0, wsp1, wsp2, bwd, deriv0,
+                          deriv1, deriv2);
 
             // de-interleave and store data
-            deinterleave_store(tmpOut, m_nmTot, outptr);
-
-            inptr += nmBlocks;
-            outptr += nmBlocks;
+            deinterleave_store(tmpOut, m_nmTot, locField);
+            std::copy(locField, locField + acturalSize, outptr);
         }
+        // free aligned memory
+        ::operator delete[](locField, std::align_val_t(vec_t::alignment));
     }
 
     // Size based template version.
@@ -487,80 +486,111 @@ struct HelmholtzTemplate
         std::vector<vec_t, allocator<vec_t>> deriv0(nqTot), deriv1(nqTot),
             deriv2(nqTot);
 
-        const vec_t *jac_ptr = {};
-        const vec_t *df_ptr  = {};
+        // temporary aligned storage for local fields
+        alignas(vec_t::alignment) NekDouble locField[nqTot * vec_t::width];
 
-        // Get size of derivative factor block
-        auto dfSize = ndf;
-        if (DEFORMED)
-        {
-            dfSize *= nqTot;
-        }
+        const vec_t *jac_ptr = &((*this->m_jac)[0]);
+        const vec_t *df_ptr  = &((*this->m_df)[0]);
 
-        for (int e = 0; e < this->m_nBlocks; e++)
+        constexpr auto dfSize = ndf * nqTot;
+
+        for (int e = 0; e < this->m_nBlocks - 1; e++)
         {
-            df_ptr = &((*this->m_df)[e * dfSize]);
+            // Load data to aligned storage and interleave it
+            // std::copy(inptr, inptr + nmBlocks, locField);
+            // load_interleave(locField, m_nmTot, tmpIn);
+            load_unalign_interleave(inptr, m_nmTot, tmpIn);
+
+            fusedKernel3D(nm0, nm1, nm2, nq0, nq1, nq2, correct, jac_ptr,
+                          df_ptr, tmpIn, tmpOut, wsp0, wsp1, wsp2, bwd, deriv0,
+                          deriv1, deriv2);
 
-            // Load and transpose data
-            load_interleave(inptr, m_nmTot, tmpIn);
+            // de-interleave and store data
+            // deinterleave_store(tmpOut, m_nmTot, locField);
+            // std::copy(locField, locField + nmBlocks, outptr);
+            deinterleave_unalign_store(tmpOut, m_nmTot, outptr);
 
-            if (!DEFORMED)
+            inptr += nmBlocks;
+            outptr += nmBlocks;
+            if constexpr (DEFORMED)
             {
-                jac_ptr = &((*this->m_jac)[e]);
+                jac_ptr += nqTot;
+                df_ptr += dfSize;
             }
             else
             {
-                jac_ptr = &((*this->m_jac)[e * nqTot]);
+                ++jac_ptr;
+                df_ptr += ndf;
             }
+        }
+        // last block
+        {
+            int acturalSize = nmBlocks - this->m_nPads * m_nmTot;
+            std::copy(inptr, inptr + acturalSize, locField);
+            load_interleave(locField, m_nmTot, tmpIn);
 
-            // Step 1: BwdTrans
-            BwdTrans3DKernel<SHAPE_TYPE>(
-                nm0, nm1, nm2, nq0, nq1, nq2, correct, tmpIn, this->m_bdata[0],
-                this->m_bdata[1], this->m_bdata[2], wsp0, wsp1, bwd);
-
-            // // Step 2: inner product for mass matrix operation
-            IProduct3DKernel<SHAPE_TYPE, true, false, DEFORMED>(
-                nm0, nm1, nm2, nq0, nq1, nq2, correct, bwd, this->m_bdata[0],
-                this->m_bdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1],
-                this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut, m_lambda);
-
-            // Step 3: take derivatives in standard space
-            PhysDerivTensor3DKernel(nq0, nq1, nq2, bwd, this->m_D[0],
-                                    this->m_D[1], this->m_D[2], deriv0, deriv1,
-                                    deriv2);
-
-            // Step 4: Apply Laplacian metrics & inner product
-            DiffusionCoeff3DKernel<SHAPE_TYPE, DEFORMED>(
-                nq0, nq1, nq2, this->m_isConstVarDiff, this->m_constVarDiff,
-                this->m_isVarDiff, this->m_varD00, this->m_varD01,
-                this->m_varD11, this->m_varD02, this->m_varD12, this->m_varD22,
-                df_ptr, this->m_h0, this->m_h1, this->m_h2, this->m_h3, deriv0,
-                deriv1, deriv2);
-
-            IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>(
-                nm0, nm1, nm2, nq0, nq1, nq2, correct, deriv0,
-                this->m_dbdata[0], this->m_bdata[1], this->m_bdata[2],
-                this->m_w[0], this->m_w[1], this->m_w[2], jac_ptr, wsp0, wsp1,
-                wsp2, tmpOut);
-
-            IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>(
-                nm0, nm1, nm2, nq0, nq1, nq2, correct, deriv1, this->m_bdata[0],
-                this->m_dbdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1],
-                this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut);
-
-            IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>(
-                nm0, nm1, nm2, nq0, nq1, nq2, correct, deriv2, this->m_bdata[0],
-                this->m_bdata[1], this->m_dbdata[2], this->m_w[0], this->m_w[1],
-                this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut);
+            fusedKernel3D(nm0, nm1, nm2, nq0, nq1, nq2, correct, jac_ptr,
+                          df_ptr, tmpIn, tmpOut, wsp0, wsp1, wsp2, bwd, deriv0,
+                          deriv1, deriv2);
 
             // de-interleave and store data
-            deinterleave_store(tmpOut, m_nmTot, outptr);
-
-            inptr += nmBlocks;
-            outptr += nmBlocks;
+            deinterleave_store(tmpOut, m_nmTot, locField);
+            std::copy(locField, locField + acturalSize, outptr);
         }
     }
 
+    NEK_FORCE_INLINE void fusedKernel3D(
+        const size_t nm0, const size_t nm1, const size_t nm2, const size_t nq0,
+        const size_t nq1, const size_t nq2, const bool correct,
+        const vec_t *jac_ptr, const vec_t *df_ptr,
+        const std::vector<vec_t, allocator<vec_t>> &tmpIn,
+        std::vector<vec_t, allocator<vec_t>> &tmpOut,
+        std::vector<vec_t, allocator<vec_t>> &wsp0,
+        std::vector<vec_t, allocator<vec_t>> &wsp1,
+        std::vector<vec_t, allocator<vec_t>> &wsp2,
+        std::vector<vec_t, allocator<vec_t>> &bwd,
+        std::vector<vec_t, allocator<vec_t>> &deriv0,
+        std::vector<vec_t, allocator<vec_t>> &deriv1,
+        std::vector<vec_t, allocator<vec_t>> &deriv2)
+    {
+        // Step 1: BwdTrans
+        BwdTrans3DKernel<SHAPE_TYPE>(nm0, nm1, nm2, nq0, nq1, nq2, correct,
+                                     tmpIn, this->m_bdata[0], this->m_bdata[1],
+                                     this->m_bdata[2], wsp0, wsp1, bwd);
+
+        // Step 2: inner product for mass matrix operation
+        IProduct3DKernel<SHAPE_TYPE, true, false, DEFORMED>(
+            nm0, nm1, nm2, nq0, nq1, nq2, correct, bwd, this->m_bdata[0],
+            this->m_bdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1],
+            this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut, m_lambda);
+
+        // Step 3: take derivatives in standard space
+        PhysDerivTensor3DKernel(nq0, nq1, nq2, bwd, this->m_D[0], this->m_D[1],
+                                this->m_D[2], deriv0, deriv1, deriv2);
+
+        // Step 4: Apply Laplacian metrics & inner product
+        DiffusionCoeff3DKernel<SHAPE_TYPE, DEFORMED>(
+            nq0, nq1, nq2, this->m_isConstVarDiff, this->m_constVarDiff,
+            this->m_isVarDiff, this->m_varD00, this->m_varD01, this->m_varD11,
+            this->m_varD02, this->m_varD12, this->m_varD22, df_ptr, this->m_h0,
+            this->m_h1, this->m_h2, this->m_h3, deriv0, deriv1, deriv2);
+
+        IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>(
+            nm0, nm1, nm2, nq0, nq1, nq2, correct, deriv0, this->m_dbdata[0],
+            this->m_bdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1],
+            this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut);
+
+        IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>(
+            nm0, nm1, nm2, nq0, nq1, nq2, correct, deriv1, this->m_bdata[0],
+            this->m_dbdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1],
+            this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut);
+
+        IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>(
+            nm0, nm1, nm2, nq0, nq1, nq2, correct, deriv2, this->m_bdata[0],
+            this->m_bdata[1], this->m_dbdata[2], this->m_w[0], this->m_w[1],
+            this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut);
+    }
+
 #endif // SHAPE_DIMENSION
 
 private:
diff --git a/library/MatrixFreeOps/IProduct.h b/library/MatrixFreeOps/IProduct.h
index f0a7d61515f0cbef65e14c63c27a0bf7fc57bb6e..b22f8dba37c8c4e54e217da89d89b7e094a74571 100644
--- a/library/MatrixFreeOps/IProduct.h
+++ b/library/MatrixFreeOps/IProduct.h
@@ -126,32 +126,56 @@ struct IProductTemplate
 
         std::vector<vec_t, allocator<vec_t>> tmpIn(nqTot), tmpOut(m_nmTot);
 
-        vec_t *jac_ptr;
+        vec_t *jac_ptr = &((*this->m_jac)[0]);
 
-        for (int e = 0; e < this->m_nBlocks; ++e)
+        // temporary aligned storage for local fields
+        NekDouble *locField = static_cast<NekDouble *>(::operator new[](
+            nqBlocks * sizeof(NekDouble), std::align_val_t(vec_t::alignment)));
+
+        for (int e = 0; e < this->m_nBlocks - 1; ++e)
         {
-            if (DEFORMED)
+            // Load data to aligned storage and interleave it
+            // std::copy(inptr, inptr + nqBlocks, locField);
+            // load_interleave(locField, nqTot, tmpIn);
+            load_unalign_interleave(inptr, nqTot, tmpIn);
+
+            IProduct1DKernel<SHAPE_TYPE, false, false, DEFORMED>(
+                nm0, nq0, tmpIn, this->m_bdata[0], this->m_w[0], jac_ptr,
+                tmpOut);
+
+            // de-interleave and store data
+            // deinterleave_store(tmpOut, m_nmTot, locField);
+            // std::copy(locField, locField + nmBlocks, outptr);
+            deinterleave_unalign_store(tmpOut, m_nmTot, outptr);
+
+            inptr += nqBlocks;
+            outptr += nmBlocks;
+            if constexpr (DEFORMED)
             {
-                jac_ptr = &((*this->m_jac)[e * nqTot]);
+                jac_ptr += nqTot;
             }
             else
             {
-                jac_ptr = &((*this->m_jac)[e]);
+                ++jac_ptr;
             }
-
-            // Load and transpose data
-            load_interleave(inptr, nqTot, tmpIn);
+        }
+        // last block
+        {
+            int acturalSize = nqBlocks - this->m_nPads * nqTot;
+            std::copy(inptr, inptr + acturalSize, locField);
+            load_interleave(locField, nqTot, tmpIn);
 
             IProduct1DKernel<SHAPE_TYPE, false, false, DEFORMED>(
                 nm0, nq0, tmpIn, this->m_bdata[0], this->m_w[0], jac_ptr,
                 tmpOut);
 
             // de-interleave and store data
-            deinterleave_store(tmpOut, m_nmTot, outptr);
-
-            inptr += nqBlocks;
-            outptr += nmBlocks;
+            deinterleave_store(tmpOut, m_nmTot, locField);
+            acturalSize = nmBlocks - this->m_nPads * m_nmTot;
+            std::copy(locField, locField + acturalSize, outptr);
         }
+        // free aligned memory
+        ::operator delete[](locField, std::align_val_t(vec_t::alignment));
     }
 
     // Size based template version.
@@ -168,31 +192,52 @@ struct IProductTemplate
 
         std::vector<vec_t, allocator<vec_t>> tmpIn(nqTot), tmpOut(m_nmTot);
 
-        vec_t *jac_ptr;
+        vec_t *jac_ptr = &((*this->m_jac)[0]);
 
-        for (int e = 0; e < this->m_nBlocks; ++e)
+        // temporary aligned storage for local fields
+        alignas(vec_t::alignment) NekDouble locField[nqBlocks];
+
+        for (int e = 0; e < this->m_nBlocks - 1; ++e)
         {
-            if (DEFORMED)
+            // Load data to aligned storage and interleave it
+            // std::copy(inptr, inptr + nqBlocks, locField);
+            // load_interleave(locField, nqTot, tmpIn);
+            load_unalign_interleave(inptr, nqTot, tmpIn);
+
+            IProduct1DKernel<SHAPE_TYPE, false, false, DEFORMED>(
+                nm0, nq0, tmpIn, this->m_bdata[0], this->m_w[0], jac_ptr,
+                tmpOut);
+
+            // de-interleave and store data
+            // deinterleave_store(tmpOut, m_nmTot, locField);
+            // std::copy(locField, locField + nmBlocks, outptr);
+            deinterleave_unalign_store(tmpOut, m_nmTot, outptr);
+
+            inptr += nqBlocks;
+            outptr += nmBlocks;
+            if constexpr (DEFORMED)
             {
-                jac_ptr = &((*this->m_jac)[e * nqTot]);
+                jac_ptr += nqTot;
             }
             else
             {
-                jac_ptr = &((*this->m_jac)[e]);
+                ++jac_ptr;
             }
-
-            // Load and transpose data
-            load_interleave(inptr, nqTot, tmpIn);
+        }
+        // last block
+        {
+            int acturalSize = nqBlocks - this->m_nPads * nqTot;
+            std::copy(inptr, inptr + acturalSize, locField);
+            load_interleave(locField, nqTot, tmpIn);
 
             IProduct1DKernel<SHAPE_TYPE, false, false, DEFORMED>(
                 nm0, nq0, tmpIn, this->m_bdata[0], this->m_w[0], jac_ptr,
                 tmpOut);
 
             // de-interleave and store data
-            deinterleave_store(tmpOut, m_nmTot, outptr);
-
-            inptr += nqBlocks;
-            outptr += nmBlocks;
+            deinterleave_store(tmpOut, m_nmTot, locField);
+            acturalSize = nmBlocks - this->m_nPads * m_nmTot;
+            std::copy(locField, locField + acturalSize, outptr);
         }
     }
 
@@ -225,21 +270,45 @@ struct IProductTemplate
         std::vector<vec_t, allocator<vec_t>> wsp0(wsp0Size), tmpIn(nqTot),
             tmpOut(m_nmTot);
 
-        vec_t *jac_ptr;
+        vec_t *jac_ptr = &((*this->m_jac)[0]);
 
-        for (int e = 0; e < this->m_nBlocks; ++e)
+        // temporary aligned storage for local fields
+        NekDouble *locField = static_cast<NekDouble *>(::operator new[](
+            nqBlocks * sizeof(NekDouble), std::align_val_t(vec_t::alignment)));
+
+        for (int e = 0; e < this->m_nBlocks - 1; ++e)
         {
-            if (DEFORMED)
+            // Load data to aligned storage and interleave it
+            // std::copy(inptr, inptr + nqBlocks, locField);
+            // load_interleave(locField, nqTot, tmpIn);
+            load_unalign_interleave(inptr, nqTot, tmpIn);
+
+            IProduct2DKernel<SHAPE_TYPE, false, false, DEFORMED>(
+                nm0, nm1, nq0, nq1, correct, tmpIn, this->m_bdata[0],
+                this->m_bdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0,
+                tmpOut);
+
+            // de-interleave and store data
+            // deinterleave_store(tmpOut, m_nmTot, locField);
+            // std::copy(locField, locField + nmBlocks, outptr);
+            deinterleave_unalign_store(tmpOut, m_nmTot, outptr);
+
+            inptr += nqBlocks;
+            outptr += nmBlocks;
+            if constexpr (DEFORMED)
             {
-                jac_ptr = &((*this->m_jac)[nqTot * e]);
+                jac_ptr += nqTot;
             }
             else
             {
-                jac_ptr = &((*this->m_jac)[e]);
+                ++jac_ptr;
             }
-
-            // Load and transpose data
-            load_interleave(inptr, nqTot, tmpIn);
+        }
+        // last block
+        {
+            int acturalSize = nqBlocks - this->m_nPads * nqTot;
+            std::copy(inptr, inptr + acturalSize, locField);
+            load_interleave(locField, nqTot, tmpIn);
 
             IProduct2DKernel<SHAPE_TYPE, false, false, DEFORMED>(
                 nm0, nm1, nq0, nq1, correct, tmpIn, this->m_bdata[0],
@@ -247,11 +316,12 @@ struct IProductTemplate
                 tmpOut);
 
             // de-interleave and store data
-            deinterleave_store(tmpOut, m_nmTot, outptr);
-
-            inptr += nqBlocks;
-            outptr += nmBlocks;
+            deinterleave_store(tmpOut, m_nmTot, locField);
+            acturalSize = nmBlocks - this->m_nPads * m_nmTot;
+            std::copy(locField, locField + acturalSize, outptr);
         }
+        // free aligned memory
+        ::operator delete[](locField, std::align_val_t(vec_t::alignment));
     }
 
     // Size based template version.
@@ -276,21 +346,44 @@ struct IProductTemplate
         std::vector<vec_t, allocator<vec_t>> wsp0(wsp0Size), tmpIn(nqTot),
             tmpOut(m_nmTot);
 
-        vec_t *jac_ptr;
+        vec_t *jac_ptr = &((*this->m_jac)[0]);
+
+        // temporary aligned storage for local fields
+        alignas(vec_t::alignment) NekDouble locField[nqBlocks];
 
-        for (int e = 0; e < this->m_nBlocks; ++e)
+        for (int e = 0; e < this->m_nBlocks - 1; ++e)
         {
-            if (DEFORMED)
+            // Load data to aligned storage and interleave it
+            // std::copy(inptr, inptr + nqBlocks, locField);
+            // load_interleave(locField, nqTot, tmpIn);
+            load_unalign_interleave(inptr, nqTot, tmpIn);
+
+            IProduct2DKernel<SHAPE_TYPE, false, false, DEFORMED>(
+                nm0, nm1, nq0, nq1, correct, tmpIn, this->m_bdata[0],
+                this->m_bdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0,
+                tmpOut);
+
+            // de-interleave and store data
+            // deinterleave_store(tmpOut, m_nmTot, locField);
+            // std::copy(locField, locField + nmBlocks, outptr);
+            deinterleave_unalign_store(tmpOut, m_nmTot, outptr);
+
+            inptr += nqBlocks;
+            outptr += nmBlocks;
+            if constexpr (DEFORMED)
             {
-                jac_ptr = &((*this->m_jac)[nqTot * e]);
+                jac_ptr += nqTot;
             }
             else
             {
-                jac_ptr = &((*this->m_jac)[e]);
+                ++jac_ptr;
             }
-
-            // Load and transpose data
-            load_interleave(inptr, nqTot, tmpIn);
+        }
+        // last block
+        {
+            int acturalSize = nqBlocks - this->m_nPads * nqTot;
+            std::copy(inptr, inptr + acturalSize, locField);
+            load_interleave(locField, nqTot, tmpIn);
 
             IProduct2DKernel<SHAPE_TYPE, false, false, DEFORMED>(
                 nm0, nm1, nq0, nq1, correct, tmpIn, this->m_bdata[0],
@@ -298,10 +391,9 @@ struct IProductTemplate
                 tmpOut);
 
             // de-interleave and store data
-            deinterleave_store(tmpOut, m_nmTot, outptr);
-
-            inptr += nqBlocks;
-            outptr += nmBlocks;
+            deinterleave_store(tmpOut, m_nmTot, locField);
+            acturalSize = nmBlocks - this->m_nPads * m_nmTot;
+            std::copy(locField, locField + acturalSize, outptr);
         }
     }
 
@@ -337,21 +429,45 @@ struct IProductTemplate
         std::vector<vec_t, allocator<vec_t>> wsp0(wsp0Size), wsp1(wsp1Size),
             wsp2(wsp2Size), tmpIn(nqTot), tmpOut(m_nmTot);
 
-        vec_t *jac_ptr;
+        vec_t *jac_ptr = &((*this->m_jac)[0]);
 
-        for (int e = 0; e < this->m_nBlocks; ++e)
+        // temporary aligned storage for local fields
+        NekDouble *locField = static_cast<NekDouble *>(::operator new[](
+            nqBlocks * sizeof(NekDouble), std::align_val_t(vec_t::alignment)));
+
+        for (int e = 0; e < this->m_nBlocks - 1; ++e)
         {
-            if (DEFORMED)
+            // Load data to aligned storage and interleave it
+            // std::copy(inptr, inptr + nqBlocks, locField);
+            // load_interleave(locField, nqTot, tmpIn);
+            load_unalign_interleave(inptr, nqTot, tmpIn);
+
+            IProduct3DKernel<SHAPE_TYPE, false, false, DEFORMED>(
+                nm0, nm1, nm2, nq0, nq1, nq2, correct, tmpIn, this->m_bdata[0],
+                this->m_bdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1],
+                this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut);
+
+            // de-interleave and store data
+            // deinterleave_store(tmpOut, m_nmTot, locField);
+            // std::copy(locField, locField + nmBlocks, outptr);
+            deinterleave_unalign_store(tmpOut, m_nmTot, outptr);
+
+            inptr += nqBlocks;
+            outptr += nmBlocks;
+            if constexpr (DEFORMED)
             {
-                jac_ptr = &((*this->m_jac)[nqTot * e]);
+                jac_ptr += nqTot;
             }
             else
             {
-                jac_ptr = &((*this->m_jac)[e]);
+                ++jac_ptr;
             }
-
-            // Load and transpose data
-            load_interleave(inptr, nqTot, tmpIn);
+        }
+        // last block
+        {
+            int acturalSize = nqBlocks - this->m_nPads * nqTot;
+            std::copy(inptr, inptr + acturalSize, locField);
+            load_interleave(locField, nqTot, tmpIn);
 
             IProduct3DKernel<SHAPE_TYPE, false, false, DEFORMED>(
                 nm0, nm1, nm2, nq0, nq1, nq2, correct, tmpIn, this->m_bdata[0],
@@ -359,11 +475,12 @@ struct IProductTemplate
                 this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut);
 
             // de-interleave and store data
-            deinterleave_store(tmpOut, m_nmTot, outptr);
-
-            inptr += nqBlocks;
-            outptr += nmBlocks;
+            deinterleave_store(tmpOut, m_nmTot, locField);
+            acturalSize = nmBlocks - this->m_nPads * m_nmTot;
+            std::copy(locField, locField + acturalSize, outptr);
         }
+        // free aligned memory
+        ::operator delete[](locField, std::align_val_t(vec_t::alignment));
     }
 
     // Size based template version.
@@ -389,21 +506,44 @@ struct IProductTemplate
         std::vector<vec_t, allocator<vec_t>> wsp0(wsp0Size), wsp1(wsp1Size),
             wsp2(wsp2Size), tmpIn(nqTot), tmpOut(m_nmTot);
 
-        vec_t *jac_ptr;
+        vec_t *jac_ptr = &((*this->m_jac)[0]);
+
+        // temporary aligned storage for local fields
+        alignas(vec_t::alignment) NekDouble locField[nqBlocks];
 
-        for (int e = 0; e < this->m_nBlocks; ++e)
+        for (int e = 0; e < this->m_nBlocks - 1; ++e)
         {
-            if (DEFORMED)
+            // Load data to aligned storage and interleave it
+            // std::copy(inptr, inptr + nqBlocks, locField);
+            // load_interleave(locField, nqTot, tmpIn);
+            load_unalign_interleave(inptr, nqTot, tmpIn);
+
+            IProduct3DKernel<SHAPE_TYPE, false, false, DEFORMED>(
+                nm0, nm1, nm2, nq0, nq1, nq2, correct, tmpIn, this->m_bdata[0],
+                this->m_bdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1],
+                this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut);
+
+            // de-interleave and store data
+            // deinterleave_store(tmpOut, m_nmTot, locField);
+            // std::copy(locField, locField + nmBlocks, outptr);
+            deinterleave_unalign_store(tmpOut, m_nmTot, outptr);
+
+            inptr += nqBlocks;
+            outptr += nmBlocks;
+            if constexpr (DEFORMED)
             {
-                jac_ptr = &((*this->m_jac)[nqTot * e]);
+                jac_ptr += nqTot;
             }
             else
             {
-                jac_ptr = &((*this->m_jac)[e]);
+                ++jac_ptr;
             }
-
-            // Load and transpose data
-            load_interleave(inptr, nqTot, tmpIn);
+        }
+        // last block
+        {
+            int acturalSize = nqBlocks - this->m_nPads * nqTot;
+            std::copy(inptr, inptr + acturalSize, locField);
+            load_interleave(locField, nqTot, tmpIn);
 
             IProduct3DKernel<SHAPE_TYPE, false, false, DEFORMED>(
                 nm0, nm1, nm2, nq0, nq1, nq2, correct, tmpIn, this->m_bdata[0],
@@ -411,10 +551,9 @@ struct IProductTemplate
                 this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut);
 
             // de-interleave and store data
-            deinterleave_store(tmpOut, m_nmTot, outptr);
-
-            inptr += nqBlocks;
-            outptr += nmBlocks;
+            deinterleave_store(tmpOut, m_nmTot, locField);
+            acturalSize = nmBlocks - this->m_nPads * m_nmTot;
+            std::copy(locField, locField + acturalSize, outptr);
         }
     }
 
diff --git a/library/MatrixFreeOps/IProductWRTDerivBase.h b/library/MatrixFreeOps/IProductWRTDerivBase.h
index 699c9a7c78779349565d5dd34ef5a6977e7b70d7..9d9aebad5c0722835363c6f5763d7a0a0cef092f 100644
--- a/library/MatrixFreeOps/IProductWRTDerivBase.h
+++ b/library/MatrixFreeOps/IProductWRTDerivBase.h
@@ -150,8 +150,8 @@ struct IProductWRTDerivBaseTemplate
             tmpIn[max_ndf]; // max_ndf is a constexpr
         std::vector<vec_t, allocator<vec_t>> tmp0(nqTot), tmpOut(m_nmTot);
 
-        const vec_t *jac_ptr;
-        const vec_t *df_ptr;
+        const vec_t *jac_ptr = &((*this->m_jac)[0]);
+        const vec_t *df_ptr  = &((*this->m_df)[0]);
         vec_t df_tmp[max_ndf];     // max_ndf is a constexpr
         NekDouble *inptr[max_ndf]; // max_ndf is a constexpr
 
@@ -161,18 +161,19 @@ struct IProductWRTDerivBaseTemplate
             inptr[d] = const_cast<NekDouble *>(input[d].data());
         }
 
-        for (int e = 0; e < this->m_nBlocks; ++e)
-        {
-            // Jacobian
-            jac_ptr = &((*this->m_jac)[dJSize * e]);
-
-            // Derivative factor
-            df_ptr = &((*this->m_df)[dfSize * e]);
+        // temporary aligned storage for local fields
+        NekDouble *locField = static_cast<NekDouble *>(::operator new[](
+            nqBlocks * sizeof(NekDouble), std::align_val_t(vec_t::alignment)));
 
+        for (int e = 0; e < this->m_nBlocks - 1; ++e)
+        {
             // Load and transpose data
             for (int d = 0; d < ndf; ++d)
             {
-                load_interleave(inptr[d], nqTot, tmpIn[d]);
+                // Load data to aligned storage and interleave it
+                // std::copy(inptr[d], inptr[d] + nqBlocks, locField);
+                // load_interleave(locField, nqTot, tmpIn[d]);
+                load_unalign_interleave(inptr[d], nqTot, tmpIn[d]);
             }
 
             IProductWRTDerivBase1DKernel<SHAPE_TYPE, DEFORMED>(
@@ -184,15 +185,42 @@ struct IProductWRTDerivBaseTemplate
                 tmpOut);
 
             // de-interleave and store data
-            deinterleave_store(tmpOut, m_nmTot, outptr);
+            // deinterleave_store(tmpOut, m_nmTot, locField);
+            // std::copy(locField, locField + nmBlocks, outptr);
+            deinterleave_unalign_store(tmpOut, m_nmTot, outptr);
 
             for (int d = 0; d < ndf; ++d)
             {
                 inptr[d] += nqBlocks;
             }
-
             outptr += nmBlocks;
+            df_ptr += dfSize;
+            jac_ptr += dJSize;
+        }
+        // last block
+        {
+            int acturalSize = nqBlocks - this->m_nPads * nqTot;
+            for (int d = 0; d < ndf; ++d)
+            {
+                std::copy(inptr[d], inptr[d] + acturalSize, locField);
+                load_interleave(locField, nqTot, tmpIn[d]);
+            }
+
+            IProductWRTDerivBase1DKernel<SHAPE_TYPE, DEFORMED>(
+                nqTot, ndf, df_ptr, df_tmp, tmpIn, tmp0);
+
+            // IP DB0
+            IProduct1DKernel<SHAPE_TYPE, false, false, DEFORMED>(
+                nm0, nq0, tmp0, this->m_dbdata[0], this->m_w[0], jac_ptr,
+                tmpOut);
+
+            // de-interleave and store data
+            deinterleave_store(tmpOut, m_nmTot, locField);
+            acturalSize = nmBlocks - this->m_nPads * m_nmTot;
+            std::copy(locField, locField + acturalSize, outptr);
         }
+        // free aligned memory
+        ::operator delete[](locField, std::align_val_t(vec_t::alignment));
     }
 
     // Size based template version.
@@ -231,8 +259,8 @@ struct IProductWRTDerivBaseTemplate
             tmpIn[max_ndf]; // max_ndf is a constexpr
         std::vector<vec_t, allocator<vec_t>> tmp0(nqTot), tmpOut(m_nmTot);
 
-        const vec_t *jac_ptr;
-        const vec_t *df_ptr;
+        const vec_t *jac_ptr = &((*this->m_jac)[0]);
+        const vec_t *df_ptr  = &((*this->m_df)[0]);
         vec_t df_tmp[max_ndf];     // max_ndf is a constexpr
         NekDouble *inptr[max_ndf]; // max_ndf is a constexpr
 
@@ -242,18 +270,18 @@ struct IProductWRTDerivBaseTemplate
             inptr[d] = const_cast<NekDouble *>(input[d].data());
         }
 
-        for (int e = 0; e < this->m_nBlocks; ++e)
-        {
-            // Jacobian
-            jac_ptr = &((*this->m_jac)[dJSize * e]);
-
-            // Derivative factor
-            df_ptr = &((*this->m_df)[dfSize * e]);
+        // temporary aligned storage for local fields
+        alignas(vec_t::alignment) NekDouble locField[nqTot * vec_t::width];
 
+        for (int e = 0; e < this->m_nBlocks - 1; ++e)
+        {
             // Load and transpose data
             for (int d = 0; d < ndf; ++d)
             {
-                load_interleave(inptr[d], nqTot, tmpIn[d]);
+                // Load data to aligned storage and interleave it
+                // std::copy(inptr[d], inptr[d] + nqBlocks, locField);
+                // load_interleave(locField, nqTot, tmpIn[d]);
+                load_unalign_interleave(inptr[d], nqTot, tmpIn[d]);
             }
 
             IProductWRTDerivBase1DKernel<SHAPE_TYPE, DEFORMED>(
@@ -265,14 +293,39 @@ struct IProductWRTDerivBaseTemplate
                 tmpOut);
 
             // de-interleave and store data
-            deinterleave_store(tmpOut, m_nmTot, outptr);
+            // deinterleave_store(tmpOut, m_nmTot, locField);
+            // std::copy(locField, locField + nmBlocks, outptr);
+            deinterleave_unalign_store(tmpOut, m_nmTot, outptr);
 
             for (int d = 0; d < ndf; ++d)
             {
                 inptr[d] += nqBlocks;
             }
-
             outptr += nmBlocks;
+            df_ptr += dfSize;
+            jac_ptr += dJSize;
+        }
+        // last block
+        {
+            int acturalSize = nqBlocks - this->m_nPads * nqTot;
+            for (int d = 0; d < ndf; ++d)
+            {
+                std::copy(inptr[d], inptr[d] + acturalSize, locField);
+                load_interleave(locField, nqTot, tmpIn[d]);
+            }
+
+            IProductWRTDerivBase1DKernel<SHAPE_TYPE, DEFORMED>(
+                nqTot, ndf, df_ptr, df_tmp, tmpIn, tmp0);
+
+            // IP DB0
+            IProduct1DKernel<SHAPE_TYPE, false, false, DEFORMED>(
+                nm0, nq0, tmp0, this->m_dbdata[0], this->m_w[0], jac_ptr,
+                tmpOut);
+
+            // de-interleave and store data
+            deinterleave_store(tmpOut, m_nmTot, locField);
+            acturalSize = nmBlocks - this->m_nPads * m_nmTot;
+            std::copy(locField, locField + acturalSize, outptr);
         }
     }
 
@@ -336,51 +389,60 @@ struct IProductWRTDerivBaseTemplate
 
         std::vector<vec_t, allocator<vec_t>> wsp0(wsp0Size);
 
-        const vec_t *jac_ptr;
-        const vec_t *df_ptr;
+        const vec_t *jac_ptr = &((*this->m_jac)[0]);
+        const vec_t *df_ptr  = &((*this->m_df)[0]);
         vec_t df_tmp[max_ndf]; // max_ndf is a constexpr
 
-        std::vector<vec_t, allocator<vec_t>> &Z0 = this->m_Z[0];
-        std::vector<vec_t, allocator<vec_t>> &Z1 = this->m_Z[1];
+        // temporary aligned storage for local fields
+        NekDouble *locField = static_cast<NekDouble *>(::operator new[](
+            nqBlocks * sizeof(NekDouble), std::align_val_t(vec_t::alignment)));
 
-        for (int e = 0; e < this->m_nBlocks; ++e)
+        for (int e = 0; e < this->m_nBlocks - 1; ++e)
         {
-            // Jacobian
-            jac_ptr = &((*this->m_jac)[dJSize * e]);
-
-            // Derivative factor
-            df_ptr = &((*this->m_df)[dfSize * e]);
-
             // Load and transpose data
             for (int d = 0; d < indim; ++d)
             {
-                load_interleave(inptr[d], nqTot, tmpIn[d]);
+                // Load data to aligned storage and interleave it
+                // std::copy(inptr[d], inptr[d] + nqBlocks, locField);
+                // load_interleave(locField, nqTot, tmpIn[d]);
+                load_unalign_interleave(inptr[d], nqTot, tmpIn[d]);
             }
 
-            IProductWRTDerivBase2DKernel<SHAPE_TYPE, DEFORMED>(
-                nq0, nq1, indim, df_ptr, df_tmp, Z0, Z1, tmpIn, tmp0, tmp1);
-
-            // IP DB0 B1
-            IProduct2DKernel<SHAPE_TYPE, false, false, DEFORMED>(
-                nm0, nm1, nq0, nq1, correct, tmp0, this->m_dbdata[0],
-                this->m_bdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0,
-                tmpOut);
-
-            // IP DB1 B0
-            IProduct2DKernel<SHAPE_TYPE, false, true, DEFORMED>(
-                nm0, nm1, nq0, nq1, correct, tmp1, this->m_bdata[0],
-                this->m_dbdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0,
-                tmpOut);
+            fusedKernel2D(nm0, nm1, nq0, nq1, indim, correct, jac_ptr, df_ptr,
+                          tmpIn, df_tmp, tmp0, tmp1, wsp0, tmpOut);
 
             // de-interleave and store data
-            deinterleave_store(tmpOut, m_nmTot, outptr);
+            // deinterleave_store(tmpOut, m_nmTot, locField);
+            // std::copy(locField, locField + nmBlocks, outptr);
+            deinterleave_unalign_store(tmpOut, m_nmTot, outptr);
 
             for (int d = 0; d < indim; ++d)
             {
                 inptr[d] += nqBlocks;
             }
             outptr += nmBlocks;
+            df_ptr += dfSize;
+            jac_ptr += dJSize;
         }
+        // last block
+        {
+            int acturalSize = nqBlocks - this->m_nPads * nqTot;
+            for (int d = 0; d < indim; ++d)
+            {
+                std::copy(inptr[d], inptr[d] + acturalSize, locField);
+                load_interleave(locField, nqTot, tmpIn[d]);
+            }
+
+            fusedKernel2D(nm0, nm1, nq0, nq1, indim, correct, jac_ptr, df_ptr,
+                          tmpIn, df_tmp, tmp0, tmp1, wsp0, tmpOut);
+
+            // de-interleave and store data
+            deinterleave_store(tmpOut, m_nmTot, locField);
+            acturalSize = nmBlocks - this->m_nPads * m_nmTot;
+            std::copy(locField, locField + acturalSize, outptr);
+        }
+        // free aligned memory
+        ::operator delete[](locField, std::align_val_t(vec_t::alignment));
     }
 
     // Size based template version.
@@ -436,51 +498,83 @@ struct IProductWRTDerivBaseTemplate
 
         std::vector<vec_t, allocator<vec_t>> wsp0(wsp0Size);
 
-        const vec_t *jac_ptr;
-        const vec_t *df_ptr;
+        const vec_t *jac_ptr = &((*this->m_jac)[0]);
+        const vec_t *df_ptr  = &((*this->m_df)[0]);
         vec_t df_tmp[max_ndf]; // max_ndf is a constexpr
 
-        std::vector<vec_t, allocator<vec_t>> &Z0 = this->m_Z[0];
-        std::vector<vec_t, allocator<vec_t>> &Z1 = this->m_Z[1];
+        // temporary aligned storage for local fields
+        alignas(vec_t::alignment) NekDouble locField[nqTot * vec_t::width];
 
-        for (int e = 0; e < this->m_nBlocks; ++e)
+        for (int e = 0; e < this->m_nBlocks - 1; ++e)
         {
-            // Jacobian
-            jac_ptr = &((*this->m_jac)[dJSize * e]);
-
-            // Derivative factor
-            df_ptr = &((*this->m_df)[dfSize * e]);
-
             // Load and transpose data
             for (int d = 0; d < indim; ++d)
             {
-                load_interleave(inptr[d], nqTot, tmpIn[d]);
+                // Load data to aligned storage and interleave it
+                // std::copy(inptr[d], inptr[d] + nqBlocks, locField);
+                // load_interleave(locField, nqTot, tmpIn[d]);
+                load_unalign_interleave(inptr[d], nqTot, tmpIn[d]);
             }
 
-            IProductWRTDerivBase2DKernel<SHAPE_TYPE, DEFORMED>(
-                nq0, nq1, indim, df_ptr, df_tmp, Z0, Z1, tmpIn, tmp0, tmp1);
-
-            // IP DB0 B1
-            IProduct2DKernel<SHAPE_TYPE, false, false, DEFORMED>(
-                nm0, nm1, nq0, nq1, correct, tmp0, this->m_dbdata[0],
-                this->m_bdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0,
-                tmpOut);
-
-            // IP DB1 B0
-            IProduct2DKernel<SHAPE_TYPE, false, true, DEFORMED>(
-                nm0, nm1, nq0, nq1, correct, tmp1, this->m_bdata[0],
-                this->m_dbdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0,
-                tmpOut);
+            fusedKernel2D(nm0, nm1, nq0, nq1, indim, correct, jac_ptr, df_ptr,
+                          tmpIn, df_tmp, tmp0, tmp1, wsp0, tmpOut);
 
             // de-interleave and store data
-            deinterleave_store(tmpOut, m_nmTot, outptr);
+            // deinterleave_store(tmpOut, m_nmTot, locField);
+            // std::copy(locField, locField + nmBlocks, outptr);
+            deinterleave_unalign_store(tmpOut, m_nmTot, outptr);
 
             for (int d = 0; d < indim; ++d)
             {
                 inptr[d] += nqBlocks;
             }
             outptr += nmBlocks;
+            df_ptr += dfSize;
+            jac_ptr += dJSize;
         }
+        // last block
+        {
+            int acturalSize = nqBlocks - this->m_nPads * nqTot;
+            for (int d = 0; d < indim; ++d)
+            {
+                std::copy(inptr[d], inptr[d] + acturalSize, locField);
+                load_interleave(locField, nqTot, tmpIn[d]);
+            }
+
+            fusedKernel2D(nm0, nm1, nq0, nq1, indim, correct, jac_ptr, df_ptr,
+                          tmpIn, df_tmp, tmp0, tmp1, wsp0, tmpOut);
+
+            // de-interleave and store data
+            deinterleave_store(tmpOut, m_nmTot, locField);
+            acturalSize = nmBlocks - this->m_nPads * m_nmTot;
+            std::copy(locField, locField + acturalSize, outptr);
+        }
+    }
+
+    NEK_FORCE_INLINE void fusedKernel2D(
+        const size_t nm0, const size_t nm1, const size_t nq0, const size_t nq1,
+        const size_t indim, const bool correct, const vec_t *jac_ptr,
+        const vec_t *df_ptr, const std::vector<vec_t, allocator<vec_t>> *tmpIn,
+        vec_t *df_tmp, std::vector<vec_t, allocator<vec_t>> &tmp0,
+        std::vector<vec_t, allocator<vec_t>> &tmp1,
+        std::vector<vec_t, allocator<vec_t>> &wsp0,
+        std::vector<vec_t, allocator<vec_t>> &tmpOut)
+    {
+        IProductWRTDerivBase2DKernel<SHAPE_TYPE, DEFORMED>(
+            nq0, nq1, indim, df_ptr, df_tmp, this->m_Z[0], this->m_Z[1], tmpIn,
+            tmp0, tmp1);
+
+        // IP DB0 B1
+        IProduct2DKernel<SHAPE_TYPE, false, false, DEFORMED>(
+            nm0, nm1, nq0, nq1, correct, tmp0, this->m_dbdata[0],
+            this->m_bdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0,
+            tmpOut);
+
+        // IP DB1 B0
+        IProduct2DKernel<SHAPE_TYPE, false, true, DEFORMED>(
+            nm0, nm1, nq0, nq1, correct, tmp1, this->m_bdata[0],
+            this->m_dbdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0,
+            tmpOut);
     }
 
 #elif defined(SHAPE_DIMENSION_3D)
@@ -533,56 +627,60 @@ struct IProductWRTDerivBaseTemplate
             wsp2(wsp2Size), tmpIn0(nqTot), tmpIn1(nqTot), tmpIn2(nqTot),
             tmp0(nqTot), tmp1(nqTot), tmp2(nqTot), tmpOut(m_nmTot);
 
-        const vec_t *jac_ptr;
-        const vec_t *df_ptr;
+        const vec_t *jac_ptr = &((*this->m_jac)[0]);
+        const vec_t *df_ptr  = &((*this->m_df)[0]);
 
-        std::vector<vec_t, allocator<vec_t>> &Z0 = this->m_Z[0];
-        std::vector<vec_t, allocator<vec_t>> &Z1 = this->m_Z[1];
-        std::vector<vec_t, allocator<vec_t>> &Z2 = this->m_Z[2];
+        // temporary aligned storage for local fields
+        NekDouble *locField = static_cast<NekDouble *>(::operator new[](
+            nqBlocks * sizeof(NekDouble), std::align_val_t(vec_t::alignment)));
 
-        for (int e = 0; e < this->m_nBlocks; ++e)
+        for (int e = 0; e < this->m_nBlocks - 1; ++e)
         {
-            // Jacobian
-            jac_ptr = &((*this->m_jac)[dJSize * e]);
-
-            // Derivative factor
-            df_ptr = &((*this->m_df)[dfSize * e]);
-
-            // Load and transpose data
-            load_interleave(inptr0, nqTot, tmpIn0);
-            load_interleave(inptr1, nqTot, tmpIn1);
-            load_interleave(inptr2, nqTot, tmpIn2);
-
-            IProductWRTDerivBase3DKernel<SHAPE_TYPE, DEFORMED>(
-                nq0, nq1, nq2, df_ptr, Z0, Z1, Z2, tmpIn0, tmpIn1, tmpIn2, tmp0,
-                tmp1, tmp2);
-
-            // IP DB0 B1 B2
-            IProduct3DKernel<SHAPE_TYPE, false, false, DEFORMED>(
-                nm0, nm1, nm2, nq0, nq1, nq2, correct, tmp0, this->m_dbdata[0],
-                this->m_bdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1],
-                this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut);
-
-            // IP B0 DB1 B2
-            IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>(
-                nm0, nm1, nm2, nq0, nq1, nq2, correct, tmp1, this->m_bdata[0],
-                this->m_dbdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1],
-                this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut);
-
-            // IP B0 B1 DB2
-            IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>(
-                nm0, nm1, nm2, nq0, nq1, nq2, correct, tmp2, this->m_bdata[0],
-                this->m_bdata[1], this->m_dbdata[2], this->m_w[0], this->m_w[1],
-                this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut);
+            // Load data to aligned storage and interleave it
+            // load_interleave(inptr0, nqTot, tmpIn0);
+            // load_interleave(inptr1, nqTot, tmpIn1);
+            // load_interleave(inptr2, nqTot, tmpIn2);
+            load_unalign_interleave(inptr0, nqTot, tmpIn0);
+            load_unalign_interleave(inptr1, nqTot, tmpIn1);
+            load_unalign_interleave(inptr2, nqTot, tmpIn2);
+
+            fusedKernel3D(nm0, nm1, nm2, nq0, nq1, nq2, correct, jac_ptr,
+                          df_ptr, tmpIn0, tmpIn1, tmpIn2, tmp0, tmp1, tmp2,
+                          wsp0, wsp1, wsp2, tmpOut);
 
             // de-interleave and store data
-            deinterleave_store(tmpOut, m_nmTot, outptr);
+            // deinterleave_store(tmpOut, m_nmTot, locField);
+            // std::copy(locField, locField + nmBlocks, outptr);
+            deinterleave_unalign_store(tmpOut, m_nmTot, outptr);
 
             inptr0 += nqBlocks;
             inptr1 += nqBlocks;
             inptr2 += nqBlocks;
             outptr += nmBlocks;
+            df_ptr += dfSize;
+            jac_ptr += dJSize;
+        }
+        // last block
+        {
+            int acturalSize = nqBlocks - this->m_nPads * nqTot;
+            std::copy(inptr0, inptr0 + acturalSize, locField);
+            load_interleave(locField, nqTot, tmpIn0);
+            std::copy(inptr1, inptr1 + acturalSize, locField);
+            load_interleave(locField, nqTot, tmpIn1);
+            std::copy(inptr2, inptr2 + acturalSize, locField);
+            load_interleave(locField, nqTot, tmpIn2);
+
+            fusedKernel3D(nm0, nm1, nm2, nq0, nq1, nq2, correct, jac_ptr,
+                          df_ptr, tmpIn0, tmpIn1, tmpIn2, tmp0, tmp1, tmp2,
+                          wsp0, wsp1, wsp2, tmpOut);
+
+            // de-interleave and store data
+            deinterleave_store(tmpOut, m_nmTot, locField);
+            acturalSize = nmBlocks - this->m_nPads * m_nmTot;
+            std::copy(locField, locField + acturalSize, outptr);
         }
+        // free aligned memory
+        ::operator delete[](locField, std::align_val_t(vec_t::alignment));
     }
 
     // Size based template version.
@@ -626,58 +724,97 @@ struct IProductWRTDerivBaseTemplate
             wsp2(wsp2Size), tmpIn0(nqTot), tmpIn1(nqTot), tmpIn2(nqTot),
             tmp0(nqTot), tmp1(nqTot), tmp2(nqTot), tmpOut(m_nmTot);
 
-        const vec_t *jac_ptr;
-        const vec_t *df_ptr;
+        const vec_t *jac_ptr = &((*this->m_jac)[0]);
+        const vec_t *df_ptr  = &((*this->m_df)[0]);
 
-        std::vector<vec_t, allocator<vec_t>> &Z0 = this->m_Z[0];
-        std::vector<vec_t, allocator<vec_t>> &Z1 = this->m_Z[1];
-        std::vector<vec_t, allocator<vec_t>> &Z2 = this->m_Z[2];
+        // temporary aligned storage for local fields
+        alignas(vec_t::alignment) NekDouble locField[nqTot * vec_t::width];
 
-        for (int e = 0; e < this->m_nBlocks; ++e)
+        for (int e = 0; e < this->m_nBlocks - 1; ++e)
         {
-            // Jacobian
-            jac_ptr = &((*this->m_jac)[dJSize * e]);
-
-            // Derivative factor
-            df_ptr = &((*this->m_df)[dfSize * e]);
-
             // Load and transpose data
-            load_interleave(inptr0, nqTot, tmpIn0);
-            load_interleave(inptr1, nqTot, tmpIn1);
-            load_interleave(inptr2, nqTot, tmpIn2);
-
-            IProductWRTDerivBase3DKernel<SHAPE_TYPE, DEFORMED>(
-                nq0, nq1, nq2, df_ptr, Z0, Z1, Z2, tmpIn0, tmpIn1, tmpIn2, tmp0,
-                tmp1, tmp2);
-
-            // IP DB0 B1 B2
-            IProduct3DKernel<SHAPE_TYPE, false, false, DEFORMED>(
-                nm0, nm1, nm2, nq0, nq1, nq2, correct, tmp0, this->m_dbdata[0],
-                this->m_bdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1],
-                this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut);
-
-            // IP B0 DB1 B2
-            IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>(
-                nm0, nm1, nm2, nq0, nq1, nq2, correct, tmp1, this->m_bdata[0],
-                this->m_dbdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1],
-                this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut);
-
-            // IP B0 B1 DB2
-            IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>(
-                nm0, nm1, nm2, nq0, nq1, nq2, correct, tmp2, this->m_bdata[0],
-                this->m_bdata[1], this->m_dbdata[2], this->m_w[0], this->m_w[1],
-                this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut);
+            // load_interleave(inptr0, nqTot, tmpIn0);
+            // load_interleave(inptr1, nqTot, tmpIn1);
+            // load_interleave(inptr2, nqTot, tmpIn2);
+            load_unalign_interleave(inptr0, nqTot, tmpIn0);
+            load_unalign_interleave(inptr1, nqTot, tmpIn1);
+            load_unalign_interleave(inptr2, nqTot, tmpIn2);
+
+            fusedKernel3D(nm0, nm1, nm2, nq0, nq1, nq2, correct, jac_ptr,
+                          df_ptr, tmpIn0, tmpIn1, tmpIn2, tmp0, tmp1, tmp2,
+                          wsp0, wsp1, wsp2, tmpOut);
 
             // de-interleave and store data
-            deinterleave_store(tmpOut, m_nmTot, outptr);
+            // deinterleave_store(tmpOut, m_nmTot, locField);
+            // std::copy(locField, locField + nmBlocks, outptr);
+            deinterleave_unalign_store(tmpOut, m_nmTot, outptr);
 
             inptr0 += nqBlocks;
             inptr1 += nqBlocks;
             inptr2 += nqBlocks;
             outptr += nmBlocks;
+            df_ptr += dfSize;
+            jac_ptr += dJSize;
+        }
+        // last block
+        {
+            int acturalSize = nqBlocks - this->m_nPads * nqTot;
+            std::copy(inptr0, inptr0 + acturalSize, locField);
+            load_interleave(locField, nqTot, tmpIn0);
+            std::copy(inptr1, inptr1 + acturalSize, locField);
+            load_interleave(locField, nqTot, tmpIn1);
+            std::copy(inptr2, inptr2 + acturalSize, locField);
+            load_interleave(locField, nqTot, tmpIn2);
+
+            fusedKernel3D(nm0, nm1, nm2, nq0, nq1, nq2, correct, jac_ptr,
+                          df_ptr, tmpIn0, tmpIn1, tmpIn2, tmp0, tmp1, tmp2,
+                          wsp0, wsp1, wsp2, tmpOut);
+
+            // de-interleave and store data
+            deinterleave_store(tmpOut, m_nmTot, locField);
+            acturalSize = nmBlocks - this->m_nPads * m_nmTot;
+            std::copy(locField, locField + acturalSize, outptr);
         }
     }
 
+    NEK_FORCE_INLINE void fusedKernel3D(
+        const size_t nm0, const size_t nm1, const size_t nm2, const size_t nq0,
+        const size_t nq1, const size_t nq2, const bool correct,
+        const vec_t *jac_ptr, const vec_t *df_ptr,
+        const std::vector<vec_t, allocator<vec_t>> &tmpIn0,
+        const std::vector<vec_t, allocator<vec_t>> &tmpIn1,
+        const std::vector<vec_t, allocator<vec_t>> &tmpIn2,
+        std::vector<vec_t, allocator<vec_t>> &tmp0,
+        std::vector<vec_t, allocator<vec_t>> &tmp1,
+        std::vector<vec_t, allocator<vec_t>> &tmp2,
+        std::vector<vec_t, allocator<vec_t>> &wsp0,
+        std::vector<vec_t, allocator<vec_t>> &wsp1,
+        std::vector<vec_t, allocator<vec_t>> &wsp2,
+        std::vector<vec_t, allocator<vec_t>> &tmpOut)
+    {
+        IProductWRTDerivBase3DKernel<SHAPE_TYPE, DEFORMED>(
+            nq0, nq1, nq2, df_ptr, this->m_Z[0], this->m_Z[1], this->m_Z[2],
+            tmpIn0, tmpIn1, tmpIn2, tmp0, tmp1, tmp2);
+
+        // IP DB0 B1 B2
+        IProduct3DKernel<SHAPE_TYPE, false, false, DEFORMED>(
+            nm0, nm1, nm2, nq0, nq1, nq2, correct, tmp0, this->m_dbdata[0],
+            this->m_bdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1],
+            this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut);
+
+        // IP B0 DB1 B2
+        IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>(
+            nm0, nm1, nm2, nq0, nq1, nq2, correct, tmp1, this->m_bdata[0],
+            this->m_dbdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1],
+            this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut);
+
+        // IP B0 B1 DB2
+        IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>(
+            nm0, nm1, nm2, nq0, nq1, nq2, correct, tmp2, this->m_bdata[0],
+            this->m_bdata[1], this->m_dbdata[2], this->m_w[0], this->m_w[1],
+            this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut);
+    }
+
 #endif // SHAPE_DIMENSION
 
 private:
diff --git a/library/MatrixFreeOps/LinearAdvectionDiffusionReaction.h b/library/MatrixFreeOps/LinearAdvectionDiffusionReaction.h
index ac84d0bb9175e8c164f97731e33f6bb70552385c..f8cbbc019cbe84b046b8cbae3246640400dbbb98 100644
--- a/library/MatrixFreeOps/LinearAdvectionDiffusionReaction.h
+++ b/library/MatrixFreeOps/LinearAdvectionDiffusionReaction.h
@@ -193,102 +193,62 @@ struct LinearAdvectionDiffusionReactionTemplate
         std::vector<vec_t, allocator<vec_t>> bwd(nqTot), deriv0(nqTot),
             deriv1(nqTot);
 
-        const vec_t *jac_ptr    = {};
-        const vec_t *df_ptr     = {};
-        const vec_t *advVel_ptr = {};
+        // temporary aligned storage for local fields
+        NekDouble *locField = static_cast<NekDouble *>(::operator new[](
+            nmBlocks * sizeof(NekDouble), std::align_val_t(vec_t::alignment)));
 
-        // Get size of derivative factor and advection velocity block
-        auto dfSize = ndf;
-        if (DEFORMED)
-        {
-            dfSize *= nqTot;
-        }
-        auto advVelSize = nvel * nqTot;
+        const vec_t *jac_ptr    = &((*this->m_jac)[0]);
+        const vec_t *df_ptr     = &((*this->m_df)[0]);
+        const vec_t *advVel_ptr = &((*this->m_advVel)[0]);
+
+        // Get size of derivative factor block
+        const auto dfSize     = ndf * nqTot;
+        const auto advVelSize = nvel * nqTot;
 
-        for (int e = 0; e < this->m_nBlocks; e++)
+        for (int e = 0; e < this->m_nBlocks - 1; e++)
         {
-            df_ptr     = &((*this->m_df)[e * dfSize]);
-            advVel_ptr = &((*this->m_advVel)[e * advVelSize]);
+            // Load data to aligned storage and interleave it
+            // std::copy(inptr, inptr + nmBlocks, locField);
+            // load_interleave(locField, m_nmTot, tmpIn);
+            load_unalign_interleave(inptr, m_nmTot, tmpIn);
 
-            // Load and transpose data
-            load_interleave(inptr, m_nmTot, tmpIn);
+            fusedKernel2D(nm0, nm1, nq0, nq1, correct, jac_ptr, df_ptr,
+                          advVel_ptr, tmpIn, tmpOut, wsp0, bwd, deriv0, deriv1);
 
-            if (!DEFORMED)
+            // de-interleave and store data
+            // deinterleave_store(tmpOut, m_nmTot, locField);
+            // std::copy(locField, locField + nmBlocks, outptr);
+            deinterleave_unalign_store(tmpOut, m_nmTot, outptr);
+
+            inptr += nmBlocks;
+            outptr += nmBlocks;
+            advVel_ptr += advVelSize;
+            if constexpr (DEFORMED)
             {
-                jac_ptr = &((*this->m_jac)[e]);
+                jac_ptr += nqTot;
+                df_ptr += dfSize;
             }
             else
             {
-                jac_ptr = &((*this->m_jac)[e * nqTot]);
+                ++jac_ptr;
+                df_ptr += ndf;
             }
+        }
+        // last block
+        {
+            int acturalSize = nmBlocks - this->m_nPads * m_nmTot;
+            std::copy(inptr, inptr + acturalSize, locField);
+            load_interleave(locField, m_nmTot, tmpIn);
 
-            // Step 1: BwdTrans
-            // Input: tmpIn = uCoeff
-            // Output: bwd = uPhys xi
-            BwdTrans2DKernel<SHAPE_TYPE>(nm0, nm1, nq0, nq1, correct, tmpIn,
-                                         this->m_bdata[0], this->m_bdata[1],
-                                         wsp0, bwd);
-
-            // Step 2: take STD derivatives in collapsed coordinate space
-            // Input: bwd = uPhys xi (const)
-            // Output: deriv0 = du/dxi_1
-            //         deriv1 = du/dxi_2
-            PhysDerivTensor2DKernel(nq0, nq1, bwd, this->m_D[0], this->m_D[1],
-                                    deriv0, deriv1);
-
-            // Step 3: Advect solution by advection velocities (varcoeffs)
-            // Input: deriv0 = du/dxi_1,
-            //        deriv1 = du/dxi_2
-            // Output: bwd += V \cdot \nabla_x u
-            Advection2DKernel<SHAPE_TYPE, DEFORMED>(
-                nq0, nq1, advVel_ptr, df_ptr, this->m_h0, this->m_h1, deriv0,
-                deriv1, bwd);
-
-            // Step 4: Inner product for mass + advection matrix operation
-            // Input: bwd = 1/lambda * V \cdot \nabla_x uPhys + uPhys (const)
-            // Output: wsp0 = temporary
-            //         tmpOut = lambda \int_\Omega (1/lambda*V \cdot \nabla_x
-            //         uPhys + uPhys) phi
-            IProduct2DKernel<SHAPE_TYPE, true, false, DEFORMED>(
-                nm0, nm1, nq0, nq1, correct, bwd, this->m_bdata[0],
-                this->m_bdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0,
-                tmpOut, m_lambda);
-
-            // Step 5: Evaluate local derivatives with laplacian metrics
-            // Input:
-            //        deriv0 = du/dxi_1,
-            //        deriv1 = du/dxi_2
-            // Output: deriv0 = (J^2 \cdot du/dxi)[0],
-            //         deriv1 = (J^2 \cdot du/dxi)[1]
-            DiffusionCoeff2DKernel<SHAPE_TYPE, DEFORMED>(
-                nq0, nq1, this->m_isConstVarDiff, this->m_constVarDiff,
-                this->m_isVarDiff, this->m_varD00, this->m_varD01,
-                this->m_varD11, df_ptr, this->m_h0, this->m_h1, deriv0, deriv1);
-
-            // Inner product with derivative basis xi_1
-            // Input: deriv0 = (J^2 \cdot du/dxi)[0] (const)
-            // Output: wsp0 = temporary,
-            //         tmpOut += \int_\Omega X dphi/dx (append)
-            IProduct2DKernel<SHAPE_TYPE, false, true, DEFORMED>(
-                nm0, nm1, nq0, nq1, correct, deriv0, this->m_dbdata[0],
-                this->m_bdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0,
-                tmpOut);
-
-            // Inner product with derivative basis xi_2
-            // Input: deriv1 = (J^2 \cdot du/dxi)[1] (const)
-            // Output: wsp0 = temporary,
-            //         tmpOut += \int_\Omega X dphi/dy (append)
-            IProduct2DKernel<SHAPE_TYPE, false, true, DEFORMED>(
-                nm0, nm1, nq0, nq1, correct, deriv1, this->m_bdata[0],
-                this->m_dbdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0,
-                tmpOut);
+            fusedKernel2D(nm0, nm1, nq0, nq1, correct, jac_ptr, df_ptr,
+                          advVel_ptr, tmpIn, tmpOut, wsp0, bwd, deriv0, deriv1);
 
             // de-interleave and store data
-            deinterleave_store(tmpOut, m_nmTot, outptr);
-
-            inptr += nmBlocks;
-            outptr += nmBlocks;
+            deinterleave_store(tmpOut, m_nmTot, locField);
+            std::copy(locField, locField + acturalSize, outptr);
         }
+        // free aligned memory
+        ::operator delete[](locField, std::align_val_t(vec_t::alignment));
     }
 
     // Size based template version.
@@ -299,8 +259,8 @@ struct LinearAdvectionDiffusionReactionTemplate
         constexpr auto ndf  = 4;
         constexpr auto nvel = 2;
 
-        const auto nqTot    = nq0 * nq1;
-        const auto nmBlocks = m_nmTot * vec_t::width;
+        constexpr auto nqTot = nq0 * nq1;
+        const auto nmBlocks  = m_nmTot * vec_t::width;
 
         auto *inptr  = &input[0];
         auto *outptr = &output[0];
@@ -319,104 +279,134 @@ struct LinearAdvectionDiffusionReactionTemplate
         std::vector<vec_t, allocator<vec_t>> bwd(nqTot), deriv0(nqTot),
             deriv1(nqTot);
 
-        const vec_t *jac_ptr    = {};
-        const vec_t *df_ptr     = {};
-        const vec_t *advVel_ptr = {};
+        // temporary aligned storage for local fields
+        alignas(vec_t::alignment) NekDouble locField[nqTot * vec_t::width];
+
+        const vec_t *jac_ptr    = &((*this->m_jac)[0]);
+        const vec_t *df_ptr     = &((*this->m_df)[0]);
+        const vec_t *advVel_ptr = &((*this->m_advVel)[0]);
 
         // Get size of derivative factor block
-        auto dfSize = ndf;
-        if (DEFORMED)
-        {
-            dfSize *= nqTot;
-        }
-        auto advVelSize = nvel * nqTot;
+        constexpr auto dfSize     = ndf * nqTot;
+        constexpr auto advVelSize = nvel * nqTot;
 
-        for (int e = 0; e < this->m_nBlocks; e++)
+        for (int e = 0; e < this->m_nBlocks - 1; e++)
         {
-            df_ptr     = &((*this->m_df)[e * dfSize]);
-            advVel_ptr = &((*this->m_advVel)[e * advVelSize]);
+            // Load data to aligned storage and interleave it
+            // std::copy(inptr, inptr + nmBlocks, locField);
+            // load_interleave(locField, m_nmTot, tmpIn);
+            load_unalign_interleave(inptr, m_nmTot, tmpIn);
 
-            // Load and transpose data
-            load_interleave(inptr, m_nmTot, tmpIn);
+            fusedKernel2D(nm0, nm1, nq0, nq1, correct, jac_ptr, df_ptr,
+                          advVel_ptr, tmpIn, tmpOut, wsp0, bwd, deriv0, deriv1);
 
-            if (!DEFORMED)
+            // de-interleave and store data
+            // deinterleave_store(tmpOut, m_nmTot, locField);
+            // std::copy(locField, locField + nmBlocks, outptr);
+            deinterleave_unalign_store(tmpOut, m_nmTot, outptr);
+
+            inptr += nmBlocks;
+            outptr += nmBlocks;
+            advVel_ptr += advVelSize;
+            if constexpr (DEFORMED)
             {
-                jac_ptr = &((*this->m_jac)[e]);
+                jac_ptr += nqTot;
+                df_ptr += dfSize;
             }
             else
             {
-                jac_ptr = &((*this->m_jac)[e * nqTot]);
+                ++jac_ptr;
+                df_ptr += ndf;
             }
+        }
+        // last block
+        {
+            int acturalSize = nmBlocks - this->m_nPads * m_nmTot;
+            std::copy(inptr, inptr + acturalSize, locField);
+            load_interleave(locField, m_nmTot, tmpIn);
 
-            // Step 1: BwdTrans
-            // Input: tmpIn = uCoeff
-            // Output: bwd = uPhys xi
-            BwdTrans2DKernel<SHAPE_TYPE>(nm0, nm1, nq0, nq1, correct, tmpIn,
-                                         this->m_bdata[0], this->m_bdata[1],
-                                         wsp0, bwd);
-
-            // Step 2: take STD derivatives in collapsed coordinate space
-            // Input: bwd = uPhys xi (const)
-            // Output: deriv0 = du/dxi_1
-            //         deriv1 = du/dxi_2
-            PhysDerivTensor2DKernel(nq0, nq1, bwd, this->m_D[0], this->m_D[1],
-                                    deriv0, deriv1);
-
-            // Step 3: Advect solution by advection velocities (varcoeffs)
-            // Input: deriv0 = du/dxi_1,
-            //        deriv1 = du/dxi_2
-            // Output: bwd += V \cdot \nabla_x u
-            Advection2DKernel<SHAPE_TYPE, DEFORMED>(
-                nq0, nq1, advVel_ptr, df_ptr, this->m_h0, this->m_h1, deriv0,
-                deriv1, bwd);
-
-            // Step 4: Inner product for mass + advection matrix operation
-            // Input: bwd = 1/lambda * V \cdot \nabla_x uPhys + uPhys (const)
-            // Output: wsp0 = temporary
-            //         tmpOut = lambda \int_\Omega (1/lambda*V \cdot \nabla_x
-            //         uPhys + uPhys) phi
-            IProduct2DKernel<SHAPE_TYPE, true, false, DEFORMED>(
-                nm0, nm1, nq0, nq1, correct, bwd, this->m_bdata[0],
-                this->m_bdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0,
-                tmpOut, m_lambda);
-
-            // Step 5: Evaluate local derivatives with laplacian metrics
-            // Input:
-            //        deriv0 = du/dxi_1,
-            //        deriv1 = du/dxi_2
-            // Output: deriv0 = (J^2 \cdot du/dxi)[0],
-            //         deriv1 = (J^2 \cdot du/dxi)[1]
-            DiffusionCoeff2DKernel<SHAPE_TYPE, DEFORMED>(
-                nq0, nq1, this->m_isConstVarDiff, this->m_constVarDiff,
-                this->m_isVarDiff, this->m_varD00, this->m_varD01,
-                this->m_varD11, df_ptr, this->m_h0, this->m_h1, deriv0, deriv1);
-
-            // Inner product with derivative basis xi_1
-            // Input: deriv0 = (J^2 \cdot du/dxi)[0] (const)
-            // Output: wsp0 = temporary,
-            //         tmpOut += \int_\Omega X dphi/dx (append)
-            IProduct2DKernel<SHAPE_TYPE, false, true, DEFORMED>(
-                nm0, nm1, nq0, nq1, correct, deriv0, this->m_dbdata[0],
-                this->m_bdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0,
-                tmpOut);
-
-            // Inner product with derivative basis xi_2
-            // Input: deriv1 = (J^2 \cdot du/dxi)[1] (const)
-            // Output: wsp0 = temporary,
-            //         tmpOut += \int_\Omega X dphi/dy (append)
-            IProduct2DKernel<SHAPE_TYPE, false, true, DEFORMED>(
-                nm0, nm1, nq0, nq1, correct, deriv1, this->m_bdata[0],
-                this->m_dbdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0,
-                tmpOut);
+            fusedKernel2D(nm0, nm1, nq0, nq1, correct, jac_ptr, df_ptr,
+                          advVel_ptr, tmpIn, tmpOut, wsp0, bwd, deriv0, deriv1);
 
             // de-interleave and store data
-            deinterleave_store(tmpOut, m_nmTot, outptr);
-
-            inptr += nmBlocks;
-            outptr += nmBlocks;
+            deinterleave_store(tmpOut, m_nmTot, locField);
+            std::copy(locField, locField + acturalSize, outptr);
         }
     }
 
+    NEK_FORCE_INLINE void fusedKernel2D(
+        const size_t nm0, const size_t nm1, const size_t nq0, const size_t nq1,
+        const bool correct, const vec_t *jac_ptr, const vec_t *df_ptr,
+        const vec_t *advVel_ptr,
+        const std::vector<vec_t, allocator<vec_t>> &tmpIn,
+        std::vector<vec_t, allocator<vec_t>> &tmpOut,
+        std::vector<vec_t, allocator<vec_t>> &wsp0,
+        std::vector<vec_t, allocator<vec_t>> &bwd,
+        std::vector<vec_t, allocator<vec_t>> &deriv0,
+        std::vector<vec_t, allocator<vec_t>> &deriv1)
+    {
+        // Step 1: BwdTrans
+        // Input: tmpIn = uCoeff
+        // Output: bwd = uPhys xi
+        BwdTrans2DKernel<SHAPE_TYPE>(nm0, nm1, nq0, nq1, correct, tmpIn,
+                                     this->m_bdata[0], this->m_bdata[1], wsp0,
+                                     bwd);
+
+        // Step 2: take STD derivatives in collapsed coordinate space
+        // Input: bwd = uPhys xi (const)
+        // Output: deriv0 = du/dxi_1
+        //         deriv1 = du/dxi_2
+        PhysDerivTensor2DKernel(nq0, nq1, bwd, this->m_D[0], this->m_D[1],
+                                deriv0, deriv1);
+
+        // Step 3: Advect solution by advection velocities (varcoeffs)
+        // Input: deriv0 = du/dxi_1,
+        //        deriv1 = du/dxi_2
+        // Output: bwd += V \cdot \nabla_x u
+        Advection2DKernel<SHAPE_TYPE, DEFORMED>(nq0, nq1, advVel_ptr, df_ptr,
+                                                this->m_h0, this->m_h1, deriv0,
+                                                deriv1, bwd);
+
+        // Step 4: Inner product for mass + advection matrix operation
+        // Input: bwd = 1/lambda * V \cdot \nabla_x uPhys + uPhys (const)
+        // Output: wsp0 = temporary
+        //         tmpOut = lambda \int_\Omega (1/lambda*V \cdot \nabla_x
+        //         uPhys + uPhys) phi
+        IProduct2DKernel<SHAPE_TYPE, true, false, DEFORMED>(
+            nm0, nm1, nq0, nq1, correct, bwd, this->m_bdata[0],
+            this->m_bdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0, tmpOut,
+            m_lambda);
+
+        // Step 5: Evaluate local derivatives with laplacian metrics
+        // Input:
+        //        deriv0 = du/dxi_1,
+        //        deriv1 = du/dxi_2
+        // Output: deriv0 = (J^2 \cdot du/dxi)[0],
+        //         deriv1 = (J^2 \cdot du/dxi)[1]
+        DiffusionCoeff2DKernel<SHAPE_TYPE, DEFORMED>(
+            nq0, nq1, this->m_isConstVarDiff, this->m_constVarDiff,
+            this->m_isVarDiff, this->m_varD00, this->m_varD01, this->m_varD11,
+            df_ptr, this->m_h0, this->m_h1, deriv0, deriv1);
+
+        // Inner product with derivative basis xi_1
+        // Input: deriv0 = (J^2 \cdot du/dxi)[0] (const)
+        // Output: wsp0 = temporary,
+        //         tmpOut += \int_\Omega X dphi/dx (append)
+        IProduct2DKernel<SHAPE_TYPE, false, true, DEFORMED>(
+            nm0, nm1, nq0, nq1, correct, deriv0, this->m_dbdata[0],
+            this->m_bdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0,
+            tmpOut);
+
+        // Inner product with derivative basis xi_2
+        // Input: deriv1 = (J^2 \cdot du/dxi)[1] (const)
+        // Output: wsp0 = temporary,
+        //         tmpOut += \int_\Omega X dphi/dy (append)
+        IProduct2DKernel<SHAPE_TYPE, false, true, DEFORMED>(
+            nm0, nm1, nq0, nq1, correct, deriv1, this->m_bdata[0],
+            this->m_dbdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0,
+            tmpOut);
+    }
+
 #elif defined(SHAPE_DIMENSION_3D)
 
     // Non-size based operator.
@@ -457,94 +447,64 @@ struct LinearAdvectionDiffusionReactionTemplate
         std::vector<vec_t, allocator<vec_t>> deriv0(nqTot), deriv1(nqTot),
             deriv2(nqTot);
 
-        const vec_t *jac_ptr    = {};
-        const vec_t *df_ptr     = {};
-        const vec_t *advVel_ptr = {};
+        // temporary aligned storage for local fields
+        NekDouble *locField = static_cast<NekDouble *>(::operator new[](
+            nmBlocks * sizeof(NekDouble), std::align_val_t(vec_t::alignment)));
+
+        const vec_t *jac_ptr    = &((*this->m_jac)[0]);
+        const vec_t *df_ptr     = &((*this->m_df)[0]);
+        const vec_t *advVel_ptr = &((*this->m_advVel)[0]);
 
         // Get size of derivative factor block
-        auto dfSize = ndf;
-        if (DEFORMED)
-        {
-            dfSize *= nqTot;
-        }
-        auto advVelSize = nvel * nqTot;
+        const auto dfSize     = ndf * nqTot;
+        const auto advVelSize = nvel * nqTot;
 
-        for (int e = 0; e < this->m_nBlocks; e++)
+        for (int e = 0; e < this->m_nBlocks - 1; e++)
         {
-            df_ptr     = &((*this->m_df)[e * dfSize]);
-            advVel_ptr = &((*this->m_advVel)[e * advVelSize]);
+            // Load data to aligned storage and interleave it
+            // std::copy(inptr, inptr + nmBlocks, locField);
+            // load_interleave(locField, m_nmTot, tmpIn);
+            load_unalign_interleave(inptr, m_nmTot, tmpIn);
+
+            fusedKernel3D(nm0, nm1, nm2, nq0, nq1, nq2, correct, jac_ptr,
+                          df_ptr, advVel_ptr, tmpIn, tmpOut, wsp0, wsp1, wsp2,
+                          bwd, deriv0, deriv1, deriv2);
 
-            // Load and transpose data
-            load_interleave(inptr, m_nmTot, tmpIn);
+            // de-interleave and store data
+            // deinterleave_store(tmpOut, m_nmTot, locField);
+            // std::copy(locField, locField + nmBlocks, outptr);
+            deinterleave_unalign_store(tmpOut, m_nmTot, outptr);
 
-            if (!DEFORMED)
+            inptr += nmBlocks;
+            outptr += nmBlocks;
+            advVel_ptr += advVelSize;
+            if constexpr (DEFORMED)
             {
-                jac_ptr = &((*this->m_jac)[e]);
+                jac_ptr += nqTot;
+                df_ptr += dfSize;
             }
             else
             {
-                jac_ptr = &((*this->m_jac)[e * nqTot]);
+                ++jac_ptr;
+                df_ptr += ndf;
             }
+        }
+        // last block
+        {
+            int acturalSize = nmBlocks - this->m_nPads * m_nmTot;
+            std::copy(inptr, inptr + acturalSize, locField);
+            load_interleave(locField, m_nmTot, tmpIn);
 
-            // Step 1: BwdTrans
-            BwdTrans3DKernel<SHAPE_TYPE>(
-                nm0, nm1, nm2, nq0, nq1, nq2, correct, tmpIn, this->m_bdata[0],
-                this->m_bdata[1], this->m_bdata[2], wsp0, wsp1, bwd);
-
-            // Step 2: take STD derivatives in collapsed coordinate space
-            PhysDerivTensor3DKernel(nq0, nq1, nq2, bwd, this->m_D[0],
-                                    this->m_D[1], this->m_D[2], deriv0, deriv1,
-                                    deriv2);
-
-            // Step 3: Advect solution by advection velocities (varcoeffs)
-            // Input: deriv0 = du/dxi_1,
-            //        deriv1 = du/dxi_2
-            //        deriv2 = du/dxi_3
-            // Output: bwd += V \cdot \nabla_x u
-            Advection3DKernel<SHAPE_TYPE, DEFORMED>(
-                nq0, nq1, nq2, advVel_ptr, df_ptr, this->m_h0, this->m_h1,
-                this->m_h2, this->m_h3, deriv0, deriv1, deriv2, bwd);
-
-            // Step 4: Inner product for mass + advection matrix operation
-            // Input: bwd = 1/lambda * V \cdot \nabla_x uPhys + uPhys (const)
-            // Output: wsp0/1/2 = temporary
-            //         tmpOut = lambda \int_\Omega (1/lambda*V \cdot \nabla_x
-            //         uPhys + uPhys) phi
-            IProduct3DKernel<SHAPE_TYPE, true, false, DEFORMED>(
-                nm0, nm1, nm2, nq0, nq1, nq2, correct, bwd, this->m_bdata[0],
-                this->m_bdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1],
-                this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut, m_lambda);
-
-            // Step 5: Evaluate local derivatives with laplacian metrics
-            DiffusionCoeff3DKernel<SHAPE_TYPE, DEFORMED>(
-                nq0, nq1, nq2, this->m_isConstVarDiff, this->m_constVarDiff,
-                this->m_isVarDiff, this->m_varD00, this->m_varD01,
-                this->m_varD11, this->m_varD02, this->m_varD12, this->m_varD22,
-                df_ptr, this->m_h0, this->m_h1, this->m_h2, this->m_h3, deriv0,
-                deriv1, deriv2);
-
-            IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>(
-                nm0, nm1, nm2, nq0, nq1, nq2, correct, deriv0,
-                this->m_dbdata[0], this->m_bdata[1], this->m_bdata[2],
-                this->m_w[0], this->m_w[1], this->m_w[2], jac_ptr, wsp0, wsp1,
-                wsp2, tmpOut);
-
-            IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>(
-                nm0, nm1, nm2, nq0, nq1, nq2, correct, deriv1, this->m_bdata[0],
-                this->m_dbdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1],
-                this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut);
-
-            IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>(
-                nm0, nm1, nm2, nq0, nq1, nq2, correct, deriv2, this->m_bdata[0],
-                this->m_bdata[1], this->m_dbdata[2], this->m_w[0], this->m_w[1],
-                this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut);
+            fusedKernel3D(nm0, nm1, nm2, nq0, nq1, nq2, correct, jac_ptr,
+                          df_ptr, advVel_ptr, tmpIn, tmpOut, wsp0, wsp1, wsp2,
+                          bwd, deriv0, deriv1, deriv2);
 
             // de-interleave and store data
-            deinterleave_store(tmpOut, m_nmTot, outptr);
-
-            inptr += nmBlocks;
-            outptr += nmBlocks;
+            deinterleave_store(tmpOut, m_nmTot, locField);
+            std::copy(locField, locField + acturalSize, outptr);
         }
+        // free aligned memory
+        ::operator delete[](locField, std::align_val_t(vec_t::alignment));
     }
 
     // Size based template version.
@@ -578,96 +538,128 @@ struct LinearAdvectionDiffusionReactionTemplate
         std::vector<vec_t, allocator<vec_t>> deriv0(nqTot), deriv1(nqTot),
             deriv2(nqTot);
 
-        const vec_t *jac_ptr    = {};
-        const vec_t *df_ptr     = {};
-        const vec_t *advVel_ptr = {};
+        // temporary aligned storage for local fields
+        alignas(vec_t::alignment) NekDouble locField[nqTot * vec_t::width];
+
+        const vec_t *jac_ptr    = &((*this->m_jac)[0]);
+        const vec_t *df_ptr     = &((*this->m_df)[0]);
+        const vec_t *advVel_ptr = &((*this->m_advVel)[0]);
 
         // Get size of derivative factor block
-        auto dfSize = ndf;
-        if (DEFORMED)
-        {
-            dfSize *= nqTot;
-        }
-        auto advVelSize = nvel * nqTot;
+        constexpr auto dfSize     = ndf * nqTot;
+        constexpr auto advVelSize = nvel * nqTot;
 
-        for (int e = 0; e < this->m_nBlocks; e++)
+        for (int e = 0; e < this->m_nBlocks - 1; e++)
         {
-            df_ptr     = &((*this->m_df)[e * dfSize]);
-            advVel_ptr = &((*this->m_advVel)[e * advVelSize]);
+            // Load data to aligned storage and interleave it
+            // std::copy(inptr, inptr + nmBlocks, locField);
+            // load_interleave(locField, m_nmTot, tmpIn);
+            load_unalign_interleave(inptr, m_nmTot, tmpIn);
 
-            // Load and transpose data
-            load_interleave(inptr, m_nmTot, tmpIn);
+            fusedKernel3D(nm0, nm1, nm2, nq0, nq1, nq2, correct, jac_ptr,
+                          df_ptr, advVel_ptr, tmpIn, tmpOut, wsp0, wsp1, wsp2,
+                          bwd, deriv0, deriv1, deriv2);
+
+            // de-interleave and store data
+            // deinterleave_store(tmpOut, m_nmTot, locField);
+            // std::copy(locField, locField + nmBlocks, outptr);
+            deinterleave_unalign_store(tmpOut, m_nmTot, outptr);
 
-            if (!DEFORMED)
+            inptr += nmBlocks;
+            outptr += nmBlocks;
+            advVel_ptr += advVelSize;
+            if constexpr (DEFORMED)
             {
-                jac_ptr = &((*this->m_jac)[e]);
+                jac_ptr += nqTot;
+                df_ptr += dfSize;
             }
             else
             {
-                jac_ptr = &((*this->m_jac)[e * nqTot]);
+                ++jac_ptr;
+                df_ptr += ndf;
             }
+        }
+        // last block
+        {
+            int acturalSize = nmBlocks - this->m_nPads * m_nmTot;
+            std::copy(inptr, inptr + acturalSize, locField);
+            load_interleave(locField, m_nmTot, tmpIn);
 
-            // Step 1: BwdTrans
-            BwdTrans3DKernel<SHAPE_TYPE>(
-                nm0, nm1, nm2, nq0, nq1, nq2, correct, tmpIn, this->m_bdata[0],
-                this->m_bdata[1], this->m_bdata[2], wsp0, wsp1, bwd);
-
-            // Step 2: take STD derivatives in collapsed coordinate space
-            PhysDerivTensor3DKernel(nq0, nq1, nq2, bwd, this->m_D[0],
-                                    this->m_D[1], this->m_D[2], deriv0, deriv1,
-                                    deriv2);
-
-            // Step 3: Advect solution by advection velocities (varcoeffs)
-            // Input: deriv0 = du/dxi_1,
-            //        deriv1 = du/dxi_2
-            //        deriv2 = du/dxi_3
-            // Output: bwd += V \cdot \nabla_x u
-            Advection3DKernel<SHAPE_TYPE, DEFORMED>(
-                nq0, nq1, nq2, advVel_ptr, df_ptr, this->m_h0, this->m_h1,
-                this->m_h2, this->m_h3, deriv0, deriv1, deriv2, bwd);
-
-            // Step 4: Inner product for mass + advection matrix operation
-            // Input: bwd = 1/lambda * V \cdot \nabla_x uPhys + uPhys (const)
-            // Output: wsp0/1/2 = temporary
-            //         tmpOut = lambda \int_\Omega (1/lambda*V \cdot \nabla_x
-            //         uPhys + uPhys) phi
-            IProduct3DKernel<SHAPE_TYPE, true, false, DEFORMED>(
-                nm0, nm1, nm2, nq0, nq1, nq2, correct, bwd, this->m_bdata[0],
-                this->m_bdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1],
-                this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut, m_lambda);
-
-            // Step 5: Evaluate local derivatives with laplacian metrics
-            DiffusionCoeff3DKernel<SHAPE_TYPE, DEFORMED>(
-                nq0, nq1, nq2, this->m_isConstVarDiff, this->m_constVarDiff,
-                this->m_isVarDiff, this->m_varD00, this->m_varD01,
-                this->m_varD11, this->m_varD02, this->m_varD12, this->m_varD22,
-                df_ptr, this->m_h0, this->m_h1, this->m_h2, this->m_h3, deriv0,
-                deriv1, deriv2);
-
-            IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>(
-                nm0, nm1, nm2, nq0, nq1, nq2, correct, deriv0,
-                this->m_dbdata[0], this->m_bdata[1], this->m_bdata[2],
-                this->m_w[0], this->m_w[1], this->m_w[2], jac_ptr, wsp0, wsp1,
-                wsp2, tmpOut);
-
-            IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>(
-                nm0, nm1, nm2, nq0, nq1, nq2, correct, deriv1, this->m_bdata[0],
-                this->m_dbdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1],
-                this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut);
-
-            IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>(
-                nm0, nm1, nm2, nq0, nq1, nq2, correct, deriv2, this->m_bdata[0],
-                this->m_bdata[1], this->m_dbdata[2], this->m_w[0], this->m_w[1],
-                this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut);
+            fusedKernel3D(nm0, nm1, nm2, nq0, nq1, nq2, correct, jac_ptr,
+                          df_ptr, advVel_ptr, tmpIn, tmpOut, wsp0, wsp1, wsp2,
+                          bwd, deriv0, deriv1, deriv2);
 
             // de-interleave and store data
-            deinterleave_store(tmpOut, m_nmTot, outptr);
-
-            inptr += nmBlocks;
-            outptr += nmBlocks;
+            deinterleave_store(tmpOut, m_nmTot, locField);
+            std::copy(locField, locField + acturalSize, outptr);
         }
     }
 
+    NEK_FORCE_INLINE void fusedKernel3D(
+        const size_t nm0, const size_t nm1, const size_t nm2, const size_t nq0,
+        const size_t nq1, const size_t nq2, const bool correct,
+        const vec_t *jac_ptr, const vec_t *df_ptr, const vec_t *advVel_ptr,
+        const std::vector<vec_t, allocator<vec_t>> &tmpIn,
+        std::vector<vec_t, allocator<vec_t>> &tmpOut,
+        std::vector<vec_t, allocator<vec_t>> &wsp0,
+        std::vector<vec_t, allocator<vec_t>> &wsp1,
+        std::vector<vec_t, allocator<vec_t>> &wsp2,
+        std::vector<vec_t, allocator<vec_t>> &bwd,
+        std::vector<vec_t, allocator<vec_t>> &deriv0,
+        std::vector<vec_t, allocator<vec_t>> &deriv1,
+        std::vector<vec_t, allocator<vec_t>> &deriv2)
+    {
+        // Step 1: BwdTrans
+        BwdTrans3DKernel<SHAPE_TYPE>(nm0, nm1, nm2, nq0, nq1, nq2, correct,
+                                     tmpIn, this->m_bdata[0], this->m_bdata[1],
+                                     this->m_bdata[2], wsp0, wsp1, bwd);
+
+        // Step 2: take STD derivatives in collapsed coordinate space
+        PhysDerivTensor3DKernel(nq0, nq1, nq2, bwd, this->m_D[0], this->m_D[1],
+                                this->m_D[2], deriv0, deriv1, deriv2);
+
+        // Step 3: Advect solution by advection velocities (varcoeffs)
+        // Input: deriv0 = du/dxi_1,
+        //        deriv1 = du/dxi_2
+        //        deriv2 = du/dxi_3
+        // Output: bwd += V \cdot \nabla_x u
+        Advection3DKernel<SHAPE_TYPE, DEFORMED>(
+            nq0, nq1, nq2, advVel_ptr, df_ptr, this->m_h0, this->m_h1,
+            this->m_h2, this->m_h3, deriv0, deriv1, deriv2, bwd);
+
+        // Step 4: Inner product for mass + advection matrix operation
+        // Input: bwd = 1/lambda * V \cdot \nabla_x uPhys + uPhys (const)
+        // Output: wsp0/1/2 = temporary
+        //         tmpOut = lambda \int_\Omega (1/lambda*V \cdot \nabla_x
+        //         uPhys + uPhys) phi
+        IProduct3DKernel<SHAPE_TYPE, true, false, DEFORMED>(
+            nm0, nm1, nm2, nq0, nq1, nq2, correct, bwd, this->m_bdata[0],
+            this->m_bdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1],
+            this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut, m_lambda);
+
+        // Step 5: Evaluate local derivatives with laplacian metrics
+        DiffusionCoeff3DKernel<SHAPE_TYPE, DEFORMED>(
+            nq0, nq1, nq2, this->m_isConstVarDiff, this->m_constVarDiff,
+            this->m_isVarDiff, this->m_varD00, this->m_varD01, this->m_varD11,
+            this->m_varD02, this->m_varD12, this->m_varD22, df_ptr, this->m_h0,
+            this->m_h1, this->m_h2, this->m_h3, deriv0, deriv1, deriv2);
+
+        IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>(
+            nm0, nm1, nm2, nq0, nq1, nq2, correct, deriv0, this->m_dbdata[0],
+            this->m_bdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1],
+            this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut);
+
+        IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>(
+            nm0, nm1, nm2, nq0, nq1, nq2, correct, deriv1, this->m_bdata[0],
+            this->m_dbdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1],
+            this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut);
+
+        IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>(
+            nm0, nm1, nm2, nq0, nq1, nq2, correct, deriv2, this->m_bdata[0],
+            this->m_bdata[1], this->m_dbdata[2], this->m_w[0], this->m_w[1],
+            this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut);
+    }
+
 #endif // SHAPE_DIMENSION
 
 private:
diff --git a/library/MatrixFreeOps/Operator.hpp b/library/MatrixFreeOps/Operator.hpp
index 80f9b737bc0c2117cea0eaac06f86d26f665486f..22577425a0a2eda67294b079900d3526dbbe97a5 100644
--- a/library/MatrixFreeOps/Operator.hpp
+++ b/library/MatrixFreeOps/Operator.hpp
@@ -472,13 +472,18 @@ protected:
     Helper(std::vector<LibUtilities::BasisSharedPtr> basis, int nElmt)
         : Operator()
     {
-        // Sanity check: no padding yet!
-        ASSERTL1(nElmt % vec_t::width == 0,
-                 "Number of elements not divisible by vector "
-                 "width, padding not yet implemented.");
-
-        // Calculate number of 'blocks', i.e. meta-elements
-        m_nBlocks = nElmt / vec_t::width;
+        if (nElmt % vec_t::width == 0) // No padding or already padded
+        {
+            // Calculate number of 'blocks', i.e. meta-elements
+            m_nBlocks = nElmt / vec_t::width;
+            m_nPads   = 0;
+        }
+        else // Need padding internally
+        {
+            // Calculate number of 'blocks', i.e. meta-elements
+            m_nBlocks = nElmt / vec_t::width + 1;
+            m_nPads   = vec_t::width - (nElmt % vec_t::width);
+        }
 
         // Depending on element dimension, set up basis information, quadrature,
         // etc, inside vectorised environment.
@@ -552,6 +557,7 @@ protected:
     }
 
     int m_nBlocks;
+    int m_nPads;
     std::array<int, DIM> m_nm, m_nq;
     std::array<std::vector<vec_t, tinysimd::allocator<vec_t>>, DIM> m_bdata;
     std::array<std::vector<vec_t, tinysimd::allocator<vec_t>>, DIM> m_dbdata;
diff --git a/library/MatrixFreeOps/PhysDeriv.h b/library/MatrixFreeOps/PhysDeriv.h
index 75afbfafba4063f7cec9a9a48f60797b31ddb93b..49b0d16570ac01879eb6f696bf587c66ec59503c 100644
--- a/library/MatrixFreeOps/PhysDeriv.h
+++ b/library/MatrixFreeOps/PhysDeriv.h
@@ -137,7 +137,7 @@ struct PhysDerivTemplate
         }
 
         // Call 1D kernel
-        const vec_t *df_ptr   = {};
+        const vec_t *df_ptr   = &((*this->m_df)[0]);
         vec_t df_tmp[max_ndf] = {}; // max_ndf is a constexpr
 
         std::vector<vec_t, allocator<vec_t>> tmpIn(nqTot), tmpOut[max_ndf];
@@ -149,12 +149,16 @@ struct PhysDerivTemplate
             out_ptr[d] = &output[d][0];
         }
 
-        for (int e = 0; e < this->m_nBlocks; ++e)
-        {
-            df_ptr = &((*this->m_df)[dfSize * e]);
+        // temporary aligned storage for local fields
+        NekDouble *locField = static_cast<NekDouble *>(::operator new[](
+            nqBlocks * sizeof(NekDouble), std::align_val_t(vec_t::alignment)));
 
-            // Load and transpose data
-            load_interleave(inptr, nqTot, tmpIn);
+        for (int e = 0; e < this->m_nBlocks - 1; ++e)
+        {
+            // Load data to aligned storage and interleave it
+            // std::copy(inptr, inptr + nqBlocks, locField);
+            // load_interleave(locField, nqTot, tmpIn);
+            load_unalign_interleave(inptr, nqTot, tmpIn);
 
             // Get the basic derivative
             PhysDerivTensor1DKernel(nq0, tmpIn, this->m_D[0], tmpOut[0]);
@@ -165,12 +169,38 @@ struct PhysDerivTemplate
             // De-interleave and store data
             for (int d = 0; d < ndf; ++d)
             {
-                deinterleave_store(tmpOut[d], nqTot, out_ptr[d]);
+                // de-interleave and store data
+                // deinterleave_store(tmpOut[d], nqTot, locField);
+                // std::copy(locField, locField + nqBlocks, out_ptr[d]);
+                deinterleave_unalign_store(tmpOut[d], nqTot, out_ptr[d]);
                 out_ptr[d] += nqBlocks;
             }
 
             inptr += nqBlocks;
+            df_ptr += dfSize;
+        }
+        // last block
+        {
+            int acturalSize = nqBlocks - this->m_nPads * nqTot;
+            std::copy(inptr, inptr + acturalSize, locField);
+            load_interleave(locField, nqTot, tmpIn);
+
+            // Get the basic derivative
+            PhysDerivTensor1DKernel(nq0, tmpIn, this->m_D[0], tmpOut[0]);
+
+            // Calculate physical derivative
+            PhysDeriv1DKernel<DEFORMED>(nq0, ndf, df_ptr, df_tmp, tmpOut);
+
+            // De-interleave and store data
+            for (int d = 0; d < ndf; ++d)
+            {
+                // de-interleave and store data
+                deinterleave_store(tmpOut[d], nqTot, locField);
+                std::copy(locField, locField + acturalSize, out_ptr[d]);
+            }
         }
+        // free aligned memory
+        ::operator delete[](locField, std::align_val_t(vec_t::alignment));
     }
 
     // Size based template version.
@@ -202,7 +232,7 @@ struct PhysDerivTemplate
         PhysDeriv1DWorkspace<SHAPE_TYPE>(nq0);
 
         // Call 1D kernel
-        const vec_t *df_ptr   = {};
+        const vec_t *df_ptr   = &((*this->m_df)[0]);
         vec_t df_tmp[max_ndf] = {}; // max_ndf is a constexpr
 
         std::vector<vec_t, allocator<vec_t>> tmpIn(nqTot), tmpOut[max_ndf];
@@ -214,12 +244,15 @@ struct PhysDerivTemplate
             out_ptr[d] = &output[d][0];
         }
 
-        for (int e = 0; e < this->m_nBlocks; ++e)
-        {
-            df_ptr = &((*this->m_df)[dfSize * e]);
+        // temporary aligned storage for local fields
+        alignas(vec_t::alignment) NekDouble locField[nqBlocks];
 
-            // Load and transpose data
-            load_interleave(inptr, nqTot, tmpIn);
+        for (int e = 0; e < this->m_nBlocks - 1; ++e)
+        {
+            // Load data to aligned storage and interleave it
+            // std::copy(inptr, inptr + nqBlocks, locField);
+            // load_interleave(locField, nqTot, tmpIn);
+            load_unalign_interleave(inptr, nqTot, tmpIn);
 
             // Get the basic derivative
             PhysDerivTensor1DKernel(nq0, tmpIn, this->m_D[0], tmpOut[0]);
@@ -230,11 +263,35 @@ struct PhysDerivTemplate
             // De-interleave and store data
             for (int d = 0; d < ndf; ++d)
             {
-                deinterleave_store(tmpOut[d], nqTot, out_ptr[d]);
+                // de-interleave and store data
+                // deinterleave_store(tmpOut[d], nqTot, locField);
+                // std::copy(locField, locField + nqBlocks, out_ptr[d]);
+                deinterleave_unalign_store(tmpOut[d], nqTot, out_ptr[d]);
                 out_ptr[d] += nqBlocks;
             }
 
             inptr += nqBlocks;
+            df_ptr += dfSize;
+        }
+        // last block
+        {
+            int acturalSize = nqBlocks - this->m_nPads * nqTot;
+            std::copy(inptr, inptr + acturalSize, locField);
+            load_interleave(locField, nqTot, tmpIn);
+
+            // Get the basic derivative
+            PhysDerivTensor1DKernel(nq0, tmpIn, this->m_D[0], tmpOut[0]);
+
+            // Calculate physical derivative
+            PhysDeriv1DKernel<DEFORMED>(nq0, ndf, df_ptr, df_tmp, tmpOut);
+
+            // De-interleave and store data
+            for (int d = 0; d < ndf; ++d)
+            {
+                // de-interleave and store data
+                deinterleave_store(tmpOut[d], nqTot, locField);
+                std::copy(locField, locField + acturalSize, out_ptr[d]);
+            }
         }
     }
 
@@ -279,15 +336,19 @@ struct PhysDerivTemplate
             dfSize *= nqTot;
         }
 
-        const vec_t *df_ptr   = {};
+        const vec_t *df_ptr   = &((*this->m_df)[0]);
         vec_t df_tmp[max_ndf] = {}; // max_ndf is a constexpr
 
-        for (int e = 0; e < this->m_nBlocks; ++e)
-        {
-            df_ptr = &((*this->m_df)[dfSize * e]);
+        // temporary aligned storage for local fields
+        NekDouble *locField = static_cast<NekDouble *>(::operator new[](
+            nqBlocks * sizeof(NekDouble), std::align_val_t(vec_t::alignment)));
 
-            // Load and transpose data
-            load_interleave(inptr, nqTot, tmpIn);
+        for (int e = 0; e < this->m_nBlocks - 1; ++e)
+        {
+            // Load data to aligned storage and interleave it
+            // std::copy(inptr, inptr + nqBlocks, locField);
+            // load_interleave(locField, nqTot, tmpIn);
+            load_unalign_interleave(inptr, nqTot, tmpIn);
 
             // Results written to out_d0, out_d1
             PhysDerivTensor2DKernel(nq0, nq1, tmpIn, this->m_D[0], this->m_D[1],
@@ -297,14 +358,40 @@ struct PhysDerivTemplate
                                         this->m_Z[1], df_ptr, df_tmp, tmpOut);
 
             inptr += nqBlocks;
-
+            df_ptr += dfSize;
             // de-interleave and store data
             for (int d = 0; d < outdim; ++d)
             {
-                deinterleave_store(tmpOut[d], nqTot, out_ptr[d]);
+                // de-interleave and store data
+                // deinterleave_store(tmpOut[d], nqTot, locField);
+                // std::copy(locField, locField + nqBlocks, out_ptr[d]);
+                deinterleave_unalign_store(tmpOut[d], nqTot, out_ptr[d]);
                 out_ptr[d] += nqBlocks;
             }
         }
+        // last block
+        {
+            int acturalSize = nqBlocks - this->m_nPads * nqTot;
+            std::copy(inptr, inptr + acturalSize, locField);
+            load_interleave(locField, nqTot, tmpIn);
+
+            // Results written to out_d0, out_d1
+            PhysDerivTensor2DKernel(nq0, nq1, tmpIn, this->m_D[0], this->m_D[1],
+                                    tmpOut[0], tmpOut[1]);
+            // Calculate physical derivative
+            PhysDeriv2DKernel<DEFORMED>(nq0, nq1, outdim, this->m_Z[0],
+                                        this->m_Z[1], df_ptr, df_tmp, tmpOut);
+
+            // De-interleave and store data
+            for (int d = 0; d < outdim; ++d)
+            {
+                // de-interleave and store data
+                deinterleave_store(tmpOut[d], nqTot, locField);
+                std::copy(locField, locField + acturalSize, out_ptr[d]);
+            }
+        }
+        // free aligned memory
+        ::operator delete[](locField, std::align_val_t(vec_t::alignment));
     }
 
     // Size based template version.
@@ -344,15 +431,18 @@ struct PhysDerivTemplate
             dfSize *= nqTot;
         }
 
-        const vec_t *df_ptr   = {};
+        const vec_t *df_ptr   = &((*this->m_df)[0]);
         vec_t df_tmp[max_ndf] = {}; // max_ndf is a constexpr
 
-        for (int e = 0; e < this->m_nBlocks; ++e)
-        {
-            df_ptr = &((*this->m_df)[dfSize * e]);
+        // temporary aligned storage for local fields
+        alignas(vec_t::alignment) NekDouble locField[nqBlocks];
 
-            // Load and transpose data
-            load_interleave(inptr, nqTot, tmpIn);
+        for (int e = 0; e < this->m_nBlocks - 1; ++e)
+        {
+            // Load data to aligned storage and interleave it
+            // std::copy(inptr, inptr + nqBlocks, locField);
+            // load_interleave(locField, nqTot, tmpIn);
+            load_unalign_interleave(inptr, nqTot, tmpIn);
 
             // Results written to out_d0, out_d1
             PhysDerivTensor2DKernel(nq0, nq1, tmpIn, this->m_D[0], this->m_D[1],
@@ -362,14 +452,38 @@ struct PhysDerivTemplate
                                         this->m_Z[1], df_ptr, df_tmp, tmpOut);
 
             inptr += nqBlocks;
-
+            df_ptr += dfSize;
             // de-interleave and store data
             for (int d = 0; d < outdim; ++d)
             {
-                deinterleave_store(tmpOut[d], nqTot, out_ptr[d]);
+                // de-interleave and store data
+                // deinterleave_store(tmpOut[d], nqTot, locField);
+                // std::copy(locField, locField + nqBlocks, out_ptr[d]);
+                deinterleave_unalign_store(tmpOut[d], nqTot, out_ptr[d]);
                 out_ptr[d] += nqBlocks;
             }
         }
+        // last block
+        {
+            int acturalSize = nqBlocks - this->m_nPads * nqTot;
+            std::copy(inptr, inptr + acturalSize, locField);
+            load_interleave(locField, nqTot, tmpIn);
+
+            // Results written to out_d0, out_d1
+            PhysDerivTensor2DKernel(nq0, nq1, tmpIn, this->m_D[0], this->m_D[1],
+                                    tmpOut[0], tmpOut[1]);
+            // Calculate physical derivative
+            PhysDeriv2DKernel<DEFORMED>(nq0, nq1, outdim, this->m_Z[0],
+                                        this->m_Z[1], df_ptr, df_tmp, tmpOut);
+
+            // De-interleave and store data
+            for (int d = 0; d < outdim; ++d)
+            {
+                // de-interleave and store data
+                deinterleave_store(tmpOut[d], nqTot, locField);
+                std::copy(locField, locField + acturalSize, out_ptr[d]);
+            }
+        }
     }
 
 #elif defined(SHAPE_DIMENSION_3D)
@@ -409,15 +523,19 @@ struct PhysDerivTemplate
         std::vector<vec_t, allocator<vec_t>> wsp0(wsp0Size), wsp1(wsp1Size),
             tmpIn(nqTot), tmpd0(nqTot), tmpd1(nqTot), tmpd2(nqTot);
 
-        const vec_t *df_ptr = {};
+        const vec_t *df_ptr = &((*this->m_df)[0]);
         vec_t df_tmp[ndf]   = {}; // ndf is a constexpr
 
-        for (int e = 0; e < this->m_nBlocks; ++e)
-        {
-            df_ptr = &((*this->m_df)[dfSize * e]);
+        // temporary aligned storage for local fields
+        NekDouble *locField = static_cast<NekDouble *>(::operator new[](
+            nqBlocks * sizeof(NekDouble), std::align_val_t(vec_t::alignment)));
 
-            // Load and transpose data
-            load_interleave(inptr, nqTot, tmpIn);
+        for (int e = 0; e < this->m_nBlocks - 1; ++e)
+        {
+            // Load data to aligned storage and interleave it
+            // std::copy(inptr, inptr + nqBlocks, locField);
+            // load_interleave(locField, nqTot, tmpIn);
+            load_unalign_interleave(inptr, nqTot, tmpIn);
 
             // Results written to out_d0, out_d1, out_d2
             PhysDerivTensor3DKernel(nq0, nq1, nq2, tmpIn, this->m_D[0],
@@ -429,15 +547,41 @@ struct PhysDerivTemplate
                 df_tmp, wsp0, wsp1, tmpd0, tmpd1, tmpd2);
 
             // de-interleave and store data
-            deinterleave_store(tmpd0, nqTot, outptr_d0);
-            deinterleave_store(tmpd1, nqTot, outptr_d1);
-            deinterleave_store(tmpd2, nqTot, outptr_d2);
+            deinterleave_unalign_store(tmpd0, nqTot, outptr_d0);
+            deinterleave_unalign_store(tmpd1, nqTot, outptr_d1);
+            deinterleave_unalign_store(tmpd2, nqTot, outptr_d2);
 
             inptr += nqBlocks;
+            df_ptr += dfSize;
             outptr_d0 += nqBlocks;
             outptr_d1 += nqBlocks;
             outptr_d2 += nqBlocks;
         }
+        // last block
+        {
+            int acturalSize = nqBlocks - this->m_nPads * nqTot;
+            std::copy(inptr, inptr + acturalSize, locField);
+            load_interleave(locField, nqTot, tmpIn);
+
+            // Results written to out_d0, out_d1, out_d2
+            PhysDerivTensor3DKernel(nq0, nq1, nq2, tmpIn, this->m_D[0],
+                                    this->m_D[1], this->m_D[2], tmpd0, tmpd1,
+                                    tmpd2);
+            // Calculate physical derivative
+            PhysDeriv3DKernel<DEFORMED>(
+                nq0, nq1, nq2, this->m_Z[0], this->m_Z[1], this->m_Z[2], df_ptr,
+                df_tmp, wsp0, wsp1, tmpd0, tmpd1, tmpd2);
+
+            // De-interleave and store data
+            deinterleave_store(tmpd0, nqTot, locField);
+            std::copy(locField, locField + acturalSize, outptr_d0);
+            deinterleave_store(tmpd1, nqTot, locField);
+            std::copy(locField, locField + acturalSize, outptr_d1);
+            deinterleave_store(tmpd2, nqTot, locField);
+            std::copy(locField, locField + acturalSize, outptr_d2);
+        }
+        // free aligned memory
+        ::operator delete[](locField, std::align_val_t(vec_t::alignment));
     }
 
     // Size based template version.
@@ -472,15 +616,18 @@ struct PhysDerivTemplate
         std::vector<vec_t, allocator<vec_t>> tmpIn(nqTot), tmpd0(nqTot),
             tmpd1(nqTot), tmpd2(nqTot), wsp0(wsp0Size), wsp1(wsp1Size);
 
-        const vec_t *df_ptr = {};
+        const vec_t *df_ptr = &((*this->m_df)[0]);
         vec_t df_tmp[ndf]   = {}; // ndf is a constexpr
 
-        for (int e = 0; e < this->m_nBlocks; ++e)
-        {
-            df_ptr = &((*this->m_df)[dfSize * e]);
+        // temporary aligned storage for local fields
+        alignas(vec_t::alignment) NekDouble locField[nqBlocks];
 
-            // Load and transpose data
-            load_interleave(inptr, nqTot, tmpIn);
+        for (int e = 0; e < this->m_nBlocks - 1; ++e)
+        {
+            // Load data to aligned storage and interleave it
+            // std::copy(inptr, inptr + nqBlocks, locField);
+            // load_interleave(locField, nqTot, tmpIn);
+            load_unalign_interleave(inptr, nqTot, tmpIn);
 
             // Results written to out_d0, out_d1, out_d2
             PhysDerivTensor3DKernel(nq0, nq1, nq2, tmpIn, this->m_D[0],
@@ -492,15 +639,39 @@ struct PhysDerivTemplate
                 df_tmp, wsp0, wsp1, tmpd0, tmpd1, tmpd2);
 
             // de-interleave and store data
-            deinterleave_store(tmpd0, nqTot, outptr_d0);
-            deinterleave_store(tmpd1, nqTot, outptr_d1);
-            deinterleave_store(tmpd2, nqTot, outptr_d2);
+            deinterleave_unalign_store(tmpd0, nqTot, outptr_d0);
+            deinterleave_unalign_store(tmpd1, nqTot, outptr_d1);
+            deinterleave_unalign_store(tmpd2, nqTot, outptr_d2);
 
             inptr += nqBlocks;
+            df_ptr += dfSize;
             outptr_d0 += nqBlocks;
             outptr_d1 += nqBlocks;
             outptr_d2 += nqBlocks;
         }
+        // last block
+        {
+            int acturalSize = nqBlocks - this->m_nPads * nqTot;
+            std::copy(inptr, inptr + acturalSize, locField);
+            load_interleave(locField, nqTot, tmpIn);
+
+            // Results written to out_d0, out_d1, out_d2
+            PhysDerivTensor3DKernel(nq0, nq1, nq2, tmpIn, this->m_D[0],
+                                    this->m_D[1], this->m_D[2], tmpd0, tmpd1,
+                                    tmpd2);
+            // Calculate physical derivative
+            PhysDeriv3DKernel<DEFORMED>(
+                nq0, nq1, nq2, this->m_Z[0], this->m_Z[1], this->m_Z[2], df_ptr,
+                df_tmp, wsp0, wsp1, tmpd0, tmpd1, tmpd2);
+
+            // De-interleave and store data
+            deinterleave_store(tmpd0, nqTot, locField);
+            std::copy(locField, locField + acturalSize, outptr_d0);
+            deinterleave_store(tmpd1, nqTot, locField);
+            std::copy(locField, locField + acturalSize, outptr_d1);
+            deinterleave_store(tmpd2, nqTot, locField);
+            std::copy(locField, locField + acturalSize, outptr_d2);
+        }
     }
 
 #endif
diff --git a/library/MatrixFreeOps/PhysInterp1DScaled.h b/library/MatrixFreeOps/PhysInterp1DScaled.h
deleted file mode 100644
index 9658bfb22729bdd2a80b5af1e1393736744d737a..0000000000000000000000000000000000000000
--- a/library/MatrixFreeOps/PhysInterp1DScaled.h
+++ /dev/null
@@ -1,359 +0,0 @@
-///////////////////////////////////////////////////////////////////////////////
-//
-// File: PhysInterp1DScaled.h
-//
-// For more information, please see: http://www.nektar.info
-//
-// The MIT License
-//
-// Copyright (c) 2006 Division of Applied Mathematics, Brown University (USA),
-// Department of Aeronautics, Imperial College London (UK), and Scientific
-// Computing and Imaging Institute, University of Utah (USA).
-//
-// Permission is hereby granted, free of charge, to any person obtaining a
-// copy of this software and associated documentation files (the "Software"),
-// to deal in the Software without restriction, including without limitation
-// the rights to use, copy, modify, merge, publish, distribute, sublicense,
-// and/or sell copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included
-// in all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-// DEALINGS IN THE SOFTWARE.
-//
-// Description:
-//
-///////////////////////////////////////////////////////////////////////////////
-
-#ifndef NEKTAR_LIBRARY_MF_PHYSINTERP1DSCALED_H
-#define NEKTAR_LIBRARY_MF_PHYSINTERP1DSCALED_H
-
-#include <LibUtilities/BasicUtils/ShapeType.hpp>
-#include <LibUtilities/BasicUtils/SharedArray.hpp>
-#include <LibUtilities/Foundations/Basis.h>
-
-#include "Operator.hpp"
-#include "PhysInterp1DScaledKernels.hpp"
-
-// As each opertor has seven shapes over three dimension to get to the
-// "work" each operator uses a series of preprocessor directives based
-// on the dimension and shape type so to limit the code
-// inclusion. This keeps the library size as minimal as possible while
-// removing runtime conditionals.
-
-// The preprocessor directives, SHAPE_DIMENSION_?D and SHAPE_TYPE_*
-// are constructed by CMake in the CMakeLists.txt file which uses an
-// implementation file.  See the CMakeLists.txt files for more
-// details.
-namespace Nektar::MatrixFree
-{
-
-template <LibUtilities::ShapeType SHAPE_TYPE, bool DEFORMED = false>
-struct PhysInterp1DScaledTemplate
-    : public PhysInterp1DScaled,
-      public Helper<LibUtilities::ShapeTypeDimMap[SHAPE_TYPE]>
-{
-    PhysInterp1DScaledTemplate(std::vector<LibUtilities::BasisSharedPtr> basis,
-                               int nElmt)
-        : PhysInterp1DScaled(basis, nElmt),
-          Helper<LibUtilities::ShapeTypeDimMap[SHAPE_TYPE]>(basis, nElmt)
-    {
-        constexpr auto DIM = LibUtilities::ShapeTypeDimMap[SHAPE_TYPE];
-
-        if (DIM == 1)
-        {
-            m_nmTot = LibUtilities::GetNumberOfCoefficients(SHAPE_TYPE,
-                                                            this->m_nm[0]);
-        }
-        else if (DIM == 2)
-        {
-            m_nmTot = LibUtilities::GetNumberOfCoefficients(
-                SHAPE_TYPE, this->m_nm[0], this->m_nm[1]);
-        }
-        else if (DIM == 3)
-        {
-            m_nmTot = LibUtilities::GetNumberOfCoefficients(
-                SHAPE_TYPE, this->m_nm[0], this->m_nm[1], this->m_nm[2]);
-        }
-    }
-
-    static std::shared_ptr<Operator> Create(
-        std::vector<LibUtilities::BasisSharedPtr> basis, int nElmt)
-    {
-        return std::make_shared<PhysInterp1DScaledTemplate<SHAPE_TYPE>>(basis,
-                                                                        nElmt);
-    }
-
-    void operator()(const Array<OneD, const NekDouble> &input,
-                    Array<OneD, NekDouble> &output) final
-    {
-#include "SwitchNodesPoints.h"
-    }
-
-    // There must be separate 1D, 2D, and 3D operators because the
-    // helper base class is based on the shape type dim. Thus indices
-    // for the basis must be respected.
-
-    // Further there are duplicate 1D, 2D, and 3D operators so to
-    // allow for compile time array sizes to be part of the template
-    // and thus gain loop unrolling. The only difference is the
-    // location of the size value, in the template or the function:
-    // foo<int bar>() {} vs foo() { int bar = ... }
-
-#if defined(SHAPE_DIMENSION_1D)
-
-    // Non-size based operator.
-    void operator1D(const Array<OneD, const NekDouble> &input,
-                    Array<OneD, NekDouble> &output)
-    {
-        const auto nm0 = m_basis[0]->GetNumModes();
-        const auto nq0 = m_basis[0]->GetNumPoints();
-
-        const auto nqTot    = nq0;
-        const auto nqBlocks = nqTot * vec_t::width;
-        const auto nmBlocks = m_nmTot * vec_t::width;
-
-        auto *inptr  = &input[0];
-        auto *outptr = &output[0];
-
-        // Workspace for kernels - also checks preconditions
-        PhysInterp1DScaled1DWorkspace<SHAPE_TYPE>(nm0, nq0);
-
-        std::vector<vec_t, allocator<vec_t>> tmpIn(m_nmTot), tmpOut(nqTot);
-
-        for (int e = 0; e < this->m_nBlocks; ++e)
-        {
-            // Load and transpose data
-            load_interleave(inptr, m_nmTot, tmpIn);
-
-            PhysInterp1DScaled1DKernel<SHAPE_TYPE>(nm0, nq0, tmpIn,
-                                                   this->m_bdata[0], tmpOut);
-
-            // de-interleave and store data
-            deinterleave_store(tmpOut, nqTot, outptr);
-
-            inptr += nmBlocks;
-            outptr += nqBlocks;
-        }
-    }
-
-    // Size based template version.
-    template <int nm0, int nq0>
-    void operator1D(const Array<OneD, const NekDouble> &input,
-                    Array<OneD, NekDouble> &output)
-    {
-        constexpr auto nqTot    = nq0;
-        constexpr auto nqBlocks = nqTot * vec_t::width;
-        const auto nmBlocks     = m_nmTot * vec_t::width;
-
-        auto *inptr  = &input[0];
-        auto *outptr = &output[0];
-
-        // Workspace for kernels - also checks preconditions
-        PhysInterp1DScaled1DWorkspace<SHAPE_TYPE>(nm0, nq0);
-
-        std::vector<vec_t, allocator<vec_t>> tmpIn(m_nmTot), tmpOut(nqTot);
-
-        for (int e = 0; e < this->m_nBlocks; ++e)
-        {
-            // Load and transpose data
-            load_interleave(inptr, m_nmTot, tmpIn);
-
-            PhysInterp1DScaled1DKernel<SHAPE_TYPE>(nm0, nq0, tmpIn,
-                                                   this->m_bdata[0], tmpOut);
-
-            // de-interleave and store data
-            deinterleave_store(tmpOut, nqTot, outptr);
-
-            inptr += nmBlocks;
-            outptr += nqBlocks;
-        }
-    }
-
-#elif defined(SHAPE_DIMENSION_2D)
-
-    // Non-size based operator.
-    void operator2D(const Array<OneD, const NekDouble> &input,
-                    Array<OneD, NekDouble> &output)
-    {
-        const auto nm0 = m_basis[0]->GetNumModes();
-        const auto nm1 = m_basis[1]->GetNumModes();
-
-        const auto nq0 = m_basis[0]->GetNumPoints();
-        const auto nq1 = m_basis[1]->GetNumPoints();
-
-        const auto nqTot    = nq0 * nq1;
-        const auto nqBlocks = nqTot * vec_t::width;
-        const auto nmBlocks = m_nmTot * vec_t::width;
-
-        auto *inptr  = &input[0];
-        auto *outptr = &output[0];
-        const bool correct =
-            (m_basis[0]->GetBasisType() == LibUtilities::eModified_A);
-
-        // Workspace for kernels - also checks preconditions
-        size_t wsp0Size = 0;
-        PhysInterp1DScaled2DWorkspace<SHAPE_TYPE>(nm0, nm1, nq0, nq1, wsp0Size);
-
-        std::vector<vec_t, allocator<vec_t>> wsp0(wsp0Size), tmpIn(m_nmTot),
-            tmpOut(nqTot);
-
-        for (int e = 0; e < this->m_nBlocks; ++e)
-        {
-            // Load and transpose data
-            load_interleave(inptr, m_nmTot, tmpIn);
-
-            PhysInterp1DScaled2DKernel<SHAPE_TYPE>(
-                nm0, nm1, nq0, nq1, correct, tmpIn, this->m_bdata[0],
-                this->m_bdata[1], wsp0, tmpOut);
-
-            // de-interleave and store data
-            deinterleave_store(tmpOut, nqTot, outptr);
-
-            inptr += nmBlocks;
-            outptr += nqBlocks;
-        }
-    }
-
-    // Size based template version.
-    template <int nm0, int nm1, int nq0, int nq1>
-    void operator2D(const Array<OneD, const NekDouble> &input,
-                    Array<OneD, NekDouble> &output)
-    {
-        constexpr auto nqTot    = nq0 * nq1;
-        constexpr auto nqBlocks = nqTot * vec_t::width;
-        const auto nmBlocks     = m_nmTot * vec_t::width;
-
-        auto *inptr  = &input[0];
-        auto *outptr = &output[0];
-        const bool correct =
-            (m_basis[0]->GetBasisType() == LibUtilities::eModified_A);
-
-        // Workspace for kernels - also checks preconditions
-        size_t wsp0Size = 0;
-        PhysInterp1DScaled2DWorkspace<SHAPE_TYPE>(nm0, nm1, nq0, nq1, wsp0Size);
-
-        std::vector<vec_t, allocator<vec_t>> wsp0(wsp0Size), tmpIn(m_nmTot),
-            tmpOut(nqTot);
-
-        for (int e = 0; e < this->m_nBlocks; ++e)
-        {
-            // Load and transpose data
-            load_interleave(inptr, m_nmTot, tmpIn);
-
-            PhysInterp1DScaled2DKernel<SHAPE_TYPE>(
-                nm0, nm1, nq0, nq1, correct, tmpIn, this->m_bdata[0],
-                this->m_bdata[1], wsp0, tmpOut);
-
-            // de-interleave and store data
-            deinterleave_store(tmpOut, nqTot, outptr);
-
-            inptr += nmBlocks;
-            outptr += nqBlocks;
-        }
-    }
-
-#elif defined(SHAPE_DIMENSION_3D)
-
-    // Non-size based operator.
-    void operator3D(const Array<OneD, const NekDouble> &input,
-                    Array<OneD, NekDouble> &output)
-    {
-        const auto nm0 = m_basis[0]->GetNumModes();
-        const auto nm1 = m_basis[1]->GetNumModes();
-        const auto nm2 = m_basis[2]->GetNumModes();
-
-        const auto nq0 = m_basis[0]->GetNumPoints();
-        const auto nq1 = m_basis[1]->GetNumPoints();
-        const auto nq2 = m_basis[2]->GetNumPoints();
-
-        const auto nqTot    = nq0 * nq1 * nq2;
-        const auto nqBlocks = nqTot * vec_t::width;
-        const auto nmBlocks = m_nmTot * vec_t::width;
-
-        auto *inptr  = &input[0];
-        auto *outptr = &output[0];
-
-        const bool correct =
-            (m_basis[0]->GetBasisType() == LibUtilities::eModified_A);
-
-        // Workspace for kernels - also checks preconditions
-        size_t wsp0Size = 0, wsp1Size = 0;
-        PhysInterp1DScaled3DWorkspace<SHAPE_TYPE>(nm0, nm1, nm2, nq0, nq1, nq2,
-                                                  wsp0Size, wsp1Size);
-
-        std::vector<vec_t, allocator<vec_t>> wsp0(wsp0Size), wsp1(wsp1Size),
-            tmpIn(m_nmTot), tmpOut(nqTot);
-
-        for (int e = 0; e < this->m_nBlocks; ++e)
-        {
-            // Load and transpose data
-            load_interleave(inptr, m_nmTot, tmpIn);
-
-            PhysInterp1DScaled3DKernel<SHAPE_TYPE>(
-                nm0, nm1, nm2, nq0, nq1, nq2, correct, tmpIn, this->m_bdata[0],
-                this->m_bdata[1], this->m_bdata[2], wsp0, wsp1, tmpOut);
-
-            // de-interleave and store data
-            deinterleave_store(tmpOut, nqTot, outptr);
-
-            inptr += nmBlocks;
-            outptr += nqBlocks;
-        }
-    }
-
-    // Size based template version.
-    template <int nm0, int nm1, int nm2, int nq0, int nq1, int nq2>
-    void operator3D(const Array<OneD, const NekDouble> &input,
-                    Array<OneD, NekDouble> &output)
-    {
-        constexpr auto nqTot    = nq0 * nq1 * nq2;
-        constexpr auto nqBlocks = nqTot * vec_t::width;
-        const auto nmBlocks     = m_nmTot * vec_t::width;
-
-        auto *inptr  = &input[0];
-        auto *outptr = &output[0];
-        const bool correct =
-            (m_basis[0]->GetBasisType() == LibUtilities::eModified_A);
-
-        // Workspace for kernels - also checks preconditions
-        size_t wsp0Size = 0, wsp1Size = 0;
-        PhysInterp1DScaled3DWorkspace<SHAPE_TYPE>(nm0, nm1, nm2, nq0, nq1, nq2,
-                                                  wsp0Size, wsp1Size);
-
-        std::vector<vec_t, allocator<vec_t>> wsp0(wsp0Size), wsp1(wsp1Size),
-            tmpIn(m_nmTot), tmpOut(nqTot);
-
-        for (int e = 0; e < this->m_nBlocks; ++e)
-        {
-            // Load and transpose data
-            load_interleave(inptr, m_nmTot, tmpIn);
-
-            PhysInterp1DScaled3DKernel<SHAPE_TYPE>(
-                nm0, nm1, nm2, nq0, nq1, nq2, correct, tmpIn, this->m_bdata[0],
-                this->m_bdata[1], this->m_bdata[2], wsp0, wsp1, tmpOut);
-
-            // de-interleave and store data
-            deinterleave_store(tmpOut, nqTot, outptr);
-
-            inptr += nmBlocks;
-            outptr += nqBlocks;
-        }
-    }
-
-#endif // SHAPE_DIMENSION
-
-private:
-    int m_nmTot;
-};
-
-} // namespace Nektar::MatrixFree
-
-#endif
diff --git a/library/MatrixFreeOps/PhysInterp1DScaledKernels.hpp b/library/MatrixFreeOps/PhysInterp1DScaledKernels.hpp
deleted file mode 100644
index 17708156f86fba40c31a4c4a99f55ae5b365a9d4..0000000000000000000000000000000000000000
--- a/library/MatrixFreeOps/PhysInterp1DScaledKernels.hpp
+++ /dev/null
@@ -1,642 +0,0 @@
-///////////////////////////////////////////////////////////////////////////////
-//
-// File: PhysInterp1DScaledKernels.hpp
-//
-// For more information, please see: http://www.nektar.info
-//
-// The MIT License
-//
-// Copyright (c) 2006 Division of Applied Mathematics, Brown University (USA),
-// Department of Aeronautics, Imperial College London (UK), and Scientific
-// Computing and Imaging Institute, University of Utah (USA).
-//
-// Permission is hereby granted, free of charge, to any person obtaining a
-// copy of this software and associated documentation files (the "Software"),
-// to deal in the Software without restriction, including without limitation
-// the rights to use, copy, modify, merge, publish, distribute, sublicense,
-// and/or sell copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included
-// in all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-// DEALINGS IN THE SOFTWARE.
-//
-// Description:
-//
-///////////////////////////////////////////////////////////////////////////////
-
-#ifndef NEKTAR_LIBRARY_MF_PHYSINTERP1DSCALED_KERNELS_HPP
-#define NEKTAR_LIBRARY_MF_PHYSINTERP1DSCALED_KERNELS_HPP
-
-#include <LibUtilities/BasicUtils/NekInline.hpp>
-
-namespace Nektar::MatrixFree
-{
-
-using namespace tinysimd;
-using vec_t = simd<NekDouble>;
-
-// The dimension and shape kernels. NOTE: They are NOT duplicate
-// templated version based on the array size like the
-// operators. HOWEVER, they are forced to be INLINED. The inlining is
-// critical so that when used in the templated version of the operator
-// that loop unrolling occurs.
-
-// The seven shape kernels where the work gets done.
-#if defined(SHAPE_TYPE_SEG)
-
-NEK_FORCE_INLINE static void PhysInterp1DScaledSegKernel(
-    const size_t nm0, const size_t nq0,
-    const std::vector<vec_t, allocator<vec_t>> &in,
-    const std::vector<vec_t, allocator<vec_t>> &basis0,
-    std::vector<vec_t, allocator<vec_t>> &out)
-{
-    for (int i = 0; i < nq0; ++i)
-    {
-        vec_t tmp = in[0] * basis0[i]; // Load 2x
-
-        for (int p = 1; p < nm0; ++p)
-        {
-            tmp.fma(in[p], basis0[p * nq0 + i]); // Load 2x
-        }
-
-        out[i] = tmp; // Store 1x
-    }
-}
-
-#elif defined(SHAPE_TYPE_TRI)
-
-NEK_FORCE_INLINE static void PhysInterp1DScaledTriKernel(
-    const size_t nm0, const size_t nm1, const size_t nq0, const size_t nq1,
-    const bool correct, const std::vector<vec_t, allocator<vec_t>> &in,
-    const std::vector<vec_t, allocator<vec_t>> &basis0,
-    const std::vector<vec_t, allocator<vec_t>> &basis1,
-    std::vector<vec_t, allocator<vec_t>> &p_sums, // nm0
-    std::vector<vec_t, allocator<vec_t>> &out)
-{
-    for (int eta1 = 0, eta_idx = 0; eta1 < nq1; ++eta1)
-    {
-        for (int p = 0, mode = 0; p < nm0; ++p)
-        {
-            vec_t p_sum = 0.0;
-
-            for (int q = 0; q < (nm1 - p); ++q, ++mode)
-            {
-                p_sum.fma(basis1[mode * nq1 + eta1], in[mode]);
-            }
-
-            p_sums[p] = p_sum; // Store 1x
-        }
-
-        // Already have q_sum at each quadrature point in eta 1 for
-        // each mode, p.  From this assemble the tensor produce of
-        // each quadrature point, eta1
-        for (int eta0 = 0; eta0 < nq0; ++eta0, ++eta_idx)
-        {
-            vec_t p_sum = 0.0;
-            for (int p = 0; p < nm0; ++p)
-            {
-                p_sum.fma(p_sums[p], basis0[p * nq0 + eta0]); // Load 2x
-            }
-
-            if (correct)
-            {
-                // p_sum += coef * basis0 * basis1
-                p_sum.fma(in[1] * basis0[nq0 + eta0], basis1[nq1 + eta1]);
-            }
-
-            out[eta_idx] = p_sum;
-        }
-    }
-}
-
-#elif defined(SHAPE_TYPE_QUAD)
-
-NEK_FORCE_INLINE static void PhysInterp1DScaledQuadKernel(
-    const size_t nm0, const size_t nm1, const size_t nq0, const size_t nq1,
-    const std::vector<vec_t, allocator<vec_t>> &in,
-    const std::vector<vec_t, allocator<vec_t>> &basis0,
-    const std::vector<vec_t, allocator<vec_t>> &basis1,
-    std::vector<vec_t, allocator<vec_t>> &wsp, // nq0 * nm1
-    std::vector<vec_t, allocator<vec_t>> &out)
-{
-    for (int i = 0, cnt_iq = 0; i < nq0; ++i)
-    {
-        for (int q = 0, cnt_pq = 0; q < nm1; ++q, ++cnt_iq)
-        {
-            vec_t tmp = in[cnt_pq] * basis0[i]; // Load 2x
-            ++cnt_pq;
-            for (int p = 1; p < nm0; ++p, ++cnt_pq)
-            {
-                tmp.fma(in[cnt_pq], basis0[p * nq0 + i]); // Load 2x
-            }
-            wsp[cnt_iq] = tmp; // Store 1x
-        }
-    }
-
-    for (int j = 0, cnt_ij = 0; j < nq1; ++j)
-    {
-        for (int i = 0, cnt_iq = 0; i < nq0; ++i, ++cnt_ij)
-        {
-            vec_t tmp = wsp[cnt_iq] * basis1[j]; // Load 2x
-            ++cnt_iq;
-            for (int q = 1; q < nm1; ++q, ++cnt_iq)
-            {
-                tmp.fma(wsp[cnt_iq], basis1[q * nq1 + j]); // Load 2x
-            }
-            out[cnt_ij] = tmp; // Store 1x
-        }
-    }
-}
-
-#elif defined(SHAPE_TYPE_HEX)
-
-NEK_FORCE_INLINE static void PhysInterp1DScaledHexKernel(
-    const size_t nm0, const size_t nm1, const size_t nm2, const size_t nq0,
-    const size_t nq1, const size_t nq2,
-    const std::vector<vec_t, allocator<vec_t>> &in,
-    const std::vector<vec_t, allocator<vec_t>> &basis0,
-    const std::vector<vec_t, allocator<vec_t>> &basis1,
-    const std::vector<vec_t, allocator<vec_t>> &basis2,
-    std::vector<vec_t, allocator<vec_t>> &sum_irq, // nq0 * nm2 * nm1
-    std::vector<vec_t, allocator<vec_t>> &sum_jir, // nq1 * nq0 * nm2
-    std::vector<vec_t, allocator<vec_t>> &out)
-{
-    for (int i = 0, cnt_irq = 0; i < nq0; ++i)
-    {
-        for (int r = 0, cnt_rqp = 0; r < nm2; ++r)
-        {
-            for (int q = 0; q < nm1; ++q, ++cnt_irq)
-            {
-                vec_t tmp = in[cnt_rqp] * basis0[i];
-                ++cnt_rqp;
-
-                for (int p = 1; p < nm0; ++p, ++cnt_rqp)
-                {
-                    tmp.fma(in[cnt_rqp], basis0[p * nq0 + i]);
-                }
-
-                sum_irq[cnt_irq] = tmp;
-            }
-        }
-    }
-
-    for (int j = 0, cnt_jir = 0; j < nq1; ++j)
-    {
-        for (int i = 0, cnt_irq = 0; i < nq0; ++i)
-        {
-            for (int r = 0; r < nm2; ++r, ++cnt_jir)
-            {
-                vec_t tmp = sum_irq[cnt_irq] * basis1[j];
-                ++cnt_irq;
-
-                for (int q = 1; q < nm1; ++q)
-                {
-                    tmp.fma(sum_irq[cnt_irq++], basis1[q * nq1 + j]);
-                }
-
-                sum_jir[cnt_jir] = tmp;
-            }
-        }
-    }
-
-    for (int k = 0, cnt_kji = 0; k < nq2; ++k)
-    {
-        for (int j = 0, cnt_jir = 0; j < nq1; ++j)
-        {
-            for (int i = 0; i < nq0; ++i, ++cnt_kji)
-            {
-                vec_t tmp = sum_jir[cnt_jir] * basis2[k];
-                ++cnt_jir;
-
-                for (int r = 1; r < nm2; ++r)
-                {
-                    tmp.fma(sum_jir[cnt_jir++], basis2[r * nq2 + k]);
-                }
-
-                out[cnt_kji] = tmp;
-            }
-        }
-    }
-}
-
-#elif defined(SHAPE_TYPE_TET)
-
-NEK_FORCE_INLINE static void PhysInterp1DScaledTetKernel(
-    const size_t nm0, const size_t nm1, const size_t nm2, const size_t nq0,
-    const size_t nq1, const size_t nq2, const bool correct,
-    const std::vector<vec_t, allocator<vec_t>> &in,
-    const std::vector<vec_t, allocator<vec_t>> &basis0,
-    const std::vector<vec_t, allocator<vec_t>> &basis1,
-    const std::vector<vec_t, allocator<vec_t>> &basis2,
-    std::vector<vec_t, allocator<vec_t>> &fpq, // nm0 * nm1
-    std::vector<vec_t, allocator<vec_t>> &fp,  // nm0
-    std::vector<vec_t, allocator<vec_t>> &out)
-{
-    for (int k = 0, cnt_kji = 0; k < nq2; ++k)
-    {
-        int cnt_pq = 0, mode = 0;
-        for (int p = 0; p < nm0; ++p)
-        {
-            for (int q = 0; q < nm1 - p; ++q, ++cnt_pq)
-            {
-                vec_t prod = in[mode] * basis2[k + nq2 * mode]; // Load 2x
-                ++mode;
-
-                for (int r = 1; r < nm2 - p - q; ++r, ++mode)
-                {
-                    vec_t inxmm = in[mode];                  // Load 1x
-                    prod.fma(inxmm, basis2[k + nq2 * mode]); // Load 1x
-                }
-
-                fpq[cnt_pq] = prod; // Store 1x
-            }
-
-            // increment mode in case order1!=order2
-            for (int q = nm1 - p; q < nm2 - p; ++q)
-            {
-                mode += nm2 - p - q;
-            }
-        }
-
-        for (int j = 0; j < nq1; ++j)
-        {
-            mode = cnt_pq = 0;
-            for (int p = 0; p < nm0; ++p)
-            {
-                vec_t prod = fpq[cnt_pq] * basis1[mode * nq1 + j]; // Load 2x
-                ++cnt_pq;
-
-                for (int q = 1; q < nm1 - p; ++q, ++cnt_pq)
-                {
-                    prod.fma(fpq[cnt_pq],
-                             basis1[(mode + q) * nq1 + j]); // Load 2x
-                }
-
-                fp[p] = prod; // Store 1x
-                mode += nm1 - p;
-            }
-
-            for (int i = 0; i < nq0; ++i, ++cnt_kji)
-            {
-                vec_t tmp = basis0[i] * fp[0]; // Load 2x
-
-                for (int p = 1; p < nm0; ++p)
-                {
-                    tmp.fma(basis0[p * nq0 + i], fp[p]); // Load 2x
-                }
-
-                if (correct)
-                {
-                    // top vertex
-                    //
-                    // sum += inarray[1] * base2[nquad2 + k] * (
-                    //     base0[i] * base1[nquad1+j] +
-                    //     base0[nquad0+i] * base1[j] +
-                    //     base0[nquad0+i] * base1[nquad1+j]);
-
-                    vec_t tmp1 = basis0[i] * basis1[nq1 + j];   // Load 2x
-                    tmp1.fma(basis0[nq0 + i], basis1[j]);       // Load 2x
-                    tmp1.fma(basis0[nq0 + i], basis1[nq1 + j]); // Load 2x
-                    tmp1 = tmp1 * basis2[nq2 + k];              // Load 1x
-
-                    vec_t inarray1 = in[1]; // Load 1x
-                    tmp.fma(tmp1, inarray1);
-
-                    // bottom vertex
-                    //
-                    // sum += inarray[order2] * base2[k] * (
-                    //     base0[nquad0+i] * base1[nquad1+j]);
-                    tmp1     = basis0[nq0 + i] * basis1[nq1 + j]; // Load 2x
-                    tmp1     = tmp1 * basis2[k];                  // Load 1x
-                    inarray1 = in[nm2];                           // Load 1x
-                    tmp.fma(inarray1, tmp1);
-
-                    // singular edge
-                    for (int r = 1; r < nm2 - 1; ++r)
-                    {
-                        // sum += inarray[order2+r] * base2[(r+1)*nquad2+k] *
-                        //     base1[nquad1+j] * base0[nquad0+i];
-                        tmp1     = basis1[nq1 + j] * basis0[nq0 + i]; // Load 2x
-                        tmp1     = tmp1 * basis2[(r + 1) * nq2 + k];  // Load 1x
-                        inarray1 = in[nm2 + r];                       // Load 1x
-                        tmp.fma(inarray1, tmp1);
-                        // multiply by (1-a)/2
-                    }
-                }
-
-                out[cnt_kji] = tmp; // Store 1x
-            }
-        }
-    }
-}
-
-#elif defined(SHAPE_TYPE_PRISM)
-
-NEK_FORCE_INLINE static void PhysInterp1DScaledPrismKernel(
-    const size_t nm0, const size_t nm1, const size_t nm2, const size_t nq0,
-    const size_t nq1, const size_t nq2, const bool correct,
-    const std::vector<vec_t, allocator<vec_t>> &in,
-    const std::vector<vec_t, allocator<vec_t>> &basis0,
-    const std::vector<vec_t, allocator<vec_t>> &basis1,
-    const std::vector<vec_t, allocator<vec_t>> &basis2,
-    std::vector<vec_t, allocator<vec_t>> &fpq, // nm0 * nm1
-    std::vector<vec_t, allocator<vec_t>> &fp,  // nm0
-    std::vector<vec_t, allocator<vec_t>> &out)
-{
-    for (int k = 0, cnt_kji = 0; k < nq2; ++k)
-    {
-        int mode_pqr = 0, mode_pq = 0, mode_pr = 0;
-        for (int p = 0; p < nm0; ++p)
-        {
-            for (int q = 0; q < nm1; ++q, ++mode_pq)
-            {
-                vec_t prod = 0.0;
-                for (int r = 0; r < nm2 - p; ++r, ++mode_pqr)
-                {
-                    vec_t coef = in[mode_pqr];                       // Load 1x
-                    prod.fma(coef, basis2[(mode_pr + r) * nq2 + k]); // Load 1x
-                }
-
-                fpq[mode_pq] = prod; // Store 1x
-            }
-
-            mode_pr += nm2 - p;
-        }
-
-        for (int j = 0; j < nq1; ++j)
-        {
-            mode_pq = 0;
-            for (int p = 0; p < nm0; ++p)
-            {
-                vec_t prod = 0.0;
-                for (int q = 0; q < nm1; ++q, ++mode_pq)
-                {
-                    prod.fma(fpq[mode_pq], basis1[q * nq1 + j]); // Load 2x
-                }
-                fp[p] = prod; // Store 1x
-            }
-
-            for (int i = 0; i < nq0; ++i, ++cnt_kji)
-            {
-
-                vec_t val_kji = 0.0;
-                for (int p = 0; p < nm0; ++p)
-                {
-                    val_kji.fma(fp[p], basis0[p * nq0 + i]); // Load 2x
-                }
-
-                if (correct)
-                {
-                    vec_t basis_2 = basis2[nq2 + k]; // Load 1x
-                    vec_t basis_0 = basis0[nq0 + i]; // Load 1x
-
-                    for (int q = 0; q < nm1; ++q)
-                    {
-                        vec_t coef_0q1 = in[q * nm2 + 1];     // Load 1x
-                        vec_t basis_1  = basis1[q * nq1 + j]; // Load 1x
-                        val_kji.fma(basis_2 * basis_1, basis_0 * coef_0q1);
-                    }
-                }
-                out[cnt_kji] = val_kji; // store 1x
-            }
-        }
-    }
-}
-
-#elif defined(SHAPE_TYPE_PYR)
-
-NEK_FORCE_INLINE static void PhysInterp1DScaledPyrKernel(
-    const size_t nm0, const size_t nm1, const size_t nm2, const size_t nq0,
-    const size_t nq1, const size_t nq2, const bool correct,
-    const std::vector<vec_t, allocator<vec_t>> &in,
-    const std::vector<vec_t, allocator<vec_t>> &basis0,
-    const std::vector<vec_t, allocator<vec_t>> &basis1,
-    const std::vector<vec_t, allocator<vec_t>> &basis2,
-    std::vector<vec_t, allocator<vec_t>> &fpq, // nm0 * nm1
-    std::vector<vec_t, allocator<vec_t>> &fp,  // nm0
-    std::vector<vec_t, allocator<vec_t>> &out)
-{
-    for (int k = 0, cnt_kji = 0; k < nq2; ++k)
-    {
-        int mode_pqr = 0, mode_pq = 0;
-        for (int p = 0; p < nm0; ++p)
-        {
-            for (int q = 0; q < p; ++q, ++mode_pq)
-            {
-                vec_t prod = 0.0;
-                for (int r = 0; r < nm2 - p; ++r, ++mode_pqr)
-                {
-                    vec_t coef = in[mode_pqr];                  // Load 1x
-                    prod.fma(coef, basis2[mode_pqr * nq2 + k]); // Load 1x
-                }
-                fpq[mode_pq] = prod; // Store 1x
-            }
-
-            for (int q = p; q < nm1; ++q, ++mode_pq)
-            {
-                vec_t prod = 0.0;
-                for (int r = 0; r < nm2 - q; ++r, ++mode_pqr)
-                {
-                    vec_t coef = in[mode_pqr];                  // Load 1x
-                    prod.fma(coef, basis2[mode_pqr * nq2 + k]); // Load 1x
-                }
-
-                fpq[mode_pq] = prod; // Store 1x
-            }
-
-            // increment mode in case nm2>nm1
-            for (int q = nm1; q < nm2 - p; ++q)
-            {
-                mode_pqr += nm2 - q;
-            }
-        }
-
-        for (int j = 0; j < nq1; ++j)
-        {
-            mode_pq = 0;
-            for (int p = 0; p < nm0; ++p)
-            {
-                vec_t prod = 0.0;
-                for (int q = 0; q < nm1; ++q, ++mode_pq)
-                {
-                    prod.fma(fpq[mode_pq], basis1[q * nq1 + j]); // Load 2x
-                }
-                fp[p] = prod; // Store 1x
-            }
-
-            for (int i = 0; i < nq0; ++i, ++cnt_kji)
-            {
-                vec_t val_kji = 0.0;
-                for (int p = 0; p < nm0; ++p)
-                {
-                    val_kji.fma(fp[p], basis0[p * nq0 + i]); // Load 2x
-                }
-
-                if (correct)
-                {
-                    // top vertex
-                    //
-                    // sum += inarray[1] * base2[nquad2 + k] * (
-                    //     base0[i] * base1[nquad1+j] +
-                    //     base0[nquad0+i] * base1[j] +
-                    //     base0[nquad0+i] * base1[nquad1+j]);
-                    vec_t tmp1 = basis0[i] * basis1[nq1 + j];   // Load 2x
-                    tmp1.fma(basis0[nq0 + i], basis1[j]);       // Load 2x
-                    tmp1.fma(basis0[nq0 + i], basis1[nq1 + j]); // Load 2x
-                    tmp1 = tmp1 * basis2[nq2 + k];              // Load 1x
-
-                    vec_t inarray1 = in[1]; // Load 1x
-                    val_kji.fma(tmp1, inarray1);
-                }
-                out[cnt_kji] = val_kji; // store 1x
-            }
-        }
-    }
-}
-
-#endif // SHAPE_TYPE
-
-// Workspace - used to dynamically get the workspace size needed for
-// temporary memory.
-#if defined(SHAPE_DIMENSION_1D)
-
-template <LibUtilities::ShapeType SHAPE_TYPE>
-NEK_FORCE_INLINE static void PhysInterp1DScaled1DWorkspace(
-    [[maybe_unused]] const size_t nm0, [[maybe_unused]] const size_t nq0)
-
-{
-}
-
-#elif defined(SHAPE_DIMENSION_2D)
-
-template <LibUtilities::ShapeType SHAPE_TYPE>
-NEK_FORCE_INLINE static void PhysInterp1DScaled2DWorkspace(
-    [[maybe_unused]] const size_t nm0, [[maybe_unused]] const size_t nm1,
-    [[maybe_unused]] const size_t nq0, [[maybe_unused]] const size_t nq1,
-    size_t &wsp0Size)
-{
-    // Check preconditions
-    ASSERTL1((SHAPE_TYPE == LibUtilities::ShapeType::Tri && nm0 == nm1 &&
-              nq0 == nq1 + 1) ||
-                 (SHAPE_TYPE == LibUtilities::ShapeType::Quad && nm0 == nm1 &&
-                  nq0 == nq1),
-             "PhysInterp1DScaled2DWorkspace: Requires homogenous points.");
-
-#if defined(SHAPE_TYPE_TRI)
-    wsp0Size = std::max(wsp0Size, nm0);
-#elif defined(SHAPE_TYPE_QUAD)
-    wsp0Size = std::max(wsp0Size, nm0 * nq0);
-#endif
-}
-
-#elif defined(SHAPE_DIMENSION_3D)
-
-template <LibUtilities::ShapeType SHAPE_TYPE>
-NEK_FORCE_INLINE static void PhysInterp1DScaled3DWorkspace(
-    [[maybe_unused]] const size_t nm0, [[maybe_unused]] const size_t nm1,
-    [[maybe_unused]] const size_t nm2, [[maybe_unused]] const size_t nq0,
-    [[maybe_unused]] const size_t nq1, [[maybe_unused]] const size_t nq2,
-    size_t &wsp0Size, size_t &wsp1Size)
-{
-    // Check preconditions
-    ASSERTL1((SHAPE_TYPE == LibUtilities::ShapeType::Hex && nm0 == nm1 &&
-              nm0 == nm2 && nq0 == nq1 && nq0 == nq2) ||
-                 (SHAPE_TYPE == LibUtilities::ShapeType::Tet && nm0 == nm1 &&
-                  nm0 == nm2 && nq0 == nq1 + 1 && nq0 == nq2 + 1) ||
-                 (SHAPE_TYPE == LibUtilities::ShapeType::Pyr && nm0 == nm1 &&
-                  nm0 == nm2 && nq0 == nq1 && nq0 == nq2 + 1) ||
-                 (SHAPE_TYPE == LibUtilities::ShapeType::Prism && nm0 == nm1 &&
-                  nm0 == nm2 && nq0 == nq1 && nq0 == nq2 + 1),
-             "PhysInterp1DScaled3DWorkspace: Requires homogenous points.");
-
-#if defined(SHAPE_TYPE_HEX)
-    wsp0Size = std::max(wsp0Size, nq0 * nm1 * nm2); // nm1 == nm2
-    wsp1Size = std::max(wsp1Size, nq0 * nq1 * nm2); // nq0 == nq1
-#elif defined(SHAPE_TYPE_TET) || defined(SHAPE_TYPE_PRISM) ||                  \
-    defined(SHAPE_TYPE_PYR)
-    wsp0Size = std::max(wsp0Size, nm0 * nm1); // nm0 == nm1 == nm2
-    wsp1Size = std::max(wsp1Size, nm0);
-#endif
-}
-
-#endif // SHAPE_DIMENSION
-
-// The dimension kernels which select the shape kernel.
-#if defined(SHAPE_DIMENSION_1D)
-
-template <LibUtilities::ShapeType SHAPE_TYPE>
-NEK_FORCE_INLINE static void PhysInterp1DScaled1DKernel(
-    const size_t nm0, const size_t nq0,
-    const std::vector<vec_t, allocator<vec_t>> &in,
-    const std::vector<vec_t, allocator<vec_t>> &basis0,
-    std::vector<vec_t, allocator<vec_t>> &out)
-{
-#if defined(SHAPE_TYPE_SEG)
-    PhysInterp1DScaledSegKernel(nm0, nq0, in, basis0, out);
-#endif
-}
-
-#elif defined(SHAPE_DIMENSION_2D)
-
-template <LibUtilities::ShapeType SHAPE_TYPE>
-NEK_FORCE_INLINE static void PhysInterp1DScaled2DKernel(
-    const size_t nm0, const size_t nm1, const size_t nq0, const size_t nq1,
-    [[maybe_unused]] const bool correct,
-    const std::vector<vec_t, allocator<vec_t>> &in,
-    const std::vector<vec_t, allocator<vec_t>> &basis0,
-    const std::vector<vec_t, allocator<vec_t>> &basis1,
-    std::vector<vec_t, allocator<vec_t>> &wsp0,
-    std::vector<vec_t, allocator<vec_t>> &out)
-{
-#if defined(SHAPE_TYPE_TRI)
-    PhysInterp1DScaledTriKernel(nm0, nm1, nq0, nq1, correct, in, basis0, basis1,
-                                wsp0, out);
-
-#elif defined(SHAPE_TYPE_QUAD)
-    PhysInterp1DScaledQuadKernel(nm0, nm1, nq0, nq1, in, basis0, basis1, wsp0,
-                                 out);
-#endif
-}
-
-#elif defined(SHAPE_DIMENSION_3D)
-
-template <LibUtilities::ShapeType SHAPE_TYPE>
-NEK_FORCE_INLINE static void PhysInterp1DScaled3DKernel(
-    const size_t nm0, const size_t nm1, const size_t nm2, const size_t nq0,
-    const size_t nq1, const size_t nq2, [[maybe_unused]] const bool correct,
-    const std::vector<vec_t, allocator<vec_t>> &in,
-    const std::vector<vec_t, allocator<vec_t>> &basis0,
-    const std::vector<vec_t, allocator<vec_t>> &basis1,
-    const std::vector<vec_t, allocator<vec_t>> &basis2,
-    std::vector<vec_t, allocator<vec_t>> &wsp0,
-    std::vector<vec_t, allocator<vec_t>> &wsp1,
-    std::vector<vec_t, allocator<vec_t>> &out)
-{
-#if defined(SHAPE_TYPE_HEX)
-    PhysInterp1DScaledHexKernel(nm0, nm1, nm2, nq0, nq1, nq2, in, basis0,
-                                basis1, basis2, wsp0, wsp1, out);
-#elif defined(SHAPE_TYPE_TET)
-    PhysInterp1DScaledTetKernel(nm0, nm1, nm2, nq0, nq1, nq2, correct, in,
-                                basis0, basis1, basis2, wsp0, wsp1, out);
-#elif defined(SHAPE_TYPE_PRISM)
-    PhysInterp1DScaledPrismKernel(nm0, nm1, nm2, nq0, nq1, nq2, correct, in,
-                                  basis0, basis1, basis2, wsp0, wsp1, out);
-#elif defined(SHAPE_TYPE_PYR)
-    PhysInterp1DScaledPyrKernel(nm0, nm1, nm2, nq0, nq1, nq2, correct, in,
-                                basis0, basis1, basis2, wsp0, wsp1, out);
-#endif
-}
-
-#endif // SHAPE_DIMENSION
-
-} // namespace Nektar::MatrixFree
-
-#endif
diff --git a/solvers/IncNavierStokesSolver/Tests/Perf_ChanFlow_3DH1D_pRef.tst b/solvers/IncNavierStokesSolver/Tests/Perf_ChanFlow_3DH1D_pRef.tst
index a7d0fe0a19c4d6120486c60b25fbf805426e9fe8..6e67b585004b41b1dd6d116222989424b86257cf 100644
--- a/solvers/IncNavierStokesSolver/Tests/Perf_ChanFlow_3DH1D_pRef.tst
+++ b/solvers/IncNavierStokesSolver/Tests/Perf_ChanFlow_3DH1D_pRef.tst
@@ -20,7 +20,7 @@
             <value variable="p" tolerance="1e-7">0.0671074</value>
         </metric>
         <metric type="ExecutionTime" id="3">
-            <value tolerance="8e-1" hostname="42.debian-bullseye-performance-build-and-test">13.3</value>
+            <value tolerance="8e-1" hostname="42.debian-bullseye-performance-build-and-test">12.942</value>
         </metric>
     </metrics>
 </test>