From 4aee847346f8dd3f52d8ddeece469913c80e9a67 Mon Sep 17 00:00:00 2001
From: Alexandra Liosi <a.liosi22@imperial.ac.uk>
Date: Fri, 13 Sep 2024 21:03:28 +0000
Subject: [PATCH] Matrix-Free PhysInterp1DScaled

---
 CHANGELOG.md                                  |   1 +
 library/Collections/BwdTrans.cpp              |   2 +
 library/Collections/Helmholtz.cpp             |   5 +
 library/Collections/IProductWRTBase.cpp       |   3 +
 library/Collections/IProductWRTDerivBase.cpp  |   5 +
 .../LinearAdvectionDiffusionReaction.cpp      |   6 +
 library/Collections/PhysDeriv.cpp             |   3 +
 library/Collections/PhysInterp1DScaled.cpp    | 353 ++++++------
 library/Demos/MultiRegions/Tests/test.opt     |  28 +
 library/MatrixFreeOps/BwdTrans.h              |  35 +-
 library/MatrixFreeOps/CMakeLists.txt          |   2 +-
 library/MatrixFreeOps/Helmholtz.h             |  45 +-
 library/MatrixFreeOps/IProduct.h              |  35 +-
 library/MatrixFreeOps/IProductWRTDerivBase.h  |  35 +-
 .../LinearAdvectionDiffusionReaction.h        |  57 +-
 library/MatrixFreeOps/Operator.hpp            | 124 ++++-
 library/MatrixFreeOps/PhysDeriv.h             |  37 +-
 library/MatrixFreeOps/PhysInterp1DScaled.h    | 501 ++++++++++++++++++
 .../PhysInterp1DScaledKernels.hpp             | 225 ++++++++
 library/MatrixFreeOps/SwitchNodesPoints.h     |  16 -
 library/MatrixFreeOps/SwitchPoints.h          |  10 -
 library/MultiRegions/ExpList.cpp              |   1 +
 .../Collections/TestHexCollection.cpp         | 189 ++++++-
 .../Collections/TestPrismCollection.cpp       | 202 +++++++
 .../Collections/TestPyrCollection.cpp         | 185 +++++++
 .../Collections/TestQuadCollection.cpp        | 153 ++++++
 .../Collections/TestSegCollection.cpp         | 140 +++++
 .../Collections/TestTetCollection.cpp         | 198 ++++++-
 .../Collections/TestTriCollection.cpp         | 171 +++++-
 29 files changed, 2414 insertions(+), 353 deletions(-)
 create mode 100644 library/MatrixFreeOps/PhysInterp1DScaled.h
 create mode 100644 library/MatrixFreeOps/PhysInterp1DScaledKernels.hpp

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3ba97fbb72..874f99f113 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,7 @@ v5.7.0
 - Modified MatrixFreeOp library  switch initialisation to use BOOST_PP (!1794)
 - Fix memory-leak with LowEnergyBlock preconditioner for time-updated matrices (!1627)
 - Fix Fourier expansion integration weights are related test (!1803)
+- Introduced the MatrixFree implementation for the PhysInterp1DScaled operator and tidying up MatrixFreeOps (!1812) 
 - Separate MeshGraph input/output functions into a new class (!1778)
 - Added checkpoint file writing start time in the fieldconvert filter (!1789)
 - Fix fieldconvert filter incorrect boundary values (!1789)
diff --git a/library/Collections/BwdTrans.cpp b/library/Collections/BwdTrans.cpp
index 2ee0f0a314..7581727dd7 100644
--- a/library/Collections/BwdTrans.cpp
+++ b/library/Collections/BwdTrans.cpp
@@ -209,6 +209,8 @@ private:
         auto oper = MatrixFree::GetOperatorFactory().CreateInstance(
             op_string, basis, pCollExp.size());
 
+        oper->SetUpBdata(basis);
+
         m_oper = std::dynamic_pointer_cast<MatrixFree::BwdTrans>(oper);
         ASSERTL0(m_oper, "Failed to cast pointer.");
     }
diff --git a/library/Collections/Helmholtz.cpp b/library/Collections/Helmholtz.cpp
index 001b3bbdf9..3a9a430ca1 100644
--- a/library/Collections/Helmholtz.cpp
+++ b/library/Collections/Helmholtz.cpp
@@ -745,6 +745,11 @@ private:
         // Store derivative factor
         oper->SetDF(pGeomData->GetDerivFactorsInterLeave(pCollExp, m_nElmtPad));
 
+        oper->SetUpBdata(basis);
+        oper->SetUpDBdata(basis);
+        oper->SetUpZW(basis);
+        oper->SetUpD(basis);
+
         m_oper = std::dynamic_pointer_cast<MatrixFree::Helmholtz>(oper);
         ASSERTL0(m_oper, "Failed to cast pointer.");
 
diff --git a/library/Collections/IProductWRTBase.cpp b/library/Collections/IProductWRTBase.cpp
index d50bc39725..847e97f7b4 100644
--- a/library/Collections/IProductWRTBase.cpp
+++ b/library/Collections/IProductWRTBase.cpp
@@ -228,6 +228,9 @@ private:
         auto oper = MatrixFree::GetOperatorFactory().CreateInstance(
             op_string, basis, pCollExp.size());
 
+        oper->SetUpBdata(basis);
+        oper->SetUpZW(basis);
+
         // Set Jacobian
         oper->SetJac(pGeomData->GetJacInterLeave(pCollExp, m_nElmtPad));
 
diff --git a/library/Collections/IProductWRTDerivBase.cpp b/library/Collections/IProductWRTDerivBase.cpp
index 534b85eb26..f708a7c40c 100644
--- a/library/Collections/IProductWRTDerivBase.cpp
+++ b/library/Collections/IProductWRTDerivBase.cpp
@@ -354,6 +354,11 @@ private:
         auto oper = MatrixFree::GetOperatorFactory().CreateInstance(
             op_string, basis, pCollExp.size());
 
+        // set up required copies for operations
+        oper->SetUpBdata(basis);
+        oper->SetUpDBdata(basis);
+        oper->SetUpZW(basis);
+
         // Set Jacobian
         oper->SetJac(pGeomData->GetJacInterLeave(pCollExp, m_nElmtPad));
 
diff --git a/library/Collections/LinearAdvectionDiffusionReaction.cpp b/library/Collections/LinearAdvectionDiffusionReaction.cpp
index cc917c236a..2b2e671001 100644
--- a/library/Collections/LinearAdvectionDiffusionReaction.cpp
+++ b/library/Collections/LinearAdvectionDiffusionReaction.cpp
@@ -765,6 +765,12 @@ private:
         // Get N quadpoints with padding
         m_nqtot = m_numElmt * pCollExp[0]->GetStdExp()->GetTotPoints();
 
+        // set up required copies for operations
+        oper->SetUpBdata(basis);
+        oper->SetUpDBdata(basis);
+        oper->SetUpD(basis);
+        oper->SetUpZW(basis);
+
         // Set Jacobian
         oper->SetJac(pGeomData->GetJacInterLeave(pCollExp, m_nElmtPad));
 
diff --git a/library/Collections/PhysDeriv.cpp b/library/Collections/PhysDeriv.cpp
index 7c06af8051..958a16a8fd 100644
--- a/library/Collections/PhysDeriv.cpp
+++ b/library/Collections/PhysDeriv.cpp
@@ -362,6 +362,9 @@ private:
         auto oper = MatrixFree::GetOperatorFactory().CreateInstance(
             op_string, basis, pCollExp.size());
 
+        oper->SetUpZW(basis);
+        oper->SetUpD(basis);
+
         // Set derivative factors
         oper->SetDF(pGeomData->GetDerivFactorsInterLeave(pCollExp, m_nElmtPad));
 
diff --git a/library/Collections/PhysInterp1DScaled.cpp b/library/Collections/PhysInterp1DScaled.cpp
index 1f7c4bdb18..c550866de3 100644
--- a/library/Collections/PhysInterp1DScaled.cpp
+++ b/library/Collections/PhysInterp1DScaled.cpp
@@ -63,14 +63,12 @@ class PhysInterp1DScaled_Helper : virtual public Operator
 {
 public:
     void UpdateFactors(StdRegions::FactorMap factors) override
-
     {
         if (factors == m_factors)
         {
             return;
         }
         m_factors = factors;
-
         // Set scaling factor for the PhysInterp1DScaled function
         [[maybe_unused]] auto x = factors.find(StdRegions::eFactorConst);
         ASSERTL1(
@@ -78,6 +76,15 @@ public:
             "Constant factor not defined: " +
                 std::string(
                     StdRegions::ConstFactorTypeMap[StdRegions::eFactorConst]));
+
+        NekDouble scale = m_factors[StdRegions::eFactorConst];
+
+        int shape_dimension = m_stdExp->GetShapeDimension();
+        m_outputSize        = m_numElmt; // initializing m_outputSize
+        for (int i = 0; i < shape_dimension; ++i)
+        {
+            m_outputSize *= (int)(m_stdExp->GetNumPoints(i) * scale);
+        }
     }
 
 protected:
@@ -93,7 +100,6 @@ protected:
         {
             scale = 1.5;
         }
-
         // expect input to be number of elements by the number of quad points
         m_inputSize = m_numElmt * m_stdExp->GetTotPoints();
         // expect input to be number of elements by the number of quad points
@@ -104,83 +110,14 @@ protected:
             m_outputSize *= (int)(m_stdExp->GetNumPoints(i) * scale);
         }
     }
+
     StdRegions::FactorMap
         m_factors; // map for storing the scaling factor of the operator
 };
 
 /**
- * @brief PhysInterp1DScaled operator using standard matrix approach. Currently,
- * it is an identical copy of the BwdTrans operator.
+ * @brief PhysInterp1DScaled operator using matrix free implementation.
  */
-
-class PhysInterp1DScaled_StdMat final : virtual public Operator,
-                                        PhysInterp1DScaled_Helper
-{
-public:
-    OPERATOR_CREATE(PhysInterp1DScaled_StdMat)
-    ~PhysInterp1DScaled_StdMat() final
-    {
-    }
-
-    void operator()(const Array<OneD, const NekDouble> &input,
-                    Array<OneD, NekDouble> &output,
-                    [[maybe_unused]] Array<OneD, NekDouble> &output1,
-                    [[maybe_unused]] Array<OneD, NekDouble> &output2,
-                    [[maybe_unused]] Array<OneD, NekDouble> &wsp) override
-    {
-        Blas::Dgemm('N', 'N', m_mat->GetRows(), m_numElmt, m_mat->GetColumns(),
-                    1.0, m_mat->GetRawPtr(), m_mat->GetRows(), input.get(),
-                    m_stdExp->GetNcoeffs(), 0.0, output.get(),
-                    m_stdExp->GetTotPoints());
-    }
-
-    void operator()([[maybe_unused]] int dir,
-                    [[maybe_unused]] const Array<OneD, const NekDouble> &input,
-                    [[maybe_unused]] Array<OneD, NekDouble> &output,
-                    [[maybe_unused]] Array<OneD, NekDouble> &wsp) final
-    {
-        ASSERTL0(false, "Not valid for this operator.");
-    }
-
-    void UpdateFactors(StdRegions::FactorMap factors) override
-    {
-        if (factors == m_factors)
-        {
-            return;
-        }
-
-        m_factors = factors;
-
-        // Set scaling factor for the PhysInterp1DScaled function
-        [[maybe_unused]] auto x = factors.find(StdRegions::eFactorConst);
-        ASSERTL1(
-            x != factors.end(),
-            "Constant factor not defined: " +
-                std::string(
-                    StdRegions::ConstFactorTypeMap[StdRegions::eFactorConst]));
-    }
-
-protected:
-    DNekMatSharedPtr m_mat;
-    StdRegions::FactorMap m_factors; // Initially, PhysInterp1DScaled was taking
-                                     // as input a single scaling factor
-    // This single scaling factor will be expressed into a matrix where each
-    // element has the same value so that matrix - vector multiplication
-    // optimization by BLAS is implemented
-
-private:
-    PhysInterp1DScaled_StdMat(
-        vector<StdRegions::StdExpansionSharedPtr> pCollExp,
-        CoalescedGeomDataSharedPtr pGeomData, StdRegions::FactorMap factors)
-        : Operator(pCollExp, pGeomData, factors), PhysInterp1DScaled_Helper()
-    {
-    }
-};
-
-/**
- * @brief PhysInterp1DScaled operator using matrix free operators.
- */
-
 class PhysInterp1DScaled_MatrixFree final : virtual public Operator,
                                             MatrixFreeBase,
                                             PhysInterp1DScaled_Helper
@@ -188,9 +125,7 @@ class PhysInterp1DScaled_MatrixFree final : virtual public Operator,
 public:
     OPERATOR_CREATE(PhysInterp1DScaled_MatrixFree)
 
-    ~PhysInterp1DScaled_MatrixFree() final
-    {
-    }
+    ~PhysInterp1DScaled_MatrixFree() final = default;
 
     void operator()(const Array<OneD, const NekDouble> &input,
                     Array<OneD, NekDouble> &output0,
@@ -207,30 +142,56 @@ public:
                     [[maybe_unused]] Array<OneD, NekDouble> &wsp) final
     {
         NEKERROR(ErrorUtil::efatal,
-                 "BwdTrans_MatrixFree: Not valid for this operator.");
+                 "PhysInterp1DScaled_MatrixFree: Not valid for this operator.");
     }
 
     void UpdateFactors([[maybe_unused]] StdRegions::FactorMap factors) override
     {
-        ASSERTL0(false, "Not valid for this operator.");
+
+        // Set lambda for this call
+        auto x = factors.find(StdRegions::eFactorConst);
+        ASSERTL1(
+            x != factors.end(),
+            "Constant factor not defined: " +
+                std::string(
+                    StdRegions::ConstFactorTypeMap[StdRegions::eFactorConst]));
+        // Update the factors member inside PhysInterp1DScaled_MatrixFree
+        // class
+        m_factors = factors;
+
+        const auto dim = m_stdExp->GetShapeDimension();
+
+        // Definition of basis vector
+        std::vector<LibUtilities::BasisSharedPtr> basis(dim);
+        for (unsigned int i = 0; i < dim; ++i)
+        {
+            basis[i] = m_stdExp->GetBasis(i);
+        }
+
+        // Update the scaling factor inside MatrixFreeOps using the SetLamda
+        // routine
+        m_oper->SetScalingFactor(x->second);
+        m_oper->SetUpInterp1D(basis, m_factors[StdRegions::eFactorConst]);
+        m_nOut = m_nElmtPad * m_outputSize / m_numElmt;
     }
 
 private:
-    std::shared_ptr<MatrixFree::BwdTrans> m_oper;
+    std::shared_ptr<MatrixFree::PhysInterp1DScaled> m_oper;
 
     PhysInterp1DScaled_MatrixFree(
         vector<StdRegions::StdExpansionSharedPtr> pCollExp,
         CoalescedGeomDataSharedPtr pGeomData, StdRegions::FactorMap factors)
         : Operator(pCollExp, pGeomData, factors),
-          MatrixFreeBase(pCollExp[0]->GetStdExp()->GetNcoeffs(),
+          MatrixFreeBase(pCollExp[0]->GetStdExp()->GetTotPoints(),
                          pCollExp[0]->GetStdExp()->GetTotPoints(),
                          pCollExp.size()),
           PhysInterp1DScaled_Helper()
     {
-        // Basis vector.
         const auto dim = pCollExp[0]->GetStdExp()->GetShapeDimension();
+
+        // Definition of basis vector
         std::vector<LibUtilities::BasisSharedPtr> basis(dim);
-        for (auto i = 0; i < dim; ++i)
+        for (unsigned int i = 0; i < dim; ++i)
         {
             basis[i] = pCollExp[0]->GetBasis(i);
         }
@@ -239,73 +200,77 @@ private:
         auto shapeType = pCollExp[0]->GetStdExp()->DetShapeType();
 
         // Generate operator string and create operator.
-        std::string op_string = "BwdTrans";
+        std::string op_string = "PhysInterp1DScaled";
         op_string += MatrixFree::GetOpstring(shapeType, false);
         auto oper = MatrixFree::GetOperatorFactory().CreateInstance(
             op_string, basis, pCollExp.size());
 
-        m_oper = std::dynamic_pointer_cast<MatrixFree::BwdTrans>(oper);
+        m_oper =
+            std::dynamic_pointer_cast<MatrixFree::PhysInterp1DScaled>(oper);
         ASSERTL0(m_oper, "Failed to cast pointer.");
-    }
-};
-
-/**
- * @brief PhysInterp1DScaled operator using default StdRegions operator
- */
-
-class PhysInterp1DScaled_IterPerExp final : virtual public Operator,
-                                            PhysInterp1DScaled_Helper
-{
-public:
-    OPERATOR_CREATE(PhysInterp1DScaled_IterPerExp)
-
-    ~PhysInterp1DScaled_IterPerExp() final
-    {
-    }
 
-    void operator()(const Array<OneD, const NekDouble> &input,
-                    Array<OneD, NekDouble> &output,
-                    [[maybe_unused]] Array<OneD, NekDouble> &output1,
-                    [[maybe_unused]] Array<OneD, NekDouble> &output2,
-                    [[maybe_unused]] Array<OneD, NekDouble> &wsp) override
-    {
-        const int nCoeffs = m_stdExp->GetNcoeffs();
-        const int nPhys   = m_stdExp->GetTotPoints();
-        Array<OneD, NekDouble> tmp;
-
-        for (int i = 0; i < m_numElmt; ++i)
+        NekDouble scale; // declaration of the scaling factor to be used for the
+                         // output size
+        if (factors[StdRegions::eFactorConst] != 0)
         {
-            m_stdExp->BwdTrans(input + i * nCoeffs, tmp = output + i * nPhys);
+            m_factors = factors;
+            scale     = m_factors[StdRegions::eFactorConst];
         }
+        else
+        {
+            scale                               = 1.5;
+            m_factors[StdRegions::eFactorConst] = 1.5;
+        }
+        // Update the scaling factor inside MatrixFreeOps using the SetLamda
+        // routine
+        m_oper->SetScalingFactor(scale);
+        m_oper->SetUpInterp1D(basis, scale);
+        m_nOut = m_nElmtPad * m_outputSize / m_numElmt;
     }
+};
 
-    void operator()([[maybe_unused]] int dir,
-                    [[maybe_unused]] const Array<OneD, const NekDouble> &input,
-                    [[maybe_unused]] Array<OneD, NekDouble> &output,
-                    [[maybe_unused]] Array<OneD, NekDouble> &wsp) final
-    {
-        ASSERTL0(false, "Not valid for this operator.");
-    }
-
-    void UpdateFactors([[maybe_unused]] StdRegions::FactorMap factors) override
-    {
-        m_factors = factors;
-    }
-
-protected:
-    StdRegions::FactorMap m_factors; // Initially, PhysInterp1DScaled was taking
-                                     // as input a single scaling factor
-    // This single scaling factor will be expressed into a matrix where each
-    // element has the same value so that matrix - vector multiplication
-    // optimization by BLAS is implemented
-
-private:
-    PhysInterp1DScaled_IterPerExp(
-        vector<StdRegions::StdExpansionSharedPtr> pCollExp,
-        CoalescedGeomDataSharedPtr pGeomData, StdRegions::FactorMap factors)
-        : Operator(pCollExp, pGeomData, factors), PhysInterp1DScaled_Helper()
-    {
-    }
+/// Factory initialisation for the PhysInterp1DScaled_MatrixFree operators
+OperatorKey PhysInterp1DScaled_MatrixFree::m_typeArr[] = {
+    GetOperatorFactory().RegisterCreatorFunction(
+        OperatorKey(eSegment, ePhysInterp1DScaled, eMatrixFree, false),
+        PhysInterp1DScaled_MatrixFree::create,
+        "PhysInterp1DScaled_MatrixFree_Seg"),
+    GetOperatorFactory().RegisterCreatorFunction(
+        OperatorKey(eTriangle, ePhysInterp1DScaled, eMatrixFree, false),
+        PhysInterp1DScaled_MatrixFree::create,
+        "PhysInterp1DScaled_MatrixFree_Tri"),
+    GetOperatorFactory().RegisterCreatorFunction(
+        OperatorKey(eTriangle, ePhysInterp1DScaled, eMatrixFree, true),
+        PhysInterp1DScaled_MatrixFree::create,
+        "PhysInterp1DScaled_MatrixFree_NodalTri"),
+    GetOperatorFactory().RegisterCreatorFunction(
+        OperatorKey(eQuadrilateral, ePhysInterp1DScaled, eMatrixFree, false),
+        PhysInterp1DScaled_MatrixFree::create,
+        "PhysInterp1DScaled_MatrixFree_Quad"),
+    GetOperatorFactory().RegisterCreatorFunction(
+        OperatorKey(eTetrahedron, ePhysInterp1DScaled, eMatrixFree, false),
+        PhysInterp1DScaled_MatrixFree::create,
+        "PhysInterp1DScaled_MatrixFree_Tet"),
+    GetOperatorFactory().RegisterCreatorFunction(
+        OperatorKey(eTetrahedron, ePhysInterp1DScaled, eMatrixFree, true),
+        PhysInterp1DScaled_MatrixFree::create,
+        "PhysInterp1DScaled_MatrixFree_NodalTet"),
+    GetOperatorFactory().RegisterCreatorFunction(
+        OperatorKey(ePyramid, ePhysInterp1DScaled, eMatrixFree, false),
+        PhysInterp1DScaled_MatrixFree::create,
+        "PhysInterp1DScaled_MatrixFree_Pyr"),
+    GetOperatorFactory().RegisterCreatorFunction(
+        OperatorKey(ePrism, ePhysInterp1DScaled, eMatrixFree, false),
+        PhysInterp1DScaled_MatrixFree::create,
+        "PhysInterp1DScaled_MatrixFree_Prism"),
+    GetOperatorFactory().RegisterCreatorFunction(
+        OperatorKey(ePrism, ePhysInterp1DScaled, eMatrixFree, true),
+        PhysInterp1DScaled_MatrixFree::create,
+        "PhysInterp1DScaled_MatrixFree_NodalPrism"),
+    GetOperatorFactory().RegisterCreatorFunction(
+        OperatorKey(eHexahedron, ePhysInterp1DScaled, eMatrixFree, false),
+        PhysInterp1DScaled_MatrixFree::create,
+        "PhysInterp1DScaled_NoCollection_Hex"),
 };
 
 /**
@@ -340,17 +305,18 @@ public:
                 // same for each element inside a single collection
                 int pt0  = m_expList[0]->GetNumPoints(0);
                 int npt0 = (int)pt0 * scale;
+                // current points key - use first entry
+                LibUtilities::PointsKey PointsKey0(
+                    pt0, m_expList[0]->GetPointsType(0));
+                // get new points key
+                LibUtilities::PointsKey newPointsKey0(
+                    npt0, m_expList[0]->GetPointsType(0));
+                // Interpolate points;
+                I0 = LibUtilities::PointsManager()[PointsKey0]->GetI(
+                    newPointsKey0);
+
                 for (int i = 0; i < m_numElmt; ++i)
                 {
-                    // current points key
-                    LibUtilities::PointsKey PointsKey0(
-                        pt0, m_expList[i]->GetPointsType(0));
-                    // get new points key
-                    LibUtilities::PointsKey newPointsKey0(
-                        npt0, m_expList[i]->GetPointsType(0));
-                    // Interpolate points;
-                    I0 = LibUtilities::PointsManager()[PointsKey0]->GetI(
-                        newPointsKey0);
 
                     Blas::Dgemv('N', npt0, pt0, 1.0, I0->GetPtr().get(), npt0,
                                 &input[cnt], 1, 0.0, &output[cnt1], 1);
@@ -371,25 +337,25 @@ public:
                 // workspace declaration
                 Array<OneD, NekDouble> wsp(npt1 * pt0); // fnp0*tnp1
 
+                // current points key - using first entry
+                LibUtilities::PointsKey PointsKey0(
+                    pt0, m_expList[0]->GetPointsType(0));
+                LibUtilities::PointsKey PointsKey1(
+                    pt1, m_expList[0]->GetPointsType(1));
+                // get new points key
+                LibUtilities::PointsKey newPointsKey0(
+                    npt0, m_expList[0]->GetPointsType(0));
+                LibUtilities::PointsKey newPointsKey1(
+                    npt1, m_expList[0]->GetPointsType(1));
+
+                // Interpolate points;
+                I0 = LibUtilities::PointsManager()[PointsKey0]->GetI(
+                    newPointsKey0);
+                I1 = LibUtilities::PointsManager()[PointsKey1]->GetI(
+                    newPointsKey1);
+
                 for (int i = 0; i < m_numElmt; ++i)
                 {
-                    // current points key
-                    LibUtilities::PointsKey PointsKey0(
-                        pt0, m_expList[i]->GetPointsType(0));
-                    LibUtilities::PointsKey PointsKey1(
-                        pt1, m_expList[i]->GetPointsType(1));
-                    // get new points key
-                    LibUtilities::PointsKey newPointsKey0(
-                        npt0, m_expList[i]->GetPointsType(0));
-                    LibUtilities::PointsKey newPointsKey1(
-                        npt1, m_expList[i]->GetPointsType(1));
-
-                    // Interpolate points;
-                    I0 = LibUtilities::PointsManager()[PointsKey0]->GetI(
-                        newPointsKey0);
-                    I1 = LibUtilities::PointsManager()[PointsKey1]->GetI(
-                        newPointsKey1);
-
                     Blas::Dgemm('N', 'T', pt0, npt1, pt1, 1.0, &input[cnt], pt0,
                                 I1->GetPtr().get(), npt1, 0.0, wsp.get(), pt0);
 
@@ -415,30 +381,30 @@ public:
                 Array<OneD, NekDouble> wsp1(npt0 * npt1 * pt2);
                 Array<OneD, NekDouble> wsp2(npt0 * pt1 * pt2);
 
+                // current points key
+                LibUtilities::PointsKey PointsKey0(
+                    pt0, m_expList[0]->GetPointsType(0));
+                LibUtilities::PointsKey PointsKey1(
+                    pt1, m_expList[0]->GetPointsType(1));
+                LibUtilities::PointsKey PointsKey2(
+                    pt2, m_expList[0]->GetPointsType(2));
+                // get new points key
+                LibUtilities::PointsKey newPointsKey0(
+                    npt0, m_expList[0]->GetPointsType(0));
+                LibUtilities::PointsKey newPointsKey1(
+                    npt1, m_expList[0]->GetPointsType(1));
+                LibUtilities::PointsKey newPointsKey2(
+                    npt2, m_expList[0]->GetPointsType(2));
+
+                I0 = LibUtilities::PointsManager()[PointsKey0]->GetI(
+                    newPointsKey0);
+                I1 = LibUtilities::PointsManager()[PointsKey1]->GetI(
+                    newPointsKey1);
+                I2 = LibUtilities::PointsManager()[PointsKey2]->GetI(
+                    newPointsKey2);
+
                 for (int i = 0; i < m_numElmt; ++i)
                 {
-                    // current points key
-                    LibUtilities::PointsKey PointsKey0(
-                        pt0, m_expList[i]->GetPointsType(0));
-                    LibUtilities::PointsKey PointsKey1(
-                        pt1, m_expList[i]->GetPointsType(1));
-                    LibUtilities::PointsKey PointsKey2(
-                        pt2, m_expList[i]->GetPointsType(2));
-                    // get new points key
-                    LibUtilities::PointsKey newPointsKey0(
-                        npt0, m_expList[i]->GetPointsType(0));
-                    LibUtilities::PointsKey newPointsKey1(
-                        npt1, m_expList[i]->GetPointsType(1));
-                    LibUtilities::PointsKey newPointsKey2(
-                        npt2, m_expList[i]->GetPointsType(2));
-
-                    I0 = LibUtilities::PointsManager()[PointsKey0]->GetI(
-                        newPointsKey0);
-                    I1 = LibUtilities::PointsManager()[PointsKey1]->GetI(
-                        newPointsKey1);
-                    I2 = LibUtilities::PointsManager()[PointsKey2]->GetI(
-                        newPointsKey2);
-
                     // Interpolate points
                     Blas::Dgemm('N', 'N', npt0, pt1 * pt2, pt0, 1.0,
                                 I0->GetPtr().get(), npt0, &input[cnt], pt0, 0.0,
@@ -483,14 +449,6 @@ public:
         m_factors = factors;
     }
 
-protected:
-    vector<StdRegions::StdExpansionSharedPtr> m_expList;
-    StdRegions::FactorMap m_factors; // Initially, PhysInterp1DScaled was taking
-                                     // as input a single scaling factor
-    // This single scaling factor will be expressed into a matrix where each
-    // element has the same value so that matrix - vector multiplication
-    // optimization by BLAS is implemented
-
 private:
     PhysInterp1DScaled_NoCollection(
         vector<StdRegions::StdExpansionSharedPtr> pCollExp,
@@ -500,6 +458,9 @@ private:
         m_expList = pCollExp;
         m_factors = factors;
     }
+
+    StdRegions::FactorMap m_factors;
+    vector<StdRegions::StdExpansionSharedPtr> m_expList;
 };
 
 /// Factory initialisation for the PhysInterp1DScaled_NoCollection operators
diff --git a/library/Demos/MultiRegions/Tests/test.opt b/library/Demos/MultiRegions/Tests/test.opt
index 6d85804037..723fd47d9d 100644
--- a/library/Demos/MultiRegions/Tests/test.opt
+++ b/library/Demos/MultiRegions/Tests/test.opt
@@ -4,22 +4,50 @@
         <OPERATOR TYPE="BwdTrans">
             <ELEMENT TYPE="T" ORDER="*" IMPTYPE="NoCollection" />
             <ELEMENT TYPE="Q" ORDER="*" IMPTYPE="NoCollection" />
+            <ELEMENT TYPE="H" ORDER="*" IMPTYPE="NoCollection" />
+            <ELEMENT TYPE="R" ORDER="*" IMPTYPE="NoCollection" />
+            <ELEMENT TYPE="P" ORDER="*" IMPTYPE="NoCollection" />
+            <ELEMENT TYPE="A" ORDER="*" IMPTYPE="NoCollection" />
         </OPERATOR>
         <OPERATOR TYPE="Helmholtz">
             <ELEMENT TYPE="T" ORDER="*" IMPTYPE="NoCollection" />
             <ELEMENT TYPE="Q" ORDER="*" IMPTYPE="NoCollection" />
+            <ELEMENT TYPE="H" ORDER="*" IMPTYPE="NoCollection" />
+            <ELEMENT TYPE="R" ORDER="*" IMPTYPE="NoCollection" />
+            <ELEMENT TYPE="P" ORDER="*" IMPTYPE="NoCollection" />
+            <ELEMENT TYPE="A" ORDER="*" IMPTYPE="NoCollection" />
         </OPERATOR>
         <OPERATOR TYPE="IProductWRTBase">
             <ELEMENT TYPE="T" ORDER="*" IMPTYPE="NoCollection" />
             <ELEMENT TYPE="Q" ORDER="*" IMPTYPE="NoCollection" />
+            <ELEMENT TYPE="H" ORDER="*" IMPTYPE="NoCollection" />
+            <ELEMENT TYPE="R" ORDER="*" IMPTYPE="NoCollection" />
+            <ELEMENT TYPE="P" ORDER="*" IMPTYPE="NoCollection" />
+            <ELEMENT TYPE="A" ORDER="*" IMPTYPE="NoCollection" />
         </OPERATOR>
         <OPERATOR TYPE="IProductWRTDerivBase">
             <ELEMENT TYPE="T" ORDER="*" IMPTYPE="NoCollection" />
             <ELEMENT TYPE="Q" ORDER="*" IMPTYPE="NoCollection" />
+            <ELEMENT TYPE="H" ORDER="*" IMPTYPE="NoCollection" />
+            <ELEMENT TYPE="R" ORDER="*" IMPTYPE="NoCollection" />
+            <ELEMENT TYPE="P" ORDER="*" IMPTYPE="NoCollection" />
+            <ELEMENT TYPE="A" ORDER="*" IMPTYPE="NoCollection" />
         </OPERATOR>
         <OPERATOR TYPE="PhysDeriv">
             <ELEMENT TYPE="T" ORDER="*" IMPTYPE="NoCollection" />
             <ELEMENT TYPE="Q" ORDER="*" IMPTYPE="NoCollection" />
+            <ELEMENT TYPE="H" ORDER="*" IMPTYPE="NoCollection" />
+            <ELEMENT TYPE="R" ORDER="*" IMPTYPE="NoCollection" />
+            <ELEMENT TYPE="P" ORDER="*" IMPTYPE="NoCollection" />
+            <ELEMENT TYPE="A" ORDER="*" IMPTYPE="NoCollection" />
+        </OPERATOR>
+        <OPERATOR TYPE="PhysInterp1DScaled">
+            <ELEMENT TYPE="T" ORDER="*" IMPTYPE="NoCollection" />
+            <ELEMENT TYPE="Q" ORDER="*" IMPTYPE="NoCollection" />
+            <ELEMENT TYPE="H" ORDER="*" IMPTYPE="NoCollection" />
+            <ELEMENT TYPE="R" ORDER="*" IMPTYPE="NoCollection" />
+            <ELEMENT TYPE="P" ORDER="*" IMPTYPE="NoCollection" />
+            <ELEMENT TYPE="A" ORDER="*" IMPTYPE="NoCollection" />
         </OPERATOR>
     </COLLECTIONS>
 </NEKTAR>
diff --git a/library/MatrixFreeOps/BwdTrans.h b/library/MatrixFreeOps/BwdTrans.h
index e81cb44ae5..92c0d6e104 100644
--- a/library/MatrixFreeOps/BwdTrans.h
+++ b/library/MatrixFreeOps/BwdTrans.h
@@ -92,6 +92,17 @@ struct BwdTransTemplate
     void operator()(const Array<OneD, const NekDouble> &input,
                     Array<OneD, NekDouble> &output) final
     {
+        const int nm0 = this->m_nm[0];
+        const int nq0 = this->m_nq[0];
+#if defined(SHAPE_DIMENSION_2D)
+        const int nm1 = this->m_nm[1];
+        const int nq1 = this->m_nq[1];
+#elif defined(SHAPE_DIMENSION_3D)
+        const int nm1 = this->m_nm[1];
+        const int nm2 = this->m_nm[2];
+        const int nq1 = this->m_nq[1];
+        const int nq2 = this->m_nq[2];
+#endif
 #include "SwitchNodesPoints.h"
     }
 
@@ -111,8 +122,8 @@ struct BwdTransTemplate
     void operator1D(const Array<OneD, const NekDouble> &input,
                     Array<OneD, NekDouble> &output)
     {
-        const auto nm0 = m_basis[0]->GetNumModes();
-        const auto nq0 = m_basis[0]->GetNumPoints();
+        const auto nm0 = this->m_nm[0];
+        const auto nq0 = this->m_nq[0];
 
         const auto nqTot    = nq0;
         const auto nqBlocks = nqTot * vec_t::width;
@@ -226,11 +237,11 @@ struct BwdTransTemplate
     void operator2D(const Array<OneD, const NekDouble> &input,
                     Array<OneD, NekDouble> &output)
     {
-        const auto nm0 = m_basis[0]->GetNumModes();
-        const auto nm1 = m_basis[1]->GetNumModes();
+        const auto nm0 = this->m_nm[0];
+        const auto nm1 = this->m_nm[1];
 
-        const auto nq0 = m_basis[0]->GetNumPoints();
-        const auto nq1 = m_basis[1]->GetNumPoints();
+        const auto nq0 = this->m_nq[0];
+        const auto nq1 = this->m_nq[1];
 
         const auto nqTot    = nq0 * nq1;
         const auto nqBlocks = nqTot * vec_t::width;
@@ -356,13 +367,13 @@ struct BwdTransTemplate
     void operator3D(const Array<OneD, const NekDouble> &input,
                     Array<OneD, NekDouble> &output)
     {
-        const auto nm0 = m_basis[0]->GetNumModes();
-        const auto nm1 = m_basis[1]->GetNumModes();
-        const auto nm2 = m_basis[2]->GetNumModes();
+        const auto nm0 = this->m_nm[0];
+        const auto nm1 = this->m_nm[1];
+        const auto nm2 = this->m_nm[2];
 
-        const auto nq0 = m_basis[0]->GetNumPoints();
-        const auto nq1 = m_basis[1]->GetNumPoints();
-        const auto nq2 = m_basis[2]->GetNumPoints();
+        const auto nq0 = this->m_nq[0];
+        const auto nq1 = this->m_nq[1];
+        const auto nq2 = this->m_nq[2];
 
         const auto nqTot    = nq0 * nq1 * nq2;
         const auto nqBlocks = nqTot * vec_t::width;
diff --git a/library/MatrixFreeOps/CMakeLists.txt b/library/MatrixFreeOps/CMakeLists.txt
index 7dc170f643..267ece36c0 100644
--- a/library/MatrixFreeOps/CMakeLists.txt
+++ b/library/MatrixFreeOps/CMakeLists.txt
@@ -16,7 +16,7 @@ CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/SwitchLimits.h.in
 # so to reduce compile time timeout failures due to the large number
 # of template impementations.
 
-SET(OPERATORS BwdTrans PhysDeriv Helmholtz LinearAdvectionDiffusionReaction IProduct IProductWRTDerivBase)
+SET(OPERATORS BwdTrans PhysDeriv Helmholtz LinearAdvectionDiffusionReaction IProduct IProductWRTDerivBase PhysInterp1DScaled)
 SET(SHAPES Seg Tri Quad Hex Tet Prism Pyr )
 SET(DIMENSIONS 1 2 2 3 3 3 3 )
 
diff --git a/library/MatrixFreeOps/Helmholtz.h b/library/MatrixFreeOps/Helmholtz.h
index 6d1ec4bfa0..2926479e6b 100644
--- a/library/MatrixFreeOps/Helmholtz.h
+++ b/library/MatrixFreeOps/Helmholtz.h
@@ -83,8 +83,8 @@ struct HelmholtzTemplate
                 SHAPE_TYPE, this->m_nm[0], this->m_nm[1]);
 
 #if defined(SHAPE_TYPE_TRI)
-            const auto nq0 = m_basis[0]->GetNumPoints();
-            const auto nq1 = m_basis[1]->GetNumPoints();
+            const auto nq0 = this->m_nq[0];
+            const auto nq1 = this->m_nq[1];
 
             const Array<OneD, const NekDouble> &z0 = m_basis[0]->GetZ();
             const Array<OneD, const NekDouble> &z1 = m_basis[1]->GetZ();
@@ -99,9 +99,9 @@ struct HelmholtzTemplate
 
 #if defined(SHAPE_TYPE_TET) || defined(SHAPE_TYPE_PRISM) ||                    \
     defined(SHAPE_TYPE_PYR)
-            const auto nq0 = m_basis[0]->GetNumPoints();
-            const auto nq1 = m_basis[1]->GetNumPoints();
-            const auto nq2 = m_basis[2]->GetNumPoints();
+            const auto nq0 = this->m_nq[0];
+            const auto nq1 = this->m_nq[1];
+            const auto nq2 = this->m_nq[2];
 
             const Array<OneD, const NekDouble> &z0 = m_basis[0]->GetZ();
             const Array<OneD, const NekDouble> &z1 = m_basis[1]->GetZ();
@@ -123,6 +123,17 @@ struct HelmholtzTemplate
     void operator()(const Array<OneD, const NekDouble> &input,
                     Array<OneD, NekDouble> &output) final
     {
+        const int nm0 = this->m_nm[0];
+        const int nq0 = this->m_nq[0];
+#if defined(SHAPE_DIMENSION_2D)
+        const int nm1 = this->m_nm[1];
+        const int nq1 = this->m_nq[1];
+#elif defined(SHAPE_DIMENSION_3D)
+        const int nm1 = this->m_nm[1];
+        const int nm2 = this->m_nm[2];
+        const int nq1 = this->m_nq[1];
+        const int nq2 = this->m_nq[2];
+#endif
 #include "SwitchNodesPoints.h"
     }
 
@@ -159,11 +170,11 @@ struct HelmholtzTemplate
     void operator2D(const Array<OneD, const NekDouble> &input,
                     Array<OneD, NekDouble> &output)
     {
-        const auto nm0 = m_basis[0]->GetNumModes();
-        const auto nm1 = m_basis[1]->GetNumModes();
+        const auto nm0 = this->m_nm[0];
+        const auto nm1 = this->m_nm[1];
 
-        const auto nq0 = m_basis[0]->GetNumPoints();
-        const auto nq1 = m_basis[1]->GetNumPoints();
+        const auto nq0 = this->m_nq[0];
+        const auto nq1 = this->m_nq[1];
 
         constexpr auto ndf = 4;
 
@@ -194,7 +205,7 @@ struct HelmholtzTemplate
         const vec_t *jac_ptr = &((*this->m_jac)[0]);
         const vec_t *df_ptr  = &((*this->m_df)[0]);
 
-        const auto dfSize = ndf * nqTot;
+        [[maybe_unused]] const auto dfSize = ndf * nqTot;
 
         for (int e = 0; e < this->m_nBlocks - 1; e++)
         {
@@ -367,13 +378,13 @@ struct HelmholtzTemplate
     void operator3D(const Array<OneD, const NekDouble> &input,
                     Array<OneD, NekDouble> &output)
     {
-        const auto nm0 = m_basis[0]->GetNumModes();
-        const auto nm1 = m_basis[1]->GetNumModes();
-        const auto nm2 = m_basis[2]->GetNumModes();
+        const auto nm0 = this->m_nm[0];
+        const auto nm1 = this->m_nm[1];
+        const auto nm2 = this->m_nm[2];
 
-        const auto nq0 = m_basis[0]->GetNumPoints();
-        const auto nq1 = m_basis[1]->GetNumPoints();
-        const auto nq2 = m_basis[2]->GetNumPoints();
+        const auto nq0 = this->m_nq[0];
+        const auto nq1 = this->m_nq[1];
+        const auto nq2 = this->m_nq[2];
 
         constexpr auto ndf  = 9;
         const auto nqTot    = nq0 * nq1 * nq2;
@@ -407,7 +418,7 @@ struct HelmholtzTemplate
         const vec_t *jac_ptr = &((*this->m_jac)[0]);
         const vec_t *df_ptr  = &((*this->m_df)[0]);
 
-        const auto dfSize = ndf * nqTot;
+        [[maybe_unused]] const auto dfSize = ndf * nqTot;
 
         for (int e = 0; e < this->m_nBlocks - 1; e++)
         {
diff --git a/library/MatrixFreeOps/IProduct.h b/library/MatrixFreeOps/IProduct.h
index b22f8dba37..88d14dd56a 100644
--- a/library/MatrixFreeOps/IProduct.h
+++ b/library/MatrixFreeOps/IProduct.h
@@ -95,6 +95,17 @@ struct IProductTemplate
     void operator()(const Array<OneD, const NekDouble> &input,
                     Array<OneD, NekDouble> &output) final
     {
+        const int nm0 = this->m_nm[0];
+        const int nq0 = this->m_nq[0];
+#if defined(SHAPE_DIMENSION_2D)
+        const int nm1 = this->m_nm[1];
+        const int nq1 = this->m_nq[1];
+#elif defined(SHAPE_DIMENSION_3D)
+        const int nm1 = this->m_nm[1];
+        const int nm2 = this->m_nm[2];
+        const int nq1 = this->m_nq[1];
+        const int nq2 = this->m_nq[2];
+#endif
 #include "SwitchNodesPoints.h"
     }
 
@@ -114,8 +125,8 @@ struct IProductTemplate
     void operator1D(const Array<OneD, const NekDouble> &input,
                     Array<OneD, NekDouble> &output)
     {
-        const auto nm0 = m_basis[0]->GetNumModes();
-        const auto nq0 = m_basis[0]->GetNumPoints();
+        const auto nm0 = this->m_nm[0];
+        const auto nq0 = this->m_nq[0];
 
         const auto nqTot    = nq0;
         const auto nqBlocks = nqTot * vec_t::width;
@@ -247,11 +258,11 @@ struct IProductTemplate
     void operator2D(const Array<OneD, const NekDouble> &input,
                     Array<OneD, NekDouble> &output)
     {
-        const auto nm0 = m_basis[0]->GetNumModes();
-        const auto nm1 = m_basis[1]->GetNumModes();
+        const auto nm0 = this->m_nm[0];
+        const auto nm1 = this->m_nm[1];
 
-        const auto nq0 = m_basis[0]->GetNumPoints();
-        const auto nq1 = m_basis[1]->GetNumPoints();
+        const auto nq0 = this->m_nq[0];
+        const auto nq1 = this->m_nq[1];
 
         const auto nqTot    = nq0 * nq1;
         const auto nqBlocks = nqTot * vec_t::width;
@@ -403,13 +414,13 @@ struct IProductTemplate
     void operator3D(const Array<OneD, const NekDouble> &input,
                     Array<OneD, NekDouble> &output)
     {
-        const auto nm0 = m_basis[0]->GetNumModes();
-        const auto nm1 = m_basis[1]->GetNumModes();
-        const auto nm2 = m_basis[2]->GetNumModes();
+        const auto nm0 = this->m_nm[0];
+        const auto nm1 = this->m_nm[1];
+        const auto nm2 = this->m_nm[2];
 
-        const auto nq0 = m_basis[0]->GetNumPoints();
-        const auto nq1 = m_basis[1]->GetNumPoints();
-        const auto nq2 = m_basis[2]->GetNumPoints();
+        const auto nq0 = this->m_nq[0];
+        const auto nq1 = this->m_nq[1];
+        const auto nq2 = this->m_nq[2];
 
         const auto nqTot    = nq0 * nq1 * nq2;
         const auto nqBlocks = nqTot * vec_t::width;
diff --git a/library/MatrixFreeOps/IProductWRTDerivBase.h b/library/MatrixFreeOps/IProductWRTDerivBase.h
index 9d9aebad5c..93331c16c6 100644
--- a/library/MatrixFreeOps/IProductWRTDerivBase.h
+++ b/library/MatrixFreeOps/IProductWRTDerivBase.h
@@ -97,6 +97,17 @@ struct IProductWRTDerivBaseTemplate
     void operator()(const Array<OneD, Array<OneD, NekDouble>> &input,
                     Array<OneD, NekDouble> &output) final
     {
+        const int nm0 = this->m_nm[0];
+        const int nq0 = this->m_nq[0];
+#if defined(SHAPE_DIMENSION_2D)
+        const int nm1 = this->m_nm[1];
+        const int nq1 = this->m_nq[1];
+#elif defined(SHAPE_DIMENSION_3D)
+        const int nm1 = this->m_nm[1];
+        const int nm2 = this->m_nm[2];
+        const int nq1 = this->m_nq[1];
+        const int nq2 = this->m_nq[2];
+#endif
 #include "SwitchNodesPoints.h"
     }
 
@@ -121,8 +132,8 @@ struct IProductWRTDerivBaseTemplate
         ASSERTL1(input.size() <= 3, "IProductWRTDerivBaseTemplate::Operator1D: "
                                     "Operator not set up for other dimensions.")
 
-        const auto nm0 = m_basis[0]->GetNumModes();
-        const auto nq0 = m_basis[0]->GetNumPoints();
+        const auto nm0 = this->m_nm[0];
+        const auto nq0 = this->m_nq[0];
 
         const auto nqTot    = nq0;
         const auto nqBlocks = nqTot * vec_t::width;
@@ -340,11 +351,11 @@ struct IProductWRTDerivBaseTemplate
         ASSERTL1(input.size() <= 3, "IProductWRTDerivBaseTemplate::Operator2D: "
                                     "Operator not set up for 3D coordinates.");
 
-        const auto nm0 = m_basis[0]->GetNumModes();
-        const auto nm1 = m_basis[1]->GetNumModes();
+        const auto nm0 = this->m_nm[0];
+        const auto nm1 = this->m_nm[1];
 
-        const auto nq0 = m_basis[0]->GetNumPoints();
-        const auto nq1 = m_basis[1]->GetNumPoints();
+        const auto nq0 = this->m_nq[0];
+        const auto nq1 = this->m_nq[1];
 
         const auto nqTot    = nq0 * nq1;
         const auto nqBlocks = nqTot * vec_t::width;
@@ -587,13 +598,13 @@ struct IProductWRTDerivBaseTemplate
                  "IProductWRTDerivBaseTemplate::Operator3D: Cannot call 3D "
                  "routine with 1 or 2 outputs.");
 
-        const auto nm0 = m_basis[0]->GetNumModes();
-        const auto nm1 = m_basis[1]->GetNumModes();
-        const auto nm2 = m_basis[2]->GetNumModes();
+        const auto nm0 = this->m_nm[0];
+        const auto nm1 = this->m_nm[1];
+        const auto nm2 = this->m_nm[2];
 
-        const auto nq0 = m_basis[0]->GetNumPoints();
-        const auto nq1 = m_basis[1]->GetNumPoints();
-        const auto nq2 = m_basis[2]->GetNumPoints();
+        const auto nq0 = this->m_nq[0];
+        const auto nq1 = this->m_nq[1];
+        const auto nq2 = this->m_nq[2];
 
         constexpr auto ndf  = 9u;
         const auto nqTot    = nq0 * nq1 * nq2;
diff --git a/library/MatrixFreeOps/LinearAdvectionDiffusionReaction.h b/library/MatrixFreeOps/LinearAdvectionDiffusionReaction.h
index f8cbbc019c..6833f5bf5b 100644
--- a/library/MatrixFreeOps/LinearAdvectionDiffusionReaction.h
+++ b/library/MatrixFreeOps/LinearAdvectionDiffusionReaction.h
@@ -84,8 +84,8 @@ struct LinearAdvectionDiffusionReactionTemplate
                 SHAPE_TYPE, this->m_nm[0], this->m_nm[1]);
 
 #if defined(SHAPE_TYPE_TRI)
-            const auto nq0 = m_basis[0]->GetNumPoints();
-            const auto nq1 = m_basis[1]->GetNumPoints();
+            const auto nq0 = this->m_nq[0];
+            const auto nq1 = this->m_nq[1];
 
             const Array<OneD, const NekDouble> &z0 = m_basis[0]->GetZ();
             const Array<OneD, const NekDouble> &z1 = m_basis[1]->GetZ();
@@ -101,9 +101,9 @@ struct LinearAdvectionDiffusionReactionTemplate
 
 #if defined(SHAPE_TYPE_TET) || defined(SHAPE_TYPE_PRISM) ||                    \
     defined(SHAPE_TYPE_PYR)
-            const auto nq0 = m_basis[0]->GetNumPoints();
-            const auto nq1 = m_basis[1]->GetNumPoints();
-            const auto nq2 = m_basis[2]->GetNumPoints();
+            const auto nq0 = this->m_nq[0];
+            const auto nq1 = this->m_nq[1];
+            const auto nq2 = this->m_nq[2];
 
             const Array<OneD, const NekDouble> &z0 = m_basis[0]->GetZ();
             const Array<OneD, const NekDouble> &z1 = m_basis[1]->GetZ();
@@ -126,6 +126,17 @@ struct LinearAdvectionDiffusionReactionTemplate
     void operator()(const Array<OneD, const NekDouble> &input,
                     Array<OneD, NekDouble> &output) final
     {
+        const int nm0 = this->m_nm[0];
+        const int nq0 = this->m_nq[0];
+#if defined(SHAPE_DIMENSION_2D)
+        const int nm1 = this->m_nm[1];
+        const int nq1 = this->m_nq[1];
+#elif defined(SHAPE_DIMENSION_3D)
+        const int nm1 = this->m_nm[1];
+        const int nm2 = this->m_nm[2];
+        const int nq1 = this->m_nq[1];
+        const int nq2 = this->m_nq[2];
+#endif
 #include "SwitchNodesPoints.h"
     }
 
@@ -164,11 +175,11 @@ struct LinearAdvectionDiffusionReactionTemplate
     void operator2D(const Array<OneD, const NekDouble> &input,
                     Array<OneD, NekDouble> &output)
     {
-        const auto nm0 = m_basis[0]->GetNumModes();
-        const auto nm1 = m_basis[1]->GetNumModes();
+        const auto nm0 = this->m_nm[0];
+        const auto nm1 = this->m_nm[1];
 
-        const auto nq0 = m_basis[0]->GetNumPoints();
-        const auto nq1 = m_basis[1]->GetNumPoints();
+        const auto nq0 = this->m_nq[0];
+        const auto nq1 = this->m_nq[1];
 
         constexpr auto ndf  = 4;
         constexpr auto nvel = 2;
@@ -202,8 +213,8 @@ struct LinearAdvectionDiffusionReactionTemplate
         const vec_t *advVel_ptr = &((*this->m_advVel)[0]);
 
         // Get size of derivative factor block
-        const auto dfSize     = ndf * nqTot;
-        const auto advVelSize = nvel * nqTot;
+        [[maybe_unused]] const auto dfSize = ndf * nqTot;
+        const auto advVelSize              = nvel * nqTot;
 
         for (int e = 0; e < this->m_nBlocks - 1; e++)
         {
@@ -368,10 +379,10 @@ struct LinearAdvectionDiffusionReactionTemplate
                                                 deriv1, bwd);
 
         // Step 4: Inner product for mass + advection matrix operation
-        // Input: bwd = 1/lambda * V \cdot \nabla_x uPhys + uPhys (const)
-        // Output: wsp0 = temporary
-        //         tmpOut = lambda \int_\Omega (1/lambda*V \cdot \nabla_x
-        //         uPhys + uPhys) phi
+        // Input: bwd = 1/lambda * V \cdot \nabla_x uPhys + uPhys
+        // (const) Output: wsp0 = temporary
+        //         tmpOut = lambda \int_\Omega (1/lambda*V \cdot
+        //         \nabla_x uPhys + uPhys) phi
         IProduct2DKernel<SHAPE_TYPE, true, false, DEFORMED>(
             nm0, nm1, nq0, nq1, correct, bwd, this->m_bdata[0],
             this->m_bdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0, tmpOut,
@@ -413,13 +424,13 @@ struct LinearAdvectionDiffusionReactionTemplate
     void operator3D(const Array<OneD, const NekDouble> &input,
                     Array<OneD, NekDouble> &output)
     {
-        const auto nm0 = m_basis[0]->GetNumModes();
-        const auto nm1 = m_basis[1]->GetNumModes();
-        const auto nm2 = m_basis[2]->GetNumModes();
+        const auto nm0 = this->m_nm[0];
+        const auto nm1 = this->m_nm[1];
+        const auto nm2 = this->m_nm[2];
 
-        const auto nq0 = m_basis[0]->GetNumPoints();
-        const auto nq1 = m_basis[1]->GetNumPoints();
-        const auto nq2 = m_basis[2]->GetNumPoints();
+        const auto nq0 = this->m_nq[0];
+        const auto nq1 = this->m_nq[1];
+        const auto nq2 = this->m_nq[2];
 
         constexpr auto ndf  = 9;
         constexpr auto nvel = 3;
@@ -456,8 +467,8 @@ struct LinearAdvectionDiffusionReactionTemplate
         const vec_t *advVel_ptr = &((*this->m_advVel)[0]);
 
         // Get size of derivative factor block
-        const auto dfSize     = ndf * nqTot;
-        const auto advVelSize = nvel * nqTot;
+        [[maybe_unused]] const auto dfSize = ndf * nqTot;
+        const auto advVelSize              = nvel * nqTot;
 
         for (int e = 0; e < this->m_nBlocks - 1; e++)
         {
diff --git a/library/MatrixFreeOps/Operator.hpp b/library/MatrixFreeOps/Operator.hpp
index 22577425a0..83b552fcfc 100644
--- a/library/MatrixFreeOps/Operator.hpp
+++ b/library/MatrixFreeOps/Operator.hpp
@@ -43,6 +43,7 @@
 #include <LibUtilities/BasicUtils/NekFactory.hpp>
 #include <LibUtilities/BasicUtils/NekInline.hpp>
 #include <LibUtilities/Foundations/Basis.h>
+#include <LibUtilities/Foundations/ManagerAccess.h>
 #include <LibUtilities/SimdLib/tinysimd.hpp>
 
 namespace Nektar::MatrixFree
@@ -82,6 +83,18 @@ public:
         return false;
     }
 
+    MATRIXFREE_EXPORT virtual void SetUpBdata(
+        [[maybe_unused]] std::vector<LibUtilities::BasisSharedPtr> &basis) = 0;
+    MATRIXFREE_EXPORT virtual void SetUpDBdata(
+        [[maybe_unused]] std::vector<LibUtilities::BasisSharedPtr> &basis) = 0;
+    MATRIXFREE_EXPORT virtual void SetUpInterp1D(
+        [[maybe_unused]] std::vector<LibUtilities::BasisSharedPtr> &basis,
+        [[maybe_unused]] NekDouble factor) = 0;
+    MATRIXFREE_EXPORT virtual void SetUpZW(
+        [[maybe_unused]] std::vector<LibUtilities::BasisSharedPtr> &basis) = 0;
+    MATRIXFREE_EXPORT virtual void SetUpD(
+        [[maybe_unused]] std::vector<LibUtilities::BasisSharedPtr> &basis) = 0;
+
     MATRIXFREE_EXPORT virtual void SetDF(
         const std::shared_ptr<std::vector<vec_t, tinysimd::allocator<vec_t>>>
             &df) = 0;
@@ -119,6 +132,12 @@ public:
                        int nElmt)
         : m_basis(basis), m_nElmt(nElmt)
     {
+        // Since the class is constructed before the
+        // PhysInterp1DScaled_MatrixFree class inside collections, we are
+        // initializing the m_scalingFactor member with the default value of 1.5
+        // and upon construction of the aforemmentioned class, it will be
+        // updated to the correct value.
+        m_scalingFactor = 1.5;
     }
 
     ~PhysInterp1DScaled() override = default;
@@ -127,9 +146,19 @@ public:
         const Array<OneD, const NekDouble> &input,
         Array<OneD, NekDouble> &output) = 0; // Abstract Method
 
+    // Function to initialize the scaling factor that is used to increase the
+    // number of quadrature points in each direction of the flow
+    NEK_FORCE_INLINE void SetScalingFactor(NekDouble scalingFactor)
+    {
+        m_scalingFactor = scalingFactor;
+    }
+
 protected:
     std::vector<LibUtilities::BasisSharedPtr> m_basis;
     int m_nElmt;
+
+    // Scaling factor to enhance the polynomial space
+    NekDouble m_scalingFactor;
 };
 
 // Base class for product operator.
@@ -484,31 +513,82 @@ protected:
             m_nBlocks = nElmt / vec_t::width + 1;
             m_nPads   = vec_t::width - (nElmt % vec_t::width);
         }
-
-        // Depending on element dimension, set up basis information, quadrature,
-        // etc, inside vectorised environment.
         for (int i = 0; i < DIM; ++i)
         {
-            const Array<OneD, const NekDouble> bdata  = basis[i]->GetBdata();
-            const Array<OneD, const NekDouble> dbdata = basis[i]->GetDbdata();
-            const Array<OneD, const NekDouble> w      = basis[i]->GetW();
-
             m_nm[i] = basis[i]->GetNumModes();
             m_nq[i] = basis[i]->GetNumPoints();
+        }
+    }
+
+    // Depending on element dimension, set up basis information,
+    // inside vectorised environment.
+    void SetUpBdata(std::vector<LibUtilities::BasisSharedPtr> &basis) final
+    {
+        for (int i = 0; i < DIM; ++i)
+        {
+            const Array<OneD, const NekDouble> bdata = basis[i]->GetBdata();
 
             m_bdata[i].resize(bdata.size());
             for (auto j = 0; j < bdata.size(); ++j)
             {
                 m_bdata[i][j] = bdata[j];
             }
+        }
+    }
+
+    // Depending on element dimension, set up derivative of basis
+    // information, inside vectorised environment.
+    void SetUpDBdata(std::vector<LibUtilities::BasisSharedPtr> &basis) final
+    {
+        for (int i = 0; i < DIM; ++i)
+        {
+            const Array<OneD, const NekDouble> dbdata = basis[i]->GetDbdata();
 
             m_dbdata[i].resize(dbdata.size());
             for (auto j = 0; j < dbdata.size(); ++j)
             {
                 m_dbdata[i][j] = dbdata[j];
             }
+        }
+    }
 
-            NekDouble fac = 1.0;
+    // Depending on element dimension, set up 1D interpolation matrix in
+    // basis data inside vectorised environment.
+    void SetUpInterp1D(std::vector<LibUtilities::BasisSharedPtr> &basis,
+                       NekDouble factor) final
+    {
+        // Is this updated at each time-step?
+        // Depending on element dimension, set up interpolation matrices,
+        // inside vectorised environment.
+        for (int i = 0; i < DIM; ++i)
+        {
+            m_enhancednq[i] = (int)basis[i]->GetNumPoints() * factor;
+
+            LibUtilities::PointsKey PointsKeyIn(m_nq[i],
+                                                basis[i]->GetPointsType());
+            LibUtilities::PointsKey PointsKeyOut((int)m_nq[i] * factor,
+                                                 basis[i]->GetPointsType());
+            auto I = LibUtilities::PointsManager()[PointsKeyIn]
+                         ->GetI(PointsKeyOut)
+                         ->GetPtr();
+
+            m_I[i].resize(I.size());
+            for (int j = 0; j < I.size(); ++j)
+            {
+                m_I[i][j] = I[j];
+            }
+        }
+    }
+
+    void SetUpZW(std::vector<LibUtilities::BasisSharedPtr> &basis) final
+    {
+
+        // Depending on element dimension, set up quadrature,
+        // inside vectorised environment.
+        for (int i = 0; i < DIM; ++i)
+        {
+            const Array<OneD, const NekDouble> w = basis[i]->GetW();
+            NekDouble fac                        = 1.0;
             if (basis[i]->GetPointsType() ==
                 LibUtilities::eGaussRadauMAlpha1Beta0)
             {
@@ -526,13 +606,6 @@ protected:
                 m_w[i][j] = fac * w[j];
             }
 
-            auto D = basis[i]->GetD()->GetPtr();
-            m_D[i].resize(D.size());
-            for (int j = 0; j < D.size(); ++j)
-            {
-                m_D[i][j] = D[j];
-            }
-
             auto Z = basis[i]->GetZ();
             m_Z[i].resize(Z.size());
             for (int j = 0; j < Z.size(); ++j)
@@ -542,6 +615,22 @@ protected:
         }
     }
 
+    void SetUpD(std::vector<LibUtilities::BasisSharedPtr> &basis) final
+    {
+
+        // Depending on element dimension, set up quadrature,
+        // inside vectorised environment.
+        for (int i = 0; i < DIM; ++i)
+        {
+            auto D = basis[i]->GetD()->GetPtr();
+            m_D[i].resize(D.size());
+            for (int j = 0; j < D.size(); ++j)
+            {
+                m_D[i][j] = D[j];
+            }
+        }
+    }
+
     void SetDF(
         const std::shared_ptr<std::vector<vec_t, tinysimd::allocator<vec_t>>>
             &df) final
@@ -558,7 +647,7 @@ protected:
 
     int m_nBlocks;
     int m_nPads;
-    std::array<int, DIM> m_nm, m_nq;
+    std::array<int, DIM> m_nm, m_nq, m_enhancednq;
     std::array<std::vector<vec_t, tinysimd::allocator<vec_t>>, DIM> m_bdata;
     std::array<std::vector<vec_t, tinysimd::allocator<vec_t>>, DIM> m_dbdata;
     std::array<std::vector<vec_t, tinysimd::allocator<vec_t>>, DIM>
@@ -567,12 +656,13 @@ protected:
         m_Z; // Zeroes
     std::array<std::vector<vec_t, tinysimd::allocator<vec_t>>, DIM>
         m_w; // Weights
+    std::array<std::vector<vec_t, tinysimd::allocator<vec_t>>, DIM>
+        m_I; // Interpolation Matrices
     std::shared_ptr<std::vector<vec_t, tinysimd::allocator<vec_t>>>
         m_df; // Chain rule function deriviatives for each element (00, 10,
               // 20, 30...)
     std::shared_ptr<std::vector<vec_t, tinysimd::allocator<vec_t>>> m_jac;
 };
-
 } // namespace Nektar::MatrixFree
 
 #endif
diff --git a/library/MatrixFreeOps/PhysDeriv.h b/library/MatrixFreeOps/PhysDeriv.h
index 49b0d16570..b2bd73f278 100644
--- a/library/MatrixFreeOps/PhysDeriv.h
+++ b/library/MatrixFreeOps/PhysDeriv.h
@@ -66,23 +66,6 @@ struct PhysDerivTemplate
           Helper<LibUtilities::ShapeTypeDimMap[SHAPE_TYPE], DEFORMED>(basis,
                                                                       nElmt)
     {
-        constexpr auto DIM = LibUtilities::ShapeTypeDimMap[SHAPE_TYPE];
-
-        if (DIM == 1)
-        {
-            m_nmTot = LibUtilities::GetNumberOfCoefficients(SHAPE_TYPE,
-                                                            this->m_nm[0]);
-        }
-        else if (DIM == 2)
-        {
-            m_nmTot = LibUtilities::GetNumberOfCoefficients(
-                SHAPE_TYPE, this->m_nm[0], this->m_nm[1]);
-        }
-        else if (DIM == 3)
-        {
-            m_nmTot = LibUtilities::GetNumberOfCoefficients(
-                SHAPE_TYPE, this->m_nm[0], this->m_nm[1], this->m_nm[2]);
-        }
     }
 
     static std::shared_ptr<Operator> Create(
@@ -95,6 +78,13 @@ struct PhysDerivTemplate
     void operator()(const Array<OneD, const NekDouble> &input,
                     Array<OneD, Array<OneD, NekDouble>> &output) final
     {
+        const auto nq0 = this->m_nq[0];
+#if defined(SHAPE_DIMENSION_2D)
+        const auto nq1 = this->m_nq[1];
+#elif defined(SHAPE_DIMENSION_3D)
+        const auto nq1 = this->m_nq[1];
+        const auto nq2 = this->m_nq[2];
+#endif
 #include "SwitchPoints.h"
     }
 
@@ -119,7 +109,7 @@ struct PhysDerivTemplate
         ASSERTL1(output.size() <= 3, "PhysDerivTemplate::Operator1D: Operator "
                                      "not set up for other dimensions.")
 
-        const auto nq0 = m_basis[0]->GetNumPoints();
+        const auto nq0 = this->m_nq[0];
 
         const auto nqTot    = nq0;
         const auto nqBlocks = nqTot * vec_t::width;
@@ -306,8 +296,8 @@ struct PhysDerivTemplate
         ASSERTL1(output.size() <= 3, "PhysDerivTemplate::Operator2D: Operator "
                                      "not set up for 3D coordinates.");
 
-        const auto nq0 = m_basis[0]->GetNumPoints();
-        const auto nq1 = m_basis[1]->GetNumPoints();
+        const auto nq0 = this->m_nq[0];
+        const auto nq1 = this->m_nq[1];
 
         const auto nqTot    = nq0 * nq1;
         const auto nqBlocks = nqTot * vec_t::width;
@@ -495,9 +485,9 @@ struct PhysDerivTemplate
         ASSERTL1(output.size() == 3, "PhysDerivTemplate::Operator3D: Cannot "
                                      "call 3D routine with 1 or 2 outputs.");
 
-        const auto nq0 = m_basis[0]->GetNumPoints();
-        const auto nq1 = m_basis[1]->GetNumPoints();
-        const auto nq2 = m_basis[2]->GetNumPoints();
+        const auto nq0 = this->m_nq[0];
+        const auto nq1 = this->m_nq[1];
+        const auto nq2 = this->m_nq[2];
 
         const auto nqTot    = nq0 * nq1 * nq2;
         const auto nqBlocks = nqTot * vec_t::width;
@@ -677,7 +667,6 @@ struct PhysDerivTemplate
 #endif
 
 private:
-    int m_nmTot;
 };
 
 } // namespace Nektar::MatrixFree
diff --git a/library/MatrixFreeOps/PhysInterp1DScaled.h b/library/MatrixFreeOps/PhysInterp1DScaled.h
new file mode 100644
index 0000000000..952e98a3b8
--- /dev/null
+++ b/library/MatrixFreeOps/PhysInterp1DScaled.h
@@ -0,0 +1,501 @@
+///////////////////////////////////////////////////////////////////////////////
+//
+// File: PhysInterp1DScaled.h
+//
+// For more information, please see: http://www.nektar.info
+//
+// The MIT License
+//
+// Copyright (c) 2006 Division of Applied Mathematics, Brown University (USA),
+// Department of Aeronautics, Imperial College London (UK), and Scientific
+// Computing and Imaging Institute, University of Utah (USA).
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included
+// in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS IN THE SOFTWARE.
+//
+// Description:
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#ifndef NEKTAR_LIBRARY_MF_PHYSINTERP1DSCALED_H
+#define NEKTAR_LIBRARY_MF_PHYSINTERP1DSCALED_H
+
+#include <LibUtilities/BasicUtils/ShapeType.hpp>
+#include <LibUtilities/BasicUtils/SharedArray.hpp>
+#include <LibUtilities/Foundations/Basis.h>
+
+#include "Operator.hpp"
+#include "PhysInterp1DScaledKernels.hpp"
+
+// As each opertor has seven shapes over three dimension to get to the
+// "work" each operator uses a series of preprocessor directives based
+// on the dimension and shape type so to limit the code
+// inclusion. This keeps the library size as minimal as possible while
+// removing runtime conditionals.
+
+// The preprocessor directives, SHAPE_DIMENSION_?D and SHAPE_TYPE_*
+// are constructed by CMake in the CMakeLists.txt file which uses an
+// implementation file.  See the CMakeLists.txt files for more
+// details.
+namespace Nektar::MatrixFree
+{
+
+template <LibUtilities::ShapeType SHAPE_TYPE, bool DEFORMED = false>
+struct PhysInterp1DScaledTemplate
+    : public PhysInterp1DScaled,
+      public Helper<LibUtilities::ShapeTypeDimMap[SHAPE_TYPE]>
+{
+    PhysInterp1DScaledTemplate(std::vector<LibUtilities::BasisSharedPtr> basis,
+                               int nElmt)
+        : PhysInterp1DScaled(basis, nElmt),
+          Helper<LibUtilities::ShapeTypeDimMap[SHAPE_TYPE]>(basis, nElmt)
+    {
+    }
+
+    static std::shared_ptr<Operator> Create(
+        std::vector<LibUtilities::BasisSharedPtr> basis, int nElmt)
+    {
+        return std::make_shared<PhysInterp1DScaledTemplate<SHAPE_TYPE>>(basis,
+                                                                        nElmt);
+    }
+
+    void operator()(const Array<OneD, const NekDouble> &input,
+                    Array<OneD, NekDouble> &output) final
+    {
+        // Following the same variable naming as in SwitchNodesPoints.h
+        // Both input and output number of points of the kernels are in the
+        // physical space
+        // nm0 is replaced by nq_in0
+        // nq0 is replaced by nq_out0
+        const auto nm0 = this->m_nq[0];
+        const auto nq0 = this->m_enhancednq[0];
+#if defined(SHAPE_DIMENSION_2D)
+        // nm1 is replaced by nq_in1
+        // nq1 is replaced by nq_out1
+        // Following the same variable naming as in SwitchNodesPoints.h
+        const auto nm1 = this->m_nq[1];
+        const auto nq1 = this->m_enhancednq[1];
+#elif defined(SHAPE_DIMENSION_3D)
+        // nm2 is replaced by nq_in2
+        // nq2 is replaced by nq_out2
+        // Following the same variable naming as in SwitchNodesPoints.h
+        const auto nm1 = this->m_nq[1];
+        const auto nm2 = this->m_nq[2];
+        const auto nq1 = this->m_enhancednq[1];
+        const auto nq2 = this->m_enhancednq[2];
+#endif
+#include "SwitchNodesPoints.h"
+    }
+
+    // There must be separate 1D, 2D, and 3D operators because the
+    // helper base class is based on the shape type dim. Thus indices
+    // for the basis must be respected.
+
+    // Further there are duplicate 1D, 2D, and 3D operators so to
+    // allow for compile time array sizes to be part of the template
+    // and thus gain loop unrolling. The only difference is the
+    // location of the size value, in the template or the function:
+    // foo<int bar>() {} vs foo() { int bar = ... }
+
+#if defined(SHAPE_DIMENSION_1D)
+
+    // Non-size based operator.
+    void operator1D(const Array<OneD, const NekDouble> &input,
+                    Array<OneD, NekDouble> &output)
+    {
+        const auto nq_in0  = this->m_nq[0];
+        const auto nq_out0 = this->m_enhancednq[0];
+
+        const auto nb_out = nq_out0 * vec_t::width;
+        const auto nb_in  = nq_in0 * vec_t::width;
+
+        auto *inptr  = &input[0];
+        auto *outptr = &output[0];
+
+        // Workspace for kernels - also checks preconditions
+        PhysInterp1DScaled1DWorkspace<SHAPE_TYPE>(nq_in0, nq_out0);
+
+        std::vector<vec_t, allocator<vec_t>> tmpIn(nq_in0), tmpOut(nq_out0);
+
+        // temporary aligned storage for local fields
+        NekDouble *locField = static_cast<NekDouble *>(::operator new[](
+            nb_out * sizeof(NekDouble), std::align_val_t(vec_t::alignment)));
+
+        for (int e = 0; e < this->m_nBlocks - 1; ++e)
+        {
+            // Load data to aligned storage and interleave it
+            // std::copy(inptr, inptr + nb_in, locField);
+            // load_interleave(locField, nq_in0, tmpIn);
+            load_unalign_interleave(inptr, nq_in0, tmpIn);
+
+            PhysInterp1DScaled1DKernel<SHAPE_TYPE>(nq_in0, nq_out0, tmpIn,
+                                                   this->m_I[0], tmpOut);
+
+            // de-interleave and store data
+            // deinterleave_store(tmpOut, nq_out0, locField);
+            // std::copy(locField, locField + nb_out, outptr);
+            deinterleave_unalign_store(tmpOut, nq_out0, outptr);
+
+            inptr += nb_in;
+            outptr += nb_out;
+        }
+        // last block
+        {
+            int acturalSize = nb_in - this->m_nPads * nq_in0;
+            std::copy(inptr, inptr + acturalSize, locField);
+            load_interleave(locField, nq_in0, tmpIn);
+
+            PhysInterp1DScaled1DKernel<SHAPE_TYPE>(nq_in0, nq_out0, tmpIn,
+                                                   this->m_I[0], tmpOut);
+
+            // de-interleave and store data
+            deinterleave_store(tmpOut, nq_out0, locField);
+            acturalSize = nb_out - this->m_nPads * nq_out0;
+            std::copy(locField, locField + acturalSize, outptr);
+        }
+        // free aligned memory
+        ::operator delete[](locField, std::align_val_t(vec_t::alignment));
+    }
+
+    // Size based template version.
+    template <int nq_in0, int nq_out0>
+    void operator1D(const Array<OneD, const NekDouble> &input,
+                    Array<OneD, NekDouble> &output)
+    {
+        constexpr auto nOutTot = nq_out0;
+        constexpr auto nInTot  = nq_in0;
+        constexpr auto nb_out  = nOutTot * vec_t::width;
+        const auto nb_in       = nInTot * vec_t::width;
+
+        auto *inptr  = &input[0];
+        auto *outptr = &output[0];
+
+        // Workspace for kernels - also checks preconditions
+        PhysInterp1DScaled1DWorkspace<SHAPE_TYPE>(nq_in0, nq_out0);
+
+        std::vector<vec_t, allocator<vec_t>> tmpIn(nInTot), tmpOut(nOutTot);
+
+        // temporary aligned storage for local fields
+        alignas(vec_t::alignment) NekDouble locField[nb_out];
+
+        for (int e = 0; e < this->m_nBlocks - 1; ++e)
+        {
+            // Load data to aligned storage and interleave it
+            // std::copy(inptr, inptr + nb_in, locField);
+            // load_interleave(locField, m_nInTot, tmpIn);
+            load_unalign_interleave(inptr, nInTot, tmpIn);
+
+            PhysInterp1DScaled1DKernel<SHAPE_TYPE>(nq_in0, nq_out0, tmpIn,
+                                                   this->m_I[0], tmpOut);
+
+            // de-interleave and store data
+            // deinterleave_store(tmpOut, nOutTot, locField);
+            // std::copy(locField, locField + nb_out, outptr);
+            deinterleave_unalign_store(tmpOut, nOutTot, outptr);
+
+            inptr += nb_in;
+            outptr += nb_out;
+        }
+        // last block
+        {
+            int acturalSize = nb_in - this->m_nPads * nInTot;
+            std::copy(inptr, inptr + acturalSize, locField);
+            load_interleave(locField, nInTot, tmpIn);
+
+            PhysInterp1DScaled1DKernel<SHAPE_TYPE>(nq_in0, nq_out0, tmpIn,
+                                                   this->m_I[0], tmpOut);
+
+            // de-interleave and store data
+            deinterleave_store(tmpOut, nOutTot, locField);
+            acturalSize = nb_out - this->m_nPads * nOutTot;
+            std::copy(locField, locField + acturalSize, outptr);
+        }
+    }
+
+#elif defined(SHAPE_DIMENSION_2D)
+
+    // Non-size based operator.
+    void operator2D(const Array<OneD, const NekDouble> &input,
+                    Array<OneD, NekDouble> &output)
+    {
+        const auto nq_in0 = this->m_nq[0];
+        const auto nq_in1 = this->m_nq[1];
+
+        const auto nq_out0 = this->m_enhancednq[0];
+        const auto nq_out1 = this->m_enhancednq[1];
+
+        const auto nInTot  = nq_in0 * nq_in1;
+        const auto nOutTot = nq_out0 * nq_out1;
+        const auto nb_out  = nOutTot * vec_t::width;
+        const auto nb_in   = nInTot * vec_t::width;
+
+        auto *inptr  = &input[0];
+        auto *outptr = &output[0];
+
+        // Workspace for kernels - also checks preconditions
+        size_t wsp0Size = 0;
+        PhysInterp1DScaled2DWorkspace<SHAPE_TYPE>(nq_in0, nq_in1, nq_out0,
+                                                  nq_out1, wsp0Size);
+
+        std::vector<vec_t, allocator<vec_t>> wsp0(wsp0Size), tmpIn(nInTot),
+            tmpOut(nOutTot);
+
+        // temporary aligned storage for local fields
+        NekDouble *locField = static_cast<NekDouble *>(::operator new[](
+            nb_out * sizeof(NekDouble), std::align_val_t(vec_t::alignment)));
+
+        for (int e = 0; e < this->m_nBlocks - 1; ++e)
+        {
+            // Load data to aligned storage and interleave it
+            // std::copy(inptr, inptr + nb_in, locField);
+            // load_interleave(locField, m_nInTot, tmpIn);
+            load_unalign_interleave(inptr, nInTot, tmpIn);
+
+            PhysInterp1DScaled2DKernel<SHAPE_TYPE>(nq_in0, nq_in1, nq_out0,
+                                                   nq_out1, tmpIn, this->m_I[0],
+                                                   this->m_I[1], wsp0, tmpOut);
+
+            // de-interleave and store data
+            // deinterleave_store(tmpOut, nOutTot, locField);
+            // std::copy(locField, locField + nb_out, outptr);
+            deinterleave_unalign_store(tmpOut, nOutTot, outptr);
+
+            inptr += nb_in;
+            outptr += nb_out;
+        }
+        // last block
+        {
+            int acturalSize = nb_in - this->m_nPads * nInTot;
+            std::copy(inptr, inptr + acturalSize, locField);
+            load_interleave(locField, nInTot, tmpIn);
+
+            PhysInterp1DScaled2DKernel<SHAPE_TYPE>(nq_in0, nq_in1, nq_out0,
+                                                   nq_out1, tmpIn, this->m_I[0],
+                                                   this->m_I[1], wsp0, tmpOut);
+
+            // de-interleave and store data
+            deinterleave_store(tmpOut, nOutTot, locField);
+            acturalSize = nb_out - this->m_nPads * nOutTot;
+            std::copy(locField, locField + acturalSize, outptr);
+        }
+        // free aligned memory
+        ::operator delete[](locField, std::align_val_t(vec_t::alignment));
+    }
+
+    // Size based template version.
+    template <int nq_in0, int nq_in1, int nq_out0, int nq_out1>
+    void operator2D(const Array<OneD, const NekDouble> &input,
+                    Array<OneD, NekDouble> &output)
+    {
+        constexpr auto nInTot  = nq_in0 * nq_in1;
+        constexpr auto nOutTot = nq_out0 * nq_out1;
+        constexpr auto nb_out  = nOutTot * vec_t::width;
+        const auto nb_in       = nInTot * vec_t::width;
+
+        auto *inptr  = &input[0];
+        auto *outptr = &output[0];
+
+        // Workspace for kernels - also checks preconditions
+        size_t wsp0Size = 0;
+        PhysInterp1DScaled2DWorkspace<SHAPE_TYPE>(nq_in0, nq_in1, nq_out0,
+                                                  nq_out1, wsp0Size);
+
+        std::vector<vec_t, allocator<vec_t>> wsp0(wsp0Size), tmpIn(nInTot),
+            tmpOut(nOutTot);
+
+        // temporary aligned storage for local fields
+        alignas(vec_t::alignment) NekDouble locField[nb_out];
+
+        for (int e = 0; e < this->m_nBlocks - 1; ++e)
+        {
+            // Load data to aligned storage and interleave it
+            // std::copy(inptr, inptr + nb_in, locField);
+            // load_interleave(locField, m_nInTot, tmpIn);
+            load_unalign_interleave(inptr, nInTot, tmpIn);
+
+            PhysInterp1DScaled2DKernel<SHAPE_TYPE>(nq_in0, nq_in1, nq_out0,
+                                                   nq_out1, tmpIn, this->m_I[0],
+                                                   this->m_I[1], wsp0, tmpOut);
+            // de-interleave and store data
+            // deinterleave_store(tmpOut, nOutTot, locField);
+            // std::copy(locField, locField + nb_out, outptr);
+            deinterleave_unalign_store(tmpOut, nOutTot, outptr);
+
+            inptr += nb_in;
+            outptr += nb_out;
+        }
+        // last block
+        {
+            int acturalSize = nb_in - this->m_nPads * nInTot;
+            std::copy(inptr, inptr + acturalSize, locField);
+            load_interleave(locField, nInTot, tmpIn);
+
+            PhysInterp1DScaled2DKernel<SHAPE_TYPE>(nq_in0, nq_in1, nq_out0,
+                                                   nq_out1, tmpIn, this->m_I[0],
+                                                   this->m_I[1], wsp0, tmpOut);
+
+            // de-interleave and store data
+            deinterleave_store(tmpOut, nOutTot, locField);
+            acturalSize = nb_out - this->m_nPads * nOutTot;
+            std::copy(locField, locField + acturalSize, outptr);
+        }
+    }
+
+#elif defined(SHAPE_DIMENSION_3D)
+
+    // Non-size based operator.
+    void operator3D(const Array<OneD, const NekDouble> &input,
+                    Array<OneD, NekDouble> &output)
+    {
+        const auto nq_in0 = this->m_nq[0];
+        const auto nq_in1 = this->m_nq[1];
+        const auto nq_in2 = this->m_nq[2];
+
+        const auto nq_out0 = this->m_enhancednq[0];
+        const auto nq_out1 = this->m_enhancednq[1];
+        const auto nq_out2 = this->m_enhancednq[2];
+
+        const auto nInTot  = nq_in0 * nq_in1 * nq_in2;
+        const auto nOutTot = nq_out0 * nq_out1 * nq_out2;
+        const auto nb_out  = nOutTot * vec_t::width;
+        const auto nb_in   = nInTot * vec_t::width;
+
+        auto *inptr  = &input[0];
+        auto *outptr = &output[0];
+
+        // Workspace for kernels - also checks preconditions
+        size_t wsp0Size = 0, wsp1Size = 0;
+        PhysInterp1DScaled3DWorkspace<SHAPE_TYPE>(nq_in0, nq_in1, nq_in2,
+                                                  nq_out0, nq_out1, nq_out2,
+                                                  wsp0Size, wsp1Size);
+
+        std::vector<vec_t, allocator<vec_t>> wsp0(wsp0Size), wsp1(wsp1Size),
+            tmpIn(nInTot), tmpOut(nOutTot);
+
+        // temporary aligned storage for local fields
+        NekDouble *locField = static_cast<NekDouble *>(::operator new[](
+            nb_out * sizeof(NekDouble), std::align_val_t(vec_t::alignment)));
+
+        for (int e = 0; e < this->m_nBlocks - 1; ++e)
+        {
+            // Load data to aligned storage and interleave it
+            // std::copy(inptr, inptr + nb_in, locField);
+            // load_interleave(locField, m_nInTot, tmpIn);
+            load_unalign_interleave(inptr, nInTot, tmpIn);
+
+            PhysInterp1DScaled3DKernel<SHAPE_TYPE>(
+                nq_in0, nq_in1, nq_in2, nq_out0, nq_out1, nq_out2, tmpIn,
+                this->m_I[0], this->m_I[1], this->m_I[2], wsp0, wsp1, tmpOut);
+
+            // de-interleave and store data
+            // deinterleave_store(tmpOut, nOutTot, locField);
+            // std::copy(locField, locField + nb_out, outptr);
+            deinterleave_unalign_store(tmpOut, nOutTot, outptr);
+
+            inptr += nb_in;
+            outptr += nb_out;
+        }
+        // last block
+        {
+            int acturalSize = nb_in - this->m_nPads * nInTot;
+            std::copy(inptr, inptr + acturalSize, locField);
+            load_interleave(locField, nInTot, tmpIn);
+
+            PhysInterp1DScaled3DKernel<SHAPE_TYPE>(
+                nq_in0, nq_in1, nq_in2, nq_out0, nq_out1, nq_out2, tmpIn,
+                this->m_I[0], this->m_I[1], this->m_I[2], wsp0, wsp1, tmpOut);
+
+            // de-interleave and store data
+            deinterleave_store(tmpOut, nOutTot, locField);
+            acturalSize = nb_out - this->m_nPads * nOutTot;
+            std::copy(locField, locField + acturalSize, outptr);
+        }
+        // free aligned memory
+        ::operator delete[](locField, std::align_val_t(vec_t::alignment));
+    }
+
+    // Size based template version.
+    template <int nq_in0, int nq_in1, int nq_in2, int nq_out0, int nq_out1,
+              int nq_out2>
+    void operator3D(const Array<OneD, const NekDouble> &input,
+                    Array<OneD, NekDouble> &output)
+    {
+        constexpr auto nInTot  = nq_in0 * nq_in1 * nq_in2;
+        constexpr auto nOutTot = nq_out0 * nq_out1 * nq_out2;
+        constexpr auto nb_out  = nOutTot * vec_t::width;
+        const auto nb_in       = nInTot * vec_t::width;
+
+        auto *inptr  = &input[0];
+        auto *outptr = &output[0];
+
+        // Workspace for kernels - also checks preconditions
+        size_t wsp0Size = 0, wsp1Size = 0;
+        PhysInterp1DScaled3DWorkspace<SHAPE_TYPE>(nq_in0, nq_in1, nq_in2,
+                                                  nq_out0, nq_out1, nq_out2,
+                                                  wsp0Size, wsp1Size);
+
+        std::vector<vec_t, allocator<vec_t>> wsp0(wsp0Size), wsp1(wsp1Size),
+            tmpIn(nInTot), tmpOut(nOutTot);
+
+        // temporary aligned storage for local fields
+        alignas(vec_t::alignment) NekDouble locField[nb_out];
+
+        for (int e = 0; e < this->m_nBlocks - 1; ++e)
+        {
+            // Load data to aligned storage and interleave it
+            // std::copy(inptr, inptr + nb_in, locField);
+            // load_interleave(locField, m_nInTot, tmpIn);
+            load_unalign_interleave(inptr, nInTot, tmpIn);
+
+            PhysInterp1DScaled3DKernel<SHAPE_TYPE>(
+                nq_in0, nq_in1, nq_in2, nq_out0, nq_out1, nq_out2, tmpIn,
+                this->m_I[0], this->m_I[1], this->m_I[2], wsp0, wsp1, tmpOut);
+
+            // de-interleave and store data
+            // deinterleave_store(tmpOut, nOutTot, locField);
+            // std::copy(locField, locField + nb_out, outptr);
+            deinterleave_unalign_store(tmpOut, nOutTot, outptr);
+
+            inptr += nb_in;
+            outptr += nb_out;
+        }
+        // last block
+        {
+            int acturalSize = nb_in - this->m_nPads * nInTot;
+            std::copy(inptr, inptr + acturalSize, locField);
+            load_interleave(locField, nInTot, tmpIn);
+
+            PhysInterp1DScaled3DKernel<SHAPE_TYPE>(
+                nq_in0, nq_in1, nq_in2, nq_out0, nq_out1, nq_out2, tmpIn,
+                this->m_I[0], this->m_I[1], this->m_I[2], wsp0, wsp1, tmpOut);
+
+            // de-interleave and store data
+            deinterleave_store(tmpOut, nOutTot, locField);
+            acturalSize = nb_out - this->m_nPads * nOutTot;
+            std::copy(locField, locField + acturalSize, outptr);
+        }
+    }
+
+#endif // SHAPE_DIMENSION
+
+private:
+};
+
+} // namespace Nektar::MatrixFree
+
+#endif
diff --git a/library/MatrixFreeOps/PhysInterp1DScaledKernels.hpp b/library/MatrixFreeOps/PhysInterp1DScaledKernels.hpp
new file mode 100644
index 0000000000..8d728be480
--- /dev/null
+++ b/library/MatrixFreeOps/PhysInterp1DScaledKernels.hpp
@@ -0,0 +1,225 @@
+///////////////////////////////////////////////////////////////////////////////
+//
+// File: PhysInterp1DScaledKernels.hpp
+//
+// For more information, please see: http://www.nektar.info
+//
+// The MIT License
+//
+// Copyright (c) 2006 Division of Applied Mathematics, Brown University (USA),
+// Department of Aeronautics, Imperial College London (UK), and Scientific
+// Computing and Imaging Institute, University of Utah (USA).
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included
+// in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS IN THE SOFTWARE.
+//
+// Description:
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#ifndef NEKTAR_LIBRARY_MF_PHYSINTERP1DSCALED_KERNELS_HPP
+#define NEKTAR_LIBRARY_MF_PHYSINTERP1DSCALED_KERNELS_HPP
+
+#include <LibUtilities/BasicUtils/NekInline.hpp>
+
+namespace Nektar::MatrixFree
+{
+
+using namespace tinysimd;
+using vec_t = simd<NekDouble>;
+
+// Workspace - used to dynamically get the workspace size needed for
+// temporary memory.
+#if defined(SHAPE_DIMENSION_1D)
+
+template <LibUtilities::ShapeType SHAPE_TYPE>
+NEK_FORCE_INLINE static void PhysInterp1DScaled1DWorkspace(
+    [[maybe_unused]] const size_t nq_in0, [[maybe_unused]] const size_t nq_out0)
+
+{
+}
+
+#elif defined(SHAPE_DIMENSION_2D)
+
+template <LibUtilities::ShapeType SHAPE_TYPE>
+NEK_FORCE_INLINE static void PhysInterp1DScaled2DWorkspace(
+    [[maybe_unused]] const size_t nq_in0, [[maybe_unused]] const size_t nq_in1,
+    [[maybe_unused]] const size_t nq_out0,
+    [[maybe_unused]] const size_t nq_out1, size_t &wsp0Size)
+{
+    wsp0Size = std::max(wsp0Size, nq_in0 * nq_out0);
+}
+
+#elif defined(SHAPE_DIMENSION_3D)
+
+template <LibUtilities::ShapeType SHAPE_TYPE>
+NEK_FORCE_INLINE static void PhysInterp1DScaled3DWorkspace(
+    [[maybe_unused]] const size_t nq_in0, [[maybe_unused]] const size_t nq_in1,
+    [[maybe_unused]] const size_t nq_in2, [[maybe_unused]] const size_t nq_out0,
+    [[maybe_unused]] const size_t nq_out1,
+    [[maybe_unused]] const size_t nq_out2, size_t &wsp0Size, size_t &wsp1Size)
+{
+
+    wsp0Size =
+        std::max(wsp0Size, nq_out0 * nq_in1 * nq_in2); // nq_in1 == nq_in2
+    wsp1Size =
+        std::max(wsp1Size, nq_out0 * nq_out1 * nq_in2); // nq_out0 == nq_out1
+}
+
+#endif // SHAPE_DIMENSION
+
+// The dimension kernels which select the shape kernel.
+#if defined(SHAPE_DIMENSION_1D)
+
+template <LibUtilities::ShapeType SHAPE_TYPE>
+NEK_FORCE_INLINE static void PhysInterp1DScaled1DKernel(
+    const size_t nq_in0, const size_t nq_out0,
+    const std::vector<vec_t, allocator<vec_t>> &in,
+    const std::vector<vec_t, allocator<vec_t>> &basis0,
+    std::vector<vec_t, allocator<vec_t>> &out)
+{
+    for (int i = 0; i < nq_out0; ++i)
+    {
+        vec_t tmp = in[0] * basis0[i]; // Load 2x
+
+        for (int p = 1; p < nq_in0; ++p)
+        {
+            tmp.fma(in[p], basis0[p * nq_out0 + i]); // Load 2x
+        }
+
+        out[i] = tmp; // Store 1x
+    }
+}
+
+#elif defined(SHAPE_DIMENSION_2D)
+
+template <LibUtilities::ShapeType SHAPE_TYPE>
+NEK_FORCE_INLINE static void PhysInterp1DScaled2DKernel(
+    const size_t nq_in0, const size_t nq_in1, const size_t nq_out0,
+    const size_t nq_out1, const std::vector<vec_t, allocator<vec_t>> &in,
+    const std::vector<vec_t, allocator<vec_t>> &basis0,
+    const std::vector<vec_t, allocator<vec_t>> &basis1,
+    std::vector<vec_t, allocator<vec_t>> &wsp,
+    std::vector<vec_t, allocator<vec_t>> &out)
+{
+    for (int i = 0, cnt_iq = 0; i < nq_out0; ++i)
+    {
+        for (int q = 0, cnt_pq = 0; q < nq_in1; ++q, ++cnt_iq)
+        {
+            vec_t tmp = in[cnt_pq] * basis0[i]; // Load 2x
+            ++cnt_pq;
+            for (int p = 1; p < nq_in0; ++p, ++cnt_pq)
+            {
+                tmp.fma(in[cnt_pq], basis0[p * nq_out0 + i]); // Load 2x
+            }
+            wsp[cnt_iq] = tmp; // Store 1x
+        }
+    }
+
+    for (int j = 0, cnt_ij = 0; j < nq_out1; ++j)
+    {
+        for (int i = 0, cnt_iq = 0; i < nq_out0; ++i, ++cnt_ij)
+        {
+            vec_t tmp = wsp[cnt_iq] * basis1[j]; // Load 2x
+            ++cnt_iq;
+            for (int q = 1; q < nq_in1; ++q, ++cnt_iq)
+            {
+                tmp.fma(wsp[cnt_iq], basis1[q * nq_out1 + j]); // Load 2x
+            }
+            out[cnt_ij] = tmp; // Store 1x
+        }
+    }
+}
+
+#elif defined(SHAPE_DIMENSION_3D)
+
+template <LibUtilities::ShapeType SHAPE_TYPE>
+NEK_FORCE_INLINE static void PhysInterp1DScaled3DKernel(
+    const size_t nq_in0, const size_t nq_in1, const size_t nq_in2,
+    const size_t nq_out0, const size_t nq_out1, const size_t nq_out2,
+    const std::vector<vec_t, allocator<vec_t>> &in,
+    const std::vector<vec_t, allocator<vec_t>> &basis0,
+    const std::vector<vec_t, allocator<vec_t>> &basis1,
+    const std::vector<vec_t, allocator<vec_t>> &basis2,
+    std::vector<vec_t, allocator<vec_t>> &sum_irq, // nq_out0 * nq_in2 * nq_in1
+    std::vector<vec_t, allocator<vec_t>> &sum_jir, // nq_out1 * nq_out0 * nq_in2
+    std::vector<vec_t, allocator<vec_t>> &out)
+{
+    for (int i = 0, cnt_irq = 0; i < nq_out0; ++i)
+    {
+        for (int r = 0, cnt_rqp = 0; r < nq_in2; ++r)
+        {
+            for (int q = 0; q < nq_in1; ++q, ++cnt_irq)
+            {
+                vec_t tmp = in[cnt_rqp] * basis0[i];
+                ++cnt_rqp;
+
+                for (int p = 1; p < nq_in0; ++p, ++cnt_rqp)
+                {
+                    tmp.fma(in[cnt_rqp], basis0[p * nq_out0 + i]);
+                }
+
+                sum_irq[cnt_irq] = tmp;
+            }
+        }
+    }
+
+    for (int j = 0, cnt_jir = 0; j < nq_out1; ++j)
+    {
+        for (int i = 0, cnt_irq = 0; i < nq_out0; ++i)
+        {
+            for (int r = 0; r < nq_in2; ++r, ++cnt_jir)
+            {
+                vec_t tmp = sum_irq[cnt_irq] * basis1[j];
+                ++cnt_irq;
+
+                for (int q = 1; q < nq_in1; ++q)
+                {
+                    tmp.fma(sum_irq[cnt_irq++], basis1[q * nq_out1 + j]);
+                }
+
+                sum_jir[cnt_jir] = tmp;
+            }
+        }
+    }
+
+    for (int k = 0, cnt_kji = 0; k < nq_out2; ++k)
+    {
+        for (int j = 0, cnt_jir = 0; j < nq_out1; ++j)
+        {
+            for (int i = 0; i < nq_out0; ++i, ++cnt_kji)
+            {
+                vec_t tmp = sum_jir[cnt_jir] * basis2[k];
+                ++cnt_jir;
+
+                for (int r = 1; r < nq_in2; ++r)
+                {
+                    tmp.fma(sum_jir[cnt_jir++], basis2[r * nq_out2 + k]);
+                }
+
+                out[cnt_kji] = tmp;
+            }
+        }
+    }
+}
+
+#endif // SHAPE_DIMENSION
+
+} // namespace Nektar::MatrixFree
+
+#endif
diff --git a/library/MatrixFreeOps/SwitchNodesPoints.h b/library/MatrixFreeOps/SwitchNodesPoints.h
index c369eb115a..c8642cd14e 100644
--- a/library/MatrixFreeOps/SwitchNodesPoints.h
+++ b/library/MatrixFreeOps/SwitchNodesPoints.h
@@ -102,9 +102,6 @@
 
 /* start switch definition for operator 1D*/
 {
-    const int nm0 = m_basis[0]->GetNumModes();
-    const int nq0 = m_basis[0]->GetNumPoints();
-
 #if defined(SHAPE_TYPE_SEG)
 
     switch (nm0)
@@ -184,11 +181,6 @@
 
 /* start switch definition for operator 2D*/
 {
-    const int nm0 = m_basis[0]->GetNumModes();
-    const int nm1 = m_basis[1]->GetNumModes();
-
-    const int nq0 = m_basis[0]->GetNumPoints();
-    const int nq1 = m_basis[1]->GetNumPoints();
 
 #if defined(SHAPE_TYPE_TRI)
 
@@ -305,14 +297,6 @@
 
 /* start switch definition for operator 3D*/
 {
-    const int nm0 = m_basis[0]->GetNumModes();
-    const int nm1 = m_basis[1]->GetNumModes();
-    const int nm2 = m_basis[2]->GetNumModes();
-
-    const int nq0 = m_basis[0]->GetNumPoints();
-    const int nq1 = m_basis[1]->GetNumPoints();
-    const int nq2 = m_basis[2]->GetNumPoints();
-
 #if defined(SHAPE_TYPE_HEX)
 
     if (nm0 == nm1 && nm0 == nm2 && nq0 == nq1 && nq0 == nq2)
diff --git a/library/MatrixFreeOps/SwitchPoints.h b/library/MatrixFreeOps/SwitchPoints.h
index 01b5809f86..af07d77628 100644
--- a/library/MatrixFreeOps/SwitchPoints.h
+++ b/library/MatrixFreeOps/SwitchPoints.h
@@ -74,8 +74,6 @@
 
 /* start of switch include for operator1D */
 {
-    const int nq0 = m_basis[0]->GetNumPoints();
-
 #if defined(SHAPE_TYPE_SEG)
     switch (nq0)
     {
@@ -113,9 +111,6 @@
 
 /* start of switch include for operator2D */
 {
-    const int nq0 = m_basis[0]->GetNumPoints();
-    const int nq1 = m_basis[1]->GetNumPoints();
-
 #if defined(SHAPE_TYPE_TRI)
 
     if (nq0 == nq1 + 1)
@@ -160,7 +155,6 @@
     }
 
 #endif // SHAPE_TYPE
-
     else
     {
         operator2D(input, output);
@@ -195,10 +189,6 @@
 
 /* start of include switch details for operator3D */
 {
-    const int nq0 = m_basis[0]->GetNumPoints();
-    const int nq1 = m_basis[1]->GetNumPoints();
-    const int nq2 = m_basis[2]->GetNumPoints();
-
 #if defined(SHAPE_TYPE_HEX)
 
     if (nq0 == nq1 && nq0 == nq2)
diff --git a/library/MultiRegions/ExpList.cpp b/library/MultiRegions/ExpList.cpp
index 49938f9d15..c1e03af18c 100644
--- a/library/MultiRegions/ExpList.cpp
+++ b/library/MultiRegions/ExpList.cpp
@@ -6008,6 +6008,7 @@ void ExpList::v_PhysInterp1DScaled([[maybe_unused]] const NekDouble scale,
         m_collections[i].UpdateFactors(Collections::ePhysInterp1DScaled,
                                        factors);
     }
+
     LIKWID_MARKER_START("v_PhysInterp1DScaled");
     timer.Start();
     Array<OneD, NekDouble> tmp;
diff --git a/library/UnitTests/Collections/TestHexCollection.cpp b/library/UnitTests/Collections/TestHexCollection.cpp
index f5890ff117..d51d3e4b74 100644
--- a/library/UnitTests/Collections/TestHexCollection.cpp
+++ b/library/UnitTests/Collections/TestHexCollection.cpp
@@ -497,7 +497,7 @@ BOOST_AUTO_TEST_CASE(TestHexBwdTrans_NoCollection_VariableP)
     CollExp.push_back(Exp);
 
     LibUtilities::SessionReaderSharedPtr dummySession;
-    Collections::CollectionOptimisation colOpt(dummySession, 2,
+    Collections::CollectionOptimisation colOpt(dummySession, 3,
                                                Collections::eNoCollection);
     Collections::OperatorImpMap impTypes = colOpt.GetOperatorImpMap(stdExp);
     Collections::Collection c(CollExp, impTypes);
@@ -869,7 +869,7 @@ BOOST_AUTO_TEST_CASE(TestHexBwdTrans_MatrixFree_UniformP)
     CollExp.push_back(Exp);
 
     LibUtilities::SessionReaderSharedPtr dummySession;
-    Collections::CollectionOptimisation colOpt(dummySession, 2,
+    Collections::CollectionOptimisation colOpt(dummySession, 3,
                                                Collections::eMatrixFree);
     Collections::OperatorImpMap impTypes = colOpt.GetOperatorImpMap(stdExp);
     Collections::Collection c(CollExp, impTypes);
@@ -1008,7 +1008,7 @@ BOOST_AUTO_TEST_CASE(TestHexIProductWRTBase_MatrixFree_UniformP_Undeformed)
     CollExp.push_back(Exp);
 
     LibUtilities::SessionReaderSharedPtr dummySession;
-    Collections::CollectionOptimisation colOpt(dummySession, 2,
+    Collections::CollectionOptimisation colOpt(dummySession, 3,
                                                Collections::eMatrixFree);
     Collections::OperatorImpMap impTypes = colOpt.GetOperatorImpMap(stdExp);
     Collections::Collection c(CollExp, impTypes);
@@ -1085,7 +1085,7 @@ BOOST_AUTO_TEST_CASE(TestHexIProductWRTBase_MatrixFree_UniformP_Deformed)
     CollExp.push_back(Exp);
 
     LibUtilities::SessionReaderSharedPtr dummySession;
-    Collections::CollectionOptimisation colOpt(dummySession, 2,
+    Collections::CollectionOptimisation colOpt(dummySession, 3,
                                                Collections::eMatrixFree);
     Collections::OperatorImpMap impTypes = colOpt.GetOperatorImpMap(stdExp);
     Collections::Collection c(CollExp, impTypes);
@@ -1163,7 +1163,7 @@ BOOST_AUTO_TEST_CASE(
     CollExp.push_back(Exp);
 
     LibUtilities::SessionReaderSharedPtr dummySession;
-    Collections::CollectionOptimisation colOpt(dummySession, 2,
+    Collections::CollectionOptimisation colOpt(dummySession, 3,
                                                Collections::eMatrixFree);
     Collections::OperatorImpMap impTypes = colOpt.GetOperatorImpMap(stdExp);
     Collections::Collection c(CollExp, impTypes);
@@ -4644,4 +4644,183 @@ BOOST_AUTO_TEST_CASE(TestHexHelmholtz_MatrixFree_UniformP_ConstVarDiff)
         BOOST_CHECK_CLOSE(coeffsRef[i], coeffs[i], epsilon);
     }
 }
+
+BOOST_AUTO_TEST_CASE(TestHexPhysInterp1D_NoCollection_UniformP)
+{
+    SpatialDomains::PointGeomSharedPtr v0(
+        new SpatialDomains::PointGeom(3u, 0u, -1.0, -1.0, -1.0));
+    SpatialDomains::PointGeomSharedPtr v1(
+        new SpatialDomains::PointGeom(3u, 1u, 1.0, -1.0, -1.0));
+    SpatialDomains::PointGeomSharedPtr v2(
+        new SpatialDomains::PointGeom(3u, 2u, 1.0, 1.0, -1.0));
+    SpatialDomains::PointGeomSharedPtr v3(
+        new SpatialDomains::PointGeom(3u, 3u, -1.0, 1.0, -1.0));
+    SpatialDomains::PointGeomSharedPtr v4(
+        new SpatialDomains::PointGeom(3u, 4u, -1.0, -1.0, 1.0));
+    SpatialDomains::PointGeomSharedPtr v5(
+        new SpatialDomains::PointGeom(3u, 5u, 1.0, -1.0, 1.0));
+    SpatialDomains::PointGeomSharedPtr v6(
+        new SpatialDomains::PointGeom(3u, 6u, 2.0, 3.0, 4.0));
+    SpatialDomains::PointGeomSharedPtr v7(
+        new SpatialDomains::PointGeom(3u, 7u, -1.0, 1.0, 1.0));
+
+    SpatialDomains::HexGeomSharedPtr hexGeom =
+        CreateHex(v0, v1, v2, v3, v4, v5, v6, v7);
+
+    Nektar::LibUtilities::PointsType quadPointsTypeDir1 =
+        Nektar::LibUtilities::eGaussLobattoLegendre;
+    Nektar::LibUtilities::BasisType basisTypeDir1 =
+        Nektar::LibUtilities::eModified_A;
+    unsigned int numQuadPoints = 5;
+    unsigned int numModes      = 4;
+
+    const Nektar::LibUtilities::PointsKey quadPointsKeyDir1(numQuadPoints,
+                                                            quadPointsTypeDir1);
+    const Nektar::LibUtilities::BasisKey basisKeyDir1(basisTypeDir1, numModes,
+                                                      quadPointsKeyDir1);
+
+    Nektar::LocalRegions::HexExpSharedPtr Exp =
+        MemoryManager<Nektar::LocalRegions::HexExp>::AllocateSharedPtr(
+            basisKeyDir1, basisKeyDir1, basisKeyDir1, hexGeom);
+
+    Nektar::StdRegions::StdHexExpSharedPtr stdExp =
+        MemoryManager<Nektar::StdRegions::StdHexExp>::AllocateSharedPtr(
+            basisKeyDir1, basisKeyDir1, basisKeyDir1);
+
+    std::vector<StdRegions::StdExpansionSharedPtr> CollExp;
+    CollExp.push_back(Exp);
+
+    LibUtilities::SessionReaderSharedPtr dummySession;
+    Collections::CollectionOptimisation colOpt(dummySession, 3,
+                                               Collections::eNoCollection);
+    Collections::OperatorImpMap impTypes = colOpt.GetOperatorImpMap(Exp);
+    Collections::Collection c(CollExp, impTypes);
+
+    StdRegions::ConstFactorMap factors;
+    factors[StdRegions::eFactorConst] = 1.5;
+    c.Initialise(Collections::ePhysInterp1DScaled, factors);
+
+    const int nq = Exp->GetTotPoints();
+
+    Array<OneD, NekDouble> xc(nq), yc(nq), zc(nq);
+    Array<OneD, NekDouble> phys(nq), tmp;
+
+    Exp->GetCoords(xc, yc, zc);
+
+    for (int i = 0; i < nq; ++i)
+    {
+        phys[i] = pow(xc[i], 3) + pow(yc[i], 3) + pow(zc[i], 3);
+    }
+
+    const int nq1 = c.GetOutputSize(Collections::ePhysInterp1DScaled);
+    Array<OneD, NekDouble> xc1(nq1);
+    Array<OneD, NekDouble> yc1(nq1);
+    Array<OneD, NekDouble> zc1(nq1);
+    Array<OneD, NekDouble> phys1(nq1);
+
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, xc, xc1);
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, yc, yc1);
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, zc, zc1);
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, phys, phys1);
+
+    double epsilon = 1.0e-8;
+    // since solution is a polynomial should be able to compare soln directly
+    for (int i = 0; i < nq1; ++i)
+    {
+        NekDouble exact = pow(xc1[i], 3) + pow(yc1[i], 3) + pow(zc1[i], 3);
+        phys1[i]        = (fabs(phys1[i]) < 1e-14) ? 0.0 : phys1[i];
+        exact           = (fabs(exact) < 1e-14) ? 0.0 : exact;
+        BOOST_CHECK_CLOSE(phys1[i], exact, epsilon);
+    }
+}
+
+BOOST_AUTO_TEST_CASE(TestHexPhysInterp1D_MatrixFree_UniformP)
+{
+    SpatialDomains::PointGeomSharedPtr v0(
+        new SpatialDomains::PointGeom(3u, 0u, -1.0, -1.0, -1.0));
+    SpatialDomains::PointGeomSharedPtr v1(
+        new SpatialDomains::PointGeom(3u, 1u, 1.0, -1.0, -1.0));
+    SpatialDomains::PointGeomSharedPtr v2(
+        new SpatialDomains::PointGeom(3u, 2u, 1.0, 1.0, -1.0));
+    SpatialDomains::PointGeomSharedPtr v3(
+        new SpatialDomains::PointGeom(3u, 3u, -1.0, 1.0, -1.0));
+    SpatialDomains::PointGeomSharedPtr v4(
+        new SpatialDomains::PointGeom(3u, 4u, -1.0, -1.0, 1.0));
+    SpatialDomains::PointGeomSharedPtr v5(
+        new SpatialDomains::PointGeom(3u, 5u, 1.0, -1.0, 1.0));
+    SpatialDomains::PointGeomSharedPtr v6(
+        new SpatialDomains::PointGeom(3u, 6u, 2.0, 3.0, 4.0));
+    SpatialDomains::PointGeomSharedPtr v7(
+        new SpatialDomains::PointGeom(3u, 7u, -1.0, 1.0, 1.0));
+
+    SpatialDomains::HexGeomSharedPtr hexGeom =
+        CreateHex(v0, v1, v2, v3, v4, v5, v6, v7);
+
+    Nektar::LibUtilities::PointsType quadPointsTypeDir1 =
+        Nektar::LibUtilities::eGaussLobattoLegendre;
+    Nektar::LibUtilities::BasisType basisTypeDir1 =
+        Nektar::LibUtilities::eModified_A;
+    unsigned int numQuadPoints = 5;
+    unsigned int numModes      = 4;
+
+    const Nektar::LibUtilities::PointsKey quadPointsKeyDir1(numQuadPoints,
+                                                            quadPointsTypeDir1);
+    const Nektar::LibUtilities::BasisKey basisKeyDir1(basisTypeDir1, numModes,
+                                                      quadPointsKeyDir1);
+
+    Nektar::LocalRegions::HexExpSharedPtr Exp =
+        MemoryManager<Nektar::LocalRegions::HexExp>::AllocateSharedPtr(
+            basisKeyDir1, basisKeyDir1, basisKeyDir1, hexGeom);
+
+    Nektar::StdRegions::StdHexExpSharedPtr stdExp =
+        MemoryManager<Nektar::StdRegions::StdHexExp>::AllocateSharedPtr(
+            basisKeyDir1, basisKeyDir1, basisKeyDir1);
+
+    std::vector<StdRegions::StdExpansionSharedPtr> CollExp;
+    CollExp.push_back(Exp);
+
+    LibUtilities::SessionReaderSharedPtr dummySession;
+    Collections::CollectionOptimisation colOpt(dummySession, 3,
+                                               Collections::eMatrixFree);
+    Collections::OperatorImpMap impTypes = colOpt.GetOperatorImpMap(Exp);
+    Collections::Collection c(CollExp, impTypes);
+
+    StdRegions::ConstFactorMap factors;
+    factors[StdRegions::eFactorConst] = 1.5;
+    c.Initialise(Collections::ePhysInterp1DScaled, factors);
+
+    const int nq = Exp->GetTotPoints();
+
+    Array<OneD, NekDouble> xc(nq), yc(nq), zc(nq);
+    Array<OneD, NekDouble> phys(nq), tmp;
+
+    Exp->GetCoords(xc, yc, zc);
+
+    for (int i = 0; i < nq; ++i)
+    {
+        phys[i] = pow(xc[i], 3) + pow(yc[i], 3) + pow(zc[i], 3);
+    }
+
+    const int nq1 = c.GetOutputSize(Collections::ePhysInterp1DScaled);
+    Array<OneD, NekDouble> xc1(nq1);
+    Array<OneD, NekDouble> yc1(nq1);
+    Array<OneD, NekDouble> zc1(nq1);
+    Array<OneD, NekDouble> phys1(nq1);
+
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, xc, xc1);
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, yc, yc1);
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, zc, zc1);
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, phys, phys1);
+
+    double epsilon = 1.0e-8;
+    // since solution is a polynomial should be able to compare soln directly
+    for (int i = 0; i < nq1; ++i)
+    {
+        NekDouble exact = pow(xc1[i], 3) + pow(yc1[i], 3) + pow(zc1[i], 3);
+        phys1[i]        = (fabs(phys1[i]) < 1e-14) ? 0.0 : phys1[i];
+        exact           = (fabs(exact) < 1e-14) ? 0.0 : exact;
+        BOOST_CHECK_CLOSE(phys1[i], exact, epsilon);
+    }
+}
+
 } // namespace Nektar::HexCollectionTests
diff --git a/library/UnitTests/Collections/TestPrismCollection.cpp b/library/UnitTests/Collections/TestPrismCollection.cpp
index d427ef12eb..9b98afe2e9 100644
--- a/library/UnitTests/Collections/TestPrismCollection.cpp
+++ b/library/UnitTests/Collections/TestPrismCollection.cpp
@@ -3693,4 +3693,206 @@ BOOST_AUTO_TEST_CASE(TestPrismHelmholtz_MatrixFree_UniformP_ConstVarDiff)
     }
 }
 
+BOOST_AUTO_TEST_CASE(TestPrismPhsyInterp1DScaled_NoCollection_UniformP)
+{
+    SpatialDomains::PointGeomSharedPtr v0(
+        new SpatialDomains::PointGeom(3u, 0u, -1.0, -1.0, -1.0));
+    SpatialDomains::PointGeomSharedPtr v1(
+        new SpatialDomains::PointGeom(3u, 1u, 1.0, -1.0, -1.0));
+    SpatialDomains::PointGeomSharedPtr v2(
+        new SpatialDomains::PointGeom(3u, 2u, 1.0, 1.0, -1.0));
+    SpatialDomains::PointGeomSharedPtr v3(
+        new SpatialDomains::PointGeom(3u, 3u, -1.0, 1.0, -1.0));
+    SpatialDomains::PointGeomSharedPtr v4(
+        new SpatialDomains::PointGeom(3u, 4u, -1.0, -1.0, 1.0));
+    SpatialDomains::PointGeomSharedPtr v5(
+        new SpatialDomains::PointGeom(3u, 5u, -1.0, 1.0, 1.0));
+
+    SpatialDomains::PrismGeomSharedPtr prismGeom =
+        CreatePrism(v0, v1, v2, v3, v4, v5);
+
+    unsigned int numQuadPoints = 7;
+    unsigned int numModes      = 6;
+
+    Nektar::LibUtilities::PointsType PointsTypeDir1 =
+        Nektar::LibUtilities::eGaussLobattoLegendre;
+    const Nektar::LibUtilities::PointsKey PointsKeyDir1(numQuadPoints,
+                                                        PointsTypeDir1);
+    Nektar::LibUtilities::BasisType basisTypeDir1 =
+        Nektar::LibUtilities::eModified_A;
+    const Nektar::LibUtilities::BasisKey basisKeyDir1(basisTypeDir1, numModes,
+                                                      PointsKeyDir1);
+
+    Nektar::LibUtilities::PointsType PointsTypeDir2 =
+        Nektar::LibUtilities::eGaussLobattoLegendre;
+    const Nektar::LibUtilities::PointsKey PointsKeyDir2(numQuadPoints,
+                                                        PointsTypeDir2);
+    Nektar::LibUtilities::BasisType basisTypeDir2 =
+        Nektar::LibUtilities::eModified_A;
+    const Nektar::LibUtilities::BasisKey basisKeyDir2(basisTypeDir2, numModes,
+                                                      PointsKeyDir2);
+
+    Nektar::LibUtilities::PointsType PointsTypeDir3 =
+        Nektar::LibUtilities::eGaussRadauMAlpha1Beta0;
+    const Nektar::LibUtilities::PointsKey PointsKeyDir3(numQuadPoints - 1,
+                                                        PointsTypeDir3);
+    Nektar::LibUtilities::BasisType basisTypeDir3 =
+        Nektar::LibUtilities::eModified_B;
+    const Nektar::LibUtilities::BasisKey basisKeyDir3(basisTypeDir3, numModes,
+                                                      PointsKeyDir3);
+
+    Nektar::LocalRegions::PrismExpSharedPtr Exp =
+        MemoryManager<Nektar::LocalRegions::PrismExp>::AllocateSharedPtr(
+            basisKeyDir1, basisKeyDir2, basisKeyDir3, prismGeom);
+
+    Nektar::StdRegions::StdPrismExpSharedPtr stdExp =
+        MemoryManager<Nektar::StdRegions::StdPrismExp>::AllocateSharedPtr(
+            basisKeyDir1, basisKeyDir1, basisKeyDir1);
+
+    std::vector<StdRegions::StdExpansionSharedPtr> CollExp;
+    CollExp.push_back(Exp);
+
+    LibUtilities::SessionReaderSharedPtr dummySession;
+    Collections::CollectionOptimisation colOpt(dummySession, 3,
+                                               Collections::eNoCollection);
+    Collections::OperatorImpMap impTypes = colOpt.GetOperatorImpMap(Exp);
+    Collections::Collection c(CollExp, impTypes);
+    StdRegions::ConstFactorMap factors;
+    factors[StdRegions::eFactorConst] = 1.5;
+    c.Initialise(Collections::ePhysInterp1DScaled, factors);
+
+    const int nq = Exp->GetTotPoints();
+
+    Array<OneD, NekDouble> xc(nq), yc(nq), zc(nq);
+    Array<OneD, NekDouble> phys(nq);
+
+    Exp->GetCoords(xc, yc, zc);
+
+    for (int i = 0; i < nq; ++i)
+    {
+        phys[i] = pow(xc[i], 3) + pow(yc[i], 3) + pow(zc[i], 3);
+    }
+
+    const int nq1 = c.GetOutputSize(Collections::ePhysInterp1DScaled);
+    Array<OneD, NekDouble> xc1(nq1);
+    Array<OneD, NekDouble> yc1(nq1);
+    Array<OneD, NekDouble> zc1(nq1);
+    Array<OneD, NekDouble> phys1(nq1);
+
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, xc, xc1);
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, yc, yc1);
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, zc, zc1);
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, phys, phys1);
+
+    double epsilon = 2.0e-8;
+    for (int i = 0; i < nq1; ++i)
+    {
+        NekDouble exact = pow(xc1[i], 3) + pow(yc1[i], 3) + pow(zc1[i], 3);
+        phys1[i]        = (fabs(phys1[i]) < 1e-14) ? 0.0 : phys1[i];
+        exact           = (fabs(exact) < 1e-14) ? 0.0 : exact;
+        BOOST_CHECK_CLOSE(phys1[i], exact, epsilon);
+    }
+}
+
+BOOST_AUTO_TEST_CASE(TestPrismPhsyInterp1DScaled_MatrixFree_UniformP)
+{
+    SpatialDomains::PointGeomSharedPtr v0(
+        new SpatialDomains::PointGeom(3u, 0u, -1.0, -1.0, -1.0));
+    SpatialDomains::PointGeomSharedPtr v1(
+        new SpatialDomains::PointGeom(3u, 1u, 1.0, -1.0, -1.0));
+    SpatialDomains::PointGeomSharedPtr v2(
+        new SpatialDomains::PointGeom(3u, 2u, 1.0, 1.0, -1.0));
+    SpatialDomains::PointGeomSharedPtr v3(
+        new SpatialDomains::PointGeom(3u, 3u, -1.0, 1.0, -1.0));
+    SpatialDomains::PointGeomSharedPtr v4(
+        new SpatialDomains::PointGeom(3u, 4u, -1.0, -1.0, 1.0));
+    SpatialDomains::PointGeomSharedPtr v5(
+        new SpatialDomains::PointGeom(3u, 5u, -1.0, 1.0, 1.0));
+
+    SpatialDomains::PrismGeomSharedPtr prismGeom =
+        CreatePrism(v0, v1, v2, v3, v4, v5);
+
+    unsigned int numQuadPoints = 7;
+    unsigned int numModes      = 6;
+
+    Nektar::LibUtilities::PointsType PointsTypeDir1 =
+        Nektar::LibUtilities::eGaussLobattoLegendre;
+    const Nektar::LibUtilities::PointsKey PointsKeyDir1(numQuadPoints,
+                                                        PointsTypeDir1);
+    Nektar::LibUtilities::BasisType basisTypeDir1 =
+        Nektar::LibUtilities::eModified_A;
+    const Nektar::LibUtilities::BasisKey basisKeyDir1(basisTypeDir1, numModes,
+                                                      PointsKeyDir1);
+
+    Nektar::LibUtilities::PointsType PointsTypeDir2 =
+        Nektar::LibUtilities::eGaussLobattoLegendre;
+    const Nektar::LibUtilities::PointsKey PointsKeyDir2(numQuadPoints,
+                                                        PointsTypeDir2);
+    Nektar::LibUtilities::BasisType basisTypeDir2 =
+        Nektar::LibUtilities::eModified_A;
+    const Nektar::LibUtilities::BasisKey basisKeyDir2(basisTypeDir2, numModes,
+                                                      PointsKeyDir2);
+
+    Nektar::LibUtilities::PointsType PointsTypeDir3 =
+        Nektar::LibUtilities::eGaussRadauMAlpha1Beta0;
+    const Nektar::LibUtilities::PointsKey PointsKeyDir3(numQuadPoints - 1,
+                                                        PointsTypeDir3);
+    Nektar::LibUtilities::BasisType basisTypeDir3 =
+        Nektar::LibUtilities::eModified_B;
+    const Nektar::LibUtilities::BasisKey basisKeyDir3(basisTypeDir3, numModes,
+                                                      PointsKeyDir3);
+
+    Nektar::LocalRegions::PrismExpSharedPtr Exp =
+        MemoryManager<Nektar::LocalRegions::PrismExp>::AllocateSharedPtr(
+            basisKeyDir1, basisKeyDir2, basisKeyDir3, prismGeom);
+
+    Nektar::StdRegions::StdPrismExpSharedPtr stdExp =
+        MemoryManager<Nektar::StdRegions::StdPrismExp>::AllocateSharedPtr(
+            basisKeyDir1, basisKeyDir1, basisKeyDir1);
+
+    std::vector<StdRegions::StdExpansionSharedPtr> CollExp;
+    CollExp.push_back(Exp);
+
+    LibUtilities::SessionReaderSharedPtr dummySession;
+    Collections::CollectionOptimisation colOpt(dummySession, 3,
+                                               Collections::eMatrixFree);
+    Collections::OperatorImpMap impTypes = colOpt.GetOperatorImpMap(Exp);
+    Collections::Collection c(CollExp, impTypes);
+    StdRegions::ConstFactorMap factors;
+    factors[StdRegions::eFactorConst] = 1.5;
+    c.Initialise(Collections::ePhysInterp1DScaled, factors);
+
+    const int nq = Exp->GetTotPoints();
+
+    Array<OneD, NekDouble> xc(nq), yc(nq), zc(nq);
+    Array<OneD, NekDouble> phys(nq);
+
+    Exp->GetCoords(xc, yc, zc);
+
+    for (int i = 0; i < nq; ++i)
+    {
+        phys[i] = pow(xc[i], 3) + pow(yc[i], 3) + pow(zc[i], 3);
+    }
+
+    const int nq1 = c.GetOutputSize(Collections::ePhysInterp1DScaled);
+    Array<OneD, NekDouble> xc1(nq1);
+    Array<OneD, NekDouble> yc1(nq1);
+    Array<OneD, NekDouble> zc1(nq1);
+    Array<OneD, NekDouble> phys1(nq1);
+
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, xc, xc1);
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, yc, yc1);
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, zc, zc1);
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, phys, phys1);
+
+    double epsilon = 1.0e-8;
+    for (int i = 0; i < nq1; ++i)
+    {
+        NekDouble exact = pow(xc1[i], 3) + pow(yc1[i], 3) + pow(zc1[i], 3);
+        phys1[i]        = (fabs(phys1[i]) < 1e-14) ? 0.0 : phys1[i];
+        exact           = (fabs(exact) < 1e-14) ? 0.0 : exact;
+        BOOST_CHECK_CLOSE(phys1[i], exact, epsilon);
+    }
+}
+
 } // namespace Nektar::PrismCollectionTests
diff --git a/library/UnitTests/Collections/TestPyrCollection.cpp b/library/UnitTests/Collections/TestPyrCollection.cpp
index d5be2f67dc..ec464f5d53 100644
--- a/library/UnitTests/Collections/TestPyrCollection.cpp
+++ b/library/UnitTests/Collections/TestPyrCollection.cpp
@@ -3395,4 +3395,189 @@ BOOST_AUTO_TEST_CASE(TestPyrHelmholtz_MatrixFree_UniformP_ConstVarDiff)
         BOOST_CHECK_CLOSE(coeffsRef[i], coeffs[i], epsilon);
     }
 }
+
+BOOST_AUTO_TEST_CASE(TestPyrPhysInterp1DScaled_NoCollection_UniformP_MultiElmt)
+{
+    SpatialDomains::PointGeomSharedPtr v0(
+        new SpatialDomains::PointGeom(3u, 0u, -1.0, -1.0, -1.0));
+    SpatialDomains::PointGeomSharedPtr v1(
+        new SpatialDomains::PointGeom(3u, 1u, 1.0, -1.0, -1.0));
+    SpatialDomains::PointGeomSharedPtr v2(
+        new SpatialDomains::PointGeom(3u, 2u, 1.0, 1.0, -1.0));
+    SpatialDomains::PointGeomSharedPtr v3(
+        new SpatialDomains::PointGeom(3u, 3u, -1.0, 1.0, -1.0));
+    SpatialDomains::PointGeomSharedPtr v4(
+        new SpatialDomains::PointGeom(3u, 4u, -1.0, -1.0, 1.0));
+
+    SpatialDomains::PyrGeomSharedPtr pyrGeom = CreatePyr(v0, v1, v2, v3, v4);
+
+    Nektar::LibUtilities::PointsType PointsTypeDir1 =
+        Nektar::LibUtilities::eGaussLobattoLegendre;
+    const Nektar::LibUtilities::PointsKey PointsKeyDir1(5, PointsTypeDir1);
+    Nektar::LibUtilities::BasisType basisTypeDir1 =
+        Nektar::LibUtilities::eModified_A;
+    const Nektar::LibUtilities::BasisKey basisKeyDir1(basisTypeDir1, 4,
+                                                      PointsKeyDir1);
+
+    Nektar::LibUtilities::PointsType PointsTypeDir2 =
+        Nektar::LibUtilities::eGaussLobattoLegendre;
+    const Nektar::LibUtilities::PointsKey PointsKeyDir2(5, PointsTypeDir2);
+    Nektar::LibUtilities::BasisType basisTypeDir2 =
+        Nektar::LibUtilities::eModified_A;
+    const Nektar::LibUtilities::BasisKey basisKeyDir2(basisTypeDir2, 4,
+                                                      PointsKeyDir2);
+
+    Nektar::LibUtilities::PointsType PointsTypeDir3 =
+        Nektar::LibUtilities::eGaussRadauMAlpha2Beta0;
+    const Nektar::LibUtilities::PointsKey PointsKeyDir3(4, PointsTypeDir3);
+    Nektar::LibUtilities::BasisType basisTypeDir3 =
+        Nektar::LibUtilities::eModifiedPyr_C;
+    const Nektar::LibUtilities::BasisKey basisKeyDir3(basisTypeDir3, 4,
+                                                      PointsKeyDir3);
+
+    Nektar::LocalRegions::PyrExpSharedPtr Exp =
+        MemoryManager<Nektar::LocalRegions::PyrExp>::AllocateSharedPtr(
+            basisKeyDir1, basisKeyDir2, basisKeyDir3, pyrGeom);
+
+    int nelmts = 1;
+
+    std::vector<StdRegions::StdExpansionSharedPtr> CollExp;
+    for (int i = 0; i < nelmts; ++i)
+    {
+        CollExp.push_back(Exp);
+    }
+
+    LibUtilities::SessionReaderSharedPtr dummySession;
+    Collections::CollectionOptimisation colOpt(dummySession, 3,
+                                               Collections::eNoCollection);
+    Collections::OperatorImpMap impTypes = colOpt.GetOperatorImpMap(Exp);
+    Collections::Collection c(CollExp, impTypes);
+    StdRegions::ConstFactorMap factors;
+    factors[StdRegions::eFactorConst] = 1.5;
+    c.Initialise(Collections::ePhysInterp1DScaled, factors);
+
+    const int nq = Exp->GetTotPoints();
+    Array<OneD, NekDouble> xc(nq), yc(nq), zc(nq);
+    Array<OneD, NekDouble> phys(nq);
+
+    Exp->GetCoords(xc, yc, zc);
+
+    for (int i = 0; i < nq; ++i)
+    {
+        phys[i] = pow(xc[i], 3) + pow(yc[i], 3) + pow(zc[i], 3);
+    }
+
+    const int nq1 = c.GetOutputSize(Collections::ePhysInterp1DScaled);
+    Array<OneD, NekDouble> xc1(nq1);
+    Array<OneD, NekDouble> yc1(nq1);
+    Array<OneD, NekDouble> zc1(nq1);
+    Array<OneD, NekDouble> phys1(nq1);
+
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, xc, xc1);
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, yc, yc1);
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, zc, zc1);
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, phys, phys1);
+
+    double epsilon = 2.0e-8;
+    for (int i = 0; i < nq1; ++i)
+    {
+        NekDouble exact = pow(xc1[i], 3) + pow(yc1[i], 3) + pow(zc1[i], 3);
+        phys1[i]        = (fabs(phys1[i]) < 1e-14) ? 0.0 : phys1[i];
+        exact           = (fabs(exact) < 1e-14) ? 0.0 : exact;
+        BOOST_CHECK_CLOSE(phys1[i], exact, epsilon);
+    }
+}
+
+BOOST_AUTO_TEST_CASE(TestPyrPhysInterp1DScaled_MatrixFree_UniformP_MultiElmt)
+{
+    SpatialDomains::PointGeomSharedPtr v0(
+        new SpatialDomains::PointGeom(3u, 0u, -1.0, -1.0, -1.0));
+    SpatialDomains::PointGeomSharedPtr v1(
+        new SpatialDomains::PointGeom(3u, 1u, 1.0, -1.0, -1.0));
+    SpatialDomains::PointGeomSharedPtr v2(
+        new SpatialDomains::PointGeom(3u, 2u, 1.0, 1.0, -1.0));
+    SpatialDomains::PointGeomSharedPtr v3(
+        new SpatialDomains::PointGeom(3u, 3u, -1.0, 1.0, -1.0));
+    SpatialDomains::PointGeomSharedPtr v4(
+        new SpatialDomains::PointGeom(3u, 4u, -1.0, -1.0, 1.0));
+
+    SpatialDomains::PyrGeomSharedPtr pyrGeom = CreatePyr(v0, v1, v2, v3, v4);
+
+    Nektar::LibUtilities::PointsType PointsTypeDir1 =
+        Nektar::LibUtilities::eGaussLobattoLegendre;
+    const Nektar::LibUtilities::PointsKey PointsKeyDir1(5, PointsTypeDir1);
+    Nektar::LibUtilities::BasisType basisTypeDir1 =
+        Nektar::LibUtilities::eModified_A;
+    const Nektar::LibUtilities::BasisKey basisKeyDir1(basisTypeDir1, 4,
+                                                      PointsKeyDir1);
+
+    Nektar::LibUtilities::PointsType PointsTypeDir2 =
+        Nektar::LibUtilities::eGaussLobattoLegendre;
+    const Nektar::LibUtilities::PointsKey PointsKeyDir2(5, PointsTypeDir2);
+    Nektar::LibUtilities::BasisType basisTypeDir2 =
+        Nektar::LibUtilities::eModified_A;
+    const Nektar::LibUtilities::BasisKey basisKeyDir2(basisTypeDir2, 4,
+                                                      PointsKeyDir2);
+
+    Nektar::LibUtilities::PointsType PointsTypeDir3 =
+        Nektar::LibUtilities::eGaussRadauMAlpha2Beta0;
+    const Nektar::LibUtilities::PointsKey PointsKeyDir3(4, PointsTypeDir3);
+    Nektar::LibUtilities::BasisType basisTypeDir3 =
+        Nektar::LibUtilities::eModifiedPyr_C;
+    const Nektar::LibUtilities::BasisKey basisKeyDir3(basisTypeDir3, 4,
+                                                      PointsKeyDir3);
+
+    Nektar::LocalRegions::PyrExpSharedPtr Exp =
+        MemoryManager<Nektar::LocalRegions::PyrExp>::AllocateSharedPtr(
+            basisKeyDir1, basisKeyDir2, basisKeyDir3, pyrGeom);
+
+    int nelmts = 1;
+
+    std::vector<StdRegions::StdExpansionSharedPtr> CollExp;
+    for (int i = 0; i < nelmts; ++i)
+    {
+        CollExp.push_back(Exp);
+    }
+
+    LibUtilities::SessionReaderSharedPtr dummySession;
+    Collections::CollectionOptimisation colOpt(dummySession, 3,
+                                               Collections::eMatrixFree);
+    Collections::OperatorImpMap impTypes = colOpt.GetOperatorImpMap(Exp);
+    Collections::Collection c(CollExp, impTypes);
+    StdRegions::ConstFactorMap factors;
+    factors[StdRegions::eFactorConst] = 1.5;
+    c.Initialise(Collections::ePhysInterp1DScaled, factors);
+
+    const int nq = Exp->GetTotPoints();
+    Array<OneD, NekDouble> xc(nq), yc(nq), zc(nq);
+    Array<OneD, NekDouble> phys(nq);
+
+    Exp->GetCoords(xc, yc, zc);
+
+    for (int i = 0; i < nq; ++i)
+    {
+        phys[i] = pow(xc[i], 3) + pow(yc[i], 3) + pow(zc[i], 3);
+    }
+
+    const int nq1 = c.GetOutputSize(Collections::ePhysInterp1DScaled);
+    Array<OneD, NekDouble> xc1(nq1);
+    Array<OneD, NekDouble> yc1(nq1);
+    Array<OneD, NekDouble> zc1(nq1);
+    Array<OneD, NekDouble> phys1(nq1);
+
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, xc, xc1);
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, yc, yc1);
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, zc, zc1);
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, phys, phys1);
+
+    double epsilon = 2.0e-8;
+    for (int i = 0; i < nq1; ++i)
+    {
+        NekDouble exact = pow(xc1[i], 3) + pow(yc1[i], 3) + pow(zc1[i], 3);
+        phys1[i]        = (fabs(phys1[i]) < 1e-14) ? 0.0 : phys1[i];
+        exact           = (fabs(exact) < 1e-14) ? 0.0 : exact;
+        BOOST_CHECK_CLOSE(phys1[i], exact, epsilon);
+    }
+}
+
 } // namespace Nektar::PyrCollectionTests
diff --git a/library/UnitTests/Collections/TestQuadCollection.cpp b/library/UnitTests/Collections/TestQuadCollection.cpp
index 7c61a52f1d..7cd6423568 100644
--- a/library/UnitTests/Collections/TestQuadCollection.cpp
+++ b/library/UnitTests/Collections/TestQuadCollection.cpp
@@ -3286,4 +3286,157 @@ BOOST_AUTO_TEST_CASE(TestQuadHelmholtz_MatrixFree_UniformP_ConstVarDiff)
         BOOST_CHECK_CLOSE(coeffsRef[i], coeffs[i], epsilon);
     }
 }
+
+BOOST_AUTO_TEST_CASE(TestQuadPhysInterp1D_NoCollection_UniformP)
+{
+    SpatialDomains::PointGeomSharedPtr v0(
+        new SpatialDomains::PointGeom(2u, 0u, -1.0, -1.0, 0.0));
+    SpatialDomains::PointGeomSharedPtr v1(
+        new SpatialDomains::PointGeom(2u, 1u, 1.0, -1.0, 0.0));
+    SpatialDomains::PointGeomSharedPtr v2(
+        new SpatialDomains::PointGeom(2u, 2u, 1.0, 1.0, 0.0));
+    SpatialDomains::PointGeomSharedPtr v3(
+        new SpatialDomains::PointGeom(2u, 3u, -1.0, 1.0, 0.0));
+
+    SpatialDomains::QuadGeomSharedPtr quadGeom = CreateQuad(v0, v1, v2, v3);
+
+    Nektar::LibUtilities::PointsType quadPointsTypeDir1 =
+        Nektar::LibUtilities::eGaussLobattoLegendre;
+    Nektar::LibUtilities::BasisType basisTypeDir1 =
+        Nektar::LibUtilities::eModified_A;
+    unsigned int numQuadPoints = 6;
+    const Nektar::LibUtilities::PointsKey quadPointsKeyDir1(numQuadPoints,
+                                                            quadPointsTypeDir1);
+    const Nektar::LibUtilities::BasisKey basisKeyDir1(basisTypeDir1, 4,
+                                                      quadPointsKeyDir1);
+
+    Nektar::LocalRegions::QuadExpSharedPtr Exp =
+        MemoryManager<Nektar::LocalRegions::QuadExp>::AllocateSharedPtr(
+            basisKeyDir1, basisKeyDir1, quadGeom);
+
+    Nektar::StdRegions::StdQuadExpSharedPtr stdExp =
+        MemoryManager<Nektar::StdRegions::StdQuadExp>::AllocateSharedPtr(
+            basisKeyDir1, basisKeyDir1);
+
+    std::vector<StdRegions::StdExpansionSharedPtr> CollExp;
+    CollExp.push_back(Exp);
+
+    LibUtilities::SessionReaderSharedPtr dummySession;
+    Collections::CollectionOptimisation colOpt(dummySession, 2,
+                                               Collections::eNoCollection);
+    Collections::OperatorImpMap impTypes = colOpt.GetOperatorImpMap(Exp);
+    Collections::Collection c(CollExp, impTypes);
+
+    StdRegions::ConstFactorMap factors;
+    factors[StdRegions::eFactorConst] = 1.5;
+    c.Initialise(Collections::ePhysInterp1DScaled, factors);
+
+    const int nq = Exp->GetTotPoints();
+
+    Array<OneD, NekDouble> xc(nq), yc(nq);
+    Array<OneD, NekDouble> phys(nq), tmp;
+
+    Exp->GetCoords(xc, yc);
+
+    for (int i = 0; i < nq; ++i)
+    {
+        phys[i] = pow(xc[i], 3) + pow(yc[i], 3);
+    }
+
+    const int nq1 = c.GetOutputSize(Collections::ePhysInterp1DScaled);
+    Array<OneD, NekDouble> xc1(nq1);
+    Array<OneD, NekDouble> yc1(nq1);
+    Array<OneD, NekDouble> phys1(nq1);
+
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, xc, xc1);
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, yc, yc1);
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, phys, phys1);
+
+    double epsilon = 1.0e-8;
+    // since solution is a polynomial should be able to compare soln directly
+    for (int i = 0; i < nq1; ++i)
+    {
+        NekDouble exact = pow(xc1[i], 3) + pow(yc1[i], 3);
+        phys1[i]        = (fabs(phys1[i]) < 1e-14) ? 0.0 : phys1[i];
+        exact           = (fabs(exact) < 1e-14) ? 0.0 : exact;
+        BOOST_CHECK_CLOSE(phys1[i], exact, epsilon);
+    }
+}
+
+BOOST_AUTO_TEST_CASE(TestQuadPhysInterp1D_MatrixFree_UniformP)
+{
+    SpatialDomains::PointGeomSharedPtr v0(
+        new SpatialDomains::PointGeom(2u, 0u, -1.0, -1.0, 0.0));
+    SpatialDomains::PointGeomSharedPtr v1(
+        new SpatialDomains::PointGeom(2u, 1u, 1.0, -1.0, 0.0));
+    SpatialDomains::PointGeomSharedPtr v2(
+        new SpatialDomains::PointGeom(2u, 2u, 1.0, 1.0, 0.0));
+    SpatialDomains::PointGeomSharedPtr v3(
+        new SpatialDomains::PointGeom(2u, 3u, -1.0, 1.0, 0.0));
+
+    SpatialDomains::QuadGeomSharedPtr quadGeom = CreateQuad(v0, v1, v2, v3);
+
+    Nektar::LibUtilities::PointsType quadPointsTypeDir1 =
+        Nektar::LibUtilities::eGaussLobattoLegendre;
+    Nektar::LibUtilities::BasisType basisTypeDir1 =
+        Nektar::LibUtilities::eModified_A;
+    unsigned int numQuadPoints = 6;
+    const Nektar::LibUtilities::PointsKey quadPointsKeyDir1(numQuadPoints,
+                                                            quadPointsTypeDir1);
+    const Nektar::LibUtilities::BasisKey basisKeyDir1(basisTypeDir1, 4,
+                                                      quadPointsKeyDir1);
+
+    Nektar::LocalRegions::QuadExpSharedPtr Exp =
+        MemoryManager<Nektar::LocalRegions::QuadExp>::AllocateSharedPtr(
+            basisKeyDir1, basisKeyDir1, quadGeom);
+
+    Nektar::StdRegions::StdQuadExpSharedPtr stdExp =
+        MemoryManager<Nektar::StdRegions::StdQuadExp>::AllocateSharedPtr(
+            basisKeyDir1, basisKeyDir1);
+
+    std::vector<StdRegions::StdExpansionSharedPtr> CollExp;
+    CollExp.push_back(Exp);
+
+    LibUtilities::SessionReaderSharedPtr dummySession;
+    Collections::CollectionOptimisation colOpt(dummySession, 2,
+                                               Collections::eMatrixFree);
+    Collections::OperatorImpMap impTypes = colOpt.GetOperatorImpMap(Exp);
+    Collections::Collection c(CollExp, impTypes);
+
+    StdRegions::ConstFactorMap factors;
+    factors[StdRegions::eFactorConst] = 1.5;
+    c.Initialise(Collections::ePhysInterp1DScaled, factors);
+
+    const int nq = Exp->GetTotPoints();
+
+    Array<OneD, NekDouble> xc(nq), yc(nq);
+    Array<OneD, NekDouble> phys(nq), tmp;
+
+    Exp->GetCoords(xc, yc);
+
+    for (int i = 0; i < nq; ++i)
+    {
+        phys[i] = pow(xc[i], 3) + pow(yc[i], 3);
+    }
+
+    const int nq1 = c.GetOutputSize(Collections::ePhysInterp1DScaled);
+    Array<OneD, NekDouble> xc1(nq1);
+    Array<OneD, NekDouble> yc1(nq1);
+    Array<OneD, NekDouble> phys1(nq1);
+
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, xc, xc1);
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, yc, yc1);
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, phys, phys1);
+
+    double epsilon = 1.0e-8;
+    // since solution is a polynomial should be able to compare soln directly
+    for (int i = 0; i < nq1; ++i)
+    {
+        NekDouble exact = pow(xc1[i], 3) + pow(yc1[i], 3);
+        phys1[i]        = (fabs(phys1[i]) < 1e-14) ? 0.0 : phys1[i];
+        exact           = (fabs(exact) < 1e-14) ? 0.0 : exact;
+        BOOST_CHECK_CLOSE(phys1[i], exact, epsilon);
+    }
+}
+
 } // namespace Nektar::QuadCollectionTests
diff --git a/library/UnitTests/Collections/TestSegCollection.cpp b/library/UnitTests/Collections/TestSegCollection.cpp
index 8bb95e8dfc..0630801cd5 100644
--- a/library/UnitTests/Collections/TestSegCollection.cpp
+++ b/library/UnitTests/Collections/TestSegCollection.cpp
@@ -1795,4 +1795,144 @@ BOOST_AUTO_TEST_CASE(
         BOOST_CHECK_CLOSE(coeffs1[i], coeffs2[i], epsilon);
     }
 }
+
+BOOST_AUTO_TEST_CASE(TestSegPhysInterp1D_NoCollection_UniformP)
+{
+    SpatialDomains::PointGeomSharedPtr v0(
+        new SpatialDomains::PointGeom(2u, 0u, -1.0, 0.0, 0.0));
+    SpatialDomains::PointGeomSharedPtr v1(
+        new SpatialDomains::PointGeom(2u, 1u, 1.0, 0.0, 0.0));
+
+    SpatialDomains::SegGeomSharedPtr segGeom = CreateSegGeom(0, v0, v1, 2);
+
+    Nektar::LibUtilities::PointsType segPointsTypeDir1 =
+        Nektar::LibUtilities::eGaussLobattoLegendre;
+    Nektar::LibUtilities::BasisType basisTypeDir1 =
+        Nektar::LibUtilities::eModified_A;
+    unsigned int numSegPoints = 6;
+    const Nektar::LibUtilities::PointsKey segPointsKeyDir1(numSegPoints,
+                                                           segPointsTypeDir1);
+    const Nektar::LibUtilities::BasisKey basisKeyDir1(basisTypeDir1, 4,
+                                                      segPointsKeyDir1);
+
+    Nektar::LocalRegions::SegExpSharedPtr Exp =
+        MemoryManager<Nektar::LocalRegions::SegExp>::AllocateSharedPtr(
+            basisKeyDir1, segGeom);
+
+    std::vector<StdRegions::StdExpansionSharedPtr> CollExp;
+    CollExp.push_back(Exp);
+
+    LibUtilities::SessionReaderSharedPtr dummySession;
+    Collections::CollectionOptimisation colOpt(dummySession, 1,
+                                               Collections::eNoCollection);
+    Collections::OperatorImpMap impTypes = colOpt.GetOperatorImpMap(Exp);
+    Collections::Collection c(CollExp, impTypes);
+
+    StdRegions::ConstFactorMap factors;
+    factors[StdRegions::eFactorConst] = 1.5;
+    c.Initialise(Collections::ePhysInterp1DScaled, factors);
+
+    const int nq = Exp->GetTotPoints();
+
+    Array<OneD, NekDouble> xc(nq), yc(nq);
+    Array<OneD, NekDouble> phys(nq), tmp;
+
+    Exp->GetCoords(xc, yc);
+
+    for (int i = 0; i < nq; ++i)
+    {
+        phys[i] = pow(xc[i], 3) + pow(yc[i], 3);
+    }
+
+    const int nq1 = c.GetOutputSize(Collections::ePhysInterp1DScaled);
+    Array<OneD, NekDouble> xc1(nq1);
+    Array<OneD, NekDouble> yc1(nq1);
+    Array<OneD, NekDouble> phys1(nq1);
+
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, xc, xc1);
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, yc, yc1);
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, phys, phys1);
+
+    double epsilon = 1.0e-8;
+    // since solution is a polynomial should be able to compare soln directly
+    for (int i = 0; i < nq1; ++i)
+    {
+        NekDouble exact = pow(xc1[i], 3) + pow(yc1[i], 3);
+        phys1[i]        = (fabs(phys1[i]) < 1e-14) ? 0.0 : phys1[i];
+        exact           = (fabs(exact) < 1e-14) ? 0.0 : exact;
+        BOOST_CHECK_CLOSE(phys1[i], exact, epsilon);
+    }
+}
+
+BOOST_AUTO_TEST_CASE(TestSegPhysInterp1D_MatrixFree_UniformP)
+{
+    SpatialDomains::PointGeomSharedPtr v0(
+        new SpatialDomains::PointGeom(2u, 0u, -1.0, 1.0, 0.0));
+    SpatialDomains::PointGeomSharedPtr v1(
+        new SpatialDomains::PointGeom(2u, 1u, 1.0, 1.0, 0.0));
+
+    SpatialDomains::SegGeomSharedPtr segGeom = CreateSegGeom(0, v0, v1, 2);
+
+    Nektar::LibUtilities::PointsType segPointsTypeDir1 =
+        Nektar::LibUtilities::eGaussLobattoLegendre;
+    Nektar::LibUtilities::BasisType basisTypeDir1 =
+        Nektar::LibUtilities::eModified_A;
+    unsigned int numSegPoints = 6;
+    const Nektar::LibUtilities::PointsKey segPointsKeyDir1(numSegPoints,
+                                                           segPointsTypeDir1);
+    const Nektar::LibUtilities::BasisKey basisKeyDir1(basisTypeDir1, 4,
+                                                      segPointsKeyDir1);
+
+    Nektar::LocalRegions::SegExpSharedPtr Exp =
+        MemoryManager<Nektar::LocalRegions::SegExp>::AllocateSharedPtr(
+            basisKeyDir1, segGeom);
+
+    std::vector<StdRegions::StdExpansionSharedPtr> CollExp;
+    CollExp.push_back(Exp);
+
+    LibUtilities::SessionReaderSharedPtr dummySession;
+    Collections::CollectionOptimisation colOpt(dummySession, 1,
+                                               Collections::eMatrixFree);
+    Collections::OperatorImpMap impTypes = colOpt.GetOperatorImpMap(Exp);
+    Collections::Collection c(CollExp, impTypes);
+
+    StdRegions::ConstFactorMap factors;
+    factors[StdRegions::eFactorConst] = 1.5;
+    c.Initialise(Collections::ePhysInterp1DScaled, factors);
+
+    const int nq = Exp->GetTotPoints();
+
+    Array<OneD, NekDouble> xc(nq), yc(nq);
+    Array<OneD, NekDouble> phys(nq), tmp;
+
+    Exp->GetCoords(xc, yc);
+
+    for (int i = 0; i < nq; ++i)
+    {
+        yc[i]   = (fabs(yc[i]) < 1e-14) ? 0.0 : yc[i];
+        phys[i] = pow(xc[i], 3) + pow(yc[i], 3);
+    }
+
+    const int nq1 = c.GetOutputSize(Collections::ePhysInterp1DScaled);
+    Array<OneD, NekDouble> xc1(nq1);
+    Array<OneD, NekDouble> yc1(nq1);
+    Array<OneD, NekDouble> phys1(nq1);
+
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, xc, xc1);
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, yc, yc1);
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, phys, phys1);
+
+    double epsilon = 1.0e-8;
+    // since solution is a polynomial should be able to compare soln directly
+    for (int i = 0; i < nq1; ++i)
+    {
+        xc1[i]          = (fabs(xc1[i]) < 1e-14) ? 0.0 : xc1[i];
+        yc1[i]          = (fabs(yc1[i]) < 1e-14) ? 0.0 : yc1[i];
+        NekDouble exact = pow(xc1[i], 3) + pow(yc1[i], 3);
+        phys1[i]        = (fabs(phys1[i]) < 1e-14) ? 0.0 : phys1[i];
+        exact           = (fabs(exact) < 1e-14) ? 0.0 : exact;
+        BOOST_CHECK_CLOSE(phys1[i], exact, epsilon);
+    }
+}
+
 } // namespace Nektar::SegCollectionTests
diff --git a/library/UnitTests/Collections/TestTetCollection.cpp b/library/UnitTests/Collections/TestTetCollection.cpp
index 7a70f4d2ef..4de6373b39 100644
--- a/library/UnitTests/Collections/TestTetCollection.cpp
+++ b/library/UnitTests/Collections/TestTetCollection.cpp
@@ -2763,7 +2763,7 @@ BOOST_AUTO_TEST_CASE(TestTetmHelmholtz_IterPerExp_UniformP_ConstVarDiff)
 
     Nektar::StdRegions::StdTetExpSharedPtr stdExp =
         MemoryManager<Nektar::StdRegions::StdTetExp>::AllocateSharedPtr(
-            basisKeyDir1, basisKeyDir1, basisKeyDir1);
+            basisKeyDir1, basisKeyDir2, basisKeyDir3);
 
     int nelmts = 10;
 
@@ -2873,7 +2873,7 @@ BOOST_AUTO_TEST_CASE(TestTetmHelmholtz_MatrixFree_UniformP)
 
     Nektar::StdRegions::StdTetExpSharedPtr stdExp =
         MemoryManager<Nektar::StdRegions::StdTetExp>::AllocateSharedPtr(
-            basisKeyDir1, basisKeyDir1, basisKeyDir1);
+            basisKeyDir1, basisKeyDir2, basisKeyDir3);
 
     int nelmts = 10;
 
@@ -2977,7 +2977,7 @@ BOOST_AUTO_TEST_CASE(TestTetmHelmholtz_MatrixFree_Deformed_OverInt)
 
     Nektar::StdRegions::StdTetExpSharedPtr stdExp =
         MemoryManager<Nektar::StdRegions::StdTetExp>::AllocateSharedPtr(
-            basisKeyDir1, basisKeyDir1, basisKeyDir1);
+            basisKeyDir1, basisKeyDir2, basisKeyDir3);
 
     int nelmts = 10;
 
@@ -3178,7 +3178,7 @@ BOOST_AUTO_TEST_CASE(TestTetmHelmholtz_MatrixFree_UniformP_ConstVarDiff)
 
     Nektar::StdRegions::StdTetExpSharedPtr stdExp =
         MemoryManager<Nektar::StdRegions::StdTetExp>::AllocateSharedPtr(
-            basisKeyDir1, basisKeyDir1, basisKeyDir1);
+            basisKeyDir1, basisKeyDir2, basisKeyDir3);
 
     int nelmts = 10;
 
@@ -3238,4 +3238,194 @@ BOOST_AUTO_TEST_CASE(TestTetmHelmholtz_MatrixFree_UniformP_ConstVarDiff)
         BOOST_CHECK_CLOSE(coeffsRef[i], coeffs[i], epsilon);
     }
 }
+
+BOOST_AUTO_TEST_CASE(TestTetPhysInterp1D_NoCollections_UniformP)
+{
+
+    SpatialDomains::PointGeomSharedPtr v0(
+        new SpatialDomains::PointGeom(3u, 0u, -1.0, -1.0, -1.0));
+    SpatialDomains::PointGeomSharedPtr v1(
+        new SpatialDomains::PointGeom(3u, 1u, 1.0, -1.0, -1.0));
+    SpatialDomains::PointGeomSharedPtr v2(
+        new SpatialDomains::PointGeom(3u, 2u, -1.0, 1.0, -1.0));
+    SpatialDomains::PointGeomSharedPtr v3(
+        new SpatialDomains::PointGeom(3u, 3u, -1.0, -1.0, 1.0));
+
+    SpatialDomains::TetGeomSharedPtr tetGeom = CreateTet(v0, v1, v2, v3);
+
+    unsigned int numQuadPoints = 5;
+    unsigned int numModes      = 4;
+
+    Nektar::LibUtilities::PointsType triPointsTypeDir1 =
+        Nektar::LibUtilities::eGaussLobattoLegendre;
+    const Nektar::LibUtilities::PointsKey triPointsKeyDir1(numQuadPoints,
+                                                           triPointsTypeDir1);
+    Nektar::LibUtilities::BasisType basisTypeDir1 =
+        Nektar::LibUtilities::eModified_A;
+    const Nektar::LibUtilities::BasisKey basisKeyDir1(basisTypeDir1, numModes,
+                                                      triPointsKeyDir1);
+
+    Nektar::LibUtilities::PointsType triPointsTypeDir2 =
+        Nektar::LibUtilities::eGaussRadauMAlpha1Beta0;
+    const Nektar::LibUtilities::PointsKey triPointsKeyDir2(numQuadPoints - 1,
+                                                           triPointsTypeDir2);
+    Nektar::LibUtilities::BasisType basisTypeDir2 =
+        Nektar::LibUtilities::eModified_B;
+    const Nektar::LibUtilities::BasisKey basisKeyDir2(basisTypeDir2, numModes,
+                                                      triPointsKeyDir2);
+
+    Nektar::LibUtilities::PointsType triPointsTypeDir3 =
+        Nektar::LibUtilities::eGaussRadauMAlpha2Beta0;
+    const Nektar::LibUtilities::PointsKey triPointsKeyDir3(numQuadPoints - 1,
+                                                           triPointsTypeDir3);
+    Nektar::LibUtilities::BasisType basisTypeDir3 =
+        Nektar::LibUtilities::eModified_C;
+    const Nektar::LibUtilities::BasisKey basisKeyDir3(basisTypeDir3, numModes,
+                                                      triPointsKeyDir3);
+
+    Nektar::LocalRegions::TetExpSharedPtr Exp =
+        MemoryManager<Nektar::LocalRegions::TetExp>::AllocateSharedPtr(
+            basisKeyDir1, basisKeyDir2, basisKeyDir3, tetGeom);
+
+    std::vector<StdRegions::StdExpansionSharedPtr> CollExp;
+    CollExp.push_back(Exp);
+
+    LibUtilities::SessionReaderSharedPtr dummySession;
+    Collections::CollectionOptimisation colOpt(dummySession, 3,
+                                               Collections::eNoCollection);
+    Collections::OperatorImpMap impTypes = colOpt.GetOperatorImpMap(Exp);
+    Collections::Collection c(CollExp, impTypes);
+
+    StdRegions::ConstFactorMap factors;
+    factors[StdRegions::eFactorConst] = 1.5;
+    c.Initialise(Collections::ePhysInterp1DScaled, factors);
+
+    const int nq = Exp->GetTotPoints();
+
+    Array<OneD, NekDouble> xc(nq), yc(nq), zc(nq);
+    Array<OneD, NekDouble> phys(nq);
+
+    Exp->GetCoords(xc, yc, zc);
+
+    for (int i = 0; i < nq; ++i)
+    {
+        phys[i] = pow(xc[i], 3) + pow(yc[i], 3) + pow(zc[i], 3);
+    }
+
+    const int nq1 = c.GetOutputSize(Collections::ePhysInterp1DScaled);
+    Array<OneD, NekDouble> xc1(nq1);
+    Array<OneD, NekDouble> yc1(nq1);
+    Array<OneD, NekDouble> zc1(nq1);
+    Array<OneD, NekDouble> phys1(nq1);
+
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, xc, xc1);
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, yc, yc1);
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, zc, zc1);
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, phys, phys1);
+
+    double epsilon = 1.0e-8;
+    // since solution is a polynomial should be able to compare soln directly
+    for (int i = 0; i < nq1; ++i)
+    {
+        NekDouble exact = pow(xc1[i], 3) + pow(yc1[i], 3) + pow(zc1[i], 3);
+        phys1[i]        = (fabs(phys1[i]) < 1e-14) ? 0.0 : phys1[i];
+        exact           = (fabs(exact) < 1e-14) ? 0.0 : exact;
+        BOOST_CHECK_CLOSE(phys1[i], exact, epsilon);
+    }
+}
+
+BOOST_AUTO_TEST_CASE(TestTetPhysInterp1D_MatrixFree_UniformP)
+{
+
+    SpatialDomains::PointGeomSharedPtr v0(
+        new SpatialDomains::PointGeom(3u, 0u, -1.0, -1.0, -1.0));
+    SpatialDomains::PointGeomSharedPtr v1(
+        new SpatialDomains::PointGeom(3u, 1u, 1.0, -1.0, -1.0));
+    SpatialDomains::PointGeomSharedPtr v2(
+        new SpatialDomains::PointGeom(3u, 2u, -1.0, 1.0, -1.0));
+    SpatialDomains::PointGeomSharedPtr v3(
+        new SpatialDomains::PointGeom(3u, 3u, -1.0, -1.0, 1.0));
+
+    SpatialDomains::TetGeomSharedPtr tetGeom = CreateTet(v0, v1, v2, v3);
+
+    unsigned int numQuadPoints = 5;
+    unsigned int numModes      = 4;
+
+    Nektar::LibUtilities::PointsType triPointsTypeDir1 =
+        Nektar::LibUtilities::eGaussLobattoLegendre;
+    const Nektar::LibUtilities::PointsKey triPointsKeyDir1(numQuadPoints,
+                                                           triPointsTypeDir1);
+    Nektar::LibUtilities::BasisType basisTypeDir1 =
+        Nektar::LibUtilities::eModified_A;
+    const Nektar::LibUtilities::BasisKey basisKeyDir1(basisTypeDir1, numModes,
+                                                      triPointsKeyDir1);
+
+    Nektar::LibUtilities::PointsType triPointsTypeDir2 =
+        Nektar::LibUtilities::eGaussRadauMAlpha1Beta0;
+    const Nektar::LibUtilities::PointsKey triPointsKeyDir2(numQuadPoints - 1,
+                                                           triPointsTypeDir2);
+    Nektar::LibUtilities::BasisType basisTypeDir2 =
+        Nektar::LibUtilities::eModified_B;
+    const Nektar::LibUtilities::BasisKey basisKeyDir2(basisTypeDir2, numModes,
+                                                      triPointsKeyDir2);
+
+    Nektar::LibUtilities::PointsType triPointsTypeDir3 =
+        Nektar::LibUtilities::eGaussRadauMAlpha2Beta0;
+    const Nektar::LibUtilities::PointsKey triPointsKeyDir3(numQuadPoints - 1,
+                                                           triPointsTypeDir3);
+    Nektar::LibUtilities::BasisType basisTypeDir3 =
+        Nektar::LibUtilities::eModified_C;
+    const Nektar::LibUtilities::BasisKey basisKeyDir3(basisTypeDir3, numModes,
+                                                      triPointsKeyDir3);
+
+    Nektar::LocalRegions::TetExpSharedPtr Exp =
+        MemoryManager<Nektar::LocalRegions::TetExp>::AllocateSharedPtr(
+            basisKeyDir1, basisKeyDir2, basisKeyDir3, tetGeom);
+
+    std::vector<StdRegions::StdExpansionSharedPtr> CollExp;
+    CollExp.push_back(Exp);
+
+    LibUtilities::SessionReaderSharedPtr dummySession;
+    Collections::CollectionOptimisation colOpt(dummySession, 3,
+                                               Collections::eMatrixFree);
+    Collections::OperatorImpMap impTypes = colOpt.GetOperatorImpMap(Exp);
+    Collections::Collection c(CollExp, impTypes);
+
+    StdRegions::ConstFactorMap factors;
+    factors[StdRegions::eFactorConst] = 1.5;
+    c.Initialise(Collections::ePhysInterp1DScaled, factors);
+
+    const int nq = Exp->GetTotPoints();
+
+    Array<OneD, NekDouble> xc(nq), yc(nq), zc(nq);
+    Array<OneD, NekDouble> phys(nq);
+
+    Exp->GetCoords(xc, yc, zc);
+
+    for (int i = 0; i < nq; ++i)
+    {
+        phys[i] = pow(xc[i], 3) + pow(yc[i], 3) + pow(zc[i], 3);
+    }
+
+    const int nq1 = c.GetOutputSize(Collections::ePhysInterp1DScaled);
+    Array<OneD, NekDouble> xc1(nq1);
+    Array<OneD, NekDouble> yc1(nq1);
+    Array<OneD, NekDouble> zc1(nq1);
+    Array<OneD, NekDouble> phys1(nq1);
+
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, xc, xc1);
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, yc, yc1);
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, zc, zc1);
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, phys, phys1);
+
+    double epsilon = 1.0e-8;
+    // since solution is a polynomial should be able to compare soln directly
+    for (int i = 0; i < nq1; ++i)
+    {
+        NekDouble exact = pow(xc1[i], 3) + pow(yc1[i], 3) + pow(zc1[i], 3);
+        phys1[i]        = (fabs(phys1[i]) < 1e-14) ? 0.0 : phys1[i];
+        exact           = (fabs(exact) < 1e-14) ? 0.0 : exact;
+        BOOST_CHECK_CLOSE(phys1[i], exact, epsilon);
+    }
+}
 } // namespace Nektar::TetCollectionTests
diff --git a/library/UnitTests/Collections/TestTriCollection.cpp b/library/UnitTests/Collections/TestTriCollection.cpp
index 7d68e2c02b..060c491804 100644
--- a/library/UnitTests/Collections/TestTriCollection.cpp
+++ b/library/UnitTests/Collections/TestTriCollection.cpp
@@ -3190,7 +3190,7 @@ BOOST_AUTO_TEST_CASE(TestTriHelmholtz_IterPerExp_UniformP_ConstVarDiff)
 
     Nektar::StdRegions::StdTriExpSharedPtr stdExp =
         MemoryManager<Nektar::StdRegions::StdTriExp>::AllocateSharedPtr(
-            basisKeyDir1, basisKeyDir1);
+            basisKeyDir1, basisKeyDir2);
 
     int nelmts = 10;
 
@@ -3287,7 +3287,7 @@ BOOST_AUTO_TEST_CASE(TestTriHelmholtz_MatrixFree_UniformP)
 
     Nektar::StdRegions::StdTriExpSharedPtr stdExp =
         MemoryManager<Nektar::StdRegions::StdTriExp>::AllocateSharedPtr(
-            basisKeyDir1, basisKeyDir1);
+            basisKeyDir1, basisKeyDir2);
 
     int nelmts = 10;
 
@@ -3381,7 +3381,7 @@ BOOST_AUTO_TEST_CASE(TestTriHelmholtz_MatrixFree_UniformP_OverInt)
 
     Nektar::StdRegions::StdTriExpSharedPtr stdExp =
         MemoryManager<Nektar::StdRegions::StdTriExp>::AllocateSharedPtr(
-            basisKeyDir1, basisKeyDir1);
+            basisKeyDir1, basisKeyDir2);
 
     int nelmts = 10;
 
@@ -3475,7 +3475,7 @@ BOOST_AUTO_TEST_CASE(TestTriHelmholtz_MatrixFree_UniformP_ConstVarDiff)
 
     Nektar::StdRegions::StdTriExpSharedPtr stdExp =
         MemoryManager<Nektar::StdRegions::StdTriExp>::AllocateSharedPtr(
-            basisKeyDir1, basisKeyDir1);
+            basisKeyDir1, basisKeyDir2);
 
     int nelmts = 10;
 
@@ -3533,4 +3533,167 @@ BOOST_AUTO_TEST_CASE(TestTriHelmholtz_MatrixFree_UniformP_ConstVarDiff)
         BOOST_CHECK_CLOSE(coeffsRef[i], coeffs[i], epsilon);
     }
 }
+
+BOOST_AUTO_TEST_CASE(TestTriPhysInterp1D_NoCollection_UniformP)
+{
+    SpatialDomains::PointGeomSharedPtr v0(
+        new SpatialDomains::PointGeom(2u, 0u, -1.5, -1.5, 0.0));
+    SpatialDomains::PointGeomSharedPtr v1(
+        new SpatialDomains::PointGeom(2u, 1u, 1.0, -1.0, 0.0));
+    SpatialDomains::PointGeomSharedPtr v2(
+        new SpatialDomains::PointGeom(2u, 2u, -1.0, 1.0, 0.0));
+
+    SpatialDomains::TriGeomSharedPtr triGeom = CreateTri(v0, v1, v2);
+
+    unsigned int numQuadPoints = 5;
+    unsigned int numModes      = 4;
+
+    Nektar::LibUtilities::PointsType triPointsTypeDir1 =
+        Nektar::LibUtilities::eGaussLobattoLegendre;
+    const Nektar::LibUtilities::PointsKey triPointsKeyDir1(numQuadPoints,
+                                                           triPointsTypeDir1);
+    Nektar::LibUtilities::BasisType basisTypeDir1 =
+        Nektar::LibUtilities::eModified_A;
+    const Nektar::LibUtilities::BasisKey basisKeyDir1(basisTypeDir1, numModes,
+                                                      triPointsKeyDir1);
+
+    Nektar::LibUtilities::PointsType triPointsTypeDir2 =
+        Nektar::LibUtilities::eGaussRadauMAlpha1Beta0;
+    const Nektar::LibUtilities::PointsKey triPointsKeyDir2(numQuadPoints - 1,
+                                                           triPointsTypeDir2);
+    Nektar::LibUtilities::BasisType basisTypeDir2 =
+        Nektar::LibUtilities::eModified_B;
+    const Nektar::LibUtilities::BasisKey basisKeyDir2(basisTypeDir2, numModes,
+                                                      triPointsKeyDir2);
+
+    Nektar::LocalRegions::TriExpSharedPtr Exp =
+        MemoryManager<Nektar::LocalRegions::TriExp>::AllocateSharedPtr(
+            basisKeyDir1, basisKeyDir2, triGeom);
+
+    std::vector<StdRegions::StdExpansionSharedPtr> CollExp;
+    CollExp.push_back(Exp);
+
+    LibUtilities::SessionReaderSharedPtr dummySession;
+    Collections::CollectionOptimisation colOpt(dummySession, 2,
+                                               Collections::eNoCollection);
+    Collections::OperatorImpMap impTypes = colOpt.GetOperatorImpMap(Exp);
+    Collections::Collection c(CollExp, impTypes);
+
+    StdRegions::ConstFactorMap factors;
+    factors[StdRegions::eFactorConst] = 1.5;
+    c.Initialise(Collections::ePhysInterp1DScaled, factors);
+
+    const int nq = Exp->GetTotPoints();
+
+    Array<OneD, NekDouble> xc(nq), yc(nq);
+    Array<OneD, NekDouble> phys(nq), tmp;
+
+    Exp->GetCoords(xc, yc);
+
+    for (int i = 0; i < nq; ++i)
+    {
+        phys[i] = pow(xc[i], 3) + pow(yc[i], 3);
+    }
+
+    const int nq1 = c.GetOutputSize(Collections::ePhysInterp1DScaled);
+    Array<OneD, NekDouble> xc1(nq1);
+    Array<OneD, NekDouble> yc1(nq1);
+    Array<OneD, NekDouble> phys1(nq1);
+
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, xc, xc1);
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, yc, yc1);
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, phys, phys1);
+
+    double epsilon = 1.0e-8;
+    // since solution is a polynomial should be able to compare soln directly
+    for (int i = 0; i < nq1; ++i)
+    {
+        NekDouble exact = pow(xc1[i], 3) + pow(yc1[i], 3);
+        phys1[i]        = (fabs(phys1[i]) < 1e-14) ? 0.0 : phys1[i];
+        exact           = (fabs(exact) < 1e-14) ? 0.0 : exact;
+        BOOST_CHECK_CLOSE(phys1[i], exact, epsilon);
+    }
+}
+
+BOOST_AUTO_TEST_CASE(TestTriPhysInterp1D_MatrixFree_UniformP)
+{
+    SpatialDomains::PointGeomSharedPtr v0(
+        new SpatialDomains::PointGeom(2u, 0u, -1.5, -1.5, 0.0));
+    SpatialDomains::PointGeomSharedPtr v1(
+        new SpatialDomains::PointGeom(2u, 1u, 1.0, -1.0, 0.0));
+    SpatialDomains::PointGeomSharedPtr v2(
+        new SpatialDomains::PointGeom(2u, 2u, -1.0, 1.0, 0.0));
+
+    SpatialDomains::TriGeomSharedPtr triGeom = CreateTri(v0, v1, v2);
+
+    unsigned int numQuadPoints = 5;
+    unsigned int numModes      = 4;
+
+    Nektar::LibUtilities::PointsType triPointsTypeDir1 =
+        Nektar::LibUtilities::eGaussLobattoLegendre;
+    const Nektar::LibUtilities::PointsKey triPointsKeyDir1(numQuadPoints,
+                                                           triPointsTypeDir1);
+    Nektar::LibUtilities::BasisType basisTypeDir1 =
+        Nektar::LibUtilities::eModified_A;
+    const Nektar::LibUtilities::BasisKey basisKeyDir1(basisTypeDir1, numModes,
+                                                      triPointsKeyDir1);
+
+    Nektar::LibUtilities::PointsType triPointsTypeDir2 =
+        Nektar::LibUtilities::eGaussRadauMAlpha1Beta0;
+    const Nektar::LibUtilities::PointsKey triPointsKeyDir2(numQuadPoints - 1,
+                                                           triPointsTypeDir2);
+    Nektar::LibUtilities::BasisType basisTypeDir2 =
+        Nektar::LibUtilities::eModified_B;
+    const Nektar::LibUtilities::BasisKey basisKeyDir2(basisTypeDir2, numModes,
+                                                      triPointsKeyDir2);
+
+    Nektar::LocalRegions::TriExpSharedPtr Exp =
+        MemoryManager<Nektar::LocalRegions::TriExp>::AllocateSharedPtr(
+            basisKeyDir1, basisKeyDir2, triGeom);
+
+    std::vector<StdRegions::StdExpansionSharedPtr> CollExp;
+    CollExp.push_back(Exp);
+
+    LibUtilities::SessionReaderSharedPtr dummySession;
+    Collections::CollectionOptimisation colOpt(dummySession, 2,
+                                               Collections::eMatrixFree);
+    Collections::OperatorImpMap impTypes = colOpt.GetOperatorImpMap(Exp);
+    Collections::Collection c(CollExp, impTypes);
+
+    StdRegions::ConstFactorMap factors;
+    factors[StdRegions::eFactorConst] = 1.5;
+    c.Initialise(Collections::ePhysInterp1DScaled, factors);
+
+    const int nq = Exp->GetTotPoints();
+
+    Array<OneD, NekDouble> xc(nq), yc(nq);
+    Array<OneD, NekDouble> phys(nq), tmp;
+
+    Exp->GetCoords(xc, yc);
+
+    for (int i = 0; i < nq; ++i)
+    {
+        phys[i] = pow(xc[i], 3) + pow(yc[i], 3);
+    }
+
+    const int nq1 = c.GetOutputSize(Collections::ePhysInterp1DScaled);
+    Array<OneD, NekDouble> xc1(nq1);
+    Array<OneD, NekDouble> yc1(nq1);
+    Array<OneD, NekDouble> phys1(nq1);
+
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, xc, xc1);
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, yc, yc1);
+    c.ApplyOperator(Collections::ePhysInterp1DScaled, phys, phys1);
+
+    double epsilon = 1.0e-8;
+    // since solution is a polynomial should be able to compare soln directly
+    for (int i = 0; i < nq1; ++i)
+    {
+        NekDouble exact = pow(xc1[i], 3) + pow(yc1[i], 3);
+        phys1[i]        = (fabs(phys1[i]) < 1e-14) ? 0.0 : phys1[i];
+        exact           = (fabs(exact) < 1e-14) ? 0.0 : exact;
+        BOOST_CHECK_CLOSE(phys1[i], exact, epsilon);
+    }
+}
+
 } // namespace Nektar::TriCollectionTests
-- 
GitLab