Compare revisions

Jacques Xing · Chris Cantwell · Chris Cantwell · d05e90c5 · d05e90c5 · d05e90c5
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,12 +26,12 @@ message(STATUS "Found Nektar++: version ${NEKTAR++_VERSION}")

 set(CMAKE_INSTALL_RPATH "${NEKTAR++_LIBRARY_DIRS}")

-set(SRC Field.cpp Operators/Operator.cpp Operators/OperatorBwdTrans.cpp Operators/BwdTrans/BwdTransImpl.cpp Operators/OperatorIProductWRTBase.cpp Operators/IProductWRTBase/IProductWRTBaseImpl.cpp)
+set(SRC Field.cpp Operators/Operator.cpp Operators/OperatorBwdTrans.cpp Operators/BwdTrans/BwdTransImpl.cpp Operators/OperatorIProductWRTBase.cpp Operators/IProductWRTBase/IProductWRTBaseImpl.cpp Operators/OperatorPhysDeriv.cpp Operators/PhysDeriv/PhysDerivImpl.cpp)

 if (NEKTAR_USE_CUDA)
    enable_language(CUDA)
    add_definitions(-DNEKTAR_USE_CUDA)
-    set(SRC ${SRC} MemoryRegionCUDA.cu Operators/BwdTrans/BwdTransCUDA.cu Operators/IProductWRTBase/IProductWRTBaseCUDA.cu)
+    set(SRC ${SRC} MemoryRegionCUDA.cu Operators/BwdTrans/BwdTransCUDA.cu Operators/IProductWRTBase/IProductWRTBaseCUDA.cu Operators/PhysDeriv/PhysDerivCUDA.cu)
 endif()

 if (NEKTAR_USE_SIMD)

--- a/Field.hpp
+++ b/Field.hpp
@@ -99,38 +99,45 @@ public:
        return *this;
    }

-  /**
-   * @brief Compare this field to another field, with absolute
-   * tolerance tol.
-   * @return bool
-   */
-  bool compare(Field<TType, TState> &rhs, double tol)
-  {
-    const std::vector<BlockAttributes>& rhs_blocks = rhs.GetBlocks();
-    TType *store = GetStorage().GetCPUPtr();
-    TType *rhs_store = rhs.GetStorage().GetCPUPtr();
-    if (rhs_blocks.size() != block_attributes.size()) return false;
-
-    size_t i {0};
-    for (size_t bl = 0; bl < block_attributes.size(); ++bl){
-
-      size_t num_elements = block_attributes[bl].num_elements;
-      size_t num_pts = block_attributes[bl].num_pts;
-
-      // Check that each block have the same structure
-      if (num_elements != rhs_blocks[bl].num_elements) return false;
-      if (num_pts != rhs_blocks[bl].num_pts) return false;
-
-      // Compare elements in the blocks
-      for (size_t el = 0 ; el < num_elements; ++el) {
-	for (size_t coeff = 0; coeff < num_pts; ++coeff) {
-	  i = coeff + el * num_pts + bl * num_pts * num_elements;
-	  if (std::abs(store[i] - rhs_store[i]) > tol) return false;
-	}
-      }
+    /**
+     * @brief Compare this field to another field, with absolute
+     * tolerance tol.
+     * @return bool
+     */
+    bool compare(Field<TType, TState> &rhs, double tol)
+    {
+        const std::vector<BlockAttributes> &rhs_blocks = rhs.GetBlocks();
+        TType *store     = GetStorage().GetCPUPtr();
+        TType *rhs_store = rhs.GetStorage().GetCPUPtr();
+        if (rhs_blocks.size() != block_attributes.size())
+            return false;
+
+        size_t i{0};
+        for (size_t bl = 0; bl < block_attributes.size(); ++bl)
+        {
+
+            size_t num_elements = block_attributes[bl].num_elements;
+            size_t num_pts      = block_attributes[bl].num_pts;
+
+            // Check that each block have the same structure
+            if (num_elements != rhs_blocks[bl].num_elements)
+                return false;
+            if (num_pts != rhs_blocks[bl].num_pts)
+                return false;
+
+            // Compare elements in the blocks
+            for (size_t el = 0; el < num_elements; ++el)
+            {
+                for (size_t coeff = 0; coeff < num_pts; ++coeff)
+                {
+                    i = coeff + el * num_pts + bl * num_pts * num_elements;
+                    if (std::abs(store[i] - rhs_store[i]) > tol)
+                        return false;
+                }
+            }
+        }
+        return true;
    }
-    return true;
-  }

    /**
     * @brief Static templated creation method.

--- a/Operators/BwdTrans/BwdTransCUDA.hpp
+++ b/Operators/BwdTrans/BwdTransCUDA.hpp
@@ -61,13 +61,7 @@ public:

    ~OperatorBwdTransImpl()
    {
-        for (auto &basis : m_basis)
-        {
-            for (size_t i = 0; i < basis.second.size(); i++)
-            {
-                cudaFree(basis.second[i]);
-            }
-        }
+        DeallocateDataCUDA<TData>(m_basis);
    }

    void apply(Field<TData, FieldState::Coeff> &in,

--- a/Operators/Operator.hpp
+++ b/Operators/Operator.hpp
@@ -114,6 +114,50 @@ protected:
        return jac;
    }

+    Array<OneD, Array<OneD, TData>> SetDerivativeFactor(size_t dfSize)
+    {
+        // Allocate memory for the derivative factor
+        size_t nDim   = this->m_expansionList->GetShapeDimension();
+        size_t nCoord = this->m_expansionList->GetCoordim(0);
+        Array<OneD, Array<OneD, TData>> derivFac(nDim * nCoord);
+        for (size_t d = 0; d < nDim * nCoord; d++)
+        {
+            derivFac[d] = Array<OneD, TData>(dfSize);
+        }
+
+        // Initialise derivative factor.
+        size_t dfindex   = 0;
+        size_t nTotElmts = this->m_expansionList->GetNumElmts();
+        for (size_t e = 0; e < nTotElmts; ++e)
+        {
+            auto expPtr = this->m_expansionList->GetExp(e);
+            auto &df    = expPtr->GetMetricInfo()->GetDerivFactors(
+                expPtr->GetPointsKeys());
+            size_t nqTot = expPtr->GetTotPoints();
+            if (expPtr->GetMetricInfo()->GetGtype() ==
+                SpatialDomains::eDeformed)
+            {
+                for (size_t d = 0; d < nDim * nCoord; d++)
+                {
+                    for (size_t i = 0; i < nqTot; ++i)
+                    {
+                        derivFac[d][dfindex + i] = df[d][i];
+                    }
+                }
+                dfindex += nqTot;
+            }
+            else
+            {
+                for (size_t d = 0; d < nDim * nCoord; d++)
+                {
+                    derivFac[d][dfindex] = df[d][0];
+                }
+                dfindex += 1;
+            }
+        }
+        return derivFac;
+    }
+
    MultiRegions::ExpListSharedPtr m_expansionList;
 };


--- a/Operators/OperatorHelper.cuh
+++ b/Operators/OperatorHelper.cuh
@@ -7,18 +7,15 @@ namespace Nektar::Operators
 {

 template <typename TData>
-using BasisMap =
-    std::map<std::vector<LibUtilities::BasisKey>, std::vector<TData *>>;
-template <typename TData>
-using WeightMap =
+using DataMap =
    std::map<std::vector<LibUtilities::BasisKey>, std::vector<TData *>>;

 template <typename TData>
-BasisMap<TData> GetBasisDataCUDA(
+DataMap<TData> GetBasisDataCUDA(
    const MultiRegions::ExpListSharedPtr &expansionList)
 {
    // Initialize data map.
-    BasisMap<TData> basis;
+    DataMap<TData> basis;

    // Initialize basiskey.
    std::vector<LibUtilities::BasisKey> basisKeys(3,
@@ -40,7 +37,7 @@ BasisMap<TData> GetBasisDataCUDA(
        if (basis.find(basisKeys) == basis.end())
        {
            basis[basisKeys] = std::vector<TData *>(nDim, 0);
-            for (size_t d = 0; d < expPtr->GetShapeDimension(); d++)
+            for (size_t d = 0; d < nDim; d++)
            {
                auto ndata      = expPtr->GetBasis(d)->GetBdata().size();
                auto hostPtr    = expPtr->GetBasis(d)->GetBdata().get();
@@ -55,11 +52,93 @@ BasisMap<TData> GetBasisDataCUDA(
 }

 template <typename TData>
-WeightMap<TData> GetWeightDataCUDA(
+DataMap<TData> GetPointDataCUDA(
    const MultiRegions::ExpListSharedPtr &expansionList)
 {
    // Initialize data map.
-    WeightMap<TData> weight;
+    DataMap<TData> points;
+
+    // Initialize basiskey.
+    std::vector<LibUtilities::BasisKey> basisKeys(3,
+                                                  LibUtilities::NullBasisKey);
+
+    // Loop over the elements of expansionList.
+    size_t nDim = expansionList->GetShapeDimension();
+    for (size_t i = 0; i < expansionList->GetNumElmts(); ++i)
+    {
+        auto const expPtr = expansionList->GetExp(i);
+
+        // Fetch basiskeys of current element.
+        for (size_t d = 0; d < nDim; d++)
+        {
+            basisKeys[d] = expPtr->GetBasis(d)->GetBasisKey();
+        }
+
+        // Copy data to points, if necessary.
+        if (points.find(basisKeys) == points.end())
+        {
+            points[basisKeys] = std::vector<TData *>(nDim, 0);
+            for (size_t d = 0; d < nDim; d++)
+            {
+                auto ndata      = expPtr->GetBasis(d)->GetZ().size();
+                auto hostPtr    = expPtr->GetBasis(d)->GetZ().get();
+                auto &devicePtr = points[basisKeys][d];
+                cudaMalloc((void **)&devicePtr, sizeof(TData) * ndata);
+                cudaMemcpy(devicePtr, hostPtr, sizeof(TData) * ndata,
+                           cudaMemcpyHostToDevice);
+            }
+        }
+    }
+    return points;
+}
+
+template <typename TData>
+DataMap<TData> GetDerivativeDataCUDA(
+    const MultiRegions::ExpListSharedPtr &expansionList)
+{
+    // Initialize data map.
+    DataMap<TData> derivative;
+
+    // Initialize basiskey.
+    std::vector<LibUtilities::BasisKey> basisKeys(3,
+                                                  LibUtilities::NullBasisKey);
+
+    // Loop over the elements of expansionList.
+    size_t nDim = expansionList->GetShapeDimension();
+    for (size_t i = 0; i < expansionList->GetNumElmts(); ++i)
+    {
+        auto const expPtr = expansionList->GetExp(i);
+
+        // Fetch basiskeys of current element.
+        for (size_t d = 0; d < nDim; d++)
+        {
+            basisKeys[d] = expPtr->GetBasis(d)->GetBasisKey();
+        }
+
+        // Copy data to derivative, if necessary.
+        if (derivative.find(basisKeys) == derivative.end())
+        {
+            derivative[basisKeys] = std::vector<TData *>(nDim, 0);
+            for (size_t d = 0; d < nDim; d++)
+            {
+                auto ndata      = expPtr->GetBasis(d)->GetD()->GetPtr().size();
+                auto hostPtr    = expPtr->GetBasis(d)->GetD()->GetPtr().get();
+                auto &devicePtr = derivative[basisKeys][d];
+                cudaMalloc((void **)&devicePtr, sizeof(TData) * ndata);
+                cudaMemcpy(devicePtr, hostPtr, sizeof(TData) * ndata,
+                           cudaMemcpyHostToDevice);
+            }
+        }
+    }
+    return derivative;
+}
+
+template <typename TData>
+DataMap<TData> GetWeightDataCUDA(
+    const MultiRegions::ExpListSharedPtr &expansionList)
+{
+    // Initialize data map.
+    DataMap<TData> weight;

    // Initialize basiskey.
    std::vector<LibUtilities::BasisKey> basisKeys(3,
@@ -81,7 +160,7 @@ WeightMap<TData> GetWeightDataCUDA(
        if (weight.find(basisKeys) == weight.end())
        {
            weight[basisKeys] = std::vector<TData *>(nDim, 0);
-            for (size_t d = 0; d < expPtr->GetShapeDimension(); d++)
+            for (size_t d = 0; d < nDim; d++)
            {
                auto ndata = expPtr->GetBasis(d)->GetW().size();
                Array<OneD, TData> w(ndata);
@@ -112,4 +191,17 @@ WeightMap<TData> GetWeightDataCUDA(
    }
    return weight;
 }
+
+template <typename TData>
+void DeallocateDataCUDA(DataMap<TData> &dataMap)
+{
+    for (auto &data : dataMap)
+    {
+        for (size_t i = 0; i < data.second.size(); i++)
+        {
+            cudaFree(data.second[i]);
+        }
+    }
+}
+
 } // namespace Nektar::Operators
--- a/Operators/OperatorPhysDeriv.cpp
+++ b/Operators/OperatorPhysDeriv.cpp
+#include "OperatorPhysDeriv.hpp"
+
+namespace Nektar::Operators
+{
+
+template <> const std::string PhysDeriv<>::key          = "PhysDeriv";
+template <> const std::string PhysDeriv<>::default_impl = "StdMat";
+
+} // namespace Nektar::Operators
--- a/Operators/OperatorPhysDeriv.hpp
+++ b/Operators/OperatorPhysDeriv.hpp
+#pragma once
+
+#include "Field.hpp"
+#include "Operator.hpp"
+
+namespace Nektar::Operators
+{
+
+// PhysDeriv base class
+// Defines the apply operator to enforce apply parameter types
+template <typename TData> class OperatorPhysDeriv : public Operator<TData>
+{
+public:
+    virtual ~OperatorPhysDeriv() = default;
+
+    OperatorPhysDeriv(const MultiRegions::ExpListSharedPtr &expansionList)
+        : Operator<TData>(expansionList)
+    {
+    }
+
+    virtual void apply(Field<TData, FieldState::Phys> &in,
+                       Field<TData, FieldState::Phys> &out0,
+                       Field<TData, FieldState::Phys> &out1,
+                       Field<TData, FieldState::Phys> &out2) = 0;
+    virtual void operator()(Field<TData, FieldState::Phys> &in,
+                            Field<TData, FieldState::Phys> &out0,
+                            Field<TData, FieldState::Phys> &out1,
+                            Field<TData, FieldState::Phys> &out2)
+    {
+        apply(in, out0, out1, out2);
+    }
+};
+
+// Descriptor / traits class for PhysDeriv
+template <typename TData = default_fp_type> struct PhysDeriv
+{
+    using class_name = OperatorPhysDeriv<TData>;
+    using FieldIn    = Field<TData, FieldState::Phys>;
+    using FieldOut   = Field<TData, FieldState::Phys>;
+    static const std::string key;
+    static const std::string default_impl;
+
+    static std::shared_ptr<class_name> create(
+        const MultiRegions::ExpListSharedPtr &expansionList,
+        std::string pKey = "")
+    {
+        return Operator<TData>::template create<PhysDeriv>(expansionList, pKey);
+    }
+};
+
+namespace detail
+{
+// Template for PhysDeriv implementations
+template <typename TData, typename Op> class OperatorPhysDerivImpl;
+} // namespace detail
+
+} // namespace Nektar::Operators
--- a/Operators/PhysDeriv/PhysDerivCUDA.cu
+++ b/Operators/PhysDeriv/PhysDerivCUDA.cu
+#include "PhysDerivCUDA.hpp"
+
+namespace Nektar::Operators::detail
+{
+template <>
+std::string OperatorPhysDerivImpl<double, ImplCUDA>::className =
+    GetOperatorFactory<double>().RegisterCreatorFunction(
+        "PhysDerivCUDA", OperatorPhysDerivImpl<double, ImplCUDA>::instantiate,
+        "...");
+}
--- a/Operators/PhysDeriv/PhysDerivCUDA.hpp
+++ b/Operators/PhysDeriv/PhysDerivCUDA.hpp
+#include "MemoryRegionCUDA.hpp"
+#include "Operators/OperatorHelper.cuh"
+#include "Operators/OperatorPhysDeriv.hpp"
+#include "Operators/PhysDeriv/PhysDerivCUDAKernels.cuh"
+
+namespace Nektar::Operators::detail
+{
+
+template <typename TData, bool DEFORMED = false>
+void PhysDeriv1DKernel(const size_t gridSize, const size_t blockSize,
+                       const size_t nq0, const size_t nCoord,
+                       const size_t nElmts, const TData *D0,
+                       const size_t dfSize, TData *df, const TData *in,
+                       TData *out0, TData *out1, TData *out2);
+
+template <typename TData, bool DEFORMED = false>
+void PhysDeriv2DKernel(const size_t gridSize, const size_t blockSize,
+                       LibUtilities::ShapeType shapetype, const size_t nq0,
+                       const size_t nq1, const size_t nCoord,
+                       const size_t nElmts, const TData *D0, const TData *D1,
+                       const TData *Z0, const TData *Z1, const size_t dfSize,
+                       TData *df, const TData *in, TData *out0, TData *out1,
+                       TData *out2);
+
+template <typename TData, bool DEFORMED = false>
+void PhysDeriv3DKernel(const size_t gridSize, const size_t blockSize,
+                       LibUtilities::ShapeType shapetype, const size_t nq0,
+                       const size_t nq1, const size_t nq2, const size_t nCoord,
+                       const size_t nElmts, const TData *D0, const TData *D1,
+                       const TData *D2, const TData *Z0, const TData *Z1,
+                       const TData *Z2, const size_t dfSize, TData *df,
+                       const TData *in, TData *out0, TData *out1, TData *out2);
+
+// Matrix-free implementation
+template <typename TData>
+class OperatorPhysDerivImpl<TData, ImplCUDA> : public OperatorPhysDeriv<TData>
+{
+public:
+    OperatorPhysDerivImpl(const MultiRegions::ExpListSharedPtr &expansionList)
+        : OperatorPhysDeriv<TData>(expansionList)
+    {
+        size_t nTotElmts = this->m_expansionList->GetNumElmts();
+        size_t nDim      = this->m_expansionList->GetShapeDimension();
+        size_t nCoord    = this->m_expansionList->GetCoordim(0);
+
+        // Initialise derivative factor.
+        m_dfSize      = Operator<TData>::GetGeometricFactorSize();
+        auto derivFac = Operator<TData>::SetDerivativeFactor(m_dfSize);
+        cudaMalloc((void **)&m_derivFac,
+                   sizeof(TData) * nDim * nCoord * m_dfSize);
+        for (size_t d = 0; d < nDim * nCoord; d++)
+        {
+            auto hostPtr   = derivFac[d].get();
+            auto devicePtr = m_derivFac + d * m_dfSize;
+            cudaMemcpy(devicePtr, hostPtr, sizeof(TData) * m_dfSize,
+                       cudaMemcpyHostToDevice);
+        }
+
+        // Initialize points.
+        m_Z = GetPointDataCUDA<TData>(expansionList);
+
+        // Initialize derivative matrix.
+        m_D = GetDerivativeDataCUDA<TData>(expansionList);
+    }
+
+    ~OperatorPhysDerivImpl(void)
+    {
+        DeallocateDataCUDA<TData>(m_Z);
+        DeallocateDataCUDA<TData>(m_D);
+        cudaFree(m_derivFac);
+    }
+
+    void apply(Field<TData, FieldState::Phys> &in,
+               Field<TData, FieldState::Phys> &out0,
+               Field<TData, FieldState::Phys> &out1,
+               Field<TData, FieldState::Phys> &out2) override
+    {
+        // Initialize pointers.
+        auto const *inptr =
+            in.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
+        auto *outptr0 =
+            out0.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
+        auto *outptr1 =
+            out1.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
+        auto *outptr2 =
+            out2.template GetStorage<MemoryRegionCUDA>().GetGPUPtr();
+        auto dfptr = m_derivFac;
+
+        // Initialize index.
+        size_t expIdx = 0;
+
+        // Initialize basiskey.
+        std::vector<LibUtilities::BasisKey> basisKeys(
+            3, LibUtilities::NullBasisKey);
+
+        for (size_t block_idx = 0; block_idx < in.GetBlocks().size();
+             ++block_idx)
+        {
+            // Determine shape and type of the element.
+            auto const expPtr = this->m_expansionList->GetExp(expIdx);
+            auto nElmts       = in.GetBlocks()[block_idx].num_elements;
+            auto nqTot        = expPtr->GetTotPoints();
+            auto nCoord       = expPtr->GetCoordim();
+            auto shape        = expPtr->DetShapeType();
+            auto deformed     = expPtr->GetMetricInfo()->GetGtype() ==
+                            SpatialDomains::eDeformed;
+
+            // Determine CUDA grid parameters.
+            m_gridSize = nElmts / m_blockSize;
+            m_gridSize += (nElmts % m_blockSize == 0) ? 0 : 1;
+
+            // Fetch basis key for the current element type.
+            for (size_t d = 0; d < expPtr->GetShapeDimension(); d++)
+            {
+                basisKeys[d] = expPtr->GetBasis(d)->GetBasisKey();
+            }
+
+            // Function call to kernel functions.
+            if (expPtr->GetShapeDimension() == 1)
+            {
+                auto D0  = m_D[basisKeys][0];
+                auto nq0 = expPtr->GetNumPoints(0);
+                if (deformed)
+                {
+                    PhysDeriv1DKernel<TData, true>(
+                        m_gridSize, m_blockSize, nq0, nCoord, nElmts, D0,
+                        m_dfSize, dfptr, inptr, outptr0, outptr1, outptr2);
+                }
+                else
+                {
+                    PhysDeriv1DKernel<TData, false>(
+                        m_gridSize, m_blockSize, nq0, nCoord, nElmts, D0,
+                        m_dfSize, dfptr, inptr, outptr0, outptr1, outptr2);
+                }
+            }
+            else if (expPtr->GetShapeDimension() == 2)
+            {
+                auto D0  = m_D[basisKeys][0];
+                auto D1  = m_D[basisKeys][1];
+                auto Z0  = m_Z[basisKeys][0];
+                auto Z1  = m_Z[basisKeys][1];
+                auto nq0 = expPtr->GetNumPoints(0);
+                auto nq1 = expPtr->GetNumPoints(1);
+                if (deformed)
+                {
+                    PhysDeriv2DKernel<TData, true>(
+                        m_gridSize, m_blockSize, shape, nq0, nq1, nCoord,
+                        nElmts, D0, D1, Z0, Z1, m_dfSize, dfptr, inptr, outptr0,
+                        outptr1, outptr2);
+                }
+                else
+                {
+                    PhysDeriv2DKernel<TData, false>(
+                        m_gridSize, m_blockSize, shape, nq0, nq1, nCoord,
+                        nElmts, D0, D1, Z0, Z1, m_dfSize, dfptr, inptr, outptr0,
+                        outptr1, outptr2);
+                }
+            }
+            else if (expPtr->GetShapeDimension() == 3)
+            {
+                auto D0  = m_D[basisKeys][0];
+                auto D1  = m_D[basisKeys][1];
+                auto D2  = m_D[basisKeys][2];
+                auto Z0  = m_Z[basisKeys][0];
+                auto Z1  = m_Z[basisKeys][1];
+                auto Z2  = m_Z[basisKeys][2];
+                auto nq0 = expPtr->GetNumPoints(0);
+                auto nq1 = expPtr->GetNumPoints(1);
+                auto nq2 = expPtr->GetNumPoints(2);
+                if (deformed)
+                {
+                    PhysDeriv3DKernel<TData, true>(
+                        m_gridSize, m_blockSize, shape, nq0, nq1, nq2, nCoord,
+                        nElmts, D0, D1, D2, Z0, Z1, Z2, m_dfSize, dfptr, inptr,
+                        outptr0, outptr1, outptr2);
+                }
+                else
+                {
+                    PhysDeriv3DKernel<TData, false>(
+                        m_gridSize, m_blockSize, shape, nq0, nq1, nq2, nCoord,
+                        nElmts, D0, D1, D2, Z0, Z1, Z2, m_dfSize, dfptr, inptr,
+                        outptr0, outptr1, outptr2);
+                }
+            }
+
+            // Increment pointer and index for next element type.
+            dfptr += deformed ? nqTot * nElmts : nElmts;
+            outptr0 += nqTot * nElmts;
+            outptr1 += nqTot * nElmts;
+            outptr2 += nqTot * nElmts;
+            inptr += nqTot * nElmts;
+            expIdx += nElmts;
+        }
+    }
+
+    static std::unique_ptr<Operator<TData>> instantiate(
+        MultiRegions::ExpListSharedPtr expansionList)
+    {
+        return std::make_unique<OperatorPhysDerivImpl<TData, ImplCUDA>>(
+            expansionList);
+    }
+
+    static std::string className;
+
+private:
+    TData *m_derivFac;
+    std::map<std::vector<LibUtilities::BasisKey>, std::vector<TData *>> m_D;
+    std::map<std::vector<LibUtilities::BasisKey>, std::vector<TData *>> m_Z;
+    size_t m_dfSize;
+    size_t m_blockSize = 32;
+    size_t m_gridSize;
+};
+
+template <typename TData, bool DEFORMED>
+void PhysDeriv1DKernel(const size_t gridSize, const size_t blockSize,
+                       const size_t nq0, const size_t nCoord,
+                       const size_t nElmts, const TData *D0,
+                       const size_t dfSize, TData *df, const TData *in,
+                       TData *out0, TData *out1, TData *out2)
+{
+    // Compute tensorial derivative.
+    PhysDerivTensor1DKernel<TData>
+        <<<gridSize, blockSize>>>(nq0, nElmts, D0, in, out0);
+
+    // Compute physical derivative.
+    PhysDerivSegKernel<TData, DEFORMED><<<gridSize, blockSize>>>(
+        nq0, nCoord, nElmts, dfSize, df, out0, out1, out2);
+}
+
+template <typename TData, bool DEFORMED>
+void PhysDeriv2DKernel(const size_t gridSize, const size_t blockSize,
+                       LibUtilities::ShapeType shapetype, const size_t nq0,
+                       const size_t nq1, const size_t nCoord,
+                       const size_t nElmts, const TData *D0, const TData *D1,
+                       const TData *Z0, const TData *Z1, const size_t dfSize,
+                       TData *df, const TData *in, TData *out0, TData *out1,
+                       TData *out2)
+{
+    // Compute tensorial derivative.
+    PhysDerivTensor2DKernel<TData>
+        <<<gridSize, blockSize>>>(nq0, nq1, nElmts, D0, D1, in, out0, out1);
+
+    // Compute physical derivative.
+    if (shapetype == LibUtilities::Quad)
+    {
+        PhysDerivQuadKernel<TData, DEFORMED><<<gridSize, blockSize>>>(
+            nq0, nq1, nCoord, nElmts, dfSize, df, out0, out1, out2);
+    }
+    else if (shapetype == LibUtilities::Tri)
+    {
+        PhysDerivTriKernel<TData, DEFORMED><<<gridSize, blockSize>>>(
+            nq0, nq1, nCoord, nElmts, Z0, Z1, dfSize, df, out0, out1, out2);
+    }
+}
+
+template <typename TData, bool DEFORMED>
+void PhysDeriv3DKernel(const size_t gridSize, const size_t blockSize,
+                       LibUtilities::ShapeType shapetype, const size_t nq0,
+                       const size_t nq1, const size_t nq2, const size_t nCoord,
+                       const size_t nElmts, const TData *D0, const TData *D1,
+                       const TData *D2, const TData *Z0, const TData *Z1,
+                       const TData *Z2, const size_t dfSize, TData *df,
+                       const TData *in, TData *out0, TData *out1, TData *out2)
+{
+    // Compute tensorial derivative.
+    PhysDerivTensor3DKernel<TData><<<gridSize, blockSize>>>(
+        nq0, nq1, nq2, nElmts, D0, D1, D2, in, out0, out1, out2);
+
+    // Compute physical derivative.
+    if (shapetype == LibUtilities::Hex)
+    {
+        PhysDerivHexKernel<TData, DEFORMED><<<gridSize, blockSize>>>(
+            nq0, nq1, nq2, nCoord, nElmts, dfSize, df, out0, out1, out2);
+    }
+    else if (shapetype == LibUtilities::Tet)
+    {
+        PhysDerivTetKernel<TData, DEFORMED>
+            <<<gridSize, blockSize>>>(nq0, nq1, nq2, nCoord, nElmts, Z0, Z1, Z2,
+                                      dfSize, df, out0, out1, out2);
+    }
+    else if (shapetype == LibUtilities::Prism)
+    {
+        PhysDerivPrismKernel<TData, DEFORMED>
+            <<<gridSize, blockSize>>>(nq0, nq1, nq2, nCoord, nElmts, Z0, Z1, Z2,
+                                      dfSize, df, out0, out1, out2);
+    }
+    else if (shapetype == LibUtilities::Pyr)
+    {
+        PhysDerivPyrKernel<TData, DEFORMED>
+            <<<gridSize, blockSize>>>(nq0, nq1, nq2, nCoord, nElmts, Z0, Z1, Z2,
+                                      dfSize, df, out0, out1, out2);
+    }
+}
+
+} // namespace Nektar::Operators::detail
--- a/Operators/PhysDeriv/PhysDerivCUDAKernels.cuh
+++ b/Operators/PhysDeriv/PhysDerivCUDAKernels.cuh
+namespace Nektar::Operators::detail
+{
+template <typename TData, bool DEFORMED>
+__global__ void PhysDerivSegKernel(const size_t nq0, const size_t ncoord,
+                                   const size_t nelmt, const size_t dfSize,
+                                   TData *df, TData *inout0, TData *inout1,
+                                   TData *inout2)
+{
+    size_t e = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (e >= nelmt)
+    {
+        return;
+    }
+
+    size_t ndf = ncoord;
+
+    // Assign pointers.
+    TData **inoutptr = new TData *[ncoord];
+    inoutptr[0]      = inout0 + (nq0 * e);
+    if (ncoord > 1)
+    {
+        inoutptr[1] = inout1 + (nq0 * e);
+    }
+    if (ncoord > 2)
+    {
+        inoutptr[2] = inout2 + (nq0 * e);
+    }
+
+    TData **dfptr = new TData *[ndf];
+    for (size_t d = 0; d < ndf; d++)
+    {
+        dfptr[d] = df + d * dfSize;
+        dfptr[d] += DEFORMED ? nq0 * e : e;
+    }
+
+    // Compute derivative.
+    for (size_t j = 0; j < nq0; ++j)
+    {
+        size_t dfindex = DEFORMED ? j : 0;
+        for (size_t d = ndf; d > 0; d--)
+        {
+            inoutptr[d - 1][j] = inoutptr[0][j] * dfptr[d - 1][dfindex];
+        }
+    }
+}
+
+template <typename TData, bool DEFORMED>
+__global__ void PhysDerivQuadKernel(const size_t nq0, const size_t nq1,
+                                    const size_t ncoord, const size_t nelmt,
+                                    const size_t dfSize, TData *df,
+                                    TData *inout0, TData *inout1, TData *inout2)
+{
+    size_t e = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (e >= nelmt)
+    {
+        return;
+    }
+
+    auto ndf = 2 * ncoord;
+
+    // Assign pointers.
+    TData **inoutptr = new TData *[ncoord];
+    inoutptr[0]      = inout0 + (nq0 * nq1 * e);
+    inoutptr[1]      = inout1 + (nq0 * nq1 * e);
+    if (ncoord > 2)
+    {
+        inoutptr[2] = inout2 + (nq0 * nq1 * e);
+    }
+
+    TData **dfptr = new TData *[ndf];
+    for (size_t d = 0; d < ndf; d++)
+    {
+        dfptr[d] = df + d * dfSize;
+        dfptr[d] += DEFORMED ? nq0 * nq1 * e : e;
+    }
+
+    // Compute derivative.
+    for (size_t j = 0, cnt_ji = 0; j < nq1; ++j)
+    {
+        for (size_t i = 0; i < nq0; ++i, ++cnt_ji)
+        {
+            TData d0 = inoutptr[0][cnt_ji];
+            TData d1 = inoutptr[1][cnt_ji];
+
+            size_t dfindex = DEFORMED ? cnt_ji : 0;
+            for (size_t d = 0; d < ncoord; d++)
+            {
+                inoutptr[d][cnt_ji] =
+                    d0 * dfptr[2 * d][dfindex] + d1 * dfptr[2 * d + 1][dfindex];
+            }
+        }
+    }
+}
+
+template <typename TData, bool DEFORMED>
+__global__ void PhysDerivTriKernel(const size_t nq0, const size_t nq1,
+                                   const size_t ncoord, const size_t nelmt,
+                                   const TData *Z0, const TData *Z1,
+                                   const size_t dfSize, TData *df,
+                                   TData *inout0, TData *inout1, TData *inout2)
+{
+    size_t e = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (e >= nelmt)
+    {
+        return;
+    }
+
+    auto ndf = 2 * ncoord;
+
+    // Assign pointers.
+    TData **inoutptr = new TData *[ncoord];
+    inoutptr[0]      = inout0 + (nq0 * nq1 * e);
+    inoutptr[1]      = inout1 + (nq0 * nq1 * e);
+    if (ncoord > 2)
+    {
+        inoutptr[2] = inout2 + (nq0 * nq1 * e);
+    }
+
+    TData **dfptr = new TData *[ndf];
+    for (size_t d = 0; d < ndf; d++)
+    {
+        dfptr[d] = df + d * dfSize;
+        dfptr[d] += DEFORMED ? nq0 * nq1 * e : e;
+    }
+
+    // Compute derivative.
+    for (int j = 0, cnt_ji = 0; j < nq1; ++j)
+    {
+        TData xfrm0 = 2.0 / (1.0 - Z1[j]);
+        for (int i = 0; i < nq0; ++i, ++cnt_ji)
+        {
+            TData d0 = inoutptr[0][cnt_ji];
+            TData d1 = inoutptr[1][cnt_ji];
+
+            // Moving from standard to collapsed coordinates.
+            TData xfrm1 = 0.5 * (1.0 + Z0[i]);
+            d0 *= xfrm0;
+            d1 += d0 * xfrm1;
+
+            // Multiply by derivative factors.
+            size_t dfindex = DEFORMED ? cnt_ji : 0;
+            for (size_t d = 0; d < ncoord; d++)
+            {
+                inoutptr[d][cnt_ji] =
+                    d0 * dfptr[2 * d][dfindex] + d1 * dfptr[2 * d + 1][dfindex];
+            }
+        }
+    }
+}
+
+template <typename TData, bool DEFORMED>
+__global__ void PhysDerivHexKernel(const size_t nq0, const size_t nq1,
+                                   const size_t nq2, const size_t ncoord,
+                                   const size_t nelmt, const size_t dfSize,
+                                   TData *df, TData *inout0, TData *inout1,
+                                   TData *inout2)
+{
+    size_t e = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (e >= nelmt)
+    {
+        return;
+    }
+
+    size_t ndf = 3 * ncoord;
+
+    // Assign pointers.
+    TData **inoutptr = new TData *[ncoord];
+    inoutptr[0]      = inout0 + (nq0 * nq1 * nq2 * e);
+    inoutptr[1]      = inout1 + (nq0 * nq1 * nq2 * e);
+    inoutptr[2]      = inout2 + (nq0 * nq1 * nq2 * e);
+
+    TData **dfptr = new TData *[ndf];
+    for (size_t d = 0; d < ndf; d++)
+    {
+        dfptr[d] = df + d * dfSize;
+        dfptr[d] += DEFORMED ? nq0 * nq1 * nq2 * e : e;
+    }
+
+    // Compute derivative.
+    for (size_t k = 0, cnt_ijk = 0; k < nq2; ++k)
+    {
+        for (size_t j = 0; j < nq1; ++j)
+        {
+            for (size_t i = 0; i < nq0; ++i, ++cnt_ijk)
+            {
+                TData tmp[] = {inoutptr[0][cnt_ijk], inoutptr[1][cnt_ijk],
+                               inoutptr[2][cnt_ijk]};
+
+                // Multiply by derivative factors.
+                size_t dfindex = DEFORMED ? cnt_ijk : 0;
+                for (size_t d = 0; d < ncoord; ++d)
+                {
+                    TData sum = 0.0;
+                    for (size_t n = 0; n < ncoord; ++n)
+                    {
+                        sum += tmp[n] * dfptr[d * ncoord + n][dfindex];
+                    }
+                    inoutptr[d][cnt_ijk] = sum;
+                }
+            }
+        }
+    }
+}
+
+template <typename TData, bool DEFORMED>
+__global__ void PhysDerivTetKernel(const size_t nq0, const size_t nq1,
+                                   const size_t nq2, const size_t ncoord,
+                                   const size_t nelmt, const TData *Z0,
+                                   const TData *Z1, const TData *Z2,
+                                   const size_t dfSize, TData *df,
+                                   TData *inout0, TData *inout1, TData *inout2)
+{
+    size_t e = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (e >= nelmt)
+    {
+        return;
+    }
+
+    size_t ndf = 3 * ncoord;
+
+    // Allocate workspace memory.
+    TData *wsp0 = new TData[nq0 * nq1 * nq2];
+    TData *wsp1 = new TData[nq0 * nq1 * nq2];
+
+    // Assign pointers.
+    TData **inoutptr = new TData *[ncoord];
+    inoutptr[0]      = inout0 + (nq0 * nq1 * nq2 * e);
+    inoutptr[1]      = inout1 + (nq0 * nq1 * nq2 * e);
+    inoutptr[2]      = inout2 + (nq0 * nq1 * nq2 * e);
+
+    TData **dfptr = new TData *[ndf];
+    for (size_t d = 0; d < ndf; d++)
+    {
+        dfptr[d] = df + d * dfSize;
+        dfptr[d] += DEFORMED ? nq0 * nq1 * nq2 * e : e;
+    }
+
+    // Moving from standard to collapsed coordinates.
+    for (int k = 0, eta0 = 0; k < nq2; ++k)
+    {
+        TData xfrm_eta2 = 2.0 / (1.0 - Z2[k]);
+        for (int j = 0; j < nq1; ++j)
+        {
+            TData xfrm_eta1 = 2.0 / (1.0 - Z1[j]);
+            TData xfrm      = xfrm_eta1 * xfrm_eta2;
+            for (int i = 0; i < nq0; ++i, ++eta0)
+            {
+                inoutptr[0][eta0] *= xfrm;
+                wsp0[eta0] = inoutptr[0][eta0];
+            }
+        }
+    }
+
+    for (int k = 0, eta0 = 0; k < nq2; ++k)
+    {
+        TData xfrm_eta2 = 2.0 / (1.0 - Z2[k]);
+        for (int j = 0; j < nq1; ++j)
+        {
+            for (int i = 0; i < nq0; ++i, ++eta0)
+            {
+                TData xfrm_eta0   = 0.5 * (1.0 + Z0[i]);
+                wsp0[eta0]        = xfrm_eta0 * wsp0[eta0];
+                wsp1[eta0]        = xfrm_eta2 * inoutptr[1][eta0];
+                inoutptr[1][eta0] = wsp0[eta0] + wsp1[eta0];
+            }
+        }
+    }
+
+    for (int k = 0, eta0 = 0; k < nq2; ++k)
+    {
+        for (int j = 0; j < nq1; ++j)
+        {
+            TData xfrm_eta1 = 0.5 * (1.0 + Z1[j]);
+            for (int i = 0; i < nq0; ++i, ++eta0)
+            {
+                inoutptr[2][eta0] += wsp0[eta0] + wsp1[eta0] * xfrm_eta1;
+            }
+        }
+    }
+
+    // Compute derivative.
+    for (size_t k = 0, cnt_ijk = 0; k < nq2; ++k)
+    {
+        for (size_t j = 0; j < nq1; ++j)
+        {
+            for (size_t i = 0; i < nq0; ++i, ++cnt_ijk)
+            {
+                TData tmp[] = {inoutptr[0][cnt_ijk], inoutptr[1][cnt_ijk],
+                               inoutptr[2][cnt_ijk]};
+
+                // Multiply by derivative factors.
+                size_t dfindex = DEFORMED ? cnt_ijk : 0;
+                for (size_t d = 0; d < ncoord; ++d)
+                {
+                    TData sum = 0.0;
+                    for (size_t n = 0; n < ncoord; ++n)
+                    {
+                        sum += tmp[n] * dfptr[d * ncoord + n][dfindex];
+                    }
+                    inoutptr[d][cnt_ijk] = sum;
+                }
+            }
+        }
+    }
+}
+
+template <typename TData, bool DEFORMED>
+__global__ void PhysDerivPrismKernel(
+    const size_t nq0, const size_t nq1, const size_t nq2, const size_t ncoord,
+    const size_t nelmt, const TData *Z0, const TData *Z1, const TData *Z2,
+    const size_t dfSize, TData *df, TData *inout0, TData *inout1, TData *inout2)
+{
+    size_t e = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (e >= nelmt)
+    {
+        return;
+    }
+
+    size_t ndf = 3 * ncoord;
+
+    // Assign pointers.
+    TData **inoutptr = new TData *[ncoord];
+    inoutptr[0]      = inout0 + (nq0 * nq1 * nq2 * e);
+    inoutptr[1]      = inout1 + (nq0 * nq1 * nq2 * e);
+    inoutptr[2]      = inout2 + (nq0 * nq1 * nq2 * e);
+
+    TData **dfptr = new TData *[ndf];
+    for (size_t d = 0; d < ndf; d++)
+    {
+        dfptr[d] = df + d * dfSize;
+        dfptr[d] += DEFORMED ? nq0 * nq1 * nq2 * e : e;
+    }
+
+    // Compute derivative.
+    for (size_t k = 0, cnt_ijk = 0; k < nq2; ++k)
+    {
+        TData xfrm_eta2 = 2.0 / (1.0 - Z2[k]);
+        for (size_t j = 0; j < nq1; ++j)
+        {
+            for (size_t i = 0; i < nq0; ++i, ++cnt_ijk)
+            {
+                TData d0 = inoutptr[0][cnt_ijk];
+                TData d1 = inoutptr[1][cnt_ijk];
+                TData d2 = inoutptr[2][cnt_ijk];
+
+                // Chain-rule for eta_0 and eta_2.
+                TData xfrm_eta0 = 0.5 * (1.0 + Z0[i]);
+                d0 *= xfrm_eta2;
+                d2 += xfrm_eta0 * d0;
+
+                // Multiply by derivative factors.
+                size_t dfindex       = DEFORMED ? cnt_ijk : 0;
+                inoutptr[0][cnt_ijk] = d0 * dfptr[0][dfindex] +
+                                       d1 * dfptr[1][dfindex] +
+                                       d2 * dfptr[2][dfindex];
+                inoutptr[1][cnt_ijk] = d0 * dfptr[3][dfindex] +
+                                       d1 * dfptr[4][dfindex] +
+                                       d2 * dfptr[5][dfindex];
+                inoutptr[2][cnt_ijk] = d0 * dfptr[6][dfindex] +
+                                       d1 * dfptr[7][dfindex] +
+                                       d2 * dfptr[8][dfindex];
+            }
+        }
+    }
+}
+
+template <typename TData, bool DEFORMED>
+__global__ void PhysDerivPyrKernel(const size_t nq0, const size_t nq1,
+                                   const size_t nq2, const size_t ncoord,
+                                   const size_t nelmt, const TData *Z0,
+                                   const TData *Z1, const TData *Z2,
+                                   const size_t dfSize, TData *df,
+                                   TData *inout0, TData *inout1, TData *inout2)
+{
+    size_t e = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (e >= nelmt)
+    {
+        return;
+    }
+
+    size_t ndf = 3 * ncoord;
+
+    // Assign pointers.
+    TData **inoutptr = new TData *[ncoord];
+    inoutptr[0]      = inout0 + (nq0 * nq1 * nq2 * e);
+    inoutptr[1]      = inout1 + (nq0 * nq1 * nq2 * e);
+    inoutptr[2]      = inout2 + (nq0 * nq1 * nq2 * e);
+
+    TData **dfptr = new TData *[ndf];
+    for (size_t d = 0; d < ndf; d++)
+    {
+        dfptr[d] = df + d * dfSize;
+        dfptr[d] += DEFORMED ? nq0 * nq1 * nq2 * e : e;
+    }
+
+    // Compute derivative.
+    for (size_t k = 0, cnt_ijk = 0; k < nq2; ++k)
+    {
+        TData xfrm_eta2 = 2.0 / (1.0 - Z2[k]);
+        for (size_t j = 0; j < nq1; ++j)
+        {
+            TData xfrm_eta1 = 0.5 * (1.0 + Z1[j]);
+            for (size_t i = 0; i < nq0; ++i, ++cnt_ijk)
+            {
+                TData d0 = inoutptr[0][cnt_ijk];
+                TData d1 = inoutptr[1][cnt_ijk];
+                TData d2 = inoutptr[2][cnt_ijk];
+
+                // Chain-rule for eta_0 and eta_2.
+                TData xfrm_eta0 = 0.5 * (1.0 + Z0[i]);
+                d0 *= xfrm_eta2;
+                d1 *= xfrm_eta2;
+                d2 += xfrm_eta0 * d0 + xfrm_eta1 * d1;
+
+                // Multiply by derivative factors.
+                size_t dfindex       = DEFORMED ? cnt_ijk : 0;
+                inoutptr[0][cnt_ijk] = d0 * dfptr[0][dfindex] +
+                                       d1 * dfptr[1][dfindex] +
+                                       d2 * dfptr[2][dfindex];
+                inoutptr[1][cnt_ijk] = d0 * dfptr[3][dfindex] +
+                                       d1 * dfptr[4][dfindex] +
+                                       d2 * dfptr[5][dfindex];
+                inoutptr[2][cnt_ijk] = d0 * dfptr[6][dfindex] +
+                                       d1 * dfptr[7][dfindex] +
+                                       d2 * dfptr[8][dfindex];
+            }
+        }
+    }
+}
+
+template <typename TData>
+__global__ void PhysDerivTensor1DKernel(const size_t nq0, const size_t nelmt,
+                                        const TData *D0, const TData *in,
+                                        TData *out0)
+{
+    size_t e = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (e >= nelmt)
+    {
+        return;
+    }
+
+    // Assign pointers.
+    const TData *inptr = in + (nq0 * e);
+    TData *outptr0     = out0 + (nq0 * e);
+
+    // Direction 1
+    for (size_t i = 0; i < nq0; ++i)
+    {
+        TData sum = 0.0;
+        for (size_t k = 0; k < nq0; ++k)
+        {
+            sum += D0[k * nq0 + i] * inptr[k];
+        }
+        outptr0[i] = sum;
+    }
+}
+
+template <typename TData>
+__global__ void PhysDerivTensor2DKernel(const size_t nq0, const size_t nq1,
+                                        const size_t nelmt, const TData *D0,
+                                        const TData *D1, const TData *in,
+                                        TData *out0, TData *out1)
+{
+    size_t e = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (e >= nelmt)
+    {
+        return;
+    }
+
+    // Assign pointers.
+    const TData *inptr = in + (nq0 * nq1 * e);
+    TData *outptr0     = out0 + (nq0 * nq1 * e);
+    TData *outptr1     = out1 + (nq0 * nq1 * e);
+
+    // Direction 1
+    for (size_t i = 0; i < nq0; ++i)
+    {
+        for (size_t j = 0; j < nq1; ++j)
+        {
+            TData sum = 0.0;
+            for (size_t k = 0; k < nq0; ++k)
+            {
+                sum += D0[k * nq0 + i] * inptr[j * nq0 + k];
+            }
+            outptr0[j * nq0 + i] = sum;
+        }
+    }
+
+    // Direction 2
+    for (size_t i = 0; i < nq0; ++i)
+    {
+        for (size_t j = 0; j < nq1; ++j)
+        {
+            TData sum = 0.0;
+            for (size_t k = 0; k < nq1; ++k)
+            {
+                sum += D1[k * nq1 + j] * inptr[k * nq0 + i];
+            }
+            outptr1[j * nq0 + i] = sum;
+        }
+    }
+}
+
+template <typename TData>
+__global__ void PhysDerivTensor3DKernel(const size_t nq0, const size_t nq1,
+                                        const size_t nq2, const size_t nelmt,
+                                        const TData *D0, const TData *D1,
+                                        const TData *D2, const TData *in,
+                                        TData *out0, TData *out1, TData *out2)
+{
+    size_t e = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (e >= nelmt)
+    {
+        return;
+    }
+
+    // Assign pointers.
+    const TData *inptr = in + (nq0 * nq1 * nq2 * e);
+    TData *outptr0     = out0 + (nq0 * nq1 * nq2 * e);
+    TData *outptr1     = out1 + (nq0 * nq1 * nq2 * e);
+    TData *outptr2     = out2 + (nq0 * nq1 * nq2 * e);
+
+    // Direction 1
+    for (size_t i = 0; i < nq0; ++i)
+    {
+        for (size_t j = 0; j < nq1 * nq2; ++j)
+        {
+            TData sum = 0.0;
+            for (size_t k = 0; k < nq0; ++k)
+            {
+                sum += D0[k * nq0 + i] * inptr[j * nq0 + k];
+            }
+            outptr0[j * nq0 + i] = sum;
+        }
+    }
+
+    // Direction 2
+    for (size_t block = 0; block < nq2; ++block)
+    {
+        size_t start = block * nq0 * nq1;
+        for (size_t i = 0; i < nq0; ++i)
+        {
+            for (size_t j = 0; j < nq1; ++j)
+            {
+                TData sum = 0.0;
+                for (size_t k = 0; k < nq1; ++k)
+                {
+                    sum += D1[k * nq1 + j] * inptr[start + k * nq0 + i];
+                }
+                outptr1[start + j * nq0 + i] = sum;
+            }
+        }
+    }
+
+    // Direction 3
+    for (size_t i = 0; i < nq0 * nq1; ++i)
+    {
+        for (size_t j = 0; j < nq2; ++j)
+        {
+            TData sum = 0.0;
+            for (size_t k = 0; k < nq2; ++k)
+            {
+                sum += D2[k * nq2 + j] * inptr[k * nq0 * nq1 + i];
+            }
+            outptr2[j * nq0 * nq1 + i] = sum;
+        }
+    }
+}
+} // namespace Nektar::Operators::detail
--- a/Operators/PhysDeriv/PhysDerivImpl.cpp
+++ b/Operators/PhysDeriv/PhysDerivImpl.cpp
+#include "PhysDerivStdMat.hpp"
+
+namespace Nektar::Operators::detail
+{
+
+// Add different PhysDeriv implementations to the factory.
+template <>
+std::string OperatorPhysDerivImpl<double, ImplStdMat>::className =
+    GetOperatorFactory<double>().RegisterCreatorFunction(
+        "PhysDerivStdMat",
+        OperatorPhysDerivImpl<double, ImplStdMat>::instantiate, "...");
+
+} // namespace Nektar::Operators::detail
--- a/Operators/PhysDeriv/PhysDerivStdMat.hpp
+++ b/Operators/PhysDeriv/PhysDerivStdMat.hpp
+#include "Operators/OperatorPhysDeriv.hpp"
+#include <StdRegions/StdExpansion.h>
+
+namespace Nektar::Operators::detail
+{
+
+// standard matrix implementation
+template <typename TData>
+class OperatorPhysDerivImpl<TData, ImplStdMat> : public OperatorPhysDeriv<TData>
+{
+public:
+    OperatorPhysDerivImpl(const MultiRegions::ExpListSharedPtr &expansionList)
+        : OperatorPhysDeriv<TData>(expansionList)
+    {
+        size_t nTotElmts = this->m_expansionList->GetNumElmts();
+        size_t nDim      = this->m_expansionList->GetShapeDimension();
+
+        // Initialise derivative factor.
+        size_t dfSize = Operator<TData>::GetGeometricFactorSize();
+        m_derivFac    = Operator<TData>::SetDerivativeFactor(dfSize);
+
+        // Initialize basiskey.
+        std::vector<LibUtilities::BasisKey> basisKeys(
+            3, LibUtilities::NullBasisKey);
+
+        // Loop over the elements of expansionList.
+        for (size_t e = 0; e < nTotElmts; ++e)
+        {
+            auto const expPtr = this->m_expansionList->GetExp(e);
+
+            // Fetch basiskeys of current element.
+            for (size_t d = 0; d < nDim; d++)
+            {
+                basisKeys[d] = expPtr->GetBasis(d)->GetBasisKey();
+            }
+
+            // Copy data to m_matPtr, if necessary.
+            if (m_matPtr.find(basisKeys) == m_matPtr.end())
+            {
+                size_t nqTot = expPtr->GetTotPoints();
+                auto &matPtr = m_matPtr[basisKeys];
+                matPtr       = Array<OneD, Array<OneD, TData>>(nDim);
+                Array<OneD, NekDouble> tmp(nqTot), t;
+                for (size_t d = 0; d < nDim; ++d)
+                {
+                    // Get deriv matrix.
+                    matPtr[d] = Array<OneD, TData>(nqTot * nqTot);
+                    for (int i = 0; i < nqTot; ++i)
+                    {
+                        Vmath::Zero(nqTot, tmp, 1);
+                        tmp[i] = 1.0;
+                        expPtr->GetStdExp()->PhysDeriv(
+                            d, tmp, t = matPtr[d] + i * nqTot);
+                    }
+                }
+            }
+        }
+    }
+
+    void apply(Field<TData, FieldState::Phys> &in,
+               Field<TData, FieldState::Phys> &out0,
+               Field<TData, FieldState::Phys> &out1,
+               Field<TData, FieldState::Phys> &out2) override
+    {
+        // Initialize pointers.
+        auto const *inptr = in.GetStorage().GetCPUPtr();
+        std::vector<TData *> outptr{out0.GetStorage().GetCPUPtr(),
+                                    out1.GetStorage().GetCPUPtr(),
+                                    out2.GetStorage().GetCPUPtr()};
+
+        // Initialize index.
+        size_t expIdx  = 0;
+        size_t dfindex = 0;
+
+        // Initialize basiskey.
+        std::vector<LibUtilities::BasisKey> basisKeys(
+            3, LibUtilities::NullBasisKey);
+
+        for (size_t block_idx = 0; block_idx < in.GetBlocks().size();
+             ++block_idx)
+        {
+            // Determine shape and type of the element.
+            auto const expPtr = this->m_expansionList->GetExp(expIdx);
+            auto nElmts       = in.GetBlocks()[block_idx].num_elements;
+            auto nDim         = expPtr->GetShapeDimension();
+            auto nCoord       = expPtr->GetCoordim();
+            auto nqTot        = expPtr->GetTotPoints();
+            auto ptsKeys      = expPtr->GetPointsKeys();
+            auto deformed     = expPtr->GetMetricInfo()->GetGtype() ==
+                            SpatialDomains::eDeformed;
+            Array<OneD, Array<OneD, TData>> deriv(nDim);
+
+            // Fetch basis key for the current element type.
+            for (size_t d = 0; d < expPtr->GetShapeDimension(); d++)
+            {
+                basisKeys[d] = expPtr->GetBasis(d)->GetBasisKey();
+            }
+
+            // Get derivative matrix.
+            auto &matPtr = m_matPtr[basisKeys];
+            for (size_t d = 0; d < nDim; ++d)
+            {
+                // Perform matrix-matrix multiply.
+                deriv[d] = Array<OneD, TData>(nqTot * nElmts);
+                Blas::Dgemm('N', 'N', nqTot, nElmts, nqTot, 1.0,
+                            matPtr[d].get(), nqTot, inptr, nqTot, 0.0,
+                            deriv[d].get(), nqTot);
+            }
+
+            if (deformed)
+            {
+                for (size_t i = 0; i < nCoord; i++)
+                {
+                    Vmath::Vmul(nqTot * nElmts,
+                                m_derivFac[i * nDim].get() + dfindex, 1,
+                                deriv[0].get(), 1, outptr[i], 1);
+                    for (size_t d = 1; d < nDim; d++)
+                    {
+                        Vmath::Vvtvp(nqTot * nElmts,
+                                     m_derivFac[i * nDim + d].get() + dfindex,
+                                     1, deriv[d].get(), 1, outptr[i], 1,
+                                     outptr[i], 1);
+                    }
+                    outptr[i] += nqTot * nElmts;
+                }
+                dfindex += nqTot * nElmts;
+            }
+            else
+            {
+                for (size_t i = 0; i < nCoord; i++)
+                {
+                    for (size_t e = 0; e < nElmts; ++e)
+                    {
+                        Vmath::Smul(nqTot, m_derivFac[i * nDim][dfindex + e],
+                                    deriv[0].get() + e * nqTot, 1, outptr[i],
+                                    1);
+                        for (size_t d = 1; d < nDim; d++)
+                        {
+                            Vmath::Svtvp(nqTot,
+                                         m_derivFac[i * nDim + d][dfindex + e],
+                                         deriv[d].get() + e * nqTot, 1,
+                                         outptr[i], 1, outptr[i], 1);
+                        }
+                        outptr[i] += nqTot;
+                    }
+                }
+                dfindex += nElmts;
+            }
+            inptr += nqTot * nElmts;
+            expIdx += nElmts;
+        }
+    }
+
+    static std::unique_ptr<Operator<TData>> instantiate(
+        const MultiRegions::ExpListSharedPtr &expansionList)
+    {
+        return std::make_unique<OperatorPhysDerivImpl<TData, ImplStdMat>>(
+            expansionList);
+    }
+
+    static std::string className;
+
+private:
+    Array<OneD, Array<OneD, TData>> m_derivFac;
+    std::map<std::vector<LibUtilities::BasisKey>,
+             Array<OneD, Array<OneD, TData>>>
+        m_matPtr;
+};
+
+} // namespace Nektar::Operators::detail
--- a/main.cpp
+++ b/main.cpp
@@ -15,6 +15,7 @@
 #include "Field.hpp"
 #include "Operators/OperatorBwdTrans.hpp"
 #include "Operators/OperatorIProductWRTBase.hpp"
+#include "Operators/OperatorPhysDeriv.hpp"

 #include <LibUtilities/BasicUtils/SessionReader.h>
 //#include <LibUtilities/SimdLib/tinysimd.hpp>
@@ -188,7 +189,7 @@ int main(int argc, char *argv[])
        // Check output values.
        std::cout << "Out:" << std::endl;
        auto outptr = outPhys.GetStorage().GetCPUPtr();
-        for (auto const &block : out.GetBlocks())
+        for (auto const &block : outPhys.GetBlocks())
        {
            for (size_t el = 0; el < block.num_elements; ++el)
            {
@@ -353,6 +354,180 @@ int main(int argc, char *argv[])
        std::cout << std::endl;
    }
 #endif
+    // Test PhysDeriv Implementation
+    {
+        std::cout << "PhysDeriv (StdMat) test starts." << std::endl;
+
+        // Create two Field objects with a MemoryRegionCPU backend by default
+        // for the inner product with respect to base
+        auto inPhys   = Field<double, FieldState::Phys>::create(blocks_phys);
+        auto outPhys0 = Field<double, FieldState::Phys>::create(blocks_phys);
+        auto outPhys1 = Field<double, FieldState::Phys>::create(blocks_phys);
+        auto outPhys2 = Field<double, FieldState::Phys>::create(blocks_phys);
+
+        // Assign input values
+        auto *inptr = inPhys.GetStorage().GetCPUPtr();
+        for (auto const &block : inPhys.GetBlocks())
+        {
+            for (size_t el = 0; el < block.num_elements; ++el)
+            {
+                for (size_t phys = 0; phys < block.num_pts; ++phys)
+                {
+                    // Each element is the index of the quadrature points
+                    // this is useful for testing reshapes
+                    *(inptr++) = phys;
+                }
+            }
+        }
+
+        std::cout << "Initial shape:\n" << inPhys << std::endl;
+
+        // PhysDeriv
+        PhysDeriv<>::create(explist, "StdMat")
+            ->apply(inPhys, outPhys0, outPhys1, outPhys2);
+
+        // Check output values.
+        std::cout << "Out0:" << std::endl;
+        auto *outptr0 = outPhys0.GetStorage().GetCPUPtr();
+        for (auto const &block : outPhys0.GetBlocks())
+        {
+            for (size_t el = 0; el < block.num_elements; ++el)
+            {
+                for (size_t phys = 0; phys < block.num_pts; ++phys)
+                {
+                    std::cout << *(outptr0++) << ' ';
+                }
+            }
+            std::cout << std::endl;
+        }
+        std::cout << std::endl;
+
+        // Check output values.
+        std::cout << "Out1:" << std::endl;
+        auto *outptr1 = outPhys1.GetStorage().GetCPUPtr();
+        for (auto const &block : outPhys1.GetBlocks())
+        {
+            for (size_t el = 0; el < block.num_elements; ++el)
+            {
+                for (size_t phys = 0; phys < block.num_pts; ++phys)
+                {
+                    std::cout << *(outptr1++) << ' ';
+                }
+            }
+            std::cout << std::endl;
+        }
+        std::cout << std::endl;
+
+        // Check output values.
+        std::cout << "Out2:" << std::endl;
+        auto *outptr2 = outPhys2.GetStorage().GetCPUPtr();
+        for (auto const &block : outPhys2.GetBlocks())
+        {
+            for (size_t el = 0; el < block.num_elements; ++el)
+            {
+                for (size_t phys = 0; phys < block.num_pts; ++phys)
+                {
+                    std::cout << *(outptr2++) << ' ';
+                }
+            }
+            std::cout << std::endl;
+        }
+        std::cout << std::endl;
+    }
+    // Test PhysDeriv (CUDA) Implementation
+#ifdef NEKTAR_USE_CUDA
+    {
+        std::cout << "PhysDeriv (CUDA) test starts." << std::endl;
+
+        // Create two Field objects with a MemoryRegionCPU backend by default
+        // for the inner product with respect to base
+        auto inPhys = Field<double, FieldState::Phys>::create<MemoryRegionCUDA>(
+            blocks_phys);
+        auto outPhys0 =
+            Field<double, FieldState::Phys>::create<MemoryRegionCUDA>(
+                blocks_phys);
+        auto outPhys1 =
+            Field<double, FieldState::Phys>::create<MemoryRegionCUDA>(
+                blocks_phys);
+        auto outPhys2 =
+            Field<double, FieldState::Phys>::create<MemoryRegionCUDA>(
+                blocks_phys);
+
+        // Assign input values
+        std::cout << "Initial shape: " << std::endl;
+        auto *inptr =
+            inPhys.template GetStorage<MemoryRegionCUDA>().GetCPUPtr();
+        for (auto const &block : inPhys.GetBlocks())
+        {
+            for (size_t el = 0; el < block.num_elements; ++el)
+            {
+                for (size_t phys = 0; phys < block.num_pts; ++phys)
+                {
+                    // Each element is the index of the quadrature points
+                    // this is useful for testing reshapes
+                    *(inptr++) = phys;
+                    std::cout << phys << " ";
+                }
+            }
+        }
+        std::cout << std::endl << std::endl;
+
+        // PhysDeriv
+        PhysDeriv<>::create(explist, "CUDA")
+            ->apply(inPhys, outPhys0, outPhys1, outPhys2);
+
+        // Check output values.
+        std::cout << "Out0:" << std::endl;
+        auto *outptr0 =
+            outPhys0.template GetStorage<MemoryRegionCUDA>().GetCPUPtr();
+        for (auto const &block : outPhys0.GetBlocks())
+        {
+            for (size_t el = 0; el < block.num_elements; ++el)
+            {
+                for (size_t phys = 0; phys < block.num_pts; ++phys)
+                {
+                    std::cout << *(outptr0++) << ' ';
+                }
+            }
+            std::cout << std::endl;
+        }
+        std::cout << std::endl;
+
+        // Check output values.
+        std::cout << "Out1:" << std::endl;
+        auto *outptr1 =
+            outPhys1.template GetStorage<MemoryRegionCUDA>().GetCPUPtr();
+        for (auto const &block : outPhys1.GetBlocks())
+        {
+            for (size_t el = 0; el < block.num_elements; ++el)
+            {
+                for (size_t phys = 0; phys < block.num_pts; ++phys)
+                {
+                    std::cout << *(outptr1++) << ' ';
+                }
+            }
+            std::cout << std::endl;
+        }
+        std::cout << std::endl;
+
+        // Check output values.
+        std::cout << "Out2:" << std::endl;
+        auto *outptr2 =
+            outPhys2.template GetStorage<MemoryRegionCUDA>().GetCPUPtr();
+        for (auto const &block : outPhys2.GetBlocks())
+        {
+            for (size_t el = 0; el < block.num_elements; ++el)
+            {
+                for (size_t phys = 0; phys < block.num_pts; ++phys)
+                {
+                    std::cout << *(outptr2++) << ' ';
+                }
+            }
+            std::cout << std::endl;
+        }
+        std::cout << std::endl;
+    }
+#endif

    std::cout << "END" << std::endl;
    return 0;

--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -44,6 +44,26 @@ IF (NEKTAR_USE_CUDA)
        -DTEST_PATH="${CMAKE_SOURCE_DIR}")
 ENDIF()

+add_executable(test_physderiv test_physderiv.cpp)
+target_link_libraries(test_physderiv PRIVATE Operators)
+target_link_libraries(test_physderiv PRIVATE ${NEKTAR++_LIBRARIES})
+target_link_libraries(test_physderiv PRIVATE Boost::unit_test_framework)
+target_include_directories(test_physderiv PRIVATE "${CMAKE_SOURCE_DIR}")
+target_include_directories(test_physderiv PRIVATE ${NEKTAR++_INCLUDE_DIRS})
+target_compile_definitions(test_physderiv PRIVATE
+    -DTEST_PATH="${CMAKE_SOURCE_DIR}")
+
+IF (NEKTAR_USE_CUDA)
+    add_executable(test_physderivcuda test_physderivcuda.cpp)
+    target_link_libraries(test_physderivcuda PRIVATE Operators)
+    target_link_libraries(test_physderivcuda PRIVATE ${NEKTAR++_LIBRARIES})
+    target_link_libraries(test_physderivcuda PRIVATE Boost::unit_test_framework)
+    target_include_directories(test_physderivcuda PRIVATE "${CMAKE_SOURCE_DIR}")
+    target_include_directories(test_physderivcuda PRIVATE ${NEKTAR++_INCLUDE_DIRS})
+    target_compile_definitions(test_physderivcuda PRIVATE
+        -DTEST_PATH="${CMAKE_SOURCE_DIR}")
+ENDIF()
+
 add_test(
  NAME BwdTrans
  COMMAND test_bwdtrans
@@ -71,3 +91,17 @@ IF (NEKTAR_USE_CUDA)
      WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
    )
 ENDIF()
+
+add_test(
+  NAME PhysDeriv
+  COMMAND test_physderiv
+  WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+)
+
+IF (NEKTAR_USE_CUDA)
+    add_test(
+      NAME PhysDerivCUDA
+      COMMAND test_physderivcuda
+      WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+    )
+ENDIF()
--- a/tests/init_fields.hpp
+++ b/tests/init_fields.hpp
@@ -34,24 +34,39 @@ template <typename TData, FieldState stateIn = FieldState::Coeff,
 class InitFields
 {
 public:
-    Field<TData, stateIn>  *fixt_in        = nullptr;
-    Field<TData, stateOut> *fixt_out       = nullptr;
-    Field<TData, stateOut> *fixt_expected  = nullptr;
+    Field<TData, stateIn> *fixt_in        = nullptr;
+    Field<TData, stateOut> *fixt_out      = nullptr;
+    Field<TData, stateOut> *fixt_expected = nullptr;
 #ifdef NEKTAR_USE_CUDA
-    Field<TData, stateIn>  *fixtcuda_in    = nullptr;
-    Field<TData, stateOut> *fixtcuda_out   = nullptr;
+    Field<TData, stateIn> *fixtcuda_in   = nullptr;
+    Field<TData, stateOut> *fixtcuda_out = nullptr;
 #endif
    MultiRegions::ExpListSharedPtr fixt_explist{nullptr};

    ~InitFields()
    {
        BOOST_TEST_MESSAGE("teardown fixture");
-        if (fixt_in) delete fixt_in;
-        if (fixt_out) delete fixt_out;
-        if (fixt_expected) delete fixt_expected;
+        if (fixt_in)
+        {
+            delete fixt_in;
+        }
+        if (fixt_out)
+        {
+            delete fixt_out;
+        }
+        if (fixt_expected)
+        {
+            delete fixt_expected;
+        }
 #ifdef NEKTAR_USE_CUDA
-        if (fixtcuda_in) delete fixtcuda_in;
-        if (fixtcuda_out) delete fixtcuda_out;
+        if (fixtcuda_in)
+        {
+            delete fixtcuda_in;
+        }
+        if (fixtcuda_out)
+        {
+            delete fixtcuda_out;
+        }
 #endif
    }


--- a/tests/test_bwdtrans.cpp
+++ b/tests/test_bwdtrans.cpp
@@ -21,7 +21,8 @@ using namespace Nektar;
 class Line : public InitFields<double, FieldState::Coeff, FieldState::Phys>
 {
 public:
-    Line() : InitFields<double, FieldState::Coeff, FieldState::Phys>() {
+    Line() : InitFields<double, FieldState::Coeff, FieldState::Phys>()
+    {
        meshName = "line.xml";
    }
 };
@@ -29,7 +30,8 @@ public:
 class Square : public InitFields<double, FieldState::Coeff, FieldState::Phys>
 {
 public:
-    Square() : InitFields<double, FieldState::Coeff, FieldState::Phys>() {
+    Square() : InitFields<double, FieldState::Coeff, FieldState::Phys>()
+    {
        meshName = "square.xml";
    }
 };

--- a/tests/test_bwdtranscuda.cpp
+++ b/tests/test_bwdtranscuda.cpp
@@ -21,7 +21,8 @@ using namespace Nektar;
 class Line : public InitFields<double, FieldState::Coeff, FieldState::Phys>
 {
 public:
-    Line() : InitFields<double, FieldState::Coeff, FieldState::Phys>() {
+    Line() : InitFields<double, FieldState::Coeff, FieldState::Phys>()
+    {
        meshName = "line.xml";
    }
 };
@@ -29,7 +30,8 @@ public:
 class Square : public InitFields<double, FieldState::Coeff, FieldState::Phys>
 {
 public:
-    Square() : InitFields<double, FieldState::Coeff, FieldState::Phys>() {
+    Square() : InitFields<double, FieldState::Coeff, FieldState::Phys>()
+    {
        meshName = "square.xml";
    }
 };

--- a/tests/test_ipwrtbase.cpp
+++ b/tests/test_ipwrtbase.cpp
@@ -21,7 +21,8 @@ using namespace Nektar;
 class Line : public InitFields<double, FieldState::Phys, FieldState::Coeff>
 {
 public:
-    Line() : InitFields<double, FieldState::Phys, FieldState::Coeff>() {
+    Line() : InitFields<double, FieldState::Phys, FieldState::Coeff>()
+    {
        meshName = "line.xml";
    }
 };
@@ -29,7 +30,8 @@ public:
 class Square : public InitFields<double, FieldState::Phys, FieldState::Coeff>
 {
 public:
-    Square() : InitFields<double, FieldState::Phys, FieldState::Coeff>() {
+    Square() : InitFields<double, FieldState::Phys, FieldState::Coeff>()
+    {
        meshName = "square.xml";
    }
 };

--- a/tests/test_ipwrtbasecuda.cpp
+++ b/tests/test_ipwrtbasecuda.cpp
@@ -21,7 +21,8 @@ using namespace Nektar;
 class Line : public InitFields<double, FieldState::Phys, FieldState::Coeff>
 {
 public:
-    Line() : InitFields<double, FieldState::Phys, FieldState::Coeff>() {
+    Line() : InitFields<double, FieldState::Phys, FieldState::Coeff>()
+    {
        meshName = "line.xml";
    }
 };
@@ -29,7 +30,8 @@ public:
 class Square : public InitFields<double, FieldState::Phys, FieldState::Coeff>
 {
 public:
-    Square() : InitFields<double, FieldState::Phys, FieldState::Coeff>() {
+    Square() : InitFields<double, FieldState::Phys, FieldState::Coeff>()
+    {
        meshName = "square.xml";
    }
 };

--- a/tests/test_physderiv.cpp
+++ b/tests/test_physderiv.cpp
+#define BOOST_TEST_MODULE example
+#include <boost/test/unit_test.hpp>
+
+#include <iostream>
+#include <memory>
+
+#include "Field.hpp"
+#include "Operators/OperatorPhysDeriv.hpp"
+
+#include <LibUtilities/BasicUtils/SessionReader.h>
+#include <MultiRegions/ExpList.h>
+#include <SpatialDomains/MeshGraph.h>
+
+#include "init_fields.hpp"
+
+using namespace std;
+using namespace Nektar::Operators;
+using namespace Nektar::LibUtilities;
+using namespace Nektar;
+
+class Line : public InitFields<double, FieldState::Phys, FieldState::Phys>
+{
+public:
+    Line() : InitFields<double, FieldState::Phys, FieldState::Phys>()
+    {
+        meshName = "line.xml";
+    }
+};
+
+class Square : public InitFields<double, FieldState::Phys, FieldState::Phys>
+{
+public:
+    Square() : InitFields<double, FieldState::Phys, FieldState::Phys>()
+    {
+        meshName = "square.xml";
+    }
+};
+
+BOOST_FIXTURE_TEST_CASE(physderiv, Line)
+{
+
+    Configure();
+
+    double *inptr = fixt_in->GetStorage().GetCPUPtr();
+    double *exptr = fixt_expected->GetStorage().GetCPUPtr();
+
+    size_t order = 6, pts = 0;
+    Array<OneD, double> x(fixt_explist->GetTotPoints());
+    fixt_explist->GetCoords(x);
+    for (auto const &block : fixt_in->GetBlocks())
+    {
+        for (size_t el = 0; el < block.num_elements; ++el)
+        {
+            for (size_t phys = 0; phys < block.num_pts; ++phys)
+            {
+                double tmp1 = 0.0, tmp2 = 0.0;
+                for (size_t k = 0; k < order; k++)
+                {
+                    tmp1 += std::pow(x[pts], k);
+                    tmp2 += k * std::pow(x[pts], k - 1);
+                }
+                pts++;
+                *(inptr++) = tmp1;
+                *(exptr++) = tmp2;
+            }
+        }
+    }
+
+    PhysDeriv<>::create(fixt_explist, "StdMat")
+        ->apply(*fixt_in, *fixt_out, *fixt_out, *fixt_out);
+    double TOL{1.0E-12};
+    BOOST_TEST(fixt_out->compare(*fixt_expected, TOL));
+}
No results found