diff --git a/.gitignore b/.gitignore
index 1f5dcc33c16a7ae52938f0416c551d8df56d878e..59247d9be06a3a0d1cef9139eb04b0a3063bd9d0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@ build
 .ccls-cache/
 a.out
 .ccls-cache/
+.cache
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 68eeedc03150049799d821021525da3c7aa9a25b..d84013d68b779719205485b4d3cb3a91a5be98c5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,11 +2,25 @@ cmake_minimum_required(VERSION 3.0)
 project(Redesign CXX)
 
 option(NEKTAR_USE_CUDA "Enable CUDA support" FALSE)
+option(NEKTAR_USE_SIMD "Enable SIMD support, if available" TRUE)
 
 set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
-set(SRC LibUtilities/ErrorUtil.cpp Operators/Operator.cpp Operators/OperatorBwdTrans.cpp Operators/BwdTrans/BwdTransImpl.cpp)
+# Default install location: build/dist
+IF (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
+        SET(CMAKE_INSTALL_PREFIX ${CMAKE_BINARY_DIR}/dist CACHE PATH "" FORCE)
+ENDIF()
+
+find_package(Nektar++ REQUIRED)
+include_directories(SYSTEM ${NEKTAR++_INCLUDE_DIRS} ${NEKTAR++_TP_INCLUDE_DIRS})
+link_directories(${NEKTAR++_LIBRARY_DIRS} ${NEKTAR++_TP_LIBRARY_DIRS})
+message(STATUS "Found Nektar++: version ${NEKTAR++_VERSION}")
+
+set(CMAKE_INSTALL_RPATH "${NEKTAR++_LIBRARY_DIRS}")
+
+set(SRC Operators/Operator.cpp Operators/OperatorBwdTrans.cpp Operators/BwdTrans/BwdTransImpl.cpp)
 
 if (NEKTAR_USE_CUDA)
     enable_language(CUDA)
@@ -14,13 +28,26 @@ if (NEKTAR_USE_CUDA)
     set(SRC ${SRC} MemoryRegionCUDA.cu Operators/BwdTrans/BwdTransCUDA.cu)
 endif()
 
+if (NEKTAR_USE_SIMD)
+    add_compile_definitions(NEKTAR_ENABLE_SIMD_AVX512 NEKTAR_ENABLE_SIMD_AVX2 NEKTAR_ENABLE_SIMD_SSE2)
+    add_compile_options("-march=native")
+endif()
+
+# add_compile_options("-fsanitize=address,undefined")
+# add_link_options("-fsanitize=address,undefined")
+
+add_compile_definitions(NEKTAR_FULLDEBUG) # Test ASSERT/WARNING macros
+
 add_library(Operators SHARED ${SRC})
+target_link_libraries(Operators PUBLIC ${NEKTAR++_LIBRARIES} ${NEKTAR++_TP_LIBRARIES})
 target_include_directories(Operators PRIVATE "${CMAKE_SOURCE_DIR}")
+target_compile_definitions(Operators PUBLIC ${NEKTAR++_DEFINITIONS})
 
 set(TEST_SRC main.cpp)
 add_executable(main ${TEST_SRC})
 target_link_libraries(main Operators)
 target_include_directories(main PRIVATE "${CMAKE_SOURCE_DIR}")
+target_compile_definitions(main PUBLIC ${NEKTAR++_DEFINITIONS})
 
 find_package(Doxygen)
 add_custom_target(doc COMMAND ${DOXYGEN_EXECUTABLE} ${CMAKE_SOURCE_DIR}/Doxyfile)
diff --git a/Field.hpp b/Field.hpp
index 52f8d0905812bc7a5fd0cb46517f71f4c3dba524..a5dc600fc765a54c9cfa6bae38ee41c8d99bd58f 100644
--- a/Field.hpp
+++ b/Field.hpp
@@ -1,23 +1,20 @@
 #pragma once
 
+#include <StdRegions/StdExpansion.h>
+#include <StdRegions/StdRegions.hpp>
 #include <array>
 #include <iostream>
 #include <memory>
+#include <numeric>
 #include <stdexcept>
 #include <string>
 #include <vector>
 
-#include "LibUtilities/ErrorUtil.hpp"
 #include "MemoryRegionCPU.hpp"
-
-/**
- * @brief Element shape enum used when defining Blocks of elements.
- */
-enum class ShapeType
-{
-    eQuadrilateral,
-    eTriangle
-};
+#include <LibUtilities/BasicUtils/ErrorUtil.hpp>
+#include <LibUtilities/BasicUtils/ShapeType.hpp>
+#include <LibUtilities/BasicUtils/SharedArray.hpp>
+#include <StdRegions/StdExpansion.h>
 
 /**
  * @brief Captures the structure of a block of elements of identical shape
@@ -25,14 +22,20 @@ enum class ShapeType
  */
 struct BlockAttributes
 {
-    ShapeType shape;
-    std::array<int, 3> dofs;
+    BlockAttributes(size_t num_elements, size_t num_pts)
+        : num_elements(num_elements), num_pts(num_pts),
+          block_size(num_elements * num_pts)
+    {
+    }
+
     size_t num_elements;
+    size_t num_pts;
+    size_t block_size;
 };
 
 /**
  * @brief Possible states for Field data.
- * 
+ *
  * These identify the mathematical representation of the field data. The two
  * main states are *Phys*, representing the field at the quadrature points,
  * and *Coeff*, representing the field in terms of its spectral/hp element
@@ -51,9 +54,10 @@ static constexpr FieldState DefaultState = FieldState::Phys;
  * @tparam TType  The floating-point representation used by the field.
  * @tparam TState A FieldState value representing the state of the field.
  */
-template <typename TType = double, FieldState TState = DefaultState>
-class Field
+template <typename TType = double, FieldState TState = DefaultState> class Field
 {
+    using ExpansionSharedPtr = Nektar::StdRegions::StdExpansionSharedPtr;
+
 public:
     Field(const Field &) = delete;
     virtual ~Field()     = default;
@@ -61,21 +65,22 @@ public:
     /**
      * @brief Construct a new Field object by moving storage from an existing
      * Field object.
-     * 
-     * @param rhs 
+     *
+     * @param rhs
      */
     Field(Field &&rhs)
         : m_storage(std::move(rhs.m_storage)),
           block_attributes(std::move(rhs.block_attributes)),
-          component_names(std::move(component_names))
+          component_names(std::move(rhs.component_names)),
+          m_curVecWidth(rhs.m_curVecWidth)
     {
     }
 
     /**
      * @brief Move assignment operator.
-     * 
-     * @param rhs 
-     * @return Field& 
+     *
+     * @param rhs
+     * @return Field&
      */
     Field &operator=(Field &&rhs)
     {
@@ -88,27 +93,22 @@ public:
 
     /**
      * @brief Static templated creation method.
-     * 
+     *
      * @tparam TMemoryRegion Type of memory region to use
      * @param blocks         Field data specification.
      * @param num_components Number of components for a vector field.
-     * @return Field<TType, TState> 
+     * @return Field<TType, TState>
      */
     template <template <typename> class TMemoryRegion = MemoryRegionCPU>
-    static Field<TType, TState> create(std::vector<BlockAttributes> &blocks,
+    static Field<TType, TState> create(std::vector<BlockAttributes> blocks,
                                        int num_components = 1)
     {
-        auto field = Field(blocks, num_components);
+        auto field = Field(std::move(blocks), num_components);
 
-        size_t storage_size = 0;
-        for (int i = 0; i < blocks.size(); ++i)
-        {
-            auto &block      = blocks[i];
-            size_t blockSize = block.num_elements * block.dofs[0] *
-                               block.dofs[1] *
-                               block.dofs[2]; // wasteful but an upper bound
-            storage_size += blockSize;
-        }
+        size_t storage_size = std::accumulate(
+            field.block_attributes.begin(), field.block_attributes.end(), 0,
+            [](size_t acc, const BlockAttributes &block)
+            { return acc + block.block_size; });
 
         // Create new TMemoryRegion and polymorphically store as MemoryRegionCPU
         field.m_storage = std::make_unique<TMemoryRegion<TType>>(
@@ -117,14 +117,32 @@ public:
         return field;
     }
 
+    template <template <typename> class TMemoryRegion = MemoryRegionCPU>
+    static Field<TType, TState> fromArray(
+        std::vector<BlockAttributes> blocks,
+        Nektar::Array<Nektar::OneD, TType> const &array)
+    {
+        auto field = create<MemoryRegionCPU>(blocks);
+
+        ASSERTL0(
+            field.GetStorage().size() == array.size(),
+            "Array size does not match size calculated from provided elements.")
+
+        std::copy(array.begin(), array.end(), field.GetStorage().GetCPUPtr());
+
+        field.template GetStorage<TMemoryRegion>(); // convert storage to
+                                                    // TMemoryRegion
+        return field;
+    }
+
     /**
      * @brief Get the underlying storage of the field as the requested type.
      * @return MemoryRegion storage converted to the requested type
-     * 
+     *
      * This routine performs MemoryRegion conversions if necessary to enable
      * casting of, for example a CUDA memory region to a CPU memory region to
      * support the use of a CPU-only operator if necessary.
-     * 
+     *
      * A runtime warning is provided if a transfer of data from device to host
      * is required to achieve the conversion.
      */
@@ -172,30 +190,166 @@ public:
         }
     }
 
+    /**
+     * @brief Reshapes the storage to a prescribed vector width.
+     *
+     * This routine reorders the elemental data to interleave elements to a
+     * prescribed vector width VW. This therefore puts the same DOF for groups
+     * of VW elements contiguously in memory, enabling the efficient use of
+     * vectorised instructions. At the moment this routine reshapes from VW0 to
+     * VW1 by first reshaping from VW0 to a vector width of 1, and then to VW1.
+     *
+     * @tparam  VectorWidth     Target vector width.
+     * @tparam  Align           Memory alignment to use.
+     */
+    template <size_t VectorWidth,
+              size_t Align = __STDCPP_DEFAULT_NEW_ALIGNMENT__>
+    void ReshapeStorage()
+    {
+        // No reshape required, early return
+        if (m_curVecWidth == VectorWidth)
+            return;
+
+        ReshapeToScalar<Align>();
+
+        // Early return if "scalar" shape is required
+        if (VectorWidth == 1)
+            return;
+
+        // New memory region with requested alignment
+        MemoryRegionCPU<TType> reshapedStorage(m_storage->size(), Align);
+
+        TType *inptr  = m_storage->GetCPUPtr();
+        TType *outptr = reshapedStorage.GetCPUPtr();
+
+        for (const auto &block : block_attributes)
+        {
+            ASSERTL1(
+                block.num_elements % VectorWidth == 0,
+                "Number of elements not divisible by VectorWidth, padding not "
+                "implemented yet.");
+
+            const size_t numMetaBlocks = block.num_elements / VectorWidth;
+            for (size_t metaBlock = 0; metaBlock < numMetaBlocks; ++metaBlock)
+            {
+                InterleaveFromScalar<VectorWidth>(inptr, block.num_pts, outptr);
+                inptr += block.num_pts * VectorWidth;
+                outptr += block.num_pts * VectorWidth;
+            }
+        }
+
+        m_curVecWidth = VectorWidth;
+        m_storage     = std::make_unique<MemoryRegionCPU<TType>>(
+            std::move(reshapedStorage));
+    }
+
+    std::vector<BlockAttributes> const &GetBlocks() const
+    {
+        return block_attributes;
+    }
+
     /**
      * @brief Gets the number of components for a vector field.
-     * 
-     * @return size_t 
+     *
+     * @return size_t
      */
     size_t GetNumComponents()
     {
         return component_names.size();
     }
 
+    Nektar::Array<Nektar::OneD, TType> toArray() const
+    {
+        return Nektar::Array<Nektar::OneD, TType>(m_storage->size(),
+                                                  m_storage->GetCPUPtr());
+    }
+
 private:
+    /**
+     * @brief Reshapes the current storage interleaving to a non-interleaved
+     * arrangement.
+     */
+    template <size_t Align = __STDCPP_DEFAULT_NEW_ALIGNMENT__>
+    void ReshapeToScalar()
+    {
+        if (m_curVecWidth == 1)
+            return;
+
+        MemoryRegionCPU<TType> reshapedStorage(m_storage->size(), Align);
+
+        TType *inptr  = m_storage->GetCPUPtr();
+        TType *outptr = reshapedStorage.GetCPUPtr();
+
+        for (const auto &block : block_attributes)
+        {
+            const size_t numMetaBlocks = block.num_elements / m_curVecWidth;
+
+            for (size_t metaBlock = 0; metaBlock < numMetaBlocks; ++metaBlock)
+            {
+                Deinterleave(inptr, block.num_pts, outptr);
+                inptr += m_curVecWidth * block.num_pts;
+                outptr += m_curVecWidth * block.num_pts;
+            }
+        }
+
+        m_curVecWidth = 1;
+        m_storage     = std::make_unique<MemoryRegionCPU<TType>>(
+            std::move(reshapedStorage));
+    }
+
+    /**
+     * @brief Interleave the data block to the given vector width
+     * @tparam  VectorWidth Target vector width.
+     * @param   in          Input array with VW of 1.
+     * @param   dataLen     Length of data blocks to be interleaved.
+     * @param   out         Output array of VW specified by VectorWidth.
+     */
+    template <size_t VectorWidth>
+    void InterleaveFromScalar(const TType *in, size_t dataLen, TType *out)
+    {
+        // TODO: SIMD this
+        for (size_t idx = 0; idx < dataLen; ++idx)
+        {
+            for (size_t vecElem = 0; vecElem < VectorWidth; ++vecElem)
+            {
+                out[idx * VectorWidth + vecElem] = in[vecElem * dataLen + idx];
+            }
+        }
+    }
+
+    /**
+     * @brief Deinterleave data from the current vector width to be
+     * non-interleaved.
+     * @param   in      Input array
+     * @param   dataLen Length of a block of data (e.g. an element)
+     * @param   out     Output array
+     */
+    void Deinterleave(const TType *in, size_t dataLen, TType *out)
+    {
+        for (size_t idx = 0; idx < dataLen; ++idx)
+        {
+            for (size_t vecElem = 0; vecElem < m_curVecWidth; ++vecElem)
+            {
+                out[vecElem * dataLen + idx] =
+                    in[idx * m_curVecWidth + vecElem];
+            }
+        }
+    }
+
     /**
      * @brief Construct a new Field object.
-     * 
+     *
      * @param blocks    Field data layout specification
      * @param num_components Number of components for a vector field.
      */
-    Field(std::vector<BlockAttributes> &blocks, int num_components = 1)
-        : block_attributes(blocks)
+    Field(std::vector<BlockAttributes> blocks, int num_components = 1)
+        : block_attributes(std::move(blocks))
     {
     }
 
     std::unique_ptr<MemoryRegionCPU<TType>> m_storage;
-    // std::vector<MemoryView<TType>> m_views;
     std::vector<BlockAttributes> block_attributes;
     std::vector<std::string> component_names;
+
+    size_t m_curVecWidth = 1;
 };
diff --git a/LibUtilities/ErrorUtil.cpp b/LibUtilities/ErrorUtil.cpp
deleted file mode 100644
index 56c2bee0a72e1a41abe2195491db184431b38862..0000000000000000000000000000000000000000
--- a/LibUtilities/ErrorUtil.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-#include "ErrorUtil.hpp"
-
-namespace Nektar
-{
-    std::ostream *ErrorUtil::m_outStream = &std::cerr;
-    bool ErrorUtil::m_printBacktrace     = true;
-}
\ No newline at end of file
diff --git a/LibUtilities/ErrorUtil.hpp b/LibUtilities/ErrorUtil.hpp
deleted file mode 100644
index 44977d2d82643a14f4106d19158e0b0b46a94c5a..0000000000000000000000000000000000000000
--- a/LibUtilities/ErrorUtil.hpp
+++ /dev/null
@@ -1,279 +0,0 @@
-///////////////////////////////////////////////////////////////////////////////
-//
-// File ErrorUtil.hpp
-//
-// For more information, please see: http://www.nektar.info
-//
-// The MIT License
-//
-// Copyright (c) 2006 Division of Applied Mathematics, Brown University (USA),
-// Department of Aeronautics, Imperial College London (UK), and Scientific
-// Computing and Imaging Institute, University of Utah (USA).
-//
-// Permission is hereby granted, free of charge, to any person obtaining a
-// copy of this software and associated documentation files (the "Software"),
-// to deal in the Software without restriction, including without limitation
-// the rights to use, copy, modify, merge, publish, distribute, sublicense,
-// and/or sell copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included
-// in all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-// DEALINGS IN THE SOFTWARE.
-//
-// Description: error related utilities
-//
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ERRORUTIL_HPP
-#define ERRORUTIL_HPP
-
-#include <iostream>
-#include <stdexcept>
-#include <string>
-
-#include <boost/core/ignore_unused.hpp>
-
-// #include <LibUtilities/LibUtilitiesDeclspec.h>
-#define LIB_UTILITIES_EXPORT
-
-#if defined(NEKTAR_USE_MPI)
-#include <mpi.h>
-#endif
-
-#ifndef _WIN32
-#include <execinfo.h>
-#endif
-
-namespace Nektar
-{
-
-class ErrorUtil
-{
-public:
-    class NekError : public std::runtime_error
-    {
-    public:
-        NekError(const std::string &message) : std::runtime_error(message)
-        {
-        }
-    };
-
-    enum ErrType
-    {
-        efatal,
-        ewarning
-    };
-
-    inline static void SetErrorStream(std::ostream &o)
-    {
-        m_outStream = &o;
-    }
-
-    inline static void SetPrintBacktrace(bool b)
-    {
-        m_printBacktrace = b;
-    }
-
-    inline static bool HasCustomErrorStream()
-    {
-        return m_outStream != &std::cerr;
-    }
-
-    inline static void Error(ErrType type, const char *routine, int lineNumber,
-                             const char *msg, unsigned int level,
-                             bool DoComm = false)
-    {
-        boost::ignore_unused(DoComm);
-
-        // The user of outStream is primarily for the unit tests.  The unit
-        // tests often generate errors on purpose to make sure invalid usage is
-        // flagged appropriately.  Printing the error messages to cerr made the
-        // unit test output hard to parse.
-
-        std::string baseMsg =
-            "Level " + std::to_string(level) + " assertion violation\n";
-#if defined(NEKTAR_DEBUG) || defined(NEKTAR_FULLDEBUG)
-        baseMsg += "Where   : " + std::string(routine) + "[" +
-                   std::to_string(lineNumber) + "]\nMessage : ";
-#else
-        boost::ignore_unused(routine, lineNumber);
-#endif
-        baseMsg += std::string(msg);
-
-        // Default rank is zero. If MPI used and initialised, populate with
-        // the correct rank. Messages are only printed on rank zero.
-        int rank = 0;
-#if defined(NEKTAR_USE_MPI) && !defined(NEKTAR_USE_CWIPI)
-        int flag = 0;
-        if (DoComm)
-        {
-            MPI_Initialized(&flag);
-            if (flag)
-            {
-                MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-            }
-        }
-#else
-        boost::ignore_unused(DoComm);
-#endif
-
-        std::string btMessage("");
-#if defined(NEKTAR_FULLDEBUG)
-#ifndef _WIN32
-        if (m_printBacktrace)
-        {
-            void *btArray[40];
-            int btSize;
-            char **btStrings;
-
-            btSize    = backtrace(btArray, 40);
-            btStrings = backtrace_symbols(btArray, btSize);
-
-            for (int i = 0; i < btSize; ++i)
-            {
-                btMessage += std::string(btStrings[i]) + "\n";
-            }
-            free(btStrings);
-        }
-#endif
-#endif
-
-        switch (type)
-        {
-            case efatal:
-                if (!rank)
-                {
-                    if (m_printBacktrace)
-                    {
-                        (*m_outStream) << btMessage;
-                    }
-                    (*m_outStream) << "Fatal   : " << baseMsg << std::endl;
-                }
-
-#if defined(NEKTAR_USE_MPI) && !defined(NEKTAR_USE_CWIPI)
-                if (DoComm)
-                {
-                    if (flag)
-                    {
-                        MPI_Barrier(MPI_COMM_WORLD);
-                    }
-                }
-#endif
-                throw NekError(baseMsg);
-                break;
-            case ewarning:
-                if (!rank)
-                {
-                    if (m_printBacktrace)
-                    {
-                        (*m_outStream) << btMessage;
-                    }
-                    (*m_outStream) << "Warning : " << baseMsg << std::endl;
-                }
-                break;
-            default:
-                (*m_outStream)
-                    << "Unknown warning type: " << baseMsg << std::endl;
-        }
-    }
-
-    inline static void Error(ErrType type, const char *routine, int lineNumber,
-                             const std::string &msg, unsigned int level)
-    {
-        Error(type, routine, lineNumber, msg.c_str(), level);
-    }
-
-    inline static void Error(ErrType type, const char *routine, int lineNumber,
-                             const char *msg)
-    {
-        Error(type, routine, lineNumber, msg, 0);
-    }
-
-private:
-    LIB_UTILITIES_EXPORT static std::ostream *m_outStream;
-    LIB_UTILITIES_EXPORT static bool m_printBacktrace;
-};
-
-/// Assert Level 0 -- Fundamental assert which
-/// is used whether in FULLDEBUG, DEBUG or OPT
-/// compilation mode.  This level assert is
-/// considered code critical, even under
-/// optimized compilation.
-
-#define NEKERROR(type, msg)                                                    \
-    Nektar::ErrorUtil::Error(type, __FILE__, __LINE__, msg, 0);
-
-#define ROOTONLY_NEKERROR(type, msg)                                           \
-    Nektar::ErrorUtil::Error(type, __FILE__, __LINE__, msg, 0, true);
-
-#define ASSERTL0(condition, msg)                                               \
-    if (!(condition))                                                          \
-    {                                                                          \
-        Nektar::ErrorUtil::Error(Nektar::ErrorUtil::efatal, __FILE__,          \
-                                 __LINE__, msg, 0);                            \
-    }
-
-#define WARNINGL0(condition, msg)                                              \
-    if (!(condition))                                                          \
-    {                                                                          \
-        Nektar::ErrorUtil::Error(Nektar::ErrorUtil::ewarning, __FILE__,        \
-                                 __LINE__, msg, 0);                            \
-    }
-
-/// Assert Level 1 -- Debugging which is used whether in FULLDEBUG or
-/// DEBUG compilation mode.  This level assert is designed for aiding
-/// in standard debug (-g) mode
-#if defined(NEKTAR_DEBUG) || defined(NEKTAR_FULLDEBUG)
-
-#define ASSERTL1(condition, msg)                                               \
-    if (!(condition))                                                          \
-    {                                                                          \
-        Nektar::ErrorUtil::Error(Nektar::ErrorUtil::efatal, __FILE__,          \
-                                 __LINE__, msg, 1);                            \
-    }
-
-#define WARNINGL1(condition, msg)                                              \
-    if (!(condition))                                                          \
-    {                                                                          \
-        Nektar::ErrorUtil::Error(Nektar::ErrorUtil::ewarning, __FILE__,        \
-                                 __LINE__, msg, 1);                            \
-    }
-
-#else // defined(NEKTAR_DEBUG) || defined(NEKTAR_FULLDEBUG)
-#define ASSERTL1(condition, msg)
-#define WARNINGL1(condition, msg)
-#endif // defined(NEKTAR_DEBUG) || defined(NEKTAR_FULLDEBUG)
-
-/// Assert Level 2 -- Debugging which is used FULLDEBUG compilation
-/// mode.  This level assert is designed to provide addition safety
-/// checks within the code (such as bounds checking, etc.).
-#ifdef NEKTAR_FULLDEBUG
-
-#define ASSERTL2(condition, msg)                                               \
-    if (!(condition))                                                          \
-    {                                                                          \
-        Nektar::ErrorUtil::Error(Nektar::ErrorUtil::efatal, __FILE__,          \
-                                 __LINE__, msg, 2);                            \
-    }
-#define WARNINGL2(condition, msg)                                              \
-    if (!(condition))                                                          \
-    {                                                                          \
-        Nektar::ErrorUtil::Error(Nektar::ErrorUtil::ewarning, __FILE__,        \
-                                 __LINE__, msg, 2);                            \
-    }
-
-#else // NEKTAR_FULLDEBUG
-#define ASSERTL2(condition, msg)
-#define WARNINGL2(condition, msg)
-#endif // NEKTAR_FULLDEBUG
-
-} // namespace Nektar
-
-#endif // ERRORUTIL_HPP
diff --git a/LibUtilities/NekFactory.hpp b/LibUtilities/NekFactory.hpp
deleted file mode 100644
index c725a7eefc38c1bda85ee4a93ffef55cae338757..0000000000000000000000000000000000000000
--- a/LibUtilities/NekFactory.hpp
+++ /dev/null
@@ -1,316 +0,0 @@
-///////////////////////////////////////////////////////////////////////////////
-//
-// File: NekFactory.hpp
-//
-// For more information, please see: http://www.nektar.info
-//
-// The MIT License
-//
-// Copyright (c) 2006 Division of Applied Mathematics, Brown University (USA),
-// Department of Aeronautics, Imperial College London (UK), and Scientific
-// Computing and Imaging Institute, University of Utah (USA).
-//
-// Permission is hereby granted, free of charge, to any person obtaining a
-// copy of this software and associated documentation files (the "Software"),
-// to deal in the Software without restriction, including without limitation
-// the rights to use, copy, modify, merge, publish, distribute, sublicense,
-// and/or sell copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included
-// in all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-// DEALINGS IN THE SOFTWARE.
-//
-// Description: Factory pattern class for Nektar
-//
-///////////////////////////////////////////////////////////////////////////////
-
-#ifndef NEKTAR_LIBUTILITIES_BASICUTILS_NEKFACTORY
-#define NEKTAR_LIBUTILITIES_BASICUTILS_NEKFACTORY
-
-#include <functional>
-#include <iostream>
-#include <map>
-#include <memory>
-#include <sstream>
-#include <string>
-
-#ifdef NEKTAR_USE_THREAD_SAFETY
-#include <boost/thread/locks.hpp>
-#include <boost/thread/shared_mutex.hpp>
-#endif
-
-#include "ErrorUtil.hpp"
-
-namespace Nektar
-{
-namespace LibUtilities
-{
-
-#ifdef NEKTAR_USE_THREAD_SAFETY
-// Generate parameter typenames with default type of 'none'
-typedef boost::unique_lock<boost::shared_mutex> WriteLock;
-typedef boost::shared_lock<boost::shared_mutex> ReadLock;
-#endif
-
-/**
- * @class NekFactory
- *
- * @brief Provides a generic Factory class.
- *
- * Implements a generic object factory. Class-types which use an arbitrary
- * number of parameters may be used via C++ variadic templating.
- *
- * To allow a class to be instantiated by the factory, the following are
- * required in each class definition (in the case of a single parameter):
- *
- * \code
- *   static [baseclass]* create([paramtype1] &P) {
- *      return new [derivedclass](P);
- *   }
- *   static std::string className;
- * \endcode
- *
- * and outside the class definition in the implementation:
- *
- * \code
- *   std::string [derivedclass]::className
- *      = Factory<std::string,[baseclass],[paramtype1]>::
- *          RegisterCreatorFunction("[derivedclass]",
- *                              [derivedclass]::create,"Description");
- * \endcode
- *
- * The assignment of the static variable className is done through the call to
- * RegisterCreatorFunction, which registers the class with the factory prior to
- * the start of the main() routine.
- *
- * To create an instance of a derived class, for instance:
- * \code
- *   [baseclass]* var_name =
- *      Factory<std::string,[baseclass],[paramtype1]>
- *              ::CreateInstance("[derivedclass]",Param1);
- * \endcode
- */
-template <typename tKey,  // reference tag (e.g. string, int)
-          typename tBase, // base class
-          typename... tParam>
-class NekFactory
-{
-public:
-    /// Comparison predicator of key
-    typedef std::less<tKey> tPredicator;
-    /// Shared pointer to an object of baseclass type.
-    typedef std::unique_ptr<tBase> tBaseUniquePtr;
-    /// CreatorFunction type which takes parameter and returns base class shared
-    /// pointer.
-    typedef std::function<tBaseUniquePtr(tParam...)> CreatorFunction;
-
-    /// Define a struct to hold the information about a module.
-    struct ModuleEntry
-    {
-        ModuleEntry(CreatorFunction pFunc, const std::string pDesc)
-            : m_func(pFunc), m_desc(pDesc)
-        {
-        }
-
-        /// Function used to create instance of class.
-        CreatorFunction m_func;
-        /// Description of class for use in listing available classes.
-        std::string m_desc;
-    };
-
-    /// Factory map between key and module data.
-    typedef std::map<tKey, ModuleEntry, tPredicator> TMapFactory;
-
-public:
-    NekFactory() = default;
-
-    /**
-     * @brief Create an instance of the class referred to by \c idKey.
-     *
-     * Searches the factory's map for the given key and returns a shared
-     * base class pointer to a new instance of the associated class.
-     * @param   idKey           Key of class to create.
-     * @param   x               Parameter to pass to class constructor.
-     * @returns                 Base class pointer to new instance.
-     */
-    tBaseUniquePtr CreateInstance(tKey idKey, tParam... args)
-    {
-#ifdef NEKTAR_USE_THREAD_SAFETY
-        ReadLock vReadLock(m_mutex);
-#endif
-
-        // Now try and find the key in the map.
-        auto it = getMapFactory()->find(idKey);
-
-        // If successful, check the CreatorFunction is defined and
-        // create a new instance of the class.
-        if (it != getMapFactory()->end())
-        {
-            ModuleEntry *tmp = &(it->second);
-#ifdef NEKTAR_USE_THREAD_SAFETY
-            vReadLock.unlock();
-#endif
-
-            if (tmp->m_func)
-            {
-                try
-                {
-                    return tmp->m_func(args...);
-                }
-                catch (const std::string &s)
-                {
-                    std::stringstream errstr;
-                    errstr << "Unable to create module: " << idKey << "\n";
-                    errstr << s;
-                    NEKERROR(ErrorUtil::efatal, errstr.str());
-                }
-            }
-        }
-
-        // If we get this far, the key doesn't exist, so throw an error.
-        std::stringstream errstr;
-        errstr << "No such module: " << idKey << std::endl;
-        PrintAvailableClasses(errstr);
-        NEKERROR(ErrorUtil::efatal, errstr.str());
-        return tBaseUniquePtr();
-    }
-
-    /**
-     * @brief Register a class with the factory.
-     *
-     * This function is called by each class in a static context (prior
-     * to the execution of main()) and creates an entry for the class
-     * in the factory's map.
-     * @param   idKey           Key used to reference the class.
-     * @param   classCreator    Function to call to create an instance
-     *                          of this class.
-     * @param   pDesc           Optional description of class.
-     * @returns                 The given key \c idKey.
-     */
-    tKey RegisterCreatorFunction(tKey idKey, CreatorFunction classCreator,
-                                 std::string pDesc = "")
-    {
-#ifdef NEKTAR_USE_THREAD_SAFETY
-        WriteLock vWriteLock(m_mutex);
-#endif
-
-        ModuleEntry e(classCreator, pDesc);
-        getMapFactory()->insert(std::pair<tKey, ModuleEntry>(idKey, e));
-        return idKey;
-    }
-
-    /**
-     * @brief Checks if a particular module is available.
-     */
-    bool ModuleExists(tKey idKey)
-    {
-#ifdef NEKTAR_USE_THREAD_SAFETY
-        ReadLock vReadLock(m_mutex);
-#endif
-
-        // Now try and find the key in the map.
-        auto it = getMapFactory()->find(idKey);
-
-        if (it != getMapFactory()->end())
-        {
-            return true;
-        }
-        return false;
-    }
-
-    /**
-     * @brief Prints the available classes to stdout.
-     */
-    void PrintAvailableClasses(std::ostream &pOut = std::cout)
-    {
-#ifdef NEKTAR_USE_THREAD_SAFETY
-        ReadLock vReadLock(m_mutex);
-#endif
-
-        pOut << std::endl << "Available classes: " << std::endl;
-        for (auto &it : *getMapFactory())
-        {
-            pOut << "  " << it.first;
-            if (it.second.m_desc != "")
-            {
-                pOut << ":" << std::endl
-                     << "    " << it.second.m_desc << std::endl;
-            }
-            else
-            {
-                pOut << std::endl;
-            }
-        }
-    }
-
-    /**
-     * @brief Retrieves a key, given a description
-     */
-    tKey GetKey(std::string pDesc)
-    {
-#ifdef NEKTAR_USE_THREAD_SAFETY
-        ReadLock vReadLock(m_mutex);
-#endif
-
-        for (auto &it : *getMapFactory())
-        {
-            if (it.second.m_desc == pDesc)
-            {
-                return it.first;
-            }
-        }
-        std::string errstr = "Module '" + pDesc + "' is not known.";
-        ASSERTL0(false, errstr);
-    }
-
-    /**
-     * @brief Returns the description of a class
-     */
-    std::string GetClassDescription(tKey idKey)
-    {
-#ifdef NEKTAR_USE_THREAD_SAFETY
-        ReadLock vReadLock(m_mutex);
-#endif
-
-        // Now try and find the key in the map.
-        auto it = getMapFactory()->find(idKey);
-
-        std::stringstream errstr;
-        errstr << "No such module: " << idKey << std::endl;
-        ASSERTL0(it != getMapFactory()->end(), errstr.str());
-        return it->second.m_desc;
-    }
-
-protected:
-    /**
-     * @brief Ensure the factory's map is created.
-     * @returns                 The factory's map.
-     */
-    TMapFactory *getMapFactory()
-    {
-        return &mMapFactory;
-    }
-
-private:
-    NekFactory(const NekFactory &rhs);
-    NekFactory &operator=(const NekFactory &rhs);
-
-    TMapFactory mMapFactory;
-
-#ifdef NEKTAR_USE_THREAD_SAFETY
-    boost::shared_mutex m_mutex;
-#endif
-};
-
-} // namespace LibUtilities
-} // namespace Nektar
-
-#endif
diff --git a/MemoryRegionCPU.hpp b/MemoryRegionCPU.hpp
index 1a3c5d89ea602115efda41806054287066ab257a..877f27cf47d1a9afa7a70332954e374a089f66af 100644
--- a/MemoryRegionCPU.hpp
+++ b/MemoryRegionCPU.hpp
@@ -1,51 +1,64 @@
 #pragma once
 
+#include <new>
+
 /**
  * @brief Stores underlying data for a Field on the CPU.
- * 
+ *
  * It acts as a holder for a contiguous block of memory, allocated on the
- * host system. 
- * 
+ * host system.
+ *
  * This class also acts as a base class for device-aware builds.
  */
 template <typename TData> class MemoryRegionCPU
 {
 public:
-    MemoryRegionCPU()                           = delete;
-    MemoryRegionCPU(const MemoryRegionCPU &rhs) = delete;
+    MemoryRegionCPU()                                      = delete;
+    MemoryRegionCPU(const MemoryRegionCPU &rhs)            = delete;
+    MemoryRegionCPU &operator=(const MemoryRegionCPU &rhs) = delete;
 
     MemoryRegionCPU(MemoryRegionCPU &&rhs)
     {
-        m_host     = rhs.m_host;
-        m_size     = rhs.m_size;
-        rhs.m_host = nullptr;
+        m_host      = rhs.m_host;
+        m_size      = rhs.m_size;
+        m_alignment = rhs.m_alignment;
+        rhs.m_host  = nullptr;
     }
 
-    MemoryRegionCPU(size_t n)
+    MemoryRegionCPU(size_t n,
+                    size_t alignment = __STDCPP_DEFAULT_NEW_ALIGNMENT__)
     {
-        m_host = new TData[n];
-        m_size = n;
+        // C++17 aligned new
+        m_host      = new (std::align_val_t(alignment)) TData[n];
+        m_alignment = alignment;
+        m_size      = n;
     }
 
     virtual ~MemoryRegionCPU()
     {
         if (m_host != nullptr)
         {
-            delete[] m_host;
+            operator delete[](m_host, std::align_val_t(m_alignment));
             m_host = nullptr;
         }
     }
 
-    void operator=(MemoryRegionCPU &&rhs)
+    MemoryRegionCPU &operator=(MemoryRegionCPU &&rhs)
     {
-        m_host     = rhs.m_host;
-        m_size     = rhs.m_size;
-        rhs.m_host = nullptr;
+        if (m_host)
+            operator delete[](m_host, std::align_val_t(m_alignment));
+
+        m_host      = rhs.m_host;
+        m_size      = rhs.m_size;
+        m_alignment = rhs.m_alignment;
+        rhs.m_host  = nullptr;
+
+        return *this;
     }
 
     /**
      * @brief Get the pointer to the CPU memory.
-     * 
+     *
      * This is a virtual function so that subclasses can move memory to the
      * CPU from a device if needed.
      */
@@ -56,7 +69,7 @@ public:
 
     /**
      * @brief Move memory to the CPU.
-     * 
+     *
      * This is a virtual function so that subclasses can move memory to the
      * CPU from a device if needed.
      */
@@ -70,6 +83,7 @@ public:
     }
 
 protected:
-    TData *m_host = nullptr;
-    size_t m_size = 0;
+    TData *m_host      = nullptr;
+    size_t m_size      = 0;
+    size_t m_alignment = __STDCPP_DEFAULT_NEW_ALIGNMENT__;
 };
diff --git a/MemoryRegionCUDA.hpp b/MemoryRegionCUDA.hpp
index e634cc1dbea0ae23cea6098efba4afc11e80276f..2187341bee8d1e5033656e0746b0c576cd9fd40f 100644
--- a/MemoryRegionCUDA.hpp
+++ b/MemoryRegionCUDA.hpp
@@ -91,7 +91,7 @@ protected:
 private:
     void initFromSize(size_t n);
 
-    TData *m_device = nullptr;      ///< Device memory pointer
-    size_t m_size   = 0;            ///< Device storage size
-    bool m_ondevice = false;        ///< Flag indicating if data is on device
+    TData *m_device = nullptr; ///< Device memory pointer
+    size_t m_size   = 0;       ///< Device storage size
+    bool m_ondevice = false;   ///< Flag indicating if data is on device
 };
diff --git a/Operators/BwdTrans/BwdTransCUDA.hpp b/Operators/BwdTrans/BwdTransCUDA.hpp
index 1d3abfd4031aee536c213882f3115269b1779ab6..236504ed9661509dd20979cf7c9240f0ceac7e93 100644
--- a/Operators/BwdTrans/BwdTransCUDA.hpp
+++ b/Operators/BwdTrans/BwdTransCUDA.hpp
@@ -9,6 +9,11 @@ template <typename TData>
 class OperatorBwdTransImpl<TData, ImplCUDA> : public OperatorBwdTrans<TData>
 {
 public:
+    OperatorBwdTransImpl(const MultiRegions::ExpListSharedPtr& expansionList)
+        : OperatorBwdTrans<TData>(std::move(expansionList))
+    {
+    }
+
     void apply(Field<TData, FieldState::Coeff> &in,
                Field<TData, FieldState::Phys> &out) override
     {
@@ -17,9 +22,11 @@ public:
         std::cout << "Op bwd trans CUDA\n";
     }
 
-    static std::unique_ptr<Operator<TData>> instantiate()
+    static std::unique_ptr<Operator<TData>> instantiate(
+        MultiRegions::ExpListSharedPtr expansionList)
     {
-        return std::make_unique<OperatorBwdTransImpl<TData, ImplCUDA>>();
+        return std::make_unique<OperatorBwdTransImpl<TData, ImplCUDA>>(
+            std::move(expansionList));
     }
 
     static std::string className;
diff --git a/Operators/BwdTrans/BwdTransImpl.cpp b/Operators/BwdTrans/BwdTransImpl.cpp
index 07df19524ed28c4e9e9ac1a287f883d833144b09..d2290d59c497bbf205b527775b964f8bb516c4eb 100644
--- a/Operators/BwdTrans/BwdTransImpl.cpp
+++ b/Operators/BwdTrans/BwdTransImpl.cpp
@@ -1,5 +1,6 @@
-#include "BwdTransSumFac.hpp"
+#include "BwdTransStdMat.hpp"
 #include "BwdTransMatFree.hpp"
+#include "BwdTransSumFac.hpp"
 
 namespace Nektar::Operators::detail
 {
@@ -14,7 +15,13 @@ std::string OperatorBwdTransImpl<double, ImplMatFree>::className =
 template <>
 std::string OperatorBwdTransImpl<double, ImplSumFac>::className =
     GetOperatorFactory<double>().RegisterCreatorFunction(
-        "BwdTransSumFac",
-        OperatorBwdTransImpl<double, ImplSumFac>::instantiate, "...");
+        "BwdTransSumFac", OperatorBwdTransImpl<double, ImplSumFac>::instantiate,
+        "...");
+
+template <>
+std::string OperatorBwdTransImpl<double, ImplStdMat>::className =
+    GetOperatorFactory<double>().RegisterCreatorFunction(
+        "BwdTransStdMat", OperatorBwdTransImpl<double, ImplStdMat>::instantiate,
+        "...");
 
-}
\ No newline at end of file
+} // namespace Nektar::Operators::detail
diff --git a/Operators/BwdTrans/BwdTransMatFree.hpp b/Operators/BwdTrans/BwdTransMatFree.hpp
index 5bb5f2e65c602b8972a1d2836f8e75c470a2df26..182aa6355b6d600dbab0fc9077be3bff5a54ad75 100644
--- a/Operators/BwdTrans/BwdTransMatFree.hpp
+++ b/Operators/BwdTrans/BwdTransMatFree.hpp
@@ -8,6 +8,11 @@ template <typename TData>
 class OperatorBwdTransImpl<TData, ImplMatFree> : public OperatorBwdTrans<TData>
 {
 public:
+    OperatorBwdTransImpl(const MultiRegions::ExpListSharedPtr& expansionList)
+        : OperatorBwdTrans<TData>(std::move(expansionList))
+    {
+    }
+
     void apply(Field<TData, FieldState::Coeff> &in,
                Field<TData, FieldState::Phys> &out) override
     {
@@ -21,9 +26,11 @@ public:
         std::cout << "Op bwd trans mat free\n";
     }
 
-    static std::unique_ptr<Operator<TData>> instantiate()
+    static std::unique_ptr<Operator<TData>> instantiate(
+        const MultiRegions::ExpListSharedPtr& expansionList)
     {
-        return std::make_unique<OperatorBwdTransImpl<TData, ImplMatFree>>();
+        return std::make_unique<OperatorBwdTransImpl<TData, ImplMatFree>>(
+            std::move(expansionList));
     }
 
     static std::string className;
diff --git a/Operators/BwdTrans/BwdTransMatFreeKernels.hpp b/Operators/BwdTrans/BwdTransMatFreeKernels.hpp
index d9138db6f9d6869ebe156ef0e9ce7c0ce911097f..9bbe9e5543448af991841d127f83d197b9d9b18a 100644
--- a/Operators/BwdTrans/BwdTransMatFreeKernels.hpp
+++ b/Operators/BwdTrans/BwdTransMatFreeKernels.hpp
@@ -1,5 +1,4 @@
-namespace Nektar::Operators::detail::matfree {
+namespace Nektar::Operators::detail::matfree
+{
 
-
-
-}
\ No newline at end of file
+}
diff --git a/Operators/BwdTrans/BwdTransStdMat.hpp b/Operators/BwdTrans/BwdTransStdMat.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..053dc419f7f476f2b5189fac0d3eb2ef01c6722d
--- /dev/null
+++ b/Operators/BwdTrans/BwdTransStdMat.hpp
@@ -0,0 +1,54 @@
+#include "Operators/OperatorBwdTrans.hpp"
+#include <StdRegions/StdExpansion.h>
+
+namespace Nektar::Operators::detail
+{
+
+// sum-factorisation implementation
+template <typename TData>
+class OperatorBwdTransImpl<TData, ImplStdMat> : public OperatorBwdTrans<TData>
+{
+public:
+    OperatorBwdTransImpl(const MultiRegions::ExpListSharedPtr& expansionList)
+        : OperatorBwdTrans<TData>(std::move(expansionList))
+    {
+    }
+
+    void apply(Field<TData, FieldState::Coeff> &in,
+               Field<TData, FieldState::Phys> &out) override
+    {
+        auto const *inptr = in.GetStorage().GetCPUPtr();
+        auto *outptr      = out.GetStorage().GetCPUPtr();
+
+        for (size_t block_idx = 0; block_idx < in.GetBlocks().size();
+             ++block_idx)
+        {
+            auto const expPtr = this->m_expansionList->GetExp(block_idx);
+
+            Nektar::StdRegions::StdMatrixKey key(
+                StdRegions::eBwdTrans, expPtr->DetShapeType(), *expPtr);
+            auto const matPtr = expPtr->GetStdMatrix(key);
+
+            auto const &block = in.GetBlocks()[block_idx];
+
+            Blas::Dgemm('N', 'N', matPtr->GetRows(), block.num_elements,
+                        matPtr->GetColumns(), 1.0, matPtr->GetRawPtr(),
+                        matPtr->GetRows(), inptr, block.num_pts, 0.0, outptr,
+                        expPtr->GetTotPoints());
+
+            inptr += block.block_size;
+            outptr += expPtr->GetTotPoints() * block.num_elements;
+        }
+    }
+
+    static std::unique_ptr<Operator<TData>> instantiate(
+        const MultiRegions::ExpListSharedPtr& expansionList)
+    {
+        return std::make_unique<OperatorBwdTransImpl<TData, ImplStdMat>>(
+            std::move(expansionList));
+    }
+
+    static std::string className;
+};
+
+} // namespace Nektar::Operators::detail
diff --git a/Operators/BwdTrans/BwdTransSumFac.hpp b/Operators/BwdTrans/BwdTransSumFac.hpp
index d436bcd89298cc4abb684ceb0fed3ff193adcefc..0eda71e86c1dfd697387ff8b5e27e5328d00aeee 100644
--- a/Operators/BwdTrans/BwdTransSumFac.hpp
+++ b/Operators/BwdTrans/BwdTransSumFac.hpp
@@ -2,24 +2,31 @@
 
 namespace Nektar::Operators::detail
 {
-    
+
 // sum-factorisation implementation
 template <typename TData>
 class OperatorBwdTransImpl<TData, ImplSumFac> : public OperatorBwdTrans<TData>
 {
 public:
+    OperatorBwdTransImpl(const MultiRegions::ExpListSharedPtr& expansionList)
+        : OperatorBwdTrans<TData>(std::move(expansionList))
+    {
+    }
+
     void apply(Field<TData, FieldState::Coeff> &in,
                Field<TData, FieldState::Phys> &out) override
     {
         std::cout << "Op bwd trans sum fac\n";
     }
 
-    static std::unique_ptr<Operator<TData>> instantiate()
+    static std::unique_ptr<Operator<TData>> instantiate(
+        const MultiRegions::ExpListSharedPtr& expansionList)
     {
-        return std::make_unique<OperatorBwdTransImpl<TData, ImplSumFac>>();
+        return std::make_unique<OperatorBwdTransImpl<TData, ImplSumFac>>(
+            std::move(expansionList));
     }
 
     static std::string className;
 };
 
-}
\ No newline at end of file
+} // namespace Nektar::Operators::detail
diff --git a/Operators/Operator.cpp b/Operators/Operator.cpp
index 4ff199615c11d57067b52d209bb6c668a0cfea4a..def9aed3988c5340e9111609775520e553909952 100644
--- a/Operators/Operator.cpp
+++ b/Operators/Operator.cpp
@@ -2,9 +2,8 @@
 
 namespace Nektar::Operators
 {
-    
-template< typename TData>
-OperatorFactory<TData> &GetOperatorFactory()
+
+template <typename TData> OperatorFactory<TData> &GetOperatorFactory()
 {
     static OperatorFactory<TData> instance;
     return instance;
@@ -12,4 +11,4 @@ OperatorFactory<TData> &GetOperatorFactory()
 
 template OperatorFactory<double> &GetOperatorFactory();
 
-}
\ No newline at end of file
+} // namespace Nektar::Operators
diff --git a/Operators/Operator.hpp b/Operators/Operator.hpp
index 1b7b0062dc2551ee55a9e588f4cbed6686afc9f5..54e0ea7000d5df450282f41e7a4ea9e41ede715b 100644
--- a/Operators/Operator.hpp
+++ b/Operators/Operator.hpp
@@ -2,7 +2,9 @@
 
 #include <string>
 
-#include "LibUtilities/NekFactory.hpp"
+#include <LibUtilities/BasicUtils/NekFactory.hpp>
+#include <MultiRegions/ExpList.h>
+//#include <StdRegions/StdExpansion.h>
 
 namespace Nektar::Operators
 {
@@ -11,41 +13,56 @@ namespace Nektar::Operators
 // allow extension by users without modifying library
 using default_fp_type = double;
 
-// Implementation types
-struct ImplLocMat;
+// Core implementation types
+struct ImplStdMat;
 struct ImplSumFac;
 struct ImplMatFree;
 struct ImplCUDA;
 
 // Forward-declare the Operator base class so we can define the factory
-template< typename TData> class Operator;
+template <typename TData> class Operator;
 
 // Typename alias for the factory
-template< typename TData>
+template <typename TData>
 using OperatorFactory =
-    Nektar::LibUtilities::NekFactory<std::string, Operator<TData>>;
+    Nektar::LibUtilities::NekFactory<std::string, Operator<TData>,
+                                     const MultiRegions::ExpListSharedPtr&>;
 
 // Operator factory singleton
-template< typename TData>
-OperatorFactory<TData> &GetOperatorFactory();
+template <typename TData> OperatorFactory<TData> &GetOperatorFactory();
 
-template <typename TData>
-class Operator
+template <typename TData> class Operator
 {
 public:
-    template< typename TDescriptor>
-    static std::unique_ptr<typename TDescriptor::class_name> create(std::string pKey = "")
+    virtual ~Operator() = default;
+
+    Operator(const MultiRegions::ExpListSharedPtr& expansionList)
+        : m_expansionList(std::move(expansionList))
+    {
+    }
+
+    template <typename TDescriptor>
+    static std::shared_ptr<typename TDescriptor::class_name> create(
+        const MultiRegions::ExpListSharedPtr& expansionList,
+        std::string pKey = "")
     {
         std::string key = TDescriptor::key;
-        if (pKey.empty()) {
+        if (pKey.empty())
+        {
             key += TDescriptor::default_impl;
         }
-        else {
+        else
+        {
             key += pKey;
         }
-        auto x = GetOperatorFactory<TData>().CreateInstance(key);
-        return std::unique_ptr<typename TDescriptor::class_name>(static_cast<typename TDescriptor::class_name*>(x.release()));
+
+        return std::static_pointer_cast<typename TDescriptor::class_name>(
+            GetOperatorFactory<TData>().CreateInstance(
+                key, std::move(expansionList)));
     }
+
+protected:
+    MultiRegions::ExpListSharedPtr m_expansionList;
 };
 
-}
\ No newline at end of file
+} // namespace Nektar::Operators
diff --git a/Operators/OperatorBwdTrans.cpp b/Operators/OperatorBwdTrans.cpp
index 37aa2f859b78a8ee17d6c7fe04bfabd4f0103126..58ccec285115fde0e8edad5b742452ec078e5225 100644
--- a/Operators/OperatorBwdTrans.cpp
+++ b/Operators/OperatorBwdTrans.cpp
@@ -2,8 +2,8 @@
 
 namespace Nektar::Operators
 {
-    
-template<> const std::string BwdTrans<>::key = "BwdTrans";
-template<> const std::string BwdTrans<>::default_impl = "MatFree";
 
-}
\ No newline at end of file
+template <> const std::string BwdTrans<>::key          = "BwdTrans";
+template <> const std::string BwdTrans<>::default_impl = "MatFree";
+
+} // namespace Nektar::Operators
diff --git a/Operators/OperatorBwdTrans.hpp b/Operators/OperatorBwdTrans.hpp
index a95b67a4c15a83dd1038af68a385d6c23876ab30..d5108b4232d39b0745a16c73c311e0e700476b2f 100644
--- a/Operators/OperatorBwdTrans.hpp
+++ b/Operators/OperatorBwdTrans.hpp
@@ -8,39 +8,47 @@ namespace Nektar::Operators
 
 // BwdTrans base class
 // Defines the apply operator to enforce apply parameter types
-template <typename TData>
-class OperatorBwdTrans : public Operator<TData>
+template <typename TData> class OperatorBwdTrans : public Operator<TData>
 {
 public:
+    virtual ~OperatorBwdTrans() = default;
+
+    OperatorBwdTrans(const MultiRegions::ExpListSharedPtr& expansionList)
+        : Operator<TData>(std::move(expansionList))
+    {
+    }
+
     virtual void apply(Field<TData, FieldState::Coeff> &in,
                        Field<TData, FieldState::Phys> &out) = 0;
     virtual void operator()(Field<TData, FieldState::Coeff> &in,
-                       Field<TData, FieldState::Phys> &out)
+                            Field<TData, FieldState::Phys> &out)
     {
         apply(in, out);
     }
 };
 
 // Descriptor / traits class for BwdTrans
-template<typename TData = default_fp_type>
-struct BwdTrans {
+template <typename TData = default_fp_type> struct BwdTrans
+{
     using class_name = OperatorBwdTrans<TData>;
-    using FieldIn = Field<TData, FieldState::Coeff>;
-    using FieldOut = Field<TData, FieldState::Phys>;
+    using FieldIn    = Field<TData, FieldState::Coeff>;
+    using FieldOut   = Field<TData, FieldState::Phys>;
     static const std::string key;
     static const std::string default_impl;
 
-    static std::unique_ptr<class_name> create(std::string pKey = "")
+    static std::shared_ptr<class_name> create(
+        const MultiRegions::ExpListSharedPtr& expansionList,
+        std::string pKey = "")
     {
-        return Operator<TData>::template create<BwdTrans>(pKey);
+        return Operator<TData>::template create<BwdTrans>(
+            std::move(expansionList), pKey);
     }
 };
 
-namespace detail {
+namespace detail
+{
 // Template for BwdTrans implementations
-template <typename TData, typename Op>
-class OperatorBwdTransImpl;
-}
-
+template <typename TData, typename Op> class OperatorBwdTransImpl;
+} // namespace detail
 
-}
+} // namespace Nektar::Operators
diff --git a/main.cpp b/main.cpp
index 3f14fba40462ccc790c1e8d6728d1658c27540f0..a47fcb3910e37932fb0569d214cb8f55a4f89131 100644
--- a/main.cpp
+++ b/main.cpp
@@ -4,10 +4,10 @@
  * @brief Demonstrator program for the new Field class.
  * @version 0.1
  * @date 2023-02-13
- * 
+ *
  * @copyright Copyright (c) 2023 Imperial College London, University of Utah,
  * Kings College London
- * 
+ *
  */
 #include <iostream>
 #include <memory>
@@ -15,24 +15,72 @@
 #include "Field.hpp"
 #include "Operators/OperatorBwdTrans.hpp"
 
+#include <LibUtilities/BasicUtils/SessionReader.h>
+//#include <LibUtilities/SimdLib/tinysimd.hpp>
+#include <SpatialDomains/MeshGraph.h>
+#include <MultiRegions/ExpList.h>
+
 #ifdef NEKTAR_USE_CUDA
 #include "MemoryRegionCUDA.hpp"
 #endif
 
 using namespace Nektar::Operators;
+using namespace Nektar::LibUtilities;
+using namespace Nektar;
+
+template <typename TType, FieldState State>
+std::ostream &operator<<(std::ostream &stream, Field<TType, State> &f)
+{
+    auto *x = f.GetStorage().GetCPUPtr();
+    for (size_t i = 0; i < f.GetStorage().size(); ++i)
+    {
+        stream << x[i] << ' ';
+    }
+    stream << std::endl;
+
+    return stream;
+}
+
+std::vector<BlockAttributes> GetBlockAttributes(
+        FieldState state,
+        const MultiRegions::ExpListSharedPtr explist)
+{
+    const int n = explist->GetNumElmts();
+    std::map<std::tuple<LibUtilities::ShapeType,unsigned int,unsigned int>,size_t> blockList;
+    for (int i = 0; i < explist->GetNumElmts(); ++i)
+    {
+        auto e = explist->GetExp(i);
+        blockList[{e->DetShapeType(),e->GetNcoeffs(),e->GetTotPoints()}]++;
+    }
+    std::vector<BlockAttributes> blockAttr;
+    for (auto &x : blockList)
+    {
+        auto val = state == FieldState::Phys ? std::get<2>(x.first) : std::get<1>(x.first);
+        blockAttr.push_back( { x.second, val } );
+    }
+    return blockAttr;
+}
 
-int main()
+int main(int argc, char *argv[])
 {
-    // Define the structure of the computational domain. In this case, we
-    // suppose it consists of two composites: 10 quadrilaterals and 20
-    // triangles, each with 4 modes / 4 quadrature points.
-    std::vector<BlockAttributes> blocks = {
-        {ShapeType::eQuadrilateral, {4, 4, 1}, 10},
-        {ShapeType::eTriangle, {4, 4, 1}, 20}};
+    // Initialise a session, graph and create an expansion list
+    LibUtilities::SessionReaderSharedPtr session;
+    SpatialDomains::MeshGraphSharedPtr   graph;
+    MultiRegions::ExpListSharedPtr       explist;
+
+    session = LibUtilities::SessionReader::CreateInstance(argc, argv);
+    graph   = SpatialDomains::MeshGraph::Read(session);
+    explist = MemoryManager<MultiRegions::ExpList>::AllocateSharedPtr
+                    (session, graph);
+
+    // Generate a blocks definition from the expansion list for each state
+    auto blocks_phys  = GetBlockAttributes(FieldState::Phys,  explist);
+    auto blocks_coeff = GetBlockAttributes(FieldState::Coeff, explist);
 
     // Create two Field objects with a MemoryRegionCPU backend by default
-    auto in  = Field<double, FieldState::Coeff>::create(blocks);
-    auto out = Field<double, FieldState::Phys >::create(blocks);
+    auto in  = Field<double, FieldState::Coeff>::create(blocks_coeff);
+    auto in2 = Field<double, FieldState::Coeff>::create(blocks_coeff);
+    auto out = Field<double, FieldState::Phys >::create(blocks_phys);
 
     // Populate the field with some data. In this case, we just grab a pointer
     // to the memory on the CPU and populate the array with values. Operators,
@@ -44,13 +92,66 @@ int main()
     // implicitly transfer the data from the GPU to the host. The intention is
     // that all operators support multiple implementations and the appropriate
     // choice is made based on the backend selected.
-    double *x      = in.GetStorage().GetCPUPtr();
-    const size_t n = in.GetStorage().size();
-    for (size_t i = 0; i < n; ++i)
+    double *x = in.GetStorage().GetCPUPtr();
+    for (auto const &block : blocks_coeff)
+    {
+        for (size_t el = 0; el < block.num_elements; ++el)
+        {
+            for (size_t coeff = 0; coeff < block.num_pts; ++coeff)
+            {
+                // Each element is the index of the degree of freedom
+                // this is useful for testing reshapes
+                *(x++) = coeff;
+            }
+        }
+    }
+
+    memset(out.GetStorage().GetCPUPtr(), 0, out.GetStorage().size()*sizeof(double));
+
+    std::cout << "Initial shape:\n" << in << std::endl << std::endl;
+
+    // Test out field reshaping
+    in.ReshapeStorage<4>();
+    std::cout << "Reshaped to 4:\n" << in << std::endl << std::endl;
+
+#ifdef NEKTAR_ENABLE_SIMD_AVX2
+    // Test out SIMD instructions
+    using vec_t = tinysimd::simd<double>;
+
+    // Reshape into vec_t::width, with the right memory alignment requirements
+    in.ReshapeStorage<vec_t::width, vec_t::alignment>();
+    std::cout << "SIMD In:\n" << in << std::endl << std::endl;
+
+    in2.ReshapeStorage<vec_t::width, vec_t::alignment>();
+    std::cout << "SIMD Out (before):\n" << out << std::endl << std::endl;
+
+    // Reinterpret casting from double to vector type allows for efficient
+    // conversion to SIMD intrinsics
+    vec_t::vectorType *inptr =
+        reinterpret_cast<vec_t::vectorType *>(in.GetStorage().GetCPUPtr());
+    vec_t::scalarType *in2ptr = in2.GetStorage().GetCPUPtr();
+
+    // Loop over each block in the field and square each element
+    for (auto const &block : blocks_coeff)
     {
-        x[i] = i;
+        const size_t numMetaBlocks = block.num_elements / vec_t::width;
+        for (size_t metaBlock = 0; metaBlock < numMetaBlocks * block.num_pts;
+             ++metaBlock)
+        {
+            (vec_t(inptr[metaBlock]) * vec_t(inptr[metaBlock])).store(in2ptr);
+            in2ptr += vec_t::width;
+        }
+
+        inptr += numMetaBlocks * block.num_pts;
     }
 
+    // Back to non-interleaved for non-SIMD Operators
+    in.ReshapeStorage<1>();
+    in2.ReshapeStorage<1>();
+
+    std::cout << "Out:\n" << in2 << std::endl;
+#endif
+
     // Operators are instantiated using a Factory pattern. First lets check
     // which operators have been registered with the Factory.
     GetOperatorFactory<double>().PrintAvailableClasses();
@@ -58,49 +159,71 @@ int main()
     // We can create a BwdTrans operator (default implementation)
     // Default implementation might be provided through configuration options,
     // benchmarking, etc eventually.
-    auto bt = BwdTrans<>::create();
+    auto bt = BwdTrans<>::create(explist);
     // ...and then apply it
     bt->apply(in, out);
 
     // We can combine these for one-shot operations
-    BwdTrans<>::create()->apply(in, out);
+    BwdTrans<>::create(explist)->apply(in, out);
     // We can also explicitly select the implementation of the operator to use
-    BwdTrans<>::create("SumFac")->apply(in, out);
+    BwdTrans<>::create(explist, "SumFac")->apply(in, out);
 
     // Let's display the result
-    double *y = out.GetStorage().GetCPUPtr();
-    for (size_t i = 0; i < n; ++i)
+    std::cout << out << std::endl;
+
+    in              = Field<double, FieldState::Coeff>::create(blocks_coeff);
+    auto &inStorage = in.GetStorage();
+    std::fill(inStorage.GetCPUPtr(), inStorage.GetCPUPtr() + inStorage.size(),
+              0);
+    out = Field<double, FieldState::Phys>::create(blocks_phys);
+
+    auto &outStorage = out.GetStorage();
+    std::fill(outStorage.GetCPUPtr(),
+              outStorage.GetCPUPtr() + outStorage.size(), 0);
+
+    for (auto const &block : in.GetBlocks())
     {
-        std::cout << y[i] << std::endl;
+        for (size_t el = 0; el < block.num_elements; ++el)
+        {
+            in.GetStorage().GetCPUPtr()[el * block.num_pts] = 1;
+        }
     }
 
+    std::cout << in << std::endl;
+
+    BwdTrans<>::create(explist, "StdMat")->apply(in, out);
+
+    std::cout << out << std::endl;
+
+
 #ifdef NEKTAR_USE_CUDA
 
     // Test CUDA MemoryRegion
 
     // Create two Fields with memory on the GPU
-    in  = Field<double, FieldState::Coeff>::create<MemoryRegionCUDA>(blocks);
-    out = Field<double, FieldState::Phys >::create<MemoryRegionCUDA>(blocks);
+    in  = Field<double, FieldState::Coeff>::create<MemoryRegionCUDA>(blocks_coeff);
+    out = Field<double, FieldState::Phys>::create<MemoryRegionCUDA>(blocks_phys);
 
     // Perform the BwdTrans on the fields using the CUDA implementation
     // Since this is a CUDA operator, acting on CUDA fields, everything happens
     // on the GPU.
-    BwdTrans<>::create("CUDA")->apply(in, out);
+    BwdTrans<>::create(explist, "CUDA")->apply(in, out);
 
     // Test the GPU-backed fields with a CPU operator
     // This should show a debug warning due to the implicit conversion. The
     // purpose of this is to allow us to transition the code to the new
     // infrastructure and add CUDA operators, without the need to add ALL CUDA
     // operators before we can test anything.
-    BwdTrans<>::create("MatFree")->apply(in, out);
+    BwdTrans<>::create(explist, "MatFree")->apply(in, out);
 
     // Create two CPU backed fields and use them with a GPU operator
     // This call implicitly converts the fields to a GPU backend and warns the
     // user about that fact
     in  = Field<double, FieldState::Coeff>::create(blocks);
     out = Field<double, FieldState::Phys>::create(blocks);
-    BwdTrans<>::create("CUDA")->apply(in, out);
+    BwdTrans<>::create(explist, "CUDA")->apply(in, out);
 #endif
 
+    std::cout << "END" << std::endl;
     return 0;
 }
diff --git a/square.geo b/square.geo
new file mode 100644
index 0000000000000000000000000000000000000000..ee9a66908d7bda260df522c03ab39f086ee5affd
--- /dev/null
+++ b/square.geo
@@ -0,0 +1,13 @@
+width=1;
+height=1;
+res=4.0;  // number of elements per unit length
+
+Point(1) = {-width/2,-height/2,0,0.1};
+l[] = Extrude {width,0,0} {
+      Point{1}; Layers{width*res}; Recombine;
+};
+s[] = Extrude {0,height,0} {
+      Line{l[1]}; Layers{height*res}; Recombine;
+};
+Physical Surface(0) = {s[1]};
+Physical Line(1) = {1,2,3,4};
diff --git a/square.xml b/square.xml
new file mode 100644
index 0000000000000000000000000000000000000000..63241c7866d35354fcb64c61d9edbac4379d3b72
--- /dev/null
+++ b/square.xml
@@ -0,0 +1,39 @@
+<?xml version="1.0" encoding="utf-8" ?>
+<NEKTAR>
+    <GEOMETRY DIM="2" SPACE="2">
+        <VERTEX COMPRESSED="B64Z-LittleEndian" BITSIZE="64">eJx1j8sVgjAABIOIgIrK324o3RYoIaV4yi7guBfey2SySwjbxM/+G0K24+sPPyFfxXN83/yMvlOg71y4f0knJfeLV+hH8Rp98+tx0GHnDbn/847cadhX/4P7xZ/oed8Lufe1f3pTOvZ1r+d+8QF3e9+I3Psm5M7Mvu69uV/8C2/eNsUA</VERTEX>
+        <EDGE COMPRESSED="B64Z-LittleEndian" BITSIZE="64">eJx1kEkSglAMBQEVZxxBHEARh/vf0I29oKvyN6l+FUI6STJ8aVCzoI6CyhurH54EnIuZN1U/fTMxfXMx8xbB90sx36+Ceet/5T7MLcTM3Yi5x1Y5/9uJ8d4rZ4+DGL+jcvYrxXhX2p+9T2L2rgOPs3J8LmI8rsrxvInxaJTj34rxu2t/7vIQc5dOjMdTOffqxXi8lHPHtxiPj3Lu+xXj9wOocAa/</EDGE>
+        <ELEMENT>
+            <Q COMPRESSED="B64Z-LittleEndian" BITSIZE="64">eJx10EcOwjAABdFQ0kkPvYTO/W/IZrxgJLx50kSyvxJFv2eGc1z86UuMMdH30FPMMNe9oRdY4krvVFhjo12xeoud7knUexy0N1Ufca29Yf8Gt7jT3lx9jwftLdSPeNLeUv2MF+0N/3HCK960t1K/40N7a/UnvrS3UX/jR3u/iOQFQAAA</Q>
+        </ELEMENT>
+        <COMPOSITE>
+            <C ID="0"> Q[0-15] </C>
+            <C ID="1"> E[0,13,22,31,11,21,30,39,3,6,9,12,32,34,36,38] </C>
+        </COMPOSITE>
+        <DOMAIN> C[0] </DOMAIN>
+    </GEOMETRY>
+    <Metadata>
+        <Provenance>
+            <GitBranch></GitBranch>
+            <GitSHA1>88b1cb3be29cc9090d8a14af19c26decd88345db</GitSHA1>
+            <Hostname>kingscross</Hostname>
+            <NektarVersion>5.1.0</NektarVersion>
+            <Timestamp>27-Apr-2023 15:32:08</Timestamp>
+        </Provenance>
+        <NekMeshCommandLine>square.msh square.xml </NekMeshCommandLine>
+    </Metadata>
+    <EXPANSIONS>
+        <E COMPOSITE="C[0]" NUMMODES="4" TYPE="MODIFIED" FIELDS="u" />
+    </EXPANSIONS>
+    <CONDITIONS>
+        <SOLVERINFO>
+            <I PROPERTY="EQTYPE" VALUE="Helmholtz" />
+        </SOLVERINFO>
+        <VARIABLES>
+            <V ID="0">u</V>
+        </VARIABLES>
+        <FUNCTION NAME="Forcing">
+            <E VAR="u" VALUE="0" />
+        </FUNCTION>
+    </CONDITIONS>
+</NEKTAR>