diff --git a/.gitlab-ci/packaging.yml b/.gitlab-ci/packaging.yml index b734089ad8eb6c22d4041fa32c637882d9ec61a3..d6457be0272cd06088fc696ffe672bb68929b5da 100644 --- a/.gitlab-ci/packaging.yml +++ b/.gitlab-ci/packaging.yml @@ -33,7 +33,7 @@ tags: - pkg variables: - GIT_SUBMODULE_STRATEGY: normal + GIT_SUBMODULE_STRATEGY: recursive script: - OS_DIST=$(echo $CI_JOB_NAME | cut -d- -f 2) - OS_VERSION=$(echo $CI_JOB_NAME | cut -d- -f 3) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8f4dfb3c9876a09c4933f50956b18a636c30f23d..f62b8667744116e660efc505b12eeb2d0b9b0788 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,6 +39,7 @@ v5.6.0 - Fix MPI communicator for Parallel-in-Time (!1801) - Fix warnings issued by MSVC compiler in LibUtilities and StdRegions (!1740) - Fix Parareal convergence monitoring (!1802) +- Avoid copying input/output to/from padded aligned arrays in MatrixFree operators(!1783) **CompressibleFlowSolver** - Complete second Frechet derivative implementation (!1761) diff --git a/library/Collections/BwdTrans.cpp b/library/Collections/BwdTrans.cpp index 84c5c5f03fa7f5162e1df4478b9425eb83adc644..2ee0f0a314006e7362baf730f0325cc132e98b99 100644 --- a/library/Collections/BwdTrans.cpp +++ b/library/Collections/BwdTrans.cpp @@ -155,7 +155,7 @@ OperatorKey BwdTrans_StdMat::m_typeArr[] = { * @brief Backward transform operator using matrix free operators. */ class BwdTrans_MatrixFree final : virtual public Operator, - MatrixFreeOneInOneOut, + MatrixFreeBase, virtual public BwdTrans_Helper { public: @@ -169,19 +169,7 @@ public: [[maybe_unused]] Array<OneD, NekDouble> &output2, [[maybe_unused]] Array<OneD, NekDouble> &wsp) final { - if (m_isPadded) - { - // copy into padded vector - Vmath::Vcopy(m_nIn, input, 1, m_input, 1); - // call op - (*m_oper)(m_input, m_output); - // copy out of padded vector - Vmath::Vcopy(m_nOut, m_output, 1, output0, 1); - } - else - { - (*m_oper)(input, output0); - } + (*m_oper)(input, output0); } void operator()([[maybe_unused]] int dir, @@ -200,9 +188,9 @@ private: CoalescedGeomDataSharedPtr pGeomData, StdRegions::FactorMap factors) : Operator(pCollExp, pGeomData, factors), BwdTrans_Helper(), - MatrixFreeOneInOneOut(pCollExp[0]->GetStdExp()->GetNcoeffs(), - pCollExp[0]->GetStdExp()->GetTotPoints(), - pCollExp.size()) + MatrixFreeBase(pCollExp[0]->GetStdExp()->GetNcoeffs(), + pCollExp[0]->GetStdExp()->GetTotPoints(), + pCollExp.size()) { // Basis vector. const auto dim = pCollExp[0]->GetStdExp()->GetShapeDimension(); @@ -219,7 +207,7 @@ private: std::string op_string = "BwdTrans"; op_string += MatrixFree::GetOpstring(shapeType, false); auto oper = MatrixFree::GetOperatorFactory().CreateInstance( - op_string, basis, m_nElmtPad); + op_string, basis, pCollExp.size()); m_oper = std::dynamic_pointer_cast<MatrixFree::BwdTrans>(oper); ASSERTL0(m_oper, "Failed to cast pointer."); diff --git a/library/Collections/Helmholtz.cpp b/library/Collections/Helmholtz.cpp index 93706346fa16ff61e1657002e3e1a70b08a61b0c..001b3bbdf944fbc6ae1d57a06de43d7f4918de80 100644 --- a/library/Collections/Helmholtz.cpp +++ b/library/Collections/Helmholtz.cpp @@ -583,7 +583,7 @@ OperatorKey Helmholtz_IterPerExp::m_typeArr[] = { * @brief Helmholtz operator using matrix free operators. */ class Helmholtz_MatrixFree final : virtual public Operator, - MatrixFreeOneInOneOut, + MatrixFreeBase, Helmholtz_Helper { public: @@ -597,17 +597,7 @@ public: [[maybe_unused]] Array<OneD, NekDouble> &output2, [[maybe_unused]] Array<OneD, NekDouble> &wsp) final { - if (m_isPadded) - { - // copy into padded vector - Vmath::Vcopy(m_nmtot, input, 1, m_input, 1); - (*m_oper)(m_input, m_output); - Vmath::Vcopy(m_nmtot, m_output, 1, output0, 1); - } - else - { - (*m_oper)(input, output0); - } + (*m_oper)(input, output0); } void operator()([[maybe_unused]] int dir, @@ -723,9 +713,9 @@ private: CoalescedGeomDataSharedPtr pGeomData, StdRegions::FactorMap factors) : Operator(pCollExp, pGeomData, factors), - MatrixFreeOneInOneOut(pCollExp[0]->GetStdExp()->GetNcoeffs(), - pCollExp[0]->GetStdExp()->GetNcoeffs(), - pCollExp.size()), + MatrixFreeBase(pCollExp[0]->GetStdExp()->GetNcoeffs(), + pCollExp[0]->GetStdExp()->GetNcoeffs(), + pCollExp.size()), Helmholtz_Helper() { @@ -747,7 +737,7 @@ private: std::string op_string = "Helmholtz"; op_string += MatrixFree::GetOpstring(shapeType, m_isDeformed); auto oper = MatrixFree::GetOperatorFactory().CreateInstance( - op_string, basis, m_nElmtPad); + op_string, basis, pCollExp.size()); // Set Jacobian oper->SetJac(pGeomData->GetJacInterLeave(pCollExp, m_nElmtPad)); diff --git a/library/Collections/IProductWRTBase.cpp b/library/Collections/IProductWRTBase.cpp index 80ccf759d482a6e9d82779539c42e54d576c7724..d50bc3972548945b83fefdafc926d85cd444f73e 100644 --- a/library/Collections/IProductWRTBase.cpp +++ b/library/Collections/IProductWRTBase.cpp @@ -175,7 +175,7 @@ OperatorKey IProductWRTBase_StdMat::m_typeArr[] = { * @brief Inner product operator using operator using matrix free operators. */ class IProductWRTBase_MatrixFree final : virtual public Operator, - MatrixFreeOneInOneOut, + MatrixFreeBase, virtual public IProductWRTBase_Helper { public: @@ -189,19 +189,7 @@ public: [[maybe_unused]] Array<OneD, NekDouble> &output2, [[maybe_unused]] Array<OneD, NekDouble> &wsp) final { - if (m_isPadded) - { - // copy into padded vector - Vmath::Vcopy(m_nIn, input, 1, m_input, 1); - // call op - (*m_oper)(m_input, m_output); - // copy out of padded vector - Vmath::Vcopy(m_nOut, m_output, 1, output, 1); - } - else - { - (*m_oper)(input, output); - } + (*m_oper)(input, output); } void operator()([[maybe_unused]] int dir, @@ -219,9 +207,9 @@ private: vector<StdRegions::StdExpansionSharedPtr> pCollExp, CoalescedGeomDataSharedPtr pGeomData, StdRegions::FactorMap factors) : Operator(pCollExp, pGeomData, factors), IProductWRTBase_Helper(), - MatrixFreeOneInOneOut(pCollExp[0]->GetStdExp()->GetTotPoints(), - pCollExp[0]->GetStdExp()->GetNcoeffs(), - pCollExp.size()) + MatrixFreeBase(pCollExp[0]->GetStdExp()->GetTotPoints(), + pCollExp[0]->GetStdExp()->GetNcoeffs(), + pCollExp.size()) { // Basis vector const auto dim = pCollExp[0]->GetStdExp()->GetShapeDimension(); @@ -238,7 +226,7 @@ private: std::string op_string = "IProduct"; op_string += MatrixFree::GetOpstring(shapeType, m_isDeformed); auto oper = MatrixFree::GetOperatorFactory().CreateInstance( - op_string, basis, m_nElmtPad); + op_string, basis, pCollExp.size()); // Set Jacobian oper->SetJac(pGeomData->GetJacInterLeave(pCollExp, m_nElmtPad)); diff --git a/library/Collections/IProductWRTDerivBase.cpp b/library/Collections/IProductWRTDerivBase.cpp index 83244f9cd24ec2e2f28b6d8df76261c0ec81f38e..534b85eb262003f78dbf05d6439f9a6deed86705 100644 --- a/library/Collections/IProductWRTDerivBase.cpp +++ b/library/Collections/IProductWRTDerivBase.cpp @@ -271,7 +271,7 @@ OperatorKey IProductWRTDerivBase_StdMat::m_typeArr[] = { * @brief Inner product operator using operator using matrix free operators. */ class IProductWRTDerivBase_MatrixFree final : virtual public Operator, - MatrixFreeMultiInOneOut, + MatrixFreeBase, IProductWRTDerivBase_Helper { public: @@ -286,66 +286,32 @@ public: [[maybe_unused]] Array<OneD, NekDouble> &wsp) final { Array<OneD, NekDouble> output; + Array<OneD, Array<OneD, NekDouble>> input(m_coordim); - if (m_isPadded) + // copy into padded vector + switch (m_coordim) { - // copy into padded vector - switch (m_coordim) - { - case 1: - output = entry1; - Vmath::Vcopy(m_nIn, entry0, 1, m_input[0], 1); - break; - case 2: - output = entry2; - Vmath::Vcopy(m_nIn, entry0, 1, m_input[0], 1); - Vmath::Vcopy(m_nIn, entry1, 1, m_input[1], 1); - break; - case 3: - output = entry3; - Vmath::Vcopy(m_nIn, entry0, 1, m_input[0], 1); - Vmath::Vcopy(m_nIn, entry1, 1, m_input[1], 1); - Vmath::Vcopy(m_nIn, entry2, 1, m_input[2], 1); - break; - default: - NEKERROR(ErrorUtil::efatal, "m_coordim value not valid"); - break; - } - - // call op - (*m_oper)(m_input, m_output); - // copy out of padded vector - Vmath::Vcopy(m_nOut, m_output, 1, output, 1); + case 1: + input[0] = entry0; + output = entry1; + break; + case 2: + input[0] = entry0; + input[1] = entry1; + output = entry2; + break; + case 3: + input[0] = entry0; + input[1] = entry1; + input[2] = entry2; + output = entry3; + break; + default: + NEKERROR(ErrorUtil::efatal, "coordim not valid"); + break; } - else - { - Array<OneD, Array<OneD, NekDouble>> input{m_coordim}; - // copy into padded vector - switch (m_coordim) - { - case 1: - output = entry1; - input[0] = entry0; - break; - case 2: - output = entry2; - input[0] = entry0; - input[1] = entry1; - break; - case 3: - output = entry3; - input[0] = entry0; - input[1] = entry1; - input[2] = entry2; - break; - default: - NEKERROR(ErrorUtil::efatal, "coordim not valid"); - break; - } - - (*m_oper)(input, output); - } + (*m_oper)(input, output); } void operator()([[maybe_unused]] int dir, @@ -358,19 +324,19 @@ public: private: std::shared_ptr<MatrixFree::IProductWRTDerivBase> m_oper; + int m_coordim; IProductWRTDerivBase_MatrixFree( vector<StdRegions::StdExpansionSharedPtr> pCollExp, CoalescedGeomDataSharedPtr pGeomData, StdRegions::FactorMap factors) : Operator(pCollExp, pGeomData, factors), - MatrixFreeMultiInOneOut(pCollExp[0]->GetCoordim(), - pCollExp[0]->GetStdExp()->GetTotPoints(), - pCollExp[0]->GetStdExp()->GetNcoeffs(), - pCollExp.size()), + MatrixFreeBase(pCollExp[0]->GetStdExp()->GetTotPoints(), + pCollExp[0]->GetStdExp()->GetNcoeffs(), + pCollExp.size()), IProductWRTDerivBase_Helper() { - // Check if deformed const auto dim = pCollExp[0]->GetStdExp()->GetShapeDimension(); + m_coordim = pCollExp[0]->GetCoordim(); // Basis vector std::vector<LibUtilities::BasisSharedPtr> basis(dim); @@ -386,7 +352,7 @@ private: std::string op_string = "IProductWRTDerivBase"; op_string += MatrixFree::GetOpstring(shapeType, m_isDeformed); auto oper = MatrixFree::GetOperatorFactory().CreateInstance( - op_string, basis, m_nElmtPad); + op_string, basis, pCollExp.size()); // Set Jacobian oper->SetJac(pGeomData->GetJacInterLeave(pCollExp, m_nElmtPad)); diff --git a/library/Collections/LinearAdvectionDiffusionReaction.cpp b/library/Collections/LinearAdvectionDiffusionReaction.cpp index 63be926dd62d762ef300efd6d12b467a738063ea..cc917c236adf401bc9e2481cc789f71c0215c3fc 100644 --- a/library/Collections/LinearAdvectionDiffusionReaction.cpp +++ b/library/Collections/LinearAdvectionDiffusionReaction.cpp @@ -548,7 +548,7 @@ OperatorKey LinearAdvectionDiffusionReaction_IterPerExp::m_typeArr[] = { */ class LinearAdvectionDiffusionReaction_MatrixFree final : virtual public Operator, - MatrixFreeOneInOneOut, + MatrixFreeBase, LinearAdvectionDiffusionReaction_Helper { public: @@ -562,17 +562,7 @@ public: [[maybe_unused]] Array<OneD, NekDouble> &output2, [[maybe_unused]] Array<OneD, NekDouble> &wsp) final { - if (m_isPadded) - { - // copy into padded vector - Vmath::Vcopy(m_nmtot, input, 1, m_input, 1); - (*m_oper)(m_input, m_output); - Vmath::Vcopy(m_nmtot, m_output, 1, output0, 1); - } - else - { - (*m_oper)(input, output0); - } + (*m_oper)(input, output0); } void operator()([[maybe_unused]] int dir, @@ -746,9 +736,9 @@ private: vector<StdRegions::StdExpansionSharedPtr> pCollExp, CoalescedGeomDataSharedPtr pGeomData, StdRegions::FactorMap factors) : Operator(pCollExp, pGeomData, factors), - MatrixFreeOneInOneOut(pCollExp[0]->GetStdExp()->GetNcoeffs(), - pCollExp[0]->GetStdExp()->GetNcoeffs(), - pCollExp.size()), + MatrixFreeBase(pCollExp[0]->GetStdExp()->GetNcoeffs(), + pCollExp[0]->GetStdExp()->GetNcoeffs(), + pCollExp.size()), LinearAdvectionDiffusionReaction_Helper() { m_nmtot = m_numElmt * pCollExp[0]->GetStdExp()->GetNcoeffs(); @@ -770,7 +760,7 @@ private: std::string op_string = "LinearAdvectionDiffusionReaction"; op_string += MatrixFree::GetOpstring(shapeType, m_isDeformed); auto oper = MatrixFree::GetOperatorFactory().CreateInstance( - op_string, basis, m_nElmtPad); + op_string, basis, pCollExp.size()); // Get N quadpoints with padding m_nqtot = m_numElmt * pCollExp[0]->GetStdExp()->GetTotPoints(); diff --git a/library/Collections/MatrixFreeBase.h b/library/Collections/MatrixFreeBase.h index d3031f3fd64db105e2cddb3f4d1d8e2782c1dd5b..b12299e59f7601ac5e3196113c427dc6b0648b15 100644 --- a/library/Collections/MatrixFreeBase.h +++ b/library/Collections/MatrixFreeBase.h @@ -50,56 +50,6 @@ public: const unsigned int nCollSize) : m_nIn(nIn * nCollSize), m_nOut(nOut * nCollSize) { - } - -protected: - /// flag for padding - bool m_isPadded{false}; - /// size after padding - unsigned int m_nElmtPad; - unsigned int m_nIn; - unsigned int m_nOut; -}; - -class MatrixFreeOneInOneOut : protected MatrixFreeBase -{ -public: - /// Constructor - MatrixFreeOneInOneOut(const unsigned int nIn, const unsigned int nOut, - const unsigned int nCollSize) - : MatrixFreeBase(nIn, nOut, nCollSize) - { - // Padding if needed - using vec_t = tinysimd::simd<NekDouble>; - const auto nElmtNoPad = nCollSize; - m_nElmtPad = nElmtNoPad; - - if (nElmtNoPad % vec_t::width != 0) - { - m_isPadded = true; - m_nElmtPad = - nElmtNoPad + vec_t::width - (nElmtNoPad % vec_t::width); - m_input = Array<OneD, NekDouble>{nIn * m_nElmtPad, 0.0}; - m_output = Array<OneD, NekDouble>{nOut * m_nElmtPad, 0.0}; - } - } - -protected: - /// padded input/output vectors - Array<OneD, NekDouble> m_input, m_output; -}; - -class MatrixFreeMultiInOneOut : protected MatrixFreeBase -{ -public: - /// Constructor - MatrixFreeMultiInOneOut(const unsigned int coordim, const unsigned int nIn, - const unsigned int nOut, - const unsigned int nCollSize) - : MatrixFreeBase(nIn, nOut, nCollSize) - { - m_coordim = coordim; - // Padding if needed using vec_t = tinysimd::simd<NekDouble>; const auto nElmtNoPad = nCollSize; @@ -110,74 +60,19 @@ public: m_isPadded = true; m_nElmtPad = nElmtNoPad + vec_t::width - (nElmtNoPad % vec_t::width); - - m_input = Array<OneD, Array<OneD, NekDouble>>(m_coordim); - m_input[0] = Array<OneD, NekDouble>{nIn * m_nElmtPad, 0.0}; - if (m_coordim == 2) - { - m_input[1] = Array<OneD, NekDouble>{nIn * m_nElmtPad, 0.0}; - } - else if (m_coordim == 3) - { - m_input[1] = Array<OneD, NekDouble>{nIn * m_nElmtPad, 0.0}; - m_input[2] = Array<OneD, NekDouble>{nIn * m_nElmtPad, 0.0}; - } - m_output = Array<OneD, NekDouble>{nOut * m_nElmtPad, 0.0}; } } protected: - /// coordinates dimension - unsigned short m_coordim; - /// padded input/output vectors - Array<OneD, Array<OneD, NekDouble>> m_input; - Array<OneD, NekDouble> m_output; + /// flag for padding + bool m_isPadded{false}; + /// size after padding + unsigned int m_nElmtPad; + /// actural size of input array + unsigned int m_nIn; + /// actural size of output array + unsigned int m_nOut; }; -class MatrixFreeOneInMultiOut : protected MatrixFreeBase -{ -public: - /// Constructor - MatrixFreeOneInMultiOut(const unsigned int coordim, const unsigned int nIn, - const unsigned int nOut, - const unsigned int nCollSize) - : MatrixFreeBase(nIn, nOut, nCollSize) - { - m_coordim = coordim; - - // Padding if needed - using vec_t = tinysimd::simd<NekDouble>; - const auto nElmtNoPad = nCollSize; - m_nElmtPad = nElmtNoPad; - - if (nElmtNoPad % vec_t::width != 0) - { - m_isPadded = true; - m_nElmtPad = - nElmtNoPad + vec_t::width - (nElmtNoPad % vec_t::width); - - m_input = Array<OneD, NekDouble>{nIn * m_nElmtPad, 0.0}; - - m_output = Array<OneD, Array<OneD, NekDouble>>(m_coordim); - m_output[0] = Array<OneD, NekDouble>{nOut * m_nElmtPad, 0.0}; - if (m_coordim == 2) - { - m_output[1] = Array<OneD, NekDouble>{nOut * m_nElmtPad, 0.0}; - } - else if (m_coordim == 3) - { - m_output[1] = Array<OneD, NekDouble>{nOut * m_nElmtPad, 0.0}; - m_output[2] = Array<OneD, NekDouble>{nOut * m_nElmtPad, 0.0}; - } - } - } - -protected: - /// coordinates dimension - unsigned short m_coordim; - /// padded input/output vectors - Array<OneD, NekDouble> m_input; - Array<OneD, Array<OneD, NekDouble>> m_output; -}; } // namespace Nektar::Collections #endif // NEKTAR_LIBRARY_COLLECTIONS_MATRIXFREEBASE_H diff --git a/library/Collections/PhysDeriv.cpp b/library/Collections/PhysDeriv.cpp index 6d6504e1bcaf72b9dd1713ee76c9d21b93b365d2..7c06af80516ec1f2a7f8f2241601b6a8133d83c1 100644 --- a/library/Collections/PhysDeriv.cpp +++ b/library/Collections/PhysDeriv.cpp @@ -267,7 +267,7 @@ OperatorKey PhysDeriv_StdMat::m_typeArr[] = { * @brief Phys deriv operator using matrix free operators. */ class PhysDeriv_MatrixFree final : virtual public Operator, - MatrixFreeOneInMultiOut, + MatrixFreeBase, virtual public PhysDeriv_Helper { public: @@ -281,87 +281,69 @@ public: Array<OneD, NekDouble> &output2, [[maybe_unused]] Array<OneD, NekDouble> &wsp) final { - if (m_isPadded) - { - // copy into padded vector - Vmath::Vcopy(m_nIn, input, 1, m_input, 1); - (*m_oper)(m_input, m_output); - } - else - { - (*m_oper)(input, m_output); - } - + Array<OneD, Array<OneD, NekDouble>> output(m_coordim); // currently using temporary local temporary space for output // to allow for other operator call below which is // directionally dependent switch (m_coordim) { case 1: - Vmath::Vcopy(m_nOut, m_output[0], 1, output0, 1); + output[0] = output0; break; case 2: - Vmath::Vcopy(m_nOut, m_output[0], 1, output0, 1); - Vmath::Vcopy(m_nOut, m_output[1], 1, output1, 1); + output[0] = output0; + output[1] = output1; break; case 3: - Vmath::Vcopy(m_nOut, m_output[0], 1, output0, 1); - Vmath::Vcopy(m_nOut, m_output[1], 1, output1, 1); - Vmath::Vcopy(m_nOut, m_output[2], 1, output2, 1); + output[0] = output0; + output[1] = output1; + output[2] = output2; break; default: NEKERROR(ErrorUtil::efatal, "Unknown coordinate dimension"); break; } + (*m_oper)(input, output); } void operator()(int dir, const Array<OneD, const NekDouble> &input, Array<OneD, NekDouble> &output, [[maybe_unused]] Array<OneD, NekDouble> &wsp) final { - if (m_isPadded) - { - // copy into padded vector - Vmath::Vcopy(m_nIn, input, 1, m_input, 1); - (*m_oper)(m_input, m_output); - } - else - { - (*m_oper)(input, m_output); - } + (*m_oper)(input, m_output); Vmath::Vcopy(m_nOut, m_output[dir], 1, output, 1); } private: std::shared_ptr<MatrixFree::PhysDeriv> m_oper; + int m_coordim; + Array<OneD, Array<OneD, NekDouble>> m_output; PhysDeriv_MatrixFree(vector<StdRegions::StdExpansionSharedPtr> pCollExp, CoalescedGeomDataSharedPtr pGeomData, StdRegions::FactorMap factors) : Operator(pCollExp, pGeomData, factors), PhysDeriv_Helper(), - MatrixFreeOneInMultiOut(pCollExp[0]->GetCoordim(), - pCollExp[0]->GetStdExp()->GetTotPoints(), - pCollExp[0]->GetStdExp()->GetTotPoints(), - pCollExp.size()) + MatrixFreeBase(pCollExp[0]->GetStdExp()->GetTotPoints(), + pCollExp[0]->GetStdExp()->GetTotPoints(), + pCollExp.size()) { // Check if deformed bool deformed{pGeomData->IsDeformed(pCollExp)}; const auto dim = pCollExp[0]->GetStdExp()->GetShapeDimension(); - if (m_isPadded == false) // declare local space non-padded case + // only used operator(dir, in, out) + m_coordim = pCollExp[0]->GetCoordim(); + int nOut = pCollExp[0]->GetStdExp()->GetTotPoints(); + m_output = Array<OneD, Array<OneD, NekDouble>>(m_coordim); + m_output[0] = Array<OneD, NekDouble>{nOut * m_nElmtPad, 0.0}; + if (m_coordim == 2) { - int nOut = pCollExp[0]->GetStdExp()->GetTotPoints(); - m_output = Array<OneD, Array<OneD, NekDouble>>(m_coordim); - m_output[0] = Array<OneD, NekDouble>{nOut * m_nElmtPad, 0.0}; - if (m_coordim == 2) - { - m_output[1] = Array<OneD, NekDouble>{nOut * m_nElmtPad, 0.0}; - } - else if (m_coordim == 3) - { - m_output[1] = Array<OneD, NekDouble>{nOut * m_nElmtPad, 0.0}; - m_output[2] = Array<OneD, NekDouble>{nOut * m_nElmtPad, 0.0}; - } + m_output[1] = Array<OneD, NekDouble>{nOut * m_nElmtPad, 0.0}; + } + else if (m_coordim == 3) + { + m_output[1] = Array<OneD, NekDouble>{nOut * m_nElmtPad, 0.0}; + m_output[2] = Array<OneD, NekDouble>{nOut * m_nElmtPad, 0.0}; } // Basis vector. @@ -378,7 +360,7 @@ private: std::string op_string = "PhysDeriv"; op_string += MatrixFree::GetOpstring(shapeType, deformed); auto oper = MatrixFree::GetOperatorFactory().CreateInstance( - op_string, basis, m_nElmtPad); + op_string, basis, pCollExp.size()); // Set derivative factors oper->SetDF(pGeomData->GetDerivFactorsInterLeave(pCollExp, m_nElmtPad)); diff --git a/library/Collections/PhysInterp1DScaled.cpp b/library/Collections/PhysInterp1DScaled.cpp index 3399015c4cf5dca8be7190b976fdfaa9268c4728..1f7c4bdb189ef03a7f32dfba6155dcf6bbf19175 100644 --- a/library/Collections/PhysInterp1DScaled.cpp +++ b/library/Collections/PhysInterp1DScaled.cpp @@ -182,7 +182,7 @@ private: */ class PhysInterp1DScaled_MatrixFree final : virtual public Operator, - MatrixFreeOneInOneOut, + MatrixFreeBase, PhysInterp1DScaled_Helper { public: @@ -198,19 +198,7 @@ public: [[maybe_unused]] Array<OneD, NekDouble> &output2, [[maybe_unused]] Array<OneD, NekDouble> &wsp) final { - if (m_isPadded) - { - // copy into padded vector - Vmath::Vcopy(m_nIn, input, 1, m_input, 1); - // call op - (*m_oper)(m_input, m_output); - // copy out of padded vector - Vmath::Vcopy(m_nOut, m_output, 1, output0, 1); - } - else - { - (*m_oper)(input, output0); - } + (*m_oper)(input, output0); } void operator()([[maybe_unused]] int dir, @@ -228,15 +216,15 @@ public: } private: - std::shared_ptr<MatrixFree::PhysInterp1DScaled> m_oper; + std::shared_ptr<MatrixFree::BwdTrans> m_oper; PhysInterp1DScaled_MatrixFree( vector<StdRegions::StdExpansionSharedPtr> pCollExp, CoalescedGeomDataSharedPtr pGeomData, StdRegions::FactorMap factors) : Operator(pCollExp, pGeomData, factors), - MatrixFreeOneInOneOut(pCollExp[0]->GetStdExp()->GetNcoeffs(), - pCollExp[0]->GetStdExp()->GetTotPoints(), - pCollExp.size()), + MatrixFreeBase(pCollExp[0]->GetStdExp()->GetNcoeffs(), + pCollExp[0]->GetStdExp()->GetTotPoints(), + pCollExp.size()), PhysInterp1DScaled_Helper() { // Basis vector. @@ -251,13 +239,12 @@ private: auto shapeType = pCollExp[0]->GetStdExp()->DetShapeType(); // Generate operator string and create operator. - std::string op_string = "PhysInterp1DScaled"; + std::string op_string = "BwdTrans"; op_string += MatrixFree::GetOpstring(shapeType, false); auto oper = MatrixFree::GetOperatorFactory().CreateInstance( - op_string, basis, m_nElmtPad); + op_string, basis, pCollExp.size()); - m_oper = - std::dynamic_pointer_cast<MatrixFree::PhysInterp1DScaled>(oper); + m_oper = std::dynamic_pointer_cast<MatrixFree::BwdTrans>(oper); ASSERTL0(m_oper, "Failed to cast pointer."); } }; diff --git a/library/LibUtilities/SimdLib/avx2.hpp b/library/LibUtilities/SimdLib/avx2.hpp index 25821a2073ec69aea1ad06bccfef7d221808e474..95ec832fdfeb0518563e9485093ffe10b4b2a876 100644 --- a/library/LibUtilities/SimdLib/avx2.hpp +++ b/library/LibUtilities/SimdLib/avx2.hpp @@ -605,6 +605,21 @@ inline avx2Double4 log(avx2Double4 in) #endif } +inline void load_unalign_interleave( + const double *in, const std::uint32_t dataLen, + std::vector<avx2Double4, allocator<avx2Double4>> &out) +{ + alignas(avx2Double4::alignment) avx2Double4::scalarArray tmp; + for (size_t i = 0; i < dataLen; ++i) + { + tmp[0] = in[i]; + tmp[1] = in[i + dataLen]; + tmp[2] = in[i + 2 * dataLen]; + tmp[3] = in[i + 3 * dataLen]; + out[i].load(tmp); + } +} + inline void load_interleave( const double *in, std::uint32_t dataLen, std::vector<avx2Double4, allocator<avx2Double4>> &out) @@ -640,6 +655,21 @@ inline void load_interleave( } } +inline void deinterleave_unalign_store( + const std::vector<avx2Double4, allocator<avx2Double4>> &in, + const std::uint32_t dataLen, double *out) +{ + alignas(avx2Double4::alignment) avx2Double4::scalarArray tmp; + for (size_t i = 0; i < dataLen; ++i) + { + in[i].store(tmp); + out[i] = tmp[0]; + out[i + dataLen] = tmp[1]; + out[i + 2 * dataLen] = tmp[2]; + out[i + 3 * dataLen] = tmp[3]; + } +} + inline void deinterleave_store( const std::vector<avx2Double4, allocator<avx2Double4>> &in, std::uint32_t dataLen, double *out) @@ -862,6 +892,25 @@ inline avx2Float8 log(avx2Float8 in) return ret; } +inline void load_unalign_interleave( + const double *in, const std::uint32_t dataLen, + std::vector<avx2Float8, allocator<avx2Float8>> &out) +{ + alignas(avx2Float8::alignment) avx2Float8::scalarArray tmp; + for (size_t i = 0; i < dataLen; ++i) + { + tmp[0] = in[i]; + tmp[1] = in[i + dataLen]; + tmp[2] = in[i + 2 * dataLen]; + tmp[3] = in[i + 3 * dataLen]; + tmp[4] = in[i + 4 * dataLen]; + tmp[5] = in[i + 5 * dataLen]; + tmp[6] = in[i + 6 * dataLen]; + tmp[7] = in[i + 7 * dataLen]; + out[i].load(tmp); + } +} + inline void load_interleave(const float *in, std::uint32_t dataLen, std::vector<avx2Float8, allocator<avx2Float8>> &out) { @@ -898,6 +947,25 @@ inline void load_interleave(const float *in, std::uint32_t dataLen, } } +inline void deinterleave_unalign_store( + const std::vector<avx2Float8, allocator<avx2Float8>> &in, + const std::uint32_t dataLen, double *out) +{ + alignas(avx2Float8::alignment) avx2Float8::scalarArray tmp; + for (size_t i = 0; i < dataLen; ++i) + { + in[i].store(tmp); + out[i] = tmp[0]; + out[i + dataLen] = tmp[1]; + out[i + 2 * dataLen] = tmp[2]; + out[i + 3 * dataLen] = tmp[3]; + out[i + 4 * dataLen] = tmp[4]; + out[i + 5 * dataLen] = tmp[5]; + out[i + 6 * dataLen] = tmp[6]; + out[i + 7 * dataLen] = tmp[7]; + } +} + inline void deinterleave_store( const std::vector<avx2Float8, allocator<avx2Float8>> &in, std::uint32_t dataLen, float *out) diff --git a/library/LibUtilities/SimdLib/avx512.hpp b/library/LibUtilities/SimdLib/avx512.hpp index 518681ef71b0b92631da8c86d4e6415b23cbb841..d40c8f4959786371e23c4aad38064c9e98a67ca9 100644 --- a/library/LibUtilities/SimdLib/avx512.hpp +++ b/library/LibUtilities/SimdLib/avx512.hpp @@ -601,6 +601,25 @@ inline avx512Double8 log(avx512Double8 in) #endif } +inline void load_unalign_interleave( + const double *in, const std::uint32_t dataLen, + std::vector<avx512Double8, allocator<avx512Double8>> &out) +{ + alignas(avx512Double8::alignment) avx512Double8::scalarArray tmp; + for (size_t i = 0; i < dataLen; ++i) + { + tmp[0] = in[i]; + tmp[1] = in[i + dataLen]; + tmp[2] = in[i + 2 * dataLen]; + tmp[3] = in[i + 3 * dataLen]; + tmp[4] = in[i + 4 * dataLen]; + tmp[5] = in[i + 5 * dataLen]; + tmp[6] = in[i + 6 * dataLen]; + tmp[7] = in[i + 7 * dataLen]; + out[i].load(tmp); + } +} + inline void load_interleave( const double *in, std::uint32_t dataLen, std::vector<avx512Double8, allocator<avx512Double8>> &out) @@ -640,6 +659,25 @@ inline void load_interleave( } } +inline void deinterleave_unalign_store( + const std::vector<avx512Double8, allocator<avx512Double8>> &in, + const std::uint32_t dataLen, double *out) +{ + alignas(avx512Double8::alignment) avx512Double8::scalarArray tmp; + for (size_t i = 0; i < dataLen; ++i) + { + in[i].store(tmp); + out[i] = tmp[0]; + out[i + dataLen] = tmp[1]; + out[i + 2 * dataLen] = tmp[2]; + out[i + 3 * dataLen] = tmp[3]; + out[i + 4 * dataLen] = tmp[4]; + out[i + 5 * dataLen] = tmp[5]; + out[i + 6 * dataLen] = tmp[6]; + out[i + 7 * dataLen] = tmp[7]; + } +} + inline void deinterleave_store( const std::vector<avx512Double8, allocator<avx512Double8>> &in, std::uint32_t dataLen, double *out) @@ -864,6 +902,33 @@ inline avx512Float16 log(avx512Float16 in) #endif } +inline void load_unalign_interleave( + const double *in, const std::uint32_t dataLen, + std::vector<avx512Float16, allocator<avx512Float16>> &out) +{ + alignas(avx512Float16::alignment) avx512Float16::scalarArray tmp; + for (size_t i = 0; i < dataLen; ++i) + { + tmp[0] = in[i]; + tmp[1] = in[i + dataLen]; + tmp[2] = in[i + 2 * dataLen]; + tmp[3] = in[i + 3 * dataLen]; + tmp[4] = in[i + 4 * dataLen]; + tmp[5] = in[i + 5 * dataLen]; + tmp[6] = in[i + 6 * dataLen]; + tmp[7] = in[i + 7 * dataLen]; + tmp[8] = in[i + 8 * dataLen]; + tmp[9] = in[i + 9 * dataLen]; + tmp[10] = in[i + 10 * dataLen]; + tmp[11] = in[i + 11 * dataLen]; + tmp[12] = in[i + 12 * dataLen]; + tmp[13] = in[i + 13 * dataLen]; + tmp[14] = in[i + 14 * dataLen]; + tmp[15] = in[i + 15 * dataLen]; + out[i].load(tmp); + } +} + inline void load_interleave( const float *in, std::uint32_t dataLen, std::vector<avx512Float16, allocator<avx512Float16>> &out) @@ -871,8 +936,22 @@ inline void load_interleave( alignas(avx512Float16::alignment) avx512Float16::scalarIndexType tmp[avx512Float16::width] = { - 0, dataLen, 2 * dataLen, 3 * dataLen, - 4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen}; + 0, + dataLen, + 2 * dataLen, + 3 * dataLen, + 4 * dataLen, + 5 * dataLen, + 6 * dataLen, + 7 * dataLen, + 8 * dataLen, + 9 * dataLen, + 10 * dataLen, + 11 * dataLen, + 12 * dataLen, + 13 * dataLen, + 14 * dataLen, + 15 * dataLen}; using index_t = avx512Int16<avx512Float16::scalarIndexType>; index_t index0(tmp); @@ -903,6 +982,33 @@ inline void load_interleave( } } +inline void deinterleave_unalign_store( + const std::vector<avx512Float16, allocator<avx512Float16>> &in, + const std::uint32_t dataLen, double *out) +{ + alignas(avx512Float16::alignment) avx512Float16::scalarArray tmp; + for (size_t i = 0; i < dataLen; ++i) + { + in[i].store(tmp); + out[i] = tmp[0]; + out[i + dataLen] = tmp[1]; + out[i + 2 * dataLen] = tmp[2]; + out[i + 3 * dataLen] = tmp[3]; + out[i + 4 * dataLen] = tmp[4]; + out[i + 5 * dataLen] = tmp[5]; + out[i + 6 * dataLen] = tmp[6]; + out[i + 7 * dataLen] = tmp[7]; + out[i + 8 * dataLen] = tmp[8]; + out[i + 9 * dataLen] = tmp[9]; + out[i + 10 * dataLen] = tmp[10]; + out[i + 11 * dataLen] = tmp[11]; + out[i + 12 * dataLen] = tmp[12]; + out[i + 13 * dataLen] = tmp[13]; + out[i + 14 * dataLen] = tmp[14]; + out[i + 15 * dataLen] = tmp[15]; + } +} + inline void deinterleave_store( const std::vector<avx512Float16, allocator<avx512Float16>> &in, std::uint32_t dataLen, float *out) @@ -911,8 +1017,22 @@ inline void deinterleave_store( alignas(avx512Float16::alignment) avx512Float16::scalarIndexType tmp[avx512Float16::width] = { - 0, dataLen, 2 * dataLen, 3 * dataLen, - 4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen}; + 0, + dataLen, + 2 * dataLen, + 3 * dataLen, + 4 * dataLen, + 5 * dataLen, + 6 * dataLen, + 7 * dataLen, + 8 * dataLen, + 9 * dataLen, + 10 * dataLen, + 11 * dataLen, + 12 * dataLen, + 13 * dataLen, + 14 * dataLen, + 15 * dataLen}; using index_t = avx512Int16<avx512Float16::scalarIndexType>; index_t index0(tmp); diff --git a/library/LibUtilities/SimdLib/scalar.hpp b/library/LibUtilities/SimdLib/scalar.hpp index dc795a73ff707d718e307e97258765f84cdbfbc4..8eadd62216885e8a68f30193cdf80666a56b6cac 100644 --- a/library/LibUtilities/SimdLib/scalar.hpp +++ b/library/LibUtilities/SimdLib/scalar.hpp @@ -306,7 +306,18 @@ template <typename T> inline scalarT<T> log(scalarT<T> in) } template <typename T> -inline void load_interleave(const T *in, size_t dataLen, +inline void load_unalign_interleave( + const T *in, const size_t dataLen, + std::vector<scalarT<T>, allocator<scalarT<T>>> &out) +{ + for (size_t i = 0; i < dataLen; ++i) + { + out[i] = in[i]; + } +} + +template <typename T> +inline void load_interleave(const T *in, const size_t dataLen, std::vector<scalarT<T>, allocator<scalarT<T>>> &out) { for (size_t i = 0; i < dataLen; ++i) @@ -315,10 +326,21 @@ inline void load_interleave(const T *in, size_t dataLen, } } +template <typename T> +inline void deinterleave_unalign_store( + const std::vector<scalarT<T>, allocator<scalarT<T>>> &in, + const size_t dataLen, T *out) +{ + for (size_t i = 0; i < dataLen; ++i) + { + out[i] = in[i]._data; + } +} + template <typename T> inline void deinterleave_store( - const std::vector<scalarT<T>, allocator<scalarT<T>>> &in, size_t dataLen, - T *out) + const std::vector<scalarT<T>, allocator<scalarT<T>>> &in, + const size_t dataLen, T *out) { for (size_t i = 0; i < dataLen; ++i) { diff --git a/library/MatrixFreeOps/BwdTrans.h b/library/MatrixFreeOps/BwdTrans.h index 51b1d73a3c3618633b0eda60308322a9cf210c11..e81cb44ae58091c1f1649781372ed98bdaa40082 100644 --- a/library/MatrixFreeOps/BwdTrans.h +++ b/library/MatrixFreeOps/BwdTrans.h @@ -126,20 +126,44 @@ struct BwdTransTemplate std::vector<vec_t, allocator<vec_t>> tmpIn(m_nmTot), tmpOut(nqTot); - for (int e = 0; e < this->m_nBlocks; ++e) + // temporary aligned storage for local fields + NekDouble *locField = static_cast<NekDouble *>(::operator new[]( + nqBlocks * sizeof(NekDouble), std::align_val_t(vec_t::alignment))); + + for (int e = 0; e < this->m_nBlocks - 1; ++e) { - // Load and transpose data - load_interleave(inptr, m_nmTot, tmpIn); + // Load data to aligned storage and interleave it + // std::copy(inptr, inptr + nmBlocks, locField); + // load_interleave(locField, m_nmTot, tmpIn); + load_unalign_interleave(inptr, m_nmTot, tmpIn); BwdTrans1DKernel<SHAPE_TYPE>(nm0, nq0, tmpIn, this->m_bdata[0], tmpOut); // de-interleave and store data - deinterleave_store(tmpOut, nqTot, outptr); + // deinterleave_store(tmpOut, nqTot, locField); + // std::copy(locField, locField + nqBlocks, outptr); + deinterleave_unalign_store(tmpOut, nqTot, outptr); inptr += nmBlocks; outptr += nqBlocks; } + // last block + { + int acturalSize = nmBlocks - this->m_nPads * m_nmTot; + std::copy(inptr, inptr + acturalSize, locField); + load_interleave(locField, m_nmTot, tmpIn); + + BwdTrans1DKernel<SHAPE_TYPE>(nm0, nq0, tmpIn, this->m_bdata[0], + tmpOut); + + // de-interleave and store data + deinterleave_store(tmpOut, nqTot, locField); + acturalSize = nqBlocks - this->m_nPads * nqTot; + std::copy(locField, locField + acturalSize, outptr); + } + // free aligned memory + ::operator delete[](locField, std::align_val_t(vec_t::alignment)); } // Size based template version. @@ -159,20 +183,41 @@ struct BwdTransTemplate std::vector<vec_t, allocator<vec_t>> tmpIn(m_nmTot), tmpOut(nqTot); - for (int e = 0; e < this->m_nBlocks; ++e) + // temporary aligned storage for local fields + alignas(vec_t::alignment) NekDouble locField[nqBlocks]; + + for (int e = 0; e < this->m_nBlocks - 1; ++e) { - // Load and transpose data - load_interleave(inptr, m_nmTot, tmpIn); + // Load data to aligned storage and interleave it + // std::copy(inptr, inptr + nmBlocks, locField); + // load_interleave(locField, m_nmTot, tmpIn); + load_unalign_interleave(inptr, m_nmTot, tmpIn); BwdTrans1DKernel<SHAPE_TYPE>(nm0, nq0, tmpIn, this->m_bdata[0], tmpOut); // de-interleave and store data - deinterleave_store(tmpOut, nqTot, outptr); + // deinterleave_store(tmpOut, nqTot, locField); + // std::copy(locField, locField + nqBlocks, outptr); + deinterleave_unalign_store(tmpOut, nqTot, outptr); inptr += nmBlocks; outptr += nqBlocks; } + // last block + { + int acturalSize = nmBlocks - this->m_nPads * m_nmTot; + std::copy(inptr, inptr + acturalSize, locField); + load_interleave(locField, m_nmTot, tmpIn); + + BwdTrans1DKernel<SHAPE_TYPE>(nm0, nq0, tmpIn, this->m_bdata[0], + tmpOut); + + // de-interleave and store data + deinterleave_store(tmpOut, nqTot, locField); + acturalSize = nqBlocks - this->m_nPads * nqTot; + std::copy(locField, locField + acturalSize, outptr); + } } #elif defined(SHAPE_DIMENSION_2D) @@ -203,21 +248,46 @@ struct BwdTransTemplate std::vector<vec_t, allocator<vec_t>> wsp0(wsp0Size), tmpIn(m_nmTot), tmpOut(nqTot); - for (int e = 0; e < this->m_nBlocks; ++e) + // temporary aligned storage for local fields + NekDouble *locField = static_cast<NekDouble *>(::operator new[]( + nqBlocks * sizeof(NekDouble), std::align_val_t(vec_t::alignment))); + + for (int e = 0; e < this->m_nBlocks - 1; ++e) { - // Load and transpose data - load_interleave(inptr, m_nmTot, tmpIn); + // Load data to aligned storage and interleave it + // std::copy(inptr, inptr + nmBlocks, locField); + // load_interleave(locField, m_nmTot, tmpIn); + load_unalign_interleave(inptr, m_nmTot, tmpIn); BwdTrans2DKernel<SHAPE_TYPE>(nm0, nm1, nq0, nq1, correct, tmpIn, this->m_bdata[0], this->m_bdata[1], wsp0, tmpOut); // de-interleave and store data - deinterleave_store(tmpOut, nqTot, outptr); + // deinterleave_store(tmpOut, nqTot, locField); + // std::copy(locField, locField + nqBlocks, outptr); + deinterleave_unalign_store(tmpOut, nqTot, outptr); inptr += nmBlocks; outptr += nqBlocks; } + // last block + { + int acturalSize = nmBlocks - this->m_nPads * m_nmTot; + std::copy(inptr, inptr + acturalSize, locField); + load_interleave(locField, m_nmTot, tmpIn); + + BwdTrans2DKernel<SHAPE_TYPE>(nm0, nm1, nq0, nq1, correct, tmpIn, + this->m_bdata[0], this->m_bdata[1], + wsp0, tmpOut); + + // de-interleave and store data + deinterleave_store(tmpOut, nqTot, locField); + acturalSize = nqBlocks - this->m_nPads * nqTot; + std::copy(locField, locField + acturalSize, outptr); + } + // free aligned memory + ::operator delete[](locField, std::align_val_t(vec_t::alignment)); } // Size based template version. @@ -241,21 +311,43 @@ struct BwdTransTemplate std::vector<vec_t, allocator<vec_t>> wsp0(wsp0Size), tmpIn(m_nmTot), tmpOut(nqTot); - for (int e = 0; e < this->m_nBlocks; ++e) + // temporary aligned storage for local fields + alignas(vec_t::alignment) NekDouble locField[nqBlocks]; + + for (int e = 0; e < this->m_nBlocks - 1; ++e) { - // Load and transpose data - load_interleave(inptr, m_nmTot, tmpIn); + // Load data to aligned storage and interleave it + // std::copy(inptr, inptr + nmBlocks, locField); + // load_interleave(locField, m_nmTot, tmpIn); + load_unalign_interleave(inptr, m_nmTot, tmpIn); BwdTrans2DKernel<SHAPE_TYPE>(nm0, nm1, nq0, nq1, correct, tmpIn, this->m_bdata[0], this->m_bdata[1], wsp0, tmpOut); // de-interleave and store data - deinterleave_store(tmpOut, nqTot, outptr); + // deinterleave_store(tmpOut, nqTot, locField); + // std::copy(locField, locField + nqBlocks, outptr); + deinterleave_unalign_store(tmpOut, nqTot, outptr); inptr += nmBlocks; outptr += nqBlocks; } + // last block + { + int acturalSize = nmBlocks - this->m_nPads * m_nmTot; + std::copy(inptr, inptr + acturalSize, locField); + load_interleave(locField, m_nmTot, tmpIn); + + BwdTrans2DKernel<SHAPE_TYPE>(nm0, nm1, nq0, nq1, correct, tmpIn, + this->m_bdata[0], this->m_bdata[1], + wsp0, tmpOut); + + // de-interleave and store data + deinterleave_store(tmpOut, nqTot, locField); + acturalSize = nqBlocks - this->m_nPads * nqTot; + std::copy(locField, locField + acturalSize, outptr); + } } #elif defined(SHAPE_DIMENSION_3D) @@ -290,21 +382,46 @@ struct BwdTransTemplate std::vector<vec_t, allocator<vec_t>> wsp0(wsp0Size), wsp1(wsp1Size), tmpIn(m_nmTot), tmpOut(nqTot); - for (int e = 0; e < this->m_nBlocks; ++e) + // temporary aligned storage for local fields + NekDouble *locField = static_cast<NekDouble *>(::operator new[]( + nqBlocks * sizeof(NekDouble), std::align_val_t(vec_t::alignment))); + + for (int e = 0; e < this->m_nBlocks - 1; ++e) { - // Load and transpose data - load_interleave(inptr, m_nmTot, tmpIn); + // Load data to aligned storage and interleave it + // std::copy(inptr, inptr + nmBlocks, locField); + // load_interleave(locField, m_nmTot, tmpIn); + load_unalign_interleave(inptr, m_nmTot, tmpIn); BwdTrans3DKernel<SHAPE_TYPE>( nm0, nm1, nm2, nq0, nq1, nq2, correct, tmpIn, this->m_bdata[0], this->m_bdata[1], this->m_bdata[2], wsp0, wsp1, tmpOut); // de-interleave and store data - deinterleave_store(tmpOut, nqTot, outptr); + // deinterleave_store(tmpOut, nqTot, locField); + // std::copy(locField, locField + nqBlocks, outptr); + deinterleave_unalign_store(tmpOut, nqTot, outptr); inptr += nmBlocks; outptr += nqBlocks; } + // last block + { + int acturalSize = nmBlocks - this->m_nPads * m_nmTot; + std::copy(inptr, inptr + acturalSize, locField); + load_interleave(locField, m_nmTot, tmpIn); + + BwdTrans3DKernel<SHAPE_TYPE>( + nm0, nm1, nm2, nq0, nq1, nq2, correct, tmpIn, this->m_bdata[0], + this->m_bdata[1], this->m_bdata[2], wsp0, wsp1, tmpOut); + + // de-interleave and store data + deinterleave_store(tmpOut, nqTot, locField); + acturalSize = nqBlocks - this->m_nPads * nqTot; + std::copy(locField, locField + acturalSize, outptr); + } + // free aligned memory + ::operator delete[](locField, std::align_val_t(vec_t::alignment)); } // Size based template version. @@ -329,21 +446,43 @@ struct BwdTransTemplate std::vector<vec_t, allocator<vec_t>> wsp0(wsp0Size), wsp1(wsp1Size), tmpIn(m_nmTot), tmpOut(nqTot); - for (int e = 0; e < this->m_nBlocks; ++e) + // temporary aligned storage for local fields + alignas(vec_t::alignment) NekDouble locField[nqBlocks]; + + for (int e = 0; e < this->m_nBlocks - 1; ++e) { - // Load and transpose data - load_interleave(inptr, m_nmTot, tmpIn); + // Load data to aligned storage and interleave it + // std::copy(inptr, inptr + nmBlocks, locField); + // load_interleave(locField, m_nmTot, tmpIn); + load_unalign_interleave(inptr, m_nmTot, tmpIn); BwdTrans3DKernel<SHAPE_TYPE>( nm0, nm1, nm2, nq0, nq1, nq2, correct, tmpIn, this->m_bdata[0], this->m_bdata[1], this->m_bdata[2], wsp0, wsp1, tmpOut); // de-interleave and store data - deinterleave_store(tmpOut, nqTot, outptr); + // deinterleave_store(tmpOut, nqTot, locField); + // std::copy(locField, locField + nqBlocks, outptr); + deinterleave_unalign_store(tmpOut, nqTot, outptr); inptr += nmBlocks; outptr += nqBlocks; } + // last block + { + int acturalSize = nmBlocks - this->m_nPads * m_nmTot; + std::copy(inptr, inptr + acturalSize, locField); + load_interleave(locField, m_nmTot, tmpIn); + + BwdTrans3DKernel<SHAPE_TYPE>( + nm0, nm1, nm2, nq0, nq1, nq2, correct, tmpIn, this->m_bdata[0], + this->m_bdata[1], this->m_bdata[2], wsp0, wsp1, tmpOut); + + // de-interleave and store data + deinterleave_store(tmpOut, nqTot, locField); + acturalSize = nqBlocks - this->m_nPads * nqTot; + std::copy(locField, locField + acturalSize, outptr); + } } #endif // SHAPE_DIMENSION diff --git a/library/MatrixFreeOps/CMakeLists.txt b/library/MatrixFreeOps/CMakeLists.txt index 267ece36c04cbade90bb671a1881e264591c6011..7dc170f643320e3f4cbb0a9d780d240669da22ee 100644 --- a/library/MatrixFreeOps/CMakeLists.txt +++ b/library/MatrixFreeOps/CMakeLists.txt @@ -16,7 +16,7 @@ CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/SwitchLimits.h.in # so to reduce compile time timeout failures due to the large number # of template impementations. -SET(OPERATORS BwdTrans PhysDeriv Helmholtz LinearAdvectionDiffusionReaction IProduct IProductWRTDerivBase PhysInterp1DScaled) +SET(OPERATORS BwdTrans PhysDeriv Helmholtz LinearAdvectionDiffusionReaction IProduct IProductWRTDerivBase) SET(SHAPES Seg Tri Quad Hex Tet Prism Pyr ) SET(DIMENSIONS 1 2 2 3 3 3 3 ) diff --git a/library/MatrixFreeOps/Helmholtz.h b/library/MatrixFreeOps/Helmholtz.h index 0f98083234d878839a5956ef2e5116f9060e398b..6d1ec4bfa01b64b27d0cdce83bb0570263a44e98 100644 --- a/library/MatrixFreeOps/Helmholtz.h +++ b/library/MatrixFreeOps/Helmholtz.h @@ -187,69 +187,58 @@ struct HelmholtzTemplate std::vector<vec_t, allocator<vec_t>> bwd(nqTot), deriv0(nqTot), deriv1(nqTot); - const vec_t *jac_ptr = {}; - const vec_t *df_ptr = {}; + // temporary aligned storage for local fields + NekDouble *locField = static_cast<NekDouble *>(::operator new[]( + nmBlocks * sizeof(NekDouble), std::align_val_t(vec_t::alignment))); - // Get size of derivative factor block - auto dfSize = ndf; - if (DEFORMED) - { - dfSize *= nqTot; - } + const vec_t *jac_ptr = &((*this->m_jac)[0]); + const vec_t *df_ptr = &((*this->m_df)[0]); - for (int e = 0; e < this->m_nBlocks; e++) + const auto dfSize = ndf * nqTot; + + for (int e = 0; e < this->m_nBlocks - 1; e++) { - df_ptr = &((*this->m_df)[e * dfSize]); + // Load data to aligned storage and interleave it + // std::copy(inptr, inptr + nmBlocks, locField); + // load_interleave(locField, m_nmTot, tmpIn); + load_unalign_interleave(inptr, m_nmTot, tmpIn); - // Load and transpose data - load_interleave(inptr, m_nmTot, tmpIn); + fusedKernel2D(nm0, nm1, nq0, nq1, correct, jac_ptr, df_ptr, tmpIn, + tmpOut, wsp0, bwd, deriv0, deriv1); - if (!DEFORMED) + // de-interleave and store data + // deinterleave_store(tmpOut, m_nmTot, locField); + // std::copy(locField, locField + nmBlocks, outptr); + deinterleave_unalign_store(tmpOut, m_nmTot, outptr); + + inptr += nmBlocks; + outptr += nmBlocks; + if constexpr (DEFORMED) { - jac_ptr = &((*this->m_jac)[e]); + jac_ptr += nqTot; + df_ptr += dfSize; } else { - jac_ptr = &((*this->m_jac)[e * nqTot]); + ++jac_ptr; + df_ptr += ndf; } + } + // last block + { + int acturalSize = nmBlocks - this->m_nPads * m_nmTot; + std::copy(inptr, inptr + acturalSize, locField); + load_interleave(locField, m_nmTot, tmpIn); - // Step 1: BwdTrans - BwdTrans2DKernel<SHAPE_TYPE>(nm0, nm1, nq0, nq1, correct, tmpIn, - this->m_bdata[0], this->m_bdata[1], - wsp0, bwd); - - // Step 2: inner product for mass matrix operation - IProduct2DKernel<SHAPE_TYPE, true, false, DEFORMED>( - nm0, nm1, nq0, nq1, correct, bwd, this->m_bdata[0], - this->m_bdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0, - tmpOut, m_lambda); - - // Step 3: take derivatives in collapsed coordinate space - PhysDerivTensor2DKernel(nq0, nq1, bwd, this->m_D[0], this->m_D[1], - deriv0, deriv1); - - DiffusionCoeff2DKernel<SHAPE_TYPE, DEFORMED>( - nq0, nq1, this->m_isConstVarDiff, this->m_constVarDiff, - this->m_isVarDiff, this->m_varD00, this->m_varD01, - this->m_varD11, df_ptr, this->m_h0, this->m_h1, deriv0, deriv1); - - // Step 4: Apply Laplacian metrics & inner product - IProduct2DKernel<SHAPE_TYPE, false, true, DEFORMED>( - nm0, nm1, nq0, nq1, correct, deriv0, this->m_dbdata[0], - this->m_bdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0, - tmpOut); - - IProduct2DKernel<SHAPE_TYPE, false, true, DEFORMED>( - nm0, nm1, nq0, nq1, correct, deriv1, this->m_bdata[0], - this->m_dbdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0, - tmpOut); + fusedKernel2D(nm0, nm1, nq0, nq1, correct, jac_ptr, df_ptr, tmpIn, + tmpOut, wsp0, bwd, deriv0, deriv1); // de-interleave and store data - deinterleave_store(tmpOut, m_nmTot, outptr); - - inptr += nmBlocks; - outptr += nmBlocks; + deinterleave_store(tmpOut, m_nmTot, locField); + std::copy(locField, locField + acturalSize, outptr); } + // free aligned memory + ::operator delete[](locField, std::align_val_t(vec_t::alignment)); } // Size based template version. @@ -279,71 +268,99 @@ struct HelmholtzTemplate std::vector<vec_t, allocator<vec_t>> bwd(nqTot), deriv0(nqTot), deriv1(nqTot); - const vec_t *jac_ptr = {}; - const vec_t *df_ptr = {}; + // temporary aligned storage for local fields + alignas(vec_t::alignment) NekDouble locField[nqTot * vec_t::width]; - // Get size of derivative factor block - auto dfSize = ndf; - if (DEFORMED) - { - dfSize *= nqTot; - } + const vec_t *jac_ptr = &((*this->m_jac)[0]); + const vec_t *df_ptr = &((*this->m_df)[0]); - for (int e = 0; e < this->m_nBlocks; e++) + constexpr auto dfSize = ndf * nqTot; + + for (int e = 0; e < this->m_nBlocks - 1; e++) { - df_ptr = &((*this->m_df)[e * dfSize]); + // Load data to aligned storage and interleave it + // std::copy(inptr, inptr + nmBlocks, locField); + // load_interleave(locField, m_nmTot, tmpIn); + load_unalign_interleave(inptr, m_nmTot, tmpIn); + + fusedKernel2D(nm0, nm1, nq0, nq1, correct, jac_ptr, df_ptr, tmpIn, + tmpOut, wsp0, bwd, deriv0, deriv1); - // Load and transpose data - load_interleave(inptr, m_nmTot, tmpIn); + // de-interleave and store data + // deinterleave_store(tmpOut, m_nmTot, locField); + // std::copy(locField, locField + nmBlocks, outptr); + deinterleave_unalign_store(tmpOut, m_nmTot, outptr); - if (!DEFORMED) + inptr += nmBlocks; + outptr += nmBlocks; + if constexpr (DEFORMED) { - jac_ptr = &((*this->m_jac)[e]); + jac_ptr += nqTot; + df_ptr += dfSize; } else { - jac_ptr = &((*this->m_jac)[e * nqTot]); + ++jac_ptr; + df_ptr += ndf; } + } + // last block + { + int acturalSize = nmBlocks - this->m_nPads * m_nmTot; + std::copy(inptr, inptr + acturalSize, locField); + load_interleave(locField, m_nmTot, tmpIn); - // Step 1: BwdTrans - BwdTrans2DKernel<SHAPE_TYPE>(nm0, nm1, nq0, nq1, correct, tmpIn, - this->m_bdata[0], this->m_bdata[1], - wsp0, bwd); - - // Step 2: inner product for mass matrix operation - IProduct2DKernel<SHAPE_TYPE, true, false, DEFORMED>( - nm0, nm1, nq0, nq1, correct, bwd, this->m_bdata[0], - this->m_bdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0, - tmpOut, m_lambda); - - // Step 3: take derivatives in collapsed coordinate space - PhysDerivTensor2DKernel(nq0, nq1, bwd, this->m_D[0], this->m_D[1], - deriv0, deriv1); - - DiffusionCoeff2DKernel<SHAPE_TYPE, DEFORMED>( - nq0, nq1, this->m_isConstVarDiff, this->m_constVarDiff, - this->m_isVarDiff, this->m_varD00, this->m_varD01, - this->m_varD11, df_ptr, this->m_h0, this->m_h1, deriv0, deriv1); - - // Step 4: Apply Laplacian metrics & inner product - IProduct2DKernel<SHAPE_TYPE, false, true, DEFORMED>( - nm0, nm1, nq0, nq1, correct, deriv0, this->m_dbdata[0], - this->m_bdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0, - tmpOut); - - IProduct2DKernel<SHAPE_TYPE, false, true, DEFORMED>( - nm0, nm1, nq0, nq1, correct, deriv1, this->m_bdata[0], - this->m_dbdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0, - tmpOut); + fusedKernel2D(nm0, nm1, nq0, nq1, correct, jac_ptr, df_ptr, tmpIn, + tmpOut, wsp0, bwd, deriv0, deriv1); // de-interleave and store data - deinterleave_store(tmpOut, m_nmTot, outptr); - - inptr += nmBlocks; - outptr += nmBlocks; + deinterleave_store(tmpOut, m_nmTot, locField); + std::copy(locField, locField + acturalSize, outptr); } } + NEK_FORCE_INLINE void fusedKernel2D( + const size_t nm0, const size_t nm1, const size_t nq0, const size_t nq1, + const bool correct, const vec_t *jac_ptr, const vec_t *df_ptr, + const std::vector<vec_t, allocator<vec_t>> &tmpIn, + std::vector<vec_t, allocator<vec_t>> &tmpOut, + std::vector<vec_t, allocator<vec_t>> &wsp0, + std::vector<vec_t, allocator<vec_t>> &bwd, + std::vector<vec_t, allocator<vec_t>> &deriv0, + std::vector<vec_t, allocator<vec_t>> &deriv1) + { + // Step 1: BwdTrans + BwdTrans2DKernel<SHAPE_TYPE>(nm0, nm1, nq0, nq1, correct, tmpIn, + this->m_bdata[0], this->m_bdata[1], wsp0, + bwd); + + // Step 2: inner product for mass matrix operation + IProduct2DKernel<SHAPE_TYPE, true, false, DEFORMED>( + nm0, nm1, nq0, nq1, correct, bwd, this->m_bdata[0], + this->m_bdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0, tmpOut, + m_lambda); + + // Step 3: take derivatives in collapsed coordinate space + PhysDerivTensor2DKernel(nq0, nq1, bwd, this->m_D[0], this->m_D[1], + deriv0, deriv1); + + DiffusionCoeff2DKernel<SHAPE_TYPE, DEFORMED>( + nq0, nq1, this->m_isConstVarDiff, this->m_constVarDiff, + this->m_isVarDiff, this->m_varD00, this->m_varD01, this->m_varD11, + df_ptr, this->m_h0, this->m_h1, deriv0, deriv1); + + // Step 4: Apply Laplacian metrics & inner product + IProduct2DKernel<SHAPE_TYPE, false, true, DEFORMED>( + nm0, nm1, nq0, nq1, correct, deriv0, this->m_dbdata[0], + this->m_bdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0, + tmpOut); + + IProduct2DKernel<SHAPE_TYPE, false, true, DEFORMED>( + nm0, nm1, nq0, nq1, correct, deriv1, this->m_bdata[0], + this->m_dbdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0, + tmpOut); + } + #elif defined(SHAPE_DIMENSION_3D) // Non-size based operator. @@ -383,78 +400,60 @@ struct HelmholtzTemplate std::vector<vec_t, allocator<vec_t>> deriv0(nqTot), deriv1(nqTot), deriv2(nqTot); - const vec_t *jac_ptr = {}; - const vec_t *df_ptr = {}; + // temporary aligned storage for local fields + NekDouble *locField = static_cast<NekDouble *>(::operator new[]( + nmBlocks * sizeof(NekDouble), std::align_val_t(vec_t::alignment))); - // Get size of derivative factor block - auto dfSize = ndf; - if (DEFORMED) - { - dfSize *= nqTot; - } + const vec_t *jac_ptr = &((*this->m_jac)[0]); + const vec_t *df_ptr = &((*this->m_df)[0]); - for (int e = 0; e < this->m_nBlocks; e++) + const auto dfSize = ndf * nqTot; + + for (int e = 0; e < this->m_nBlocks - 1; e++) { - df_ptr = &((*this->m_df)[e * dfSize]); + // Load data to aligned storage and interleave it + // std::copy(inptr, inptr + nmBlocks, locField); + // load_interleave(locField, m_nmTot, tmpIn); + load_unalign_interleave(inptr, m_nmTot, tmpIn); - // Load and transpose data - load_interleave(inptr, m_nmTot, tmpIn); + fusedKernel3D(nm0, nm1, nm2, nq0, nq1, nq2, correct, jac_ptr, + df_ptr, tmpIn, tmpOut, wsp0, wsp1, wsp2, bwd, deriv0, + deriv1, deriv2); - if (!DEFORMED) + // de-interleave and store data + // deinterleave_store(tmpOut, m_nmTot, locField); + // std::copy(locField, locField + nmBlocks, outptr); + deinterleave_unalign_store(tmpOut, m_nmTot, outptr); + + inptr += nmBlocks; + outptr += nmBlocks; + if constexpr (DEFORMED) { - jac_ptr = &((*this->m_jac)[e]); + jac_ptr += nqTot; + df_ptr += dfSize; } else { - jac_ptr = &((*this->m_jac)[e * nqTot]); + ++jac_ptr; + df_ptr += ndf; } + } + // last block + { + int acturalSize = nmBlocks - this->m_nPads * m_nmTot; + std::copy(inptr, inptr + acturalSize, locField); + load_interleave(locField, m_nmTot, tmpIn); - // Step 1: BwdTrans - BwdTrans3DKernel<SHAPE_TYPE>( - nm0, nm1, nm2, nq0, nq1, nq2, correct, tmpIn, this->m_bdata[0], - this->m_bdata[1], this->m_bdata[2], wsp0, wsp1, bwd); - - // // Step 2: inner product for mass matrix operation - IProduct3DKernel<SHAPE_TYPE, true, false, DEFORMED>( - nm0, nm1, nm2, nq0, nq1, nq2, correct, bwd, this->m_bdata[0], - this->m_bdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1], - this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut, m_lambda); - - // Step 3: take derivatives in standard space - PhysDerivTensor3DKernel(nq0, nq1, nq2, bwd, this->m_D[0], - this->m_D[1], this->m_D[2], deriv0, deriv1, - deriv2); - - // Step 4: Apply Laplacian metrics & inner product - DiffusionCoeff3DKernel<SHAPE_TYPE, DEFORMED>( - nq0, nq1, nq2, this->m_isConstVarDiff, this->m_constVarDiff, - this->m_isVarDiff, this->m_varD00, this->m_varD01, - this->m_varD11, this->m_varD02, this->m_varD12, this->m_varD22, - df_ptr, this->m_h0, this->m_h1, this->m_h2, this->m_h3, deriv0, - deriv1, deriv2); - - IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>( - nm0, nm1, nm2, nq0, nq1, nq2, correct, deriv0, - this->m_dbdata[0], this->m_bdata[1], this->m_bdata[2], - this->m_w[0], this->m_w[1], this->m_w[2], jac_ptr, wsp0, wsp1, - wsp2, tmpOut); - - IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>( - nm0, nm1, nm2, nq0, nq1, nq2, correct, deriv1, this->m_bdata[0], - this->m_dbdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1], - this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut); - - IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>( - nm0, nm1, nm2, nq0, nq1, nq2, correct, deriv2, this->m_bdata[0], - this->m_bdata[1], this->m_dbdata[2], this->m_w[0], this->m_w[1], - this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut); + fusedKernel3D(nm0, nm1, nm2, nq0, nq1, nq2, correct, jac_ptr, + df_ptr, tmpIn, tmpOut, wsp0, wsp1, wsp2, bwd, deriv0, + deriv1, deriv2); // de-interleave and store data - deinterleave_store(tmpOut, m_nmTot, outptr); - - inptr += nmBlocks; - outptr += nmBlocks; + deinterleave_store(tmpOut, m_nmTot, locField); + std::copy(locField, locField + acturalSize, outptr); } + // free aligned memory + ::operator delete[](locField, std::align_val_t(vec_t::alignment)); } // Size based template version. @@ -487,80 +486,111 @@ struct HelmholtzTemplate std::vector<vec_t, allocator<vec_t>> deriv0(nqTot), deriv1(nqTot), deriv2(nqTot); - const vec_t *jac_ptr = {}; - const vec_t *df_ptr = {}; + // temporary aligned storage for local fields + alignas(vec_t::alignment) NekDouble locField[nqTot * vec_t::width]; - // Get size of derivative factor block - auto dfSize = ndf; - if (DEFORMED) - { - dfSize *= nqTot; - } + const vec_t *jac_ptr = &((*this->m_jac)[0]); + const vec_t *df_ptr = &((*this->m_df)[0]); - for (int e = 0; e < this->m_nBlocks; e++) + constexpr auto dfSize = ndf * nqTot; + + for (int e = 0; e < this->m_nBlocks - 1; e++) { - df_ptr = &((*this->m_df)[e * dfSize]); + // Load data to aligned storage and interleave it + // std::copy(inptr, inptr + nmBlocks, locField); + // load_interleave(locField, m_nmTot, tmpIn); + load_unalign_interleave(inptr, m_nmTot, tmpIn); + + fusedKernel3D(nm0, nm1, nm2, nq0, nq1, nq2, correct, jac_ptr, + df_ptr, tmpIn, tmpOut, wsp0, wsp1, wsp2, bwd, deriv0, + deriv1, deriv2); - // Load and transpose data - load_interleave(inptr, m_nmTot, tmpIn); + // de-interleave and store data + // deinterleave_store(tmpOut, m_nmTot, locField); + // std::copy(locField, locField + nmBlocks, outptr); + deinterleave_unalign_store(tmpOut, m_nmTot, outptr); - if (!DEFORMED) + inptr += nmBlocks; + outptr += nmBlocks; + if constexpr (DEFORMED) { - jac_ptr = &((*this->m_jac)[e]); + jac_ptr += nqTot; + df_ptr += dfSize; } else { - jac_ptr = &((*this->m_jac)[e * nqTot]); + ++jac_ptr; + df_ptr += ndf; } + } + // last block + { + int acturalSize = nmBlocks - this->m_nPads * m_nmTot; + std::copy(inptr, inptr + acturalSize, locField); + load_interleave(locField, m_nmTot, tmpIn); - // Step 1: BwdTrans - BwdTrans3DKernel<SHAPE_TYPE>( - nm0, nm1, nm2, nq0, nq1, nq2, correct, tmpIn, this->m_bdata[0], - this->m_bdata[1], this->m_bdata[2], wsp0, wsp1, bwd); - - // // Step 2: inner product for mass matrix operation - IProduct3DKernel<SHAPE_TYPE, true, false, DEFORMED>( - nm0, nm1, nm2, nq0, nq1, nq2, correct, bwd, this->m_bdata[0], - this->m_bdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1], - this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut, m_lambda); - - // Step 3: take derivatives in standard space - PhysDerivTensor3DKernel(nq0, nq1, nq2, bwd, this->m_D[0], - this->m_D[1], this->m_D[2], deriv0, deriv1, - deriv2); - - // Step 4: Apply Laplacian metrics & inner product - DiffusionCoeff3DKernel<SHAPE_TYPE, DEFORMED>( - nq0, nq1, nq2, this->m_isConstVarDiff, this->m_constVarDiff, - this->m_isVarDiff, this->m_varD00, this->m_varD01, - this->m_varD11, this->m_varD02, this->m_varD12, this->m_varD22, - df_ptr, this->m_h0, this->m_h1, this->m_h2, this->m_h3, deriv0, - deriv1, deriv2); - - IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>( - nm0, nm1, nm2, nq0, nq1, nq2, correct, deriv0, - this->m_dbdata[0], this->m_bdata[1], this->m_bdata[2], - this->m_w[0], this->m_w[1], this->m_w[2], jac_ptr, wsp0, wsp1, - wsp2, tmpOut); - - IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>( - nm0, nm1, nm2, nq0, nq1, nq2, correct, deriv1, this->m_bdata[0], - this->m_dbdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1], - this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut); - - IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>( - nm0, nm1, nm2, nq0, nq1, nq2, correct, deriv2, this->m_bdata[0], - this->m_bdata[1], this->m_dbdata[2], this->m_w[0], this->m_w[1], - this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut); + fusedKernel3D(nm0, nm1, nm2, nq0, nq1, nq2, correct, jac_ptr, + df_ptr, tmpIn, tmpOut, wsp0, wsp1, wsp2, bwd, deriv0, + deriv1, deriv2); // de-interleave and store data - deinterleave_store(tmpOut, m_nmTot, outptr); - - inptr += nmBlocks; - outptr += nmBlocks; + deinterleave_store(tmpOut, m_nmTot, locField); + std::copy(locField, locField + acturalSize, outptr); } } + NEK_FORCE_INLINE void fusedKernel3D( + const size_t nm0, const size_t nm1, const size_t nm2, const size_t nq0, + const size_t nq1, const size_t nq2, const bool correct, + const vec_t *jac_ptr, const vec_t *df_ptr, + const std::vector<vec_t, allocator<vec_t>> &tmpIn, + std::vector<vec_t, allocator<vec_t>> &tmpOut, + std::vector<vec_t, allocator<vec_t>> &wsp0, + std::vector<vec_t, allocator<vec_t>> &wsp1, + std::vector<vec_t, allocator<vec_t>> &wsp2, + std::vector<vec_t, allocator<vec_t>> &bwd, + std::vector<vec_t, allocator<vec_t>> &deriv0, + std::vector<vec_t, allocator<vec_t>> &deriv1, + std::vector<vec_t, allocator<vec_t>> &deriv2) + { + // Step 1: BwdTrans + BwdTrans3DKernel<SHAPE_TYPE>(nm0, nm1, nm2, nq0, nq1, nq2, correct, + tmpIn, this->m_bdata[0], this->m_bdata[1], + this->m_bdata[2], wsp0, wsp1, bwd); + + // Step 2: inner product for mass matrix operation + IProduct3DKernel<SHAPE_TYPE, true, false, DEFORMED>( + nm0, nm1, nm2, nq0, nq1, nq2, correct, bwd, this->m_bdata[0], + this->m_bdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1], + this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut, m_lambda); + + // Step 3: take derivatives in standard space + PhysDerivTensor3DKernel(nq0, nq1, nq2, bwd, this->m_D[0], this->m_D[1], + this->m_D[2], deriv0, deriv1, deriv2); + + // Step 4: Apply Laplacian metrics & inner product + DiffusionCoeff3DKernel<SHAPE_TYPE, DEFORMED>( + nq0, nq1, nq2, this->m_isConstVarDiff, this->m_constVarDiff, + this->m_isVarDiff, this->m_varD00, this->m_varD01, this->m_varD11, + this->m_varD02, this->m_varD12, this->m_varD22, df_ptr, this->m_h0, + this->m_h1, this->m_h2, this->m_h3, deriv0, deriv1, deriv2); + + IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>( + nm0, nm1, nm2, nq0, nq1, nq2, correct, deriv0, this->m_dbdata[0], + this->m_bdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1], + this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut); + + IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>( + nm0, nm1, nm2, nq0, nq1, nq2, correct, deriv1, this->m_bdata[0], + this->m_dbdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1], + this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut); + + IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>( + nm0, nm1, nm2, nq0, nq1, nq2, correct, deriv2, this->m_bdata[0], + this->m_bdata[1], this->m_dbdata[2], this->m_w[0], this->m_w[1], + this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut); + } + #endif // SHAPE_DIMENSION private: diff --git a/library/MatrixFreeOps/IProduct.h b/library/MatrixFreeOps/IProduct.h index f0a7d61515f0cbef65e14c63c27a0bf7fc57bb6e..b22f8dba37c8c4e54e217da89d89b7e094a74571 100644 --- a/library/MatrixFreeOps/IProduct.h +++ b/library/MatrixFreeOps/IProduct.h @@ -126,32 +126,56 @@ struct IProductTemplate std::vector<vec_t, allocator<vec_t>> tmpIn(nqTot), tmpOut(m_nmTot); - vec_t *jac_ptr; + vec_t *jac_ptr = &((*this->m_jac)[0]); - for (int e = 0; e < this->m_nBlocks; ++e) + // temporary aligned storage for local fields + NekDouble *locField = static_cast<NekDouble *>(::operator new[]( + nqBlocks * sizeof(NekDouble), std::align_val_t(vec_t::alignment))); + + for (int e = 0; e < this->m_nBlocks - 1; ++e) { - if (DEFORMED) + // Load data to aligned storage and interleave it + // std::copy(inptr, inptr + nqBlocks, locField); + // load_interleave(locField, nqTot, tmpIn); + load_unalign_interleave(inptr, nqTot, tmpIn); + + IProduct1DKernel<SHAPE_TYPE, false, false, DEFORMED>( + nm0, nq0, tmpIn, this->m_bdata[0], this->m_w[0], jac_ptr, + tmpOut); + + // de-interleave and store data + // deinterleave_store(tmpOut, m_nmTot, locField); + // std::copy(locField, locField + nmBlocks, outptr); + deinterleave_unalign_store(tmpOut, m_nmTot, outptr); + + inptr += nqBlocks; + outptr += nmBlocks; + if constexpr (DEFORMED) { - jac_ptr = &((*this->m_jac)[e * nqTot]); + jac_ptr += nqTot; } else { - jac_ptr = &((*this->m_jac)[e]); + ++jac_ptr; } - - // Load and transpose data - load_interleave(inptr, nqTot, tmpIn); + } + // last block + { + int acturalSize = nqBlocks - this->m_nPads * nqTot; + std::copy(inptr, inptr + acturalSize, locField); + load_interleave(locField, nqTot, tmpIn); IProduct1DKernel<SHAPE_TYPE, false, false, DEFORMED>( nm0, nq0, tmpIn, this->m_bdata[0], this->m_w[0], jac_ptr, tmpOut); // de-interleave and store data - deinterleave_store(tmpOut, m_nmTot, outptr); - - inptr += nqBlocks; - outptr += nmBlocks; + deinterleave_store(tmpOut, m_nmTot, locField); + acturalSize = nmBlocks - this->m_nPads * m_nmTot; + std::copy(locField, locField + acturalSize, outptr); } + // free aligned memory + ::operator delete[](locField, std::align_val_t(vec_t::alignment)); } // Size based template version. @@ -168,31 +192,52 @@ struct IProductTemplate std::vector<vec_t, allocator<vec_t>> tmpIn(nqTot), tmpOut(m_nmTot); - vec_t *jac_ptr; + vec_t *jac_ptr = &((*this->m_jac)[0]); - for (int e = 0; e < this->m_nBlocks; ++e) + // temporary aligned storage for local fields + alignas(vec_t::alignment) NekDouble locField[nqBlocks]; + + for (int e = 0; e < this->m_nBlocks - 1; ++e) { - if (DEFORMED) + // Load data to aligned storage and interleave it + // std::copy(inptr, inptr + nqBlocks, locField); + // load_interleave(locField, nqTot, tmpIn); + load_unalign_interleave(inptr, nqTot, tmpIn); + + IProduct1DKernel<SHAPE_TYPE, false, false, DEFORMED>( + nm0, nq0, tmpIn, this->m_bdata[0], this->m_w[0], jac_ptr, + tmpOut); + + // de-interleave and store data + // deinterleave_store(tmpOut, m_nmTot, locField); + // std::copy(locField, locField + nmBlocks, outptr); + deinterleave_unalign_store(tmpOut, m_nmTot, outptr); + + inptr += nqBlocks; + outptr += nmBlocks; + if constexpr (DEFORMED) { - jac_ptr = &((*this->m_jac)[e * nqTot]); + jac_ptr += nqTot; } else { - jac_ptr = &((*this->m_jac)[e]); + ++jac_ptr; } - - // Load and transpose data - load_interleave(inptr, nqTot, tmpIn); + } + // last block + { + int acturalSize = nqBlocks - this->m_nPads * nqTot; + std::copy(inptr, inptr + acturalSize, locField); + load_interleave(locField, nqTot, tmpIn); IProduct1DKernel<SHAPE_TYPE, false, false, DEFORMED>( nm0, nq0, tmpIn, this->m_bdata[0], this->m_w[0], jac_ptr, tmpOut); // de-interleave and store data - deinterleave_store(tmpOut, m_nmTot, outptr); - - inptr += nqBlocks; - outptr += nmBlocks; + deinterleave_store(tmpOut, m_nmTot, locField); + acturalSize = nmBlocks - this->m_nPads * m_nmTot; + std::copy(locField, locField + acturalSize, outptr); } } @@ -225,21 +270,45 @@ struct IProductTemplate std::vector<vec_t, allocator<vec_t>> wsp0(wsp0Size), tmpIn(nqTot), tmpOut(m_nmTot); - vec_t *jac_ptr; + vec_t *jac_ptr = &((*this->m_jac)[0]); - for (int e = 0; e < this->m_nBlocks; ++e) + // temporary aligned storage for local fields + NekDouble *locField = static_cast<NekDouble *>(::operator new[]( + nqBlocks * sizeof(NekDouble), std::align_val_t(vec_t::alignment))); + + for (int e = 0; e < this->m_nBlocks - 1; ++e) { - if (DEFORMED) + // Load data to aligned storage and interleave it + // std::copy(inptr, inptr + nqBlocks, locField); + // load_interleave(locField, nqTot, tmpIn); + load_unalign_interleave(inptr, nqTot, tmpIn); + + IProduct2DKernel<SHAPE_TYPE, false, false, DEFORMED>( + nm0, nm1, nq0, nq1, correct, tmpIn, this->m_bdata[0], + this->m_bdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0, + tmpOut); + + // de-interleave and store data + // deinterleave_store(tmpOut, m_nmTot, locField); + // std::copy(locField, locField + nmBlocks, outptr); + deinterleave_unalign_store(tmpOut, m_nmTot, outptr); + + inptr += nqBlocks; + outptr += nmBlocks; + if constexpr (DEFORMED) { - jac_ptr = &((*this->m_jac)[nqTot * e]); + jac_ptr += nqTot; } else { - jac_ptr = &((*this->m_jac)[e]); + ++jac_ptr; } - - // Load and transpose data - load_interleave(inptr, nqTot, tmpIn); + } + // last block + { + int acturalSize = nqBlocks - this->m_nPads * nqTot; + std::copy(inptr, inptr + acturalSize, locField); + load_interleave(locField, nqTot, tmpIn); IProduct2DKernel<SHAPE_TYPE, false, false, DEFORMED>( nm0, nm1, nq0, nq1, correct, tmpIn, this->m_bdata[0], @@ -247,11 +316,12 @@ struct IProductTemplate tmpOut); // de-interleave and store data - deinterleave_store(tmpOut, m_nmTot, outptr); - - inptr += nqBlocks; - outptr += nmBlocks; + deinterleave_store(tmpOut, m_nmTot, locField); + acturalSize = nmBlocks - this->m_nPads * m_nmTot; + std::copy(locField, locField + acturalSize, outptr); } + // free aligned memory + ::operator delete[](locField, std::align_val_t(vec_t::alignment)); } // Size based template version. @@ -276,21 +346,44 @@ struct IProductTemplate std::vector<vec_t, allocator<vec_t>> wsp0(wsp0Size), tmpIn(nqTot), tmpOut(m_nmTot); - vec_t *jac_ptr; + vec_t *jac_ptr = &((*this->m_jac)[0]); + + // temporary aligned storage for local fields + alignas(vec_t::alignment) NekDouble locField[nqBlocks]; - for (int e = 0; e < this->m_nBlocks; ++e) + for (int e = 0; e < this->m_nBlocks - 1; ++e) { - if (DEFORMED) + // Load data to aligned storage and interleave it + // std::copy(inptr, inptr + nqBlocks, locField); + // load_interleave(locField, nqTot, tmpIn); + load_unalign_interleave(inptr, nqTot, tmpIn); + + IProduct2DKernel<SHAPE_TYPE, false, false, DEFORMED>( + nm0, nm1, nq0, nq1, correct, tmpIn, this->m_bdata[0], + this->m_bdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0, + tmpOut); + + // de-interleave and store data + // deinterleave_store(tmpOut, m_nmTot, locField); + // std::copy(locField, locField + nmBlocks, outptr); + deinterleave_unalign_store(tmpOut, m_nmTot, outptr); + + inptr += nqBlocks; + outptr += nmBlocks; + if constexpr (DEFORMED) { - jac_ptr = &((*this->m_jac)[nqTot * e]); + jac_ptr += nqTot; } else { - jac_ptr = &((*this->m_jac)[e]); + ++jac_ptr; } - - // Load and transpose data - load_interleave(inptr, nqTot, tmpIn); + } + // last block + { + int acturalSize = nqBlocks - this->m_nPads * nqTot; + std::copy(inptr, inptr + acturalSize, locField); + load_interleave(locField, nqTot, tmpIn); IProduct2DKernel<SHAPE_TYPE, false, false, DEFORMED>( nm0, nm1, nq0, nq1, correct, tmpIn, this->m_bdata[0], @@ -298,10 +391,9 @@ struct IProductTemplate tmpOut); // de-interleave and store data - deinterleave_store(tmpOut, m_nmTot, outptr); - - inptr += nqBlocks; - outptr += nmBlocks; + deinterleave_store(tmpOut, m_nmTot, locField); + acturalSize = nmBlocks - this->m_nPads * m_nmTot; + std::copy(locField, locField + acturalSize, outptr); } } @@ -337,21 +429,45 @@ struct IProductTemplate std::vector<vec_t, allocator<vec_t>> wsp0(wsp0Size), wsp1(wsp1Size), wsp2(wsp2Size), tmpIn(nqTot), tmpOut(m_nmTot); - vec_t *jac_ptr; + vec_t *jac_ptr = &((*this->m_jac)[0]); - for (int e = 0; e < this->m_nBlocks; ++e) + // temporary aligned storage for local fields + NekDouble *locField = static_cast<NekDouble *>(::operator new[]( + nqBlocks * sizeof(NekDouble), std::align_val_t(vec_t::alignment))); + + for (int e = 0; e < this->m_nBlocks - 1; ++e) { - if (DEFORMED) + // Load data to aligned storage and interleave it + // std::copy(inptr, inptr + nqBlocks, locField); + // load_interleave(locField, nqTot, tmpIn); + load_unalign_interleave(inptr, nqTot, tmpIn); + + IProduct3DKernel<SHAPE_TYPE, false, false, DEFORMED>( + nm0, nm1, nm2, nq0, nq1, nq2, correct, tmpIn, this->m_bdata[0], + this->m_bdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1], + this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut); + + // de-interleave and store data + // deinterleave_store(tmpOut, m_nmTot, locField); + // std::copy(locField, locField + nmBlocks, outptr); + deinterleave_unalign_store(tmpOut, m_nmTot, outptr); + + inptr += nqBlocks; + outptr += nmBlocks; + if constexpr (DEFORMED) { - jac_ptr = &((*this->m_jac)[nqTot * e]); + jac_ptr += nqTot; } else { - jac_ptr = &((*this->m_jac)[e]); + ++jac_ptr; } - - // Load and transpose data - load_interleave(inptr, nqTot, tmpIn); + } + // last block + { + int acturalSize = nqBlocks - this->m_nPads * nqTot; + std::copy(inptr, inptr + acturalSize, locField); + load_interleave(locField, nqTot, tmpIn); IProduct3DKernel<SHAPE_TYPE, false, false, DEFORMED>( nm0, nm1, nm2, nq0, nq1, nq2, correct, tmpIn, this->m_bdata[0], @@ -359,11 +475,12 @@ struct IProductTemplate this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut); // de-interleave and store data - deinterleave_store(tmpOut, m_nmTot, outptr); - - inptr += nqBlocks; - outptr += nmBlocks; + deinterleave_store(tmpOut, m_nmTot, locField); + acturalSize = nmBlocks - this->m_nPads * m_nmTot; + std::copy(locField, locField + acturalSize, outptr); } + // free aligned memory + ::operator delete[](locField, std::align_val_t(vec_t::alignment)); } // Size based template version. @@ -389,21 +506,44 @@ struct IProductTemplate std::vector<vec_t, allocator<vec_t>> wsp0(wsp0Size), wsp1(wsp1Size), wsp2(wsp2Size), tmpIn(nqTot), tmpOut(m_nmTot); - vec_t *jac_ptr; + vec_t *jac_ptr = &((*this->m_jac)[0]); + + // temporary aligned storage for local fields + alignas(vec_t::alignment) NekDouble locField[nqBlocks]; - for (int e = 0; e < this->m_nBlocks; ++e) + for (int e = 0; e < this->m_nBlocks - 1; ++e) { - if (DEFORMED) + // Load data to aligned storage and interleave it + // std::copy(inptr, inptr + nqBlocks, locField); + // load_interleave(locField, nqTot, tmpIn); + load_unalign_interleave(inptr, nqTot, tmpIn); + + IProduct3DKernel<SHAPE_TYPE, false, false, DEFORMED>( + nm0, nm1, nm2, nq0, nq1, nq2, correct, tmpIn, this->m_bdata[0], + this->m_bdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1], + this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut); + + // de-interleave and store data + // deinterleave_store(tmpOut, m_nmTot, locField); + // std::copy(locField, locField + nmBlocks, outptr); + deinterleave_unalign_store(tmpOut, m_nmTot, outptr); + + inptr += nqBlocks; + outptr += nmBlocks; + if constexpr (DEFORMED) { - jac_ptr = &((*this->m_jac)[nqTot * e]); + jac_ptr += nqTot; } else { - jac_ptr = &((*this->m_jac)[e]); + ++jac_ptr; } - - // Load and transpose data - load_interleave(inptr, nqTot, tmpIn); + } + // last block + { + int acturalSize = nqBlocks - this->m_nPads * nqTot; + std::copy(inptr, inptr + acturalSize, locField); + load_interleave(locField, nqTot, tmpIn); IProduct3DKernel<SHAPE_TYPE, false, false, DEFORMED>( nm0, nm1, nm2, nq0, nq1, nq2, correct, tmpIn, this->m_bdata[0], @@ -411,10 +551,9 @@ struct IProductTemplate this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut); // de-interleave and store data - deinterleave_store(tmpOut, m_nmTot, outptr); - - inptr += nqBlocks; - outptr += nmBlocks; + deinterleave_store(tmpOut, m_nmTot, locField); + acturalSize = nmBlocks - this->m_nPads * m_nmTot; + std::copy(locField, locField + acturalSize, outptr); } } diff --git a/library/MatrixFreeOps/IProductWRTDerivBase.h b/library/MatrixFreeOps/IProductWRTDerivBase.h index 699c9a7c78779349565d5dd34ef5a6977e7b70d7..9d9aebad5c0722835363c6f5763d7a0a0cef092f 100644 --- a/library/MatrixFreeOps/IProductWRTDerivBase.h +++ b/library/MatrixFreeOps/IProductWRTDerivBase.h @@ -150,8 +150,8 @@ struct IProductWRTDerivBaseTemplate tmpIn[max_ndf]; // max_ndf is a constexpr std::vector<vec_t, allocator<vec_t>> tmp0(nqTot), tmpOut(m_nmTot); - const vec_t *jac_ptr; - const vec_t *df_ptr; + const vec_t *jac_ptr = &((*this->m_jac)[0]); + const vec_t *df_ptr = &((*this->m_df)[0]); vec_t df_tmp[max_ndf]; // max_ndf is a constexpr NekDouble *inptr[max_ndf]; // max_ndf is a constexpr @@ -161,18 +161,19 @@ struct IProductWRTDerivBaseTemplate inptr[d] = const_cast<NekDouble *>(input[d].data()); } - for (int e = 0; e < this->m_nBlocks; ++e) - { - // Jacobian - jac_ptr = &((*this->m_jac)[dJSize * e]); - - // Derivative factor - df_ptr = &((*this->m_df)[dfSize * e]); + // temporary aligned storage for local fields + NekDouble *locField = static_cast<NekDouble *>(::operator new[]( + nqBlocks * sizeof(NekDouble), std::align_val_t(vec_t::alignment))); + for (int e = 0; e < this->m_nBlocks - 1; ++e) + { // Load and transpose data for (int d = 0; d < ndf; ++d) { - load_interleave(inptr[d], nqTot, tmpIn[d]); + // Load data to aligned storage and interleave it + // std::copy(inptr[d], inptr[d] + nqBlocks, locField); + // load_interleave(locField, nqTot, tmpIn[d]); + load_unalign_interleave(inptr[d], nqTot, tmpIn[d]); } IProductWRTDerivBase1DKernel<SHAPE_TYPE, DEFORMED>( @@ -184,15 +185,42 @@ struct IProductWRTDerivBaseTemplate tmpOut); // de-interleave and store data - deinterleave_store(tmpOut, m_nmTot, outptr); + // deinterleave_store(tmpOut, m_nmTot, locField); + // std::copy(locField, locField + nmBlocks, outptr); + deinterleave_unalign_store(tmpOut, m_nmTot, outptr); for (int d = 0; d < ndf; ++d) { inptr[d] += nqBlocks; } - outptr += nmBlocks; + df_ptr += dfSize; + jac_ptr += dJSize; + } + // last block + { + int acturalSize = nqBlocks - this->m_nPads * nqTot; + for (int d = 0; d < ndf; ++d) + { + std::copy(inptr[d], inptr[d] + acturalSize, locField); + load_interleave(locField, nqTot, tmpIn[d]); + } + + IProductWRTDerivBase1DKernel<SHAPE_TYPE, DEFORMED>( + nqTot, ndf, df_ptr, df_tmp, tmpIn, tmp0); + + // IP DB0 + IProduct1DKernel<SHAPE_TYPE, false, false, DEFORMED>( + nm0, nq0, tmp0, this->m_dbdata[0], this->m_w[0], jac_ptr, + tmpOut); + + // de-interleave and store data + deinterleave_store(tmpOut, m_nmTot, locField); + acturalSize = nmBlocks - this->m_nPads * m_nmTot; + std::copy(locField, locField + acturalSize, outptr); } + // free aligned memory + ::operator delete[](locField, std::align_val_t(vec_t::alignment)); } // Size based template version. @@ -231,8 +259,8 @@ struct IProductWRTDerivBaseTemplate tmpIn[max_ndf]; // max_ndf is a constexpr std::vector<vec_t, allocator<vec_t>> tmp0(nqTot), tmpOut(m_nmTot); - const vec_t *jac_ptr; - const vec_t *df_ptr; + const vec_t *jac_ptr = &((*this->m_jac)[0]); + const vec_t *df_ptr = &((*this->m_df)[0]); vec_t df_tmp[max_ndf]; // max_ndf is a constexpr NekDouble *inptr[max_ndf]; // max_ndf is a constexpr @@ -242,18 +270,18 @@ struct IProductWRTDerivBaseTemplate inptr[d] = const_cast<NekDouble *>(input[d].data()); } - for (int e = 0; e < this->m_nBlocks; ++e) - { - // Jacobian - jac_ptr = &((*this->m_jac)[dJSize * e]); - - // Derivative factor - df_ptr = &((*this->m_df)[dfSize * e]); + // temporary aligned storage for local fields + alignas(vec_t::alignment) NekDouble locField[nqTot * vec_t::width]; + for (int e = 0; e < this->m_nBlocks - 1; ++e) + { // Load and transpose data for (int d = 0; d < ndf; ++d) { - load_interleave(inptr[d], nqTot, tmpIn[d]); + // Load data to aligned storage and interleave it + // std::copy(inptr[d], inptr[d] + nqBlocks, locField); + // load_interleave(locField, nqTot, tmpIn[d]); + load_unalign_interleave(inptr[d], nqTot, tmpIn[d]); } IProductWRTDerivBase1DKernel<SHAPE_TYPE, DEFORMED>( @@ -265,14 +293,39 @@ struct IProductWRTDerivBaseTemplate tmpOut); // de-interleave and store data - deinterleave_store(tmpOut, m_nmTot, outptr); + // deinterleave_store(tmpOut, m_nmTot, locField); + // std::copy(locField, locField + nmBlocks, outptr); + deinterleave_unalign_store(tmpOut, m_nmTot, outptr); for (int d = 0; d < ndf; ++d) { inptr[d] += nqBlocks; } - outptr += nmBlocks; + df_ptr += dfSize; + jac_ptr += dJSize; + } + // last block + { + int acturalSize = nqBlocks - this->m_nPads * nqTot; + for (int d = 0; d < ndf; ++d) + { + std::copy(inptr[d], inptr[d] + acturalSize, locField); + load_interleave(locField, nqTot, tmpIn[d]); + } + + IProductWRTDerivBase1DKernel<SHAPE_TYPE, DEFORMED>( + nqTot, ndf, df_ptr, df_tmp, tmpIn, tmp0); + + // IP DB0 + IProduct1DKernel<SHAPE_TYPE, false, false, DEFORMED>( + nm0, nq0, tmp0, this->m_dbdata[0], this->m_w[0], jac_ptr, + tmpOut); + + // de-interleave and store data + deinterleave_store(tmpOut, m_nmTot, locField); + acturalSize = nmBlocks - this->m_nPads * m_nmTot; + std::copy(locField, locField + acturalSize, outptr); } } @@ -336,51 +389,60 @@ struct IProductWRTDerivBaseTemplate std::vector<vec_t, allocator<vec_t>> wsp0(wsp0Size); - const vec_t *jac_ptr; - const vec_t *df_ptr; + const vec_t *jac_ptr = &((*this->m_jac)[0]); + const vec_t *df_ptr = &((*this->m_df)[0]); vec_t df_tmp[max_ndf]; // max_ndf is a constexpr - std::vector<vec_t, allocator<vec_t>> &Z0 = this->m_Z[0]; - std::vector<vec_t, allocator<vec_t>> &Z1 = this->m_Z[1]; + // temporary aligned storage for local fields + NekDouble *locField = static_cast<NekDouble *>(::operator new[]( + nqBlocks * sizeof(NekDouble), std::align_val_t(vec_t::alignment))); - for (int e = 0; e < this->m_nBlocks; ++e) + for (int e = 0; e < this->m_nBlocks - 1; ++e) { - // Jacobian - jac_ptr = &((*this->m_jac)[dJSize * e]); - - // Derivative factor - df_ptr = &((*this->m_df)[dfSize * e]); - // Load and transpose data for (int d = 0; d < indim; ++d) { - load_interleave(inptr[d], nqTot, tmpIn[d]); + // Load data to aligned storage and interleave it + // std::copy(inptr[d], inptr[d] + nqBlocks, locField); + // load_interleave(locField, nqTot, tmpIn[d]); + load_unalign_interleave(inptr[d], nqTot, tmpIn[d]); } - IProductWRTDerivBase2DKernel<SHAPE_TYPE, DEFORMED>( - nq0, nq1, indim, df_ptr, df_tmp, Z0, Z1, tmpIn, tmp0, tmp1); - - // IP DB0 B1 - IProduct2DKernel<SHAPE_TYPE, false, false, DEFORMED>( - nm0, nm1, nq0, nq1, correct, tmp0, this->m_dbdata[0], - this->m_bdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0, - tmpOut); - - // IP DB1 B0 - IProduct2DKernel<SHAPE_TYPE, false, true, DEFORMED>( - nm0, nm1, nq0, nq1, correct, tmp1, this->m_bdata[0], - this->m_dbdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0, - tmpOut); + fusedKernel2D(nm0, nm1, nq0, nq1, indim, correct, jac_ptr, df_ptr, + tmpIn, df_tmp, tmp0, tmp1, wsp0, tmpOut); // de-interleave and store data - deinterleave_store(tmpOut, m_nmTot, outptr); + // deinterleave_store(tmpOut, m_nmTot, locField); + // std::copy(locField, locField + nmBlocks, outptr); + deinterleave_unalign_store(tmpOut, m_nmTot, outptr); for (int d = 0; d < indim; ++d) { inptr[d] += nqBlocks; } outptr += nmBlocks; + df_ptr += dfSize; + jac_ptr += dJSize; } + // last block + { + int acturalSize = nqBlocks - this->m_nPads * nqTot; + for (int d = 0; d < indim; ++d) + { + std::copy(inptr[d], inptr[d] + acturalSize, locField); + load_interleave(locField, nqTot, tmpIn[d]); + } + + fusedKernel2D(nm0, nm1, nq0, nq1, indim, correct, jac_ptr, df_ptr, + tmpIn, df_tmp, tmp0, tmp1, wsp0, tmpOut); + + // de-interleave and store data + deinterleave_store(tmpOut, m_nmTot, locField); + acturalSize = nmBlocks - this->m_nPads * m_nmTot; + std::copy(locField, locField + acturalSize, outptr); + } + // free aligned memory + ::operator delete[](locField, std::align_val_t(vec_t::alignment)); } // Size based template version. @@ -436,51 +498,83 @@ struct IProductWRTDerivBaseTemplate std::vector<vec_t, allocator<vec_t>> wsp0(wsp0Size); - const vec_t *jac_ptr; - const vec_t *df_ptr; + const vec_t *jac_ptr = &((*this->m_jac)[0]); + const vec_t *df_ptr = &((*this->m_df)[0]); vec_t df_tmp[max_ndf]; // max_ndf is a constexpr - std::vector<vec_t, allocator<vec_t>> &Z0 = this->m_Z[0]; - std::vector<vec_t, allocator<vec_t>> &Z1 = this->m_Z[1]; + // temporary aligned storage for local fields + alignas(vec_t::alignment) NekDouble locField[nqTot * vec_t::width]; - for (int e = 0; e < this->m_nBlocks; ++e) + for (int e = 0; e < this->m_nBlocks - 1; ++e) { - // Jacobian - jac_ptr = &((*this->m_jac)[dJSize * e]); - - // Derivative factor - df_ptr = &((*this->m_df)[dfSize * e]); - // Load and transpose data for (int d = 0; d < indim; ++d) { - load_interleave(inptr[d], nqTot, tmpIn[d]); + // Load data to aligned storage and interleave it + // std::copy(inptr[d], inptr[d] + nqBlocks, locField); + // load_interleave(locField, nqTot, tmpIn[d]); + load_unalign_interleave(inptr[d], nqTot, tmpIn[d]); } - IProductWRTDerivBase2DKernel<SHAPE_TYPE, DEFORMED>( - nq0, nq1, indim, df_ptr, df_tmp, Z0, Z1, tmpIn, tmp0, tmp1); - - // IP DB0 B1 - IProduct2DKernel<SHAPE_TYPE, false, false, DEFORMED>( - nm0, nm1, nq0, nq1, correct, tmp0, this->m_dbdata[0], - this->m_bdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0, - tmpOut); - - // IP DB1 B0 - IProduct2DKernel<SHAPE_TYPE, false, true, DEFORMED>( - nm0, nm1, nq0, nq1, correct, tmp1, this->m_bdata[0], - this->m_dbdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0, - tmpOut); + fusedKernel2D(nm0, nm1, nq0, nq1, indim, correct, jac_ptr, df_ptr, + tmpIn, df_tmp, tmp0, tmp1, wsp0, tmpOut); // de-interleave and store data - deinterleave_store(tmpOut, m_nmTot, outptr); + // deinterleave_store(tmpOut, m_nmTot, locField); + // std::copy(locField, locField + nmBlocks, outptr); + deinterleave_unalign_store(tmpOut, m_nmTot, outptr); for (int d = 0; d < indim; ++d) { inptr[d] += nqBlocks; } outptr += nmBlocks; + df_ptr += dfSize; + jac_ptr += dJSize; } + // last block + { + int acturalSize = nqBlocks - this->m_nPads * nqTot; + for (int d = 0; d < indim; ++d) + { + std::copy(inptr[d], inptr[d] + acturalSize, locField); + load_interleave(locField, nqTot, tmpIn[d]); + } + + fusedKernel2D(nm0, nm1, nq0, nq1, indim, correct, jac_ptr, df_ptr, + tmpIn, df_tmp, tmp0, tmp1, wsp0, tmpOut); + + // de-interleave and store data + deinterleave_store(tmpOut, m_nmTot, locField); + acturalSize = nmBlocks - this->m_nPads * m_nmTot; + std::copy(locField, locField + acturalSize, outptr); + } + } + + NEK_FORCE_INLINE void fusedKernel2D( + const size_t nm0, const size_t nm1, const size_t nq0, const size_t nq1, + const size_t indim, const bool correct, const vec_t *jac_ptr, + const vec_t *df_ptr, const std::vector<vec_t, allocator<vec_t>> *tmpIn, + vec_t *df_tmp, std::vector<vec_t, allocator<vec_t>> &tmp0, + std::vector<vec_t, allocator<vec_t>> &tmp1, + std::vector<vec_t, allocator<vec_t>> &wsp0, + std::vector<vec_t, allocator<vec_t>> &tmpOut) + { + IProductWRTDerivBase2DKernel<SHAPE_TYPE, DEFORMED>( + nq0, nq1, indim, df_ptr, df_tmp, this->m_Z[0], this->m_Z[1], tmpIn, + tmp0, tmp1); + + // IP DB0 B1 + IProduct2DKernel<SHAPE_TYPE, false, false, DEFORMED>( + nm0, nm1, nq0, nq1, correct, tmp0, this->m_dbdata[0], + this->m_bdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0, + tmpOut); + + // IP DB1 B0 + IProduct2DKernel<SHAPE_TYPE, false, true, DEFORMED>( + nm0, nm1, nq0, nq1, correct, tmp1, this->m_bdata[0], + this->m_dbdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0, + tmpOut); } #elif defined(SHAPE_DIMENSION_3D) @@ -533,56 +627,60 @@ struct IProductWRTDerivBaseTemplate wsp2(wsp2Size), tmpIn0(nqTot), tmpIn1(nqTot), tmpIn2(nqTot), tmp0(nqTot), tmp1(nqTot), tmp2(nqTot), tmpOut(m_nmTot); - const vec_t *jac_ptr; - const vec_t *df_ptr; + const vec_t *jac_ptr = &((*this->m_jac)[0]); + const vec_t *df_ptr = &((*this->m_df)[0]); - std::vector<vec_t, allocator<vec_t>> &Z0 = this->m_Z[0]; - std::vector<vec_t, allocator<vec_t>> &Z1 = this->m_Z[1]; - std::vector<vec_t, allocator<vec_t>> &Z2 = this->m_Z[2]; + // temporary aligned storage for local fields + NekDouble *locField = static_cast<NekDouble *>(::operator new[]( + nqBlocks * sizeof(NekDouble), std::align_val_t(vec_t::alignment))); - for (int e = 0; e < this->m_nBlocks; ++e) + for (int e = 0; e < this->m_nBlocks - 1; ++e) { - // Jacobian - jac_ptr = &((*this->m_jac)[dJSize * e]); - - // Derivative factor - df_ptr = &((*this->m_df)[dfSize * e]); - - // Load and transpose data - load_interleave(inptr0, nqTot, tmpIn0); - load_interleave(inptr1, nqTot, tmpIn1); - load_interleave(inptr2, nqTot, tmpIn2); - - IProductWRTDerivBase3DKernel<SHAPE_TYPE, DEFORMED>( - nq0, nq1, nq2, df_ptr, Z0, Z1, Z2, tmpIn0, tmpIn1, tmpIn2, tmp0, - tmp1, tmp2); - - // IP DB0 B1 B2 - IProduct3DKernel<SHAPE_TYPE, false, false, DEFORMED>( - nm0, nm1, nm2, nq0, nq1, nq2, correct, tmp0, this->m_dbdata[0], - this->m_bdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1], - this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut); - - // IP B0 DB1 B2 - IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>( - nm0, nm1, nm2, nq0, nq1, nq2, correct, tmp1, this->m_bdata[0], - this->m_dbdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1], - this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut); - - // IP B0 B1 DB2 - IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>( - nm0, nm1, nm2, nq0, nq1, nq2, correct, tmp2, this->m_bdata[0], - this->m_bdata[1], this->m_dbdata[2], this->m_w[0], this->m_w[1], - this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut); + // Load data to aligned storage and interleave it + // load_interleave(inptr0, nqTot, tmpIn0); + // load_interleave(inptr1, nqTot, tmpIn1); + // load_interleave(inptr2, nqTot, tmpIn2); + load_unalign_interleave(inptr0, nqTot, tmpIn0); + load_unalign_interleave(inptr1, nqTot, tmpIn1); + load_unalign_interleave(inptr2, nqTot, tmpIn2); + + fusedKernel3D(nm0, nm1, nm2, nq0, nq1, nq2, correct, jac_ptr, + df_ptr, tmpIn0, tmpIn1, tmpIn2, tmp0, tmp1, tmp2, + wsp0, wsp1, wsp2, tmpOut); // de-interleave and store data - deinterleave_store(tmpOut, m_nmTot, outptr); + // deinterleave_store(tmpOut, m_nmTot, locField); + // std::copy(locField, locField + nmBlocks, outptr); + deinterleave_unalign_store(tmpOut, m_nmTot, outptr); inptr0 += nqBlocks; inptr1 += nqBlocks; inptr2 += nqBlocks; outptr += nmBlocks; + df_ptr += dfSize; + jac_ptr += dJSize; + } + // last block + { + int acturalSize = nqBlocks - this->m_nPads * nqTot; + std::copy(inptr0, inptr0 + acturalSize, locField); + load_interleave(locField, nqTot, tmpIn0); + std::copy(inptr1, inptr1 + acturalSize, locField); + load_interleave(locField, nqTot, tmpIn1); + std::copy(inptr2, inptr2 + acturalSize, locField); + load_interleave(locField, nqTot, tmpIn2); + + fusedKernel3D(nm0, nm1, nm2, nq0, nq1, nq2, correct, jac_ptr, + df_ptr, tmpIn0, tmpIn1, tmpIn2, tmp0, tmp1, tmp2, + wsp0, wsp1, wsp2, tmpOut); + + // de-interleave and store data + deinterleave_store(tmpOut, m_nmTot, locField); + acturalSize = nmBlocks - this->m_nPads * m_nmTot; + std::copy(locField, locField + acturalSize, outptr); } + // free aligned memory + ::operator delete[](locField, std::align_val_t(vec_t::alignment)); } // Size based template version. @@ -626,58 +724,97 @@ struct IProductWRTDerivBaseTemplate wsp2(wsp2Size), tmpIn0(nqTot), tmpIn1(nqTot), tmpIn2(nqTot), tmp0(nqTot), tmp1(nqTot), tmp2(nqTot), tmpOut(m_nmTot); - const vec_t *jac_ptr; - const vec_t *df_ptr; + const vec_t *jac_ptr = &((*this->m_jac)[0]); + const vec_t *df_ptr = &((*this->m_df)[0]); - std::vector<vec_t, allocator<vec_t>> &Z0 = this->m_Z[0]; - std::vector<vec_t, allocator<vec_t>> &Z1 = this->m_Z[1]; - std::vector<vec_t, allocator<vec_t>> &Z2 = this->m_Z[2]; + // temporary aligned storage for local fields + alignas(vec_t::alignment) NekDouble locField[nqTot * vec_t::width]; - for (int e = 0; e < this->m_nBlocks; ++e) + for (int e = 0; e < this->m_nBlocks - 1; ++e) { - // Jacobian - jac_ptr = &((*this->m_jac)[dJSize * e]); - - // Derivative factor - df_ptr = &((*this->m_df)[dfSize * e]); - // Load and transpose data - load_interleave(inptr0, nqTot, tmpIn0); - load_interleave(inptr1, nqTot, tmpIn1); - load_interleave(inptr2, nqTot, tmpIn2); - - IProductWRTDerivBase3DKernel<SHAPE_TYPE, DEFORMED>( - nq0, nq1, nq2, df_ptr, Z0, Z1, Z2, tmpIn0, tmpIn1, tmpIn2, tmp0, - tmp1, tmp2); - - // IP DB0 B1 B2 - IProduct3DKernel<SHAPE_TYPE, false, false, DEFORMED>( - nm0, nm1, nm2, nq0, nq1, nq2, correct, tmp0, this->m_dbdata[0], - this->m_bdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1], - this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut); - - // IP B0 DB1 B2 - IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>( - nm0, nm1, nm2, nq0, nq1, nq2, correct, tmp1, this->m_bdata[0], - this->m_dbdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1], - this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut); - - // IP B0 B1 DB2 - IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>( - nm0, nm1, nm2, nq0, nq1, nq2, correct, tmp2, this->m_bdata[0], - this->m_bdata[1], this->m_dbdata[2], this->m_w[0], this->m_w[1], - this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut); + // load_interleave(inptr0, nqTot, tmpIn0); + // load_interleave(inptr1, nqTot, tmpIn1); + // load_interleave(inptr2, nqTot, tmpIn2); + load_unalign_interleave(inptr0, nqTot, tmpIn0); + load_unalign_interleave(inptr1, nqTot, tmpIn1); + load_unalign_interleave(inptr2, nqTot, tmpIn2); + + fusedKernel3D(nm0, nm1, nm2, nq0, nq1, nq2, correct, jac_ptr, + df_ptr, tmpIn0, tmpIn1, tmpIn2, tmp0, tmp1, tmp2, + wsp0, wsp1, wsp2, tmpOut); // de-interleave and store data - deinterleave_store(tmpOut, m_nmTot, outptr); + // deinterleave_store(tmpOut, m_nmTot, locField); + // std::copy(locField, locField + nmBlocks, outptr); + deinterleave_unalign_store(tmpOut, m_nmTot, outptr); inptr0 += nqBlocks; inptr1 += nqBlocks; inptr2 += nqBlocks; outptr += nmBlocks; + df_ptr += dfSize; + jac_ptr += dJSize; + } + // last block + { + int acturalSize = nqBlocks - this->m_nPads * nqTot; + std::copy(inptr0, inptr0 + acturalSize, locField); + load_interleave(locField, nqTot, tmpIn0); + std::copy(inptr1, inptr1 + acturalSize, locField); + load_interleave(locField, nqTot, tmpIn1); + std::copy(inptr2, inptr2 + acturalSize, locField); + load_interleave(locField, nqTot, tmpIn2); + + fusedKernel3D(nm0, nm1, nm2, nq0, nq1, nq2, correct, jac_ptr, + df_ptr, tmpIn0, tmpIn1, tmpIn2, tmp0, tmp1, tmp2, + wsp0, wsp1, wsp2, tmpOut); + + // de-interleave and store data + deinterleave_store(tmpOut, m_nmTot, locField); + acturalSize = nmBlocks - this->m_nPads * m_nmTot; + std::copy(locField, locField + acturalSize, outptr); } } + NEK_FORCE_INLINE void fusedKernel3D( + const size_t nm0, const size_t nm1, const size_t nm2, const size_t nq0, + const size_t nq1, const size_t nq2, const bool correct, + const vec_t *jac_ptr, const vec_t *df_ptr, + const std::vector<vec_t, allocator<vec_t>> &tmpIn0, + const std::vector<vec_t, allocator<vec_t>> &tmpIn1, + const std::vector<vec_t, allocator<vec_t>> &tmpIn2, + std::vector<vec_t, allocator<vec_t>> &tmp0, + std::vector<vec_t, allocator<vec_t>> &tmp1, + std::vector<vec_t, allocator<vec_t>> &tmp2, + std::vector<vec_t, allocator<vec_t>> &wsp0, + std::vector<vec_t, allocator<vec_t>> &wsp1, + std::vector<vec_t, allocator<vec_t>> &wsp2, + std::vector<vec_t, allocator<vec_t>> &tmpOut) + { + IProductWRTDerivBase3DKernel<SHAPE_TYPE, DEFORMED>( + nq0, nq1, nq2, df_ptr, this->m_Z[0], this->m_Z[1], this->m_Z[2], + tmpIn0, tmpIn1, tmpIn2, tmp0, tmp1, tmp2); + + // IP DB0 B1 B2 + IProduct3DKernel<SHAPE_TYPE, false, false, DEFORMED>( + nm0, nm1, nm2, nq0, nq1, nq2, correct, tmp0, this->m_dbdata[0], + this->m_bdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1], + this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut); + + // IP B0 DB1 B2 + IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>( + nm0, nm1, nm2, nq0, nq1, nq2, correct, tmp1, this->m_bdata[0], + this->m_dbdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1], + this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut); + + // IP B0 B1 DB2 + IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>( + nm0, nm1, nm2, nq0, nq1, nq2, correct, tmp2, this->m_bdata[0], + this->m_bdata[1], this->m_dbdata[2], this->m_w[0], this->m_w[1], + this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut); + } + #endif // SHAPE_DIMENSION private: diff --git a/library/MatrixFreeOps/LinearAdvectionDiffusionReaction.h b/library/MatrixFreeOps/LinearAdvectionDiffusionReaction.h index ac84d0bb9175e8c164f97731e33f6bb70552385c..f8cbbc019cbe84b046b8cbae3246640400dbbb98 100644 --- a/library/MatrixFreeOps/LinearAdvectionDiffusionReaction.h +++ b/library/MatrixFreeOps/LinearAdvectionDiffusionReaction.h @@ -193,102 +193,62 @@ struct LinearAdvectionDiffusionReactionTemplate std::vector<vec_t, allocator<vec_t>> bwd(nqTot), deriv0(nqTot), deriv1(nqTot); - const vec_t *jac_ptr = {}; - const vec_t *df_ptr = {}; - const vec_t *advVel_ptr = {}; + // temporary aligned storage for local fields + NekDouble *locField = static_cast<NekDouble *>(::operator new[]( + nmBlocks * sizeof(NekDouble), std::align_val_t(vec_t::alignment))); - // Get size of derivative factor and advection velocity block - auto dfSize = ndf; - if (DEFORMED) - { - dfSize *= nqTot; - } - auto advVelSize = nvel * nqTot; + const vec_t *jac_ptr = &((*this->m_jac)[0]); + const vec_t *df_ptr = &((*this->m_df)[0]); + const vec_t *advVel_ptr = &((*this->m_advVel)[0]); + + // Get size of derivative factor block + const auto dfSize = ndf * nqTot; + const auto advVelSize = nvel * nqTot; - for (int e = 0; e < this->m_nBlocks; e++) + for (int e = 0; e < this->m_nBlocks - 1; e++) { - df_ptr = &((*this->m_df)[e * dfSize]); - advVel_ptr = &((*this->m_advVel)[e * advVelSize]); + // Load data to aligned storage and interleave it + // std::copy(inptr, inptr + nmBlocks, locField); + // load_interleave(locField, m_nmTot, tmpIn); + load_unalign_interleave(inptr, m_nmTot, tmpIn); - // Load and transpose data - load_interleave(inptr, m_nmTot, tmpIn); + fusedKernel2D(nm0, nm1, nq0, nq1, correct, jac_ptr, df_ptr, + advVel_ptr, tmpIn, tmpOut, wsp0, bwd, deriv0, deriv1); - if (!DEFORMED) + // de-interleave and store data + // deinterleave_store(tmpOut, m_nmTot, locField); + // std::copy(locField, locField + nmBlocks, outptr); + deinterleave_unalign_store(tmpOut, m_nmTot, outptr); + + inptr += nmBlocks; + outptr += nmBlocks; + advVel_ptr += advVelSize; + if constexpr (DEFORMED) { - jac_ptr = &((*this->m_jac)[e]); + jac_ptr += nqTot; + df_ptr += dfSize; } else { - jac_ptr = &((*this->m_jac)[e * nqTot]); + ++jac_ptr; + df_ptr += ndf; } + } + // last block + { + int acturalSize = nmBlocks - this->m_nPads * m_nmTot; + std::copy(inptr, inptr + acturalSize, locField); + load_interleave(locField, m_nmTot, tmpIn); - // Step 1: BwdTrans - // Input: tmpIn = uCoeff - // Output: bwd = uPhys xi - BwdTrans2DKernel<SHAPE_TYPE>(nm0, nm1, nq0, nq1, correct, tmpIn, - this->m_bdata[0], this->m_bdata[1], - wsp0, bwd); - - // Step 2: take STD derivatives in collapsed coordinate space - // Input: bwd = uPhys xi (const) - // Output: deriv0 = du/dxi_1 - // deriv1 = du/dxi_2 - PhysDerivTensor2DKernel(nq0, nq1, bwd, this->m_D[0], this->m_D[1], - deriv0, deriv1); - - // Step 3: Advect solution by advection velocities (varcoeffs) - // Input: deriv0 = du/dxi_1, - // deriv1 = du/dxi_2 - // Output: bwd += V \cdot \nabla_x u - Advection2DKernel<SHAPE_TYPE, DEFORMED>( - nq0, nq1, advVel_ptr, df_ptr, this->m_h0, this->m_h1, deriv0, - deriv1, bwd); - - // Step 4: Inner product for mass + advection matrix operation - // Input: bwd = 1/lambda * V \cdot \nabla_x uPhys + uPhys (const) - // Output: wsp0 = temporary - // tmpOut = lambda \int_\Omega (1/lambda*V \cdot \nabla_x - // uPhys + uPhys) phi - IProduct2DKernel<SHAPE_TYPE, true, false, DEFORMED>( - nm0, nm1, nq0, nq1, correct, bwd, this->m_bdata[0], - this->m_bdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0, - tmpOut, m_lambda); - - // Step 5: Evaluate local derivatives with laplacian metrics - // Input: - // deriv0 = du/dxi_1, - // deriv1 = du/dxi_2 - // Output: deriv0 = (J^2 \cdot du/dxi)[0], - // deriv1 = (J^2 \cdot du/dxi)[1] - DiffusionCoeff2DKernel<SHAPE_TYPE, DEFORMED>( - nq0, nq1, this->m_isConstVarDiff, this->m_constVarDiff, - this->m_isVarDiff, this->m_varD00, this->m_varD01, - this->m_varD11, df_ptr, this->m_h0, this->m_h1, deriv0, deriv1); - - // Inner product with derivative basis xi_1 - // Input: deriv0 = (J^2 \cdot du/dxi)[0] (const) - // Output: wsp0 = temporary, - // tmpOut += \int_\Omega X dphi/dx (append) - IProduct2DKernel<SHAPE_TYPE, false, true, DEFORMED>( - nm0, nm1, nq0, nq1, correct, deriv0, this->m_dbdata[0], - this->m_bdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0, - tmpOut); - - // Inner product with derivative basis xi_2 - // Input: deriv1 = (J^2 \cdot du/dxi)[1] (const) - // Output: wsp0 = temporary, - // tmpOut += \int_\Omega X dphi/dy (append) - IProduct2DKernel<SHAPE_TYPE, false, true, DEFORMED>( - nm0, nm1, nq0, nq1, correct, deriv1, this->m_bdata[0], - this->m_dbdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0, - tmpOut); + fusedKernel2D(nm0, nm1, nq0, nq1, correct, jac_ptr, df_ptr, + advVel_ptr, tmpIn, tmpOut, wsp0, bwd, deriv0, deriv1); // de-interleave and store data - deinterleave_store(tmpOut, m_nmTot, outptr); - - inptr += nmBlocks; - outptr += nmBlocks; + deinterleave_store(tmpOut, m_nmTot, locField); + std::copy(locField, locField + acturalSize, outptr); } + // free aligned memory + ::operator delete[](locField, std::align_val_t(vec_t::alignment)); } // Size based template version. @@ -299,8 +259,8 @@ struct LinearAdvectionDiffusionReactionTemplate constexpr auto ndf = 4; constexpr auto nvel = 2; - const auto nqTot = nq0 * nq1; - const auto nmBlocks = m_nmTot * vec_t::width; + constexpr auto nqTot = nq0 * nq1; + const auto nmBlocks = m_nmTot * vec_t::width; auto *inptr = &input[0]; auto *outptr = &output[0]; @@ -319,104 +279,134 @@ struct LinearAdvectionDiffusionReactionTemplate std::vector<vec_t, allocator<vec_t>> bwd(nqTot), deriv0(nqTot), deriv1(nqTot); - const vec_t *jac_ptr = {}; - const vec_t *df_ptr = {}; - const vec_t *advVel_ptr = {}; + // temporary aligned storage for local fields + alignas(vec_t::alignment) NekDouble locField[nqTot * vec_t::width]; + + const vec_t *jac_ptr = &((*this->m_jac)[0]); + const vec_t *df_ptr = &((*this->m_df)[0]); + const vec_t *advVel_ptr = &((*this->m_advVel)[0]); // Get size of derivative factor block - auto dfSize = ndf; - if (DEFORMED) - { - dfSize *= nqTot; - } - auto advVelSize = nvel * nqTot; + constexpr auto dfSize = ndf * nqTot; + constexpr auto advVelSize = nvel * nqTot; - for (int e = 0; e < this->m_nBlocks; e++) + for (int e = 0; e < this->m_nBlocks - 1; e++) { - df_ptr = &((*this->m_df)[e * dfSize]); - advVel_ptr = &((*this->m_advVel)[e * advVelSize]); + // Load data to aligned storage and interleave it + // std::copy(inptr, inptr + nmBlocks, locField); + // load_interleave(locField, m_nmTot, tmpIn); + load_unalign_interleave(inptr, m_nmTot, tmpIn); - // Load and transpose data - load_interleave(inptr, m_nmTot, tmpIn); + fusedKernel2D(nm0, nm1, nq0, nq1, correct, jac_ptr, df_ptr, + advVel_ptr, tmpIn, tmpOut, wsp0, bwd, deriv0, deriv1); - if (!DEFORMED) + // de-interleave and store data + // deinterleave_store(tmpOut, m_nmTot, locField); + // std::copy(locField, locField + nmBlocks, outptr); + deinterleave_unalign_store(tmpOut, m_nmTot, outptr); + + inptr += nmBlocks; + outptr += nmBlocks; + advVel_ptr += advVelSize; + if constexpr (DEFORMED) { - jac_ptr = &((*this->m_jac)[e]); + jac_ptr += nqTot; + df_ptr += dfSize; } else { - jac_ptr = &((*this->m_jac)[e * nqTot]); + ++jac_ptr; + df_ptr += ndf; } + } + // last block + { + int acturalSize = nmBlocks - this->m_nPads * m_nmTot; + std::copy(inptr, inptr + acturalSize, locField); + load_interleave(locField, m_nmTot, tmpIn); - // Step 1: BwdTrans - // Input: tmpIn = uCoeff - // Output: bwd = uPhys xi - BwdTrans2DKernel<SHAPE_TYPE>(nm0, nm1, nq0, nq1, correct, tmpIn, - this->m_bdata[0], this->m_bdata[1], - wsp0, bwd); - - // Step 2: take STD derivatives in collapsed coordinate space - // Input: bwd = uPhys xi (const) - // Output: deriv0 = du/dxi_1 - // deriv1 = du/dxi_2 - PhysDerivTensor2DKernel(nq0, nq1, bwd, this->m_D[0], this->m_D[1], - deriv0, deriv1); - - // Step 3: Advect solution by advection velocities (varcoeffs) - // Input: deriv0 = du/dxi_1, - // deriv1 = du/dxi_2 - // Output: bwd += V \cdot \nabla_x u - Advection2DKernel<SHAPE_TYPE, DEFORMED>( - nq0, nq1, advVel_ptr, df_ptr, this->m_h0, this->m_h1, deriv0, - deriv1, bwd); - - // Step 4: Inner product for mass + advection matrix operation - // Input: bwd = 1/lambda * V \cdot \nabla_x uPhys + uPhys (const) - // Output: wsp0 = temporary - // tmpOut = lambda \int_\Omega (1/lambda*V \cdot \nabla_x - // uPhys + uPhys) phi - IProduct2DKernel<SHAPE_TYPE, true, false, DEFORMED>( - nm0, nm1, nq0, nq1, correct, bwd, this->m_bdata[0], - this->m_bdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0, - tmpOut, m_lambda); - - // Step 5: Evaluate local derivatives with laplacian metrics - // Input: - // deriv0 = du/dxi_1, - // deriv1 = du/dxi_2 - // Output: deriv0 = (J^2 \cdot du/dxi)[0], - // deriv1 = (J^2 \cdot du/dxi)[1] - DiffusionCoeff2DKernel<SHAPE_TYPE, DEFORMED>( - nq0, nq1, this->m_isConstVarDiff, this->m_constVarDiff, - this->m_isVarDiff, this->m_varD00, this->m_varD01, - this->m_varD11, df_ptr, this->m_h0, this->m_h1, deriv0, deriv1); - - // Inner product with derivative basis xi_1 - // Input: deriv0 = (J^2 \cdot du/dxi)[0] (const) - // Output: wsp0 = temporary, - // tmpOut += \int_\Omega X dphi/dx (append) - IProduct2DKernel<SHAPE_TYPE, false, true, DEFORMED>( - nm0, nm1, nq0, nq1, correct, deriv0, this->m_dbdata[0], - this->m_bdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0, - tmpOut); - - // Inner product with derivative basis xi_2 - // Input: deriv1 = (J^2 \cdot du/dxi)[1] (const) - // Output: wsp0 = temporary, - // tmpOut += \int_\Omega X dphi/dy (append) - IProduct2DKernel<SHAPE_TYPE, false, true, DEFORMED>( - nm0, nm1, nq0, nq1, correct, deriv1, this->m_bdata[0], - this->m_dbdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0, - tmpOut); + fusedKernel2D(nm0, nm1, nq0, nq1, correct, jac_ptr, df_ptr, + advVel_ptr, tmpIn, tmpOut, wsp0, bwd, deriv0, deriv1); // de-interleave and store data - deinterleave_store(tmpOut, m_nmTot, outptr); - - inptr += nmBlocks; - outptr += nmBlocks; + deinterleave_store(tmpOut, m_nmTot, locField); + std::copy(locField, locField + acturalSize, outptr); } } + NEK_FORCE_INLINE void fusedKernel2D( + const size_t nm0, const size_t nm1, const size_t nq0, const size_t nq1, + const bool correct, const vec_t *jac_ptr, const vec_t *df_ptr, + const vec_t *advVel_ptr, + const std::vector<vec_t, allocator<vec_t>> &tmpIn, + std::vector<vec_t, allocator<vec_t>> &tmpOut, + std::vector<vec_t, allocator<vec_t>> &wsp0, + std::vector<vec_t, allocator<vec_t>> &bwd, + std::vector<vec_t, allocator<vec_t>> &deriv0, + std::vector<vec_t, allocator<vec_t>> &deriv1) + { + // Step 1: BwdTrans + // Input: tmpIn = uCoeff + // Output: bwd = uPhys xi + BwdTrans2DKernel<SHAPE_TYPE>(nm0, nm1, nq0, nq1, correct, tmpIn, + this->m_bdata[0], this->m_bdata[1], wsp0, + bwd); + + // Step 2: take STD derivatives in collapsed coordinate space + // Input: bwd = uPhys xi (const) + // Output: deriv0 = du/dxi_1 + // deriv1 = du/dxi_2 + PhysDerivTensor2DKernel(nq0, nq1, bwd, this->m_D[0], this->m_D[1], + deriv0, deriv1); + + // Step 3: Advect solution by advection velocities (varcoeffs) + // Input: deriv0 = du/dxi_1, + // deriv1 = du/dxi_2 + // Output: bwd += V \cdot \nabla_x u + Advection2DKernel<SHAPE_TYPE, DEFORMED>(nq0, nq1, advVel_ptr, df_ptr, + this->m_h0, this->m_h1, deriv0, + deriv1, bwd); + + // Step 4: Inner product for mass + advection matrix operation + // Input: bwd = 1/lambda * V \cdot \nabla_x uPhys + uPhys (const) + // Output: wsp0 = temporary + // tmpOut = lambda \int_\Omega (1/lambda*V \cdot \nabla_x + // uPhys + uPhys) phi + IProduct2DKernel<SHAPE_TYPE, true, false, DEFORMED>( + nm0, nm1, nq0, nq1, correct, bwd, this->m_bdata[0], + this->m_bdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0, tmpOut, + m_lambda); + + // Step 5: Evaluate local derivatives with laplacian metrics + // Input: + // deriv0 = du/dxi_1, + // deriv1 = du/dxi_2 + // Output: deriv0 = (J^2 \cdot du/dxi)[0], + // deriv1 = (J^2 \cdot du/dxi)[1] + DiffusionCoeff2DKernel<SHAPE_TYPE, DEFORMED>( + nq0, nq1, this->m_isConstVarDiff, this->m_constVarDiff, + this->m_isVarDiff, this->m_varD00, this->m_varD01, this->m_varD11, + df_ptr, this->m_h0, this->m_h1, deriv0, deriv1); + + // Inner product with derivative basis xi_1 + // Input: deriv0 = (J^2 \cdot du/dxi)[0] (const) + // Output: wsp0 = temporary, + // tmpOut += \int_\Omega X dphi/dx (append) + IProduct2DKernel<SHAPE_TYPE, false, true, DEFORMED>( + nm0, nm1, nq0, nq1, correct, deriv0, this->m_dbdata[0], + this->m_bdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0, + tmpOut); + + // Inner product with derivative basis xi_2 + // Input: deriv1 = (J^2 \cdot du/dxi)[1] (const) + // Output: wsp0 = temporary, + // tmpOut += \int_\Omega X dphi/dy (append) + IProduct2DKernel<SHAPE_TYPE, false, true, DEFORMED>( + nm0, nm1, nq0, nq1, correct, deriv1, this->m_bdata[0], + this->m_dbdata[1], this->m_w[0], this->m_w[1], jac_ptr, wsp0, + tmpOut); + } + #elif defined(SHAPE_DIMENSION_3D) // Non-size based operator. @@ -457,94 +447,64 @@ struct LinearAdvectionDiffusionReactionTemplate std::vector<vec_t, allocator<vec_t>> deriv0(nqTot), deriv1(nqTot), deriv2(nqTot); - const vec_t *jac_ptr = {}; - const vec_t *df_ptr = {}; - const vec_t *advVel_ptr = {}; + // temporary aligned storage for local fields + NekDouble *locField = static_cast<NekDouble *>(::operator new[]( + nmBlocks * sizeof(NekDouble), std::align_val_t(vec_t::alignment))); + + const vec_t *jac_ptr = &((*this->m_jac)[0]); + const vec_t *df_ptr = &((*this->m_df)[0]); + const vec_t *advVel_ptr = &((*this->m_advVel)[0]); // Get size of derivative factor block - auto dfSize = ndf; - if (DEFORMED) - { - dfSize *= nqTot; - } - auto advVelSize = nvel * nqTot; + const auto dfSize = ndf * nqTot; + const auto advVelSize = nvel * nqTot; - for (int e = 0; e < this->m_nBlocks; e++) + for (int e = 0; e < this->m_nBlocks - 1; e++) { - df_ptr = &((*this->m_df)[e * dfSize]); - advVel_ptr = &((*this->m_advVel)[e * advVelSize]); + // Load data to aligned storage and interleave it + // std::copy(inptr, inptr + nmBlocks, locField); + // load_interleave(locField, m_nmTot, tmpIn); + load_unalign_interleave(inptr, m_nmTot, tmpIn); + + fusedKernel3D(nm0, nm1, nm2, nq0, nq1, nq2, correct, jac_ptr, + df_ptr, advVel_ptr, tmpIn, tmpOut, wsp0, wsp1, wsp2, + bwd, deriv0, deriv1, deriv2); - // Load and transpose data - load_interleave(inptr, m_nmTot, tmpIn); + // de-interleave and store data + // deinterleave_store(tmpOut, m_nmTot, locField); + // std::copy(locField, locField + nmBlocks, outptr); + deinterleave_unalign_store(tmpOut, m_nmTot, outptr); - if (!DEFORMED) + inptr += nmBlocks; + outptr += nmBlocks; + advVel_ptr += advVelSize; + if constexpr (DEFORMED) { - jac_ptr = &((*this->m_jac)[e]); + jac_ptr += nqTot; + df_ptr += dfSize; } else { - jac_ptr = &((*this->m_jac)[e * nqTot]); + ++jac_ptr; + df_ptr += ndf; } + } + // last block + { + int acturalSize = nmBlocks - this->m_nPads * m_nmTot; + std::copy(inptr, inptr + acturalSize, locField); + load_interleave(locField, m_nmTot, tmpIn); - // Step 1: BwdTrans - BwdTrans3DKernel<SHAPE_TYPE>( - nm0, nm1, nm2, nq0, nq1, nq2, correct, tmpIn, this->m_bdata[0], - this->m_bdata[1], this->m_bdata[2], wsp0, wsp1, bwd); - - // Step 2: take STD derivatives in collapsed coordinate space - PhysDerivTensor3DKernel(nq0, nq1, nq2, bwd, this->m_D[0], - this->m_D[1], this->m_D[2], deriv0, deriv1, - deriv2); - - // Step 3: Advect solution by advection velocities (varcoeffs) - // Input: deriv0 = du/dxi_1, - // deriv1 = du/dxi_2 - // deriv2 = du/dxi_3 - // Output: bwd += V \cdot \nabla_x u - Advection3DKernel<SHAPE_TYPE, DEFORMED>( - nq0, nq1, nq2, advVel_ptr, df_ptr, this->m_h0, this->m_h1, - this->m_h2, this->m_h3, deriv0, deriv1, deriv2, bwd); - - // Step 4: Inner product for mass + advection matrix operation - // Input: bwd = 1/lambda * V \cdot \nabla_x uPhys + uPhys (const) - // Output: wsp0/1/2 = temporary - // tmpOut = lambda \int_\Omega (1/lambda*V \cdot \nabla_x - // uPhys + uPhys) phi - IProduct3DKernel<SHAPE_TYPE, true, false, DEFORMED>( - nm0, nm1, nm2, nq0, nq1, nq2, correct, bwd, this->m_bdata[0], - this->m_bdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1], - this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut, m_lambda); - - // Step 5: Evaluate local derivatives with laplacian metrics - DiffusionCoeff3DKernel<SHAPE_TYPE, DEFORMED>( - nq0, nq1, nq2, this->m_isConstVarDiff, this->m_constVarDiff, - this->m_isVarDiff, this->m_varD00, this->m_varD01, - this->m_varD11, this->m_varD02, this->m_varD12, this->m_varD22, - df_ptr, this->m_h0, this->m_h1, this->m_h2, this->m_h3, deriv0, - deriv1, deriv2); - - IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>( - nm0, nm1, nm2, nq0, nq1, nq2, correct, deriv0, - this->m_dbdata[0], this->m_bdata[1], this->m_bdata[2], - this->m_w[0], this->m_w[1], this->m_w[2], jac_ptr, wsp0, wsp1, - wsp2, tmpOut); - - IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>( - nm0, nm1, nm2, nq0, nq1, nq2, correct, deriv1, this->m_bdata[0], - this->m_dbdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1], - this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut); - - IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>( - nm0, nm1, nm2, nq0, nq1, nq2, correct, deriv2, this->m_bdata[0], - this->m_bdata[1], this->m_dbdata[2], this->m_w[0], this->m_w[1], - this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut); + fusedKernel3D(nm0, nm1, nm2, nq0, nq1, nq2, correct, jac_ptr, + df_ptr, advVel_ptr, tmpIn, tmpOut, wsp0, wsp1, wsp2, + bwd, deriv0, deriv1, deriv2); // de-interleave and store data - deinterleave_store(tmpOut, m_nmTot, outptr); - - inptr += nmBlocks; - outptr += nmBlocks; + deinterleave_store(tmpOut, m_nmTot, locField); + std::copy(locField, locField + acturalSize, outptr); } + // free aligned memory + ::operator delete[](locField, std::align_val_t(vec_t::alignment)); } // Size based template version. @@ -578,96 +538,128 @@ struct LinearAdvectionDiffusionReactionTemplate std::vector<vec_t, allocator<vec_t>> deriv0(nqTot), deriv1(nqTot), deriv2(nqTot); - const vec_t *jac_ptr = {}; - const vec_t *df_ptr = {}; - const vec_t *advVel_ptr = {}; + // temporary aligned storage for local fields + alignas(vec_t::alignment) NekDouble locField[nqTot * vec_t::width]; + + const vec_t *jac_ptr = &((*this->m_jac)[0]); + const vec_t *df_ptr = &((*this->m_df)[0]); + const vec_t *advVel_ptr = &((*this->m_advVel)[0]); // Get size of derivative factor block - auto dfSize = ndf; - if (DEFORMED) - { - dfSize *= nqTot; - } - auto advVelSize = nvel * nqTot; + constexpr auto dfSize = ndf * nqTot; + constexpr auto advVelSize = nvel * nqTot; - for (int e = 0; e < this->m_nBlocks; e++) + for (int e = 0; e < this->m_nBlocks - 1; e++) { - df_ptr = &((*this->m_df)[e * dfSize]); - advVel_ptr = &((*this->m_advVel)[e * advVelSize]); + // Load data to aligned storage and interleave it + // std::copy(inptr, inptr + nmBlocks, locField); + // load_interleave(locField, m_nmTot, tmpIn); + load_unalign_interleave(inptr, m_nmTot, tmpIn); - // Load and transpose data - load_interleave(inptr, m_nmTot, tmpIn); + fusedKernel3D(nm0, nm1, nm2, nq0, nq1, nq2, correct, jac_ptr, + df_ptr, advVel_ptr, tmpIn, tmpOut, wsp0, wsp1, wsp2, + bwd, deriv0, deriv1, deriv2); + + // de-interleave and store data + // deinterleave_store(tmpOut, m_nmTot, locField); + // std::copy(locField, locField + nmBlocks, outptr); + deinterleave_unalign_store(tmpOut, m_nmTot, outptr); - if (!DEFORMED) + inptr += nmBlocks; + outptr += nmBlocks; + advVel_ptr += advVelSize; + if constexpr (DEFORMED) { - jac_ptr = &((*this->m_jac)[e]); + jac_ptr += nqTot; + df_ptr += dfSize; } else { - jac_ptr = &((*this->m_jac)[e * nqTot]); + ++jac_ptr; + df_ptr += ndf; } + } + // last block + { + int acturalSize = nmBlocks - this->m_nPads * m_nmTot; + std::copy(inptr, inptr + acturalSize, locField); + load_interleave(locField, m_nmTot, tmpIn); - // Step 1: BwdTrans - BwdTrans3DKernel<SHAPE_TYPE>( - nm0, nm1, nm2, nq0, nq1, nq2, correct, tmpIn, this->m_bdata[0], - this->m_bdata[1], this->m_bdata[2], wsp0, wsp1, bwd); - - // Step 2: take STD derivatives in collapsed coordinate space - PhysDerivTensor3DKernel(nq0, nq1, nq2, bwd, this->m_D[0], - this->m_D[1], this->m_D[2], deriv0, deriv1, - deriv2); - - // Step 3: Advect solution by advection velocities (varcoeffs) - // Input: deriv0 = du/dxi_1, - // deriv1 = du/dxi_2 - // deriv2 = du/dxi_3 - // Output: bwd += V \cdot \nabla_x u - Advection3DKernel<SHAPE_TYPE, DEFORMED>( - nq0, nq1, nq2, advVel_ptr, df_ptr, this->m_h0, this->m_h1, - this->m_h2, this->m_h3, deriv0, deriv1, deriv2, bwd); - - // Step 4: Inner product for mass + advection matrix operation - // Input: bwd = 1/lambda * V \cdot \nabla_x uPhys + uPhys (const) - // Output: wsp0/1/2 = temporary - // tmpOut = lambda \int_\Omega (1/lambda*V \cdot \nabla_x - // uPhys + uPhys) phi - IProduct3DKernel<SHAPE_TYPE, true, false, DEFORMED>( - nm0, nm1, nm2, nq0, nq1, nq2, correct, bwd, this->m_bdata[0], - this->m_bdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1], - this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut, m_lambda); - - // Step 5: Evaluate local derivatives with laplacian metrics - DiffusionCoeff3DKernel<SHAPE_TYPE, DEFORMED>( - nq0, nq1, nq2, this->m_isConstVarDiff, this->m_constVarDiff, - this->m_isVarDiff, this->m_varD00, this->m_varD01, - this->m_varD11, this->m_varD02, this->m_varD12, this->m_varD22, - df_ptr, this->m_h0, this->m_h1, this->m_h2, this->m_h3, deriv0, - deriv1, deriv2); - - IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>( - nm0, nm1, nm2, nq0, nq1, nq2, correct, deriv0, - this->m_dbdata[0], this->m_bdata[1], this->m_bdata[2], - this->m_w[0], this->m_w[1], this->m_w[2], jac_ptr, wsp0, wsp1, - wsp2, tmpOut); - - IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>( - nm0, nm1, nm2, nq0, nq1, nq2, correct, deriv1, this->m_bdata[0], - this->m_dbdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1], - this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut); - - IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>( - nm0, nm1, nm2, nq0, nq1, nq2, correct, deriv2, this->m_bdata[0], - this->m_bdata[1], this->m_dbdata[2], this->m_w[0], this->m_w[1], - this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut); + fusedKernel3D(nm0, nm1, nm2, nq0, nq1, nq2, correct, jac_ptr, + df_ptr, advVel_ptr, tmpIn, tmpOut, wsp0, wsp1, wsp2, + bwd, deriv0, deriv1, deriv2); // de-interleave and store data - deinterleave_store(tmpOut, m_nmTot, outptr); - - inptr += nmBlocks; - outptr += nmBlocks; + deinterleave_store(tmpOut, m_nmTot, locField); + std::copy(locField, locField + acturalSize, outptr); } } + NEK_FORCE_INLINE void fusedKernel3D( + const size_t nm0, const size_t nm1, const size_t nm2, const size_t nq0, + const size_t nq1, const size_t nq2, const bool correct, + const vec_t *jac_ptr, const vec_t *df_ptr, const vec_t *advVel_ptr, + const std::vector<vec_t, allocator<vec_t>> &tmpIn, + std::vector<vec_t, allocator<vec_t>> &tmpOut, + std::vector<vec_t, allocator<vec_t>> &wsp0, + std::vector<vec_t, allocator<vec_t>> &wsp1, + std::vector<vec_t, allocator<vec_t>> &wsp2, + std::vector<vec_t, allocator<vec_t>> &bwd, + std::vector<vec_t, allocator<vec_t>> &deriv0, + std::vector<vec_t, allocator<vec_t>> &deriv1, + std::vector<vec_t, allocator<vec_t>> &deriv2) + { + // Step 1: BwdTrans + BwdTrans3DKernel<SHAPE_TYPE>(nm0, nm1, nm2, nq0, nq1, nq2, correct, + tmpIn, this->m_bdata[0], this->m_bdata[1], + this->m_bdata[2], wsp0, wsp1, bwd); + + // Step 2: take STD derivatives in collapsed coordinate space + PhysDerivTensor3DKernel(nq0, nq1, nq2, bwd, this->m_D[0], this->m_D[1], + this->m_D[2], deriv0, deriv1, deriv2); + + // Step 3: Advect solution by advection velocities (varcoeffs) + // Input: deriv0 = du/dxi_1, + // deriv1 = du/dxi_2 + // deriv2 = du/dxi_3 + // Output: bwd += V \cdot \nabla_x u + Advection3DKernel<SHAPE_TYPE, DEFORMED>( + nq0, nq1, nq2, advVel_ptr, df_ptr, this->m_h0, this->m_h1, + this->m_h2, this->m_h3, deriv0, deriv1, deriv2, bwd); + + // Step 4: Inner product for mass + advection matrix operation + // Input: bwd = 1/lambda * V \cdot \nabla_x uPhys + uPhys (const) + // Output: wsp0/1/2 = temporary + // tmpOut = lambda \int_\Omega (1/lambda*V \cdot \nabla_x + // uPhys + uPhys) phi + IProduct3DKernel<SHAPE_TYPE, true, false, DEFORMED>( + nm0, nm1, nm2, nq0, nq1, nq2, correct, bwd, this->m_bdata[0], + this->m_bdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1], + this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut, m_lambda); + + // Step 5: Evaluate local derivatives with laplacian metrics + DiffusionCoeff3DKernel<SHAPE_TYPE, DEFORMED>( + nq0, nq1, nq2, this->m_isConstVarDiff, this->m_constVarDiff, + this->m_isVarDiff, this->m_varD00, this->m_varD01, this->m_varD11, + this->m_varD02, this->m_varD12, this->m_varD22, df_ptr, this->m_h0, + this->m_h1, this->m_h2, this->m_h3, deriv0, deriv1, deriv2); + + IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>( + nm0, nm1, nm2, nq0, nq1, nq2, correct, deriv0, this->m_dbdata[0], + this->m_bdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1], + this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut); + + IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>( + nm0, nm1, nm2, nq0, nq1, nq2, correct, deriv1, this->m_bdata[0], + this->m_dbdata[1], this->m_bdata[2], this->m_w[0], this->m_w[1], + this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut); + + IProduct3DKernel<SHAPE_TYPE, false, true, DEFORMED>( + nm0, nm1, nm2, nq0, nq1, nq2, correct, deriv2, this->m_bdata[0], + this->m_bdata[1], this->m_dbdata[2], this->m_w[0], this->m_w[1], + this->m_w[2], jac_ptr, wsp0, wsp1, wsp2, tmpOut); + } + #endif // SHAPE_DIMENSION private: diff --git a/library/MatrixFreeOps/Operator.hpp b/library/MatrixFreeOps/Operator.hpp index 80f9b737bc0c2117cea0eaac06f86d26f665486f..22577425a0a2eda67294b079900d3526dbbe97a5 100644 --- a/library/MatrixFreeOps/Operator.hpp +++ b/library/MatrixFreeOps/Operator.hpp @@ -472,13 +472,18 @@ protected: Helper(std::vector<LibUtilities::BasisSharedPtr> basis, int nElmt) : Operator() { - // Sanity check: no padding yet! - ASSERTL1(nElmt % vec_t::width == 0, - "Number of elements not divisible by vector " - "width, padding not yet implemented."); - - // Calculate number of 'blocks', i.e. meta-elements - m_nBlocks = nElmt / vec_t::width; + if (nElmt % vec_t::width == 0) // No padding or already padded + { + // Calculate number of 'blocks', i.e. meta-elements + m_nBlocks = nElmt / vec_t::width; + m_nPads = 0; + } + else // Need padding internally + { + // Calculate number of 'blocks', i.e. meta-elements + m_nBlocks = nElmt / vec_t::width + 1; + m_nPads = vec_t::width - (nElmt % vec_t::width); + } // Depending on element dimension, set up basis information, quadrature, // etc, inside vectorised environment. @@ -552,6 +557,7 @@ protected: } int m_nBlocks; + int m_nPads; std::array<int, DIM> m_nm, m_nq; std::array<std::vector<vec_t, tinysimd::allocator<vec_t>>, DIM> m_bdata; std::array<std::vector<vec_t, tinysimd::allocator<vec_t>>, DIM> m_dbdata; diff --git a/library/MatrixFreeOps/PhysDeriv.h b/library/MatrixFreeOps/PhysDeriv.h index 75afbfafba4063f7cec9a9a48f60797b31ddb93b..49b0d16570ac01879eb6f696bf587c66ec59503c 100644 --- a/library/MatrixFreeOps/PhysDeriv.h +++ b/library/MatrixFreeOps/PhysDeriv.h @@ -137,7 +137,7 @@ struct PhysDerivTemplate } // Call 1D kernel - const vec_t *df_ptr = {}; + const vec_t *df_ptr = &((*this->m_df)[0]); vec_t df_tmp[max_ndf] = {}; // max_ndf is a constexpr std::vector<vec_t, allocator<vec_t>> tmpIn(nqTot), tmpOut[max_ndf]; @@ -149,12 +149,16 @@ struct PhysDerivTemplate out_ptr[d] = &output[d][0]; } - for (int e = 0; e < this->m_nBlocks; ++e) - { - df_ptr = &((*this->m_df)[dfSize * e]); + // temporary aligned storage for local fields + NekDouble *locField = static_cast<NekDouble *>(::operator new[]( + nqBlocks * sizeof(NekDouble), std::align_val_t(vec_t::alignment))); - // Load and transpose data - load_interleave(inptr, nqTot, tmpIn); + for (int e = 0; e < this->m_nBlocks - 1; ++e) + { + // Load data to aligned storage and interleave it + // std::copy(inptr, inptr + nqBlocks, locField); + // load_interleave(locField, nqTot, tmpIn); + load_unalign_interleave(inptr, nqTot, tmpIn); // Get the basic derivative PhysDerivTensor1DKernel(nq0, tmpIn, this->m_D[0], tmpOut[0]); @@ -165,12 +169,38 @@ struct PhysDerivTemplate // De-interleave and store data for (int d = 0; d < ndf; ++d) { - deinterleave_store(tmpOut[d], nqTot, out_ptr[d]); + // de-interleave and store data + // deinterleave_store(tmpOut[d], nqTot, locField); + // std::copy(locField, locField + nqBlocks, out_ptr[d]); + deinterleave_unalign_store(tmpOut[d], nqTot, out_ptr[d]); out_ptr[d] += nqBlocks; } inptr += nqBlocks; + df_ptr += dfSize; + } + // last block + { + int acturalSize = nqBlocks - this->m_nPads * nqTot; + std::copy(inptr, inptr + acturalSize, locField); + load_interleave(locField, nqTot, tmpIn); + + // Get the basic derivative + PhysDerivTensor1DKernel(nq0, tmpIn, this->m_D[0], tmpOut[0]); + + // Calculate physical derivative + PhysDeriv1DKernel<DEFORMED>(nq0, ndf, df_ptr, df_tmp, tmpOut); + + // De-interleave and store data + for (int d = 0; d < ndf; ++d) + { + // de-interleave and store data + deinterleave_store(tmpOut[d], nqTot, locField); + std::copy(locField, locField + acturalSize, out_ptr[d]); + } } + // free aligned memory + ::operator delete[](locField, std::align_val_t(vec_t::alignment)); } // Size based template version. @@ -202,7 +232,7 @@ struct PhysDerivTemplate PhysDeriv1DWorkspace<SHAPE_TYPE>(nq0); // Call 1D kernel - const vec_t *df_ptr = {}; + const vec_t *df_ptr = &((*this->m_df)[0]); vec_t df_tmp[max_ndf] = {}; // max_ndf is a constexpr std::vector<vec_t, allocator<vec_t>> tmpIn(nqTot), tmpOut[max_ndf]; @@ -214,12 +244,15 @@ struct PhysDerivTemplate out_ptr[d] = &output[d][0]; } - for (int e = 0; e < this->m_nBlocks; ++e) - { - df_ptr = &((*this->m_df)[dfSize * e]); + // temporary aligned storage for local fields + alignas(vec_t::alignment) NekDouble locField[nqBlocks]; - // Load and transpose data - load_interleave(inptr, nqTot, tmpIn); + for (int e = 0; e < this->m_nBlocks - 1; ++e) + { + // Load data to aligned storage and interleave it + // std::copy(inptr, inptr + nqBlocks, locField); + // load_interleave(locField, nqTot, tmpIn); + load_unalign_interleave(inptr, nqTot, tmpIn); // Get the basic derivative PhysDerivTensor1DKernel(nq0, tmpIn, this->m_D[0], tmpOut[0]); @@ -230,11 +263,35 @@ struct PhysDerivTemplate // De-interleave and store data for (int d = 0; d < ndf; ++d) { - deinterleave_store(tmpOut[d], nqTot, out_ptr[d]); + // de-interleave and store data + // deinterleave_store(tmpOut[d], nqTot, locField); + // std::copy(locField, locField + nqBlocks, out_ptr[d]); + deinterleave_unalign_store(tmpOut[d], nqTot, out_ptr[d]); out_ptr[d] += nqBlocks; } inptr += nqBlocks; + df_ptr += dfSize; + } + // last block + { + int acturalSize = nqBlocks - this->m_nPads * nqTot; + std::copy(inptr, inptr + acturalSize, locField); + load_interleave(locField, nqTot, tmpIn); + + // Get the basic derivative + PhysDerivTensor1DKernel(nq0, tmpIn, this->m_D[0], tmpOut[0]); + + // Calculate physical derivative + PhysDeriv1DKernel<DEFORMED>(nq0, ndf, df_ptr, df_tmp, tmpOut); + + // De-interleave and store data + for (int d = 0; d < ndf; ++d) + { + // de-interleave and store data + deinterleave_store(tmpOut[d], nqTot, locField); + std::copy(locField, locField + acturalSize, out_ptr[d]); + } } } @@ -279,15 +336,19 @@ struct PhysDerivTemplate dfSize *= nqTot; } - const vec_t *df_ptr = {}; + const vec_t *df_ptr = &((*this->m_df)[0]); vec_t df_tmp[max_ndf] = {}; // max_ndf is a constexpr - for (int e = 0; e < this->m_nBlocks; ++e) - { - df_ptr = &((*this->m_df)[dfSize * e]); + // temporary aligned storage for local fields + NekDouble *locField = static_cast<NekDouble *>(::operator new[]( + nqBlocks * sizeof(NekDouble), std::align_val_t(vec_t::alignment))); - // Load and transpose data - load_interleave(inptr, nqTot, tmpIn); + for (int e = 0; e < this->m_nBlocks - 1; ++e) + { + // Load data to aligned storage and interleave it + // std::copy(inptr, inptr + nqBlocks, locField); + // load_interleave(locField, nqTot, tmpIn); + load_unalign_interleave(inptr, nqTot, tmpIn); // Results written to out_d0, out_d1 PhysDerivTensor2DKernel(nq0, nq1, tmpIn, this->m_D[0], this->m_D[1], @@ -297,14 +358,40 @@ struct PhysDerivTemplate this->m_Z[1], df_ptr, df_tmp, tmpOut); inptr += nqBlocks; - + df_ptr += dfSize; // de-interleave and store data for (int d = 0; d < outdim; ++d) { - deinterleave_store(tmpOut[d], nqTot, out_ptr[d]); + // de-interleave and store data + // deinterleave_store(tmpOut[d], nqTot, locField); + // std::copy(locField, locField + nqBlocks, out_ptr[d]); + deinterleave_unalign_store(tmpOut[d], nqTot, out_ptr[d]); out_ptr[d] += nqBlocks; } } + // last block + { + int acturalSize = nqBlocks - this->m_nPads * nqTot; + std::copy(inptr, inptr + acturalSize, locField); + load_interleave(locField, nqTot, tmpIn); + + // Results written to out_d0, out_d1 + PhysDerivTensor2DKernel(nq0, nq1, tmpIn, this->m_D[0], this->m_D[1], + tmpOut[0], tmpOut[1]); + // Calculate physical derivative + PhysDeriv2DKernel<DEFORMED>(nq0, nq1, outdim, this->m_Z[0], + this->m_Z[1], df_ptr, df_tmp, tmpOut); + + // De-interleave and store data + for (int d = 0; d < outdim; ++d) + { + // de-interleave and store data + deinterleave_store(tmpOut[d], nqTot, locField); + std::copy(locField, locField + acturalSize, out_ptr[d]); + } + } + // free aligned memory + ::operator delete[](locField, std::align_val_t(vec_t::alignment)); } // Size based template version. @@ -344,15 +431,18 @@ struct PhysDerivTemplate dfSize *= nqTot; } - const vec_t *df_ptr = {}; + const vec_t *df_ptr = &((*this->m_df)[0]); vec_t df_tmp[max_ndf] = {}; // max_ndf is a constexpr - for (int e = 0; e < this->m_nBlocks; ++e) - { - df_ptr = &((*this->m_df)[dfSize * e]); + // temporary aligned storage for local fields + alignas(vec_t::alignment) NekDouble locField[nqBlocks]; - // Load and transpose data - load_interleave(inptr, nqTot, tmpIn); + for (int e = 0; e < this->m_nBlocks - 1; ++e) + { + // Load data to aligned storage and interleave it + // std::copy(inptr, inptr + nqBlocks, locField); + // load_interleave(locField, nqTot, tmpIn); + load_unalign_interleave(inptr, nqTot, tmpIn); // Results written to out_d0, out_d1 PhysDerivTensor2DKernel(nq0, nq1, tmpIn, this->m_D[0], this->m_D[1], @@ -362,14 +452,38 @@ struct PhysDerivTemplate this->m_Z[1], df_ptr, df_tmp, tmpOut); inptr += nqBlocks; - + df_ptr += dfSize; // de-interleave and store data for (int d = 0; d < outdim; ++d) { - deinterleave_store(tmpOut[d], nqTot, out_ptr[d]); + // de-interleave and store data + // deinterleave_store(tmpOut[d], nqTot, locField); + // std::copy(locField, locField + nqBlocks, out_ptr[d]); + deinterleave_unalign_store(tmpOut[d], nqTot, out_ptr[d]); out_ptr[d] += nqBlocks; } } + // last block + { + int acturalSize = nqBlocks - this->m_nPads * nqTot; + std::copy(inptr, inptr + acturalSize, locField); + load_interleave(locField, nqTot, tmpIn); + + // Results written to out_d0, out_d1 + PhysDerivTensor2DKernel(nq0, nq1, tmpIn, this->m_D[0], this->m_D[1], + tmpOut[0], tmpOut[1]); + // Calculate physical derivative + PhysDeriv2DKernel<DEFORMED>(nq0, nq1, outdim, this->m_Z[0], + this->m_Z[1], df_ptr, df_tmp, tmpOut); + + // De-interleave and store data + for (int d = 0; d < outdim; ++d) + { + // de-interleave and store data + deinterleave_store(tmpOut[d], nqTot, locField); + std::copy(locField, locField + acturalSize, out_ptr[d]); + } + } } #elif defined(SHAPE_DIMENSION_3D) @@ -409,15 +523,19 @@ struct PhysDerivTemplate std::vector<vec_t, allocator<vec_t>> wsp0(wsp0Size), wsp1(wsp1Size), tmpIn(nqTot), tmpd0(nqTot), tmpd1(nqTot), tmpd2(nqTot); - const vec_t *df_ptr = {}; + const vec_t *df_ptr = &((*this->m_df)[0]); vec_t df_tmp[ndf] = {}; // ndf is a constexpr - for (int e = 0; e < this->m_nBlocks; ++e) - { - df_ptr = &((*this->m_df)[dfSize * e]); + // temporary aligned storage for local fields + NekDouble *locField = static_cast<NekDouble *>(::operator new[]( + nqBlocks * sizeof(NekDouble), std::align_val_t(vec_t::alignment))); - // Load and transpose data - load_interleave(inptr, nqTot, tmpIn); + for (int e = 0; e < this->m_nBlocks - 1; ++e) + { + // Load data to aligned storage and interleave it + // std::copy(inptr, inptr + nqBlocks, locField); + // load_interleave(locField, nqTot, tmpIn); + load_unalign_interleave(inptr, nqTot, tmpIn); // Results written to out_d0, out_d1, out_d2 PhysDerivTensor3DKernel(nq0, nq1, nq2, tmpIn, this->m_D[0], @@ -429,15 +547,41 @@ struct PhysDerivTemplate df_tmp, wsp0, wsp1, tmpd0, tmpd1, tmpd2); // de-interleave and store data - deinterleave_store(tmpd0, nqTot, outptr_d0); - deinterleave_store(tmpd1, nqTot, outptr_d1); - deinterleave_store(tmpd2, nqTot, outptr_d2); + deinterleave_unalign_store(tmpd0, nqTot, outptr_d0); + deinterleave_unalign_store(tmpd1, nqTot, outptr_d1); + deinterleave_unalign_store(tmpd2, nqTot, outptr_d2); inptr += nqBlocks; + df_ptr += dfSize; outptr_d0 += nqBlocks; outptr_d1 += nqBlocks; outptr_d2 += nqBlocks; } + // last block + { + int acturalSize = nqBlocks - this->m_nPads * nqTot; + std::copy(inptr, inptr + acturalSize, locField); + load_interleave(locField, nqTot, tmpIn); + + // Results written to out_d0, out_d1, out_d2 + PhysDerivTensor3DKernel(nq0, nq1, nq2, tmpIn, this->m_D[0], + this->m_D[1], this->m_D[2], tmpd0, tmpd1, + tmpd2); + // Calculate physical derivative + PhysDeriv3DKernel<DEFORMED>( + nq0, nq1, nq2, this->m_Z[0], this->m_Z[1], this->m_Z[2], df_ptr, + df_tmp, wsp0, wsp1, tmpd0, tmpd1, tmpd2); + + // De-interleave and store data + deinterleave_store(tmpd0, nqTot, locField); + std::copy(locField, locField + acturalSize, outptr_d0); + deinterleave_store(tmpd1, nqTot, locField); + std::copy(locField, locField + acturalSize, outptr_d1); + deinterleave_store(tmpd2, nqTot, locField); + std::copy(locField, locField + acturalSize, outptr_d2); + } + // free aligned memory + ::operator delete[](locField, std::align_val_t(vec_t::alignment)); } // Size based template version. @@ -472,15 +616,18 @@ struct PhysDerivTemplate std::vector<vec_t, allocator<vec_t>> tmpIn(nqTot), tmpd0(nqTot), tmpd1(nqTot), tmpd2(nqTot), wsp0(wsp0Size), wsp1(wsp1Size); - const vec_t *df_ptr = {}; + const vec_t *df_ptr = &((*this->m_df)[0]); vec_t df_tmp[ndf] = {}; // ndf is a constexpr - for (int e = 0; e < this->m_nBlocks; ++e) - { - df_ptr = &((*this->m_df)[dfSize * e]); + // temporary aligned storage for local fields + alignas(vec_t::alignment) NekDouble locField[nqBlocks]; - // Load and transpose data - load_interleave(inptr, nqTot, tmpIn); + for (int e = 0; e < this->m_nBlocks - 1; ++e) + { + // Load data to aligned storage and interleave it + // std::copy(inptr, inptr + nqBlocks, locField); + // load_interleave(locField, nqTot, tmpIn); + load_unalign_interleave(inptr, nqTot, tmpIn); // Results written to out_d0, out_d1, out_d2 PhysDerivTensor3DKernel(nq0, nq1, nq2, tmpIn, this->m_D[0], @@ -492,15 +639,39 @@ struct PhysDerivTemplate df_tmp, wsp0, wsp1, tmpd0, tmpd1, tmpd2); // de-interleave and store data - deinterleave_store(tmpd0, nqTot, outptr_d0); - deinterleave_store(tmpd1, nqTot, outptr_d1); - deinterleave_store(tmpd2, nqTot, outptr_d2); + deinterleave_unalign_store(tmpd0, nqTot, outptr_d0); + deinterleave_unalign_store(tmpd1, nqTot, outptr_d1); + deinterleave_unalign_store(tmpd2, nqTot, outptr_d2); inptr += nqBlocks; + df_ptr += dfSize; outptr_d0 += nqBlocks; outptr_d1 += nqBlocks; outptr_d2 += nqBlocks; } + // last block + { + int acturalSize = nqBlocks - this->m_nPads * nqTot; + std::copy(inptr, inptr + acturalSize, locField); + load_interleave(locField, nqTot, tmpIn); + + // Results written to out_d0, out_d1, out_d2 + PhysDerivTensor3DKernel(nq0, nq1, nq2, tmpIn, this->m_D[0], + this->m_D[1], this->m_D[2], tmpd0, tmpd1, + tmpd2); + // Calculate physical derivative + PhysDeriv3DKernel<DEFORMED>( + nq0, nq1, nq2, this->m_Z[0], this->m_Z[1], this->m_Z[2], df_ptr, + df_tmp, wsp0, wsp1, tmpd0, tmpd1, tmpd2); + + // De-interleave and store data + deinterleave_store(tmpd0, nqTot, locField); + std::copy(locField, locField + acturalSize, outptr_d0); + deinterleave_store(tmpd1, nqTot, locField); + std::copy(locField, locField + acturalSize, outptr_d1); + deinterleave_store(tmpd2, nqTot, locField); + std::copy(locField, locField + acturalSize, outptr_d2); + } } #endif diff --git a/library/MatrixFreeOps/PhysInterp1DScaled.h b/library/MatrixFreeOps/PhysInterp1DScaled.h deleted file mode 100644 index 9658bfb22729bdd2a80b5af1e1393736744d737a..0000000000000000000000000000000000000000 --- a/library/MatrixFreeOps/PhysInterp1DScaled.h +++ /dev/null @@ -1,359 +0,0 @@ -/////////////////////////////////////////////////////////////////////////////// -// -// File: PhysInterp1DScaled.h -// -// For more information, please see: http://www.nektar.info -// -// The MIT License -// -// Copyright (c) 2006 Division of Applied Mathematics, Brown University (USA), -// Department of Aeronautics, Imperial College London (UK), and Scientific -// Computing and Imaging Institute, University of Utah (USA). -// -// Permission is hereby granted, free of charge, to any person obtaining a -// copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation -// the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the -// Software is furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included -// in all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -// DEALINGS IN THE SOFTWARE. -// -// Description: -// -/////////////////////////////////////////////////////////////////////////////// - -#ifndef NEKTAR_LIBRARY_MF_PHYSINTERP1DSCALED_H -#define NEKTAR_LIBRARY_MF_PHYSINTERP1DSCALED_H - -#include <LibUtilities/BasicUtils/ShapeType.hpp> -#include <LibUtilities/BasicUtils/SharedArray.hpp> -#include <LibUtilities/Foundations/Basis.h> - -#include "Operator.hpp" -#include "PhysInterp1DScaledKernels.hpp" - -// As each opertor has seven shapes over three dimension to get to the -// "work" each operator uses a series of preprocessor directives based -// on the dimension and shape type so to limit the code -// inclusion. This keeps the library size as minimal as possible while -// removing runtime conditionals. - -// The preprocessor directives, SHAPE_DIMENSION_?D and SHAPE_TYPE_* -// are constructed by CMake in the CMakeLists.txt file which uses an -// implementation file. See the CMakeLists.txt files for more -// details. -namespace Nektar::MatrixFree -{ - -template <LibUtilities::ShapeType SHAPE_TYPE, bool DEFORMED = false> -struct PhysInterp1DScaledTemplate - : public PhysInterp1DScaled, - public Helper<LibUtilities::ShapeTypeDimMap[SHAPE_TYPE]> -{ - PhysInterp1DScaledTemplate(std::vector<LibUtilities::BasisSharedPtr> basis, - int nElmt) - : PhysInterp1DScaled(basis, nElmt), - Helper<LibUtilities::ShapeTypeDimMap[SHAPE_TYPE]>(basis, nElmt) - { - constexpr auto DIM = LibUtilities::ShapeTypeDimMap[SHAPE_TYPE]; - - if (DIM == 1) - { - m_nmTot = LibUtilities::GetNumberOfCoefficients(SHAPE_TYPE, - this->m_nm[0]); - } - else if (DIM == 2) - { - m_nmTot = LibUtilities::GetNumberOfCoefficients( - SHAPE_TYPE, this->m_nm[0], this->m_nm[1]); - } - else if (DIM == 3) - { - m_nmTot = LibUtilities::GetNumberOfCoefficients( - SHAPE_TYPE, this->m_nm[0], this->m_nm[1], this->m_nm[2]); - } - } - - static std::shared_ptr<Operator> Create( - std::vector<LibUtilities::BasisSharedPtr> basis, int nElmt) - { - return std::make_shared<PhysInterp1DScaledTemplate<SHAPE_TYPE>>(basis, - nElmt); - } - - void operator()(const Array<OneD, const NekDouble> &input, - Array<OneD, NekDouble> &output) final - { -#include "SwitchNodesPoints.h" - } - - // There must be separate 1D, 2D, and 3D operators because the - // helper base class is based on the shape type dim. Thus indices - // for the basis must be respected. - - // Further there are duplicate 1D, 2D, and 3D operators so to - // allow for compile time array sizes to be part of the template - // and thus gain loop unrolling. The only difference is the - // location of the size value, in the template or the function: - // foo<int bar>() {} vs foo() { int bar = ... } - -#if defined(SHAPE_DIMENSION_1D) - - // Non-size based operator. - void operator1D(const Array<OneD, const NekDouble> &input, - Array<OneD, NekDouble> &output) - { - const auto nm0 = m_basis[0]->GetNumModes(); - const auto nq0 = m_basis[0]->GetNumPoints(); - - const auto nqTot = nq0; - const auto nqBlocks = nqTot * vec_t::width; - const auto nmBlocks = m_nmTot * vec_t::width; - - auto *inptr = &input[0]; - auto *outptr = &output[0]; - - // Workspace for kernels - also checks preconditions - PhysInterp1DScaled1DWorkspace<SHAPE_TYPE>(nm0, nq0); - - std::vector<vec_t, allocator<vec_t>> tmpIn(m_nmTot), tmpOut(nqTot); - - for (int e = 0; e < this->m_nBlocks; ++e) - { - // Load and transpose data - load_interleave(inptr, m_nmTot, tmpIn); - - PhysInterp1DScaled1DKernel<SHAPE_TYPE>(nm0, nq0, tmpIn, - this->m_bdata[0], tmpOut); - - // de-interleave and store data - deinterleave_store(tmpOut, nqTot, outptr); - - inptr += nmBlocks; - outptr += nqBlocks; - } - } - - // Size based template version. - template <int nm0, int nq0> - void operator1D(const Array<OneD, const NekDouble> &input, - Array<OneD, NekDouble> &output) - { - constexpr auto nqTot = nq0; - constexpr auto nqBlocks = nqTot * vec_t::width; - const auto nmBlocks = m_nmTot * vec_t::width; - - auto *inptr = &input[0]; - auto *outptr = &output[0]; - - // Workspace for kernels - also checks preconditions - PhysInterp1DScaled1DWorkspace<SHAPE_TYPE>(nm0, nq0); - - std::vector<vec_t, allocator<vec_t>> tmpIn(m_nmTot), tmpOut(nqTot); - - for (int e = 0; e < this->m_nBlocks; ++e) - { - // Load and transpose data - load_interleave(inptr, m_nmTot, tmpIn); - - PhysInterp1DScaled1DKernel<SHAPE_TYPE>(nm0, nq0, tmpIn, - this->m_bdata[0], tmpOut); - - // de-interleave and store data - deinterleave_store(tmpOut, nqTot, outptr); - - inptr += nmBlocks; - outptr += nqBlocks; - } - } - -#elif defined(SHAPE_DIMENSION_2D) - - // Non-size based operator. - void operator2D(const Array<OneD, const NekDouble> &input, - Array<OneD, NekDouble> &output) - { - const auto nm0 = m_basis[0]->GetNumModes(); - const auto nm1 = m_basis[1]->GetNumModes(); - - const auto nq0 = m_basis[0]->GetNumPoints(); - const auto nq1 = m_basis[1]->GetNumPoints(); - - const auto nqTot = nq0 * nq1; - const auto nqBlocks = nqTot * vec_t::width; - const auto nmBlocks = m_nmTot * vec_t::width; - - auto *inptr = &input[0]; - auto *outptr = &output[0]; - const bool correct = - (m_basis[0]->GetBasisType() == LibUtilities::eModified_A); - - // Workspace for kernels - also checks preconditions - size_t wsp0Size = 0; - PhysInterp1DScaled2DWorkspace<SHAPE_TYPE>(nm0, nm1, nq0, nq1, wsp0Size); - - std::vector<vec_t, allocator<vec_t>> wsp0(wsp0Size), tmpIn(m_nmTot), - tmpOut(nqTot); - - for (int e = 0; e < this->m_nBlocks; ++e) - { - // Load and transpose data - load_interleave(inptr, m_nmTot, tmpIn); - - PhysInterp1DScaled2DKernel<SHAPE_TYPE>( - nm0, nm1, nq0, nq1, correct, tmpIn, this->m_bdata[0], - this->m_bdata[1], wsp0, tmpOut); - - // de-interleave and store data - deinterleave_store(tmpOut, nqTot, outptr); - - inptr += nmBlocks; - outptr += nqBlocks; - } - } - - // Size based template version. - template <int nm0, int nm1, int nq0, int nq1> - void operator2D(const Array<OneD, const NekDouble> &input, - Array<OneD, NekDouble> &output) - { - constexpr auto nqTot = nq0 * nq1; - constexpr auto nqBlocks = nqTot * vec_t::width; - const auto nmBlocks = m_nmTot * vec_t::width; - - auto *inptr = &input[0]; - auto *outptr = &output[0]; - const bool correct = - (m_basis[0]->GetBasisType() == LibUtilities::eModified_A); - - // Workspace for kernels - also checks preconditions - size_t wsp0Size = 0; - PhysInterp1DScaled2DWorkspace<SHAPE_TYPE>(nm0, nm1, nq0, nq1, wsp0Size); - - std::vector<vec_t, allocator<vec_t>> wsp0(wsp0Size), tmpIn(m_nmTot), - tmpOut(nqTot); - - for (int e = 0; e < this->m_nBlocks; ++e) - { - // Load and transpose data - load_interleave(inptr, m_nmTot, tmpIn); - - PhysInterp1DScaled2DKernel<SHAPE_TYPE>( - nm0, nm1, nq0, nq1, correct, tmpIn, this->m_bdata[0], - this->m_bdata[1], wsp0, tmpOut); - - // de-interleave and store data - deinterleave_store(tmpOut, nqTot, outptr); - - inptr += nmBlocks; - outptr += nqBlocks; - } - } - -#elif defined(SHAPE_DIMENSION_3D) - - // Non-size based operator. - void operator3D(const Array<OneD, const NekDouble> &input, - Array<OneD, NekDouble> &output) - { - const auto nm0 = m_basis[0]->GetNumModes(); - const auto nm1 = m_basis[1]->GetNumModes(); - const auto nm2 = m_basis[2]->GetNumModes(); - - const auto nq0 = m_basis[0]->GetNumPoints(); - const auto nq1 = m_basis[1]->GetNumPoints(); - const auto nq2 = m_basis[2]->GetNumPoints(); - - const auto nqTot = nq0 * nq1 * nq2; - const auto nqBlocks = nqTot * vec_t::width; - const auto nmBlocks = m_nmTot * vec_t::width; - - auto *inptr = &input[0]; - auto *outptr = &output[0]; - - const bool correct = - (m_basis[0]->GetBasisType() == LibUtilities::eModified_A); - - // Workspace for kernels - also checks preconditions - size_t wsp0Size = 0, wsp1Size = 0; - PhysInterp1DScaled3DWorkspace<SHAPE_TYPE>(nm0, nm1, nm2, nq0, nq1, nq2, - wsp0Size, wsp1Size); - - std::vector<vec_t, allocator<vec_t>> wsp0(wsp0Size), wsp1(wsp1Size), - tmpIn(m_nmTot), tmpOut(nqTot); - - for (int e = 0; e < this->m_nBlocks; ++e) - { - // Load and transpose data - load_interleave(inptr, m_nmTot, tmpIn); - - PhysInterp1DScaled3DKernel<SHAPE_TYPE>( - nm0, nm1, nm2, nq0, nq1, nq2, correct, tmpIn, this->m_bdata[0], - this->m_bdata[1], this->m_bdata[2], wsp0, wsp1, tmpOut); - - // de-interleave and store data - deinterleave_store(tmpOut, nqTot, outptr); - - inptr += nmBlocks; - outptr += nqBlocks; - } - } - - // Size based template version. - template <int nm0, int nm1, int nm2, int nq0, int nq1, int nq2> - void operator3D(const Array<OneD, const NekDouble> &input, - Array<OneD, NekDouble> &output) - { - constexpr auto nqTot = nq0 * nq1 * nq2; - constexpr auto nqBlocks = nqTot * vec_t::width; - const auto nmBlocks = m_nmTot * vec_t::width; - - auto *inptr = &input[0]; - auto *outptr = &output[0]; - const bool correct = - (m_basis[0]->GetBasisType() == LibUtilities::eModified_A); - - // Workspace for kernels - also checks preconditions - size_t wsp0Size = 0, wsp1Size = 0; - PhysInterp1DScaled3DWorkspace<SHAPE_TYPE>(nm0, nm1, nm2, nq0, nq1, nq2, - wsp0Size, wsp1Size); - - std::vector<vec_t, allocator<vec_t>> wsp0(wsp0Size), wsp1(wsp1Size), - tmpIn(m_nmTot), tmpOut(nqTot); - - for (int e = 0; e < this->m_nBlocks; ++e) - { - // Load and transpose data - load_interleave(inptr, m_nmTot, tmpIn); - - PhysInterp1DScaled3DKernel<SHAPE_TYPE>( - nm0, nm1, nm2, nq0, nq1, nq2, correct, tmpIn, this->m_bdata[0], - this->m_bdata[1], this->m_bdata[2], wsp0, wsp1, tmpOut); - - // de-interleave and store data - deinterleave_store(tmpOut, nqTot, outptr); - - inptr += nmBlocks; - outptr += nqBlocks; - } - } - -#endif // SHAPE_DIMENSION - -private: - int m_nmTot; -}; - -} // namespace Nektar::MatrixFree - -#endif diff --git a/library/MatrixFreeOps/PhysInterp1DScaledKernels.hpp b/library/MatrixFreeOps/PhysInterp1DScaledKernels.hpp deleted file mode 100644 index 17708156f86fba40c31a4c4a99f55ae5b365a9d4..0000000000000000000000000000000000000000 --- a/library/MatrixFreeOps/PhysInterp1DScaledKernels.hpp +++ /dev/null @@ -1,642 +0,0 @@ -/////////////////////////////////////////////////////////////////////////////// -// -// File: PhysInterp1DScaledKernels.hpp -// -// For more information, please see: http://www.nektar.info -// -// The MIT License -// -// Copyright (c) 2006 Division of Applied Mathematics, Brown University (USA), -// Department of Aeronautics, Imperial College London (UK), and Scientific -// Computing and Imaging Institute, University of Utah (USA). -// -// Permission is hereby granted, free of charge, to any person obtaining a -// copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation -// the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the -// Software is furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included -// in all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -// DEALINGS IN THE SOFTWARE. -// -// Description: -// -/////////////////////////////////////////////////////////////////////////////// - -#ifndef NEKTAR_LIBRARY_MF_PHYSINTERP1DSCALED_KERNELS_HPP -#define NEKTAR_LIBRARY_MF_PHYSINTERP1DSCALED_KERNELS_HPP - -#include <LibUtilities/BasicUtils/NekInline.hpp> - -namespace Nektar::MatrixFree -{ - -using namespace tinysimd; -using vec_t = simd<NekDouble>; - -// The dimension and shape kernels. NOTE: They are NOT duplicate -// templated version based on the array size like the -// operators. HOWEVER, they are forced to be INLINED. The inlining is -// critical so that when used in the templated version of the operator -// that loop unrolling occurs. - -// The seven shape kernels where the work gets done. -#if defined(SHAPE_TYPE_SEG) - -NEK_FORCE_INLINE static void PhysInterp1DScaledSegKernel( - const size_t nm0, const size_t nq0, - const std::vector<vec_t, allocator<vec_t>> &in, - const std::vector<vec_t, allocator<vec_t>> &basis0, - std::vector<vec_t, allocator<vec_t>> &out) -{ - for (int i = 0; i < nq0; ++i) - { - vec_t tmp = in[0] * basis0[i]; // Load 2x - - for (int p = 1; p < nm0; ++p) - { - tmp.fma(in[p], basis0[p * nq0 + i]); // Load 2x - } - - out[i] = tmp; // Store 1x - } -} - -#elif defined(SHAPE_TYPE_TRI) - -NEK_FORCE_INLINE static void PhysInterp1DScaledTriKernel( - const size_t nm0, const size_t nm1, const size_t nq0, const size_t nq1, - const bool correct, const std::vector<vec_t, allocator<vec_t>> &in, - const std::vector<vec_t, allocator<vec_t>> &basis0, - const std::vector<vec_t, allocator<vec_t>> &basis1, - std::vector<vec_t, allocator<vec_t>> &p_sums, // nm0 - std::vector<vec_t, allocator<vec_t>> &out) -{ - for (int eta1 = 0, eta_idx = 0; eta1 < nq1; ++eta1) - { - for (int p = 0, mode = 0; p < nm0; ++p) - { - vec_t p_sum = 0.0; - - for (int q = 0; q < (nm1 - p); ++q, ++mode) - { - p_sum.fma(basis1[mode * nq1 + eta1], in[mode]); - } - - p_sums[p] = p_sum; // Store 1x - } - - // Already have q_sum at each quadrature point in eta 1 for - // each mode, p. From this assemble the tensor produce of - // each quadrature point, eta1 - for (int eta0 = 0; eta0 < nq0; ++eta0, ++eta_idx) - { - vec_t p_sum = 0.0; - for (int p = 0; p < nm0; ++p) - { - p_sum.fma(p_sums[p], basis0[p * nq0 + eta0]); // Load 2x - } - - if (correct) - { - // p_sum += coef * basis0 * basis1 - p_sum.fma(in[1] * basis0[nq0 + eta0], basis1[nq1 + eta1]); - } - - out[eta_idx] = p_sum; - } - } -} - -#elif defined(SHAPE_TYPE_QUAD) - -NEK_FORCE_INLINE static void PhysInterp1DScaledQuadKernel( - const size_t nm0, const size_t nm1, const size_t nq0, const size_t nq1, - const std::vector<vec_t, allocator<vec_t>> &in, - const std::vector<vec_t, allocator<vec_t>> &basis0, - const std::vector<vec_t, allocator<vec_t>> &basis1, - std::vector<vec_t, allocator<vec_t>> &wsp, // nq0 * nm1 - std::vector<vec_t, allocator<vec_t>> &out) -{ - for (int i = 0, cnt_iq = 0; i < nq0; ++i) - { - for (int q = 0, cnt_pq = 0; q < nm1; ++q, ++cnt_iq) - { - vec_t tmp = in[cnt_pq] * basis0[i]; // Load 2x - ++cnt_pq; - for (int p = 1; p < nm0; ++p, ++cnt_pq) - { - tmp.fma(in[cnt_pq], basis0[p * nq0 + i]); // Load 2x - } - wsp[cnt_iq] = tmp; // Store 1x - } - } - - for (int j = 0, cnt_ij = 0; j < nq1; ++j) - { - for (int i = 0, cnt_iq = 0; i < nq0; ++i, ++cnt_ij) - { - vec_t tmp = wsp[cnt_iq] * basis1[j]; // Load 2x - ++cnt_iq; - for (int q = 1; q < nm1; ++q, ++cnt_iq) - { - tmp.fma(wsp[cnt_iq], basis1[q * nq1 + j]); // Load 2x - } - out[cnt_ij] = tmp; // Store 1x - } - } -} - -#elif defined(SHAPE_TYPE_HEX) - -NEK_FORCE_INLINE static void PhysInterp1DScaledHexKernel( - const size_t nm0, const size_t nm1, const size_t nm2, const size_t nq0, - const size_t nq1, const size_t nq2, - const std::vector<vec_t, allocator<vec_t>> &in, - const std::vector<vec_t, allocator<vec_t>> &basis0, - const std::vector<vec_t, allocator<vec_t>> &basis1, - const std::vector<vec_t, allocator<vec_t>> &basis2, - std::vector<vec_t, allocator<vec_t>> &sum_irq, // nq0 * nm2 * nm1 - std::vector<vec_t, allocator<vec_t>> &sum_jir, // nq1 * nq0 * nm2 - std::vector<vec_t, allocator<vec_t>> &out) -{ - for (int i = 0, cnt_irq = 0; i < nq0; ++i) - { - for (int r = 0, cnt_rqp = 0; r < nm2; ++r) - { - for (int q = 0; q < nm1; ++q, ++cnt_irq) - { - vec_t tmp = in[cnt_rqp] * basis0[i]; - ++cnt_rqp; - - for (int p = 1; p < nm0; ++p, ++cnt_rqp) - { - tmp.fma(in[cnt_rqp], basis0[p * nq0 + i]); - } - - sum_irq[cnt_irq] = tmp; - } - } - } - - for (int j = 0, cnt_jir = 0; j < nq1; ++j) - { - for (int i = 0, cnt_irq = 0; i < nq0; ++i) - { - for (int r = 0; r < nm2; ++r, ++cnt_jir) - { - vec_t tmp = sum_irq[cnt_irq] * basis1[j]; - ++cnt_irq; - - for (int q = 1; q < nm1; ++q) - { - tmp.fma(sum_irq[cnt_irq++], basis1[q * nq1 + j]); - } - - sum_jir[cnt_jir] = tmp; - } - } - } - - for (int k = 0, cnt_kji = 0; k < nq2; ++k) - { - for (int j = 0, cnt_jir = 0; j < nq1; ++j) - { - for (int i = 0; i < nq0; ++i, ++cnt_kji) - { - vec_t tmp = sum_jir[cnt_jir] * basis2[k]; - ++cnt_jir; - - for (int r = 1; r < nm2; ++r) - { - tmp.fma(sum_jir[cnt_jir++], basis2[r * nq2 + k]); - } - - out[cnt_kji] = tmp; - } - } - } -} - -#elif defined(SHAPE_TYPE_TET) - -NEK_FORCE_INLINE static void PhysInterp1DScaledTetKernel( - const size_t nm0, const size_t nm1, const size_t nm2, const size_t nq0, - const size_t nq1, const size_t nq2, const bool correct, - const std::vector<vec_t, allocator<vec_t>> &in, - const std::vector<vec_t, allocator<vec_t>> &basis0, - const std::vector<vec_t, allocator<vec_t>> &basis1, - const std::vector<vec_t, allocator<vec_t>> &basis2, - std::vector<vec_t, allocator<vec_t>> &fpq, // nm0 * nm1 - std::vector<vec_t, allocator<vec_t>> &fp, // nm0 - std::vector<vec_t, allocator<vec_t>> &out) -{ - for (int k = 0, cnt_kji = 0; k < nq2; ++k) - { - int cnt_pq = 0, mode = 0; - for (int p = 0; p < nm0; ++p) - { - for (int q = 0; q < nm1 - p; ++q, ++cnt_pq) - { - vec_t prod = in[mode] * basis2[k + nq2 * mode]; // Load 2x - ++mode; - - for (int r = 1; r < nm2 - p - q; ++r, ++mode) - { - vec_t inxmm = in[mode]; // Load 1x - prod.fma(inxmm, basis2[k + nq2 * mode]); // Load 1x - } - - fpq[cnt_pq] = prod; // Store 1x - } - - // increment mode in case order1!=order2 - for (int q = nm1 - p; q < nm2 - p; ++q) - { - mode += nm2 - p - q; - } - } - - for (int j = 0; j < nq1; ++j) - { - mode = cnt_pq = 0; - for (int p = 0; p < nm0; ++p) - { - vec_t prod = fpq[cnt_pq] * basis1[mode * nq1 + j]; // Load 2x - ++cnt_pq; - - for (int q = 1; q < nm1 - p; ++q, ++cnt_pq) - { - prod.fma(fpq[cnt_pq], - basis1[(mode + q) * nq1 + j]); // Load 2x - } - - fp[p] = prod; // Store 1x - mode += nm1 - p; - } - - for (int i = 0; i < nq0; ++i, ++cnt_kji) - { - vec_t tmp = basis0[i] * fp[0]; // Load 2x - - for (int p = 1; p < nm0; ++p) - { - tmp.fma(basis0[p * nq0 + i], fp[p]); // Load 2x - } - - if (correct) - { - // top vertex - // - // sum += inarray[1] * base2[nquad2 + k] * ( - // base0[i] * base1[nquad1+j] + - // base0[nquad0+i] * base1[j] + - // base0[nquad0+i] * base1[nquad1+j]); - - vec_t tmp1 = basis0[i] * basis1[nq1 + j]; // Load 2x - tmp1.fma(basis0[nq0 + i], basis1[j]); // Load 2x - tmp1.fma(basis0[nq0 + i], basis1[nq1 + j]); // Load 2x - tmp1 = tmp1 * basis2[nq2 + k]; // Load 1x - - vec_t inarray1 = in[1]; // Load 1x - tmp.fma(tmp1, inarray1); - - // bottom vertex - // - // sum += inarray[order2] * base2[k] * ( - // base0[nquad0+i] * base1[nquad1+j]); - tmp1 = basis0[nq0 + i] * basis1[nq1 + j]; // Load 2x - tmp1 = tmp1 * basis2[k]; // Load 1x - inarray1 = in[nm2]; // Load 1x - tmp.fma(inarray1, tmp1); - - // singular edge - for (int r = 1; r < nm2 - 1; ++r) - { - // sum += inarray[order2+r] * base2[(r+1)*nquad2+k] * - // base1[nquad1+j] * base0[nquad0+i]; - tmp1 = basis1[nq1 + j] * basis0[nq0 + i]; // Load 2x - tmp1 = tmp1 * basis2[(r + 1) * nq2 + k]; // Load 1x - inarray1 = in[nm2 + r]; // Load 1x - tmp.fma(inarray1, tmp1); - // multiply by (1-a)/2 - } - } - - out[cnt_kji] = tmp; // Store 1x - } - } - } -} - -#elif defined(SHAPE_TYPE_PRISM) - -NEK_FORCE_INLINE static void PhysInterp1DScaledPrismKernel( - const size_t nm0, const size_t nm1, const size_t nm2, const size_t nq0, - const size_t nq1, const size_t nq2, const bool correct, - const std::vector<vec_t, allocator<vec_t>> &in, - const std::vector<vec_t, allocator<vec_t>> &basis0, - const std::vector<vec_t, allocator<vec_t>> &basis1, - const std::vector<vec_t, allocator<vec_t>> &basis2, - std::vector<vec_t, allocator<vec_t>> &fpq, // nm0 * nm1 - std::vector<vec_t, allocator<vec_t>> &fp, // nm0 - std::vector<vec_t, allocator<vec_t>> &out) -{ - for (int k = 0, cnt_kji = 0; k < nq2; ++k) - { - int mode_pqr = 0, mode_pq = 0, mode_pr = 0; - for (int p = 0; p < nm0; ++p) - { - for (int q = 0; q < nm1; ++q, ++mode_pq) - { - vec_t prod = 0.0; - for (int r = 0; r < nm2 - p; ++r, ++mode_pqr) - { - vec_t coef = in[mode_pqr]; // Load 1x - prod.fma(coef, basis2[(mode_pr + r) * nq2 + k]); // Load 1x - } - - fpq[mode_pq] = prod; // Store 1x - } - - mode_pr += nm2 - p; - } - - for (int j = 0; j < nq1; ++j) - { - mode_pq = 0; - for (int p = 0; p < nm0; ++p) - { - vec_t prod = 0.0; - for (int q = 0; q < nm1; ++q, ++mode_pq) - { - prod.fma(fpq[mode_pq], basis1[q * nq1 + j]); // Load 2x - } - fp[p] = prod; // Store 1x - } - - for (int i = 0; i < nq0; ++i, ++cnt_kji) - { - - vec_t val_kji = 0.0; - for (int p = 0; p < nm0; ++p) - { - val_kji.fma(fp[p], basis0[p * nq0 + i]); // Load 2x - } - - if (correct) - { - vec_t basis_2 = basis2[nq2 + k]; // Load 1x - vec_t basis_0 = basis0[nq0 + i]; // Load 1x - - for (int q = 0; q < nm1; ++q) - { - vec_t coef_0q1 = in[q * nm2 + 1]; // Load 1x - vec_t basis_1 = basis1[q * nq1 + j]; // Load 1x - val_kji.fma(basis_2 * basis_1, basis_0 * coef_0q1); - } - } - out[cnt_kji] = val_kji; // store 1x - } - } - } -} - -#elif defined(SHAPE_TYPE_PYR) - -NEK_FORCE_INLINE static void PhysInterp1DScaledPyrKernel( - const size_t nm0, const size_t nm1, const size_t nm2, const size_t nq0, - const size_t nq1, const size_t nq2, const bool correct, - const std::vector<vec_t, allocator<vec_t>> &in, - const std::vector<vec_t, allocator<vec_t>> &basis0, - const std::vector<vec_t, allocator<vec_t>> &basis1, - const std::vector<vec_t, allocator<vec_t>> &basis2, - std::vector<vec_t, allocator<vec_t>> &fpq, // nm0 * nm1 - std::vector<vec_t, allocator<vec_t>> &fp, // nm0 - std::vector<vec_t, allocator<vec_t>> &out) -{ - for (int k = 0, cnt_kji = 0; k < nq2; ++k) - { - int mode_pqr = 0, mode_pq = 0; - for (int p = 0; p < nm0; ++p) - { - for (int q = 0; q < p; ++q, ++mode_pq) - { - vec_t prod = 0.0; - for (int r = 0; r < nm2 - p; ++r, ++mode_pqr) - { - vec_t coef = in[mode_pqr]; // Load 1x - prod.fma(coef, basis2[mode_pqr * nq2 + k]); // Load 1x - } - fpq[mode_pq] = prod; // Store 1x - } - - for (int q = p; q < nm1; ++q, ++mode_pq) - { - vec_t prod = 0.0; - for (int r = 0; r < nm2 - q; ++r, ++mode_pqr) - { - vec_t coef = in[mode_pqr]; // Load 1x - prod.fma(coef, basis2[mode_pqr * nq2 + k]); // Load 1x - } - - fpq[mode_pq] = prod; // Store 1x - } - - // increment mode in case nm2>nm1 - for (int q = nm1; q < nm2 - p; ++q) - { - mode_pqr += nm2 - q; - } - } - - for (int j = 0; j < nq1; ++j) - { - mode_pq = 0; - for (int p = 0; p < nm0; ++p) - { - vec_t prod = 0.0; - for (int q = 0; q < nm1; ++q, ++mode_pq) - { - prod.fma(fpq[mode_pq], basis1[q * nq1 + j]); // Load 2x - } - fp[p] = prod; // Store 1x - } - - for (int i = 0; i < nq0; ++i, ++cnt_kji) - { - vec_t val_kji = 0.0; - for (int p = 0; p < nm0; ++p) - { - val_kji.fma(fp[p], basis0[p * nq0 + i]); // Load 2x - } - - if (correct) - { - // top vertex - // - // sum += inarray[1] * base2[nquad2 + k] * ( - // base0[i] * base1[nquad1+j] + - // base0[nquad0+i] * base1[j] + - // base0[nquad0+i] * base1[nquad1+j]); - vec_t tmp1 = basis0[i] * basis1[nq1 + j]; // Load 2x - tmp1.fma(basis0[nq0 + i], basis1[j]); // Load 2x - tmp1.fma(basis0[nq0 + i], basis1[nq1 + j]); // Load 2x - tmp1 = tmp1 * basis2[nq2 + k]; // Load 1x - - vec_t inarray1 = in[1]; // Load 1x - val_kji.fma(tmp1, inarray1); - } - out[cnt_kji] = val_kji; // store 1x - } - } - } -} - -#endif // SHAPE_TYPE - -// Workspace - used to dynamically get the workspace size needed for -// temporary memory. -#if defined(SHAPE_DIMENSION_1D) - -template <LibUtilities::ShapeType SHAPE_TYPE> -NEK_FORCE_INLINE static void PhysInterp1DScaled1DWorkspace( - [[maybe_unused]] const size_t nm0, [[maybe_unused]] const size_t nq0) - -{ -} - -#elif defined(SHAPE_DIMENSION_2D) - -template <LibUtilities::ShapeType SHAPE_TYPE> -NEK_FORCE_INLINE static void PhysInterp1DScaled2DWorkspace( - [[maybe_unused]] const size_t nm0, [[maybe_unused]] const size_t nm1, - [[maybe_unused]] const size_t nq0, [[maybe_unused]] const size_t nq1, - size_t &wsp0Size) -{ - // Check preconditions - ASSERTL1((SHAPE_TYPE == LibUtilities::ShapeType::Tri && nm0 == nm1 && - nq0 == nq1 + 1) || - (SHAPE_TYPE == LibUtilities::ShapeType::Quad && nm0 == nm1 && - nq0 == nq1), - "PhysInterp1DScaled2DWorkspace: Requires homogenous points."); - -#if defined(SHAPE_TYPE_TRI) - wsp0Size = std::max(wsp0Size, nm0); -#elif defined(SHAPE_TYPE_QUAD) - wsp0Size = std::max(wsp0Size, nm0 * nq0); -#endif -} - -#elif defined(SHAPE_DIMENSION_3D) - -template <LibUtilities::ShapeType SHAPE_TYPE> -NEK_FORCE_INLINE static void PhysInterp1DScaled3DWorkspace( - [[maybe_unused]] const size_t nm0, [[maybe_unused]] const size_t nm1, - [[maybe_unused]] const size_t nm2, [[maybe_unused]] const size_t nq0, - [[maybe_unused]] const size_t nq1, [[maybe_unused]] const size_t nq2, - size_t &wsp0Size, size_t &wsp1Size) -{ - // Check preconditions - ASSERTL1((SHAPE_TYPE == LibUtilities::ShapeType::Hex && nm0 == nm1 && - nm0 == nm2 && nq0 == nq1 && nq0 == nq2) || - (SHAPE_TYPE == LibUtilities::ShapeType::Tet && nm0 == nm1 && - nm0 == nm2 && nq0 == nq1 + 1 && nq0 == nq2 + 1) || - (SHAPE_TYPE == LibUtilities::ShapeType::Pyr && nm0 == nm1 && - nm0 == nm2 && nq0 == nq1 && nq0 == nq2 + 1) || - (SHAPE_TYPE == LibUtilities::ShapeType::Prism && nm0 == nm1 && - nm0 == nm2 && nq0 == nq1 && nq0 == nq2 + 1), - "PhysInterp1DScaled3DWorkspace: Requires homogenous points."); - -#if defined(SHAPE_TYPE_HEX) - wsp0Size = std::max(wsp0Size, nq0 * nm1 * nm2); // nm1 == nm2 - wsp1Size = std::max(wsp1Size, nq0 * nq1 * nm2); // nq0 == nq1 -#elif defined(SHAPE_TYPE_TET) || defined(SHAPE_TYPE_PRISM) || \ - defined(SHAPE_TYPE_PYR) - wsp0Size = std::max(wsp0Size, nm0 * nm1); // nm0 == nm1 == nm2 - wsp1Size = std::max(wsp1Size, nm0); -#endif -} - -#endif // SHAPE_DIMENSION - -// The dimension kernels which select the shape kernel. -#if defined(SHAPE_DIMENSION_1D) - -template <LibUtilities::ShapeType SHAPE_TYPE> -NEK_FORCE_INLINE static void PhysInterp1DScaled1DKernel( - const size_t nm0, const size_t nq0, - const std::vector<vec_t, allocator<vec_t>> &in, - const std::vector<vec_t, allocator<vec_t>> &basis0, - std::vector<vec_t, allocator<vec_t>> &out) -{ -#if defined(SHAPE_TYPE_SEG) - PhysInterp1DScaledSegKernel(nm0, nq0, in, basis0, out); -#endif -} - -#elif defined(SHAPE_DIMENSION_2D) - -template <LibUtilities::ShapeType SHAPE_TYPE> -NEK_FORCE_INLINE static void PhysInterp1DScaled2DKernel( - const size_t nm0, const size_t nm1, const size_t nq0, const size_t nq1, - [[maybe_unused]] const bool correct, - const std::vector<vec_t, allocator<vec_t>> &in, - const std::vector<vec_t, allocator<vec_t>> &basis0, - const std::vector<vec_t, allocator<vec_t>> &basis1, - std::vector<vec_t, allocator<vec_t>> &wsp0, - std::vector<vec_t, allocator<vec_t>> &out) -{ -#if defined(SHAPE_TYPE_TRI) - PhysInterp1DScaledTriKernel(nm0, nm1, nq0, nq1, correct, in, basis0, basis1, - wsp0, out); - -#elif defined(SHAPE_TYPE_QUAD) - PhysInterp1DScaledQuadKernel(nm0, nm1, nq0, nq1, in, basis0, basis1, wsp0, - out); -#endif -} - -#elif defined(SHAPE_DIMENSION_3D) - -template <LibUtilities::ShapeType SHAPE_TYPE> -NEK_FORCE_INLINE static void PhysInterp1DScaled3DKernel( - const size_t nm0, const size_t nm1, const size_t nm2, const size_t nq0, - const size_t nq1, const size_t nq2, [[maybe_unused]] const bool correct, - const std::vector<vec_t, allocator<vec_t>> &in, - const std::vector<vec_t, allocator<vec_t>> &basis0, - const std::vector<vec_t, allocator<vec_t>> &basis1, - const std::vector<vec_t, allocator<vec_t>> &basis2, - std::vector<vec_t, allocator<vec_t>> &wsp0, - std::vector<vec_t, allocator<vec_t>> &wsp1, - std::vector<vec_t, allocator<vec_t>> &out) -{ -#if defined(SHAPE_TYPE_HEX) - PhysInterp1DScaledHexKernel(nm0, nm1, nm2, nq0, nq1, nq2, in, basis0, - basis1, basis2, wsp0, wsp1, out); -#elif defined(SHAPE_TYPE_TET) - PhysInterp1DScaledTetKernel(nm0, nm1, nm2, nq0, nq1, nq2, correct, in, - basis0, basis1, basis2, wsp0, wsp1, out); -#elif defined(SHAPE_TYPE_PRISM) - PhysInterp1DScaledPrismKernel(nm0, nm1, nm2, nq0, nq1, nq2, correct, in, - basis0, basis1, basis2, wsp0, wsp1, out); -#elif defined(SHAPE_TYPE_PYR) - PhysInterp1DScaledPyrKernel(nm0, nm1, nm2, nq0, nq1, nq2, correct, in, - basis0, basis1, basis2, wsp0, wsp1, out); -#endif -} - -#endif // SHAPE_DIMENSION - -} // namespace Nektar::MatrixFree - -#endif diff --git a/solvers/IncNavierStokesSolver/Tests/Perf_ChanFlow_3DH1D_pRef.tst b/solvers/IncNavierStokesSolver/Tests/Perf_ChanFlow_3DH1D_pRef.tst index a7d0fe0a19c4d6120486c60b25fbf805426e9fe8..6e67b585004b41b1dd6d116222989424b86257cf 100644 --- a/solvers/IncNavierStokesSolver/Tests/Perf_ChanFlow_3DH1D_pRef.tst +++ b/solvers/IncNavierStokesSolver/Tests/Perf_ChanFlow_3DH1D_pRef.tst @@ -20,7 +20,7 @@ <value variable="p" tolerance="1e-7">0.0671074</value> </metric> <metric type="ExecutionTime" id="3"> - <value tolerance="8e-1" hostname="42.debian-bullseye-performance-build-and-test">13.3</value> + <value tolerance="8e-1" hostname="42.debian-bullseye-performance-build-and-test">12.942</value> </metric> </metrics> </test>