Commit 33eaba6e authored by Chris Cantwell's avatar Chris Cantwell

Recovery of sub-communicators (in comms with more than one rank) works.

Temporarily disabled splitcomm for strips.
parent e7987788
...@@ -1910,11 +1910,11 @@ namespace Nektar ...@@ -1910,11 +1910,11 @@ namespace Nektar
int nProcSem = m_comm->GetSize() / nProcSm; int nProcSem = m_comm->GetSize() / nProcSm;
m_comm->SplitComm(nProcSm,nProcSem); m_comm->SplitComm(nProcSm,nProcSem);
m_comm->GetColumnComm()->SplitComm(nProcZ/nStripZ,nStripZ); // m_comm->GetColumnComm()->SplitComm(nProcZ/nStripZ,nStripZ);
m_comm->GetColumnComm()->GetColumnComm()->SplitComm( // m_comm->GetColumnComm()->GetColumnComm()->SplitComm(
(nProcY*nProcX),nProcZ/nStripZ); // (nProcY*nProcX),nProcZ/nStripZ);
m_comm->GetColumnComm()->GetColumnComm()->GetColumnComm() // m_comm->GetColumnComm()->GetColumnComm()->GetColumnComm()
->SplitComm(nProcX,nProcY); // ->SplitComm(nProcX,nProcY);
} }
} }
......
...@@ -42,7 +42,6 @@ namespace LibUtilities ...@@ -42,7 +42,6 @@ namespace LibUtilities
{ {
Comm::Comm(int narg, char *arg[]) Comm::Comm(int narg, char *arg[])
{ {
m_isRecovering = false;
} }
Comm::Comm() Comm::Comm()
......
...@@ -119,7 +119,7 @@ public: ...@@ -119,7 +119,7 @@ public:
template <class T> T Gather(const int rootProc, T &val); template <class T> T Gather(const int rootProc, T &val);
template <class T> T Scatter(const int rootProc, T &pData); template <class T> T Scatter(const int rootProc, T &pData);
LIB_UTILITIES_EXPORT inline CommSharedPtr CommCreateIf(int flag); LIB_UTILITIES_EXPORT inline CommSharedPtr CommCreateIf(int colour);
LIB_UTILITIES_EXPORT inline void SplitComm(int pRows, int pColumns); LIB_UTILITIES_EXPORT inline void SplitComm(int pRows, int pColumns);
LIB_UTILITIES_EXPORT inline CommSharedPtr GetRowComm(); LIB_UTILITIES_EXPORT inline CommSharedPtr GetRowComm();
...@@ -129,39 +129,14 @@ public: ...@@ -129,39 +129,14 @@ public:
LIB_UTILITIES_EXPORT inline bool RemoveExistingFiles(void); LIB_UTILITIES_EXPORT inline bool RemoveExistingFiles(void);
LIB_UTILITIES_EXPORT inline int EnrolSpare(); LIB_UTILITIES_EXPORT inline int EnrolSpare();
LIB_UTILITIES_EXPORT inline bool IsRecovering(); LIB_UTILITIES_EXPORT inline void BeginTransactionLog();
LIB_UTILITIES_EXPORT inline void MarkRecoveryComplete(); LIB_UTILITIES_EXPORT inline void EndTransactionLog();
LIB_UTILITIES_EXPORT inline void BeginTransactionLog()
{
m_isLogging = true;
}
LIB_UTILITIES_EXPORT inline void EndTransactionLog()
{
m_isLogging = false;
if (m_isRecovering)
{
m_isRecovering = false;
for (int i = 0; i < m_derivedComm.size(); ++i)
{
m_derivedComm[i]->m_isRecovering = false;
}
}
v_BackupState();
}
protected: protected:
typedef std::vector<CommSharedPtr> DerivedCommType;
typedef std::vector<int> DerivedCommFlagType;
int m_size; ///< Number of processes int m_size; ///< Number of processes
std::string m_type; ///< Type of communication std::string m_type; ///< Type of communication
CommSharedPtr m_commRow; ///< Row communicator CommSharedPtr m_commRow; ///< Row communicator
CommSharedPtr m_commColumn; ///< Column communicator CommSharedPtr m_commColumn; ///< Column communicator
bool m_isRecovering; ///< True if we are undergoing recovery from failed process
bool m_isLogging; ///< True if logging MPI output
DerivedCommType m_derivedComm;
DerivedCommFlagType m_derivedCommFlag;
int m_derivedRecoverIndex;
Comm(); Comm();
...@@ -200,17 +175,15 @@ protected: ...@@ -200,17 +175,15 @@ protected:
void *recvbuf, int recvcount, CommDataType recvtype, void *recvbuf, int recvcount, CommDataType recvtype,
int root) = 0; int root) = 0;
virtual CommSharedPtr v_CommCreateIf(int flag) = 0; virtual CommSharedPtr v_CommCreateIf(int colour) = 0;
virtual void v_SplitComm(int pRows, int pColumns) = 0; virtual void v_SplitComm(int pRows, int pColumns) = 0;
virtual bool v_TreatAsRankZero(void) = 0; virtual bool v_TreatAsRankZero(void) = 0;
LIB_UTILITIES_EXPORT virtual bool v_RemoveExistingFiles(void); LIB_UTILITIES_EXPORT virtual bool v_RemoveExistingFiles(void);
virtual int v_EnrolSpare() = 0; virtual int v_EnrolSpare() = 0;
virtual bool v_IsRecovering() {return m_isRecovering;} virtual void v_BeginTransactionLog() {}
virtual void v_BackupState() = 0; virtual void v_EndTransactionLog() {}
public:
virtual void v_ReplaceComm(void* commptr) {}
}; };
/** /**
...@@ -423,14 +396,9 @@ template <class T> T Comm::Scatter(const int rootProc, T &pData) ...@@ -423,14 +396,9 @@ template <class T> T Comm::Scatter(const int rootProc, T &pData)
/** /**
* @brief If the flag is non-zero create a new communicator. * @brief If the flag is non-zero create a new communicator.
*/ */
inline CommSharedPtr Comm::CommCreateIf(int flag) inline CommSharedPtr Comm::CommCreateIf(int colour)
{ {
CommSharedPtr c = v_CommCreateIf(flag); return v_CommCreateIf(colour);
if (m_isRecovering)
{
c->m_isRecovering = true;
}
return c;
} }
/** /**
...@@ -489,15 +457,16 @@ inline int Comm::EnrolSpare() ...@@ -489,15 +457,16 @@ inline int Comm::EnrolSpare()
return v_EnrolSpare(); return v_EnrolSpare();
} }
inline bool Comm::IsRecovering() inline void Comm::BeginTransactionLog()
{ {
return m_isRecovering; v_BeginTransactionLog();
} }
inline void Comm::MarkRecoveryComplete() inline void Comm::EndTransactionLog()
{ {
m_isRecovering = false; v_EndTransactionLog();
} }
} }
} }
......
...@@ -40,6 +40,8 @@ ...@@ -40,6 +40,8 @@
#include <string> #include <string>
#include <queue> #include <queue>
#include <vector> #include <vector>
#include <list>
#include <LibUtilities/Communication/Comm.h> #include <LibUtilities/Communication/Comm.h>
#include <LibUtilities/Memory/NekMemoryManager.hpp> #include <LibUtilities/Memory/NekMemoryManager.hpp>
...@@ -135,24 +137,36 @@ protected: ...@@ -135,24 +137,36 @@ protected:
virtual void v_SplitComm(int pRows, int pColumns); virtual void v_SplitComm(int pRows, int pColumns);
virtual CommSharedPtr v_CommCreateIf(int flag); virtual CommSharedPtr v_CommCreateIf(int flag);
virtual int v_EnrolSpare();
virtual void v_BackupState();
private: private:
typedef std::queue<std::vector<char>> StorageType; typedef std::queue<std::vector<char>> StorageType;
typedef std::list<CommMpiSharedPtr> DerivedCommType;
typedef std::queue<int> DerivedCommFlagType;
MPI_Comm m_comm; MPI_Comm m_comm;
MPI_Comm m_agreecomm; MPI_Comm m_agreecomm;
int m_rank; int m_rank;
bool m_isRecovering; ///< True if we are undergoing recovery from failed process
bool m_isLogging; ///< True if logging MPI output
StorageType m_data; StorageType m_data;
StorageType m_dataBackup; StorageType m_dataBackup;
DerivedCommType m_derivedComm; ///< Temporary derived comm list used during restore
DerivedCommFlagType m_derivedCommFlag; ///< Log derived comm flags
DerivedCommFlagType m_derivedCommFlagBackup; ///< Backup of neighbour flags
static void HandleMpiError(MPI_Comm* pcomm, int* perr, ...); static void HandleMpiError(MPI_Comm* pcomm, int* perr, ...);
CommMpi(MPI_Comm pComm); CommMpi(MPI_Comm pComm);
virtual void v_ReplaceComm(void* commptr); virtual int v_EnrolSpare();
virtual void v_BeginTransactionLog();
virtual void v_EndTransactionLog();
void BackupState();
void RestoreState();
void ReplaceComm(MPI_Comm commptr);
}; };
} }
} }
......
...@@ -117,6 +117,7 @@ protected: ...@@ -117,6 +117,7 @@ protected:
LIB_UTILITIES_EXPORT virtual int v_EnrolSpare() {return 0;} LIB_UTILITIES_EXPORT virtual int v_EnrolSpare() {return 0;}
LIB_UTILITIES_EXPORT virtual void v_BackupState() {} LIB_UTILITIES_EXPORT virtual void v_BackupState() {}
LIB_UTILITIES_EXPORT virtual void v_RestoreState() {}
}; };
} }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment