Commit e7987788 authored by Chris Cantwell's avatar Chris Cantwell

Added initial non-working implementation for sub-communicator recovery.

parent 0a821d56
[submodule "docs/tutorial"]
branch = master
path = docs/tutorial
url = git@gitlab.nektar.info:nektar/tutorial
url = http://gitlab.nektar.info:nektar/tutorial
ignore = all
......@@ -10,7 +10,7 @@
MESSAGE(STATUS "Searching for Boost:")
SET(MIN_VER "1.56.0")
SET(NEEDED_BOOST_LIBS thread iostreams date_time filesystem system
program_options regex timer chrono)
program_options regex timer chrono mpi serialization)
SET(Boost_DEBUG 0)
SET(Boost_NO_BOOST_CMAKE ON)
IF( BOOST_ROOT )
......
......@@ -194,6 +194,8 @@ namespace Nektar
// Create communicator
CreateComm(argc, argv);
m_comm->BeginTransactionLog();
TestSharedFilesystem();
// If running in parallel change the default global sys solution
......
......@@ -414,6 +414,8 @@ TARGET_LINK_LIBRARIES(LibUtilities LINK_PUBLIC
${Boost_SYSTEM_LIBRARY}
${Boost_TIMER_LIBRARY}
${Boost_CHRONO_LIBRARY}
${Boost_MPI_LIBRARY}
${Boost_SERIALIZATION_LIBRARY}
debug ${ZLIB_LIBRARY_DEBUG} optimized ${ZLIB_LIBRARY_RELEASE}
)
......
......@@ -131,13 +131,37 @@ public:
LIB_UTILITIES_EXPORT inline int EnrolSpare();
LIB_UTILITIES_EXPORT inline bool IsRecovering();
LIB_UTILITIES_EXPORT inline void MarkRecoveryComplete();
LIB_UTILITIES_EXPORT inline void BeginTransactionLog()
{
m_isLogging = true;
}
LIB_UTILITIES_EXPORT inline void EndTransactionLog()
{
m_isLogging = false;
if (m_isRecovering)
{
m_isRecovering = false;
for (int i = 0; i < m_derivedComm.size(); ++i)
{
m_derivedComm[i]->m_isRecovering = false;
}
}
v_BackupState();
}
protected:
typedef std::vector<CommSharedPtr> DerivedCommType;
typedef std::vector<int> DerivedCommFlagType;
int m_size; ///< Number of processes
std::string m_type; ///< Type of communication
CommSharedPtr m_commRow; ///< Row communicator
CommSharedPtr m_commColumn; ///< Column communicator
bool m_isRecovering; ///< True if we are undergoing recovery from failed process
bool m_isLogging; ///< True if logging MPI output
DerivedCommType m_derivedComm;
DerivedCommFlagType m_derivedCommFlag;
int m_derivedRecoverIndex;
Comm();
......@@ -182,6 +206,11 @@ protected:
LIB_UTILITIES_EXPORT virtual bool v_RemoveExistingFiles(void);
virtual int v_EnrolSpare() = 0;
virtual bool v_IsRecovering() {return m_isRecovering;}
virtual void v_BackupState() = 0;
public:
virtual void v_ReplaceComm(void* commptr) {}
};
/**
......@@ -396,7 +425,12 @@ template <class T> T Comm::Scatter(const int rootProc, T &pData)
*/
inline CommSharedPtr Comm::CommCreateIf(int flag)
{
return v_CommCreateIf(flag);
CommSharedPtr c = v_CommCreateIf(flag);
if (m_isRecovering)
{
c->m_isRecovering = true;
}
return c;
}
/**
......
......@@ -38,6 +38,8 @@
#include <mpi.h>
#include <mpi-ext.h>
#include <string>
#include <queue>
#include <vector>
#include <LibUtilities/Communication/Comm.h>
#include <LibUtilities/Memory/NekMemoryManager.hpp>
......@@ -134,15 +136,23 @@ protected:
virtual CommSharedPtr v_CommCreateIf(int flag);
virtual int v_EnrolSpare();
virtual void v_BackupState();
private:
typedef std::queue<std::vector<char>> StorageType;
MPI_Comm m_comm;
MPI_Comm m_agreecomm;
int m_rank;
StorageType m_data;
StorageType m_dataBackup;
static void HandleMpiError(MPI_Comm* pcomm, int* perr, ...);
CommMpi(MPI_Comm pComm);
virtual void v_ReplaceComm(void* commptr);
};
}
}
......
......@@ -116,6 +116,7 @@ protected:
LIB_UTILITIES_EXPORT virtual CommSharedPtr v_CommCreateIf(int flag);
LIB_UTILITIES_EXPORT virtual int v_EnrolSpare() {return 0;}
LIB_UTILITIES_EXPORT virtual void v_BackupState() {}
};
}
}
......
......@@ -89,6 +89,7 @@ int main(int argc, char *argv[])
// Zero field coefficients for initial guess for linear solver.
Vmath::Zero(field->GetNcoeffs(), field->UpdateCoeffs(), 1);
session->GetComm()->EndTransactionLog();
//BackupStaticState();
// Time integrate using backward Euler
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment