Commit d1bca648 authored by Chris Cantwell's avatar Chris Cantwell

Initial implementation of resilience in ADRSolver.

parent c1f07405
......@@ -32,7 +32,7 @@
// Description: Generic timestepping for Unsteady solvers
//
///////////////////////////////////////////////////////////////////////////////
#include <signal.h>
#include <iostream>
#include <iomanip>
......@@ -257,9 +257,21 @@ namespace Nektar
NekDouble cpuTime = 0.0;
NekDouble elapsed = 0.0;
int failureOccured = 0;
int failureMax = m_session->GetCmdLineArgument<int>("spares");
while (step < m_steps ||
m_time < m_fintime - NekConstants::kNekZeroTol)
{
try {
double r = (double)rand()/RAND_MAX;
if (r < 0.001 && failureOccured < failureMax) {
cout << "COMMITTING SUICIDE!" << endl;
int blah;
cin >> blah;
raise(SIGKILL);
}
if (m_cflSafetyFactor)
{
m_timestep = GetTimeStep(fields);
......@@ -398,6 +410,70 @@ namespace Nektar
// Step advance
++step;
if (m_session->GetComm()->IsRecovering())
{
cout << "Restore field data" << endl;
for (unsigned int i = 0; i < m_fields.num_elements(); ++i)
{
m_session->GetComm()->StateGet(
"field" + boost::lexical_cast<string>(i),
&m_fields[i]->UpdatePhys()[0],
m_fields[i]->GetTotPoints());
}
m_session->GetComm()->StateGet("step", step);
m_session->GetComm()->StateGet("time", m_time);
m_session->GetComm()->StateGet("failures", failureOccured);
failureOccured++;
}
else
{
cout << "Store field data" << endl;
for (unsigned int i = 0; i < m_fields.num_elements(); ++i)
{
m_session->GetComm()->StateAdd(
"field" + boost::lexical_cast<string>(i),
&m_fields[i]->GetPhys()[0],
m_fields[i]->GetTotPoints());
}
m_session->GetComm()->StateAdd("step", step);
m_session->GetComm()->StateAdd("time", m_time);
m_session->GetComm()->StateAdd("failures", failureOccured);
m_session->GetComm()->StateCommit();
}
if (step == 1 || m_session->GetComm()->IsRecovering())
{
if (m_session->GetComm()->IsRecovering())
{
failureOccured++;
}
m_session->GetComm()->EndTransactionLog();
}
}
catch (...) {
try
{
cout << "Caught error - trying to invoke a spare." << endl;
int x = m_session->GetComm()->EnrolSpare();
cout << "Result: " << x << endl;
for (unsigned int i = 0; i < m_fields.num_elements(); ++i)
{
m_session->GetComm()->StateGet(
"field" + boost::lexical_cast<string>(i),
&m_fields[i]->UpdatePhys()[0],
m_fields[i]->GetTotPoints());
}
m_session->GetComm()->StateGet("step", step);
m_session->GetComm()->StateGet("time", m_time);
failureOccured++;
}
catch (...)
{
cout << "ERROR WHEN PERFORMING ENROL SPARE!!!" << endl;
exit(-1);
}
}
cout << "Completed iteration" << endl;
}
// Print out summary statistics
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment