Skip to content

Commit

Permalink
Add MPI support to ParaGrid (#560)
Browse files Browse the repository at this point in the history
# Add MPI support to ParaGrid
closes #534
closes #454
closes #448 

---
# Task List
- [x] modify `make_init_para24x30.py` to use new DG data layout and
remove CG vars
- [x] add MPI doctest support to `ParaGrid_test.cpp`
- [x] move `extent` and `start` variables into `ModelArray`'s
`DimensionSpec`
- [x] add special MPI test case to `ParaGrid_test.cpp`
- [x] add MPI support to `getModelState`
- [x] add MPI support to `dumpModelState`
- [x] add MPI support to `writeDiagnosticTime`
- [x] reinstate test `ConfigOutput_test.cpp` for MPI builds
- [x] add MPI support to `ERA5`/`TOPAZ` tests

---
# Change Description

After #331 added MPI parallelisation for thermodynamics on the RectGrid,
this PR does the same for the Parametric grid. This should then check
off the second task in #120 (_"MPI parallelization of thermodynamics
where all operations, except for I/O, are local to an MPI rank"_)

---
# Test Description

- `ParaGrid_test.cpp` tests core functionality of ParaGrid (serial and
MPI)
- `./nextsim --config-file config_para24x30.cfg` should provide an
integration test (serial and MPI) (based on #506)

---
# Further work (for a future PR)

- add MPI to dynamics (to close PR #120)
- implement halo comms
- implement boundary conditions (as part of MPI) 
- tidy naming of variables in Domain Decomp tool
  • Loading branch information
TomMelt authored Aug 21, 2024
2 parents 5fa3d6c + 4088afa commit 09dd2b2
Show file tree
Hide file tree
Showing 23 changed files with 873 additions and 476 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/clang-compile-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ jobs:
for component in core physics dynamics
do
cd $component/test
for file in $(find test* -maxdepth 0 -type f | grep -vP "_MPI\d*$")
for file in $(find test* -maxdepth 0 -type f)
do
echo $file
./$file
Expand Down Expand Up @@ -91,7 +91,7 @@ jobs:
for component in core physics dynamics
do
cd $component/test
for file in $(find test* -maxdepth 0 -type f | grep -P "_MPI\d+$")
for file in $(find test* -maxdepth 0 -type f)
do
echo $file
nprocs=$(echo $file | sed -r "s/.*MPI([0-9]+)/\1/")
Expand Down
16 changes: 11 additions & 5 deletions core/src/ModelArray.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ void ModelArray::setDimensions(Type type, const MultiDim& newDims)
{
std::vector<Dimension>& dimSpecs = typeDimensions.at(type);
for (size_t i = 0; i < dimSpecs.size(); ++i) {
definedDimensions.at(dimSpecs[i]).length = newDims[i];
definedDimensions.at(dimSpecs[i]).localLength = newDims[i];
}
validateMaps();
}
Expand All @@ -200,9 +200,15 @@ void ModelArray::setNComponents(std::map<Type, size_t> cMap)
}
}

void ModelArray::setDimension(Dimension dim, size_t length)
#ifdef USE_MPI
void ModelArray::setDimension(Dimension dim, size_t globalLength, size_t localLength, size_t start)
{
definedDimensions.at(dim).length = length;
definedDimensions.at(dim).setLengths(globalLength, localLength, start);
#else
void ModelArray::setDimension(Dimension dim, size_t globalLength)
{
definedDimensions.at(dim).setLengths(globalLength);
#endif
validateMaps();
}

Expand Down Expand Up @@ -271,7 +277,7 @@ void ModelArray::DimensionMap::validate()
std::vector<Dimension>& typeDims = entry.second;
dims.resize(typeDims.size());
for (size_t i = 0; i < typeDims.size(); ++i) {
dims[i] = definedDimensions.at(typeDims[i]).length;
dims[i] = definedDimensions.at(typeDims[i]).localLength;
}
}
}
Expand All @@ -282,7 +288,7 @@ void ModelArray::SizeMap::validate()
size_t size = 1;
std::vector<Dimension>& typeDims = entry.second;
for (size_t i = 0; i < typeDims.size(); ++i) {
size *= definedDimensions.at(typeDims[i]).length;
size *= definedDimensions.at(typeDims[i]).localLength;
}
m_sizes.at(entry.first) = size;
}
Expand Down
186 changes: 139 additions & 47 deletions core/src/ParaGridIO.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,11 @@ const std::map<ModelArray::Dimension, bool> ParaGridIO::isDG = {
};

std::map<ModelArray::Dimension, ModelArray::Type> ParaGridIO::dimCompMap;
#ifdef USE_MPI
std::map<std::string, netCDF::NcFilePar> ParaGridIO::openFiles;
#else
std::map<std::string, netCDF::NcFile> ParaGridIO::openFiles;
#endif
std::map<std::string, size_t> ParaGridIO::timeIndexByFile;

void ParaGridIO::makeDimCompMap()
Expand All @@ -77,19 +81,28 @@ void ParaGridIO::makeDimCompMap()

ParaGridIO::~ParaGridIO() = default;

#ifdef USE_MPI
ModelState ParaGridIO::getModelState(const std::string& filePath, ModelMetadata& metadata)
#else
ModelState ParaGridIO::getModelState(const std::string& filePath)
#endif
{
ModelState state;

try {
#ifdef USE_MPI
netCDF::NcFilePar ncFile(filePath, netCDF::NcFile::read, metadata.mpiComm);
#else
netCDF::NcFile ncFile(filePath, netCDF::NcFile::read);
#endif
netCDF::NcGroup metaGroup(ncFile.getGroup(IStructure::metadataNodeName()));
netCDF::NcGroup dataGroup(ncFile.getGroup(IStructure::dataNodeName()));

// Dimensions and DG components
std::multimap<std::string, netCDF::NcDim> dimMap = dataGroup.getDims();
for (auto entry : ModelArray::definedDimensions) {
if (dimCompMap.count(entry.first) > 0)
auto dimType = entry.first;
if (dimCompMap.count(dimType) > 0)
// TODO Assertions that DG in the file equals the compile time DG in the model. See
// #205
continue;
Expand All @@ -107,12 +120,39 @@ ModelState ParaGridIO::getModelState(const std::string& filePath)
std::string("No netCDF dimension found corresponding to the dimension named ")
+ dimensionSpec.name + std::string(" or ") + dimensionSpec.altName);
}
if (entry.first == ModelArray::Dimension::Z) {
if (dimType == ModelArray::Dimension::Z) {
// A special case, as the number of levels in the file might not be
// the number that the selected ice thermodynamics requires.
ModelArray::setDimension(entry.first, NZLevels::get());
#ifdef USE_MPI
ModelArray::setDimension(dimType, NZLevels::get(), NZLevels::get(), 0);
#else
ModelArray::setDimension(dimType, NZLevels::get());
#endif
} else {
ModelArray::setDimension(entry.first, dim.getSize());
#ifdef USE_MPI
auto dimName = dim.getName();
size_t localLength = 0;
size_t start = 0;
if (dimType == ModelArray::Dimension::X) {
localLength = metadata.localExtentX;
start = metadata.localCornerX;
} else if (dimType == ModelArray::Dimension::Y) {
localLength = metadata.localExtentY;
start = metadata.localCornerY;
} else if (dimType == ModelArray::Dimension::XVERTEX) {
localLength = metadata.localExtentX + 1;
start = metadata.localCornerX;
} else if (dimType == ModelArray::Dimension::YVERTEX) {
localLength = metadata.localExtentY + 1;
start = metadata.localCornerY;
} else {
localLength = dim.getSize();
start = 0;
}
ModelArray::setDimension(dimType, dim.getSize(), localLength, start);
#else
ModelArray::setDimension(dimType, dim.getSize());
#endif
}
}

Expand All @@ -132,21 +172,29 @@ ModelState ParaGridIO::getModelState(const std::string& filePath)
std::string("No ModelArray::Type corresponds to the dimensional key ")
+ dimKey);
}
ModelArray::Type newType = dimensionKeys.at(dimKey);
state.data[varName] = ModelArray(newType);
ModelArray::Type type = dimensionKeys.at(dimKey);
state.data[varName] = ModelArray(type);
ModelArray& data = state.data.at(varName);
data.resize();

if (newType == ModelArray::Type::Z) {
std::vector<size_t> startVector(ModelArray::nDimensions(newType), 0);
std::vector<size_t> extentVector = ModelArray::dimensions(newType);
// Reverse the extent vector to go from logical (x, y, z) ordering
// of indexes to netCDF storage ordering.
std::reverse(extentVector.begin(), extentVector.end());
var.getVar(startVector, extentVector, &data[0]);
} else {
var.getVar(&data[0]);
std::vector<size_t> start;
std::vector<size_t> count;
if (ModelArray::hasDoF(type)) {
auto ncomps = data.nComponents();
start.push_back(0);
count.push_back(ncomps);
}
for (ModelArray::Dimension dt : ModelArray::typeDimensions.at(type)) {
auto dim = ModelArray::definedDimensions.at(dt);
start.push_back(dim.start);
count.push_back(dim.localLength);
}
// dims are looped in [dg], x, y, [z] order so start and count
// order must be reveresed to match order netcdf expects
std::reverse(start.begin(), start.end());
std::reverse(count.begin(), count.end());

var.getVar(start, count, &data[0]);
}
ncFile.close();
} catch (const netCDF::exceptions::NcException& nce) {
Expand Down Expand Up @@ -194,7 +242,7 @@ ModelState ParaGridIO::readForcingTimeStatic(
= ModelArray::typeDimensions.at(ModelArray::Type::H);
for (auto riter = dimensions.rbegin(); riter != dimensions.rend(); ++riter) {
indexArray.push_back(0);
extentArray.push_back(ModelArray::definedDimensions.at(*riter).length);
extentArray.push_back(ModelArray::definedDimensions.at(*riter).localLength);
}

for (const std::string& varName : forcings) {
Expand All @@ -217,7 +265,12 @@ ModelState ParaGridIO::readForcingTimeStatic(
void ParaGridIO::dumpModelState(
const ModelState& state, const ModelMetadata& metadata, const std::string& filePath)
{

#ifdef USE_MPI
netCDF::NcFilePar ncFile(filePath, netCDF::NcFile::replace, metadata.mpiComm);
#else
netCDF::NcFile ncFile(filePath, netCDF::NcFile::replace);
#endif

CommonRestartMetadata::writeStructureType(ncFile, metadata);
netCDF::NcGroup metaGroup = ncFile.addGroup(IStructure::metadataNodeName());
Expand All @@ -229,7 +282,7 @@ void ParaGridIO::dumpModelState(
for (auto entry : ModelArray::definedDimensions) {
ModelArray::Dimension dim = entry.first;
size_t dimSz = (dimCompMap.count(dim)) ? ModelArray::nComponents(dimCompMap.at(dim))
: dimSz = entry.second.length;
: dimSz = entry.second.globalLength;
ncFromMAMap[dim] = dataGroup.addDim(entry.second.name, dimSz);
// TODO Do I need to add data, even if it is just integers 0...n-1?
}
Expand Down Expand Up @@ -260,10 +313,27 @@ void ParaGridIO::dumpModelState(
if (restartFields.count(entry.first)) {
// Get the type, then relevant vector of NetCDF dimensions
ModelArray::Type type = entry.second.getType();
std::vector<size_t> start;
std::vector<size_t> count;
if (ModelArray::hasDoF(type)) {
auto ncomps = entry.second.nComponents();
start.push_back(0);
count.push_back(ncomps);
}
for (ModelArray::Dimension dt : entry.second.typeDimensions.at(type)) {
auto dim = entry.second.definedDimensions.at(dt);
start.push_back(dim.start);
count.push_back(dim.localLength);
}
// dims are looped in [dg], x, y, [z] order so start and count
// order must be reveresed to match order netcdf expects
std::reverse(start.begin(), start.end());
std::reverse(count.begin(), count.end());

std::vector<netCDF::NcDim>& ncDims = dimMap.at(type);
netCDF::NcVar var(dataGroup.addVar(entry.first, netCDF::ncDouble, ncDims));
var.putAtt(mdiName, netCDF::ncDouble, MissingData::value);
var.putVar(entry.second.getData());
var.putVar(start, count, entry.second.getData());
}
}

Expand All @@ -277,12 +347,20 @@ void ParaGridIO::writeDiagnosticTime(
size_t nt = (isNew) ? 0 : ++timeIndexByFile.at(filePath);
if (isNew) {
// Open a new file and emplace it in the map of open files.
#ifdef USE_MPI
openFiles.try_emplace(filePath, filePath, netCDF::NcFile::replace, meta.mpiComm);
#else
openFiles.try_emplace(filePath, filePath, netCDF::NcFile::replace);
#endif
// Set the initial time to be zero (assigned above)
timeIndexByFile[filePath] = nt;
}
// Get the file handle
#ifdef USE_MPI
netCDF::NcFilePar& ncFile = openFiles.at(filePath);
#else
netCDF::NcFile& ncFile = openFiles.at(filePath);
#endif

// Get the netCDF groups, creating them if necessary
netCDF::NcGroup metaGroup = (isNew) ? ncFile.addGroup(IStructure::metadataNodeName())
Expand All @@ -303,51 +381,57 @@ void ParaGridIO::writeDiagnosticTime(
for (auto entry : ModelArray::definedDimensions) {
ModelArray::Dimension dim = entry.first;
size_t dimSz = (dimCompMap.count(dim)) ? ModelArray::nComponents(dimCompMap.at(dim))
: dimSz = entry.second.length;
: dimSz = entry.second.globalLength;
ncFromMAMap[dim] = (isNew) ? dataGroup.addDim(entry.second.name, dimSz)
: dataGroup.getDim(entry.second.name);
}

// Also create the sets of dimensions to be connected to the data fields
std::map<ModelArray::Type, std::vector<netCDF::NcDim>> dimMap;
// Create the index and size arrays
// The index arrays always start from zero, except in the first/time axis
std::map<ModelArray::Type, std::vector<size_t>> indexArrays;
std::map<ModelArray::Type, std::vector<size_t>> extentArrays;
std::map<ModelArray::Type, std::vector<size_t>> startMap;
std::map<ModelArray::Type, std::vector<size_t>> countMap;
for (auto entry : ModelArray::typeDimensions) {
ModelArray::Type type = entry.first;
std::vector<netCDF::NcDim> ncDims;
std::vector<size_t> indexArray;
std::vector<size_t> extentArray;
std::vector<size_t> start;
std::vector<size_t> count;

// Everything that has components needs that dimension, too
if (ModelArray::hasDoF(type)) {
if (type == ModelArray::Type::VERTEX && !isNew)
continue;
auto ncomps = ModelArray::nComponents(type);
auto dim = ModelArray::componentMap.at(type);
ncDims.push_back(ncFromMAMap.at(dim));
start.push_back(0);
count.push_back(ncomps);
}
for (auto dt : entry.second) {
auto dim = ModelArray::definedDimensions.at(dt);
ncDims.push_back(ncFromMAMap.at(dt));
start.push_back(dim.start);
count.push_back(dim.localLength);
}

// Deal with VERTEX in each case
// Add the time dimension for all types that are not VERTEX
if (type != ModelArray::Type::VERTEX) {
ncDims.push_back(timeDim);
indexArray.push_back(nt);
extentArray.push_back(1UL);
start.push_back(nt);
count.push_back(1UL);
} else if (!isNew) {
// For VERTEX in an existing file, there is nothing more to be done
continue;
}
for (auto iter = entry.second.rbegin(); iter != entry.second.rend(); ++iter) {
ModelArray::Dimension& maDim = *iter;
ncDims.push_back(ncFromMAMap.at(maDim));
indexArray.push_back(0);
extentArray.push_back(ModelArray::definedDimensions.at(maDim).length);
}

std::reverse(ncDims.begin(), ncDims.end());
std::reverse(start.begin(), start.end());
std::reverse(count.begin(), count.end());

dimMap[type] = ncDims;
indexArrays[type] = indexArray;
extentArrays[type] = extentArray;
}
// Everything that has components needs that dimension, too
for (auto entry : dimCompMap) {
// Skip VERTEX fields on existing files
if (entry.second == ModelArray::Type::VERTEX && !isNew)
continue;
dimMap.at(entry.second).push_back(ncFromMAMap.at(entry.first));
indexArrays.at(entry.second).push_back(0);
extentArrays.at(entry.second).push_back(ModelArray::nComponents(entry.second));
startMap[type] = start;
countMap[type] = count;
}

// Create a special timeless set of dimensions for the landmask
Expand All @@ -361,16 +445,19 @@ void ParaGridIO::writeDiagnosticTime(
maskIndexes = { 0, 0 };
maskExtents = { ModelArray::definedDimensions
.at(ModelArray::typeDimensions.at(ModelArray::Type::H)[0])
.length,
.localLength,
ModelArray::definedDimensions.at(ModelArray::typeDimensions.at(ModelArray::Type::H)[1])
.length };
.localLength };
}

// Put the time axis variable
std::vector<netCDF::NcDim> timeDimVec = { timeDim };
netCDF::NcVar timeVar((isNew) ? dataGroup.addVar(timeName, netCDF::ncDouble, timeDimVec)
: dataGroup.getVar(timeName));
double secondsSinceEpoch = (meta.time() - TimePoint()).seconds();
#ifdef USE_MPI
netCDF::setVariableCollective(timeVar, dataGroup);
#endif
timeVar.putVar({ nt }, { 1 }, &secondsSinceEpoch);

// Write the data
Expand All @@ -383,6 +470,9 @@ void ParaGridIO::writeDiagnosticTime(
// Land mask in a new file (since it was skipped above in existing files)
netCDF::NcVar var(dataGroup.addVar(maskName, netCDF::ncDouble, maskDims));
// No missing data
#ifdef USE_MPI
netCDF::setVariableCollective(var, dataGroup);
#endif
var.putVar(maskIndexes, maskExtents, entry.second.getData());

} else {
Expand All @@ -392,8 +482,10 @@ void ParaGridIO::writeDiagnosticTime(
: dataGroup.getVar(entry.first));
if (isNew)
var.putAtt(mdiName, netCDF::ncDouble, MissingData::value);

var.putVar(indexArrays.at(type), extentArrays.at(type), entry.second.getData());
#ifdef USE_MPI
netCDF::setVariableCollective(var, dataGroup);
#endif
var.putVar(startMap.at(type), countMap.at(type), entry.second.getData());
}
}
}
Expand Down
Loading

0 comments on commit 09dd2b2

Please # to comment.