|
4 | 4 | #include "openPMD/auxiliary/Environment.hpp" |
5 | 5 | #include "openPMD/auxiliary/Filesystem.hpp" |
6 | 6 | #include "openPMD/openPMD.hpp" |
| 7 | +// @todo change includes |
| 8 | +#include "openPMD/benchmark/mpi/OneDimensionalBlockSlicer.hpp" |
7 | 9 | #include <catch2/catch.hpp> |
8 | 10 |
|
9 | 11 | #if openPMD_HAVE_MPI |
@@ -1177,4 +1179,219 @@ TEST_CASE( "adios2_ssc", "[parallel][adios2]" ) |
1177 | 1179 | { |
1178 | 1180 | adios2_ssc(); |
1179 | 1181 | } |
1180 | | -#endif |
| 1182 | + |
| 1183 | +void adios2_chunk_distribution() |
| 1184 | +{ |
| 1185 | + /* |
| 1186 | + * This test simulates a multi-node streaming setup in order to test some |
| 1187 | + * of our chunk distribution strategies. |
| 1188 | + * We don't actually stream (but write a .bp file instead) and also we don't |
| 1189 | + * actually run anything on multiple nodes, but we can use this for testing |
| 1190 | + * the distribution strategies anyway. |
| 1191 | + */ |
| 1192 | + int mpi_size{ -1 }; |
| 1193 | + int mpi_rank{ -1 }; |
| 1194 | + MPI_Comm_size( MPI_COMM_WORLD, &mpi_size ); |
| 1195 | + MPI_Comm_rank( MPI_COMM_WORLD, &mpi_rank ); |
| 1196 | + |
| 1197 | + /* |
| 1198 | + * Mappings: MPI rank -> hostname where the rank is executed. |
| 1199 | + * For the writing application as well as for the reading one. |
| 1200 | + */ |
| 1201 | + chunk_assignment::RankMeta writingRanksHostnames, readingRanksHostnames; |
| 1202 | + for( int i = 0; i < mpi_size; ++i ) |
| 1203 | + { |
| 1204 | + /* |
| 1205 | + * The mapping is intentionally weird. Nodes "node1", "node3", ... |
| 1206 | + * do not have instances of the reading application running on them. |
| 1207 | + * Our distribution strategies will need to deal with that situation. |
| 1208 | + */ |
| 1209 | + // 0, 0, 1, 1, 2, 2, 3, 3 ... |
| 1210 | + writingRanksHostnames[ i ] = "node" + std::to_string( i / 2 ); |
| 1211 | + // 0, 0, 0, 0, 2, 2, 2, 2 ... |
| 1212 | + readingRanksHostnames[ i ] = "node" + std::to_string( i / 4 * 2 ); |
| 1213 | + } |
| 1214 | + |
| 1215 | + std::string filename = "../samples/adios2_chunk_distribution.bp"; |
| 1216 | + // Simulate a stream: BP4 assigns chunk IDs by subfile (i.e. aggregator). |
| 1217 | + std::stringstream parameters; |
| 1218 | + parameters << R"END( |
| 1219 | +{ |
| 1220 | + "adios2": |
| 1221 | + { |
| 1222 | + "engine": |
| 1223 | + { |
| 1224 | + "type": "bp4", |
| 1225 | + "parameters": |
| 1226 | + { |
| 1227 | + "NumAggregators":)END" |
| 1228 | + << "\"" << std::to_string( mpi_size ) << "\"" |
| 1229 | + << R"END( |
| 1230 | + } |
| 1231 | + } |
| 1232 | + } |
| 1233 | +} |
| 1234 | +)END"; |
| 1235 | + |
| 1236 | + auto printAssignment = [ mpi_rank ]( |
| 1237 | + std::string const & strategyName, |
| 1238 | + ChunkTable const & table, |
| 1239 | + chunk_assignment::RankMeta const & meta ) |
| 1240 | + { |
| 1241 | + if( mpi_rank != 0 ) |
| 1242 | + { |
| 1243 | + return; |
| 1244 | + } |
| 1245 | + std::cout << "WITH STRATEGY '" << strategyName << "':\n"; |
| 1246 | + for( auto const & chunk : table ) |
| 1247 | + { |
| 1248 | + std::cout << "[HOST: " << meta.at( chunk.sourceID ) |
| 1249 | + << ",\tRank: " << chunk.sourceID << ",\tOffset: "; |
| 1250 | + for( auto offset : chunk.offset ) |
| 1251 | + { |
| 1252 | + std::cout << offset << ", "; |
| 1253 | + } |
| 1254 | + std::cout << "\tExtent: "; |
| 1255 | + for( auto extent : chunk.extent ) |
| 1256 | + { |
| 1257 | + std::cout << extent << ", "; |
| 1258 | + } |
| 1259 | + std::cout << "]" << std::endl; |
| 1260 | + } |
| 1261 | + }; |
| 1262 | + |
| 1263 | + // Create a dataset. |
| 1264 | + { |
| 1265 | + Series series( |
| 1266 | + filename, |
| 1267 | + openPMD::Access::CREATE, |
| 1268 | + MPI_COMM_WORLD, |
| 1269 | + parameters.str() ); |
| 1270 | + /* |
| 1271 | + * The writing application sets an attribute that tells the reading |
| 1272 | + * application about the "MPI rank -> hostname" mapping. |
| 1273 | + * Each rank only needs to set its own value. |
| 1274 | + * (Some other options like setting all at once or reading from a file |
| 1275 | + * exist as well.) |
| 1276 | + */ |
| 1277 | + series.setMpiRanksMetaInfo( writingRanksHostnames.at( mpi_rank ) ); |
| 1278 | + |
| 1279 | + auto E_x = series.iterations[ 0 ].meshes[ "E" ][ "x" ]; |
| 1280 | + openPMD::Dataset ds( |
| 1281 | + openPMD::Datatype::INT, { unsigned( mpi_size ), 10 } ); |
| 1282 | + E_x.resetDataset( ds ); |
| 1283 | + std::vector< int > data( 10, 0 ); |
| 1284 | + std::iota( data.begin(), data.end(), 0 ); |
| 1285 | + E_x.storeChunk( data, { unsigned( mpi_rank ), 0 }, { 1, 10 } ); |
| 1286 | + series.flush(); |
| 1287 | + } |
| 1288 | + |
| 1289 | + { |
| 1290 | + Series series( filename, openPMD::Access::READ_ONLY, MPI_COMM_WORLD ); |
| 1291 | + /* |
| 1292 | + * Inquire the writing application's "MPI rank -> hostname" mapping. |
| 1293 | + * The reading application needs to know about its own mapping. |
| 1294 | + * Having both of these mappings is the basis for an efficient chunk |
| 1295 | + * distribution since we can use it to figure out which instances |
| 1296 | + * are running on the same nodes. |
| 1297 | + */ |
| 1298 | + auto rankMetaIn = series.mpiRanksMetaInfo(); |
| 1299 | + REQUIRE( rankMetaIn == writingRanksHostnames ); |
| 1300 | + |
| 1301 | + auto E_x = series.iterations[ 0 ].meshes[ "E" ][ "x" ]; |
| 1302 | + /* |
| 1303 | + * Ask the backend which chunks are available. |
| 1304 | + */ |
| 1305 | + auto const chunkTable = E_x.availableChunks(); |
| 1306 | + |
| 1307 | + printAssignment( "INPUT", chunkTable, rankMetaIn ); |
| 1308 | + |
| 1309 | + using namespace chunk_assignment; |
| 1310 | + |
| 1311 | + /* |
| 1312 | + * Assign the chunks by distributing them one after the other to reading |
| 1313 | + * ranks. Easy, but not particularly efficient. |
| 1314 | + */ |
| 1315 | + RoundRobin roundRobinStrategy; |
| 1316 | + auto roundRobinAssignment = assignChunks( |
| 1317 | + chunkTable, rankMetaIn, readingRanksHostnames, roundRobinStrategy ); |
| 1318 | + printAssignment( |
| 1319 | + "ROUND ROBIN", roundRobinAssignment, readingRanksHostnames ); |
| 1320 | + |
| 1321 | + /* |
| 1322 | + * Assign chunks by hostname. |
| 1323 | + * Two difficulties: |
| 1324 | + * * A distribution strategy within one node needs to be picked. |
| 1325 | + * We pick the BinPacking strategy that tries to assign chunks in a |
| 1326 | + * balanced manner. Since our chunks have a small extent along |
| 1327 | + * dimension 0, use dimension 1 for slicing. |
| 1328 | + * * The assignment is partial since some nodes only have instances of |
| 1329 | + * the writing application. Those chunks remain unassigned. |
| 1330 | + */ |
| 1331 | + ByHostname byHostname( |
| 1332 | + std::make_unique< BinPacking >( /* splitAlongDimension = */ 1 ) ); |
| 1333 | + auto byHostnamePartialAssignment = assignChunks( |
| 1334 | + chunkTable, rankMetaIn, readingRanksHostnames, byHostname ); |
| 1335 | + printAssignment( |
| 1336 | + "HOSTNAME, ASSIGNED", |
| 1337 | + byHostnamePartialAssignment.assigned, |
| 1338 | + readingRanksHostnames ); |
| 1339 | + printAssignment( |
| 1340 | + "HOSTNAME, LEFTOVER", |
| 1341 | + byHostnamePartialAssignment.notAssigned, |
| 1342 | + rankMetaIn ); |
| 1343 | + |
| 1344 | + /* |
| 1345 | + * Assign chunks by hostnames, once more. |
| 1346 | + * This time, apply a secondary distribution strategy to assign |
| 1347 | + * leftovers. We pick BinPacking, once more. |
| 1348 | + * Notice that the BinPacking strategy does not (yet) take into account |
| 1349 | + * chunks that have been assigned by the first round. |
| 1350 | + * Balancing is calculated solely based on the leftover chunks from the |
| 1351 | + * first round. |
| 1352 | + */ |
| 1353 | + FromPartialStrategy fromPartialStrategy( |
| 1354 | + std::make_unique< ByHostname >( std::move( byHostname ) ), |
| 1355 | + std::make_unique< BinPacking >( /* splitAlongDimension = */ 1 ) ); |
| 1356 | + auto fromPartialAssignment = assignChunks( |
| 1357 | + chunkTable, |
| 1358 | + rankMetaIn, |
| 1359 | + readingRanksHostnames, |
| 1360 | + fromPartialStrategy ); |
| 1361 | + printAssignment( |
| 1362 | + "HOSTNAME WITH SECOND PASS", |
| 1363 | + fromPartialAssignment, |
| 1364 | + readingRanksHostnames ); |
| 1365 | + |
| 1366 | + /* |
| 1367 | + * Assign chunks by slicing the n-dimensional physical domain and |
| 1368 | + * intersecting those slices with the available chunks from the backend. |
| 1369 | + * Notice that this strategy only returns the chunks that the currently |
| 1370 | + * running rank is supposed to load, whereas the other strategies return |
| 1371 | + * a chunk table containing all chunks that all ranks will load. |
| 1372 | + * In principle, a chunk_assignment::Strategy only needs to return the |
| 1373 | + * chunks that the current rank should load, but is free to emplace the |
| 1374 | + * other chunks for other reading ranks as well. |
| 1375 | + * (Reasoning: In some strategies, calculating everything is necessary, |
| 1376 | + * in others such as this one, it's an unneeded overhead.) |
| 1377 | + */ |
| 1378 | + ByCuboidSlice cuboidSliceStrategy( |
| 1379 | + std::make_unique< OneDimensionalBlockSlicer >( 1 ), |
| 1380 | + E_x.getExtent(), |
| 1381 | + mpi_rank, |
| 1382 | + mpi_size ); |
| 1383 | + auto cuboidSliceAssignment = assignChunks( |
| 1384 | + chunkTable, |
| 1385 | + rankMetaIn, |
| 1386 | + readingRanksHostnames, |
| 1387 | + cuboidSliceStrategy ); |
| 1388 | + printAssignment( |
| 1389 | + "CUBOID SLICE", cuboidSliceAssignment, readingRanksHostnames ); |
| 1390 | + } |
| 1391 | +} |
| 1392 | + |
| 1393 | +TEST_CASE( "adios2_chunk_distribution", "[parallel][adios2]" ) |
| 1394 | +{ |
| 1395 | + adios2_chunk_distribution(); |
| 1396 | +} |
| 1397 | +#endif // openPMD_HAVE_ADIOS2 && openPMD_HAVE_MPI |
0 commit comments