|
4 | 4 | #include "openPMD/auxiliary/Environment.hpp" |
5 | 5 | #include "openPMD/auxiliary/Filesystem.hpp" |
6 | 6 | #include "openPMD/openPMD.hpp" |
| 7 | +// @todo change includes |
| 8 | +#include "openPMD/benchmark/mpi/OneDimensionalBlockSlicer.hpp" |
7 | 9 | #include <catch2/catch.hpp> |
8 | 10 |
|
9 | 11 | #if openPMD_HAVE_MPI |
@@ -1108,4 +1110,219 @@ TEST_CASE( "adios2_ssc", "[parallel][adios2]" ) |
1108 | 1110 | { |
1109 | 1111 | adios2_ssc(); |
1110 | 1112 | } |
1111 | | -#endif |
| 1113 | + |
| 1114 | +void adios2_chunk_distribution() |
| 1115 | +{ |
| 1116 | + /* |
| 1117 | + * This test simulates a multi-node streaming setup in order to test some |
| 1118 | + * of our chunk distribution strategies. |
| 1119 | + * We don't actually stream (but write a .bp file instead) and also we don't |
| 1120 | + * actually run anything on multiple nodes, but we can use this for testing |
| 1121 | + * the distribution strategies anyway. |
| 1122 | + */ |
| 1123 | + int mpi_size{ -1 }; |
| 1124 | + int mpi_rank{ -1 }; |
| 1125 | + MPI_Comm_size( MPI_COMM_WORLD, &mpi_size ); |
| 1126 | + MPI_Comm_rank( MPI_COMM_WORLD, &mpi_rank ); |
| 1127 | + |
| 1128 | + /* |
| 1129 | + * Mappings: MPI rank -> hostname where the rank is executed. |
| 1130 | + * For the writing application as well as for the reading one. |
| 1131 | + */ |
| 1132 | + chunk_assignment::RankMeta writingRanksHostnames, readingRanksHostnames; |
| 1133 | + for( int i = 0; i < mpi_size; ++i ) |
| 1134 | + { |
| 1135 | + /* |
| 1136 | + * The mapping is intentionally weird. Nodes "node1", "node3", ... |
| 1137 | + * do not have instances of the reading application running on them. |
| 1138 | + * Our distribution strategies will need to deal with that situation. |
| 1139 | + */ |
| 1140 | + // 0, 0, 1, 1, 2, 2, 3, 3 ... |
| 1141 | + writingRanksHostnames[ i ] = "node" + std::to_string( i / 2 ); |
| 1142 | + // 0, 0, 0, 0, 2, 2, 2, 2 ... |
| 1143 | + readingRanksHostnames[ i ] = "node" + std::to_string( i / 4 * 2 ); |
| 1144 | + } |
| 1145 | + |
| 1146 | + std::string filename = "../samples/adios2_chunk_distribution.bp"; |
| 1147 | + // Simulate a stream: BP4 assigns chunk IDs by subfile (i.e. aggregator). |
| 1148 | + std::stringstream parameters; |
| 1149 | + parameters << R"END( |
| 1150 | +{ |
| 1151 | + "adios2": |
| 1152 | + { |
| 1153 | + "engine": |
| 1154 | + { |
| 1155 | + "type": "bp4", |
| 1156 | + "parameters": |
| 1157 | + { |
| 1158 | + "NumAggregators":)END" |
| 1159 | + << "\"" << std::to_string( mpi_size ) << "\"" |
| 1160 | + << R"END( |
| 1161 | + } |
| 1162 | + } |
| 1163 | + } |
| 1164 | +} |
| 1165 | +)END"; |
| 1166 | + |
| 1167 | + auto printAssignment = [ mpi_rank ]( |
| 1168 | + std::string const & strategyName, |
| 1169 | + ChunkTable const & table, |
| 1170 | + chunk_assignment::RankMeta const & meta ) |
| 1171 | + { |
| 1172 | + if( mpi_rank != 0 ) |
| 1173 | + { |
| 1174 | + return; |
| 1175 | + } |
| 1176 | + std::cout << "WITH STRATEGY '" << strategyName << "':\n"; |
| 1177 | + for( auto const & chunk : table ) |
| 1178 | + { |
| 1179 | + std::cout << "[HOST: " << meta.at( chunk.sourceID ) |
| 1180 | + << ",\tRank: " << chunk.sourceID << ",\tOffset: "; |
| 1181 | + for( auto offset : chunk.offset ) |
| 1182 | + { |
| 1183 | + std::cout << offset << ", "; |
| 1184 | + } |
| 1185 | + std::cout << "\tExtent: "; |
| 1186 | + for( auto extent : chunk.extent ) |
| 1187 | + { |
| 1188 | + std::cout << extent << ", "; |
| 1189 | + } |
| 1190 | + std::cout << "]" << std::endl; |
| 1191 | + } |
| 1192 | + }; |
| 1193 | + |
| 1194 | + // Create a dataset. |
| 1195 | + { |
| 1196 | + Series series( |
| 1197 | + filename, |
| 1198 | + openPMD::Access::CREATE, |
| 1199 | + MPI_COMM_WORLD, |
| 1200 | + parameters.str() ); |
| 1201 | + /* |
| 1202 | + * The writing application sets an attribute that tells the reading |
| 1203 | + * application about the "MPI rank -> hostname" mapping. |
| 1204 | + * Each rank only needs to set its own value. |
| 1205 | + * (Some other options like setting all at once or reading from a file |
| 1206 | + * exist as well.) |
| 1207 | + */ |
| 1208 | + series.setMpiRanksMetaInfo( writingRanksHostnames.at( mpi_rank ) ); |
| 1209 | + |
| 1210 | + auto E_x = series.iterations[ 0 ].meshes[ "E" ][ "x" ]; |
| 1211 | + openPMD::Dataset ds( |
| 1212 | + openPMD::Datatype::INT, { unsigned( mpi_size ), 10 } ); |
| 1213 | + E_x.resetDataset( ds ); |
| 1214 | + std::vector< int > data( 10, 0 ); |
| 1215 | + std::iota( data.begin(), data.end(), 0 ); |
| 1216 | + E_x.storeChunk( data, { unsigned( mpi_rank ), 0 }, { 1, 10 } ); |
| 1217 | + series.flush(); |
| 1218 | + } |
| 1219 | + |
| 1220 | + { |
| 1221 | + Series series( filename, openPMD::Access::READ_ONLY, MPI_COMM_WORLD ); |
| 1222 | + /* |
| 1223 | + * Inquire the writing application's "MPI rank -> hostname" mapping. |
| 1224 | + * The reading application needs to know about its own mapping. |
| 1225 | + * Having both of these mappings is the basis for an efficient chunk |
| 1226 | + * distribution since we can use it to figure out which instances |
| 1227 | + * are running on the same nodes. |
| 1228 | + */ |
| 1229 | + auto rankMetaIn = series.mpiRanksMetaInfo(); |
| 1230 | + REQUIRE( rankMetaIn == writingRanksHostnames ); |
| 1231 | + |
| 1232 | + auto E_x = series.iterations[ 0 ].meshes[ "E" ][ "x" ]; |
| 1233 | + /* |
| 1234 | + * Ask the backend which chunks are available. |
| 1235 | + */ |
| 1236 | + auto const chunkTable = E_x.availableChunks(); |
| 1237 | + |
| 1238 | + printAssignment( "INPUT", chunkTable, rankMetaIn ); |
| 1239 | + |
| 1240 | + using namespace chunk_assignment; |
| 1241 | + |
| 1242 | + /* |
| 1243 | + * Assign the chunks by distributing them one after the other to reading |
| 1244 | + * ranks. Easy, but not particularly efficient. |
| 1245 | + */ |
| 1246 | + RoundRobin roundRobinStrategy; |
| 1247 | + auto roundRobinAssignment = assignChunks( |
| 1248 | + chunkTable, rankMetaIn, readingRanksHostnames, roundRobinStrategy ); |
| 1249 | + printAssignment( |
| 1250 | + "ROUND ROBIN", roundRobinAssignment, readingRanksHostnames ); |
| 1251 | + |
| 1252 | + /* |
| 1253 | + * Assign chunks by hostname. |
| 1254 | + * Two difficulties: |
| 1255 | + * * A distribution strategy within one node needs to be picked. |
| 1256 | + * We pick the BinPacking strategy that tries to assign chunks in a |
| 1257 | + * balanced manner. Since our chunks have a small extent along |
| 1258 | + * dimension 0, use dimension 1 for slicing. |
| 1259 | + * * The assignment is partial since some nodes only have instances of |
| 1260 | + * the writing application. Those chunks remain unassigned. |
| 1261 | + */ |
| 1262 | + ByHostname byHostname( |
| 1263 | + std::make_unique< BinPacking >( /* splitAlongDimension = */ 1 ) ); |
| 1264 | + auto byHostnamePartialAssignment = assignChunks( |
| 1265 | + chunkTable, rankMetaIn, readingRanksHostnames, byHostname ); |
| 1266 | + printAssignment( |
| 1267 | + "HOSTNAME, ASSIGNED", |
| 1268 | + byHostnamePartialAssignment.assigned, |
| 1269 | + readingRanksHostnames ); |
| 1270 | + printAssignment( |
| 1271 | + "HOSTNAME, LEFTOVER", |
| 1272 | + byHostnamePartialAssignment.notAssigned, |
| 1273 | + rankMetaIn ); |
| 1274 | + |
| 1275 | + /* |
| 1276 | + * Assign chunks by hostnames, once more. |
| 1277 | + * This time, apply a secondary distribution strategy to assign |
| 1278 | + * leftovers. We pick BinPacking, once more. |
| 1279 | + * Notice that the BinPacking strategy does not (yet) take into account |
| 1280 | + * chunks that have been assigned by the first round. |
| 1281 | + * Balancing is calculated solely based on the leftover chunks from the |
| 1282 | + * first round. |
| 1283 | + */ |
| 1284 | + FromPartialStrategy fromPartialStrategy( |
| 1285 | + std::make_unique< ByHostname >( std::move( byHostname ) ), |
| 1286 | + std::make_unique< BinPacking >( /* splitAlongDimension = */ 1 ) ); |
| 1287 | + auto fromPartialAssignment = assignChunks( |
| 1288 | + chunkTable, |
| 1289 | + rankMetaIn, |
| 1290 | + readingRanksHostnames, |
| 1291 | + fromPartialStrategy ); |
| 1292 | + printAssignment( |
| 1293 | + "HOSTNAME WITH SECOND PASS", |
| 1294 | + fromPartialAssignment, |
| 1295 | + readingRanksHostnames ); |
| 1296 | + |
| 1297 | + /* |
| 1298 | + * Assign chunks by slicing the n-dimensional physical domain and |
| 1299 | + * intersecting those slices with the available chunks from the backend. |
| 1300 | + * Notice that this strategy only returns the chunks that the currently |
| 1301 | + * running rank is supposed to load, whereas the other strategies return |
| 1302 | + * a chunk table containing all chunks that all ranks will load. |
| 1303 | + * In principle, a chunk_assignment::Strategy only needs to return the |
| 1304 | + * chunks that the current rank should load, but is free to emplace the |
| 1305 | + * other chunks for other reading ranks as well. |
| 1306 | + * (Reasoning: In some strategies, calculating everything is necessary, |
| 1307 | + * in others such as this one, it's an unneeded overhead.) |
| 1308 | + */ |
| 1309 | + ByCuboidSlice cuboidSliceStrategy( |
| 1310 | + std::make_unique< OneDimensionalBlockSlicer >( 1 ), |
| 1311 | + E_x.getExtent(), |
| 1312 | + mpi_rank, |
| 1313 | + mpi_size ); |
| 1314 | + auto cuboidSliceAssignment = assignChunks( |
| 1315 | + chunkTable, |
| 1316 | + rankMetaIn, |
| 1317 | + readingRanksHostnames, |
| 1318 | + cuboidSliceStrategy ); |
| 1319 | + printAssignment( |
| 1320 | + "CUBOID SLICE", cuboidSliceAssignment, readingRanksHostnames ); |
| 1321 | + } |
| 1322 | +} |
| 1323 | + |
| 1324 | +TEST_CASE( "adios2_chunk_distribution", "[parallel][adios2]" ) |
| 1325 | +{ |
| 1326 | + adios2_chunk_distribution(); |
| 1327 | +} |
| 1328 | +#endif // openPMD_HAVE_ADIOS2 && openPMD_HAVE_MPI |
0 commit comments