|
29 | 29 | from executorch.exir.dialects._ops import ops as exir_ops |
30 | 30 | from executorch.exir.memory_planning import ( |
31 | 31 | _do_user_inputs_exist, |
| 32 | + apply_algo, |
| 33 | + collect_specs_from_nodes, |
32 | 34 | filter_nodes, |
33 | 35 | get_node_tensor_specs, |
34 | 36 | greedy, |
|
45 | 47 | ToOutVarPass, |
46 | 48 | ) |
47 | 49 | from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass |
| 50 | +from executorch.exir.schema import DeviceType |
48 | 51 | from executorch.exir.tensor import TensorSpec |
49 | 52 | from functorch.experimental.control_flow import map as torch_map |
50 | 53 | from parameterized import parameterized |
@@ -1259,3 +1262,169 @@ def reset(self, k_zeros: torch.Tensor, v_zeros: torch.Tensor) -> None: |
1259 | 1262 | self.assertEqual(v_cache[0].val.allocation_info.memory_id, 2) |
1260 | 1263 | self.assertEqual(v_cache[0].val.allocation_info.memory_offset_low, 256) |
1261 | 1264 | self.assertEqual(v_cache[0].val.allocation_info.memory_offset_high, 0) |
| 1265 | + |
| 1266 | + |
| 1267 | +class TestDeviceAwareMemoryPlanning(unittest.TestCase): |
| 1268 | + """Tests for per-device memory planning (separate buffers per device type).""" |
| 1269 | + |
| 1270 | + def _prepare_model( |
| 1271 | + self, |
| 1272 | + ) -> Tuple[GraphModule, ExportGraphSignature]: |
| 1273 | + """Prepare ToyModelForMemPlanning through SpecPropPass + ToOutVarPass.""" |
| 1274 | + model = ToyModelForMemPlanning() |
| 1275 | + inputs = model.get_random_inputs() |
| 1276 | + edge = to_edge(export(model, inputs, strict=True)) |
| 1277 | + gm = edge.exported_program().graph_module |
| 1278 | + gs = edge.exported_program().graph_signature |
| 1279 | + gm = PassManager(passes=[SpecPropPass(), ToOutVarPass()])(gm).graph_module |
| 1280 | + return gm, gs |
| 1281 | + |
| 1282 | + def _get_planned_specs( |
| 1283 | + self, |
| 1284 | + gm: GraphModule, |
| 1285 | + gs: ExportGraphSignature, |
| 1286 | + ) -> list[TensorSpec]: |
| 1287 | + """Get the unique set of specs that apply_algo would plan.""" |
| 1288 | + return list( |
| 1289 | + collect_specs_from_nodes( |
| 1290 | + gm.graph.nodes, |
| 1291 | + gs, |
| 1292 | + do_assertion=False, |
| 1293 | + ignore_graph_input=False, |
| 1294 | + ignore_graph_output=False, |
| 1295 | + ignore_mutable_buffers=False, |
| 1296 | + ) |
| 1297 | + ) |
| 1298 | + |
| 1299 | + def test_cpu_only_unchanged(self) -> None: |
| 1300 | + """CPU-only specs produce bufsizes = [0, X] with no device metadata.""" |
| 1301 | + gm, gs = self._prepare_model() |
| 1302 | + |
| 1303 | + algo = MemoryPlanningAlgorithmSuite(algo_list=[greedy]) |
| 1304 | + bufsizes = apply_algo( |
| 1305 | + algo, gm, 16, gs, enable_non_cpu_memory_planning=True |
| 1306 | + ) |
| 1307 | + |
| 1308 | + # The CUDA spec is the only tensor in its buffer |
| 1309 | + self.assertEqual(bufsizes[0], 0) # constants |
| 1310 | + self.assertGreater(bufsizes[1], 0) # CPU activations |
| 1311 | + self.assertNotIn("non_const_buffer_device", gm.meta) |
| 1312 | + |
| 1313 | + def test_all_cuda_no_wasted_slots(self) -> None: |
| 1314 | + """CUDA-only specs produce [0, X] with CUDA at buffer index 1.""" |
| 1315 | + gm, gs = self._prepare_model() |
| 1316 | + specs = self._get_planned_specs(gm, gs) |
| 1317 | + for spec in specs: |
| 1318 | + spec.device = DeviceType.CUDA |
| 1319 | + |
| 1320 | + algo = MemoryPlanningAlgorithmSuite(algo_list=[greedy]) |
| 1321 | + bufsizes = apply_algo(algo, gm, 16, gs, enable_non_cpu_memory_planning=True) |
| 1322 | + |
| 1323 | + # [0, cuda_size] — no wasted CPU buffer slot |
| 1324 | + self.assertEqual(len(bufsizes), 2) |
| 1325 | + self.assertEqual(bufsizes[0], 0) |
| 1326 | + self.assertGreater(bufsizes[1], 0) |
| 1327 | + # Device mapping should be present |
| 1328 | + self.assertIn("non_const_buffer_device", gm.meta) |
| 1329 | + device_map = gm.meta["non_const_buffer_device"] |
| 1330 | + self.assertEqual(len(device_map), 2) |
| 1331 | + self.assertEqual(device_map[0].device_type, DeviceType.CPU) # constants |
| 1332 | + self.assertEqual(device_map[1].device_type, DeviceType.CUDA) |
| 1333 | + |
| 1334 | + def test_mixed_cpu_cuda_separate_buffers(self) -> None: |
| 1335 | + """CPU specs at mem_id=1, CUDA specs at mem_id=2, separate sizes.""" |
| 1336 | + gm, gs = self._prepare_model() |
| 1337 | + specs = self._get_planned_specs(gm, gs) |
| 1338 | + |
| 1339 | + # Set second half of specs to CUDA |
| 1340 | + mid = len(specs) // 2 |
| 1341 | + self.assertGreater(mid, 0) |
| 1342 | + cpu_specs = specs[:mid] |
| 1343 | + cuda_specs = specs[mid:] |
| 1344 | + for spec in cuda_specs: |
| 1345 | + spec.device = DeviceType.CUDA |
| 1346 | + |
| 1347 | + algo = MemoryPlanningAlgorithmSuite(algo_list=[greedy]) |
| 1348 | + bufsizes = apply_algo(algo, gm, 16, gs, enable_non_cpu_memory_planning=True) |
| 1349 | + |
| 1350 | + # [constants, cpu_activations, cuda_activations] |
| 1351 | + self.assertEqual(len(bufsizes), 3) |
| 1352 | + self.assertEqual(bufsizes[0], 0) |
| 1353 | + self.assertGreater(bufsizes[1], 0) |
| 1354 | + self.assertGreater(bufsizes[2], 0) |
| 1355 | + |
| 1356 | + # CPU specs should have mem_id=1, CUDA specs should have mem_id=2 |
| 1357 | + for spec in cpu_specs: |
| 1358 | + self.assertEqual(spec.mem_id, 1, f"CPU spec has wrong mem_id: {spec.mem_id}") |
| 1359 | + for spec in cuda_specs: |
| 1360 | + self.assertEqual(spec.mem_id, 2, f"CUDA spec has wrong mem_id: {spec.mem_id}") |
| 1361 | + |
| 1362 | + def test_mem_offset_correct_after_remap(self) -> None: |
| 1363 | + """After remapping, mem_offset is relative to its own buffer.""" |
| 1364 | + gm, gs = self._prepare_model() |
| 1365 | + specs = self._get_planned_specs(gm, gs) |
| 1366 | + |
| 1367 | + # Set the last spec to CUDA (sole CUDA tensor) |
| 1368 | + cuda_spec = specs[-1] |
| 1369 | + cuda_spec.device = DeviceType.CUDA |
| 1370 | + |
| 1371 | + algo = MemoryPlanningAlgorithmSuite(algo_list=[greedy]) |
| 1372 | + bufsizes = apply_algo( |
| 1373 | + algo, gm, 16, gs, enable_non_cpu_memory_planning=True |
| 1374 | + ) |
| 1375 | + |
| 1376 | + # The CUDA spec is the only tensor in its buffer, so offset should be 0 |
| 1377 | + self.assertEqual(cuda_spec.mem_offset, 0) |
| 1378 | + # The CUDA buffer should fit exactly this tensor |
| 1379 | + cuda_mem_id = cuda_spec.mem_id |
| 1380 | + self.assertIsNotNone(cuda_mem_id) |
| 1381 | + assert cuda_mem_id is not None |
| 1382 | + self.assertGreaterEqual(bufsizes[cuda_mem_id], cuda_spec.allocated_memory) |
| 1383 | + |
| 1384 | + def test_no_cross_device_memory_sharing(self) -> None: |
| 1385 | + """Specs on different devices never share buffers, regardless of lifetime.""" |
| 1386 | + gm, gs = self._prepare_model() |
| 1387 | + specs = self._get_planned_specs(gm, gs) |
| 1388 | + self.assertGreaterEqual(len(specs), 2) |
| 1389 | + |
| 1390 | + # Assign alternating specs to CUDA to ensure some pairs have |
| 1391 | + # non-overlapping lifetimes (which greedy would normally share). |
| 1392 | + for i, spec in enumerate(specs): |
| 1393 | + if i % 2 == 0: |
| 1394 | + spec.device = DeviceType.CUDA |
| 1395 | + |
| 1396 | + algo = MemoryPlanningAlgorithmSuite(algo_list=[greedy]) |
| 1397 | + apply_algo(algo, gm, 16, gs, enable_non_cpu_memory_planning=True) |
| 1398 | + |
| 1399 | + # Verify CPU and CUDA specs have disjoint mem_ids |
| 1400 | + cpu_mem_ids: set[int] = set() |
| 1401 | + cuda_mem_ids: set[int] = set() |
| 1402 | + for i, spec in enumerate(specs): |
| 1403 | + if spec.mem_id is not None: |
| 1404 | + if i % 2 == 0: |
| 1405 | + cuda_mem_ids.add(spec.mem_id) |
| 1406 | + else: |
| 1407 | + cpu_mem_ids.add(spec.mem_id) |
| 1408 | + |
| 1409 | + self.assertTrue( |
| 1410 | + cpu_mem_ids.isdisjoint(cuda_mem_ids), |
| 1411 | + f"CPU {cpu_mem_ids} and CUDA {cuda_mem_ids} should not share buffers", |
| 1412 | + ) |
| 1413 | + |
| 1414 | + def test_disabled_falls_back_to_cpu(self) -> None: |
| 1415 | + """With enable_non_cpu_memory_planning=False (default), CUDA specs are |
| 1416 | + planned into CPU memory — no device-specific buffers are created.""" |
| 1417 | + gm, gs = self._prepare_model() |
| 1418 | + specs = self._get_planned_specs(gm, gs) |
| 1419 | + for spec in specs: |
| 1420 | + spec.device = DeviceType.CUDA |
| 1421 | + |
| 1422 | + algo = MemoryPlanningAlgorithmSuite(algo_list=[greedy]) |
| 1423 | + # Default: enable_non_cpu_memory_planning=False |
| 1424 | + bufsizes = apply_algo(algo, gm, 16, gs) |
| 1425 | + |
| 1426 | + # All specs planned into a single CPU pool — same as CPU-only |
| 1427 | + self.assertEqual(len(bufsizes), 2) |
| 1428 | + self.assertEqual(bufsizes[0], 0) |
| 1429 | + self.assertGreater(bufsizes[1], 0) |
| 1430 | + self.assertNotIn("non_const_buffer_device", gm.meta) |
0 commit comments