Commit 30b762e
fix: remove the use of
This pull request refactors the data loading mechanism in the training
module by replacing the `BufferedIterator` class with a simpler
`cycle_iterator` function.
I was able to reproduce this error on a system of Ni6Fe10 provided by
@iProzd . The error log shows there is something strange when running
garbage collection. I've tried the following approaches:
- Not using GPU for training, but CPU: failed
- Set `NUM_WORKERS`=0: worked
- Manually run garbage collection `gc.collect()` when a DataLoader
finishes its epoch: worked, but ~10% slower
<details><summary>Error log</summary>
<p>
```
Fatal Python error: Aborted
Thread 0x00007f68b9289700 (most recent call first):
File "/root/miniconda3/lib/python3.10/threading.py", line 320 in wait
File "/root/miniconda3/lib/python3.10/multiprocessing/queues.py", line 231 in _feed
File "/root/miniconda3/lib/python3.10/threading.py", line 953 in run
File "/root/miniconda3/lib/python3.10/threading.py", line 1016 in _bootstrap_inner
File "/root/miniconda3/lib/python3.10/threading.py", line 973 in _bootstrap
Current thread 0x00007f6c595d7280 (most recent call first):
Garbage-collecting
File "/root/miniconda3/lib/python3.10/ast.py", line 99 in _convert
File "/root/miniconda3/lib/python3.10/ast.py", line 110 in literal_eval
File "/root/miniconda3/lib/python3.10/site-packages/numpy/lib/utils.py", line 1078 in safe_eval
File "/root/miniconda3/lib/python3.10/site-packages/numpy/lib/format.py", line 623 in _read_array_header
File "/root/miniconda3/lib/python3.10/site-packages/numpy/lib/format.py", line 784 in read_array
File "/root/miniconda3/lib/python3.10/site-packages/numpy/lib/npyio.py", line 456 in load
File "/aisi/cc/deepmd-kit/deepmd/utils/path.py", line 187 in load_numpy
File "/aisi/cc/deepmd-kit/deepmd/utils/data.py", line 634 in _load_data
File "/aisi/cc/deepmd-kit/deepmd/utils/data.py", line 526 in _load_set
File "/aisi/cc/deepmd-kit/deepmd/utils/data.py", line 251 in get_item_torch
File "/aisi/cc/deepmd-kit/deepmd/pt/utils/dataset.py", line 39 in __getitem__
File "/root/miniconda3/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52 in <listcomp>
File "/root/miniconda3/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52 in fetch
File "/root/miniconda3/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 789 in _next_data
File "/root/miniconda3/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 733 in __next__
File "/aisi/cc/deepmd-kit/deepmd/pt/utils/dataloader.py", line 204 in __getitem__
File "/root/miniconda3/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 54 in fetch
File "/root/miniconda3/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 349 in _worker_loop
File "/root/miniconda3/lib/python3.10/multiprocessing/process.py", line 108 in run
File "/root/miniconda3/lib/python3.10/multiprocessing/process.py", line 314 in _bootstrap
File "/root/miniconda3/lib/python3.10/multiprocessing/popen_fork.py", line 71 in _launch
File "/root/miniconda3/lib/python3.10/multiprocessing/popen_fork.py", line 19 in __init__
File "/root/miniconda3/lib/python3.10/multiprocessing/context.py", line 281 in _Popen
File "/root/miniconda3/lib/python3.10/multiprocessing/context.py", line 224 in _Popen
File "/root/miniconda3/lib/python3.10/multiprocessing/process.py", line 121 in start
File "/root/miniconda3/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1171 in __init__
File "/root/miniconda3/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 424 in _get_iterator
File "/root/miniconda3/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 493 in __iter__
File "/aisi/cc/deepmd-kit/deepmd/pt/train/training.py", line 1075 in get_data
File "/aisi/cc/deepmd-kit/deepmd/pt/train/training.py", line 689 in step
File "/aisi/cc/deepmd-kit/deepmd/pt/train/training.py", line 960 in run
File "/aisi/cc/deepmd-kit/deepmd/pt/entrypoints/main.py", line 361 in train
File "/aisi/cc/deepmd-kit/deepmd/pt/entrypoints/main.py", line 530 in main
File "/root/miniconda3/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355 in wrapper
File "/aisi/cc/deepmd-kit/deepmd/main.py", line 930 in main
File "/root/miniconda3/bin/dp", line 8 in <module>
Extension modules: numpy.core._multiarray_umath, numpy.core._multiarray_tests, numpy.linalg._umath_linalg, numpy.fft._pocketfft_internal, numpy.random._common, numpy.random.bit_generator, numpy.random._bounded_integers, numpy.random._mt19937, numpy.random.mtrand, numpy.random._philox, numpy.random._pcg64, numpy.random._sfc64, numpy.random._generator, torch._C, torch._C._dynamo.autograd_compiler, torch._C._dynamo.eval_frame, torch._C._dynamo.guards, torch._C._dynamo.utils, torch._C._fft, torch._C._linalg, torch._C._nested, torch._C._nn, torch._C._sparse, torch._C._special, h5py._errors, h5py.defs, h5py._objects, h5py.h5, h5py.utils, h5py.h5t, h5py.h5s, h5py.h5ac, h5py.h5p, h5py.h5r, h5py._proxy, h5py._conv, h5py.h5z, h5py.h5a, h5py.h5d, h5py.h5ds, h5py.h5g, h5py.h5i, h5py.h5o, h5py.h5f, h5py.h5fd, h5py.h5pl, h5py.h5l, h5py._selector, yaml._yaml, scipy._lib._ccallback_c, scipy.special._ufuncs_cxx, scipy.special._ufuncs, scipy.special._specfun, scipy.special._comb, scipy.linalg._fblas, scipy.linalg._flapack, scipy.linalg.cython_lapack, scipy.linalg._cythonized_array_utils, scipy.linalg._solve_toeplitz, scipy.linalg._decomp_lu_cython, scipy.linalg._matfuncs_sqrtm_triu, scipy.linalg._matfuncs_expm, scipy.linalg._linalg_pythran, scipy.linalg.cython_blas, scipy.linalg._decomp_update, scipy.sparse._sparsetools, _csparsetools, scipy.sparse._csparsetools, scipy.sparse.linalg._dsolve._superlu, scipy.sparse.linalg._eigen.arpack._arpack, scipy.sparse.linalg._propack._spropack, scipy.sparse.linalg._propack._dpropack, scipy.sparse.linalg._propack._cpropack, scipy.sparse.linalg._propack._zpropack, scipy.sparse.csgraph._tools, scipy.sparse.csgraph._shortest_path, scipy.sparse.csgraph._traversal, scipy.sparse.csgraph._min_spanning_tree, scipy.sparse.csgraph._flow, scipy.sparse.csgraph._matching, scipy.sparse.csgraph._reordering, scipy.special._ellip_harm_2, scipy.interpolate._fitpack, scipy.interpolate._dfitpack, scipy.optimize._group_columns, scipy._lib.messagestream, scipy.optimize._trlib._trlib, scipy.optimize._lbfgsb, _moduleTNC, scipy.optimize._moduleTNC, scipy.optimize._cobyla, scipy.optimize._slsqp, scipy.optimize._minpack, scipy.optimize._lsq.givens_elimination, scipy.optimize._zeros, scipy.optimize._cython_nnls, scipy._lib._uarray._uarray, scipy.linalg._decomp_interpolative, scipy.optimize._bglu_dense, scipy.optimize._lsap, scipy.spatial._ckdtree, scipy.spatial._qhull, scipy.spatial._voronoi, scipy.spatial._distance_wrap, scipy.spatial._hausdorff, scipy.spatial.transform._rotation, scipy.optimize._direct, scipy.interpolate._dierckx, scipy.interpolate._ppoly, scipy.interpolate._interpnd, scipy.interpolate._rbfinterp_pythran, scipy.interpolate._rgi_cython, scipy.interpolate._bspl (total: 113)
Traceback (most recent call last):
File "/root/miniconda3/bin/dp", line 8, in <module>
sys.exit(main())
File "/aisi/cc/deepmd-kit/deepmd/main.py", line 930, in main
deepmd_main(args)
File "/root/miniconda3/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
return f(*args, **kwargs)
File "/aisi/cc/deepmd-kit/deepmd/pt/entrypoints/main.py", line 530, in main
train(
File "/aisi/cc/deepmd-kit/deepmd/pt/entrypoints/main.py", line 361, in train
trainer.run()
File "/aisi/cc/deepmd-kit/deepmd/pt/train/training.py", line 960, in run
step(step_id)
File "/aisi/cc/deepmd-kit/deepmd/pt/train/training.py", line 705, in step
loss.backward()
File "/root/miniconda3/lib/python3.10/site-packages/torch/_tensor.py", line 648, in backward
torch.autograd.backward(
File "/root/miniconda3/lib/python3.10/site-packages/torch/autograd/__init__.py", line 353, in backward
_engine_run_backward(
File "/root/miniconda3/lib/python3.10/site-packages/torch/autograd/graph.py", line 824, in _engine_run_backward
return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/root/miniconda3/lib/python3.10/site-packages/torch/utils/data/_utils/signal_handling.py", line 73, in handler
_error_if_any_worker_fails()
RuntimeError: DataLoader worker (pid 434731) is killed by signal: Aborted.
```
</p>
</details>
The problem couples with pytorch tensor and python threads and pipes,
which is hard to locate the root cause.
I tested this PR on single-task training and multi-task training, and
the training speed (in s/1000 steps) is almost the same:
|data|before|after|
|---|---|---|
|omat|235.52|235.89|
|multi-task pretraining|290.63|290.00|
Fix #4586
<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit
- **Refactor**
- Enhanced data loading with an infinite cycling iterator for
uninterrupted batch retrieval during training and validation.
- Removed background prefetching and threading to simplify data loading
utilities.
- **Style**
- Added a clarifying comment about shuffling behavior when distributed
sampling is active.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
---------
Signed-off-by: Chun Cai <amoycaic@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>BufferedIterator (#4737)1 parent 43e0288 commit 30b762e
File tree
4 files changed
+66
-130
lines changed- deepmd/pt
- train
- utils
- source/tests/pt/model
4 files changed
+66
-130
lines changed| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
2 | 2 | | |
3 | 3 | | |
4 | 4 | | |
| 5 | + | |
| 6 | + | |
| 7 | + | |
5 | 8 | | |
6 | 9 | | |
7 | 10 | | |
| |||
47 | 50 | | |
48 | 51 | | |
49 | 52 | | |
50 | | - | |
51 | 53 | | |
52 | 54 | | |
53 | 55 | | |
| |||
159 | 161 | | |
160 | 162 | | |
161 | 163 | | |
| 164 | + | |
| 165 | + | |
| 166 | + | |
| 167 | + | |
| 168 | + | |
| 169 | + | |
| 170 | + | |
| 171 | + | |
| 172 | + | |
| 173 | + | |
| 174 | + | |
| 175 | + | |
| 176 | + | |
| 177 | + | |
| 178 | + | |
| 179 | + | |
162 | 180 | | |
163 | | - | |
| 181 | + | |
164 | 182 | | |
165 | 183 | | |
166 | 184 | | |
| |||
177 | 195 | | |
178 | 196 | | |
179 | 197 | | |
180 | | - | |
181 | | - | |
182 | | - | |
| 198 | + | |
| 199 | + | |
183 | 200 | | |
184 | | - | |
| 201 | + | |
185 | 202 | | |
186 | 203 | | |
187 | 204 | | |
188 | 205 | | |
189 | 206 | | |
190 | 207 | | |
191 | | - | |
192 | | - | |
| 208 | + | |
| 209 | + | |
193 | 210 | | |
194 | 211 | | |
195 | 212 | | |
196 | 213 | | |
197 | 214 | | |
198 | 215 | | |
199 | 216 | | |
200 | | - | |
| 217 | + | |
201 | 218 | | |
202 | 219 | | |
203 | 220 | | |
204 | | - | |
| 221 | + | |
205 | 222 | | |
206 | | - | |
| 223 | + | |
207 | 224 | | |
208 | 225 | | |
209 | 226 | | |
| |||
1064 | 1081 | | |
1065 | 1082 | | |
1066 | 1083 | | |
1067 | | - | |
1068 | | - | |
1069 | | - | |
1070 | | - | |
1071 | | - | |
1072 | | - | |
1073 | | - | |
1074 | | - | |
1075 | | - | |
1076 | | - | |
1077 | | - | |
1078 | | - | |
1079 | | - | |
1080 | | - | |
1081 | | - | |
1082 | | - | |
1083 | | - | |
1084 | | - | |
1085 | | - | |
1086 | | - | |
1087 | | - | |
| 1084 | + | |
| 1085 | + | |
1088 | 1086 | | |
1089 | | - | |
1090 | | - | |
1091 | | - | |
1092 | | - | |
1093 | | - | |
1094 | | - | |
1095 | | - | |
1096 | | - | |
1097 | | - | |
1098 | | - | |
1099 | | - | |
1100 | | - | |
1101 | | - | |
1102 | | - | |
1103 | | - | |
1104 | | - | |
1105 | | - | |
1106 | | - | |
1107 | | - | |
1108 | | - | |
| 1087 | + | |
| 1088 | + | |
| 1089 | + | |
| 1090 | + | |
| 1091 | + | |
| 1092 | + | |
1109 | 1093 | | |
1110 | 1094 | | |
1111 | 1095 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
1 | 1 | | |
2 | 2 | | |
3 | 3 | | |
4 | | - | |
5 | 4 | | |
6 | 5 | | |
7 | 6 | | |
8 | | - | |
9 | | - | |
10 | | - | |
11 | | - | |
12 | | - | |
13 | | - | |
14 | 7 | | |
15 | 8 | | |
16 | 9 | | |
| |||
173 | 166 | | |
174 | 167 | | |
175 | 168 | | |
176 | | - | |
| 169 | + | |
| 170 | + | |
| 171 | + | |
177 | 172 | | |
178 | 173 | | |
179 | 174 | | |
| |||
200 | 195 | | |
201 | 196 | | |
202 | 197 | | |
203 | | - | |
204 | | - | |
205 | | - | |
206 | | - | |
207 | | - | |
| 198 | + | |
| 199 | + | |
| 200 | + | |
| 201 | + | |
| 202 | + | |
| 203 | + | |
208 | 204 | | |
209 | 205 | | |
210 | 206 | | |
| |||
235 | 231 | | |
236 | 232 | | |
237 | 233 | | |
238 | | - | |
239 | | - | |
240 | | - | |
241 | | - | |
242 | | - | |
243 | | - | |
244 | | - | |
245 | | - | |
246 | | - | |
247 | | - | |
248 | | - | |
249 | | - | |
250 | | - | |
251 | | - | |
252 | | - | |
253 | | - | |
254 | | - | |
255 | | - | |
256 | | - | |
257 | | - | |
258 | | - | |
259 | | - | |
260 | | - | |
261 | | - | |
262 | | - | |
263 | | - | |
264 | | - | |
265 | | - | |
266 | | - | |
267 | | - | |
268 | | - | |
269 | | - | |
270 | | - | |
271 | | - | |
272 | | - | |
273 | | - | |
274 | | - | |
275 | | - | |
276 | | - | |
277 | | - | |
278 | | - | |
279 | | - | |
280 | | - | |
281 | | - | |
282 | | - | |
283 | | - | |
284 | | - | |
285 | | - | |
286 | 234 | | |
287 | 235 | | |
288 | 236 | | |
| |||
320 | 268 | | |
321 | 269 | | |
322 | 270 | | |
323 | | - | |
| 271 | + | |
| 272 | + | |
| 273 | + | |
| 274 | + | |
| 275 | + | |
324 | 276 | | |
325 | 277 | | |
326 | 278 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
25 | 25 | | |
26 | 26 | | |
27 | 27 | | |
28 | | - | |
29 | 28 | | |
30 | 29 | | |
31 | 30 | | |
| |||
72 | 71 | | |
73 | 72 | | |
74 | 73 | | |
| 74 | + | |
| 75 | + | |
| 76 | + | |
| 77 | + | |
| 78 | + | |
75 | 79 | | |
76 | | - | |
| 80 | + | |
77 | 81 | | |
78 | 82 | | |
79 | 83 | | |
| |||
111 | 115 | | |
112 | 116 | | |
113 | 117 | | |
114 | | - | |
115 | | - | |
116 | | - | |
117 | | - | |
118 | | - | |
119 | | - | |
| 118 | + | |
| 119 | + | |
120 | 120 | | |
121 | 121 | | |
122 | 122 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
25 | 25 | | |
26 | 26 | | |
27 | 27 | | |
28 | | - | |
29 | 28 | | |
30 | 29 | | |
31 | 30 | | |
| |||
72 | 71 | | |
73 | 72 | | |
74 | 73 | | |
| 74 | + | |
| 75 | + | |
| 76 | + | |
| 77 | + | |
| 78 | + | |
75 | 79 | | |
76 | | - | |
| 80 | + | |
77 | 81 | | |
78 | 82 | | |
79 | 83 | | |
| |||
105 | 109 | | |
106 | 110 | | |
107 | 111 | | |
108 | | - | |
109 | | - | |
110 | | - | |
111 | | - | |
112 | | - | |
113 | | - | |
| 112 | + | |
| 113 | + | |
114 | 114 | | |
115 | 115 | | |
116 | 116 | | |
| |||
0 commit comments