Skip to content

Commit feded2c

Browse files
franzpoeschelax3lpre-commit-ci[bot]
authored
Make lazy parsing (defer_iteration_parsing) more discoverable by giving a hint on the command line (#1802)
* Base impl: Hint when parsing takes too long Only file-based so far * Factor out timeout functionality * Do this also for g/v-encodings * Env variables, better warning message * Add some docs * Adapt examles * Revert dataframe example It accesses all Iterations at once * Dataframe: Use snapshots API for iterating To support Streaming workflows * Formatting * Fix extra quote * Inline Comment: Seconds * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Formatting --------- Co-authored-by: Axel Huebl <axel.huebl@plasma.ninja> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent e248cc7 commit feded2c

File tree

11 files changed

+196
-26
lines changed

11 files changed

+196
-26
lines changed

docs/source/details/backendconfig.rst

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,34 @@ Using the Streaming API (i.e. ``SeriesInterface::readIteration()``) will do this
9494
Parsing eagerly might be very expensive for a Series with many iterations, but will avoid bugs by forgotten calls to ``Iteration::open()``.
9595
In complex environments, calling ``Iteration::open()`` on an already open environment does no harm (and does not incur additional runtime cost for additional ``open()`` calls).
9696

97+
By default, the library will print a warning to suggest using deferred Iteration parsing when opening a Series takes long.
98+
The timeout can be tuned by the JSON/TOML key ``hint_lazy_parsing_timeout`` (integer, seconds):
99+
if set to a positive value, the library will print periodic warnings to stderr when eager parsing of Iterations takes longer than the specified number of seconds (default: ``20``). Setting this option to ``0`` disables the warnings.
100+
101+
Environment variables may alternatively be used for options concerning deferred iteration parsing:
102+
103+
* Environment variable ``OPENPMD_DEFER_ITERATION_PARSING``: if set to a truthy value (e.g. ``1``), the Series will be opened with deferred iteration parsing as if ``{"defer_iteration_parsing": true}`` had been supplied.
104+
* Environment variable ``OPENPMD_HINT_LAZY_PARSING_TIMEOUT``: accepts integral values equivalent to the ``hint_lazy_parsing_timeout`` key.
105+
106+
Examples:
107+
108+
.. code-block:: bash
109+
110+
# enable lazy parsing via env var
111+
export OPENPMD_DEFER_ITERATION_PARSING=1
112+
113+
# disable the parsing hint/warning
114+
export OPENPMD_HINT_LAZY_PARSING_TIMEOUT=0
115+
116+
Or in a Series constructor JSON/TOML configuration:
117+
118+
.. code-block:: json
119+
120+
{
121+
"defer_iteration_parsing": true,
122+
"hint_lazy_parsing_timeout": 20
123+
}
124+
97125
The key ``resizable`` can be passed to ``Dataset`` options.
98126
It if set to ``{"resizable": true}``, this declares that it shall be allowed to increased the ``Extent`` of a ``Dataset`` via ``resetDataset()`` at a later time, i.e., after it has been first declared (and potentially written).
99127
For HDF5, resizable Datasets come with a performance penalty.

examples/2_read_serial.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,10 @@ using namespace openPMD;
2929

3030
int main()
3131
{
32-
Series series =
33-
Series("../samples/git-sample/data%T.h5", Access::READ_ONLY);
32+
Series series = Series(
33+
"../samples/git-sample/data%T.h5",
34+
Access::READ_ONLY,
35+
R"({"defer_iteration_parsing": true})");
3436
cout << "Read a Series with openPMD standard version " << series.openPMD()
3537
<< '\n';
3638

@@ -40,7 +42,8 @@ int main()
4042
cout << "\n\t" << i.first;
4143
cout << '\n';
4244

43-
Iteration i = series.snapshots()[100];
45+
// with defer_iteration_parsing, open() must be called explicitly
46+
Iteration i = series.snapshots()[100].open();
4447
cout << "Iteration 100 contains " << i.meshes.size() << " meshes:";
4548
for (auto const &m : i.meshes)
4649
cout << "\n\t" << m.first;

examples/2_read_serial.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@
1010

1111
if __name__ == "__main__":
1212
series = io.Series("../samples/git-sample/data%T.h5",
13-
io.Access.read_only)
13+
io.Access.read_only, {
14+
"defer_iteration_parsing": True
15+
})
1416
print("Read a Series with openPMD standard version %s" %
1517
series.openPMD)
1618

@@ -20,7 +22,8 @@
2022
print("\t {0}".format(i))
2123
print("")
2224

23-
i = series.snapshots()[100]
25+
# with defer_iteration_parsing, open() must be called explicitly
26+
i = series.snapshots()[100].open()
2427
print("Iteration 100 contains {0} meshes:".format(len(i.meshes)))
2528
for m in i.meshes:
2629
print("\t {0}".format(m))

examples/2a_read_thetaMode_serial.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,13 @@ int main()
3131
{
3232
/* The pattern %E instructs the openPMD-api to determine the file ending
3333
* automatically. It can also be given explicitly, e.g. `data%T.h5`. */
34-
Series series =
35-
Series("../samples/git-sample/thetaMode/data%T.h5", Access::READ_ONLY);
34+
Series series = Series(
35+
"../samples/git-sample/thetaMode/data%T.h5",
36+
Access::READ_ONLY,
37+
R"({"defer_iteration_parsing": true})");
3638

37-
Iteration i = series.snapshots()[500];
39+
// defer_iteration_parsing implies that open() must be called explicitly
40+
Iteration i = series.snapshots()[500].open();
3841
MeshRecordComponent E_z_modes = i.meshes["E"]["z"];
3942
Extent extent = E_z_modes.getExtent(); // (modal components, r, z)
4043

examples/2a_read_thetaMode_serial.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,12 @@
1212
# The pattern %E instructs the openPMD-api to determine the file ending
1313
# automatically. It can also be given explicitly, e.g. `data%T.h5`.
1414
series = io.Series("../samples/git-sample/thetaMode/data%T.h5",
15-
io.Access.read_only)
15+
io.Access.read_only, {
16+
"defer_iteration_parsing": True
17+
})
1618

17-
i = series.snapshots()[500]
19+
# with defer_iteration_parsing, open() must be called explicitly
20+
i = series.snapshots()[500].open()
1821
E_z_modes = i.meshes["E"]["z"]
1922
shape = E_z_modes.shape # (modal components, r, z)
2023

examples/4_read_parallel.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,17 @@ int main(int argc, char *argv[])
4040
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
4141

4242
Series series = Series(
43-
"../samples/git-sample/data%T.h5", Access::READ_ONLY, MPI_COMM_WORLD);
43+
"../samples/git-sample/data%T.h5",
44+
Access::READ_ONLY,
45+
MPI_COMM_WORLD,
46+
R"({"defer_iteration_parsing": true})");
4447
if (0 == mpi_rank)
4548
cout << "Read a series in parallel with " << mpi_size << " MPI ranks\n";
4649

47-
MeshRecordComponent E_x = series.snapshots()[100].meshes["E"]["x"];
50+
// with defer_iteration_parsing, open() must be called explicitly
51+
// explicit Iteration opening is recommended in general for parallel
52+
// applications
53+
MeshRecordComponent E_x = series.snapshots()[100].open().meshes["E"]["x"];
4854

4955
Offset chunk_offset = {static_cast<long unsigned int>(mpi_rank) + 1, 1, 1};
5056
Extent chunk_extent = {2, 2, 1};

examples/4_read_parallel.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,17 @@
2020
series = io.Series(
2121
"../samples/git-sample/data%T.h5",
2222
io.Access.read_only,
23-
comm
23+
comm, {
24+
"defer_iteration_parsing": True
25+
}
2426
)
2527
if 0 == comm.rank:
2628
print("Read a series in parallel with {} MPI ranks".format(
2729
comm.size))
2830

29-
E_x = series.snapshots()[100].meshes["E"]["x"]
31+
# with defer_iteration_parsing, open() must be called explicitly
32+
# explicit use of open() is recommended for parallel applications
33+
E_x = series.snapshots()[100].open().meshes["E"]["x"]
3034

3135
chunk_offset = [comm.rank + 1, 1, 1]
3236
chunk_extent = [2, 2, 1]

examples/6_dump_filebased_series.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,10 @@ using namespace openPMD;
77

88
int main()
99
{
10-
Series o =
11-
Series("../samples/git-sample/data%T.h5", Access::READ_RANDOM_ACCESS);
10+
Series o = Series(
11+
"../samples/git-sample/data%T.h5",
12+
Access::READ_RANDOM_ACCESS,
13+
R"({"defer_iteration_parsing": true})");
1214

1315
std::cout << "Read iterations ";
1416
for (auto const &val : o.snapshots())
@@ -39,6 +41,8 @@ int main()
3941

4042
for (auto &[index, i] : o.snapshots())
4143
{
44+
// with defer_iteration_parsing, open() must be called explicitly
45+
i.open();
4246
std::cout << "Read attributes in iteration " << index << ":\n";
4347
for (auto const &val : i.attributes())
4448
std::cout << '\t' << val << '\n';

include/openPMD/Series.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,7 @@ namespace internal
217217
* True if a user opts into lazy parsing.
218218
*/
219219
bool m_parseLazily = false;
220+
uint64_t m_hintLazyParsingAfterTimeout = 20; // seconds
220221

221222
/**
222223
* In variable-based encoding, all backends except ADIOS2 can only write

src/Series.cpp

Lines changed: 118 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#include "openPMD/IterationEncoding.hpp"
3333
#include "openPMD/ThrowError.hpp"
3434
#include "openPMD/auxiliary/Date.hpp"
35+
#include "openPMD/auxiliary/Environment.hpp"
3536
#include "openPMD/auxiliary/Filesystem.hpp"
3637
#include "openPMD/auxiliary/JSON_internal.hpp"
3738
#include "openPMD/auxiliary/Mpi.hpp"
@@ -48,6 +49,7 @@
4849

4950
#include <algorithm>
5051
#include <cctype>
52+
#include <chrono>
5153
#include <exception>
5254
#include <iomanip>
5355
#include <iostream>
@@ -126,6 +128,79 @@ namespace
126128
int padding,
127129
std::string const &postfix,
128130
std::optional<std::string> const &extension);
131+
132+
struct TimeoutLazyParsing
133+
{
134+
using Clock = std::chrono::system_clock;
135+
Clock::time_point start_parsing, time_of_last_warning;
136+
uint64_t timeout;
137+
bool printed_warning_already = false;
138+
139+
TimeoutLazyParsing(uint64_t timeout_in) : timeout(timeout_in)
140+
{
141+
if (timeout > 0)
142+
{
143+
start_parsing = Clock::now();
144+
time_of_last_warning = start_parsing;
145+
}
146+
}
147+
148+
void now(size_t current_iteration_count, size_t total_iteration_count)
149+
{
150+
if (timeout == 0)
151+
{
152+
return;
153+
}
154+
auto current = Clock::now();
155+
auto diff = std::chrono::duration_cast<std::chrono::seconds>(
156+
current - time_of_last_warning);
157+
if (uint64_t(diff.count()) >= timeout)
158+
{
159+
auto total_diff =
160+
std::chrono::duration_cast<std::chrono::seconds>(
161+
current - start_parsing);
162+
if (!printed_warning_already)
163+
{
164+
std::cerr << &R"END(
165+
[openPMD] WARNING: Parsing Iterations is taking a long time.
166+
Consider using deferred Iteration parsing in order to open the Series lazily.
167+
This can be achieved by either setting an environment variable:
168+
169+
> export OPENPMD_DEFER_ITERATION_PARSING=1
170+
171+
Or by specifying it as part of a JSON/TOML configuration:
172+
173+
> // C++:
174+
> Series simData("my_data_%T.%E", R"({"defer_iteration_parsing": true})");
175+
> // Python:
176+
> simData = opmd.Series("my_data_%T.%E", {"defer_iteration_parsing": True})
177+
178+
Iterations will then be parsed only upon explicit user request:
179+
180+
> series.snapshots()[100].open() // new API
181+
> series.iterations[100].open() // old API
182+
183+
Alternatively, Iterations will be opened implicitly when iterating in
184+
READ_LINEAR access mode.
185+
Refer also to the documentation at https://openpmd-api.readthedocs.io
186+
187+
This warning can be suppressed also by either specifying
188+
an environment variable:
189+
190+
> export OPENPMD_HINT_LAZY_PARSING_TIMEOUT=0
191+
192+
Or by the JSON/TOML option {"hint_lazy_parsing_timeout": 0}.
193+
)END"[1] << '\n';
194+
printed_warning_already = true;
195+
}
196+
std::cerr << "Elapsed time: " << total_diff.count()
197+
<< "s, parsed " << current_iteration_count << " of "
198+
<< total_iteration_count << " Iterations."
199+
<< std::endl;
200+
time_of_last_warning = current;
201+
}
202+
}
203+
};
129204
} // namespace
130205

131206
struct Series::ParsedInput
@@ -1757,13 +1832,21 @@ void Series::readFileBased(
17571832
{
17581833
bool atLeastOneIterationSuccessful = false;
17591834
std::optional<error::ReadError> forwardFirstError;
1835+
1836+
TimeoutLazyParsing timeout(series.m_hintLazyParsingAfterTimeout);
1837+
1838+
size_t read_iterations = 0;
17601839
for (auto &iteration : series.iterations)
17611840
{
17621841
if (read_only_this_single_iteration.has_value() &&
17631842
*read_only_this_single_iteration != iteration.first)
17641843
{
17651844
continue;
17661845
}
1846+
if (!read_only_this_single_iteration.has_value())
1847+
{
1848+
timeout.now(read_iterations, iterations.size());
1849+
}
17671850
if (auto error = readIterationEagerly(iteration.second); error)
17681851
{
17691852
std::cerr << "Cannot read iteration '" << iteration.first
@@ -1779,6 +1862,7 @@ void Series::readFileBased(
17791862
{
17801863
atLeastOneIterationSuccessful = true;
17811864
}
1865+
++read_iterations;
17821866
}
17831867
if (!atLeastOneIterationSuccessful)
17841868
{
@@ -2157,6 +2241,10 @@ creating new iterations.
21572241

21582242
auto currentSteps = currentSnapshot();
21592243

2244+
TimeoutLazyParsing timeout{
2245+
series.m_parseLazily ? 0 : series.m_hintLazyParsingAfterTimeout};
2246+
size_t parsed_iterations = 0;
2247+
21602248
switch (iterationEncoding())
21612249
{
21622250
case IterationEncoding::groupBased:
@@ -2175,6 +2263,10 @@ creating new iterations.
21752263
{
21762264
continue;
21772265
}
2266+
if (!read_only_this_single_iteration.has_value())
2267+
{
2268+
timeout.now(parsed_iterations, pList.paths->size());
2269+
}
21782270
if (auto err = internal::withRWAccess(
21792271
IOHandler()->m_seriesStatus,
21802272
[&]() {
@@ -2198,6 +2290,7 @@ creating new iterations.
21982290
{
21992291
readableIterations.push_back(index);
22002292
}
2293+
++parsed_iterations;
22012294
}
22022295
if (currentSteps.has_value())
22032296
{
@@ -2254,6 +2347,10 @@ creating new iterations.
22542347

22552348
for (auto it : *currentSteps)
22562349
{
2350+
if (!read_only_this_single_iteration.has_value())
2351+
{
2352+
timeout.now(parsed_iterations, pList.paths->size());
2353+
}
22572354
/*
22582355
* Variable-based iteration encoding relies on steps, so parsing
22592356
* must happen after opening the first step.
@@ -2281,6 +2378,7 @@ creating new iterations.
22812378
*/
22822379
throw *err;
22832380
}
2381+
++parsed_iterations;
22842382
}
22852383
return *currentSteps;
22862384
}
@@ -2982,9 +3080,18 @@ namespace
29823080
* If yes, read it into the specified location.
29833081
*/
29843082
template <typename From, typename Dest = From>
2985-
void
2986-
getJsonOption(json::TracingJSON &config, std::string const &key, Dest &dest)
3083+
void getJsonOption(
3084+
json::TracingJSON &config,
3085+
std::string const &key,
3086+
Dest &dest,
3087+
std::optional<std::string> envVar = std::nullopt)
29873088
{
3089+
if (envVar.has_value())
3090+
{
3091+
dest = auxiliary::getEnvNum(*envVar, dest);
3092+
std::cout << "Read from env var " << *envVar << " as: " << dest
3093+
<< std::endl;
3094+
}
29883095
if (config.json().contains(key))
29893096
{
29903097
dest = config[key].json().get<From>();
@@ -3027,7 +3134,15 @@ void Series::parseJsonOptions(TracingJSON &options, ParsedInput &input)
30273134
{
30283135
auto &series = get();
30293136
getJsonOption<bool>(
3030-
options, "defer_iteration_parsing", series.m_parseLazily);
3137+
options,
3138+
"defer_iteration_parsing",
3139+
series.m_parseLazily,
3140+
"OPENPMD_DEFER_ITERATION_PARSING");
3141+
getJsonOption<uint64_t>(
3142+
options,
3143+
"hint_lazy_parsing_timeout",
3144+
series.m_hintLazyParsingAfterTimeout,
3145+
"OPENPMD_HINT_LAZY_PARSING_TIMEOUT");
30313146
internal::SeriesData::SourceSpecifiedViaJSON rankTableSource;
30323147
if (getJsonOptionLowerCase(options, "rank_table", rankTableSource.value))
30333148
{

0 commit comments

Comments
 (0)