Skip to content

Commit 55c4e63

Browse files
committed
Improvements
1 parent 49bac87 commit 55c4e63

1 file changed

Lines changed: 154 additions & 71 deletions

File tree

mdio/utils/segy_export.h

Lines changed: 154 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,7 @@ Result<void> MdioToSegy(
311311

312312
// 1 GiB cache
313313
auto cacheJson = nlohmann::json::parse(
314-
R"({"cache_pool": {"total_bytes_limit": 1073741824}})");
314+
R"({"cache_pool": {"total_bytes_limit": 5000000000}})");
315315
auto ctxSpec = Context::Spec::FromJson(cacheJson);
316316
auto ctx = Context(ctxSpec.value());
317317

@@ -367,6 +367,20 @@ Result<void> MdioToSegy(
367367
num_traces *= shape[i];
368368
}
369369

370+
// Use the domain size of the second-fastest growing dimension for chunking
371+
// This ensures we process complete "slices" along that dimension
372+
Index output_chunk_size = 1;
373+
if (rank >= 2) {
374+
// Use the full domain size of the second-fastest growing spatial dimension
375+
output_chunk_size = shape[rank - 2];
376+
} else if (rank >= 1) {
377+
// Fallback to first spatial dimension size
378+
output_chunk_size = shape[0];
379+
}
380+
381+
std::cout << "Using output chunk size: " << output_chunk_size << " (full domain of dimension "
382+
<< (rank >= 2 ? rank - 2 : 0) << ")" << std::endl;
383+
370384
// Build Zarr spec for output SEG-Y
371385
nlohmann::json spec;
372386
spec["driver"] = "zarr";
@@ -399,7 +413,7 @@ Result<void> MdioToSegy(
399413
spec["metadata"] = {
400414
{"dtype", std::string("|V") + std::to_string(trace_bytes)},
401415
{"shape", {num_traces}},
402-
{"chunks", {1}},
416+
{"chunks", {output_chunk_size}}, // Use the computed chunk size instead of 1
403417
{"dimension_separator", "."},
404418
{"compressor", nullptr},
405419
{"fill_value", nullptr},
@@ -411,79 +425,148 @@ Result<void> MdioToSegy(
411425

412426
MDIO_ASSIGN_OR_RETURN(
413427
auto out_var,
414-
Variable<>::Open(spec, constants::kCreateClean).result());
415-
416-
// prepare an (N-1)-length odometer for spatial dims
417-
std::vector<Index> coords(rank - 1, 0);
418-
std::vector<RangeDescriptor<Index>> descs;
419-
descs.reserve(rank);
420-
421-
// flat loop over all traces
422-
for (Index trace_idx = 0; trace_idx < num_traces; ++trace_idx) {
423-
// Progress bar (tqdm style)
424-
double progress = static_cast<double>(trace_idx) / static_cast<double>(num_traces);
425-
int bar_width = 50;
426-
int pos = static_cast<int>(bar_width * progress);
427-
428-
std::cout << "\r[";
429-
for (int i = 0; i < bar_width; ++i) {
430-
if (i < pos) std::cout << "=";
431-
else if (i == pos) std::cout << ">";
432-
else std::cout << " ";
428+
Variable<>::Open(spec, constants::kCreateClean, ctx).result());
429+
430+
// Process traces by chunking along the second-fastest growing dimension
431+
// This ensures we align with the underlying storage chunks
432+
size_t chunk_dim = (rank >= 2) ? rank - 2 : 0; // Index of second-fastest growing spatial dim
433+
Index chunk_dim_size = shape[chunk_dim];
434+
435+
// Calculate total number of "slices" in all other spatial dimensions
436+
Index num_outer_slices = 1;
437+
for (size_t d = 0; d < rank - 1; ++d) {
438+
if (d != chunk_dim) {
439+
num_outer_slices *= shape[d];
433440
}
434-
std::cout << "] " << static_cast<int>(progress * 100.0) << "% "
435-
<< "(" << trace_idx << "/" << num_traces << " traces)";
436-
std::cout.flush();
437-
438-
// build slice for dims 0..N-2
439-
descs.clear();
440-
for (size_t d = 0; d < rank - 1; ++d) {
441-
descs.push_back(RangeDescriptor<Index>{
442-
labels[d],
443-
coords[d],
444-
coords[d] + 1,
445-
1});
441+
}
442+
443+
Index traces_processed = 0;
444+
445+
// Iterate through all combinations of the other spatial dimensions
446+
std::vector<Index> outer_coords(rank - 1, 0);
447+
448+
for (Index outer_slice = 0; outer_slice < num_outer_slices; ++outer_slice) {
449+
// Convert outer_slice to coordinates for all dimensions except chunk_dim
450+
Index temp_slice = outer_slice;
451+
for (int d = static_cast<int>(rank) - 2; d >= 0; --d) {
452+
if (static_cast<size_t>(d) != chunk_dim) {
453+
outer_coords[d] = temp_slice % shape[d];
454+
temp_slice /= shape[d];
455+
}
446456
}
447-
// full slice on final dimension
448-
descs.push_back(RangeDescriptor<Index>{
449-
labels[rank - 1],
450-
0,
451-
num_samples,
452-
1});
453-
454-
// read the full trace (header + samples)
455-
MDIO_ASSIGN_OR_RETURN(auto trace_var, seismic_var.slice(descs));
456-
MDIO_ASSIGN_OR_RETURN(auto trace_data, trace_var.Read().result());
457457

458-
const char* in_ptr = reinterpret_cast<const char*>(
459-
trace_data.get_data_accessor().data());
460-
ptrdiff_t in_off = trace_data.get_flattened_offset();
461-
462-
// slice+read one output slot
463-
RangeDescriptor<Index> write_desc{"trace", trace_idx, trace_idx + 1, 1};
464-
MDIO_ASSIGN_OR_RETURN(auto out_slice, out_var.slice(write_desc));
465-
MDIO_ASSIGN_OR_RETURN(auto out_data, out_slice.Read().result());
466-
467-
char* out_ptr = reinterpret_cast<char*>(
468-
out_data.get_data_accessor().data());
469-
ptrdiff_t out_off = out_data.get_flattened_offset();
470-
471-
// zero header+data, then copy samples after 240-byte header
472-
std::memset(out_ptr + out_off, 0, trace_bytes);
473-
std::memcpy(
474-
out_ptr + out_off + 240,
475-
in_ptr + (in_off * sample_bytes), // Convert element offset to byte offset
476-
static_cast<size_t>(num_samples) * sample_bytes);
477-
478-
// write it back
479-
auto fut = out_slice.Write(out_data);
480-
if (!fut.status().ok()) return fut.status();
458+
// Now chunk along the chunk_dim
459+
for (Index chunk_start = 0; chunk_start < chunk_dim_size; chunk_start += output_chunk_size) {
460+
Index chunk_end = std::min(chunk_start + output_chunk_size, chunk_dim_size);
461+
Index chunk_size = chunk_end - chunk_start;
462+
463+
// Progress bar (tqdm style)
464+
double progress = static_cast<double>(traces_processed) / static_cast<double>(num_traces);
465+
int bar_width = 50;
466+
int pos = static_cast<int>(bar_width * progress);
467+
468+
// std::cout << "\r[";
469+
// for (int i = 0; i < bar_width; ++i) {
470+
// if (i < pos) std::cout << "=";
471+
// else if (i == pos) std::cout << ">";
472+
// else std::cout << " ";
473+
// }
474+
// std::cout << "] " << static_cast<int>(progress * 100.0) << "% "
475+
// << "(" << traces_processed << "/" << num_traces << " traces)";
476+
// std::cout.flush();
477+
478+
// Build slice descriptors for this specific chunk
479+
std::vector<RangeDescriptor<Index>> chunk_descs;
480+
chunk_descs.reserve(rank);
481+
482+
// For each spatial dimension, either use the fixed coordinate or the chunk range
483+
for (size_t d = 0; d < rank - 1; ++d) {
484+
if (d == chunk_dim) {
485+
// This is the dimension we're chunking along
486+
chunk_descs.push_back(RangeDescriptor<Index>{
487+
labels[d],
488+
chunk_start,
489+
chunk_end,
490+
1});
491+
} else {
492+
// This is a fixed coordinate for this outer slice
493+
chunk_descs.push_back(RangeDescriptor<Index>{
494+
labels[d],
495+
outer_coords[d],
496+
outer_coords[d] + 1,
497+
1});
498+
}
499+
}
500+
501+
// Full slice on the last dimension (samples/time)
502+
chunk_descs.push_back(RangeDescriptor<Index>{
503+
labels[rank - 1],
504+
0,
505+
num_samples,
506+
1});
481507

482-
// increment odometer
483-
for (int d = static_cast<int>(rank) - 2; d >= 0; --d) {
484-
coords[d]++;
485-
if (coords[d] < shape[d]) break;
486-
coords[d] = 0;
508+
// Read only the seismic data we need for this chunk
509+
MDIO_ASSIGN_OR_RETURN(auto chunk_var, seismic_var.slice(chunk_descs));
510+
std::cout << chunk_var << std::endl;
511+
MDIO_ASSIGN_OR_RETURN(auto chunk_data, chunk_var.Read().result());
512+
std::cout << "Finished reading chunk_var..." << std::endl;
513+
514+
const char* chunk_ptr = reinterpret_cast<const char*>(
515+
chunk_data.get_data_accessor().data());
516+
ptrdiff_t chunk_off = chunk_data.get_flattened_offset();
517+
518+
// Allocate buffer for the output chunk
519+
std::vector<char> chunk_buffer(chunk_size * trace_bytes, 0);
520+
521+
// Process each trace in the chunk
522+
for (Index i = 0; i < chunk_size; ++i) {
523+
// Calculate the offset in the chunk data for this trace
524+
Index trace_offset_in_chunk = i * num_samples;
525+
526+
// Copy trace data to chunk buffer (240 bytes header + samples)
527+
char* trace_buffer = chunk_buffer.data() + (i * trace_bytes);
528+
std::memset(trace_buffer, 0, 240); // Zero the header
529+
std::memcpy(
530+
trace_buffer + 240,
531+
chunk_ptr + (chunk_off + trace_offset_in_chunk) * sample_bytes,
532+
static_cast<size_t>(num_samples) * sample_bytes);
533+
}
534+
535+
// Calculate the output trace range for this chunk
536+
// Convert coordinates back to linear trace indices
537+
Index output_start = 0;
538+
Index multiplier = 1;
539+
540+
// Build the linear index from coordinates
541+
std::vector<Index> trace_coords = outer_coords;
542+
trace_coords[chunk_dim] = chunk_start;
543+
544+
for (int d = static_cast<int>(rank) - 2; d >= 0; --d) {
545+
output_start += trace_coords[d] * multiplier;
546+
multiplier *= shape[d];
547+
}
548+
549+
Index output_end = output_start + chunk_size;
550+
551+
// Write the chunk to output
552+
RangeDescriptor<Index> write_desc{"trace", output_start, output_end, 1};
553+
MDIO_ASSIGN_OR_RETURN(auto out_slice, out_var.slice(write_desc));
554+
std::cout << out_slice << std::endl;
555+
MDIO_ASSIGN_OR_RETURN(auto out_data, out_slice.Read().result());
556+
std::cout << "Finished reading out_data..." << std::endl;
557+
char* out_ptr = reinterpret_cast<char*>(
558+
out_data.get_data_accessor().data());
559+
ptrdiff_t out_off = out_data.get_flattened_offset();
560+
561+
// Copy the chunk buffer to output
562+
std::memcpy(out_ptr + out_off, chunk_buffer.data(), chunk_size * trace_bytes);
563+
564+
// Write it back
565+
std::cout << "Writing back to out_slice..." << std::endl;
566+
auto fut = out_slice.Write(out_data);
567+
if (!fut.status().ok()) return fut.status();
568+
std::cout << "Finished writing back to out_slice..." << std::endl;
569+
traces_processed += chunk_size;
487570
}
488571
}
489572

0 commit comments

Comments
 (0)