@@ -307,6 +307,165 @@ If you want to achieve the best compression, you should use `GZIP` or `SNAPPY` w
307307
308308For not yet supported algorithms, please check our [ Roadmap] ( https://github.com/orgs/flow-php/projects/1 ) to understand when they will be supported.
309309
310+ ### Per-Column Compression
311+
312+ You can specify different compression algorithms for individual columns using flat path notation. This allows fine-grained control over the compression strategy for optimal storage and performance.
313+
314+ #### When to Use Per-Column Compression
315+
316+ ** Different data characteristics require different compression strategies:**
317+
318+ - ** Fast access columns** (IDs, timestamps) - Use ` UNCOMPRESSED ` or ` LZ4 ` for minimal decompression overhead
319+ - ** Categorical data** (status, country codes) - Use ` SNAPPY ` for balanced compression with good performance
320+ - ** Text/JSON data** (descriptions, metadata) - Use ` ZSTD ` or ` BROTLI ` for maximum compression
321+ - ** Numerical data** - Use ` LZ4 ` or ` SNAPPY ` for good compression with fast access
322+ - ** Archival columns** - Use ` ZSTD ` for maximum compression when access speed is less critical
323+
324+ #### Basic Per-Column Compression
325+
326+ ``` php
327+ use Flow\Parquet\{Writer, Options, Option};
328+ use Flow\Parquet\ParquetFile\{Schema, Compressions};
329+ use Flow\Parquet\ParquetFile\Schema\FlatColumn;
330+
331+ $schema = Schema::with(
332+ FlatColumn::int64('user_id'),
333+ FlatColumn::string('status'),
334+ FlatColumn::string('description'),
335+ FlatColumn::float('price')
336+ );
337+
338+ $options = Options::default()->set(Option::COLUMNS_COMPRESSIONS, [
339+ 'user_id' => Compressions::UNCOMPRESSED, // Fast access for frequent queries
340+ 'status' => Compressions::SNAPPY, // Balanced compression for enum-like data
341+ 'description' => Compressions::ZSTD, // Maximum compression for text data
342+ 'price' => Compressions::LZ4 // Fast compression for numeric data
343+ ]);
344+
345+ // Global compression serves as fallback for unspecified columns
346+ $writer = new Writer(compressions: Compressions::GZIP, options: $options);
347+ ```
348+
349+ #### Nested Column Compression
350+
351+ For nested structures, use the same flat path notation as column encodings:
352+
353+ ``` php
354+ use Flow\Parquet\ParquetFile\Schema\{NestedColumn, ListElement, MapKey, MapValue};
355+
356+ $schema = Schema::with(
357+ NestedColumn::struct('user', [
358+ FlatColumn::int64('id'),
359+ FlatColumn::string('name'),
360+ NestedColumn::struct('address', [
361+ FlatColumn::string('street'),
362+ FlatColumn::string('city'),
363+ FlatColumn::string('country')
364+ ])
365+ ]),
366+ NestedColumn::list('tags', ListElement::string()),
367+ NestedColumn::map('metadata', MapKey::string(), MapValue::string())
368+ );
369+
370+ $options = Options::default()->set(Option::COLUMNS_COMPRESSIONS, [
371+ // Struct fields - direct access
372+ 'user.id' => Compressions::UNCOMPRESSED, // Primary key - fast access
373+ 'user.name' => Compressions::SNAPPY, // Balanced for names
374+ 'user.address.street' => Compressions::ZSTD, // High compression for addresses
375+ 'user.address.city' => Compressions::LZ4, // Fast for frequently queried cities
376+ 'user.address.country' => Compressions::SNAPPY, // Balanced for country codes
377+
378+ // List elements - note the '.list.element' suffix
379+ 'tags.list.element' => Compressions::BROTLI, // High compression for tags
380+
381+ // Map key/value pairs - note the '.key_value.key/value' suffix
382+ 'metadata.key_value.key' => Compressions::LZ4, // Fast access for metadata keys
383+ 'metadata.key_value.value' => Compressions::ZSTD // Max compression for metadata values
384+ ]);
385+ ```
386+
387+ #### Performance-Optimized Strategies
388+
389+ ** Query-Optimized Strategy:**
390+ ``` php
391+ $options = Options::default()->set(Option::COLUMNS_COMPRESSIONS, [
392+ // Frequently queried columns - prioritize speed
393+ 'user_id' => Compressions::UNCOMPRESSED,
394+ 'created_at' => Compressions::LZ4,
395+ 'status' => Compressions::SNAPPY,
396+
397+ // Analytical columns - prioritize compression
398+ 'analytics_payload' => Compressions::ZSTD,
399+ 'raw_data' => Compressions::BROTLI,
400+
401+ // Balanced approach for mixed usage
402+ 'category' => Compressions::SNAPPY,
403+ 'description' => Compressions::GZIP
404+ ]);
405+ ```
406+
407+ ** Storage-Optimized Strategy:**
408+ ``` php
409+ $options = Options::default()->set(Option::COLUMNS_COMPRESSIONS, [
410+ // Only critical columns use fast compression
411+ 'id' => Compressions::LZ4,
412+
413+ // Everything else maximizes compression
414+ 'content' => Compressions::ZSTD,
415+ 'metadata' => Compressions::BROTLI,
416+ 'tags' => Compressions::GZIP,
417+ 'attributes' => Compressions::ZSTD
418+ ]);
419+ ```
420+
421+ #### Combined Compression and Encoding Strategy
422+
423+ You can combine per-column compression with custom encodings for optimal results:
424+
425+ ``` php
426+ $options = Options::default()
427+ ->set(Option::COLUMNS_COMPRESSIONS, [
428+ 'user_id' => Compressions::UNCOMPRESSED, // Fast primary key access
429+ 'status' => Compressions::SNAPPY, // Balanced for enum data
430+ 'metadata' => Compressions::ZSTD // Max compression for JSON
431+ ])
432+ ->set(Option::COLUMNS_ENCODINGS, [
433+ 'user_id' => Encodings::DELTA_BINARY_PACKED, // Efficient encoding for sequential IDs
434+ 'status' => Encodings::RLE_DICTIONARY, // Dictionary for repeated values
435+ 'metadata' => Encodings::PLAIN // No encoding overhead for compressed data
436+ ]);
437+ ```
438+
439+ #### Compression Selection Guidelines
440+
441+ | Data Type | Characteristics | Recommended Compression | Use Case |
442+ | -----------| ----------------| ------------------------| -----------|
443+ | ** Primary Keys** | Sequential integers, frequent queries | ` UNCOMPRESSED ` or ` LZ4 ` | Fast joins and lookups |
444+ | ** Status/Categories** | Low cardinality, repeated values | ` SNAPPY ` | Balanced performance |
445+ | ** Timestamps** | Sequential, frequently filtered | ` LZ4 ` | Fast time-based queries |
446+ | ** Text Content** | High variance, large size | ` ZSTD ` or ` BROTLI ` | Storage optimization |
447+ | ** JSON/Metadata** | Complex nested data | ` ZSTD ` | Maximum compression |
448+ | ** Numerical Data** | Calculations, aggregations | ` SNAPPY ` or ` LZ4 ` | Fast mathematical operations |
449+ | ** Archive Data** | Rarely accessed | ` ZSTD ` or ` BROTLI ` | Long-term storage |
450+
451+ #### Performance vs. Compression Trade-offs
452+
453+ ** Compression Ratio (Best to Worst):**
454+ 1 . ` ZSTD ` - Best compression, slower decompression
455+ 2 . ` BROTLI ` - Excellent compression, moderate speed
456+ 3 . ` GZIP ` - Good compression, widely supported
457+ 4 . ` SNAPPY ` - Balanced compression and speed (default)
458+ 5 . ` LZ4 ` - Fast compression/decompression, moderate ratio
459+ 6 . ` UNCOMPRESSED ` - No compression overhead, largest size
460+
461+ ** Decompression Speed (Fastest to Slowest):**
462+ 1 . ` UNCOMPRESSED ` - No decompression needed
463+ 2 . ` LZ4 ` - Very fast decompression
464+ 3 . ` SNAPPY ` - Fast decompression (good balance)
465+ 4 . ` GZIP ` - Moderate decompression speed
466+ 5 . ` BROTLI ` - Slower decompression
467+ 6 . ` ZSTD ` - Configurable, generally slower for high compression
468+
310469## Column Encodings
311470
312471Parquet supports various column encoding algorithms that can significantly impact file size and query performance.
@@ -325,12 +484,12 @@ The default encoding that stores values as-is without any compression scheme.
325484** Supported types:** All column types
326485
327486``` php
328- use Flow\Parquet\Options;
329- use Flow\Parquet\Option ;
487+ use Flow\Parquet\{ Options, Option} ;
488+ use Flow\Parquet\ParquetFile\Encodings ;
330489
331490$options = Options::default()->set(Option::COLUMNS_ENCODINGS, [
332- 'description' => ' PLAIN' ,
333- 'uuid' => ' PLAIN'
491+ 'description' => Encodings:: PLAIN,
492+ 'uuid' => Encodings:: PLAIN
334493]);
335494```
336495
@@ -347,9 +506,9 @@ Run Length Encoding with Dictionary compression. Values are stored in a dictiona
347506
348507``` php
349508$options = Options::default()->set(Option::COLUMNS_ENCODINGS, [
350- 'status' => ' RLE_DICTIONARY' , // 'active', 'inactive', 'pending'
351- 'country_code' => ' RLE_DICTIONARY' , // 'US', 'UK', 'DE', 'FR'
352- 'department' => ' RLE_DICTIONARY' // 'engineering', 'sales', 'marketing'
509+ 'status' => Encodings:: RLE_DICTIONARY, // 'active', 'inactive', 'pending'
510+ 'country_code' => Encodings:: RLE_DICTIONARY, // 'US', 'UK', 'DE', 'FR'
511+ 'department' => Encodings:: RLE_DICTIONARY // 'engineering', 'sales', 'marketing'
353512]);
354513```
355514
@@ -365,9 +524,9 @@ Delta encoding with binary packing for integer columns. Stores differences betwe
365524
366525``` php
367526$options = Options::default()->set(Option::COLUMNS_ENCODINGS, [
368- 'user_id' => ' DELTA_BINARY_PACKED' , // 1, 2, 3, 4, 5...
369- 'timestamp_ms' => ' DELTA_BINARY_PACKED' , // 1634567890123, 1634567890124...
370- 'order_number' => ' DELTA_BINARY_PACKED' // Sequential order IDs
527+ 'user_id' => Encodings:: DELTA_BINARY_PACKED, // 1, 2, 3, 4, 5...
528+ 'timestamp_ms' => Encodings:: DELTA_BINARY_PACKED, // 1634567890123, 1634567890124...
529+ 'order_number' => Encodings:: DELTA_BINARY_PACKED // Sequential order IDs
371530]);
372531```
373532
@@ -377,7 +536,7 @@ $options = Options::default()->set(Option::COLUMNS_ENCODINGS, [
377536
378537``` php
379538use Flow\Parquet\{Writer, Options, Option};
380- use Flow\Parquet\ParquetFile\Schema;
539+ use Flow\Parquet\ParquetFile\{ Schema, Compressions, Encodings} ;
381540use Flow\Parquet\ParquetFile\Schema\FlatColumn;
382541
383542$schema = Schema::with(
@@ -387,9 +546,9 @@ $schema = Schema::with(
387546);
388547
389548$options = Options::default()->set(Option::COLUMNS_ENCODINGS, [
390- 'user_id' => ' DELTA_BINARY_PACKED' , // Sequential IDs
391- 'status' => ' RLE_DICTIONARY' , // Limited set of values
392- 'description' => ' PLAIN' // High variance text
549+ 'user_id' => Encodings:: DELTA_BINARY_PACKED, // Sequential IDs
550+ 'status' => Encodings:: RLE_DICTIONARY, // Limited set of values
551+ 'description' => Encodings:: PLAIN // High variance text
393552]);
394553
395554$writer = new Writer(compressions: Compressions::SNAPPY, options: $options);
@@ -447,18 +606,18 @@ $schema = Schema::with(
447606``` php
448607$options = Options::default()->set(Option::COLUMNS_ENCODINGS, [
449608 // Struct fields - direct access
450- 'user.id' => ' DELTA_BINARY_PACKED' ,
451- 'user.name' => ' RLE_DICTIONARY' ,
452- 'user.address.street' => ' PLAIN' ,
453- 'user.address.city' => ' RLE_DICTIONARY' ,
454- 'user.address.country' => ' RLE_DICTIONARY' ,
609+ 'user.id' => Encodings:: DELTA_BINARY_PACKED,
610+ 'user.name' => Encodings:: RLE_DICTIONARY,
611+ 'user.address.street' => Encodings:: PLAIN,
612+ 'user.address.city' => Encodings:: RLE_DICTIONARY,
613+ 'user.address.country' => Encodings:: RLE_DICTIONARY,
455614
456615 // List elements - note the '.list.element' suffix
457- 'tags.list.element' => ' RLE_DICTIONARY' ,
616+ 'tags.list.element' => Encodings:: RLE_DICTIONARY,
458617
459618 // Map key/value pairs - note the '.key_value.key/value' suffix
460- 'metadata.key_value.key' => ' RLE_DICTIONARY' ,
461- 'metadata.key_value.value' => ' PLAIN'
619+ 'metadata.key_value.key' => Encodings:: RLE_DICTIONARY,
620+ 'metadata.key_value.value' => Encodings:: PLAIN
462621]);
463622```
464623
@@ -476,12 +635,12 @@ $schema = Schema::with(
476635
477636$options = Options::default()->set(Option::COLUMNS_ENCODINGS, [
478637 // List of structs: list_name.list.element.field_name
479- 'orders.list.element.order_id' => ' DELTA_BINARY_PACKED' ,
480- 'orders.list.element.status' => ' RLE_DICTIONARY' ,
638+ 'orders.list.element.order_id' => Encodings:: DELTA_BINARY_PACKED,
639+ 'orders.list.element.status' => Encodings:: RLE_DICTIONARY,
481640
482641 // Map inside list element: list_name.list.element.map_name.key_value.key/value
483- 'orders.list.element.attributes.key_value.key' => ' RLE_DICTIONARY' ,
484- 'orders.list.element.attributes.key_value.value' => ' PLAIN'
642+ 'orders.list.element.attributes.key_value.key' => Encodings:: RLE_DICTIONARY,
643+ 'orders.list.element.attributes.key_value.value' => Encodings:: PLAIN
485644]);
486645```
487646
@@ -490,17 +649,17 @@ $options = Options::default()->set(Option::COLUMNS_ENCODINGS, [
490649``` php
491650$options = Options::default()->set(Option::COLUMNS_ENCODINGS, [
492651 // High cardinality sequential data
493- 'order_id' => ' DELTA_BINARY_PACKED' ,
494- 'created_timestamp' => ' DELTA_BINARY_PACKED' ,
652+ 'order_id' => Encodings:: DELTA_BINARY_PACKED,
653+ 'created_timestamp' => Encodings:: DELTA_BINARY_PACKED,
495654
496655 // Low cardinality categorical data
497- 'order_status' => ' RLE_DICTIONARY' ,
498- 'payment_method' => ' RLE_DICTIONARY' ,
499- 'shipping_country' => ' RLE_DICTIONARY' ,
656+ 'order_status' => Encodings:: RLE_DICTIONARY,
657+ 'payment_method' => Encodings:: RLE_DICTIONARY,
658+ 'shipping_country' => Encodings:: RLE_DICTIONARY,
500659
501660 // High variance descriptive data
502- 'customer_notes' => ' PLAIN' ,
503- 'product_description' => ' PLAIN'
661+ 'customer_notes' => Encodings:: PLAIN,
662+ 'product_description' => Encodings:: PLAIN
504663]);
505664```
506665
0 commit comments