Skip to content

Commit d8a839f

Browse files
authored
Fix parquet fixed array length (#2257)
* refactor: parquet binary encoding to standalone functions - Convert ByteConverter class to standalone functions in Binary/functions.php (encode_i8, decode_i8, encode_i32, decode_i32, encode_f64, decode_f64, etc.) - Remove ByteOrder parameter from i8/u8 functions (single bytes have no endianness) - Add precision validation to encode_decimal and decode_decimal functions - Fix UuidConverter to handle Stringable objects (e.g., Flow\Types\Value\Uuid) - Fix static analysis errors: - Add Bytes type to RLEDictionaryChunkBuilder::$pageValues annotation - Handle Bytes objects as array keys in ScalarDictionaryBuilder - Remove unnecessary @phpstan-ignore annotations in RLEBitPackedHybrid - Regenerate CLI test fixture orders.parquet with correct binary UUID - Update CLI test expectations to match regenerated fixture * refactor: parquet performance optimizations * refactor: optimize dremel shredder algorithm * refactor: reduce computation of parquet flat path * refactor: ColumnDataalidator optimizations * fix: missing dsl definitions
1 parent e5a31d9 commit d8a839f

51 files changed

Lines changed: 1554 additions & 872 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

composer.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,7 @@
201201
"src/lib/azure-sdk/src/Flow/Azure/SDK/DSL/functions.php",
202202
"src/lib/filesystem/src/Flow/Filesystem/DSL/functions.php",
203203
"src/lib/parquet/src/Flow/Parquet/functions.php",
204+
"src/lib/parquet/src/Flow/Parquet/Binary/functions.php",
204205
"src/lib/parquet/src/stubs.php",
205206
"src/lib/postgresql/src/Flow/PostgreSql/DSL/functions.php",
206207
"src/lib/postgresql/src/stubs.php",

src/adapter/etl-adapter-parquet/src/Flow/ETL/Adapter/Parquet/ParquetExtractor.php

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@
88
use Flow\ETL\{Exception\InvalidArgumentException, Extractor, FlowContext};
99
use Flow\ETL\Extractor\{FileExtractor, Limitable, LimitableExtractor, PathFiltering, Signal};
1010
use Flow\Filesystem\{Path, SourceStream};
11-
use Flow\Parquet\{ByteOrder, Options, ParquetFile, Reader};
11+
use Flow\Parquet\Binary\ByteOrder;
12+
use Flow\Parquet\{Options, ParquetFile, Reader};
1213

1314
final class ParquetExtractor implements Extractor, FileExtractor, LimitableExtractor
1415
{

src/adapter/etl-adapter-parquet/src/Flow/ETL/Adapter/Parquet/functions.php

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@
1212
};
1313
use Flow\ETL\Schema;
1414
use Flow\Filesystem\Path;
15-
use Flow\Parquet\{ByteOrder, Options};
15+
use Flow\Parquet\Binary\ByteOrder;
16+
use Flow\Parquet\Options;
1617
use Flow\Parquet\ParquetFile\Compressions;
1718

1819
/**

src/cli/tests/Flow/CLI/Tests/Integration/FileReadCommandTest.php

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -98,15 +98,15 @@ public function test_read_rows_parquet() : void
9898

9999
self::assertCommandOutputIdentical(
100100
<<<'OUTPUT'
101-
+----------------------+----------------------+----------------------+-----------+----------------------+-------------------+----------------------+----------------------+----------------------+
102-
| order_id | created_at | updated_at | discount | email | customer | address | notes | items |
103-
+----------------------+----------------------+----------------------+-----------+----------------------+-------------------+----------------------+----------------------+----------------------+
104-
| 1e4544ab-7c94-3d39-b | 2024-10-02T23:01:19+ | 2024-11-18T12:21:16+ | 26.209999 | shany14@baumbach.org | Rafaela Hartmann | {"street":"64610 Kat | ["Deleniti vitae dol | [{"sku":"SKU_0005"," |
105-
| f1aba27a-3387-3e10-b | 2024-09-04T01:26:42+ | 2024-11-15T05:27:03+ | | okon.haley@yahoo.com | Marjolaine Kohler | {"street":"14054 Ker | ["Nulla exercitation | [{"sku":"SKU_0003"," |
106-
| c8d23f0b-c157-323f-8 | 2024-02-14T14:07:07+ | 2024-11-06T13:38:25+ | | rolfson.noble@hotmai | Loyce McLaughlin | {"street":"9058 Kess | ["Laborum molestiae | [{"sku":"SKU_0005"," |
107-
| d6215090-cea0-3fd9-a | 2024-10-12T09:18:12+ | 2024-11-21T09:38:15+ | | lruecker@hotmail.com | Estelle Schinner | {"street":"68058 Dav | ["In dolore nam et s | [{"sku":"SKU_0003"," |
108-
| ac622a00-7de2-3eb0-b | 2024-06-07T10:27:53+ | 2024-11-23T21:03:36+ | | morar.beth@mayer.net | Ethan Hodkiewicz | {"street":"84594 Vla | ["Vel ipsam id quos | [{"sku":"SKU_0004"," |
109-
+----------------------+----------------------+----------------------+-----------+----------------------+-------------------+----------------------+----------------------+----------------------+
101+
+----------------------+----------------------+----------------------+-----------+----------------------+------------------+----------------------+----------------------+----------------------+
102+
| order_id | created_at | updated_at | discount | email | customer | address | notes | items |
103+
+----------------------+----------------------+----------------------+-----------+----------------------+------------------+----------------------+----------------------+----------------------+
104+
| 4e93f175-0c89-30df-a | 2026-01-29T13:48:47+ | 2026-02-15T13:48:47+ | 27.969999 | fahey.aurelie@denesi | Jerod Abbott | {"street":"5567 Grim | ["Maxime et sed impe | [{"sku":"SKU_0004"," |
105+
| db560d65-3a26-359e-8 | 2026-02-15T18:39:41+ | | 35.990002 | dusty70@howell.com | Omari McGlynn | {"street":"86184 Mck | ["Mollitia eos optio | [{"sku":"SKU_0001"," |
106+
| f24d7f7d-1615-3b5b-a | 2026-01-26T05:23:13+ | 2026-02-01T05:23:13+ | 42.669998 | linnea91@gmail.com | Kendall Weissnat | {"street":"65268 Apr | ["Ut nesciunt volupt | [{"sku":"SKU_0003"," |
107+
| 78fe7069-f06f-3081-b | 2026-03-12T13:08:56+ | | | glubowitz@morissette | Roman Balistreri | {"street":"1722 Hall | ["Velit vero invento | [{"sku":"SKU_0005"," |
108+
| e3fb781c-34ff-380f-8 | 2026-01-10T18:34:34+ | 2026-02-06T18:34:34+ | | walton60@hand.com | Sedrick Ondricka | {"street":"201 Mosci | ["Doloremque culpa i | [{"sku":"SKU_0004"," |
109+
+----------------------+----------------------+----------------------+-----------+----------------------+------------------+----------------------+----------------------+----------------------+
110110
5 rows
111111

112112
OUTPUT,

src/cli/tests/Flow/CLI/Tests/Integration/FileSchemaCommandTest.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -388,7 +388,7 @@ public function test_run_schema_with_table_output_on_parquet() : void
388388
+------------+----------+----------+----------+
389389
| order_id | uuid | false | [] |
390390
| created_at | datetime | false | [] |
391-
| updated_at | datetime | false | [] |
391+
| updated_at | datetime | true | [] |
392392
| discount | float | true | [] |
393393
| email | string | false | [] |
394394
| customer | string | false | [] |
-30.6 KB
Binary file not shown.

src/lib/parquet/composer.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
},
3434
"files": [
3535
"src/Flow/Parquet/functions.php",
36+
"src/Flow/Parquet/Binary/functions.php",
3637
"src/stubs.php"
3738
]
3839
},

src/lib/parquet/src/Flow/Parquet/ByteOrder.php renamed to src/lib/parquet/src/Flow/Parquet/Binary/ByteOrder.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
declare(strict_types=1);
44

5-
namespace Flow\Parquet;
5+
namespace Flow\Parquet\Binary;
66

77
enum ByteOrder : string
88
{

src/lib/parquet/src/Flow/Parquet/BinaryReader/Bytes.php renamed to src/lib/parquet/src/Flow/Parquet/Binary/Bytes.php

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,24 @@
22

33
declare(strict_types=1);
44

5-
namespace Flow\Parquet\BinaryReader;
5+
namespace Flow\Parquet\Binary;
66

7-
use Flow\Parquet\{ByteOrder, DataSize};
7+
use Flow\Parquet\DataSize;
88

9+
/**
10+
* @implements \ArrayAccess<int, int>
11+
* @implements \IteratorAggregate<int, int>
12+
*/
913
final class Bytes implements \ArrayAccess, \Countable, \IteratorAggregate
1014
{
15+
/** @var ?\ArrayIterator<int, int> */
1116
private ?\ArrayIterator $iterator = null;
1217

1318
private readonly DataSize $size;
1419

20+
/**
21+
* @param array<int, int> $bytes
22+
*/
1523
public function __construct(
1624
private array $bytes,
1725
private readonly ByteOrder $byteOrder = ByteOrder::LITTLE_ENDIAN,
@@ -31,7 +39,9 @@ public function count() : int
3139
return \count($this->bytes);
3240
}
3341

34-
// IteratorAggregate methods
42+
/**
43+
* @return \ArrayIterator<int, int>
44+
*/
3545
public function getIterator() : \ArrayIterator
3646
{
3747
if ($this->iterator === null) {
@@ -41,18 +51,17 @@ public function getIterator() : \ArrayIterator
4151
return $this->iterator;
4252
}
4353

44-
// ArrayAccess methods
45-
public function offsetExists($offset) : bool
54+
public function offsetExists(mixed $offset) : bool
4655
{
4756
return isset($this->bytes[$offset]);
4857
}
4958

50-
public function offsetGet($offset) : mixed
59+
public function offsetGet(mixed $offset) : int
5160
{
5261
return $this->bytes[$offset];
5362
}
5463

55-
public function offsetSet($offset, $value) : void
64+
public function offsetSet(mixed $offset, mixed $value) : void
5665
{
5766
if ($offset === null) {
5867
$this->bytes[] = $value;
@@ -61,7 +70,7 @@ public function offsetSet($offset, $value) : void
6170
}
6271
}
6372

64-
public function offsetUnset($offset) : void
73+
public function offsetUnset(mixed $offset) : void
6574
{
6675
unset($this->bytes[$offset]);
6776
}

0 commit comments

Comments
 (0)