Skip to content

Commit af99327

Browse files
authored
1789 proposal add offset method to Data Frame (#1790)
* DataFrame::offset() method Added DataFrame::offsetMethod implemented by OffsetPipeline * Added --file-input-offset option to CLI commands
1 parent ca1e62e commit af99327

15 files changed

Lines changed: 1274 additions & 5 deletions

File tree

src/cli/src/Flow/CLI/Command/FileAnalyzeCommand.php

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ public function configure() : void
5151
->addOption('input-file-format', null, InputArgument::OPTIONAL, 'File format. When not set file format is guessed from source file path extension', null)
5252
->addOption('input-file-batch-size', null, InputOption::VALUE_REQUIRED, 'Number of rows that are going to be read and displayed in one batch, when set to -1 whole dataset will be displayed at once', self::DEFAULT_BATCH_SIZE)
5353
->addOption('input-file-limit', null, InputOption::VALUE_REQUIRED, 'Limit number of rows that are going to be used to infer file schema, when not set whole file is analyzed', null)
54+
->addOption('input-file-offset', null, InputOption::VALUE_REQUIRED, 'Number of rows to skip before starting to read data', null)
5455
->addOption('schema-auto-cast', null, InputOption::VALUE_OPTIONAL, 'When set Flow will try to automatically cast values to more precise data types, for example datetime strings will be casted to datetime type', false);
5556

5657
$this->addConfigOptions($this);
@@ -91,6 +92,12 @@ protected function execute(InputInterface $input, OutputInterface $output) : int
9192
$df->limit($limit);
9293
}
9394

95+
$offset = option_int_nullable('input-file-offset', $input);
96+
97+
if ($offset !== null && $offset > 0) {
98+
$df->offset($offset);
99+
}
100+
94101
$progress = $style->createProgressBar();
95102
$progress->setFormat('Analyzed Rows: %current% %bar%');
96103

src/cli/src/Flow/CLI/Command/FileConvertCommand.php

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ public function configure() : void
4848
->addOption('input-file-format', null, InputArgument::OPTIONAL, 'File format. When not set file format is guessed from input file path extension', null)
4949
->addOption('input-file-batch-size', null, InputOption::VALUE_REQUIRED, 'Number of rows that are going to be read and displayed in one batch, when set to -1 whole dataset will be displayed at once', self::DEFAULT_BATCH_SIZE)
5050
->addOption('input-file-limit', null, InputOption::VALUE_REQUIRED, 'Limit number of rows that are going to be used to infer file schema, when not set whole file is analyzed', null)
51+
->addOption('input-file-offset', null, InputOption::VALUE_REQUIRED, 'Number of rows to skip before starting to read data', null)
5152
->addOption('output-file-format', null, InputArgument::OPTIONAL, 'File format. When not set file format is guessed from output file path extension', null)
5253
->addOption('output-overwrite', null, InputOption::VALUE_OPTIONAL, 'When set output file will be overwritten if exists')
5354
->addOption('schema-auto-cast', null, InputOption::VALUE_OPTIONAL, 'When set Flow will try to automatically cast values to more precise data types, for example datetime strings will be casted to datetime type', false)
@@ -90,6 +91,12 @@ protected function execute(InputInterface $input, OutputInterface $output) : int
9091
$df->limit($limit);
9192
}
9293

94+
$offset = option_int_nullable('input-file-offset', $input);
95+
96+
if ($offset !== null && $offset > 0) {
97+
$df->offset($offset);
98+
}
99+
93100
$overwrite = option_bool('output-overwrite', $input);
94101

95102
if ($overwrite) {

src/cli/src/Flow/CLI/Command/FileReadCommand.php

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ public function configure() : void
4444
->addOption('input-file-format', null, InputArgument::OPTIONAL, 'File format. When not set file format is guessed from source file path extension', null)
4545
->addOption('input-file-batch-size', null, InputOption::VALUE_REQUIRED, 'Number of rows that are going to be read and displayed in one batch, when set to -1 whole dataset will be displayed at once', self::DEFAULT_BATCH_SIZE)
4646
->addOption('input-file-limit', null, InputOption::VALUE_REQUIRED, 'Limit number of rows that are going to be used to infer file schema, when not set whole file is analyzed', null)
47+
->addOption('input-file-offset', null, InputOption::VALUE_REQUIRED, 'Number of rows to skip before starting to read data', null)
4748
->addOption('output-truncate', null, InputOption::VALUE_REQUIRED, 'Truncate output to given number of characters, when set to -1 output is not truncated at all', 20)
4849
->addOption('output-columns', null, InputOption::VALUE_REQUIRED | InputOption::VALUE_IS_ARRAY, 'Columns to include in output, when not set all columns are displayed', [])
4950
->addOption('schema-auto-cast', null, InputOption::VALUE_OPTIONAL, 'When set Flow will try to automatically cast values to more precise data types, for example datetime strings will be casted to datetime type', false);
@@ -83,6 +84,12 @@ protected function execute(InputInterface $input, OutputInterface $output) : int
8384
$df->limit($limit);
8485
}
8586

87+
$offset = option_int_nullable('input-file-offset', $input);
88+
89+
if ($offset !== null && $offset > 0) {
90+
$df->offset($offset);
91+
}
92+
8693
$outputColumns = option_list_of_strings('output-columns', $input);
8794

8895
if (\count($outputColumns)) {

src/cli/src/Flow/CLI/Command/FileRowsCountCommand.php

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,8 @@ public function configure() : void
3939
->setDescription('Read data schema from a file.')
4040
->addArgument('input-file', InputArgument::REQUIRED, 'Path to a file from which schema should be extracted.')
4141
->addOption('input-file-format', null, InputArgument::OPTIONAL, 'Source file format. When not set file format is guessed from source file path extension', null)
42-
->addOption('input-file-limit', null, InputOption::VALUE_REQUIRED, 'Limit number of rows that are going to be used to infer file schema, when not set whole file is analyzed', null);
42+
->addOption('input-file-limit', null, InputOption::VALUE_REQUIRED, 'Limit number of rows that are going to be used to infer file schema, when not set whole file is analyzed', null)
43+
->addOption('input-file-offset', null, InputOption::VALUE_REQUIRED, 'Number of rows to skip before starting to read data', null);
4344

4445
$this->addConfigOptions($this);
4546
$this->addJSONInputOptions($this);
@@ -61,6 +62,12 @@ protected function execute(InputInterface $input, OutputInterface $output) : int
6162
$df->limit($limit);
6263
}
6364

65+
$offset = option_int_nullable('input-file-offset', $input);
66+
67+
if ($offset !== null && $offset > 0) {
68+
$df->offset($offset);
69+
}
70+
6471
$style->write((string) $df->count());
6572

6673
return Command::SUCCESS;

src/cli/src/Flow/CLI/Command/FileSchemaCommand.php

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ public function configure() : void
4242
->addArgument('input-file', InputArgument::REQUIRED, 'Path to a file from which schema should be extracted.')
4343
->addOption('input-file-format', null, InputArgument::OPTIONAL, 'Source file format. When not set file format is guessed from source file path extension', null)
4444
->addOption('input-file-limit', null, InputOption::VALUE_REQUIRED, 'Limit number of rows that are going to be used to infer file schema, when not set whole file is analyzed', null)
45+
->addOption('input-file-offset', null, InputOption::VALUE_REQUIRED, 'Number of rows to skip before starting to read data', null)
4546
->addOption('output-pretty', null, InputOption::VALUE_NONE, 'Print schema as pretty json')
4647
->addOption('output-php', null, InputOption::VALUE_NONE, 'Print schema as PHP code')
4748
->addOption('output-table', null, InputOption::VALUE_NONE, 'Print schema as ascii table')
@@ -72,6 +73,12 @@ protected function execute(InputInterface $input, OutputInterface $output) : int
7273
$df->limit($limit);
7374
}
7475

76+
$offset = option_int_nullable('input-file-offset', $input);
77+
78+
if ($offset !== null && $offset > 0) {
79+
$df->offset($offset);
80+
}
81+
7582
$schema = $df->schema();
7683

7784
if (option_bool('output-ascii', $input)) {

src/cli/tests/Flow/CLI/Tests/Integration/FileAnalyzeCommandTest.php

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,9 @@ public function test_read_rows_csv() : void
6363
$tester->getDisplay()
6464
);
6565

66-
self::assertStringContainsString("Analyzed Rows", $tester->getDisplay());
66+
self::assertStringContainsString('Analyzed Rows', $tester->getDisplay());
6767

68-
self::assertStringContainsString("Execution Time", $tester->getDisplay());
68+
self::assertStringContainsString('Execution Time', $tester->getDisplay());
6969
}
7070

7171
public function test_read_rows_csv_without_schema() : void
@@ -108,8 +108,8 @@ public function test_read_rows_csv_without_schema() : void
108108
$tester->getDisplay()
109109
);
110110

111-
self::assertStringContainsString("Analyzed Rows", $tester->getDisplay());
111+
self::assertStringContainsString('Analyzed Rows', $tester->getDisplay());
112112

113-
self::assertStringContainsString("Execution Time", $tester->getDisplay());
113+
self::assertStringContainsString('Execution Time', $tester->getDisplay());
114114
}
115115
}

src/cli/tests/Flow/CLI/Tests/Integration/FileConvertCommandTest.php

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,4 +62,72 @@ public function test_convert(string $inputFormat, string $outputFormat, array $o
6262
self::assertFileExists($output);
6363
unlink($output);
6464
}
65+
66+
public function test_convert_with_offset() : void
67+
{
68+
$output = __DIR__ . '/var/' . bin2hex(random_bytes(16)) . '.json';
69+
70+
if (\file_exists($output)) {
71+
\unlink($output);
72+
}
73+
74+
$tester = new CommandTester(new FileConvertCommand('convert'));
75+
76+
$tester->execute([
77+
'input-file' => __DIR__ . '/Fixtures/orders.csv',
78+
'output-file' => $output,
79+
'--input-file-limit' => 3,
80+
'--input-file-offset' => 2,
81+
'--output-overwrite' => true,
82+
]);
83+
84+
$tester->assertCommandIsSuccessful();
85+
86+
self::assertFileExists($output);
87+
88+
// Read the converted file to verify offset was applied
89+
$content = file_get_contents($output);
90+
self::assertNotFalse($content);
91+
92+
// Should contain the third row (after offset of 2) but not the first two rows
93+
self::assertStringContainsString('6315f9e2-86bf-3321-a', $content); // Third row
94+
self::assertStringNotContainsString('e13d7098-5a78-3389-9', $content); // First row should not be there
95+
self::assertStringNotContainsString('947df050-3abb-3f5a-9', $content); // Second row should not be there
96+
97+
unlink($output);
98+
}
99+
100+
public function test_convert_with_offset_and_limit() : void
101+
{
102+
$output = __DIR__ . '/var/' . bin2hex(random_bytes(16)) . '.json';
103+
104+
if (\file_exists($output)) {
105+
\unlink($output);
106+
}
107+
108+
$tester = new CommandTester(new FileConvertCommand('convert'));
109+
110+
$tester->execute([
111+
'input-file' => __DIR__ . '/Fixtures/orders.csv',
112+
'output-file' => $output,
113+
'--input-file-limit' => 3,
114+
'--input-file-offset' => 1,
115+
'--output-overwrite' => true,
116+
]);
117+
118+
$tester->assertCommandIsSuccessful();
119+
120+
self::assertFileExists($output);
121+
122+
// Read the converted file to verify offset + limit was applied
123+
$content = file_get_contents($output);
124+
self::assertNotFalse($content);
125+
126+
// Should contain second and third rows (offset 1, limit 3 gives us 2 rows after offset)
127+
self::assertStringContainsString('947df050-3abb-3f5a-9', $content); // Second row
128+
self::assertStringContainsString('6315f9e2-86bf-3321-a', $content); // Third row
129+
self::assertStringNotContainsString('e13d7098-5a78-3389-9', $content); // First row should not be there
130+
131+
unlink($output);
132+
}
65133
}

src/cli/tests/Flow/CLI/Tests/Integration/FileReadCommandTest.php

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,93 @@ public function test_read_rows_text() : void
137137
);
138138
}
139139

140+
public function test_read_rows_with_large_offset() : void
141+
{
142+
$tester = new CommandTester(new FileReadCommand('read'));
143+
144+
$tester->execute([
145+
'input-file' => __DIR__ . '/Fixtures/orders.csv',
146+
'--input-file-offset' => 1000,
147+
]);
148+
149+
$tester->assertCommandIsSuccessful();
150+
151+
$output = $tester->getDisplay();
152+
153+
// Should show empty output as offset is larger than file
154+
self::assertEmpty(trim($output));
155+
}
156+
157+
public function test_read_rows_with_offset_and_columns() : void
158+
{
159+
$tester = new CommandTester(new FileReadCommand('read'));
160+
161+
$tester->execute([
162+
'input-file' => __DIR__ . '/Fixtures/orders.csv',
163+
'--input-file-limit' => 4,
164+
'--input-file-offset' => 1,
165+
'--output-columns' => ['order_id', 'discount'],
166+
]);
167+
168+
$tester->assertCommandIsSuccessful();
169+
170+
$output = $tester->getDisplay();
171+
172+
// Should skip first row and show next 3 rows with only order_id and discount columns
173+
self::assertStringNotContainsString('e13d7098-5a78-3389-9', $output); // First row should not be displayed
174+
self::assertStringContainsString('947df050-3abb-3f5a-9', $output); // Second row should be first displayed
175+
self::assertStringContainsString('6315f9e2-86bf-3321-a', $output); // Third row should be displayed
176+
self::assertStringContainsString('4cccb632-fade-34e2-8', $output); // Fourth row should be displayed
177+
self::assertStringContainsString('3 rows', $output); // Should show 3 rows
178+
179+
// Should only show selected columns
180+
self::assertStringContainsString('order_id', $output);
181+
self::assertStringContainsString('discount', $output);
182+
self::assertStringNotContainsString('created_at', $output);
183+
self::assertStringNotContainsString('address', $output);
184+
}
185+
186+
public function test_read_rows_with_offset_csv() : void
187+
{
188+
$tester = new CommandTester(new FileReadCommand('read'));
189+
190+
$tester->execute([
191+
'input-file' => __DIR__ . '/Fixtures/orders.csv',
192+
'--input-file-limit' => 3,
193+
'--input-file-offset' => 2,
194+
]);
195+
196+
$tester->assertCommandIsSuccessful();
197+
198+
$output = $tester->getDisplay();
199+
200+
// Should display rows starting from offset 2 (third row)
201+
self::assertStringContainsString('6315f9e2-86bf-3321-a', $output); // Third row should be first displayed
202+
self::assertStringNotContainsString('e13d7098-5a78-3389-9', $output); // First row should not be displayed
203+
self::assertStringNotContainsString('947df050-3abb-3f5a-9', $output); // Second row should not be displayed
204+
self::assertStringContainsString('1 rows', $output); // Only 1 row should be displayed (limit 3 - offset 2)
205+
}
206+
207+
public function test_read_rows_with_offset_zero() : void
208+
{
209+
$tester = new CommandTester(new FileReadCommand('read'));
210+
211+
$tester->execute([
212+
'input-file' => __DIR__ . '/Fixtures/orders.csv',
213+
'--input-file-limit' => 2,
214+
'--input-file-offset' => 0,
215+
]);
216+
217+
$tester->assertCommandIsSuccessful();
218+
219+
$output = $tester->getDisplay();
220+
221+
// Should behave same as no offset - show first 2 rows
222+
self::assertStringContainsString('e13d7098-5a78-3389-9', $output); // First row should be displayed
223+
self::assertStringContainsString('947df050-3abb-3f5a-9', $output); // Second row should be displayed
224+
self::assertStringContainsString('2 rows', $output);
225+
}
226+
140227
public function test_read_rows_with_output_columns_empty_maintains_all_columns() : void
141228
{
142229
$tester = new CommandTester(new FileReadCommand('read'));

src/cli/tests/Flow/CLI/Tests/Integration/FileRowsCountCommandTest.php

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,67 @@ public function test_count_rows_text() : void
6565
self::assertSame('44', $tester->getDisplay());
6666
}
6767

68+
public function test_count_rows_with_large_offset() : void
69+
{
70+
$tester = new CommandTester(new FileRowsCountCommand('count'));
71+
72+
$tester->execute([
73+
'input-file' => __DIR__ . '/Fixtures/orders.csv',
74+
'--input-file-offset' => 1000,
75+
]);
76+
77+
$tester->assertCommandIsSuccessful();
78+
79+
// Offset larger than file should result in 0 rows
80+
self::assertSame('0', $tester->getDisplay());
81+
}
82+
83+
public function test_count_rows_with_offset_and_limit() : void
84+
{
85+
$tester = new CommandTester(new FileRowsCountCommand('count'));
86+
87+
$tester->execute([
88+
'input-file' => __DIR__ . '/Fixtures/orders.csv',
89+
'--input-file-offset' => 2,
90+
'--input-file-limit' => 5,
91+
]);
92+
93+
$tester->assertCommandIsSuccessful();
94+
95+
// CSV has 43 total rows, with offset 2 and limit 5 should count 3 rows (limit applies first, then offset)
96+
self::assertSame('3', $tester->getDisplay());
97+
}
98+
99+
public function test_count_rows_with_offset_csv() : void
100+
{
101+
$tester = new CommandTester(new FileRowsCountCommand('count'));
102+
103+
$tester->execute([
104+
'input-file' => __DIR__ . '/Fixtures/orders.csv',
105+
'--input-file-offset' => 5,
106+
]);
107+
108+
$tester->assertCommandIsSuccessful();
109+
110+
// CSV has 43 total rows, with offset 5 should count 38 rows
111+
self::assertSame('38', $tester->getDisplay());
112+
}
113+
114+
public function test_count_rows_with_zero_offset() : void
115+
{
116+
$tester = new CommandTester(new FileRowsCountCommand('count'));
117+
118+
$tester->execute([
119+
'input-file' => __DIR__ . '/Fixtures/orders.csv',
120+
'--input-file-offset' => 0,
121+
]);
122+
123+
$tester->assertCommandIsSuccessful();
124+
125+
// Zero offset should behave same as no offset
126+
self::assertSame('43', $tester->getDisplay());
127+
}
128+
68129
public function test_count_rows_xml() : void
69130
{
70131
$tester = new CommandTester(new FileRowsCountCommand('count'));

0 commit comments

Comments
 (0)