Skip to content

Commit 92e7c14

Browse files
committed
Add HTMLQuerySelectorAll & HTMLQuerySelector scalar functions
1 parent 73ccd04 commit 92e7c14

22 files changed

Lines changed: 522 additions & 381 deletions

File tree

phpstan.neon

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,15 @@ parameters:
6060
excludePaths:
6161
- src/cli/src/Flow/CLI/Command/*
6262
- src/core/etl/src/Flow/ETL/Formatter/ASCII/ASCIITable.php
63+
- src/core/etl/src/Flow/ETL/Function/HTMLQuerySelectorAll.php
64+
- src/core/etl/src/Flow/ETL/Function/HTMLQuerySelector.php
65+
- src/core/etl/src/Flow/ETL/Row/Entry/HTMLEntry.php
6366
- src/core/etl/src/Flow/ETL/Sort/ExternalSort/RowsMinHeap.php
67+
- src/core/etl/tests/Flow/ETL/Tests/Integration/Function/HTMLQuerySelectorAllTest.php
68+
- src/core/etl/tests/Flow/ETL/Tests/Integration/Function/HTMLQuerySelectorTest.php
69+
- src/core/etl/tests/Flow/ETL/Tests/Unit/Function/HTMLQuerySelectorAllTest.php
70+
- src/core/etl/tests/Flow/ETL/Tests/Unit/Function/HTMLQuerySelectorTest.php
71+
- src/core/etl/tests/Flow/ETL/Tests/Unit/Row/Entry/HTMLEntryTest.php
6472
- src/adapter/etl-adapter-avro/*
6573
- src/adapter/etl-adapter-elasticsearch/src/Flow/ETL/Adapter/Elasticsearch/ElasticsearchPHP/SearchResults.php
6674
- src/adapter/etl-adapter-elasticsearch/src/Flow/ETL/Adapter/Elasticsearch/ElasticsearchPHP/SearchParams.php
@@ -70,6 +78,7 @@ parameters:
7078
- src/lib/parquet/src/Flow/Parquet/ThriftModel/*
7179
- src/lib/parquet/src/Flow/Parquet/BinaryReader/*
7280
- src/lib/parquet/src/Flow/Parquet/Dremel/ColumnData/DefinitionConverter.php
81+
- src/lib/types/src/Flow/Types/Type/Logical/HTMLType.php
7382

7483
tmpDir: var/phpstan/cache
7584

src/core/etl/src/Flow/ETL/DSL/functions.php

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
type_xml_element as type_xml_element_new,
3838
types as types_new
3939
};
40+
use Dom\HTMLDocument;
4041
use Flow\Calculator\Rounding;
4142
use Flow\ETL\{Analyze,
4243
Attribute\DocumentationDSL,
@@ -205,7 +206,6 @@
205206
UnionType
206207
};
207208
use Flow\Types\Type\Types;
208-
use Flow\Types\Value\HTMLDocument;
209209
use UnitEnum;
210210

211211
/**
@@ -632,7 +632,7 @@ function xml_element_entry(string $name, \DOMElement|string|null $value, ?Metada
632632
* @return Entry<?HTMLDocument>
633633
*/
634634
#[DocumentationDSL(module: Module::CORE, type: DSLType::ENTRY)]
635-
function html_entry(string $name, HTMLDocument|string|null $value, ?Metadata $metadata = null) : Entry
635+
function html_entry(string $name, HTMLDocument|string|null $value, ?Metadata $metadata = null) : Entry // @phpstan-ignore class.notFound,class.notFound
636636
{
637637
return new HTMLEntry($name, $value, $metadata);
638638
}
@@ -1973,7 +1973,7 @@ function json_schema(string $name, bool $nullable = false, ?Metadata $metadata =
19731973
* @return Definition<HTMLDocument>
19741974
*/
19751975
#[DocumentationDSL(module: Module::CORE, type: DSLType::SCHEMA)]
1976-
function html_schema(string $name, bool $nullable = false, ?Metadata $metadata = null) : Definition
1976+
function html_schema(string $name, bool $nullable = false, ?Metadata $metadata = null) : Definition // @phpstan-ignore class.notFound
19771977
{
19781978
return Definition::html($name, $nullable, $metadata);
19791979
}
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace Flow\ETL\Function;
6+
7+
use Dom\{Element, HTMLDocument};
8+
use Flow\ETL\Row;
9+
10+
final class HTMLQuerySelector extends ScalarFunctionChain
11+
{
12+
public function __construct(
13+
private readonly mixed $value,
14+
private readonly ScalarFunction|string $selector,
15+
) {
16+
}
17+
18+
public function eval(Row $row) : ?Element
19+
{
20+
$value = (new Parameter($this->value))->asInstanceOf($row, HTMLDocument::class);
21+
$selector = (new Parameter($this->selector))->asString($row);
22+
23+
if (null === $value || null === $selector) {
24+
return null;
25+
}
26+
27+
if (\PHP_VERSION_ID < 80400) {
28+
throw new \RuntimeException('This function requires \Dom\Element extension available in PHP 8.4+.');
29+
}
30+
31+
return $value->querySelector($selector);
32+
}
33+
}
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace Flow\ETL\Function;
6+
7+
use DOM\{Element, HTMLDocument};
8+
use Flow\ETL\Row;
9+
10+
final class HTMLQuerySelectorAll extends ScalarFunctionChain
11+
{
12+
public function __construct(
13+
private readonly mixed $value,
14+
private readonly ScalarFunction|string $selector,
15+
) {
16+
}
17+
18+
/**
19+
* @return null|array<Element>
20+
*/
21+
public function eval(Row $row) : ?array
22+
{
23+
$value = (new Parameter($this->value))->asInstanceOf($row, HTMLDocument::class);
24+
$selector = (new Parameter($this->selector))->asString($row);
25+
26+
if (null === $value || null === $selector) {
27+
return null;
28+
}
29+
30+
if (\PHP_VERSION_ID < 80400) {
31+
throw new \RuntimeException('This function requires \Dom\HTMLDocument extension available in PHP 8.4+.');
32+
}
33+
34+
$result = $value->querySelectorAll($selector);
35+
36+
if (0 === $result->count()) {
37+
return null;
38+
}
39+
40+
$nodes = [];
41+
42+
foreach ($result as $node) {
43+
if (!$node instanceof Element) {
44+
continue;
45+
}
46+
47+
$nodes[] = $node;
48+
}
49+
50+
return $nodes;
51+
}
52+
}

src/core/etl/src/Flow/ETL/Function/ScalarFunctionChain.php

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,16 @@ public function hash(Algorithm $algorithm = new NativePHPHash()) : Hash
311311
return new Hash($this, $algorithm);
312312
}
313313

314+
public function htmlQuerySelector(ScalarFunction|string $path) : HTMLQuerySelector
315+
{
316+
return new HTMLQuerySelector($this, $path);
317+
}
318+
319+
public function htmlQuerySelectorAll(ScalarFunction|string $path) : HTMLQuerySelectorAll
320+
{
321+
return new HTMLQuerySelectorAll($this, $path);
322+
}
323+
314324
/**
315325
* Returns the index of given $needle in string.
316326
*/

src/core/etl/src/Flow/ETL/Row/Entry/HTMLEntry.php

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@
55
namespace Flow\ETL\Row\Entry;
66

77
use function Flow\Types\DSL\{type_equals, type_html, type_optional};
8+
use Dom\HTMLDocument;
89
use Flow\ETL\Row\{Entry, Reference};
910
use Flow\ETL\Schema\{Definition, Metadata};
1011
use Flow\Types\Type;
11-
use Flow\Types\Value\HTMLDocument;
1212

1313
/**
1414
* @implements Entry<?HTMLDocument>
@@ -32,7 +32,7 @@ public function __construct(
3232
?Metadata $metadata = null,
3333
) {
3434
if (\is_string($value)) {
35-
$this->value = HTMLDocument::fromString($value);
35+
$this->value = HTMLDocument::createFromString($value);
3636
} else {
3737
$this->value = $value;
3838
}
@@ -75,7 +75,7 @@ public function isEqual(Entry $entry) : bool
7575
return false;
7676
}
7777

78-
return $entry->value()?->toString() === $this->value?->toString();
78+
return $entry->value()?->saveHtml() === $this->value?->saveHtml();
7979
}
8080

8181
public function map(callable $mapper) : self
@@ -99,7 +99,7 @@ public function toString() : string
9999
return '';
100100
}
101101

102-
return $this->value->toString();
102+
return $this->value->saveHtml();
103103
}
104104

105105
public function type() : Type

src/core/etl/src/Flow/ETL/Schema/Definition.php

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,13 @@
2828
type_xml,
2929
type_xml_element,
3030
types};
31+
use Dom\HTMLDocument;
3132
use Flow\ETL\Exception\{InvalidArgumentException, RuntimeException};
3233
use Flow\ETL\Row\{Entry, EntryReference, Reference};
3334
use Flow\Types\Type;
3435
use Flow\Types\Type\Logical\{ListType, MapType, OptionalType, StructureType};
3536
use Flow\Types\Type\{Native\FloatType, Native\IntegerType, Native\UnionType, TypeFactory};
36-
use Flow\Types\Value\{HTMLDocument, Uuid};
37+
use Flow\Types\Value\Uuid;
3738

3839
/**
3940
* @template-covariant T
@@ -142,7 +143,7 @@ public static function fromArray(array $definition) : self
142143
/**
143144
* @return Definition<HTMLDocument>
144145
*/
145-
public static function html(string|Reference $entry, bool $nullable = false, ?Metadata $metadata = null) : self
146+
public static function html(string|Reference $entry, bool $nullable = false, ?Metadata $metadata = null) : self // @phpstan-ignore class.notFound
146147
{
147148
return new self($entry, type_html(), $nullable, $metadata);
148149
}
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace Flow\ETL\Tests\Integration\Function;
6+
7+
use function Flow\ETL\DSL\{df, from_rows, html_entry, ref, row, rows};
8+
use Dom\{Element, HTMLDocument};
9+
use PHPUnit\Framework\Attributes\RequiresPhp;
10+
use PHPUnit\Framework\TestCase;
11+
12+
final class HTMLQuerySelectorAllTest extends TestCase
13+
{
14+
private HTMLDocument $html;
15+
16+
protected function setUp() : void
17+
{
18+
$this->html = HTMLDocument::createFromString('<!DOCTYPE html><html lang="en"><head></head><body><div><span>foobar</span></div></body></html>');
19+
}
20+
21+
#[RequiresPhp('<= 8.4')]
22+
public function test_fails_with_older_php() : void
23+
{
24+
$this->expectException(\RuntimeException::class);
25+
26+
df()
27+
->read(from_rows(rows(row(html_entry('html_raw', $this->html)))))
28+
->withEntry('html', ref('html_raw')->htmlQuerySelectorAll('body div span'))
29+
->fetch();
30+
}
31+
32+
#[RequiresPhp('>= 8.4')]
33+
public function test_invalid_query_all_on_html_document() : void
34+
{
35+
$rows = df()
36+
->read(from_rows(rows(row(html_entry('html_raw', $this->html)))))
37+
->withEntry('html', ref('html_raw')->htmlQuerySelectorAll('body div p'))
38+
->drop('html_raw')
39+
->fetch();
40+
41+
$results = $rows->toArray()[0]['html'] ?? [];
42+
43+
/* @phpstan-ignore-next-line */
44+
self::assertCount(0, $results);
45+
}
46+
47+
#[RequiresPhp('>= 8.4')]
48+
public function test_valid_query_all_on_html_document() : void
49+
{
50+
$rows = df()
51+
->read(from_rows(rows(row(html_entry('html_raw', $this->html)))))
52+
->withEntry('html', ref('html_raw')->htmlQuerySelectorAll('body div span'))
53+
->drop('html_raw')
54+
->fetch();
55+
56+
$results = $rows->toArray()[0]['html'] ?? [];
57+
58+
/* @phpstan-ignore-next-line */
59+
self::assertCount(1, $results);
60+
/* @phpstan-ignore-next-line */
61+
self::assertContainsOnlyInstancesOf(Element::class, $results);
62+
}
63+
}
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace Flow\ETL\Tests\Integration\Function;
6+
7+
use function Flow\ETL\DSL\{df, from_rows, html_entry, ref, row, rows};
8+
use Dom\HTMLDocument;
9+
use PHPUnit\Framework\Attributes\RequiresPhp;
10+
use PHPUnit\Framework\TestCase;
11+
12+
final class HTMLQuerySelectorTest extends TestCase
13+
{
14+
private HTMLDocument $html;
15+
16+
protected function setUp() : void
17+
{
18+
$this->html = HTMLDocument::createFromString('<!DOCTYPE html><html lang="en"><head></head><body><div><span>foobar</span></div></body></html>');
19+
}
20+
21+
#[RequiresPhp('<= 8.4')]
22+
public function test_fails_with_older_php() : void
23+
{
24+
$this->expectException(\RuntimeException::class);
25+
26+
df()
27+
->read(from_rows(rows(row(html_entry('html_raw', $this->html)))))
28+
->withEntry('html', ref('html_raw')->htmlQuerySelector('body div span'))
29+
->fetch();
30+
}
31+
32+
#[RequiresPhp('>= 8.4')]
33+
public function test_invalid_query_on_html_document() : void
34+
{
35+
$rows = df()
36+
->read(from_rows(rows(row(html_entry('html_raw', $this->html)))))
37+
->withEntry('html', ref('html_raw')->htmlQuerySelector('body div p'))
38+
->drop('html_raw')
39+
->fetch();
40+
41+
$results = $rows->toArray()[0]['html'] ?? [];
42+
43+
/* @phpstan-ignore-next-line */
44+
self::assertCount(0, $results);
45+
}
46+
}
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace Flow\ETL\Tests\Unit\Function;
6+
7+
use function Flow\ETL\DSL\{config, flow_context, ref, row};
8+
use Dom\{Element, HTMLDocument};
9+
use PHPUnit\Framework\Attributes\RequiresPhp;
10+
use PHPUnit\Framework\TestCase;
11+
12+
final class HTMLQuerySelectorAllTest extends TestCase
13+
{
14+
private HTMLDocument $html;
15+
16+
protected function setUp() : void
17+
{
18+
$this->html = HTMLDocument::createFromString('<!DOCTYPE html><html><head></head><body><div><span>foobar</span></div></body></html>');
19+
}
20+
21+
public function test_getting_elements_for_given_path() : void
22+
{
23+
/** @var array<mixed> $result */
24+
$result = ref('value')->htmlQuerySelectorAll('body div span')->eval(row(flow_context(config())->entryFactory()->create('value', $this->html)));
25+
26+
self::assertCount(1, $result);
27+
28+
/* @phpstan-ignore-next-line */
29+
self::assertInstanceOf(Element::class, $result[0]);
30+
}
31+
32+
#[RequiresPhp('< 8.4')]
33+
public function test_getting_null_for_older_versions() : void
34+
{
35+
$this->expectException(\RuntimeException::class);
36+
37+
ref('value')->htmlQuerySelectorAll('body div p')->eval(row(flow_context(config())->entryFactory()->create('value', $this->html)));
38+
}
39+
40+
#[RequiresPhp('>= 8.4')]
41+
public function test_getting_null_when_nothing_found() : void
42+
{
43+
$result = ref('value')->htmlQuerySelectorAll('body div p')->eval(row(flow_context(config())->entryFactory()->create('value', $this->html)));
44+
45+
self::assertNull($result);
46+
}
47+
48+
public function test_invalid_value() : void
49+
{
50+
$result = ref('value')->htmlQuerySelectorAll('body div span')->eval(row(flow_context(config())->entryFactory()->create('value', '')));
51+
52+
self::assertNull($result);
53+
}
54+
}

0 commit comments

Comments
 (0)