Skip to content

Commit 528121b

Browse files
authored
Add HTMLQuerySelectorAll & HTMLQuerySelector scalar functions (#1960)
* Add `HTMLQuerySelectorAll` & `HTMLQuerySelector` scalar functions * Add a `RequiredPHPVersionException` * Adjust phpstan.neon ignoring rules for < PHP 8.4 * Enforce PHP requirement in a new HTML selector functions
1 parent 62d2934 commit 528121b

26 files changed

Lines changed: 566 additions & 407 deletions

File tree

phpstan.neon

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,23 @@ parameters:
8585
-
8686
function: 'dj()'
8787

88+
ignoreErrors:
89+
-
90+
identifier: class.notFound
91+
path: src/core/etl/src/Flow/ETL/Row/Entry/HTMLEntry.php
92+
93+
-
94+
identifier: class.notFound
95+
path: src/core/etl/src/Flow/ETL/Function/HTMLQuerySelector.php
96+
97+
-
98+
identifier: class.notFound
99+
path: src/core/etl/src/Flow/ETL/Function/HTMLQuerySelectorAll.php
100+
101+
-
102+
identifier: class.notFound
103+
path: src/lib/types/src/Flow/Types/Type/Logical/HTMLType.php
104+
88105
includes:
89106
- tools/phpstan/vendor/spaze/phpstan-disallowed-calls/extension.neon
90107

src/core/etl/src/Flow/ETL/DSL/functions.php

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
type_xml_element as type_xml_element_new,
3838
types as types_new
3939
};
40+
use Dom\HTMLDocument;
4041
use Flow\Calculator\Rounding;
4142
use Flow\ETL\{Analyze,
4243
Attribute\DocumentationDSL,
@@ -205,7 +206,6 @@
205206
UnionType
206207
};
207208
use Flow\Types\Type\Types;
208-
use Flow\Types\Value\HTMLDocument;
209209
use UnitEnum;
210210

211211
/**
@@ -632,7 +632,7 @@ function xml_element_entry(string $name, \DOMElement|string|null $value, ?Metada
632632
* @return Entry<?HTMLDocument>
633633
*/
634634
#[DocumentationDSL(module: Module::CORE, type: DSLType::ENTRY)]
635-
function html_entry(string $name, HTMLDocument|string|null $value, ?Metadata $metadata = null) : Entry
635+
function html_entry(string $name, HTMLDocument|string|null $value, ?Metadata $metadata = null) : Entry // @phpstan-ignore class.notFound,class.notFound
636636
{
637637
return new HTMLEntry($name, $value, $metadata);
638638
}
@@ -1973,7 +1973,7 @@ function json_schema(string $name, bool $nullable = false, ?Metadata $metadata =
19731973
* @return Definition<HTMLDocument>
19741974
*/
19751975
#[DocumentationDSL(module: Module::CORE, type: DSLType::SCHEMA)]
1976-
function html_schema(string $name, bool $nullable = false, ?Metadata $metadata = null) : Definition
1976+
function html_schema(string $name, bool $nullable = false, ?Metadata $metadata = null) : Definition // @phpstan-ignore class.notFound
19771977
{
19781978
return Definition::html($name, $nullable, $metadata);
19791979
}
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace Flow\ETL\Exception;
6+
7+
final class RequiredPHPVersionException extends RuntimeException
8+
{
9+
public function __construct(string $className, string $version, ?\Exception $previous = null)
10+
{
11+
parent::__construct(
12+
"To use {$className} class, you need to upgrade your PHP version to: {$version}+.",
13+
previous: $previous
14+
);
15+
}
16+
}
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace Flow\ETL\Function;
6+
7+
use Dom\{Element, HTMLDocument};
8+
use Flow\ETL\Exception\RequiredPHPVersionException;
9+
use Flow\ETL\Row;
10+
11+
final class HTMLQuerySelector extends ScalarFunctionChain
12+
{
13+
public function __construct(
14+
private readonly mixed $value,
15+
private readonly ScalarFunction|string $selector,
16+
) {
17+
if (!\class_exists('\Dom\HTMLDocument')) {
18+
throw new RequiredPHPVersionException('\Dom\HTMLDocument', '8.4');
19+
}
20+
}
21+
22+
public function eval(Row $row) : ?Element
23+
{
24+
$value = (new Parameter($this->value))->asInstanceOf($row, HTMLDocument::class);
25+
$selector = (new Parameter($this->selector))->asString($row);
26+
27+
if (null === $value || null === $selector) {
28+
return null;
29+
}
30+
31+
return $value->querySelector($selector);
32+
}
33+
}
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace Flow\ETL\Function;
6+
7+
use DOM\{Element, HTMLDocument};
8+
use Flow\ETL\Exception\RequiredPHPVersionException;
9+
use Flow\ETL\Row;
10+
11+
final class HTMLQuerySelectorAll extends ScalarFunctionChain
12+
{
13+
public function __construct(
14+
private readonly mixed $value,
15+
private readonly ScalarFunction|string $selector,
16+
) {
17+
if (!\class_exists('\Dom\HTMLDocument')) {
18+
throw new RequiredPHPVersionException('\Dom\HTMLDocument', '8.4');
19+
}
20+
}
21+
22+
/**
23+
* @return null|array<Element>
24+
*/
25+
public function eval(Row $row) : ?array
26+
{
27+
$value = (new Parameter($this->value))->asInstanceOf($row, HTMLDocument::class);
28+
$selector = (new Parameter($this->selector))->asString($row);
29+
30+
if (null === $value || null === $selector) {
31+
return null;
32+
}
33+
34+
$result = $value->querySelectorAll($selector);
35+
36+
if (0 === $result->count()) {
37+
return null;
38+
}
39+
40+
$nodes = [];
41+
42+
foreach ($result as $node) {
43+
if (!$node instanceof Element) {
44+
continue;
45+
}
46+
47+
$nodes[] = $node;
48+
}
49+
50+
return $nodes;
51+
}
52+
}

src/core/etl/src/Flow/ETL/Function/ScalarFunctionChain.php

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,16 @@ public function hash(Algorithm $algorithm = new NativePHPHash()) : Hash
311311
return new Hash($this, $algorithm);
312312
}
313313

314+
public function htmlQuerySelector(ScalarFunction|string $path) : HTMLQuerySelector
315+
{
316+
return new HTMLQuerySelector($this, $path);
317+
}
318+
319+
public function htmlQuerySelectorAll(ScalarFunction|string $path) : HTMLQuerySelectorAll
320+
{
321+
return new HTMLQuerySelectorAll($this, $path);
322+
}
323+
314324
/**
315325
* Returns the index of given $needle in string.
316326
*/

src/core/etl/src/Flow/ETL/Row/Entry/HTMLEntry.php

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@
55
namespace Flow\ETL\Row\Entry;
66

77
use function Flow\Types\DSL\{type_equals, type_html, type_optional};
8+
use Dom\HTMLDocument;
89
use Flow\ETL\Row\{Entry, Reference};
910
use Flow\ETL\Schema\{Definition, Metadata};
1011
use Flow\Types\Type;
11-
use Flow\Types\Value\HTMLDocument;
1212

1313
/**
1414
* @implements Entry<?HTMLDocument>
@@ -32,7 +32,7 @@ public function __construct(
3232
?Metadata $metadata = null,
3333
) {
3434
if (\is_string($value)) {
35-
$this->value = HTMLDocument::fromString($value);
35+
$this->value = HTMLDocument::createFromString($value);
3636
} else {
3737
$this->value = $value;
3838
}
@@ -75,7 +75,7 @@ public function isEqual(Entry $entry) : bool
7575
return false;
7676
}
7777

78-
return $entry->value()?->toString() === $this->value?->toString();
78+
return $entry->value()?->saveHtml() === $this->value?->saveHtml();
7979
}
8080

8181
public function map(callable $mapper) : self
@@ -99,7 +99,7 @@ public function toString() : string
9999
return '';
100100
}
101101

102-
return $this->value->toString();
102+
return $this->value->saveHtml();
103103
}
104104

105105
public function type() : Type

src/core/etl/src/Flow/ETL/Schema/Definition.php

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,13 @@
2828
type_xml,
2929
type_xml_element,
3030
types};
31+
use Dom\HTMLDocument;
3132
use Flow\ETL\Exception\{InvalidArgumentException, RuntimeException};
3233
use Flow\ETL\Row\{Entry, EntryReference, Reference};
3334
use Flow\Types\Type;
3435
use Flow\Types\Type\Logical\{ListType, MapType, OptionalType, StructureType};
3536
use Flow\Types\Type\{Native\FloatType, Native\IntegerType, Native\UnionType, TypeFactory};
36-
use Flow\Types\Value\{HTMLDocument, Uuid};
37+
use Flow\Types\Value\Uuid;
3738

3839
/**
3940
* @template-covariant T
@@ -142,7 +143,7 @@ public static function fromArray(array $definition) : self
142143
/**
143144
* @return Definition<HTMLDocument>
144145
*/
145-
public static function html(string|Reference $entry, bool $nullable = false, ?Metadata $metadata = null) : self
146+
public static function html(string|Reference $entry, bool $nullable = false, ?Metadata $metadata = null) : self // @phpstan-ignore class.notFound
146147
{
147148
return new self($entry, type_html(), $nullable, $metadata);
148149
}

src/core/etl/tests/Flow/ETL/Tests/Integration/DataFrame/DisplayTest.php

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ enum_entry,
1111
float_entry,
1212
from_array,
1313
from_rows,
14-
html_entry,
1514
int_entry,
1615
json_entry,
1716
list_entry,
@@ -76,7 +75,6 @@ public function extract(FlowContext $context) : \Generator
7675
),
7776
enum_entry('enum', BackedStringEnum::three),
7877
xml_entry('xml', '<xml><node id="123">test<foo>bar</foo></node></xml>'),
79-
html_entry('html', '<!DOCTYPE html><html lang="en"><head></head><body></body></html>'),
8078
),
8179
);
8280
}
@@ -86,15 +84,15 @@ enum_entry('enum', BackedStringEnum::three),
8684

8785
self::assertCommandOutputIdentical(
8886
<<<'ASCIITABLE'
89-
+------+------------+-----+---------+----------------------+-------+----------------------+---------+-------------------+----------------------+-------+----------------------+----------------------+
90-
| id | price | 100 | deleted | created-at | phase | array | list | map | items | enum | xml | html |
91-
+------+------------+-----+---------+----------------------+-------+----------------------+---------+-------------------+----------------------+-------+----------------------+----------------------+
92-
| 1234 | 123.450000 | 100 | false | 2020-07-13T15:00:00+ | | [{"id":1,"status":"N | [1,2,3] | ["NEW","PENDING"] | {"item-id":"1","name | three | <xml><node id="123"> | <!DOCTYPE html><html |
93-
| 1234 | 123.450000 | 100 | false | 2020-07-13T15:00:00+ | | [{"id":1,"status":"N | [1,2,3] | ["NEW","PENDING"] | {"item-id":"1","name | three | <xml><node id="123"> | <!DOCTYPE html><html |
94-
| 1234 | 123.450000 | 100 | false | 2020-07-13T15:00:00+ | | [{"id":1,"status":"N | [1,2,3] | ["NEW","PENDING"] | {"item-id":"1","name | three | <xml><node id="123"> | <!DOCTYPE html><html |
95-
| 1234 | 123.450000 | 100 | false | 2020-07-13T15:00:00+ | | [{"id":1,"status":"N | [1,2,3] | ["NEW","PENDING"] | {"item-id":"1","name | three | <xml><node id="123"> | <!DOCTYPE html><html |
96-
| 1234 | 123.450000 | 100 | false | 2020-07-13T15:00:00+ | | [{"id":1,"status":"N | [1,2,3] | ["NEW","PENDING"] | {"item-id":"1","name | three | <xml><node id="123"> | <!DOCTYPE html><html |
97-
+------+------------+-----+---------+----------------------+-------+----------------------+---------+-------------------+----------------------+-------+----------------------+----------------------+
87+
+------+------------+-----+---------+----------------------+-------+----------------------+---------+-------------------+----------------------+-------+----------------------+
88+
| id | price | 100 | deleted | created-at | phase | array | list | map | items | enum | xml |
89+
+------+------------+-----+---------+----------------------+-------+----------------------+---------+-------------------+----------------------+-------+----------------------+
90+
| 1234 | 123.450000 | 100 | false | 2020-07-13T15:00:00+ | | [{"id":1,"status":"N | [1,2,3] | ["NEW","PENDING"] | {"item-id":"1","name | three | <xml><node id="123"> |
91+
| 1234 | 123.450000 | 100 | false | 2020-07-13T15:00:00+ | | [{"id":1,"status":"N | [1,2,3] | ["NEW","PENDING"] | {"item-id":"1","name | three | <xml><node id="123"> |
92+
| 1234 | 123.450000 | 100 | false | 2020-07-13T15:00:00+ | | [{"id":1,"status":"N | [1,2,3] | ["NEW","PENDING"] | {"item-id":"1","name | three | <xml><node id="123"> |
93+
| 1234 | 123.450000 | 100 | false | 2020-07-13T15:00:00+ | | [{"id":1,"status":"N | [1,2,3] | ["NEW","PENDING"] | {"item-id":"1","name | three | <xml><node id="123"> |
94+
| 1234 | 123.450000 | 100 | false | 2020-07-13T15:00:00+ | | [{"id":1,"status":"N | [1,2,3] | ["NEW","PENDING"] | {"item-id":"1","name | three | <xml><node id="123"> |
95+
+------+------------+-----+---------+----------------------+-------+----------------------+---------+-------------------+----------------------+-------+----------------------+
9896
5 rows
9997

10098
ASCIITABLE,
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace Flow\ETL\Tests\Integration\Function;
6+
7+
use function Flow\ETL\DSL\{df, from_rows, html_entry, ref, row, rows};
8+
use Dom\{Element, HTMLDocument};
9+
use PHPUnit\Framework\Attributes\RequiresPhp;
10+
use PHPUnit\Framework\TestCase;
11+
12+
#[RequiresPhp('>= 8.4')]
13+
final class HTMLQuerySelectorAllTest extends TestCase
14+
{
15+
public function test_invalid_query_all_on_html_document() : void
16+
{
17+
/* @phpstan-ignore-next-line */
18+
$html = HTMLDocument::createFromString('<!DOCTYPE html><html lang="en"><head></head><body><div><span>foobar</span></div></body></html>');
19+
20+
$rows = df()
21+
->read(from_rows(rows(row(html_entry('html_raw', $html)))))
22+
->withEntry('html', ref('html_raw')->htmlQuerySelectorAll('body div p'))
23+
->drop('html_raw')
24+
->fetch();
25+
26+
$results = $rows->toArray()[0]['html'] ?? [];
27+
28+
/* @phpstan-ignore-next-line */
29+
self::assertCount(0, $results);
30+
}
31+
32+
public function test_valid_query_all_on_html_document() : void
33+
{
34+
/* @phpstan-ignore-next-line */
35+
$html = HTMLDocument::createFromString('<!DOCTYPE html><html lang="en"><head></head><body><div><span>foobar</span></div></body></html>');
36+
37+
$rows = df()
38+
->read(from_rows(rows(row(html_entry('html_raw', $html)))))
39+
->withEntry('html', ref('html_raw')->htmlQuerySelectorAll('body div span'))
40+
->drop('html_raw')
41+
->fetch();
42+
43+
$results = $rows->toArray()[0]['html'] ?? [];
44+
45+
/* @phpstan-ignore-next-line */
46+
self::assertCount(1, $results);
47+
/* @phpstan-ignore-next-line */
48+
self::assertContainsOnlyInstancesOf(Element::class, $results);
49+
}
50+
}

0 commit comments

Comments
 (0)