Skip to content

Commit 48ad784

Browse files
committed
Breaking! Node::attr($attr, **$eval=false**, $to_str=false)
1 parent 3dc2879 commit 48ad784

9 files changed

Lines changed: 125 additions & 26 deletions

File tree

README.md

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -161,16 +161,18 @@ if ( $banners ) {
161161

162162
// Iterate over the result
163163
foreach($banners as $pos => $a) {
164-
$links[$pos] = $a->attr('href'); // get absolute URL from href property
164+
// $a->href property is the resolved $a->attr('href') relative to the
165+
// documents <base href=...>, if present, or $doc->baseURL.
166+
$links[$pos] = $a->href; // get absolute URL from href property
165167
$titles[$pos] = trim($a->text()); // strip all HTML tags and leave just text
166168

167169
// Filter the result
168170
if ( !$a->hasClass('logo') ) {
169-
// $a->style property is the parsed $a->attr('style')
171+
// $a->style property is the parsed $a->attr('style'), same as $a->attr('style', true)
170172
if ( strtolower($a->style['position']) == 'fixed' ) continue;
171173

172174
$img = $a->find('img')[0]; // ArrayAccess
173-
if ( $img ) $images[$pos] = $img->src; // short for $img->attr('src')
175+
if ( $img ) $images[$pos] = $img->src; // short for $img->attr('src', true)
174176
}
175177
}
176178

@@ -190,6 +192,13 @@ $charset = $doc->charset;
190192

191193
// Get the size of the document ( strlen($html) )
192194
$size = $doc->size;
195+
196+
// The URL at which the document was requested
197+
$requestUri = $doc->href;
198+
199+
// <base href=...>, if present, or the origin + dir path part from $doc->href.
200+
// The .href and .src props are resolved using this value.
201+
$baseURL = $doc->baseURL;
193202
```
194203

195204
Note: In case the charset meta attribute has a wrong value or the internal conversion fails for any other reason, `hQuery` would ignore the error and continue processing with the original HTML, but would register an error message on `$doc->html_errors['convert_encoding']`.

composer.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"name": "duzun/hquery",
33
"type": "library",
44
"description": "An extremely fast web scraper that parses megabytes of HTML in a blink of an eye. No dependencies. PHP5+",
5-
"version": "3.2.0",
5+
"version": "3.3.0",
66
"license": "MIT",
77
"authors": [
88
{

package-lock.json

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"name": "hquery.php",
33
"description": "An extremely fast web scraper that parses megabytes of HTML in a blink of an eye. No dependencies. PHP5+",
4-
"version": "3.2.0",
4+
"version": "3.3.0",
55
"author": {
66
"name": "Dumitru Uzun",
77
"email": "contact@duzun.me",

src/hQuery.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ class_exists('duzun\\hQuery\\HTML_Index', false) or require_once __DIR__ . DIREC
2929
*
3030
* Copyright (C) 2014-2018 Dumitru Uzun
3131
*
32-
* @version 3.2.0
32+
* @version 3.3.0
3333
* @author Dumitru Uzun (DUzun.ME)
3434
* @license MIT
3535
*/

src/hQuery/Element.php

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
<?php
22
namespace duzun\hQuery;
33

4-
use duzun\hQuery\Parser\HTML;
54
// ------------------------------------------------------------------------
65
class_exists('duzun\\hQuery\\Node', false) or require_once __DIR__ . DIRECTORY_SEPARATOR . 'Node.php';
76

@@ -79,10 +78,6 @@ public function __get($name)
7978

8079
switch ($name) {
8180
case 'style':
82-
$style = $this->attr('style');
83-
if ( !$style ) return self::$_ar_;
84-
return $this->_prop[$name] = HTML::parseCssStr($style);
85-
8681
case 'id':
8782
case 'class':
8883
case 'alt':
@@ -97,7 +92,7 @@ public function __get($name)
9792
// case 'search':
9893
// case 'hash':
9994
default:
100-
return $this->attr($name);
95+
return $this->attr($name, true);
10196
}
10297
}
10398

src/hQuery/HTML_Index.php

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -546,7 +546,8 @@ public function _info()
546546
$lev[$b] = count($st);
547547
}
548548
foreach ($nm as $b => &$n) {
549-
$n = str_repeat(' -', $lev[$b]) . ' < ' . $n . ' ' . $this->get_attr_byId($b, null, true) . ' >';
549+
$n = str_repeat(' -', $lev[$b]) .
550+
' < ' . $n . ' ' . $this->get_attr_byId($b, null, true, false) . ' >';
550551
}
551552
$nm = implode("\n", $nm);
552553
$inf['struc'] = $nm;
@@ -745,7 +746,7 @@ protected function _index_all()
745746
// Read <base href="..." /> tag
746747
if (!empty($this->tag_idx['base'])) {
747748
foreach ($this->tag_idx['base'] as $b => $e) {
748-
if ($a = $this->get_attr_byId($b, 'href', false)) {
749+
if ($a = $this->get_attr_byId($b, 'href', false, false)) {
749750
$this->baseURI($a);
750751
break;
751752
}

src/hQuery/Node.php

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
abstract class Node implements \Iterator, \Countable
1616
{
1717
// ------------------------------------------------------------------------
18-
const VERSION = '3.2.0';
18+
const VERSION = '3.3.0';
1919
// ------------------------------------------------------------------------
2020
/**
2121
* Response details of last request
@@ -125,17 +125,27 @@ public function __destruct()
125125
* Get and attribute or all attributes of first element in the collection.
126126
*
127127
* @param string $attr attribute name, or NULL to get all
128+
* @param boolean $eval if true, evaluate the attribute as a property,
129+
* e.g. resolve .href & .src using document's baseURL
130+
* or parse the .style property as an assoc arrays.
128131
* @param boolean $to_str When $attr is NULL, if true, get the list of attributes as string
129132
* @return array|string If no $attr, return a list of attributes, or attribute's value otherwise.
130133
*/
131-
public function attr($attr = null, $to_str = false)
134+
public function attr($attr = null, $eval = false, $to_str = false)
132135
{
133136
$k = key($this->ids);
134137
if (null === $k) {
135138
reset($this->ids);
136139
$k = key($this->ids);
137140
}
138-
return isset($k) ? $this->doc()->get_attr_byId($k, $attr, $to_str) : null;
141+
if (!isset($k)) return null;
142+
143+
$val = $this->doc()->get_attr_byId($k, $attr, $to_str, !$eval);
144+
if($eval && $attr == 'style') {
145+
if (!$val) return self::$_ar_;
146+
return HTMLParser::parseCssStr($val);
147+
}
148+
return $val;
139149
}
140150

141151
// ------------------------------------------------------------------------
@@ -1114,7 +1124,13 @@ public function __get($name)
11141124
return $this->_prop[$name];
11151125
}
11161126

1117-
return $this->attr($name);
1127+
$val = $this->attr($name, true);
1128+
1129+
if($val && $name == 'style') {
1130+
$this->_prop[$name] = $val;
1131+
}
1132+
1133+
return $val;
11181134
}
11191135

11201136
/**

tests/hQueryCore.Test.php

Lines changed: 85 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ class hQueryCore extends PHPUnit_BaseClass
8282
href="//not-my-site.com/next.html"
8383
style="Color:blue;padding: 1px 2pt 3em 0; background-image:url(/path/to/img.jpg?url=param&and=another&one);"
8484
>Not My Site</a>
85-
<img id="outerImg" src="https://cdn.duzun.me/images/logo.png" />
85+
<img id="outerImg" src="//cdn.duzun.me/images/logo.png" />
8686
8787
<dl id="dict1">
8888
<dt>Coffee</dt>
@@ -127,6 +127,21 @@ class hQueryCore extends PHPUnit_BaseClass
127127

128128
public static $badHTML2 = '<html><head><meta http-equiv="Content-Type" content="text/html; charset=uft-8" /></head><body><a>A</a></body></html>';
129129

130+
public static $baseTag1 = '<!doctype html>
131+
<html>
132+
<head>
133+
<meta content="/logo.png" property="og:image"/>
134+
<base href="/base/path.html?how=rewrite#hash" />
135+
<link rel="shortcut icon" href="/favicon.ico" class=pjax />
136+
</head>
137+
<body class="test-class">
138+
<a href="rel-path/index.html" id="rel_path">relative path</a>
139+
<a href="/abs-path/index.html" id="rel_origin">relative origin</a>
140+
<a href="//not-my-site.com/next.html" id="rel_schema">relative schema</a>
141+
<img id="rel_img" src="/images/logo.png" />
142+
</body>
143+
</html>';
144+
130145
// Before any test
131146
public static function mySetUpBeforeClass()
132147
{
@@ -258,7 +273,8 @@ public function test_find()
258273
$this->assertTrue($a instanceof Element);
259274
$this->assertEquals('a', $a->nodeName);
260275
$this->assertEquals('link: This is a link', trim($a->text));
261-
$this->assertEquals('https://DUzun.Me/path', $a->attr('href'));
276+
$this->assertEquals('https://DUzun.Me/path', $a->href);
277+
$this->assertEquals('/path', $a->attr('href'));
262278
$this->assertEquals('div', $a->parent->nodeName);
263279
$this->assertEquals('test-div', $a->parent->attr('id'));
264280

@@ -438,29 +454,48 @@ public function test_hQuery_Element_ArrayAccess($doc)
438454
/**
439455
* @depends test_hQuery_Element_ArrayAccess
440456
*/
441-
public function test_attr($doc)
457+
public function test_attr_and_prop($doc)
442458
{
459+
// Note: there is no baseURI for $doc at this point.
443460
$e = $doc->find('#img1');
461+
$a = $doc->find('a.aa:last');
444462

445463
// It's magic!
446464
$this->assertEquals($e->src, $e->attr('src'));
447465
$this->assertEquals($e->src1, $e->attr('src1'));
448466
$this->assertEquals($e->src2, $e->attr('src2'));
467+
$this->assertEquals($a->href, $a->attr('href'));
449468

450469
// Standard way of accessing attributes:
451470
$this->assertEquals('/path/to/img.png', $e->attr('src'));
452471
$this->assertEquals('other/img/here.jpg', $e->attr('src2'));
453472
$this->assertEquals('//example.com/full/path.gif', $e->attr('src3'));
473+
$this->assertEquals('#test', $a->attr('href'));
474+
475+
// $doc was loaded from the file "data/attr.html" and has no baseURI associated.
476+
// Set the baseURI from document location, so that .href and .src props
477+
// would be resolved.
478+
$doc->location(self::$baseUrl);
479+
480+
// Properties are evaluated semantically:
481+
$this->assertEquals(self::$baseUrl . 'path/to/img.png', $e->src);
482+
$this->assertEquals('other/img/here.jpg', $e->src2); // .src2 ain't special
483+
484+
$this->assertEquals(self::$baseUrl . '#test', $a->href);
485+
$this->assertEquals('#test', $a->attr('href'));
486+
454487

455488
// Relative vs Absolute URL paths
456489

457490
// a[href] relative URL
458491
$a = self::$inst->find('a:first');
459492
$this->assertEquals(self::$baseUrl . 'path', $a->href);
493+
$this->assertEquals('/path', $a->attr('href'));
460494

461495
// a[href] absolute URL
462496
$a = self::$inst->find('a#outerLink');
463497
$this->assertEquals('https://not-my-site.com/next.html', $a->href);
498+
$this->assertEquals('//not-my-site.com/next.html', $a->attr('href'));
464499

465500
// $a->style is the parsed $a->attr('style'):
466501
$this->assertNotEmpty($a->style);
@@ -474,10 +509,12 @@ public function test_attr($doc)
474509
// img[src] absolute URL
475510
$a = self::$inst->find('img#outerImg');
476511
$this->assertEquals('https://cdn.duzun.me/images/logo.png', $a->src);
512+
$this->assertEquals('//cdn.duzun.me/images/logo.png', $a->attr('src'));
477513

478514
// link[href] relative URL
479515
$a = self::$inst->find('link', array('rel' => 'shortcut icon'));
480516
$this->assertEquals(self::$baseUrl . 'favicon.ico', $a->href);
517+
$this->assertEquals('/favicon.ico', $a->attr('href'));
481518

482519
// meta[content] - not a URL
483520
$m = self::$inst->find('meta', array('property' => 'og:image'));
@@ -493,7 +530,7 @@ public function test_attr($doc)
493530

494531
// -----------------------------------------------------
495532
/**
496-
* @depends test_attr
533+
* @depends test_attr_and_prop
497534
*/
498535
public function test_prop_charset($doc)
499536
{
@@ -515,20 +552,61 @@ public function test_prop_baseURL()
515552
{
516553
$baseURL = self::$inst->baseURL;
517554
$this->assertEquals(self::$baseUrl, $baseURL);
555+
556+
$doc = hQueryTestSurrogate::fromHTML(self::$baseTag1, self::$baseUrl . 'index.html');
557+
$baseURL = $doc->baseURL;
558+
$this->assertEquals(self::$baseUrl . 'base/', $baseURL);
559+
560+
$a = $doc->find('a#rel_path');
561+
$this->assertEquals('rel-path/index.html', $a->attr('href'));
562+
$this->assertEquals(self::$baseUrl . 'base/rel-path/index.html', $a->href);
563+
564+
$a = $doc->find('a#rel_origin');
565+
$this->assertEquals('/abs-path/index.html', $a->attr('href'));
566+
$this->assertEquals(self::$baseUrl . 'abs-path/index.html', $a->href);
567+
568+
$a = $doc->find('a#rel_schema');
569+
$this->assertEquals('//not-my-site.com/next.html', $a->attr('href'));
570+
$this->assertEquals('https://not-my-site.com/next.html', $a->href);
571+
572+
$img = $doc->find('img#rel_img');
573+
$this->assertEquals('/images/logo.png', $img->attr('src'));
574+
$this->assertEquals(self::$baseUrl . 'images/logo.png', $img->src);
575+
576+
return $doc;
518577
}
519578

520579
// -----------------------------------------------------
521-
public function test_prop_baseURI()
580+
/**
581+
* Either the <base href=...> or the location()
582+
*
583+
* @depends test_prop_baseURL
584+
*/
585+
public function test_prop_baseURI($doc)
522586
{
523587
$baseURI = self::$inst->baseURI;
524588
$this->assertEquals(self::$baseUrl . 'index.html', $baseURI);
589+
590+
$baseURI = $doc->baseURI;
591+
$this->assertEquals(self::$baseUrl . 'base/path.html?how=rewrite#hash', $baseURI);
592+
593+
return $doc;
525594
}
526595

527596
// -----------------------------------------------------
528-
// Alias of baseURI
529-
public function test_prop_href()
597+
/**
598+
* URI at which the doc was accessed/loaded.
599+
*
600+
* @depends test_prop_baseURI
601+
*/
602+
public function test_prop_href($doc)
530603
{
531604
$href = self::$inst->href;
605+
$location = self::$inst->location();
606+
$this->assertEquals($location, $href);
607+
$this->assertEquals(self::$baseUrl . 'index.html', $href);
608+
609+
$href = $doc->href;
532610
$this->assertEquals(self::$baseUrl . 'index.html', $href);
533611
}
534612

0 commit comments

Comments
 (0)