Skip to content

Commit 6b36354

Browse files
committed
Various improvements
1 parent fcbd57a commit 6b36354

10 files changed

Lines changed: 309 additions & 170 deletions

src/XRobotsTagParser.php

Lines changed: 48 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515

1616
namespace vipnytt;
1717

18-
use DateTime;
1918
use vipnytt\robot\URLParser;
2019
use vipnytt\robot\UserAgentParser;
2120

@@ -35,6 +34,18 @@ class XRobotsTagParser
3534
const DIRECTIVE_NO_TRANSLATE = 'notranslate';
3635
const DIRECTIVE_UNAVAILABLE_AFTER = 'unavailable_after';
3736

37+
// TODO: Shuld be RFC-850, but disabled due to an rule parsing bug
38+
const DATE_FORMAT_DEFAULT = 'd M Y H:i:s T';
39+
40+
private $supportedDateFormats = [
41+
self::DATE_FORMAT_DEFAULT,
42+
DATE_RFC1123,
43+
DATE_RFC850,
44+
'd M Y H:i:s T'
45+
];
46+
47+
private $strict = false;
48+
3849
private $url = '';
3950
private $userAgent = self::USERAGENT_DEFAULT;
4051

@@ -51,18 +62,21 @@ class XRobotsTagParser
5162
*
5263
* @param string $url
5364
* @param string $userAgent
54-
* @param array $headers
65+
* @param bool $strict
66+
* @param array|null $headers
5567
*/
56-
public function __construct($url, $userAgent = self::USERAGENT_DEFAULT, $headers = [])
68+
public function __construct($url, $userAgent = self::USERAGENT_DEFAULT, $strict = false, $headers = null)
5769
{
70+
$this->strict = $strict;
71+
5872
// Parse URL
5973
$urlParser = new URLParser(trim($url));
6074
if (!$urlParser->isValid()) {
6175
trigger_error('Invalid URL', E_USER_WARNING);
6276
}
6377
$this->url = $urlParser->encode();
6478
// Get headers
65-
$this->setHeaders($headers);
79+
$this->useHeaders($headers);
6680
// Parse rules
6781
$this->parse();
6882
// Set User-Agent
@@ -73,20 +87,19 @@ public function __construct($url, $userAgent = self::USERAGENT_DEFAULT, $headers
7387
/**
7488
* Request HTTP headers
7589
*
76-
* @param array $customHeaders - use these headers
77-
* @return void
90+
* @param array|null|false $customHeaders - use these headers
91+
* @return bool
7892
*/
79-
private function setHeaders($customHeaders = [])
93+
private function useHeaders($customHeaders = null)
8094
{
81-
$this->headers = $customHeaders;
82-
if (is_array($this->headers) && !empty($this->headers)) {
83-
return;
84-
}
85-
$this->headers = get_headers($this->url);
86-
if (is_array($this->headers) && !empty($this->headers)) {
95+
if ($customHeaders === false) {
8796
trigger_error('Unable to fetch HTTP headers', E_USER_ERROR);
88-
return;
97+
return false;
98+
} elseif (!is_array($customHeaders) || empty($customHeaders)) {
99+
return $this->useHeaders(get_headers($this->url));
89100
}
101+
$this->headers = $customHeaders;
102+
return true;
90103
}
91104

92105
/**
@@ -117,20 +130,14 @@ private function detectDirectives()
117130
{
118131
$rules = explode(',', $this->currentRule);
119132
foreach ($rules as $rule) {
120-
$part = explode(':', $rule, 3);
121-
$part[0] = trim($part[0]);
122-
$part[1] = isset($part[1]) ? trim($part[1]) : '';
123-
$part[2] = isset($part[2]) ? trim($part[2]) : '';
124-
if ($rules[0] === $rule && count($part) >= 2 && !in_array($part[0], $this->directiveArray())) {
125-
$this->currentUserAgent = $part[0];
126-
if (in_array($part[1], $this->directiveArray())) {
127-
$this->currentDirective = $part[1];
128-
$this->currentValue = $part[2];
129-
$this->addRule();
130-
}
131-
} elseif (in_array($part[0], $this->directiveArray())) {
132-
$this->currentDirective = $part[0];
133-
$this->currentValue = $part[1];
133+
$pair = array_map('trim', explode(':', $rule, 2));
134+
if ($rules[0] === $rule && count($pair) == 2 && !in_array($pair[0], $this->directiveArray())) {
135+
$this->currentUserAgent = $pair[0];
136+
$pair = array_map('trim', explode(':', $pair[1], 2));
137+
}
138+
if (in_array($pair[0], $this->directiveArray())) {
139+
$this->currentDirective = $pair[0];
140+
$this->currentValue = isset($pair[1]) ? $pair[1] : null;
134141
$this->addRule();
135142
}
136143
}
@@ -176,13 +183,23 @@ private function addRule()
176183
$this->rules[$this->currentUserAgent][$this->currentDirective] = true;
177184
break;
178185
case self::DIRECTIVE_NONE:
186+
$this->rules[$this->currentUserAgent][self::DIRECTIVE_NONE] = true;
187+
if ($this->strict) break;
179188
$this->rules[$this->currentUserAgent][self::DIRECTIVE_NO_INDEX] = true;
180189
$this->rules[$this->currentUserAgent][self::DIRECTIVE_NO_FOLLOW] = true;
181190
break;
182191
case self::DIRECTIVE_UNAVAILABLE_AFTER:
183-
$dateTime = new DateTime();
184-
$dateTime->createFromFormat(DATE_RFC850, $this->currentValue);
185-
$this->rules[$this->currentUserAgent][self::DIRECTIVE_UNAVAILABLE_AFTER] = $dateTime->getTimestamp();
192+
if ($this->strict) $this->supportedDateFormats = [self::DATE_FORMAT_DEFAULT];
193+
foreach (array_unique($this->supportedDateFormats) as $format) {
194+
$dateTime = date_create_from_format($format, $this->currentValue);
195+
if ($dateTime === false) continue;
196+
$this->rules[$this->currentUserAgent][self::DIRECTIVE_UNAVAILABLE_AFTER] = $dateTime->format(self::DATE_FORMAT_DEFAULT);
197+
if ($this->strict) break;
198+
if (time() >= $dateTime->getTimestamp()) {
199+
$this->rules[$this->currentUserAgent][self::DIRECTIVE_NO_INDEX] = true;
200+
}
201+
break;
202+
}
186203
break;
187204
}
188205
}

test/cases/DirectiveNoneTest.php

Lines changed: 0 additions & 52 deletions
This file was deleted.

test/cases/MultiDirectivesTest.php

Lines changed: 0 additions & 71 deletions
This file was deleted.

test/cases/MultiTest.php

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
<?php
2+
3+
namespace vipnytt\XRobotsTagParser\tests;
4+
5+
use vipnytt\XRobotsTagParser;
6+
7+
class MultiTest extends \PHPUnit_Framework_TestCase
8+
{
9+
/**
10+
* Multi directives test
11+
*
12+
* @dataProvider generateDataForTest
13+
* @param string $url
14+
* @param string $bot
15+
* @param bool $strict
16+
* @param array|null $headers
17+
*/
18+
public function testMultipleDirectives($url, $bot, $strict, $headers)
19+
{
20+
$parser = new XRobotsTagParser($url, $bot, $strict, $headers);
21+
$this->assertInstanceOf('vipnytt\XRobotsTagParser', $parser);
22+
23+
$this->assertContains(['noindex' => true], $parser->getRules());
24+
$this->assertContains(['noindex' => true], $parser->export()['']);
25+
$this->assertContains(['noindex' => true], $parser->export()['googlebot']);
26+
27+
$this->assertContains(['nofollow' => true], $parser->getRules());
28+
$this->assertContains(['nofollow' => true], $parser->export()['']);
29+
$this->assertContains(['nofollow' => true], $parser->export()['googlebot']);
30+
31+
$this->assertContains(['noarchive' => true], $parser->getRules());
32+
$this->assertContains(['noarchive' => true], $parser->export()['']);
33+
$this->assertContains(['noarchive' => true], $parser->export()['googlebot']);
34+
35+
$this->assertContains(['nosnippet' => true], $parser->getRules());
36+
$this->assertContains(['nosnippet' => true], $parser->export()['']);
37+
$this->assertContains(['nosnippet' => true], $parser->export()['googlebot']);
38+
39+
$this->assertContains(['noodp' => true], $parser->getRules());
40+
$this->assertContains(['noodp' => true], $parser->export()['']);
41+
$this->assertContains(['noodp' => true], $parser->export()['googlebot']);
42+
43+
$this->assertContains(['notranslate' => true], $parser->getRules());
44+
$this->assertContains(['notranslate' => true], $parser->export()['']);
45+
$this->assertContains(['notranslate' => true], $parser->export()['googlebot']);
46+
47+
$this->assertContains(['noimageindex' => true], $parser->getRules());
48+
$this->assertContains(['noimageindex' => true], $parser->export()['']);
49+
$this->assertContains(['noimageindex' => true], $parser->export()['googlebot']);
50+
}
51+
52+
/**
53+
* Generate test data
54+
* @return array
55+
*/
56+
public function generateDataForTest()
57+
{
58+
return [
59+
[
60+
'http://example.com/',
61+
'googlebot',
62+
false,
63+
[
64+
'HTTP/1.1 200 OK',
65+
'Date: Tue, 25 May 2010 21:42:43 GMT',
66+
'X-Robots-Tag: all',
67+
'X-Robots-Tag: noindex',
68+
'X-Robots-Tag: nofollow',
69+
'X-Robots-Tag: none',
70+
'X-Robots-Tag: noarchive',
71+
'X-Robots-Tag: nosnippet',
72+
'X-Robots-Tag: noodp',
73+
'X-Robots-Tag: notranslate',
74+
'X-Robots-Tag: noimageindex',
75+
'X-Robots-Tag: unavailable_after: 25 Jun 2010 15:00:00 PST',
76+
'X-Robots-Tag: googlebot: all, none, nofollow,nosnippet,notranslate unavailable_after: 25 Jun 2010 15:00:00 PST, noindex, noarchive, noodp,noimageindex'
77+
]
78+
]
79+
];
80+
}
81+
}

test/cases/NoneTest.php

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
<?php
2+
3+
namespace vipnytt\XRobotsTagParser\tests;
4+
5+
use vipnytt\XRobotsTagParser;
6+
7+
class NoneTest extends \PHPUnit_Framework_TestCase
8+
{
9+
/**
10+
* Directive: NONE
11+
*
12+
* @dataProvider generateDataForTest
13+
* @param string $url
14+
* @param string $bot
15+
* @param bool $strict
16+
* @param array|null $headers
17+
*/
18+
public function testNone($url, $bot, $strict, $headers)
19+
{
20+
$parser = new XRobotsTagParser($url, $bot, $strict, $headers);
21+
$this->assertInstanceOf('vipnytt\XRobotsTagParser', $parser);
22+
23+
$this->assertContains(['none' => true], $parser->getRules());
24+
$this->assertContains(['noindex' => true], $parser->getRules());
25+
$this->assertContains(['nofollow' => true], $parser->getRules());
26+
27+
$this->assertContains(['none' => true], $parser->export()['']);
28+
$this->assertContains(['noindex' => true], $parser->export()['']);
29+
$this->assertContains(['nofollow' => true], $parser->export()['']);
30+
31+
$this->assertContains(['none' => true], $parser->export()['googlebot']);
32+
$this->assertContains(['noindex' => true], $parser->export()['googlebot']);
33+
$this->assertContains(['nofollow' => true], $parser->export()['googlebot']);
34+
}
35+
36+
/**
37+
* Generate test data
38+
* @return array
39+
*/
40+
public function generateDataForTest()
41+
{
42+
return [
43+
[
44+
'http://example.com/',
45+
'googlebot',
46+
false,
47+
[
48+
'X-Robots-Tag: none',
49+
'X-Robots-Tag: googlebot: none'
50+
]
51+
]
52+
];
53+
}
54+
}

0 commit comments

Comments
 (0)