1515
1616namespace vipnytt ;
1717
18- use DateTime ;
1918use vipnytt \robot \URLParser ;
2019use vipnytt \robot \UserAgentParser ;
2120
@@ -35,6 +34,18 @@ class XRobotsTagParser
3534 const DIRECTIVE_NO_TRANSLATE = 'notranslate ' ;
3635 const DIRECTIVE_UNAVAILABLE_AFTER = 'unavailable_after ' ;
3736
37+ // TODO: Shuld be RFC-850, but disabled due to an rule parsing bug
38+ const DATE_FORMAT_DEFAULT = 'd M Y H:i:s T ' ;
39+
40+ private $ supportedDateFormats = [
41+ self ::DATE_FORMAT_DEFAULT ,
42+ DATE_RFC1123 ,
43+ DATE_RFC850 ,
44+ 'd M Y H:i:s T '
45+ ];
46+
47+ private $ strict = false ;
48+
3849 private $ url = '' ;
3950 private $ userAgent = self ::USERAGENT_DEFAULT ;
4051
@@ -51,18 +62,21 @@ class XRobotsTagParser
5162 *
5263 * @param string $url
5364 * @param string $userAgent
54- * @param array $headers
65+ * @param bool $strict
66+ * @param array|null $headers
5567 */
56- public function __construct ($ url , $ userAgent = self ::USERAGENT_DEFAULT , $ headers = [] )
68+ public function __construct ($ url , $ userAgent = self ::USERAGENT_DEFAULT , $ strict = false , $ headers = null )
5769 {
70+ $ this ->strict = $ strict ;
71+
5872 // Parse URL
5973 $ urlParser = new URLParser (trim ($ url ));
6074 if (!$ urlParser ->isValid ()) {
6175 trigger_error ('Invalid URL ' , E_USER_WARNING );
6276 }
6377 $ this ->url = $ urlParser ->encode ();
6478 // Get headers
65- $ this ->setHeaders ($ headers );
79+ $ this ->useHeaders ($ headers );
6680 // Parse rules
6781 $ this ->parse ();
6882 // Set User-Agent
@@ -73,20 +87,19 @@ public function __construct($url, $userAgent = self::USERAGENT_DEFAULT, $headers
7387 /**
7488 * Request HTTP headers
7589 *
76- * @param array $customHeaders - use these headers
77- * @return void
90+ * @param array|null|false $customHeaders - use these headers
91+ * @return bool
7892 */
79- private function setHeaders ($ customHeaders = [] )
93+ private function useHeaders ($ customHeaders = null )
8094 {
81- $ this ->headers = $ customHeaders ;
82- if (is_array ($ this ->headers ) && !empty ($ this ->headers )) {
83- return ;
84- }
85- $ this ->headers = get_headers ($ this ->url );
86- if (is_array ($ this ->headers ) && !empty ($ this ->headers )) {
95+ if ($ customHeaders === false ) {
8796 trigger_error ('Unable to fetch HTTP headers ' , E_USER_ERROR );
88- return ;
97+ return false ;
98+ } elseif (!is_array ($ customHeaders ) || empty ($ customHeaders )) {
99+ return $ this ->useHeaders (get_headers ($ this ->url ));
89100 }
101+ $ this ->headers = $ customHeaders ;
102+ return true ;
90103 }
91104
92105 /**
@@ -117,20 +130,14 @@ private function detectDirectives()
117130 {
118131 $ rules = explode (', ' , $ this ->currentRule );
119132 foreach ($ rules as $ rule ) {
120- $ part = explode (': ' , $ rule , 3 );
121- $ part [0 ] = trim ($ part [0 ]);
122- $ part [1 ] = isset ($ part [1 ]) ? trim ($ part [1 ]) : '' ;
123- $ part [2 ] = isset ($ part [2 ]) ? trim ($ part [2 ]) : '' ;
124- if ($ rules [0 ] === $ rule && count ($ part ) >= 2 && !in_array ($ part [0 ], $ this ->directiveArray ())) {
125- $ this ->currentUserAgent = $ part [0 ];
126- if (in_array ($ part [1 ], $ this ->directiveArray ())) {
127- $ this ->currentDirective = $ part [1 ];
128- $ this ->currentValue = $ part [2 ];
129- $ this ->addRule ();
130- }
131- } elseif (in_array ($ part [0 ], $ this ->directiveArray ())) {
132- $ this ->currentDirective = $ part [0 ];
133- $ this ->currentValue = $ part [1 ];
133+ $ pair = array_map ('trim ' , explode (': ' , $ rule , 2 ));
134+ if ($ rules [0 ] === $ rule && count ($ pair ) == 2 && !in_array ($ pair [0 ], $ this ->directiveArray ())) {
135+ $ this ->currentUserAgent = $ pair [0 ];
136+ $ pair = array_map ('trim ' , explode (': ' , $ pair [1 ], 2 ));
137+ }
138+ if (in_array ($ pair [0 ], $ this ->directiveArray ())) {
139+ $ this ->currentDirective = $ pair [0 ];
140+ $ this ->currentValue = isset ($ pair [1 ]) ? $ pair [1 ] : null ;
134141 $ this ->addRule ();
135142 }
136143 }
@@ -176,13 +183,23 @@ private function addRule()
176183 $ this ->rules [$ this ->currentUserAgent ][$ this ->currentDirective ] = true ;
177184 break ;
178185 case self ::DIRECTIVE_NONE :
186+ $ this ->rules [$ this ->currentUserAgent ][self ::DIRECTIVE_NONE ] = true ;
187+ if ($ this ->strict ) break ;
179188 $ this ->rules [$ this ->currentUserAgent ][self ::DIRECTIVE_NO_INDEX ] = true ;
180189 $ this ->rules [$ this ->currentUserAgent ][self ::DIRECTIVE_NO_FOLLOW ] = true ;
181190 break ;
182191 case self ::DIRECTIVE_UNAVAILABLE_AFTER :
183- $ dateTime = new DateTime ();
184- $ dateTime ->createFromFormat (DATE_RFC850 , $ this ->currentValue );
185- $ this ->rules [$ this ->currentUserAgent ][self ::DIRECTIVE_UNAVAILABLE_AFTER ] = $ dateTime ->getTimestamp ();
192+ if ($ this ->strict ) $ this ->supportedDateFormats = [self ::DATE_FORMAT_DEFAULT ];
193+ foreach (array_unique ($ this ->supportedDateFormats ) as $ format ) {
194+ $ dateTime = date_create_from_format ($ format , $ this ->currentValue );
195+ if ($ dateTime === false ) continue ;
196+ $ this ->rules [$ this ->currentUserAgent ][self ::DIRECTIVE_UNAVAILABLE_AFTER ] = $ dateTime ->format (self ::DATE_FORMAT_DEFAULT );
197+ if ($ this ->strict ) break ;
198+ if (time () >= $ dateTime ->getTimestamp ()) {
199+ $ this ->rules [$ this ->currentUserAgent ][self ::DIRECTIVE_NO_INDEX ] = true ;
200+ }
201+ break ;
202+ }
186203 break ;
187204 }
188205 }
0 commit comments