Skip to content

Commit 7306b07

Browse files
author
Blake Bertuccelli
committed
Update site adder validation so it depends on the cURLed URL to close issue #22.
1 parent 5c28ece commit 7306b07

3 files changed

Lines changed: 90 additions & 53 deletions

File tree

actions/add_site.php

Lines changed: 18 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -11,55 +11,49 @@
1111
DB_NAME
1212
);
1313

14-
// Valid URLs are required so that we can CURL.
15-
$site_url = filter_input(INPUT_GET, 'url', FILTER_VALIDATE_URL);
14+
// We don't validate the URLs here because cURL does
15+
// a better job of validating/redirecting in the adders.
16+
$site_url = $_GET['url'];
1617
if($site_url == false)
17-
throw new Exception('"'.$_GET['url'].'" is an invalid URL');
18+
throw new Exception('URL is missing');
1819

1920
// We need to check the type since a user could manually
2021
// update the URL string to something unsupported.
2122
$type = $_GET['type'];
2223
if( $type == false)
2324
throw new Exception('Type is not specified for the URL "'.$site_url.'"');
2425

25-
// Requiring unique URLs minimizes unnessary scans.
26-
if(!is_unique_page_url($db, $site_url))
27-
throw new Exception('Page "'.$site_url.'" already exists');
28-
2926
// Static pages are treated as sites in themselves.
3027
if($type == 'single_page' ){
3128

32-
// We build an adder so we can tell if the URL can be
33-
// scaned.
34-
single_page_adder($site_url);
29+
// The adder cURLs the site to test if the URL can be scanned.
30+
$curled_site = single_page_adder($site_url);
31+
32+
// Site URL changes to the curled URL.
33+
$site_url = $curled_site['url'];
3534

3635
// Single pages are saved with the following pramenters
3736
$type = 'single_page';
3837
$status = 'active';
39-
$site = $site_url;
38+
$site = $curled_site['url'];
4039
$is_parent = 1;
41-
add_page($db, $url, $type, $status, $site, $is_parent );
40+
add_page($db, $site_url, $type, $status, $site, $is_parent );
4241

4342
// WordPress and XML deals with adding pages similarly,
4443
// so their functions are wrapped in one condition.
4544
}elseif($type == 'wordpress' || $type == 'xml' ){
4645

4746
// WordPress API is queried to create sites.
48-
if($type == 'wordpress' ){
49-
50-
// Lots of users don't include backslashes,
51-
// which WordPress requirew to access the API.
52-
if( !str_ends_with($site_url, '/') )
53-
$site_url = $site_url.'/';
54-
55-
// WordPress adder can create lots of pages.
56-
$pages = wordpress_site_adder($site_url);
57-
58-
}
47+
if($type == 'wordpress' )
48+
$curled_site = wordpress_site_adder($site_url);
5949

6050
// .XML adder can create lots of pages.
6151
if($type == 'xml' )
62-
$pages = xml_site_adder($site_url);
52+
$curled_site = xml_site_adder($site_url);
53+
54+
// Both XML and WP deliver similar content.
55+
$pages = $curled_site['contents'];
56+
$site_url = $curled_site['url'];
6357

6458
// We're setting the status and adding pages here so we
6559
// do not have to call the $db inside "models/adders.php",

models/adders.php

Lines changed: 66 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,17 @@
11
<?php
2+
23
/**
34
* Get Page Body
45
*/
5-
function get_url_contents($site_url, $type = ''){
6+
function run_curl($site_url, $type = ''){
67
$curl = curl_init($site_url);
78
curl_setopt($curl, CURLOPT_URL, $site_url);
89
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
10+
curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
911
curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);
1012
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
13+
curl_setopt($curl, CURLOPT_PROTOCOLS, CURLPROTO_HTTP | CURLPROTO_HTTPS);
14+
curl_setopt($curl, CURLOPT_REDIR_PROTOCOLS, CURLPROTO_HTTP | CURLPROTO_HTTPS);
1115
curl_setopt($curl, CURLOPT_USERAGENT, 'Equalify');
1216

1317
// Restrict CURL to the type of what you want to add.
@@ -16,12 +20,40 @@ function get_url_contents($site_url, $type = ''){
1620
if($type == 'xml')
1721
curl_setopt($curl, CURLOPT_HTTPHEADER, array('Accept: application/xml'));
1822

19-
// Execute CURL or fallback.
23+
// Execute CURL
2024
$url_contents = curl_exec($curl);
25+
26+
// Add in DB info so we can see if URL is unique.
27+
require_once '../config.php';
28+
require_once 'db.php';
29+
$db = connect(
30+
DB_HOST,
31+
DB_USERNAME,
32+
DB_PASSWORD,
33+
DB_NAME
34+
);
35+
36+
// The curled URL is the URL we use as an ID.
37+
$curled_url = curl_getinfo($curl, CURLINFO_EFFECTIVE_URL);
38+
39+
// We don't include added enpoints to the URL.
40+
$json_endpoints = '/wp-json/wp/v2/pages?per_page=100';
41+
$curled_url = str_replace($json_endpoints, '', $curled_url);
42+
43+
// Make sure URL is unique to minimize scans.
44+
if(!is_unique_site($db, $curled_url))
45+
throw new Exception('"'.$curled_url.'" already exists');
46+
47+
// Fallback if no contents exist.
2148
if($url_contents == false)
22-
throw new Exception('Contents of "'.$site_url.'" cannot be loaded');
49+
throw new Exception('Contents of "'.$curled_url.'" cannot be loaded');
2350
curl_close($curl);
24-
return $url_contents;
51+
52+
// We use the curled URL as the unique ID.
53+
return array(
54+
'url' => $curled_url,
55+
'contents' => $url_contents
56+
);
2557

2658
}
2759

@@ -30,29 +62,26 @@ function get_url_contents($site_url, $type = ''){
3062
*/
3163
function single_page_adder($site_url){
3264

33-
// Reformat URL for JSON request.
34-
$json_url = $site_url.'wp-json/wp/v2/pages?per_page=100';
35-
3665
// Get URL contents so we can make sure URL
3766
// can be scanned.
38-
$url_contents = get_url_contents($site_url);
39-
echo $url_contents;
40-
die;
67+
return run_curl($site_url);
68+
4169
}
4270

4371
/**
4472
* WordPress Pages Adder
4573
*/
4674
function wordpress_site_adder($site_url){
4775

48-
// Reformat URL for JSON request.
49-
$json_url = $site_url.'wp-json/wp/v2/pages?per_page=100';
76+
// Add WP JSON URL endpoints for request.
77+
$json_endpoints = '/wp-json/wp/v2/pages?per_page=100';
78+
$json_url = $site_url.$json_endpoints;
5079

5180
// Get URL contents.
52-
$url_contents = get_url_contents($json_url, 'wordpress');
81+
$curled_site = run_curl($json_url, 'wordpress');
5382

5483
// Create JSON.
55-
$wp_api_json = json_decode($url_contents, true);
84+
$wp_api_json = json_decode($curled_site['contents'], true);
5685
if(empty($wp_api_json[0]))
5786
throw new Exception('The URL "'.$site_url.'" is not valid output');
5887

@@ -61,7 +90,17 @@ function wordpress_site_adder($site_url){
6190
foreach ($wp_api_json as $page):
6291
array_push($pages, array('url' => $page['link']));
6392
endforeach;
64-
return $pages;
93+
94+
// Remove WP JSON endbpoints.
95+
$clean_curled_url = str_replace($json_endpoints, '', $curled_site['url']);
96+
97+
// Reformat the curled contents to be an array we can
98+
// work with.
99+
return array(
100+
'url' => $clean_curled_url,
101+
'contents' => $pages
102+
);
103+
65104
}
66105

67106
/**
@@ -70,14 +109,15 @@ function wordpress_site_adder($site_url){
70109
function xml_site_adder($site_url){
71110

72111
// Get URL contents.
73-
$url_contents = get_url_contents($site_url, 'xml');
112+
$curled_site = run_curl($site_url, 'xml');
74113

75114
// Valid XML files are only allowed!
76-
if(!str_starts_with($url_contents, '<?xml'))
77-
throw new Exception('"'.$site_url.'" is not valid XML');
115+
$xml_contents = $curled_site['contents'];
116+
if(!str_starts_with($xml_contents, '<?xml'))
117+
throw new Exception('"'.$curled_site['url'].'" is not a readable XML format');
78118

79119
// Convert XML to JSON, so we can use it later
80-
$xml = simplexml_load_string($url_contents);
120+
$xml = simplexml_load_string($xml_contents);
81121
$json = json_encode($xml);
82122
$json_entries = json_decode($json,TRUE);
83123

@@ -86,6 +126,12 @@ function xml_site_adder($site_url){
86126
foreach ($json_entries['url'] as $page):
87127
array_push($pages, array('url' => $page['loc']));
88128
endforeach;
89-
return $pages;
90129

130+
// Reformat the curled contents to be an array we can
131+
// work with.
132+
return array(
133+
'url' => $curled_site['url'],
134+
'contents' => $pages
135+
);
136+
91137
}

models/db.php

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -315,19 +315,16 @@ function get_column_names(mysqli $db, $table){
315315

316316

317317
/**
318-
* Is Unique Page URL
318+
* Is Unique Site
319319
*/
320-
function is_unique_page_url(mysqli $db, $page_url){
320+
function is_unique_site(mysqli $db, $site_url){
321+
322+
// Require unique URL
323+
$sql = 'SELECT * FROM `pages` WHERE `site` = "'.$site_url.'"';
321324

322325
// We don't consider a page with a '/' a unique url
323326
// so we will also search for them.
324-
if( !str_ends_with($page_url, '/') )
325-
$page_url_backslashed = $page_url.'/';
326-
327-
// Require unique URL
328-
$sql = 'SELECT * FROM `pages` WHERE `url` = "'.$page_url.'"';
329-
if(isset($page_url_backslashed))
330-
$sql.= ' OR `url` = "'.$page_url_backslashed.'"';
327+
$sql.= ' OR `site` = "'.$site_url.'/"';
331328

332329
$query = $db->query($sql);
333330
if(mysqli_num_rows($query) > 0){

0 commit comments

Comments
 (0)