1 |
<?php |
2 |
class CricketSpider { |
3 |
protected $html_file, |
4 |
$urls = array(); |
5 |
public $trunks = array(); |
6 |
|
7 |
public static $cache_dir = "cache"; |
8 |
|
9 |
public function __construct($html_file){ |
10 |
$this->html_file = $html_file; |
11 |
$this->create_trunks($this->urls); |
12 |
} |
13 |
|
14 |
private function create_trunks(){ |
15 |
$urls = CricketSpider::get_urls_from_html_file($this->html_file); |
16 |
foreach($urls[3] as $index => $url){ |
17 |
//echo 'url:' . $urls[3][$index] . "\n"; |
18 |
//echo 'alt:' . $urls[2][$index] . "\n"; |
19 |
$this->trunks[] = new Trunk($urls[3][$index], $urls[2][$index]); |
20 |
} |
21 |
} |
22 |
|
23 |
/* checks if a given file is less than 1 day old */ |
24 |
public static function probe_cache($file){ |
25 |
$today = time(); |
26 |
$start_of_today = mktime(0, 0, 0, date("n", $today), date("j", $today), date("Y", $today)); |
27 |
|
28 |
if(@filemtime($file) >= strtotime("-1 day", $start_of_today)){ |
29 |
return true; |
30 |
} |
31 |
return false; |
32 |
} |
33 |
|
34 |
/* returns a unique code the url either a urlencoded string or an md5 hash if no meaningful identifier was found in the url */ |
35 |
public static function code_from_url($url){ |
36 |
$name_string = array(); |
37 |
if(preg_match('/target=%2F(.*);/', $url, $name_string)){ |
38 |
return $name_string[1]; |
39 |
} else { |
40 |
return md5($url); |
41 |
} |
42 |
} |
43 |
|
44 |
/* returns ana array of urls found in a local file */ |
45 |
public static function get_urls_from_html_file($file){ |
46 |
$page_urls = array(); |
47 |
$urls_cache = CricketSpider::$cache_dir . '/' . md5($file); |
48 |
/* get a list of Cricket urls from an html file */ |
49 |
if(CricketSpider::probe_cache($urls_cache)){ |
50 |
/* load cache */ |
51 |
$page_urls = unserialize(file_get_contents($urls_cache)); |
52 |
} else { |
53 |
/* makes a list of urls present in the image map html */ |
54 |
preg_match_all('/(alt="(.+)")?\s?href="(.*)"/', file_get_contents('./'.$file), $page_urls); |
55 |
/* create cache */ |
56 |
file_put_contents($urls_cache, serialize($page_urls)); |
57 |
} |
58 |
return $page_urls; |
59 |
} |
60 |
} |
61 |
?> |