/[pdpsoft]/trunk/nl.nikhef.ndpf.tools/network-stats-mashup/includes/class.cricket_spider.php
ViewVC logotype

Contents of /trunk/nl.nikhef.ndpf.tools/network-stats-mashup/includes/class.cricket_spider.php

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1284 - (show annotations) (download) (as text)
Fri Dec 11 19:18:55 2009 UTC (12 years, 7 months ago) by aramv
File MIME type: text/x-php
File size: 1964 byte(s)
Added wildcards for blank hrefs
1 <?php
2 class CricketSpider {
3 protected $html_file,
4 $urls = array();
5 public $trunks = array();
6
7 public static $cache_dir = "cache";
8
9 public function __construct($html_file){
10 $this->html_file = $html_file;
11 $this->create_trunks($this->urls);
12 }
13
14 private function create_trunks(){
15 $urls = CricketSpider::get_urls_from_html_file($this->html_file);
16 foreach($urls[3] as $index => $url){
17 //echo 'url:' . $urls[3][$index] . "\n";
18 //echo 'alt:' . $urls[2][$index] . "\n";
19 $this->trunks[] = new Trunk($urls[3][$index], $urls[2][$index]);
20 }
21 }
22
23 /* checks if a given file is less than 1 day old */
24 public static function probe_cache($file){
25 $today = time();
26 $start_of_today = mktime(0, 0, 0, date("n", $today), date("j", $today), date("Y", $today));
27
28 if(@filemtime($file) >= strtotime("-1 day", $start_of_today)){
29 return true;
30 }
31 return false;
32 }
33
34 /* returns a unique code the url either a urlencoded string or an md5 hash if no meaningful identifier was found in the url */
35 public static function code_from_url($url){
36 $name_string = array();
37 if(preg_match('/target=%2F(.*);/', $url, $name_string)){
38 return $name_string[1];
39 } else {
40 return md5($url);
41 }
42 }
43
44 /* returns ana array of urls found in a local file */
45 public static function get_urls_from_html_file($file){
46 $page_urls = array();
47 $urls_cache = CricketSpider::$cache_dir . '/' . md5($file);
48 /* get a list of Cricket urls from an html file */
49 if(CricketSpider::probe_cache($urls_cache)){
50 /* load cache */
51 $page_urls = unserialize(file_get_contents($urls_cache));
52 } else {
53 /* makes a list of urls present in the image map html */
54 preg_match_all('/(alt="(.+)")?\s?href="(.*)"/', file_get_contents('./'.$file), $page_urls);
55 /* create cache */
56 file_put_contents($urls_cache, serialize($page_urls));
57 }
58 return $page_urls;
59 }
60 }
61 ?>

grid.support@nikhef.nl
ViewVC Help
Powered by ViewVC 1.1.28