#!/usr/bin/perl # script written by Zbigniew Koziol, softquake@gmail.com # This script will download all archival pages day by day, till today. # "archival" means these pages that are accessible by using the following type of URL: # http://marucha.wordpress.com/yyyy/mm/dd/ # As an example: # http://marucha.wordpress.com/2006/09/06/ -- seems to be the first date of posting # that date will approximately be 1157553223 seconds since the epoche. $start_time = 1157553223; # 2006/09/06 in seconds since the epoche $today = time(); # now in seconds since the epoche $oneday = 24*3600; # seconds of 24-hours $my_start_time = $start_time; while ($my_start_time < $today) { ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = gmtime($my_start_time); $year = $year + 1900; $mon = $mon + 1; if ($mon < 10) { $mon = "0" . $mon; } if ($mday < 10) { $mday = "0" . $mday; } $my_download_url = "http://marucha.wordpress.com/" . $year . "/" . $mon . "/" . $mday . "/"; $my_save_file = $year . "_" . $mon . "_" . $mday . ".html"; # print "$my_download_url\t$my_save_file\n"; system("/usr/bin/wget -c $my_download_url -O one_day/$my_save_file"); $my_start_time = $my_start_time + $oneday; } # After that, remove all files with size==0 bytes from directory one_day, and run from # that directory: ls -1 * >> ../postings.dat