#!/usr/bin/perl # script written by Zbigniew Koziol, softquake@gmail.com # This script reads content of postings.dat # It finds the names of files to be downloaded (URL addresses) # downlowds them, and saves in directory all_html_files/. $D = ''; open(WAVE1, "<", "postings.dat"); while($l=) { $D .= $l; } close WAVE1; (@all_postings) = split("\n", $D); foreach $line (@all_postings) { $line =~ s/\n//; $file_name = $line; $file_date = $line; $file_date =~ s/\.html//; ($y, $m, $d) = split("_", $file_date); $file_to_read = "one_day/". $file_name; $this_file = ''; open(WAVE2, "<", "$file_to_read"); while($l=) { $this_file .= $l; } close WAVE2; (@all_lines) = split("\n", $this_file); foreach $l (@all_lines) { if ($l =~ s/

// && $l =~ s/bookmark//) { # Example line : #

Witajcie!

($a,$b) = split ('href="', $l); ($c,$d) = split ('"', $b); # $c now contains URL to an entire article, with comments # print $c, "\n"; ($e, $f) = split('wordpress.com/', $c); ($y, $m, $d, $f, $anything) = split ('/', $f); $my_file_name = $y ."_" . $m . "_" . $d ."_" . $f; # print $my_file_name, "\n"; system("/usr/bin/wget -c $c -O all_html_files/$my_file_name.html"); } } }