#!/usr/bin/perl # script written by Zbigniew Koziol, softquake@gmail.com # After running get_all_marucha_1.pl we have directory all_html_files with all # posts and discussions there. # First, we create a list of these files: # ls -1 >> ../my_files.dat # We would like to have statistics of users posting frequency (activity), by indyvidual users, # from their comments. # Hence we need to know timestamp of their postings and their username # Let that be the first step in our data analysis. # We will output results to STDOUT and direct that to file users_activity.dat # by running this script in this way: # ./get_all_marucha_2.pl >> users_activity.dat $D = ''; open(WAVE1, "<", "my_files.dat"); while($l=) { $D .= $l; } close WAVE1; (@all_files) = split("\n", $D); foreach $line (@all_files) { $line =~ s/\n//; $file_to_read = "all_html_files/". $line; $this_file = ''; open(WAVE2, "<", "$file_to_read"); while($l=) { $this_file .= $l; } close WAVE2; (@all_lines) = split("\n", $this_file); $n_of_lines = $#all_lines; for ($i=0; $i<$n_of_lines; $i++) { $l = @all_lines[$i]; if ($l =~ s//XXXXXYYYYZZZZ/) { ($left,$right)=split(XXXXXYYYYZZZZ, $l); ($left,$right)=split('', $right); if($left =~ s/href=//) { ($left,$author)=split("class='url'>", $left); $author =~ s/<\/a>//; } else { $author = $left; } $i++;$i++;$i++;$i++; $posting_date = @all_lines[$i]; ($data, $week_day, $nothing, $my_time, $small)= split(" ", $posting_date); $my_time =~ s/<\/a>//; $week_day =~ s/\(//; $week_day =~ s/\)//; print $data, "\t", $week_day,"\t",$my_time,"\t",$author, "\t", $line,"\n"; } } }