#! /usr/bin/env perl BEGIN{$^W=1} use strict; # Politech seems to be stuck using pipermail now. # So we'll go ahead and implement a feed scraper based on that now. # # Uses approximately 5k of extra politech bandwidth every time it's run, over and above # the transfer of new messages. use Data::Dumper; use File::Basename; use HTML::Entities; use Time::ParseDate; use lib dirname($0); # look in the same directory that this script resides in use FeedHelpers; # Common code, available here: http://paperlined.org/rss/feeds/FeedHelpers.pm sub NUM_EMAILS {10} # how many emails to return in the RSS feed # since we post the full text of each message, which tends to be *quite* large, we'll only display the last couple emails # Load the persistent data my $persistent = PersistentHash::tie_storable("politech.db"); ############################################# # Choose the last two months ############################################# my $main_page = conditional_http_simple_get('http://politechbot.com/pipermail/politech/', $persistent) or die "Unable to fetch politech pipermail main page.\n\t"; my $days = 24*60*60; my $months = 30.41 * $days; my $curtime = time(); my @month_urls; foreach my $monthname ($main_page =~ m#href="(\d\d\d\d-\S+)/thread.html"#sgi) { (my $mn = $monthname) =~ s/^(\d+)-(\w+)$/$2 1, $1/; my $month = parsedate($mn); next unless ($curtime - $month < 2.5 * $months); # we don't want to just look at the latest month, because it might be july 1st, and we wouldn't see the end-of-june posts push(@month_urls, "http://politechbot.com/pipermail/politech/$monthname/thread.html"); } #die Dumper(\@month_urls); ############################################# # Read in the months ############################################# my @email_urls; foreach my $month_url (@month_urls) { my $month = conditional_http_simple_get($month_url, $persistent) or die "Unable to fetch $month_url.\n\t"; $month_url =~ s#thread.html$##; while ($month =~ m#

#gsi) { push(@email_urls, $month_url . $1); } } sub mail_num {local $_ = shift; s/\.html$//; s#^.*/##; int($_)} my $max_mail_num = 0; foreach (@email_urls) { my $mn = mail_num($_); if ($mn > $max_mail_num) { $max_mail_num = $mn; } } @email_urls = grep {$max_mail_num - mail_num($_) < NUM_EMAILS} @email_urls; #die Dumper(\@email_urls); ############################################# # Read in each email ############################################# my $stories = {}; foreach my $email_url (@email_urls) { my $story = {}; print "Fetching $email_url\n"; my $email = conditional_http_simple_get($email_url, $persistent) or warn "Unable to fetch $email_url.\n\t"; if ($email !~ m#

(.*?)

.*?(.*?).*?(

.*?

)#si) { die "HTML format of emails apparently changed\n\t"; } my ($title, $date, $body) = ($1, $2, $3); $story->{link} = $email_url; $title =~ s/^\s*\[[^\]]*\]\s*//g; # remove "[politech]" $story->{title} = HTML::Entities::encode_entities($title); $story->{description} = HTML::Entities::encode_entities($body); $story->{date} = parsedate($date); $stories->{$email_url} = $story; } # remove data for stories we haven't looked at this run clear_stale_conditionalGET_cache($persistent); ############################################# # Output RSS file ############################################# open RSS, ">/home/interiot/www/rss/feeds/politech.rss" or die "Unable to write to /home/interiot/www/rss/feeds/politech.rss: $!"; print RSS <<"EOF"; Newcum's Politech Feed http://politechbot.com/pipermail/politech/ David Newcum's RSS Feed of politechbot.com. Contact rss_feeds\@paperlined.org for change requests. en-us EOF foreach my $story (sort {$b->{date} <=> $a->{date}} values %$stories) { #next if ($curtime - $story->{date} > 4 * $days); my $date = rss_localtime($story->{date}); print RSS <<"EOF"; $story->{title} $story->{link} $story->{description} $date EOF } print RSS "\n";