#! /usr/bin/env perl BEGIN{$^W=1} use strict; # Politech seems to be stuck using pipermail now. # So we'll go ahead and implement a feed scraper based on that now. # # Uses approximately 5k of extra politech bandwidth every time it's run, over and above # the transfer of new messages. use Data::Dumper; use File::Basename; use HTML::Entities; use Time::ParseDate; use lib dirname($0); # look in the same directory that this script resides in use FeedHelpers; # Common code, available here: http://paperlined.org/rss/feeds/FeedHelpers.pm sub NUM_EMAILS {10} # how many emails to return in the RSS feed # since we post the full text of each message, which tends to be *quite* large, we'll only display the last couple emails # Load the persistent data my $persistent = PersistentHash::tie_storable("politech.db"); ############################################# # Choose the last two months ############################################# my $main_page = conditional_http_simple_get('http://politechbot.com/pipermail/politech/', $persistent) or die "Unable to fetch politech pipermail main page.\n\t"; my $days = 24*60*60; my $months = 30.41 * $days; my $curtime = time(); my @month_urls; foreach my $monthname ($main_page =~ m#href="(\d\d\d\d-\S+)/thread.html"#sgi) { (my $mn = $monthname) =~ s/^(\d+)-(\w+)$/$2 1, $1/; my $month = parsedate($mn); next unless ($curtime - $month < 2.5 * $months); # we don't want to just look at the latest month, because it might be july 1st, and we wouldn't see the end-of-june posts push(@month_urls, "http://politechbot.com/pipermail/politech/$monthname/thread.html"); } #die Dumper(\@month_urls); ############################################# # Read in the months ############################################# my @email_urls; foreach my $month_url (@month_urls) { my $month = conditional_http_simple_get($month_url, $persistent) or die "Unable to fetch $month_url.\n\t"; $month_url =~ s#thread.html$##; while ($month =~ m#
.*?)#si) { die "HTML format of emails apparently changed\n\t"; } my ($title, $date, $body) = ($1, $2, $3); $story->{link} = $email_url; $title =~ s/^\s*\[[^\]]*\]\s*//g; # remove "[politech]" $story->{title} = HTML::Entities::encode_entities($title); $story->{description} = HTML::Entities::encode_entities($body); $story->{date} = parsedate($date); $stories->{$email_url} = $story; } # remove data for stories we haven't looked at this run clear_stale_conditionalGET_cache($persistent); ############################################# # Output RSS file ############################################# open RSS, ">/home/interiot/www/rss/feeds/politech.rss" or die "Unable to write to /home/interiot/www/rss/feeds/politech.rss: $!"; print RSS <<"EOF";