http://paperlined.org/rss/feeds/politech.gen.pl
#! /usr/bin/env perl
BEGIN{$^W=1} use strict;
# Politech seems to be stuck using pipermail now.
# So we'll go ahead and implement a feed scraper based on that now.
#
# Uses approximately 5k of extra politech bandwidth every time it's run, over and above
# the transfer of new messages.
use Data::Dumper;
use File::Basename;
use HTML::Entities;
use Time::ParseDate;
use lib dirname($0); # look in the same directory that this script resides in
use FeedHelpers; # Common code, available here: http://paperlined.org/rss/feeds/FeedHelpers.pm
sub NUM_EMAILS {10} # how many emails to return in the RSS feed
# since we post the full text of each message, which tends to be *quite* large, we'll only display the last couple emails
# Load the persistent data
my $persistent = PersistentHash::tie_storable("politech.db");
#############################################
# Choose the last two months
#############################################
my $main_page = conditional_http_simple_get('http://politechbot.com/pipermail/politech/', $persistent)
or die "Unable to fetch politech pipermail main page.\n\t";
my $days = 24*60*60;
my $months = 30.41 * $days;
my $curtime = time();
my @month_urls;
foreach my $monthname ($main_page =~ m#href="(\d\d\d\d-\S+)/thread.html"#sgi) {
(my $mn = $monthname) =~ s/^(\d+)-(\w+)$/$2 1, $1/;
my $month = parsedate($mn);
next unless ($curtime - $month < 2.5 * $months); # we don't want to just look at the latest month, because it might be july 1st, and we wouldn't see the end-of-june posts
push(@month_urls, "http://politechbot.com/pipermail/politech/$monthname/thread.html");
}
#die Dumper(\@month_urls);
#############################################
# Read in the months
#############################################
my @email_urls;
foreach my $month_url (@month_urls) {
my $month = conditional_http_simple_get($month_url, $persistent)
or die "Unable to fetch $month_url.\n\t";
$month_url =~ s#thread.html$##;
while ($month =~ m#<li><a href="(\d{6}.html)">#gsi) {
push(@email_urls, $month_url . $1);
}
}
sub mail_num {local $_ = shift; s/\.html$//; s#^.*/##; int($_)}
my $max_mail_num = 0;
foreach (@email_urls) {
my $mn = mail_num($_);
if ($mn > $max_mail_num) {
$max_mail_num = $mn;
}
}
@email_urls = grep {$max_mail_num - mail_num($_) < NUM_EMAILS} @email_urls;
#die Dumper(\@email_urls);
#############################################
# Read in each email
#############################################
my $stories = {};
foreach my $email_url (@email_urls) {
my $story = {};
print "Fetching $email_url\n";
my $email = conditional_http_simple_get($email_url, $persistent)
or warn "Unable to fetch $email_url.\n\t";
if ($email !~ m#<h1>(.*?)</h1>.*?<i>(.*?)</i>.*?(<pre>.*?</pre>)#si) {
die "HTML format of emails apparently changed\n\t";
}
my ($title, $date, $body) = ($1, $2, $3);
$story->{link} = $email_url;
$title =~ s/^\s*\[[^\]]*\]\s*//g; # remove "[politech]"
$story->{title} = HTML::Entities::encode_entities($title);
$story->{description} = HTML::Entities::encode_entities($body);
$story->{date} = parsedate($date);
$stories->{$email_url} = $story;
}
# remove data for stories we haven't looked at this run
clear_stale_conditionalGET_cache($persistent);
#############################################
# Output RSS file
#############################################
open RSS, ">/home/interiot/www/rss/feeds/politech.rss" or die "Unable to write to /home/interiot/www/rss/feeds/politech.rss: $!";
print RSS <<"EOF";
<?xml version="1.0" encoding="UTF-8"?>
<rdf:RDF
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns="http://purl.org/rss/1.0/"
xmlns:dc="http://purl.org/dc/elements/1.1/"
>
<channel rdf:about="http://paperlined.org/rss/feeds/politech.rss">
<title>Newcum's Politech Feed</title>
<link>http://politechbot.com/pipermail/politech/</link>
<description>David Newcum's RSS Feed of politechbot.com. Contact rss_feeds\@paperlined.org for change requests.</description>
<language>en-us</language>
</channel>
EOF
foreach my $story (sort {$b->{date} <=> $a->{date}} values %$stories)
{
#next if ($curtime - $story->{date} > 4 * $days);
my $date = rss_localtime($story->{date});
print RSS <<"EOF";
<item rdf:about="$story->{link}">
<title>$story->{title}</title>
<link>$story->{link}</link>
<description>$story->{description}</description>
<dc:date>$date</dc:date>
</item>
EOF
}
print RSS "</rdf:RDF>\n";
Generated by GNU enscript 1.6.4.