http://paperlined.org/rss/feeds/politech.gen.pl

#! /usr/bin/env perl
BEGIN{$^W=1}  use strict;

# Politech seems to be stuck using pipermail now.
# So we'll go ahead and implement a feed scraper based on that now.
#
# Uses approximately 5k of extra politech bandwidth every time it's run, over and above 
# the transfer of new messages.

use Data::Dumper;
use File::Basename;
use HTML::Entities;
use Time::ParseDate;

use lib dirname($0);		# look in the same directory that this script resides in
use FeedHelpers;		# Common code, available here:  http://paperlined.org/rss/feeds/FeedHelpers.pm

sub NUM_EMAILS {10}		# how many emails to return in the RSS feed
				# since we post the full text of each message, which tends to be *quite* large, we'll only display the last couple emails


# Load the persistent data
my $persistent = PersistentHash::tie_storable("politech.db");


#############################################
# Choose the last two months
#############################################
my $main_page = conditional_http_simple_get('http://politechbot.com/pipermail/politech/', $persistent)
	or die "Unable to fetch politech pipermail main page.\n\t";

my $days = 24*60*60;
my $months = 30.41 * $days;

my $curtime = time();

my @month_urls;
foreach my $monthname ($main_page =~ m#href="(\d\d\d\d-\S+)/thread.html"#sgi) {
	(my $mn = $monthname) =~ s/^(\d+)-(\w+)$/$2 1, $1/;
	my $month = parsedate($mn);
	next unless ($curtime - $month < 2.5 * $months);		# we don't want to just look at the latest month, because it might be july 1st, and we wouldn't see the end-of-june posts
	push(@month_urls, "http://politechbot.com/pipermail/politech/$monthname/thread.html");
}
#die Dumper(\@month_urls);


#############################################
# Read in the months
#############################################
my @email_urls;
foreach my $month_url (@month_urls) {
	my $month = conditional_http_simple_get($month_url, $persistent)
		or die "Unable to fetch $month_url.\n\t";
	$month_url =~ s#thread.html$##;

	while ($month =~ m#<li><a href="(\d{6}.html)">#gsi) {
		push(@email_urls, $month_url . $1);
	}
}

sub mail_num {local $_ = shift; s/\.html$//; s#^.*/##; int($_)}
my $max_mail_num = 0;
foreach (@email_urls) {
	my $mn = mail_num($_);
	if ($mn > $max_mail_num) {
		$max_mail_num = $mn;
	}
}

@email_urls = grep {$max_mail_num - mail_num($_) < NUM_EMAILS} @email_urls;

#die Dumper(\@email_urls);


#############################################
# Read in each email
#############################################
my $stories = {};
foreach my $email_url (@email_urls) {
	my $story = {};
	print "Fetching $email_url\n";
	my $email = conditional_http_simple_get($email_url, $persistent)
		or warn "Unable to fetch $email_url.\n\t";

	if ($email !~ m#<h1>(.*?)</h1>.*?<i>(.*?)</i>.*?(<pre>.*?</pre>)#si) {
		die "HTML format of emails apparently changed\n\t";
	}

	my ($title, $date, $body) = ($1, $2, $3);

	$story->{link} = $email_url;

	$title =~ s/^\s*\[[^\]]*\]\s*//g;		# remove "[politech]"
	$story->{title} = HTML::Entities::encode_entities($title);

	$story->{description} = HTML::Entities::encode_entities($body);

	$story->{date} = parsedate($date);

	$stories->{$email_url} = $story;
}

# remove data for stories we haven't looked at this run
clear_stale_conditionalGET_cache($persistent);


#############################################
# Output RSS file
#############################################
open RSS, ">/home/interiot/www/rss/feeds/politech.rss"	or die "Unable to write to /home/interiot/www/rss/feeds/politech.rss: $!";
print RSS <<"EOF";
<?xml version="1.0" encoding="UTF-8"?>
<rdf:RDF
  xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
  xmlns="http://purl.org/rss/1.0/"
  xmlns:dc="http://purl.org/dc/elements/1.1/"
>
        <channel rdf:about="http://paperlined.org/rss/feeds/politech.rss">
                <title>Newcum's Politech Feed</title>
                <link>http://politechbot.com/pipermail/politech/</link>
                <description>David Newcum's RSS Feed of politechbot.com.  Contact rss_feeds\@paperlined.org for change requests.</description>
                <language>en-us</language>
        </channel>
EOF

foreach my $story (sort {$b->{date} <=> $a->{date}} values %$stories)
{
	#next if ($curtime - $story->{date} > 4 * $days);
	my $date = rss_localtime($story->{date});
	print RSS <<"EOF";
        <item rdf:about="$story->{link}">
                <title>$story->{title}</title>
                <link>$story->{link}</link>
                <description>$story->{description}</description>
                <dc:date>$date</dc:date>
        </item>
EOF
}
print RSS "</rdf:RDF>\n";

Generated by GNU enscript 1.6.4.