http://paperlined.org/rss/feeds/instapundit.gen.pl
#! /usr/bin/perl
BEGIN{$^W=1}
use strict;
# Instapundit's RSS stories
# 1) are truncated
# 2) have all HTML markup removed
#
# This just restores the full description to the .rss feed, and should leave the rest of it alone.
#
# The story is only read the first time it shows up on the official .rss, to save bandwidth
# (actually, doing it otherwise would use a huge amount of bandwidth, but would catch any story updates)
use lib '/home/interiot/src/pl/modules/';
use lib '/home/interiot/perllib/lib/perl5/site_perl/5.6.1/i686-linux/';
use lib '/home/jerry/perllib/lib/site_perl/5.005/i386-linux/';
use LWP::Simple;
use Storable;
use XML::RSS;
#########################################
# Load the persistent data
#########################################
my $persistent;
my $persistent_filename = "/home/interiot/www/rss/feeds/instapundit.db";
if (-e $persistent_filename) {
$persistent = Storable::retrieve($persistent_filename);
} else {
$persistent = {};
}
#########################################
# Load the official RSS feed
#########################################
my $page = LWP::Simple::get("http://www.instapundit.com/index.xml")
or die "Unable to fetch instapundit's index.xml: $!\n\t";
print "Fetched ", length($page), " bytes from Instapundit\n";
my $rss_obj = XML::RSS->new(version => '1.0');
eval{
local $^W=0;
no strict;
$rss_obj->parse($page);
};
#########################################
# Use a previously-fetched description
# where possible, or a newly-fetched
# description where necessary.
#########################################
my %links_seen;
foreach my $item (@{ $rss_obj->{items} })
{
my $link = $item->{link};
$links_seen{$link}++;
if (! exists $persistent->{$link}) {
my $link_page = LWP::Simple::get($link)
or die "Unable to fetch $link: $!\n\t";
print "Fetched ", length($link_page), " bytes from $link\n";
if ($link_page =~ m#<h3></h3>\s*(.*?)\s+<div id="footer">#mis) {
$persistent->{$link} = $1;
} else {
die "Page format has changed.\n\t";
}
}
$item->{description} = $persistent->{$link};
#$item->{title} = "No Title"; # make XML::RSS::as_rss_1_0() happy so it can save out items out
# Come up with a psuedo-title since Instapundit doesn't have any (???)
$item->{title} = $item->{description};
$item->{title} =~ s#<[^>]*?>##sg; # Take out any tags
$item->{title} =~ s#[a-z].*##s; # Chop off anything other than the first capital letters
$item->{title} =~ s#\s+$##s; # Chop off trailing/begining spaces
$item->{title} =~ s#^\s+##s;
$item->{title} =~ s#\W.$##s; # Chop the start of the next sentance
$item->{title} =~ s#[:,']\s*$##s; # Chop the start of the next sentance
$item->{title} =~ s#([\w']+)#\u\L$1#g; # Uppercase just the first letter of each word
print $item->{title}, "\n"
}
#########################################
# Retire old persistent data
#########################################
foreach my $link (keys %$persistent)
{
if (! exists $links_seen{$link}) {
delete $persistent->{$link};
}
}
#########################################
# Write out persistent data
#########################################
Storable::nstore($persistent, $persistent_filename) or die "Unable to write to $persistent_filename: $!\n\t";
#########################################
# Write out updated .rss file
#########################################
eval{
local $^W=0;
no strict;
$rss_obj->{output} = '1.0';
$rss_obj->save("/home/interiot/www/rss/feeds/instapundit.rss");
};
Generated by GNU enscript 1.6.4.