http://paperlined.org/rss/feeds/instapundit.gen.pl

#! /usr/bin/perl
BEGIN{$^W=1}
use strict;

# Instapundit's RSS stories
# 	1) are truncated
# 	2) have all HTML markup removed
#
# This just restores the full description to the .rss feed, and should leave the rest of it alone.
# 
# The story is only read the first time it shows up on the official .rss, to save bandwidth
# 	(actually, doing it otherwise would use a huge amount of bandwidth, but would catch any story updates)

use lib '/home/interiot/src/pl/modules/';
use lib '/home/interiot/perllib/lib/perl5/site_perl/5.6.1/i686-linux/';
use lib '/home/jerry/perllib/lib/site_perl/5.005/i386-linux/';

use LWP::Simple;
use Storable;
use XML::RSS;


#########################################
# Load the persistent data
#########################################
my $persistent;
my $persistent_filename = "/home/interiot/www/rss/feeds/instapundit.db";
if (-e $persistent_filename) {
	$persistent = Storable::retrieve($persistent_filename);
} else {
	$persistent = {};
}


#########################################
# Load the official RSS feed
#########################################
my $page = LWP::Simple::get("http://www.instapundit.com/index.xml")
	or die "Unable to fetch instapundit's index.xml: $!\n\t";

print "Fetched ", length($page), " bytes from Instapundit\n";

my $rss_obj = XML::RSS->new(version => '1.0');
eval{
	local $^W=0;
	no strict;
	$rss_obj->parse($page);
};


#########################################
# Use a previously-fetched description
# where possible, or a newly-fetched
# description where necessary.
#########################################
my %links_seen;
foreach my $item (@{ $rss_obj->{items} })
{
	my $link = $item->{link};
	$links_seen{$link}++;

	if (! exists $persistent->{$link}) {
		my $link_page = LWP::Simple::get($link)
			or die "Unable to fetch $link: $!\n\t";

		print "Fetched ", length($link_page), " bytes from $link\n";

		if ($link_page =~ m#<h3></h3>\s*(.*?)\s+<div id="footer">#mis) {
			$persistent->{$link} = $1;
		} else {
			die "Page format has changed.\n\t";
		}
	}

	$item->{description} = $persistent->{$link};

	#$item->{title} = "No Title";		# make XML::RSS::as_rss_1_0() happy so it can save out items out
	
	# Come up with a psuedo-title since Instapundit doesn't have any (???)
	$item->{title} = $item->{description};
	$item->{title} =~ s#<[^>]*?>##sg;		# Take out any tags
	$item->{title} =~ s#[a-z].*##s;			# Chop off anything other than the first capital letters
	$item->{title} =~ s#\s+$##s;			# Chop off trailing/begining spaces
	$item->{title} =~ s#^\s+##s;
	$item->{title} =~ s#\W.$##s;			# Chop the start of the next sentance
	$item->{title} =~ s#[:,']\s*$##s;		# Chop the start of the next sentance
	$item->{title} =~ s#([\w']+)#\u\L$1#g;		# Uppercase just the first letter of each word

	print $item->{title}, "\n"
}


#########################################
# Retire old persistent data
#########################################
foreach my $link (keys %$persistent)
{
	if (! exists $links_seen{$link}) {
		delete $persistent->{$link};
	}
}


#########################################
# Write out persistent data
#########################################
Storable::nstore($persistent, $persistent_filename)     or die "Unable to write to $persistent_filename: $!\n\t";


#########################################
# Write out updated .rss file
#########################################
eval{
	local $^W=0;
	no strict;
	$rss_obj->{output} = '1.0';
	$rss_obj->save("/home/interiot/www/rss/feeds/instapundit.rss");
};

Generated by GNU enscript 1.6.4.