http://paperlined.org/rss/feeds/andrewsullivan.gen.pl

#! /usr/bin/env perl
BEGIN{$^W=1}  use strict;

# AndrewSullivan.com has no RSS feed, and the only existing feed I can find doesn't have dates, even though dates (down to the second) are available on the website.
# 
# Obviously this needs to be rectified.


use lib '/home/interiot/src/pl/modules/';
use lib '/home/interiot/perllib/lib/perl5/site_perl/5.6.1/i686-linux/';

use Compress::Zlib;
use Data::Dumper;
use Date::Parse;
use HTML::Entities;
use LWP::Simple 'get';
use POSIX;
use Storable;
use Time::Local;
use Time::Zone;


#############################################
# Fetch and Scrape
#############################################
# andrewsullivan.com doesn't support conditional-GETs, and the headers are as cache-unfriendly as possible.
# Doesn't bother me, it's not my bandwidth bill...
my $page = LWP::Simple::get("http://www.andrewsullivan.com/")
	or die "Unable to fetch the andrewsullivan.com's homepage: $!\n";

print "Fetched ", length($page), " bytes from andrewsullivan.com\n";

# Zero or more HTML tags, with zero or more whitespace characters (space, tab, newline, etc) between them, or at the begining or the end.
# Note that this specifically excludes any plaintext from being matched by this.
my $arbitrary_tags = '\s*(?:<\S+(?:\s+\S+=(?:"[^"]*"|\'[^\']*\'|[^\s>]*))*>\s*)*';

my @stories;


my $current_day;

while ($page =~ m!
			<span\s+CLASS="art_title">([^<]*\d\d,\s+\d\d\d\d)</span>		# start of a new day's posts
		|
			<SPAN\s+CLASS="inc_subtitle">([^<]*)</span>				# title
			(.*?)									# body
			<A\s+HREF="(index.php\?dish_inc=archives/[^"]*)">			# link
			<SPAN\s+CLASS="inc_source">-\s*([^<]*)</SPAN>				# time
		!xsigo)
{
	if ($1) {
		$current_day = $1;
	} else {
		push(@stories, {
			TITLE		=> $2,
			BODY		=> $3,
			LINK		=> $4,
			DATE		=> $current_day . ' ' . $5,
		});

		$stories[-1]{LINK} =~ s#\&PHPSESSID=[a-z0-9]+##i;

		print "Parsed story #", scalar(@stories), " (", $stories[-1]{TITLE}, ")\n";
	}
	
}

#print Dumper(\@stories); exit(0);


#############################################
# Output RSS file
#############################################
my $tz_offset = Time::Zone::tz_local_offset();
my $timezone = sprintf("%d:%02d", $tz_offset/3600, ($tz_offset/60)% 60);
$timezone = "+$timezone" if ($tz_offset >= 0);

open RSS, ">/home/interiot/www/rss/feeds/andrewsullivan.rss"	or die "Unable to write to /home/interiot/www/rss/feeds/andrewsullivan.rss: $!";
print RSS <<"EOF";
<?xml version="1.0" encoding="UTF-8"?>

<rdf:RDF
  xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
  xmlns="http://purl.org/rss/1.0/"
  xmlns:dc="http://purl.org/dc/elements/1.1/"
>
        <channel rdf:about="http://paperlined.org/rss/feeds/andrewsullivan.rss">
                <title>Newcum's AndrewSullivan Feed</title>
                <link>http://www.andrewsullivan.com/</link>
                <description>David Newcum's RSS Feed of andrewsullivan.com.  Contact rss_feeds\@paperlined.org for change requests.</description>
                <language>en-us</language>
        </channel>
EOF

foreach my $story (@stories)
{
	my $perl_date = Date::Parse::str2time($story->{DATE});
 	my $rss_date = POSIX::strftime("%Y-%m-%dT%H:%M", localtime($perl_date)) . $timezone;	

	$story->{TITLE} =~ s#:\s*$##;
	$story->{TITLE} =~ s#([\w']+)#\u\L$1#g;			# Uppercase just the first letter of each word
	my $title = encode_entities( $story->{TITLE}, '<>&');

	my $link = encode_entities("http://www.andrewsullivan.com/" . $story->{LINK}, '<>&');

	$story->{BODY} =~ s/\starget\s*=\s*["']?_blank["']?//g;		# no target=_blank !!!!
	my $body = encode_entities(trim($story->{BODY}, '<>&'));
	$body =~ s/\&nbsp;/ /g;		# No freaking clue why this shows up
 
 	print RSS <<"EOF";
         <item rdf:about="$link">
                 <title>$title</title>
                 <link>$link</link>
                 <description>$body</description>
		 <dc:creator>Andrew Sullivan</dc:creator>
                 <dc:date>$rss_date</dc:date>
         </item>
EOF
}
print RSS "</rdf:RDF>\n";


sub trim {local $_=shift; s/^\s+//; s/\s+$//; $_}

Generated by GNU enscript 1.6.4.