http://paperlined.org/rss/feeds/andrewsullivan.gen.pl
#! /usr/bin/env perl
BEGIN{$^W=1} use strict;
# AndrewSullivan.com has no RSS feed, and the only existing feed I can find doesn't have dates, even though dates (down to the second) are available on the website.
#
# Obviously this needs to be rectified.
use lib '/home/interiot/src/pl/modules/';
use lib '/home/interiot/perllib/lib/perl5/site_perl/5.6.1/i686-linux/';
use Compress::Zlib;
use Data::Dumper;
use Date::Parse;
use HTML::Entities;
use LWP::Simple 'get';
use POSIX;
use Storable;
use Time::Local;
use Time::Zone;
#############################################
# Fetch and Scrape
#############################################
# andrewsullivan.com doesn't support conditional-GETs, and the headers are as cache-unfriendly as possible.
# Doesn't bother me, it's not my bandwidth bill...
my $page = LWP::Simple::get("http://www.andrewsullivan.com/")
or die "Unable to fetch the andrewsullivan.com's homepage: $!\n";
print "Fetched ", length($page), " bytes from andrewsullivan.com\n";
# Zero or more HTML tags, with zero or more whitespace characters (space, tab, newline, etc) between them, or at the begining or the end.
# Note that this specifically excludes any plaintext from being matched by this.
my $arbitrary_tags = '\s*(?:<\S+(?:\s+\S+=(?:"[^"]*"|\'[^\']*\'|[^\s>]*))*>\s*)*';
my @stories;
my $current_day;
while ($page =~ m!
<span\s+CLASS="art_title">([^<]*\d\d,\s+\d\d\d\d)</span> # start of a new day's posts
|
<SPAN\s+CLASS="inc_subtitle">([^<]*)</span> # title
(.*?) # body
<A\s+HREF="(index.php\?dish_inc=archives/[^"]*)"> # link
<SPAN\s+CLASS="inc_source">-\s*([^<]*)</SPAN> # time
!xsigo)
{
if ($1) {
$current_day = $1;
} else {
push(@stories, {
TITLE => $2,
BODY => $3,
LINK => $4,
DATE => $current_day . ' ' . $5,
});
$stories[-1]{LINK} =~ s#\&PHPSESSID=[a-z0-9]+##i;
print "Parsed story #", scalar(@stories), " (", $stories[-1]{TITLE}, ")\n";
}
}
#print Dumper(\@stories); exit(0);
#############################################
# Output RSS file
#############################################
my $tz_offset = Time::Zone::tz_local_offset();
my $timezone = sprintf("%d:%02d", $tz_offset/3600, ($tz_offset/60)% 60);
$timezone = "+$timezone" if ($tz_offset >= 0);
open RSS, ">/home/interiot/www/rss/feeds/andrewsullivan.rss" or die "Unable to write to /home/interiot/www/rss/feeds/andrewsullivan.rss: $!";
print RSS <<"EOF";
<?xml version="1.0" encoding="UTF-8"?>
<rdf:RDF
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns="http://purl.org/rss/1.0/"
xmlns:dc="http://purl.org/dc/elements/1.1/"
>
<channel rdf:about="http://paperlined.org/rss/feeds/andrewsullivan.rss">
<title>Newcum's AndrewSullivan Feed</title>
<link>http://www.andrewsullivan.com/</link>
<description>David Newcum's RSS Feed of andrewsullivan.com. Contact rss_feeds\@paperlined.org for change requests.</description>
<language>en-us</language>
</channel>
EOF
foreach my $story (@stories)
{
my $perl_date = Date::Parse::str2time($story->{DATE});
my $rss_date = POSIX::strftime("%Y-%m-%dT%H:%M", localtime($perl_date)) . $timezone;
$story->{TITLE} =~ s#:\s*$##;
$story->{TITLE} =~ s#([\w']+)#\u\L$1#g; # Uppercase just the first letter of each word
my $title = encode_entities( $story->{TITLE}, '<>&');
my $link = encode_entities("http://www.andrewsullivan.com/" . $story->{LINK}, '<>&');
$story->{BODY} =~ s/\starget\s*=\s*["']?_blank["']?//g; # no target=_blank !!!!
my $body = encode_entities(trim($story->{BODY}, '<>&'));
$body =~ s/\ / /g; # No freaking clue why this shows up
print RSS <<"EOF";
<item rdf:about="$link">
<title>$title</title>
<link>$link</link>
<description>$body</description>
<dc:creator>Andrew Sullivan</dc:creator>
<dc:date>$rss_date</dc:date>
</item>
EOF
}
print RSS "</rdf:RDF>\n";
sub trim {local $_=shift; s/^\s+//; s/\s+$//; $_}
Generated by GNU enscript 1.6.4.