#! /usr/bin/env perl
BEGIN{$^W=1} use strict;
# AndrewSullivan.com has no RSS feed, and the only existing feed I can find doesn't have dates, even though dates (down to the second) are available on the website.
#
# Obviously this needs to be rectified.
use lib '/home/interiot/src/pl/modules/';
use lib '/home/interiot/perllib/lib/perl5/site_perl/5.6.1/i686-linux/';
use Compress::Zlib;
use Data::Dumper;
use Date::Parse;
use HTML::Entities;
use LWP::Simple 'get';
use POSIX;
use Storable;
use Time::Local;
use Time::Zone;
#############################################
# Fetch and Scrape
#############################################
# andrewsullivan.com doesn't support conditional-GETs, and the headers are as cache-unfriendly as possible.
# Doesn't bother me, it's not my bandwidth bill...
my $page = LWP::Simple::get("http://www.andrewsullivan.com/")
or die "Unable to fetch the andrewsullivan.com's homepage: $!\n";
print "Fetched ", length($page), " bytes from andrewsullivan.com\n";
# Zero or more HTML tags, with zero or more whitespace characters (space, tab, newline, etc) between them, or at the begining or the end.
# Note that this specifically excludes any plaintext from being matched by this.
my $arbitrary_tags = '\s*(?:<\S+(?:\s+\S+=(?:"[^"]*"|\'[^\']*\'|[^\s>]*))*>\s*)*';
my @stories;
my $current_day;
while ($page =~ m!
([^<]*\d\d,\s+\d\d\d\d) # start of a new day's posts
|
([^<]*) # title
(.*?) # body
# link
-\s*([^<]*) # time
!xsigo)
{
if ($1) {
$current_day = $1;
} else {
push(@stories, {
TITLE => $2,
BODY => $3,
LINK => $4,
DATE => $current_day . ' ' . $5,
});
$stories[-1]{LINK} =~ s#\&PHPSESSID=[a-z0-9]+##i;
print "Parsed story #", scalar(@stories), " (", $stories[-1]{TITLE}, ")\n";
}
}
#print Dumper(\@stories); exit(0);
#############################################
# Output RSS file
#############################################
my $tz_offset = Time::Zone::tz_local_offset();
my $timezone = sprintf("%d:%02d", $tz_offset/3600, ($tz_offset/60)% 60);
$timezone = "+$timezone" if ($tz_offset >= 0);
open RSS, ">/home/interiot/www/rss/feeds/andrewsullivan.rss" or die "Unable to write to /home/interiot/www/rss/feeds/andrewsullivan.rss: $!";
print RSS <<"EOF";
Newcum's AndrewSullivan Feed
http://www.andrewsullivan.com/
David Newcum's RSS Feed of andrewsullivan.com. Contact rss_feeds\@paperlined.org for change requests.
en-us
EOF
foreach my $story (@stories)
{
my $perl_date = Date::Parse::str2time($story->{DATE});
my $rss_date = POSIX::strftime("%Y-%m-%dT%H:%M", localtime($perl_date)) . $timezone;
$story->{TITLE} =~ s#:\s*$##;
$story->{TITLE} =~ s#([\w']+)#\u\L$1#g; # Uppercase just the first letter of each word
my $title = encode_entities( $story->{TITLE}, '<>&');
my $link = encode_entities("http://www.andrewsullivan.com/" . $story->{LINK}, '<>&');
$story->{BODY} =~ s/\starget\s*=\s*["']?_blank["']?//g; # no target=_blank !!!!
my $body = encode_entities(trim($story->{BODY}, '<>&'));
$body =~ s/\ / /g; # No freaking clue why this shows up
print RSS <<"EOF";
-
$title
$link
$body
Andrew Sullivan
$rss_date
EOF
}
print RSS "\n";
sub trim {local $_=shift; s/^\s+//; s/\s+$//; $_}