#! /usr/bin/env perl BEGIN{$^W=1} use strict; # AndrewSullivan.com has no RSS feed, and the only existing feed I can find doesn't have dates, even though dates (down to the second) are available on the website. # # Obviously this needs to be rectified. use lib '/home/interiot/src/pl/modules/'; use lib '/home/interiot/perllib/lib/perl5/site_perl/5.6.1/i686-linux/'; use Compress::Zlib; use Data::Dumper; use Date::Parse; use HTML::Entities; use LWP::Simple 'get'; use POSIX; use Storable; use Time::Local; use Time::Zone; ############################################# # Fetch and Scrape ############################################# # andrewsullivan.com doesn't support conditional-GETs, and the headers are as cache-unfriendly as possible. # Doesn't bother me, it's not my bandwidth bill... my $page = LWP::Simple::get("http://www.andrewsullivan.com/") or die "Unable to fetch the andrewsullivan.com's homepage: $!\n"; print "Fetched ", length($page), " bytes from andrewsullivan.com\n"; # Zero or more HTML tags, with zero or more whitespace characters (space, tab, newline, etc) between them, or at the begining or the end. # Note that this specifically excludes any plaintext from being matched by this. my $arbitrary_tags = '\s*(?:<\S+(?:\s+\S+=(?:"[^"]*"|\'[^\']*\'|[^\s>]*))*>\s*)*'; my @stories; my $current_day; while ($page =~ m! ([^<]*\d\d,\s+\d\d\d\d) # start of a new day's posts | ([^<]*) # title (.*?) # body # link -\s*([^<]*) # time !xsigo) { if ($1) { $current_day = $1; } else { push(@stories, { TITLE => $2, BODY => $3, LINK => $4, DATE => $current_day . ' ' . $5, }); $stories[-1]{LINK} =~ s#\&PHPSESSID=[a-z0-9]+##i; print "Parsed story #", scalar(@stories), " (", $stories[-1]{TITLE}, ")\n"; } } #print Dumper(\@stories); exit(0); ############################################# # Output RSS file ############################################# my $tz_offset = Time::Zone::tz_local_offset(); my $timezone = sprintf("%d:%02d", $tz_offset/3600, ($tz_offset/60)% 60); $timezone = "+$timezone" if ($tz_offset >= 0); open RSS, ">/home/interiot/www/rss/feeds/andrewsullivan.rss" or die "Unable to write to /home/interiot/www/rss/feeds/andrewsullivan.rss: $!"; print RSS <<"EOF"; Newcum's AndrewSullivan Feed http://www.andrewsullivan.com/ David Newcum's RSS Feed of andrewsullivan.com. Contact rss_feeds\@paperlined.org for change requests. en-us EOF foreach my $story (@stories) { my $perl_date = Date::Parse::str2time($story->{DATE}); my $rss_date = POSIX::strftime("%Y-%m-%dT%H:%M", localtime($perl_date)) . $timezone; $story->{TITLE} =~ s#:\s*$##; $story->{TITLE} =~ s#([\w']+)#\u\L$1#g; # Uppercase just the first letter of each word my $title = encode_entities( $story->{TITLE}, '<>&'); my $link = encode_entities("http://www.andrewsullivan.com/" . $story->{LINK}, '<>&'); $story->{BODY} =~ s/\starget\s*=\s*["']?_blank["']?//g; # no target=_blank !!!! my $body = encode_entities(trim($story->{BODY}, '<>&')); $body =~ s/\ / /g; # No freaking clue why this shows up print RSS <<"EOF"; $title $link $body Andrew Sullivan $rss_date EOF } print RSS "\n"; sub trim {local $_=shift; s/^\s+//; s/\s+$//; $_}