Newcum's Slashdot Feed

#! /usr/bin/env perl BEGIN{$^W=1} use strict; # Slashdot's RSS feed has two problems: # - it's very restrictive on the frequency that a host can read from it # - it lags behind the main page by quite a bit # Creating my own feed that's screen-scraped directly from the home page will solve both of these problems. use lib '/home/interiot/src/pl/modules/'; use lib '/home/interiot/perllib/lib/perl5/site_perl/5.6.1/i686-linux/'; use Compress::Zlib; use Date::Parse; use HTML::Entities; use LWP::Simple 'get'; use POSIX; use Storable; use Time::Local; use Time::Zone; ############################################# # Fetch and Scrape ############################################# # slashdot doesn't support conditional-GETs :( my $page = LWP::Simple::get("http://slashdot.org/") or die "Unable to fetch the slashdot homepage: $!\n"; my $ungzipped = Compress::Zlib::uncompress($page); if ($ungzipped) { # In case they stop gzipping it at some point in the future... $page = $ungzipped; } #print $page; exit(1); print "Fetched ", length($page), " bytes from slashdot.org\n"; # Zero or more HTML tags, with zero or more whitespace characters (space, tab, newline, etc) between them, or at the begining or the end. # Note that this specifically excludes any plaintext from being matched by this. my $arbitrary_tags = '\s*(?:<\S+(?:\s+\S+=(?:"[^"]*"|\'[^\']*\'|[^\s>]*))*>\s*)*'; my @stories; while ($page =~ m! ]*size="4"[^>]*> (.*?) # title of the article ${arbitrary_tags}]*>${arbitrary_tags} # we make sure that this is a story because we don't allow any plaintext between the subject and the topics image Posted\s+by${arbitrary_tags}([^<]*)${arbitrary_tags}on\s+([^<]*) # slashdot admin poster, and posted date $arbitrary_tags from\s+the\s+[^<]*\s+dept\. $arbitrary_tags (.*?) # story body

\( .*? HREF="([^"]*)" # story link !xsigo) { push(@stories, { TITLE => $1, SLASHDOT_POSTER => $2, DATE => $3, BODY => $4, LINK => $5, }); print "Parsed story #", scalar(@stories), " (", $1, ")\n"; } ############################################# # Output RSS file ############################################# my $tz_offset = Time::Zone::tz_local_offset(); my $timezone = sprintf("%d:%02d", $tz_offset/3600, ($tz_offset/60)% 60); $timezone = "+$timezone" if ($tz_offset >= 0); open RSS, ">/home/interiot/www/rss/feeds/slashdot.rss" or die "Unable to write to /home/interiot/www/rss/feeds/slashdot.rss: $!"; print RSS <<"EOF"; Newcum's Slashdot Feed http://www.slashdot.com/ David Newcum's RSS Feed of slashdot.com. Contact rss_feeds\@paperlined.org for change requests. en-us EOF foreach my $story (@stories) { $story->{DATE} =~ s#\@##; $story->{BODY} =~ s#\s+$##s; my $perl_date = Date::Parse::str2time($story->{DATE}); my $rss_date = POSIX::strftime("%Y-%m-%dT%H:%M", localtime($perl_date)) . $timezone; my $link; # turn the link into something that will work if you're already logged in next unless ($story->{LINK} =~ m#//((?:\w+\.)?slashdot.org)/\w+/(\d\d/\d\d/\d\d/\d+)\.shtml#); $link = "http://$1/article.pl?sid=$2"; my $body = encode_entities($story->{BODY}); $story->{TITLE} =~ s#<.*?>##gs; my $title = encode_entities($story->{TITLE}); print RSS <<"EOF"; $title $link $body $rss_date EOF } print RSS "\n";