http://paperlined.org/rss/feeds/slashdot.gen.pl
#! /usr/bin/env perl
BEGIN{$^W=1} use strict;
# Slashdot's RSS feed has two problems:
# - it's very restrictive on the frequency that a host can read from it
# - it lags behind the main page by quite a bit
# Creating my own feed that's screen-scraped directly from the home page will solve both of these problems.
use lib '/home/interiot/src/pl/modules/';
use lib '/home/interiot/perllib/lib/perl5/site_perl/5.6.1/i686-linux/';
use Compress::Zlib;
use Date::Parse;
use HTML::Entities;
use LWP::Simple 'get';
use POSIX;
use Storable;
use Time::Local;
use Time::Zone;
#############################################
# Fetch and Scrape
#############################################
# slashdot doesn't support conditional-GETs :(
my $page = LWP::Simple::get("http://slashdot.org/")
or die "Unable to fetch the slashdot homepage: $!\n";
my $ungzipped = Compress::Zlib::uncompress($page);
if ($ungzipped) { # In case they stop gzipping it at some point in the future...
$page = $ungzipped;
}
#print $page; exit(1);
print "Fetched ", length($page), " bytes from slashdot.org\n";
# Zero or more HTML tags, with zero or more whitespace characters (space, tab, newline, etc) between them, or at the begining or the end.
# Note that this specifically excludes any plaintext from being matched by this.
my $arbitrary_tags = '\s*(?:<\S+(?:\s+\S+=(?:"[^"]*"|\'[^\']*\'|[^\s>]*))*>\s*)*';
my @stories;
while ($page =~ m!
<font[^>]*size="4"[^>]*>
<B>(.*?)</B> # title of the article
${arbitrary_tags}<img\s+src="//images.slashdot.org/topics/[^>]*>${arbitrary_tags} # we make sure that this is a story because we don't allow any plaintext between the subject and the topics image
Posted\s+by${arbitrary_tags}([^<]*)${arbitrary_tags}on\s+([^<]*) # slashdot admin poster, and posted date
$arbitrary_tags
from\s+the\s+[^<]*\s+dept\.
$arbitrary_tags
(.*?) # story body
<P><P><B>\(</B> .*?
HREF="([^"]*)" # story link
!xsigo)
{
push(@stories, {
TITLE => $1,
SLASHDOT_POSTER => $2,
DATE => $3,
BODY => $4,
LINK => $5,
});
print "Parsed story #", scalar(@stories), " (", $1, ")\n";
}
#############################################
# Output RSS file
#############################################
my $tz_offset = Time::Zone::tz_local_offset();
my $timezone = sprintf("%d:%02d", $tz_offset/3600, ($tz_offset/60)% 60);
$timezone = "+$timezone" if ($tz_offset >= 0);
open RSS, ">/home/interiot/www/rss/feeds/slashdot.rss" or die "Unable to write to /home/interiot/www/rss/feeds/slashdot.rss: $!";
print RSS <<"EOF";
<?xml version="1.0" encoding="UTF-8"?>
<rdf:RDF
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns="http://purl.org/rss/1.0/"
xmlns:dc="http://purl.org/dc/elements/1.1/"
>
<channel rdf:about="http://paperlined.org/rss/feeds/slashdot.rss">
<title>Newcum's Slashdot Feed</title>
<link>http://www.slashdot.com/</link>
<description>David Newcum's RSS Feed of slashdot.com. Contact rss_feeds\@paperlined.org for change requests.</description>
<language>en-us</language>
</channel>
EOF
foreach my $story (@stories)
{
$story->{DATE} =~ s#\@##;
$story->{BODY} =~ s#\s+$##s;
my $perl_date = Date::Parse::str2time($story->{DATE});
my $rss_date = POSIX::strftime("%Y-%m-%dT%H:%M", localtime($perl_date)) . $timezone;
my $link; # turn the link into something that will work if you're already logged in
next unless ($story->{LINK} =~ m#//((?:\w+\.)?slashdot.org)/\w+/(\d\d/\d\d/\d\d/\d+)\.shtml#);
$link = "http://$1/article.pl?sid=$2";
my $body = encode_entities($story->{BODY});
$story->{TITLE} =~ s#<.*?>##gs;
my $title = encode_entities($story->{TITLE});
print RSS <<"EOF";
<item rdf:about="$link">
<title>$title</title>
<link>$link</link>
<description>$body</description>
<dc:date>$rss_date</dc:date>
</item>
EOF
}
print RSS "</rdf:RDF>\n";
Generated by GNU enscript 1.6.4.