http://paperlined.org/rss/feeds/slashdot.gen.pl

#! /usr/bin/env perl
BEGIN{$^W=1}  use strict;

# Slashdot's RSS feed has two problems:
# 	- it's very restrictive on the frequency that a host can read from it
# 	- it lags behind the main page by quite a bit
# Creating my own feed that's screen-scraped directly from the home page will solve both of these problems.


use lib '/home/interiot/src/pl/modules/';
use lib '/home/interiot/perllib/lib/perl5/site_perl/5.6.1/i686-linux/';

use Compress::Zlib;
use Date::Parse;
use HTML::Entities;
use LWP::Simple 'get';
use POSIX;
use Storable;
use Time::Local;
use Time::Zone;




#############################################
# Fetch and Scrape
#############################################
# slashdot doesn't support conditional-GETs :(
my $page = LWP::Simple::get("http://slashdot.org/")
	or die "Unable to fetch the slashdot homepage: $!\n";

my $ungzipped = Compress::Zlib::uncompress($page);
if ($ungzipped) {			# In case they stop gzipping it at some point in the future...
	$page = $ungzipped;
}

#print $page; exit(1);

print "Fetched ", length($page), " bytes from slashdot.org\n";

# Zero or more HTML tags, with zero or more whitespace characters (space, tab, newline, etc) between them, or at the begining or the end.
# Note that this specifically excludes any plaintext from being matched by this.
my $arbitrary_tags = '\s*(?:<\S+(?:\s+\S+=(?:"[^"]*"|\'[^\']*\'|[^\s>]*))*>\s*)*';

my @stories;

while ($page =~ m!
		<font[^>]*size="4"[^>]*>
		<B>(.*?)</B>		# title of the article
			${arbitrary_tags}<img\s+src="//images.slashdot.org/topics/[^>]*>${arbitrary_tags}	# we make sure that this is a story because we don't allow any plaintext between the subject and the topics image
		Posted\s+by${arbitrary_tags}([^<]*)${arbitrary_tags}on\s+([^<]*)			# slashdot admin poster, and posted date
			$arbitrary_tags
		from\s+the\s+[^<]*\s+dept\.
			$arbitrary_tags
		(.*?)			# story body
			<P><P><B>\(</B> .*?
		HREF="([^"]*)"		# story link
	!xsigo)
{
	push(@stories, {
		TITLE		=> $1,
		SLASHDOT_POSTER	=> $2,
		DATE		=> $3,
		BODY		=> $4,
		LINK		=> $5,
	});
	
	print "Parsed story #", scalar(@stories), " (", $1, ")\n";
}



#############################################
# Output RSS file
#############################################
my $tz_offset = Time::Zone::tz_local_offset();
my $timezone = sprintf("%d:%02d", $tz_offset/3600, ($tz_offset/60)% 60);
$timezone = "+$timezone" if ($tz_offset >= 0);

open RSS, ">/home/interiot/www/rss/feeds/slashdot.rss"	or die "Unable to write to /home/interiot/www/rss/feeds/slashdot.rss: $!";
print RSS <<"EOF";
<?xml version="1.0" encoding="UTF-8"?>

<rdf:RDF
  xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
  xmlns="http://purl.org/rss/1.0/"
  xmlns:dc="http://purl.org/dc/elements/1.1/"
>
        <channel rdf:about="http://paperlined.org/rss/feeds/slashdot.rss">
                <title>Newcum's Slashdot Feed</title>
                <link>http://www.slashdot.com/</link>
                <description>David Newcum's RSS Feed of slashdot.com.  Contact rss_feeds\@paperlined.org for change requests.</description>
                <language>en-us</language>
        </channel>
EOF

foreach my $story (@stories)
{
	$story->{DATE} =~ s#\@##;
	$story->{BODY} =~ s#\s+$##s;

	my $perl_date = Date::Parse::str2time($story->{DATE});
 	my $rss_date = POSIX::strftime("%Y-%m-%dT%H:%M", localtime($perl_date)) . $timezone;	

	my $link;		# turn the link into something that will work if you're already logged in
	next unless ($story->{LINK} =~ m#//((?:\w+\.)?slashdot.org)/\w+/(\d\d/\d\d/\d\d/\d+)\.shtml#);
	$link = "http://$1/article.pl?sid=$2";

	my $body = encode_entities($story->{BODY});

	$story->{TITLE} =~ s#<.*?>##gs;
	my $title = encode_entities($story->{TITLE});
 
 	print RSS <<"EOF";
         <item rdf:about="$link">
                 <title>$title</title>
                 <link>$link</link>
                 <description>$body</description>
                 <dc:date>$rss_date</dc:date>
         </item>
EOF
}
print RSS "</rdf:RDF>\n";

Generated by GNU enscript 1.6.4.