http://paperlined.org/rss/feeds/pennyarcade.gen.pl

#! /usr/bin/env perl
BEGIN{$^W=1}  use strict;

# Each story is an <img src=""> tag for that day's pennyarcade comic.  If you click either
# on the title or the image, it will take you to the news article for that comic.
#
# The official pennyarcade .rss doesn't include the comic at all.  And other
# third-party pennyarcade.rss's don't include a link to the news article that I can see.


use lib '/home/interiot/src/pl/modules/';

use Date::Parse;
use Data::Dumper;
use HTML::Entities;
use LWP::Simple 'get';
use POSIX;
use Storable;
use Time::Local;
use Time::Zone;



# Load the persistent data
my $persistent;
my $persistent_filename = "/home/interiot/www/rss/feeds/pennyarcade.db";
if (-e $persistent_filename) {
	$persistent = Storable::retrieve($persistent_filename);
} else {
	$persistent = {};
}


##################################
# Grab a new comic, if one exists
##################################
my $latest_page = LWP::Simple::get('http://www.penny-arcade.com/view.php3')
	or die "Unable to retrieve http://www.penny-arcade.com/view.php3 webpage: $!\n";


my $latest_news_url;
if ($latest_page =~ m#href="(news.php3\?date=\d\d\d\d-\d\d-\d\d)"#i) {
	$latest_news_url = "http://www.penny-arcade.com/$1";
} else {
	#die "PennyArcade parser failed to find necessary info.  Page format has changed.";
	# Actually, apparently the text can be posted long after the comic.  So...  I guess...  deal with it.
	$latest_news_url = undef;
}

if ($latest_page !~ m#<img src="(images/20\d\d/20\d\d\d\d\d\d\S*\.(?:gif|jpg))"#i) {
	print STDERR "$latest_page\n";
	die "PennyArcade parser failed to find necessary info.  Page format has changed.";
}
my $latest_img_url = "http://www.penny-arcade.com/$1";

if ($latest_img_url !~ m#/(\d\d\d\d)(\d\d)(\d\d)\S*\.(?:gif|jpg)$#i) {
	die "PennyArcade parser failed to find necessary info.  Page format has changed.";
}
my ($year, $month, $day) = ($1, $2, $3);
my $perl_date = Time::Local::timelocal(0,0,0, $day, $month-1, $year);

# Zero or more HTML tags, with zero or more whitespace characters (space, tab, newline, &nbsp;, etc) between them, or at the begining or the end.
# Note that this specifically excludes any plaintext from being matched by this.
my $arbitrary_tags = '\s*(?:&nbsp;)*\s*(?:<\S+(?:\s+\S+=(?:"[^"]*"|\'[^\']*\'|[^\s>]*))*\s*>\s*(?:&nbsp;)*\s*)*';

if ($latest_page !~ m#<td[^>]*background="bluegrey/images/strip/striptop.gif"[^>]*>$arbitrary_tags([^<]+)#si) {
	die "PennyArcade parser failed to find necessary info.  Page format has changed.";
}
my $headline = $1;

## Add a new entry if necessary, but generally update the persistent data with the (possibly) new information
if (! exists $persistent->{$perl_date}) {
	$persistent->{$perl_date} = {};
}
$persistent->{$perl_date}{COMIC_IMG}	= $latest_img_url;
$persistent->{$perl_date}{NEWS_URL}	= $latest_news_url		if ($latest_news_url);
$persistent->{$perl_date}{FIRST_SEEN}	= time()
	unless ($persistent->{$perl_date}{FIRST_SEEN});
$persistent->{$perl_date}{HEADLINE}	= $headline;


#print Dumper($persistent); exit(1);


###################################
# Retire old comics
#    only keep the latest 25
##################################
my @chrono_sorted = sort {$b <=> $a} keys(%$persistent);		# the latest ones show up first

# remove the first 25 from the list of ones to kill
splice(@chrono_sorted, 0, 25, );

# kill the remaining ones
foreach my $date (@chrono_sorted) {
	delete($persistent->{$date});
}
	# (the above code is untested as of yet)


###################################
# Write out the .RSS file
###################################
@chrono_sorted = sort {$b <=> $a} keys(%$persistent);		# the latest ones show up first

my $tz_offset = Time::Zone::tz_local_offset();
my $timezone = sprintf("%d:%02d", $tz_offset/3600, ($tz_offset/60)% 60);
$timezone = "+$timezone" if ($tz_offset >= 0);

open RSS, ">/home/interiot/www/rss/feeds/pennyarcade.rss"	or die "Unable to write to pennyarcade.rss: $!";
print RSS <<"EOF";
<?xml version="1.0" encoding="UTF-8"?>

<rdf:RDF
  xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
  xmlns="http://purl.org/rss/1.0/"
  xmlns:dc="http://purl.org/dc/elements/1.1/"
>
	<channel rdf:about="http://paperlined.org/rss/feeds/pennyarcade.rss">
		<title>Newcum's PennyArcade Feed</title>
		<link>http://www.penny-arcade.com/</link>
		<description>David Newcum's RSS Feed of PennyArcade.com.  Contact rss_feeds\@paperlined.org for change requests.</description>
		<language>en-us</language>
	</channel>
EOF

foreach my $perl_date (@chrono_sorted) {
	my $data = $persistent->{$perl_date};
	my $displayable_date = POSIX::strftime("%D", localtime($perl_date));
	my $rss_date = POSIX::strftime("%Y-%m-%dT%H:%M", localtime($data->{FIRST_SEEN})) . $timezone;
	my $headline = encode_entities($data->{HEADLINE} || "Penny Arcade");

	my $img_news_url = '';
	if ($data->{NEWS_URL}) {
		$img_news_url = $data->{NEWS_URL};
	} else {
		$img_news_url = "http://www.penny-arcade.com/";
	}

	print RSS <<"EOF";
	<item rdf:about="$data->{COMIC_IMG}">
		<title>$headline ($displayable_date)</title>
		<link>$img_news_url</link>
		<description>&lt;a href="$img_news_url"&gt;&lt;img src="$data->{COMIC_IMG}" border="0" /&gt;&lt;/a&gt;</description>
		<dc:creator>Gabe and Tycho</dc:creator>
		<dc:date>$rss_date</dc:date>
	</item>
EOF
}
print RSS "</rdf:RDF>\n";


# Write the updated persistent perl data out
Storable::nstore($persistent, $persistent_filename)	or die "Unable to write to $persistent_filename: $!";

Generated by GNU enscript 1.6.4.