http://paperlined.org/rss/feeds/edmunds_headlines.gen.pl
#! /usr/bin/env perl
BEGIN{$^W=1} use strict;
# Edmund's automotive news headlines, including story text, and large photo when available.
use lib '/home/interiot/src/pl/modules/';
use Date::Parse;
use Data::Dumper;
use HTML::Entities;
use LWP::Simple 'get';
use POSIX;
use Storable;
use Time::Local;
use Time::Zone;
# Load the persistent data
my $persistent;
my $persistent_filename = "/home/interiot/www/rss/feeds/edmunds_headlines.db";
if (-e $persistent_filename) {
$persistent = Storable::retrieve($persistent_filename);
} else {
$persistent = {};
}
##################################
# Get all stories currently lsited
##################################
my $index_page = LWP::Simple::get('http://www.edmunds.com/news/regularnews/articles/index.html')
or die "Unable to retrieve http://www.edmunds.com/news/regularnews/articles/index.html webpage: $!\n";
print length($index_page), " bytes fetched from http://www.edmunds.com/news/regularnews/articles/\n";
my %articles_seen_this_time;
# delete $persistent->{"http://www.edmunds.com/news/regularnews/articles/100892/article.html"};
while ($index_page =~ m#<a href="(/news/regularnews/articles/\d+/article.html)">([^<]+)</a>#gsi)
{
my $article_url = "http://www.edmunds.com$1";
my $article_title = $2;
$articles_seen_this_time{$article_url}++;
if (! exists $persistent->{$article_url}) {
##################################
# Fetch a new story
##################################
my $article = {
URL => $article_url,
TITLE => $article_title,
FIRST_SEEN => time(),
};
sleep(2); # don't piss edmunds.com off
my $article_page = LWP::Simple::get($article_url)
or die "Unable to retrieve $article_url $!\n";
print length($article_page), " bytes fetched from $article_url\n";
if ($article_page =~ m#<font class="bodytext">(.*?)</font>#si) {
$article->{DESCRIPTION} = $1;
} else {
die "Unable to find necessary information. Page format likely changed.\n\t";
}
if ($article_page =~ m#<!-- start lead photo -->.*<a href="([^"]+)"><img src="[^>]+.*<!-- end lead photo -->#si) {
$article->{DESCRIPTION} = qq(<p><img src="$1" /></p>) . $article->{DESCRIPTION};
}
$persistent->{$article_url} = $article;
}
}
if (scalar keys %articles_seen_this_time < 10) {
die "Didn't parse enough articles. Page format likely changed.\n\t";
}
###################################
# Retire article data
# once it's no longer listed by edmunds
##################################
foreach my $key (keys %$persistent)
{
if (! exists $articles_seen_this_time{$key}) {
delete $persistent->{$key};
}
}
###################################
# Write out the .RSS file
###################################
my @chrono_sorted = sort {$persistent->{$b}{FIRST_SEEN} <=> $persistent->{$a}{FIRST_SEEN}}
keys(%$persistent); # the latest ones show up first
my $tz_offset = Time::Zone::tz_local_offset();
my $timezone = sprintf("%d:%02d", $tz_offset/3600, ($tz_offset/60)% 60);
$timezone = "+$timezone" if ($tz_offset >= 0);
open RSS, ">/home/interiot/www/rss/feeds/edmunds_headlines.rss" or die "Unable to write to edmunds_headlines.rss: $!";
print RSS <<"EOF";
<?xml version="1.0" encoding="UTF-8"?>
<rdf:RDF
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns="http://purl.org/rss/1.0/"
xmlns:dc="http://purl.org/dc/elements/1.1/"
>
<channel rdf:about="http://paperlined.org/rss/feeds/edmunds_headlines.rss">
<title>Newcum's Edmunds Headlines Feed</title>
<link>http://www.edmunds.com/news/regularnews/articles/</link>
<description>David Newcum's RSS Feed of Edmunds.com's automotive news headlines. Contact rss_feeds\@paperlined.org for change requests.</description>
<language>en-us</language>
</channel>
EOF
foreach my $key (@chrono_sorted) {
my $data = $persistent->{$key};
my $rss_date = POSIX::strftime("%Y-%m-%dT%H:%M", localtime($data->{FIRST_SEEN})) . $timezone;
my $headline = encode_entities($data->{TITLE});
my $description = encode_entities($data->{DESCRIPTION});
print RSS <<"EOF";
<item rdf:about="$data->{URL}">
<title>$headline</title>
<link>$data->{URL}</link>
<description>$description</description>
<dc:creator>edmunds.com</dc:creator>
<dc:date>$rss_date</dc:date>
</item>
EOF
}
print RSS "</rdf:RDF>\n";
# Write the updated persistent perl data out
Storable::nstore($persistent, $persistent_filename) or die "Unable to write to $persistent_filename: $!";
Generated by GNU enscript 1.6.4.