http://paperlined.org/rss/feeds/edmunds_headlines.gen.pl

#! /usr/bin/env perl
BEGIN{$^W=1}  use strict;

# Edmund's automotive news headlines, including story text, and large photo when available.


use lib '/home/interiot/src/pl/modules/';

use Date::Parse;
use Data::Dumper;
use HTML::Entities;
use LWP::Simple 'get';
use POSIX;
use Storable;
use Time::Local;
use Time::Zone;



# Load the persistent data
my $persistent;
my $persistent_filename = "/home/interiot/www/rss/feeds/edmunds_headlines.db";
if (-e $persistent_filename) {
	$persistent = Storable::retrieve($persistent_filename);
} else {
	$persistent = {};
}


##################################
# Get all stories currently lsited
##################################
my $index_page = LWP::Simple::get('http://www.edmunds.com/news/regularnews/articles/index.html')
	or die "Unable to retrieve http://www.edmunds.com/news/regularnews/articles/index.html webpage: $!\n";
print length($index_page), " bytes fetched from http://www.edmunds.com/news/regularnews/articles/\n";

my %articles_seen_this_time;

# delete $persistent->{"http://www.edmunds.com/news/regularnews/articles/100892/article.html"};

while ($index_page =~ m#<a href="(/news/regularnews/articles/\d+/article.html)">([^<]+)</a>#gsi)
{
	my $article_url = "http://www.edmunds.com$1";
	my $article_title = $2;

	$articles_seen_this_time{$article_url}++;
		
	if (! exists $persistent->{$article_url}) {

		##################################
		# Fetch a new story
		##################################
		my $article = {
			URL		=> $article_url,
			TITLE		=> $article_title,
			FIRST_SEEN	=> time(),
		};

		sleep(2);		# don't piss edmunds.com off
		my $article_page = LWP::Simple::get($article_url)
				or die "Unable to retrieve $article_url $!\n";
		print length($article_page), " bytes fetched from $article_url\n";

		if ($article_page =~ m#<font class="bodytext">(.*?)</font>#si) {
			$article->{DESCRIPTION} = $1;
		} else {
			die "Unable to find necessary information.  Page format likely changed.\n\t";
		}

		if ($article_page =~ m#<!-- start lead photo -->.*<a href="([^"]+)"><img src="[^>]+.*<!-- end lead photo -->#si) {
			$article->{DESCRIPTION} = qq(<p><img src="$1" /></p>) . $article->{DESCRIPTION};
		}

		$persistent->{$article_url} = $article;
	}
}

if (scalar keys %articles_seen_this_time < 10) {
	die "Didn't parse enough articles.  Page format likely changed.\n\t";
}


###################################
# Retire article data
#    once it's no longer listed by edmunds
##################################
foreach my $key (keys %$persistent)
{
	if (! exists $articles_seen_this_time{$key}) {
		delete $persistent->{$key};
	}
}


###################################
# Write out the .RSS file
###################################
my @chrono_sorted = sort {$persistent->{$b}{FIRST_SEEN} <=> $persistent->{$a}{FIRST_SEEN}}
		keys(%$persistent);		# the latest ones show up first

my $tz_offset = Time::Zone::tz_local_offset();
my $timezone = sprintf("%d:%02d", $tz_offset/3600, ($tz_offset/60)% 60);
$timezone = "+$timezone" if ($tz_offset >= 0);

open RSS, ">/home/interiot/www/rss/feeds/edmunds_headlines.rss"	or die "Unable to write to edmunds_headlines.rss: $!";
print RSS <<"EOF";
<?xml version="1.0" encoding="UTF-8"?>

<rdf:RDF
  xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
  xmlns="http://purl.org/rss/1.0/"
  xmlns:dc="http://purl.org/dc/elements/1.1/"
>
	<channel rdf:about="http://paperlined.org/rss/feeds/edmunds_headlines.rss">
		<title>Newcum's Edmunds Headlines Feed</title>
		<link>http://www.edmunds.com/news/regularnews/articles/</link>
		<description>David Newcum's RSS Feed of Edmunds.com's automotive news headlines.  Contact rss_feeds\@paperlined.org for change requests.</description>
		<language>en-us</language>
	</channel>
EOF

foreach my $key (@chrono_sorted) {
	my $data = $persistent->{$key};

	my $rss_date = POSIX::strftime("%Y-%m-%dT%H:%M", localtime($data->{FIRST_SEEN})) . $timezone;
	my $headline = encode_entities($data->{TITLE});
	my $description = encode_entities($data->{DESCRIPTION});

	print RSS <<"EOF";
	<item rdf:about="$data->{URL}">
		<title>$headline</title>
		<link>$data->{URL}</link>
		<description>$description</description>
		<dc:creator>edmunds.com</dc:creator>
		<dc:date>$rss_date</dc:date>
	</item>
EOF
}
print RSS "</rdf:RDF>\n";


# Write the updated persistent perl data out
Storable::nstore($persistent, $persistent_filename)	or die "Unable to write to $persistent_filename: $!";

Generated by GNU enscript 1.6.4.