#! /usr/bin/env perl
BEGIN{$^W=1} use strict;
# Edmund's automotive news headlines, including story text, and large photo when available.
use lib '/home/interiot/src/pl/modules/';
use Date::Parse;
use Data::Dumper;
use HTML::Entities;
use LWP::Simple 'get';
use POSIX;
use Storable;
use Time::Local;
use Time::Zone;
# Load the persistent data
my $persistent;
my $persistent_filename = "/home/interiot/www/rss/feeds/edmunds_headlines.db";
if (-e $persistent_filename) {
$persistent = Storable::retrieve($persistent_filename);
} else {
$persistent = {};
}
##################################
# Get all stories currently lsited
##################################
my $index_page = LWP::Simple::get('http://www.edmunds.com/news/regularnews/articles/index.html')
or die "Unable to retrieve http://www.edmunds.com/news/regularnews/articles/index.html webpage: $!\n";
print length($index_page), " bytes fetched from http://www.edmunds.com/news/regularnews/articles/\n";
my %articles_seen_this_time;
# delete $persistent->{"http://www.edmunds.com/news/regularnews/articles/100892/article.html"};
while ($index_page =~ m#([^<]+)#gsi)
{
my $article_url = "http://www.edmunds.com$1";
my $article_title = $2;
$articles_seen_this_time{$article_url}++;
if (! exists $persistent->{$article_url}) {
##################################
# Fetch a new story
##################################
my $article = {
URL => $article_url,
TITLE => $article_title,
FIRST_SEEN => time(),
};
sleep(2); # don't piss edmunds.com off
my $article_page = LWP::Simple::get($article_url)
or die "Unable to retrieve $article_url $!\n";
print length($article_page), " bytes fetched from $article_url\n";
if ($article_page =~ m#(.*?)#si) {
$article->{DESCRIPTION} = $1;
} else {
die "Unable to find necessary information. Page format likely changed.\n\t";
}
if ($article_page =~ m#.*