#! /usr/bin/env perl BEGIN{$^W=1} use strict; # Edmund's automotive news headlines, including story text, and large photo when available. use lib '/home/interiot/src/pl/modules/'; use Date::Parse; use Data::Dumper; use HTML::Entities; use LWP::Simple 'get'; use POSIX; use Storable; use Time::Local; use Time::Zone; # Load the persistent data my $persistent; my $persistent_filename = "/home/interiot/www/rss/feeds/edmunds_headlines.db"; if (-e $persistent_filename) { $persistent = Storable::retrieve($persistent_filename); } else { $persistent = {}; } ################################## # Get all stories currently lsited ################################## my $index_page = LWP::Simple::get('http://www.edmunds.com/news/regularnews/articles/index.html') or die "Unable to retrieve http://www.edmunds.com/news/regularnews/articles/index.html webpage: $!\n"; print length($index_page), " bytes fetched from http://www.edmunds.com/news/regularnews/articles/\n"; my %articles_seen_this_time; # delete $persistent->{"http://www.edmunds.com/news/regularnews/articles/100892/article.html"}; while ($index_page =~ m#([^<]+)#gsi) { my $article_url = "http://www.edmunds.com$1"; my $article_title = $2; $articles_seen_this_time{$article_url}++; if (! exists $persistent->{$article_url}) { ################################## # Fetch a new story ################################## my $article = { URL => $article_url, TITLE => $article_title, FIRST_SEEN => time(), }; sleep(2); # don't piss edmunds.com off my $article_page = LWP::Simple::get($article_url) or die "Unable to retrieve $article_url $!\n"; print length($article_page), " bytes fetched from $article_url\n"; if ($article_page =~ m#(.*?)#si) { $article->{DESCRIPTION} = $1; } else { die "Unable to find necessary information. Page format likely changed.\n\t"; } if ($article_page =~ m#.*

) . $article->{DESCRIPTION}; } $persistent->{$article_url} = $article; } } if (scalar keys %articles_seen_this_time < 10) { die "Didn't parse enough articles. Page format likely changed.\n\t"; } ################################### # Retire article data # once it's no longer listed by edmunds ################################## foreach my $key (keys %$persistent) { if (! exists $articles_seen_this_time{$key}) { delete $persistent->{$key}; } } ################################### # Write out the .RSS file ################################### my @chrono_sorted = sort {$persistent->{$b}{FIRST_SEEN} <=> $persistent->{$a}{FIRST_SEEN}} keys(%$persistent); # the latest ones show up first my $tz_offset = Time::Zone::tz_local_offset(); my $timezone = sprintf("%d:%02d", $tz_offset/3600, ($tz_offset/60)% 60); $timezone = "+$timezone" if ($tz_offset >= 0); open RSS, ">/home/interiot/www/rss/feeds/edmunds_headlines.rss" or die "Unable to write to edmunds_headlines.rss: $!"; print RSS <<"EOF"; Newcum's Edmunds Headlines Feed http://www.edmunds.com/news/regularnews/articles/ David Newcum's RSS Feed of Edmunds.com's automotive news headlines. Contact rss_feeds\@paperlined.org for change requests. en-us EOF foreach my $key (@chrono_sorted) { my $data = $persistent->{$key}; my $rss_date = POSIX::strftime("%Y-%m-%dT%H:%M", localtime($data->{FIRST_SEEN})) . $timezone; my $headline = encode_entities($data->{TITLE}); my $description = encode_entities($data->{DESCRIPTION}); print RSS <<"EOF"; $headline $data->{URL} $description edmunds.com $rss_date EOF } print RSS "\n"; # Write the updated persistent perl data out Storable::nstore($persistent, $persistent_filename) or die "Unable to write to $persistent_filename: $!";