Newcum's FuckedCompany Feed

#! /usr/bin/env perl BEGIN{$^W=1} use strict; # This feed: # http://trainedmonkey.com/news/rss.php?s=52 # doesn't include the info below each story, eg: # When: 11/24/2003 # Company: Tri-Chem Corp # Severity: 35 # Points: 135 # So we'll screen-scrape the root html and get this info use lib '/home/interiot/src/pl/modules/'; use lib '/home/interiot/perllib/lib/perl5/site_perl/5.6.1/i686-linux/'; use Compress::Zlib; use Date::Parse; use HTML::Entities; use LWP::Simple 'get'; use POSIX; use Time::Local; use Time::Zone; ############################################# # Fetch and Scrape ############################################# my $page = LWP::Simple::get("http://www.fuckedcompany.com/") or die "Unable to load the fuckedcompany homepage: $!"; my $ungzipped = Compress::Zlib::uncompress($page); if ($ungzipped) { # In case they stop gzipping it at some point in the future... $page = $ungzipped; } $page =~ s/\r//gs; print "Fetched ", length($page), " bytes from fuckedcompany.com\n"; # Zero or more HTML tags, with zero or more whitespace characters (space, tab, newline, , etc) between them, or at the begining or the end. # Note that this specifically excludes any plaintext from being matched by this. my $arbitrary_tags = '\s*(?: )?\s*(?:<\S+(?:\s+\S+=(?:"[^"]*"|\'[^\']*\'|[^\s>]*))*\s*>\s*(?: )?\s*)*'; my @stories; while ($page =~ m! ${arbitrary_tags} ([^<]*) # headline ${arbitrary_tags} (.*?) # article $1, BODY => $2, LINK => $3, }); print "Parsed story #", scalar(@stories), " (", $1, ")\n"; } ############################################# # Output RSS file ############################################# my $tz_offset = Time::Zone::tz_local_offset(); my $timezone = sprintf("%d:%02d", $tz_offset/3600, ($tz_offset/60)% 60); $timezone = "+$timezone" if ($tz_offset >= 0); open RSS, ">/home/interiot/www/rss/feeds/fuckedcompany.rss" or die "Unable to write to /home/interiot/www/rss/feeds/fuckedcompany.rss: $!"; print RSS <<"EOF"; Newcum's FuckedCompany Feed http://www.fuckedcompany.com/ David Newcum's RSS Feed of FuckedCompany.com. Contact rss_feeds\@paperlined.org for change requests. en-us EOF foreach my $story (@stories) { $story->{BODY} =~ s/
\s*When: /

\nWhen: /; $story->{BODY} =~ s/\starget=_blank//sg; $story->{BODY} =~ s/^\s+//mg; my $body = encode_entities($story->{BODY}); my $about = $story->{LINK}; $about =~ s/&/|/g; my $link = encode_entities($story->{LINK}); my $title = encode_entities($story->{TITLE}); print RSS <<"EOF"; $title $link $body EOF } print RSS "\n";