http://paperlined.org/rss/feeds/fuckedcompany.gen.pl
#! /usr/bin/env perl
BEGIN{$^W=1} use strict;
# This feed:
# http://trainedmonkey.com/news/rss.php?s=52
# doesn't include the info below each story, eg:
# When: 11/24/2003
# Company: Tri-Chem Corp
# Severity: 35
# Points: 135
# So we'll screen-scrape the root html and get this info
use lib '/home/interiot/src/pl/modules/';
use lib '/home/interiot/perllib/lib/perl5/site_perl/5.6.1/i686-linux/';
use Compress::Zlib;
use Date::Parse;
use HTML::Entities;
use LWP::Simple 'get';
use POSIX;
use Time::Local;
use Time::Zone;
#############################################
# Fetch and Scrape
#############################################
my $page = LWP::Simple::get("http://www.fuckedcompany.com/")
or die "Unable to load the fuckedcompany homepage: $!";
my $ungzipped = Compress::Zlib::uncompress($page);
if ($ungzipped) { # In case they stop gzipping it at some point in the future...
$page = $ungzipped;
}
$page =~ s/\r//gs;
print "Fetched ", length($page), " bytes from fuckedcompany.com\n";
# Zero or more HTML tags, with zero or more whitespace characters (space, tab, newline, , etc) between them, or at the begining or the end.
# Note that this specifically excludes any plaintext from being matched by this.
my $arbitrary_tags = '\s*(?: )?\s*(?:<\S+(?:\s+\S+=(?:"[^"]*"|\'[^\']*\'|[^\s>]*))*\s*>\s*(?: )?\s*)*';
my @stories;
while ($page =~ m!
<span\s+class="headline">${arbitrary_tags}
([^<]*) # headline
${arbitrary_tags}<span\s+class="article">
(.*?) # article
<a\s+href="(http://comments.fuckedcompany.com/phpcomments/index.php\?[^"]+)" # link
!xsigo)
{
push(@stories, {
TITLE => $1,
BODY => $2,
LINK => $3,
});
print "Parsed story #", scalar(@stories), " (", $1, ")\n";
}
#############################################
# Output RSS file
#############################################
my $tz_offset = Time::Zone::tz_local_offset();
my $timezone = sprintf("%d:%02d", $tz_offset/3600, ($tz_offset/60)% 60);
$timezone = "+$timezone" if ($tz_offset >= 0);
open RSS, ">/home/interiot/www/rss/feeds/fuckedcompany.rss" or die "Unable to write to /home/interiot/www/rss/feeds/fuckedcompany.rss: $!";
print RSS <<"EOF";
<?xml version="1.0" encoding="UTF-8"?>
<rdf:RDF
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns="http://purl.org/rss/1.0/"
xmlns:dc="http://purl.org/dc/elements/1.1/"
>
<channel rdf:about="http://paperlined.org/rss/feeds/fuckedcompany.rss">
<title>Newcum's FuckedCompany Feed</title>
<link>http://www.fuckedcompany.com/</link>
<description>David Newcum's RSS Feed of FuckedCompany.com. Contact rss_feeds\@paperlined.org for change requests.</description>
<language>en-us</language>
</channel>
EOF
foreach my $story (@stories)
{
$story->{BODY} =~ s/<br>\s*When: /<br><br>\nWhen: /;
$story->{BODY} =~ s/\starget=_blank//sg;
$story->{BODY} =~ s/^\s+//mg;
my $body = encode_entities($story->{BODY});
my $about = $story->{LINK};
$about =~ s/&/|/g;
my $link = encode_entities($story->{LINK});
my $title = encode_entities($story->{TITLE});
print RSS <<"EOF";
<item rdf:about="$link">
<title>$title</title>
<link>$link</link>
<description>$body</description>
</item>
EOF
}
print RSS "</rdf:RDF>\n";
Generated by GNU enscript 1.6.4.