http://paperlined.org/rss/feeds/fuckedcompany.gen.pl

#! /usr/bin/env perl
BEGIN{$^W=1}  use strict;

# This feed:
# 	http://trainedmonkey.com/news/rss.php?s=52
# doesn't include the info below each story, eg:
# 	 When: 11/24/2003
# 	 Company: Tri-Chem Corp
# 	 Severity: 35
# 	 Points: 135
# So we'll screen-scrape the root html and get this info


use lib '/home/interiot/src/pl/modules/';
use lib '/home/interiot/perllib/lib/perl5/site_perl/5.6.1/i686-linux/';

use Compress::Zlib;
use Date::Parse;
use HTML::Entities;
use LWP::Simple 'get';
use POSIX;
use Time::Local;
use Time::Zone;


#############################################
# Fetch and Scrape
#############################################
my $page = LWP::Simple::get("http://www.fuckedcompany.com/")
	or die "Unable to load the fuckedcompany homepage: $!";

my $ungzipped = Compress::Zlib::uncompress($page);
if ($ungzipped) {			# In case they stop gzipping it at some point in the future...
	$page = $ungzipped;
}

$page =~ s/\r//gs;

print "Fetched ", length($page), " bytes from fuckedcompany.com\n";

# Zero or more HTML tags, with zero or more whitespace characters (space, tab, newline,  , etc) between them, or at the begining or the end.
# Note that this specifically excludes any plaintext from being matched by this.
my $arbitrary_tags = '\s*(?:&nbsp;)?\s*(?:<\S+(?:\s+\S+=(?:"[^"]*"|\'[^\']*\'|[^\s>]*))*\s*>\s*(?:&nbsp;)?\s*)*';

my @stories;

while ($page =~ m!
		<span\s+class="headline">${arbitrary_tags}
			([^<]*)		# headline
		${arbitrary_tags}<span\s+class="article">
		(.*?)		# article
		<a\s+href="(http://comments.fuckedcompany.com/phpcomments/index.php\?[^"]+)"	# link
	!xsigo)
{
	push(@stories, {
		TITLE		=> $1,
		BODY		=> $2,
		LINK		=> $3,
	});

	print "Parsed story #", scalar(@stories), " (", $1, ")\n";
}



#############################################
# Output RSS file
#############################################
my $tz_offset = Time::Zone::tz_local_offset();
my $timezone = sprintf("%d:%02d", $tz_offset/3600, ($tz_offset/60)% 60);
$timezone = "+$timezone" if ($tz_offset >= 0);

open RSS, ">/home/interiot/www/rss/feeds/fuckedcompany.rss"	or die "Unable to write to /home/interiot/www/rss/feeds/fuckedcompany.rss: $!";
print RSS <<"EOF";
<?xml version="1.0" encoding="UTF-8"?>

<rdf:RDF
  xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
  xmlns="http://purl.org/rss/1.0/"
  xmlns:dc="http://purl.org/dc/elements/1.1/"
>
        <channel rdf:about="http://paperlined.org/rss/feeds/fuckedcompany.rss">
                <title>Newcum's FuckedCompany Feed</title>
                <link>http://www.fuckedcompany.com/</link>
                <description>David Newcum's RSS Feed of FuckedCompany.com.  Contact rss_feeds\@paperlined.org for change requests.</description>
                <language>en-us</language>
        </channel>
EOF

foreach my $story (@stories)
{
	$story->{BODY} =~ s/<br>\s*When: /<br><br>\nWhen: /;
	$story->{BODY} =~ s/\starget=_blank//sg;
	$story->{BODY} =~ s/^\s+//mg;
	my $body = encode_entities($story->{BODY});


	my $about = $story->{LINK};
	$about =~ s/&/|/g;

	my $link = encode_entities($story->{LINK});
	my $title = encode_entities($story->{TITLE});
	

 	print RSS <<"EOF";
	<item rdf:about="$link">
		<title>$title</title>
		<link>$link</link>
		<description>$body</description>
	</item>
EOF
}
print RSS "</rdf:RDF>\n";

Generated by GNU enscript 1.6.4.