package Porn;
use strict;
use warnings;
use Carp;
use HTML::TokeParser;
use URI;
# Given an HTML page, returns a list of
, with each entry in the form of:
# [$html, $a_href, $img_src]
# where $a_href and $img_src are a hashref of the attributes in the and
respectively # (using the rules of HTML::TokeParser... eg. all attribute names are in lowercase),
# and $html is just a string of the entire unparsed "
"
# (however, beware that the unparsed string doesn't have any of its relative URLs expanded out to absolute URLs like the parsed version does)
#
# You REALLY want to use this routine instead of re-implementing it yourself. Reasons:
# 1. This automatically converts relative URLs to absolute
# 2. It spots s and properly updates relative URLs using the new base URL
sub parse_linked_images {
my $html = shift;
my $base_url = shift or carp "Base URL not specified in call to parse_linked_images().\n\t";
my @linkedimages;
# tags that implicitely close/break a
my %tags_break = map {$_,1} qw[
a td tr table div
];
my $p = HTML::TokeParser->new(\$html) or die "Can't open: $!";
my $accumulated_text = "";
my $a_href;
my $img;
while (my $token = $p->get_token) {
# Might this be the end of a ... link?
if (($token->[0] eq 'S' || $token->[0] eq 'E') && $tags_break{$token->[1]}) {
if ($a_href && $img) { # if we've gotten a full set of data
$accumulated_text .= $token->[-1] if ($token->[1] eq 'a');
# make any relative URLs into absolute ones
$a_href->{href} = URI->new_abs( $a_href->{href}, $base_url )->as_string();
$img->{src} = URI->new_abs( $img->{src}, $base_url )->as_string();
push(@linkedimages, [$accumulated_text, $a_href, $img]);
}
# in either case, drop the data acummulated so far
$a_href = undef;
$img = undef;
$accumulated_text = "";
}
# have we run across the or
?
if (!$a_href && $token->[0] eq 'S' && $token->[1] eq 'a') {
$a_href = $token->[2];
} elsif ($a_href && !$img && $token->[0] eq 'S' && $token->[1] eq 'img') {
$img = $token->[2];
}
# accumulate the unparsed text if needed
if ($a_href) {
$accumulated_text .= ($token->[0] eq 'T' ? $token->[1] : $token->[-1]);
}
# if we run across a (yes, it happens, see ashleysgalleries.com), then reset the $base_url
if ($token->[0] eq 'S' && $token->[1] eq 'base' && $token->[2]{href}) {
$base_url = $token->[2]{href};
}
}
return @linkedimages;
}
1;