package Porn; use strict; use warnings; use Carp; use HTML::TokeParser; use URI; # Given an HTML page, returns a list of , with each entry in the form of: # [$html, $a_href, $img_src] # where $a_href and $img_src are a hashref of the attributes in the and respectively # (using the rules of HTML::TokeParser... eg. all attribute names are in lowercase), # and $html is just a string of the entire unparsed "" # (however, beware that the unparsed string doesn't have any of its relative URLs expanded out to absolute URLs like the parsed version does) # # You REALLY want to use this routine instead of re-implementing it yourself. Reasons: # 1. This automatically converts relative URLs to absolute # 2. It spots s and properly updates relative URLs using the new base URL sub parse_linked_images { my $html = shift; my $base_url = shift or carp "Base URL not specified in call to parse_linked_images().\n\t"; my @linkedimages; # tags that implicitely close/break a my %tags_break = map {$_,1} qw[ a td tr table div ]; my $p = HTML::TokeParser->new(\$html) or die "Can't open: $!"; my $accumulated_text = ""; my $a_href; my $img; while (my $token = $p->get_token) { # Might this be the end of a ... link? if (($token->[0] eq 'S' || $token->[0] eq 'E') && $tags_break{$token->[1]}) { if ($a_href && $img) { # if we've gotten a full set of data $accumulated_text .= $token->[-1] if ($token->[1] eq 'a'); # make any relative URLs into absolute ones $a_href->{href} = URI->new_abs( $a_href->{href}, $base_url )->as_string(); $img->{src} = URI->new_abs( $img->{src}, $base_url )->as_string(); push(@linkedimages, [$accumulated_text, $a_href, $img]); } # in either case, drop the data acummulated so far $a_href = undef; $img = undef; $accumulated_text = ""; } # have we run across the or ? if (!$a_href && $token->[0] eq 'S' && $token->[1] eq 'a') { $a_href = $token->[2]; } elsif ($a_href && !$img && $token->[0] eq 'S' && $token->[1] eq 'img') { $img = $token->[2]; } # accumulate the unparsed text if needed if ($a_href) { $accumulated_text .= ($token->[0] eq 'T' ? $token->[1] : $token->[-1]); } # if we run across a (yes, it happens, see ashleysgalleries.com), then reset the $base_url if ($token->[0] eq 'S' && $token->[1] eq 'base' && $token->[2]{href}) { $base_url = $token->[2]{href}; } } return @linkedimages; } 1;