http://paperlined.org/dev/src/pl/porn/evermine/evermine.pl
#!/usr/bin/perl
# A small framework for updating data about individual TGP porn pages on specific sites.
#
# The output of this script is a database (hash) of:
# $url => $metadata,
# where $metadata describes each image that appears on that TGP page. The database also serves as
# an easy way to remove duplicate TGP entries (at least, based only on canonical URL).
#
# Modules for site-specific code resides in EM_*.pm
#
# See the end of this file for more documentation on the interface with each EM_*.pm file.
# TODO:
# -
use strict;
use warnings;
use lib '/home/interiot/cpan/lib/';
use Data::Dumper;
#use SOAP::Lite +trace; BEGIN { open(STDERR, '>>soap.trace'); }
#use SOAP::Lite +trace => [qw(transport)];
use SOAP::Lite
on_fault => sub {die Dumper \@_};
#use SOAP::Data;
use DB_File;
use Getopt::Long;
use Storable;
#use LWP::Simple;
use LWP::UserAgent;
use HTML::TokeParser;
use List::Util qw[min max];
use URI;
use Carp;
### --- Set up the DB_File database ---
sub URL_invert { # reverse the hostname
my $url = shift;
$url =~ s#^http://([^/]*)# "http://" . join("", reverse(split /(\.)/, $1)) #ie or die;
#print "$url\n";
return $url;
}
sub DB_compare {
return URL_invert($_[0]) cmp URL_invert($_[1]);
}
my %TGPpages;
$DB_BTREE->{'compare'} = \&DB_compare ;
our $tgppages_db = tie %TGPpages, 'DB_File', 'tgppages.db', O_RDWR|O_CREAT, 0666, $DB_BTREE
or die "Unable to open database.\n\t";
### --- Load in the EM_*.pm modules ---
my @module_filenames = glob "EM_*.pm";
my $mod_loading_problems;
foreach my $modfile (@module_filenames) {
#print "Loading module $modfile\n";
do $modfile;
die $@ if $@;
(my $pkg = $modfile) =~ s/\.pm$//i;
$::{"${pkg}::"} or die "Failure in loading $modfile: Package $pkg doesn't exist after loading.\n";
if ($::{"${pkg}::"}{metadata}) { # do some double-checking on the metadata if it exists (some modules may be helper modules, and thus aren't required to implement metadata at all
(my $dm = $pkg) =~ s/^EM_//;
my $metadata = domain_metadata($dm);
foreach my $requiredvar (qw[domain query]) {
next if (exists $metadata->{$requiredvar});
warn "Warning: Metadata in $modfile needs to include the '$requiredvar' variable.\n";
$mod_loading_problems++;
}
}
}
exit if ($mod_loading_problems);
### --- Process the command-line ---
my %cmdln = process_command_line();
## Do --clear
if ($cmdln{clear}) {
foreach my $domain (@{$cmdln{clear}}) {
print "-"x20, " --clear $domain\n";
my $domain_metadata = domain_metadata($domain) or next;
my @urls = list_domain($domain_metadata);
foreach my $url (@urls) {
delete $TGPpages{$url};
}
}
}
## Do --search
our $google_service;
if ($cmdln{search}) {
if (@{$cmdln{search}}==1 && $cmdln{search}[0] eq '') {
# no sitename specified... find all those that have no URL yet
my @sites = map {s/^EM_(.*)::/$1/; $1} grep {$::{$_}{metadata}} grep /^EM_.*::/, keys %::;
$cmdln{search} = [];
foreach my $site (@sites) {
push(@{$cmdln{search}}, $site)
unless (list_domain(domain_metadata($site)));
}
}
my $google_key = do{local*FIN; open FIN, "<$ENV{HOME}/.google.key" or die $!;<FIN>};
chomp $google_key;
$google_service = SOAP::Lite->service('http://api.google.com/GoogleSearch.wsdl');
foreach my $domain (@{$cmdln{search}}) {
print "-"x20, " --search $domain\n";
my $metadata = domain_metadata($domain) or next;
my @results = google_lots_of_results(
$google_key,
$metadata->{query}, # query
0, # first
10, # count
0, # filter (hide similar results)
"", # restrict to a specific country or the like http://code.google.com/apis/soapsearch/reference.html#2_4
0, # enable SafeSearch filtering
"", # language restrict
"", # input encoding
"", # output encoding
$metadata->{maxSearchResults} || 1000); # max # results
local *canonical = $::{"EM_${domain}::"}{canonicalURL};
foreach my $result (@results) {
my $cURL = &canonical($result->{URL})
or next;
if (!exists $TGPpages{$cURL}) {
print "Found new $cURL\n";
$TGPpages{$cURL} = Storable::freeze({});
}
}
}
}
## Do --thumbs or --reparse
# fetch the HTML of each TGP page, and get as much information out of them as we
# can (at a minimum, the list of thumbnail URLs and associated full-size URLs)
if (defined($cmdln{thumbs}) || defined($cmdln{reparse})) {
$cmdln{thumbs} = $cmdln{reparse} if (defined($cmdln{reparse}));
if (@{$cmdln{thumbs}} == 1 && $cmdln{thumbs}[0] eq '') {
# the user specified --thumbs without any argument... so add all sites to the list
$cmdln{thumbs} = [map {s/^EM_(.*)::/$1/; $1} grep /^EM_.*::/, keys %::];
}
#LWP::Simple::_init_ua();
#$LWP::Simple::ua->agent("Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.3) Gecko/20070309 Firefox/2.0.0.3");
my $ua = LWP::UserAgent->new;
$ua->agent("Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.3) Gecko/20070309 Firefox/2.0.0.3");
foreach my $domain (@{$cmdln{thumbs}}) {
#local *thumb_detail = $::{"EM_${domain}::"}{thumb_detail};
#next unless (&thumb_detail);
next unless $::{"EM_${domain}::"}{thumb_detail};
print "-"x20, defined($cmdln{reparse}) ? " --reparse $domain\n" : " --thumbs $domain\n";
my $metadata = domain_metadata($domain) or next;
my @urls = list_domain($metadata);
foreach my $url (@urls) {
my $db_data = Storable::thaw($TGPpages{$url}) || {};
if (! keys %$db_data || defined($cmdln{reparse})) {
print "Fetching details for $url\n";
my $html;
my $last_fetched = time();
if (!defined($cmdln{reparse})) {
my $res = $ua->request(HTTP::Request->new(GET => $url));
if (!$res->is_success) {
print STDERR "FAILED getting $url via HTTP:\n\t", $res->status_line, "\n";
next;
}
$html = $res->content;
} else {
my $data = Storable::thaw($TGPpages{$url});
$html = $data->{html};
$last_fetched = $data->{last_fetched};
}
local *thumb_detail = $::{"EM_${domain}::"}{thumb_detail};
$db_data = { &thumb_detail($html, $url) }; # the routine returns a (listified) hash, and we turn it into a hashref
$db_data->{html} = $html;
$db_data->{last_fetched} = $last_fetched; # store this info in case the --reparse option needs it
if (exists $db_data->{delete}) { # the parser may realize at this point that this particular page isn't a TGP page
delete $TGPpages{$url};
print " deleted\n";
next;
}
$TGPpages{$url} = Storable::freeze($db_data);
}
}
}
}
## Do --tidy
if (defined($cmdln{tidy})) {
# we can do some stuff ourselves without any help... if there are any TGP pages that link to
# the EXACT same set of fullURLs, then it's clear the pages are duplicates.
die "Not implemented yet";
}
## Do --dump
if ($cmdln{dump}) {
#my $strength = min(1, scalar(grep /^$/, @{$cmdln{dump}}));
my $strength = scalar(@{$cmdln{dump}});
my @domains = grep /./, @{$cmdln{dump}};
#print Dumper($strength, $cmdln{dump}); exit;
if ($strength==1 && !@domains) { # --dump
print join("\n", keys %TGPpages), "\n";
} elsif (!@domains) { # --dump --dump
while (my ($url, $data) = each %TGPpages) {
print "--------------------------- $url ------------------------\n";
print Dumper(Storable::thaw($data));
}
} else { # --dump <sitename>
foreach my $domain (@domains) {
my $domain_metadata = domain_metadata($domain) or next;
print "-"x20, " --dump $domain\n";
my @urls = list_domain($domain_metadata);
if ($strength==2) { # --dump <sitename> --dump
# sort by the number of thumbnails
my %num_images = map {$_ => scalar(@{Storable::thaw($TGPpages{$_})->{images}})} @urls;
@urls = sort {$num_images{$a} <=> $num_images{$b}} @urls;
}
foreach my $url (@urls) {
my $data = Storable::thaw($TGPpages{$url});
if ($strength==1) {
print "$url\n";
} elsif ($strength==2) { # --dump <sitename> --dump
(my $first = $data->{images}[0]{fullURL} || "") =~ s#^.*/##;
(my $last = $data->{images}[-1]{fullURL} || "") =~ s#^.*/##;
printf "%4d %60s",
scalar(@{$data->{images}}),
$url;
if ($data->{images}[0] && $data->{images}[0]{fullURL} && $data->{images}[0]{fullURL} =~ /\S/) {
printf "%30s to %s\n",
$first,
$last
} else {
print "\n";
}
} elsif ($strength==3) { # --dump <sitename> --dump --dump
print "--------------------------- $url ------------------------\n";
delete $data->{html};
print Dumper($data) ;
}
}
}
}
}
# Based on something specified on the command-line, checks whether an EM_*.pm file has been defined yet.
sub domain_metadata {
my $domain = shift;
if ($::{"EM_${domain}::"}) {
if ($::{"EM_${domain}::"}{metadata}) {
local *sym = $::{"EM_${domain}::"}{metadata};
return &sym() if (&sym);
} else {
print STDERR "${domain}::metadata() not defined by 'EM_$domain.pm'\n";
}
} else {
print STDERR "Unable to find domain '$domain' (which should be defined by file 'EM_$domain.pm').\n";
}
return undef;
}
# Just return true/false based on whether the metadata for a given module says it owns the given
#
# in case we need to make this more complicated in the future, we'll wrap this step up
sub is_url_in_domain {
my $url = shift;
my $domain_or_metadata = shift;
my $desired_domain = (ref($domain_or_metadata) eq 'HASH') ? $domain_or_metadata->{domain} : $domain_or_metadata;
if ($url =~ m#^http://([^/]*)/#) {
my $this_domain = $1;
return 1 if ($desired_domain eq $this_domain || $this_domain =~ /\.$desired_domain$/);
}
}
# Given something like "domain.com", return a list of all URLs (keys) in the database for that
# domain. If given "domain.com", will return things from "www.domain.com" and "mail.domain.com",
# etc. If given "subdomain.domain.com", will return "www.subdomain.domain.com", etc... but not "www.domain.com".
#
# If metadata is given instead, and it specifies further criteria to narrow down the URLs, the list will be filtered further.
# (or may even include other demains, if we decide to go that way in the future)
sub list_domain {
my $domain_or_metadata = shift;
my $domain = (ref($domain_or_metadata) eq 'HASH') ? $domain_or_metadata->{domain} : $domain_or_metadata;
my @list;
my $key = "http://$domain/";
my $value;
$tgppages_db->seq($key, $value, R_CURSOR);
#print "returned $key\n";
#push(@list, $key) if (is_url_in_domain($key, $domain_or_metadata));
#while ($tgppages_db->seq($key, $value, R_NEXT) == 0) {
while ($tgppages_db->seq($key, $value, R_PREV) == 0) {
#print "returned $key\n";
last unless (is_url_in_domain($key, $domain_or_metadata));
unshift(@list, $key);
}
# TODO: figure out what further URL filters are required, and apply them here if (defined($metadata));
return @list;
}
sub process_command_line {
# allow single-letters to be bundled
Getopt::Long::Configure(qw(bundling no_getopt_compat));
my $show_syntax;
my %cmdln;
GetOptions(
\%cmdln,
"search:s@",
"thumbs|thumb:s@",
"reparse:s@",
"tidy:s@",
"clear=s@",
"dump:s@",
) or $show_syntax++;
$show_syntax++ unless (defined($cmdln{search}) || defined($cmdln{thumbs}) || defined($cmdln{reparse}) || defined($cmdln{tidy}) || $cmdln{clear} || $cmdln{dump});
if ($show_syntax) {
(my $zero = $0) =~ s#^.*/##;
print <<"EOF";
Options:
--search <sitename> (where sitename is the * in EM_*.pm)
Fetches all the URLs from Google for the given site, uniques them, and adds
them (without any metadata) into the database.
Given no sitename, this updates all sites that have no URLs yet.
--thumbs [<sitename>]
Given a sitename, fetches each of the pages that have no detailed data yet, and stores the
detail about the individual images found on that page.
Given no sitename, this updates *all* pages that have no detail yet, for all sites.
--reparse [<sitename>]
Just like --thumbs, except that it uses the cached HTML, and therefore runs faster and doesn't
risk causing us to be IP-blocked.
--tidy [<sitename>]
While duplicate TGP pages are weeded out in the --search step based purely on canonical URL,
once we've fetched the thumbs data, we may be able to remove further
duplicates by comparing data across pages. This goes through and does that.
Given no sitename, this tides the data for *all* sites.
--clear <sitename>
Removes all data for a given site (--search doesn't clear the data for a given site before
updating, so if the old URLs are invalid, this is needed first).
(if you need to remove the data for all sites, simply delete the database file)
--dump [<sitename>] (can be used multiple times to dump more detailed data)
Otuput the contents of the database, for debugging purposes.
EOF
exit;
}
#print Dumper \%cmdln; exit;
return %cmdln;
}
sub google_lots_of_results {
my $max_count = pop(@_);
my @args = @_;
my $max_count_per_query = 10;
my @all_results;
for (my $start=0; $start<$max_count; $start+=$max_count_per_query) {
$args[2] = $start;
my $results = $google_service->doGoogleSearch(@args) or die $!;
push(@all_results, @{$results->{resultElements}});
# stop if we don't receive the max results (probably indicating we've hit the end)
last if (scalar(@{$results->{resultElements}}) < $max_count_per_query);
}
return @all_results;
}
################################################################################################################################
################################################################################################################################
########## Helper functions for modules to use #################################################################################
################################################################################################################################
################################################################################################################################
# Given an HTML page, returns a list of <a href><img src></a>, with each entry in the form of:
# [$html, $a_href, $img_src]
# where $a_href and $img_src are a hashref of the attributes in the <a> and <img> respectively # (using the rules of HTML::TokeParser... eg. all attribute names are in lowercase),
# and $html is just a string of the entire unparsed "<a href...><img ...></a>"
# (however, beware that the unparsed string doesn't have any of its relative URLs expanded out to absolute URLs like the parsed version does)
#
# You REALLY want to use this routine instead of re-implementing it yourself. Reasons:
# 1. This automatically converts relative URLs to absolute
# 2. It spots <base href...>s and properly updates relative URLs using the new base URL
sub parse_linked_images {
my $html = shift;
my $base_url = shift or carp "Base URL not specified in call to parse_linked_images().\n\t";
my @linkedimages;
# tags that implicitely close/break a <a>
my %tags_break = map {$_,1} qw[
a td tr table div
];
my $p = HTML::TokeParser->new(\$html) or die "Can't open: $!";
my $accumulated_text = "";
my $a_href;
my $img;
while (my $token = $p->get_token) {
# Might this be the end of a <a>...</a> link?
if (($token->[0] eq 'S' || $token->[0] eq 'E') && $tags_break{$token->[1]}) {
if ($a_href && $img) { # if we've gotten a full set of data
$accumulated_text .= $token->[-1] if ($token->[1] eq 'a');
# make any relative URLs into absolute ones
$a_href->{href} = URI->new_abs( $a_href->{href}, $base_url )->as_string();
$img->{src} = URI->new_abs( $img->{src}, $base_url )->as_string();
push(@linkedimages, [$accumulated_text, $a_href, $img]);
}
# in either case, drop the data acummulated so far
$a_href = undef;
$img = undef;
$accumulated_text = "";
}
# have we run across the <a> or <img>?
if (!$a_href && $token->[0] eq 'S' && $token->[1] eq 'a') {
$a_href = $token->[2];
} elsif ($a_href && !$img && $token->[0] eq 'S' && $token->[1] eq 'img') {
$img = $token->[2];
}
# accumulate the unparsed text if needed
if ($a_href) {
$accumulated_text .= ($token->[0] eq 'T' ? $token->[1] : $token->[-1]);
}
# if we run across a <base href> (yes, it happens, see ashleysgalleries.com), then reset the $base_url
if ($token->[0] eq 'S' && $token->[1] eq 'base') {
$base_url = $token->[2]{href};
}
}
return @linkedimages;
}
# given the HTML for a page, returns the same thing as parse_linked_images() does, but
# it's filtered to include only the links that appear to match a numerical pattern of <a href> links
# to *.jpg files.
sub list_mostprevalent_JPG_pattern {
my $html = shift;
my $base_url = shift or carp "Base URL not specified in call to list_mostprevalent_JPG_pattern().\n\t";
my @linkedimages = parse_linked_images($html, $base_url);
my %uniq;
foreach my $li (@linkedimages) {
next unless ($li->[1]{href} =~ /\.(?:jpg|jpeg|mpg|mpeg|wmv|asf)$/i);
my $url = quotemeta($li->[1]{href});
$url =~ s/\d+/\\d+/g;
#print "-- $url\n";
$uniq{$url}++;
}
my $pattern;
my $max = 0;
while (my ($k, $v) = each %uniq) {
next unless ($v > $max);
$max = $v;
$pattern = $k;
}
my $p = $pattern;
$p =~ s/\\d\+/000/g;
$p =~ s/\\//g;
print " found $max links that match pattern $p\n";
return grep {$_->[1]{href} =~ /^$pattern$/} @linkedimages;
}
# change data in the form that parse_linked_images() outputs, to the form that thumb_detail() is expected to output
# in its $ret{image} section
sub linkedimagelist_to_thumbdetail {
my $linkedimages = shift;
my @ret;
foreach my $li (@$linkedimages) {
my %i = (
fullURL => $li->[1]{href},
thumbURL => $li->[2]{src},
);
$i{thumbWidth} = $li->[2]{width} if ($li->[2]{width});
$i{thumbHeight} = $li->[2]{height} if ($li->[2]{height});
push(@ret, \%i);
}
return \@ret;
}
__END__
============================================== Interface with EM_*.pm files ==============================================
metadata()
Must return a static data-structure with a specific set of fields. This is just data that
various bits of main-loop code needs to get its work done.
canonicalURL() see http://en.wikipedia.org/wiki/Canonical_form
Given a URL on the site, return the canonical version of the URL. The main purpose of this is
to identify duplicate URLs, since they should map to the same canonical URL, so that in the
--thumbs phase, we don't have to fetch the same page twice.
Its secondary purpose is to identify which pages on the site are TGP thumb pages and which
aren't... pages that aren't should have undef returned for them.
thumb_detail()
Given the HTML contents of a page, return the list of
Generated by GNU enscript 1.6.4.