package URLMetadata; use CGI::Util; use URI; use URI::QueryParam; use Data::Dumper; use strict; use warnings; # in some cases, doing site-specific redirector-stripping isn't enough, because there are actually # two or more redirectors chained together!! Strip these off. # # TODO: move this to a more common module for making URLs canonical? sub strip_extra_redirectors { my $url = shift; if ($url =~ m#refer\.ccbill\.com/cgi-bin/clicks\.cgi# && $url =~ s#^.*?HTML=##) { $url =~ s#=.*##; return strip_extra_redirectors(CGI::Util::unescape($url)); } elsif ($url =~ m#bertonicash\.com/tgp\.php#) { $url =~ s#^.*?HTML=##; $url =~ s#=.*##; return strip_extra_redirectors(CGI::Util::unescape($url)); } return $url; } BEGIN { my %strip_all_queries = map {$_,1} qw[ g.wasteland.com galleries.extremebound.com ]; # bdsmtraffic.com # fhg.masteracash.com # galleries.extremebound.com # promo.limitedaudience.com # bondageforte.org # bondagenation.com # ]; my %strip_one_param = ( "promo.gordcash.com" => "a", "fhg.masteracash.com" => "id", "kinky-katinka.com" => "id", "promo.limitedaudience.com" => "a", "bondageforte.org" => "aid", "bondagenation.com" => "PA", "bongum-video.com" => "ID", "fetishshots.com" => "id", "ratherextreme.com" => "id", "roperookie.com" => "nats", "trashygallery.com" => "nats", "platinumfetish.com" => "ccbill", "pics.maniacpass.com" => "id", "bdsmtraffic.com" => "webmaster", "pornaccess.com" => "adv_id", "freemeatwhores.com" => "aff", "maxcash.com" => "webm_id", "assspankingvideos.com" => "id", "redassteens.com" => "r", "fistinglessons.net" => "r", "fistinglessons.com" => "r", "misspain.com" => "r", "miss-krista.com" => "id", "femdomchronicles.com" => "id", "bbraxaporn.com" => "r", "sadoslaves.com" => "r", "maniacpass.com" => "pt", "rubberdollies.com" => "id", "shinyangels.com" => "id", "slutgalz.com" => "nats", "spookycash.com" => "id", "straponangels.com" => "id", ); my @always_strip = qw[ nats ]; # TODO: move this to a more common module for making URLs canonical? sub strip_affiliate_id { my $url = shift; my $uri = URI->new($url); my $host = $uri->host; foreach my $always (@always_strip) { $uri->query_param_delete( $always ); } my $h = $host; while ($h =~ m#\.#s) { if ($strip_all_queries{$h}) { $url =~ s#\?.*##s; return $url; } if ($strip_one_param{$h}) { # delete this one parameter $uri->query_param_delete( $strip_one_param{$h} ); return $uri->as_string; } $h =~ s#^[^\.]*\.##s; } return $url; } } BEGIN { my %strip_final_file = ( 'bdsmtraffic.com' => '^/galleries/[^/]*/index\.php$', 'g.wasteland.com' => '^/[^/]*/index\.html$', 'galleries.extremebound.com' => '^/[^/]*/index\.html$', 'galleries.knotnice.com' => '^(?:/[^/]*){3}/index\.html$', 'kinky-katinka.com' => '^(?:/[^/]*){2}/index\.php$', 'bongum-video.com' => '^(?:/[^/]*){2}/index\.php$', 'christinacaptured.com' => '^(?:/[^/]*){3}/index\.html$', 'fetishshots.com' => '^(?:/[^/]*){2}/index\.php$', 'ratherextreme.com' => '^(?:/[^/]*){2}/index\.php$', 'platinumfetish.com' => '^(?:/[^/]*){4}/index\.php$', 'freemeatwhores.com' => '^(?:/[^/]*){3}/index\.php$', 'bdsmtraffic.com' => '^(?:/[^/]*){2,3}/index\.php$', 'galleries.bucks2go.com' => '^(?:/[^/]*){3}/index\.html$', 'assspankingvideos.com' => '^(?:/[^/]*){2}/index\.php$', 'latexangelic.com' => '^(?:/[^/]*){2}/fcindex\.html$', 'orientalrope.com' => '/index\d+\.html$', ); # Don't put sites here if they always force-redirect back to the longer version of the name. my %strip_domain_prefix = map {$_,1} qw[ www.bondageforte.org www.bondagenation.com www.bondageorgies.com www.bongum-video.com www.damseldiary.com www.fetishshots.com www.fragilehostage.com www.natalisbondagesluts.com www.platinumfetish.com www.rarebdsm.com www.ratherextreme.com www.roperookie.com www.wildpornreviews.com www.freemeatwhores.com www.latexbyanna.com www.shinyangels.com www.straponangels.com www.xxx69.net ]; # the site force-redirects to the longer one anyway, might as well go with that... my %add_domain_prefix = ( 'christinacaptured.net' => 'www.christinacaptured.net', ); my %other_modification = ( # not sure what the extra commas are for, but they don't seem to be required... 'bondagebank.com' => sub {s#(gallhit\.php)\?\d+,(\d+),\d+,\d+,\d+$#$1?1,$2#si}, ); # TODO: move this to a more common module for making URLs canonical? sub make_canonical { my $url = shift; my $uri = URI->new($url)->canonical; my $h = $uri->host; my $p = $uri->path; #print "path: $p\n"; exit; while ($h =~ m#\.#s) { if ($strip_final_file{$h} && $p =~ m#$strip_final_file{$h}#si) { $p =~ s#/[^/]+$#/#si; $uri->path($p); } if ($strip_domain_prefix{$h}) { $h =~ s#^[^\.]*\.##si; $uri->host($h); next; } if ($add_domain_prefix{$h}) { $uri->host($add_domain_prefix{$h}); } if ($other_modification{$h}) { local $_ = $uri->as_string; $other_modification{$h}->(); $uri = URI->new($_); } $h =~ s#^[^\.]*\.##s; } return $uri->as_string; } } BEGIN { my %domain_hosted_for_another = ( # NOTE: these are LIST-refs, not HASH-refs, though they end up being functionaly # equivalent to hashrefs 'fhg.masteracash.com' => [ qr#^/slavesinlove/#i => 'slavesinlove.com', ], 'bdsmi.com' => [ qr#^/hosted/painfiles/#i => 'thepainfiles.com', ], 'fragilehostage.com' => [ qr#galleries/ssm/#i => 'societysm.com', qr#/galleries/ps/#i => 'perfectslave.com', ], 'bondagebank.com' => [ qr#\?100080,#i => 'chantasbitches.com', ], 'bdsmtraffic.com' => [ # actually, this is an obfuscated redirector qr#^/kk/#i => 'kinky-katinka.com', qr#/as\d#i => 'amateurslave.com', qr#/ss\d#i => 'slavesluts.com', ], 'platinumfetish.com' => [ qr#^/amateurbondagevideos/#i => 'amateurbondagevideos.com', qr#/purespanking/#i => 'purespanking.com', qr#/painfreaks/#i => 'painfreaks.com', qr#/amateursmothering/#i => 'amateursmothering.com', ], 'ddfcash.com' => [ qr#^(?:/PROMO/|/affiliate/freepages/)hot/#i => 'houseoftaboo.com', ], 'galleryhost.com' => [ qr#/her1stanal/#i => 'her1stanal.com', ], 'natalisbondagesluts.com' => [ qr#^/galleries/ss/#i => 'smotheredslave.com', qr#^/galleries/bo/#i => 'bondageorgasms.com', ], 'freemeatwhores.com' => [ qr#^/g/Gagging-Whores\b#i => 'gaggingwhores.com', ], 'damseldiary.com' => [ qr#^/free/ssm/#i => 'societysm.com', qr#^/free/bo/#i => 'bondageorgasms.com', qr#^/free/ps/#i => 'perfectslave.com', ], 'spankbuxx.com' => [ qr#^/hosted/SSG/#i => 'spankedschoolgirl.com', qr#^/hosted/spvids/#i => 'spankingvids.com', qr#^/hosted/sol/#i => 'spankingonline.com', ], 'fhg.fuckingfree.net' => [ qr#^/fhg/violentanime/#i => 'violentanime.com', qr#^/fhg/violentcomix/#i => 'violentcomix.com', ], 'massivegalleries.com' => [ qr#^/vids/rr/#i => 'rarebondage.com', ], 'bondageorgies.com' => [ qr#^/pjwarchive/#i => 'pjwarchive.com', ], 'galleries.do-bill.com' => [ qr#^/bdsmartwork/#i => 'bdsmartwork.com', ], 'fetishshots.com' => [ qr#^/fdl/#i => 'femdomloft.com', qr#^/tt/#i => 'tightlytied.com', qr#^/tw/#i => 'tramplingworld.com', qr#^/nd/#i => 'nylondivas.com', qr#^/ld/#i => 'lesbiandommes.com', qr#^/hhd/#i => 'highheeldivas.com', ], 'rarebdsm.com' => [ qr#^/galleries/ball_gagged/#i => 'ball-gagged.com', qr#^/galleries/shackled/#i => 'shackledmaidens.com', ], 'galleries.fetishsphere.com' => [ qr#^/hustler-taboo/#i => 'hustlerstaboo.com', qr#^/infernal-restraints/#i => 'infernalrestraints.com', qr#^/hot/#i => 'houseoftaboo.com', qr#^/hard-tied/#i => 'hardtied.com', qr#^/rope-affairs/#i => 'ropeaffairs.com', qr#^/charlotte/#i => 'charlottefetish.com', qr#^/sweetties/#i => 'sweetties.com', qr#^/satin-steel/#i => 'satinsteel.com', qr#^/bloodangels/#i => 'bloodangels.com', qr#^/bondage-maidens/#i => 'bondagemaidens.com', qr#^/pupett/#i => 'pupett.com', qr#^/maniac-pass/#i => 'maniacpass.com', qr#^/alterpic/#i => 'alterpic.com', ], 'spookycash.com' => [ qr#^/hgma/#i => 'michelle-aston.com', qr#^/hgsc13/#i => 'scar13.com', qr#^/hgeb/#i => 'eroticbpm.net', ], 'fetishgera.com' => [ qr#^/mega\d+#i => 'megapenetrations.com', qr#^/gap\d+#i => 'clubgape.deluxepass.com', ], 'bdsmbook.com' => [ qr#/gsg/#i => 'girlspanksgirl.com', qr#/ffg/#i => 'femdomfetishgirls.com', ], 'eropain.com' => [ qr#^/\w/\d+shp-#i => 'shockingpain.com', qr#^/\w/\d+ep-#i => 'extrapain.com', qr#^/\w/\d+-#i => 'painriser.com', ], 'bondagepoint.com' => [ qr#^/(?:maledom|bdsm)/#i => 'societysm.com', qr#^/(?:domination)/#i => 'uckingdungeon.com', ], # just markers for the fact that the thumbnails from these places need to be sorted out 'bucks2go.com' => [ ], 'adult-empire.com' => [ ], 'pornaccess.com' => [ ], 'risegalleries.com' => [ ], ); # same as above... ostensibly an array, but acts like a hash my @universal_pattern = ( qr#ricksavage#i => 'ricksavage.com', qr#Hell-Fire-Sex#i => 'hellfiresex.com', qr#specialexercises#i => 'specialexercises.com', qr#\bmeatholes\b#i => 'meatholes.com', qr#amateurbondagevideos#i => 'amateurbondagevideos.com', qr#cumbots#i => 'cumbots.com', ); my %strip_domain_prefix = map {$_,1} qw[ sadoslaves.com gagfuck.com frenchfisting.com hogtied.com maxxandmore.com meninpain.com devicebondage.com miss-krista.com sexandsubmission.com bondagehere.com fuckingmachines.com knotnice.com limitedaudience.com thetrainingofo.com wasteland.com waterbondage.com whippedass.com wiredpussy.com extremebound.com sadisticbondage.com naturebondage.com hellfiresex.com femdomchronicles.com brutaldildos.com maniacpass.com bossbitches2.com ]; my %entire_domain_is_for_another = ( 'fistinglessons.com' => 'fistinglessons.net', 'maxhardcoreporn.com' => 'maxhardcoreporn.net', 'bondagenation.com' => 'fetishnation.com', 'fhgp.maxcash.com' => 'fetishhotel.com', 'bound-and-gagged.org' => 'extremebound.com', 'medical-fetish-videos.com' => 'doctortushy.com', 'promo.gordcash.com' => 'nakedgord.com', 'lady-sonia-galleries.com' => 'lady-sonia.com', 'orientalrope.com' => 'thepainfiles.com', 'aerobdsm.com' => 'societysm.com', 'chantacash.com' => 'chantasbitches.com', ); my %per_page_key = ( 'promo.bucks2go.com' => { getkey => sub {(split m#/#, $_)[1]}, # the first in the sequence of nubmers domains => { 'slavecontest.com' => {map{$_,1}qw[ 113 247 255 ]}, 'shockers.com' => {map{$_,1}qw[ 267 268 ]}, }, }, 'bondagebank.com' => { getkey => sub {(split m#[,]#)[1]}, domains => { 'chantasbitches.com' => {map{$_,1}qw[ 71 88 560 566 568 574 580 ]}, 'fuckedandbound.com' => {map{$_,1}qw[ 119 133 286 542 554 558 562 564 572 576 578 582 ]}, }, }, ); sub domain_hosted_for_another { my $url = shift; #print Dumper \%per_page_key; exit; my $uri = URI->new($url); my $h = $uri->host; my $p = $uri->path_query; my $matched_hosting_house = 0; #print "path: $p\n"; exit; while ($h =~ m#\.#s) { if ($domain_hosted_for_another{$h}) { for (my $ctr=0; $ctr<@{$domain_hosted_for_another{$h}}; $ctr+=2) { my $pattern = $domain_hosted_for_another{$h}[$ctr]; my $hosted_for = $domain_hosted_for_another{$h}[$ctr+1]; if (ref($pattern)) { return $hosted_for if ($p =~ $pattern); } } $matched_hosting_house++; } if ($strip_domain_prefix{$h}) { return $h; } if ($entire_domain_is_for_another{$h}) { return $entire_domain_is_for_another{$h}; } if ($per_page_key{$h}) { local $_ = $p; my $key = $per_page_key{$h}{getkey}->($p); #print "Got key: $key from $url\n"; foreach my $checking_domain (keys %{$per_page_key{$h}{domains}}) { my $matching_keys = $per_page_key{$h}{domains}{$checking_domain}; return $checking_domain if ($matching_keys->{$key}); } #while (my ($var, $val) = each %{$per_page_key{$h}{domains}}) { # return $var if ($val->{$key}); # print "key($key) didn't match against: ", Dumper $val; #} print "key($key) didn't match against anything in '$h'.
\n"; } if ($h eq 'tgp.pornaccess.com' && $p =~ m#^/galleries/([^/]*)/#i) { return "$1.pornaccess.com"; # hard coded, just because this doesn't seem to be an infrequently used site } $h =~ s#^[^\.]*\.##s; } for (my $ctr=0; $ctr<@universal_pattern; $ctr+=2) { my $pattern = $universal_pattern[$ctr]; my $hosted_for = $universal_pattern[$ctr+1]; #print "Matching '$p' against '$pattern'\n"; if (ref($pattern)) { return $hosted_for if ($p =~ $pattern); } } return ($uri->host . "(hosting house, unknown) ") if ($matched_hosting_house); return $uri->host; } } BEGIN { my %most_domains_URLs_bad = map{$_,1} ( "rathbondage.com", # original owner seems to have sold it to someone else, who just uses it for SEO junk now "pimpcash.com", # most links I've seen have been dead... ); sub is_mostly_bad_domain { my $h = URI->new(shift)->host; while ($h =~ m#\.#s) { return 1 if $most_domains_URLs_bad{$h}; $h =~ s#^[^\.]*\.##s; } } } BEGIN { my $kink_com = qr#^/g/\d+/\d+/v/#i; my %domain_movie_urls = ( 'vipspanking.com' => [ qr#^/wm/mg/#i, ], 'promo.waterbondage.com' => [$kink_com], 'promo.sexandsubmission.com' => [$kink_com], 'promo.waterbondage.com' => [$kink_com], 'promo.whippedass.com' => [$kink_com], 'promo.wiredpussy.com' => [$kink_com], 'promo.ultimatesurrender.com' => [$kink_com], 'promo.hogtied.com' => [$kink_com], 'promo.fuckingmachines.com' => [$kink_com], 'bdsmtraffic.com' => [ qr#^/galleries/movies/#i, ], 'electricityplay.com' => [ qr#^/gallery/[^/]*/mgp#i, ], 'mgp.bossbitches2.com' => [ qr#.# ], ); sub is_TGP_movie_page { my $uri = URI->new(shift); my $h = $uri->host; my $p = $uri->path_query; while ($h =~ m#\.#s) { if ($domain_movie_urls{$h}) { foreach my $regexp (@{$domain_movie_urls{$h}}) { return 1 if ($p =~ $regexp); } } $h =~ s#^[^\.]*\.##s; } } } 1;