package URLMetadata; # Functions that operate using site-specific information. use Carp; use CGI::Util; use URI; use URI::QueryParam; use Data::Dumper; use strict; use warnings; # in some cases, doing site-specific redirector-stripping isn't enough, because there are actually # two or more redirectors chained together!! Strip these off. # # TODO: move this to a more common module for making URLs canonical? sub strip_extra_redirectors { my $url = shift; if ($url =~ m#refer\.ccbill\.com/cgi-bin/clicks\.cgi# && $url =~ s#^.*?HTML=##) { $url =~ s#=.*##; return strip_extra_redirectors(CGI::Util::unescape($url)); } elsif ($url =~ m#bertonicash\.com/tgp\.php#) { $url =~ s#^.*?HTML=##; $url =~ s#=.*##; return strip_extra_redirectors(CGI::Util::unescape($url)); } return $url; } BEGIN { my %strip_all_queries = map {$_,1} qw[ g.wasteland.com galleries.extremebound.com ]; # bdsmtraffic.com # fhg.masteracash.com # galleries.extremebound.com # promo.limitedaudience.com # bondageforte.org # bondagenation.com # ]; my %strip_one_param = ( "promo.gordcash.com" => "a", "fhg.masteracash.com" => "id", "kinky-katinka.com" => "id", "promo.limitedaudience.com" => "a", "bondageforte.org" => "aid", "bondagenation.com" => "PA", "bongum-video.com" => "ID", "fetishshots.com" => "id", "ratherextreme.com" => "id", "roperookie.com" => "nats", "trashygallery.com" => "nats", "platinumfetish.com" => "ccbill", "pics.maniacpass.com" => "id", "bdsmtraffic.com" => "webmaster", "pornaccess.com" => "adv_id", "freemeatwhores.com" => "aff", "maxcash.com" => "webm_id", "assspankingvideos.com" => "id", "redassteens.com" => "r", "fistinglessons.net" => "r", "fistinglessons.com" => "r", "misspain.com" => "r", "miss-krista.com" => "id", "femdomchronicles.com" => "id", "bbraxaporn.com" => "r", "sadoslaves.com" => "r", "maniacpass.com" => "pt", "rubberdollies.com" => "id", "shinyangels.com" => "id", "slutgalz.com" => "nats", "spookycash.com" => "id", "straponangels.com" => "id", "shockingbdsm.com" => "id", "ebony-bondage.org" => "aff", "gothicsluts.com" => "id", "ddfcash.com" => "nats", "susanwayland.com" => "PA", "silversdollars.com" => "affcode", "cherrysweetpics.com" => "ID", "exclusivebdsm.com" => "id", "freewhorelinks.com" => "aff", "meatgalleries.com" => "aff", "freewhoregalleries.com" => "aff", ); my @always_strip = qw[ nats ]; # TODO: move this to a more common module for making URLs canonical? sub strip_affiliate_id { my $url = shift; my $uri = URI->new($url); ref($uri) eq "URI::http" or confess "Not http: $url\n\t"; my $host = $uri->host; foreach my $always (@always_strip) { $uri->query_param_delete( $always ); } my $h = $host; while ($h =~ m#\.#s) { if ($strip_all_queries{$h}) { $url =~ s#\?.*##s; return $url; } if ($strip_one_param{$h}) { # delete this one parameter $uri->query_param_delete( $strip_one_param{$h} ); return $uri->as_string; } $h =~ s#^[^\.]*\.##s; } return $url; } } BEGIN { my %strip_final_file = ( 'bdsmtraffic.com' => '^/galleries/[^/]*/index\.php$', 'g.wasteland.com' => '^/[^/]*/index\.html$', 'galleries.extremebound.com' => '^/[^/]*/index\.html$', 'galleries.knotnice.com' => '^(?:/[^/]*){3}/index\.html$', 'kinky-katinka.com' => '^(?:/[^/]*){2}/index\.php$', 'bongum-video.com' => '^(?:/[^/]*){2}/index\.php$', 'christinacaptured.com' => '^(?:/[^/]*){3}/index\.html$', 'fetishshots.com' => '^(?:/[^/]*){2}/index\.php$', 'ratherextreme.com' => '^(?:/[^/]*){2}/index\.php$', 'platinumfetish.com' => '^(?:/[^/]*){4}/index\.php$', 'freemeatwhores.com' => '^(?:/[^/]*){3}/index\.php$', 'bdsmtraffic.com' => '^(?:/[^/]*){2,3}/index\.php$', 'galleries.bucks2go.com' => '^(?:/[^/]*){3}/index\.html$', 'assspankingvideos.com' => '^(?:/[^/]*){2}/index\.php$', 'latexangelic.com' => '^(?:/[^/]*){2}/fcindex\.html$', 'orientalrope.com' => '/index\d+\.html$', 'shockingbdsm.com' => '^(?:/[^/]*){2}/index\.php$', 'susanwayland.com' => '^(?:/[^/]*){2}/index\.php$', 'bdsmdirect.com' => '^/[^/]*/index\d+\.html$', 'galleries.xrblogger.com' => '.', 'checksexmovie.com' => '.', 'bondagehotporn.com' => '/index.html$', 'tusya.com' => '/index.html$', ); # Don't put sites here if they always force-redirect back to the longer version of the name. my %strip_domain_prefix = map {$_,1} qw[ www.bondageforte.org www.bondagenation.com www.bondageorgies.com www.bongum-video.com www.damseldiary.com www.fetishshots.com www.fragilehostage.com www.natalisbondagesluts.com www.platinumfetish.com www.rarebdsm.com www.ratherextreme.com www.roperookie.com www.wildpornreviews.com www.freemeatwhores.com www.latexbyanna.com www.shinyangels.com www.straponangels.com www.xxx69.net www.shockingbdsm.com www.evildesire.com www.susanwayland.com www.bondagehotporn.com www.kinky-katinka.com ]; # the site force-redirects to the longer one anyway, might as well go with that... my %add_domain_prefix = ( 'christinacaptured.net' => 'www.christinacaptured.net', ); my $kink_com = sub {s#http://([^/]*)/([gm])/[^/]+/(\d+)/\w/\d+/h.*#http://$1/$2/1/$3/m/1/h#}; my %other_modification = ( # not sure what the extra commas are for, but they don't seem to be required... 'bondagebank.com' => sub {s#(gallhit\.php)\?\d+,(\d+),(\d+,)*\d+$#$1?1,$2#si}, 'evildesire.com' => sub {s#^(http://evildesire\.com/hosted/\d+/).*#$1#}, 'promo.devicebondage.com' => $kink_com, 'promo.fuckingmachines.com' => $kink_com, 'promo.hogtied.com' => $kink_com, 'promo.meninpain.com' => $kink_com, 'promo.sexandsubmission.com' => $kink_com, 'promo.thetrainingofo.com' => $kink_com, 'promo.ultimatesurrender.com' => $kink_com, 'promo.waterbondage.com' => $kink_com, 'promo.whippedass.com' => $kink_com, 'promo.wiredpussy.com' => $kink_com, 'promo.tsseduction.com' => $kink_com, 'bondageforte.org' => sub {s#/bondageforte\.org/#/bondageforte.com/#s;}, 'galleries.adult-empire.com' => sub {s#(adult-empire.com/\d+/\d+)/\d+/#$1/1/#}, # XXXCumCash sites 'natalisbondagesluts.com' => sub {s#(natalisbondagesluts\.com/galleries/[^/]*/[^/]*/[^/]*)/\d+$#$1/1#}, 'damseldiary.com' => sub {s#(damseldiary\.com/free/[^/]*/[^/]*/[^/]*)/\d+$#$1/1#}, 'promo.bucks2go.com' => sub {s#(promo\.bucks2go\.com/[^/]*)/\d+/\d+#$1/1/1#}, ); # TODO: move this to a more common module for making URLs canonical? sub make_canonical { my $url = shift; my $uri = URI->new($url)->canonical; my $h = $uri->host; my $p = $uri->path; #print "path: $p\n"; exit; while ($h =~ m#\.#s) { if ($strip_final_file{$h} && $p =~ m#$strip_final_file{$h}#si) { $p =~ s#/[^/]+$#/#si; $uri->path($p); } if ($strip_domain_prefix{$h}) { $h =~ s#^[^\.]*\.##si; $uri->host($h); next; } if ($add_domain_prefix{$h}) { $uri->host($add_domain_prefix{$h}); } if ($other_modification{$h}) { local $_ = $uri->as_string; $other_modification{$h}->(); $uri = URI->new($_); } $h =~ s#^[^\.]*\.##s; } return $uri->as_string; } } BEGIN { my %domain_hosted_for_another = ( # NOTE: these are LIST-refs, not HASH-refs, though they end up being functionaly # equivalent to hashrefs 'fhg.masteracash.com' => [ qr#^/slavesinlove/#i => 'slavesinlove.com', ], 'bdsmi.com' => [ qr#^/hosted/painfiles/#i => 'thepainfiles.com', ], 'fragilehostage.com' => [ qr#galleries/ssm/#i => 'societysm.com', qr#/galleries/ps/#i => 'perfectslave.com', ], 'bdsmtraffic.com' => [ # actually, this is an obfuscated redirector qr#^/kk/#i => 'kinky-katinka.com', qr#/as\d#i => 'amateurslave.com', qr#/ss\d#i => 'slavesluts.com', ], 'ddfcash.com' => [ qr#^(?:/PROMO/|/affiliate/freepages/)hot/#i => 'houseoftaboo.com', ], 'galleryhost.com' => [ qr#/her1stanal/#i => 'her1stanal.com', ], 'natalisbondagesluts.com' => [ qr#^/galleries/ss/#i => 'smotheredslave.com', qr#^/galleries/bo/#i => 'bondageorgasms.com', ], 'freemeatwhores.com' => [ qr#^//?(?:g|galleries)/Gagging-Whores\b#i => 'gaggingwhores.com', ], 'damseldiary.com' => [ qr#^/free/ssm/#i => 'societysm.com', qr#^/free/bo/#i => 'bondageorgasms.com', qr#^/free/ps/#i => 'perfectslave.com', ], 'spankbuxx.com' => [ qr#^/hosted/SSG/#i => 'spankedschoolgirl.com', qr#^/hosted/spvids/#i => 'spankingvids.com', qr#^/hosted/sol/#i => 'spankingonline.com', ], 'fhg.fuckingfree.net' => [ qr#^/fhg/violentanime/#i => 'violentanime.com', qr#^/fhg/violentcomix/#i => 'violentcomix.com', ], 'massivegalleries.com' => [ qr#^/vids/rr/#i => 'rarebondage.com', ], 'bondageorgies.com' => [ qr#^/pjwarchive/#i => 'pjwarchive.com', ], 'galleries.do-bill.com' => [ qr#^/bdsmartwork/#i => 'bdsmartwork.com', ], 'fetishshots.com' => [ qr#^/fdl/#i => 'femdomloft.com', qr#^/tt/#i => 'tightlytied.com', qr#^/tw/#i => 'tramplingworld.com', qr#^/nd/#i => 'nylondivas.com', qr#^/ld/#i => 'lesbiandommes.com', qr#^/hhd/#i => 'highheeldivas.com', ], 'spookycash.com' => [ qr#^/hgma/#i => 'michelle-aston.com', qr#^/hgsc13/#i => 'scar13.com', qr#^/hgeb/#i => 'eroticbpm.net', ], 'bdsmbook.com' => [ qr#/gsg/#i => 'girlspanksgirl.com', qr#/ffg/#i => 'femdomfetishgirls.com', qr#/sas/#i => 'sexandsubmission.com', qr#/ir/#i => 'infernalrestraints.com', qr#/pg/#i => 'paingate.com', ], 'eropain.com' => [ qr#^/\w/(?:\w-)?\d+shp-#i => 'shockingpain.com', qr#^/\w/(?:\w-)?\d+ep-#i => 'extrapain.com', qr#^/\w/(?:\w-)?\d+-#i => 'painriser.com', ], 'bondagepoint.com' => [ qr#^/(?:maledom|bdsm)/#i => 'societysm.com', qr#^/(?:domination)/#i => 'fuckingdungeon.com', ], 'femdomfreeimages.com' => [ qr#^/aetgp/SpSvr\d+#i => 'spankingserver.com', ], 'carnaldungeon.com' => [ qr#^/galleries/fd/#i => 'fuckingdungeon.com', qr#^/galleries/str/#i => 'strictrestraint.com', ], 'cuckoldrix.com' => [ qr#^/host50/#i => 'lady-sonia.com', ], 'gothdream.net' => [ qr#^/fh[gs]/f\d+#i => 'guysgetfucked.com', qr#^/fh[gs]/h\d+#i => 'girlfriendhandjobs.com', qr#^/fh[gs]/z\d+#i => 'zoliboy.com', ], 'spankingmoney.com' => [ qr#/rs/#i => 'realspankings.com', ], 'bondagehotporn.com' => [ qr#^/bdsm_sex[/_]#i => 'sexandsubmission.com', qr#^/sex_and_submission/#i => 'sexandsubmission.com', qr#^/fucked_bound/#i => 'fuckedandbound.com', qr#^/machines?_#i => 'fuckingmachines.com', qr#^/fuck_machine_#i => 'fuckingmachines.com', qr#^/chantas_bitches/#i => 'chantasbitches.com', ], # just markers for the fact that the thumbnails from these places need to be sorted out 'bucks2go.com' => [ ], 'adult-empire.com' => [ ], 'pornaccess.com' => [ ], 'risegalleries.com' => [ ], 'sexcataclysm.com' => [ ], 'majesticfuck.com' => [ ], 'facesittingheaven.com' => [ ], 'spankingdollars.com' => [ ], # dedicated to "Girls Boarding School" or not? 'silversdollars.com' => [ ], 'porncabinet.com' => [ ], 'femdomfetishsites.com' => [ ], 'painwoman.com' => [ ], # paingate.com + whippedwomen.com? ); # same as above... ostensibly an array, but acts like a hash my @universal_pattern = ( qr#ricksavage#i => 'ricksavage.com', qr#Hell-Fire-Sex#i => 'hellfiresex.com', qr#specialexercises#i => 'specialexercises.com', qr#\bmeatholes\b#i => 'meatholes.com', qr#amateurbondagevideos#i => 'amateurbondagevideos.com', qr#cumbots#i => 'cumbots.com', ); my %strip_domain_prefix = map {$_,1} qw[ sadoslaves.com gagfuck.com frenchfisting.com hogtied.com maxxandmore.com meninpain.com devicebondage.com miss-krista.com sexandsubmission.com bondagehere.com fuckingmachines.com knotnice.com limitedaudience.com thetrainingofo.com wasteland.com waterbondage.com whippedass.com wiredpussy.com tsseduction.com extremebound.com sadisticbondage.com naturebondage.com hellfiresex.com femdomchronicles.com brutaldildos.com bossbitches2.com ebony-bondage.org ultimatesurrender.com bizarreadultclub.com exclusivebdsm.com kshara.com meatinsertions.com ]; my %entire_domain_is_for_another = ( 'fistinglessons.com' => 'fistinglessons.net', 'maxhardcoreporn.com' => 'maxhardcoreporn.net', 'bondagenation.com' => 'fetishnation.com', 'fhgp.maxcash.com' => 'fetishhotel.com', 'bound-and-gagged.org' => 'extremebound.com', 'medical-fetish-videos.com' => 'doctortushy.com', 'promo.gordcash.com' => 'nakedgord.com', 'lady-sonia-galleries.com' => 'lady-sonia.com', 'orientalrope.com' => 'thepainfiles.com', 'aerobdsm.com' => 'societysm.com', 'chantacash.com' => 'chantasbitches.com', 'bdsmresorts.com' => 'perfectslave.com', 'eroticshadows.com' => 'thepainfiles.com', 'smfiles.com' => 'thepainfiles.com', 'bdsmdirect.com' => 'thepainfiles.com', 'devianterotica.com' => 'xxxsuicide.com', 'abbysrealm.com' => 'abbys-realm.com', ); my %per_domain_key = ( 'pussypizza.com' => { getkey => sub {if (m#^/([a-z]+)\d+#i) {return $1}}, domains => { dd => 'deviantdavid.com', mb => 'meanbitches.com', md => 'meandungeon.com', }, }, 'femdomworld.com' => { getkey => sub {if ($_[1] =~ /^([^.]+)\.femdomworld\.com$/i) {return $1} }, domains => { russianmistress => 'russian-mistress.com', underfeet => 'under-feet.com', }, }, 'richporn.com' => { getkey => sub {(split m#/#)[1]}, domains => { 'backdoorlesbians' => 'backdoorlesbians.com', }, }, 'bdsmengine.com' => { getkey => sub {(split m#/#)[1]}, domains => { 'male_domination' => 'societysm.com', }, }, 'wanklinks.com' => { getkey => sub {if (m#^/([a-z]+)[0-9_]#i) {return $1}}, domains => { 'tyrannized' => 'tyrannized.com', }, }, 'galleries.xrblogger.com' => { getkey => sub {if (m#^/([a-z]+)[0-9]#i) {return $1}}, domains => { tgpc => 'spankingfear.com', tgp => 'spankingfear.com', }, }, 'tampapartygirl.com' => { getkey => sub {if (m#^/galleries/([^/]+)/#i) {return $1}}, domains => { 'electricityplay' => 'electricityplay.com', }, }, 'deviantteens.com' => { getkey => sub {if (m#^/galleries/(?:overbucks|xxxcumcash)/([^/]+)/#i) {return $1}}, domains => { 'goldenrain' => 'thegoldenrain.com', 'societysm' => 'societysm.com', }, }, 'eroticaxxx.com' => { getkey => sub {if (m#^/gal/([a-z]+)[0-9]#i) {return $1}}, domains => { nwl => 'wasteland.com', }, }, 'agebucks.com' => { getkey => sub {if (m#^/content/fhg/([^/]+)/#i) {return $1}}, domains => { 'painart.com' => 'painart.com', }, }, 'agebucks.com' => { getkey => sub {if (m#^/content/fhg/([^/]+)/#i) {return $1}}, domains => { 'painart.com' => 'painart.com', }, }, 'tusya.com' => { getkey => sub {(split m#/#, $_)[1]}, domains => { mature => 'myfriendshotmom.com', bondage => 'thetrainingofo.com', }, }, 'erotiscore.com' => { getkey => sub {(split m#/#, $_)[1]}, domains => { kumimonster => 'kumimonster.com', }, }, 'galleries.fetishsphere.com' => { getkey => sub {(split m#/#, $_)[1]}, domains => { 'alterpic' => 'alterpic.com', 'bloodangels' => 'bloodangels.com', 'bondage-maidens' => 'bondagemaidens.com', 'bound-by-bhowani' => 'boundbybhowani.com', 'charlotte' => 'charlottefetish.com', 'chilly-girls' => 'chillygirls.com', 'gwen-media' => 'gwenmedia.com', 'hard-tied' => 'hardtied.com', 'hell-fire' => 'hellfiresex.com', 'hot' => 'houseoftaboo.com', 'hustler-taboo' => 'hustlerstaboo.com', 'infernal-restraints' => 'infernalrestraints.com', 'kinky-katinka' => 'kinky-katinka.com', 'maniac-pass' => 'maniacpass.com', 'pupett' => 'pupett.com', 'rope-affairs' => 'ropeaffairs.com', 'rope-lover' => 'ropelover.com', 'saffys-secrets' => 'saffyssecrets.com', 'satin-steel' => 'satinsteel.com', 'sweetties' => 'sweetties.com', 'shadow-slaves' => 'shadowslaves.com', }, }, 'rarebdsm.com' => { getkey => sub {(split m#/#, $_)[2]}, domains => { 'ashley_renee' => 'ashleyrenee.com', 'ball_gagged' => 'ball-gagged.com', 'shackled' => 'shackledmaidens.com', 'painfiles' => 'thepainfiles.com', }, }, 'fetishgera.com' => { getkey => sub {if (m#^/([a-z]+)#i) {return $1}}, domains => { 'mega' => 'megapenetrations.com', 'gubba' => 'skinvideo.com', 'tlove' => 'teens-love-oldmen.com', 'gap' => 'clubgape.deluxepass.com', 'vag' => 'vaginalsucker.com', 'pissact' => 'pissinginaction.net', }, }, 'spankingarts.com' => { getkey => sub {(split m#/#, $_)[1]}, domains => { 'manp' => 'meninpain.com', 'whip' => 'whippedass.com', 'shame' => 'spankingshame.com', 'redasst' => 'redassteens.net', 'redassm' => 'redassmodels.com', 'cutie' => 'spanked-cutie.com', 'smb' => 'spankmybottom.com', 'gb' => 'girls-boarding-school.com', } }, 'kinky-photos.com' => { getkey => sub {(split m#/#, $_)[1]}, domains => { 'straponsissies' => 'straponsissies.com', 'ladiesfuckgents' => 'ladiesfuckgents.com', 'tyrannized' => 'tyrannized.com', 'lady-sonia' => 'lady-sonia.com', } }, 'mainbdsm.com' => { getkey => sub {if (m#^/\w/(?:\w-)?\d+([a-z]+)#i) {return $1}}, domains => { 'pmaid' => 'painmaiden.net', 'maid' => 'painmaiden.net', 'pain' => 'thepainfiles.com', 'painf' => 'thepainfiles.com', 'sad' => 'sadoslaves.com', 'rotvid' => 'nichepass.com', 'elec' => 'nichepass.com', 'rick' => 'ricksavage.com', } }, '1stbbw.com' => { getkey => sub {if (m#^/[^/]*/(?:\w-)?\d+([a-z]+)#i) {return $1}}, domains => { 'strap' => 'straponfuckers.com', }, }, 'bdsmpain.info' => { getkey => sub {if (m#^/[^/]*/(?:\w-)?\d+([a-z]+)#i) {return $1}}, domains => { 'ep' => 'extrapain.com', }, }, 'whipsandwelts.com' => { getkey => sub {if (m#^/([a-z]+)#i) {return $1}}, domains => { su => 'slutspanking.com', }, }, 'spankpain.com' => { getkey => sub {(split m#/#, $_)[3]}, domains => { 'spse' => 'spankingserver.com', } }, 'femdom-universe.com' => { #getkey => sub {m#^/galleries/# and return (split m#/#, $_)[2]}, getkey => sub {if (m#^/galleries/([^/]*/[^/]*)/#i) {return $1}}, domains => { 'lezdom/lesbiandommes' => 'lesbiandommes.com', 'lezdom/bloodangels' => 'bloodangels.com', }, }, 'facialrama.com' => { getkey => sub {m#^/amp/# and return (split m#/#, $_)[2]}, domains => { 'AsianRopes' => 'asianropes.com', }, }, 'cream-me.com' => { getkey => sub {if (m#^/rageca/([^_/]+)_#i) {return $1}}, domains => { 'spanker' => 'spankedandabused.com', }, }, 'spankingbookmarks.com' => { getkey => sub {m#^/gals/# and return (split m#/#, $_)[2]}, domains => { 'hard' => 'hardcaning.com', }, }, 'hollywoodsweeties.com' => { getkey => sub {if (m#^/Gal/CSub/([^/0-9]+?)Gal#i) {return $1}}, domains => { 'BBROS' => 'bangbros1.com', 'BKSM' => 'babesfight.com', }, }, 'painforall.com' => { getkey => sub {m#^/gals/as/# and return (split m#/#, $_)[3]}, domains => { 'mixt' => 'missxtreme.com', }, }, 'freewhoregalleries.com' => { #getkey => sub {m#^//?(?:g|galleries)/([^/]*)/# and return $1}, getkey => sub {m#^/g/# and return (split m#/#, $_)[2]}, domains => { 'Gagging-Whores' => 'gaggingwhores.com', }, }, 'msrcash.com' => { getkey => sub {(split m#/#, $_)[1]}, domains => { 'barukakane' => 'barukakane.com', }, }, 'marialovesporn.com' => { getkey => sub {(split m#/#, $_)[1]}, domains => { 'squirtbukkake' => 'squirtbukkake.com', 'gggsexbox' => 'gggsexbox.com', }, }, # google site: doesn't work here, unfortunately, so some of this might be a blind guess # (eg. what's the first parameter?) # http://www.bastinda.com/bondage/fuckedandbound291107/bondage_3493812518.html 'bastinda.com' => { getkey => sub {m#^/[^/]*/([a-z]+)#s && return $1}, domains => { 'violentcomix' => 'violentcomix.com', 'fuckedandbound' => 'fuckedandbound.com', }, }, 'bdsmvault.com' => { getkey => sub {m#^/images/([a-z])\d+([a-z])\d# && return "$1$2"}, domains => { bo => 'bondageorgasms.com', sm => 'societysm.com', fd => 'fuckingdungeon.com', ps => 'perfectslave.com', }, }, 'gigolomovies.com' => { getkey => sub {(split m#/#, $_)[1]}, domains => { 'young_dirty_sluts' => 'youngdirtysluts.com', 'dirty_anal_girls' => 'dirtyanalgirls.com', }, }, 'platinumfetish.com' => { getkey => sub {m#^/hosted_galleries\d*/([^/]+)/# && return "$1"}, domains => { 'amateurbondagevideos' => 'amateurbondagevideos.com', 'amateursmothering' => 'amateursmothering.com', 'femdominas' => 'femdominas.com', 'painfreaks' => 'painfreaks.com', 'painvixens' => 'painvixens.com', 'purespanking' => 'purespanking.com', 'ricksavage' => 'ricksavage.com', 'tokyoslaves' => 'tokyoslaves.com', }, }, 'painwoman.com' => { getkey => sub {(split m#/#, $_)[1]}, domains => { 'paingate' => 'paingate.com', }, }, ); my %per_page_key = ( 'promo.bucks2go.com' => { getkey => sub {(split m#/#, $_)[1]}, # the first in the sequence of nubmers domains => { 'slavecontest.com' => {map{$_,1}qw[ 74 113 246 247 255 ]}, 'shockers.com' => {map{$_,1}qw[ 151 240 267 268 ]}, }, }, 'bondagebank.com' => { getkey => sub {(split m#[,]#)[1]}, domains => { 'chantasbitches.com' => {map{$_,1}qw[ 25 39 41 60 71 77 80 82 87 88 219 268 273 461 485 489 552 556 560 566 568 574 580 ]}, 'fuckedandbound.com' => {map{$_,1}qw[ 119 129 133 134 258 282 286 298 328 521 542 554 558 562 564 572 573 576 578 582 586 ]}, 'captivemale.com' => {map{$_,1}qw[ 602 610 ]}, }, }, 'darksidebondage.com' => { getkey => sub {(split m#/#, $_)[2]}, domains => { 'wasteland.com' => {map{$_,1}qw[ painslut ]}, }, }, 'fetishhotel.ws' => { getkey => sub {(split m#/#, $_)[1]}, domains => { 'sadisticbondage.com' => {map{$_,1}qw[ fetish0400 ]}, 'sadisticsluts.com' => {map{$_,1}qw[ fetish0682 fetish0688 ]}, }, }, 'checksexmovie.com' => { getkey => sub {if (m#^/galls/([^/]+)/#i) {return $1}}, domains => { 'bangingmachines.com' => {map{$_,1}qw[ d05nov17 ]}, }, }, 'fetishmpg.com' => { getkey => sub {(split m#/#, $_)[1]}, domains => { 'latexxxgirls.com' => {map{$_,1}qw[ 06fetish12ow 08fetish56ye 08fetish57rd 08fetish58xq 08fetish60ko 08fetish62lg 08fetish63rq ]}, }, }, ); sub domain_hosted_for_another { my $url = shift; #print Dumper \%per_page_key; exit; my $uri = URI->new($url); my $h = $uri->host; my $p = $uri->path_query; my $matched_hosting_house = 0; #print "path: $p\n"; exit; while ($h =~ m#\.#s) { if ($domain_hosted_for_another{$h}) { for (my $ctr=0; $ctr<@{$domain_hosted_for_another{$h}}; $ctr+=2) { my $pattern = $domain_hosted_for_another{$h}[$ctr]; my $hosted_for = $domain_hosted_for_another{$h}[$ctr+1]; if (ref($pattern)) { return $hosted_for if ($p =~ $pattern); } } $matched_hosting_house++; } if ($strip_domain_prefix{$h}) { return $h; } if ($entire_domain_is_for_another{$h}) { return $entire_domain_is_for_another{$h}; } if ($per_domain_key{$h}) { local $_ = $p; my $key = $per_domain_key{$h}{getkey}->($p, $uri->host); if (!defined($key) || length($key)==0) { print "Unable to parse key from '", $uri->as_string, "'.
\n"; } elsif (exists $per_domain_key{$h}{domains}{$key}) { return $per_domain_key{$h}{domains}{$key}; } else { print "key($key) didn't match against anything in '$h'.
\n"; } $matched_hosting_house++; } if ($per_page_key{$h}) { local $_ = $p; my $key = $per_page_key{$h}{getkey}->($p); #print "Got key: $key from $url\n"; foreach my $checking_domain (keys %{$per_page_key{$h}{domains}}) { my $matching_keys = $per_page_key{$h}{domains}{$checking_domain}; return $checking_domain if ($matching_keys->{$key}); } #while (my ($var, $val) = each %{$per_page_key{$h}{domains}}) { # return $var if ($val->{$key}); # print "key($key) didn't match against: ", Dumper $val; #} print "key($key) didn't match against anything in '$h'.
\n"; $matched_hosting_house++; } # hard-coded patterns (special cases that are more complex) if ($h eq 'tgp.pornaccess.com' && $p =~ m#^/galleries/([^/]*)/#i) { return "$1.pornaccess.com"; # hard coded, just because this doesn't seem to be an infrequently used site } if ($h =~ m#^(pics|free|galls)\.maniacpass\.com$#i && $p =~ m#^/([^/]*)/#i) { # similar to the above return "$1.maniacpass.com"; } if ($h eq 'galleries.adult-empire.com' && $p =~ m#^/(\d+)/#i) { return "sites.adult-empire.com/$1/"; } $h =~ s#^[^\.]*\.##s; } for (my $ctr=0; $ctr<@universal_pattern; $ctr+=2) { my $pattern = $universal_pattern[$ctr]; my $hosted_for = $universal_pattern[$ctr+1]; #print "Matching '$p' against '$pattern'\n"; if (ref($pattern)) { return $hosted_for if ($p =~ $pattern); } } return ($uri->host . "(hosting house, unknown) ") if ($matched_hosting_house); return $uri->host; } } BEGIN { my %most_domains_URLs_bad = map{$_,1} ( "rathbondage.com", # original owner seems to have sold it to someone else, who just uses it for SEO junk now "pimpcash.com", # most links I've seen have been dead... ); sub is_mostly_bad_domain { my $h = URI->new(shift)->host; while ($h =~ m#\.#s) { return 1 if $most_domains_URLs_bad{$h}; $h =~ s#^[^\.]*\.##s; } } } BEGIN { my $kink_com = qr#^/[gm]/\d+/\d+/[vm]/#i; my %domain_movie_urls = ( 'vipspanking.com' => [ qr#^/wm/mg/#i, ], 'promo.devicebondage.com' => [$kink_com], 'promo.fuckingmachines.com' => [$kink_com], 'promo.hogtied.com' => [$kink_com], 'promo.meninpain.com' => [$kink_com], 'promo.sexandsubmission.com' => [$kink_com], 'promo.thetrainingofo.com' => [$kink_com], 'promo.ultimatesurrender.com' => [$kink_com], 'promo.waterbondage.com' => [$kink_com], 'promo.whippedass.com' => [$kink_com], 'promo.wiredpussy.com' => [$kink_com], 'promo.tsseduction.com' => [$kink_com], # XXXCumCash sites: 1=image, 2=hybrid, 3=movie (though hybrid has 3 fewer movies, so I don't know that we want to always force it to that) 'natalisbondagesluts.com' => [ qr#^/galleries/[^/]*/[^/]*/[23]/# ], 'damseldiary.com' => [ qr#^/free/[^/]*]/[^/]*/[23]/# ], # this doesn't seem to be working...? 'bdsmtraffic.com' => [ qr#^/galleries/movies/#i, ], 'electricityplay.com' => [ qr#^/gallery/[^/]*/mgp#i, ], 'mgp.bossbitches2.com' => [ qr#.# ], 'movies.meatinsertions.com' => [ qr#.# ], 'spankingdollars.com' => [ qr#^/mg/# ], 'vipspanking.com' => [ qr#^/wm/mg/# ], 'badfemales.com' => [ qr#^/wm/mg/# ], 'herfirstpunishment.com' => [ qr#^/wm/mg/# ], 'fhg.masteracash.com' => [ qr#^/[^/]*/video/# ], 'platinumfetish.com' => [ qr#^/hosted[^/]*/[^/]*/video/# ], 'galleries.knotnice.com' => [ qr#^/h/\d+/mgp# ], 'bdsmtraffic.com' => [ qr#^/[^/]*/movies/# ], 'latexangel.com' => [ qr#^/galleries/free/[^/]*/fhm/#si ], ); sub is_TGP_movie_page { my $uri = URI->new(shift); my $h = $uri->host; my $p = $uri->path_query; while ($h =~ m#\.#s) { if ($domain_movie_urls{$h}) { foreach my $regexp (@{$domain_movie_urls{$h}}) { return 1 if ($p =~ $regexp); } } $h =~ s#^[^\.]*\.##s; } } } 1;