http://paperlined.org/dev/src/pl/porn/cgi/tgp_condom/bak/URLMetadata.pm

package URLMetadata;

    use CGI::Util;
    use URI;
    use URI::QueryParam;
    use Data::Dumper;


    use strict;
    use warnings;

# in some cases, doing site-specific redirector-stripping isn't enough, because there are actually
# two or more redirectors chained together!!  Strip these off.
#
# TODO: move this to a more common module for making URLs canonical?
sub strip_extra_redirectors {
    my $url = shift;

    if ($url =~ m#refer\.ccbill\.com/cgi-bin/clicks\.cgi# && $url =~ s#^.*?HTML=##) {
        $url =~ s#=.*##;
        return strip_extra_redirectors(CGI::Util::unescape($url));
    } elsif ($url =~ m#bertonicash\.com/tgp\.php#) {
        $url =~ s#^.*?HTML=##;
        $url =~ s#=.*##;
        return strip_extra_redirectors(CGI::Util::unescape($url));
    }

    return $url;
}

BEGIN {
    my %strip_all_queries = map {$_,1} qw[
        g.wasteland.com
        galleries.extremebound.com
    ];
#        bdsmtraffic.com
#        fhg.masteracash.com
#        galleries.extremebound.com
#        promo.limitedaudience.com
#        bondageforte.org
#        bondagenation.com
#    ];
    my %strip_one_param = (
        "promo.gordcash.com" => "a",
        "fhg.masteracash.com" => "id",
        "kinky-katinka.com" => "id",
        "promo.limitedaudience.com" => "a",
        "bondageforte.org" => "aid",
        "bondagenation.com" => "PA",
        "bongum-video.com" => "ID",
        "fetishshots.com" => "id",
        "ratherextreme.com" => "id",
        "roperookie.com" => "nats",
        "trashygallery.com" => "nats",
        "platinumfetish.com" => "ccbill",
        "pics.maniacpass.com" => "id",
        "bdsmtraffic.com" => "webmaster",
        "pornaccess.com" => "adv_id",
        "freemeatwhores.com" => "aff",
        "maxcash.com" => "webm_id",
        "assspankingvideos.com" => "id",
        "redassteens.com" => "r",
        "fistinglessons.net" => "r",
        "fistinglessons.com" => "r",
        "misspain.com" => "r",
        "miss-krista.com" => "id",
        "femdomchronicles.com" => "id",
        "bbraxaporn.com" => "r",
        "sadoslaves.com" => "r",
        "maniacpass.com" => "pt",
        "rubberdollies.com" => "id",
        "shinyangels.com" => "id",
        "slutgalz.com" => "nats",
        "spookycash.com" => "id",
        "straponangels.com" => "id",
    );
    my @always_strip = qw[
        nats
    ];
# TODO: move this to a more common module for making URLs canonical?
sub strip_affiliate_id {
    my $url = shift;

    my $uri = URI->new($url);
    my $host = $uri->host;

    foreach my $always (@always_strip) {
        $uri->query_param_delete( $always );
    }

    my $h = $host;
    while ($h =~ m#\.#s) {
        if ($strip_all_queries{$h}) {
            $url =~ s#\?.*##s;
            return $url;
        }
        if ($strip_one_param{$h}) {
            # delete this one parameter
            $uri->query_param_delete( $strip_one_param{$h} );
            return $uri->as_string;
        }
        $h =~ s#^[^\.]*\.##s;
    }

    return $url;
}
}

BEGIN {
    my %strip_final_file = (
        'bdsmtraffic.com' => '^/galleries/[^/]*/index\.php$',
        'g.wasteland.com' => '^/[^/]*/index\.html$',
        'galleries.extremebound.com' => '^/[^/]*/index\.html$',
        'galleries.knotnice.com' => '^(?:/[^/]*){3}/index\.html$',
        'kinky-katinka.com' => '^(?:/[^/]*){2}/index\.php$',
        'bongum-video.com' => '^(?:/[^/]*){2}/index\.php$',
        'christinacaptured.com' => '^(?:/[^/]*){3}/index\.html$',
        'fetishshots.com' => '^(?:/[^/]*){2}/index\.php$',
        'ratherextreme.com' => '^(?:/[^/]*){2}/index\.php$',
        'platinumfetish.com' => '^(?:/[^/]*){4}/index\.php$',
        'freemeatwhores.com' => '^(?:/[^/]*){3}/index\.php$',
        'bdsmtraffic.com' => '^(?:/[^/]*){2,3}/index\.php$',
        'galleries.bucks2go.com' => '^(?:/[^/]*){3}/index\.html$',
        'assspankingvideos.com' => '^(?:/[^/]*){2}/index\.php$',
        'latexangelic.com' => '^(?:/[^/]*){2}/fcindex\.html$',
        'orientalrope.com' => '/index\d+\.html$',
    );

    # Don't put sites here if they always force-redirect back to the longer version of the name.
    my %strip_domain_prefix = map {$_,1} qw[
        www.bondageforte.org
        www.bondagenation.com
        www.bondageorgies.com
        www.bongum-video.com
        www.damseldiary.com
        www.fetishshots.com
        www.fragilehostage.com
        www.natalisbondagesluts.com
        www.platinumfetish.com
        www.rarebdsm.com
        www.ratherextreme.com
        www.roperookie.com
        www.wildpornreviews.com
        www.freemeatwhores.com
        www.latexbyanna.com
        www.shinyangels.com
        www.straponangels.com
        www.xxx69.net
    ];

    # the site force-redirects to the longer one anyway, might as well go with that...
    my %add_domain_prefix = (
        'christinacaptured.net' => 'www.christinacaptured.net',
    );

    my %other_modification = (
                    # not sure what the extra commas are for, but they don't seem to be required...
        'bondagebank.com' => sub {s#(gallhit\.php)\?\d+,(\d+),\d+,\d+,\d+$#$1?1,$2#si},
    );
# TODO: move this to a more common module for making URLs canonical?
sub make_canonical {
    my $url = shift;

    my $uri = URI->new($url)->canonical;
    my $h = $uri->host;
    my $p = $uri->path;
    #print "path: $p\n"; exit;
    while ($h =~ m#\.#s) {
        if ($strip_final_file{$h} && $p =~ m#$strip_final_file{$h}#si) {
            $p =~ s#/[^/]+$#/#si;
            $uri->path($p);
        }
        if ($strip_domain_prefix{$h}) {
            $h =~ s#^[^\.]*\.##si;
            $uri->host($h);
            next;
        }
        if ($add_domain_prefix{$h}) {
            $uri->host($add_domain_prefix{$h});
        }
        if ($other_modification{$h}) {
            local $_ = $uri->as_string;
            $other_modification{$h}->();
            $uri = URI->new($_);
        }
        $h =~ s#^[^\.]*\.##s;
    }

    return $uri->as_string;
}
}


BEGIN {
    my %domain_hosted_for_another = (
                # NOTE: these are LIST-refs, not HASH-refs, though they end up being functionaly
                # equivalent to hashrefs
        'fhg.masteracash.com' => [
            qr#^/slavesinlove/#i => 'slavesinlove.com',
        ],

        'bdsmi.com' => [
            qr#^/hosted/painfiles/#i => 'thepainfiles.com',
        ],

        'fragilehostage.com' => [
            qr#galleries/ssm/#i => 'societysm.com',
            qr#/galleries/ps/#i => 'perfectslave.com',
        ],

        'bondagebank.com' => [
            qr#\?100080,#i => 'chantasbitches.com',
        ],


        'bdsmtraffic.com' => [          # actually, this is an obfuscated redirector
            qr#^/kk/#i => 'kinky-katinka.com',
            qr#/as\d#i => 'amateurslave.com',
            qr#/ss\d#i => 'slavesluts.com',
        ],

        'platinumfetish.com' => [
            qr#^/amateurbondagevideos/#i => 'amateurbondagevideos.com',
            qr#/purespanking/#i => 'purespanking.com',
            qr#/painfreaks/#i => 'painfreaks.com',
            qr#/amateursmothering/#i => 'amateursmothering.com',
        ],

        'ddfcash.com' => [
            qr#^(?:/PROMO/|/affiliate/freepages/)hot/#i => 'houseoftaboo.com',
        ],

        'galleryhost.com' => [
            qr#/her1stanal/#i => 'her1stanal.com',
        ],

        'natalisbondagesluts.com' => [
            qr#^/galleries/ss/#i => 'smotheredslave.com',
            qr#^/galleries/bo/#i => 'bondageorgasms.com',
        ],

        'freemeatwhores.com' => [
            qr#^/g/Gagging-Whores\b#i => 'gaggingwhores.com',
        ],

        'damseldiary.com' => [
            qr#^/free/ssm/#i => 'societysm.com',
            qr#^/free/bo/#i => 'bondageorgasms.com',
            qr#^/free/ps/#i => 'perfectslave.com',
        ],

        'spankbuxx.com' => [
            qr#^/hosted/SSG/#i => 'spankedschoolgirl.com',
            qr#^/hosted/spvids/#i => 'spankingvids.com',
            qr#^/hosted/sol/#i => 'spankingonline.com',
        ],

        'fhg.fuckingfree.net' => [
            qr#^/fhg/violentanime/#i => 'violentanime.com',
            qr#^/fhg/violentcomix/#i => 'violentcomix.com',
        ],

        'massivegalleries.com' => [
            qr#^/vids/rr/#i => 'rarebondage.com',
        ],

        'bondageorgies.com' => [
            qr#^/pjwarchive/#i => 'pjwarchive.com',
        ],

        'galleries.do-bill.com' => [
            qr#^/bdsmartwork/#i => 'bdsmartwork.com',
        ],

        'fetishshots.com' => [
            qr#^/fdl/#i => 'femdomloft.com',
            qr#^/tt/#i => 'tightlytied.com',
            qr#^/tw/#i => 'tramplingworld.com',
            qr#^/nd/#i => 'nylondivas.com',
            qr#^/ld/#i => 'lesbiandommes.com',
            qr#^/hhd/#i => 'highheeldivas.com',
        ],

        'rarebdsm.com' => [
            qr#^/galleries/ball_gagged/#i => 'ball-gagged.com',
            qr#^/galleries/shackled/#i => 'shackledmaidens.com',
        ],

        'galleries.fetishsphere.com' => [
            qr#^/hustler-taboo/#i => 'hustlerstaboo.com',
            qr#^/infernal-restraints/#i => 'infernalrestraints.com',
            qr#^/hot/#i => 'houseoftaboo.com',
            qr#^/hard-tied/#i => 'hardtied.com',
            qr#^/rope-affairs/#i => 'ropeaffairs.com',
            qr#^/charlotte/#i => 'charlottefetish.com',
            qr#^/sweetties/#i => 'sweetties.com',
            qr#^/satin-steel/#i => 'satinsteel.com',
            qr#^/bloodangels/#i => 'bloodangels.com',
            qr#^/bondage-maidens/#i => 'bondagemaidens.com',
            qr#^/pupett/#i => 'pupett.com',
            qr#^/maniac-pass/#i => 'maniacpass.com',
            qr#^/alterpic/#i => 'alterpic.com',
        ],

        'spookycash.com' => [
            qr#^/hgma/#i => 'michelle-aston.com',
            qr#^/hgsc13/#i => 'scar13.com',
            qr#^/hgeb/#i => 'eroticbpm.net',
        ],

        'fetishgera.com' => [
            qr#^/mega\d+#i => 'megapenetrations.com',
            qr#^/gap\d+#i => 'clubgape.deluxepass.com',
        ],

        'bdsmbook.com' => [
            qr#/gsg/#i => 'girlspanksgirl.com',
            qr#/ffg/#i => 'femdomfetishgirls.com',
        ],

        'eropain.com' => [
            qr#^/\w/\d+shp-#i => 'shockingpain.com',
            qr#^/\w/\d+ep-#i => 'extrapain.com',
            qr#^/\w/\d+-#i => 'painriser.com',
        ],

        'bondagepoint.com' => [
            qr#^/(?:maledom|bdsm)/#i => 'societysm.com',
            qr#^/(?:domination)/#i => 'uckingdungeon.com',
        ],

        # just markers for the fact that the thumbnails from these places need to be sorted out
        'bucks2go.com' => [ ],
        'adult-empire.com' => [ ],
        'pornaccess.com' => [ ],
        'risegalleries.com' => [ ],
    );

        # same as above...  ostensibly an array, but acts like a hash
    my @universal_pattern = (
        qr#ricksavage#i => 'ricksavage.com',
        qr#Hell-Fire-Sex#i => 'hellfiresex.com',
        qr#specialexercises#i => 'specialexercises.com',
        qr#\bmeatholes\b#i => 'meatholes.com',
        qr#amateurbondagevideos#i => 'amateurbondagevideos.com',
        qr#cumbots#i => 'cumbots.com',
    );
    
    my %strip_domain_prefix = map {$_,1} qw[
        sadoslaves.com
        gagfuck.com
        frenchfisting.com
        hogtied.com
        maxxandmore.com
        meninpain.com
        devicebondage.com
        miss-krista.com
        sexandsubmission.com
        bondagehere.com
        fuckingmachines.com
        knotnice.com
        limitedaudience.com
        thetrainingofo.com
        wasteland.com
        waterbondage.com
        whippedass.com
        wiredpussy.com
        extremebound.com
        sadisticbondage.com
        naturebondage.com
        hellfiresex.com
        femdomchronicles.com
        brutaldildos.com
        maniacpass.com
        bossbitches2.com
    ];

    my %entire_domain_is_for_another = (
        'fistinglessons.com' => 'fistinglessons.net',
        'maxhardcoreporn.com' => 'maxhardcoreporn.net',
        'bondagenation.com' => 'fetishnation.com',
        'fhgp.maxcash.com' => 'fetishhotel.com',
        'bound-and-gagged.org' => 'extremebound.com',
        'medical-fetish-videos.com' => 'doctortushy.com',
        'promo.gordcash.com' => 'nakedgord.com',
        'lady-sonia-galleries.com' => 'lady-sonia.com',
        'orientalrope.com' => 'thepainfiles.com',
        'aerobdsm.com' => 'societysm.com',
        'chantacash.com' => 'chantasbitches.com',
    );

    my %per_page_key = (
        'promo.bucks2go.com' => {
            getkey => sub {(split m#/#, $_)[1]},        # the first in the sequence of nubmers
            domains => {
                'slavecontest.com' => {map{$_,1}qw[  113 247 255 ]},
                'shockers.com' => {map{$_,1}qw[  267 268  ]},
            },
        },

        'bondagebank.com' => {
            getkey => sub {(split m#[,]#)[1]},
            domains => {
                'chantasbitches.com' => {map{$_,1}qw[  71 88 560 566 568 574 580  ]},
                'fuckedandbound.com' => {map{$_,1}qw[  119 133 286 542 554 558 562 564 572 576 578 582  ]},
            },
        },
    );

sub domain_hosted_for_another {
    my $url = shift;

    #print Dumper \%per_page_key; exit;

    my $uri = URI->new($url);
    my $h = $uri->host;
    my $p = $uri->path_query;

    my $matched_hosting_house = 0;

    #print "path: $p\n"; exit;
    while ($h =~ m#\.#s) {
        if ($domain_hosted_for_another{$h}) {
            for (my $ctr=0; $ctr<@{$domain_hosted_for_another{$h}}; $ctr+=2) {
                my $pattern    = $domain_hosted_for_another{$h}[$ctr];
                my $hosted_for = $domain_hosted_for_another{$h}[$ctr+1];

                if (ref($pattern)) {
                    return $hosted_for if ($p =~ $pattern);
                }
            }

            $matched_hosting_house++;
        }
        if ($strip_domain_prefix{$h}) {
            return $h;
        }
        if ($entire_domain_is_for_another{$h}) {
            return $entire_domain_is_for_another{$h};
        }
        if ($per_page_key{$h}) {
            local $_ = $p;
            my $key = $per_page_key{$h}{getkey}->($p);
            #print "Got key: $key    from $url\n";
            foreach my $checking_domain (keys %{$per_page_key{$h}{domains}}) {
                my $matching_keys = $per_page_key{$h}{domains}{$checking_domain};
                return $checking_domain if ($matching_keys->{$key});
            }
            #while (my ($var, $val) = each %{$per_page_key{$h}{domains}}) {
            #    return $var if ($val->{$key});
            #    print "key($key) didn't match against: ", Dumper $val;
            #}
            print "key($key) didn't match against anything in '$h'.<br>\n";
        }
        if ($h eq 'tgp.pornaccess.com' && $p =~ m#^/galleries/([^/]*)/#i) {
            return "$1.pornaccess.com";     # hard coded, just because this doesn't seem to be an infrequently used site
        }
        $h =~ s#^[^\.]*\.##s;
    }

    for (my $ctr=0; $ctr<@universal_pattern; $ctr+=2) {
        my $pattern    = $universal_pattern[$ctr];
        my $hosted_for = $universal_pattern[$ctr+1];

        #print "Matching '$p' against '$pattern'\n";

        if (ref($pattern)) {
            return $hosted_for if ($p =~ $pattern);
        }
    }

    return ($uri->host . "(hosting house, unknown) ") if ($matched_hosting_house);

    return $uri->host;
}
}


BEGIN {
    my %most_domains_URLs_bad = map{$_,1} (
        "rathbondage.com",      # original owner seems to have sold it to someone else, who just uses it for SEO junk now
        "pimpcash.com",         # most links I've seen have been dead...
    );
    sub is_mostly_bad_domain {
        my $h = URI->new(shift)->host;
        while ($h =~ m#\.#s) {
            return 1 if $most_domains_URLs_bad{$h};
            $h =~ s#^[^\.]*\.##s;
        }
    }
}


BEGIN {
    my $kink_com = qr#^/g/\d+/\d+/v/#i;
    my %domain_movie_urls = (
        'vipspanking.com' => [
            qr#^/wm/mg/#i,
        ],
        'promo.waterbondage.com' => [$kink_com],
        'promo.sexandsubmission.com' => [$kink_com],
        'promo.waterbondage.com' => [$kink_com],
        'promo.whippedass.com' => [$kink_com],
        'promo.wiredpussy.com' => [$kink_com],
        'promo.ultimatesurrender.com' => [$kink_com],
        'promo.hogtied.com' => [$kink_com],
        'promo.fuckingmachines.com' => [$kink_com],
        'bdsmtraffic.com' => [
            qr#^/galleries/movies/#i,
        ],
        'electricityplay.com' => [
            qr#^/gallery/[^/]*/mgp#i,
        ],
        'mgp.bossbitches2.com' => [ qr#.# ],
    );
sub is_TGP_movie_page {
    my $uri = URI->new(shift);
    my $h = $uri->host;
    my $p = $uri->path_query;

    while ($h =~ m#\.#s) {
        if ($domain_movie_urls{$h}) {
            foreach my $regexp (@{$domain_movie_urls{$h}}) {
                return 1 if ($p =~ $regexp);
            }
        }

        $h =~ s#^[^\.]*\.##s;
    }
}
}



1;

Generated by GNU enscript 1.6.4.