http://paperlined.org/dev/src/pl/wowtcg/fetch_regionals_statistics.pl

#!/usr/bin/perl

# wow.tcgplayer.com keeps track of many regionals decks.  This script fetches all those decks, and
# calculates how frequently each specific card gets used in those regionals.

# Additionally, cards are weighted, because some cards can only be used by one
# hero, others can be used by ~50% of the heros (eg. allies), and others can be
# used by everyone (eg. quests), so even if a card is ALWAYS used when
# possible, it can still get a much lower rating than a lower-quality card that
# has the possibility of being used by a larger number of heros.
#       The weighting simply assumes that every hero can use 4 cards of every card that's legal for its deck.
#       This is an oversimplification (since decks limit themselves to ~60 cards),
#       but it's sufficient.

# Some odd output of this (>100%) is a result of:
#   - http://wow.tcgplayer.com/db/deck.asp?ID=1579 is marked as using "Dismantle", eg. a Rogue card in a Warlock deck
#   - the same deck is marked as using "Coup de Grace", another Rogue card in a Warlock deck

    use strict;
    use warnings;

    use lib '/home/interiot/cpan/lib/';

    use LWP::Simple;
    #use Storable;
    use Tie::Storable;
    use List::Util qw[min max];

    use Data::Dumper;

    my $debug = 1;

#my $tcgcardid_map_filename = ".tcgplayer_map";

my @deck_ids;

### Load the file-based data
open FIN, "<hero_validity.txt" or die;
my %hero_validity;
my $cur_hero_listref;
while (<FIN>) {
    s/[\n\r]*$//;   # fancy chomp
    if (s/^----HERO---- //) {       # we came across a new section...  start accumulating under a new hero
        $cur_hero_listref = $hero_validity{$_} = [];
    } else {
        push(@$cur_hero_listref, $_);
    }
}
#die Dumper \%hero_validity;

close FIN;
open FIN, "<card_names.txt" or die;
my %card_names = map {s/[\n\r]*$//; split /\t/, $_, 2} <FIN>;
close FIN;
#die Dumper \%card_names;


### Fetch all the regional deck IDs
foreach my $class (qw[ Warrior Druid Hunter Mage Paladin Priest Rogue Shaman Warlock ]) {
#foreach my $class (qw[ Hunter ]) {
    print "Fetching deck_ids for class=$class\n" if $debug;
    my $html = get_wowtcgplayercom("http://wow.tcgplayer.com/db/Decks.asp?Location=Regionals&HeroClass=$class") or die;

    $html =~ s/privacy\.asp.*//si;  # remove the bottom part of the page, in particular the "Latest Decks" box, which gives us false positives
    push(@deck_ids, ($html =~ m#/db/deck\.asp\?id=(\d+)#gi));
}
#print Dumper(\@deck_ids); exit;
#@deck_ids = qw[1318 1128 1099];     # DEBUG...  Graccus decks

### Fetch all the cards in each deck
my %cards_used;
my %max_cards_possible;     # For each given card, if the card was legal for 5 decks (eg. of all the decks we scanned, 5 of the heros could legally use this card), then
                            # this is 5 * 4 = 20 total possible for that card.
tie my %tcgplayer_cardid_map => 'Tie::Storable', '.tcgplayer_map';
tie my %regional_decks => 'Tie::Storable', '.regional_decks';
foreach my $deckid (@deck_ids) {
#foreach my $deckid (@deck_ids[0..1]) {
    print "Fetching deck_id=$deckid\n" if $debug;

    my $html = ($regional_decks{$deckid} ||= get_wowtcgplayercom("http://wow.tcgplayer.com/db/deck.asp?ID=$deckid"));

    $html =~ s#alt="Return to Search Page".*##si;   # remove bottom half of page, just focus on the part with cards

    if ($html =~ m#<b>Hero:</b>\s*<a\s+href="card\.asp\?ID=(\d+)#i) {
        my $hero_id = fetch_tcg_cardid_cached($1);
        next if (!$hero_id || !$card_names{$hero_id});
        print "  hero = ", $card_names{$hero_id}, "\n";
        #next unless ($hero_id eq 'AZEROTH 4/361');      # DEBUG: Add temporarily, only to check certain heroes individually
    
        while ($html =~ m#(\d)\s*<A\s*HREF="card\.asp\?id=(\d+)">#gi) {
            my $qty = $1;
            my $wow_cardid = fetch_tcg_cardid_cached($2);
            
            next if ($deckid == 1579 && ($2 == 223 || $2 == 229));  # ignore rogue cards in a warlock deck (Dismantle and Coup de Grace in http://wow.tcgplayer.com/db/deck.asp?ID=1579)

            $cards_used{$wow_cardid} += $qty;
        }

        # increment %max_cards_possible for all of this hero's legal cards
        foreach my $card (@{$hero_validity{$hero_id}}) {
            $max_cards_possible{$card} += 4;
        }
    }
}
#print Dumper \%cards_used;


## Output the final stats
my %tcgplayer_cardid_reverse_map = map {$tcgplayer_cardid_map{$_} => $_} keys %tcgplayer_cardid_map;


sub sortfilter {$_ = shift; my ($outof)=s#(/\d+)$##; s/(\d+)$//; $_ .= sprintf('%04d%s', $1, $outof); $_}
#my @real_cardids = sort {sortfilter($a) cmp sortfilter($b)} map {$tcgplayer_cardid_map{$_}} keys %cards_used;
my @real_cardids = keys %card_names;
open FOUT, ">regionals_statistics.txt" or die;
my $maxmax = max(values %max_cards_possible);
foreach my $real_cardid (@real_cardids) {
    my $num = $cards_used{$real_cardid} || 0;
    my $max_num = $max_cards_possible{$real_cardid} || 0;
    #my $factor = $max_num ? ($max_num / $maxmax) : 1;
    #$num /= $factor;
        
    #my $line = sprintf "%4d of %4d   %-40s  %s\n", $num, $max_num, $real_cardid, $card_names{$real_cardid};
    my $line = sprintf "%7.2f%%   %4d of %4d    %-40s  %s\n", 100 * $num / ($max_num || 1), $num, $max_num, $real_cardid, $card_names{$real_cardid};
    print FOUT $line;
    print $line;
}





sub fetch_tcg_cardid_cached {
    my $tcgplayer_cardid = shift;

    $tcgplayer_cardid_map{$tcgplayer_cardid} && return $tcgplayer_cardid_map{$tcgplayer_cardid};

    if ($tcgplayer_cardid_map{$tcgplayer_cardid} = fetch_tcg_cardid($tcgplayer_cardid)) {
        print "    Fetched tcgplayer_cardid=$tcgplayer_cardid, ", $card_names{$tcgplayer_cardid_map{$tcgplayer_cardid}}, "\n";
    } else {
        print "    Fetched tcgplayer_cardid=$tcgplayer_cardid, resulted in NULL\n";
    }

    return $tcgplayer_cardid_map{$tcgplayer_cardid};
}


# given a TCG cardid, returns an official card ID (eg. "AZEROTH 32")
sub fetch_tcg_cardid {
    my $tcgplayer_cardid = shift;

    my $html = get_wowtcgplayercom("http://wow.tcgplayer.com/db/card.asp?id=$tcgplayer_cardid");
    if (!$html) {
        print STDERR "Unable to fetch TCGPlayer cardID $tcgplayer_cardid\n"; return;
    }

    if ($html =~ /Card #(?:<[^>]*>|\s*)*(\d+)/) {
        my $cardnum = $1;

        if ($html =~ /<b>Set(?:<[^>]*>|\s*)*([^<>]+)/) {
            my $set = $1;
            #print "TCG-ID $tcgplayer_cardid = $set $cardnum\n";

            if ($set =~ /Heroes of Azeroth/i) {
                return "AZEROTH $cardnum/361";
            }
            if ($set =~ /Onyxia's Lair Treasure/i) {
                return "ONYXIA $cardnum/33";
            }
        }
    }

}


# for some reason, wow.tcgplayer.com stopped resolving during development...  hack around that
# (though this hack should be removed as soon as its DNS starts working again)
sub get_wowtcgplayercom {
    my $url = shift;        # present the same interface as LWP::Simple::get()

    my $hack = 0;
    $hack++ if ($url =~ s/wow\.tcgplayer\.com/208.109.49.16/);

    ### based on LWP::Simple::get_old
    LWP::Simple::_init_ua() unless $LWP::Simple::ua;

    my $request = HTTP::Request->new(GET => $url);
    $request->header(host => "wow.tcgplayer.com") if $hack;     # the server won't work unless it has its Host: value set...  something that would normally be a bit hard to do without a functioning DNS
    my $response = $LWP::Simple::ua->request($request);

    return $response->content if $response->is_success;
    return undef;
}

Generated by GNU enscript 1.6.4.