http://paperlined.org/src/pl/reddit/effektz/effektz_domains.pl

#!/usr/bin/perl

    use strict;
    use warnings;

    use LWP::UserAgent;
    use URI;
    use Data::Dumper;
    use CGI::Util;
    use CGI;

    my $ua = new LWP::UserAgent;
    $ua->agent("Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.4) Gecko Netscape/7.1 (ax)");

    my $hilight_creation_date_if = qr#^2007-(Sep|Oct|Nov|Dec)#i;

    # accounts that appear to be interested in spamming Weidmann-owned domains at least part of the time
    my @reddit_accounts = qw[
        alexw
        nitsujw
        TimmyCracKorn
        iTouch
        MyTouch
        yahooxjeffektz
        perfektxj
    ];
        # effektz       # apparently banned

    # sites that the Weidmann brothers post stories to semi-often, but aren't solely controlled by them
    #   (the goal isn't to remove ALL offsite links they post...  rather, to make it easier to
    #   manually separate the wheat from the chaff by making sure that only Weidmann-owned sites
    #   appear at the top of the frequency-sorted list)
    my %remove = map {$_,1} qw[
        infowars.com
        nytimes.com
        ivnnetwork.com
        news.com.au
        youtube.com
        abcnews.go.com
        news.yahoo.com
        lifehacker.com
        neatorama.com
        tuaw.com
        burbia.com
        torrentfreak.com
        sciencedaily.com
        crooksandliars.com
        reuters.com
        engadget.com
        mashable.com
        dumblittleman.com
        news.bbc.co.uk
        advancedmn.com
        netmag.co.uk
        cracked.com
        chow.com
        sciencenow.sciencemag.org
        hosted.ap.org
        ronpaulnation.com
        uk.reuters.com
        concierge.com
        money.cnn.com
        offbeatink.com
        kontraband.com
        inhabitat.com
        absolutelytrue.com
        images.dpchallenge.com
        community.livejournal.com
        reubenmiller.typepad.com
        cnn.com
        gigaom.com
        andrewsullivan.theatlantic.com
        in-public.com
        talkbulge.com
        mytho-fleurs.com
        macdesktops.com
        weblogs.media.mit.edu
        realsimple.com
        jimjamzoo.com
        redbubble.com
        topiaryartdesigns.com
        streetart.splitbrain.org
        freshome.com
        skatemoss.blogspot.com
        photopumpkin.com
        1nf0rmat10n.com
        coldwaterimages.com
        streetart.splitbrain.org
        wild-landscape.com
        picsyard.com
        flickr.com
        photos.sutradirectory.com
        eyje.net
        911archive.org
        marcofolio.net
        tumbblogr.com
        photo.net
        davidshrigley.com
        incredimazing.com
        sithvixen.com
        mentalfloss.com
        foxnews.com
        thinkprogress.org
        askmen.com
        dl.ziza.ru
        telegraph.co.uk
    ];
        #jimjamzoo.com      (am I really sure about this one?)
        #photopumpkin.com
        # picsyard.com
        #   as minimalist as the site is, it doesn't appear to be tied to the Weidmann brothers:
        #           http://siteexplorer.search.yahoo.com/advsearch?p=http%3A%2F%2Fpicsyard.com%2F&bwm=i&bwmo=d&bwmf=s
        #           http://reddit.com/search?q=site%3Apicsyard.com
        #eyje.net

    my %someone_else = map {$_,1} qw[
        208.73.212.12
    ];

    our %positive_hits__;
    do "zen2design_metadata.pm";

    # modify the above structure just a bit...  (basically, add the ->{caps} member)
    my %positive_hits;
    while (my ($caps, $h) = each %positive_hits__) {
        if (lc($caps) ne $caps) {
            $h->{caps} = $caps;
        }
        $positive_hits{lc $caps} = $h;
    }

    #print Dumper \%positive_hits; exit;


my $hosts = {};
my %subhosts;

foreach my $username (@reddit_accounts) {
    fetch_reddit_submitter_domains($username, -1, $hosts);
}


my @hosts = sort {$hosts->{$a} <=> $hosts->{$b}} keys %$hosts;

foreach my $host (@hosts) {
    next if ($remove{$host});
    printf "%5d %s\n", $hosts->{$host}, $host;
}


open FOUT_HTML, ">effektz_domains.html" or die;
open FOUT_MD, ">effektz_domains.markdown" or die;

print FOUT_MD <<"EOF";
Stop spamming linkjacked pictures, effektz ([aka TimmyCracKorn aka MyTouch aka iTouch aka nitsujw aka alexw](http://paperlined.org/apps/reddit/zen2design.html)).

## Spammed domains:

EOF

print FOUT_HTML "<table>\n";
print FOUT_HTML "<tr><th>Site <th> <th> <th> <th> <th style='padding-left:1em'>Site's primary purpose <th style='padding-left:1em'>Created <th>IP address\n";
foreach my $host (reverse @hosts) {
    next if ($remove{$host});
    
    my @subdomains = keys %{$subhosts{$host}};
    my $srch = "http://reddit.com/search?q=" . join("+", map {"site:$_"} @subdomains);

    my $num = $hosts->{$host};

    my $pos = $positive_hits{$host} || {};
    my $caps_host = $pos->{caps} || $host;

    my (undef, undef, undef, undef, @addrs) = gethostbyname($host);
    @addrs = map {join ".", unpack ('C4', $_)} @addrs;

    # remove sites that we seem to clearly not care about:
    #   - sites with only one story
    #   - sites we haven't put in our whitelist
    #   - sites that aren't hosted on the 66.98.156.36 server
    next if ($num <= 2 && !keys %$pos && !($addrs[0] && $addrs[0] eq '66.98.156.36'));

    my $stumble_discoveries = google_howmany("site:stumbleupon.com/url/ inurl:$host");


    my $story_plural = ($num>1) ? "stories" : "story";
    print FOUT_HTML "<tr><td><a href='$srch'>$caps_host</a>",
                    "<td>(<a href='http://www.alexa.com/data/details/traffic_details/$host'>alexa</a>)",
                    "<td>(<a href='http://siteexplorer.search.yahoo.com/advsearch?p=http%3A%2F%2F$host%2F&bwm=i&bwmo=d&bwmf=s'>backlinks</a>)",
                    "<td>$num&nbsp;reddit&nbsp;$story_plural";

    if ($stumble_discoveries) {
        my $plural = ($stumble_discoveries > 1) ? "s" : "";
        print FOUT_HTML "<td><a href='http://www.google.com/search?q=site:stumbleupon.com/url/+inurl%3A$caps_host&hl=en&safe=off&start=0&sa=N&filter=0'>$stumble_discoveries&nbsp;stumble$plural</a>",
    } else {
        print FOUT_HTML "<td>";
    }

    my $devoted_to_others = "";     # visually differentiate between sites devoted to image tumblelogging, and those ostensibly devoted to something else
    my $general_d = $pos->{descr} || $pos->{unescaped_descr};
    if (!$pos->{no_chevron} && $general_d && $general_d !~ /(image|video).*tumblelog/ && $general_d !~ /lapsed/i) {
        $devoted_to_others = "&raquo;&raquo;&nbsp;";
    }

    if ($pos->{unescaped_descr}) {
        print FOUT_HTML "<td style='padding-left:1em'>$devoted_to_others$pos->{unescaped_descr}";
    } else {
        print FOUT_HTML "<td style='padding-left:1em'>$devoted_to_others<a href='", ($pos->{link} || "http://$host/"), "'>",
                CGI::escapeHTML($pos->{descr} || ''), "</a>";
    }
#    if ($pos->{first_appeared}) {
#        (my $a = CGI::escapeHTML($pos->{first_appeared})) =~ s/ /&nbsp;/g;
#        print FOUT_HTML "<td style='padding-left:1em'>First&nbsp;appeared:&nbsp;$a";
#    } else {
#        print FOUT_HTML "<td style='padding-left:1em'>";
#    }
    if ($pos->{created}) {
        my $b = "";
        $b = "<b>" if ($pos->{created} =~ $hilight_creation_date_if);
        print FOUT_HTML "<td style='padding-left:1em'><a href='http://networking.ringofsaturn.com/Tools/whois.php?domain=$caps_host'>$b$pos->{created}</b></a>";
    } else {
        print FOUT_HTML "<td style='padding-left:1em'>";
    }
    #print FOUT_HTML "<td><a href='http://www.zoneedit.com/lookup.html?host=$caps_host&type=A&server=&forward=Look+it+up'>$addrs[0]</a>";
    my $strike_ip = ($addrs[0] eq '68.178.232.99' || $someone_else{$addrs[0]}) ? "<s>" : "";
    my $ip_annotation = "";
    $ip_annotation = " (<a href='http://whois.domaintools.com/$addrs[0]'>godaddy</a>)" if ($addrs[0] =~ /^68\.178\./);
    $ip_annotation = " (<a href='http://whois.domaintools.com/$addrs[0]'>someone else</a>)" if ($someone_else{$addrs[0]});

    #print FOUT_HTML "<td><a href='http://ping.eu/nslookup/?host=$caps_host&port='>$strike_ip$addrs[0]</s></a>";
    print FOUT_HTML "<td><a href='http://network-tools.com/default.asp?prog=dnsrec&host=$caps_host'>$strike_ip$addrs[0]</s></a>$ip_annotation";
    print FOUT_HTML "\n";


    if ($general_d !~ /lapsed/i) {      # skipped lapsed domains
        my $markdown_ip = $addrs[0];
        $markdown_ip = "lapsed domain" if ($general_d =~ /lapsed/i);
        print FOUT_MD   "* [$caps_host]($srch) ([alexa](http://www.alexa.com/data/details/traffic_details/$host), IP:$markdown_ip) [$num $story_plural]\n";
    }
}

0 && print FOUT_MD <<"EOF";

## Spamming accounts:

* [effektz](http://all.reddit.com/user/effektz/submitted)
* [TimmyCracKorn](http://all.reddit.com/user/TimmyCracKorn/submitted)
* [MyTouch](http://all.reddit.com/user/MyTouch/submitted)
* [iTouch](http://all.reddit.com/user/iTouch/submitted)
* [nitsujw](http://all.reddit.com/user/nitsujw/submitted)
* [alexw](http://all.reddit.com/user/alexw/submitted)

(this list, while available via a [web link](http://paperlined.org/apps/reddit/zen2design.html), is also available for [copy-n-pasting](http://paperlined.org/src/pl/reddit/effektz/effektz_domains.markdown) into other effektz spam stories)
EOF

print "Reports output to:\n\teffektz_domains.html\n\teffektz_domains.markdown\n";
#print Dumper $a;


sub fetch_reddit_submitter_domains {
    my $username = shift or die;
    my $npages = shift || 1;
    my $hosts = shift || {};

    my $url = "http://reddit.com/user/$username/submitted";

        # -1 can be given for the number of pages, and it'll process ALL submissions
    for (my $ctr=0; $npages<0 || $ctr<$npages; $ctr++) {
        print "Fetching $url\n";
        my $res = $ua->get($url);
        print ". "; $|++;
        die $res->status_line unless ($res->is_success);

        my $html = $res->content;
        while ($html =~ m#class=['"]?titlerow.*?href=['"]?([^<>'"\s]*)#sig) {
            my $uri = URI->new($1);
            my $h = $uri->host;

            # this is a terrible hack that doesn't really work...
            #  maybe it'd be better to identify which domains are for sure theirs, and truncate down to that?
            $h =~ s/^(?:www|m3|yn2s|nr|ucp|lc|aw|pic)\.//;

            #print $uri->host, "\n";
            $hosts->{ $h }++;
            $subhosts{$h}{ $uri->host }++;
        }


        if ($html =~ m#<a href=['"]?([^<>'"\s]+after=[^<>'"\s]+)['"]?>next</a>#si) {
            #print "next?  $1\n";
            (my $match = $1) =~ s/&amp;/&/g;
            my $next = URI->new_abs($match, $url);
            $url = $next->as_string;

            #sleep(1);
        } else {
            print "Couldn't find 'next' link.\n";
            #print uncompress_redit_html($html); exit;
            last;
        }
    }

    return $hosts;
}



sub google_howmany {
#       print STDERR "Trying $_[0]\n";
    my $res = $ua->get("http://www.google.com/search?q=" . CGI::Util::escape(join("+", @_)));
    die unless $res->is_success;
    my $pg = $res->content;

        $pg =~ s/,//gs;

        if ($pg =~ / - did not match any documents\./) {
                return 0;
        }
        my ($howmany) =
                ($pg =~ m|Results .*? of .*?<b>(\d+)</b>|s);

        print $pg if (!defined($howmany));

        return $howmany;
}

# reddit's HTML code is compressed to be all one line...  this sucks to view in many editors/viewers
sub uncompress_redit_html {
    my $html = shift;
    $html =~ s#(</[^>]*>)#$1\n#g;
    return $html;
}

Generated by GNU enscript 1.6.4.