http://paperlined.org/src/pl/reddit/effektz/effektz_domains.pl
#!/usr/bin/perl
use strict;
use warnings;
use LWP::UserAgent;
use URI;
use Data::Dumper;
use CGI::Util;
use CGI;
my $ua = new LWP::UserAgent;
$ua->agent("Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.4) Gecko Netscape/7.1 (ax)");
my $hilight_creation_date_if = qr#^2007-(Sep|Oct|Nov|Dec)#i;
# accounts that appear to be interested in spamming Weidmann-owned domains at least part of the time
my @reddit_accounts = qw[
alexw
nitsujw
TimmyCracKorn
iTouch
MyTouch
yahooxjeffektz
perfektxj
];
# effektz # apparently banned
# sites that the Weidmann brothers post stories to semi-often, but aren't solely controlled by them
# (the goal isn't to remove ALL offsite links they post... rather, to make it easier to
# manually separate the wheat from the chaff by making sure that only Weidmann-owned sites
# appear at the top of the frequency-sorted list)
my %remove = map {$_,1} qw[
infowars.com
nytimes.com
ivnnetwork.com
news.com.au
youtube.com
abcnews.go.com
news.yahoo.com
lifehacker.com
neatorama.com
tuaw.com
burbia.com
torrentfreak.com
sciencedaily.com
crooksandliars.com
reuters.com
engadget.com
mashable.com
dumblittleman.com
news.bbc.co.uk
advancedmn.com
netmag.co.uk
cracked.com
chow.com
sciencenow.sciencemag.org
hosted.ap.org
ronpaulnation.com
uk.reuters.com
concierge.com
money.cnn.com
offbeatink.com
kontraband.com
inhabitat.com
absolutelytrue.com
images.dpchallenge.com
community.livejournal.com
reubenmiller.typepad.com
cnn.com
gigaom.com
andrewsullivan.theatlantic.com
in-public.com
talkbulge.com
mytho-fleurs.com
macdesktops.com
weblogs.media.mit.edu
realsimple.com
jimjamzoo.com
redbubble.com
topiaryartdesigns.com
streetart.splitbrain.org
freshome.com
skatemoss.blogspot.com
photopumpkin.com
1nf0rmat10n.com
coldwaterimages.com
streetart.splitbrain.org
wild-landscape.com
picsyard.com
flickr.com
photos.sutradirectory.com
eyje.net
911archive.org
marcofolio.net
tumbblogr.com
photo.net
davidshrigley.com
incredimazing.com
sithvixen.com
mentalfloss.com
foxnews.com
thinkprogress.org
askmen.com
dl.ziza.ru
telegraph.co.uk
];
#jimjamzoo.com (am I really sure about this one?)
#photopumpkin.com
# picsyard.com
# as minimalist as the site is, it doesn't appear to be tied to the Weidmann brothers:
# http://siteexplorer.search.yahoo.com/advsearch?p=http%3A%2F%2Fpicsyard.com%2F&bwm=i&bwmo=d&bwmf=s
# http://reddit.com/search?q=site%3Apicsyard.com
#eyje.net
my %someone_else = map {$_,1} qw[
208.73.212.12
];
our %positive_hits__;
do "zen2design_metadata.pm";
# modify the above structure just a bit... (basically, add the ->{caps} member)
my %positive_hits;
while (my ($caps, $h) = each %positive_hits__) {
if (lc($caps) ne $caps) {
$h->{caps} = $caps;
}
$positive_hits{lc $caps} = $h;
}
#print Dumper \%positive_hits; exit;
my $hosts = {};
my %subhosts;
foreach my $username (@reddit_accounts) {
fetch_reddit_submitter_domains($username, -1, $hosts);
}
my @hosts = sort {$hosts->{$a} <=> $hosts->{$b}} keys %$hosts;
foreach my $host (@hosts) {
next if ($remove{$host});
printf "%5d %s\n", $hosts->{$host}, $host;
}
open FOUT_HTML, ">effektz_domains.html" or die;
open FOUT_MD, ">effektz_domains.markdown" or die;
print FOUT_MD <<"EOF";
Stop spamming linkjacked pictures, effektz ([aka TimmyCracKorn aka MyTouch aka iTouch aka nitsujw aka alexw](http://paperlined.org/apps/reddit/zen2design.html)).
## Spammed domains:
EOF
print FOUT_HTML "<table>\n";
print FOUT_HTML "<tr><th>Site <th> <th> <th> <th> <th style='padding-left:1em'>Site's primary purpose <th style='padding-left:1em'>Created <th>IP address\n";
foreach my $host (reverse @hosts) {
next if ($remove{$host});
my @subdomains = keys %{$subhosts{$host}};
my $srch = "http://reddit.com/search?q=" . join("+", map {"site:$_"} @subdomains);
my $num = $hosts->{$host};
my $pos = $positive_hits{$host} || {};
my $caps_host = $pos->{caps} || $host;
my (undef, undef, undef, undef, @addrs) = gethostbyname($host);
@addrs = map {join ".", unpack ('C4', $_)} @addrs;
# remove sites that we seem to clearly not care about:
# - sites with only one story
# - sites we haven't put in our whitelist
# - sites that aren't hosted on the 66.98.156.36 server
next if ($num <= 2 && !keys %$pos && !($addrs[0] && $addrs[0] eq '66.98.156.36'));
my $stumble_discoveries = google_howmany("site:stumbleupon.com/url/ inurl:$host");
my $story_plural = ($num>1) ? "stories" : "story";
print FOUT_HTML "<tr><td><a href='$srch'>$caps_host</a>",
"<td>(<a href='http://www.alexa.com/data/details/traffic_details/$host'>alexa</a>)",
"<td>(<a href='http://siteexplorer.search.yahoo.com/advsearch?p=http%3A%2F%2F$host%2F&bwm=i&bwmo=d&bwmf=s'>backlinks</a>)",
"<td>$num reddit $story_plural";
if ($stumble_discoveries) {
my $plural = ($stumble_discoveries > 1) ? "s" : "";
print FOUT_HTML "<td><a href='http://www.google.com/search?q=site:stumbleupon.com/url/+inurl%3A$caps_host&hl=en&safe=off&start=0&sa=N&filter=0'>$stumble_discoveries stumble$plural</a>",
} else {
print FOUT_HTML "<td>";
}
my $devoted_to_others = ""; # visually differentiate between sites devoted to image tumblelogging, and those ostensibly devoted to something else
my $general_d = $pos->{descr} || $pos->{unescaped_descr};
if (!$pos->{no_chevron} && $general_d && $general_d !~ /(image|video).*tumblelog/ && $general_d !~ /lapsed/i) {
$devoted_to_others = "»» ";
}
if ($pos->{unescaped_descr}) {
print FOUT_HTML "<td style='padding-left:1em'>$devoted_to_others$pos->{unescaped_descr}";
} else {
print FOUT_HTML "<td style='padding-left:1em'>$devoted_to_others<a href='", ($pos->{link} || "http://$host/"), "'>",
CGI::escapeHTML($pos->{descr} || ''), "</a>";
}
# if ($pos->{first_appeared}) {
# (my $a = CGI::escapeHTML($pos->{first_appeared})) =~ s/ / /g;
# print FOUT_HTML "<td style='padding-left:1em'>First appeared: $a";
# } else {
# print FOUT_HTML "<td style='padding-left:1em'>";
# }
if ($pos->{created}) {
my $b = "";
$b = "<b>" if ($pos->{created} =~ $hilight_creation_date_if);
print FOUT_HTML "<td style='padding-left:1em'><a href='http://networking.ringofsaturn.com/Tools/whois.php?domain=$caps_host'>$b$pos->{created}</b></a>";
} else {
print FOUT_HTML "<td style='padding-left:1em'>";
}
#print FOUT_HTML "<td><a href='http://www.zoneedit.com/lookup.html?host=$caps_host&type=A&server=&forward=Look+it+up'>$addrs[0]</a>";
my $strike_ip = ($addrs[0] eq '68.178.232.99' || $someone_else{$addrs[0]}) ? "<s>" : "";
my $ip_annotation = "";
$ip_annotation = " (<a href='http://whois.domaintools.com/$addrs[0]'>godaddy</a>)" if ($addrs[0] =~ /^68\.178\./);
$ip_annotation = " (<a href='http://whois.domaintools.com/$addrs[0]'>someone else</a>)" if ($someone_else{$addrs[0]});
#print FOUT_HTML "<td><a href='http://ping.eu/nslookup/?host=$caps_host&port='>$strike_ip$addrs[0]</s></a>";
print FOUT_HTML "<td><a href='http://network-tools.com/default.asp?prog=dnsrec&host=$caps_host'>$strike_ip$addrs[0]</s></a>$ip_annotation";
print FOUT_HTML "\n";
if ($general_d !~ /lapsed/i) { # skipped lapsed domains
my $markdown_ip = $addrs[0];
$markdown_ip = "lapsed domain" if ($general_d =~ /lapsed/i);
print FOUT_MD "* [$caps_host]($srch) ([alexa](http://www.alexa.com/data/details/traffic_details/$host), IP:$markdown_ip) [$num $story_plural]\n";
}
}
0 && print FOUT_MD <<"EOF";
## Spamming accounts:
* [effektz](http://all.reddit.com/user/effektz/submitted)
* [TimmyCracKorn](http://all.reddit.com/user/TimmyCracKorn/submitted)
* [MyTouch](http://all.reddit.com/user/MyTouch/submitted)
* [iTouch](http://all.reddit.com/user/iTouch/submitted)
* [nitsujw](http://all.reddit.com/user/nitsujw/submitted)
* [alexw](http://all.reddit.com/user/alexw/submitted)
(this list, while available via a [web link](http://paperlined.org/apps/reddit/zen2design.html), is also available for [copy-n-pasting](http://paperlined.org/src/pl/reddit/effektz/effektz_domains.markdown) into other effektz spam stories)
EOF
print "Reports output to:\n\teffektz_domains.html\n\teffektz_domains.markdown\n";
#print Dumper $a;
sub fetch_reddit_submitter_domains {
my $username = shift or die;
my $npages = shift || 1;
my $hosts = shift || {};
my $url = "http://reddit.com/user/$username/submitted";
# -1 can be given for the number of pages, and it'll process ALL submissions
for (my $ctr=0; $npages<0 || $ctr<$npages; $ctr++) {
print "Fetching $url\n";
my $res = $ua->get($url);
print ". "; $|++;
die $res->status_line unless ($res->is_success);
my $html = $res->content;
while ($html =~ m#class=['"]?titlerow.*?href=['"]?([^<>'"\s]*)#sig) {
my $uri = URI->new($1);
my $h = $uri->host;
# this is a terrible hack that doesn't really work...
# maybe it'd be better to identify which domains are for sure theirs, and truncate down to that?
$h =~ s/^(?:www|m3|yn2s|nr|ucp|lc|aw|pic)\.//;
#print $uri->host, "\n";
$hosts->{ $h }++;
$subhosts{$h}{ $uri->host }++;
}
if ($html =~ m#<a href=['"]?([^<>'"\s]+after=[^<>'"\s]+)['"]?>next</a>#si) {
#print "next? $1\n";
(my $match = $1) =~ s/&/&/g;
my $next = URI->new_abs($match, $url);
$url = $next->as_string;
#sleep(1);
} else {
print "Couldn't find 'next' link.\n";
#print uncompress_redit_html($html); exit;
last;
}
}
return $hosts;
}
sub google_howmany {
# print STDERR "Trying $_[0]\n";
my $res = $ua->get("http://www.google.com/search?q=" . CGI::Util::escape(join("+", @_)));
die unless $res->is_success;
my $pg = $res->content;
$pg =~ s/,//gs;
if ($pg =~ / - did not match any documents\./) {
return 0;
}
my ($howmany) =
($pg =~ m|Results .*? of .*?<b>(\d+)</b>|s);
print $pg if (!defined($howmany));
return $howmany;
}
# reddit's HTML code is compressed to be all one line... this sucks to view in many editors/viewers
sub uncompress_redit_html {
my $html = shift;
$html =~ s#(</[^>]*>)#$1\n#g;
return $html;
}
Generated by GNU enscript 1.6.4.