http://paperlined.org/dev/src/pl/digg/identify_group_diggers.pl
#!/usr/bin/perl
# Given the URLs of several Digg stories, this will try to identify commanlities among users who
# initially dugg it up.
#
# To use, just pipe the URLs on STDIN... (eg. you can record them to a file first, if you want...)
# TODO:
# - this probably won't work once a story gets many diggers... I need to make it always go back
# to the first page or two, and record only those usernames.
use strict;
use warnings;
use LWP::Simple;
use List::Util qw[sum];
use Data::Dumper;
my @urls = split ' ', do {local $/=undef; <>};
#print Dumper \@urls;
my %url_key;
my %users;
foreach my $url (@urls) {
$url .= "/who" unless ($url =~ /\/who$/i);
my $url_key = chr(ord('A') + scalar(keys(%url_key)));
$url_key{$url} = $url_key;
print "Fetching $url\n";
#my $html = LWP::Simple::get($url);
my $html = `w3m -dump_source '$url' | gzip -dcf -`;
#my $html = "";
if (!$html) {
print "Unable to fetch $url\n";
next;
}
# trim off everything but the list we're interested in
$html =~ s#^.*?<ul class=["']?user-list["']?>##si;
$html =~ s#</ul>.*##si;
#print $html; exit;
my @users = ($html =~ m#>([^<]+)</a>#g);
@users = reverse @users;
for (my $ctr=0; $ctr<@users; $ctr++) {
my $user = $users[$ctr];
$users{$user} ||= [];
push(@{$users{$user}}, [$url, $ctr+1]);
}
}
# figure out the sort order
my %user_score;
while (my ($user, $data) = each %users) {
my $cnt = scalar(@$data);
my $avg_place = sum(map {$_->[1]} @$data) / $cnt;
# make it so that users with a higher count are sorted first, but when two users have the same
# number of diggs, that the one that dugg earlier gets sorted first
$user_score{$user} = $cnt + 1 / ($avg_place+1);
}
my @user_order = sort {$user_score{$b} <=> $user_score{$a}} keys %users;
##### Output the report #####
print "Stories apparently involved in this spam group-digging:\n";
foreach my $url (sort {$url_key{$a} cmp $url_key{$b}} keys %url_key) {
print "$url_key{$url}) $url\n";
}
print "\nUsers apparently involved in doing the initial group-digging of these stories:\n";
foreach my $user (@user_order) {
print "---- user $user ----\n";
foreach my $listref (sort {$url_key{$a->[0]} cmp $url_key{$b->[0]}} @{$users{$user}}) {
my $position = suffix_number($listref->[1]) . " digger";
$position = "submitter" if ($listref->[1] == 1);
print "Story " . $url_key{$listref->[0]} . ": $position.\n";
}
print "\n";
}
our @letter_suffixes;
BEGIN {
@letter_suffixes = qw[
th
st
nd
rd
th
th
th
th
th
th
];
}
# given a number, like '22', add the suffix to turn it into '22nd'
sub suffix_number {
my $num = scalar(shift);
my $rightnum = substr($num, -1);
if ($num >10 && $num<20) { # "teens" always end in "th" (it's "2nd", "12th", "22nd", "32nd", .... teens are the odd-man-out
return $num . "th";
} else {
return $num . $letter_suffixes[$rightnum];
}
}
Generated by GNU enscript 1.6.4.