document updated 17 years ago, on Apr 21, 2007
#!/usr/bin/perl
# Scans the text of all disambiguation pages, and assigns them a score based on how badly formatted they are.
# The worst-formatted ones are hilighted, so that I can tag them with {{disambig-cleanup}} (or, somewhat often, recognize that the worst aren't even disambig pages, and change it to {{split}} or whatnot)
use strict;
use warnings;
use FindBin;
use lib "$FindBin::Bin/";
use MediawikiDump; # my library of Mediawiki-dump-handling tools... difficult for others
# to use right now, especially with my indexes that require 1.5GB of
# RAM, but if anyone really wants to run this, ask at [[User talk:Interiot]]
use Data::Dumper;
my $indexes = MediawikiDump::load_inmem_index(".enwiki-*-pages-meta-current.xml.idx", [qw[ pageid_offset ]]);
# generated from /home/interiot/src/disambig/Disambig_cats on the toolserver
my %disambig_categories = map {(my $a=$_)=~s/_/ /;$a,1} qw[
Disambiguation
Disambiguation_and_redirection_templates
Disambiguation_lists_of_ships
Lists_of_political_parties_by_generic_name
Lists_of_ambiguous_numbers
Lists_of_Interstate_Highways_sharing_the_same_title
Lists_of_ambiguous_human_names
Lists_of_five-character_combinations
WikiProject_disambiguation_pages
Disambig-Class_television_articles
Mesoamerica_disambiguation_pages
Disambig-Class_film_articles
WikiProject_Middle-earth_disambiguation_pages
Lists_of_four-character_combinations
Lists_of_three-character_combinations
3-letter_acronyms
Lists_of_two-character_combinations
Lists_of_U.S._Routes_sharing_the_same_title
Tropical_cyclone_disambiguation
Educational_institution_disambiguation
Lists_of_roads_sharing_the_same_title
Mathematical_disambiguation
Disambiguation_pages_in_need_of_cleanup
Ambiguous_place_names
Lists_of_ambiguous_place_names_in_Wisconsin
];
open(FIN, "<" . MediawikiDump::glob_latest("enwiki-*-pages-meta-current.xml")) or die;
## Find all pages that are in one of the disambiguation categories
my %dab_page_ids;
my @scores;
MediawikiDump::read_sql("enwiki-*-categorylinks.sql", [qw[cl_from cl_to cl_sortkey cl_timestamp]], sub {
my $fields = shift;
if ($disambig_categories{$fields->{cl_to}}) { # if this is one of the disambig categories
unless ($dab_page_ids{ $fields->{cl_from} }++) { # record this page_id
#print "--------Found dab page_id: $fields->{cl_from}\n";
my $page_id = $fields->{cl_from};
my $page = MediawikiDump::load_page(\*FIN, $indexes->{pageid_offset}{$page_id}) or return;
#print Dumper $page; exit;
my $title = $page->{title}{CONTENT};
my ($ns) = split_namespace_article($title);
return if ($ns != 0); # only pay attention to mainspace articles (eg. [[Wikipedia:Disambiguation]] is in [[Category:Disambiguation]], along with a bunch of other non-disambig pages)
my $score = score_disambig($page->{revision}{text}{CONTENT});
if ($score < -100 && $title !~ /List of highways numbered/) {
print &blankEOL;
printf("%7.2f %s\n", $score, $title);
push(@scores, [$score, $title]);
}
}
}
});
@scores = sort {$a->[0] <=> $b->[0]} @scores;
open FOUT, ">worst_disambigs.txt" or die;
foreach (@scores) {
printf FOUT "%7.2f %s\n", $_->[0], $_->[1];
}
sub score_disambig {
my $text = shift;
my $score = 0;
my $num_non_entry_words = 0;
foreach (split /[\n\r]+/, $text) {
$score -= 7 if /\[\[Image:/; # if an image is used anywhere, it's a bad thing
if (/^\s*:*\*/) { # appears to be an entry
$score-- if /'''/; # unnecessary bolding
$score-=0.5 if /[.;,]\s*$/; # unecesssary punctuation
if (/\b(is|was) (a|the) /) { # should be replaced with ", "
$score--;
} elsif (/—/ || / -- / || !/,/) { # should have a "," (note that – is acceptable, as it is frequently used between birth years of people)
$score--;
}
$score -= .5 if (/^[^a-z\[\]]*[a-z]/i); # if the entry starts with anything but an internal link, then it's probably not a MOSDAB entry (eg. "In chemistry, [[X]] is blah blah")
# (okay: there can be things like "Bob, a fictional character in the movie [[List of characters in Bladefart#Bob|''Bladefart'']]",
# but that's increasingly rare, especially with [[Bob (Bladefart character)]] now being able to be a #REDIRECT that includes an anchor)
if (/for\b.*[;,]\s*see/i) { # an entry of the form "for the blah blah, see [[X]]"... sometimes this style pops up, and it's BAAAD
$score-=2;
} elsif (/[;,]\s*see\b/i) { # sometimes minor variants of the above pop up
$score--;
} elsif (/\bsee\b/i) {
$score-=0.2;
}
$score-=0.2 if /\bname of\b/; # this often pops up in the lead sentence of articles, but IMHO a whole string of them in a disambiguation page is unnecessary
if (/\[\[\d+\]\]/ || /\[\[(January|February|March|April|May|June|July|August|September|October|November|December)( \d+)?\]\]/) { # linking a DATE!!!
$score -= 3;
}
if (/\[\[.*\[\[.*\[\[.*\[\[.*\[\[/) { # an entry with 5 links is amazingly rediculous
$score-=10;
} elsif (/\[\[.*\[\[.*\[\[.*\[\[/) { # totally unacceptable
$score-=3;
} elsif (/\[\[.*\[\[.*\[\[/) { # egregious linking (though in some cases, even dedicated MOSDABers this is the limits of what's acceptable, but this is absolutely the limit)
$score--;
} elsif (/\[\[.*\[\[/) { # pretty bad linking
$score-=0.3;
}
if (!/\[\[/ && /\[/) { # an entry that contains an external link, but no internal link...
$score-=4;
} elsif (!/\[\[/) {
$score -= 2; # no links whatsoever
}
$score -= 3 if /\[\[[^\|\]]*\|[^'"\]]*\]\]/; # a piped link that has no ' or " on the right side
$score -= 2 if /{{.*}}/; # templates don't belong in disambig entries
} else {
# keep track of the number of words used outside entries
$num_non_entry_words += scalar(grep /\w/, split " ", $_);
}
$score -= 3 if m#<table>|<tr>|<td>|{\||\|-|\|}#; # lose points for including a table, especially a long one
}
if ($num_non_entry_words > 800) {
$score -= 150; # almost certainly not a disambiguation page
} elsif ($num_non_entry_words > 300) {
$score -= 40; # a long intro, at the very least
}
return $score;
}