paperlined.org
apps > wikipedia > technical > database_dump
document updated 17 years ago, on Apr 21, 2007
#!/usr/bin/perl

# Scans the text of all disambiguation pages, and assigns them a score based on how badly formatted they are.
# The worst-formatted ones are hilighted, so that I can tag them with {{disambig-cleanup}} (or, somewhat often, recognize that the worst aren't even disambig pages, and change it to {{split}} or whatnot)

	use strict;
	use warnings;

	use FindBin;
	use lib "$FindBin::Bin/";

	use MediawikiDump;      # my library of Mediawiki-dump-handling tools...  difficult for others
                            # to use right now, especially with my indexes that require 1.5GB of
                            # RAM, but if anyone really wants to run this, ask at [[User talk:Interiot]]
	use Data::Dumper;

my $indexes = MediawikiDump::load_inmem_index(".enwiki-*-pages-meta-current.xml.idx", [qw[ pageid_offset ]]);


# generated from /home/interiot/src/disambig/Disambig_cats on the toolserver
my %disambig_categories = map {(my $a=$_)=~s/_/ /;$a,1} qw[
	Disambiguation
	Disambiguation_and_redirection_templates
	Disambiguation_lists_of_ships
	Lists_of_political_parties_by_generic_name
	Lists_of_ambiguous_numbers
	Lists_of_Interstate_Highways_sharing_the_same_title
	Lists_of_ambiguous_human_names
	Lists_of_five-character_combinations
	WikiProject_disambiguation_pages
	Disambig-Class_television_articles
	Mesoamerica_disambiguation_pages
	Disambig-Class_film_articles
	WikiProject_Middle-earth_disambiguation_pages
	Lists_of_four-character_combinations
	Lists_of_three-character_combinations
	3-letter_acronyms
	Lists_of_two-character_combinations
	Lists_of_U.S._Routes_sharing_the_same_title
	Tropical_cyclone_disambiguation
	Educational_institution_disambiguation
	Lists_of_roads_sharing_the_same_title
	Mathematical_disambiguation
	Disambiguation_pages_in_need_of_cleanup
	Ambiguous_place_names
	Lists_of_ambiguous_place_names_in_Wisconsin
];

open(FIN, "<" . MediawikiDump::glob_latest("enwiki-*-pages-meta-current.xml")) or die;

## Find all pages that are in one of the disambiguation categories
my %dab_page_ids;
my @scores;
MediawikiDump::read_sql("enwiki-*-categorylinks.sql", [qw[cl_from cl_to cl_sortkey cl_timestamp]], sub {
	my $fields = shift;
	if ($disambig_categories{$fields->{cl_to}}) {	# if this is one of the disambig categories
		unless ($dab_page_ids{ $fields->{cl_from} }++) {	# record this page_id
			#print "--------Found dab page_id: $fields->{cl_from}\n";
			my $page_id = $fields->{cl_from};
			my $page = MediawikiDump::load_page(\*FIN, $indexes->{pageid_offset}{$page_id}) or return;
			#print Dumper $page; exit;
			my $title = $page->{title}{CONTENT};

			my ($ns) = split_namespace_article($title);
			return if ($ns != 0);	# only pay attention to mainspace articles (eg. [[Wikipedia:Disambiguation]] is in [[Category:Disambiguation]], along with a bunch of other non-disambig pages)

			my $score = score_disambig($page->{revision}{text}{CONTENT});

			if ($score < -100 && $title !~ /List of highways numbered/) {
				print &blankEOL;
				printf("%7.2f  %s\n", $score, $title);
				push(@scores, [$score, $title]);
			}
		}
	}
});

@scores = sort {$a->[0] <=> $b->[0]} @scores;
open FOUT, ">worst_disambigs.txt" or die;
foreach (@scores) {
	printf FOUT "%7.2f  %s\n", $_->[0], $_->[1];
}


sub score_disambig {
	my $text = shift;

	my $score = 0;

	my $num_non_entry_words = 0;

	foreach (split /[\n\r]+/, $text) {
		$score -= 7 if /\[\[Image:/;	# if an image is used anywhere, it's a bad thing

		if (/^\s*:*\*/) {		# appears to be an entry
			$score-- if /'''/;		# unnecessary bolding
			$score-=0.5 if /[.;,]\s*$/;	# unecesssary punctuation

			if (/\b(is|was) (a|the) /) {		# should be replaced with ", "
				$score--;
			} elsif (/&mdash;/ || / -- / || !/,/) {	# should have a ","		(note that &ndash; is acceptable, as it is frequently used between birth years of people)
				$score--;
			}

			$score -= .5 if (/^[^a-z\[\]]*[a-z]/i);		# if the entry starts with anything but an internal link, then it's probably not a MOSDAB entry (eg. "In chemistry, [[X]] is blah blah")
									#	(okay: there can be things like "Bob, a fictional character in the movie [[List of characters in Bladefart#Bob|''Bladefart'']]",
									#	 but that's increasingly rare, especially with [[Bob (Bladefart character)]] now being able to be a #REDIRECT that includes an anchor)

			if (/for\b.*[;,]\s*see/i) {			# an entry of the form "for the blah blah, see [[X]]"...  sometimes this style pops up, and it's BAAAD
				$score-=2;
			} elsif (/[;,]\s*see\b/i) {			# sometimes minor variants of the above pop up
				$score--;
			} elsif (/\bsee\b/i) {
				$score-=0.2;
			}
			$score-=0.2 if /\bname of\b/;	# this often pops up in the lead sentence of articles, but IMHO a whole string of them in a disambiguation page is unnecessary
			if (/\[\[\d+\]\]/ || /\[\[(January|February|March|April|May|June|July|August|September|October|November|December)( \d+)?\]\]/) {		# linking a DATE!!!
				$score -= 3;
			}

			if      (/\[\[.*\[\[.*\[\[.*\[\[.*\[\[/) {	# an entry with 5 links is amazingly rediculous
				$score-=10;
			} elsif (/\[\[.*\[\[.*\[\[.*\[\[/) {	# totally unacceptable
				$score-=3;
			} elsif (/\[\[.*\[\[.*\[\[/) {	# egregious linking (though in some cases, even dedicated MOSDABers this is the limits of what's acceptable, but this is absolutely the limit)
				$score--;
			} elsif (/\[\[.*\[\[/) {	# pretty bad linking
				$score-=0.3;
			}

			if (!/\[\[/ && /\[/) {		# an entry that contains an external link, but no internal link...
				$score-=4;
			} elsif (!/\[\[/) {
				$score -= 2;		# no links whatsoever
			}

			$score -= 3 if /\[\[[^\|\]]*\|[^'"\]]*\]\]/;	# a piped link that has no ' or " on the right side

			$score -= 2 if /{{.*}}/;	# templates don't belong in disambig entries
		} else {
			# keep track of the number of words used outside entries
			$num_non_entry_words += scalar(grep /\w/, split " ", $_);
		}

		$score -= 3 if m#<table>|<tr>|<td>|{\||\|-|\|}#;	# lose points for including a table, especially a long one
	}

	if ($num_non_entry_words > 800) {
		$score -= 150;			# almost certainly not a disambiguation page
	}  elsif ($num_non_entry_words > 300) {
		$score -= 40;			# a long intro, at the very least
	}

	return $score;
}