http://paperlined.org/dev/src/pl/speech_word_frequency_analysis/word_count.pl

#!/usr/bin/perl

# find the "keywords" that are most commonly used  (excluding obviously trivial ones that are always most common, like "the", # "I", etc)

    use strict;
    use warnings;

    BEGIN {$ENV{WNHOME} ||= "/home/interiot/large/WordNet"}
    use WordNet::stem;
    use WordNet::QueryData;

    use Data::Dumper;


my $wn = WordNet::QueryData->new();
my $stemmer = WordNet::stem->new($wn);

my %exclude_trivial = exclude_trivial();
my %exclude_stems = exclude_stems();
my %words;


# process each word
while (<>) {
    s/&mdash;//g;
    s/[\.;,]/ /g;

    my @words = split ' ', $_;
    foreach my $w (@words) {
        my @stems = $stemmer->stemWord($w);

        if (@stems) {
            foreach my $s (@stems) {
                $words{$s}++;
            }
        } else {
            $words{$w}++;
        }
    }
}


# output the statistics
foreach my $w (sort {$words{$b} <=> $words{$a}}  keys %words) {
    next if ($exclude_trivial{lc($w)} || $exclude_stems{$w});
    printf "%4d %s\n", $words{$w}, $w;
}


sub exclude_trivial {
    return map {$_,1} qw[
        a
        I
        i

        as
        be
        by
        do
        go
        he
        in
        is
        it
        me
        my
        of 
        on
        so
        to
        us
        we

        all
        and
        are
        but But
        can
        for
        has
        his
        new
        not
        see
        the
        our 
        own
        who

        from
        good
        have
        keep
        know
        make
        many
        more
        must
        need
        over
        than
        them
        that 
        they
        this
        what
        will
        with
        won't
        year

        every
        right

        before

        believe
    ];
}


# weird things that the stemmer puts out that don't seem relevant
sub exclude_stems {
    return map {$_,1} qw[
        u

        ha
    ];
}

Generated by GNU enscript 1.6.4.