http://paperlined.org/dev/src/pl/speech_word_frequency_analysis/word_count.pl
#!/usr/bin/perl
# find the "keywords" that are most commonly used (excluding obviously trivial ones that are always most common, like "the", # "I", etc)
use strict;
use warnings;
BEGIN {$ENV{WNHOME} ||= "/home/interiot/large/WordNet"}
use WordNet::stem;
use WordNet::QueryData;
use Data::Dumper;
my $wn = WordNet::QueryData->new();
my $stemmer = WordNet::stem->new($wn);
my %exclude_trivial = exclude_trivial();
my %exclude_stems = exclude_stems();
my %words;
# process each word
while (<>) {
s/—//g;
s/[\.;,]/ /g;
my @words = split ' ', $_;
foreach my $w (@words) {
my @stems = $stemmer->stemWord($w);
if (@stems) {
foreach my $s (@stems) {
$words{$s}++;
}
} else {
$words{$w}++;
}
}
}
# output the statistics
foreach my $w (sort {$words{$b} <=> $words{$a}} keys %words) {
next if ($exclude_trivial{lc($w)} || $exclude_stems{$w});
printf "%4d %s\n", $words{$w}, $w;
}
sub exclude_trivial {
return map {$_,1} qw[
a
I
i
as
be
by
do
go
he
in
is
it
me
my
of
on
so
to
us
we
all
and
are
but But
can
for
has
his
new
not
see
the
our
own
who
from
good
have
keep
know
make
many
more
must
need
over
than
them
that
they
this
what
will
with
won't
year
every
right
before
believe
];
}
# weird things that the stemmer puts out that don't seem relevant
sub exclude_stems {
return map {$_,1} qw[
u
ha
];
}
Generated by GNU enscript 1.6.4.