http://paperlined.org/dev/perl/cpan/local_mirror/utils/cpan_grep.pl

#!/usr/bin/perl

# Looks for regexp matches within a local mirror of CPAN.
#
# Designed to work with CPAN::Mini.
#
# Searches through .tar.gz'd files, so this takes a lot of CPU and some clock time.  However, it minimizes long-term disk space.



# TODO:
#
#       # integrate  http://search.cpan.org/perldoc?ppigrep   into this?
#                or  http://search.cpan.org/perldoc?grepl   ?
#
#       - important -- for domain=unlimited searches, it should DETERMINE IF THE MATCH was in documentation-only or not
#               - this documentation-only flag should be shown on reports, particularly the HTML report
#               - the documentation-only flag should be set for all non-Perl and non-XS code (eg. 'README', 'Todo', ...)
#
#       - release this to CPAN as CPAN::Mini::Grep   and CPAN::Mini::Grep::HTML
#
#       - currently it only focuses on .pm (Perl) files....   add the ability to look at .xs files too
#
#               - (personal project) and then look for B::Generate-like signatures...
#                           newSVrv()
#                           newRV_noinc()
#
#       - add flags to search different domains:
#               Perl code only, no comments
#               Perl code only, comments included
#               XS code only
#               EVERYTHING / unrestricted
#
#       - (possibly)  in a fork of this script:  add search-indexing functionality, at least to be able to narrow down the search?
#               - KinoSearch looks good


    use strict;
    use warnings;

    use Archive::Tar;
    use Pod::Parser;
    use CGI;
    use Cwd 'abs_path';
    use File::Basename;


    use Data::Dumper;



    # the location of your CPAN::Mini mirror
    my $mirror_location = '/cpan_mirror/mirror/authors/';
    my $log_location = 'logs/';


my $eval_pattern = shift or die "Please specify a Perl snippet of regexp(es), for matching files.\n\nFor example:\n\t$0 '/matches_somethign/ && /matches_something_else/'\n";

-d $mirror_location or die "\$mirror_location is set incorrectly ($mirror_location)\n";

$Archive::Tar::WARN = 0;        # quiet warning messages

my $log_filename = "${log_location}grep." . time();
tee($log_filename);

print "   Searching for:  $eval_pattern\n";
print "HTML result file:  $log_filename.html\n\n";

open my $html, '>', "$log_filename.html"        or die $!;
html_header();



my @archives;
open PIN, "-|", "find", $mirror_location, "-type", "f", "-o", "-type", "l"
        or die $!;
while (<PIN>) {
    chomp;
    push(@archives, $_)     if (/\.tar\.gz$/i);
}

# sort by distribution name -- without this, the results end up being sorted by author name
        sub distribution_name {  local $_ = shift;  s#^.*/##;  $_  }
@archives = sort {lc(distribution_name($a)) cmp lc(distribution_name($b))} @archives;

foreach my $archive (@archives) {
    search_archive($archive);
}



sub search_archive {
    my $tarball = shift;

    #print "============== $tarball ==============\n";

    # tarballs that we get hung up on for one reason or another
    return if ($tarball =~ m#/Lingua-StanfordCoreNLP-#);

    my $tar = Archive::Tar->new($tarball);

    foreach my $filename ($tar->list_files()) {
        #next unless (filename_filter($filename));
        (my $filename_sans_package = $filename) =~ s#^[^/]*/##s;
        next unless (filename_filter($filename_sans_package));

        my $contents = $tar->get_content($filename);

        ### CONFIGURATION:  enable or disable this (currently manually, later via flags)
        #$contents = remove_pod($contents);                     

        next unless ($contents);
        my $is_match = 0;
        {
            local $_ = $contents;
            $is_match = eval $eval_pattern;
        }
        #if ($contents =~ /$pattern/o) {
        if ($is_match) {
            my $module;
            #$module = parse_package_name($contents)     if ($filename =~ /\.pm$/);
            $module = get_Pod_NAME_module__from_string($contents)    if ($filename =~ /\.(?:pm|pod)$/);
            print_hit($tarball, $module, $filename);
        }
    }
}


sub filename_filter {
    local $_ = shift;

    ### CONFIGURATION:  enable or disable these (currently manually, later via flags)

    #return unless (/\.pm$/);
    return if (/^META\.yml$|^Meta\.json$|^Build\.PL$|^Makefile\.PL$|^dist.ini$|^Changes$/);

    return if (m#^inc/#si);
    return if (m#^t/#si || /\.t$/);

    return 1;
}


# get the name of the first package defined in the provided Perl source code
sub parse_package_name {
    my ($file_contents) = @_;

    if ($file_contents =~ /\bpackage\s+([a-z][a-z0-9:_]*)\s*;/si) {
        return $1;
    }
    return undef;
}



BEGIN {
    our %seen;

    sub print_hit {
        my ($tarball, $module, $inside_filename) = @_;

        my $author = basename(dirname($tarball));

        #print Dumper \@_, $author; exit;

        (my $distribution = $tarball) =~ s#.*/##;
        $distribution =~ s/\.tar\.gz$//s;
        my $distro_with_version = $distribution;
        $distribution =~ s/-[0-9\.]+$//s;

        $inside_filename =~ s#^[^/]*/##s;

        $module = '-' unless (defined($module) && $module =~ /\S/);

        printf "%-40s  %-40s  %s\n", $distribution, $module, $inside_filename;

        print $html "<tr><td><a href='https://metacpan.org/release/$distribution/'>$distribution</a>\n";
        if ($module eq '-') {
            print $html "    <td><a href='https://metacpan.org/source/$author/$distro_with_version/'>-</a>\n";
        } else {
            print $html "    <td><a href='https://metacpan.org/module/$module'>$module</a>\n";
        }
        print $html "    <td><a href='https://metacpan.org/source/$author/$distro_with_version/$inside_filename'>$inside_filename</a>\n";
    }
}




# replicate everything that goes to STDOUT to a file too  (uses a forked subprocess)
use autodie;
sub tee {
    my ($filename) = @_;
    open my $origstdout, '>&STDOUT';
    open my $fh, ">$filename";
    if (!open STDOUT, '|-') {
        ## child process
        while (sysread(STDIN, my $buffer, 1024)) {
            syswrite($origstdout, $buffer);
            syswrite($fh, $buffer);
        }
        exit;
    }
    ## parent process continues on and returns
    close $origstdout;
    close $fh;
}


# quickly read a whole file
sub slurp {my$p=open(my$f,"$_[0]")or die$!;my@o=<$f>;close$f;waitpid($p,0);wantarray?@o:join("",@o)}

# display a string to the user, via less
sub less {my$pid=open my$less,"|less";print$less @_;close$less;waitpid$pid,0}





# Avoid searching documentation-only text, by removing it.
#
# PerlTidy's --delete-pod and --delete-all-comments could also be used for this
sub remove_pod {
    my $perl_code = shift;

    my $fh = IO::String->new($perl_code);
    open my $devnull, ">/dev/null"  or die $!;

    my $parser = Pod::Parser::RemovePod->new();
    $parser->parseopts(-want_nonPODs => 1);
    $parser->{NonPod} = '';
    $parser->parse_from_filehandle($fh, $devnull);
    return $parser->{NonPod};
}




sub html_header {
    print $html <<'EOF';
<style>
    /* --==##  links aren't underlined unless you :hover  ##==-- */
    a:hover {text-decoration:underline}
    a {text-decoration:none}
    @media print { a {text-decoration:underline} }

    /* --==##  make h1/h2/h3 stand out with bars  ##==-- */
    h1, h2, h3 {padding:0.3em; border-top:2px solid #000; border-bottom:2px solid #000;
    background-color:#ccc; margin-top:2em}
    body>h1:first-child, body>h2:first-child, body>h3:first-child {margin-top:0}

    /* --==##  table cells have a nice border  ##==-- */
    table.wikitable {border-collapse:collapse}
    table.wikitable td, table.wikitable th {border:1px solid #aaa; padding:0.3em}
    table.wikitable th {background-color:#000; color:#fff}
    table.wikitable th a {color:#aaf}
    table.wikitable th a:visited {color:#faf}

    /* --==##  kbd has gray background  ##==-- */
    kbd {background-color:#bbb}

    /* --==##  selectively make ul/ol spaced (non-cascading)  ##==-- */
    ul.spaced > li, ol.spaced > li {margin-bottom:1em}
    .spaced > li > .spaced {margin-top:1em}

    /* --==##  CSS reset  ##==-- */
    a img {border:0}
</style>


EOF

    print $html "<p><b>Results for:</b> <kbd>", CGI::escapeHTML($eval_pattern), "</kbd>";
    print $html "<p><br><table class=wikitable>\n";
}


BEGIN {
    package Pod::Parser::RemovePod;

        use vars qw[@ISA];
        @ISA = ("Pod::Parser");
        use Pod::Parser;

    sub preprocess_paragraph {
        my ($self, $text, $line_num) = @_;

        if ($self->cutting()) {
            #print "$text\n";
            $self->{NonPod} .= $text;
        }

        return $text;
    }

    # quelch POD parse errors
    sub errorsub {
        sub { 1 };
    }

}



# use Pod::Select;
# use IO::String;
# 
# # Given a file with POD documentation in it,
# # looks for the NAME section, and if there, looks for a module name
# # there, using the semi-standard formatting of (module name)(dash).
# # Return it if it's found, otherwise return undef.
# sub get_Pod_NAME_module {
#     my ($fh_or_filename) = @_;
# 
#     my $out = new IO::String;
# 
#     podselect({-output => $out, -sections => ["NAME"]}, $fh_or_filename);
# 
#     my $NAME_section = ${ $out->string_ref };
# 
#     if ($NAME_section) {
#         $NAME_section =~ s/^.*?[\n\r]+//s;
#         if ($NAME_section =~ /^(\S+)\s+-\s+/s) {
#             return $1;
#         }
#     }
#     return undef;
# }



use Pod::Simple::SimpleTree;

sub get_Pod_NAME_module__from_string {
    my ($file_contents) = @_;

    my $tree = Pod::Simple::SimpleTree->new->parse_string_document($file_contents);

    my $return_next_one = 0;
    foreach my $token ( @{$tree->{root}} ) {
        next unless (ref($token) eq 'ARRAY');

        if ($token->[0] eq 'head1' && $token->[2] eq 'NAME') {
            $return_next_one++;
            next;
        }
        if ($return_next_one && $token->[0] eq 'Para') {
            my $NAME_section = $token->[2];
            if ($NAME_section =~ /^(\S+)\s+-\s+/s) {
                return $1;
            }
            return undef;
        }
    }
    return undef;
}

Generated by GNU enscript 1.6.4.