#!/usr/bin/perl use PDL; use strict; use warnings; my $metrics = {'file_count' => 0, 't_lines' => 0, 'lines' => [], 'paragraphs' => [], 'characters' => [], 'sentences' => [], 'words' => [], 'wordlength' => [], 'unique' => [], 'lexicon' => {}, }; print "====== 'Text Metric'inator ======\n"; print "Reports on the text metrics of a collection of TXT files while also attempting\n"; print "to take over the tri-state area.\n\n"; if (!defined $ARGV[0] || !-d $ARGV[0]) { print "Usage: text_metricinator.pl \n"; } else { print " * Calculating text metrics from collection...\n"; &scanDirectory($ARGV[0], $metrics); print " Done\n\n"; foreach my $word (keys %{$metrics->{'lexicon'}}) { push(@{$metrics->{'word length'}}, length($word)); } print "===== Report =====\n"; print ' File count: ' . $metrics->{'file_count'} . "\n"; foreach my $metric ( sort ('characters','lines','words','sentences','unique','word length') ) { print &generateReportLine($metric, $metrics) . "\n"; } print "Complete!\n\n"; } exit; ## @function # Given the path to a text file, calculates various text metrics and adds them # into the ever-growing metric data structure. Metrics include: # number of lines of text (easy) # number of paragraphs (lines with no text...) # number of sentences # number of characters # number of words # max word length # number of unique words encountered this document sub extractTextMetrics { my ($txt_path, $metrics) = @_; print ' - extracting metrics from: ' . $txt_path . "\n"; my $number_of_lines_of_text = 0; my $number_of_characters = 0; my $number_of_sentences = 0; my $number_of_words = 0; my $number_of_paragraphs = 0; my $word_lengths = 0; my $unique_words = 0; open(TXTFIN, '<:utf8', $txt_path) or die('Failed to open file for reading: ' . $txt_path); my $line = ''; while ($line = ) { if ($line =~ /^\s*$/) { $number_of_paragraphs++; } else { $number_of_lines_of_text++; $number_of_characters += length($line); $number_of_sentences += scalar(split(/\./, $line)) - 1; # Ending ".\n" # try to sanitize text to only contains 'words' my $clean_line = lc($line); $clean_line =~ s/[[:punct:]]//g; my @words = split(/\s+/, $clean_line); foreach my $word (@words) { $number_of_words++; if (!defined($metrics->{'lexicon'}->{$word})) { $unique_words++; $metrics->{'lexicon'}->{$word} = 0; } $metrics->{'lexicon'}->{$word}++; } } } close(TXTFIN); # update metrics with entries for the information captured above push(@{$metrics->{'characters'}}, $number_of_characters); push(@{$metrics->{'lines'}}, $number_of_lines_of_text); push(@{$metrics->{'paragraphs'}}, $number_of_paragraphs); push(@{$metrics->{'sentences'}}, $number_of_sentences); push(@{$metrics->{'unique'}}, $unique_words); push(@{$metrics->{'words'}}, $number_of_words); } ## extractTextMetrics() ## ## @function # sub generateReportLine { my ($metric, $metrics) = @_; my $label = join " ", map {ucfirst} split / /, $metric; my $piddle = pdl @{$metrics->{$metric}}; my @stats = statsover($piddle); return sprintf(' %s: avg: %0.2f, min: %d, med: %d, max: %d, sdev: %0.2f, adev: %0.2f, prms: %0.2f', $label, $stats[0], $stats[3], $stats[2], $stats[4], $stats[6], $stats[5], $stats[1]); } ## ## @function # sub scanDirectory { my ($dir, $metrics) = @_; print ' - searching directory: ' . $dir . "\n"; opendir(DH, $dir); my @files = readdir(DH); closedir(DH); foreach my $file (@files) { my $path = $dir . '/' . $file; # skip dotted files of any type if ($file =~ /^\./) { } # recurse directories elsif (-d $path) { &scanDirectory($path, $metrics); } # process TXT files elsif ($file =~ /\.txt$/i) { &extractTextMetrics($path, $metrics); $metrics->{'file_count'}++; } } } ## scanDirectory() ##