source: gs2-extensions/parallel-building/trunk/src/bin/script/text_metricinator.pl@ 29104

Last change on this file since 29104 was 29104, checked in by jmt12, 10 years ago

A script for extracting textual metrics from a collection of text files ala lorem

  • Property svn:executable set to *
File size: 4.1 KB
Line 
1#!/usr/bin/perl
2
3use PDL;
4use strict;
5use warnings;
6
7my $metrics = {'file_count' => 0,
8 't_lines' => 0,
9 'lines' => [],
10 'paragraphs' => [],
11 'characters' => [],
12 'sentences' => [],
13 'words' => [],
14 'wordlength' => [],
15 'unique' => [],
16 'lexicon' => {},
17 };
18
19print "====== 'Text Metric'inator ======\n";
20print "Reports on the text metrics of a collection of TXT files while also attempting\n";
21print "to take over the tri-state area.\n\n";
22if (!defined $ARGV[0] || !-d $ARGV[0])
23{
24 print "Usage: text_metricinator.pl <collect directory>\n";
25}
26else
27{
28 print " * Calculating text metrics from collection...\n";
29 &scanDirectory($ARGV[0], $metrics);
30 print " Done\n\n";
31
32 foreach my $word (keys %{$metrics->{'lexicon'}})
33 {
34 push(@{$metrics->{'word length'}}, length($word));
35 }
36
37 print "===== Report =====\n";
38 print ' File count: ' . $metrics->{'file_count'} . "\n";
39 foreach my $metric ( sort ('characters','lines','words','sentences','unique','word length') )
40 {
41 print &generateReportLine($metric, $metrics) . "\n";
42 }
43 print "Complete!\n\n";
44}
45exit;
46
47
48## @function
49# Given the path to a text file, calculates various text metrics and adds them
50# into the ever-growing metric data structure. Metrics include:
51# number of lines of text (easy)
52# number of paragraphs (lines with no text...)
53# number of sentences
54# number of characters
55# number of words
56# max word length
57# number of unique words encountered this document
58sub extractTextMetrics
59{
60 my ($txt_path, $metrics) = @_;
61 print ' - extracting metrics from: ' . $txt_path . "\n";
62 my $number_of_lines_of_text = 0;
63 my $number_of_characters = 0;
64 my $number_of_sentences = 0;
65 my $number_of_words = 0;
66 my $number_of_paragraphs = 0;
67 my $word_lengths = 0;
68 my $unique_words = 0;
69 open(TXTFIN, '<:utf8', $txt_path) or die('Failed to open file for reading: ' . $txt_path);
70 my $line = '';
71 while ($line = <TXTFIN>)
72 {
73 if ($line =~ /^\s*$/)
74 {
75 $number_of_paragraphs++;
76 }
77 else
78 {
79 $number_of_lines_of_text++;
80 $number_of_characters += length($line);
81 $number_of_sentences += scalar(split(/\./, $line)) - 1; # Ending ".\n"
82 # try to sanitize text to only contains 'words'
83 my $clean_line = lc($line);
84 $clean_line =~ s/[[:punct:]]//g;
85 my @words = split(/\s+/, $clean_line);
86 foreach my $word (@words)
87 {
88 $number_of_words++;
89 if (!defined($metrics->{'lexicon'}->{$word}))
90 {
91 $unique_words++;
92 $metrics->{'lexicon'}->{$word} = 0;
93 }
94 $metrics->{'lexicon'}->{$word}++;
95 }
96 }
97 }
98 close(TXTFIN);
99 # update metrics with entries for the information captured above
100 push(@{$metrics->{'characters'}}, $number_of_characters);
101 push(@{$metrics->{'lines'}}, $number_of_lines_of_text);
102 push(@{$metrics->{'paragraphs'}}, $number_of_paragraphs);
103 push(@{$metrics->{'sentences'}}, $number_of_sentences);
104 push(@{$metrics->{'unique'}}, $unique_words);
105 push(@{$metrics->{'words'}}, $number_of_words);
106}
107## extractTextMetrics() ##
108
109
110## @function
111#
112sub generateReportLine
113{
114 my ($metric, $metrics) = @_;
115 my $label = join " ", map {ucfirst} split / /, $metric;
116 my $piddle = pdl @{$metrics->{$metric}};
117 my @stats = statsover($piddle);
118 return sprintf(' %s: avg: %0.2f, min: %d, med: %d, max: %d, sdev: %0.2f, adev: %0.2f, prms: %0.2f', $label, $stats[0], $stats[3], $stats[2], $stats[4], $stats[6], $stats[5], $stats[1]);
119}
120##
121
122
123
124## @function
125#
126sub scanDirectory
127{
128 my ($dir, $metrics) = @_;
129 print ' - searching directory: ' . $dir . "\n";
130 opendir(DH, $dir);
131 my @files = readdir(DH);
132 closedir(DH);
133 foreach my $file (@files)
134 {
135 my $path = $dir . '/' . $file;
136 # skip dotted files of any type
137 if ($file =~ /^\./)
138 {
139 }
140 # recurse directories
141 elsif (-d $path)
142 {
143 &scanDirectory($path, $metrics);
144 }
145 # process TXT files
146 elsif ($file =~ /\.txt$/i)
147 {
148 &extractTextMetrics($path, $metrics);
149 $metrics->{'file_count'}++;
150 }
151 }
152}
153## scanDirectory() ##
Note: See TracBrowser for help on using the repository browser.