1 | #!/usr/bin/perl
|
---|
2 |
|
---|
3 | use PDL;
|
---|
4 | use strict;
|
---|
5 | use warnings;
|
---|
6 |
|
---|
7 | my $metrics = {'file_count' => 0,
|
---|
8 | 't_lines' => 0,
|
---|
9 | 'lines' => [],
|
---|
10 | 'paragraphs' => [],
|
---|
11 | 'characters' => [],
|
---|
12 | 'sentences' => [],
|
---|
13 | 'words' => [],
|
---|
14 | 'wordlength' => [],
|
---|
15 | 'unique' => [],
|
---|
16 | 'lexicon' => {},
|
---|
17 | };
|
---|
18 |
|
---|
19 | print "====== 'Text Metric'inator ======\n";
|
---|
20 | print "Reports on the text metrics of a collection of TXT files while also attempting\n";
|
---|
21 | print "to take over the tri-state area.\n\n";
|
---|
22 | if (!defined $ARGV[0] || !-d $ARGV[0])
|
---|
23 | {
|
---|
24 | print "Usage: text_metricinator.pl <collect directory>\n";
|
---|
25 | }
|
---|
26 | else
|
---|
27 | {
|
---|
28 | print " * Calculating text metrics from collection...\n";
|
---|
29 | &scanDirectory($ARGV[0], $metrics);
|
---|
30 | print " Done\n\n";
|
---|
31 |
|
---|
32 | foreach my $word (keys %{$metrics->{'lexicon'}})
|
---|
33 | {
|
---|
34 | push(@{$metrics->{'word length'}}, length($word));
|
---|
35 | }
|
---|
36 |
|
---|
37 | print "===== Report =====\n";
|
---|
38 | print ' File count: ' . $metrics->{'file_count'} . "\n";
|
---|
39 | foreach my $metric ( sort ('characters','lines','words','sentences','unique','word length') )
|
---|
40 | {
|
---|
41 | print &generateReportLine($metric, $metrics) . "\n";
|
---|
42 | }
|
---|
43 | print "Complete!\n\n";
|
---|
44 | }
|
---|
45 | exit;
|
---|
46 |
|
---|
47 |
|
---|
48 | ## @function
|
---|
49 | # Given the path to a text file, calculates various text metrics and adds them
|
---|
50 | # into the ever-growing metric data structure. Metrics include:
|
---|
51 | # number of lines of text (easy)
|
---|
52 | # number of paragraphs (lines with no text...)
|
---|
53 | # number of sentences
|
---|
54 | # number of characters
|
---|
55 | # number of words
|
---|
56 | # max word length
|
---|
57 | # number of unique words encountered this document
|
---|
58 | sub extractTextMetrics
|
---|
59 | {
|
---|
60 | my ($txt_path, $metrics) = @_;
|
---|
61 | print ' - extracting metrics from: ' . $txt_path . "\n";
|
---|
62 | my $number_of_lines_of_text = 0;
|
---|
63 | my $number_of_characters = 0;
|
---|
64 | my $number_of_sentences = 0;
|
---|
65 | my $number_of_words = 0;
|
---|
66 | my $number_of_paragraphs = 0;
|
---|
67 | my $word_lengths = 0;
|
---|
68 | my $unique_words = 0;
|
---|
69 | open(TXTFIN, '<:utf8', $txt_path) or die('Failed to open file for reading: ' . $txt_path);
|
---|
70 | my $line = '';
|
---|
71 | while ($line = <TXTFIN>)
|
---|
72 | {
|
---|
73 | if ($line =~ /^\s*$/)
|
---|
74 | {
|
---|
75 | $number_of_paragraphs++;
|
---|
76 | }
|
---|
77 | else
|
---|
78 | {
|
---|
79 | $number_of_lines_of_text++;
|
---|
80 | $number_of_characters += length($line);
|
---|
81 | $number_of_sentences += scalar(split(/\./, $line)) - 1; # Ending ".\n"
|
---|
82 | # try to sanitize text to only contains 'words'
|
---|
83 | my $clean_line = lc($line);
|
---|
84 | $clean_line =~ s/[[:punct:]]//g;
|
---|
85 | my @words = split(/\s+/, $clean_line);
|
---|
86 | foreach my $word (@words)
|
---|
87 | {
|
---|
88 | $number_of_words++;
|
---|
89 | if (!defined($metrics->{'lexicon'}->{$word}))
|
---|
90 | {
|
---|
91 | $unique_words++;
|
---|
92 | $metrics->{'lexicon'}->{$word} = 0;
|
---|
93 | }
|
---|
94 | $metrics->{'lexicon'}->{$word}++;
|
---|
95 | }
|
---|
96 | }
|
---|
97 | }
|
---|
98 | close(TXTFIN);
|
---|
99 | # update metrics with entries for the information captured above
|
---|
100 | push(@{$metrics->{'characters'}}, $number_of_characters);
|
---|
101 | push(@{$metrics->{'lines'}}, $number_of_lines_of_text);
|
---|
102 | push(@{$metrics->{'paragraphs'}}, $number_of_paragraphs);
|
---|
103 | push(@{$metrics->{'sentences'}}, $number_of_sentences);
|
---|
104 | push(@{$metrics->{'unique'}}, $unique_words);
|
---|
105 | push(@{$metrics->{'words'}}, $number_of_words);
|
---|
106 | }
|
---|
107 | ## extractTextMetrics() ##
|
---|
108 |
|
---|
109 |
|
---|
110 | ## @function
|
---|
111 | #
|
---|
112 | sub generateReportLine
|
---|
113 | {
|
---|
114 | my ($metric, $metrics) = @_;
|
---|
115 | my $label = join " ", map {ucfirst} split / /, $metric;
|
---|
116 | my $piddle = pdl @{$metrics->{$metric}};
|
---|
117 | my @stats = statsover($piddle);
|
---|
118 | return sprintf(' %s: avg: %0.2f, min: %d, med: %d, max: %d, sdev: %0.2f, adev: %0.2f, prms: %0.2f', $label, $stats[0], $stats[3], $stats[2], $stats[4], $stats[6], $stats[5], $stats[1]);
|
---|
119 | }
|
---|
120 | ##
|
---|
121 |
|
---|
122 |
|
---|
123 |
|
---|
124 | ## @function
|
---|
125 | #
|
---|
126 | sub scanDirectory
|
---|
127 | {
|
---|
128 | my ($dir, $metrics) = @_;
|
---|
129 | print ' - searching directory: ' . $dir . "\n";
|
---|
130 | opendir(DH, $dir);
|
---|
131 | my @files = readdir(DH);
|
---|
132 | closedir(DH);
|
---|
133 | foreach my $file (@files)
|
---|
134 | {
|
---|
135 | my $path = $dir . '/' . $file;
|
---|
136 | # skip dotted files of any type
|
---|
137 | if ($file =~ /^\./)
|
---|
138 | {
|
---|
139 | }
|
---|
140 | # recurse directories
|
---|
141 | elsif (-d $path)
|
---|
142 | {
|
---|
143 | &scanDirectory($path, $metrics);
|
---|
144 | }
|
---|
145 | # process TXT files
|
---|
146 | elsif ($file =~ /\.txt$/i)
|
---|
147 | {
|
---|
148 | &extractTextMetrics($path, $metrics);
|
---|
149 | $metrics->{'file_count'}++;
|
---|
150 | }
|
---|
151 | }
|
---|
152 | }
|
---|
153 | ## scanDirectory() ##
|
---|