Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

text_metricinator.pl@ 29104

Last change on this file since 29104 was 29104, checked in by jmt12, 10 years ago
A script for extracting textual metrics from a collection of text files ala lorem
Property svn:executable set to ``*
File size: 4.1 KB

Line
1	#!/usr/bin/perl
2
3	use PDL;
4	use strict;
5	use warnings;
6
7	my $metrics = {'file_count' => 0,
8	't_lines' => 0,
9	'lines' => [],
10	'paragraphs' => [],
11	'characters' => [],
12	'sentences' => [],
13	'words' => [],
14	'wordlength' => [],
15	'unique' => [],
16	'lexicon' => {},
17	};
18
19	print "====== 'Text Metric'inator ======\n";
20	print "Reports on the text metrics of a collection of TXT files while also attempting\n";
21	print "to take over the tri-state area.\n\n";
22	if (!defined $ARGV[0] \|\| !-d $ARGV[0])
23	{
24	print "Usage: text_metricinator.pl <collect directory>\n";
25	}
26	else
27	{
28	print " * Calculating text metrics from collection...\n";
29	&scanDirectory($ARGV[0], $metrics);
30	print " Done\n\n";
31
32	foreach my $word (keys %{$metrics->{'lexicon'}})
33	{
34	push(@{$metrics->{'word length'}}, length($word));
35	}
36
37	print "===== Report =====\n";
38	print ' File count: ' . $metrics->{'file_count'} . "\n";
39	foreach my $metric ( sort ('characters','lines','words','sentences','unique','word length') )
40	{
41	print &generateReportLine($metric, $metrics) . "\n";
42	}
43	print "Complete!\n\n";
44	}
45	exit;
46
47
48	## @function
49	# Given the path to a text file, calculates various text metrics and adds them
50	# into the ever-growing metric data structure. Metrics include:
51	# number of lines of text (easy)
52	# number of paragraphs (lines with no text...)
53	# number of sentences
54	# number of characters
55	# number of words
56	# max word length
57	# number of unique words encountered this document
58	sub extractTextMetrics
59	{
60	my ($txt_path, $metrics) = @_;
61	print ' - extracting metrics from: ' . $txt_path . "\n";
62	my $number_of_lines_of_text = 0;
63	my $number_of_characters = 0;
64	my $number_of_sentences = 0;
65	my $number_of_words = 0;
66	my $number_of_paragraphs = 0;
67	my $word_lengths = 0;
68	my $unique_words = 0;
69	open(TXTFIN, '<:utf8', $txt_path) or die('Failed to open file for reading: ' . $txt_path);
70	my $line = '';
71	while ($line = <TXTFIN>)
72	{
73	if ($line =~ /^\s*$/)
74	{
75	$number_of_paragraphs++;
76	}
77	else
78	{
79	$number_of_lines_of_text++;
80	$number_of_characters += length($line);
81	$number_of_sentences += scalar(split(/\./, $line)) - 1; # Ending ".\n"
82	# try to sanitize text to only contains 'words'
83	my $clean_line = lc($line);
84	$clean_line =~ s/[[:punct:]]//g;
85	my @words = split(/\s+/, $clean_line);
86	foreach my $word (@words)
87	{
88	$number_of_words++;
89	if (!defined($metrics->{'lexicon'}->{$word}))
90	{
91	$unique_words++;
92	$metrics->{'lexicon'}->{$word} = 0;
93	}
94	$metrics->{'lexicon'}->{$word}++;
95	}
96	}
97	}
98	close(TXTFIN);
99	# update metrics with entries for the information captured above
100	push(@{$metrics->{'characters'}}, $number_of_characters);
101	push(@{$metrics->{'lines'}}, $number_of_lines_of_text);
102	push(@{$metrics->{'paragraphs'}}, $number_of_paragraphs);
103	push(@{$metrics->{'sentences'}}, $number_of_sentences);
104	push(@{$metrics->{'unique'}}, $unique_words);
105	push(@{$metrics->{'words'}}, $number_of_words);
106	}
107	## extractTextMetrics() ##
108
109
110	## @function
111	#
112	sub generateReportLine
113	{
114	my ($metric, $metrics) = @_;
115	my $label = join " ", map {ucfirst} split / /, $metric;
116	my $piddle = pdl @{$metrics->{$metric}};
117	my @stats = statsover($piddle);
118	return sprintf(' %s: avg: %0.2f, min: %d, med: %d, max: %d, sdev: %0.2f, adev: %0.2f, prms: %0.2f', $label, $stats[0], $stats[3], $stats[2], $stats[4], $stats[6], $stats[5], $stats[1]);
119	}
120	##
121
122
123
124	## @function
125	#
126	sub scanDirectory
127	{
128	my ($dir, $metrics) = @_;
129	print ' - searching directory: ' . $dir . "\n";
130	opendir(DH, $dir);
131	my @files = readdir(DH);
132	closedir(DH);
133	foreach my $file (@files)
134	{
135	my $path = $dir . '/' . $file;
136	# skip dotted files of any type
137	if ($file =~ /^\./)
138	{
139	}
140	# recurse directories
141	elsif (-d $path)
142	{
143	&scanDirectory($path, $metrics);
144	}
145	# process TXT files
146	elsif ($file =~ /\.txt$/i)
147	{
148	&extractTextMetrics($path, $metrics);
149	$metrics->{'file_count'}++;
150	}
151	}
152	}
153	## scanDirectory() ##

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: gs2-extensions/parallel-building/trunk/src/bin/script/text_metricinator.pl@ 29104

Download in other formats: