Context Navigation

source: gs2-extensions/parallel-building/trunk/src/perllib/plugins/CPULoadTextPlugin.pm@ 29161

Last change on this file since 29161 was 29161, checked in by jmt12, 10 years ago
Some modules aren't availalbe on cluster... add test and include path to extensions CPAN library as necessary
File size: 9.9 KB

Line
1	package CPULoadTextPlugin;
2
3	use TextPlugin;
4
5	use strict;
6	no strict 'refs'; # allow filehandles to be variables and viceversa
7	no strict 'subs';
8
9	sub BEGIN
10	{
11	@CPULoadTextPlugin::ISA = ('TextPlugin');
12
13	eval('use Crypt::Blowfish_PP');
14	if ($@)
15	{
16	# We need the Perl version before continuing
17	if (!defined $ENV{'PERL_VERSION'})
18	{
19	$ENV{'PERL_VERSION'} = `perl -S $ENV{'GEXTPARALLELBUILDING'}/bin/script/perl-version.pl`;
20	}
21	die "PERL_VERSION not set\n" unless defined $ENV{'PERL_VERSION'};
22	# Crypt::Blowfish_PP module
23	unshift (@INC, $ENV{'GEXTPARALLELBUILDING'} . '/' . $ENV{'GSDLOS'} . '/lib/perl/' . $ENV{'PERL_VERSION'});
24	}
25	}
26
27	use Crypt::Blowfish_PP;
28	use Kea;
29	use Lingua::EN::Syllable;
30
31	our $cpu_load_list = [ { 'name' => "none",
32	'desc' => "Almost no processing - all dependent on IO" },
33	{ 'name' => "low",
34	'desc' => "A little processing - building lexicon" },
35	{ 'name' => "medium",
36	'desc' => "Some processing..." },
37	{ 'name' => "high",
38	'desc' => "Data mining and part of speech tagging" }
39	];
40
41	my $arguments = [ { 'name' => "process_exp",
42	'desc' => "{BasePlugin.process_exp}",
43	'type' => "regexp",
44	'deft' => &get_default_process_exp(),
45	'reqd' => "no" },
46	{ 'name' => "cpu_load",
47	'desc' => "",
48	'type' => "enum",
49	'deft' => "auto",
50	'list' => $cpu_load_list,
51	'reqd' => "no" },
52	{ 'name' => 'debug',
53	'desc' => '',
54	'type' => 'flag',
55	'reqd' => 'no',
56	'deft' => '0',
57	'hiddengli' => 'no'}
58	];
59
60	my $options = { 'name' => "CPULoadTextPlugin",
61	'desc' => "TextPlugin allowing for configurable amounts of CPU load",
62	'abstract' => "no",
63	'inherits' => "yes",
64	'srcreplaceable' => "yes", # Source docs in regular txt format can be replaced with GS-generated html
65	'args' => $arguments };
66
67
68	sub get_default_process_exp
69	{
70	my $self = shift (@_);
71	return q^(?i)\.te?xt$^;
72	}
73
74	sub new
75	{
76	my ($class) = shift (@_);
77	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
78	push(@$pluginlist, $class);
79
80	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
81	push(@{$hashArgOptLists->{"OptList"}},$options);
82
83	my $self = new TextPlugin($pluginlist, $inputargs, $hashArgOptLists);
84
85	return bless $self, $class;
86	}
87
88	# do plugin specific processing of doc_obj
89	sub process
90	{
91	my $self = shift (@_);
92	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
93	my $outhandle = $self->{'outhandle'};
94
95	my $cursection = $doc_obj->get_top_section();
96
97	# get title metadata
98	# (don't need to get title if it has been passed
99	# in from another plugin)
100	if (!defined $metadata->{'Title'})
101	{
102	my $title = $self->get_title_metadata($textref);
103	$doc_obj->add_utf8_metadata ($cursection, "Title", $title);
104	}
105	# Add FileFormat metadata
106	$doc_obj->add_metadata($cursection, "FileFormat", "Text");
107
108	if ($self->{'cpu_load'} =~ /^(medium\|high)$/ )
109	{
110	$self->generateLexicon($doc_obj, $$textref);
111	$self->generateWordLengths($doc_obj, $$textref);
112	$self->{'first'} = '80,256,1024';
113	$self->generateSummaries($doc_obj, $textref);
114	$self->{'first'} = undef;
115	$self->generateComplexity($doc_obj, $textref);
116	$self->generateEncryptedText($doc_obj, $textref, 'thePassword');
117	}
118	if ($self->{'cpu_load'} eq 'high')
119	{
120	$self->generateKeywords($doc_obj, $$textref);
121	}
122
123	# insert preformat tags and add text to document object
124	$self->text_to_html($textref); # modifies the text
125	$doc_obj->add_utf8_text($cursection, $$textref);
126
127	return 1;
128	}
129
130
131	## @function
132	#
133	sub _debugPrint
134	{
135	my $self = shift(@_);
136	if ($self->{'debug'})
137	{
138	my ($msg) = @_;
139	print '[DEBUG] ' . $msg . "\n";
140	}
141	}
142	## _debugPrint() ##
143
144
145	## Functions to hopefully create some CPU load ##
146
147
148	## @function
149	#
150	sub generateComplexity
151	{
152	my $self = shift(@_);
153	my ($doc_obj, $textref) = @_;
154	my $text = $$textref;
155
156	# No of words (we start with 0.1 to prevent things being divided by zero)
157	my @words = split(/[^a-zA-Z0-9]+/, $text);
158	my $number_of_words = scalar(@words);
159
160	# No of long words, where long is 6 or more characters
161	my $number_of_long_words = 0;
162	while ($text =~ /\w{6,}/g)
163	{
164	$number_of_long_words++;
165	}
166
167	# No of syllables
168	my $number_of_syllables = 0;
169	foreach my $the_word (@words)
170	{
171	$number_of_syllables += syllable($the_word);
172	}
173
174	# no of sentences (looking for full stops...)
175	my $number_of_sentences = ($text =~ tr/\.//);
176
177	$self->_debugPrint('Number of words: ' . $number_of_words);
178	$self->_debugPrint('Number of sentences: ' . $number_of_sentences);
179	$self->_debugPrint('Number of syllables: ' . $number_of_syllables);
180
181	# Commetrics Approach
182	# A. Big Word Ratio
183	# = Total # of words / Total # of words with > 6 characters
184	# B. Word Count Score
185	# = Total # of words / Total # of sentences
186	# Score = A / B
187	my $commetrics_complexity_score = ($number_of_words / $number_of_long_words) / ($number_of_words / $number_of_sentences);
188	$self->_debugPrint('ComMetrics Complexity Score: ' . $commetrics_complexity_score);
189
190	$doc_obj->add_metadata($doc_obj->get_top_section(), 'CommetricsScore', $commetrics_complexity_score);
191
192	# Flesch Reading Ease:
193	# Calculate the average number of words you use per sentence.
194	# Calculate the average number of syllables per word.
195	# Multiply the average number of syllables per word multiplied by 84.6 and subtract it from the average number of words multiplied by 1.015.
196	# Subtract the result from 206.835.
197	# Algorithm: 206.835 - (1.015 * average_words_sentence) - (84.6 * average_syllables_word)
198	my $words_per_sentence = $number_of_words / $number_of_sentences;
199	my $syllables_per_word = $number_of_syllables / $number_of_words;
200	my $flesch_complexity_score = 206.835 - ($words_per_sentence * 1.015) - ($syllables_per_word * 84.6);
201	$self->_debugPrint('Flesch-Kincaid Complexity Score: ' . $flesch_complexity_score);
202	$doc_obj->add_metadata($doc_obj->get_top_section(), 'FleschKincaidScore', $flesch_complexity_score);
203	my $flesch_grade = 0.38 * $words_per_sentence + 11.8 * $syllables_per_word - 15.59;
204	$self->_debugPrint('Flesch-Kincaid Grade: ' . $flesch_grade);
205	$doc_obj->add_metadata($doc_obj->get_top_section(), 'FleschKincaidGrade', $flesch_grade);
206	}
207	## generateComplexity() ##
208
209
210	## @function
211	#
212	sub generateEncryptedText
213	{
214	my $self = shift (@_);
215	my ($doc_obj, $textref) = @_;
216
217	my $key_length = 25;
218	my $multiplier = 40;
219
220	# Split the string into chunks
221	my $text = $$textref;
222	# - ensure the length of the text is some multiple of chunk length
223	while ((length($text) % ($multiplier * $key_length)) > 0)
224	{
225	$text .= '#';
226	}
227	# - now split the text into $multiplier x $key_length bytes chunks
228	# the first key_length bytes is used as the key to encrypt the whole chunk
229	my $counter = 0;
230	while (length($text) > 0)
231	{
232	my $key = substr($text, 0, $key_length);
233	my $value = $key . substr($text, $key_length, $key_length * ($multiplier - 1));
234	my $blowfish = new Crypt::Blowfish_PP($key);
235	my $encrypted_text = $blowfish->encrypt($value);
236	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), sprintf("Encrypted%05d", $counter), $encrypted_text);
237	# - shorten text by chunk length bytes, and repeat until text is exhausted
238	$text = substr($text, $multiplier * $key_length);
239	$counter++;
240	}
241	$self->_debugPrint('Encrypted ' . $counter . ' x ' . ($multiplier * $key_length) . ' byte chunks');
242	}
243	## generateEncryptedText() ##
244
245
246	## @function
247	#
248	sub generateKeywords
249	{
250	my $self = shift (@_);
251	my ($doc_obj, $text) = @_;
252	my $key_phrases = Kea::extract_KeyPhrases('3.0', $text, 'n10');
253	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), 'Keywords', $key_phrases);
254	$self->_debugPrint('Generated keywords: "' . $key_phrases . '"');
255	}
256	## generateKeywords() ##
257
258
259	## @function
260	#
261	sub generateLexicon
262	{
263	my $self = shift (@_);
264	my ($doc_obj, $text) = @_;
265	my $raw_lexicon = {};
266	my @words = split(/[\,\.\s]+/, $text);
267	foreach my $word (@words)
268	{
269	$word = lc($word);
270	if (defined $raw_lexicon->{$word})
271	{
272	$raw_lexicon->{$word}++;
273	}
274	else
275	{
276	$raw_lexicon->{$word} = 1;
277	}
278	}
279	my @lexicon;
280	foreach my $word (sort keys %{$raw_lexicon})
281	{
282	push(@lexicon, $word . ':' . $raw_lexicon->{$word});
283	}
284	$doc_obj->add_metadata($doc_obj->get_top_section(), "Lexicon", join(', ', @lexicon));
285	$self->_debugPrint('Generated lexicon');
286	}
287	## generateLexicon() ##
288
289
290	## @function
291	# extract the first NNN characters as metadata
292	sub generateSummaries
293	{
294	my $self = shift (@_);
295	my ($doc_obj, $textref) = @_;
296
297	foreach my $size (split /,/, $self->{'first'})
298	{
299	my $tmptext = $$textref;
300	$tmptext =~ s/^\s+//;
301	$tmptext =~ s/\s+$//;
302	$tmptext =~ s/\s+/ /gs;
303	$tmptext = substr ($tmptext, 0, $size);
304	$tmptext =~ s/\s\S*$/…/;
305	$doc_obj->add_utf8_metadata ($doc_obj->get_top_section(), 'First' . $size, $tmptext);
306	$self->_debugPrint('Generated summary of ' . $size . ' characters');
307	}
308	}
309
310
311	## @function
312	#
313	sub generateWordLengths
314	{
315	my $self = shift (@_);
316	my ($doc_obj, $text) = @_;
317	my $raw_word_lengths = {};
318	my @words = split('/[\,\.\s]+/', $text);
319	foreach my $word (@words)
320	{
321	$word = lc($word);
322	my $length = length($word);
323	if (defined $raw_word_lengths->{$length})
324	{
325	$raw_word_lengths->{$length} = 1;
326	}
327	else
328	{
329	$raw_word_lengths->{$length}++;
330	}
331	}
332	my @word_lengths;
333	foreach my $word_length (sort keys %{$raw_word_lengths})
334	{
335	push(@word_lengths, $word_length . ':' . $raw_word_lengths->{$word_length});
336	}
337	$doc_obj->add_metadata($doc_obj->get_top_section(), "WordLengths", join(', ', @word_lengths));
338	$self->_debugPrint('Generated word length information');
339	}
340	## generateWordLengths() ##
341
342
343	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: