source: gs2-extensions/parallel-building/trunk/src/perllib/plugins/CPULoadTextPlugin.pm@ 29161

Last change on this file since 29161 was 29161, checked in by jmt12, 10 years ago

Some modules aren't availalbe on cluster... add test and include path to extensions CPAN library as necessary

File size: 9.9 KB
Line 
1package CPULoadTextPlugin;
2
3use TextPlugin;
4
5use strict;
6no strict 'refs'; # allow filehandles to be variables and viceversa
7no strict 'subs';
8
9sub BEGIN
10{
11 @CPULoadTextPlugin::ISA = ('TextPlugin');
12
13 eval('use Crypt::Blowfish_PP');
14 if ($@)
15 {
16 # We need the Perl version before continuing
17 if (!defined $ENV{'PERL_VERSION'})
18 {
19 $ENV{'PERL_VERSION'} = `perl -S $ENV{'GEXTPARALLELBUILDING'}/bin/script/perl-version.pl`;
20 }
21 die "PERL_VERSION not set\n" unless defined $ENV{'PERL_VERSION'};
22 # Crypt::Blowfish_PP module
23 unshift (@INC, $ENV{'GEXTPARALLELBUILDING'} . '/' . $ENV{'GSDLOS'} . '/lib/perl/' . $ENV{'PERL_VERSION'});
24 }
25}
26
27use Crypt::Blowfish_PP;
28use Kea;
29use Lingua::EN::Syllable;
30
31our $cpu_load_list = [ { 'name' => "none",
32 'desc' => "Almost no processing - all dependent on IO" },
33 { 'name' => "low",
34 'desc' => "A little processing - building lexicon" },
35 { 'name' => "medium",
36 'desc' => "Some processing..." },
37 { 'name' => "high",
38 'desc' => "Data mining and part of speech tagging" }
39 ];
40
41my $arguments = [ { 'name' => "process_exp",
42 'desc' => "{BasePlugin.process_exp}",
43 'type' => "regexp",
44 'deft' => &get_default_process_exp(),
45 'reqd' => "no" },
46 { 'name' => "cpu_load",
47 'desc' => "",
48 'type' => "enum",
49 'deft' => "auto",
50 'list' => $cpu_load_list,
51 'reqd' => "no" },
52 { 'name' => 'debug',
53 'desc' => '',
54 'type' => 'flag',
55 'reqd' => 'no',
56 'deft' => '0',
57 'hiddengli' => 'no'}
58 ];
59
60my $options = { 'name' => "CPULoadTextPlugin",
61 'desc' => "TextPlugin allowing for configurable amounts of CPU load",
62 'abstract' => "no",
63 'inherits' => "yes",
64 'srcreplaceable' => "yes", # Source docs in regular txt format can be replaced with GS-generated html
65 'args' => $arguments };
66
67
68sub get_default_process_exp
69{
70 my $self = shift (@_);
71 return q^(?i)\.te?xt$^;
72}
73
74sub new
75{
76 my ($class) = shift (@_);
77 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
78 push(@$pluginlist, $class);
79
80 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
81 push(@{$hashArgOptLists->{"OptList"}},$options);
82
83 my $self = new TextPlugin($pluginlist, $inputargs, $hashArgOptLists);
84
85 return bless $self, $class;
86}
87
88# do plugin specific processing of doc_obj
89sub process
90{
91 my $self = shift (@_);
92 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
93 my $outhandle = $self->{'outhandle'};
94
95 my $cursection = $doc_obj->get_top_section();
96
97 # get title metadata
98 # (don't need to get title if it has been passed
99 # in from another plugin)
100 if (!defined $metadata->{'Title'})
101 {
102 my $title = $self->get_title_metadata($textref);
103 $doc_obj->add_utf8_metadata ($cursection, "Title", $title);
104 }
105 # Add FileFormat metadata
106 $doc_obj->add_metadata($cursection, "FileFormat", "Text");
107
108 if ($self->{'cpu_load'} =~ /^(medium|high)$/ )
109 {
110 $self->generateLexicon($doc_obj, $$textref);
111 $self->generateWordLengths($doc_obj, $$textref);
112 $self->{'first'} = '80,256,1024';
113 $self->generateSummaries($doc_obj, $textref);
114 $self->{'first'} = undef;
115 $self->generateComplexity($doc_obj, $textref);
116 $self->generateEncryptedText($doc_obj, $textref, 'thePassword');
117 }
118 if ($self->{'cpu_load'} eq 'high')
119 {
120 $self->generateKeywords($doc_obj, $$textref);
121 }
122
123 # insert preformat tags and add text to document object
124 $self->text_to_html($textref); # modifies the text
125 $doc_obj->add_utf8_text($cursection, $$textref);
126
127 return 1;
128}
129
130
131## @function
132#
133sub _debugPrint
134{
135 my $self = shift(@_);
136 if ($self->{'debug'})
137 {
138 my ($msg) = @_;
139 print '[DEBUG] ' . $msg . "\n";
140 }
141}
142## _debugPrint() ##
143
144
145## Functions to hopefully create some CPU load ##
146
147
148## @function
149#
150sub generateComplexity
151{
152 my $self = shift(@_);
153 my ($doc_obj, $textref) = @_;
154 my $text = $$textref;
155
156 # No of words (we start with 0.1 to prevent things being divided by zero)
157 my @words = split(/[^a-zA-Z0-9]+/, $text);
158 my $number_of_words = scalar(@words);
159
160 # No of long words, where long is 6 or more characters
161 my $number_of_long_words = 0;
162 while ($text =~ /\w{6,}/g)
163 {
164 $number_of_long_words++;
165 }
166
167 # No of syllables
168 my $number_of_syllables = 0;
169 foreach my $the_word (@words)
170 {
171 $number_of_syllables += syllable($the_word);
172 }
173
174 # no of sentences (looking for full stops...)
175 my $number_of_sentences = ($text =~ tr/\.//);
176
177 $self->_debugPrint('Number of words: ' . $number_of_words);
178 $self->_debugPrint('Number of sentences: ' . $number_of_sentences);
179 $self->_debugPrint('Number of syllables: ' . $number_of_syllables);
180
181 # Commetrics Approach
182 # A. Big Word Ratio
183 # = Total # of words / Total # of words with > 6 characters
184 # B. Word Count Score
185 # = Total # of words / Total # of sentences
186 # Score = A / B
187 my $commetrics_complexity_score = ($number_of_words / $number_of_long_words) / ($number_of_words / $number_of_sentences);
188 $self->_debugPrint('ComMetrics Complexity Score: ' . $commetrics_complexity_score);
189
190 $doc_obj->add_metadata($doc_obj->get_top_section(), 'CommetricsScore', $commetrics_complexity_score);
191
192 # Flesch Reading Ease:
193 # Calculate the average number of words you use per sentence.
194 # Calculate the average number of syllables per word.
195 # Multiply the average number of syllables per word multiplied by 84.6 and subtract it from the average number of words multiplied by 1.015.
196 # Subtract the result from 206.835.
197 # Algorithm: 206.835 - (1.015 * average_words_sentence) - (84.6 * average_syllables_word)
198 my $words_per_sentence = $number_of_words / $number_of_sentences;
199 my $syllables_per_word = $number_of_syllables / $number_of_words;
200 my $flesch_complexity_score = 206.835 - ($words_per_sentence * 1.015) - ($syllables_per_word * 84.6);
201 $self->_debugPrint('Flesch-Kincaid Complexity Score: ' . $flesch_complexity_score);
202 $doc_obj->add_metadata($doc_obj->get_top_section(), 'FleschKincaidScore', $flesch_complexity_score);
203 my $flesch_grade = 0.38 * $words_per_sentence + 11.8 * $syllables_per_word - 15.59;
204 $self->_debugPrint('Flesch-Kincaid Grade: ' . $flesch_grade);
205 $doc_obj->add_metadata($doc_obj->get_top_section(), 'FleschKincaidGrade', $flesch_grade);
206}
207## generateComplexity() ##
208
209
210## @function
211#
212sub generateEncryptedText
213{
214 my $self = shift (@_);
215 my ($doc_obj, $textref) = @_;
216
217 my $key_length = 25;
218 my $multiplier = 40;
219
220 # Split the string into chunks
221 my $text = $$textref;
222 # - ensure the length of the text is some multiple of chunk length
223 while ((length($text) % ($multiplier * $key_length)) > 0)
224 {
225 $text .= '#';
226 }
227 # - now split the text into $multiplier x $key_length bytes chunks
228 # the first key_length bytes is used as the key to encrypt the whole chunk
229 my $counter = 0;
230 while (length($text) > 0)
231 {
232 my $key = substr($text, 0, $key_length);
233 my $value = $key . substr($text, $key_length, $key_length * ($multiplier - 1));
234 my $blowfish = new Crypt::Blowfish_PP($key);
235 my $encrypted_text = $blowfish->encrypt($value);
236 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), sprintf("Encrypted%05d", $counter), $encrypted_text);
237 # - shorten text by chunk length bytes, and repeat until text is exhausted
238 $text = substr($text, $multiplier * $key_length);
239 $counter++;
240 }
241 $self->_debugPrint('Encrypted ' . $counter . ' x ' . ($multiplier * $key_length) . ' byte chunks');
242}
243## generateEncryptedText() ##
244
245
246## @function
247#
248sub generateKeywords
249{
250 my $self = shift (@_);
251 my ($doc_obj, $text) = @_;
252 my $key_phrases = Kea::extract_KeyPhrases('3.0', $text, 'n10');
253 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), 'Keywords', $key_phrases);
254 $self->_debugPrint('Generated keywords: "' . $key_phrases . '"');
255}
256## generateKeywords() ##
257
258
259## @function
260#
261sub generateLexicon
262{
263 my $self = shift (@_);
264 my ($doc_obj, $text) = @_;
265 my $raw_lexicon = {};
266 my @words = split(/[\,\.\s]+/, $text);
267 foreach my $word (@words)
268 {
269 $word = lc($word);
270 if (defined $raw_lexicon->{$word})
271 {
272 $raw_lexicon->{$word}++;
273 }
274 else
275 {
276 $raw_lexicon->{$word} = 1;
277 }
278 }
279 my @lexicon;
280 foreach my $word (sort keys %{$raw_lexicon})
281 {
282 push(@lexicon, $word . ':' . $raw_lexicon->{$word});
283 }
284 $doc_obj->add_metadata($doc_obj->get_top_section(), "Lexicon", join(', ', @lexicon));
285 $self->_debugPrint('Generated lexicon');
286}
287## generateLexicon() ##
288
289
290## @function
291# extract the first NNN characters as metadata
292sub generateSummaries
293{
294 my $self = shift (@_);
295 my ($doc_obj, $textref) = @_;
296
297 foreach my $size (split /,/, $self->{'first'})
298 {
299 my $tmptext = $$textref;
300 $tmptext =~ s/^\s+//;
301 $tmptext =~ s/\s+$//;
302 $tmptext =~ s/\s+/ /gs;
303 $tmptext = substr ($tmptext, 0, $size);
304 $tmptext =~ s/\s\S*$/…/;
305 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(), 'First' . $size, $tmptext);
306 $self->_debugPrint('Generated summary of ' . $size . ' characters');
307 }
308}
309
310
311## @function
312#
313sub generateWordLengths
314{
315 my $self = shift (@_);
316 my ($doc_obj, $text) = @_;
317 my $raw_word_lengths = {};
318 my @words = split('/[\,\.\s]+/', $text);
319 foreach my $word (@words)
320 {
321 $word = lc($word);
322 my $length = length($word);
323 if (defined $raw_word_lengths->{$length})
324 {
325 $raw_word_lengths->{$length} = 1;
326 }
327 else
328 {
329 $raw_word_lengths->{$length}++;
330 }
331 }
332 my @word_lengths;
333 foreach my $word_length (sort keys %{$raw_word_lengths})
334 {
335 push(@word_lengths, $word_length . ':' . $raw_word_lengths->{$word_length});
336 }
337 $doc_obj->add_metadata($doc_obj->get_top_section(), "WordLengths", join(', ', @word_lengths));
338 $self->_debugPrint('Generated word length information');
339}
340## generateWordLengths() ##
341
342
3431;
Note: See TracBrowser for help on using the repository browser.