source: gs2-extensions/parallel-building/trunk/src/perllib/plugins/CPULoadTextPlugin.pm@ 29261

Last change on this file since 29261 was 29261, checked in by jmt12, 10 years ago

Removing some of the extraneous IO from high cpu importing... altering the ratio between CPU and IO load appropriately

File size: 10.1 KB
Line 
1package CPULoadTextPlugin;
2
3use TextPlugin;
4
5use strict;
6no strict 'refs'; # allow filehandles to be variables and viceversa
7no strict 'subs';
8
9sub BEGIN
10{
11 @CPULoadTextPlugin::ISA = ('TextPlugin');
12
13 eval('use Crypt::Blowfish_PP');
14 if ($@)
15 {
16 # We need the Perl version before continuing
17 if (!defined $ENV{'PERL_VERSION'})
18 {
19 $ENV{'PERL_VERSION'} = `perl -S $ENV{'GEXTPARALLELBUILDING'}/bin/script/perl-version.pl`;
20 }
21 die "PERL_VERSION not set\n" unless defined $ENV{'PERL_VERSION'};
22 # Crypt::Blowfish_PP module
23 unshift (@INC, $ENV{'GEXTPARALLELBUILDING'} . '/' . $ENV{'GSDLOS'} . '/lib/perl/' . $ENV{'PERL_VERSION'});
24 }
25}
26
27use Crypt::Blowfish_PP;
28use Kea;
29use Lingua::EN::Syllable;
30
31our $cpu_load_list = [ { 'name' => "none",
32 'desc' => "Almost no processing - all dependent on IO" },
33 { 'name' => "low",
34 'desc' => "A little processing - building lexicon" },
35 { 'name' => "medium",
36 'desc' => "Some processing..." },
37 { 'name' => "high",
38 'desc' => "Data mining and part of speech tagging" }
39 ];
40
41my $arguments = [ { 'name' => "process_exp",
42 'desc' => "{BasePlugin.process_exp}",
43 'type' => "regexp",
44 'deft' => &get_default_process_exp(),
45 'reqd' => "no" },
46 { 'name' => "cpu_load",
47 'desc' => "",
48 'type' => "enum",
49 'deft' => "auto",
50 'list' => $cpu_load_list,
51 'reqd' => "no" },
52 { 'name' => 'debug',
53 'desc' => '',
54 'type' => 'flag',
55 'reqd' => 'no',
56 'deft' => '0',
57 'hiddengli' => 'no'}
58 ];
59
60my $options = { 'name' => "CPULoadTextPlugin",
61 'desc' => "TextPlugin allowing for configurable amounts of CPU load",
62 'abstract' => "no",
63 'inherits' => "yes",
64 'srcreplaceable' => "yes", # Source docs in regular txt format can be replaced with GS-generated html
65 'args' => $arguments };
66
67
68sub get_default_process_exp
69{
70 my $self = shift (@_);
71 return q^(?i)\.te?xt$^;
72}
73
74sub new
75{
76 my ($class) = shift (@_);
77 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
78 push(@$pluginlist, $class);
79
80 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
81 push(@{$hashArgOptLists->{"OptList"}},$options);
82
83 my $self = new TextPlugin($pluginlist, $inputargs, $hashArgOptLists);
84
85 return bless $self, $class;
86}
87
88# do plugin specific processing of doc_obj
89sub process
90{
91 my $self = shift (@_);
92 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
93 my $outhandle = $self->{'outhandle'};
94
95 my $cursection = $doc_obj->get_top_section();
96
97 # get title metadata
98 # (don't need to get title if it has been passed
99 # in from another plugin)
100 if (!defined $metadata->{'Title'})
101 {
102 my $title = $self->get_title_metadata($textref);
103 $doc_obj->add_utf8_metadata ($cursection, "Title", $title);
104 }
105 # Add FileFormat metadata
106 $doc_obj->add_metadata($cursection, "FileFormat", "Text");
107
108 if ($self->{'cpu_load'} =~ /^(medium|high)$/ )
109 {
110 $self->generateLexicon($doc_obj, $$textref);
111 $self->generateWordLengths($doc_obj, $$textref);
112 $self->{'first'} = '80,256,1024';
113 $self->generateSummaries($doc_obj, $textref);
114 $self->{'first'} = undef;
115 $self->generateComplexity($doc_obj, $textref);
116 $self->generateEncryptedText($doc_obj, $textref, 'thePassword');
117 }
118 if ($self->{'cpu_load'} eq 'high')
119 {
120 $self->generateKeywords($doc_obj, $$textref);
121 }
122
123 # insert preformat tags and add text to document object
124 $self->text_to_html($textref); # modifies the text
125 $doc_obj->add_utf8_text($cursection, $$textref);
126
127 return 1;
128}
129
130
131## @function
132#
133sub _debugPrint
134{
135 my $self = shift(@_);
136 if ($self->{'debug'})
137 {
138 my ($msg) = @_;
139 print '[DEBUG] ' . $msg . "\n";
140 }
141}
142## _debugPrint() ##
143
144
145## Functions to hopefully create some CPU load ##
146
147
148## @function
149#
150sub generateComplexity
151{
152 my $self = shift(@_);
153 my ($doc_obj, $textref) = @_;
154 my $text = $$textref;
155
156 # No of words (we start with 0.1 to prevent things being divided by zero)
157 my @words = split(/[^a-zA-Z0-9]+/, $text);
158 my $number_of_words = scalar(@words);
159
160 # No of long words, where long is 6 or more characters
161 my $number_of_long_words = 0;
162 while ($text =~ /\w{6,}/g)
163 {
164 $number_of_long_words++;
165 }
166
167 # No of syllables
168 my $number_of_syllables = 0;
169 foreach my $the_word (@words)
170 {
171 $number_of_syllables += syllable($the_word);
172 }
173
174 # no of sentences (looking for full stops...)
175 my $number_of_sentences = ($text =~ tr/\.//);
176
177 $self->_debugPrint('Number of words: ' . $number_of_words);
178 $self->_debugPrint('Number of sentences: ' . $number_of_sentences);
179 $self->_debugPrint('Number of syllables: ' . $number_of_syllables);
180
181 # Commetrics Approach
182 # A. Big Word Ratio
183 # = Total # of words / Total # of words with > 6 characters
184 # B. Word Count Score
185 # = Total # of words / Total # of sentences
186 # Score = A / B
187 my $commetrics_complexity_score = ($number_of_words / $number_of_long_words) / ($number_of_words / $number_of_sentences);
188 $self->_debugPrint('ComMetrics Complexity Score: ' . $commetrics_complexity_score);
189
190 $doc_obj->add_metadata($doc_obj->get_top_section(), 'CommetricsScore', $commetrics_complexity_score);
191
192 # Flesch Reading Ease:
193 # Calculate the average number of words you use per sentence.
194 # Calculate the average number of syllables per word.
195 # Multiply the average number of syllables per word multiplied by 84.6 and subtract it from the average number of words multiplied by 1.015.
196 # Subtract the result from 206.835.
197 # Algorithm: 206.835 - (1.015 * average_words_sentence) - (84.6 * average_syllables_word)
198 my $words_per_sentence = $number_of_words / $number_of_sentences;
199 my $syllables_per_word = $number_of_syllables / $number_of_words;
200 my $flesch_complexity_score = 206.835 - ($words_per_sentence * 1.015) - ($syllables_per_word * 84.6);
201 $self->_debugPrint('Flesch-Kincaid Complexity Score: ' . $flesch_complexity_score);
202 $doc_obj->add_metadata($doc_obj->get_top_section(), 'FleschKincaidScore', $flesch_complexity_score);
203 my $flesch_grade = 0.38 * $words_per_sentence + 11.8 * $syllables_per_word - 15.59;
204 $self->_debugPrint('Flesch-Kincaid Grade: ' . $flesch_grade);
205 $doc_obj->add_metadata($doc_obj->get_top_section(), 'FleschKincaidGrade', $flesch_grade);
206}
207## generateComplexity() ##
208
209
210## @function
211#
212sub generateEncryptedText
213{
214 my $self = shift (@_);
215 my ($doc_obj, $textref) = @_;
216
217 my $key_length = 25;
218 my $multiplier = 40;
219
220 # Split the string into chunks
221 my $text = $$textref;
222 # - ensure the length of the text is some multiple of chunk length
223 while ((length($text) % ($multiplier * $key_length)) > 0)
224 {
225 $text .= '#';
226 }
227 # - now split the text into $multiplier x $key_length bytes chunks
228 # the first key_length bytes is used as the key to encrypt the whole chunk
229 my $counter = 0;
230 while (length($text) > 0)
231 {
232 my $key = substr($text, 0, $key_length);
233 my $value = $key . substr($text, $key_length, $key_length * ($multiplier - 1));
234 my $blowfish = new Crypt::Blowfish_PP($key);
235 my $encrypted_text = $blowfish->encrypt($value);
236 if ($self->{'cpu_load'} ne 'high')
237 {
238 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), sprintf("Encrypted%05d", $counter), $encrypted_text);
239 }
240 # - shorten text by chunk length bytes, and repeat until text is exhausted
241 $text = substr($text, $multiplier * $key_length);
242 $counter++;
243 }
244 $self->_debugPrint('Encrypted ' . $counter . ' x ' . ($multiplier * $key_length) . ' byte chunks');
245}
246## generateEncryptedText() ##
247
248
249## @function
250#
251sub generateKeywords
252{
253 my $self = shift (@_);
254 my ($doc_obj, $text) = @_;
255 my $key_phrases = Kea::extract_KeyPhrases('3.0', $text, 'n10');
256 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), 'Keywords', $key_phrases);
257 $self->_debugPrint('Generated keywords: "' . $key_phrases . '"');
258}
259## generateKeywords() ##
260
261
262## @function
263#
264sub generateLexicon
265{
266 my $self = shift (@_);
267 my ($doc_obj, $text) = @_;
268 my $raw_lexicon = {};
269 my @words = split(/[\,\.\s]+/, $text);
270 foreach my $word (@words)
271 {
272 $word = lc($word);
273 if (defined $raw_lexicon->{$word})
274 {
275 $raw_lexicon->{$word}++;
276 }
277 else
278 {
279 $raw_lexicon->{$word} = 1;
280 }
281 }
282 my @lexicon;
283 foreach my $word (sort keys %{$raw_lexicon})
284 {
285 push(@lexicon, $word . ':' . $raw_lexicon->{$word});
286 }
287 if ($self->{'cpu_load'} ne 'high')
288 {
289 $doc_obj->add_metadata($doc_obj->get_top_section(), "Lexicon", join(', ', @lexicon));
290 }
291 $self->_debugPrint('Generated lexicon');
292}
293## generateLexicon() ##
294
295
296## @function
297# extract the first NNN characters as metadata
298sub generateSummaries
299{
300 my $self = shift (@_);
301 my ($doc_obj, $textref) = @_;
302
303 foreach my $size (split /,/, $self->{'first'})
304 {
305 my $tmptext = $$textref;
306 $tmptext =~ s/^\s+//;
307 $tmptext =~ s/\s+$//;
308 $tmptext =~ s/\s+/ /gs;
309 $tmptext = substr ($tmptext, 0, $size);
310 $tmptext =~ s/\s\S*$/…/;
311 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(), 'First' . $size, $tmptext);
312 $self->_debugPrint('Generated summary of ' . $size . ' characters');
313 }
314}
315
316
317## @function
318#
319sub generateWordLengths
320{
321 my $self = shift (@_);
322 my ($doc_obj, $text) = @_;
323 my $raw_word_lengths = {};
324 my @words = split('/[\,\.\s]+/', $text);
325 foreach my $word (@words)
326 {
327 $word = lc($word);
328 my $length = length($word);
329 if (defined $raw_word_lengths->{$length})
330 {
331 $raw_word_lengths->{$length} = 1;
332 }
333 else
334 {
335 $raw_word_lengths->{$length}++;
336 }
337 }
338 my @word_lengths;
339 foreach my $word_length (sort keys %{$raw_word_lengths})
340 {
341 push(@word_lengths, $word_length . ':' . $raw_word_lengths->{$word_length});
342 }
343 if ($self->{'cpu_load'} ne 'high')
344 {
345 $doc_obj->add_metadata($doc_obj->get_top_section(), "WordLengths", join(', ', @word_lengths));
346 }
347 $self->_debugPrint('Generated word length information');
348}
349## generateWordLengths() ##
350
351
3521;
Note: See TracBrowser for help on using the repository browser.