source: gs2-extensions/parallel-building/trunk/src/perllib/plugins/CPULoadTextPlugin.pm@ 28649

Last change on this file since 28649 was 28649, checked in by jmt12, 10 years ago

A version of a Textfile reading plugin that has a configurable load ranging from simply metadata addition to complex ciphers and data mining techniques (keyphrase extraction)

File size: 9.5 KB
Line 
1package CPULoadTextPlugin;
2
3use TextPlugin;
4
5use strict;
6no strict 'refs'; # allow filehandles to be variables and viceversa
7no strict 'subs';
8
9use Crypt::Blowfish_PP;
10use Kea;
11use Lingua::EN::Syllable;
12
13sub BEGIN {
14 @CPULoadTextPlugin::ISA = ('TextPlugin');
15}
16
17our $cpu_load_list = [ { 'name' => "none",
18 'desc' => "Almost no processing - all dependent on IO" },
19 { 'name' => "low",
20 'desc' => "A little processing - building lexicon" },
21 { 'name' => "medium",
22 'desc' => "Some processing..." },
23 { 'name' => "high",
24 'desc' => "Data mining and part of speech tagging" }
25 ];
26
27my $arguments = [ { 'name' => "process_exp",
28 'desc' => "{BasePlugin.process_exp}",
29 'type' => "regexp",
30 'deft' => &get_default_process_exp(),
31 'reqd' => "no" },
32 { 'name' => "cpu_load",
33 'desc' => "",
34 'type' => "enum",
35 'deft' => "auto",
36 'list' => $cpu_load_list,
37 'reqd' => "no" },
38 { 'name' => 'debug',
39 'desc' => '',
40 'type' => 'flag',
41 'reqd' => 'no',
42 'deft' => '0',
43 'hiddengli' => 'no'}
44 ];
45
46my $options = { 'name' => "CPULoadTextPlugin",
47 'desc' => "TextPlugin allowing for configurable amounts of CPU load",
48 'abstract' => "no",
49 'inherits' => "yes",
50 'srcreplaceable' => "yes", # Source docs in regular txt format can be replaced with GS-generated html
51 'args' => $arguments };
52
53
54sub get_default_process_exp
55{
56 my $self = shift (@_);
57 return q^(?i)\.te?xt$^;
58}
59
60sub new
61{
62 my ($class) = shift (@_);
63 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
64 push(@$pluginlist, $class);
65
66 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
67 push(@{$hashArgOptLists->{"OptList"}},$options);
68
69 my $self = new TextPlugin($pluginlist, $inputargs, $hashArgOptLists);
70
71 return bless $self, $class;
72}
73
74# do plugin specific processing of doc_obj
75sub process
76{
77 my $self = shift (@_);
78 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
79 my $outhandle = $self->{'outhandle'};
80
81 my $cursection = $doc_obj->get_top_section();
82
83 # get title metadata
84 # (don't need to get title if it has been passed
85 # in from another plugin)
86 if (!defined $metadata->{'Title'})
87 {
88 my $title = $self->get_title_metadata($textref);
89 $doc_obj->add_utf8_metadata ($cursection, "Title", $title);
90 }
91 # Add FileFormat metadata
92 $doc_obj->add_metadata($cursection, "FileFormat", "Text");
93
94 if ($self->{'cpu_load'} =~ /^(medium|high)$/ )
95 {
96 $self->generateLexicon($doc_obj, $$textref);
97 $self->generateWordLengths($doc_obj, $$textref);
98 $self->{'first'} = '80,256,1024';
99 $self->generateSummaries($doc_obj, $textref);
100 $self->generateComplexity($doc_obj, $textref);
101 $self->generateEncryptedText($doc_obj, $textref, 'thePassword');
102 }
103 if ($self->{'cpu_load'} eq 'high')
104 {
105 $self->generateKeywords($doc_obj, $$textref);
106 }
107
108 # insert preformat tags and add text to document object
109 $self->text_to_html($textref); # modifies the text
110 $doc_obj->add_utf8_text($cursection, $$textref);
111
112 return 1;
113}
114
115
116## @function
117#
118sub _debugPrint
119{
120 my $self = shift(@_);
121 if ($self->{'debug'})
122 {
123 my ($msg) = @_;
124 print '[DEBUG] ' . $msg . "\n";
125 }
126}
127## _debugPrint() ##
128
129
130## Functions to hopefully create some CPU load ##
131
132
133## @function
134#
135sub generateComplexity
136{
137 my $self = shift(@_);
138 my ($doc_obj, $textref) = @_;
139 my $text = $$textref;
140
141 # No of words (we start with 0.1 to prevent things being divided by zero)
142 my @words = split(/[^a-zA-Z0-9]+/, $text);
143 my $number_of_words = scalar(@words);
144
145 # No of long words, where long is 6 or more characters
146 my $number_of_long_words = 0;
147 while ($text =~ /\w{6,}/g)
148 {
149 $number_of_long_words++;
150 }
151
152 # No of syllables
153 my $number_of_syllables = 0;
154 foreach my $the_word (@words)
155 {
156 $number_of_syllables += syllable($the_word);
157 }
158
159 # no of sentences (looking for full stops...)
160 my $number_of_sentences = ($text =~ tr/\.//);
161
162 $self->_debugPrint('Number of words: ' . $number_of_words);
163 $self->_debugPrint('Number of sentences: ' . $number_of_sentences);
164 $self->_debugPrint('Number of syllables: ' . $number_of_syllables);
165
166 # Commetrics Approach
167 # A. Big Word Ratio
168 # = Total # of words / Total # of words with > 6 characters
169 # B. Word Count Score
170 # = Total # of words / Total # of sentences
171 # Score = A / B
172 my $commetrics_complexity_score = ($number_of_words / $number_of_long_words) / ($number_of_words / $number_of_sentences);
173 $self->_debugPrint('ComMetrics Complexity Score: ' . $commetrics_complexity_score);
174
175 $doc_obj->add_metadata($doc_obj->get_top_section(), 'CommetricsScore', $commetrics_complexity_score);
176
177 # Flesch Reading Ease:
178 # Calculate the average number of words you use per sentence.
179 # Calculate the average number of syllables per word.
180 # Multiply the average number of syllables per word multiplied by 84.6 and subtract it from the average number of words multiplied by 1.015.
181 # Subtract the result from 206.835.
182 # Algorithm: 206.835 - (1.015 * average_words_sentence) - (84.6 * average_syllables_word)
183 my $words_per_sentence = $number_of_words / $number_of_sentences;
184 my $syllables_per_word = $number_of_syllables / $number_of_words;
185 my $flesch_complexity_score = 206.835 - ($words_per_sentence * 1.015) - ($syllables_per_word * 84.6);
186 $self->_debugPrint('Flesch-Kincaid Complexity Score: ' . $flesch_complexity_score);
187 $doc_obj->add_metadata($doc_obj->get_top_section(), 'FleschKincaidScore', $flesch_complexity_score);
188 my $flesch_grade = 0.38 * $words_per_sentence + 11.8 * $syllables_per_word - 15.59;
189 $self->_debugPrint('Flesch-Kincaid Grade: ' . $flesch_grade);
190 $doc_obj->add_metadata($doc_obj->get_top_section(), 'FleschKincaidGrade', $flesch_grade);
191}
192## generateComplexity() ##
193
194
195## @function
196#
197sub generateEncryptedText
198{
199 my $self = shift (@_);
200 my ($doc_obj, $textref) = @_;
201
202 my $key_length = 25;
203 my $multiplier = 40;
204
205 # Split the string into chunks
206 my $text = $$textref;
207 # - ensure the length of the text is some multiple of chunk length
208 while ((length($text) % ($multiplier * $key_length)) > 0)
209 {
210 $text .= '#';
211 }
212 # - now split the text into $multiplier x $key_length bytes chunks
213 # the first key_length bytes is used as the key to encrypt the whole chunk
214 my $counter = 0;
215 while (length($text) > 0)
216 {
217 my $key = substr($text, 0, $key_length);
218 my $value = $key . substr($text, $key_length, $key_length * ($multiplier - 1));
219 my $blowfish = new Crypt::Blowfish_PP($key);
220 my $encrypted_text = $blowfish->encrypt($value);
221 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), sprintf("Encrypted%05d", $counter), $encrypted_text);
222 # - shorten text by chunk length bytes, and repeat until text is exhausted
223 $text = substr($text, $multiplier * $key_length);
224 $counter++;
225 }
226 $self->_debugPrint('Encrypted ' . $counter . ' x ' . ($multiplier * $key_length) . ' byte chunks');
227}
228## generateEncryptedText() ##
229
230
231## @function
232#
233sub generateKeywords
234{
235 my $self = shift (@_);
236 my ($doc_obj, $text) = @_;
237 my $key_phrases = Kea::extract_KeyPhrases('3.0', $text, 'n10');
238 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), 'Keywords', $key_phrases);
239 $self->_debugPrint('Generated keywords: "' . $key_phrases . '"');
240}
241## generateKeywords() ##
242
243
244## @function
245#
246sub generateLexicon
247{
248 my $self = shift (@_);
249 my ($doc_obj, $text) = @_;
250 my $raw_lexicon = {};
251 my @words = split('/[\,\.\s]+/', $text);
252 foreach my $word (@words)
253 {
254 $word = lc($word);
255 if (defined $raw_lexicon->{$word})
256 {
257 $raw_lexicon->{$word}++;
258 }
259 else
260 {
261 $raw_lexicon->{$word} = 1;
262 }
263 }
264 my @lexicon;
265 foreach my $word (sort keys %{$raw_lexicon})
266 {
267 push(@lexicon, $word . ':' . $raw_lexicon->{$word});
268 }
269 $doc_obj->add_metadata($doc_obj->get_top_section(), "Lexicon", join(', ', @lexicon));
270 $self->_debugPrint('Generated lexicon');
271}
272## generateLexicon() ##
273
274
275## @function
276# extract the first NNN characters as metadata
277sub generateSummaries
278{
279 my $self = shift (@_);
280 my ($doc_obj, $textref) = @_;
281
282 foreach my $size (split /,/, $self->{'first'})
283 {
284 my $tmptext = $$textref;
285 $tmptext =~ s/^\s+//;
286 $tmptext =~ s/\s+$//;
287 $tmptext =~ s/\s+/ /gs;
288 $tmptext = substr ($tmptext, 0, $size);
289 $tmptext =~ s/\s\S*$/…/;
290 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(), 'First' . $size, $tmptext);
291 $self->_debugPrint('Generated summary of ' . $size . ' characters');
292 }
293}
294
295
296## @function
297#
298sub generateWordLengths
299{
300 my $self = shift (@_);
301 my ($doc_obj, $text) = @_;
302 my $raw_word_lengths = {};
303 my @words = split('/[\,\.\s]+/', $text);
304 foreach my $word (@words)
305 {
306 $word = lc($word);
307 my $length = length($word);
308 if (defined $raw_word_lengths->{$length})
309 {
310 $raw_word_lengths->{$length} = 1;
311 }
312 else
313 {
314 $raw_word_lengths->{$length}++;
315 }
316 }
317 my @word_lengths;
318 foreach my $word_length (sort keys %{$raw_word_lengths})
319 {
320 push(@word_lengths, $word_length . ':' . $raw_word_lengths->{$word_length});
321 }
322 $doc_obj->add_metadata($doc_obj->get_top_section(), "WordLengths", join(', ', @word_lengths));
323 $self->_debugPrint('Generated word length information');
324}
325## generateWordLengths() ##
326
327
3281;
Note: See TracBrowser for help on using the repository browser.