Context Navigation

CPULoadTextPlugin.pm@ 29261

Last change on this file since 29261 was 29261, checked in by jmt12, 10 years ago
Removing some of the extraneous IO from high cpu importing... altering the ratio between CPU and IO load appropriately
File size: 10.1 KB

Line
1	package CPULoadTextPlugin;
2
3	use TextPlugin;
4
5	use strict;
6	no strict 'refs'; # allow filehandles to be variables and viceversa
7	no strict 'subs';
8
9	sub BEGIN
10	{
11	@CPULoadTextPlugin::ISA = ('TextPlugin');
12
13	eval('use Crypt::Blowfish_PP');
14	if ($@)
15	{
16	# We need the Perl version before continuing
17	if (!defined $ENV{'PERL_VERSION'})
18	{
19	$ENV{'PERL_VERSION'} = `perl -S $ENV{'GEXTPARALLELBUILDING'}/bin/script/perl-version.pl`;
20	}
21	die "PERL_VERSION not set\n" unless defined $ENV{'PERL_VERSION'};
22	# Crypt::Blowfish_PP module
23	unshift (@INC, $ENV{'GEXTPARALLELBUILDING'} . '/' . $ENV{'GSDLOS'} . '/lib/perl/' . $ENV{'PERL_VERSION'});
24	}
25	}
26
27	use Crypt::Blowfish_PP;
28	use Kea;
29	use Lingua::EN::Syllable;
30
31	our $cpu_load_list = [ { 'name' => "none",
32	'desc' => "Almost no processing - all dependent on IO" },
33	{ 'name' => "low",
34	'desc' => "A little processing - building lexicon" },
35	{ 'name' => "medium",
36	'desc' => "Some processing..." },
37	{ 'name' => "high",
38	'desc' => "Data mining and part of speech tagging" }
39	];
40
41	my $arguments = [ { 'name' => "process_exp",
42	'desc' => "{BasePlugin.process_exp}",
43	'type' => "regexp",
44	'deft' => &get_default_process_exp(),
45	'reqd' => "no" },
46	{ 'name' => "cpu_load",
47	'desc' => "",
48	'type' => "enum",
49	'deft' => "auto",
50	'list' => $cpu_load_list,
51	'reqd' => "no" },
52	{ 'name' => 'debug',
53	'desc' => '',
54	'type' => 'flag',
55	'reqd' => 'no',
56	'deft' => '0',
57	'hiddengli' => 'no'}
58	];
59
60	my $options = { 'name' => "CPULoadTextPlugin",
61	'desc' => "TextPlugin allowing for configurable amounts of CPU load",
62	'abstract' => "no",
63	'inherits' => "yes",
64	'srcreplaceable' => "yes", # Source docs in regular txt format can be replaced with GS-generated html
65	'args' => $arguments };
66
67
68	sub get_default_process_exp
69	{
70	my $self = shift (@_);
71	return q^(?i)\.te?xt$^;
72	}
73
74	sub new
75	{
76	my ($class) = shift (@_);
77	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
78	push(@$pluginlist, $class);
79
80	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
81	push(@{$hashArgOptLists->{"OptList"}},$options);
82
83	my $self = new TextPlugin($pluginlist, $inputargs, $hashArgOptLists);
84
85	return bless $self, $class;
86	}
87
88	# do plugin specific processing of doc_obj
89	sub process
90	{
91	my $self = shift (@_);
92	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
93	my $outhandle = $self->{'outhandle'};
94
95	my $cursection = $doc_obj->get_top_section();
96
97	# get title metadata
98	# (don't need to get title if it has been passed
99	# in from another plugin)
100	if (!defined $metadata->{'Title'})
101	{
102	my $title = $self->get_title_metadata($textref);
103	$doc_obj->add_utf8_metadata ($cursection, "Title", $title);
104	}
105	# Add FileFormat metadata
106	$doc_obj->add_metadata($cursection, "FileFormat", "Text");
107
108	if ($self->{'cpu_load'} =~ /^(medium\|high)$/ )
109	{
110	$self->generateLexicon($doc_obj, $$textref);
111	$self->generateWordLengths($doc_obj, $$textref);
112	$self->{'first'} = '80,256,1024';
113	$self->generateSummaries($doc_obj, $textref);
114	$self->{'first'} = undef;
115	$self->generateComplexity($doc_obj, $textref);
116	$self->generateEncryptedText($doc_obj, $textref, 'thePassword');
117	}
118	if ($self->{'cpu_load'} eq 'high')
119	{
120	$self->generateKeywords($doc_obj, $$textref);
121	}
122
123	# insert preformat tags and add text to document object
124	$self->text_to_html($textref); # modifies the text
125	$doc_obj->add_utf8_text($cursection, $$textref);
126
127	return 1;
128	}
129
130
131	## @function
132	#
133	sub _debugPrint
134	{
135	my $self = shift(@_);
136	if ($self->{'debug'})
137	{
138	my ($msg) = @_;
139	print '[DEBUG] ' . $msg . "\n";
140	}
141	}
142	## _debugPrint() ##
143
144
145	## Functions to hopefully create some CPU load ##
146
147
148	## @function
149	#
150	sub generateComplexity
151	{
152	my $self = shift(@_);
153	my ($doc_obj, $textref) = @_;
154	my $text = $$textref;
155
156	# No of words (we start with 0.1 to prevent things being divided by zero)
157	my @words = split(/[^a-zA-Z0-9]+/, $text);
158	my $number_of_words = scalar(@words);
159
160	# No of long words, where long is 6 or more characters
161	my $number_of_long_words = 0;
162	while ($text =~ /\w{6,}/g)
163	{
164	$number_of_long_words++;
165	}
166
167	# No of syllables
168	my $number_of_syllables = 0;
169	foreach my $the_word (@words)
170	{
171	$number_of_syllables += syllable($the_word);
172	}
173
174	# no of sentences (looking for full stops...)
175	my $number_of_sentences = ($text =~ tr/\.//);
176
177	$self->_debugPrint('Number of words: ' . $number_of_words);
178	$self->_debugPrint('Number of sentences: ' . $number_of_sentences);
179	$self->_debugPrint('Number of syllables: ' . $number_of_syllables);
180
181	# Commetrics Approach
182	# A. Big Word Ratio
183	# = Total # of words / Total # of words with > 6 characters
184	# B. Word Count Score
185	# = Total # of words / Total # of sentences
186	# Score = A / B
187	my $commetrics_complexity_score = ($number_of_words / $number_of_long_words) / ($number_of_words / $number_of_sentences);
188	$self->_debugPrint('ComMetrics Complexity Score: ' . $commetrics_complexity_score);
189
190	$doc_obj->add_metadata($doc_obj->get_top_section(), 'CommetricsScore', $commetrics_complexity_score);
191
192	# Flesch Reading Ease:
193	# Calculate the average number of words you use per sentence.
194	# Calculate the average number of syllables per word.
195	# Multiply the average number of syllables per word multiplied by 84.6 and subtract it from the average number of words multiplied by 1.015.
196	# Subtract the result from 206.835.
197	# Algorithm: 206.835 - (1.015 * average_words_sentence) - (84.6 * average_syllables_word)
198	my $words_per_sentence = $number_of_words / $number_of_sentences;
199	my $syllables_per_word = $number_of_syllables / $number_of_words;
200	my $flesch_complexity_score = 206.835 - ($words_per_sentence * 1.015) - ($syllables_per_word * 84.6);
201	$self->_debugPrint('Flesch-Kincaid Complexity Score: ' . $flesch_complexity_score);
202	$doc_obj->add_metadata($doc_obj->get_top_section(), 'FleschKincaidScore', $flesch_complexity_score);
203	my $flesch_grade = 0.38 * $words_per_sentence + 11.8 * $syllables_per_word - 15.59;
204	$self->_debugPrint('Flesch-Kincaid Grade: ' . $flesch_grade);
205	$doc_obj->add_metadata($doc_obj->get_top_section(), 'FleschKincaidGrade', $flesch_grade);
206	}
207	## generateComplexity() ##
208
209
210	## @function
211	#
212	sub generateEncryptedText
213	{
214	my $self = shift (@_);
215	my ($doc_obj, $textref) = @_;
216
217	my $key_length = 25;
218	my $multiplier = 40;
219
220	# Split the string into chunks
221	my $text = $$textref;
222	# - ensure the length of the text is some multiple of chunk length
223	while ((length($text) % ($multiplier * $key_length)) > 0)
224	{
225	$text .= '#';
226	}
227	# - now split the text into $multiplier x $key_length bytes chunks
228	# the first key_length bytes is used as the key to encrypt the whole chunk
229	my $counter = 0;
230	while (length($text) > 0)
231	{
232	my $key = substr($text, 0, $key_length);
233	my $value = $key . substr($text, $key_length, $key_length * ($multiplier - 1));
234	my $blowfish = new Crypt::Blowfish_PP($key);
235	my $encrypted_text = $blowfish->encrypt($value);
236	if ($self->{'cpu_load'} ne 'high')
237	{
238	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), sprintf("Encrypted%05d", $counter), $encrypted_text);
239	}
240	# - shorten text by chunk length bytes, and repeat until text is exhausted
241	$text = substr($text, $multiplier * $key_length);
242	$counter++;
243	}
244	$self->_debugPrint('Encrypted ' . $counter . ' x ' . ($multiplier * $key_length) . ' byte chunks');
245	}
246	## generateEncryptedText() ##
247
248
249	## @function
250	#
251	sub generateKeywords
252	{
253	my $self = shift (@_);
254	my ($doc_obj, $text) = @_;
255	my $key_phrases = Kea::extract_KeyPhrases('3.0', $text, 'n10');
256	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), 'Keywords', $key_phrases);
257	$self->_debugPrint('Generated keywords: "' . $key_phrases . '"');
258	}
259	## generateKeywords() ##
260
261
262	## @function
263	#
264	sub generateLexicon
265	{
266	my $self = shift (@_);
267	my ($doc_obj, $text) = @_;
268	my $raw_lexicon = {};
269	my @words = split(/[\,\.\s]+/, $text);
270	foreach my $word (@words)
271	{
272	$word = lc($word);
273	if (defined $raw_lexicon->{$word})
274	{
275	$raw_lexicon->{$word}++;
276	}
277	else
278	{
279	$raw_lexicon->{$word} = 1;
280	}
281	}
282	my @lexicon;
283	foreach my $word (sort keys %{$raw_lexicon})
284	{
285	push(@lexicon, $word . ':' . $raw_lexicon->{$word});
286	}
287	if ($self->{'cpu_load'} ne 'high')
288	{
289	$doc_obj->add_metadata($doc_obj->get_top_section(), "Lexicon", join(', ', @lexicon));
290	}
291	$self->_debugPrint('Generated lexicon');
292	}
293	## generateLexicon() ##
294
295
296	## @function
297	# extract the first NNN characters as metadata
298	sub generateSummaries
299	{
300	my $self = shift (@_);
301	my ($doc_obj, $textref) = @_;
302
303	foreach my $size (split /,/, $self->{'first'})
304	{
305	my $tmptext = $$textref;
306	$tmptext =~ s/^\s+//;
307	$tmptext =~ s/\s+$//;
308	$tmptext =~ s/\s+/ /gs;
309	$tmptext = substr ($tmptext, 0, $size);
310	$tmptext =~ s/\s\S*$/…/;
311	$doc_obj->add_utf8_metadata ($doc_obj->get_top_section(), 'First' . $size, $tmptext);
312	$self->_debugPrint('Generated summary of ' . $size . ' characters');
313	}
314	}
315
316
317	## @function
318	#
319	sub generateWordLengths
320	{
321	my $self = shift (@_);
322	my ($doc_obj, $text) = @_;
323	my $raw_word_lengths = {};
324	my @words = split('/[\,\.\s]+/', $text);
325	foreach my $word (@words)
326	{
327	$word = lc($word);
328	my $length = length($word);
329	if (defined $raw_word_lengths->{$length})
330	{
331	$raw_word_lengths->{$length} = 1;
332	}
333	else
334	{
335	$raw_word_lengths->{$length}++;
336	}
337	}
338	my @word_lengths;
339	foreach my $word_length (sort keys %{$raw_word_lengths})
340	{
341	push(@word_lengths, $word_length . ':' . $raw_word_lengths->{$word_length});
342	}
343	if ($self->{'cpu_load'} ne 'high')
344	{
345	$doc_obj->add_metadata($doc_obj->get_top_section(), "WordLengths", join(', ', @word_lengths));
346	}
347	$self->_debugPrint('Generated word length information');
348	}
349	## generateWordLengths() ##
350
351
352	1;

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: gs2-extensions/parallel-building/trunk/src/perllib/plugins/CPULoadTextPlugin.pm@ 29261

Download in other formats: