Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: gs2-extensions/parallel-building/trunk/src/perllib/plugins/CPULoadTextPlugin.pm@ 28649

Last change on this file since 28649 was 28649, checked in by jmt12, 10 years ago
A version of a Textfile reading plugin that has a configurable load ranging from simply metadata addition to complex ciphers and data mining techniques (keyphrase extraction)
File size: 9.5 KB

Line
1	package CPULoadTextPlugin;
2
3	use TextPlugin;
4
5	use strict;
6	no strict 'refs'; # allow filehandles to be variables and viceversa
7	no strict 'subs';
8
9	use Crypt::Blowfish_PP;
10	use Kea;
11	use Lingua::EN::Syllable;
12
13	sub BEGIN {
14	@CPULoadTextPlugin::ISA = ('TextPlugin');
15	}
16
17	our $cpu_load_list = [ { 'name' => "none",
18	'desc' => "Almost no processing - all dependent on IO" },
19	{ 'name' => "low",
20	'desc' => "A little processing - building lexicon" },
21	{ 'name' => "medium",
22	'desc' => "Some processing..." },
23	{ 'name' => "high",
24	'desc' => "Data mining and part of speech tagging" }
25	];
26
27	my $arguments = [ { 'name' => "process_exp",
28	'desc' => "{BasePlugin.process_exp}",
29	'type' => "regexp",
30	'deft' => &get_default_process_exp(),
31	'reqd' => "no" },
32	{ 'name' => "cpu_load",
33	'desc' => "",
34	'type' => "enum",
35	'deft' => "auto",
36	'list' => $cpu_load_list,
37	'reqd' => "no" },
38	{ 'name' => 'debug',
39	'desc' => '',
40	'type' => 'flag',
41	'reqd' => 'no',
42	'deft' => '0',
43	'hiddengli' => 'no'}
44	];
45
46	my $options = { 'name' => "CPULoadTextPlugin",
47	'desc' => "TextPlugin allowing for configurable amounts of CPU load",
48	'abstract' => "no",
49	'inherits' => "yes",
50	'srcreplaceable' => "yes", # Source docs in regular txt format can be replaced with GS-generated html
51	'args' => $arguments };
52
53
54	sub get_default_process_exp
55	{
56	my $self = shift (@_);
57	return q^(?i)\.te?xt$^;
58	}
59
60	sub new
61	{
62	my ($class) = shift (@_);
63	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
64	push(@$pluginlist, $class);
65
66	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
67	push(@{$hashArgOptLists->{"OptList"}},$options);
68
69	my $self = new TextPlugin($pluginlist, $inputargs, $hashArgOptLists);
70
71	return bless $self, $class;
72	}
73
74	# do plugin specific processing of doc_obj
75	sub process
76	{
77	my $self = shift (@_);
78	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
79	my $outhandle = $self->{'outhandle'};
80
81	my $cursection = $doc_obj->get_top_section();
82
83	# get title metadata
84	# (don't need to get title if it has been passed
85	# in from another plugin)
86	if (!defined $metadata->{'Title'})
87	{
88	my $title = $self->get_title_metadata($textref);
89	$doc_obj->add_utf8_metadata ($cursection, "Title", $title);
90	}
91	# Add FileFormat metadata
92	$doc_obj->add_metadata($cursection, "FileFormat", "Text");
93
94	if ($self->{'cpu_load'} =~ /^(medium\|high)$/ )
95	{
96	$self->generateLexicon($doc_obj, $$textref);
97	$self->generateWordLengths($doc_obj, $$textref);
98	$self->{'first'} = '80,256,1024';
99	$self->generateSummaries($doc_obj, $textref);
100	$self->generateComplexity($doc_obj, $textref);
101	$self->generateEncryptedText($doc_obj, $textref, 'thePassword');
102	}
103	if ($self->{'cpu_load'} eq 'high')
104	{
105	$self->generateKeywords($doc_obj, $$textref);
106	}
107
108	# insert preformat tags and add text to document object
109	$self->text_to_html($textref); # modifies the text
110	$doc_obj->add_utf8_text($cursection, $$textref);
111
112	return 1;
113	}
114
115
116	## @function
117	#
118	sub _debugPrint
119	{
120	my $self = shift(@_);
121	if ($self->{'debug'})
122	{
123	my ($msg) = @_;
124	print '[DEBUG] ' . $msg . "\n";
125	}
126	}
127	## _debugPrint() ##
128
129
130	## Functions to hopefully create some CPU load ##
131
132
133	## @function
134	#
135	sub generateComplexity
136	{
137	my $self = shift(@_);
138	my ($doc_obj, $textref) = @_;
139	my $text = $$textref;
140
141	# No of words (we start with 0.1 to prevent things being divided by zero)
142	my @words = split(/[^a-zA-Z0-9]+/, $text);
143	my $number_of_words = scalar(@words);
144
145	# No of long words, where long is 6 or more characters
146	my $number_of_long_words = 0;
147	while ($text =~ /\w{6,}/g)
148	{
149	$number_of_long_words++;
150	}
151
152	# No of syllables
153	my $number_of_syllables = 0;
154	foreach my $the_word (@words)
155	{
156	$number_of_syllables += syllable($the_word);
157	}
158
159	# no of sentences (looking for full stops...)
160	my $number_of_sentences = ($text =~ tr/\.//);
161
162	$self->_debugPrint('Number of words: ' . $number_of_words);
163	$self->_debugPrint('Number of sentences: ' . $number_of_sentences);
164	$self->_debugPrint('Number of syllables: ' . $number_of_syllables);
165
166	# Commetrics Approach
167	# A. Big Word Ratio
168	# = Total # of words / Total # of words with > 6 characters
169	# B. Word Count Score
170	# = Total # of words / Total # of sentences
171	# Score = A / B
172	my $commetrics_complexity_score = ($number_of_words / $number_of_long_words) / ($number_of_words / $number_of_sentences);
173	$self->_debugPrint('ComMetrics Complexity Score: ' . $commetrics_complexity_score);
174
175	$doc_obj->add_metadata($doc_obj->get_top_section(), 'CommetricsScore', $commetrics_complexity_score);
176
177	# Flesch Reading Ease:
178	# Calculate the average number of words you use per sentence.
179	# Calculate the average number of syllables per word.
180	# Multiply the average number of syllables per word multiplied by 84.6 and subtract it from the average number of words multiplied by 1.015.
181	# Subtract the result from 206.835.
182	# Algorithm: 206.835 - (1.015 * average_words_sentence) - (84.6 * average_syllables_word)
183	my $words_per_sentence = $number_of_words / $number_of_sentences;
184	my $syllables_per_word = $number_of_syllables / $number_of_words;
185	my $flesch_complexity_score = 206.835 - ($words_per_sentence * 1.015) - ($syllables_per_word * 84.6);
186	$self->_debugPrint('Flesch-Kincaid Complexity Score: ' . $flesch_complexity_score);
187	$doc_obj->add_metadata($doc_obj->get_top_section(), 'FleschKincaidScore', $flesch_complexity_score);
188	my $flesch_grade = 0.38 * $words_per_sentence + 11.8 * $syllables_per_word - 15.59;
189	$self->_debugPrint('Flesch-Kincaid Grade: ' . $flesch_grade);
190	$doc_obj->add_metadata($doc_obj->get_top_section(), 'FleschKincaidGrade', $flesch_grade);
191	}
192	## generateComplexity() ##
193
194
195	## @function
196	#
197	sub generateEncryptedText
198	{
199	my $self = shift (@_);
200	my ($doc_obj, $textref) = @_;
201
202	my $key_length = 25;
203	my $multiplier = 40;
204
205	# Split the string into chunks
206	my $text = $$textref;
207	# - ensure the length of the text is some multiple of chunk length
208	while ((length($text) % ($multiplier * $key_length)) > 0)
209	{
210	$text .= '#';
211	}
212	# - now split the text into $multiplier x $key_length bytes chunks
213	# the first key_length bytes is used as the key to encrypt the whole chunk
214	my $counter = 0;
215	while (length($text) > 0)
216	{
217	my $key = substr($text, 0, $key_length);
218	my $value = $key . substr($text, $key_length, $key_length * ($multiplier - 1));
219	my $blowfish = new Crypt::Blowfish_PP($key);
220	my $encrypted_text = $blowfish->encrypt($value);
221	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), sprintf("Encrypted%05d", $counter), $encrypted_text);
222	# - shorten text by chunk length bytes, and repeat until text is exhausted
223	$text = substr($text, $multiplier * $key_length);
224	$counter++;
225	}
226	$self->_debugPrint('Encrypted ' . $counter . ' x ' . ($multiplier * $key_length) . ' byte chunks');
227	}
228	## generateEncryptedText() ##
229
230
231	## @function
232	#
233	sub generateKeywords
234	{
235	my $self = shift (@_);
236	my ($doc_obj, $text) = @_;
237	my $key_phrases = Kea::extract_KeyPhrases('3.0', $text, 'n10');
238	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), 'Keywords', $key_phrases);
239	$self->_debugPrint('Generated keywords: "' . $key_phrases . '"');
240	}
241	## generateKeywords() ##
242
243
244	## @function
245	#
246	sub generateLexicon
247	{
248	my $self = shift (@_);
249	my ($doc_obj, $text) = @_;
250	my $raw_lexicon = {};
251	my @words = split('/[\,\.\s]+/', $text);
252	foreach my $word (@words)
253	{
254	$word = lc($word);
255	if (defined $raw_lexicon->{$word})
256	{
257	$raw_lexicon->{$word}++;
258	}
259	else
260	{
261	$raw_lexicon->{$word} = 1;
262	}
263	}
264	my @lexicon;
265	foreach my $word (sort keys %{$raw_lexicon})
266	{
267	push(@lexicon, $word . ':' . $raw_lexicon->{$word});
268	}
269	$doc_obj->add_metadata($doc_obj->get_top_section(), "Lexicon", join(', ', @lexicon));
270	$self->_debugPrint('Generated lexicon');
271	}
272	## generateLexicon() ##
273
274
275	## @function
276	# extract the first NNN characters as metadata
277	sub generateSummaries
278	{
279	my $self = shift (@_);
280	my ($doc_obj, $textref) = @_;
281
282	foreach my $size (split /,/, $self->{'first'})
283	{
284	my $tmptext = $$textref;
285	$tmptext =~ s/^\s+//;
286	$tmptext =~ s/\s+$//;
287	$tmptext =~ s/\s+/ /gs;
288	$tmptext = substr ($tmptext, 0, $size);
289	$tmptext =~ s/\s\S*$/…/;
290	$doc_obj->add_utf8_metadata ($doc_obj->get_top_section(), 'First' . $size, $tmptext);
291	$self->_debugPrint('Generated summary of ' . $size . ' characters');
292	}
293	}
294
295
296	## @function
297	#
298	sub generateWordLengths
299	{
300	my $self = shift (@_);
301	my ($doc_obj, $text) = @_;
302	my $raw_word_lengths = {};
303	my @words = split('/[\,\.\s]+/', $text);
304	foreach my $word (@words)
305	{
306	$word = lc($word);
307	my $length = length($word);
308	if (defined $raw_word_lengths->{$length})
309	{
310	$raw_word_lengths->{$length} = 1;
311	}
312	else
313	{
314	$raw_word_lengths->{$length}++;
315	}
316	}
317	my @word_lengths;
318	foreach my $word_length (sort keys %{$raw_word_lengths})
319	{
320	push(@word_lengths, $word_length . ':' . $raw_word_lengths->{$word_length});
321	}
322	$doc_obj->add_metadata($doc_obj->get_top_section(), "WordLengths", join(', ', @word_lengths));
323	$self->_debugPrint('Generated word length information');
324	}
325	## generateWordLengths() ##
326
327
328	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: