Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

source: gs2-extensions/parallel-building/trunk/src/perllib/plugins/CPULoadTextPlugin.pm@ 28649

Last change on this file since 28649 was 28649, checked in by jmt12, 10 years ago
A version of a Textfile reading plugin that has a configurable load ranging from simply metadata addition to complex ciphers and data mining techniques (keyphrase extraction)
File size: 9.5 KB

Rev	Line
[28649]	1	package CPULoadTextPlugin;
	2
	3	use TextPlugin;
	4
	5	use strict;
	6	no strict 'refs'; # allow filehandles to be variables and viceversa
	7	no strict 'subs';
	8
	9	use Crypt::Blowfish_PP;
	10	use Kea;
	11	use Lingua::EN::Syllable;
	12
	13	sub BEGIN {
	14	@CPULoadTextPlugin::ISA = ('TextPlugin');
	15	}
	16
	17	our $cpu_load_list = [ { 'name' => "none",
	18	'desc' => "Almost no processing - all dependent on IO" },
	19	{ 'name' => "low",
	20	'desc' => "A little processing - building lexicon" },
	21	{ 'name' => "medium",
	22	'desc' => "Some processing..." },
	23	{ 'name' => "high",
	24	'desc' => "Data mining and part of speech tagging" }
	25	];
	26
	27	my $arguments = [ { 'name' => "process_exp",
	28	'desc' => "{BasePlugin.process_exp}",
	29	'type' => "regexp",
	30	'deft' => &get_default_process_exp(),
	31	'reqd' => "no" },
	32	{ 'name' => "cpu_load",
	33	'desc' => "",
	34	'type' => "enum",
	35	'deft' => "auto",
	36	'list' => $cpu_load_list,
	37	'reqd' => "no" },
	38	{ 'name' => 'debug',
	39	'desc' => '',
	40	'type' => 'flag',
	41	'reqd' => 'no',
	42	'deft' => '0',
	43	'hiddengli' => 'no'}
	44	];
	45
	46	my $options = { 'name' => "CPULoadTextPlugin",
	47	'desc' => "TextPlugin allowing for configurable amounts of CPU load",
	48	'abstract' => "no",
	49	'inherits' => "yes",
	50	'srcreplaceable' => "yes", # Source docs in regular txt format can be replaced with GS-generated html
	51	'args' => $arguments };
	52
	53
	54	sub get_default_process_exp
	55	{
	56	my $self = shift (@_);
	57	return q^(?i)\.te?xt$^;
	58	}
	59
	60	sub new
	61	{
	62	my ($class) = shift (@_);
	63	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	64	push(@$pluginlist, $class);
	65
	66	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	67	push(@{$hashArgOptLists->{"OptList"}},$options);
	68
	69	my $self = new TextPlugin($pluginlist, $inputargs, $hashArgOptLists);
	70
	71	return bless $self, $class;
	72	}
	73
	74	# do plugin specific processing of doc_obj
	75	sub process
	76	{
	77	my $self = shift (@_);
	78	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
	79	my $outhandle = $self->{'outhandle'};
	80
	81	my $cursection = $doc_obj->get_top_section();
	82
	83	# get title metadata
	84	# (don't need to get title if it has been passed
	85	# in from another plugin)
	86	if (!defined $metadata->{'Title'})
	87	{
	88	my $title = $self->get_title_metadata($textref);
	89	$doc_obj->add_utf8_metadata ($cursection, "Title", $title);
	90	}
	91	# Add FileFormat metadata
	92	$doc_obj->add_metadata($cursection, "FileFormat", "Text");
	93
	94	if ($self->{'cpu_load'} =~ /^(medium\|high)$/ )
	95	{
	96	$self->generateLexicon($doc_obj, $$textref);
	97	$self->generateWordLengths($doc_obj, $$textref);
	98	$self->{'first'} = '80,256,1024';
	99	$self->generateSummaries($doc_obj, $textref);
	100	$self->generateComplexity($doc_obj, $textref);
	101	$self->generateEncryptedText($doc_obj, $textref, 'thePassword');
	102	}
	103	if ($self->{'cpu_load'} eq 'high')
	104	{
	105	$self->generateKeywords($doc_obj, $$textref);
	106	}
	107
	108	# insert preformat tags and add text to document object
	109	$self->text_to_html($textref); # modifies the text
	110	$doc_obj->add_utf8_text($cursection, $$textref);
	111
	112	return 1;
	113	}
	114
	115
	116	## @function
	117	#
	118	sub _debugPrint
	119	{
	120	my $self = shift(@_);
	121	if ($self->{'debug'})
	122	{
	123	my ($msg) = @_;
	124	print '[DEBUG] ' . $msg . "\n";
	125	}
	126	}
	127	## _debugPrint() ##
	128
	129
	130	## Functions to hopefully create some CPU load ##
	131
	132
	133	## @function
	134	#
	135	sub generateComplexity
	136	{
	137	my $self = shift(@_);
	138	my ($doc_obj, $textref) = @_;
	139	my $text = $$textref;
	140
	141	# No of words (we start with 0.1 to prevent things being divided by zero)
	142	my @words = split(/[^a-zA-Z0-9]+/, $text);
	143	my $number_of_words = scalar(@words);
	144
	145	# No of long words, where long is 6 or more characters
	146	my $number_of_long_words = 0;
	147	while ($text =~ /\w{6,}/g)
	148	{
	149	$number_of_long_words++;
	150	}
	151
	152	# No of syllables
	153	my $number_of_syllables = 0;
	154	foreach my $the_word (@words)
	155	{
	156	$number_of_syllables += syllable($the_word);
	157	}
	158
	159	# no of sentences (looking for full stops...)
	160	my $number_of_sentences = ($text =~ tr/\.//);
	161
	162	$self->_debugPrint('Number of words: ' . $number_of_words);
	163	$self->_debugPrint('Number of sentences: ' . $number_of_sentences);
	164	$self->_debugPrint('Number of syllables: ' . $number_of_syllables);
	165
	166	# Commetrics Approach
	167	# A. Big Word Ratio
	168	# = Total # of words / Total # of words with > 6 characters
	169	# B. Word Count Score
	170	# = Total # of words / Total # of sentences
	171	# Score = A / B
	172	my $commetrics_complexity_score = ($number_of_words / $number_of_long_words) / ($number_of_words / $number_of_sentences);
	173	$self->_debugPrint('ComMetrics Complexity Score: ' . $commetrics_complexity_score);
	174
	175	$doc_obj->add_metadata($doc_obj->get_top_section(), 'CommetricsScore', $commetrics_complexity_score);
	176
	177	# Flesch Reading Ease:
	178	# Calculate the average number of words you use per sentence.
	179	# Calculate the average number of syllables per word.
	180	# Multiply the average number of syllables per word multiplied by 84.6 and subtract it from the average number of words multiplied by 1.015.
	181	# Subtract the result from 206.835.
	182	# Algorithm: 206.835 - (1.015 * average_words_sentence) - (84.6 * average_syllables_word)
	183	my $words_per_sentence = $number_of_words / $number_of_sentences;
	184	my $syllables_per_word = $number_of_syllables / $number_of_words;
	185	my $flesch_complexity_score = 206.835 - ($words_per_sentence * 1.015) - ($syllables_per_word * 84.6);
	186	$self->_debugPrint('Flesch-Kincaid Complexity Score: ' . $flesch_complexity_score);
	187	$doc_obj->add_metadata($doc_obj->get_top_section(), 'FleschKincaidScore', $flesch_complexity_score);
	188	my $flesch_grade = 0.38 * $words_per_sentence + 11.8 * $syllables_per_word - 15.59;
	189	$self->_debugPrint('Flesch-Kincaid Grade: ' . $flesch_grade);
	190	$doc_obj->add_metadata($doc_obj->get_top_section(), 'FleschKincaidGrade', $flesch_grade);
	191	}
	192	## generateComplexity() ##
	193
	194
	195	## @function
	196	#
	197	sub generateEncryptedText
	198	{
	199	my $self = shift (@_);
	200	my ($doc_obj, $textref) = @_;
	201
	202	my $key_length = 25;
	203	my $multiplier = 40;
	204
	205	# Split the string into chunks
	206	my $text = $$textref;
	207	# - ensure the length of the text is some multiple of chunk length
	208	while ((length($text) % ($multiplier * $key_length)) > 0)
	209	{
	210	$text .= '#';
	211	}
	212	# - now split the text into $multiplier x $key_length bytes chunks
	213	# the first key_length bytes is used as the key to encrypt the whole chunk
	214	my $counter = 0;
	215	while (length($text) > 0)
	216	{
	217	my $key = substr($text, 0, $key_length);
	218	my $value = $key . substr($text, $key_length, $key_length * ($multiplier - 1));
	219	my $blowfish = new Crypt::Blowfish_PP($key);
	220	my $encrypted_text = $blowfish->encrypt($value);
	221	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), sprintf("Encrypted%05d", $counter), $encrypted_text);
	222	# - shorten text by chunk length bytes, and repeat until text is exhausted
	223	$text = substr($text, $multiplier * $key_length);
	224	$counter++;
	225	}
	226	$self->_debugPrint('Encrypted ' . $counter . ' x ' . ($multiplier * $key_length) . ' byte chunks');
	227	}
	228	## generateEncryptedText() ##
	229
	230
	231	## @function
	232	#
	233	sub generateKeywords
	234	{
	235	my $self = shift (@_);
	236	my ($doc_obj, $text) = @_;
	237	my $key_phrases = Kea::extract_KeyPhrases('3.0', $text, 'n10');
	238	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), 'Keywords', $key_phrases);
	239	$self->_debugPrint('Generated keywords: "' . $key_phrases . '"');
	240	}
	241	## generateKeywords() ##
	242
	243
	244	## @function
	245	#
	246	sub generateLexicon
	247	{
	248	my $self = shift (@_);
	249	my ($doc_obj, $text) = @_;
	250	my $raw_lexicon = {};
	251	my @words = split('/[\,\.\s]+/', $text);
	252	foreach my $word (@words)
	253	{
	254	$word = lc($word);
	255	if (defined $raw_lexicon->{$word})
	256	{
	257	$raw_lexicon->{$word}++;
	258	}
	259	else
	260	{
	261	$raw_lexicon->{$word} = 1;
	262	}
	263	}
	264	my @lexicon;
	265	foreach my $word (sort keys %{$raw_lexicon})
	266	{
	267	push(@lexicon, $word . ':' . $raw_lexicon->{$word});
	268	}
	269	$doc_obj->add_metadata($doc_obj->get_top_section(), "Lexicon", join(', ', @lexicon));
	270	$self->_debugPrint('Generated lexicon');
	271	}
	272	## generateLexicon() ##
	273
	274
	275	## @function
	276	# extract the first NNN characters as metadata
	277	sub generateSummaries
	278	{
	279	my $self = shift (@_);
	280	my ($doc_obj, $textref) = @_;
	281
	282	foreach my $size (split /,/, $self->{'first'})
	283	{
	284	my $tmptext = $$textref;
	285	$tmptext =~ s/^\s+//;
	286	$tmptext =~ s/\s+$//;
	287	$tmptext =~ s/\s+/ /gs;
	288	$tmptext = substr ($tmptext, 0, $size);
	289	$tmptext =~ s/\s\S*$/…/;
	290	$doc_obj->add_utf8_metadata ($doc_obj->get_top_section(), 'First' . $size, $tmptext);
	291	$self->_debugPrint('Generated summary of ' . $size . ' characters');
	292	}
	293	}
	294
	295
	296	## @function
	297	#
	298	sub generateWordLengths
	299	{
	300	my $self = shift (@_);
	301	my ($doc_obj, $text) = @_;
	302	my $raw_word_lengths = {};
	303	my @words = split('/[\,\.\s]+/', $text);
	304	foreach my $word (@words)
	305	{
	306	$word = lc($word);
	307	my $length = length($word);
	308	if (defined $raw_word_lengths->{$length})
	309	{
	310	$raw_word_lengths->{$length} = 1;
	311	}
	312	else
	313	{
	314	$raw_word_lengths->{$length}++;
	315	}
	316	}
	317	my @word_lengths;
	318	foreach my $word_length (sort keys %{$raw_word_lengths})
	319	{
	320	push(@word_lengths, $word_length . ':' . $raw_word_lengths->{$word_length});
	321	}
	322	$doc_obj->add_metadata($doc_obj->get_top_section(), "WordLengths", join(', ', @word_lengths));
	323	$self->_debugPrint('Generated word length information');
	324	}
	325	## generateWordLengths() ##
	326
	327
	328	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: