root/gs2-extensions/parallel-building/trunk/src/perllib/plugins/LoremTextPlugin.pm @ 25113

Revision 25113, 2.9 KB (checked in by jmt12, 9 years ago)

A TextPlugin? extended to do a little more processing in order to (hopefully) strain the CPU

Line 
1package LoremTextPlugin;
2
3use TextFile;
4
5use strict;
6no strict 'refs'; # allow filehandles to be variables and viceversa
7no strict 'subs';
8
9sub BEGIN {
10    @LoremTextPlugin::ISA = ('TextFile');
11}
12
13my $arguments =
14    [ { 'name' => "process_exp",
15    'desc' => "{BasePlugin.process_exp}",
16    'type' => "regexp",
17    'deft' => &get_default_process_exp(),
18    'reqd' => "no" } ];
19
20my $options = { 'name'     => "LoremTextPlugin",
21        'desc'     => "TextPlugin extended to do a little more processing (to stress CPU rather than IO)",
22        'abstract' => "no",
23        'inherits' => "yes",
24        'srcreplaceable' => "yes", # Source docs in regular txt format can be replaced with GS-generated html
25        'args'     => $arguments };
26
27
28sub new
29{
30  my ($class) = shift (@_);
31  my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
32  push(@$pluginlist, $class);
33
34  push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
35  push(@{$hashArgOptLists->{"OptList"}},$options);
36
37  my $self = new TextFile($pluginlist, $inputargs, $hashArgOptLists);
38
39  $self->{'lexicon'} = {};
40  $self->{'word_lengths'} = {};
41
42  return bless $self, $class;
43}
44
45# do plugin specific processing of doc_obj
46sub process
47{
48  my $self = shift (@_);
49  my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
50  my $outhandle = $self->{'outhandle'};
51
52  my $cursection = $doc_obj->get_top_section();
53
54  # get title metadata
55  # (don't need to get title if it has been passed
56  # in from another plugin)
57  if (!defined $metadata->{'Title'})
58  {
59    my $title = $self->get_title_metadata($textref);
60    $doc_obj->add_utf8_metadata ($cursection, "Title", $title);
61  }
62  # Add FileFormat metadata
63  $doc_obj->add_metadata($cursection, "FileFormat", "Text");
64
65  # Process the text to determine
66  # - a) the lexicon used, and
67  $self->{'lexicon'} = {};
68  # - b) the frequency of words of various lengths
69  $self->{'word_lengths'} = {};
70
71  my @words = split('/[\,\.\s]+/', $$textref);
72  foreach my $word (@words)
73  {
74    $word = lc($word);
75    if (defined $self->{'lexicon'}->{$word})
76    {
77      $self->{'lexicon'}->{$word}++;
78    }
79    else
80    {
81      $self->{'lexicon'}->{$word} = 1;
82    }
83    my $length = length($word);
84    if (defined $self->{'word_lengths'}->{$length})
85    {
86      $self->{'word_lengths'}->{$length} = 1;
87    }
88    else
89    {
90      $self->{'word_lengths'}->{$length}++;
91    }
92  }
93
94  my @lexicon;
95  foreach my $word (sort keys %{$self->{'lexicon'}})
96  {
97    push(@lexicon, $word . ':' . $self->{'lexicon'}->{$word});
98  }
99  $doc_obj->add_metadata($cursection, "Lexicon", join(', ', @lexicon));
100
101  my @word_lengths;
102  foreach my $word_length (sort keys %{$self->{'word_lengths'}})
103  {
104    push(@word_lengths, $word_length . ':' . $self->{'word_lengths'}->{$word_length});
105  }
106  $doc_obj->add_metadata($cursection, "WordLengths", join(', ', @word_lengths));
107
108  # insert preformat tags and add text to document object   
109  $self->text_to_html($textref); # modifies the text
110  $doc_obj->add_utf8_text($cursection, $$textref);
111
112  return 1;
113}
114
1151;
Note: See TracBrowser for help on using the browser.