source: trunk/gsdl/perllib/plugins/MACROPlug.pm@ 10254

Last change on this file since 10254 was 10254, checked in by kjdon, 19 years ago

added 'use strict' to all plugins, and made modifications (mostly adding 'my') to make them compile

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 13.6 KB
Line 
1###########################################################################
2#
3# MACROPlug plugin - to process .dm files for language translation
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package MACROPlug;
27
28use BasPlug;
29use parsargv;
30
31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
33
34sub BEGIN {
35 @MACROPlug::ISA = ('BasPlug');
36}
37
38my $arguments =
39 [ { 'name' => "process_exp",
40 'desc' => "{BasPlug.process_exp}",
41 'type' => "regexp",
42 'deft' => &get_default_process_exp(),
43 'reqd' => "no" } ];
44
45my $options = { 'name' => "MACROPlug",
46 'desc' => "{MACROPlug.desc}",
47 'abstract' => "no",
48 'inherits' => "yes",
49 'args' => $arguments };
50
51
52sub load_language_table
53{
54 my $lang_table = {};
55
56 my $lang_fname = util::filename_cat($ENV{'GSDLHOME'},"tmp","lang",
57 "package_forms","languages.log");
58 open (LANGFILE, "<$lang_fname")
59 || die ("Unable to open $lang_fname: $!\n");
60
61 my $full_name;
62 my $abbr_name;
63
64 while (defined ($full_name=<LANGFILE>)) {
65 chomp($full_name);
66
67 $abbr_name = <LANGFILE>;
68 chomp($abbr_name);
69
70 $lang_table->{$full_name} = $abbr_name;
71
72 my $fourchar_name = substr($full_name,0,4);
73 if (!defined $lang_table->{$fourchar_name}) {
74 $lang_table->{$fourchar_name} = $abbr_name;
75 }
76 else {
77 print STDERR "Warning: Clash on four character abbreviation for language $fourchar_name\n";
78 }
79 }
80
81 close LANGFILE;
82
83 return $lang_table;
84}
85
86
87
88sub new {
89 my ($class) = shift (@_);
90 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
91 push(@$pluginlist, $class);
92
93 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
94 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
95
96 my $self = (defined $hashArgOptLists)? new BasPlug($pluginlist,$inputargs,$hashArgOptLists): new BasPlug($pluginlist,$inputargs);
97
98 # $self->{'lang_abbr'} = load_language_table();
99
100 return bless $self, $class;
101}
102
103sub get_default_process_exp {
104 my $self = shift (@_);
105
106 return q^(?i)\.dm$^;
107}
108
109
110sub read {
111 my $self = shift (@_);
112 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
113
114 my $outhandle = $self->{'outhandle'};
115
116 my $lang_table = $self->{'lang_abbr'};
117 my $fn = $file;
118 $fn =~ s/.*\/(.*)\..*/$1/;
119 $fn =~ s/\d+$//; # remove any digits from end of filename
120
121 my $filename = $file;
122 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
123
124 if ((!-d $filename) && ($file !~ m/doc.xml$/) && (!defined $lang_table->{$fn})) {
125 print $outhandle "MACROPlug: blocking $file\n"
126 if $self->{'verbosity'} > 2;
127 $self->{'num_blocked'} ++;
128 return 0;
129 }
130
131 return $self->SUPER::read(@_);
132}
133
134
135# do plugin specific processing of doc_obj
136sub process {
137 my $self = shift (@_);
138 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
139 my $outhandle = $self->{'outhandle'};
140
141 print STDERR "<Processing n='$file' p='MACROPlug'>\n" if ($gli);
142 print $outhandle "MACROPlug: processing $file\n"
143 if $self->{'verbosity'} > 1;
144
145 my $thissection = $doc_obj->get_top_section();
146
147 $self->extract_macronames (\$textref, $doc_obj, $thissection, $file);
148 # we need to escape the escape character, or else mg will convert into
149 # eg literal newlines, instead of leaving the text as '\n'
150
151
152 my ($filemeta) = $file =~ /([^\\\/]+)$/;
153 $doc_obj->add_utf8_metadata($thissection, "Title", &ghtml::dmsafe($filemeta));
154
155 # FileFormat metadata
156 $doc_obj->add_metadata($thissection, "FileFormat", "MACRO");
157 $doc_obj->add_metadata($thissection, "FileSize", (-s $file));
158
159 return 1;
160}
161
162
163sub extract_macronames {
164
165 my $self = shift(@_);
166
167 my ($textref, $doc_obj, $thissection, $file) = @_;
168 my $outhandle = $self->{'outhandle'};
169
170 print $outhandle " extracting macronames ...\n"
171 if ($self->{'verbosity'}>3);
172
173 my @textarray = split ("\n", $$$textref)
174 if ($self->{'verbosity'}>3);
175
176 my $macro_text = "";
177 my $image_macro = "false";
178 my $norm_macro = "false";
179 my $currpackage = "";
180
181 #print STDERR "FILE@@@@@ $file\n";
182
183 #foreach my $ta (@textarray) {
184
185 for (my $k = 0; $k < scalar(@textarray); $k++) {
186
187 my $ta = $textarray[$k];
188
189 #print STDERR "$ta\n" if ($file =~ m/port/);
190
191 if ($ta =~ m/^package /) {
192 $currpackage = $ta;
193 chomp($currpackage);
194 $currpackage =~ s/^package //;
195 }
196
197
198 elsif ($image_macro eq "true") {
199 unless ($ta =~ m/\S+/) {
200 $image_macro = "false";
201
202 $macro_text =~ s/\_/\\_/osg; # for dm (we have a convention of starting macros with _
203
204 #print STDERR "$macro_text\n\n" if ($file =~ m/spanish/);
205
206 $doc_obj->add_utf8_text($thissection, $macro_text);
207
208 my @names = split(/\s*\#\#\s*/, $macro_text);
209
210 my ($title) = $names[1];
211
212 if (length($title) > 100) {
213 $title = substr ($title, 0, 100) . "...";
214 }
215
216 $title =~ s/</\&lt;/g;
217 $title =~ s/>/\&gt;/g;
218
219 #print STDERR "$title\n" if ($file =~ m/port/);
220
221 $doc_obj->add_utf8_metadata ($thissection, "SecTitle", $title);
222 $macro_text = "";
223 }
224 $ta .= "<br>";
225 $macro_text .= $ta;
226
227 }
228
229 elsif ($norm_macro eq "true") {
230
231 $macro_text .= $ta;
232
233 if ($ta =~ m/\}\s*(\#.*)*\Z/) {
234
235 $norm_macro = "false";
236
237 $macro_text =~ s/\A(_\w+_)\s//;
238
239 #print STDERR "$macro_text\n" if($file =~ m/spanish/i);
240
241 $doc_obj->add_utf8_text($thissection, $macro_text);
242
243 my ($title) = $macro_text;
244
245 $title =~ s/\_/\\_/osg;
246 $title =~ s/\{//;
247 $title =~ s/\}\s*(\#.*)*\Z//;
248 $title =~ s/\<br\>|\<p\>/\n/osg;
249 $title =~ s/\n/ /osg;
250 $title =~ s/</\&lt;/g;
251 $title =~ s/>/\&gt;/g;
252
253 $title =~ s/\(/\\\(/g;
254 $title =~ s/\)/\\\)/g;
255
256 if (length($title) > 100) {
257 $title = substr ($title, 0, 100) . "...";
258 }
259 #print STDERR "$title\n\n\n" if($file =~ m/spanish/i);
260
261 $doc_obj->add_utf8_metadata ($thissection, "SecTitle", $title);
262 $macro_text = "";
263 }
264
265 }
266 elsif(($ta =~ m/\A(_\w+_)\s*/) && ($image_macro eq "false")) {
267
268 my $new_macro = $1;
269
270 my $new_cursection = $doc_obj->get_next_child($thissection);
271 $thissection = $doc_obj->insert_section("");
272
273 $new_macro =~ s/\_/\\_/osg;
274
275 $new_macro = $currpackage . "::" . $new_macro;
276
277 #print STDERR "$new_macro\n" if ($file =~ m/spanish/);
278
279 $doc_obj->add_utf8_metadata ($thissection, "Macroname", $new_macro);
280
281 $macro_text = $ta;
282
283 if ($ta =~ m/\}\s*(\#.*)*\Z/) {
284
285 $macro_text =~ s/\A(_\w+_)\s//;
286 $macro_text =~ s/\n/ /osg;
287
288 #print STDERR "$macro_text\n" if($file =~ m/spanish/i);
289
290 $doc_obj->add_utf8_text($thissection, $macro_text);
291
292 my ($title) = $macro_text;
293 $title =~ s/\{//;
294 $title =~ s/\}\s*(\#.*)*\Z//;
295 $title =~ s/\<br\>|\<p\>/\n/osg;
296 $title =~ s/\n/ /osg;
297 $title =~ s/</\&lt;/g;
298 $title =~ s/>/\&gt;/g;
299
300 if (length($title) > 100) {
301 $title = substr ($title, 0, 100) . "...";
302 }
303 #print STDERR "$title\n\n\n" if($file =~ m/spanish/i);
304
305 $doc_obj->add_utf8_metadata ($thissection, "SecTitle", $title);
306 }
307 else {
308 $norm_macro = "true";
309 }
310
311 $macro_text = "";
312 $macro_text .= $ta;
313 }
314
315 #line in format ## "sometext" ## macro ## macroname ##
316 elsif ($ta =~ m/^\#\# .*/) {
317 my $macroname = $ta;
318 $image_macro = "true";
319 $macro_text = "";
320
321 #NEED S.T LIKE THIS TO PICK UP HOARY ONES OVER 2 LINES
322 unless ($macroname =~ m/^\#\# .*\#\#/) {
323 $macroname = $ta;
324 chomp($macroname);
325 $macro_text .= $macroname;
326 $ta .= $textarray[++$k];
327 $macroname .= $ta;
328 }
329
330 my @names = split(/\s*\#\#\s*/, $macroname);
331 $macroname = $names[(scalar @names) - 1];
332 my $key = $currpackage . "::" . $macroname; # key to the hash and the database
333
334 my $new_cursection = $doc_obj->get_next_child($thissection);
335 $thissection = $doc_obj->insert_section("");
336
337 #print STDERR "$key \n" if ($file =~ m/(port)|(chin)|(engl)/);
338
339 $doc_obj->add_utf8_metadata ($thissection, "Macroname", $key);
340
341 $macro_text = $ta . "<br>";
342
343 }
344 }
345
346 print $outhandle "done extracting macros\n"
347 if ($self->{'verbosity'}>3);
348
349}
350
351
352sub get_language_encoding_old {
353 my $self = shift (@_);
354 my ($filename) = @_;
355 my $outhandle = $self->{'outhandle'};
356
357 # read in file
358 open (FILE, $filename) || die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n";
359 undef $/;
360 my $text = <FILE>;
361 $/ = "\n";
362 close FILE;
363
364 # remove <title>stuff</title> -- as titles tend often to be in English
365 # for foreign language documents
366 $text =~ s/<title>.*?<\/title>//i;
367
368 # remove all HTML tags
369 $text =~ s/<[^>]*>//sg;
370 my $results = [];
371
372 # get the language/encoding
373 $results = $self->{'textcat'}->classify(\$text);
374
375 foreach my $r (@$results) {
376 print $outhandle "Results: $r\n";
377
378 }
379
380 # if textcat returns 3 or less possibilities we'll use the
381 # first one in the list - otherwise use the defaults
382 if (scalar @$results > 3) {
383
384 my $lang_fname = util::filename_cat($ENV{'GSDLHOME'},"tmp","lang","package_forms",
385 "languages.log");
386 open (LANGFILE, "<$lang_fname") or die ("Unable to open $lang_fname: $!\n");
387
388 while (<LANGFILE>) {
389
390 my $line = $_;
391 chomp($line);
392
393 my $fn = $filename;
394 $fn =~ s/.*\/(.*)\..*/$1/;
395
396
397 if ($line eq $fn) {
398 print $outhandle "BINGO $line $fn\n";
399 $line = <LANGFILE>;
400 chomp($line);
401 print $outhandle "language code is $line\n";
402
403 foreach my $r (@$results) {
404 print $outhandle "MY1 $r\n";
405
406 my $roar = $& if($r =~ m/../);
407
408 if ($roar eq $line) {
409 print $outhandle "WE HAVE A WINNER $r\n";
410 }
411 }
412
413
414 }
415 else {
416 $line = <LANGFILE>;
417 }
418 }
419
420 close LANGFILE;
421
422 if ($self->{'input_encoding'} ne 'auto') {
423 if ($self->{'extract_language'} && $self->{'verbosity'}) {
424 print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
425 print $outhandle "defaulting to $self->{'default_language'}\n";
426 }
427 return ($self->{'default_language'}, $self->{'input_encoding'});
428
429 } else {
430 if ($self->{'verbosity'}) {
431 print $outhandle "BASPlug: WARNING: language/encoding could not be extracted from $filename - ";
432 print $outhandle "defaulting to $self->{'default_language'}/$self->{'default_encoding'}\n";
433 }
434 return ($self->{'default_language'}, $self->{'default_encoding'});
435 }
436 }
437
438
439 # format language/encoding
440 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
441 if (!defined $language) {
442 if ($self->{'verbosity'}) {
443 print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
444 print $outhandle "defaulting to $self->{'default_language'}\n";
445 }
446 $language = $self->{'default_language'};
447 }
448 if (!defined $encoding) {
449 if ($self->{'verbosity'}) {
450 print $outhandle "BasPlug: WARNING: encoding could not be extracted from $filename - ";
451 print $outhandle "defaulting to $self->{'default_encoding'}\n";
452 }
453 $encoding = $self->{'default_encoding'};
454 }
455
456 if ($encoding !~ /^(ascii|utf8|unicode)$/ &&
457 !defined $encodings::encodings->{$encoding}) {
458 if ($self->{'verbosity'}) {
459 print $outhandle "BasPlug: WARNING: $filename appears to be encoded in an unsupported encoding ($encoding) - ";
460 print $outhandle "using $self->{'default_encoding'}\n";
461 }
462 $encoding = $self->{'default_encoding'};
463 }
464
465 print STDERR "**** forcing encoding to be utf8\n";
466 $encoding = "utf8";
467
468 print STDERR "**** forcing language to be first two letters\n";
469 my $lfname = $filename;
470 $lfname =~ s/^.*\///;
471 $language = substr($lfname,0,2);
472
473 print $outhandle "RETURNING VALUES $language $encoding\n";
474
475 return ($language, $encoding);
476}
477
478
479sub find_language {
480 my ($self,$fn) = @_;
481
482 my $lang_table = $self->{'lang_abbr'};
483
484 if (!defined $lang_table->{$fn}) {
485
486 # try and find it with shorter string name
487
488 my $try_len = length($fn);
489
490 while ($try_len>=4) {
491 my $try_fn = substr($fn,0,$try_len);
492
493 if (defined $lang_table->{$try_fn}) {
494 $fn = $try_fn;
495 last;
496 }
497 $try_len--;
498 }
499 }
500
501 return $fn;
502}
503
504
505sub get_language_encoding {
506 my $self = shift (@_);
507 my ($filename) = @_;
508 my $outhandle = $self->{'outhandle'};
509
510 my $fn = $filename;
511 $fn =~ s/.*\/(.*)\..*/$1/;
512 $fn =~ s/\d+$//; # remove any digits from end of filename
513
514 my $language;
515 my $encoding = "utf8";
516
517 ## my $lang_lookup = $self->find_language($fn);
518
519 my $lang_table = $self->{'lang_abbr'};
520
521 if (!defined $lang_table->{$fn}) {
522
523 print $outhandle "Warning: Macro file name $filename not in list of languages.\n";
524 print $outhandle " Using default language.\n";
525 $language = $self->{'default_language'};
526 }
527 else {
528 $language = $lang_table->{$fn};
529 }
530
531 ## print $outhandle "Storing $filename as $language $encoding\n";
532
533 return ($language, $encoding);
534}
535
536
5371;
Note: See TracBrowser for help on using the repository browser.