source: trunk/gsdl/perllib/plugins/MACROPlug.pm@ 6812

Last change on this file since 6812 was 6408, checked in by jmt12, 20 years ago

Added two new attributes for script arguments. HiddenGLI controls whether the argument will be visible at all in GLI, while ModeGLI defines the lowest detail mode under which the argument will be visible (only really for import and buildcol). Also ensured that the scripts were reporting their correct default process expressions, and further refined argument types by adding the catagory regexp for any regular expression (which can then be hidden under lower detail modes in GLI)

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 13.5 KB
Line 
1###########################################################################
2#
3# MACROPlug plugin - to process .dm files for language translation
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package MACROPlug;
27
28use BasPlug;
29use parsargv;
30
31
32sub BEGIN {
33 @ISA = ('BasPlug');
34}
35
36my $arguments =
37 [ { 'name' => "process_exp",
38 'desc' => "{BasPlug.process_exp}",
39 'type' => "regexp",
40 'deft' => &get_default_process_exp(),
41 'reqd' => "no" } ];
42
43my $options = { 'name' => "MACROPlug",
44 'desc' => "{MACROPlug.desc}",
45 'abstract' => "no",
46 'inherits' => "yes",
47 'args' => $arguments };
48
49# sub print_usage {
50# print STDERR "\n usage: plugin MACROPlug [options]\n\n";
51# print STDERR " options:\n";
52# print STDERR " -title_sub\t Substitution expression to modify string stored as Title.\n";
53# print STDERR "\t\t Used by, for example, PSPlug to remove \"Page 1\" etc from\n";
54# print STDERR "\t\t text used as the title.\n";
55
56# print STDERR "\n";
57# }
58
59
60sub load_language_table
61{
62 my $lang_table = {};
63
64 my $lang_fname = util::filename_cat($ENV{'GSDLHOME'},"tmp","lang",
65 "package_forms","languages.log");
66 open (LANGFILE, "<$lang_fname")
67 || die ("Unable to open $lang_fname: $!\n");
68
69 my $full_name;
70 my $abbr_name;
71
72 while (defined ($full_name=<LANGFILE>)) {
73 chomp($full_name);
74
75 $abbr_name = <LANGFILE>;
76 chomp($abbr_name);
77
78 $lang_table->{$full_name} = $abbr_name;
79
80 my $fourchar_name = substr($full_name,0,4);
81 if (!defined $lang_table->{$fourchar_name}) {
82 $lang_table->{$fourchar_name} = $abbr_name;
83 }
84 else {
85 print STDERR "Warning: Clash on four character abbreviation for language $fourchar_name\n";
86 }
87 }
88
89 close LANGFILE;
90
91 return $lang_table;
92}
93
94
95
96sub new {
97 my ($class) = @_;
98 my $self = new BasPlug ($class, @_);
99
100 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
101 my $option_list = $self->{'option_list'};
102 push( @{$option_list}, $options );
103
104 # $self->{'lang_abbr'} = load_language_table();
105
106 return bless $self, $class;
107}
108
109sub get_default_process_exp {
110 my $self = shift (@_);
111
112 return q^(?i)\.dm$^;
113}
114
115
116sub read {
117 my $self = shift (@_);
118 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
119
120 my $outhandle = $self->{'outhandle'};
121
122 my $lang_table = $self->{'lang_abbr'};
123 my $fn = $file;
124 $fn =~ s/.*\/(.*)\..*/$1/;
125 $fn =~ s/\d+$//; # remove any digits from end of filename
126
127 my $filename = $file;
128 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
129
130 if ((!-d $filename) && ($file !~ m/doc.xml$/) && (!defined $lang_table->{$fn})) {
131 print $outhandle "MACROPlug: blocking $file\n"
132 if $self->{'verbosity'} > 2;
133 $self->{'num_blocked'} ++;
134 return 0;
135 }
136
137 return $self->SUPER::read(@_);
138}
139
140
141# do plugin specific processing of doc_obj
142sub process {
143 my $self = shift (@_);
144 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
145 my $outhandle = $self->{'outhandle'};
146
147 print STDERR "<Processing n='$file' p='MACROPlug'>\n" if ($gli);
148 print $outhandle "MACROPlug: processing $file\n"
149 if $self->{'verbosity'} > 1;
150
151 my $thissection = $doc_obj->get_top_section();
152
153 $self->extract_macronames (\$textref, $doc_obj, $thissection, $file);
154 # we need to escape the escape character, or else mg will convert into
155 # eg literal newlines, instead of leaving the text as '\n'
156
157
158 my ($filemeta) = $file =~ /([^\\\/]+)$/;
159 $doc_obj->add_utf8_metadata($thissection, "Title", &ghtml::dmsafe($filemeta));
160
161 return 1;
162}
163
164
165sub extract_macronames {
166
167 my $self = shift(@_);
168
169 my ($textref, $doc_obj, $thissection, $file) = @_;
170 my $outhandle = $self->{'outhandle'};
171
172 print $outhandle " extracting macronames ...\n"
173 if ($self->{'verbosity'}>3);
174
175 my @textarray = split ("\n", $$$textref)
176 if ($self->{'verbosity'}>3);
177
178 my $macro_text = "";
179 my $image_macro = "false";
180 my $norm_macro = "false";
181
182 #print STDERR "FILE@@@@@ $file\n";
183
184 #foreach my $ta (@textarray) {
185
186 for ($k = 0; $k < scalar(@textarray); $k++) {
187
188 $ta = $textarray[$k];
189
190 #print STDERR "$ta\n" if ($file =~ m/port/);
191
192 if ($ta =~ m/^package /) {
193 $currpackage = $ta;
194 chomp($currpackage);
195 $currpackage =~ s/^package //;
196 }
197
198
199 elsif ($image_macro eq "true") {
200 unless ($ta =~ m/\S+/) {
201 $image_macro = "false";
202
203 $macro_text =~ s/\_/\\_/osg; # for dm (we have a convention of starting macros with _
204
205 #print STDERR "$macro_text\n\n" if ($file =~ m/spanish/);
206
207 $doc_obj->add_utf8_text($thissection, $macro_text);
208
209 my @names = split(/\s*\#\#\s*/, $macro_text);
210
211 my ($title) = $names[1];
212
213 if (length($title) > 100) {
214 $title = substr ($title, 0, 100) . "...";
215 }
216
217 $title =~ s/</\&lt;/g;
218 $title =~ s/>/\&gt;/g;
219
220 #print STDERR "$title\n" if ($file =~ m/port/);
221
222 $doc_obj->add_utf8_metadata ($thissection, "SecTitle", $title);
223 $macro_text = "";
224 }
225 $ta .= "<br>";
226 $macro_text .= $ta;
227
228 }
229
230 elsif ($norm_macro eq "true") {
231
232 $macro_text .= $ta;
233
234 if ($ta =~ m/\}\s*(\#.*)*\Z/) {
235
236 $norm_macro = "false";
237
238 $macro_text =~ s/\A(_\w+_)\s//;
239
240 #print STDERR "$macro_text\n" if($file =~ m/spanish/i);
241
242 $doc_obj->add_utf8_text($thissection, $macro_text);
243
244 my ($title) = $macro_text;
245
246 $title =~ s/\_/\\_/osg;
247 $title =~ s/\{//;
248 $title =~ s/\}\s*(\#.*)*\Z//;
249 $title =~ s/\<br\>|\<p\>/\n/osg;
250 $title =~ s/\n/ /osg;
251 $title =~ s/</\&lt;/g;
252 $title =~ s/>/\&gt;/g;
253
254 $title =~ s/\(/\\\(/g;
255 $title =~ s/\)/\\\)/g;
256
257 if (length($title) > 100) {
258 $title = substr ($title, 0, 100) . "...";
259 }
260 #print STDERR "$title\n\n\n" if($file =~ m/spanish/i);
261
262 $doc_obj->add_utf8_metadata ($thissection, "SecTitle", $title);
263 $macro_text = "";
264 }
265
266 }
267 elsif(($ta =~ m/\A(_\w+_)\s*/) && ($image_macro eq "false")) {
268
269 my $new_macro = $1;
270
271 my $new_cursection = $doc_obj->get_next_child($thissection);
272 $thissection = $doc_obj->insert_section("");
273
274 $new_macro =~ s/\_/\\_/osg;
275
276 $new_macro = $currpackage . "::" . $new_macro;
277
278 #print STDERR "$new_macro\n" if ($file =~ m/spanish/);
279
280 $doc_obj->add_utf8_metadata ($thissection, "Macroname", $new_macro);
281
282 $macro_text = $ta;
283
284 if ($ta =~ m/\}\s*(\#.*)*\Z/) {
285
286 $macro_text =~ s/\A(_\w+_)\s//;
287 $macro_text =~ s/\n/ /osg;
288
289 #print STDERR "$macro_text\n" if($file =~ m/spanish/i);
290
291 $doc_obj->add_utf8_text($thissection, $macro_text);
292
293 my ($title) = $macro_text;
294 $title =~ s/\{//;
295 $title =~ s/\}\s*(\#.*)*\Z//;
296 $title =~ s/\<br\>|\<p\>/\n/osg;
297 $title =~ s/\n/ /osg;
298 $title =~ s/</\&lt;/g;
299 $title =~ s/>/\&gt;/g;
300
301 if (length($title) > 100) {
302 $title = substr ($title, 0, 100) . "...";
303 }
304 #print STDERR "$title\n\n\n" if($file =~ m/spanish/i);
305
306 $doc_obj->add_utf8_metadata ($thissection, "SecTitle", $title);
307 }
308 else {
309 $norm_macro = "true";
310 }
311
312 $macro_text = "";
313 $macro_text .= $ta;
314 }
315
316 #line in format ## "sometext" ## macro ## macroname ##
317 elsif ($ta =~ m/^\#\# .*/) {
318 my $macroname = $ta;
319 $image_macro = "true";
320 $macro_text = "";
321
322 #NEED S.T LIKE THIS TO PICK UP HOARY ONES OVER 2 LINES
323 unless ($macroname =~ m/^\#\# .*\#\#/) {
324 $macroname = $ta;
325 chomp($macroname);
326 $macro_text .= $macroname;
327 $ta .= $textarray[++$k];
328 $macroname .= $ta;
329 }
330
331 my @names = split(/\s*\#\#\s*/, $macroname);
332 $macroname = $names[(scalar @names) - 1];
333 my $key = $currpackage . "::" . $macroname; # key to the hash and the database
334
335 my $new_cursection = $doc_obj->get_next_child($thissection);
336 $thissection = $doc_obj->insert_section("");
337
338 #print STDERR "$key \n" if ($file =~ m/(port)|(chin)|(engl)/);
339
340 $doc_obj->add_utf8_metadata ($thissection, "Macroname", $key);
341
342 $macro_text = $ta . "<br>";
343
344 }
345 }
346
347 print $outhandle "done extracting macros\n"
348 if ($self->{'verbosity'}>3);
349
350}
351
352
353sub get_language_encoding_old {
354 my $self = shift (@_);
355 my ($filename) = @_;
356 my $outhandle = $self->{'outhandle'};
357
358 # read in file
359 open (FILE, $filename) || die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n";
360 undef $/;
361 my $text = <FILE>;
362 $/ = "\n";
363 close FILE;
364
365 # remove <title>stuff</title> -- as titles tend often to be in English
366 # for foreign language documents
367 $text =~ s/<title>.*?<\/title>//i;
368
369 # remove all HTML tags
370 $text =~ s/<[^>]*>//sg;
371 my $results = [];
372
373 # get the language/encoding
374 $results = $self->{'textcat'}->classify(\$text);
375
376 foreach $r (@$results) {
377 print $outhandle "Results: $r\n";
378
379 }
380
381 # if textcat returns 3 or less possibilities we'll use the
382 # first one in the list - otherwise use the defaults
383 if (scalar @$results > 3) {
384
385 my $lang_fname = util::filename_cat($ENV{'GSDLHOME'},"tmp","lang","package_forms",
386 "languages.log");
387 open (LANGFILE, "<$lang_fname") or die ("Unable to open $lang_fname: $!\n");
388
389 while (<LANGFILE>) {
390
391 $line = $_;
392 chomp($line);
393
394 $fn = $filename;
395 $fn =~ s/.*\/(.*)\..*/$1/;
396
397
398 if ($line eq $fn) {
399 print $outhandle "BINGO $line $fn\n";
400 $line = <LANGFILE>;
401 chomp($line);
402 print $outhandle "language code is $line\n";
403
404 foreach $r (@$results) {
405 print $outhandle "MY1 $r\n";
406
407 $roar = $& if($r =~ m/../);
408
409 if ($roar eq $line) {
410 print $outhandle "WE HAVE A WINNER $r\n";
411 }
412 }
413
414
415 }
416 else {
417 $line = <LANGFILE>;
418 }
419 }
420
421 close LANGFILE;
422
423 if ($self->{'input_encoding'} ne 'auto') {
424 if ($self->{'extract_language'} && $self->{'verbosity'}) {
425 print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
426 print $outhandle "defaulting to $self->{'default_language'}\n";
427 }
428 return ($self->{'default_language'}, $self->{'input_encoding'});
429
430 } else {
431 if ($self->{'verbosity'}) {
432 print $outhandle "BASPlug: WARNING: language/encoding could not be extracted from $filename - ";
433 print $outhandle "defaulting to $self->{'default_language'}/$self->{'default_encoding'}\n";
434 }
435 return ($self->{'default_language'}, $self->{'default_encoding'});
436 }
437 }
438
439
440 # format language/encoding
441 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
442 if (!defined $language) {
443 if ($self->{'verbosity'}) {
444 print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
445 print $outhandle "defaulting to $self->{'default_language'}\n";
446 }
447 $language = $self->{'default_language'};
448 }
449 if (!defined $encoding) {
450 if ($self->{'verbosity'}) {
451 print $outhandle "BasPlug: WARNING: encoding could not be extracted from $filename - ";
452 print $outhandle "defaulting to $self->{'default_encoding'}\n";
453 }
454 $encoding = $self->{'default_encoding'};
455 }
456
457 if ($encoding !~ /^(ascii|utf8|unicode)$/ &&
458 !defined $encodings::encodings->{$encoding}) {
459 if ($self->{'verbosity'}) {
460 print $outhandle "BasPlug: WARNING: $filename appears to be encoded in an unsupported encoding ($encoding) - ";
461 print $outhandle "using $self->{'default_encoding'}\n";
462 }
463 $encoding = $self->{'default_encoding'};
464 }
465
466 print STDERR "**** forcing encoding to be utf8\n";
467 $encoding = "utf8";
468
469 print STDERR "**** forcing language to be first two letters\n";
470 my $lfname = $filename;
471 $lfname =~ s/^.*\///;
472 $language = substr($lfname,0,2);
473
474 print $outhandle "RETURNING VALUES $language $encoding\n";
475
476 return ($language, $encoding);
477}
478
479
480sub find_language {
481 my ($self,$fn) = @_;
482
483 my $lang_table = $self->{'lang_abbr'};
484
485 if (!defined $lang_table->{$fn}) {
486
487 # try and find it with shorter string name
488
489 my $try_len = length($fn);
490
491 while ($try_len>=4) {
492 $try_fn = substr($fn,0,$try_len);
493
494 if (defined $lang_table->{$try_fn}) {
495 $fn = $try_fn;
496 last;
497 }
498 $try_len--;
499 }
500 }
501
502 return $fn;
503}
504
505
506sub get_language_encoding {
507 my $self = shift (@_);
508 my ($filename) = @_;
509 my $outhandle = $self->{'outhandle'};
510
511 my $fn = $filename;
512 $fn =~ s/.*\/(.*)\..*/$1/;
513 $fn =~ s/\d+$//; # remove any digits from end of filename
514
515 my $languge;
516 my $encoding = "utf8";
517
518 ## my $lang_lookup = $self->find_language($fn);
519
520 my $lang_table = $self->{'lang_abbr'};
521
522 if (!defined $lang_table->{$fn}) {
523
524 print $outhandle "Warning: Macro file name $filename not in list of languages.\n";
525 print $outhandle " Using default language.\n";
526 $language = $self->{'default_language'};
527 }
528 else {
529 $language = $lang_table->{$fn};
530 }
531
532 ## print $outhandle "Storing $filename as $language $encoding\n";
533
534 return ($language, $encoding);
535}
536
537
5381;
Note: See TracBrowser for help on using the repository browser.