source: trunk/gsdl/perllib/plugins/MACROPlug.pm@ 11090

Last change on this file since 11090 was 10347, checked in by kjdon, 19 years ago

removed the unneeded 'use parsargv'

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 13.6 KB
Line 
1###########################################################################
2#
3# MACROPlug plugin - to process .dm files for language translation
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package MACROPlug;
27
28use BasPlug;
29
30use strict;
31no strict 'refs'; # allow filehandles to be variables and viceversa
32
33sub BEGIN {
34 @MACROPlug::ISA = ('BasPlug');
35}
36
37my $arguments =
38 [ { 'name' => "process_exp",
39 'desc' => "{BasPlug.process_exp}",
40 'type' => "regexp",
41 'deft' => &get_default_process_exp(),
42 'reqd' => "no" } ];
43
44my $options = { 'name' => "MACROPlug",
45 'desc' => "{MACROPlug.desc}",
46 'abstract' => "no",
47 'inherits' => "yes",
48 'args' => $arguments };
49
50
51sub load_language_table
52{
53 my $lang_table = {};
54
55 my $lang_fname = util::filename_cat($ENV{'GSDLHOME'},"tmp","lang",
56 "package_forms","languages.log");
57 open (LANGFILE, "<$lang_fname")
58 || die ("Unable to open $lang_fname: $!\n");
59
60 my $full_name;
61 my $abbr_name;
62
63 while (defined ($full_name=<LANGFILE>)) {
64 chomp($full_name);
65
66 $abbr_name = <LANGFILE>;
67 chomp($abbr_name);
68
69 $lang_table->{$full_name} = $abbr_name;
70
71 my $fourchar_name = substr($full_name,0,4);
72 if (!defined $lang_table->{$fourchar_name}) {
73 $lang_table->{$fourchar_name} = $abbr_name;
74 }
75 else {
76 print STDERR "Warning: Clash on four character abbreviation for language $fourchar_name\n";
77 }
78 }
79
80 close LANGFILE;
81
82 return $lang_table;
83}
84
85
86
87sub new {
88 my ($class) = shift (@_);
89 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
90 push(@$pluginlist, $class);
91
92 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
93 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
94
95 my $self = (defined $hashArgOptLists)? new BasPlug($pluginlist,$inputargs,$hashArgOptLists): new BasPlug($pluginlist,$inputargs);
96
97 # $self->{'lang_abbr'} = load_language_table();
98
99 return bless $self, $class;
100}
101
102sub get_default_process_exp {
103 my $self = shift (@_);
104
105 return q^(?i)\.dm$^;
106}
107
108
109sub read {
110 my $self = shift (@_);
111 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
112
113 my $outhandle = $self->{'outhandle'};
114
115 my $lang_table = $self->{'lang_abbr'};
116 my $fn = $file;
117 $fn =~ s/.*\/(.*)\..*/$1/;
118 $fn =~ s/\d+$//; # remove any digits from end of filename
119
120 my $filename = $file;
121 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
122
123 if ((!-d $filename) && ($file !~ m/doc.xml$/) && (!defined $lang_table->{$fn})) {
124 print $outhandle "MACROPlug: blocking $file\n"
125 if $self->{'verbosity'} > 2;
126 $self->{'num_blocked'} ++;
127 return 0;
128 }
129
130 return $self->SUPER::read(@_);
131}
132
133
134# do plugin specific processing of doc_obj
135sub process {
136 my $self = shift (@_);
137 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
138 my $outhandle = $self->{'outhandle'};
139
140 print STDERR "<Processing n='$file' p='MACROPlug'>\n" if ($gli);
141 print $outhandle "MACROPlug: processing $file\n"
142 if $self->{'verbosity'} > 1;
143
144 my $thissection = $doc_obj->get_top_section();
145
146 $self->extract_macronames (\$textref, $doc_obj, $thissection, $file);
147 # we need to escape the escape character, or else mg will convert into
148 # eg literal newlines, instead of leaving the text as '\n'
149
150
151 my ($filemeta) = $file =~ /([^\\\/]+)$/;
152 $doc_obj->add_utf8_metadata($thissection, "Title", &ghtml::dmsafe($filemeta));
153
154 # FileFormat metadata
155 $doc_obj->add_metadata($thissection, "FileFormat", "MACRO");
156 $doc_obj->add_metadata($thissection, "FileSize", (-s $file));
157
158 return 1;
159}
160
161
162sub extract_macronames {
163
164 my $self = shift(@_);
165
166 my ($textref, $doc_obj, $thissection, $file) = @_;
167 my $outhandle = $self->{'outhandle'};
168
169 print $outhandle " extracting macronames ...\n"
170 if ($self->{'verbosity'}>3);
171
172 my @textarray = split ("\n", $$$textref)
173 if ($self->{'verbosity'}>3);
174
175 my $macro_text = "";
176 my $image_macro = "false";
177 my $norm_macro = "false";
178 my $currpackage = "";
179
180 #print STDERR "FILE@@@@@ $file\n";
181
182 #foreach my $ta (@textarray) {
183
184 for (my $k = 0; $k < scalar(@textarray); $k++) {
185
186 my $ta = $textarray[$k];
187
188 #print STDERR "$ta\n" if ($file =~ m/port/);
189
190 if ($ta =~ m/^package /) {
191 $currpackage = $ta;
192 chomp($currpackage);
193 $currpackage =~ s/^package //;
194 }
195
196
197 elsif ($image_macro eq "true") {
198 unless ($ta =~ m/\S+/) {
199 $image_macro = "false";
200
201 $macro_text =~ s/\_/\\_/osg; # for dm (we have a convention of starting macros with _
202
203 #print STDERR "$macro_text\n\n" if ($file =~ m/spanish/);
204
205 $doc_obj->add_utf8_text($thissection, $macro_text);
206
207 my @names = split(/\s*\#\#\s*/, $macro_text);
208
209 my ($title) = $names[1];
210
211 if (length($title) > 100) {
212 $title = substr ($title, 0, 100) . "...";
213 }
214
215 $title =~ s/</\&lt;/g;
216 $title =~ s/>/\&gt;/g;
217
218 #print STDERR "$title\n" if ($file =~ m/port/);
219
220 $doc_obj->add_utf8_metadata ($thissection, "SecTitle", $title);
221 $macro_text = "";
222 }
223 $ta .= "<br>";
224 $macro_text .= $ta;
225
226 }
227
228 elsif ($norm_macro eq "true") {
229
230 $macro_text .= $ta;
231
232 if ($ta =~ m/\}\s*(\#.*)*\Z/) {
233
234 $norm_macro = "false";
235
236 $macro_text =~ s/\A(_\w+_)\s//;
237
238 #print STDERR "$macro_text\n" if($file =~ m/spanish/i);
239
240 $doc_obj->add_utf8_text($thissection, $macro_text);
241
242 my ($title) = $macro_text;
243
244 $title =~ s/\_/\\_/osg;
245 $title =~ s/\{//;
246 $title =~ s/\}\s*(\#.*)*\Z//;
247 $title =~ s/\<br\>|\<p\>/\n/osg;
248 $title =~ s/\n/ /osg;
249 $title =~ s/</\&lt;/g;
250 $title =~ s/>/\&gt;/g;
251
252 $title =~ s/\(/\\\(/g;
253 $title =~ s/\)/\\\)/g;
254
255 if (length($title) > 100) {
256 $title = substr ($title, 0, 100) . "...";
257 }
258 #print STDERR "$title\n\n\n" if($file =~ m/spanish/i);
259
260 $doc_obj->add_utf8_metadata ($thissection, "SecTitle", $title);
261 $macro_text = "";
262 }
263
264 }
265 elsif(($ta =~ m/\A(_\w+_)\s*/) && ($image_macro eq "false")) {
266
267 my $new_macro = $1;
268
269 my $new_cursection = $doc_obj->get_next_child($thissection);
270 $thissection = $doc_obj->insert_section("");
271
272 $new_macro =~ s/\_/\\_/osg;
273
274 $new_macro = $currpackage . "::" . $new_macro;
275
276 #print STDERR "$new_macro\n" if ($file =~ m/spanish/);
277
278 $doc_obj->add_utf8_metadata ($thissection, "Macroname", $new_macro);
279
280 $macro_text = $ta;
281
282 if ($ta =~ m/\}\s*(\#.*)*\Z/) {
283
284 $macro_text =~ s/\A(_\w+_)\s//;
285 $macro_text =~ s/\n/ /osg;
286
287 #print STDERR "$macro_text\n" if($file =~ m/spanish/i);
288
289 $doc_obj->add_utf8_text($thissection, $macro_text);
290
291 my ($title) = $macro_text;
292 $title =~ s/\{//;
293 $title =~ s/\}\s*(\#.*)*\Z//;
294 $title =~ s/\<br\>|\<p\>/\n/osg;
295 $title =~ s/\n/ /osg;
296 $title =~ s/</\&lt;/g;
297 $title =~ s/>/\&gt;/g;
298
299 if (length($title) > 100) {
300 $title = substr ($title, 0, 100) . "...";
301 }
302 #print STDERR "$title\n\n\n" if($file =~ m/spanish/i);
303
304 $doc_obj->add_utf8_metadata ($thissection, "SecTitle", $title);
305 }
306 else {
307 $norm_macro = "true";
308 }
309
310 $macro_text = "";
311 $macro_text .= $ta;
312 }
313
314 #line in format ## "sometext" ## macro ## macroname ##
315 elsif ($ta =~ m/^\#\# .*/) {
316 my $macroname = $ta;
317 $image_macro = "true";
318 $macro_text = "";
319
320 #NEED S.T LIKE THIS TO PICK UP HOARY ONES OVER 2 LINES
321 unless ($macroname =~ m/^\#\# .*\#\#/) {
322 $macroname = $ta;
323 chomp($macroname);
324 $macro_text .= $macroname;
325 $ta .= $textarray[++$k];
326 $macroname .= $ta;
327 }
328
329 my @names = split(/\s*\#\#\s*/, $macroname);
330 $macroname = $names[(scalar @names) - 1];
331 my $key = $currpackage . "::" . $macroname; # key to the hash and the database
332
333 my $new_cursection = $doc_obj->get_next_child($thissection);
334 $thissection = $doc_obj->insert_section("");
335
336 #print STDERR "$key \n" if ($file =~ m/(port)|(chin)|(engl)/);
337
338 $doc_obj->add_utf8_metadata ($thissection, "Macroname", $key);
339
340 $macro_text = $ta . "<br>";
341
342 }
343 }
344
345 print $outhandle "done extracting macros\n"
346 if ($self->{'verbosity'}>3);
347
348}
349
350
351sub get_language_encoding_old {
352 my $self = shift (@_);
353 my ($filename) = @_;
354 my $outhandle = $self->{'outhandle'};
355
356 # read in file
357 open (FILE, $filename) || die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n";
358 undef $/;
359 my $text = <FILE>;
360 $/ = "\n";
361 close FILE;
362
363 # remove <title>stuff</title> -- as titles tend often to be in English
364 # for foreign language documents
365 $text =~ s/<title>.*?<\/title>//i;
366
367 # remove all HTML tags
368 $text =~ s/<[^>]*>//sg;
369 my $results = [];
370
371 # get the language/encoding
372 $results = $self->{'textcat'}->classify(\$text);
373
374 foreach my $r (@$results) {
375 print $outhandle "Results: $r\n";
376
377 }
378
379 # if textcat returns 3 or less possibilities we'll use the
380 # first one in the list - otherwise use the defaults
381 if (scalar @$results > 3) {
382
383 my $lang_fname = util::filename_cat($ENV{'GSDLHOME'},"tmp","lang","package_forms",
384 "languages.log");
385 open (LANGFILE, "<$lang_fname") or die ("Unable to open $lang_fname: $!\n");
386
387 while (<LANGFILE>) {
388
389 my $line = $_;
390 chomp($line);
391
392 my $fn = $filename;
393 $fn =~ s/.*\/(.*)\..*/$1/;
394
395
396 if ($line eq $fn) {
397 print $outhandle "BINGO $line $fn\n";
398 $line = <LANGFILE>;
399 chomp($line);
400 print $outhandle "language code is $line\n";
401
402 foreach my $r (@$results) {
403 print $outhandle "MY1 $r\n";
404
405 my $roar = $& if($r =~ m/../);
406
407 if ($roar eq $line) {
408 print $outhandle "WE HAVE A WINNER $r\n";
409 }
410 }
411
412
413 }
414 else {
415 $line = <LANGFILE>;
416 }
417 }
418
419 close LANGFILE;
420
421 if ($self->{'input_encoding'} ne 'auto') {
422 if ($self->{'extract_language'} && $self->{'verbosity'}) {
423 print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
424 print $outhandle "defaulting to $self->{'default_language'}\n";
425 }
426 return ($self->{'default_language'}, $self->{'input_encoding'});
427
428 } else {
429 if ($self->{'verbosity'}) {
430 print $outhandle "BASPlug: WARNING: language/encoding could not be extracted from $filename - ";
431 print $outhandle "defaulting to $self->{'default_language'}/$self->{'default_encoding'}\n";
432 }
433 return ($self->{'default_language'}, $self->{'default_encoding'});
434 }
435 }
436
437
438 # format language/encoding
439 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
440 if (!defined $language) {
441 if ($self->{'verbosity'}) {
442 print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
443 print $outhandle "defaulting to $self->{'default_language'}\n";
444 }
445 $language = $self->{'default_language'};
446 }
447 if (!defined $encoding) {
448 if ($self->{'verbosity'}) {
449 print $outhandle "BasPlug: WARNING: encoding could not be extracted from $filename - ";
450 print $outhandle "defaulting to $self->{'default_encoding'}\n";
451 }
452 $encoding = $self->{'default_encoding'};
453 }
454
455 if ($encoding !~ /^(ascii|utf8|unicode)$/ &&
456 !defined $encodings::encodings->{$encoding}) {
457 if ($self->{'verbosity'}) {
458 print $outhandle "BasPlug: WARNING: $filename appears to be encoded in an unsupported encoding ($encoding) - ";
459 print $outhandle "using $self->{'default_encoding'}\n";
460 }
461 $encoding = $self->{'default_encoding'};
462 }
463
464 print STDERR "**** forcing encoding to be utf8\n";
465 $encoding = "utf8";
466
467 print STDERR "**** forcing language to be first two letters\n";
468 my $lfname = $filename;
469 $lfname =~ s/^.*\///;
470 $language = substr($lfname,0,2);
471
472 print $outhandle "RETURNING VALUES $language $encoding\n";
473
474 return ($language, $encoding);
475}
476
477
478sub find_language {
479 my ($self,$fn) = @_;
480
481 my $lang_table = $self->{'lang_abbr'};
482
483 if (!defined $lang_table->{$fn}) {
484
485 # try and find it with shorter string name
486
487 my $try_len = length($fn);
488
489 while ($try_len>=4) {
490 my $try_fn = substr($fn,0,$try_len);
491
492 if (defined $lang_table->{$try_fn}) {
493 $fn = $try_fn;
494 last;
495 }
496 $try_len--;
497 }
498 }
499
500 return $fn;
501}
502
503
504sub get_language_encoding {
505 my $self = shift (@_);
506 my ($filename) = @_;
507 my $outhandle = $self->{'outhandle'};
508
509 my $fn = $filename;
510 $fn =~ s/.*\/(.*)\..*/$1/;
511 $fn =~ s/\d+$//; # remove any digits from end of filename
512
513 my $language;
514 my $encoding = "utf8";
515
516 ## my $lang_lookup = $self->find_language($fn);
517
518 my $lang_table = $self->{'lang_abbr'};
519
520 if (!defined $lang_table->{$fn}) {
521
522 print $outhandle "Warning: Macro file name $filename not in list of languages.\n";
523 print $outhandle " Using default language.\n";
524 $language = $self->{'default_language'};
525 }
526 else {
527 $language = $lang_table->{$fn};
528 }
529
530 ## print $outhandle "Storing $filename as $language $encoding\n";
531
532 return ($language, $encoding);
533}
534
535
5361;
Note: See TracBrowser for help on using the repository browser.