source: trunk/gsdl/perllib/plugins/MACROPlug.pm@ 8121

Last change on this file since 8121 was 8121, checked in by chi, 20 years ago

Add the "FileFormat" metadata to each of the Plugins.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 13.2 KB
Line 
1###########################################################################
2#
3# MACROPlug plugin - to process .dm files for language translation
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package MACROPlug;
27
28use BasPlug;
29use parsargv;
30
31
32sub BEGIN {
33 @ISA = ('BasPlug');
34}
35
36my $arguments =
37 [ { 'name' => "process_exp",
38 'desc' => "{BasPlug.process_exp}",
39 'type' => "regexp",
40 'deft' => &get_default_process_exp(),
41 'reqd' => "no" } ];
42
43my $options = { 'name' => "MACROPlug",
44 'desc' => "{MACROPlug.desc}",
45 'abstract' => "no",
46 'inherits' => "yes",
47 'args' => $arguments };
48
49
50sub load_language_table
51{
52 my $lang_table = {};
53
54 my $lang_fname = util::filename_cat($ENV{'GSDLHOME'},"tmp","lang",
55 "package_forms","languages.log");
56 open (LANGFILE, "<$lang_fname")
57 || die ("Unable to open $lang_fname: $!\n");
58
59 my $full_name;
60 my $abbr_name;
61
62 while (defined ($full_name=<LANGFILE>)) {
63 chomp($full_name);
64
65 $abbr_name = <LANGFILE>;
66 chomp($abbr_name);
67
68 $lang_table->{$full_name} = $abbr_name;
69
70 my $fourchar_name = substr($full_name,0,4);
71 if (!defined $lang_table->{$fourchar_name}) {
72 $lang_table->{$fourchar_name} = $abbr_name;
73 }
74 else {
75 print STDERR "Warning: Clash on four character abbreviation for language $fourchar_name\n";
76 }
77 }
78
79 close LANGFILE;
80
81 return $lang_table;
82}
83
84
85
86sub new {
87 my ($class) = @_;
88 my $self = new BasPlug ($class, @_);
89
90 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
91 my $option_list = $self->{'option_list'};
92 push( @{$option_list}, $options );
93
94 # $self->{'lang_abbr'} = load_language_table();
95
96 return bless $self, $class;
97}
98
99sub get_default_process_exp {
100 my $self = shift (@_);
101
102 return q^(?i)\.dm$^;
103}
104
105
106sub read {
107 my $self = shift (@_);
108 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
109
110 my $outhandle = $self->{'outhandle'};
111
112 my $lang_table = $self->{'lang_abbr'};
113 my $fn = $file;
114 $fn =~ s/.*\/(.*)\..*/$1/;
115 $fn =~ s/\d+$//; # remove any digits from end of filename
116
117 my $filename = $file;
118 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
119
120 if ((!-d $filename) && ($file !~ m/doc.xml$/) && (!defined $lang_table->{$fn})) {
121 print $outhandle "MACROPlug: blocking $file\n"
122 if $self->{'verbosity'} > 2;
123 $self->{'num_blocked'} ++;
124 return 0;
125 }
126
127 return $self->SUPER::read(@_);
128}
129
130
131# do plugin specific processing of doc_obj
132sub process {
133 my $self = shift (@_);
134 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
135 my $outhandle = $self->{'outhandle'};
136
137 print STDERR "<Processing n='$file' p='MACROPlug'>\n" if ($gli);
138 print $outhandle "MACROPlug: processing $file\n"
139 if $self->{'verbosity'} > 1;
140
141 my $thissection = $doc_obj->get_top_section();
142
143 $self->extract_macronames (\$textref, $doc_obj, $thissection, $file);
144 # we need to escape the escape character, or else mg will convert into
145 # eg literal newlines, instead of leaving the text as '\n'
146
147
148 my ($filemeta) = $file =~ /([^\\\/]+)$/;
149 $doc_obj->add_utf8_metadata($thissection, "Title", &ghtml::dmsafe($filemeta));
150
151 # FileFormat metadata
152 $doc_obj->add_metadata($thissection, "FileFormat", "MACRO");
153
154 return 1;
155}
156
157
158sub extract_macronames {
159
160 my $self = shift(@_);
161
162 my ($textref, $doc_obj, $thissection, $file) = @_;
163 my $outhandle = $self->{'outhandle'};
164
165 print $outhandle " extracting macronames ...\n"
166 if ($self->{'verbosity'}>3);
167
168 my @textarray = split ("\n", $$$textref)
169 if ($self->{'verbosity'}>3);
170
171 my $macro_text = "";
172 my $image_macro = "false";
173 my $norm_macro = "false";
174
175 #print STDERR "FILE@@@@@ $file\n";
176
177 #foreach my $ta (@textarray) {
178
179 for ($k = 0; $k < scalar(@textarray); $k++) {
180
181 $ta = $textarray[$k];
182
183 #print STDERR "$ta\n" if ($file =~ m/port/);
184
185 if ($ta =~ m/^package /) {
186 $currpackage = $ta;
187 chomp($currpackage);
188 $currpackage =~ s/^package //;
189 }
190
191
192 elsif ($image_macro eq "true") {
193 unless ($ta =~ m/\S+/) {
194 $image_macro = "false";
195
196 $macro_text =~ s/\_/\\_/osg; # for dm (we have a convention of starting macros with _
197
198 #print STDERR "$macro_text\n\n" if ($file =~ m/spanish/);
199
200 $doc_obj->add_utf8_text($thissection, $macro_text);
201
202 my @names = split(/\s*\#\#\s*/, $macro_text);
203
204 my ($title) = $names[1];
205
206 if (length($title) > 100) {
207 $title = substr ($title, 0, 100) . "...";
208 }
209
210 $title =~ s/</\&lt;/g;
211 $title =~ s/>/\&gt;/g;
212
213 #print STDERR "$title\n" if ($file =~ m/port/);
214
215 $doc_obj->add_utf8_metadata ($thissection, "SecTitle", $title);
216 $macro_text = "";
217 }
218 $ta .= "<br>";
219 $macro_text .= $ta;
220
221 }
222
223 elsif ($norm_macro eq "true") {
224
225 $macro_text .= $ta;
226
227 if ($ta =~ m/\}\s*(\#.*)*\Z/) {
228
229 $norm_macro = "false";
230
231 $macro_text =~ s/\A(_\w+_)\s//;
232
233 #print STDERR "$macro_text\n" if($file =~ m/spanish/i);
234
235 $doc_obj->add_utf8_text($thissection, $macro_text);
236
237 my ($title) = $macro_text;
238
239 $title =~ s/\_/\\_/osg;
240 $title =~ s/\{//;
241 $title =~ s/\}\s*(\#.*)*\Z//;
242 $title =~ s/\<br\>|\<p\>/\n/osg;
243 $title =~ s/\n/ /osg;
244 $title =~ s/</\&lt;/g;
245 $title =~ s/>/\&gt;/g;
246
247 $title =~ s/\(/\\\(/g;
248 $title =~ s/\)/\\\)/g;
249
250 if (length($title) > 100) {
251 $title = substr ($title, 0, 100) . "...";
252 }
253 #print STDERR "$title\n\n\n" if($file =~ m/spanish/i);
254
255 $doc_obj->add_utf8_metadata ($thissection, "SecTitle", $title);
256 $macro_text = "";
257 }
258
259 }
260 elsif(($ta =~ m/\A(_\w+_)\s*/) && ($image_macro eq "false")) {
261
262 my $new_macro = $1;
263
264 my $new_cursection = $doc_obj->get_next_child($thissection);
265 $thissection = $doc_obj->insert_section("");
266
267 $new_macro =~ s/\_/\\_/osg;
268
269 $new_macro = $currpackage . "::" . $new_macro;
270
271 #print STDERR "$new_macro\n" if ($file =~ m/spanish/);
272
273 $doc_obj->add_utf8_metadata ($thissection, "Macroname", $new_macro);
274
275 $macro_text = $ta;
276
277 if ($ta =~ m/\}\s*(\#.*)*\Z/) {
278
279 $macro_text =~ s/\A(_\w+_)\s//;
280 $macro_text =~ s/\n/ /osg;
281
282 #print STDERR "$macro_text\n" if($file =~ m/spanish/i);
283
284 $doc_obj->add_utf8_text($thissection, $macro_text);
285
286 my ($title) = $macro_text;
287 $title =~ s/\{//;
288 $title =~ s/\}\s*(\#.*)*\Z//;
289 $title =~ s/\<br\>|\<p\>/\n/osg;
290 $title =~ s/\n/ /osg;
291 $title =~ s/</\&lt;/g;
292 $title =~ s/>/\&gt;/g;
293
294 if (length($title) > 100) {
295 $title = substr ($title, 0, 100) . "...";
296 }
297 #print STDERR "$title\n\n\n" if($file =~ m/spanish/i);
298
299 $doc_obj->add_utf8_metadata ($thissection, "SecTitle", $title);
300 }
301 else {
302 $norm_macro = "true";
303 }
304
305 $macro_text = "";
306 $macro_text .= $ta;
307 }
308
309 #line in format ## "sometext" ## macro ## macroname ##
310 elsif ($ta =~ m/^\#\# .*/) {
311 my $macroname = $ta;
312 $image_macro = "true";
313 $macro_text = "";
314
315 #NEED S.T LIKE THIS TO PICK UP HOARY ONES OVER 2 LINES
316 unless ($macroname =~ m/^\#\# .*\#\#/) {
317 $macroname = $ta;
318 chomp($macroname);
319 $macro_text .= $macroname;
320 $ta .= $textarray[++$k];
321 $macroname .= $ta;
322 }
323
324 my @names = split(/\s*\#\#\s*/, $macroname);
325 $macroname = $names[(scalar @names) - 1];
326 my $key = $currpackage . "::" . $macroname; # key to the hash and the database
327
328 my $new_cursection = $doc_obj->get_next_child($thissection);
329 $thissection = $doc_obj->insert_section("");
330
331 #print STDERR "$key \n" if ($file =~ m/(port)|(chin)|(engl)/);
332
333 $doc_obj->add_utf8_metadata ($thissection, "Macroname", $key);
334
335 $macro_text = $ta . "<br>";
336
337 }
338 }
339
340 print $outhandle "done extracting macros\n"
341 if ($self->{'verbosity'}>3);
342
343}
344
345
346sub get_language_encoding_old {
347 my $self = shift (@_);
348 my ($filename) = @_;
349 my $outhandle = $self->{'outhandle'};
350
351 # read in file
352 open (FILE, $filename) || die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n";
353 undef $/;
354 my $text = <FILE>;
355 $/ = "\n";
356 close FILE;
357
358 # remove <title>stuff</title> -- as titles tend often to be in English
359 # for foreign language documents
360 $text =~ s/<title>.*?<\/title>//i;
361
362 # remove all HTML tags
363 $text =~ s/<[^>]*>//sg;
364 my $results = [];
365
366 # get the language/encoding
367 $results = $self->{'textcat'}->classify(\$text);
368
369 foreach $r (@$results) {
370 print $outhandle "Results: $r\n";
371
372 }
373
374 # if textcat returns 3 or less possibilities we'll use the
375 # first one in the list - otherwise use the defaults
376 if (scalar @$results > 3) {
377
378 my $lang_fname = util::filename_cat($ENV{'GSDLHOME'},"tmp","lang","package_forms",
379 "languages.log");
380 open (LANGFILE, "<$lang_fname") or die ("Unable to open $lang_fname: $!\n");
381
382 while (<LANGFILE>) {
383
384 $line = $_;
385 chomp($line);
386
387 $fn = $filename;
388 $fn =~ s/.*\/(.*)\..*/$1/;
389
390
391 if ($line eq $fn) {
392 print $outhandle "BINGO $line $fn\n";
393 $line = <LANGFILE>;
394 chomp($line);
395 print $outhandle "language code is $line\n";
396
397 foreach $r (@$results) {
398 print $outhandle "MY1 $r\n";
399
400 $roar = $& if($r =~ m/../);
401
402 if ($roar eq $line) {
403 print $outhandle "WE HAVE A WINNER $r\n";
404 }
405 }
406
407
408 }
409 else {
410 $line = <LANGFILE>;
411 }
412 }
413
414 close LANGFILE;
415
416 if ($self->{'input_encoding'} ne 'auto') {
417 if ($self->{'extract_language'} && $self->{'verbosity'}) {
418 print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
419 print $outhandle "defaulting to $self->{'default_language'}\n";
420 }
421 return ($self->{'default_language'}, $self->{'input_encoding'});
422
423 } else {
424 if ($self->{'verbosity'}) {
425 print $outhandle "BASPlug: WARNING: language/encoding could not be extracted from $filename - ";
426 print $outhandle "defaulting to $self->{'default_language'}/$self->{'default_encoding'}\n";
427 }
428 return ($self->{'default_language'}, $self->{'default_encoding'});
429 }
430 }
431
432
433 # format language/encoding
434 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
435 if (!defined $language) {
436 if ($self->{'verbosity'}) {
437 print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
438 print $outhandle "defaulting to $self->{'default_language'}\n";
439 }
440 $language = $self->{'default_language'};
441 }
442 if (!defined $encoding) {
443 if ($self->{'verbosity'}) {
444 print $outhandle "BasPlug: WARNING: encoding could not be extracted from $filename - ";
445 print $outhandle "defaulting to $self->{'default_encoding'}\n";
446 }
447 $encoding = $self->{'default_encoding'};
448 }
449
450 if ($encoding !~ /^(ascii|utf8|unicode)$/ &&
451 !defined $encodings::encodings->{$encoding}) {
452 if ($self->{'verbosity'}) {
453 print $outhandle "BasPlug: WARNING: $filename appears to be encoded in an unsupported encoding ($encoding) - ";
454 print $outhandle "using $self->{'default_encoding'}\n";
455 }
456 $encoding = $self->{'default_encoding'};
457 }
458
459 print STDERR "**** forcing encoding to be utf8\n";
460 $encoding = "utf8";
461
462 print STDERR "**** forcing language to be first two letters\n";
463 my $lfname = $filename;
464 $lfname =~ s/^.*\///;
465 $language = substr($lfname,0,2);
466
467 print $outhandle "RETURNING VALUES $language $encoding\n";
468
469 return ($language, $encoding);
470}
471
472
473sub find_language {
474 my ($self,$fn) = @_;
475
476 my $lang_table = $self->{'lang_abbr'};
477
478 if (!defined $lang_table->{$fn}) {
479
480 # try and find it with shorter string name
481
482 my $try_len = length($fn);
483
484 while ($try_len>=4) {
485 $try_fn = substr($fn,0,$try_len);
486
487 if (defined $lang_table->{$try_fn}) {
488 $fn = $try_fn;
489 last;
490 }
491 $try_len--;
492 }
493 }
494
495 return $fn;
496}
497
498
499sub get_language_encoding {
500 my $self = shift (@_);
501 my ($filename) = @_;
502 my $outhandle = $self->{'outhandle'};
503
504 my $fn = $filename;
505 $fn =~ s/.*\/(.*)\..*/$1/;
506 $fn =~ s/\d+$//; # remove any digits from end of filename
507
508 my $languge;
509 my $encoding = "utf8";
510
511 ## my $lang_lookup = $self->find_language($fn);
512
513 my $lang_table = $self->{'lang_abbr'};
514
515 if (!defined $lang_table->{$fn}) {
516
517 print $outhandle "Warning: Macro file name $filename not in list of languages.\n";
518 print $outhandle " Using default language.\n";
519 $language = $self->{'default_language'};
520 }
521 else {
522 $language = $lang_table->{$fn};
523 }
524
525 ## print $outhandle "Storing $filename as $language $encoding\n";
526
527 return ($language, $encoding);
528}
529
530
5311;
Note: See TracBrowser for help on using the repository browser.