source: trunk/gsdl/perllib/plugins/MACROPlug.pm@ 7830

Last change on this file since 7830 was 7353, checked in by kjdon, 20 years ago

removed all the old print_usage functions - they may be misleading as options change but these dont

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 13.1 KB
Line 
1###########################################################################
2#
3# MACROPlug plugin - to process .dm files for language translation
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package MACROPlug;
27
28use BasPlug;
29use parsargv;
30
31
32sub BEGIN {
33 @ISA = ('BasPlug');
34}
35
36my $arguments =
37 [ { 'name' => "process_exp",
38 'desc' => "{BasPlug.process_exp}",
39 'type' => "regexp",
40 'deft' => &get_default_process_exp(),
41 'reqd' => "no" } ];
42
43my $options = { 'name' => "MACROPlug",
44 'desc' => "{MACROPlug.desc}",
45 'abstract' => "no",
46 'inherits' => "yes",
47 'args' => $arguments };
48
49
50sub load_language_table
51{
52 my $lang_table = {};
53
54 my $lang_fname = util::filename_cat($ENV{'GSDLHOME'},"tmp","lang",
55 "package_forms","languages.log");
56 open (LANGFILE, "<$lang_fname")
57 || die ("Unable to open $lang_fname: $!\n");
58
59 my $full_name;
60 my $abbr_name;
61
62 while (defined ($full_name=<LANGFILE>)) {
63 chomp($full_name);
64
65 $abbr_name = <LANGFILE>;
66 chomp($abbr_name);
67
68 $lang_table->{$full_name} = $abbr_name;
69
70 my $fourchar_name = substr($full_name,0,4);
71 if (!defined $lang_table->{$fourchar_name}) {
72 $lang_table->{$fourchar_name} = $abbr_name;
73 }
74 else {
75 print STDERR "Warning: Clash on four character abbreviation for language $fourchar_name\n";
76 }
77 }
78
79 close LANGFILE;
80
81 return $lang_table;
82}
83
84
85
86sub new {
87 my ($class) = @_;
88 my $self = new BasPlug ($class, @_);
89
90 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
91 my $option_list = $self->{'option_list'};
92 push( @{$option_list}, $options );
93
94 # $self->{'lang_abbr'} = load_language_table();
95
96 return bless $self, $class;
97}
98
99sub get_default_process_exp {
100 my $self = shift (@_);
101
102 return q^(?i)\.dm$^;
103}
104
105
106sub read {
107 my $self = shift (@_);
108 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
109
110 my $outhandle = $self->{'outhandle'};
111
112 my $lang_table = $self->{'lang_abbr'};
113 my $fn = $file;
114 $fn =~ s/.*\/(.*)\..*/$1/;
115 $fn =~ s/\d+$//; # remove any digits from end of filename
116
117 my $filename = $file;
118 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
119
120 if ((!-d $filename) && ($file !~ m/doc.xml$/) && (!defined $lang_table->{$fn})) {
121 print $outhandle "MACROPlug: blocking $file\n"
122 if $self->{'verbosity'} > 2;
123 $self->{'num_blocked'} ++;
124 return 0;
125 }
126
127 return $self->SUPER::read(@_);
128}
129
130
131# do plugin specific processing of doc_obj
132sub process {
133 my $self = shift (@_);
134 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
135 my $outhandle = $self->{'outhandle'};
136
137 print STDERR "<Processing n='$file' p='MACROPlug'>\n" if ($gli);
138 print $outhandle "MACROPlug: processing $file\n"
139 if $self->{'verbosity'} > 1;
140
141 my $thissection = $doc_obj->get_top_section();
142
143 $self->extract_macronames (\$textref, $doc_obj, $thissection, $file);
144 # we need to escape the escape character, or else mg will convert into
145 # eg literal newlines, instead of leaving the text as '\n'
146
147
148 my ($filemeta) = $file =~ /([^\\\/]+)$/;
149 $doc_obj->add_utf8_metadata($thissection, "Title", &ghtml::dmsafe($filemeta));
150
151 return 1;
152}
153
154
155sub extract_macronames {
156
157 my $self = shift(@_);
158
159 my ($textref, $doc_obj, $thissection, $file) = @_;
160 my $outhandle = $self->{'outhandle'};
161
162 print $outhandle " extracting macronames ...\n"
163 if ($self->{'verbosity'}>3);
164
165 my @textarray = split ("\n", $$$textref)
166 if ($self->{'verbosity'}>3);
167
168 my $macro_text = "";
169 my $image_macro = "false";
170 my $norm_macro = "false";
171
172 #print STDERR "FILE@@@@@ $file\n";
173
174 #foreach my $ta (@textarray) {
175
176 for ($k = 0; $k < scalar(@textarray); $k++) {
177
178 $ta = $textarray[$k];
179
180 #print STDERR "$ta\n" if ($file =~ m/port/);
181
182 if ($ta =~ m/^package /) {
183 $currpackage = $ta;
184 chomp($currpackage);
185 $currpackage =~ s/^package //;
186 }
187
188
189 elsif ($image_macro eq "true") {
190 unless ($ta =~ m/\S+/) {
191 $image_macro = "false";
192
193 $macro_text =~ s/\_/\\_/osg; # for dm (we have a convention of starting macros with _
194
195 #print STDERR "$macro_text\n\n" if ($file =~ m/spanish/);
196
197 $doc_obj->add_utf8_text($thissection, $macro_text);
198
199 my @names = split(/\s*\#\#\s*/, $macro_text);
200
201 my ($title) = $names[1];
202
203 if (length($title) > 100) {
204 $title = substr ($title, 0, 100) . "...";
205 }
206
207 $title =~ s/</\&lt;/g;
208 $title =~ s/>/\&gt;/g;
209
210 #print STDERR "$title\n" if ($file =~ m/port/);
211
212 $doc_obj->add_utf8_metadata ($thissection, "SecTitle", $title);
213 $macro_text = "";
214 }
215 $ta .= "<br>";
216 $macro_text .= $ta;
217
218 }
219
220 elsif ($norm_macro eq "true") {
221
222 $macro_text .= $ta;
223
224 if ($ta =~ m/\}\s*(\#.*)*\Z/) {
225
226 $norm_macro = "false";
227
228 $macro_text =~ s/\A(_\w+_)\s//;
229
230 #print STDERR "$macro_text\n" if($file =~ m/spanish/i);
231
232 $doc_obj->add_utf8_text($thissection, $macro_text);
233
234 my ($title) = $macro_text;
235
236 $title =~ s/\_/\\_/osg;
237 $title =~ s/\{//;
238 $title =~ s/\}\s*(\#.*)*\Z//;
239 $title =~ s/\<br\>|\<p\>/\n/osg;
240 $title =~ s/\n/ /osg;
241 $title =~ s/</\&lt;/g;
242 $title =~ s/>/\&gt;/g;
243
244 $title =~ s/\(/\\\(/g;
245 $title =~ s/\)/\\\)/g;
246
247 if (length($title) > 100) {
248 $title = substr ($title, 0, 100) . "...";
249 }
250 #print STDERR "$title\n\n\n" if($file =~ m/spanish/i);
251
252 $doc_obj->add_utf8_metadata ($thissection, "SecTitle", $title);
253 $macro_text = "";
254 }
255
256 }
257 elsif(($ta =~ m/\A(_\w+_)\s*/) && ($image_macro eq "false")) {
258
259 my $new_macro = $1;
260
261 my $new_cursection = $doc_obj->get_next_child($thissection);
262 $thissection = $doc_obj->insert_section("");
263
264 $new_macro =~ s/\_/\\_/osg;
265
266 $new_macro = $currpackage . "::" . $new_macro;
267
268 #print STDERR "$new_macro\n" if ($file =~ m/spanish/);
269
270 $doc_obj->add_utf8_metadata ($thissection, "Macroname", $new_macro);
271
272 $macro_text = $ta;
273
274 if ($ta =~ m/\}\s*(\#.*)*\Z/) {
275
276 $macro_text =~ s/\A(_\w+_)\s//;
277 $macro_text =~ s/\n/ /osg;
278
279 #print STDERR "$macro_text\n" if($file =~ m/spanish/i);
280
281 $doc_obj->add_utf8_text($thissection, $macro_text);
282
283 my ($title) = $macro_text;
284 $title =~ s/\{//;
285 $title =~ s/\}\s*(\#.*)*\Z//;
286 $title =~ s/\<br\>|\<p\>/\n/osg;
287 $title =~ s/\n/ /osg;
288 $title =~ s/</\&lt;/g;
289 $title =~ s/>/\&gt;/g;
290
291 if (length($title) > 100) {
292 $title = substr ($title, 0, 100) . "...";
293 }
294 #print STDERR "$title\n\n\n" if($file =~ m/spanish/i);
295
296 $doc_obj->add_utf8_metadata ($thissection, "SecTitle", $title);
297 }
298 else {
299 $norm_macro = "true";
300 }
301
302 $macro_text = "";
303 $macro_text .= $ta;
304 }
305
306 #line in format ## "sometext" ## macro ## macroname ##
307 elsif ($ta =~ m/^\#\# .*/) {
308 my $macroname = $ta;
309 $image_macro = "true";
310 $macro_text = "";
311
312 #NEED S.T LIKE THIS TO PICK UP HOARY ONES OVER 2 LINES
313 unless ($macroname =~ m/^\#\# .*\#\#/) {
314 $macroname = $ta;
315 chomp($macroname);
316 $macro_text .= $macroname;
317 $ta .= $textarray[++$k];
318 $macroname .= $ta;
319 }
320
321 my @names = split(/\s*\#\#\s*/, $macroname);
322 $macroname = $names[(scalar @names) - 1];
323 my $key = $currpackage . "::" . $macroname; # key to the hash and the database
324
325 my $new_cursection = $doc_obj->get_next_child($thissection);
326 $thissection = $doc_obj->insert_section("");
327
328 #print STDERR "$key \n" if ($file =~ m/(port)|(chin)|(engl)/);
329
330 $doc_obj->add_utf8_metadata ($thissection, "Macroname", $key);
331
332 $macro_text = $ta . "<br>";
333
334 }
335 }
336
337 print $outhandle "done extracting macros\n"
338 if ($self->{'verbosity'}>3);
339
340}
341
342
343sub get_language_encoding_old {
344 my $self = shift (@_);
345 my ($filename) = @_;
346 my $outhandle = $self->{'outhandle'};
347
348 # read in file
349 open (FILE, $filename) || die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n";
350 undef $/;
351 my $text = <FILE>;
352 $/ = "\n";
353 close FILE;
354
355 # remove <title>stuff</title> -- as titles tend often to be in English
356 # for foreign language documents
357 $text =~ s/<title>.*?<\/title>//i;
358
359 # remove all HTML tags
360 $text =~ s/<[^>]*>//sg;
361 my $results = [];
362
363 # get the language/encoding
364 $results = $self->{'textcat'}->classify(\$text);
365
366 foreach $r (@$results) {
367 print $outhandle "Results: $r\n";
368
369 }
370
371 # if textcat returns 3 or less possibilities we'll use the
372 # first one in the list - otherwise use the defaults
373 if (scalar @$results > 3) {
374
375 my $lang_fname = util::filename_cat($ENV{'GSDLHOME'},"tmp","lang","package_forms",
376 "languages.log");
377 open (LANGFILE, "<$lang_fname") or die ("Unable to open $lang_fname: $!\n");
378
379 while (<LANGFILE>) {
380
381 $line = $_;
382 chomp($line);
383
384 $fn = $filename;
385 $fn =~ s/.*\/(.*)\..*/$1/;
386
387
388 if ($line eq $fn) {
389 print $outhandle "BINGO $line $fn\n";
390 $line = <LANGFILE>;
391 chomp($line);
392 print $outhandle "language code is $line\n";
393
394 foreach $r (@$results) {
395 print $outhandle "MY1 $r\n";
396
397 $roar = $& if($r =~ m/../);
398
399 if ($roar eq $line) {
400 print $outhandle "WE HAVE A WINNER $r\n";
401 }
402 }
403
404
405 }
406 else {
407 $line = <LANGFILE>;
408 }
409 }
410
411 close LANGFILE;
412
413 if ($self->{'input_encoding'} ne 'auto') {
414 if ($self->{'extract_language'} && $self->{'verbosity'}) {
415 print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
416 print $outhandle "defaulting to $self->{'default_language'}\n";
417 }
418 return ($self->{'default_language'}, $self->{'input_encoding'});
419
420 } else {
421 if ($self->{'verbosity'}) {
422 print $outhandle "BASPlug: WARNING: language/encoding could not be extracted from $filename - ";
423 print $outhandle "defaulting to $self->{'default_language'}/$self->{'default_encoding'}\n";
424 }
425 return ($self->{'default_language'}, $self->{'default_encoding'});
426 }
427 }
428
429
430 # format language/encoding
431 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
432 if (!defined $language) {
433 if ($self->{'verbosity'}) {
434 print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
435 print $outhandle "defaulting to $self->{'default_language'}\n";
436 }
437 $language = $self->{'default_language'};
438 }
439 if (!defined $encoding) {
440 if ($self->{'verbosity'}) {
441 print $outhandle "BasPlug: WARNING: encoding could not be extracted from $filename - ";
442 print $outhandle "defaulting to $self->{'default_encoding'}\n";
443 }
444 $encoding = $self->{'default_encoding'};
445 }
446
447 if ($encoding !~ /^(ascii|utf8|unicode)$/ &&
448 !defined $encodings::encodings->{$encoding}) {
449 if ($self->{'verbosity'}) {
450 print $outhandle "BasPlug: WARNING: $filename appears to be encoded in an unsupported encoding ($encoding) - ";
451 print $outhandle "using $self->{'default_encoding'}\n";
452 }
453 $encoding = $self->{'default_encoding'};
454 }
455
456 print STDERR "**** forcing encoding to be utf8\n";
457 $encoding = "utf8";
458
459 print STDERR "**** forcing language to be first two letters\n";
460 my $lfname = $filename;
461 $lfname =~ s/^.*\///;
462 $language = substr($lfname,0,2);
463
464 print $outhandle "RETURNING VALUES $language $encoding\n";
465
466 return ($language, $encoding);
467}
468
469
470sub find_language {
471 my ($self,$fn) = @_;
472
473 my $lang_table = $self->{'lang_abbr'};
474
475 if (!defined $lang_table->{$fn}) {
476
477 # try and find it with shorter string name
478
479 my $try_len = length($fn);
480
481 while ($try_len>=4) {
482 $try_fn = substr($fn,0,$try_len);
483
484 if (defined $lang_table->{$try_fn}) {
485 $fn = $try_fn;
486 last;
487 }
488 $try_len--;
489 }
490 }
491
492 return $fn;
493}
494
495
496sub get_language_encoding {
497 my $self = shift (@_);
498 my ($filename) = @_;
499 my $outhandle = $self->{'outhandle'};
500
501 my $fn = $filename;
502 $fn =~ s/.*\/(.*)\..*/$1/;
503 $fn =~ s/\d+$//; # remove any digits from end of filename
504
505 my $languge;
506 my $encoding = "utf8";
507
508 ## my $lang_lookup = $self->find_language($fn);
509
510 my $lang_table = $self->{'lang_abbr'};
511
512 if (!defined $lang_table->{$fn}) {
513
514 print $outhandle "Warning: Macro file name $filename not in list of languages.\n";
515 print $outhandle " Using default language.\n";
516 $language = $self->{'default_language'};
517 }
518 else {
519 $language = $lang_table->{$fn};
520 }
521
522 ## print $outhandle "Storing $filename as $language $encoding\n";
523
524 return ($language, $encoding);
525}
526
527
5281;
Note: See TracBrowser for help on using the repository browser.