source: trunk/gsdl/perllib/plugins/MACROPlug.pm@ 9853

Last change on this file since 9853 was 9853, checked in by kjdon, 19 years ago

fixed up maxdocs - now pass an extra parameter to the read function

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 13.3 KB
Line 
1###########################################################################
2#
3# MACROPlug plugin - to process .dm files for language translation
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package MACROPlug;
27
28use BasPlug;
29use parsargv;
30
31
32sub BEGIN {
33 @ISA = ('BasPlug');
34}
35
36my $arguments =
37 [ { 'name' => "process_exp",
38 'desc' => "{BasPlug.process_exp}",
39 'type' => "regexp",
40 'deft' => &get_default_process_exp(),
41 'reqd' => "no" } ];
42
43my $options = { 'name' => "MACROPlug",
44 'desc' => "{MACROPlug.desc}",
45 'abstract' => "no",
46 'inherits' => "yes",
47 'args' => $arguments };
48
49
50sub load_language_table
51{
52 my $lang_table = {};
53
54 my $lang_fname = util::filename_cat($ENV{'GSDLHOME'},"tmp","lang",
55 "package_forms","languages.log");
56 open (LANGFILE, "<$lang_fname")
57 || die ("Unable to open $lang_fname: $!\n");
58
59 my $full_name;
60 my $abbr_name;
61
62 while (defined ($full_name=<LANGFILE>)) {
63 chomp($full_name);
64
65 $abbr_name = <LANGFILE>;
66 chomp($abbr_name);
67
68 $lang_table->{$full_name} = $abbr_name;
69
70 my $fourchar_name = substr($full_name,0,4);
71 if (!defined $lang_table->{$fourchar_name}) {
72 $lang_table->{$fourchar_name} = $abbr_name;
73 }
74 else {
75 print STDERR "Warning: Clash on four character abbreviation for language $fourchar_name\n";
76 }
77 }
78
79 close LANGFILE;
80
81 return $lang_table;
82}
83
84
85
86sub new {
87 my ($class) = @_;
88 my $self = new BasPlug ($class, @_);
89
90 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
91 my $option_list = $self->{'option_list'};
92 push( @{$option_list}, $options );
93
94 # $self->{'lang_abbr'} = load_language_table();
95
96 return bless $self, $class;
97}
98
99sub get_default_process_exp {
100 my $self = shift (@_);
101
102 return q^(?i)\.dm$^;
103}
104
105
106sub read {
107 my $self = shift (@_);
108 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
109
110 my $outhandle = $self->{'outhandle'};
111
112 my $lang_table = $self->{'lang_abbr'};
113 my $fn = $file;
114 $fn =~ s/.*\/(.*)\..*/$1/;
115 $fn =~ s/\d+$//; # remove any digits from end of filename
116
117 my $filename = $file;
118 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
119
120 if ((!-d $filename) && ($file !~ m/doc.xml$/) && (!defined $lang_table->{$fn})) {
121 print $outhandle "MACROPlug: blocking $file\n"
122 if $self->{'verbosity'} > 2;
123 $self->{'num_blocked'} ++;
124 return 0;
125 }
126
127 return $self->SUPER::read(@_);
128}
129
130
131# do plugin specific processing of doc_obj
132sub process {
133 my $self = shift (@_);
134 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
135 my $outhandle = $self->{'outhandle'};
136
137 print STDERR "<Processing n='$file' p='MACROPlug'>\n" if ($gli);
138 print $outhandle "MACROPlug: processing $file\n"
139 if $self->{'verbosity'} > 1;
140
141 my $thissection = $doc_obj->get_top_section();
142
143 $self->extract_macronames (\$textref, $doc_obj, $thissection, $file);
144 # we need to escape the escape character, or else mg will convert into
145 # eg literal newlines, instead of leaving the text as '\n'
146
147
148 my ($filemeta) = $file =~ /([^\\\/]+)$/;
149 $doc_obj->add_utf8_metadata($thissection, "Title", &ghtml::dmsafe($filemeta));
150
151 # FileFormat metadata
152 $doc_obj->add_metadata($thissection, "FileFormat", "MACRO");
153 $doc_obj->add_metadata($thissection, "FileSize", (-s $file));
154
155 return 1;
156}
157
158
159sub extract_macronames {
160
161 my $self = shift(@_);
162
163 my ($textref, $doc_obj, $thissection, $file) = @_;
164 my $outhandle = $self->{'outhandle'};
165
166 print $outhandle " extracting macronames ...\n"
167 if ($self->{'verbosity'}>3);
168
169 my @textarray = split ("\n", $$$textref)
170 if ($self->{'verbosity'}>3);
171
172 my $macro_text = "";
173 my $image_macro = "false";
174 my $norm_macro = "false";
175
176 #print STDERR "FILE@@@@@ $file\n";
177
178 #foreach my $ta (@textarray) {
179
180 for ($k = 0; $k < scalar(@textarray); $k++) {
181
182 $ta = $textarray[$k];
183
184 #print STDERR "$ta\n" if ($file =~ m/port/);
185
186 if ($ta =~ m/^package /) {
187 $currpackage = $ta;
188 chomp($currpackage);
189 $currpackage =~ s/^package //;
190 }
191
192
193 elsif ($image_macro eq "true") {
194 unless ($ta =~ m/\S+/) {
195 $image_macro = "false";
196
197 $macro_text =~ s/\_/\\_/osg; # for dm (we have a convention of starting macros with _
198
199 #print STDERR "$macro_text\n\n" if ($file =~ m/spanish/);
200
201 $doc_obj->add_utf8_text($thissection, $macro_text);
202
203 my @names = split(/\s*\#\#\s*/, $macro_text);
204
205 my ($title) = $names[1];
206
207 if (length($title) > 100) {
208 $title = substr ($title, 0, 100) . "...";
209 }
210
211 $title =~ s/</\&lt;/g;
212 $title =~ s/>/\&gt;/g;
213
214 #print STDERR "$title\n" if ($file =~ m/port/);
215
216 $doc_obj->add_utf8_metadata ($thissection, "SecTitle", $title);
217 $macro_text = "";
218 }
219 $ta .= "<br>";
220 $macro_text .= $ta;
221
222 }
223
224 elsif ($norm_macro eq "true") {
225
226 $macro_text .= $ta;
227
228 if ($ta =~ m/\}\s*(\#.*)*\Z/) {
229
230 $norm_macro = "false";
231
232 $macro_text =~ s/\A(_\w+_)\s//;
233
234 #print STDERR "$macro_text\n" if($file =~ m/spanish/i);
235
236 $doc_obj->add_utf8_text($thissection, $macro_text);
237
238 my ($title) = $macro_text;
239
240 $title =~ s/\_/\\_/osg;
241 $title =~ s/\{//;
242 $title =~ s/\}\s*(\#.*)*\Z//;
243 $title =~ s/\<br\>|\<p\>/\n/osg;
244 $title =~ s/\n/ /osg;
245 $title =~ s/</\&lt;/g;
246 $title =~ s/>/\&gt;/g;
247
248 $title =~ s/\(/\\\(/g;
249 $title =~ s/\)/\\\)/g;
250
251 if (length($title) > 100) {
252 $title = substr ($title, 0, 100) . "...";
253 }
254 #print STDERR "$title\n\n\n" if($file =~ m/spanish/i);
255
256 $doc_obj->add_utf8_metadata ($thissection, "SecTitle", $title);
257 $macro_text = "";
258 }
259
260 }
261 elsif(($ta =~ m/\A(_\w+_)\s*/) && ($image_macro eq "false")) {
262
263 my $new_macro = $1;
264
265 my $new_cursection = $doc_obj->get_next_child($thissection);
266 $thissection = $doc_obj->insert_section("");
267
268 $new_macro =~ s/\_/\\_/osg;
269
270 $new_macro = $currpackage . "::" . $new_macro;
271
272 #print STDERR "$new_macro\n" if ($file =~ m/spanish/);
273
274 $doc_obj->add_utf8_metadata ($thissection, "Macroname", $new_macro);
275
276 $macro_text = $ta;
277
278 if ($ta =~ m/\}\s*(\#.*)*\Z/) {
279
280 $macro_text =~ s/\A(_\w+_)\s//;
281 $macro_text =~ s/\n/ /osg;
282
283 #print STDERR "$macro_text\n" if($file =~ m/spanish/i);
284
285 $doc_obj->add_utf8_text($thissection, $macro_text);
286
287 my ($title) = $macro_text;
288 $title =~ s/\{//;
289 $title =~ s/\}\s*(\#.*)*\Z//;
290 $title =~ s/\<br\>|\<p\>/\n/osg;
291 $title =~ s/\n/ /osg;
292 $title =~ s/</\&lt;/g;
293 $title =~ s/>/\&gt;/g;
294
295 if (length($title) > 100) {
296 $title = substr ($title, 0, 100) . "...";
297 }
298 #print STDERR "$title\n\n\n" if($file =~ m/spanish/i);
299
300 $doc_obj->add_utf8_metadata ($thissection, "SecTitle", $title);
301 }
302 else {
303 $norm_macro = "true";
304 }
305
306 $macro_text = "";
307 $macro_text .= $ta;
308 }
309
310 #line in format ## "sometext" ## macro ## macroname ##
311 elsif ($ta =~ m/^\#\# .*/) {
312 my $macroname = $ta;
313 $image_macro = "true";
314 $macro_text = "";
315
316 #NEED S.T LIKE THIS TO PICK UP HOARY ONES OVER 2 LINES
317 unless ($macroname =~ m/^\#\# .*\#\#/) {
318 $macroname = $ta;
319 chomp($macroname);
320 $macro_text .= $macroname;
321 $ta .= $textarray[++$k];
322 $macroname .= $ta;
323 }
324
325 my @names = split(/\s*\#\#\s*/, $macroname);
326 $macroname = $names[(scalar @names) - 1];
327 my $key = $currpackage . "::" . $macroname; # key to the hash and the database
328
329 my $new_cursection = $doc_obj->get_next_child($thissection);
330 $thissection = $doc_obj->insert_section("");
331
332 #print STDERR "$key \n" if ($file =~ m/(port)|(chin)|(engl)/);
333
334 $doc_obj->add_utf8_metadata ($thissection, "Macroname", $key);
335
336 $macro_text = $ta . "<br>";
337
338 }
339 }
340
341 print $outhandle "done extracting macros\n"
342 if ($self->{'verbosity'}>3);
343
344}
345
346
347sub get_language_encoding_old {
348 my $self = shift (@_);
349 my ($filename) = @_;
350 my $outhandle = $self->{'outhandle'};
351
352 # read in file
353 open (FILE, $filename) || die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n";
354 undef $/;
355 my $text = <FILE>;
356 $/ = "\n";
357 close FILE;
358
359 # remove <title>stuff</title> -- as titles tend often to be in English
360 # for foreign language documents
361 $text =~ s/<title>.*?<\/title>//i;
362
363 # remove all HTML tags
364 $text =~ s/<[^>]*>//sg;
365 my $results = [];
366
367 # get the language/encoding
368 $results = $self->{'textcat'}->classify(\$text);
369
370 foreach $r (@$results) {
371 print $outhandle "Results: $r\n";
372
373 }
374
375 # if textcat returns 3 or less possibilities we'll use the
376 # first one in the list - otherwise use the defaults
377 if (scalar @$results > 3) {
378
379 my $lang_fname = util::filename_cat($ENV{'GSDLHOME'},"tmp","lang","package_forms",
380 "languages.log");
381 open (LANGFILE, "<$lang_fname") or die ("Unable to open $lang_fname: $!\n");
382
383 while (<LANGFILE>) {
384
385 $line = $_;
386 chomp($line);
387
388 $fn = $filename;
389 $fn =~ s/.*\/(.*)\..*/$1/;
390
391
392 if ($line eq $fn) {
393 print $outhandle "BINGO $line $fn\n";
394 $line = <LANGFILE>;
395 chomp($line);
396 print $outhandle "language code is $line\n";
397
398 foreach $r (@$results) {
399 print $outhandle "MY1 $r\n";
400
401 $roar = $& if($r =~ m/../);
402
403 if ($roar eq $line) {
404 print $outhandle "WE HAVE A WINNER $r\n";
405 }
406 }
407
408
409 }
410 else {
411 $line = <LANGFILE>;
412 }
413 }
414
415 close LANGFILE;
416
417 if ($self->{'input_encoding'} ne 'auto') {
418 if ($self->{'extract_language'} && $self->{'verbosity'}) {
419 print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
420 print $outhandle "defaulting to $self->{'default_language'}\n";
421 }
422 return ($self->{'default_language'}, $self->{'input_encoding'});
423
424 } else {
425 if ($self->{'verbosity'}) {
426 print $outhandle "BASPlug: WARNING: language/encoding could not be extracted from $filename - ";
427 print $outhandle "defaulting to $self->{'default_language'}/$self->{'default_encoding'}\n";
428 }
429 return ($self->{'default_language'}, $self->{'default_encoding'});
430 }
431 }
432
433
434 # format language/encoding
435 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
436 if (!defined $language) {
437 if ($self->{'verbosity'}) {
438 print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
439 print $outhandle "defaulting to $self->{'default_language'}\n";
440 }
441 $language = $self->{'default_language'};
442 }
443 if (!defined $encoding) {
444 if ($self->{'verbosity'}) {
445 print $outhandle "BasPlug: WARNING: encoding could not be extracted from $filename - ";
446 print $outhandle "defaulting to $self->{'default_encoding'}\n";
447 }
448 $encoding = $self->{'default_encoding'};
449 }
450
451 if ($encoding !~ /^(ascii|utf8|unicode)$/ &&
452 !defined $encodings::encodings->{$encoding}) {
453 if ($self->{'verbosity'}) {
454 print $outhandle "BasPlug: WARNING: $filename appears to be encoded in an unsupported encoding ($encoding) - ";
455 print $outhandle "using $self->{'default_encoding'}\n";
456 }
457 $encoding = $self->{'default_encoding'};
458 }
459
460 print STDERR "**** forcing encoding to be utf8\n";
461 $encoding = "utf8";
462
463 print STDERR "**** forcing language to be first two letters\n";
464 my $lfname = $filename;
465 $lfname =~ s/^.*\///;
466 $language = substr($lfname,0,2);
467
468 print $outhandle "RETURNING VALUES $language $encoding\n";
469
470 return ($language, $encoding);
471}
472
473
474sub find_language {
475 my ($self,$fn) = @_;
476
477 my $lang_table = $self->{'lang_abbr'};
478
479 if (!defined $lang_table->{$fn}) {
480
481 # try and find it with shorter string name
482
483 my $try_len = length($fn);
484
485 while ($try_len>=4) {
486 $try_fn = substr($fn,0,$try_len);
487
488 if (defined $lang_table->{$try_fn}) {
489 $fn = $try_fn;
490 last;
491 }
492 $try_len--;
493 }
494 }
495
496 return $fn;
497}
498
499
500sub get_language_encoding {
501 my $self = shift (@_);
502 my ($filename) = @_;
503 my $outhandle = $self->{'outhandle'};
504
505 my $fn = $filename;
506 $fn =~ s/.*\/(.*)\..*/$1/;
507 $fn =~ s/\d+$//; # remove any digits from end of filename
508
509 my $languge;
510 my $encoding = "utf8";
511
512 ## my $lang_lookup = $self->find_language($fn);
513
514 my $lang_table = $self->{'lang_abbr'};
515
516 if (!defined $lang_table->{$fn}) {
517
518 print $outhandle "Warning: Macro file name $filename not in list of languages.\n";
519 print $outhandle " Using default language.\n";
520 $language = $self->{'default_language'};
521 }
522 else {
523 $language = $lang_table->{$fn};
524 }
525
526 ## print $outhandle "Storing $filename as $language $encoding\n";
527
528 return ($language, $encoding);
529}
530
531
5321;
Note: See TracBrowser for help on using the repository browser.