source: trunk/gsdl/perllib/plugins/MACROPlug.pm@ 10218

Last change on this file since 10218 was 10218, checked in by kjdon, 19 years ago

Jeffrey's new parsing modifications, committed approx 6 July, 15.16

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 13.5 KB
Line 
1###########################################################################
2#
3# MACROPlug plugin - to process .dm files for language translation
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package MACROPlug;
27
28use BasPlug;
29use parsargv;
30
31
32sub BEGIN {
33 @ISA = ('BasPlug');
34}
35
36my $arguments =
37 [ { 'name' => "process_exp",
38 'desc' => "{BasPlug.process_exp}",
39 'type' => "regexp",
40 'deft' => &get_default_process_exp(),
41 'reqd' => "no" } ];
42
43my $options = { 'name' => "MACROPlug",
44 'desc' => "{MACROPlug.desc}",
45 'abstract' => "no",
46 'inherits' => "yes",
47 'args' => $arguments };
48
49
50sub load_language_table
51{
52 my $lang_table = {};
53
54 my $lang_fname = util::filename_cat($ENV{'GSDLHOME'},"tmp","lang",
55 "package_forms","languages.log");
56 open (LANGFILE, "<$lang_fname")
57 || die ("Unable to open $lang_fname: $!\n");
58
59 my $full_name;
60 my $abbr_name;
61
62 while (defined ($full_name=<LANGFILE>)) {
63 chomp($full_name);
64
65 $abbr_name = <LANGFILE>;
66 chomp($abbr_name);
67
68 $lang_table->{$full_name} = $abbr_name;
69
70 my $fourchar_name = substr($full_name,0,4);
71 if (!defined $lang_table->{$fourchar_name}) {
72 $lang_table->{$fourchar_name} = $abbr_name;
73 }
74 else {
75 print STDERR "Warning: Clash on four character abbreviation for language $fourchar_name\n";
76 }
77 }
78
79 close LANGFILE;
80
81 return $lang_table;
82}
83
84
85
86sub new {
87 my ($class) = shift (@_);
88 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
89 push(@$pluginlist, $class);
90
91 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
92 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
93
94 my $self = (defined $hashArgOptLists)? new BasPlug($pluginlist,$inputargs,$hashArgOptLists): new BasPlug($pluginlist,$inputargs);
95
96 # $self->{'lang_abbr'} = load_language_table();
97
98 return bless $self, $class;
99}
100
101sub get_default_process_exp {
102 my $self = shift (@_);
103
104 return q^(?i)\.dm$^;
105}
106
107
108sub read {
109 my $self = shift (@_);
110 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
111
112 my $outhandle = $self->{'outhandle'};
113
114 my $lang_table = $self->{'lang_abbr'};
115 my $fn = $file;
116 $fn =~ s/.*\/(.*)\..*/$1/;
117 $fn =~ s/\d+$//; # remove any digits from end of filename
118
119 my $filename = $file;
120 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
121
122 if ((!-d $filename) && ($file !~ m/doc.xml$/) && (!defined $lang_table->{$fn})) {
123 print $outhandle "MACROPlug: blocking $file\n"
124 if $self->{'verbosity'} > 2;
125 $self->{'num_blocked'} ++;
126 return 0;
127 }
128
129 return $self->SUPER::read(@_);
130}
131
132
133# do plugin specific processing of doc_obj
134sub process {
135 my $self = shift (@_);
136 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
137 my $outhandle = $self->{'outhandle'};
138
139 print STDERR "<Processing n='$file' p='MACROPlug'>\n" if ($gli);
140 print $outhandle "MACROPlug: processing $file\n"
141 if $self->{'verbosity'} > 1;
142
143 my $thissection = $doc_obj->get_top_section();
144
145 $self->extract_macronames (\$textref, $doc_obj, $thissection, $file);
146 # we need to escape the escape character, or else mg will convert into
147 # eg literal newlines, instead of leaving the text as '\n'
148
149
150 my ($filemeta) = $file =~ /([^\\\/]+)$/;
151 $doc_obj->add_utf8_metadata($thissection, "Title", &ghtml::dmsafe($filemeta));
152
153 # FileFormat metadata
154 $doc_obj->add_metadata($thissection, "FileFormat", "MACRO");
155 $doc_obj->add_metadata($thissection, "FileSize", (-s $file));
156
157 return 1;
158}
159
160
161sub extract_macronames {
162
163 my $self = shift(@_);
164
165 my ($textref, $doc_obj, $thissection, $file) = @_;
166 my $outhandle = $self->{'outhandle'};
167
168 print $outhandle " extracting macronames ...\n"
169 if ($self->{'verbosity'}>3);
170
171 my @textarray = split ("\n", $$$textref)
172 if ($self->{'verbosity'}>3);
173
174 my $macro_text = "";
175 my $image_macro = "false";
176 my $norm_macro = "false";
177
178 #print STDERR "FILE@@@@@ $file\n";
179
180 #foreach my $ta (@textarray) {
181
182 for ($k = 0; $k < scalar(@textarray); $k++) {
183
184 $ta = $textarray[$k];
185
186 #print STDERR "$ta\n" if ($file =~ m/port/);
187
188 if ($ta =~ m/^package /) {
189 $currpackage = $ta;
190 chomp($currpackage);
191 $currpackage =~ s/^package //;
192 }
193
194
195 elsif ($image_macro eq "true") {
196 unless ($ta =~ m/\S+/) {
197 $image_macro = "false";
198
199 $macro_text =~ s/\_/\\_/osg; # for dm (we have a convention of starting macros with _
200
201 #print STDERR "$macro_text\n\n" if ($file =~ m/spanish/);
202
203 $doc_obj->add_utf8_text($thissection, $macro_text);
204
205 my @names = split(/\s*\#\#\s*/, $macro_text);
206
207 my ($title) = $names[1];
208
209 if (length($title) > 100) {
210 $title = substr ($title, 0, 100) . "...";
211 }
212
213 $title =~ s/</\&lt;/g;
214 $title =~ s/>/\&gt;/g;
215
216 #print STDERR "$title\n" if ($file =~ m/port/);
217
218 $doc_obj->add_utf8_metadata ($thissection, "SecTitle", $title);
219 $macro_text = "";
220 }
221 $ta .= "<br>";
222 $macro_text .= $ta;
223
224 }
225
226 elsif ($norm_macro eq "true") {
227
228 $macro_text .= $ta;
229
230 if ($ta =~ m/\}\s*(\#.*)*\Z/) {
231
232 $norm_macro = "false";
233
234 $macro_text =~ s/\A(_\w+_)\s//;
235
236 #print STDERR "$macro_text\n" if($file =~ m/spanish/i);
237
238 $doc_obj->add_utf8_text($thissection, $macro_text);
239
240 my ($title) = $macro_text;
241
242 $title =~ s/\_/\\_/osg;
243 $title =~ s/\{//;
244 $title =~ s/\}\s*(\#.*)*\Z//;
245 $title =~ s/\<br\>|\<p\>/\n/osg;
246 $title =~ s/\n/ /osg;
247 $title =~ s/</\&lt;/g;
248 $title =~ s/>/\&gt;/g;
249
250 $title =~ s/\(/\\\(/g;
251 $title =~ s/\)/\\\)/g;
252
253 if (length($title) > 100) {
254 $title = substr ($title, 0, 100) . "...";
255 }
256 #print STDERR "$title\n\n\n" if($file =~ m/spanish/i);
257
258 $doc_obj->add_utf8_metadata ($thissection, "SecTitle", $title);
259 $macro_text = "";
260 }
261
262 }
263 elsif(($ta =~ m/\A(_\w+_)\s*/) && ($image_macro eq "false")) {
264
265 my $new_macro = $1;
266
267 my $new_cursection = $doc_obj->get_next_child($thissection);
268 $thissection = $doc_obj->insert_section("");
269
270 $new_macro =~ s/\_/\\_/osg;
271
272 $new_macro = $currpackage . "::" . $new_macro;
273
274 #print STDERR "$new_macro\n" if ($file =~ m/spanish/);
275
276 $doc_obj->add_utf8_metadata ($thissection, "Macroname", $new_macro);
277
278 $macro_text = $ta;
279
280 if ($ta =~ m/\}\s*(\#.*)*\Z/) {
281
282 $macro_text =~ s/\A(_\w+_)\s//;
283 $macro_text =~ s/\n/ /osg;
284
285 #print STDERR "$macro_text\n" if($file =~ m/spanish/i);
286
287 $doc_obj->add_utf8_text($thissection, $macro_text);
288
289 my ($title) = $macro_text;
290 $title =~ s/\{//;
291 $title =~ s/\}\s*(\#.*)*\Z//;
292 $title =~ s/\<br\>|\<p\>/\n/osg;
293 $title =~ s/\n/ /osg;
294 $title =~ s/</\&lt;/g;
295 $title =~ s/>/\&gt;/g;
296
297 if (length($title) > 100) {
298 $title = substr ($title, 0, 100) . "...";
299 }
300 #print STDERR "$title\n\n\n" if($file =~ m/spanish/i);
301
302 $doc_obj->add_utf8_metadata ($thissection, "SecTitle", $title);
303 }
304 else {
305 $norm_macro = "true";
306 }
307
308 $macro_text = "";
309 $macro_text .= $ta;
310 }
311
312 #line in format ## "sometext" ## macro ## macroname ##
313 elsif ($ta =~ m/^\#\# .*/) {
314 my $macroname = $ta;
315 $image_macro = "true";
316 $macro_text = "";
317
318 #NEED S.T LIKE THIS TO PICK UP HOARY ONES OVER 2 LINES
319 unless ($macroname =~ m/^\#\# .*\#\#/) {
320 $macroname = $ta;
321 chomp($macroname);
322 $macro_text .= $macroname;
323 $ta .= $textarray[++$k];
324 $macroname .= $ta;
325 }
326
327 my @names = split(/\s*\#\#\s*/, $macroname);
328 $macroname = $names[(scalar @names) - 1];
329 my $key = $currpackage . "::" . $macroname; # key to the hash and the database
330
331 my $new_cursection = $doc_obj->get_next_child($thissection);
332 $thissection = $doc_obj->insert_section("");
333
334 #print STDERR "$key \n" if ($file =~ m/(port)|(chin)|(engl)/);
335
336 $doc_obj->add_utf8_metadata ($thissection, "Macroname", $key);
337
338 $macro_text = $ta . "<br>";
339
340 }
341 }
342
343 print $outhandle "done extracting macros\n"
344 if ($self->{'verbosity'}>3);
345
346}
347
348
349sub get_language_encoding_old {
350 my $self = shift (@_);
351 my ($filename) = @_;
352 my $outhandle = $self->{'outhandle'};
353
354 # read in file
355 open (FILE, $filename) || die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n";
356 undef $/;
357 my $text = <FILE>;
358 $/ = "\n";
359 close FILE;
360
361 # remove <title>stuff</title> -- as titles tend often to be in English
362 # for foreign language documents
363 $text =~ s/<title>.*?<\/title>//i;
364
365 # remove all HTML tags
366 $text =~ s/<[^>]*>//sg;
367 my $results = [];
368
369 # get the language/encoding
370 $results = $self->{'textcat'}->classify(\$text);
371
372 foreach $r (@$results) {
373 print $outhandle "Results: $r\n";
374
375 }
376
377 # if textcat returns 3 or less possibilities we'll use the
378 # first one in the list - otherwise use the defaults
379 if (scalar @$results > 3) {
380
381 my $lang_fname = util::filename_cat($ENV{'GSDLHOME'},"tmp","lang","package_forms",
382 "languages.log");
383 open (LANGFILE, "<$lang_fname") or die ("Unable to open $lang_fname: $!\n");
384
385 while (<LANGFILE>) {
386
387 $line = $_;
388 chomp($line);
389
390 $fn = $filename;
391 $fn =~ s/.*\/(.*)\..*/$1/;
392
393
394 if ($line eq $fn) {
395 print $outhandle "BINGO $line $fn\n";
396 $line = <LANGFILE>;
397 chomp($line);
398 print $outhandle "language code is $line\n";
399
400 foreach $r (@$results) {
401 print $outhandle "MY1 $r\n";
402
403 $roar = $& if($r =~ m/../);
404
405 if ($roar eq $line) {
406 print $outhandle "WE HAVE A WINNER $r\n";
407 }
408 }
409
410
411 }
412 else {
413 $line = <LANGFILE>;
414 }
415 }
416
417 close LANGFILE;
418
419 if ($self->{'input_encoding'} ne 'auto') {
420 if ($self->{'extract_language'} && $self->{'verbosity'}) {
421 print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
422 print $outhandle "defaulting to $self->{'default_language'}\n";
423 }
424 return ($self->{'default_language'}, $self->{'input_encoding'});
425
426 } else {
427 if ($self->{'verbosity'}) {
428 print $outhandle "BASPlug: WARNING: language/encoding could not be extracted from $filename - ";
429 print $outhandle "defaulting to $self->{'default_language'}/$self->{'default_encoding'}\n";
430 }
431 return ($self->{'default_language'}, $self->{'default_encoding'});
432 }
433 }
434
435
436 # format language/encoding
437 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
438 if (!defined $language) {
439 if ($self->{'verbosity'}) {
440 print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
441 print $outhandle "defaulting to $self->{'default_language'}\n";
442 }
443 $language = $self->{'default_language'};
444 }
445 if (!defined $encoding) {
446 if ($self->{'verbosity'}) {
447 print $outhandle "BasPlug: WARNING: encoding could not be extracted from $filename - ";
448 print $outhandle "defaulting to $self->{'default_encoding'}\n";
449 }
450 $encoding = $self->{'default_encoding'};
451 }
452
453 if ($encoding !~ /^(ascii|utf8|unicode)$/ &&
454 !defined $encodings::encodings->{$encoding}) {
455 if ($self->{'verbosity'}) {
456 print $outhandle "BasPlug: WARNING: $filename appears to be encoded in an unsupported encoding ($encoding) - ";
457 print $outhandle "using $self->{'default_encoding'}\n";
458 }
459 $encoding = $self->{'default_encoding'};
460 }
461
462 print STDERR "**** forcing encoding to be utf8\n";
463 $encoding = "utf8";
464
465 print STDERR "**** forcing language to be first two letters\n";
466 my $lfname = $filename;
467 $lfname =~ s/^.*\///;
468 $language = substr($lfname,0,2);
469
470 print $outhandle "RETURNING VALUES $language $encoding\n";
471
472 return ($language, $encoding);
473}
474
475
476sub find_language {
477 my ($self,$fn) = @_;
478
479 my $lang_table = $self->{'lang_abbr'};
480
481 if (!defined $lang_table->{$fn}) {
482
483 # try and find it with shorter string name
484
485 my $try_len = length($fn);
486
487 while ($try_len>=4) {
488 $try_fn = substr($fn,0,$try_len);
489
490 if (defined $lang_table->{$try_fn}) {
491 $fn = $try_fn;
492 last;
493 }
494 $try_len--;
495 }
496 }
497
498 return $fn;
499}
500
501
502sub get_language_encoding {
503 my $self = shift (@_);
504 my ($filename) = @_;
505 my $outhandle = $self->{'outhandle'};
506
507 my $fn = $filename;
508 $fn =~ s/.*\/(.*)\..*/$1/;
509 $fn =~ s/\d+$//; # remove any digits from end of filename
510
511 my $languge;
512 my $encoding = "utf8";
513
514 ## my $lang_lookup = $self->find_language($fn);
515
516 my $lang_table = $self->{'lang_abbr'};
517
518 if (!defined $lang_table->{$fn}) {
519
520 print $outhandle "Warning: Macro file name $filename not in list of languages.\n";
521 print $outhandle " Using default language.\n";
522 $language = $self->{'default_language'};
523 }
524 else {
525 $language = $lang_table->{$fn};
526 }
527
528 ## print $outhandle "Storing $filename as $language $encoding\n";
529
530 return ($language, $encoding);
531}
532
533
5341;
Note: See TracBrowser for help on using the repository browser.