########################################################################### # # MACROPlug plugin - to process .dm files for language translation # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### package MACROPlug; use BasPlug; use parsargv; sub BEGIN { @ISA = ('BasPlug'); } my $arguments = [ { 'name' => "process_exp", 'desc' => "{BasPlug.process_exp}", 'type' => "regexp", 'deft' => &get_default_process_exp(), 'reqd' => "no" } ]; my $options = { 'name' => "MACROPlug", 'desc' => "{MACROPlug.desc}", 'abstract' => "no", 'inherits' => "yes", 'args' => $arguments }; sub load_language_table { my $lang_table = {}; my $lang_fname = util::filename_cat($ENV{'GSDLHOME'},"tmp","lang", "package_forms","languages.log"); open (LANGFILE, "<$lang_fname") || die ("Unable to open $lang_fname: $!\n"); my $full_name; my $abbr_name; while (defined ($full_name=)) { chomp($full_name); $abbr_name = ; chomp($abbr_name); $lang_table->{$full_name} = $abbr_name; my $fourchar_name = substr($full_name,0,4); if (!defined $lang_table->{$fourchar_name}) { $lang_table->{$fourchar_name} = $abbr_name; } else { print STDERR "Warning: Clash on four character abbreviation for language $fourchar_name\n"; } } close LANGFILE; return $lang_table; } sub new { my ($class) = @_; my $self = new BasPlug ($class, @_); # 14-05-02 To allow for proper inheritance of arguments - John Thompson my $option_list = $self->{'option_list'}; push( @{$option_list}, $options ); # $self->{'lang_abbr'} = load_language_table(); return bless $self, $class; } sub get_default_process_exp { my $self = shift (@_); return q^(?i)\.dm$^; } sub read { my $self = shift (@_); my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_; my $outhandle = $self->{'outhandle'}; my $lang_table = $self->{'lang_abbr'}; my $fn = $file; $fn =~ s/.*\/(.*)\..*/$1/; $fn =~ s/\d+$//; # remove any digits from end of filename my $filename = $file; $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/; if ((!-d $filename) && ($file !~ m/doc.xml$/) && (!defined $lang_table->{$fn})) { print $outhandle "MACROPlug: blocking $file\n" if $self->{'verbosity'} > 2; $self->{'num_blocked'} ++; return 0; } return $self->SUPER::read(@_); } # do plugin specific processing of doc_obj sub process { my $self = shift (@_); my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_; my $outhandle = $self->{'outhandle'}; print STDERR "\n" if ($gli); print $outhandle "MACROPlug: processing $file\n" if $self->{'verbosity'} > 1; my $thissection = $doc_obj->get_top_section(); $self->extract_macronames (\$textref, $doc_obj, $thissection, $file); # we need to escape the escape character, or else mg will convert into # eg literal newlines, instead of leaving the text as '\n' my ($filemeta) = $file =~ /([^\\\/]+)$/; $doc_obj->add_utf8_metadata($thissection, "Title", &ghtml::dmsafe($filemeta)); # FileFormat metadata $doc_obj->add_metadata($thissection, "FileFormat", "MACRO"); return 1; } sub extract_macronames { my $self = shift(@_); my ($textref, $doc_obj, $thissection, $file) = @_; my $outhandle = $self->{'outhandle'}; print $outhandle " extracting macronames ...\n" if ($self->{'verbosity'}>3); my @textarray = split ("\n", $$$textref) if ($self->{'verbosity'}>3); my $macro_text = ""; my $image_macro = "false"; my $norm_macro = "false"; #print STDERR "FILE@@@@@ $file\n"; #foreach my $ta (@textarray) { for ($k = 0; $k < scalar(@textarray); $k++) { $ta = $textarray[$k]; #print STDERR "$ta\n" if ($file =~ m/port/); if ($ta =~ m/^package /) { $currpackage = $ta; chomp($currpackage); $currpackage =~ s/^package //; } elsif ($image_macro eq "true") { unless ($ta =~ m/\S+/) { $image_macro = "false"; $macro_text =~ s/\_/\\_/osg; # for dm (we have a convention of starting macros with _ #print STDERR "$macro_text\n\n" if ($file =~ m/spanish/); $doc_obj->add_utf8_text($thissection, $macro_text); my @names = split(/\s*\#\#\s*/, $macro_text); my ($title) = $names[1]; if (length($title) > 100) { $title = substr ($title, 0, 100) . "..."; } $title =~ s//\>/g; #print STDERR "$title\n" if ($file =~ m/port/); $doc_obj->add_utf8_metadata ($thissection, "SecTitle", $title); $macro_text = ""; } $ta .= "
"; $macro_text .= $ta; } elsif ($norm_macro eq "true") { $macro_text .= $ta; if ($ta =~ m/\}\s*(\#.*)*\Z/) { $norm_macro = "false"; $macro_text =~ s/\A(_\w+_)\s//; #print STDERR "$macro_text\n" if($file =~ m/spanish/i); $doc_obj->add_utf8_text($thissection, $macro_text); my ($title) = $macro_text; $title =~ s/\_/\\_/osg; $title =~ s/\{//; $title =~ s/\}\s*(\#.*)*\Z//; $title =~ s/\|\/\n/osg; $title =~ s/\n/ /osg; $title =~ s//\>/g; $title =~ s/\(/\\\(/g; $title =~ s/\)/\\\)/g; if (length($title) > 100) { $title = substr ($title, 0, 100) . "..."; } #print STDERR "$title\n\n\n" if($file =~ m/spanish/i); $doc_obj->add_utf8_metadata ($thissection, "SecTitle", $title); $macro_text = ""; } } elsif(($ta =~ m/\A(_\w+_)\s*/) && ($image_macro eq "false")) { my $new_macro = $1; my $new_cursection = $doc_obj->get_next_child($thissection); $thissection = $doc_obj->insert_section(""); $new_macro =~ s/\_/\\_/osg; $new_macro = $currpackage . "::" . $new_macro; #print STDERR "$new_macro\n" if ($file =~ m/spanish/); $doc_obj->add_utf8_metadata ($thissection, "Macroname", $new_macro); $macro_text = $ta; if ($ta =~ m/\}\s*(\#.*)*\Z/) { $macro_text =~ s/\A(_\w+_)\s//; $macro_text =~ s/\n/ /osg; #print STDERR "$macro_text\n" if($file =~ m/spanish/i); $doc_obj->add_utf8_text($thissection, $macro_text); my ($title) = $macro_text; $title =~ s/\{//; $title =~ s/\}\s*(\#.*)*\Z//; $title =~ s/\|\/\n/osg; $title =~ s/\n/ /osg; $title =~ s//\>/g; if (length($title) > 100) { $title = substr ($title, 0, 100) . "..."; } #print STDERR "$title\n\n\n" if($file =~ m/spanish/i); $doc_obj->add_utf8_metadata ($thissection, "SecTitle", $title); } else { $norm_macro = "true"; } $macro_text = ""; $macro_text .= $ta; } #line in format ## "sometext" ## macro ## macroname ## elsif ($ta =~ m/^\#\# .*/) { my $macroname = $ta; $image_macro = "true"; $macro_text = ""; #NEED S.T LIKE THIS TO PICK UP HOARY ONES OVER 2 LINES unless ($macroname =~ m/^\#\# .*\#\#/) { $macroname = $ta; chomp($macroname); $macro_text .= $macroname; $ta .= $textarray[++$k]; $macroname .= $ta; } my @names = split(/\s*\#\#\s*/, $macroname); $macroname = $names[(scalar @names) - 1]; my $key = $currpackage . "::" . $macroname; # key to the hash and the database my $new_cursection = $doc_obj->get_next_child($thissection); $thissection = $doc_obj->insert_section(""); #print STDERR "$key \n" if ($file =~ m/(port)|(chin)|(engl)/); $doc_obj->add_utf8_metadata ($thissection, "Macroname", $key); $macro_text = $ta . "
"; } } print $outhandle "done extracting macros\n" if ($self->{'verbosity'}>3); } sub get_language_encoding_old { my $self = shift (@_); my ($filename) = @_; my $outhandle = $self->{'outhandle'}; # read in file open (FILE, $filename) || die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n"; undef $/; my $text = ; $/ = "\n"; close FILE; # remove stuff -- as titles tend often to be in English # for foreign language documents $text =~ s/.*?<\/title>//i; # remove all HTML tags $text =~ s/<[^>]*>//sg; my $results = []; # get the language/encoding $results = $self->{'textcat'}->classify(\$text); foreach $r (@$results) { print $outhandle "Results: $r\n"; } # if textcat returns 3 or less possibilities we'll use the # first one in the list - otherwise use the defaults if (scalar @$results > 3) { my $lang_fname = util::filename_cat($ENV{'GSDLHOME'},"tmp","lang","package_forms", "languages.log"); open (LANGFILE, "<$lang_fname") or die ("Unable to open $lang_fname: $!\n"); while (<LANGFILE>) { $line = $_; chomp($line); $fn = $filename; $fn =~ s/.*\/(.*)\..*/$1/; if ($line eq $fn) { print $outhandle "BINGO $line $fn\n"; $line = <LANGFILE>; chomp($line); print $outhandle "language code is $line\n"; foreach $r (@$results) { print $outhandle "MY1 $r\n"; $roar = $& if($r =~ m/../); if ($roar eq $line) { print $outhandle "WE HAVE A WINNER $r\n"; } } } else { $line = <LANGFILE>; } } close LANGFILE; if ($self->{'input_encoding'} ne 'auto') { if ($self->{'extract_language'} && $self->{'verbosity'}) { print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - "; print $outhandle "defaulting to $self->{'default_language'}\n"; } return ($self->{'default_language'}, $self->{'input_encoding'}); } else { if ($self->{'verbosity'}) { print $outhandle "BASPlug: WARNING: language/encoding could not be extracted from $filename - "; print $outhandle "defaulting to $self->{'default_language'}/$self->{'default_encoding'}\n"; } return ($self->{'default_language'}, $self->{'default_encoding'}); } } # format language/encoding my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/; if (!defined $language) { if ($self->{'verbosity'}) { print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - "; print $outhandle "defaulting to $self->{'default_language'}\n"; } $language = $self->{'default_language'}; } if (!defined $encoding) { if ($self->{'verbosity'}) { print $outhandle "BasPlug: WARNING: encoding could not be extracted from $filename - "; print $outhandle "defaulting to $self->{'default_encoding'}\n"; } $encoding = $self->{'default_encoding'}; } if ($encoding !~ /^(ascii|utf8|unicode)$/ && !defined $encodings::encodings->{$encoding}) { if ($self->{'verbosity'}) { print $outhandle "BasPlug: WARNING: $filename appears to be encoded in an unsupported encoding ($encoding) - "; print $outhandle "using $self->{'default_encoding'}\n"; } $encoding = $self->{'default_encoding'}; } print STDERR "**** forcing encoding to be utf8\n"; $encoding = "utf8"; print STDERR "**** forcing language to be first two letters\n"; my $lfname = $filename; $lfname =~ s/^.*\///; $language = substr($lfname,0,2); print $outhandle "RETURNING VALUES $language $encoding\n"; return ($language, $encoding); } sub find_language { my ($self,$fn) = @_; my $lang_table = $self->{'lang_abbr'}; if (!defined $lang_table->{$fn}) { # try and find it with shorter string name my $try_len = length($fn); while ($try_len>=4) { $try_fn = substr($fn,0,$try_len); if (defined $lang_table->{$try_fn}) { $fn = $try_fn; last; } $try_len--; } } return $fn; } sub get_language_encoding { my $self = shift (@_); my ($filename) = @_; my $outhandle = $self->{'outhandle'}; my $fn = $filename; $fn =~ s/.*\/(.*)\..*/$1/; $fn =~ s/\d+$//; # remove any digits from end of filename my $languge; my $encoding = "utf8"; ## my $lang_lookup = $self->find_language($fn); my $lang_table = $self->{'lang_abbr'}; if (!defined $lang_table->{$fn}) { print $outhandle "Warning: Macro file name $filename not in list of languages.\n"; print $outhandle " Using default language.\n"; $language = $self->{'default_language'}; } else { $language = $lang_table->{$fn}; } ## print $outhandle "Storing $filename as $language $encoding\n"; return ($language, $encoding); } 1;