Changeset 9413


Ignore:
Timestamp:
2005-03-14T14:05:28+13:00 (19 years ago)
Author:
jrm21
Message:

if we are trying to automatically determine the encoding, look for a
unicode byte order marker and use it, instead of taking textcat's guess
for an iso-8859 encoding.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/BasPlug.pm

    r9403 r9413  
    66# University of Waikato, New Zealand.
    77#
    8 # Copyright (C) 1999 New Zealand Digital Library Project
     8# Copyright (C) 1999-2005 New Zealand Digital Library Project
    99#
    1010# This program is free software; you can redistribute it and/or modify
     
    3636$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
    3737
     38use strict; no strict 'subs';
     39
    3840use File::Basename;
    3941
     
    4951use DateExtract;
    5052use ghtml;
    51 use gsprintf;
     53use gsprintf 'gsprintf';
    5254use printusage;
    5355
    5456use GISBasPlug;
    5557
    56 @ISA = ( GISBasPlug );
     58@BasPlug::ISA = ( GISBasPlug );
    5759
    5860my $unicode_list =
     
    176178
    177179
    178 sub gsprintf
    179 {
    180     return &gsprintf::gsprintf(@_);
    181 }
    182180
    183181
     
    198196
    199197    # XML output is always in UTF-8
    200     &gsprintf::output_strings_in_UTF8;
    201 
    202     &PrintUsage::print_xml_header();
     198    gsprintf::output_strings_in_UTF8;
     199
     200    PrintUsage::print_xml_header();
    203201    $self->print_xml();
    204202}
     
    214212    return if (!defined($pluginoptions));
    215213
    216     &gsprintf(STDERR, "<PlugInfo>\n");
    217     &gsprintf(STDERR, "  <Name>$pluginoptions->{'name'}</Name>\n");
    218     my $desc = &gsprintf::lookup_string($pluginoptions->{'desc'});
     214    gsprintf(STDERR, "<PlugInfo>\n");
     215    gsprintf(STDERR, "  <Name>$pluginoptions->{'name'}</Name>\n");
     216    my $desc = gsprintf::lookup_string($pluginoptions->{'desc'});
    219217    $desc =~ s/</&amp;lt;/g; # doubly escaped
    220218    $desc =~ s/>/&amp;gt;/g;
    221219   
    222     &gsprintf(STDERR, "  <Desc>$desc</Desc>\n");
    223     &gsprintf(STDERR, "  <Abstract>$pluginoptions->{'abstract'}</Abstract>\n");
    224     &gsprintf(STDERR, "  <Inherits>$pluginoptions->{'inherits'}</Inherits>\n");
    225     &gsprintf(STDERR, "  <Explodes>" . ($pluginoptions->{'explodes'} || "no") . "</Explodes>\n");
    226     &gsprintf(STDERR, "  <Arguments>\n");
     220    gsprintf(STDERR, "  <Desc>$desc</Desc>\n");
     221    gsprintf(STDERR, "  <Abstract>$pluginoptions->{'abstract'}</Abstract>\n");
     222    gsprintf(STDERR, "  <Inherits>$pluginoptions->{'inherits'}</Inherits>\n");
     223    gsprintf(STDERR, "  <Explodes>" . ($pluginoptions->{'explodes'} || "no") . "</Explodes>\n");
     224    gsprintf(STDERR, "  <Arguments>\n");
    227225    if (defined($pluginoptions->{'args'})) {
    228226    &PrintUsage::print_options_xml($pluginoptions->{'args'});
     
    232230    $self->print_xml();
    233231
    234     &gsprintf(STDERR, "  </Arguments>\n");
    235     &gsprintf(STDERR, "</PlugInfo>\n");
     232    gsprintf(STDERR, "  </Arguments>\n");
     233    gsprintf(STDERR, "</PlugInfo>\n");
    236234}
    237235
     
    291289    if ($isleafclass) {
    292290    if (defined($plugindesc)) {
    293         &gsprintf(STDERR, "$plugindesc\n\n");
    294     }
    295     &gsprintf(STDERR, " {common.usage}: plugin $pluginname [{common.options}]\n\n");
     291        gsprintf(STDERR, "$plugindesc\n\n");
     292    }
     293    gsprintf(STDERR, " {common.usage}: plugin $pluginname [{common.options}]\n\n");
    296294    }
    297295
     
    302300
    303301    if ($isleafclass) {
    304         &gsprintf(STDERR, " {common.specific_options}:\n");
     302        gsprintf(STDERR, " {common.specific_options}:\n");
    305303    }
    306304    else {
    307         &gsprintf(STDERR, " {common.general_options}:\n", $pluginname);
     305        gsprintf(STDERR, " {common.general_options}:\n", $pluginname);
    308306    }
    309307
     
    376374             "allow_extra_options")) {
    377375
    378     &gsprintf(STDERR, "\n{BasPlug.bad_general_option}\n", $plugin_name);
     376    gsprintf(STDERR, "\n{BasPlug.bad_general_option}\n", $plugin_name);
    379377        bless $self, $class;
    380378    $self->print_txt_usage("");  # Use default resource bundle
     
    667665
    668666    if ($self->is_recursive()) {
    669     &gsprintf(STDERR, "{BasPlug.read_must_be_implemented}") && die "\n";
     667    gsprintf(STDERR, "{BasPlug.read_must_be_implemented}") && die "\n";
    670668    }
    671669
     
    722720    if (!length ($text)) {
    723721    my $plugin_name = ref ($self);
    724     &gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n", $filename) if $self->{'verbosity'};
     722    gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n", $filename) if $self->{'verbosity'};
    725723
    726724    my $failhandle = $self->{'failhandle'};
    727     &gsprintf($failhandle, "$file: " . ref($self) . ": {BasPlug.empty_file}\n");
     725    gsprintf($failhandle, "$file: " . ref($self) . ": {BasPlug.empty_file}\n");
    728726    # print $failhandle "$file: " . ref($self) . ": file contains no text\n";
    729727    $self->{'num_not_processed'} ++;
     
    777775    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
    778776
    779     &gsprintf(STDERR, "BasPlug::process {common.must_be_implemented}\n") && die "\n";
     777    gsprintf(STDERR, "BasPlug::process {common.must_be_implemented}\n") && die "\n";
    780778    # die "Basplug::process function must be implemented in sub-class\n";
    781779
     
    794792    {
    795793    my $outhandle = $self->{'outhandle'};
    796     &gsprintf($outhandle, "{BasPlug.read_denied}\n", $filename) if $self->{'verbosity'};
     794    gsprintf($outhandle, "{BasPlug.read_denied}\n", $filename) if $self->{'verbosity'};
    797795    # print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
    798796    return;
     
    801799    $$textref = "";
    802800
    803     open (FILE, $filename) || (&gsprintf(STDERR, "BasPlug::read_file {BasPlug.could_not_open_for_reading} ($!)\n", $filename) && die "\n");
    804     # open (FILE, $filename) || die "BasPlug::read_file could not open $filename for reading ($!)\n";
     801    if (!open (FILE, $filename)) {
     802    gsprintf(STDERR, "BasPlug::read_file {BasPlug.could_not_open_for_reading} ($!)\n", $filename);
     803     die "\n";
     804     }
    805805
    806806    if ($encoding eq "ascii") {
     
    868868        my $plugin_name = ref ($self);
    869869        my $outhandle = $self->{'outhandle'};
    870         &gsprintf($outhandle, "$plugin_name: {BasPlug.wrong_encoding}\n", $filename, $encoding, $extracted_encoding);
     870        gsprintf($outhandle, "$plugin_name: {BasPlug.wrong_encoding}\n", $filename, $encoding, $extracted_encoding);
    871871            # print $outhandle "$plugin_name: WARNING: $filename was read using $encoding encoding but ";
    872872            # print $outhandle "appears to be encoded as $extracted_encoding.\n";
     
    886886    my ($filename) = @_;
    887887    my $outhandle = $self->{'outhandle'};
    888 
     888    my $unicode_format = "";
    889889    # read in file
    890     open (FILE, $filename) || (&gsprintf(STDERR, "BasPlug::get_language_encoding {BasPlug.could_not_open_for_reading} ($!)\n", $filename) && die "\n"); # die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n";
     890    open (FILE, $filename) || (gsprintf(STDERR, "BasPlug::get_language_encoding {BasPlug.could_not_open_for_reading} ($!)\n", $filename) && die "\n"); # die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n";
    891891    undef $/;
    892892    my $text = <FILE>;
     
    894894    close FILE;
    895895
     896    # check if first few bytes have a Byte Order Marker
     897    my $bom=substr($text,0,2); # check 16bit unicode
     898    if ($bom eq "\xff\xfe") { # little endian 16bit unicode
     899    $unicode_format="unicode";
     900    } elsif ($bom eq "\xfe\xff") { # big endian 16bit unicode
     901    $unicode_format="unicode";
     902    } else {
     903    $bom=substr($text,0,3); # check utf-8
     904    if ($bom eq "\xef\xbb\xbf") { # utf-8 coded FEFF bom
     905        $unicode_format="utf8";
     906#   } elsif ($bom eq "\xef\xbf\xbe") { # utf-8 coded FFFE bom. Error!?
     907#       $unicode_format="utf8";
     908    }
     909    }
     910
     911
    896912    # remove <title>stuff</title> -- as titles tend often to be in English
    897913    # for foreign language documents
     
    899915
    900916    # remove all HTML tags
    901     $text =~ s/<[^>]*>//sg;
     917    # XXX this doesn't match plugins derived from HTMLPlug (except ConvertTo)
     918    if (ref($self) eq 'HTMLPlug' ||
     919    (exists $self->{'converted_to'} && $self->{'converted_to'} eq 'HTML')){
     920    $text =~ s/<[^>]*>//sg;
     921    }
    902922
    903923    # get the language/encoding
     
    907927    # first one in the list - otherwise use the defaults
    908928    if (scalar @$results > 3) {
    909     # changed 12 Feb 2003 by jrm21
    910     # use the most popular encoding at least... otherwise we might
    911     # generate invalid archive files!
    912     my %guessed_encodings = ();
    913     foreach my $result (@$results) {
    914         $result =~ /([^\-]+)$/;
    915         my $enc=$1;
    916         if (!defined($guessed_encodings{$enc})) {
    917         $guessed_encodings{$enc}=0;
     929    my $best_encoding="";
     930    if ($unicode_format) { # in case the first had a BOM
     931        $best_encoding=$unicode_format;
     932    } else {
     933        my %guessed_encodings = ();
     934        foreach my $result (@$results) {
     935        $result =~ /([^\-]+)$/;
     936        my $enc=$1;
     937        if (!defined($guessed_encodings{$enc})) {
     938            $guessed_encodings{$enc}=0;
     939        }
     940        $guessed_encodings{$enc}++;
    918941        }
    919         $guessed_encodings{$enc}++;
    920     }
    921     my $best_encoding="";
    922     $guessed_encodings{""}=-1;
    923     foreach my $enc (keys %guessed_encodings) {
    924         if ($guessed_encodings{$enc} > $guessed_encodings{$best_encoding}){
    925         $best_encoding=$enc;
     942
     943        $guessed_encodings{""}=-1; # for default best_encoding of ""
     944        foreach my $enc (keys %guessed_encodings) {
     945        if ($guessed_encodings{$enc} >
     946            $guessed_encodings{$best_encoding}){
     947            $best_encoding=$enc;
     948        }
    926949        }
    927950    }
     
    929952    if ($self->{'input_encoding'} ne 'auto') {
    930953        if ($self->{'extract_language'} && $self->{'verbosity'}) {
    931         &gsprintf($outhandle, "BasPlug: {BasPlug.could_not_extract_language}\n", $filename, $self->{'default_language'});
    932         # print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
    933         # print $outhandle "defaulting to $self->{'default_language'}\n";
     954        gsprintf($outhandle,
     955             "BasPlug: {BasPlug.could_not_extract_language}\n",
     956             $filename, $self->{'default_language'});
    934957        }       
    935958        return ($self->{'default_language'}, $self->{'input_encoding'});
     
    937960    } else {
    938961        if ($self->{'verbosity'}) {
    939         &gsprintf($outhandle, "BasPlug: {BasPlug.could_not_extract_language}\n", $filename, $self->{'default_language'});
    940         # print $outhandle "BASPlug: WARNING: language could not be extracted from $filename - ";
    941         # print $outhandle "defaulting to $self->{'default_language'}.\n";
     962        gsprintf($outhandle,
     963             "BasPlug: {BasPlug.could_not_extract_language}\n",
     964             $filename, $self->{'default_language'});
    942965        }
    943966        return ($self->{'default_language'}, $best_encoding);
     
    949972    if (!defined $language) {
    950973    if ($self->{'verbosity'}) {
    951         &gsprintf($outhandle, "BasPlug: {BasPlug.could_not_extract_language}\n", $filename, $self->{'default_language'});
    952         # print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
    953         # print $outhandle "defaulting to $self->{'default_language'}\n";
     974        gsprintf($outhandle,
     975             "BasPlug: {BasPlug.could_not_extract_language}\n",
     976             $filename, $self->{'default_language'});
    954977    }
    955978    $language = $self->{'default_language'};
     
    957980    if (!defined $encoding) {
    958981    if ($self->{'verbosity'}) {
    959         &gsprintf($outhandle, "BasPlug: {BasPlug.could_not_extract_encoding}\n", $filename, $self->{'default_encoding'});
    960         # print $outhandle "BasPlug: WARNING: encoding could not be extracted from $filename - ";
    961         # print $outhandle "defaulting to $self->{'default_encoding'}\n";
     982        gsprintf($outhandle,
     983             "BasPlug: {BasPlug.could_not_extract_encoding}\n",
     984             $filename, $self->{'default_encoding'});
    962985    }
    963986    $encoding = $self->{'default_encoding'};
     
    10781101    my $outhandle = $self->{'outhandle'};
    10791102
    1080     # print $outhandle " extracting email addresses ...\n"
    1081     &gsprintf($outhandle, " {BasPlug.extracting_emails}...\n")
     1103    gsprintf($outhandle, " {BasPlug.extracting_emails}...\n")
    10821104    if ($self->{'verbosity'} > 2);
    10831105   
     
    10901112        push @email2, $address;
    10911113        $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
    1092         # print $outhandle "  extracting $address\n"
    1093         &gsprintf($outhandle, "  {BasPlug.extracting} $address\n")
     1114        gsprintf($outhandle, "  {BasPlug.extracting} $address\n")
    10941115        if ($self->{'verbosity'} > 3);
    10951116    }
    10961117    }
    1097     # print $outhandle " done extracting email addresses.\n"
    1098     &gsprintf($outhandle, " {BasPlug.done_email_extract}\n")
     1118    gsprintf($outhandle, " {BasPlug.done_email_extract}\n")
    10991119    if ($self->{'verbosity'} > 2);
    11001120}
     
    11281148    my $thissection = $doc_obj->get_top_section();
    11291149    my $text = "";
    1130     my @list;
     1150    my $list;
    11311151
    11321152    #loop through sections to gather whole doc
     
    11461166    if ($list){
    11471167        # if a list of kea keyphrases was returned (ie not empty)
    1148         &gsprintf(STDERR, "{BasPlug.keyphrases}: $list\n");
     1168        if ($self->{'verbosity'}) {
     1169        gsprintf(STDERR, "{BasPlug.keyphrases}: $list\n");
     1170        }
    11491171
    11501172        #add metadata to top section
     
    12161238
    12171239    # print $outhandle " extracting acronyms ...\n"
    1218     &gsprintf($outhandle, " {BasPlug.extracting_acronyms}...\n")
     1240    gsprintf($outhandle, " {BasPlug.extracting_acronyms}...\n")
    12191241    if ($self->{'verbosity'} > 2);
    12201242
     
    12291251        if ($thisAcro eq $acro->to_string()) {
    12301252        $seen_before = "true";
    1231         # print $outhandle "  already seen ". $acro->to_string() . "\n"
    1232         &gsprintf($outhandle, " {BasPlug.already_seen} " . $acro->to_string() . "\n")
    1233             if ($self->{'verbosity'} >= 4);
     1253        if ($self->{'verbosity'} >= 4) {
     1254            gsprintf($outhandle, " {BasPlug.already_seen} " .
     1255                 $acro->to_string() . "\n");
     1256        }
    12341257        }
    12351258    }
     
    12411264        #do the normal acronym
    12421265        $doc_obj->add_utf8_metadata($thissection, "Acronym",  $acro->to_string());
    1243         # print $outhandle "  adding ". $acro->to_string() . "\n"
    1244         &gsprintf($outhandle, " {BasPlug.adding} " . $acro->to_string() . "\n")
     1266        gsprintf($outhandle, " {BasPlug.adding} ".$acro->to_string()."\n")
    12451267        if ($self->{'verbosity'} > 3);
    12461268    }
    12471269    }
    12481270
    1249     # print $outhandle " done extracting acronyms. \n"
    1250     &gsprintf($outhandle, " {BasPlug.done_acronym_extract}\n")
     1271    gsprintf($outhandle, " {BasPlug.done_acronym_extract}\n")
    12511272    if ($self->{'verbosity'} > 2);
    12521273}
     
    12571278    my $outhandle = $self->{'outhandle'};
    12581279
    1259     # print $outhandle " marking up acronyms ...\n"
    1260     &gsprintf($outhandle, " {BasPlug.marking_up_acronyms}...\n")
     1280    gsprintf($outhandle, " {BasPlug.marking_up_acronyms}...\n")
    12611281    if ($self->{'verbosity'} > 2);
    12621282
     
    12641284    $text = &acronym::markup_acronyms($text, $self);
    12651285
    1266     # print $outhandle " done marking up acronyms. \n"
    1267     &gsprintf($outhandle, " {BasPlug.done_acronym_markup}\n")
     1286    gsprintf($outhandle, " {BasPlug.done_acronym_markup}\n")
    12681287    if ($self->{'verbosity'} > 2);
    12691288
     
    12851304    my ($doc_obj, $filename) = @_;
    12861305
     1306    my $top_section=$doc_obj->get_top_section();
     1307
    12871308    $filename =~ s/\.[^\\\/\.]+$/\.jpg/;
    12881309    if (-e $filename) {
    12891310    $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
    1290     $doc_obj->add_utf8_metadata($thissection, "hascover",  1);
     1311    $doc_obj->add_utf8_metadata($top_section, "hascover",  1);
    12911312    } else {
    12921313    $filename =~ s/jpg$/JPG/;
    12931314    if (-e $filename) {
    12941315        $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
    1295         $doc_obj->add_utf8_metadata($thissection, "hascover",  1);
     1316        $doc_obj->add_utf8_metadata($top_section, "hascover",  1);
    12961317    }
    12971318    }
Note: See TracChangeset for help on using the changeset viewer.