Ignore:
Timestamp:
2021-02-26T19:39:51+13:00 (3 years ago)
Author:
anupama
Message:

Committing the improvements to EmbeddedMetaPlugin's processing of Keywords vs other metadata fields. Keywords were literally stored as arrays of words rather than phrases in PDFs (at least in Diego's sample PDF), whereas other meta fields like Subjects and Creators stored them as arrays of phrases. To get both to work, Kathy updated EXIF to a newer version, to retrieve the actual EXIF values stored in the PDF. And Kathy and Dr Bainbridge came up with a new option that I added called apply_join_before_split_to_metafields that's a regex which can list the metadata fields to apply the join_before_split to and whcih previously always got applied to all metadata fields. Now it's applied to any *Keywords metafields by default, as that's the metafield we have experience of that behaves differently to the others, as it stores by word instead of phrases. Tested on Diego's sample PDF. Diego has double-checked it to works on his sample PDF too, setting the split char to ; and turning on the join_before_split and leaving apply_join_before_split_to_metafields at its default of .*Keywords. File changes are strings.properties for the tooltip, the plugin introducing the option and working with it and Kathy's EXIF updates affecting cpan/File and cpan/Image.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/cpan/Image/ExifTool/Charset.pm

    r24107 r34921  
    1515use Image::ExifTool qw(:DataAccess :Utils);
    1616
    17 $VERSION = '1.07';
     17$VERSION = '1.11';
    1818
    1919my %charsetTable;   # character set tables we've loaded
     
    5858    Latin        => 0x101,
    5959    Latin2       => 0x101,
     60    DOSLatinUS   => 0x101,
     61    DOSLatin1    => 0x101,
     62    DOSCyrillic  => 0x101,
    6063    MacCroatian  => 0x101,
    6164    MacCyrillic  => 0x101,
     
    107110
    108111#------------------------------------------------------------------------------
     112# Does an array contain valid UTF-16 characters?
     113# Inputs: 0) array reference to list of UCS-2 values
     114# Returns: 0=invalid UTF-16, 1=valid UTF-16 with no surrogates, 2=valid UTF-16 with surrogates
     115sub IsUTF16($)
     116{
     117    local $_;
     118    my $uni = shift;
     119    my $surrogate;
     120    foreach (@$uni) {
     121        my $hiBits = ($_ & 0xfc00);
     122        if ($hiBits == 0xfc00) {
     123            # check for invalid values in UTF-16
     124            return 0 if $_ == 0xffff or $_ == 0xfffe or ($_ >= 0xfdd0 and $_ <= 0xfdef);
     125        } elsif ($surrogate) {
     126            return 0 if $hiBits != 0xdc00;
     127            $surrogate = 0;
     128        } else {
     129            return 0 if $hiBits == 0xdc00;
     130            $surrogate = 1 if $hiBits == 0xd800;
     131        }
     132    }
     133    return 1 if not defined $surrogate;
     134    return 2 unless $surrogate;
     135    return 0;
     136}
     137
     138#------------------------------------------------------------------------------
    109139# Decompose string with specified encoding into an array of integer code points
    110140# Inputs: 0) ExifTool object ref (or undef), 1) string, 2) character set name,
     
    115145# - byte order mark observed and then removed with UCS2 and UCS4
    116146# - no warnings are issued if ExifTool object is not provided
     147# - sets ExifTool WrongByteOrder flag if byte order is Unknown and current order is wrong
    117148sub Decompose($$$;$)
    118149{
    119150    local $_;
    120     my ($exifTool, $val, $charset) = @_; # ($byteOrder assigned later if required)
     151    my ($et, $val, $charset) = @_; # ($byteOrder assigned later if required)
    121152    my $type = $csType{$charset};
    122153    my (@uni, $conv);
     
    126157        unless ($conv) {
    127158            # (shouldn't happen)
    128             $exifTool->Warn("Invalid character set $charset") if $exifTool;
     159            $et->Warn("Invalid character set $charset") if $et;
    129160            return \@uni;   # error!
    130161        }
     
    141172            @uni = unpack($] < 5.010000 ? 'U0U*' : 'C0U*', $val);
    142173            # issue warning if we had errors
    143             if ($Image::ExifTool::evalWarning and $exifTool and not $$exifTool{WarnBadUTF8}) {
    144                 $exifTool->Warn('Malformed UTF-8 character(s)');
    145                 $$exifTool{WarnBadUTF8} = 1;
     174            if ($Image::ExifTool::evalWarning and $et and not $$et{WarnBadUTF8}) {
     175                $et->Warn('Malformed UTF-8 character(s)');
     176                $$et{WarnBadUTF8} = 1;
    146177            }
    147178        }
     
    196227                    $fmt =~ tr/nvNV/vnVN/;
    197228                    @uni = unpack($fmt, $val);
     229                    $$et{WrongByteOrder} = 1;
    198230                }
    199231            }
     
    225257                }
    226258                # use this byte order if there are fewer errors
    227                 return \@try if $e2 < $e1;
     259                if ($e2 < $e1) {
     260                    $$et{WrongByteOrder} = 1;
     261                    return \@try;
     262                }
    228263            }
    229264        } else {
     
    275310{
    276311    local $_;
    277     my ($exifTool, $uni, $charset) = @_; # ($byteOrder assigned later if required)
     312    my ($et, $uni, $charset) = @_; # ($byteOrder assigned later if required)
    278313    my ($outVal, $conv, $inv);
    279     $charset or $charset = $$exifTool{OPTIONS}{Charset};
     314    $charset or $charset = $$et{OPTIONS}{Charset};
    280315    my $csType = $csType{$charset};
    281316    if ($csType == 0x100) {     # UTF8 (also treat ASCII as UTF8)
     
    294329        $conv = LoadCharset($charset);
    295330        unless ($conv) {
    296             $exifTool->Warn("Missing charset $charset") if $exifTool;
     331            $et->Warn("Missing charset $charset") if $et;
    297332            return '';
    298333        }
     
    301336        unless ($inv) {
    302337            if (not $csType or $csType & 0x802) {
    303                 $exifTool->Warn("Invalid destination charset $charset") if $exifTool;
     338                $et->Warn("Invalid destination charset $charset") if $et;
    304339                return '';
    305340            }
     
    322357            next if $_ < 0x100 and not $$conv{$_};
    323358            $_ = ord('?');  # set invalid characters to '?'
    324             if ($exifTool and not $$exifTool{EncodingError}) {
    325                 $exifTool->Warn("Some character(s) could not be encoded in $charset");
    326                 $$exifTool{EncodingError} = 1;
     359            if ($et and not $$et{EncodingError}) {
     360                $et->Warn("Some character(s) could not be encoded in $charset");
     361                $$et{EncodingError} = 1;
    327362            }
    328363        }
     
    374409
    375410  UTF8, UTF16, UCS2, UCS4, Arabic, Baltic, Cyrillic, Greek, Hebrew, JIS,
    376   Latin, Latin2, MacArabic, MacChineseCN, MacChineseTW, MacCroatian,
    377   MacCyrillic, MacGreek, MacHebrew, MacIceland, MacJapanese, MacKorean,
    378   MacLatin2, MacRSymbol, MacRoman, MacRomanian, MacThai, MacTurkish,
    379   PDFDoc, RSymbol, ShiftJIS, Symbol, Thai, Turkish, Vietnam
     411  Latin, Latin2, DOSLatinUS, DOSLatin1, DOSCyrillic, MacArabic,
     412  MacChineseCN, MacChineseTW, MacCroatian, MacCyrillic, MacGreek, MacHebrew,
     413  MacIceland, MacJapanese, MacKorean, MacLatin2, MacRSymbol, MacRoman,
     414  MacRomanian, MacThai, MacTurkish, PDFDoc, RSymbol, ShiftJIS, Symbol, Thai,
     415  Turkish, Vietnam
    380416
    381417However, only some of these character sets are available to the user via
    382 ExifTool options; the multi-byte character sets are used only internally
     418ExifTool options -- the multi-byte character sets are used only internally
    383419when decoding certain types of information.
    384420
    385421=head1 AUTHOR
    386422
    387 Copyright 2003-2011, Phil Harvey (phil at owl.phy.queensu.ca)
     423Copyright 2003-2021, Phil Harvey (philharvey66 at gmail.com)
    388424
    389425This library is free software; you can redistribute it and/or modify it
Note: See TracChangeset for help on using the changeset viewer.