Ignore:
Timestamp:
2021-02-26T19:39:51+13:00 (3 years ago)
Author:
anupama
Message:

Committing the improvements to EmbeddedMetaPlugin's processing of Keywords vs other metadata fields. Keywords were literally stored as arrays of words rather than phrases in PDFs (at least in Diego's sample PDF), whereas other meta fields like Subjects and Creators stored them as arrays of phrases. To get both to work, Kathy updated EXIF to a newer version, to retrieve the actual EXIF values stored in the PDF. And Kathy and Dr Bainbridge came up with a new option that I added called apply_join_before_split_to_metafields that's a regex which can list the metadata fields to apply the join_before_split to and whcih previously always got applied to all metadata fields. Now it's applied to any *Keywords metafields by default, as that's the metafield we have experience of that behaves differently to the others, as it stores by word instead of phrases. Tested on Diego's sample PDF. Diego has double-checked it to works on his sample PDF too, setting the split char to ; and turning on the join_before_split and leaving apply_join_before_split_to_metafields at its default of .*Keywords. File changes are strings.properties for the tooltip, the plugin introducing the option and working with it and Kathy's EXIF updates affecting cpan/File and cpan/Image.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/cpan/Image/ExifTool/Import.pm

    r24107 r34921  
    1313use vars qw($VERSION @ISA @EXPORT_OK);
    1414
    15 $VERSION = '1.00';
     15$VERSION = '1.10';
    1616@ISA = qw(Exporter);
    1717@EXPORT_OK = qw(ReadCSV ReadJSON);
     
    1919sub ReadJSONObject($;$);
    2020
    21 my %unescapeJSON = ( 't'=>"\t", 'n'=>"\n", 'r'=>"\r" );
     21my %unescapeJSON = ( 't'=>"\t", 'n'=>"\n", 'r'=>"\r", 'b' => "\b", 'f' => "\f" );
    2222my $charset;
    2323
    2424#------------------------------------------------------------------------------
    2525# Read CSV file
    26 # Inputs: 0) CSV file name, 1) database hash ref, 2) flag to delete "-" tags
     26# Inputs: 0) CSV file name, file ref or RAF ref, 1) database hash ref,
     27#         2) missing tag value, 3) delimiter if other than ','
    2728# Returns: undef on success, or error string
    2829# Notes: There are various flavours of CSV, but here we assume that only
    2930#        double quotes are escaped, and they are escaped by doubling them
    30 sub ReadCSV($$;$)
     31sub ReadCSV($$;$$)
    3132{
    3233    local ($_, $/);
    33     my ($file, $database, $delDash) = @_;
    34     my ($buff, @tags, $found, $err);
    35 
    36     open CSVFILE, $file or return "Error opening CSV file '$file'";
    37     binmode CSVFILE;
    38     my $raf = new File::RandomAccess(\*CSVFILE);
     34    my ($file, $database, $missingValue, $delim) = @_;
     35    my ($buff, @tags, $found, $err, $raf, $openedFile);
     36
     37    if (UNIVERSAL::isa($file, 'File::RandomAccess')) {
     38        $raf = $file;
     39        $file = 'CSV file';
     40    } elsif (ref $file eq 'GLOB') {
     41        $raf = new File::RandomAccess($file);
     42        $file = 'CSV file';
     43    } else {
     44        open CSVFILE, $file or return "Error opening CSV file '${file}'";
     45        binmode CSVFILE;
     46        $openedFile = 1;
     47        $raf = new File::RandomAccess(\*CSVFILE);
     48    }
     49    $delim = ',' unless defined $delim;
    3950    # set input record separator by first newline found in the file
    4051    # (safe because first line should contain only tag names)
     
    4556    while ($raf->ReadLine($buff)) {
    4657        my (@vals, $v, $i, %fileInfo);
    47         my @toks = split ',', $buff;
     58        my @toks = split /\Q$delim/, $buff;
    4859        while (@toks) {
    4960            ($v = shift @toks) =~ s/^ +//;  # remove leading spaces
     
    5263                while ($v !~ /("+)\s*$/ or not length($1) & 1) {
    5364                    if (@toks) {
    54                         $v .= ',' . shift @toks;
     65                        $v .= $delim . shift @toks;
    5566                    } else {
    5667                        # read another line from the file
    5768                        $raf->ReadLine($buff) or last;
    58                         @toks = split ',', $buff;
     69                        @toks = split /\Q$delim/, $buff;
    5970                        last unless @toks;
    6071                        $v .= shift @toks;
     
    7182            # save values for each tag
    7283            for ($i=0; $i<@vals and $i<@tags; ++$i) {
    73                 next unless length $vals[$i];   # ignore empty entries
    74                 # delete tag if value (set value to undef) is '-' and -f option is used
    75                 $fileInfo{$tags[$i]} = ($vals[$i] eq '-' and $delDash) ? undef : $vals[$i];
     84                # ignore empty entries unless missingValue is empty too
     85                next unless length $vals[$i] or defined $missingValue and $missingValue eq '';
     86                # delete tag (set value to undef) if value is same as missing tag
     87                $fileInfo{$tags[$i]} =
     88                    (defined $missingValue and $vals[$i] eq $missingValue) ? undef : $vals[$i];
    7689            }
    7790            # figure out the file name to use
     
    8396            # the first row should be the tag names
    8497            foreach (@vals) {
    85                 /^[-\w]+(:[-\w+]+)?#?$/ or $err = "Invalid tag name '$_'", last;
     98                # terminate at first blank tag name (eg. extra comma at end of line)
     99                last unless length $_;
     100                @tags or s/^\xef\xbb\xbf//; # remove UTF-8 BOM if it exists
     101                /^[-\w]+(:[-\w+]+)?#?$/ or $err = "Invalid tag name '${_}'", last;
    86102                push(@tags, $_);
    87103            }
    88104            last if $err;
    89105            @tags or $err = 'No tags found', last;
    90         }
    91     }
    92     close CSVFILE;
     106            # fix "SourceFile" case if necessary
     107            $tags[0] = 'SourceFile' if lc $tags[0] eq 'sourcefile';
     108        }
     109    }
     110    close CSVFILE if $openedFile;
    93111    undef $raf;
    94112    $err = 'No SourceFile column' unless $found or $err;
     
    108126#------------------------------------------------------------------------------
    109127# Read JSON object from file
    110 # Inputs: 0) JSON file handle, 1) optional file buffer reference
     128# Inputs: 0) RAF reference or undef, 1) optional scalar reference for data
     129#            to read before reading from file (ie. the file read buffer)
    111130# Returns: JSON object (scalar, hash ref, or array ref), or undef on EOF or
    112131#          empty object or array (and sets $$buffPt to empty string on EOF)
     
    114133sub ReadJSONObject($;$)
    115134{
    116     my ($fp, $buffPt) = @_;
     135    my ($raf, $buffPt) = @_;
    117136    # initialize buffer if necessary
    118     my ($pos, $readMore, $rtnVal, $tok, $key);
     137    my ($pos, $readMore, $rtnVal, $tok, $key, $didBOM);
    119138    if ($buffPt) {
    120139        $pos = pos $$buffPt;
     140        $pos = pos($$buffPt) = 0 unless defined $pos;
    121141    } else {
    122142        my $buff = '';
     
    125145    }
    126146Tok: for (;;) {
     147        # (didn't spend the time to understand how $pos could be undef, but
     148        #  put a test here to be safe because one user reported this problem)
     149        last unless defined $pos;
    127150        if ($pos >= length $$buffPt or $readMore) {
     151            last unless defined $raf;
    128152            # read another 64kB and add to unparsed data
    129153            my $offset = length($$buffPt) - $pos;
    130             $$buffPt = substr($$buffPt, $pos) if $offset;
    131             read $fp, $$buffPt, 65536, $offset or $$buffPt = '', last;
     154            if ($offset) {
     155                my $buff;
     156                $raf->Read($buff, 65536) or $$buffPt = '', last;
     157                $$buffPt = substr($$buffPt, $pos) . $buff;
     158            } else {
     159                $raf->Read($$buffPt, 65536) or $$buffPt = '', last;
     160            }
     161            unless ($didBOM) {
     162                $$buffPt =~ s/^\xef\xbb\xbf//;  # remove UTF-8 BOM if it exists
     163                $didBOM = 1;
     164            }
    132165            $pos = pos($$buffPt) = 0;
    133166            $readMore = 0;
     
    145178                # read "KEY":"VALUE" pairs
    146179                unless (defined $key) {
    147                     $key = ReadJSONObject($fp, $buffPt);
     180                    $key = ReadJSONObject($raf, $buffPt);
    148181                    $pos = pos $$buffPt;
    149182                }
     
    153186                    $$buffPt =~ /(\S)/g or $readMore = 1, next Tok;
    154187                    $1 eq ':' or return undef;  # error if not a colon
    155                     my $val = ReadJSONObject($fp, $buffPt);
     188                    my $val = ReadJSONObject($raf, $buffPt);
    156189                    $pos = pos $$buffPt;
    157190                    return undef unless defined $val;
     
    167200            $rtnVal = [ ] unless defined $rtnVal;
    168201            for (;;) {
    169                 my $item = ReadJSONObject($fp, $buffPt);
     202                my $item = ReadJSONObject($raf, $buffPt);
    170203                $pos = pos $$buffPt;
    171204                # ($item may be undef for empty array)
     
    185218            $rtnVal =~ s/\\u([0-9a-f]{4})/ToUTF8(hex $1)/ige;
    186219            $rtnVal =~ s/\\(.)/$unescapeJSON{$1}||$1/sge;
     220            # decode base64 (binary data) values
     221            if ($rtnVal =~ /^base64:[A-Za-z0-9+\/]*={0,2}$/ and length($rtnVal) % 4 == 3) {
     222                require Image::ExifTool::XMP;
     223                $rtnVal = ${Image::ExifTool::XMP::DecodeBase64(substr($rtnVal,7))};
     224            }
    187225        } elsif ($tok eq ']' or $tok eq '}' or $tok eq ',') {
    188226            # return undef for empty object, array, or list item
     
    201239#------------------------------------------------------------------------------
    202240# Read JSON file
    203 # Inputs: 0) JSON file name, 1) database hash ref, 2) flag to delete "-" tags
    204 #         2) character set
     241# Inputs: 0) JSON file name, file ref or RAF ref, 1) database hash ref,
     242#         2) flag to delete "-" tags, 3) character set
    205243# Returns: undef on success, or error string
    206244sub ReadJSON($$;$$)
    207245{
    208246    local $_;
    209     my ($file, $database, $delDash, $chset) = @_;
     247    my ($file, $database, $missingValue, $chset) = @_;
     248    my ($raf, $openedFile);
    210249
    211250    # initialize character set for converting "\uHHHH" chars
    212251    $charset = $chset || 'UTF8';
    213     open JSONFILE, $file or return "Error opening JSON file '$file'";
    214     binmode JSONFILE;
    215     my $obj = ReadJSONObject(\*JSONFILE);
    216     close JSONFILE;
     252    if (UNIVERSAL::isa($file, 'File::RandomAccess')) {
     253        $raf = $file;
     254        $file = 'JSON file';
     255    } elsif (ref $file eq 'GLOB') {
     256        $raf = new File::RandomAccess($file);
     257        $file = 'JSON file';
     258    } else {
     259        open JSONFILE, $file or return "Error opening JSON file '${file}'";
     260        binmode JSONFILE;
     261        $openedFile = 1;
     262        $raf = new File::RandomAccess(\*JSONFILE);
     263    }
     264    my $obj = ReadJSONObject($raf);
     265    close JSONFILE if $openedFile;
    217266    unless (ref $obj eq 'ARRAY') {
    218         ref $obj eq 'HASH' or return "Format error in JSON file '$file'";
     267        ref $obj eq 'HASH' or return "Format error in JSON file '${file}'";
    219268        $obj = [ $obj ];
    220269    }
    221270    my ($info, $found);
    222271    foreach $info (@$obj) {
    223         next unless ref $info eq 'HASH' and $$info{SourceFile};
    224         if ($delDash) {
    225             $$info{$_} eq '-' and $$info{$_} = undef foreach keys %$info;
     272        next unless ref $info eq 'HASH';
     273        # fix "SourceFile" case, or assume '*' if SourceFile not specified
     274        unless (defined $$info{SourceFile}) {
     275            my ($key) = grep /^SourceFile$/i, keys %$info;
     276            if ($key) {
     277                $$info{SourceFile} = $$info{$key};
     278                delete $$info{$key};
     279            } else {
     280                $$info{SourceFile} = '*';
     281            }
     282        }
     283        if (defined $missingValue) {
     284            $$info{$_} eq $missingValue and $$info{$_} = undef foreach keys %$info;
    226285        }
    227286        $$database{$$info{SourceFile}} = $info;
    228287        $found = 1;
    229288    }
    230     return $found ? undef : "No SourceFile entries in '$file'";
     289    return $found ? undef : "No valid JSON objects in '${file}'";
    231290}
    232291
     
    267326=item Inputs:
    268327
    269 0) CSV file name.
     3280) CSV file name or file reference.
    270329
    2713301) Hash reference for database object.
    272331
    273 2) Optional flag to set '-' values to undef in the database.  (Used for
    274 deleting tags.)
    275 
    276 3) [ReadJSON only] Optional character set for converting Unicode escape
    277 sequences in strings.  Defaults to "UTF8".  See the ExifTool Charset option
    278 for a list of valid settings.
     3322) Optional string used to represent an undefined (missing) tag value.
     333(Used for deleting tags.)
     334
     3353) For ReadCSV this gives the delimiter for CSV entries, with a default of
     336",".  For ReadJSON this is the character set for converting Unicode escape
     337sequences in strings, with a default of "UTF8".  See the ExifTool Charset
     338option for a list of valid character sets.
    279339
    280340=item Return Value:
     
    289349=head1 AUTHOR
    290350
    291 Copyright 2003-2011, Phil Harvey (phil at owl.phy.queensu.ca)
     351Copyright 2003-2021, Phil Harvey (philharvey66 at gmail.com)
    292352
    293353This library is free software; you can redistribute it and/or modify it
Note: See TracChangeset for help on using the changeset viewer.