Ignore:
Timestamp:
2021-02-26T19:39:51+13:00 (3 years ago)
Author:
anupama
Message:

Committing the improvements to EmbeddedMetaPlugin's processing of Keywords vs other metadata fields. Keywords were literally stored as arrays of words rather than phrases in PDFs (at least in Diego's sample PDF), whereas other meta fields like Subjects and Creators stored them as arrays of phrases. To get both to work, Kathy updated EXIF to a newer version, to retrieve the actual EXIF values stored in the PDF. And Kathy and Dr Bainbridge came up with a new option that I added called apply_join_before_split_to_metafields that's a regex which can list the metadata fields to apply the join_before_split to and whcih previously always got applied to all metadata fields. Now it's applied to any *Keywords metafields by default, as that's the metafield we have experience of that behaves differently to the others, as it stores by word instead of phrases. Tested on Diego's sample PDF. Diego has double-checked it to works on his sample PDF too, setting the split char to ; and turning on the join_before_split and leaving apply_join_before_split_to_metafields at its default of .*Keywords. File changes are strings.properties for the tooltip, the plugin introducing the option and working with it and Kathy's EXIF updates affecting cpan/File and cpan/Image.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/cpan/Image/ExifTool/DjVu.pm

    r24107 r34921  
    1919use Image::ExifTool qw(:DataAccess :Utils);
    2020
    21 $VERSION = '1.03';
     21$VERSION = '1.06';
    2222
    2323sub ParseAnt($);
     
    2929%Image::ExifTool::DjVu::Main = (
    3030    GROUPS => { 2 => 'Image' },
    31     NOTES => 'Information is extracted from the following chunks in DjVu images.',
     31    NOTES => q{
     32        Information is extracted from the following chunks in DjVu images. See
     33        L<http://www.djvu.org/> for the DjVu specification.
     34    },
    3235    INFO => {
    3336        SubDirectory => { TagTable => 'Image::ExifTool::DjVu::Info' },
     
    136139        documentation endorses tags borrowed from two standards: 1) BibTeX
    137140        bibliography system tags (all lowercase Tag ID's in the table below), and 2)
    138         PDF DocInfo tags (uppercase Tag ID's).
     141        PDF DocInfo tags (capitalized Tag ID's).
    139142    },
    140143    # BibTeX tags (ref http://en.wikipedia.org/wiki/BibTeX)
     
    225228                $tok .= '"';    # quote is part of the string
    226229            }
     230            # must protect unescaped "$" and "@" symbols, and "\" at end of string
     231            $tok =~ s{\\(.)|([\$\@]|\\$)}{'\\'.($2 || $1)}sge;
    227232            # convert C escape sequences (allowed in quoted text)
    228233            $tok = eval qq{"$tok"};
     
    231236            # allow anything in key but whitespace, braces and double quotes
    232237            # (this is one of those assumptions I mentioned)
    233             $$dataPt =~ /([^\s()"]+)/sg;
    234             $tok = $1;
     238            $tok = $$dataPt =~ /([^\s()"]+)/sg ? $1 : undef;
    235239        }
    236240        push @toks, $tok if defined $tok;
     
    247251sub ProcessAnt($$$)
    248252{
    249     my ($exifTool, $dirInfo, $tagTablePtr) = @_;
     253    my ($et, $dirInfo, $tagTablePtr) = @_;
    250254    my $dataPt = $$dirInfo{DataPt};
    251255
     
    265269        if ($tag eq 'metadata') {
    266270            # ProcessMeta() takes array reference
    267             $exifTool->HandleTag($tagTablePtr, $tag, $ant);
     271            $et->HandleTag($tagTablePtr, $tag, $ant);
    268272        } else {
    269273            next if ref $$ant[0];   # only process simple values
    270             $exifTool->HandleTag($tagTablePtr, $tag, $$ant[0]);
     274            $et->HandleTag($tagTablePtr, $tag, $$ant[0]);
    271275        }
    272276    }
     
    281285sub ProcessMeta($$$)
    282286{
    283     my ($exifTool, $dirInfo, $tagTablePtr) = @_;
     287    my ($et, $dirInfo, $tagTablePtr) = @_;
    284288    my $dataPt = $$dirInfo{DataPt};
    285289    return 0 unless ref $$dataPt eq 'ARRAY';
    286     $exifTool->VerboseDir('Metadata', scalar @$$dataPt);
     290    $et->VerboseDir('Metadata', scalar @$$dataPt);
    287291    my ($item, $err);
    288292    foreach $item (@$$dataPt) {
     
    295299            $name =~ tr/-_a-zA-Z0-9//dc; # remove illegal characters
    296300            length $name or $err = 1, next;
    297             Image::ExifTool::AddTagToTable($tagTablePtr, $$item[0], { Name => ucfirst($name) });
     301            AddTagToTable($tagTablePtr, $$item[0], { Name => ucfirst($name) });
    298302        }
    299         $exifTool->HandleTag($tagTablePtr, $$item[0], $$item[1]);
    300     }
    301     $err and $exifTool->Warn('Ignored invalid metadata entry(s)');
     303        $et->HandleTag($tagTablePtr, $$item[0], $$item[1]);
     304    }
     305    $err and $et->Warn('Ignored invalid metadata entry(s)');
    302306    return 1;
    303307}
     
    309313sub ProcessBZZ($$$)
    310314{
    311     my ($exifTool, $dirInfo, $tagTablePtr) = @_;
     315    my ($et, $dirInfo, $tagTablePtr) = @_;
    312316    require Image::ExifTool::BZZ;
    313317    my $buff = Image::ExifTool::BZZ::Decode($$dirInfo{DataPt});
    314318    unless (defined $buff) {
    315         $exifTool->Warn("Error decoding $$dirInfo{DirName}");
     319        $et->Warn("Error decoding $$dirInfo{DirName}");
    316320        return 0;
    317321    }
    318     my $verbose = $exifTool->Options('Verbose');
     322    my $verbose = $et->Options('Verbose');
    319323    if ($verbose >= 3) {
    320324        # dump the decoded data in very verbose mode
    321         $exifTool->VerboseDir("Decoded $$dirInfo{DirName}", 0, length $buff);
    322         $exifTool->VerboseDump(\$buff);
     325        $et->VerboseDir("Decoded $$dirInfo{DirName}", 0, length $buff);
     326        $et->VerboseDump(\$buff);
    323327    }
    324328    $$dirInfo{DataPt} = \$buff;
     
    326330    # process the data using the default process proc for this table
    327331    my $processProc = $$tagTablePtr{PROCESS_PROC} or return 0;
    328     return &$processProc($exifTool, $dirInfo, $tagTablePtr);
     332    return &$processProc($et, $dirInfo, $tagTablePtr);
    329333}
    330334
     
    349353=head1 AUTHOR
    350354
    351 Copyright 2003-2011, Phil Harvey (phil at owl.phy.queensu.ca)
     355Copyright 2003-2021, Phil Harvey (philharvey66 at gmail.com)
    352356
    353357This library is free software; you can redistribute it and/or modify it
Note: See TracChangeset for help on using the changeset viewer.