Ignore:
Timestamp:
2021-02-26T19:39:51+13:00 (3 years ago)
Author:
anupama
Message:

Committing the improvements to EmbeddedMetaPlugin's processing of Keywords vs other metadata fields. Keywords were literally stored as arrays of words rather than phrases in PDFs (at least in Diego's sample PDF), whereas other meta fields like Subjects and Creators stored them as arrays of phrases. To get both to work, Kathy updated EXIF to a newer version, to retrieve the actual EXIF values stored in the PDF. And Kathy and Dr Bainbridge came up with a new option that I added called apply_join_before_split_to_metafields that's a regex which can list the metadata fields to apply the join_before_split to and whcih previously always got applied to all metadata fields. Now it's applied to any *Keywords metafields by default, as that's the metafield we have experience of that behaves differently to the others, as it stores by word instead of phrases. Tested on Diego's sample PDF. Diego has double-checked it to works on his sample PDF too, setting the split char to ; and turning on the join_before_split and leaving apply_join_before_split_to_metafields at its default of .*Keywords. File changes are strings.properties for the tooltip, the plugin introducing the option and working with it and Kathy's EXIF updates affecting cpan/File and cpan/Image.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/cpan/Image/ExifTool/iWork.pm

    r24107 r34921  
    1515use Image::ExifTool::ZIP;
    1616
    17 $VERSION = '1.02';
    18 
    19 # test for recognized iWork document extensions and outter XML elements
     17$VERSION = '1.06';
     18
     19# test for recognized iWork document extensions and outer XML elements
    2020my %iWorkType = (
    2121    # file extensions
    22     NUMBERS => 'Apple Numbers',
    23     PAGES   => 'Apple Pages',
    24     KEY     => 'Apple Keynote',
    25     KTH     => 'Apple Keynote Theme',
    26     NMBTEMPLATE => 'Apple Numbers Template',
     22    NUMBERS => 'NUMBERS',
     23    PAGES   => 'PAGES',
     24    KEY     => 'KEY',
     25    KTH     => 'KTH',
     26    NMBTEMPLATE => 'NMBTEMPLATE',
    2727    # we don't support double extensions --
    2828    # "PAGES.TEMPLATE" => 'Apple Pages Template',
    29     # outter XML elements
    30     'ls:document' => 'Apple Numbers',
    31     'sl:document' => 'Apple Pages',
    32     'key:presentation' => 'Apple Keynote',
     29    # outer XML elements
     30    'ls:document' => 'NUMBERS',
     31    'sl:document' => 'PAGES',
     32    'key:presentation' => 'KEY',
    3333);
    3434
     
    3737# indicates, but I think it refers to the new "flattened" package format)
    3838my %mimeType = (
    39     'Apple Numbers' => 'application/x-iwork-numbers-sffnumbers',
    40     'Apple Pages'   => 'application/x-iwork-pages-sffpages',
    41     'Apple Keynote' => 'application/x-iWork-keynote-sffkey',
    42     'Apple Numbers Template' => 'application/x-iwork-numbers-sfftemplate',
    43     'Apple Pages Template'   => 'application/x-iwork-pages-sfftemplate',
    44     'Apple Keynote Theme'    => 'application/x-iWork-keynote-sffkth',
     39    'NUMBERS'      => 'application/x-iwork-numbers-sffnumbers',
     40    'PAGES'         => 'application/x-iwork-pages-sffpages',
     41    'KEY'          => 'application/x-iWork-keynote-sffkey',
     42    'NMBTEMPLATE'  => 'application/x-iwork-numbers-sfftemplate',
     43    'PAGES.TEMPLATE'=> 'application/x-iwork-pages-sfftemplate',
     44    'KTH'           => 'application/x-iWork-keynote-sffkth',
    4545);
    4646
     
    7171    my $props = shift;
    7272    return 0 if $$props[-1] =~ /^\w+:ID$/;  # ignore ID tags
    73     return ($$props[0] =~ /.*?:(.*)/) ? $1 : $$props[0];
     73    return $$props[0] =~ /^.*?:(.*)/ ? $1 : $$props[0];
    7474}
    7575
     
    8282sub FoundTag($$$$;$)
    8383{
    84     my ($exifTool, $tagTablePtr, $props, $val, $attrs) = @_;
     84    my ($et, $tagTablePtr, $props, $val, $attrs) = @_;
    8585    return 0 unless @$props;
    86     my $verbose = $exifTool->Options('Verbose');
    87 
    88     $exifTool->VPrint(0, "  | - Tag '", join('/',@$props), "'\n") if $verbose > 1;
     86    my $verbose = $et->Options('Verbose');
     87
     88    $et->VPrint(0, "  | - Tag '", join('/',@$props), "'\n") if $verbose > 1;
    8989
    9090    # un-escape XML character entities
    9191    $val = Image::ExifTool::XMP::UnescapeXML($val);
    9292    # convert from UTF8 to ExifTool Charset
    93     $val = $exifTool->Decode($val, 'UTF8');
     93    $val = $et->Decode($val, 'UTF8');
    9494    my $tag = GetTagID($props) or return 0;
    9595
    9696    # add any unknown tags to table
    9797    unless ($$tagTablePtr{$tag}) {
    98         $exifTool->VPrint(0, "  [adding $tag]\n") if $verbose;
    99         Image::ExifTool::AddTagToTable($tagTablePtr, $tag, { Name => ucfirst $tag });
     98        $et->VPrint(0, "  [adding $tag]\n") if $verbose;
     99        AddTagToTable($tagTablePtr, $tag, { Name => ucfirst $tag });
    100100    }
    101101    # save the tag
    102     $exifTool->HandleTag($tagTablePtr, $tag, $val);
     102    $et->HandleTag($tagTablePtr, $tag, $val);
    103103
    104104    return 1;
     
    113113sub Process_iWork($$)
    114114{
    115     my ($exifTool, $dirInfo) = @_;
     115    my ($et, $dirInfo) = @_;
    116116    my $zip = $$dirInfo{ZIP};
    117117    my ($type, $index, $indexFile, $status);
     
    120120    local $SIG{'__WARN__'} = \&Image::ExifTool::ZIP::WarnProc;
    121121    # trust type given by file extension if available
    122     $type = $iWorkType{$$exifTool{FILE_EXT}} if $$exifTool{FILE_EXT};
     122    $type = $iWorkType{$$et{FILE_EXT}} if $$et{FILE_EXT};
    123123    unless ($type) {
    124124        # read the index file
     
    132132                }
    133133            }
     134        } else {
     135            @members = $zip->membersMatching('(?i)^.*\.(pages|numbers|key)/Index.*');
     136            if (@members) {
     137                my $tmp = $members[0]->fileName();
     138                $type = $iWorkType{uc $1} if $tmp =~ /\.(pages|numbers|key)/i;
     139            }
    134140        }
    135141        $type or $type = 'ZIP';     # assume ZIP by default
    136142    }
    137     $exifTool->SetFileType($type, $mimeType{$type});
     143    $et->SetFileType($type, $mimeType{$type});
    138144
    139145    my @members = $zip->members();
     
    144150        my $file = $member->fileName();
    145151        next unless defined $file;
    146         $exifTool->VPrint(0, "File: $file\n");
     152        $et->VPrint(0, "File: $file\n");
    147153        # set the document number and extract ZIP tags
    148         $$exifTool{DOC_NUM} = ++$docNum;
    149         Image::ExifTool::ZIP::HandleMember($exifTool, $member);
    150 
    151         # process only the index XML and JPEG thumbnail files
    152         next unless $file =~ m{^(index\.(xml|apxl)|QuickLook/Thumbnail\.jpg)$}i;
     154        $$et{DOC_NUM} = ++$docNum;
     155        Image::ExifTool::ZIP::HandleMember($et, $member);
     156
     157        # process only the index XML and JPEG thumbnail/preview files
     158        next unless $file =~ m{^(index\.(xml|apxl)|QuickLook/Thumbnail\.jpg|[^/]+/preview(-micro|-web)?.jpg)$}i;
    153159        # get the file contents if necessary
    154160        # (CAREFUL! $buff MUST be local since we hand off a value ref to PreviewImage)
     
    159165        } else {
    160166            ($buff, $status) = $zip->contents($member);
    161             $status and $exifTool->Warn("Error extracting $file"), next;
     167            $status and $et->Warn("Error extracting $file"), next;
    162168            $buffPt = \$buff;
    163169        }
    164170        # extract JPEG as PreviewImage (should only be QuickLook/Thumbnail.jpg)
    165171        if ($file =~ /\.jpg$/) {
    166             $exifTool->FoundTag('PreviewImage', $buffPt);
     172            my $type = ($file =~ /preview-(\w+)/) ? ($1 eq 'web' ? 'Other' : 'Thumbnail') : 'Preview';
     173            $et->FoundTag($type . 'Image', $buffPt);
    167174            next;
    168175        }
     
    183190        );
    184191        my $tagTablePtr = GetTagTable('Image::ExifTool::iWork::Main');
    185         $exifTool->ProcessDirectory(\%dirInfo, $tagTablePtr);
     192        $et->ProcessDirectory(\%dirInfo, $tagTablePtr);
    186193        undef $$buffPt; # (free memory now)
    187194    }
    188     delete $$exifTool{DOC_NUM};
     195    delete $$et{DOC_NUM};
    189196    return 1;
    190197}
     
    209216=head1 AUTHOR
    210217
    211 Copyright 2003-2011, Phil Harvey (phil at owl.phy.queensu.ca)
     218Copyright 2003-2021, Phil Harvey (philharvey66 at gmail.com)
    212219
    213220This library is free software; you can redistribute it and/or modify it
Note: See TracChangeset for help on using the changeset viewer.