Ignore:
Timestamp:
2021-02-26T19:39:51+13:00 (3 years ago)
Author:
anupama
Message:

Committing the improvements to EmbeddedMetaPlugin's processing of Keywords vs other metadata fields. Keywords were literally stored as arrays of words rather than phrases in PDFs (at least in Diego's sample PDF), whereas other meta fields like Subjects and Creators stored them as arrays of phrases. To get both to work, Kathy updated EXIF to a newer version, to retrieve the actual EXIF values stored in the PDF. And Kathy and Dr Bainbridge came up with a new option that I added called apply_join_before_split_to_metafields that's a regex which can list the metadata fields to apply the join_before_split to and whcih previously always got applied to all metadata fields. Now it's applied to any *Keywords metafields by default, as that's the metafield we have experience of that behaves differently to the others, as it stores by word instead of phrases. Tested on Diego's sample PDF. Diego has double-checked it to works on his sample PDF too, setting the split char to ; and turning on the join_before_split and leaving apply_join_before_split_to_metafields at its default of .*Keywords. File changes are strings.properties for the tooltip, the plugin introducing the option and working with it and Kathy's EXIF updates affecting cpan/File and cpan/Image.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/cpan/Image/ExifTool/OOXML.pm

    r24107 r34921  
    1515use Image::ExifTool::ZIP;
    1616
    17 $VERSION = '1.05';
     17$VERSION = '1.08';
    1818
    1919# test for recognized OOXML document extensions
     
    2222    DOTX => 1,  DOTM => 1,
    2323    POTX => 1,  POTM => 1,
     24    PPAX => 1,  PPAM => 1,
    2425    PPSX => 1,  PPSM => 1,
    2526    PPTX => 1,  PPTM => 1,  THMX => 1,
     
    6465
    6566        B<Tips:>
    66        
     67
    6768        1) Structural ZIP tags may be ignored (if desired) with C<--ZIP:all> on the
    6869        command line.
    69        
     70
    7071        2) Tags may be grouped by their document number in the ZIP archive with the
    7172        C<-g3> or C<-G3> option.
     
    135136    MMClips     => { },
    136137    modified    => {
    137         Name => 'ModifyDate', 
     138        Name => 'ModifyDate',
    138139        Groups => { 2 => 'Time' },
    139140        Format => 'date',
     
    209210sub FoundTag($$$$;$)
    210211{
    211     my ($exifTool, $tagTablePtr, $props, $val, $attrs) = @_;
     212    my ($et, $tagTablePtr, $props, $val, $attrs) = @_;
    212213    return 0 unless @$props;
    213     my $verbose = $exifTool->Options('Verbose');
     214    my $verbose = $et->Options('Verbose');
    214215
    215216    my $tag = $$props[-1];
    216     $exifTool->VPrint(0, "  | - Tag '", join('/',@$props), "'\n") if $verbose > 1;
     217    $et->VPrint(0, "  | - Tag '", join('/',@$props), "'\n") if $verbose > 1;
    217218
    218219    # un-escape XML character entities
    219220    $val = Image::ExifTool::XMP::UnescapeXML($val);
    220     # convert OOXML-escaped characters (ie. "_x0000d_" is a newline)
     221    # convert OOXML-escaped characters (eg. "_x0000d_" is a newline)
    221222    $val =~ s/_x([0-9a-f]{4})_/Image::ExifTool::PackUTF8(hex($1))/gie;
    222223    # convert from UTF8 to ExifTool Charset
    223     $val = $exifTool->Decode($val, 'UTF8');
     224    $val = $et->Decode($val, 'UTF8');
    224225    # queue this attribute for later if necessary
    225226    if ($queueAttrs{$tag}) {
     
    247248                $tagInfo{PrintConv} = '$self->ConvertDateTime($val)';
    248249            }
    249             $exifTool->VPrint(0, "  | [adding $tag]\n") if $verbose;
    250             Image::ExifTool::AddTagToTable($tagTablePtr, $tag, \%tagInfo);
     250            $et->VPrint(0, "  | [adding $tag]\n") if $verbose;
     251            AddTagToTable($tagTablePtr, $tag, \%tagInfo);
    251252        }
    252253    } elsif ($tag eq 'xmlns') {
     
    289290        }
    290291    } else {
    291         $exifTool->VPrint(0, "  [adding $tag]\n") if $verbose;
    292         Image::ExifTool::AddTagToTable($tagTablePtr, $tag, { Name => ucfirst $tag });
     292        $et->VPrint(0, "  [adding $tag]\n") if $verbose;
     293        AddTagToTable($tagTablePtr, $tag, { Name => ucfirst $tag });
    293294    }
    294295    # save the tag
    295     $exifTool->HandleTag($tagTablePtr, $tag, $val);
     296    $et->HandleTag($tagTablePtr, $tag, $val);
    296297
    297298    # start fresh for next tag
     
    312313sub ProcessDOCX($$)
    313314{
    314     my ($exifTool, $dirInfo) = @_;
     315    my ($et, $dirInfo) = @_;
    315316    my $zip = $$dirInfo{ZIP};
    316317    my $tagTablePtr = GetTagTable('Image::ExifTool::OOXML::Main');
     
    321322    if ($fileType) {
    322323        # THMX is a special case because its contents.main MIME types is PPTX
    323         if ($fileType eq 'PPTX' and $$exifTool{FILE_EXT} and $$exifTool{FILE_EXT} eq 'THMX') {
     324        if ($fileType eq 'PPTX' and $$et{FILE_EXT} and $$et{FILE_EXT} eq 'THMX') {
    324325            $fileType = 'THMX';
    325326        }
    326327    } else {
    327         $exifTool->VPrint(0, "Unrecognized MIME type: $mime\n");
     328        $et->VPrint(0, "Unrecognized MIME type: $mime\n");
    328329        # get MIME type according to file extension
    329         $fileType = $$exifTool{FILE_EXT};
     330        $fileType = $$et{FILE_EXT};
    330331        # default to 'DOCX' if this isn't a known OOXML extension
    331332        $fileType = 'DOCX' unless $fileType and $isOOXML{$fileType};
    332333    }
    333     $exifTool->SetFileType($fileType);
     334    $et->SetFileType($fileType);
    334335
    335336    # must catch all Archive::Zip warnings
     
    343344        my $file = $member->fileName();
    344345        next unless defined $file;
    345         $exifTool->VPrint(0, "File: $file\n");
     346        $et->VPrint(0, "File: $file\n");
    346347        # set the document number and extract ZIP tags
    347         $$exifTool{DOC_NUM} = ++$docNum;
    348         Image::ExifTool::ZIP::HandleMember($exifTool, $member);
    349         # process only XML and JPEG files in "docProps" directory
    350         next unless $file =~ m{^docProps/.*\.(xml|jpe?g)$}i;
     348        $$et{DOC_NUM} = ++$docNum;
     349        Image::ExifTool::ZIP::HandleMember($et, $member);
     350        # process only XML and JPEG/WMF thumbnail images in "docProps" directory
     351        next unless $file =~ m{^docProps/(.*\.xml|(thumbnail\.(jpe?g|wmf)))$}i;
    351352        # get the file contents (CAREFUL! $buff MUST be local since we hand off a value ref)
    352353        my ($buff, $status) = $zip->contents($member);
    353         $status and $exifTool->Warn("Error extracting $file"), next;
    354         # extract JPEG as PreviewImage (should only be docProps/thumbnail.jpeg)
    355         if ($file =~ /\.jpe?g/i) {
    356             $exifTool->FoundTag('PreviewImage', \$buff);
     354        $status and $et->Warn("Error extracting $file"), next;
     355        # extract docProps/thumbnail.(jpg|mwf) as PreviewImage|PreviewMWF
     356        if ($file =~ /\.(jpe?g|wmf)$/i) {
     357            my $tag = $file =~ /\.wmf$/i ? 'PreviewWMF' : 'PreviewImage';
     358            $et->FoundTag($tag, \$buff);
    357359            next;
    358360        }
     
    366368            },
    367369        );
    368         $exifTool->ProcessDirectory(\%dirInfo, $tagTablePtr);
     370        $et->ProcessDirectory(\%dirInfo, $tagTablePtr);
    369371        undef $buff;    # (free memory now)
    370372    }
    371     delete $$exifTool{DOC_NUM};
     373    delete $$et{DOC_NUM};
    372374    return 1;
    373375}
     
    394396=head1 AUTHOR
    395397
    396 Copyright 2003-2011, Phil Harvey (phil at owl.phy.queensu.ca)
     398Copyright 2003-2021, Phil Harvey (philharvey66 at gmail.com)
    397399
    398400This library is free software; you can redistribute it and/or modify it
Note: See TracChangeset for help on using the changeset viewer.