Ignore:
Timestamp:
2021-02-26T19:39:51+13:00 (3 years ago)
Author:
anupama
Message:

Committing the improvements to EmbeddedMetaPlugin's processing of Keywords vs other metadata fields. Keywords were literally stored as arrays of words rather than phrases in PDFs (at least in Diego's sample PDF), whereas other meta fields like Subjects and Creators stored them as arrays of phrases. To get both to work, Kathy updated EXIF to a newer version, to retrieve the actual EXIF values stored in the PDF. And Kathy and Dr Bainbridge came up with a new option that I added called apply_join_before_split_to_metafields that's a regex which can list the metadata fields to apply the join_before_split to and whcih previously always got applied to all metadata fields. Now it's applied to any *Keywords metafields by default, as that's the metafield we have experience of that behaves differently to the others, as it stores by word instead of phrases. Tested on Diego's sample PDF. Diego has double-checked it to works on his sample PDF too, setting the split char to ; and turning on the join_before_split and leaving apply_join_before_split_to_metafields at its default of .*Keywords. File changes are strings.properties for the tooltip, the plugin introducing the option and working with it and Kathy's EXIF updates affecting cpan/File and cpan/Image.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/cpan/Image/ExifTool/WritePDF.pl

    r24107 r34921  
    3232);
    3333
     34# map for directories that we can add
     35my %pdfMap = (
     36    XMP => 'PDF',
     37);
     38
    3439#------------------------------------------------------------------------------
    3540# Validate raw PDF values for writing (string date integer real boolean name)
     
    3843sub CheckPDF($$$)
    3944{
    40     my ($exifTool, $tagInfo, $valPtr) = @_;
     45    my ($et, $tagInfo, $valPtr) = @_;
    4146    my $format = $$tagInfo{Writable} || $tagInfo->{Table}->{WRITABLE};
    4247    if (not $format) {
    4348        return 'No writable format';
    4449    } elsif ($format eq 'string') {
    45         # convert to Unicode if necessary
    46         if ($$valPtr =~ /[\x80-\xff]/) {
    47             $$valPtr = "\xfe\xff" . $exifTool->Encode($$valPtr,'UCS2','MM');
    48         }
     50        # (encode later because list-type string tags need to be encoded as a unit)
    4951    } elsif ($format eq 'date') {
    5052        # be flexible about this for now
     
    5961        return 'Invalid PDF name' if $$valPtr =~ /\0/;
    6062    } else {
    61         return "Invalid PDF format '$format'";
     63        return "Invalid PDF format '${format}'";
    6264    }
    6365    return undef;   # value is OK
     
    6668#------------------------------------------------------------------------------
    6769# Format value for writing to PDF file
    68 # Inputs: 0) value, 1) format string (string,date,integer,real,boolean,name)
     70# Inputs: 0) ExifTool ref, 1) value, 2) format string (string,date,integer,real,boolean,name)
    6971# Returns: formatted value or undef on error
    7072# Notes: Called at write time, so $pdfVer may be checked
    71 sub WritePDFValue($$)
     73sub WritePDFValue($$$)
    7274{
    73     my ($val, $format) = @_;
     75    my ($et, $val, $format) = @_;
    7476    if (not $format) {
    7577        return undef;
    7678    } elsif ($format eq 'string') {
     79        # encode as UCS2 if it contains any special characters
     80        $val = "\xfe\xff" . $et->Encode($val,'UCS2','MM') if $val =~ /[\x80-\xff]/;
    7781        EncodeString(\$val);
    7882    } elsif ($format eq 'date') {
    7983        # convert date to "D:YYYYmmddHHMMSS+-HH'MM'" format
    80         $val =~ s/([-+]\d{2}):(\d{2})/$1'$2'/;  # change timezone delimiters if necessary
     84        $val =~ s/([-+]\d{2}):(\d{2})/${1}'${2}'/;  # change timezone delimiters if necessary
    8185        $val =~ tr/ ://d;                       # remove spaces and colons
    8286        $val =  "D:$val";                       # add leading "D:"
     
    273277sub WritePDF($$)
    274278{
    275     my ($exifTool, $dirInfo) = @_;
     279    my ($et, $dirInfo) = @_;
    276280    my $raf = $$dirInfo{RAF};
    277281    my $outfile = $$dirInfo{OutFile};
     
    281285    # make sure this is a PDF file
    282286    my $pos = $raf->Tell();
    283     $raf->Read($buff, 10) >= 8 or return 0;
    284     $buff =~ /^%PDF-(\d+\.\d+)/ or return 0;
     287    $raf->Read($buff, 1024) >= 8 or return 0;
     288    $buff =~ /^(\s*)%PDF-(\d+\.\d+)/ or return 0;
     289    $$et{PDFBase} = length $1;
    285290    $raf->Seek($pos, 0);
    286291
     
    288293    my $newTool = new Image::ExifTool;
    289294    $newTool->Options(List => 1);
    290     $newTool->Options(Password => $exifTool->Options('Password'));
     295    $newTool->Options(Password => $et->Options('Password'));
    291296    $$newTool{PDF_CAPTURE} = \%capture;
    292297    my $info = $newTool->ImageInfo($raf, 'XMP', 'PDF:*', 'Error', 'Warning');
     
    294299    # (note: can't just check $$info{PDFVersion} due to possibility of XMP-pdf:PDFVersion)
    295300    my $vers = $newTool->GetInfo('PDF:PDFVersion');
    296     ($pdfVer) = values %$vers;
    297     $pdfVer or $exifTool->Error('Missing PDF:PDFVersion'), return 0;
     301    # take highest version number if multiple versions in an incremental save
     302    ($pdfVer) = sort { $b <=> $a } values %$vers;
     303    $pdfVer or $et->Error('Missing PDF:PDFVersion'), return 0;
    298304    # check version number
    299305    if ($pdfVer > 1.7) {
    300         if ($pdfVer >= 2.0) {
    301             $exifTool->Error("Can't yet write PDF version $pdfVer"); # (future major version changes)
    302             return 1;
    303         }
    304         $exifTool->Warn("ExifTool is untested with PDF version $pdfVer files", 1);
     306        $et->Warn("The PDF $pdfVer specification is not freely available", 1);
     307        # (so writing by ExifTool is based on trial and error)
    305308    }
    306309    # fail if we had any serious errors while extracting information
    307310    if ($capture{Error} or $$info{Error}) {
    308         $exifTool->Error($capture{Error} || $$info{Error});
     311        $et->Error($capture{Error} || $$info{Error});
    309312        return 1;
    310313    }
     
    313316        next if $capture{$obj};
    314317        # any warning we received may give a clue about why this object is missing
    315         $exifTool->Error($$info{Warning}) if $$info{Warning};
    316         $exifTool->Error("Can't find $obj object");
     318        $et->Error($$info{Warning}) if $$info{Warning};
     319        $et->Error("Can't find $obj object");
    317320        return 1;
    318321    }
     322    $et->InitWriteDirs(\%pdfMap, 'XMP');
    319323
    320324    # copy file up to start of previous exiftool update or end of file
     
    327331        $prevUpdate = $1;
    328332        # rewrite the file up to the original EOF
    329         Image::ExifTool::CopyBlock($raf, $outfile, $prevUpdate) or $rtn = -1;
     333        Image::ExifTool::CopyBlock($raf, $outfile, $prevUpdate + $$et{PDFBase}) or $rtn = -1;
    330334        # verify that we are now at the start of an ExifTool update
    331335        unless ($raf->Read($buff, length $beginComment) and $buff eq $beginComment) {
    332             $exifTool->Error('Previous ExifTool update is corrupted');
     336            $et->Error('Previous ExifTool update is corrupted');
    333337            return $rtn;
    334338        }
    335         $raf->Seek($prevUpdate, 0) or $rtn = -1;
    336         if ($exifTool->{DEL_GROUP}->{'PDF-update'}) {
    337             $exifTool->VPrint(0, "  Reverted previous ExifTool updates\n");
    338             ++$$exifTool{CHANGED};
     339        $raf->Seek($prevUpdate+$$et{PDFBase}, 0) or $rtn = -1;
     340        if ($$et{DEL_GROUP}{'PDF-update'}) {
     341            $et->VPrint(0, "  Reverted previous ExifTool updates\n");
     342            ++$$et{CHANGED};
    339343            return $rtn;
    340344        }
    341     } elsif ($exifTool->{DEL_GROUP}->{'PDF-update'}) {
    342         $exifTool->Error('File contains no previous ExifTool update');
     345    } elsif ($$et{DEL_GROUP}{'PDF-update'}) {
     346        $et->Error('File contains no previous ExifTool update');
    343347        return $rtn;
    344348    } else {
     
    348352        }
    349353    }
    350     $out = $exifTool->Options('TextOut') if $exifTool->Options('Verbose');
     354    $out = $et->Options('TextOut') if $et->Options('Verbose');
    351355#
    352356# create our new PDF objects to write
     
    362366    if ($prevUpdate) {
    363367        unless ($capture{Prev}) {
    364             $exifTool->Error("Can't locate trailer dictionary prior to last edit");
     368            $et->Error("Can't locate trailer dictionary prior to last edit");
    365369            return $rtn;
    366370        }
     
    378382    # delete entire PDF group if specified
    379383    my $infoChanged = 0;
    380     if ($exifTool->{DEL_GROUP}->{PDF} and $capture{Info}) {
     384    if ($$et{DEL_GROUP}{PDF} and $capture{Info}) {
    381385        delete $capture{Info};
    382386        $info = { XMP => $$info{XMP} }; # remove extracted PDF tags
     
    395399    # must encrypt all values in dictionary if they came from an encrypted stream
    396400    CryptObject($infoDict) if $$infoDict{_needCrypt};
    397    
     401
    398402    # must set line separator before calling WritePDFValue()
    399403    local $/ = $capture{newline};
    400404
    401405    # rewrite PDF Info tags
    402     my $newTags = $exifTool->GetNewTagInfoHash(\%Image::ExifTool::PDF::Info);
     406    my $newTags = $et->GetNewTagInfoHash(\%Image::ExifTool::PDF::Info);
    403407    my $tagID;
    404408    foreach $tagID (sort keys %$newTags) {
    405409        my $tagInfo = $$newTags{$tagID};
    406         my $nvHash = $exifTool->GetNewValueHash($tagInfo);
     410        my $nvHash = $et->GetNewValueHash($tagInfo);
    407411        my (@vals, $deleted);
    408412        my $tag = $$tagInfo{Name};
     
    421425            }
    422426            for (;;) {
    423                 if (Image::ExifTool::IsOverwriting($nvHash, $val) > 0) {
     427                if ($et->IsOverwriting($nvHash, $val) > 0) {
    424428                    $deleted = 1;
    425                     $exifTool->VerboseValue("- PDF:$tag", $val);
     429                    $et->VerboseValue("- PDF:$tag", $val);
    426430                    ++$infoChanged;
    427431                } else {
     
    433437            # don't write this out if we deleted all values
    434438            delete $$infoDict{$tagID} unless @vals;
     439        } elsif ($$nvHash{EditOnly}) {
     440            next;
    435441        }
    436442        # decide whether we want to write this tag
    437         # (always create native PDF information, so don't check IsCreating())
     443        # (native PDF information is always preferred, so don't check IsCreating)
    438444        next unless $deleted or $$tagInfo{List} or not exists $$infoDict{$tagID};
    439445
    440446        # add new values to existing ones
    441         my @newVals = Image::ExifTool::GetNewValues($nvHash);
     447        my @newVals = $et->GetNewValue($nvHash);
    442448        if (@newVals) {
    443449            push @vals, @newVals;
     
    445451            if ($out) {
    446452                foreach $val (@newVals) {
    447                     $exifTool->VerboseValue("+ PDF:$tag", $val);
     453                    $et->VerboseValue("+ PDF:$tag", $val);
    448454                }
    449455            }
     
    457463        my $writable = $$tagInfo{Writable} || $Image::ExifTool::PDF::Info{WRITABLE};
    458464        if (not $$tagInfo{List}) {
    459             $val = WritePDFValue(shift @vals, $writable);
     465            $val = WritePDFValue($et, shift(@vals), $writable);
    460466        } elsif ($$tagInfo{List} eq 'array') {
    461467            foreach $val (@vals) {
    462                 $val = WritePDFValue($val, $writable);
     468                $val = WritePDFValue($et, $val, $writable);
    463469                defined $val or undef(@vals), last;
    464470            }
    465471            $val = @vals ? \@vals : undef;
    466472        } else {
    467             $val = WritePDFValue(join($exifTool->Options('ListSep'), @vals), $writable);
     473            $val = WritePDFValue($et, join($et->Options('ListSep'), @vals), $writable);
    468474        }
    469475        if (defined $val) {
     
    471477            ++$infoChanged;
    472478        } else {
    473             $exifTool->Warn("Error converting $$tagInfo{Name} value");
     479            $et->Warn("Error converting $$tagInfo{Name} value");
    474480        }
    475481    }
    476482    if ($infoChanged) {
    477         $$exifTool{CHANGED} += $infoChanged;
     483        $$et{CHANGED} += $infoChanged;
    478484    } elsif ($prevUpdate) {
    479485        # must still write Info dictionary if it was previously updated
     
    503509    );
    504510    my $xmpTable = Image::ExifTool::GetTagTable('Image::ExifTool::XMP::Main');
    505     my $oldChanged = $$exifTool{CHANGED};
    506     my $newXMP = $exifTool->WriteDirectory(\%xmpInfo, $xmpTable);
     511    my $oldChanged = $$et{CHANGED};
     512    my $newXMP = $et->WriteDirectory(\%xmpInfo, $xmpTable);
    507513    $newXMP = $$info{XMP} ? ${$$info{XMP}} : '' unless defined $newXMP;
    508514
     
    510516    # XMP is deleted as a block -- so check for this
    511517    unless ($newXMP or $$info{XMP}) {
    512         $$exifTool{CHANGED} = $oldChanged;
    513         $exifTool->VPrint(0, "  (XMP not changed -- still empty)\n");
     518        $$et{CHANGED} = $oldChanged;
     519        $et->VPrint(0, "  (XMP not changed -- still empty)\n");
    514520    }
    515521    my ($metaChanged, $rootChanged);
    516522
    517     if ($$exifTool{CHANGED} != $oldChanged and defined $newXMP) {
     523    if ($$et{CHANGED} != $oldChanged and defined $newXMP) {
    518524        $metaChanged = 1;
    519525    } elsif ($prevUpdate and $capture{Root}->{Metadata}) {
     
    550556    my $rootRef = $$mainDict{Root};
    551557    unless ($rootRef) {
    552         $exifTool->Error("Can't find Root dictionary");
     558        $et->Error("Can't find Root dictionary");
    553559        return $rtn;
    554560    }
     
    562568# write incremental update if anything was changed
    563569#
    564     if ($$exifTool{CHANGED}) {
     570    if ($$et{CHANGED}) {
    565571        # remember position of original EOF
    566         my $oldEOF = Tell($outfile);
     572        my $oldEOF = Tell($outfile) - $$et{PDFBase};
    567573        Write($outfile, $beginComment) or $rtn = -1;
    568574
     
    578584            }
    579585            # create new entry for xref table
    580             $newXRef{$id} = [ Tell($outfile) + length($/), $gen, 'n' ];
     586            $newXRef{$id} = [ Tell($outfile) - $$et{PDFBase} + length($/), $gen, 'n' ];
    581587            $keyExt = "$id $gen obj";  # (must set for stream encryption)
    582588            Write($outfile, $/, $keyExt) or $rtn = -1;
     
    629635                my $byte = unpack('H2',chr((hex($1) + 1) & 0xff));
    630636                substr($id, 1, 2) = $byte;
    631             } elsif ($id =~ /^\((.)/s) {
    632                 substr($id, 1, 1) = chr((ord($1) + 1) & 0xff);
     637            } elsif ($id =~ /^\((.)/s and $1 ne '\\' and $1 ne ')' and $1 ne '(') {
     638                my $ch = chr((ord($1) + 1) & 0xff);
     639                # avoid generating characters that could cause problems
     640                $ch = 'a' if $ch =~ /[()\\\x00-\x08\x0a-\x1f\x7f\xff]/;
     641                substr($id, 1, 1) = $ch;
    633642            }
    634643            $mainDict->{ID}->[1] = $id;
     
    636645
    637646        # remember position of xref table in file (we will write this next)
    638         my $startxref = Tell($outfile) + length($/);
     647        my $startxref = Tell($outfile) - $$et{PDFBase} + length($/);
    639648
    640649        # must write xref as a stream in xref-stream-only files
     
    642651
    643652            # create entry for the xref stream object itself
    644             $newXRef{$nextObject++} = [ Tell($outfile) + length($/), 0, 'n' ];
     653            $newXRef{$nextObject++} = [ Tell($outfile) - $$et{PDFBase} + length($/), 0, 'n' ];
    645654            $$mainDict{Size} = $nextObject;
    646655            # create xref stream and Index entry
     
    655664                    my ($pos, $gen, $type) = @{$newXRef{$id}};
    656665                    if ($pos > 0xffffffff) {
    657                         $exifTool->Error('Huge files not yet supported');
     666                        $et->Error('Huge files not yet supported');
    658667                        last;
    659668                    }
     
    702711
    703712        # nothing new changed, so copy over previous incremental update
    704         $raf->Seek($prevUpdate, 0) or $rtn = -1;
     713        $raf->Seek($prevUpdate+$$et{PDFBase}, 0) or $rtn = -1;
    705714        while ($raf->Read($buff, 65536)) {
    706715            Write($outfile, $buff) or $rtn = -1;
    707716        }
     717    }
     718    if ($rtn > 0 and $$et{CHANGED} and ($$et{DEL_GROUP}{PDF} or $$et{DEL_GROUP}{XMP})) {
     719        $et->Warn('ExifTool PDF edits are reversible. Deleted tags may be recovered!', 1);
    708720    }
    709721    undef $newTool;
     
    738750=head1 AUTHOR
    739751
    740 Copyright 2003-2011, Phil Harvey (phil at owl.phy.queensu.ca)
     752Copyright 2003-2021, Phil Harvey (philharvey66 at gmail.com)
    741753
    742754This library is free software; you can redistribute it and/or modify it
Note: See TracChangeset for help on using the changeset viewer.