Ignore:
Timestamp:
2021-02-26T19:39:51+13:00 (3 years ago)
Author:
anupama
Message:

Committing the improvements to EmbeddedMetaPlugin's processing of Keywords vs other metadata fields. Keywords were literally stored as arrays of words rather than phrases in PDFs (at least in Diego's sample PDF), whereas other meta fields like Subjects and Creators stored them as arrays of phrases. To get both to work, Kathy updated EXIF to a newer version, to retrieve the actual EXIF values stored in the PDF. And Kathy and Dr Bainbridge came up with a new option that I added called apply_join_before_split_to_metafields that's a regex which can list the metadata fields to apply the join_before_split to and whcih previously always got applied to all metadata fields. Now it's applied to any *Keywords metafields by default, as that's the metafield we have experience of that behaves differently to the others, as it stores by word instead of phrases. Tested on Diego's sample PDF. Diego has double-checked it to works on his sample PDF too, setting the split char to ; and turning on the join_before_split and leaving apply_join_before_split_to_metafields at its default of .*Keywords. File changes are strings.properties for the tooltip, the plugin introducing the option and working with it and Kathy's EXIF updates affecting cpan/File and cpan/Image.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/cpan/Image/ExifTool/PostScript.pm

    r24107 r34921  
    1717use Image::ExifTool qw(:DataAccess :Utils);
    1818
    19 $VERSION = '1.33';
     19$VERSION = '1.44';
    2020
    2121sub WritePS($$);
     
    3939        Writable => 'string',
    4040        PrintConv => '$self->ConvertDateTime($val)',
     41        PrintConvInv => '$self->InverseDateTime($val)',
    4142    },
    4243    Creator     => { Priority => 0, Writable => 'string' },
     
    5051        Writable => 'string',
    5152        PrintConv => '$self->ConvertDateTime($val)',
     53        PrintConvInv => '$self->InverseDateTime($val)',
    5254    },
    5355    Pages       => { Priority => 0 },
     
    7678    },
    7779    TIFFPreview => {
     80        Groups => { 2 => 'Preview' },
    7881        Binary => 1,
    7982        Notes => q{
     
    8790            TagTable => 'Image::ExifTool::PostScript::Main',
    8891        },
    89         Notes => 'extracted with ExtractEmbedded option',
     92        Notes => 'extracted with L<ExtractEmbedded|../ExifTool.html#ExtractEmbedded> option',
    9093    },
    9194    EmbeddedFileName => {
    9295        Notes => q{
    9396            not a real tag ID, but the file name from a BeginDocument statement.
    94             Extracted with document metadata when ExtractEmbedded option is used
    95         },
    96     },
     97            Extracted with document metadata when L<ExtractEmbedded|../ExifTool.html#ExtractEmbedded> option is used
     98        },
     99    },
     100    # AI metadata (most with a single leading '%')
     101    AI9_ColorModel => {
     102        Name => 'AIColorModel',
     103        PrintConv => {
     104            1 => 'RGB',
     105            2 => 'CMYK',
     106        },
     107    },
     108    AI3_ColorUsage       => { Name => 'AIColorUsage' },
     109    AI5_RulerUnits       => {
     110        Name => 'AIRulerUnits',
     111        PrintConv => {
     112            0 => 'Inches',
     113            1 => 'Millimeters',
     114            2 => 'Points',
     115            3 => 'Picas',
     116            4 => 'Centimeters',
     117            6 => 'Pixels',
     118        },
     119    },
     120    AI5_TargetResolution => { Name => 'AITargetResolution' },
     121    AI5_NumLayers        => { Name => 'AINumLayers' },
     122    AI5_FileFormat       => { Name => 'AIFileFormat' },
     123    AI8_CreatorVersion   => { Name => 'AICreatorVersion' }, # (double leading '%')
     124    AI12_BuildNumber     => { Name => 'AIBuildNumber' },
    97125);
    98126
     
    159187sub PSErr($$)
    160188{
    161     my ($exifTool, $str) = @_;
     189    my ($et, $str) = @_;
    162190    # set file type if not done already
    163     my $ext = $$exifTool{FILE_EXT};
    164     $exifTool->SetFileType(($ext and $ext eq 'AI') ? 'AI' : 'PS');
    165     $exifTool->Warn("PostScript format error ($str)");
     191    my $ext = $$et{FILE_EXT};
     192    $et->SetFileType(($ext and $ext eq 'AI') ? 'AI' : 'PS');
     193    $et->Warn("PostScript format error ($str)");
    166194    return 1;
    167195}
     
    181209    $d = pos($data) if $data =~ /\x0d/g;
    182210    my $diff = $a - $d;
    183     if ($diff eq 1) {
     211    if ($diff == 1) {
    184212        $sep = "\x0d\x0a";
    185     } elsif ($diff eq -1) {
     213    } elsif ($diff == -1) {
    186214        $sep = "\x0a\x0d";
    187215    } elsif ($diff > 0) {
     
    195223
    196224#------------------------------------------------------------------------------
     225# Split into lines ending in any CR, LF or CR+LF combination
     226# (this is annoying, and could be avoided if EPS files didn't mix linefeeds!)
     227# Inputs: 0) data pointer, 1) reference to lines array
     228# Notes: Fills @$lines with lines from splitting $$dataPt
     229sub SplitLine($$)
     230{
     231    my ($dataPt, $lines) = @_;
     232    for (;;) {
     233        my $endl;
     234        # find the position of the first LF (\x0a)
     235        $endl = pos($$dataPt), pos($$dataPt) = 0 if $$dataPt =~ /\x0a/g;
     236        if ($$dataPt =~ /\x0d/g) { # find the first CR (\x0d)
     237            if (defined $endl) {
     238                # (remember, CR+LF is a DOS newline...)
     239                $endl = pos($$dataPt) if pos($$dataPt) < $endl - 1;
     240            } else {
     241                $endl = pos($$dataPt);
     242            }
     243        } elsif (not defined $endl) {
     244            push @$lines, $$dataPt;
     245            last;
     246        }
     247        if (length $$dataPt == $endl) {
     248            push @$lines, $$dataPt;
     249            last;
     250        } else {
     251            # continue to split into separate lines
     252            push @$lines, substr($$dataPt, 0, $endl);
     253            $$dataPt = substr($$dataPt, $endl);
     254        }
     255    }
     256}
     257
     258#------------------------------------------------------------------------------
     259# check to be sure we haven't read past end of PS data in DOS-style file
     260# Inputs: 0) RAF ref (with PSEnd member), 1) data ref
     261# - modifies data and sets RAF to EOF if end of PS is reached
     262sub CheckPSEnd($$)
     263{
     264    my ($raf, $dataPt) = @_;
     265    my $pos = $raf->Tell();
     266    if ($pos >= $$raf{PSEnd}) {
     267        $raf->Seek(0, 2);   # seek to end of file so we can't read any more
     268        $$dataPt = substr($$dataPt, 0, length($$dataPt) - $pos + $$raf{PSEnd}) if $pos > $$raf{PSEnd};
     269    }
     270}
     271
     272#------------------------------------------------------------------------------
     273# Read next line from EPS file
     274# Inputs: 0) RAF ref (with PSEnd member if Postscript ends before end of file)
     275#         1) array of lines from file
     276# Returns: true on success
     277sub GetNextLine($$)
     278{
     279    my ($raf, $lines) = @_;
     280    my ($data, $changedNL);
     281    my $altnl = ($/ eq "\x0d") ? "\x0a" : "\x0d";
     282    for (;;) {
     283        $raf->ReadLine($data) or last;
     284        $$raf{PSEnd} and CheckPSEnd($raf, \$data);
     285        # split line if it contains other newline sequences
     286        if ($data =~ /$altnl/) {
     287            if (length($data) > 500000 and IsPC()) {
     288                # patch for Windows memory problem
     289                unless ($changedNL) {
     290                    $changedNL = $/;
     291                    $/ = $altnl;
     292                    $altnl = $changedNL;
     293                    $raf->Seek(-length($data), 1);
     294                    next;
     295                }
     296            } else {
     297                    # split into separate lines
     298                #    push @$lines, split /$altnl/, $data, -1;
     299                #    if (@$lines == 2 and $$lines[1] eq $/) {
     300                #        # handle case of DOS newline data inside file using Unix newlines
     301                #        $$lines[0] .= pop @$lines;
     302                #    }
     303                # split into separate lines if necessary
     304               SplitLine(\$data, $lines);
     305            }
     306        } else {
     307            push @$lines, $data;
     308        }
     309        $/ = $changedNL if $changedNL;
     310        return 1;
     311    }
     312    return 0;
     313}
     314
     315#------------------------------------------------------------------------------
    197316# Decode comment from PostScript file
    198317# Inputs: 0) comment string, 1) RAF ref, 2) reference to lines array
     
    206325    # check for continuation comments
    207326    for (;;) {
    208         unless (@$lines) {
    209             my $buff;
    210             $raf->ReadLine($buff) or last;
    211             my $altnl = $/ eq "\x0d" ? "\x0a" : "\x0d";
    212             if ($buff =~ /$altnl/) {
    213                 # split into separate lines
    214                 @$lines = split /$altnl/, $buff, -1;
    215                 # handle case of DOS newline data inside file using Unix newlines
    216                 @$lines = ( $$lines[0] . $$lines[1] ) if @$lines == 2 and $$lines[1] eq $/;
    217             } else {
    218                 push @$lines, $buff;
    219             }
    220         }
     327        @$lines or GetNextLine($raf, $lines) or last;
    221328        last unless $$lines[0] =~ /^%%\+/;  # is the next line a continuation?
    222329        $$dataPt .= $$lines[0] if $dataPt;  # add to data if necessary
     
    317424sub ProcessPS($$;$)
    318425{
    319     my ($exifTool, $dirInfo, $tagTablePtr) = @_;
     426    my ($et, $dirInfo, $tagTablePtr) = @_;
    320427    my $raf = $$dirInfo{RAF};
    321     my $embedded = $exifTool->Options('ExtractEmbedded');
     428    my $embedded = $et->Options('ExtractEmbedded');
    322429    my ($data, $dos, $endDoc, $fontTable, $comment);
    323430
    324431    # allow read from data
    325     $raf = new File::RandomAccess($$dirInfo{DataPt}) unless $raf;
     432    unless ($raf) {
     433        $raf = new File::RandomAccess($$dirInfo{DataPt});
     434        $et->VerboseDir('PostScript');
     435    }
    326436#
    327437# determine if this is a postscript file
     
    338448        $raf->Read($dos, 26) == 26 or return 0;
    339449        SetByteOrder('II');
    340         unless ($raf->Seek(Get32u(\$dos, 0), 0) and
     450        my $psStart = Get32u(\$dos, 0);
     451        unless ($raf->Seek($psStart, 0) and
    341452                $raf->Read($data, 4) == 4 and $data eq '%!PS')
    342453        {
    343             return PSErr($exifTool, 'invalid header');
    344         }
     454            return PSErr($et, 'invalid header');
     455        }
     456        $$raf{PSEnd} = $psStart + Get32u(\$dos, 4); # set end of PostScript data in RAF
    345457    } else {
    346458        # check for PostScript font file (PFA or PFB)
     
    348460        $data .= $d2 if $raf->Read($d2,12);
    349461        if ($data =~ /^%!(PS-(AdobeFont-|Bitstream )|FontType1-)/) {
    350             $exifTool->SetFileType('PFA');  # PostScript ASCII font file
     462            $et->SetFileType('PFA');  # PostScript ASCII font file
    351463            $fontTable = GetTagTable('Image::ExifTool::Font::PSInfo');
    352464            # PostScript font files may contain an unformatted comments which may
     
    360472#
    361473    local $/ = GetInputRecordSeparator($raf);
    362     $/ or return PSErr($exifTool, 'invalid PS data');
     474    $/ or return PSErr($et, 'invalid PS data');
    363475
    364476    # set file type (PostScript or EPS)
     
    378490        $raf->Seek($pos, 0);
    379491    }
    380     $exifTool->SetFileType($type);
     492    $et->SetFileType($type);
     493    return 1 if $$et{OPTIONS}{FastScan} and $$et{OPTIONS}{FastScan} == 3;
    381494#
    382495# extract TIFF information from DOS header
     
    389502            # extract the TIFF preview
    390503            my $len = Get32u(\$dos, 20);
    391             my $val = $exifTool->ExtractBinary($base, $len, 'TIFFPreview');
     504            my $val = $et->ExtractBinary($base, $len, 'TIFFPreview');
    392505            if (defined $val and $val =~ /^(MM\0\x2a|II\x2a\0|Binary)/) {
    393                 $exifTool->HandleTag($tagTablePtr, 'TIFFPreview', $val);
     506                $et->HandleTag($tagTablePtr, 'TIFFPreview', $val);
    394507            } else {
    395                 $exifTool->Warn('Bad TIFF preview image');
     508                $et->Warn('Bad TIFF preview image');
    396509            }
    397510            # extract information from TIFF in DOS header
     
    402515                Base => $base,
    403516            );
    404             $exifTool->ProcessTIFF(\%dirInfo) or $exifTool->Warn('Bad embedded TIFF');
     517            $et->ProcessTIFF(\%dirInfo) or $et->Warn('Bad embedded TIFF');
    405518            # position file pointer to extract PS information
    406519            $raf->Seek($pos, 0);
     
    453566            if (not $endToken) {
    454567                $buff .= $data;
    455                 next unless $data =~ m{<\?xpacket end=.(w|r).\?>($/|$)};
     568                next unless $data =~ m{<\?xpacket end=.(w|r).\?>(\n|\r|$)};
    456569            } elsif ($data !~ /^$endToken/i) {
    457570                if ($mode eq 'XMP') {
     
    475588            $docNum =~ s/-?(\d+)$//;        # decrement nesting level
    476589            $subDocNum = $1;                # remember our last sub-document number
    477             $$exifTool{DOC_NUM} = $docNum;
     590            $$et{DOC_NUM} = $docNum;
    478591            undef $endDoc unless $docNum;   # done with document if top level
    479592            next;
     
    504617                } else {
    505618                    # this is the Nth document
    506                     $docNum = $$exifTool{DOC_COUNT} + 1;
     619                    $docNum = $$et{DOC_COUNT} + 1;
    507620                }
    508621                $subDocNum = 0; # new level, so reset subDocNum
    509622                next unless $embedded;  # skip over this document
    510623                # set document number for family 4-7 group names
    511                 $$exifTool{DOC_NUM} = $docNum;
    512                 $$exifTool{LIST_TAGS} = { };  # don't build lists across different documents
    513                 $exifTool->{PROCESSED} = { }; # re-initialize processed directory lookup too
     624                $$et{DOC_NUM} = $docNum;
     625                $$et{LIST_TAGS} = { };  # don't build lists across different documents
     626                $$et{PROCESSED} = { }; # re-initialize processed directory lookup too
    514627                $endDoc = $endToken;          # parse to EndDocument token
    515628                # reset mode to allow parsing into sub-directories
     
    521634                    # remove brackets if necessary
    522635                    $docName = $1 if $docName =~ /^\((.*)\)$/;
    523                     $exifTool->HandleTag($tagTablePtr, 'EmbeddedFileName', $docName);
     636                    $et->HandleTag($tagTablePtr, 'EmbeddedFileName', $docName);
    524637                }
    525638            }
     
    531644            undef $endToken;    # no end token (just look for xpacket end)
    532645            # XMP could be contained in a single line (if newlines are different)
    533             next unless $data =~ m{<\?xpacket end=.(w|r).\?>($/|$)};
     646            next unless $data =~ m{<\?xpacket end=.(w|r).\?>(\n|\r|$)};
    534647        } elsif ($data =~ /^%%?(\w+): ?(.*)/s and $$tagTablePtr{$1}) {
    535648            my ($tag, $val) = ($1, $2);
    536             # only allow 'ImageData' to have single leading '%'
    537             next unless $data =~ /^%%/ or $1 eq 'ImageData';
     649            # only allow 'ImageData' and AI tags to have single leading '%'
     650            next unless $data =~ /^%(%|AI\d+_)/ or $tag eq 'ImageData';
    538651            # decode comment string (reading continuation lines if necessary)
    539652            $val = DecodeComment($val, $raf, \@lines);
    540             $exifTool->HandleTag($tagTablePtr, $tag, $val);
     653            $et->HandleTag($tagTablePtr, $tag, $val);
    541654            next;
    542655        } elsif ($embedded and $data =~ /^%AI12_CompressedData/) {
    543656            # the rest of the file is compressed
    544             unless (eval 'require Compress::Zlib') {
    545                 $exifTool->Warn('Install Compress::Zlib to extract compressed embedded data');
     657            unless (eval { require Compress::Zlib }) {
     658                $et->Warn('Install Compress::Zlib to extract compressed embedded data');
    546659                last;
    547660            }
     
    554667            last unless $data =~ s/.*?%AI12_CompressedData//;
    555668            my $inflate = Compress::Zlib::inflateInit();
    556             $inflate or $exifTool->Warn('Error initializing inflate'), last;
     669            $inflate or $et->Warn('Error initializing inflate'), last;
    557670            # generate a PS-like file in memory from the compressed data
    558             my $verbose = $exifTool->Options('Verbose');
     671            my $verbose = $et->Options('Verbose');
    559672            if ($verbose > 1) {
    560                 $exifTool->VerboseDir('AI12_CompressedData (first 4kB)');
    561                 $exifTool->VerboseDump(\$data);
     673                $et->VerboseDir('AI12_CompressedData (first 4kB)');
     674                $et->VerboseDump(\$data);
    562675            }
    563676            # remove header if it exists (Windows AI files only)
     
    578691                $raf->Read($data, 65536) or last;
    579692            }
    580             defined $val or $exifTool->Warn('Error inflating AI compressed data'), last;
     693            defined $val or $et->Warn('Error inflating AI compressed data'), last;
    581694            if ($verbose > 1) {
    582                 $exifTool->VerboseDir('Uncompressed AI12 Data');
    583                 $exifTool->VerboseDump(\$val);
     695                $et->VerboseDir('Uncompressed AI12 Data');
     696                $et->VerboseDump(\$val);
    584697            }
    585698            # extract information from embedded images in the uncompressed data
    586699            $val =  # add PS header in case it needs one
    587             ProcessPS($exifTool, { DataPt => \$val });
     700            ProcessPS($et, { DataPt => \$val });
    588701            last;
    589702        } elsif ($fontTable) {
     
    596709                } elsif ($data !~ /^%/) {
    597710                    # stop extracting comments at the first non-comment line
    598                     $exifTool->FoundTag('Comment', $comment) if length $comment;
     711                    $et->FoundTag('Comment', $comment) if length $comment;
    599712                    undef $comment;
    600713                }
     
    607720                    $val = $1;
    608721                }
    609                 $exifTool->HandleTag($fontTable, $tag, $val);
     722                $et->HandleTag($fontTable, $tag, $val);
    610723            } elsif ($data =~ /^currentdict end/) {
    611724                # only extract tags from initial FontInfo dict
     
    625738        );
    626739        my $subTablePtr = GetTagTable("Image::ExifTool::${mode}::Main");
    627         unless ($exifTool->ProcessDirectory(\%dirInfo, $subTablePtr)) {
    628             $exifTool->Warn("Error processing $mode information in PostScript file");
     740        unless ($et->ProcessDirectory(\%dirInfo, $subTablePtr)) {
     741            $et->Warn("Error processing $mode information in PostScript file");
    629742        }
    630743        undef $buff;
     
    632745    }
    633746    $mode = 'Document' if $endDoc and not $mode;
    634     $mode and PSErr($exifTool, "unterminated $mode data");
     747    $mode and PSErr($et, "unterminated $mode data");
    635748    return 1;
    636749}
     
    665778=head1 AUTHOR
    666779
    667 Copyright 2003-2011, Phil Harvey (phil at owl.phy.queensu.ca)
     780Copyright 2003-2021, Phil Harvey (philharvey66 at gmail.com)
    668781
    669782This library is free software; you can redistribute it and/or modify it
Note: See TracChangeset for help on using the changeset viewer.