Ignore:
Timestamp:
2021-02-26T19:39:51+13:00 (3 years ago)
Author:
anupama
Message:

Committing the improvements to EmbeddedMetaPlugin's processing of Keywords vs other metadata fields. Keywords were literally stored as arrays of words rather than phrases in PDFs (at least in Diego's sample PDF), whereas other meta fields like Subjects and Creators stored them as arrays of phrases. To get both to work, Kathy updated EXIF to a newer version, to retrieve the actual EXIF values stored in the PDF. And Kathy and Dr Bainbridge came up with a new option that I added called apply_join_before_split_to_metafields that's a regex which can list the metadata fields to apply the join_before_split to and whcih previously always got applied to all metadata fields. Now it's applied to any *Keywords metafields by default, as that's the metafield we have experience of that behaves differently to the others, as it stores by word instead of phrases. Tested on Diego's sample PDF. Diego has double-checked it to works on his sample PDF too, setting the split char to ; and turning on the join_before_split and leaving apply_join_before_split_to_metafields at its default of .*Keywords. File changes are strings.properties for the tooltip, the plugin introducing the option and working with it and Kathy's EXIF updates affecting cpan/File and cpan/Image.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/cpan/File/RandomAccess.pm

    r24107 r34921  
    1717#                            scalar with a multi-character newline
    1818#               01/24/2009 - PH Protect against reading too much at once
     19#               10/04/2018 - PH Added NoBuffer option
    1920#
    2021# Notes:        Calls the normal file i/o routines unless SeekTest() fails, in
     
    2526#               May also be used for string i/o (just pass a scalar reference)
    2627#
    27 # Legal:        Copyright (c) 2003-2010 Phil Harvey (phil at owl.phy.queensu.ca)
     28# Legal:        Copyright (c) 2003-2021 Phil Harvey (philharvey66 at gmail.com)
    2829#               This library is free software; you can redistribute it and/or
    2930#               modify it under the same terms as Perl itself.
     
    3738
    3839use vars qw($VERSION @ISA @EXPORT_OK);
    39 $VERSION = '1.10';
     40$VERSION = '1.11';
    4041@ISA = qw(Exporter);
    4142
     
    4445# constants
    4546my $CHUNK_SIZE = 8192;  # size of chunks to read from file (must be power of 2)
     47my $SKIP_SIZE = 65536;  # size to skip when fast-forwarding over sequential data
    4648my $SLURP_CHUNKS = 16;  # read this many chunks at a time when slurping
    4749
     
    6163        $self = {
    6264            BUFF_PT => $filePt,
     65            BASE => 0,
    6366            POS => 0,
    6467            LEN => length($$filePt),
     
    7275            FILE_PT => $filePt, # file pointer
    7376            BUFF_PT => \$buff,  # reference to file data
    74             POS => 0,           # current position in file
    75             LEN => 0,           # data length
     77            BASE => 0,          # location of start of buffer in file
     78            POS => 0,           # current position in buffer
     79            LEN => 0,           # length of data in buffer
    7680            TESTED => 0,        # 0=untested, 1=passed, -1=failed (requires buffering)
    7781        };
     
    119123    my $rtnVal;
    120124    if ($self->{TESTED} < 0) {
    121         $rtnVal = $self->{POS};
     125        $rtnVal = $self->{POS} + $self->{BASE};
    122126    } else {
    123127        $rtnVal = tell($self->{FILE_PT});
     
    142146        my $newPos;
    143147        if ($whence == 0) {
    144             $newPos = $num;                # from start of file
     148            $newPos = $num - $self->{BASE}; # from start of file
    145149        } elsif ($whence == 1) {
    146150            $newPos = $num + $self->{POS};  # relative to current position
     151        } elsif ($self->{NoBuffer} and $self->{FILE_PT}) {
     152            $newPos = -1;   # (can't seek relative to end if no buffering)
    147153        } else {
    148154            $self->Slurp();                 # read whole file into buffer
     
    193199    # read through our buffer if necessary
    194200    if ($self->{TESTED} < 0) {
     201        # purge old data before reading in NoBuffer mode
     202        $self->Purge() or return 0 if $self->{NoBuffer};
    195203        my $buff;
    196204        my $newPos = $self->{POS} + $len;
     
    245253    if ($self->{TESTED} < 0) {
    246254        my ($num, $buff);
     255        $self->Purge() or return 0 if $self->{NoBuffer};
    247256        my $pos = $self->{POS};
    248257        if ($fp) {
     
    312321}
    313322
    314 
    315 #------------------------------------------------------------------------------
    316 # set binary mode
     323#------------------------------------------------------------------------------
     324# Purge internal buffer [internal use only]
     325# Inputs: 0) reference to RandomAccess object
     326# Returns: 1 on success, or 0 if current buffer position is negative
     327# Notes: This is called only in NoBuffer mode
     328sub Purge($)
     329{
     330    my $self = shift;
     331    return 1 unless $self->{FILE_PT};
     332    return 0 if $self->{POS} < 0;   # error if we can't read from here
     333    if ($self->{POS} > $CHUNK_SIZE) {
     334        my $purge = $self->{POS} - ($self->{POS} % $CHUNK_SIZE);
     335        if ($purge >= $self->{LEN}) {
     336            # read up to current position in 64k chunks, discarding as we go
     337            while ($self->{POS} > $self->{LEN}) {
     338                $self->{BASE} += $self->{LEN};
     339                $self->{POS} -= $self->{LEN};
     340                ${$self->{BUFF_PT}} = '';
     341                $self->{LEN} = read($self->{FILE_PT}, ${$self->{BUFF_PT}}, $SKIP_SIZE);
     342                last if $self->{LEN} < $SKIP_SIZE;
     343            }
     344        } elsif ($purge > 0) {
     345            ${$self->{BUFF_PT}} = substr ${$self->{BUFF_PT}}, $purge;
     346            $self->{BASE} += $purge;
     347            $self->{POS} -= $purge;
     348            $self->{LEN} -= $purge;
     349        }
     350    }
     351    return 1;
     352}
     353
     354#------------------------------------------------------------------------------
     355# Set binary mode
    317356# Inputs: 0) reference to RandomAccess object
    318357sub BinMode($)
     
    323362
    324363#------------------------------------------------------------------------------
    325 # close the file and free the buffer
     364# Close the file and free the buffer
    326365# Inputs: 0) reference to RandomAccess object
    327366sub Close($)
     
    371410    my $emptyBuff = '';
    372411    $self->{BUFF_PT} = \$emptyBuff;
     412    $self->{BASE} = 0;
    373413    $self->{LEN} = 0;
    374414    $self->{POS} = 0;
Note: See TracChangeset for help on using the changeset viewer.