source: main/trunk/greenstone2/perllib/cpan/File/RandomAccess.pm

Last change on this file was 34921, checked in by anupama, 3 years ago

Committing the improvements to EmbeddedMetaPlugin's processing of Keywords vs other metadata fields. Keywords were literally stored as arrays of words rather than phrases in PDFs (at least in Diego's sample PDF), whereas other meta fields like Subjects and Creators stored them as arrays of phrases. To get both to work, Kathy updated EXIF to a newer version, to retrieve the actual EXIF values stored in the PDF. And Kathy and Dr Bainbridge came up with a new option that I added called apply_join_before_split_to_metafields that's a regex which can list the metadata fields to apply the join_before_split to and whcih previously always got applied to all metadata fields. Now it's applied to any *Keywords metafields by default, as that's the metafield we have experience of that behaves differently to the others, as it stores by word instead of phrases. Tested on Diego's sample PDF. Diego has double-checked it to works on his sample PDF too, setting the split char to ; and turning on the join_before_split and leaving apply_join_before_split_to_metafields at its default of .*Keywords. File changes are strings.properties for the tooltip, the plugin introducing the option and working with it and Kathy's EXIF updates affecting cpan/File and cpan/Image.

File size: 14.3 KB
Line 
1#------------------------------------------------------------------------------
2# File: RandomAccess.pm
3#
4# Description: Buffer to support random access reading of sequential file
5#
6# Revisions: 02/11/2004 - P. Harvey Created
7# 02/20/2004 - P. Harvey Added flag to disable SeekTest in new()
8# 11/18/2004 - P. Harvey Fixed bug with seek relative to end of file
9# 01/02/2005 - P. Harvey Added DEBUG code
10# 01/09/2006 - P. Harvey Fixed bug in ReadLine() when using
11# multi-character EOL sequences
12# 02/20/2006 - P. Harvey Fixed bug where seek past end of file could
13# generate "substr outside string" warning
14# 06/10/2006 - P. Harvey Decreased $CHUNK_SIZE from 64k to 8k
15# 11/23/2006 - P. Harvey Limit reads to < 0x80000000 bytes
16# 11/26/2008 - P. Harvey Fixed bug in ReadLine when reading from a
17# scalar with a multi-character newline
18# 01/24/2009 - PH Protect against reading too much at once
19# 10/04/2018 - PH Added NoBuffer option
20#
21# Notes: Calls the normal file i/o routines unless SeekTest() fails, in
22# which case the file is buffered in memory to allow random access.
23# SeekTest() is called automatically when the object is created
24# unless specified.
25#
26# May also be used for string i/o (just pass a scalar reference)
27#
28# Legal: Copyright (c) 2003-2021 Phil Harvey (philharvey66 at gmail.com)
29# This library is free software; you can redistribute it and/or
30# modify it under the same terms as Perl itself.
31#------------------------------------------------------------------------------
32
33package File::RandomAccess;
34
35use strict;
36require 5.002;
37require Exporter;
38
39use vars qw($VERSION @ISA @EXPORT_OK);
40$VERSION = '1.11';
41@ISA = qw(Exporter);
42
43sub Read($$$);
44
45# constants
46my $CHUNK_SIZE = 8192; # size of chunks to read from file (must be power of 2)
47my $SKIP_SIZE = 65536; # size to skip when fast-forwarding over sequential data
48my $SLURP_CHUNKS = 16; # read this many chunks at a time when slurping
49
50#------------------------------------------------------------------------------
51# Create new RandomAccess object
52# Inputs: 0) reference to RandomAccess object or RandomAccess class name
53# 1) file reference or scalar reference
54# 2) flag set if file is already random access (disables automatic SeekTest)
55sub new($$;$)
56{
57 my ($that, $filePt, $isRandom) = @_;
58 my $class = ref($that) || $that;
59 my $self;
60
61 if (ref $filePt eq 'SCALAR') {
62 # string i/o
63 $self = {
64 BUFF_PT => $filePt,
65 BASE => 0,
66 POS => 0,
67 LEN => length($$filePt),
68 TESTED => -1,
69 };
70 bless $self, $class;
71 } else {
72 # file i/o
73 my $buff = '';
74 $self = {
75 FILE_PT => $filePt, # file pointer
76 BUFF_PT => \$buff, # reference to file data
77 BASE => 0, # location of start of buffer in file
78 POS => 0, # current position in buffer
79 LEN => 0, # length of data in buffer
80 TESTED => 0, # 0=untested, 1=passed, -1=failed (requires buffering)
81 };
82 bless $self, $class;
83 $self->SeekTest() unless $isRandom;
84 }
85 return $self;
86}
87
88#------------------------------------------------------------------------------
89# Enable DEBUG code
90# Inputs: 0) reference to RandomAccess object
91sub Debug($)
92{
93 my $self = shift;
94 $self->{DEBUG} = { };
95}
96
97#------------------------------------------------------------------------------
98# Perform seek test and turn on buffering if necessary
99# Inputs: 0) reference to RandomAccess object
100# Returns: 1 if seek test passed (ie. no buffering required)
101# Notes: Must be done before any other i/o
102sub SeekTest($)
103{
104 my $self = shift;
105 unless ($self->{TESTED}) {
106 my $fp = $self->{FILE_PT};
107 if (seek($fp, 1, 1) and seek($fp, -1, 1)) {
108 $self->{TESTED} = 1; # test passed
109 } else {
110 $self->{TESTED} = -1; # test failed (requires buffering)
111 }
112 }
113 return $self->{TESTED} == 1 ? 1 : 0;
114}
115
116#------------------------------------------------------------------------------
117# Get current position in file
118# Inputs: 0) reference to RandomAccess object
119# Returns: current position in file
120sub Tell($)
121{
122 my $self = shift;
123 my $rtnVal;
124 if ($self->{TESTED} < 0) {
125 $rtnVal = $self->{POS} + $self->{BASE};
126 } else {
127 $rtnVal = tell($self->{FILE_PT});
128 }
129 return $rtnVal;
130}
131
132#------------------------------------------------------------------------------
133# Seek to position in file
134# Inputs: 0) reference to RandomAccess object
135# 1) position, 2) whence (0 or undef=from start, 1=from cur pos, 2=from end)
136# Returns: 1 on success
137# Notes: When buffered, this doesn't quite behave like seek() since it will return
138# success even if you seek outside the limits of the file. However if you
139# do this, you will get an error on your next Read().
140sub Seek($$;$)
141{
142 my ($self, $num, $whence) = @_;
143 $whence = 0 unless defined $whence;
144 my $rtnVal;
145 if ($self->{TESTED} < 0) {
146 my $newPos;
147 if ($whence == 0) {
148 $newPos = $num - $self->{BASE}; # from start of file
149 } elsif ($whence == 1) {
150 $newPos = $num + $self->{POS}; # relative to current position
151 } elsif ($self->{NoBuffer} and $self->{FILE_PT}) {
152 $newPos = -1; # (can't seek relative to end if no buffering)
153 } else {
154 $self->Slurp(); # read whole file into buffer
155 $newPos = $num + $self->{LEN}; # relative to end of file
156 }
157 if ($newPos >= 0) {
158 $self->{POS} = $newPos;
159 $rtnVal = 1;
160 }
161 } else {
162 $rtnVal = seek($self->{FILE_PT}, $num, $whence);
163 }
164 return $rtnVal;
165}
166
167#------------------------------------------------------------------------------
168# Read from the file
169# Inputs: 0) reference to RandomAccess object, 1) buffer, 2) bytes to read
170# Returns: Number of bytes read
171sub Read($$$)
172{
173 my $self = shift;
174 my $len = $_[1];
175 my $rtnVal;
176
177 # protect against reading too much at once
178 # (also from dying with a "Negative length" error)
179 if ($len & 0xf8000000) {
180 return 0 if $len < 0;
181 # read in smaller blocks because Windows attempts to pre-allocate
182 # memory for the full size, which can lead to an out-of-memory error
183 my $maxLen = 0x4000000; # (MUST be less than bitmask in "if" above)
184 my $num = Read($self, $_[0], $maxLen);
185 return $num if $num < $maxLen;
186 for (;;) {
187 $len -= $maxLen;
188 last if $len <= 0;
189 my $l = $len < $maxLen ? $len : $maxLen;
190 my $buff;
191 my $n = Read($self, $buff, $l);
192 last unless $n;
193 $_[0] .= $buff;
194 $num += $n;
195 last if $n < $l;
196 }
197 return $num;
198 }
199 # read through our buffer if necessary
200 if ($self->{TESTED} < 0) {
201 # purge old data before reading in NoBuffer mode
202 $self->Purge() or return 0 if $self->{NoBuffer};
203 my $buff;
204 my $newPos = $self->{POS} + $len;
205 # number of bytes to read from file
206 my $num = $newPos - $self->{LEN};
207 if ($num > 0 and $self->{FILE_PT}) {
208 # read data from file in multiples of $CHUNK_SIZE
209 $num = (($num - 1) | ($CHUNK_SIZE - 1)) + 1;
210 $num = read($self->{FILE_PT}, $buff, $num);
211 if ($num) {
212 ${$self->{BUFF_PT}} .= $buff;
213 $self->{LEN} += $num;
214 }
215 }
216 # number of bytes left in data buffer
217 $num = $self->{LEN} - $self->{POS};
218 if ($len <= $num) {
219 $rtnVal = $len;
220 } elsif ($num <= 0) {
221 $_[0] = '';
222 return 0;
223 } else {
224 $rtnVal = $num;
225 }
226 # return data from our buffer
227 $_[0] = substr(${$self->{BUFF_PT}}, $self->{POS}, $rtnVal);
228 $self->{POS} += $rtnVal;
229 } else {
230 # read directly from file
231 $_[0] = '' unless defined $_[0];
232 $rtnVal = read($self->{FILE_PT}, $_[0], $len) || 0;
233 }
234 if ($self->{DEBUG}) {
235 my $pos = $self->Tell() - $rtnVal;
236 unless ($self->{DEBUG}->{$pos} and $self->{DEBUG}->{$pos} > $rtnVal) {
237 $self->{DEBUG}->{$pos} = $rtnVal;
238 }
239 }
240 return $rtnVal;
241}
242
243#------------------------------------------------------------------------------
244# Read a line from file (end of line is $/)
245# Inputs: 0) reference to RandomAccess object, 1) buffer
246# Returns: Number of bytes read
247sub ReadLine($$)
248{
249 my $self = shift;
250 my $rtnVal;
251 my $fp = $self->{FILE_PT};
252
253 if ($self->{TESTED} < 0) {
254 my ($num, $buff);
255 $self->Purge() or return 0 if $self->{NoBuffer};
256 my $pos = $self->{POS};
257 if ($fp) {
258 # make sure we have some data after the current position
259 while ($self->{LEN} <= $pos) {
260 $num = read($fp, $buff, $CHUNK_SIZE);
261 return 0 unless $num;
262 ${$self->{BUFF_PT}} .= $buff;
263 $self->{LEN} += $num;
264 }
265 # scan and read until we find the EOL (or hit EOF)
266 for (;;) {
267 $pos = index(${$self->{BUFF_PT}}, $/, $pos);
268 if ($pos >= 0) {
269 $pos += length($/);
270 last;
271 }
272 $pos = $self->{LEN}; # have scanned to end of buffer
273 $num = read($fp, $buff, $CHUNK_SIZE) or last;
274 ${$self->{BUFF_PT}} .= $buff;
275 $self->{LEN} += $num;
276 }
277 } else {
278 # string i/o
279 $pos = index(${$self->{BUFF_PT}}, $/, $pos);
280 if ($pos < 0) {
281 $pos = $self->{LEN};
282 $self->{POS} = $pos if $self->{POS} > $pos;
283 } else {
284 $pos += length($/);
285 }
286 }
287 # read the line from our buffer
288 $rtnVal = $pos - $self->{POS};
289 $_[0] = substr(${$self->{BUFF_PT}}, $self->{POS}, $rtnVal);
290 $self->{POS} = $pos;
291 } else {
292 $_[0] = <$fp>;
293 if (defined $_[0]) {
294 $rtnVal = length($_[0]);
295 } else {
296 $rtnVal = 0;
297 }
298 }
299 if ($self->{DEBUG}) {
300 my $pos = $self->Tell() - $rtnVal;
301 unless ($self->{DEBUG}->{$pos} and $self->{DEBUG}->{$pos} > $rtnVal) {
302 $self->{DEBUG}->{$pos} = $rtnVal;
303 }
304 }
305 return $rtnVal;
306}
307
308#------------------------------------------------------------------------------
309# Read whole file into buffer (without changing read pointer)
310# Inputs: 0) reference to RandomAccess object
311sub Slurp($)
312{
313 my $self = shift;
314 my $fp = $self->{FILE_PT} || return;
315 # read whole file into buffer (in large chunks)
316 my ($buff, $num);
317 while (($num = read($fp, $buff, $CHUNK_SIZE * $SLURP_CHUNKS)) != 0) {
318 ${$self->{BUFF_PT}} .= $buff;
319 $self->{LEN} += $num;
320 }
321}
322
323#------------------------------------------------------------------------------
324# Purge internal buffer [internal use only]
325# Inputs: 0) reference to RandomAccess object
326# Returns: 1 on success, or 0 if current buffer position is negative
327# Notes: This is called only in NoBuffer mode
328sub Purge($)
329{
330 my $self = shift;
331 return 1 unless $self->{FILE_PT};
332 return 0 if $self->{POS} < 0; # error if we can't read from here
333 if ($self->{POS} > $CHUNK_SIZE) {
334 my $purge = $self->{POS} - ($self->{POS} % $CHUNK_SIZE);
335 if ($purge >= $self->{LEN}) {
336 # read up to current position in 64k chunks, discarding as we go
337 while ($self->{POS} > $self->{LEN}) {
338 $self->{BASE} += $self->{LEN};
339 $self->{POS} -= $self->{LEN};
340 ${$self->{BUFF_PT}} = '';
341 $self->{LEN} = read($self->{FILE_PT}, ${$self->{BUFF_PT}}, $SKIP_SIZE);
342 last if $self->{LEN} < $SKIP_SIZE;
343 }
344 } elsif ($purge > 0) {
345 ${$self->{BUFF_PT}} = substr ${$self->{BUFF_PT}}, $purge;
346 $self->{BASE} += $purge;
347 $self->{POS} -= $purge;
348 $self->{LEN} -= $purge;
349 }
350 }
351 return 1;
352}
353
354#------------------------------------------------------------------------------
355# Set binary mode
356# Inputs: 0) reference to RandomAccess object
357sub BinMode($)
358{
359 my $self = shift;
360 binmode($self->{FILE_PT}) if $self->{FILE_PT};
361}
362
363#------------------------------------------------------------------------------
364# Close the file and free the buffer
365# Inputs: 0) reference to RandomAccess object
366sub Close($)
367{
368 my $self = shift;
369
370 if ($self->{DEBUG}) {
371 local $_;
372 if ($self->Seek(0,2)) {
373 $self->{DEBUG}->{$self->Tell()} = 0; # set EOF marker
374 my $last;
375 my $tot = 0;
376 my $bad = 0;
377 foreach (sort { $a <=> $b } keys %{$self->{DEBUG}}) {
378 my $pos = $_;
379 my $len = $self->{DEBUG}->{$_};
380 if (defined $last and $last < $pos) {
381 my $bytes = $pos - $last;
382 $tot += $bytes;
383 $self->Seek($last);
384 my $buff;
385 $self->Read($buff, $bytes);
386 my $warn = '';
387 if ($buff =~ /[^\0]/) {
388 $bad += ($pos - $last);
389 $warn = ' - NON-ZERO!';
390 }
391 printf "0x%.8x - 0x%.8x (%d bytes)$warn\n", $last, $pos, $bytes;
392 }
393 my $cur = $pos + $len;
394 $last = $cur unless defined $last and $last > $cur;
395 }
396 print "$tot bytes missed";
397 $bad and print ", $bad non-zero!";
398 print "\n";
399 } else {
400 warn "File::RandomAccess DEBUG not working (file already closed?)\n";
401 }
402 delete $self->{DEBUG};
403 }
404 # close the file
405 if ($self->{FILE_PT}) {
406 close($self->{FILE_PT});
407 delete $self->{FILE_PT};
408 }
409 # reset the buffer
410 my $emptyBuff = '';
411 $self->{BUFF_PT} = \$emptyBuff;
412 $self->{BASE} = 0;
413 $self->{LEN} = 0;
414 $self->{POS} = 0;
415}
416
417#------------------------------------------------------------------------------
4181; # end
Note: See TracBrowser for help on using the repository browser.