source: trunk/gsdl/perllib/plugins/RecPlug.pm@ 9706

Last change on this file since 9706 was 9703, checked in by mdewsnip, 19 years ago

Improvement to previous change so "file not processed" messages are seen in Expert mode.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 20.6 KB
Line 
1###########################################################################
2#
3# RecPlug.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# RecPlug is a plugin which recurses through directories processing
27# each file it finds.
28
29# RecPlug has one option: use_metadata_files. When this is set, it will
30# check each directory for an XML file called "metadata.xml" that specifies
31# metadata for the files (and subdirectories) in the directory.
32#
33# Here's an example of a metadata file that uses three FileSet structures
34# (ignore the # characters):
35
36#<?xml version="1.0" encoding="UTF-8" standalone="no"?>
37#<!DOCTYPE DirectoryMetadata SYSTEM "http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd">
38#<DirectoryMetadata>
39# <FileSet>
40# <FileName>nugget.*</FileName>
41# <Description>
42# <Metadata name="Title">Nugget Point, The Catlins</Metadata>
43# <Metadata name="Place" mode="accumulate">Nugget Point</Metadata>
44# </Description>
45# </FileSet>
46# <FileSet>
47# <FileName>nugget-point-1.jpg</FileName>
48# <Description>
49# <Metadata name="Title">Nugget Point Lighthouse, The Catlins</Metadata>
50# <Metadata name="Subject">Lighthouse</Metadata>
51# </Description>
52# </FileSet>
53# <FileSet>
54# <FileName>kaka-point-dir</FileName>
55# <Description>
56# <Metadata name="Title">Kaka Point, The Catlins</Metadata>
57# </Description>
58# </FileSet>
59#</DirectoryMetadata>
60
61# Metadata elements are read and applied to files in the order they appear
62# in the file.
63#
64# The FileName element describes the subfiles in the directory that the
65# metadata applies to as a perl regular expression (a FileSet group may
66# contain multiple FileName elements). So, <FileName>nugget.*</FileName>
67# indicates that the metadata records in the following Description block
68# apply to every subfile that starts with "nugget". For these files, a
69# Title metadata element is set, overriding any old value that the Title
70# might have had.
71#
72# Occasionally, we want to have multiple metadata values applied to a
73# document; in this case we use the "mode=accumulate" attribute of the
74# particular Metadata element. In the second metadata element of the first
75# FileSet above, the "Place" metadata is accumulating, and may therefore be
76# given several values. If we wanted to override these values and use a
77# single metadata element again, we could set the mode attribute to
78# "override" instead. Remember: every element is assumed to be in override
79# mode unless you specify otherwise, so if you want to accumulate metadata
80# for some field, every occurance must have "mode=accumulate" specified.
81#
82# The second FileSet element above applies to a specific file, called
83# nugget-point-1.jpg. This element overrides the Title metadata set in the
84# first FileSet, and adds a "Subject" metadata field.
85#
86# The third and final FileSet sets metadata for a subdirectory rather than
87# a file. The metadata specified (a Title) will be passed into the
88# subdirectory and applied to every file that occurs in the subdirectory
89# (and to every subsubdirectory and its contents, and so on) unless the
90# metadata is explictly overridden later in the import.
91
92
93
94package RecPlug;
95
96use BasPlug;
97use plugin;
98use util;
99
100use File::Basename;
101
102
103BEGIN {
104 @RecPlug::ISA = ('BasPlug');
105 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
106}
107
108use XMLParser;
109
110my $arguments =
111 [ { 'name' => "block_exp",
112 'desc' => "{BasPlug.block_exp}",
113 'type' => "regexp",
114 'deft' => &get_default_block_exp(),
115 'reqd' => "no" },
116 { 'name' => "use_metadata_files",
117 'desc' => "{RecPlug.use_metadata_files}",
118 'type' => "flag",
119 'reqd' => "no" },
120 { 'name' => "recheck_directories",
121 'desc' => "{RecPlug.recheck_directories}",
122 'type' => "flag",
123 'reqd' => "no" } ];
124
125my $options = { 'name' => "RecPlug",
126 'desc' => "{RecPlug.desc}",
127 'abstract' => "no",
128 'inherits' => "yes",
129 'args' => $arguments };
130
131
132my ($self);
133sub new {
134 my $class = shift (@_);
135
136 # $self is global for use within subroutines called by XML::Parser
137 $self = new BasPlug ($class, @_);
138
139 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
140 my $option_list = $self->{'option_list'};
141 push( @{$option_list}, $options );
142
143 if (!parsargv::parse(\@_,
144 q^use_metadata_files^, \$self->{'use_metadata_files'},
145 q^recheck_directories^, \$self->{'recheck_directories'},
146 "allow_extra_options")) {
147 print STDERR "\nRecPlug uses an incorrect option.\n";
148 print STDERR "Check your collect.cfg configuration file.\n\n";
149 $self->print_txt_usage(""); # Use default resource bundle
150 die "\n";
151 }
152
153 if ($self->{'use_metadata_files'}) {
154 # create XML::Parser object for parsing metadata.xml files
155 my $parser = new XML::Parser('Style' => 'Stream',
156 'Handlers' => {'Char' => \&Char,
157 'Doctype' => \&Doctype
158 });
159 $self->{'parser'} = $parser;
160 $self->{'in_filename'} = 0;
161 }
162
163 $self->{'subdir_extrametakeys'} = {};
164
165 return bless $self, $class;
166}
167
168# return 1 if this class might recurse using $pluginfo
169sub is_recursive {
170 my $self = shift (@_);
171
172 return 1;
173}
174
175sub get_default_block_exp {
176 my $self = shift (@_);
177
178 return 'CVS';
179}
180
181# return number of files processed, undef if can't process
182# Note that $base_dir might be "" and that $file might
183# include directories
184
185# This function passes around metadata hash structures. Metadata hash
186# structures are hashes that map from a (scalar) key (the metadata element
187# name) to either a scalar metadata value or a reference to an array of
188# such values.
189
190sub read {
191 my $self = shift (@_);
192 my ($pluginfo, $base_dir, $file, $in_metadata, $processor, $maxdocs, $gli) = @_;
193
194 my $outhandle = $self->{'outhandle'};
195 my $verbosity = $self->{'verbosity'};
196 my $read_metadata_files = $self->{'use_metadata_files'};
197
198 # Calculate the directory name and ensure it is a directory and
199 # that it is not explicitly blocked.
200 my $dirname = $file;
201 $dirname = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
202 return undef unless (-d $dirname);
203 return 0 if ($self->{'block_exp'} ne "" && $dirname =~ /$self->{'block_exp'}/);
204
205 # check to make sure we're not reading the archives or index directory
206 my $gsdlhome = quotemeta($ENV{'GSDLHOME'});
207 if ($dirname =~ m/^$gsdlhome\/.*?\/import.*?\/(archives|index)$/) {
208 print $outhandle "RecPlug: $dirname appears to be a reference to a Greenstone collection, skipping.\n";
209 return 0;
210 }
211
212 # check to see we haven't got a cyclic path...
213 if ($dirname =~ m%(/.*){,41}%) {
214 print $outhandle "RecPlug: $dirname is 40 directories deep, is this a recursive path? if not increase constant in RecPlug.pm.\n";
215 return 0;
216 }
217
218 # check to see we haven't got a cyclic path...
219 if ($dirname =~ m%.*?import/(.+?)/import/\1.*%) {
220 print $outhandle "RecPlug: $dirname appears to be in a recursive loop...\n";
221 return 0;
222 }
223
224 if (($verbosity > 2) && ((scalar keys %$in_metadata) > 0)) {
225 print $outhandle "RecPlug: metadata passed in: ",
226 join(", ", keys %$in_metadata), "\n";
227 }
228
229 # Recur over directory contents.
230 my (@dir, $subfile);
231 my $count = 0;
232
233 print $outhandle "RecPlug: getting directory $dirname\n" if ($verbosity);
234
235 # find all the files in the directory
236 if (!opendir (DIR, $dirname)) {
237 if ($gli) {
238 print STDERR "<ProcessingError n='$file' r='Could not read directory $dirname'>\n";
239 }
240 print $outhandle "RecPlug: WARNING - couldn't read directory $dirname\n";
241 return -1; # error in processing
242 }
243 @dir = readdir (DIR);
244 closedir (DIR);
245
246 # Re-order the files in the list so any directories ending with .all are moved to the end
247 for (my $i = scalar(@dir) - 1; $i >= 0; $i--) {
248 if (-d &util::filename_cat($dirname, $dir[$i]) && $dir[$i] =~ /\.all$/) {
249 push(@dir, splice(@dir, $i, 1));
250 }
251 }
252
253 # read XML metadata files (if supplied)
254 my $additionalmetadata = 0; # is there extra metadata available?
255 my %extrametadata; # maps from filespec to extra metadata keys
256 my @extrametakeys; # keys of %extrametadata in order read
257
258 my $dirsepre = &util::get_re_dirsep();
259 my $dirsep = &util::get_dirsep();
260 my $local_dirname = $dirname;
261 $local_dirname =~ s/^$base_dir($dirsepre)//;
262 $local_dirname .= $dirsep;
263
264 if (defined $self->{'subdir_extrametakeys'}->{$local_dirname}) {
265 my $extrakeys = $self->{'subdir_extrametakeys'}->{$local_dirname};
266 foreach my $ek (@$extrakeys) {
267 my $extrakeys_re = $ek->{'re'};
268 my $extrakeys_md = $ek->{'md'};
269 push(@extrametakeys,$extrakeys_re);
270 $extrametadata{$extrakeys_re} = $extrakeys_md;
271 }
272 delete($self->{'subdir_extrametakeys'}->{$local_dirname});
273 }
274
275 if ($read_metadata_files) {
276 #read the directory "metadata.xml" file
277 my $metadatafile = &util::filename_cat ($dirname, 'metadata.xml');
278 if (-e $metadatafile) {
279 print $outhandle "RecPlug: found metadata in $metadatafile\n"
280 if ($verbosity);
281 $self->read_metadata_xml_file($metadatafile, \%extrametadata, \@extrametakeys);
282 $additionalmetadata = 1;
283 }
284 }
285
286 # apply metadata pass for each of the files in the directory
287 my $out_metadata;
288 my $num_files = scalar(@dir);
289 for (my $i = 0; $i < scalar(@dir); $i++) {
290 my $subfile = $dir[$i];
291 my $this_file_base_dir = $base_dir;
292 last if ($maxdocs != -1 && $count >= $maxdocs);
293 next if ($subfile =~ m/^\.\.?$/);
294 #next if ($read_metadata_files && $subfile =~ /metadata\.xml$/);
295
296 # Recursively read each $subfile
297 print $outhandle "RecPlug metadata recurring: $subfile\n" if ($verbosity > 2);
298
299 $count += &plugin::metadata_read ($pluginfo, $this_file_base_dir,
300 &util::filename_cat($file, $subfile),
301 $out_metadata, \@extrametakeys, \%extrametadata,
302 $processor, $maxdocs, $gli);
303 $additionalmetadata = 1;
304 }
305
306 # filter out any extrametakeys that mention subdirectories and store
307 # for later use (i.e. when that sub-directory is being processed)
308
309 foreach my $ek (@extrametakeys) {
310 my ($subdir_re,$extrakey_dir) = &File::Basename::fileparse($ek);
311 $extrakey_dir =~ s/\\\./\./g; # remove RE syntax
312
313 my $dirsep_re = &util::get_re_dirsep();
314
315 if ($ek =~ m/$dirsep_re/) { # specifies at least one directory
316 my $md = $extrametadata{$ek};
317
318 my $subdir_extrametakeys = $self->{'subdir_extrametakeys'};
319
320 my $subdir_rec = { 're' => $subdir_re, 'md' => $md };
321 push(@{$subdir_extrametakeys->{$extrakey_dir}},$subdir_rec);
322 }
323 }
324
325 # import each of the files in the directory
326 $count=0;
327 for (my $i = 0; $i <= scalar(@dir); $i++) {
328 # When every file in the directory has been done, pause for a moment (figuratively!)
329 # If the -recheck_directories argument hasn't been provided, stop now (default)
330 # Otherwise, re-read the contents of the directory to check for new files
331 # Any new files are added to the @dir list and are processed as normal
332 # This is necessary when documents to be indexed are specified in bibliographic DBs
333 # These files are copied/downloaded and stored in a new folder at import time
334 if ($i == $num_files) {
335 last unless $self->{'recheck_directories'};
336
337 # Re-read the files in the directory to see if there are any new files
338 last if (!opendir (DIR, $dirname));
339 my @dirnow = readdir (DIR);
340 closedir (DIR);
341
342 # We're only interested if there are more files than there were before
343 last if (scalar(@dirnow) <= scalar(@dir));
344
345 # Any new files are added to the end of @dir to get processed by the loop
346 my $j;
347 foreach my $subfilenow (@dirnow) {
348 for ($j = 0; $j < $num_files; $j++) {
349 last if ($subfilenow eq $dir[$j]);
350 }
351 if ($j == $num_files) {
352 # New file
353 push(@dir, $subfilenow);
354 }
355 }
356 # When the new files have been processed, check again
357 $num_files = scalar(@dir);
358 }
359
360 my $subfile = $dir[$i];
361 my $this_file_base_dir = $base_dir;
362 last if ($maxdocs != -1 && $count >= $maxdocs);
363 next if ($subfile =~ /^\.\.?$/);
364 next if ($read_metadata_files && $subfile =~ /metadata\.xml$/);
365
366 # Follow Windows shortcuts
367 if ($subfile =~ /(?i)\.lnk$/ && $ENV{'GSDLOS'} =~ /^windows$/i) {
368 require Win32::Shortcut;
369 my $shortcut = new Win32::Shortcut(&util::filename_cat($dirname, $subfile));
370 if ($shortcut) {
371 # The file to be processed is now the target of the shortcut
372 $this_file_base_dir = "";
373 $file = "";
374 $subfile = $shortcut->Path;
375 }
376 }
377
378 # check for a symlink pointing back to a leading directory
379 if (-d "$dirname/$subfile" && -l "$dirname/$subfile") {
380 # readlink gives a "fatal error" on systems that don't implement
381 # symlinks. This assumes the the -l test above would fail on those.
382 my $linkdest=readlink "$dirname/$subfile";
383 if (!defined ($linkdest)) {
384 # system error - file not found?
385 warn "RecPlug: symlink problem - $!";
386 } else {
387 # see if link points to current or a parent directory
388 if ($linkdest =~ m@^[\./\\]+$@ ||
389 index($dirname, $linkdest) != -1) {
390 warn "RecPlug: Ignoring recursive symlink ($dirname/$subfile -> $linkdest)\n";
391 next;
392 ;
393 }
394 }
395 }
396
397 print $outhandle "RecPlug: preparing metadata for $subfile\n" if ($verbosity > 2);
398
399 # Make a copy of $in_metadata to pass to $subfile
400 $out_metadata = {};
401 &combine_metadata_structures($out_metadata, $in_metadata);
402
403 # Next add metadata read in XML files (if it is supplied)
404 if ($additionalmetadata == 1) {
405 my ($filespec, $mdref);
406 foreach $filespec (@extrametakeys) {
407 if ($subfile =~ /^$filespec$/) {
408 print $outhandle "File \"$subfile\" matches filespec \"$filespec\"\n"
409 if ($verbosity > 2);
410 $mdref = $extrametadata{$filespec};
411 &combine_metadata_structures($out_metadata, $mdref);
412 }
413 }
414 }
415
416 # Recursively read each $subfile
417 print $outhandle "RecPlug recurring: $subfile\n" if ($verbosity > 2);
418
419 $count += &plugin::read ($pluginfo, $this_file_base_dir,
420 &util::filename_cat($file, $subfile),
421 $out_metadata, $processor, $maxdocs, $gli);
422 }
423
424 return $count;
425}
426
427
428
429# Read a manually-constructed metadata file and store the data
430# it contains in the $metadataref structure.
431#
432# (metadataref is a reference to a hash whose keys are filenames
433# and whose values are metadata hash structures.)
434
435sub read_metadata_xml_file {
436 my $self = shift(@_);
437 my ($filename, $metadataref, $metakeysref) = @_;
438 $self->{'metadataref'} = $metadataref;
439 $self->{'metakeysref'} = $metakeysref;
440
441 eval {
442 $self->{'parser'}->parsefile($filename);
443 };
444
445 if ($@) {
446 die "RecPlug: ERROR $filename is not a well formed metadata.xml file ($@)\n";
447 }
448}
449
450sub Doctype {
451 my ($expat, $name, $sysid, $pubid, $internal) = @_;
452
453 # allow the short-lived and badly named "GreenstoneDirectoryMetadata" files
454 # to be processed as well as the "DirectoryMetadata" files which should now
455 # be created by import.pl
456 die if ($name !~ /^(Greenstone)?DirectoryMetadata$/);
457}
458
459sub StartTag {
460 my ($expat, $element) = @_;
461
462 if ($element eq "FileSet") {
463 $self->{'saved_targets'} = [];
464 $self->{'saved_metadata'} = {};
465 }
466 elsif ($element eq "FileName") {
467 $self->{'in_filename'} = 1;
468 }
469 elsif ($element eq "Metadata") {
470 $self->{'metadata_name'} = $_{'name'};
471 if ((defined $_{'mode'}) && ($_{'mode'} eq "accumulate")) {
472 $self->{'metadata_accumulate'} = 1;
473 } else {
474 $self->{'metadata_accumulate'} = 0;
475 }
476 }
477}
478
479sub EndTag {
480 my ($expat, $element) = @_;
481
482 if ($element eq "FileSet") {
483 push (@{$self->{'metakeysref'}}, @{$self->{'saved_targets'}});
484 foreach my $target (@{$self->{'saved_targets'}}) {
485 my $file_metadata = $self->{'metadataref'}->{$target};
486 my $saved_metadata = $self->{'saved_metadata'};
487 if (!defined $file_metadata) {
488 $self->{'metadataref'}->{$target} = $saved_metadata;
489 }
490 else {
491 $self->combine_metadata_structures($file_metadata,$saved_metadata);
492 }
493 }
494 }
495 elsif ($element eq "FileName") {
496 $self->{'in_filename'} = 0;
497 }
498 elsif ($element eq "Metadata") {
499 $self->{'metadata_name'} = "";
500 }
501
502}
503
504sub store_saved_metadata
505{
506 my $self = shift(@_);
507 my ($mname,$mvalue,$md_accumulate) = @_;
508
509 if (defined $self->{'saved_metadata'}->{$mname}) {
510 if ($md_accumulate) {
511 # accumulate mode - add value to existing value(s)
512 if (ref ($self->{'saved_metadata'}->{$mname}) eq "ARRAY") {
513 push (@{$self->{'saved_metadata'}->{$mname}}, $mvalue);
514 } else {
515 $self->{'saved_metadata'}->{$mname} =
516 [$self->{'saved_metadata'}->{$mname}, $mvalue];
517 }
518 } else {
519 # override mode
520 $self->{'saved_metadata'}->{$mname} = $mvalue;
521 }
522 } else {
523 if ($md_accumulate) {
524 # accumulate mode - add value into (currently empty) array
525 $self->{'saved_metadata'}->{$mname} = [$mvalue];
526 } else {
527 # override mode
528 $self->{'saved_metadata'}->{$mname} = $mvalue;
529 }
530 }
531}
532
533
534sub Text {
535
536 if ($self->{'in_filename'}) {
537 # $_ == FileName content
538 push (@{$self->{'saved_targets'}}, $_);
539 }
540 elsif (defined ($self->{'metadata_name'}) && $self->{'metadata_name'} ne "") {
541 # $_ == Metadata content
542 my $mname = $self->{'metadata_name'};
543 my $mvalue = $_;
544 my $md_accumulate = $self->{'metadata_accumulate'};
545 $self->store_saved_metadata($mname,$mvalue,$md_accumulate);
546 }
547}
548
549# This Char function overrides the one in XML::Parser::Stream to overcome a
550# problem where $expat->{Text} is treated as the return value, slowing
551# things down significantly in some cases.
552sub Char {
553 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
554 $_[0]->{'Text'} .= $_[1];
555 return undef;
556}
557
558# Combine two metadata structures. Given two references to metadata
559# element structures, add every field of the second ($mdref2) to the first
560# ($mdref1).
561#
562# Afterwards $mdref1 will be updated, and $mdref2 will be unchanged.
563#
564# We have to be careful about the way we merge metadata when one metadata
565# structure is in "override" mode and one is in "merge" mode. In fact, we
566# use the mode from the second structure, $mdref2, because it is generally
567# defined later (lower in the directory structure) and is therefore more
568# "local" to the document concerned.
569#
570# Another issue is the use of references to pass metadata around. If we
571# simply copy one metadata structure reference to another, then we're
572# effectively just copyinga pointer, and changes to the new referene
573# will affect the old (copied) one also. This also applies to ARRAY
574# references used as metadata element values (hence the "clonedata"
575# function below).
576
577sub combine_metadata_structures {
578 my ($mdref1, $mdref2) = @_;
579 my ($key, $value1, $value2);
580
581 foreach $key (keys %$mdref2) {
582
583 $value1 = $mdref1->{$key};
584 $value2 = $mdref2->{$key};
585
586 # If there is no existing value for this metadata field in
587 # $mdref1, so we simply copy the value from $mdref2 over.
588 if (!defined $value1) {
589 $mdref1->{$key} = &clonedata($value2);
590 }
591 # Otherwise we have to add the new values to the existing ones.
592 # If the second structure is accumulated, then acculate all the
593 # values into the first structure
594 elsif ((ref $value2) eq "ARRAY") {
595 # If the first metadata element is a scalar we have to
596 # convert it into an array before we add anything more.
597 if ((ref $value1) ne 'ARRAY') {
598 $mdref1->{$key} = [$value1];
599 $value1 = $mdref1->{$key};
600 }
601 # Now add the value(s) from the second array to the first
602 $value2 = &clonedata($value2);
603 push @$value1, @$value2;
604 }
605 # Finally, If the second structure is not an array erference, we
606 # know it is in override mode, so override the first structure.
607 else {
608 $mdref1->{$key} = &clonedata($value2);
609 }
610 }
611}
612
613
614# Make a "cloned" copy of a metadata value.
615# This is trivial for a simple scalar value,
616# but not for an array reference.
617
618sub clonedata {
619 my ($value) = @_;
620 my $result;
621
622 if ((ref $value) eq 'ARRAY') {
623 $result = [];
624 foreach my $item (@$value) {
625 push @$result, $item;
626 }
627 } else {
628 $result = $value;
629 }
630 return $result;
631}
632
633
6341;
Note: See TracBrowser for help on using the repository browser.