root/main/trunk/greenstone2/perllib/plugins/MetadataXMLPlugin.pm @ 32159

Revision 32159, 13.6 KB (checked in by ak19, 20 months ago)

incremental building was not being incremental when no metadata was assigned to any of the files (as happens with our docs for quick test collections). A default metadata.xml is present, but 'empty' in that it contains no FileSet? elements with metadata elements assigned to FileName? elements. But we still want incremental behaviour. The idea was to write out an entry into archiveinf-src.db for each metadata.xml processed, not just for each meta.xml file actually referencing a doc, as BasePlugout? was doing so far on a per doc basis. Kathy come up with the actual infrastructure that can make it work (to ensure all the necessary objects are available), Dr Bainbridge approved this, and it's now been added into the code.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# MetadataXMLPlugin.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# MetadataXMLPlugin process metadata.xml files in a collection
27
28# Here's an example of a metadata file that uses three FileSet structures
29# (ignore the # characters):
30
31#<?xml version="1.0" encoding="UTF-8" standalone="no"?>
32#<!DOCTYPE DirectoryMetadata SYSTEM "http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd">
33#<DirectoryMetadata>
34#  <FileSet>
35#    <FileName>nugget.*</FileName>
36#    <Description>
37#      <Metadata name="Title">Nugget Point, The Catlins</Metadata>
38#      <Metadata name="Place" mode="accumulate">Nugget Point</Metadata>
39#    </Description>
40#  </FileSet>
41#  <FileSet>
42#    <FileName>nugget-point-1.jpg</FileName>
43#    <Description>
44#      <Metadata name="Title">Nugget Point Lighthouse, The Catlins</Metadata>
45#      <Metadata name="Subject">Lighthouse</Metadata>
46#    </Description>
47#  </FileSet>
48#  <FileSet>
49#    <FileName>kaka-point-dir</FileName>
50#    <Description>
51#      <Metadata name="Title">Kaka Point, The Catlins</Metadata>
52#    </Description>
53#  </FileSet>
54#</DirectoryMetadata>
55
56# Metadata elements are read and applied to files in the order they appear
57# in the file.
58#
59# The FileName element describes the subfiles in the directory that the
60# metadata applies to as a perl regular expression (a FileSet group may
61# contain multiple FileName elements). So, <FileName>nugget.*</FileName>
62# indicates that the metadata records in the following Description block
63# apply to every subfile that starts with "nugget".  For these files, a
64# Title metadata element is set, overriding any old value that the Title
65# might have had.
66#
67# Occasionally, we want to have multiple metadata values applied to a
68# document; in this case we use the "mode=accumulate" attribute of the
69# particular Metadata element.  In the second metadata element of the first
70# FileSet above, the "Place" metadata is accumulating, and may therefore be
71# given several values.  If we wanted to override these values and use a
72# single metadata element again, we could set the mode attribute to
73# "override" instead.  Remember: every element is assumed to be in override
74# mode unless you specify otherwise, so if you want to accumulate metadata
75# for some field, every occurance must have "mode=accumulate" specified.
76#
77# The second FileSet element above applies to a specific file, called
78# nugget-point-1.jpg.  This element overrides the Title metadata set in the
79# first FileSet, and adds a "Subject" metadata field.
80#
81# The third and final FileSet sets metadata for a subdirectory rather than
82# a file.  The metadata specified (a Title) will be passed into the
83# subdirectory and applied to every file that occurs in the subdirectory
84# (and to every subsubdirectory and its contents, and so on) unless the
85# metadata is explictly overridden later in the import.
86
87package MetadataXMLPlugin;
88
89use strict;
90no strict 'refs';
91
92use Encode;
93
94use BaseImporter;
95use extrametautil;
96use util;
97use FileUtils;
98use metadatautil;
99
100sub BEGIN {
101    @MetadataXMLPlugin::ISA = ('BaseImporter');
102    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
103}
104
105use XMLParser;
106
107my $arguments = [
108      { 'name' => "process_exp",
109    'desc' => "{BaseImporter.process_exp}",
110    'type' => "regexp",
111    'reqd' => "no",
112    'deft' => &get_default_process_exp() }
113
114];
115
116my $options = { 'name'     => "MetadataXMLPlugin",
117        'desc'     => "{MetadataXMLPlugin.desc}",
118        'abstract' => "no",
119        'inherits' => "yes",
120        'args'     => $arguments };
121
122sub new {
123    my ($class) = shift (@_);
124    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
125    push(@$pluginlist, $class);
126
127    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
128    push(@{$hashArgOptLists->{"OptList"}},$options);
129
130    my $self = new BaseImporter($pluginlist, $inputargs, $hashArgOptLists);
131
132    if ($self->{'info_only'}) {
133    # don't worry about any options or initialisations etc
134    return bless $self, $class;
135    }
136   
137    # The following used to be passed in as a parameter to XML::Parser,
138    # if the version of perl was greater than or equal to 5.8.
139    # The svn commit comment explaining the reason for adding this was
140    # not very clear and also said that it was quick fix and hadn't
141    # been tested under windows.
142    # More recent work has been to make strings in Perl "Unicode-aware"
143    # and so this line might actually be potentially harmful, however
144    # it is not the case that we encountered an actual error leading to
145    # its removal, rather it has been eliminated in an attempt to tighten
146    # up the code. For example, this protocol encoding is not used in
147    # ReadXMLFile.
148    # 'ProtocolEncoding' => 'ISO-8859-1',
149
150    # create XML::Parser object for parsing metadata.xml files
151    my $parser = new XML::Parser('Style' => 'Stream',                             
152                                  'Pkg' => 'MetadataXMLPlugin',
153                                  'PluginObj' => $self,
154                    'Handlers' => {'Char' => \&Char,
155                         'Doctype' => \&Doctype
156                         });
157
158    $self->{'parser'} = $parser;
159    $self->{'in_filename'} = 0;
160   
161    return bless $self, $class;
162}
163
164
165sub get_default_process_exp
166{
167    return q^metadata\.xml$^;
168}
169
170sub get_doctype {
171    my $self = shift(@_);
172   
173    return "(Greenstone)?DirectoryMetadata"
174}
175
176sub can_process_this_file {
177    my $self = shift(@_);
178    my ($filename) = @_;
179
180    if (-f $filename && $self->SUPER::can_process_this_file($filename) && $self->check_doctype($filename)) {
181       return 1; # its a file for us
182    }
183    return 0;
184}
185
186sub check_doctype {
187    my $self = shift (@_);
188   
189    my ($filename) = @_;
190
191    if (open(XMLIN,"<$filename")) {
192    my $doctype = $self->get_doctype();
193    ## check whether the doctype has the same name as the root element tag
194    while (defined (my $line = <XMLIN>)) {
195        ## find the root element
196        if ($line =~ /<([\w\d:]+)[\s\/>]/){
197        my $root = $1;
198        if ($root !~ $doctype){
199            close(XMLIN);
200            return 0;
201        }
202        else {
203            close(XMLIN);
204            return 1;
205        }
206        }
207    }
208    close(XMLIN);
209    }
210   
211    return undef; # haven't found a valid line
212   
213}
214
215sub file_block_read {
216    my $self = shift (@_);
217    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
218   
219    my $filename_full_path = &FileUtils::filenameConcatenate($base_dir, $file);
220    return undef unless $self->can_process_this_file($filename_full_path);   
221
222    if (($ENV{'GSDLOS'} =~ m/^windows$/) && ($^O ne "cygwin")) {
223        # convert to full name - paths stored in block hash are long filenames
224    $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
225    my $lower_drive = $filename_full_path;
226    $lower_drive =~ s/^([A-Z]):/\l$1:/i;
227   
228    my $upper_drive = $filename_full_path;
229    $upper_drive =~ s/^([A-Z]):/\u$1:/i;
230   
231    $block_hash->{'metadata_files'}->{$lower_drive} = 1;
232    $block_hash->{'metadata_files'}->{$upper_drive} = 1;
233       
234    }
235    else {
236    $block_hash->{'metadata_files'}->{$filename_full_path} = 1;
237    }
238
239    return 1;
240}
241
242sub metadata_read
243{
244    my $self = shift (@_);
245    my ($pluginfo, $base_dir, $file, $block_hash,
246    $extrametakeys, $extrametadata,$extrametafile,
247    $processor, $gli, $aux) = @_;
248
249    my $filename = &FileUtils::filenameConcatenate($base_dir, $file);
250    return undef unless $self->can_process_this_file($filename);   
251   
252    $self->{'metadata-file'} = $file;
253    $self->{'metadata-filename'} = $filename;
254   
255    my $outhandle = $self->{'outhandle'};
256   
257    print STDERR "\n<Processing n='$file' p='MetadataXMLPlugin'>\n" if ($gli);
258    print $outhandle "MetadataXMLPlugin: processing $file\n" if ($self->{'verbosity'})> 1;
259
260    # In order to prevent blind reprocessing of the same old docs upon *incremental* building
261    # whenever we encounter a default empty metadata.xml that has no content defined (attaches
262    # no meta), we write an entry for *each* metadata.xml into archiveinf-src.db
263    print $outhandle "MetadataXMLPlugin: writing an entry for this metadata.xml into archiveinf-src.db\n" if ($self->{'verbosity'})> 1;
264    $processor->add_metaxml_file_entry_to_archiveinfsrc($filename); # pass in the full filename, like BasePlugout::archiveinf_db() does
265
266   
267    # add the file to the block list so that it won't be processed in read, as we will do all we can with it here
268    $self->block_raw_filename($block_hash,$filename);
269
270    $self->{'metadataref'} = $extrametadata;
271    $self->{'metafileref'} = $extrametafile;
272    $self->{'metakeysref'} = $extrametakeys;
273   
274    eval {
275    $self->{'parser'}->parsefile($filename);
276    };
277
278    if ($@) {
279    print STDERR "**** Error is: $@\n";
280    my $plugin_name = ref ($self);
281    my $failhandle = $self->{'failhandle'};
282    print $outhandle "$plugin_name failed to process $file ($@)\n";
283    print $failhandle "$plugin_name failed to process $file ($@)\n";
284    print STDERR "<ProcessingError n='$file'>\n" if ($gli);
285    return -1; #error
286    }
287   
288    return 1;
289
290}
291
292
293# Updated by Jeffrey 2010/04/16 @ DL Consulting Ltd.
294# Get rid off the global $self as it cause problems when there are 2+ MetadataXMLPlugin in your collect.cfg...
295# For example when you have an OAIMetadataXMLPlugin that is a child of MetadataXMLPlugin
296sub Doctype {$_[0]->{'PluginObj'}->xml_doctype(@_);}
297sub StartTag {$_[0]->{'PluginObj'}->xml_start_tag(@_);}
298sub EndTag {$_[0]->{'PluginObj'}->xml_end_tag(@_);}
299sub Text {$_[0]->{'PluginObj'}->xml_text(@_);}
300
301
302sub xml_doctype {
303    my $self = shift(@_);
304    my ($expat, $name, $sysid, $pubid, $internal) = @_;
305
306    # allow the short-lived and badly named "GreenstoneDirectoryMetadata" files
307    # to be processed as well as the "DirectoryMetadata" files which should now
308    # be created by import.pl
309    die if ($name !~ /^(Greenstone)?DirectoryMetadata$/);
310}
311
312sub xml_start_tag {
313    my $self = shift(@_);
314    my ($expat, $element) = @_;
315   
316    if ($element eq "FileSet") {
317    $self->{'saved_targets'} = [];
318    $self->{'saved_metadata'} = {};
319    }
320    elsif ($element eq "FileName") {
321    $self->{'in_filename'} = 1;
322    }
323    elsif ($element eq "Metadata") {
324    $self->{'metadata_name'} = $_{'name'};
325    $self->{'metadata_value'} = "";
326    if ((defined $_{'mode'}) && ($_{'mode'} eq "accumulate")) {
327        $self->{'metadata_accumulate'} = 1;
328    } else {
329        $self->{'metadata_accumulate'} = 0;
330    }
331    }
332}
333
334sub xml_end_tag {
335    my $self = shift(@_);
336    my ($expat, $element) = @_;
337
338    if ($element eq "FileSet") {
339    foreach my $target (@{$self->{'saved_targets'}}) {
340   
341        # FileNames must be regex, but we allow \\ for path separator on windows. convert to /
342        $target = &util::filepath_regex_to_url_format($target);
343
344        # we want proper unicode for the regex, so convert url-encoded chars
345        if (&unicode::is_url_encoded($target)) {
346        $target = &unicode::url_decode($target);
347        }
348
349        my $file_metadata = &extrametautil::getmetadata($self->{'metadataref'}, $target);
350        my $saved_metadata = $self->{'saved_metadata'};
351
352        if (!defined $file_metadata) {
353        &extrametautil::setmetadata($self->{'metadataref'}, $target, $saved_metadata);
354
355        # not had target before
356        &extrametautil::addmetakey($self->{'metakeysref'}, $target);
357        }
358        else {
359        &metadatautil::combine_metadata_structures($file_metadata,$saved_metadata);
360        }
361
362       
363        # now record which metadata.xml file it came from
364
365        my $file = $self->{'metadata-file'};
366        my $filename = $self->{'metadata-filename'};
367
368        if (!defined &extrametautil::getmetafile($self->{'metafileref'}, $target)) {
369            &extrametautil::setmetafile($self->{'metafileref'}, $target, {});
370        }
371
372        &extrametautil::setmetafile_for_named_file($self->{'metafileref'}, $target, $file, $filename);
373    }
374    }
375    elsif ($element eq "FileName") {
376    $self->{'in_filename'} = 0;
377    }
378    elsif ($element eq "Metadata") {
379    # text read in by XML::Parser is in Perl's binary byte value
380    # form ... need to explicitly make it UTF-8
381   
382    my $metadata_name = $self->{'metadata_name'};
383    my $metadata_value = $self->{'metadata_value'};
384    #my $metadata_name = decode("utf-8",$self->{'metadata_name'});
385    #my $metadata_value = decode("utf-8",$self->{'metadata_value'});
386   
387    &metadatautil::store_saved_metadata($self,
388                        $metadata_name, $metadata_value,
389                        $self->{'metadata_accumulate'});
390    $self->{'metadata_name'} = "";
391    }
392
393}
394
395sub xml_text {
396    my $self = shift(@_);
397
398    if ($self->{'in_filename'}) {
399    # $_ == FileName content
400    push (@{$self->{'saved_targets'}}, $_);
401    }
402    elsif (defined ($self->{'metadata_name'}) && $self->{'metadata_name'} ne "") {
403    # $_ == Metadata content
404    $self->{'metadata_value'} = $_;
405    }
406}
407
408# This Char function overrides the one in XML::Parser::Stream to overcome a
409# problem where $expat->{Text} is treated as the return value, slowing
410# things down significantly in some cases.
411sub Char {
412#    use bytes;  # Necessary to prevent encoding issues with XML::Parser 2.31+
413
414#    if ($]<5.008) {
415#   use bytes;  # Necessary to prevent encoding issues with XML::Parser 2.31+ and Perl 5.6
416#    }
417    $_[0]->{'Text'} .= $_[1];
418    return undef;
419}
420
421
422
4231;
Note: See TracBrowser for help on using the browser.