source: main/trunk/greenstone2/perllib/plugins/MetadataXMLPlugin.pm@ 36372

Last change on this file since 36372 was 36372, checked in by kjdon, 21 months ago

tidy up of extrametautil, renaming some methods to make them easier to understand, removing anything unused. then modifying plugins to use new methods. Also, moved some common code to MetadataRead function, can call this from several plugins instead of duplicating code. This is an interim commit, where I have left in the old code to make it easier to track changes. Next commit will have everything tidied up.

  • Property svn:keywords set to Author Date Id Revision
File size: 13.8 KB
Line 
1###########################################################################
2#
3# MetadataXMLPlugin.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# MetadataXMLPlugin process metadata.xml files in a collection
27
28# Here's an example of a metadata file that uses three FileSet structures
29# (ignore the # characters):
30
31#<?xml version="1.0" encoding="UTF-8" standalone="no"?>
32#<!DOCTYPE DirectoryMetadata SYSTEM "http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd">
33#<DirectoryMetadata>
34# <FileSet>
35# <FileName>nugget.*</FileName>
36# <Description>
37# <Metadata name="Title">Nugget Point, The Catlins</Metadata>
38# <Metadata name="Place" mode="accumulate">Nugget Point</Metadata>
39# </Description>
40# </FileSet>
41# <FileSet>
42# <FileName>nugget-point-1.jpg</FileName>
43# <Description>
44# <Metadata name="Title">Nugget Point Lighthouse, The Catlins</Metadata>
45# <Metadata name="Subject">Lighthouse</Metadata>
46# </Description>
47# </FileSet>
48# <FileSet>
49# <FileName>kaka-point-dir</FileName>
50# <Description>
51# <Metadata name="Title">Kaka Point, The Catlins</Metadata>
52# </Description>
53# </FileSet>
54#</DirectoryMetadata>
55
56# Metadata elements are read and applied to files in the order they appear
57# in the file.
58#
59# The FileName element describes the subfiles in the directory that the
60# metadata applies to as a perl regular expression (a FileSet group may
61# contain multiple FileName elements). So, <FileName>nugget.*</FileName>
62# indicates that the metadata records in the following Description block
63# apply to every subfile that starts with "nugget". For these files, a
64# Title metadata element is set, overriding any old value that the Title
65# might have had.
66#
67# Occasionally, we want to have multiple metadata values applied to a
68# document; in this case we use the "mode=accumulate" attribute of the
69# particular Metadata element. In the second metadata element of the first
70# FileSet above, the "Place" metadata is accumulating, and may therefore be
71# given several values. If we wanted to override these values and use a
72# single metadata element again, we could set the mode attribute to
73# "override" instead. Remember: every element is assumed to be in override
74# mode unless you specify otherwise, so if you want to accumulate metadata
75# for some field, every occurance must have "mode=accumulate" specified.
76#
77# The second FileSet element above applies to a specific file, called
78# nugget-point-1.jpg. This element overrides the Title metadata set in the
79# first FileSet, and adds a "Subject" metadata field.
80#
81# The third and final FileSet sets metadata for a subdirectory rather than
82# a file. The metadata specified (a Title) will be passed into the
83# subdirectory and applied to every file that occurs in the subdirectory
84# (and to every subsubdirectory and its contents, and so on) unless the
85# metadata is explictly overridden later in the import.
86
87package MetadataXMLPlugin;
88
89use strict;
90no strict 'refs';
91
92use Encode;
93
94use BaseImporter;
95use extrametautil;
96use util;
97use FileUtils;
98use metadatautil;
99
100sub BEGIN {
101 @MetadataXMLPlugin::ISA = ('BaseImporter');
102 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
103}
104
105use XMLParser;
106
107my $arguments = [
108 { 'name' => "process_exp",
109 'desc' => "{BaseImporter.process_exp}",
110 'type' => "regexp",
111 'reqd' => "no",
112 'deft' => &get_default_process_exp() }
113
114];
115
116my $options = { 'name' => "MetadataXMLPlugin",
117 'desc' => "{MetadataXMLPlugin.desc}",
118 'abstract' => "no",
119 'inherits' => "yes",
120 'args' => $arguments };
121
122sub new {
123 my ($class) = shift (@_);
124 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
125 push(@$pluginlist, $class);
126
127 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
128 push(@{$hashArgOptLists->{"OptList"}},$options);
129
130 my $self = new BaseImporter($pluginlist, $inputargs, $hashArgOptLists);
131
132 if ($self->{'info_only'}) {
133 # don't worry about any options or initialisations etc
134 return bless $self, $class;
135 }
136
137 # The following used to be passed in as a parameter to XML::Parser,
138 # if the version of perl was greater than or equal to 5.8.
139 # The svn commit comment explaining the reason for adding this was
140 # not very clear and also said that it was quick fix and hadn't
141 # been tested under windows.
142 # More recent work has been to make strings in Perl "Unicode-aware"
143 # and so this line might actually be potentially harmful, however
144 # it is not the case that we encountered an actual error leading to
145 # its removal, rather it has been eliminated in an attempt to tighten
146 # up the code. For example, this protocol encoding is not used in
147 # ReadXMLFile.
148 # 'ProtocolEncoding' => 'ISO-8859-1',
149
150 # create XML::Parser object for parsing metadata.xml files
151 my $parser = new XML::Parser('Style' => 'Stream',
152 'Pkg' => 'MetadataXMLPlugin',
153 'PluginObj' => $self,
154 'Handlers' => {'Char' => \&Char,
155 'Doctype' => \&Doctype
156 });
157
158 $self->{'parser'} = $parser;
159 $self->{'in_filename'} = 0;
160
161 return bless $self, $class;
162}
163
164
165sub get_default_process_exp
166{
167 return q^metadata\.xml$^;
168}
169
170sub get_doctype {
171 my $self = shift(@_);
172
173 return "(Greenstone)?DirectoryMetadata"
174}
175
176sub can_process_this_file {
177 my $self = shift(@_);
178 my ($filename) = @_;
179
180 if (-f $filename && $self->SUPER::can_process_this_file($filename) && $self->check_doctype($filename)) {
181 return 1; # its a file for us
182 }
183 return 0;
184}
185
186sub check_doctype {
187 my $self = shift (@_);
188
189 my ($filename) = @_;
190
191 if (open(XMLIN,"<$filename")) {
192 my $doctype = $self->get_doctype();
193 ## check whether the doctype has the same name as the root element tag
194 while (defined (my $line = <XMLIN>)) {
195 ## find the root element
196 if ($line =~ /<([\w\d:]+)[\s\/>]/){
197 my $root = $1;
198 if ($root !~ $doctype){
199 close(XMLIN);
200 return 0;
201 }
202 else {
203 close(XMLIN);
204 return 1;
205 }
206 }
207 }
208 close(XMLIN);
209 }
210
211 return undef; # haven't found a valid line
212
213}
214
215sub file_block_read {
216 my $self = shift (@_);
217 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
218
219 my $filename_full_path = &FileUtils::filenameConcatenate($base_dir, $file);
220 return undef unless $self->can_process_this_file($filename_full_path);
221
222 if (($ENV{'GSDLOS'} =~ m/^windows$/) && ($^O ne "cygwin")) {
223 # convert to full name - paths stored in block hash are long filenames
224 $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
225 }
226# kjdon - upgrade method converts everyhting to lower case drive letter.
227# so would we need the following stuff???
228# my $lower_drive = $filename_full_path;
229# $lower_drive =~ s/^([A-Z]):/\l$1:/i;
230
231# my $upper_drive = $filename_full_path;
232# $upper_drive =~ s/^([A-Z]):/\u$1:/i;
233
234# $block_hash->{'metadata_files'}->{$lower_drive} = 1;
235# $block_hash->{'metadata_files'}->{$upper_drive} = 1;
236
237# }
238# else {
239 $block_hash->{'metadata_files'}->{$filename_full_path} = 1;
240 # }
241
242 return 1;
243}
244
245sub metadata_read
246{
247 my $self = shift (@_);
248 my ($pluginfo, $base_dir, $file, $block_hash,
249 $extrametakeys, $extrametadata,$extrametafile,
250 $processor, $gli, $aux) = @_;
251
252 my $filename = &FileUtils::filenameConcatenate($base_dir, $file);
253 return undef unless $self->can_process_this_file($filename);
254
255 $self->{'metadata-file'} = $file;
256 $self->{'metadata-filename'} = $filename;
257
258 my $outhandle = $self->{'outhandle'};
259
260 print STDERR "\n<Processing n='$file' p='MetadataXMLPlugin'>\n" if ($gli);
261 print $outhandle "MetadataXMLPlugin: processing $file\n" if ($self->{'verbosity'})> 1;
262
263 # In order to prevent blind reprocessing of the same old docs upon *incremental* building
264 # whenever we encounter a default empty metadata.xml that has no content defined (attaches
265 # no meta), we write an entry for *each* metadata.xml into archiveinf-src.db
266 print $outhandle "MetadataXMLPlugin: writing an entry for this metadata.xml into archiveinf-src.db\n" if ($self->{'verbosity'})> 1;
267 $processor->add_metaxml_file_entry_to_archiveinfsrc($filename); # pass in the full filename, like BasePlugout::archiveinf_db() does
268
269
270 # add the file to the block list so that it won't be processed in read, as we will do all we can with it here
271 $self->block_raw_filename($block_hash,$filename);
272
273 $self->{'metadataref'} = $extrametadata;
274 $self->{'metafileref'} = $extrametafile;
275 $self->{'metakeysref'} = $extrametakeys;
276
277 eval {
278 $self->{'parser'}->parsefile($filename);
279 };
280
281 if ($@) {
282 print STDERR "**** Error is: $@\n";
283 my $plugin_name = ref ($self);
284 my $failhandle = $self->{'failhandle'};
285 print $outhandle "$plugin_name failed to process $file ($@)\n";
286 print $failhandle "$plugin_name failed to process $file ($@)\n";
287 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
288 return -1; #error
289 }
290
291 return 1;
292
293}
294
295
296# Updated by Jeffrey 2010/04/16 @ DL Consulting Ltd.
297# Get rid off the global $self as it cause problems when there are 2+ MetadataXMLPlugin in your collect.cfg...
298# For example when you have an OAIMetadataXMLPlugin that is a child of MetadataXMLPlugin
299sub Doctype {$_[0]->{'PluginObj'}->xml_doctype(@_);}
300sub StartTag {$_[0]->{'PluginObj'}->xml_start_tag(@_);}
301sub EndTag {$_[0]->{'PluginObj'}->xml_end_tag(@_);}
302sub Text {$_[0]->{'PluginObj'}->xml_text(@_);}
303
304
305sub xml_doctype {
306 my $self = shift(@_);
307 my ($expat, $name, $sysid, $pubid, $internal) = @_;
308
309 # allow the short-lived and badly named "GreenstoneDirectoryMetadata" files
310 # to be processed as well as the "DirectoryMetadata" files which should now
311 # be created by import.pl
312 die if ($name !~ /^(Greenstone)?DirectoryMetadata$/);
313}
314
315sub xml_start_tag {
316 my $self = shift(@_);
317 my ($expat, $element) = @_;
318
319 if ($element eq "FileSet") {
320 $self->{'saved_targets'} = [];
321 $self->{'saved_metadata'} = {};
322 }
323 elsif ($element eq "FileName") {
324 $self->{'in_filename'} = 1;
325 }
326 elsif ($element eq "Metadata") {
327 $self->{'metadata_name'} = $_{'name'};
328 $self->{'metadata_value'} = "";
329 if ((defined $_{'mode'}) && ($_{'mode'} eq "accumulate")) {
330 $self->{'metadata_accumulate'} = 1;
331 } else {
332 $self->{'metadata_accumulate'} = 0;
333 }
334 }
335}
336
337sub xml_end_tag {
338 my $self = shift(@_);
339 my ($expat, $element) = @_;
340
341 if ($element eq "FileSet") {
342 foreach my $target (@{$self->{'saved_targets'}}) {
343
344 # FileNames must be regex, but we allow \\ for path separator on windows. convert to /
345 $target = &util::filepath_regex_to_url_format($target);
346
347 # we want proper unicode for the regex, so convert url-encoded chars
348 if (&unicode::is_url_encoded($target)) {
349 $target = &unicode::url_decode($target);
350 }
351
352 my $file_metadata = &extrametautil::getmetadata($self->{'metadataref'}, $target);
353 my $saved_metadata = $self->{'saved_metadata'};
354
355 if (!defined $file_metadata) {
356 &extrametautil::setmetadata($self->{'metadataref'}, $target, $saved_metadata);
357
358 # not had target before
359 &extrametautil::addmetakey($self->{'metakeysref'}, $target);
360 }
361 else {
362 &metadatautil::combine_metadata_structures($file_metadata,$saved_metadata);
363 }
364
365
366 # now record which metadata.xml file it came from
367
368 my $file = $self->{'metadata-file'};
369 my $filename = $self->{'metadata-filename'};
370
371# if (!defined &extrametautil::getmetafile($self->{'metafileref'}, $target)) {
372# &extrametautil::setmetafile($self->{'metafileref'}, $target, {});
373# }
374
375# &extrametautil::setmetafile_for_named_file($self->{'metafileref'}, $target, $file, $filename);
376 &extrametautil::addmetafile($self->{'metafileref'}, $target, $file, $filename);
377 }
378 }
379 elsif ($element eq "FileName") {
380 $self->{'in_filename'} = 0;
381 }
382 elsif ($element eq "Metadata") {
383 # text read in by XML::Parser is in Perl's binary byte value
384 # form ... need to explicitly make it UTF-8
385
386 my $metadata_name = $self->{'metadata_name'};
387 my $metadata_value = $self->{'metadata_value'};
388 #my $metadata_name = decode("utf-8",$self->{'metadata_name'});
389 #my $metadata_value = decode("utf-8",$self->{'metadata_value'});
390
391 &metadatautil::store_saved_metadata($self,
392 $metadata_name, $metadata_value,
393 $self->{'metadata_accumulate'});
394 $self->{'metadata_name'} = "";
395 }
396
397}
398
399sub xml_text {
400 my $self = shift(@_);
401
402 if ($self->{'in_filename'}) {
403 # $_ == FileName content
404 push (@{$self->{'saved_targets'}}, $_);
405 }
406 elsif (defined ($self->{'metadata_name'}) && $self->{'metadata_name'} ne "") {
407 # $_ == Metadata content
408 $self->{'metadata_value'} = $_;
409 }
410}
411
412# This Char function overrides the one in XML::Parser::Stream to overcome a
413# problem where $expat->{Text} is treated as the return value, slowing
414# things down significantly in some cases.
415sub Char {
416# use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
417
418# if ($]<5.008) {
419# use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+ and Perl 5.6
420# }
421 $_[0]->{'Text'} .= $_[1];
422 return undef;
423}
424
425
426
4271;
Note: See TracBrowser for help on using the repository browser.