source: main/trunk/greenstone2/perllib/plugins/OAIPlugin.pm@ 36372

Last change on this file since 36372 was 36372, checked in by kjdon, 21 months ago

tidy up of extrametautil, renaming some methods to make them easier to understand, removing anything unused. then modifying plugins to use new methods. Also, moved some common code to MetadataRead function, can call this from several plugins instead of duplicating code. This is an interim commit, where I have left in the old code to make it easier to track changes. Next commit will have everything tidied up.

  • Property svn:keywords set to Author Date Id Revision
File size: 18.8 KB
Line 
1###########################################################################
2#
3# OAIPlug.pm -- basic Open Archives Initiative (OAI) plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package OAIPlugin;
28
29# Devel::Peek's Dump($var) function is useful for debugging encoding issues.
30#use Devel::Peek;
31use Encode;
32use extrametautil;
33use unicode;
34use util;
35
36use strict;
37no strict 'refs'; # allow filehandles to be variables and viceversa
38
39use ReadXMLFile;
40use ReadTextFile; # needed for subroutine textcat_get_language_encoding
41use metadatautil;
42use MetadataRead;
43use util;
44
45# methods with identical signatures take precedence in the order given in the ISA list.
46sub BEGIN {
47 @OAIPlugin::ISA = ('MetadataRead', 'ReadXMLFile', 'ReadTextFile');
48}
49
50my $set_list =
51 [ { 'name' => "auto",
52 'desc' => "{OAIPlugin.metadata_set.auto}" },
53 { 'name' => "dc",
54 'desc' => "{OAIPlugin.metadata_set.dc}" }
55 ];
56
57my $arguments =
58 [ { 'name' => "process_exp",
59 'desc' => "{BaseImporter.process_exp}",
60 'type' => "regexp",
61 'reqd' => "no",
62 'deft' => &get_default_process_exp() },
63 { 'name' => "metadata_set",
64 'desc' => "{OAIPlugin.metadata_set}",
65 'type' => "enumstring",
66 'reqd' => "no",
67 'list' => $set_list,
68 'deft' => "dc" },
69 { 'name' => "document_field",
70 'desc' => "{OAIPlugin.document_field}",
71 'type' => "metadata",
72 'reqd' => "no",
73 'deft' => "gi.Sourcedoc" }
74 ];
75
76my $options = { 'name' => "OAIPlugin",
77 'desc' => "{OAIPlugin.desc}",
78 'abstract' => "no",
79 'inherits' => "yes",
80 'explodes' => "yes",
81 'args' => $arguments };
82
83
84sub new {
85 my ($class) = shift (@_);
86 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
87 push(@$pluginlist, $class);
88
89 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
90 push(@{$hashArgOptLists->{"OptList"}},$options);
91
92 new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists,1);
93 my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
94
95 if ($self->{'info_only'}) {
96 # don't worry about modifying options
97 return bless $self, $class;
98 }
99 # trim any ex. from document field iff it's the only metadata namespace prefix
100 $self->{'document_field'} =~ s/^ex\.([^.]+)$/$1/;
101 return bless $self, $class;
102}
103
104sub get_default_process_exp {
105 my $self = shift (@_);
106
107 return q^(?i)(\.oai)$^;
108}
109
110sub get_doctype {
111 my $self = shift(@_);
112
113 return "OAI-PMH";
114}
115
116sub xml_start_document {
117 my $self = shift (@_);
118 $self->{'in_metadata_node'} = 0;
119 $self->{'rawxml'} = "";
120 $self->{'saved_metadata'} = {};
121}
122
123sub xml_end_document {
124}
125
126sub xml_doctype {
127 my $self = shift(@_);
128
129 my ($expat, $name, $sysid, $pubid, $internal) = @_;
130
131 ##die "" if ($name !~ /^OAI-PMH$/);
132
133 my $outhandle = $self->{'outhandle'};
134 print $outhandle "OAIPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
135 print STDERR "<Processing n='$self->{'file'}' p='OAIPlugin'>\n" if $self->{'gli'};
136
137}
138
139
140sub xml_start_tag {
141 my $self = shift(@_);
142 my ($expat,$element) = @_;
143
144 my %attr_hash = %_;
145
146 my $attr = "";
147 map { $attr .= " $_=$attr_hash{$_}"; } keys %attr_hash;
148
149 $self->{'rawxml'} .= "<$element$attr>";
150
151 if ($element eq "metadata") {
152 $self->{'in_metadata_node'} = 1;
153 $self->{'metadata_xml'} = "";
154 }
155
156 if ($self->{'in_metadata_node'}) {
157 $self->{'metadata_xml'} .= "<$element$attr>";
158 }
159}
160
161sub xml_end_tag {
162 my $self = shift(@_);
163 my ($expat, $element) = @_;
164
165 $self->{'rawxml'} .= "</$element>";
166
167 if ($self->{'in_metadata_node'}) {
168 $self->{'metadata_xml'} .= "</$element>";
169 }
170
171 if ($element eq "metadata") {
172 my $textref = \$self->{'metadata_xml'};
173 #my $metadata = $self->{'metadata'};
174 my $metadata = $self->{'saved_metadata'};
175 $self->extract_oai_metadata($textref,$metadata);
176
177 $self->{'in_metadata_node'} = 0;
178 }
179
180
181}
182
183sub xml_text {
184 my $self = shift(@_);
185 my ($expat) = @_;
186
187 $self->{'rawxml'} .= $_;
188
189 if ($self->{'in_metadata_node'}) {
190 $self->{'metadata_xml'} .= $_;
191 }
192}
193
194
195sub metadata_read {
196 my $self = shift (@_);
197
198 my ($pluginfo, $base_dir, $file, $block_hash,
199 $extrametakeys, $extrametadata, $extrametafile,
200 $processor, $gli, $aux) = @_;
201
202 # can we process this file??
203 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
204 return undef unless $self->can_process_this_file_for_metadata($filename_full_path);
205
206 print STDERR "\n<Processing n='$file' p='OAIPlugin'>\n" if ($gli);
207 print STDERR "OAIPlugin: processing $file\n" if ($self->{'verbosity'}) > 1;
208
209 if (!$self->parse_file($filename_full_path, $file, $gli)) {
210 $self->{'saved_metadata'} = undef;
211 return undef;
212 }
213
214 my $verbosity = $self->{'verbosity'};
215 my $new_metadata = $self->{'saved_metadata'};
216 $self->{'saved_metadata'} = undef;
217
218 # add the pretty metadata table as metadata
219 my $ppmd_table = $self->{'ppmd_table'};
220 $new_metadata->{'prettymd'} = $ppmd_table;
221 $self->{'ppmd_table'} = undef;
222
223 my $document_metadata_field = $self->{'document_field'};
224 my $url_array = $new_metadata->{$document_metadata_field};
225 if (!defined $url_array) {
226 # try ex.
227 $url_array = $new_metadata->{"ex.$document_metadata_field"};
228 }
229 my $num_urls = (defined $url_array) ? scalar(@$url_array) : 0;
230 ##print STDERR "$num_urls urls for $file\n";
231 my $srcdoc_exists = 0;
232 my $srcdoc_pos = 0;
233 my $filename_dir = &util::filename_head($filename_full_path);
234
235 # filenames in extrametadata must be relative to current dir, as
236 # DirectoryPlugin adds path info on itself
237 my ($filename_for_metadata) = $file =~ /([^\\\/]+)$/; # this assumes there will only be one record per oai file - is this always the case??
238 for (my $i=0; $i<$num_urls; $i++) {
239
240 if ($url_array->[$i] !~ m/^(https?|ftp):/) {
241
242 my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]);
243 if (-e $src_filename) {
244 $srcdoc_pos = $i;
245 $srcdoc_exists = 1;
246 # get the slashes the right way, use filename_cat
247 $filename_for_metadata = &util::filename_cat($url_array->[$i]);
248 last;
249 }
250 }
251 }
252
253 if ($srcdoc_exists) {
254 $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 1;
255 }
256 else {
257 # save the rawxml for the source document
258 $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 0;
259 $self->{'oai-files'}->{$file}->{'rawxml'} = $self->{'rawxml'};
260 $self->{'rawxml'} = "";
261 }
262
263 # return all the metadata we have extracted to the caller.
264 # Directory plug will pass it back in at read time, so we don't need to extract it again.
265
266 # Store the metadata for later in extrameta. if we have a srcdoc, then treat this file as a metadata file, and pass it in to the store_meta method.
267# If there is no srcdoc, then this is the actual doc, so we don't want it treated as a metadata file. - pass in undef.
268 if ($srcdoc_exists) {
269 $self->store_meta_in_extrametadata($filename_for_metadata, $new_metadata, $file, $filename_full_path, $extrametakeys, $extrametadata, $extrametafile);
270 } else {
271 $self->store_meta_in_extrametadata($filename_for_metadata, $new_metadata, undef, undef, $extrametakeys, $extrametadata, $extrametafile);
272 }
273 # Extrametadata keys should be regular expressions
274 # Indexing into the extrameta data structures requires the filename's style of slashes to be in URL format
275 # Then need to convert the filename to a regex, no longer to protect windows directory chars \, but for
276 # protecting special characters like brackets in the filepath such as "C:\Program Files (x86)\Greenstone".
277
278 if (0) {
279 $filename_for_metadata = &util::filepath_to_url_format($filename_for_metadata);
280 $filename_for_metadata = &util::filename_to_regex($filename_for_metadata);
281
282 # Check that we haven't already got some metadata
283 if (defined &extrametautil::getmetadata($extrametadata, $filename_for_metadata)) {
284 print STDERR "\n**** OAIPlugin: Need to merge new metadata with existing stored metadata: file = $filename_for_metadata\n" if $verbosity > 3;
285
286 my $file_metadata_table = &extrametautil::getmetadata($extrametadata, $filename_for_metadata);
287
288 foreach my $metaname (keys %{$new_metadata}) {
289 # will create new entry if one does not already exist
290 push(@{$file_metadata_table->{$metaname}}, @{$new_metadata->{$metaname}});
291 }
292
293 } else {
294 &extrametautil::setmetadata($extrametadata, $filename_for_metadata, $new_metadata);
295 &extrametautil::addmetakey($extrametakeys, $filename_for_metadata);
296 }
297
298 if ($srcdoc_exists) {
299# if (!defined &extrametautil::getmetafile($extrametafile, $filename_for_metadata)) {
300# &extrametautil::setmetafile($extrametafile, $filename_for_metadata, {});
301# }
302 #maps the file to full path
303# &extrametautil::setmetafile_for_named_file($extrametafile, $filename_for_metadata, $file, $filename_full_path);
304 &extrametautil::addmetafile($extrametafile, $filename_for_metadata, $file, $filename_full_path);
305
306 }
307 } # if 0
308
309 return 1;
310
311}
312
313
314sub read {
315 my $self = shift (@_);
316
317 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
318
319 if (!defined $self->{'oai-files'}->{$file}) {
320 return undef;
321 }
322
323 my $srcdoc_exists = $self->{'oai-files'}->{$file}->{'srcdoc_exists'};
324 if ($srcdoc_exists) {
325 # do nothing more - all the metadata has been extracted and associated with the srcdoc
326 # no more need to access details of this $file => tidy up as you go
327 delete $self->{'oai-files'}->{$file};
328 return 0; # not processed here, but don't pass on to rest of plugins
329 }
330
331 my $filename = $file;
332 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
333
334 # Do encoding stuff on metadata
335 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
336
337 # create a new document
338 my $doc_obj = new doc ($filename, "indexed_doc", $self->{'file_rename_method'});
339 my $top_section = $doc_obj->get_top_section;
340 my $plugin_type = $self->{'plugin_type'};
341
342 my ($filemeta) = $file =~ /([^\\\/]+)$/;
343 my $plugin_filename_encoding = $self->{'filename_encoding'};
344 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
345 $self->set_Source_metadata($doc_obj, $filename, $filename_encoding);
346
347 $doc_obj->add_utf8_metadata($top_section, "Language", $language);
348 $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
349 $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type);
350 $doc_obj->add_metadata($top_section, "FileFormat", "OAI");
351 $doc_obj->add_metadata($top_section, "FileSize", (-s $filename));
352
353 # include any metadata passed in from previous plugins
354 # note that this metadata is associated with the top level section
355 # this will include all the metadata from the oai file that we extracted
356 # during metadata_read
357 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
358
359 # do plugin specific processing of doc_obj
360 my $text = $self->{'oai-files'}->{$file}->{'rawxml'};
361 delete $self->{'oai-files'}->{$file};
362
363 unless (defined ($self->process(\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
364 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
365 return -1;
366 }
367
368 # do any automatic metadata extraction
369 $self->auto_extract_metadata ($doc_obj);
370
371 # add an OID
372 $self->add_OID($doc_obj);
373
374 # process the document
375 $processor->process($doc_obj);
376
377 $self->{'num_processed'} ++;
378
379 return 1; # processed the file
380}
381
382
383# do plugin specific processing of doc_obj
384sub process {
385 my $self = shift (@_);
386 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
387 my $outhandle = $self->{'outhandle'};
388
389 print STDERR "<Processing n='$file' p='OAIPlugin'>\n" if ($gli);
390 print $outhandle "OAIPlugin: processing $file\n"
391 if $self->{'verbosity'} > 1;
392
393 my $cursection = $doc_obj->get_top_section();
394
395## $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection);
396
397 # add text to document object
398
399# $$textref =~ s/<(.*?)>/$1 /g;
400 $$textref =~ s/</&lt;/g;
401 $$textref =~ s/>/&gt;/g;
402 $$textref =~ s/\[/&#91;/g;
403 $$textref =~ s/\]/&#93;/g;
404
405 $doc_obj->add_utf8_text($cursection, $$textref);
406
407 return 1;
408}
409
410
411# Improvement is to merge this with newer version in MetadataPass
412
413sub open_prettyprint_metadata_table
414{
415 my $self = shift(@_);
416
417 my $att = "width=100% cellspacing=2";
418 my $style = "style=\'border-bottom: 4px solid #000080\'";
419
420 $self->{'ppmd_table'} = "\n<table $att $style>";
421}
422
423sub add_prettyprint_metadata_line
424{
425 my $self = shift(@_);
426 my ($metaname, $metavalue_utf8) = @_;
427
428 $metavalue_utf8 = &util::hyperlink_text($metavalue_utf8);
429
430 $self->{'ppmd_table'} .= " <tr bgcolor=#b5d3cd>\n";
431 $self->{'ppmd_table'} .= " <td width=30%>\n";
432 $self->{'ppmd_table'} .= " $metaname\n";
433 $self->{'ppmd_table'} .= " </td>\n";
434 $self->{'ppmd_table'} .= " <td>\n";
435 $self->{'ppmd_table'} .= " $metavalue_utf8\n";
436 $self->{'ppmd_table'} .= " </td>\n";
437 $self->{'ppmd_table'} .= " </tr>\n";
438
439}
440
441sub close_prettyprint_metadata_table
442{
443 my $self = shift(@_);
444
445 $self->{'ppmd_table'} .= "</table>\n";
446}
447
448my $qualified_dc_mapping = {
449 "alternative" => "dc.title",
450 "tableOfContents" => "dc.description",
451 "abstract" => "dc.description",
452 "created" => "dc.date",
453 "valid" => "dc.date",
454 "available" => "dc.date",
455 "issued" => "dc.date",
456 "modified" => "dc.date",
457 "dateAccepted" => "dc.date",
458 "dateCopyrighted" => "dc.date",
459 "dateSubmitted" => "dc.date",
460 "extent" => "dc.format",
461 "medium" => "dc.format",
462 "isVersionOf" => "dc.relation",
463 "hasVersion" => "dc.relation",
464 "isReplacedBy" => "dc.relation",
465 "replaces" => "dc.relation",
466 "isRequiredBy" => "dc.relation",
467 "requires" => "dc.relation",
468 "isPartOf" => "dc.relation",
469 "hasPart" => "dc.relation",
470 "isReferencedBy" => "dc.relation",
471 "references" => "dc.relation",
472 "isFormatOf" => "dc.relation",
473 "hasFormat" => "dc.relation",
474 "conformsTo" => "dc.relation",
475 "spatial" => "dc.coverage",
476 "temporal" => "dc.coverage",
477# these are now top level elements in our qualified dc metadata set
478# "audience" => "dc.any",
479# "accrualMethod" => "dc.any",
480# "accrualPeriodicity" => "dc.any",
481# "accrualPolicy" => "dc.any",
482# "instructionalMethod" => "dc.any",
483# "provenance" => "dc.any",
484# "rightsHolder" => "dc.any",
485 "mediator" => "dc.audience",
486 "educationLevel" => "dc.audience",
487 "accessRights" => "dc.rights",
488 "license" => "dc.rights",
489 "bibliographicCitation" => "dc.identifier"
490 };
491
492sub remap_dc_metadata
493{
494 my $self = shift(@_);
495
496 my ($metaname) = @_;
497
498 my ($prefix,$name) = ($metaname =~ m/^(.*?)\.(.*?)$/);
499
500 if (defined $qualified_dc_mapping->{$name}) {
501
502 return $qualified_dc_mapping->{$name}."^".$name;
503 }
504
505
506 return $metaname; # didn't get a match, return param passed in unchanged
507}
508
509
510sub extract_oai_metadata {
511 my $self = shift (@_);
512 my ($textref, $metadata) = @_;
513 my $outhandle = $self->{'outhandle'};
514
515 $self->open_prettyprint_metadata_table();
516
517 # need to decode the string, else it will be double-encoded at this point
518 $$textref = decode("utf-8",$$textref);
519
520# Debugging encoding issues with Devel::Peek's Dump() which prints octal and hexcode
521# print STDERR "#### text ref: $$textref\n";
522# print STDERR "\n@@@\n";
523# Dump($$textref);
524# print STDERR "\n";
525
526 if ($$textref =~ m/<metadata\s*>(.*?)<\/metadata\s*>/s)
527 {
528 my $metadata_text = $1;
529
530 # locate and remove outermost tag (ignoring any attribute information in top-level tag)
531 my ($outer_tagname,$inner_metadata_text) = ($metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>/s);
532 # split tag into namespace and tag name
533 my($namespace,$top_level_prefix) = ($outer_tagname =~ m/^(.*?):(.*?)$/);
534 # sometimes, the dc namespace is not specified as the prefix in each element (like <dc:title>)
535 # but is rather defined in the wrapper element containing the various dc meta elements,
536 # like <dc><title></title><creator></creator></dc>.
537 # In such a case, we use this wrapper element as the top_level_prefix
538
539 # if there was no prefix, then the tag itself becomes the top_level_prefix
540 if(!defined $top_level_prefix && defined $outer_tagname) {
541 $top_level_prefix = $outer_tagname;
542 }
543
544 #process each element one by one
545 while ($inner_metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>(.*)/s)
546 {
547
548 my $metaname = $1;
549 my $metavalue = $2;
550 $inner_metadata_text = $3;
551
552 # greenstone uses . for namespace, while oai uses :
553 $metaname =~ s/:/\./;
554 # if there is no namespace, then we use the outer tag name or
555 # namespace for this element
556 if ($metaname !~ m/\./)
557 {
558 $metaname = "$top_level_prefix.$metaname";
559 }
560
561 # if metadata set is auto, leave as is, otherwise convert to
562 # specified namespace
563 if ($self->{'metadata_set'} ne "auto") {
564 if ($metaname !~ /^gi\./) { # hack to not overwrite gi metadata
565 $metaname =~ s/^([^\.]*)\./$self->{'metadata_set'}\./;
566 if ($self->{'metadata_set'} eq "dc") {
567 # convert qualified dc terms to gs version, e.g.
568 # spatial becomes coverage^spatial
569 $metaname = $self->remap_dc_metadata($metaname);
570 }
571 }
572 }
573
574 # uppercase the first char of the name
575 $metaname =~ s/\.(.)/\.\u$1/;
576 $metavalue =~ s/\[/&#91;/g;
577 $metavalue =~ s/\]/&#93;/g;
578
579 # so that GLI can see this metadata, store here as ex.dc.Title etc
580 my $ex_metaname = $metaname;
581 $ex_metaname =~ s/^ex\.//; # remove any pre-existing ex. prefix
582 $ex_metaname = "ex.$ex_metaname"; # at last can prefix ex.
583
584 if (defined $metadata->{$ex_metaname})
585 {
586 push(@{$metadata->{$ex_metaname}},$metavalue);
587
588 }
589 else
590 {
591 $metadata->{$ex_metaname} = [ $metavalue ];
592 }
593
594 # but don't add ex to the pretty print line
595 $self->add_prettyprint_metadata_line($metaname, $metavalue);
596
597 }
598 }
599
600 $self->close_prettyprint_metadata_table();
601}
602
603## we know from the file extension, so doesn't need to check the doctype
604sub check_doctype {
605
606 return 1;
607}
608
6091;
Note: See TracBrowser for help on using the repository browser.