root/main/trunk/greenstone2/perllib/plugins/OAIPlugin.pm @ 24951

Revision 24951, 16.4 KB (checked in by ak19, 8 years ago)

All perlcode that accesses extrametakeys, extrametadata, extrametafile data structures has been moved into a new perl module called extrametautil.pm. The next step will be to ensure that the file_regexes used to index into these data structures are consistent (using consistent slashes, like URL style slashes).

  • Property svn:keywords set to Author Date Id Revision
RevLine 
[4726]1###########################################################################
2#
3# OAIPlug.pm -- basic Open Archives Initiative (OAI) plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
[15872]27package OAIPlugin;
[4726]28
[24951]29use extrametautil;
[4726]30use unicode;
31use util;
32
[10254]33use strict;
34no strict 'refs'; # allow filehandles to be variables and viceversa
35
[15872]36use ReadXMLFile;
[17066]37use ReadTextFile; # needed for subroutine textcat_get_language_encoding
[17216]38use metadatautil;
[24547]39use MetadataRead;
[9958]40
[24547]41# methods with identical signatures take precedence in the order given in the ISA list.
[4726]42sub BEGIN {
[24547]43    @OAIPlugin::ISA = ('MetadataRead', 'ReadXMLFile', 'ReadTextFile');
[4726]44}
45
[19213]46my $set_list =
47    [ { 'name' => "auto",
48    'desc' => "{OAIPlugin.metadata_set.auto}" },
49      { 'name' => "dc",
50    'desc' => "{OAIPlugin.metadata_set.dc}" }
51      ];
[9958]52
[6408]53my $arguments =
54    [ { 'name' => "process_exp",
[16013]55    'desc' => "{BasePlugin.process_exp}",
[6408]56    'type' => "regexp",
57    'reqd' => "no",
[17290]58    'deft' => &get_default_process_exp() },
[19213]59      { 'name' => "metadata_set",
60    'desc' => "{OAIPlugin.metadata_set}",
61    'type' => "enumstring",
62    'reqd' => "no",
63    'list' => $set_list,
64    'deft' => "dc" },
[17319]65      { 'name' => "document_field",
66    'desc' => "{OAIPlugin.document_field}",
[17290]67    'type' => "metadata",
68    'reqd' => "no",
69    'deft' => "gi.Sourcedoc" }
[6408]70      ];
71
[15872]72my $options = { 'name'     => "OAIPlugin",
73        'desc'     => "{OAIPlugin.desc}",
[6408]74        'abstract' => "no",
75        'inherits' => "yes",
[17103]76        'explodes' => "yes",
[6408]77        'args'     => $arguments };
[4747]78
[10254]79
[4726]80sub new {
[10218]81    my ($class) = shift (@_);
82    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
83    push(@$pluginlist, $class);
[4873]84
[15872]85    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
86    push(@{$hashArgOptLists->{"OptList"}},$options);
[4726]87
[17126]88    new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists,1);
[15872]89    my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
[4726]90
[21763]91    if ($self->{'info_only'}) {
92    # don't worry about modifying options
93    return bless $self, $class;
94    }
[24404]95    # trim any ex. from document field iff it's the only metadata namespace prefix   
96    $self->{'document_field'} =~ s/^ex\.([^.]+)$/$1/;
[4726]97    return bless $self, $class;
98}
99
100sub get_default_process_exp {
101    my $self = shift (@_);
102
103    return q^(?i)(\.oai)$^;
104}
105
[13222]106sub get_doctype {
107    my $self = shift(@_);
108   
109    return "OAI-PMH";
110}
111
[9958]112sub xml_start_document {
[10254]113    my $self = shift (@_);
[9958]114    $self->{'in_metadata_node'} = 0;
115    $self->{'rawxml'} = "";
[17290]116    $self->{'saved_metadata'} = {};
[9958]117}
[4726]118
[9958]119sub xml_end_document {
120}
[4726]121
[9958]122sub xml_doctype {
123    my $self = shift(@_);
124
125    my ($expat, $name, $sysid, $pubid, $internal) = @_;
126
[13886]127    ##die "" if ($name !~ /^OAI-PMH$/);
[9958]128
[4726]129    my $outhandle = $self->{'outhandle'};
[15872]130    print $outhandle "OAIPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
131    print STDERR "<Processing n='$self->{'file'}' p='OAIPlugin'>\n" if $self->{'gli'};
[4726]132
[9958]133}
[4726]134
[9958]135
136sub xml_start_tag {
137    my $self = shift(@_);
138    my ($expat,$element) = @_;
139
140    my %attr_hash = %_;
141
142    my $attr = "";
143    map { $attr .= " $_=$attr_hash{$_}"; } keys %attr_hash;
144
145    $self->{'rawxml'} .= "<$element$attr>";
146
147    if ($element eq "metadata") {
148    $self->{'in_metadata_node'} = 1;
149    $self->{'metadata_xml'} = "";
[4726]150    }
[9958]151
152    if ($self->{'in_metadata_node'}) {
153    $self->{'metadata_xml'} .= "<$element$attr>";
[4726]154    }
[9958]155}
[4726]156
[9958]157sub xml_end_tag {
158    my $self = shift(@_);
159    my ($expat, $element) = @_;
[4726]160
[9958]161    $self->{'rawxml'} .= "</$element>";
[4726]162
[9958]163    if ($self->{'in_metadata_node'}) {
164    $self->{'metadata_xml'} .= "</$element>";
[4726]165    }
166
[9958]167    if ($element eq "metadata") {
168    my $textref = \$self->{'metadata_xml'};
[17290]169    #my $metadata = $self->{'metadata'};
170    my $metadata = $self->{'saved_metadata'};
[9958]171    $self->extract_oai_metadata($textref,$metadata);
[4726]172
[9958]173    $self->{'in_metadata_node'} = 0;   
174    }
[4726]175
176
[9958]177}
[4726]178
[9958]179sub xml_text {
180    my $self = shift(@_);
181    my ($expat) = @_;
[8684]182
[9958]183    $self->{'rawxml'} .= $_;
[4726]184
[9958]185    if ($self->{'in_metadata_node'}) {
186    $self->{'metadata_xml'} .= $_;
[4726]187    }
[9958]188}
[4726]189
[8121]190
[17216]191sub metadata_read {
192    my $self = shift (@_); 
[4726]193
[19493]194    my ($pluginfo, $base_dir, $file, $block_hash,
195    $extrametakeys, $extrametadata, $extrametafile,
[23212]196    $processor, $gli, $aux) = @_;
[5919]197
[17216]198    # can we process this file??
199    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
[24403]200    return undef unless $self->can_process_this_file_for_metadata($filename_full_path);
[17216]201   
[17290]202    if (!$self->parse_file($filename_full_path, $file, $gli)) {
203    $self->{'saved_metadata'} = undef;
204    return undef;
205    }
206
207    my $new_metadata = $self->{'saved_metadata'};
208    $self->{'saved_metadata'} = undef;
[17319]209
[17290]210    # add the pretty metadata table as metadata
211    my $ppmd_table = $self->{'ppmd_table'};
212    $new_metadata->{'prettymd'} = $ppmd_table;
213    $self->{'ppmd_table'} = undef;
[17319]214     
215    my $document_metadata_field = $self->{'document_field'};
216    my $url_array = $new_metadata->{$document_metadata_field};
[22316]217    if (!defined $url_array) {
218    # try ex.
219    $url_array = $new_metadata->{"ex.$document_metadata_field"};
220    }
[17290]221    my $num_urls = (defined $url_array) ? scalar(@$url_array) : 0;
[17319]222    ##print STDERR "$num_urls urls for $file\n";
[17290]223    my $srcdoc_exists = 0;
224    my $srcdoc_pos = 0;
225    my $filename_dir = &util::filename_head($filename_full_path);
[17590]226   
[17591]227    # filenames in extrametadata must be relative to current dir, as
228    # DirectoryPlugin adds path info on itself
[17590]229    my ($filename_for_metadata) = $file =~ /([^\\\/]+)$/; # this assumes there will only be one record per oai file - is this always the case??
[17290]230    for (my $i=0; $i<$num_urls; $i++) {
[17216]231   
[17290]232    if ($url_array->[$i] !~ m/^(https?|ftp):/) {
[17216]233       
[17290]234        my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]);
235        if (-e $src_filename) {
236        $srcdoc_pos = $i;
237        $srcdoc_exists = 1;
[19622]238        # get the slashes the right way, use filename_cat
239        $filename_for_metadata = &util::filename_cat($url_array->[$i]);
[17290]240        last;
[17216]241        }
242    }
243    }
[17290]244   
[17319]245    if ($srcdoc_exists) {
[17290]246    $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 1;
247    }
[17216]248    else {
[17290]249    # save the rawxml for the source document
250    $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 0;
251    $self->{'oai-files'}->{$file}->{'rawxml'} = $self->{'rawxml'};
252    $self->{'rawxml'} = "";
[17216]253    }
[17290]254   
255    # return all the metadata we have extracted to the caller.
256    # Directory plug will pass it back in at read time, so we don't need to extract it again.
[17513]257    # extrametadata keys should be regular expressions
258    $filename_for_metadata = &util::filename_to_regex($filename_for_metadata);
[24951]259    &extrametautil::setmetadata($extrametadata, $filename_for_metadata, $new_metadata);
260    &extrametautil::addmetakey($extrametakeys, $filename_for_metadata);
261    if ($srcdoc_exists) {   
262    if (!defined &extrametautil::getmetafile($extrametafile, $filename_for_metadata)) {
263        &extrametautil::setmetafile($extrametafile, $filename_for_metadata, {});
[20792]264    }
265     #maps the file to full path
[24951]266    &extrametautil::setmetafile_for_named_file($extrametafile, $filename_for_metadata, $file, $filename_full_path);
267   
[20792]268    }
[17290]269    return 1;
270   
[17216]271}
272
273
[9958]274sub read {
275    my $self = shift (@_); 
[17290]276   
[16392]277    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
[4726]278
[17290]279    if (!defined $self->{'oai-files'}->{$file}) {
280    return undef;
281    }
[17319]282       
[17290]283    my $srcdoc_exists = $self->{'oai-files'}->{$file}->{'srcdoc_exists'};
284    if ($srcdoc_exists) {
[17319]285    # do nothing more - all the metadata has been extracted and associated with the srcdoc
[17216]286    # no more need to access details of this $file => tidy up as you go
287    delete $self->{'oai-files'}->{$file};
[17290]288    return 0; # not processed here, but don't pass on to rest of plugins
289    }
[17216]290
[17290]291    my $filename = $file;
292    $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
[17216]293
[17290]294    # Do encoding stuff on metadata
295    my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
[17216]296
[17290]297    # create a new document
[18327]298    my $doc_obj = new doc ($filename, "indexed_doc", $self->{'file_rename_method'});
[17290]299    my $top_section = $doc_obj->get_top_section;
300    my $plugin_type = $self->{'plugin_type'};
301   
[17588]302    my ($filemeta) = $file =~ /([^\\\/]+)$/;
[23352]303    my $plugin_filename_encoding = $self->{'filename_encoding'};
[23349]304    my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
[23352]305    $self->set_Source_metadata($doc_obj, $filename, $filename_encoding);
[23349]306
[17290]307    $doc_obj->add_utf8_metadata($top_section, "Language", $language);
308    $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
309    $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type);
310    $doc_obj->add_metadata($top_section, "FileFormat", "OAI");
311    $doc_obj->add_metadata($top_section, "FileSize", (-s $filename));
312   
313    # include any metadata passed in from previous plugins
314    # note that this metadata is associated with the top level section
[17319]315    # this will include all the metadata from the oai file that we extracted
316    # during metadata_read
[17290]317    $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
318   
319    # do plugin specific processing of doc_obj
320    my $text = $self->{'oai-files'}->{$file}->{'rawxml'};
321    delete $self->{'oai-files'}->{$file};
[17216]322
[17290]323    unless (defined ($self->process(\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
324    print STDERR "<ProcessingError n='$file'>\n" if ($gli);
325    return -1;
[17216]326    }
[17290]327   
328    # do any automatic metadata extraction
329    $self->auto_extract_metadata ($doc_obj);
330   
331    # add an OID
332    $self->add_OID($doc_obj);
333       
334    # process the document
335    $processor->process($doc_obj);
336   
337    $self->{'num_processed'} ++;
338   
339    return 1; # processed the file
[17216]340}
341
342
[4726]343# do plugin specific processing of doc_obj
344sub process {
345    my $self = shift (@_);
[6332]346    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[4726]347    my $outhandle = $self->{'outhandle'};
348
[15872]349    print STDERR "<Processing n='$file' p='OAIPlugin'>\n" if ($gli);
350    print $outhandle "OAIPlugin: processing $file\n"
[4726]351    if $self->{'verbosity'} > 1;
352
353    my $cursection = $doc_obj->get_top_section();
354
355##    $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection);
356
357    # add text to document object
358
359#    $$textref =~ s/<(.*?)>/$1 /g;
360    $$textref =~ s/</&lt;/g;
361    $$textref =~ s/>/&gt;/g;
[14963]362    $$textref =~ s/\[/&#91;/g;
363    $$textref =~ s/\]/&#93;/g;
[4726]364
365    $doc_obj->add_utf8_text($cursection, $$textref);
366
367    return 1;
368}
369
370
[9958]371# Improvement is to merge this with newer version in MetadataPass
[4726]372
[9958]373sub open_prettyprint_metadata_table
374{
375    my $self = shift(@_);
376
377    my $att   = "width=100% cellspacing=2";
378    my $style = "style=\'border-bottom: 4px solid #000080\'";
379
[17290]380    $self->{'ppmd_table'} = "\n<table $att $style>";
[9958]381}
382
383sub add_prettyprint_metadata_line
384{
385    my $self = shift(@_);
386    my ($metaname, $metavalue_utf8) = @_;
387
388    $metavalue_utf8 = &util::hyperlink_text($metavalue_utf8);
389
390    $self->{'ppmd_table'} .= "  <tr bgcolor=#b5d3cd>\n";
391    $self->{'ppmd_table'} .= "    <td width=30%>\n";
392    $self->{'ppmd_table'} .= "      $metaname\n";
393    $self->{'ppmd_table'} .= "    </td>\n";
394    $self->{'ppmd_table'} .= "    <td>\n";
395    $self->{'ppmd_table'} .= "      $metavalue_utf8\n";
396    $self->{'ppmd_table'} .= "    </td>\n";
397    $self->{'ppmd_table'} .= "  </tr>\n";
398
399}
400
401sub close_prettyprint_metadata_table
402{
403    my $self = shift(@_);
404
405    $self->{'ppmd_table'} .= "</table>\n";
406}
407
[19213]408my $qualified_dc_mapping = {
409    "alternative" => "dc.title",
410    "tableOfContents" => "dc.description",
411    "abstract" => "dc.description",
412    "created" => "dc.date",
413    "valid" => "dc.date",
414    "available" => "dc.date",
415    "issued" => "dc.date",
416    "modified" => "dc.date",
417    "dateAccepted" => "dc.date",
418    "dateCopyrighted" => "dc.date",
419    "dateSubmitted" => "dc.date",
420    "extent" => "dc.format",
421    "medium" => "dc.format",
422    "isVersionOf" => "dc.relation",
423    "hasVersion" => "dc.relation",
424    "isReplacedBy" => "dc.relation",
425    "replaces" => "dc.relation",
426    "isRequiredBy" => "dc.relation",
427    "requires" => "dc.relation",
428    "isPartOf" => "dc.relation",
429    "hasPart" => "dc.relation",
430    "isReferencedBy" => "dc.relation",
431    "references" => "dc.relation",
432    "isFormatOf" => "dc.relation",
433    "hasFormat" => "dc.relation",
434    "conformsTo" => "dc.relation",
435    "spatial" => "dc.coverage",
436    "temporal" => "dc.coverage",
437# these are now top level elements in our qualified dc metadata set
[18901]438#   "audience" => "dc.any",
439#   "accrualMethod" => "dc.any",
440#   "accrualPeriodicity" => "dc.any",
441#   "accrualPolicy" => "dc.any",
442#   "instructionalMethod" => "dc.any",
443#   "provenance" => "dc.any",
444#   "rightsHolder" => "dc.any",
[19213]445    "mediator" => "dc.audience",
446    "educationLevel" => "dc.audience",
447    "accessRights" => "dc.rights",
448    "license" => "dc.rights",
449    "bibliographicCitation" => "dc.identifier"
450    };
[14940]451
[19213]452sub remap_dc_metadata
453{
454    my $self = shift(@_);
455
456    my ($metaname) = @_;
457
[14940]458    my ($prefix,$name) = ($metaname =~ m/^(.*?)\.(.*?)$/);
459   
[19213]460    if (defined $qualified_dc_mapping->{$name}) {
461   
462    return $qualified_dc_mapping->{$name}."^".$name;
[14940]463    }
[19213]464   
465   
[14940]466    return $metaname; # didn't get a match, return param passed in unchanged
467}
468
469
[4726]470sub extract_oai_metadata {
471    my $self = shift (@_);
472    my ($textref, $metadata) = @_;
473    my $outhandle = $self->{'outhandle'};
474
[9958]475    $self->open_prettyprint_metadata_table();
476
477    if ($$textref =~ m/<metadata\s*>(.*?)<\/metadata\s*>/s)
[4726]478    {
[10254]479    my $metadata_text = $1;
[4726]480
[14940]481    # locate and remove outermost tag (ignoring any attribute information in top-level tag)
[19213]482    my ($outer_tagname,$inner_metadata_text) = ($metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>/s);
[14940]483    # split tag into namespace and tag name
[19213]484    my($namespace,$top_level_prefix) = ($outer_tagname =~ m/^(.*?):(.*?)$/);
[17066]485    # sometimes, the dc namespace is not specified as the prefix in each element (like <dc:title>)
486    # but is rather defined in the wrapper element containing the various dc meta elements,
487    # like <dc><title></title><creator></creator></dc>.
488    # In such a case, we use this wrapper element as the top_level_prefix
[19213]489   
490    # if there was no prefix, then the tag itself becomes the top_level_prefix
491    if(!defined $top_level_prefix && defined $outer_tagname) {
492        $top_level_prefix = $outer_tagname;
[17066]493    }
494
[19213]495    #process each element one by one
[14949]496    while ($inner_metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>(.*)/s)
[4726]497    {
[9958]498
[4726]499        my $metaname = $1;
500        my $metavalue = $2;
[14949]501        $inner_metadata_text = $3;
502
[19213]503        # greenstone uses . for namespace, while oai uses :
[14940]504        $metaname =~ s/:/\./;
[19213]505        # if there is no namespace, then we use the outer tag name or
506        # namespace for this element
[14940]507        if ($metaname !~ m/\./)
[4726]508        {
[14940]509        $metaname = "$top_level_prefix.$metaname";
[4726]510        }
[19213]511       
512        # if metadata set is auto, leave as is, otherwise convert to
513        # specified namespace
514        if ($self->{'metadata_set'} ne "auto") {
[20787]515        if ($metaname !~ /^gi\./) { # hack to not overwrite gi metadata
516            $metaname =~ s/^([^\.]*)\./$self->{'metadata_set'}\./;
517            if ($self->{'metadata_set'} eq "dc") {
518            # convert qualified dc terms to gs version, e.g.
519            # spatial becomes coverage^spatial
520            $metaname = $self->remap_dc_metadata($metaname);
521            }
[19213]522        }
523        }
[4726]524
[18901]525        # uppercase the first char of the name
526        $metaname =~ s/\.(.)/\.\u$1/;
[14963]527        $metavalue =~ s/\[/&#91;/g;
528        $metavalue =~ s/\]/&#93;/g;
529
[22316]530        # so that GLI can see this metadata, store here as ex.dc.Title etc
[24404]531        my $ex_metaname = $metaname;
532        $ex_metaname =~ s/^ex\.//; # remove any pre-existing ex. prefix
533        $ex_metaname = "ex.$ex_metaname"; # at last can prefix ex.
[22316]534
535        if (defined $metadata->{$ex_metaname})
[4726]536        {
[22316]537        push(@{$metadata->{$ex_metaname}},$metavalue);
[8121]538
[4726]539        }
540        else
541        {
[22316]542        $metadata->{$ex_metaname} = [ $metavalue ];
[4726]543        }
544
[22316]545        # but don't add ex to the pretty print line
[9958]546        $self->add_prettyprint_metadata_line($metaname, $metavalue);
547       
[4726]548    }
549    }
[9958]550
551    $self->close_prettyprint_metadata_table();
[4726]552}
553
[13886]554## we know from the file extension, so doesn't need to check the doctype
555sub check_doctype {
556
557    return 1;
558}
559
[4726]5601;
Note: See TracBrowser for help on using the browser.