root/gsdl/trunk/perllib/plugins/OAIPlugin.pm @ 17216

Revision 17216, 17.2 KB (checked in by kjdon, 11 years ago)

trying to get OAI files exploding. Have copied in some code from one of David's obsolete files. I think it works but haven't tested fully yet. Wanted to get the code committed though.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# OAIPlug.pm -- basic Open Archives Initiative (OAI) plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package OAIPlugin;
28
29use unicode;
30use util;
31
32use strict;
33no strict 'refs'; # allow filehandles to be variables and viceversa
34
35use ReadXMLFile;
36use ReadTextFile; # needed for subroutine textcat_get_language_encoding
37use metadatautil;
38
39sub BEGIN {
40    @OAIPlugin::ISA = ('ReadXMLFile', 'ReadTextFile');
41}
42
43
44my $arguments =
45    [ { 'name' => "process_exp",
46    'desc' => "{BasePlugin.process_exp}",
47    'type' => "regexp",
48    'reqd' => "no",
49    'deft' => &get_default_process_exp() }
50      ];
51
52my $options = { 'name'     => "OAIPlugin",
53        'desc'     => "{OAIPlugin.desc}",
54        'abstract' => "no",
55        'inherits' => "yes",
56        'explodes' => "yes",
57        'args'     => $arguments };
58
59
60sub new {
61    my ($class) = shift (@_);
62    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
63    push(@$pluginlist, $class);
64
65    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
66    push(@{$hashArgOptLists->{"OptList"}},$options);
67
68    new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists,1);
69    my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
70
71    return bless $self, $class;
72}
73
74sub get_default_process_exp {
75    my $self = shift (@_);
76
77    return q^(?i)(\.oai)$^;
78}
79
80sub get_doctype {
81    my $self = shift(@_);
82   
83    return "OAI-PMH";
84}
85
86sub xml_start_document {
87    my $self = shift (@_);
88    $self->{'in_metadata_node'} = 0;
89    $self->{'rawxml'} = "";
90}
91
92sub xml_end_document {
93}
94
95sub xml_doctype {
96    my $self = shift(@_);
97
98    my ($expat, $name, $sysid, $pubid, $internal) = @_;
99
100    ##die "" if ($name !~ /^OAI-PMH$/);
101
102    my $outhandle = $self->{'outhandle'};
103    print $outhandle "OAIPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
104    print STDERR "<Processing n='$self->{'file'}' p='OAIPlugin'>\n" if $self->{'gli'};
105
106}
107
108
109sub xml_start_tag {
110    my $self = shift(@_);
111    my ($expat,$element) = @_;
112
113    my %attr_hash = %_;
114
115    my $attr = "";
116    map { $attr .= " $_=$attr_hash{$_}"; } keys %attr_hash;
117
118    $self->{'rawxml'} .= "<$element$attr>";
119
120    if ($element eq "metadata") {
121    $self->{'in_metadata_node'} = 1;
122    $self->{'metadata_xml'} = "";
123    }
124
125    if ($self->{'in_metadata_node'}) {
126    $self->{'metadata_xml'} .= "<$element$attr>";
127    }
128}
129
130sub xml_end_tag {
131    my $self = shift(@_);
132    my ($expat, $element) = @_;
133
134    $self->{'rawxml'} .= "</$element>";
135
136    if ($self->{'in_metadata_node'}) {
137    $self->{'metadata_xml'} .= "</$element>";
138    }
139
140    if ($element eq "metadata") {
141    my $textref = \$self->{'metadata_xml'};
142    my $metadata = $self->{'metadata'};
143    $self->extract_oai_metadata($textref,$metadata);
144
145    $self->{'in_metadata_node'} = 0;   
146    }
147
148
149}
150
151sub xml_text {
152    my $self = shift(@_);
153    my ($expat) = @_;
154
155    $self->{'rawxml'} .= $_;
156
157    if ($self->{'in_metadata_node'}) {
158    $self->{'metadata_xml'} .= $_;
159    }
160}
161
162
163sub metadata_read {
164    my $self = shift (@_); 
165
166    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
167
168    # can we process this file??
169    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
170    return undef unless $self->can_process_this_file($filename_full_path);
171   
172    my $total_count = 0; # is total count used?
173    if ($self->SUPER::read($pluginfo,$base_dir,$file,$block_hash,$metadata,$processor,$maxdocs,$total_count, $gli)) {
174    # calling "SUPER::read" at this point sets up $metadata
175    # data-structure.  We can then, later, in OAIPlug::read decide
176    # whether this $metadata will stick to an accompanying file,
177    # or else needs a new doc object to be formed that contains
178    # purely metadata
179       
180    $self->{'metadata'} = undef;
181   
182    #my $url_array = $metadata->{'gi.Sourcedoc'};
183    my $url_array = $metadata->{'dc.Identifier'};
184    my $num_urls = (defined $url_array) ? scalar(@$url_array) : 0;
185
186    my $srcdoc_exists = 0;
187    my $srcdoc_pos = 0;
188    my $filename_dir = &util::filename_head($filename_full_path);
189       
190    for (my $i=0; $i<$num_urls; $i++) {
191       
192        if ($url_array->[$i] !~ m/^(https?|ftp):/) {
193       
194        my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]);
195       
196        if (-e $src_filename) {
197            $srcdoc_pos = $i;
198            $srcdoc_exists = 1;
199            last;
200        }
201        }
202    }
203   
204   
205    if ($srcdoc_exists)
206    {
207        $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 1;
208       
209###     print STDERR "**** storing OAI file: $file\n";
210       
211        # Make pretty print metadata table stick with src filename
212        my $ppmd_table = $self->{'ppmd_table'};
213       
214        $metadata->{'prettymd'} = [ $ppmd_table ];
215        $self->{'ppmd_table'} = undef;
216       
217    }
218    else {
219        $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 0;
220    }
221   
222    }
223    else {
224    return undef;
225    }
226}
227
228
229sub read {
230    my $self = shift (@_); 
231 
232    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
233
234
235###    print STDERR "**** checking OAI read: $file\n";
236
237    if (defined $self->{'oai-files'}->{$file}) {
238   
239    my $srcdoc_exists = $self->{'oai-files'}->{$file}->{'srcdoc_exists'};
240
241    # no more need to access details of this $file => tidy up as you go
242    delete $self->{'oai-files'}->{$file};
243
244### print STDERR "**** !!!!! srcdoc_exists = $srcdoc_exists\n";
245    if (!$srcdoc_exists)
246    {
247
248        my $filename = $file;
249        $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
250
251        # Do encoding stuff on metadata
252        my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
253
254        # create a new document
255        my $doc_obj = new doc ($filename, "indexed_doc");
256        my $top_section = $doc_obj->get_top_section;
257        my $plugin_type = $self->{'plugin_type'};
258       
259        $doc_obj->add_utf8_metadata($top_section, "Language", $language);
260        $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
261        $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type);
262        $doc_obj->add_metadata($top_section, "FileFormat", "OAI");
263        $doc_obj->add_metadata($top_section, "FileSize", (-s $filename));
264       
265        # include any metadata passed in from previous plugins
266        # note that this metadata is associated with the top level section
267        $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
268       
269        # do plugin specific processing of doc_obj
270        my $textref = \$self->{'rawxml'};
271        unless (defined ($self->process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
272        print STDERR "<ProcessingError n='$file'>\n" if ($gli);
273        return -1;
274        }
275       
276        # do any automatic metadata extraction
277        $self->auto_extract_metadata ($doc_obj);
278       
279        # add an OID
280        $self->add_OID($doc_obj);
281       
282        my $prettymds = $self->{'prettymd'};
283        foreach my $prettymd (@$prettymds) {
284        $doc_obj->add_utf8_metadata($top_section,"prettymd",$prettymd);
285        }
286        $self->{'prettymd'} = undef;
287       
288        # process the document
289        $processor->process($doc_obj);
290       
291        $self->{'num_processed'} ++;
292       
293        return 1; # processed the file
294    }
295    }
296    else {
297    return undef;
298    }
299}
300
301
302sub read_old {
303    my $self = shift (@_); 
304 
305    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
306
307    my $outhandle = $self->{'outhandle'};
308
309    my $filename = $file;
310    $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
311
312    # block the srcdocs dir - we will process files in them when we find an OAI record for them
313    return 0 if ((-d $filename) && ($filename =~ m/srcdocs$/));
314    if ($self->SUPER::read(@_)) {
315    # Do encoding stuff
316    my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
317   
318    my $url_array = $metadata->{'dc.Identifier'};
319    my $num_urls = (defined $url_array) ? scalar(@$url_array) : 0;
320   
321    my $srcdoc_exists = 0;
322    my $srcdoc_pos = 0;
323    my $filename_dir = &util::filename_head($filename);
324   
325    for (my $i=0; $i<$num_urls; $i++) {
326        if ($url_array->[$i] !~ m/^(http|ftp):/) {
327       
328        my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]);
329        if (-e $src_filename) {
330            $srcdoc_pos = $i;
331            $srcdoc_exists = 1;
332        }
333        }
334    }
335   
336    if ($srcdoc_exists)
337    {
338        print $outhandle "OAIPlugin: passing metadata on to $url_array->[0]\n"
339        if ($self->{'verbosity'}>1);
340       
341       
342        # Make pretty print metadata table stick with src filename
343        my $ppmd_table = $self->{'ppmd_table'};
344        $metadata->{'prettymd'} = [ $ppmd_table ];
345        $self->{'ppmd_table'} = undef;
346       
347        return &plugin::read ($pluginfo, $filename_dir, $url_array->[0],
348                  $block_hash, $metadata, $processor, $maxdocs,
349                  $total_count, $gli);
350    }
351    else
352    {
353        # create a new document
354        my $doc_obj = new doc ($filename, "indexed_doc");
355        my $top_section = $doc_obj->get_top_section;
356        my $plugin_type = $self->{'plugin_type'};
357       
358        $doc_obj->add_utf8_metadata($top_section, "Language", $language);
359        $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
360        $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type);
361        $doc_obj->add_metadata($top_section, "FileFormat", "OAI");
362        $doc_obj->add_metadata($top_section, "FileSize", (-s $filename));
363       
364        # include any metadata passed in from previous plugins
365        # note that this metadata is associated with the top level section
366        $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
367       
368        # do plugin specific processing of doc_obj
369        my $textref = \$self->{'rawxml'};
370        unless (defined ($self->process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
371        print STDERR "<ProcessingError n='$file'>\n" if ($gli);
372        return -1;
373        }
374       
375        # do any automatic metadata extraction
376        $self->auto_extract_metadata ($doc_obj);
377       
378        # add an OID
379        $self->add_OID($doc_obj);
380       
381        my $ppmd_table = $self->{'ppmd_table'};
382        $doc_obj->add_utf8_metadata($top_section,"prettymd",$ppmd_table);
383        $self->{'ppmd_table'} = undef;
384       
385        # process the document
386        $processor->process($doc_obj);
387       
388        $self->{'num_processed'} ++;
389       
390        return 1; # processed the file
391    }
392    }
393    else {
394    return undef;
395    }
396}
397
398
399# do plugin specific processing of doc_obj
400sub process {
401    my $self = shift (@_);
402    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
403    my $outhandle = $self->{'outhandle'};
404
405    print STDERR "<Processing n='$file' p='OAIPlugin'>\n" if ($gli);
406    print $outhandle "OAIPlugin: processing $file\n"
407    if $self->{'verbosity'} > 1;
408
409    my $cursection = $doc_obj->get_top_section();
410
411##    $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection);
412
413    # add text to document object
414
415#    $$textref =~ s/<(.*?)>/$1 /g;
416    $$textref =~ s/</&lt;/g;
417    $$textref =~ s/>/&gt;/g;
418    $$textref =~ s/\[/&#91;/g;
419    $$textref =~ s/\]/&#93;/g;
420
421##    print STDERR "*** adding text: $$textref\n";
422   
423    $doc_obj->add_utf8_text($cursection, $$textref);
424
425    return 1;
426}
427
428
429# Improvement is to merge this with newer version in MetadataPass
430
431sub open_prettyprint_metadata_table
432{
433    my $self = shift(@_);
434
435    my $att   = "width=100% cellspacing=2";
436    my $style = "style=\'border-bottom: 4px solid #000080\'";
437
438    $self->{'ppmd_table'} = "\n<table $att $style>";
439}
440
441sub add_prettyprint_metadata_line
442{
443    my $self = shift(@_);
444    my ($metaname, $metavalue_utf8) = @_;
445
446###    $metavalue_utf8 =~ s/hdl\.handle\.net/mcgonagall.cs.waikato.ac.nz:8080\/dspace\/handle/;
447    $metavalue_utf8 = &util::hyperlink_text($metavalue_utf8);
448
449    $self->{'ppmd_table'} .= "  <tr bgcolor=#b5d3cd>\n";
450    $self->{'ppmd_table'} .= "    <td width=30%>\n";
451    $self->{'ppmd_table'} .= "      $metaname\n";
452    $self->{'ppmd_table'} .= "    </td>\n";
453    $self->{'ppmd_table'} .= "    <td>\n";
454    $self->{'ppmd_table'} .= "      $metavalue_utf8\n";
455    $self->{'ppmd_table'} .= "    </td>\n";
456    $self->{'ppmd_table'} .= "  </tr>\n";
457
458}
459
460sub close_prettyprint_metadata_table
461{
462    my $self = shift(@_);
463
464    $self->{'ppmd_table'} .= "</table>\n";
465}
466
467
468sub remap_dcterms_metadata
469{
470    my $self = shift(@_);
471
472    my ($metaname) = @_;
473
474    my $dcterm_mapping = {
475    "alternative" => "dc.title",
476    "tableOfContents" => "dc.description",
477    "abstract" => "dc.description",
478    "created" => "dc.date",
479    "valid" => "dc.date",
480    "available" => "dc.date",
481    "issued" => "dc.date",
482    "modified" => "dc.date",
483    "dateAccepted" => "dc.date",
484    "dateCopyrighted" => "dc.date",
485    "dateSubmitted" => "dc.date",
486    "extent" => "dc.format",
487    "medium" => "dc.format",
488    "isVersionOf" => "dc.relation",
489    "hasVersion" => "dc.relation",
490    "isReplacedBy" => "dc.relation",
491    "replaces" => "dc.relation",
492    "isRequiredBy" => "dc.relation",
493    "requires" => "dc.relation",
494    "isPartOf" => "dc.relation",
495    "hasPart" => "dc.relation",
496    "isReferencedBy" => "dc.relation",
497    "references" => "dc.relation",
498    "isFormatOf" => "dc.relation",
499    "hasFormat" => "dc.relation",
500    "conformsTo" => "dc.relation",
501    "spatial" => "dc.coverage",
502    "temporal" => "dc.coverage",
503    "audience" => "dc.any",
504    "accrualMethod" => "dc.any",
505    "accrualPeriodicity" => "dc.any",
506    "accrualPolicy" => "dc.any",
507    "instructionalMethod" => "dc.any",
508    "provenance" => "dc.any",
509    "rightsHolder" => "dc.any",
510    "mediator" => "audience",
511    "educationLevel" => "audience",
512    "accessRights" => "dc.rights",
513    "license" => "dc.rights",
514    "bibliographicCitation" => "dc.identifier"
515    };
516
517    my ($prefix,$name) = ($metaname =~ m/^(.*?)\.(.*?)$/);
518   
519    if ($prefix eq "dcterms")
520    {
521    if (defined $dcterm_mapping->{$name})
522    {
523        return $dcterm_mapping->{$name}."^".$name;
524    }
525
526    }
527    return $metaname; # didn't get a match, return param passed in unchanged
528}
529
530
531sub extract_oai_metadata {
532    my $self = shift (@_);
533    my ($textref, $metadata) = @_;
534    my $outhandle = $self->{'outhandle'};
535
536    # Only handles DC metadata
537
538    $self->open_prettyprint_metadata_table();
539
540    if ($$textref =~ m/<metadata\s*>(.*?)<\/metadata\s*>/s)
541    {
542    my $metadata_text = $1;
543
544    # locate and remove outermost tag (ignoring any attribute information in top-level tag)
545    my ($wrapper_metadata_xml,$inner_metadata_text) = ($metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>/s);
546
547    # split tag into namespace and tag name
548    my($namespace,$top_level_prefix) = ($wrapper_metadata_xml =~ m/^(.*?):(.*?)$/);
549
550    # sometimes, the dc namespace is not specified as the prefix in each element (like <dc:title>)
551    # but is rather defined in the wrapper element containing the various dc meta elements,
552    # like <dc><title></title><creator></creator></dc>.
553    # In such a case, we use this wrapper element as the top_level_prefix
554    if(!defined $top_level_prefix && defined $wrapper_metadata_xml && $wrapper_metadata_xml =~ m/dc$/) {
555        $top_level_prefix = $wrapper_metadata_xml;
556    }
557
558    if ($top_level_prefix !~ m/dc$/) {
559        print $outhandle "Warning: OAIPlugin currently only designed for Dublin Core (or variant) metadata\n";
560        print $outhandle "         This recorded metadata section '$top_level_prefix' does not appear to match.\n";
561        print $outhandle "         Metadata assumed to be in form: <prefix:tag>value</prefix:tag> and will be converted\n";
562        print $outhandle "         into Greenstone metadata as prefix.tag = value\n";
563    }
564
565    while ($inner_metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>(.*)/s)
566    {
567        # if URL given for document as identifier metadata, store it ...
568        # $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
569
570        my $metaname = $1;
571        my $metavalue = $2;
572        $inner_metadata_text = $3;
573
574#       print STDERR "*** metaname = $metaname\n";
575#       print STDERR "*** metavalue = $metavalue\n";
576
577        # $metaname =~ s/^(dc:)?(.)/\u$2/; # strip of optional prefix and uppercase first letter
578        $metaname =~ s/:/\./;
579        if ($metaname !~ m/\./)
580        {
581        $metaname = "$top_level_prefix.$metaname";
582#       print STDERR "*** metaname = $metaname\tmetavalue = $metavalue\n";
583        }
584        $metaname =~ s/\.(.)/\.\u$1/;
585
586        $metaname = $self->remap_dcterms_metadata($metaname);
587
588        $metavalue =~ s/\[/&#91;/g;
589        $metavalue =~ s/\]/&#93;/g;
590
591
592#       if ($metaname eq "Identifier")
593#       {
594#       # name clashes with GSDL reserved metadata name for hash id
595#       $metaname = "URL";
596#       }
597
598        if (defined $metadata->{$metaname})
599        {
600        push(@{$metadata->{$metaname}},$metavalue);
601
602        }
603        else
604        {
605        $metadata->{$metaname} = [ $metavalue ];
606        }
607
608        $self->add_prettyprint_metadata_line($metaname, $metavalue);
609       
610    }
611    }
612
613    $self->close_prettyprint_metadata_table();
614}
615
616## we know from the file extension, so doesn't need to check the doctype
617sub check_doctype {
618
619    return 1;
620}
621
6221;
Note: See TracBrowser for help on using the browser.