Changeset 19213

Show
Ignore:
Timestamp:
23.04.2009 10:03:26 (10 years ago)
Author:
kjdon
Message:

added metadata_set option. if set to auto, then metadata will be extracted from the record and left with the namespace it had originally. If set to something else, then all elements will get that namespace. If set to dc, will additionally do a mapping to greenstone form of qualified dc eg spatial is dc.Coveragespatial

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/plugins/OAIPlugin.pm

    r18901 r19213  
    4141} 
    4242 
     43my $set_list = 
     44    [ { 'name' => "auto", 
     45    'desc' => "{OAIPlugin.metadata_set.auto}" }, 
     46      { 'name' => "dc", 
     47    'desc' => "{OAIPlugin.metadata_set.dc}" }  
     48      ]; 
    4349 
    4450my $arguments =  
     
    4854    'reqd' => "no", 
    4955    'deft' => &get_default_process_exp() }, 
     56      { 'name' => "metadata_set", 
     57    'desc' => "{OAIPlugin.metadata_set}", 
     58    'type' => "enumstring", 
     59    'reqd' => "no", 
     60    'list' => $set_list, 
     61    'deft' => "dc" }, 
    5062      { 'name' => "document_field", 
    5163    'desc' => "{OAIPlugin.document_field}", 
     
    369381} 
    370382 
    371  
    372 sub remap_dcterms_metadata 
    373 { 
    374     my $self = shift(@_); 
    375  
    376     my ($metaname) = @_; 
    377  
    378     my $dcterm_mapping = { 
    379     "alternative" => "dc.title", 
    380     "tableOfContents" => "dc.description", 
    381     "abstract" => "dc.description", 
    382     "created" => "dc.date", 
    383     "valid" => "dc.date", 
    384     "available" => "dc.date", 
    385     "issued" => "dc.date", 
    386     "modified" => "dc.date", 
    387     "dateAccepted" => "dc.date", 
    388     "dateCopyrighted" => "dc.date", 
    389     "dateSubmitted" => "dc.date", 
    390     "extent" => "dc.format", 
    391     "medium" => "dc.format", 
    392     "isVersionOf" => "dc.relation", 
    393     "hasVersion" => "dc.relation", 
    394     "isReplacedBy" => "dc.relation", 
    395     "replaces" => "dc.relation", 
    396     "isRequiredBy" => "dc.relation", 
    397     "requires" => "dc.relation", 
    398     "isPartOf" => "dc.relation", 
    399     "hasPart" => "dc.relation", 
    400     "isReferencedBy" => "dc.relation", 
    401     "references" => "dc.relation", 
    402     "isFormatOf" => "dc.relation", 
    403     "hasFormat" => "dc.relation", 
    404     "conformsTo" => "dc.relation", 
    405     "spatial" => "dc.coverage", 
    406     "temporal" => "dc.coverage", 
    407 # these are top level elements in our qualified dc metadata set 
     383my $qualified_dc_mapping = { 
     384    "alternative" => "dc.title", 
     385    "tableOfContents" => "dc.description", 
     386    "abstract" => "dc.description", 
     387    "created" => "dc.date", 
     388    "valid" => "dc.date", 
     389    "available" => "dc.date", 
     390    "issued" => "dc.date", 
     391    "modified" => "dc.date", 
     392    "dateAccepted" => "dc.date", 
     393    "dateCopyrighted" => "dc.date", 
     394    "dateSubmitted" => "dc.date", 
     395    "extent" => "dc.format", 
     396    "medium" => "dc.format", 
     397    "isVersionOf" => "dc.relation", 
     398    "hasVersion" => "dc.relation", 
     399    "isReplacedBy" => "dc.relation", 
     400    "replaces" => "dc.relation", 
     401    "isRequiredBy" => "dc.relation", 
     402    "requires" => "dc.relation", 
     403    "isPartOf" => "dc.relation", 
     404    "hasPart" => "dc.relation", 
     405    "isReferencedBy" => "dc.relation", 
     406    "references" => "dc.relation", 
     407    "isFormatOf" => "dc.relation", 
     408    "hasFormat" => "dc.relation", 
     409    "conformsTo" => "dc.relation", 
     410    "spatial" => "dc.coverage", 
     411    "temporal" => "dc.coverage", 
     412# these are now top level elements in our qualified dc metadata set 
    408413#   "audience" => "dc.any", 
    409414#   "accrualMethod" => "dc.any", 
     
    413418#   "provenance" => "dc.any", 
    414419#   "rightsHolder" => "dc.any", 
    415     "mediator" => "dc.audience", 
    416     "educationLevel" => "dc.audience", 
    417     "accessRights" => "dc.rights", 
    418     "license" => "dc.rights", 
    419     "bibliographicCitation" => "dc.identifier" 
    420     }; 
     420    "mediator" => "dc.audience", 
     421    "educationLevel" => "dc.audience", 
     422    "accessRights" => "dc.rights", 
     423    "license" => "dc.rights", 
     424    "bibliographicCitation" => "dc.identifier" 
     425    }; 
     426 
     427sub remap_dc_metadata 
     428{ 
     429    my $self = shift(@_); 
     430 
     431    my ($metaname) = @_; 
    421432 
    422433    my ($prefix,$name) = ($metaname =~ m/^(.*?)\.(.*?)$/); 
    423434     
    424     if ($prefix eq "dcterms" || $prefix eq "dc") 
    425     { 
    426     if (defined $dcterm_mapping->{$name})  
    427     { 
    428         return $dcterm_mapping->{$name}."^".$name; 
    429     } 
    430  
    431     } 
     435    if (defined $qualified_dc_mapping->{$name}) { 
     436     
     437    return $qualified_dc_mapping->{$name}."^".$name; 
     438    } 
     439     
     440     
    432441    return $metaname; # didn't get a match, return param passed in unchanged 
    433442} 
     
    439448    my $outhandle = $self->{'outhandle'}; 
    440449 
    441     # Only handles DC metadata 
    442  
    443450    $self->open_prettyprint_metadata_table(); 
    444451 
     
    448455 
    449456    # locate and remove outermost tag (ignoring any attribute information in top-level tag) 
    450     my ($wrapper_metadata_xml,$inner_metadata_text) = ($metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>/s); 
     457    my ($outer_tagname,$inner_metadata_text) = ($metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>/s); 
    451458    # split tag into namespace and tag name 
    452     my($namespace,$top_level_prefix) = ($wrapper_metadata_xml =~ m/^(.*?):(.*?)$/); 
     459    my($namespace,$top_level_prefix) = ($outer_tagname =~ m/^(.*?):(.*?)$/); 
    453460    # sometimes, the dc namespace is not specified as the prefix in each element (like <dc:title>) 
    454461    # but is rather defined in the wrapper element containing the various dc meta elements, 
    455462    # like <dc><title></title><creator></creator></dc>. 
    456463    # In such a case, we use this wrapper element as the top_level_prefix 
    457     if(!defined $top_level_prefix && defined $wrapper_metadata_xml && $wrapper_metadata_xml =~ m/dc$/) { 
    458         $top_level_prefix = $wrapper_metadata_xml; 
     464     
     465    # if there was no prefix, then the tag itself becomes the top_level_prefix 
     466    if(!defined $top_level_prefix && defined $outer_tagname) { 
     467        $top_level_prefix = $outer_tagname; 
    459468    } 
    460469 
    461     if ($top_level_prefix !~ m/dc$/) { 
    462         print $outhandle "Warning: OAIPlugin currently only designed for Dublin Core (or variant) metadata\n"; 
    463         print $outhandle "         This recorded metadata section '$top_level_prefix' does not appear to match.\n"; 
    464         print $outhandle "         Metadata assumed to be in form: <prefix:tag>value</prefix:tag> and will be converted\n"; 
    465         print $outhandle "         into Greenstone metadata as prefix.tag = value\n"; 
    466     } 
    467  
     470    #process each element one by one 
    468471    while ($inner_metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>(.*)/s) 
    469472    { 
     
    473476        $inner_metadata_text = $3; 
    474477 
    475         # $metaname =~ s/^(dc:)?(.)/\u$2/; # strip off optional prefix and uppercase first letter 
     478        # greenstone uses . for namespace, while oai uses : 
    476479        $metaname =~ s/:/\./; 
     480        # if there is no namespace, then we use the outer tag name or  
     481        # namespace for this element 
    477482        if ($metaname !~ m/\./) 
    478483        { 
    479484        $metaname = "$top_level_prefix.$metaname"; 
    480485        } 
    481  
    482         $metaname = $self->remap_dcterms_metadata($metaname); 
     486         
     487        # if metadata set is auto, leave as is, otherwise convert to  
     488        # specified namespace 
     489        if ($self->{'metadata_set'} ne "auto") { 
     490        $metaname =~ s/^([^\.]*)\./$self->{'metadata_set'}\./; 
     491        if ($self->{'metadata_set'} eq "dc") { 
     492            # convert qualified dc terms to gs version, e.g. 
     493            # spatial becomes coverage^spatial 
     494            $metaname = $self->remap_dc_metadata($metaname); 
     495        } 
     496        } 
    483497 
    484498        # uppercase the first char of the name