Changeset 19213 for gsdl


Ignore:
Timestamp:
2009-04-23T10:03:26+12:00 (15 years ago)
Author:
kjdon
Message:

added metadata_set option. if set to auto, then metadata will be extracted from the record and left with the namespace it had originally. If set to something else, then all elements will get that namespace. If set to dc, will additionally do a mapping to greenstone form of qualified dc eg spatial is dc.Coveragespatial

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/plugins/OAIPlugin.pm

    r18901 r19213  
    4141}
    4242
     43my $set_list =
     44    [ { 'name' => "auto",
     45    'desc' => "{OAIPlugin.metadata_set.auto}" },
     46      { 'name' => "dc",
     47    'desc' => "{OAIPlugin.metadata_set.dc}" }
     48      ];
    4349
    4450my $arguments =
     
    4854    'reqd' => "no",
    4955    'deft' => &get_default_process_exp() },
     56      { 'name' => "metadata_set",
     57    'desc' => "{OAIPlugin.metadata_set}",
     58    'type' => "enumstring",
     59    'reqd' => "no",
     60    'list' => $set_list,
     61    'deft' => "dc" },
    5062      { 'name' => "document_field",
    5163    'desc' => "{OAIPlugin.document_field}",
     
    369381}
    370382
    371 
    372 sub remap_dcterms_metadata
    373 {
    374     my $self = shift(@_);
    375 
    376     my ($metaname) = @_;
    377 
    378     my $dcterm_mapping = {
    379     "alternative" => "dc.title",
    380     "tableOfContents" => "dc.description",
    381     "abstract" => "dc.description",
    382     "created" => "dc.date",
    383     "valid" => "dc.date",
    384     "available" => "dc.date",
    385     "issued" => "dc.date",
    386     "modified" => "dc.date",
    387     "dateAccepted" => "dc.date",
    388     "dateCopyrighted" => "dc.date",
    389     "dateSubmitted" => "dc.date",
    390     "extent" => "dc.format",
    391     "medium" => "dc.format",
    392     "isVersionOf" => "dc.relation",
    393     "hasVersion" => "dc.relation",
    394     "isReplacedBy" => "dc.relation",
    395     "replaces" => "dc.relation",
    396     "isRequiredBy" => "dc.relation",
    397     "requires" => "dc.relation",
    398     "isPartOf" => "dc.relation",
    399     "hasPart" => "dc.relation",
    400     "isReferencedBy" => "dc.relation",
    401     "references" => "dc.relation",
    402     "isFormatOf" => "dc.relation",
    403     "hasFormat" => "dc.relation",
    404     "conformsTo" => "dc.relation",
    405     "spatial" => "dc.coverage",
    406     "temporal" => "dc.coverage",
    407 # these are top level elements in our qualified dc metadata set
     383my $qualified_dc_mapping = {
     384    "alternative" => "dc.title",
     385    "tableOfContents" => "dc.description",
     386    "abstract" => "dc.description",
     387    "created" => "dc.date",
     388    "valid" => "dc.date",
     389    "available" => "dc.date",
     390    "issued" => "dc.date",
     391    "modified" => "dc.date",
     392    "dateAccepted" => "dc.date",
     393    "dateCopyrighted" => "dc.date",
     394    "dateSubmitted" => "dc.date",
     395    "extent" => "dc.format",
     396    "medium" => "dc.format",
     397    "isVersionOf" => "dc.relation",
     398    "hasVersion" => "dc.relation",
     399    "isReplacedBy" => "dc.relation",
     400    "replaces" => "dc.relation",
     401    "isRequiredBy" => "dc.relation",
     402    "requires" => "dc.relation",
     403    "isPartOf" => "dc.relation",
     404    "hasPart" => "dc.relation",
     405    "isReferencedBy" => "dc.relation",
     406    "references" => "dc.relation",
     407    "isFormatOf" => "dc.relation",
     408    "hasFormat" => "dc.relation",
     409    "conformsTo" => "dc.relation",
     410    "spatial" => "dc.coverage",
     411    "temporal" => "dc.coverage",
     412# these are now top level elements in our qualified dc metadata set
    408413#   "audience" => "dc.any",
    409414#   "accrualMethod" => "dc.any",
     
    413418#   "provenance" => "dc.any",
    414419#   "rightsHolder" => "dc.any",
    415     "mediator" => "dc.audience",
    416     "educationLevel" => "dc.audience",
    417     "accessRights" => "dc.rights",
    418     "license" => "dc.rights",
    419     "bibliographicCitation" => "dc.identifier"
    420     };
     420    "mediator" => "dc.audience",
     421    "educationLevel" => "dc.audience",
     422    "accessRights" => "dc.rights",
     423    "license" => "dc.rights",
     424    "bibliographicCitation" => "dc.identifier"
     425    };
     426
     427sub remap_dc_metadata
     428{
     429    my $self = shift(@_);
     430
     431    my ($metaname) = @_;
    421432
    422433    my ($prefix,$name) = ($metaname =~ m/^(.*?)\.(.*?)$/);
    423434   
    424     if ($prefix eq "dcterms" || $prefix eq "dc")
    425     {
    426     if (defined $dcterm_mapping->{$name})
    427     {
    428         return $dcterm_mapping->{$name}."^".$name;
    429     }
    430 
    431     }
     435    if (defined $qualified_dc_mapping->{$name}) {
     436   
     437    return $qualified_dc_mapping->{$name}."^".$name;
     438    }
     439   
     440   
    432441    return $metaname; # didn't get a match, return param passed in unchanged
    433442}
     
    439448    my $outhandle = $self->{'outhandle'};
    440449
    441     # Only handles DC metadata
    442 
    443450    $self->open_prettyprint_metadata_table();
    444451
     
    448455
    449456    # locate and remove outermost tag (ignoring any attribute information in top-level tag)
    450     my ($wrapper_metadata_xml,$inner_metadata_text) = ($metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>/s);
     457    my ($outer_tagname,$inner_metadata_text) = ($metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>/s);
    451458    # split tag into namespace and tag name
    452     my($namespace,$top_level_prefix) = ($wrapper_metadata_xml =~ m/^(.*?):(.*?)$/);
     459    my($namespace,$top_level_prefix) = ($outer_tagname =~ m/^(.*?):(.*?)$/);
    453460    # sometimes, the dc namespace is not specified as the prefix in each element (like <dc:title>)
    454461    # but is rather defined in the wrapper element containing the various dc meta elements,
    455462    # like <dc><title></title><creator></creator></dc>.
    456463    # In such a case, we use this wrapper element as the top_level_prefix
    457     if(!defined $top_level_prefix && defined $wrapper_metadata_xml && $wrapper_metadata_xml =~ m/dc$/) {
    458         $top_level_prefix = $wrapper_metadata_xml;
     464   
     465    # if there was no prefix, then the tag itself becomes the top_level_prefix
     466    if(!defined $top_level_prefix && defined $outer_tagname) {
     467        $top_level_prefix = $outer_tagname;
    459468    }
    460469
    461     if ($top_level_prefix !~ m/dc$/) {
    462         print $outhandle "Warning: OAIPlugin currently only designed for Dublin Core (or variant) metadata\n";
    463         print $outhandle "         This recorded metadata section '$top_level_prefix' does not appear to match.\n";
    464         print $outhandle "         Metadata assumed to be in form: <prefix:tag>value</prefix:tag> and will be converted\n";
    465         print $outhandle "         into Greenstone metadata as prefix.tag = value\n";
    466     }
    467 
     470    #process each element one by one
    468471    while ($inner_metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>(.*)/s)
    469472    {
     
    473476        $inner_metadata_text = $3;
    474477
    475         # $metaname =~ s/^(dc:)?(.)/\u$2/; # strip off optional prefix and uppercase first letter
     478        # greenstone uses . for namespace, while oai uses :
    476479        $metaname =~ s/:/\./;
     480        # if there is no namespace, then we use the outer tag name or
     481        # namespace for this element
    477482        if ($metaname !~ m/\./)
    478483        {
    479484        $metaname = "$top_level_prefix.$metaname";
    480485        }
    481 
    482         $metaname = $self->remap_dcterms_metadata($metaname);
     486       
     487        # if metadata set is auto, leave as is, otherwise convert to
     488        # specified namespace
     489        if ($self->{'metadata_set'} ne "auto") {
     490        $metaname =~ s/^([^\.]*)\./$self->{'metadata_set'}\./;
     491        if ($self->{'metadata_set'} eq "dc") {
     492            # convert qualified dc terms to gs version, e.g.
     493            # spatial becomes coverage^spatial
     494            $metaname = $self->remap_dc_metadata($metaname);
     495        }
     496        }
    483497
    484498        # uppercase the first char of the name
Note: See TracChangeset for help on using the changeset viewer.