Changeset 21716 for main


Ignore:
Timestamp:
2010-02-05T15:30:50+13:00 (14 years ago)
Author:
mdewsnip
Message:

Improvements to metadataaction.pm made by Jeffrey Ke and Shane Taylor at DL Consulting Ltd:

  1. Added new insert-metadata action - basically combined the set_import_metadata and set_metadata functions
  2. Fixed the XML::Rules in set_archive_metadata and set_import_metadata functions - the "raw" rule should return the reference to the pair not the pair itself.

'raw' = the [tagname => attrs] is pushed to the parent tag's _content.

You would use this style if you wanted to be able to print
the parent tag as XML preserving the whitespace or other textual content
sub { [$_[0] => $_[1]]}

  1. Added FileSet check in set_import_metadata - The system now checks to make sure it is in the right FileSet before performing the actions of override/accumulate.
  2. Added a hack to the set_import_metadata - After the metadata.xml get filtered by XML::Rules, it lost its DTD and encoding information. I am not sure if there is a flag to keep them or it is just a bug. I have hard coded it to replace the generated header: [<?xml version="1.0"?>] with [<?xml version="1.0" encoding="UTF-8"?>\n<!DOCTYPE DirectoryMetadata SYSTEM "http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd">]
File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/cgiactions/metadataaction.pm

    r21715 r21716  
    7272
    7373    "remove-metadata"       => { 'compulsory-args' => [ "d", "metaname" ],
    74                      'optional-args'   => [ "metapos" ] }
     74                     'optional-args'   => [ "metapos" ] },
     75
     76    "insert-metadata"       => { 'compulsory-args' => [ "d", "metaname", "metavalue" ],
     77                     'optional-args'   => [ ]
     78                   }
    7579};
    7680
     
    383387    }
    384388
    385     # raw extended
    386     # Someone please write some comments on why adding ':'.$tagname => $attrHash
    387     return (':'.$tagname => $attrHash, [$tagname => $attrHash]);
     389    # RAW is [$tagname => $attrHash] not $tagname => $attrHash!!
     390    return [$tagname => $attrHash];
    388391}
    389392
     
    412415    }
    413416
    414     # raw
    415     return $tagname => $attrHash;
     417
     418    # RAW is [$tagname => $attrHash] not $tagname => $attrHash!!
     419    return [$tagname => $attrHash];
    416420}
    417421
     
    461465    # Set the call back functions
    462466    my @rules =
    463     ( _default => 'raw extended',
     467    ( _default => 'raw',
    464468      'Metadata' => \&dxml_metadata,
    465469      'Description' => \&dxml_description );
     
    530534    my $metamode = $parser->{'parameters'}->{'metamode'};
    531535
     536    # Report error if we don't see FileName tag before this
     537    die "Fatel Error: Unexpected metadata.xml structure. Undefind current_file, possiblely encountered Description before FileName" if (!defined($parser->{'parameters'}->{'current_file'}));
     538   
     539    # Don't do anything if we are not in the right FileSet
     540    my $file_regexp = $parser->{'parameters'}->{'current_file'};
     541    return [$tagname => $attrHash] if (!($parser->{'parameters'}->{'src_file'} =~ /$file_regexp/));
     542   
    532543    # Find the right metadata tag and checks if we are going to override it
    533544    # Note: This over writes the first metadata block it encountered even if it doesn't belong to the source file we specified
     
    542553    }
    543554
    544     # raw extended
    545     # Someone please write some comments on why adding ':'.$tagname => $attrHash
    546     return (':'.$tagname => $attrHash, [$tagname => $attrHash]);
     555    # RAW is [$tagname => $attrHash] not $tagname => $attrHash!!
     556    return [$tagname => $attrHash];
    547557}
    548558
     
    551561{
    552562    my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
    553     my $metamode = $parser->{'parameters'}->{'metamode'};
     563    my $metamode = $parser->{'parameters'}->{'metamode'};   
     564
     565    # Failed... Report error if we don't see FileName tag before this
     566    die "Fatel Error: Unexpected metadata.xml structure. Undefind current_file, possiblely encountered Description before FileName" if (!defined($parser->{'parameters'}->{'current_file'}));
     567
     568    # Don't do anything if we are not in the right FileSet
     569    my $file_regexp = $parser->{'parameters'}->{'current_file'};
     570    return [$tagname => $attrHash] if (!($parser->{'parameters'}->{'src_file'} =~ /$file_regexp/));
    554571
    555572    # Accumulate the metadata block to the end of the description block
     
    571588    }
    572589
    573     # raw
    574     return $tagname => $attrHash;
     590    # RAW is [$tagname => $attrHash] not $tagname => $attrHash!!
     591    return [$tagname => $attrHash];
     592}
     593
     594
     595sub mxml_filename
     596{
     597    my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
     598
     599    # Store the filename of the Current Fileset
     600    # Note: According to http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd
     601    # FileName tag must come before Description tag
     602    $parser->{'parameters'}->{'current_file'} = $attrHash->{'_content'};
     603
     604    # RAW is [$tagname => $attrHash] not $tagname => $attrHash!!
     605    return [$tagname => $attrHash];
     606}
     607
     608
     609sub mxml_fileset
     610{
     611    my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
     612
     613    # Initilise the current_file
     614    # Note: According to http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd
     615    # FileName tag must come before Description tag
     616    $parser->{'parameters'}->{'current_file'} = "";
     617
     618    # RAW is [$tagname => $attrHash] not $tagname => $attrHash!!
     619    return [$tagname => $attrHash];
    575620}
    576621
     
    579624{
    580625    my $self = shift @_;
    581     my ($gsdl_cgi, $metadata_xml_filename, $metaname, $metavalue, $metamode) = @_;
     626    my ($gsdl_cgi, $metadata_xml_filename, $metaname, $metavalue, $metamode, $src_file) = @_;
    582627
    583628    # Set the call-back functions for the metadata tags
    584629    my @rules =
    585     ( _default => 'raw extended',
     630    ( _default => 'raw',
     631          'FileName' => \&mxml_filename,
    586632      'Metadata' => \&mxml_metadata,
    587       'Description' => \&mxml_description );
     633      'Description' => \&mxml_description,
     634          'FileSet' => \&mxml_fileset);
    588635
    589636    # use XML::Rules to add it in (read in and out again)
    590637    my $parser = XML::Rules->new(rules => \@rules,
    591                  style => 'filter' );
     638                 style => 'filter',
     639                                 output_encoding => 'utf8');
    592640
    593641    my $xml_in = "";
     
    607655    $parser->filter($xml_in,\$xml_out, { metaname => $metaname,
    608656                         metavalue => $metavalue,
    609                                              metamode => $metamode } );
    610        
     657                                             metamode => $metamode,
     658                         src_file => $src_file,
     659                         current_file => undef} );
     660
    611661    if (!open(MOUT,">$metadata_xml_filename")) {
    612662        $gsdl_cgi->generate_error("Unable to write out to $metadata_xml_filename: $!");
    613663    }
    614664    else {
     665            # Some wise person please find out how to keep the DTD and encode lines in after it gets filtered by this XML::Rules
     666            # At the moment, I will just hack it!
     667            my $header_with_utf8_dtd = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!DOCTYPE DirectoryMetadata SYSTEM \"http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd\">";
     668            $xml_out =~ s/\<\?xml\sversion\=\"1.0\"\?\>/$header_with_utf8_dtd/;
    615669        print MOUT $xml_out;
    616670        close(MOUT);       
     
    641695    # Make sure the collection isn't locked by someone else
    642696    $self->lock_collection($username, $collect);
    643 
     697   
    644698    # look up additional args
    645699    # want either d= or f=
     
    675729        $import_filename = &util::filename_cat($collect_dir,$collect,$import_file);
    676730    }
    677    
    678    
     731       
    679732    # figure out correct metadata.xml file [?]
    680733    # Assuming the metadata.xml file is next to the source file
     
    685738
    686739    # Edit the metadata.xml
    687     # Note: At moment it doesn't correctly on metadata.xml with multiple FileSets
    688     # "accumulate" mode will add new metadata block to all FileSets
    689     # "override" mode will over write the first encountered metadata block, even if it doesn't have the right source
    690     $self->edit_metadata_xml($gsdl_cgi,$metadata_xml_filename,
    691                  $metaname,$metavalue,$metamode);
     740    # Modified by Jeffrey from DL Consulting
     741    # Handle the case where there is one metadata.xml file for multiple FileSets
     742    # The XML filter needs to know whether it is in the right FileSet
     743    # TODO: This doesn't fix the problem where the metadata.xml is not next to the src file.
     744    # TODO: This doesn't handle the common metadata (where FileName doesn't point to a single file)
     745    $self->edit_metadata_xml($gsdl_cgi, $metadata_xml_filename,
     746                             $metaname, $metavalue, $metamode, $import_tailname);
    692747
    693748    # Release the lock once it is done
     
    859914
    860915
     916# Was trying to reused the codes, but the functions need to be broken down more before they can be reused, otherwise there will be too much overhead and duplicate process...
     917sub insert_metadata
     918{
     919    my $self = shift @_;
     920   
     921    my $username  = $self->{'username'};
     922    my $collect   = $self->{'collect'};
     923    my $gsdl_cgi  = $self->{'gsdl_cgi'};
     924    my $gsdlhome  = $self->{'gsdlhome'};
     925
     926    # If the import metadata and gdbm database have been updated, we need to insert some notification to warn user that the the text they see at the moment is not indexed and require a rebuild.
     927    my $rebuild_pending_macro = "_rebuildpendingmessage_";
     928
     929    # don't user authenticate for now
     930    if ($baseaction::authentication_enabled) {
     931    # Ensure the user is allowed to edit this collection
     932    $self->authenticate_user($username, $collect);
     933    }
     934
     935    # Obtain the collect and archive dir   
     936    my $collect_dir = &util::filename_cat($gsdlhome, "collect");
     937    my $archive_dir = &util::filename_cat($collect_dir,$collect,"archives");
     938
     939    # Make sure the collection isn't locked by someone else
     940    $self->lock_collection($username, $collect);
     941   
     942    # Check additional args
     943    my $docid = $self->{'d'};
     944    if (!defined($docid)) {
     945    $gsdl_cgi->generate_error("No document id is specified: d=...");
     946    }
     947    my $metaname = $self->{'metaname'};
     948    if (!defined($metaname)) {
     949    $gsdl_cgi->generate_error("No metaname is specified: metadataname=...");
     950    }
     951    my $metavalue = $self->{'metavalue'};
     952    if (!defined($metavalue) || $metavalue eq "") {
     953    $gsdl_cgi->generate_error("No metavalue or empty metavalue is specified: metadataname=...");
     954    }
     955    # make "accumulate" the default (less destructive, as won't actually
     956    # delete any existing values)
     957    my $metamode = "accumulate";
     958
     959    #=======================================================================#
     960    # set_import_metadata [START]
     961    #=======================================================================#
     962    # Obtain where the metadata.xml is from the archiveinfo-doc.gdb file
     963    # If the doc oid is not specified, we assume the metadata.xml is next to the specified "f"
     964    my $metadata_xml_file;
     965    my $arcinfo_doc_filename = &dbutil::get_infodb_file_path("gdbm", "archiveinf-doc", $archive_dir);
     966    my $archive_doc_rec_string = &dbutil::read_infodb_entry("gdbm", $arcinfo_doc_filename, $docid);
     967    my $archive_doc_rec = &dbutil::convert_infodb_string_to_hash($archive_doc_rec_string);
     968   
     969    # This now stores the full pathname
     970    my $import_filename = $archive_doc_rec->{'src-file'}->[0];
     971   
     972    # figure out correct metadata.xml file [?]
     973    # Assuming the metadata.xml file is next to the source file
     974    # Note: This will not work if it is using the inherited metadata from the parent folder
     975    my ($import_tailname, $import_dirname)
     976    = File::Basename::fileparse($import_filename);
     977    my $metadata_xml_filename = &util::filename_cat($import_dirname,"metadata.xml");
     978
     979    # Shane's escape characters
     980    $metavalue = pack "U0C*", unpack "C*", $metavalue;
     981    $metavalue =~ s/\,/&#44;/g;
     982    $metavalue =~ s/\:/&#58;/g;
     983    $metavalue =~ s/\|/&#124;/g;
     984    $metavalue =~ s/\(/&#40;/g;
     985    $metavalue =~ s/\)/&#41;/g;
     986    $metavalue =~ s/\[/&#91;/g;
     987    $metavalue =~ s/\\/&#92;/g;
     988    $metavalue =~ s/\]/&#93;/g;
     989    $metavalue =~ s/\{/&#123;/g;
     990    $metavalue =~ s/\}/&#125;/g;
     991    $metavalue =~ s/\"/&#34;/g;
     992    $metavalue =~ s/\`/&#96;/g;
     993    $metavalue =~ s/\n/_newline_/g;
     994
     995    # Edit the metadata.xml
     996    # Modified by Jeffrey from DL Consulting
     997    # Handle the case where there is one metadata.xml file for multiple FileSets
     998    # The XML filter needs to know whether it is in the right FileSet
     999    # TODO: This doesn't fix the problem where the metadata.xml is not next to the src file.
     1000    # TODO: This doesn't handle the common metadata (where FileName doesn't point to a single file)
     1001    $self->edit_metadata_xml($gsdl_cgi, $metadata_xml_filename,
     1002                             $metaname, $metavalue, $metamode, $import_tailname);
     1003    #=======================================================================#
     1004    # set_import_metadata [END]
     1005    #=======================================================================#
     1006
     1007
     1008    #=======================================================================#
     1009    # set_metadata (accumulate version) [START]
     1010    #=======================================================================#
     1011    # To people who know $collect_tail please add some comments
     1012    # Obtain path to the database
     1013    my $collect_tail = $collect;
     1014    $collect_tail =~ s/^.*[\/\\]//;
     1015    my $index_text_directory = &util::filename_cat($collect_dir,$collect,"index","text");
     1016    my $infodb_file_path = &dbutil::get_infodb_file_path("gdbm", $collect_tail, $index_text_directory);
     1017
     1018    # Read the docid entry
     1019    my $doc_rec_string = &dbutil::read_infodb_entry("gdbm", $infodb_file_path, $docid);
     1020    my $doc_rec = &dbutil::convert_infodb_string_to_hash($doc_rec_string);
     1021    foreach my $k (keys %$doc_rec) {
     1022    my @escaped_v = ();
     1023    foreach my $v (@{$doc_rec->{$k}}) {
     1024        if ($k eq "contains") {
     1025        # protect quotes in ".2;".3 etc
     1026        $v =~ s/\"/\\\"/g;
     1027        push(@escaped_v, $v);
     1028        }
     1029        else {
     1030        my $ev = &ghtml::unescape_html($v);
     1031        $ev =~ s/\"/\\\"/g;
     1032        push(@escaped_v, $ev);
     1033        }
     1034    }
     1035    $doc_rec->{$k} = \@escaped_v;
     1036    }
     1037
     1038    # Protect the quotes
     1039    $metavalue =~ s/\"/\\\"/g;
     1040
     1041    # Adds the pending macro
     1042    my $macro_metavalue = $rebuild_pending_macro . $metavalue;
     1043
     1044    # If the metadata doesn't exist, create a new one
     1045    if (!defined($doc_rec->{$metaname})){   
     1046    $doc_rec->{$metaname} = [ $macro_metavalue ];
     1047    }
     1048    # Else, let's acculumate the values
     1049    else {
     1050        push(@{$doc_rec->{$metaname}},$macro_metavalue);
     1051    }
     1052
     1053    # Generate the record string
     1054    my $serialized_doc_rec = &dbutil::convert_infodb_hash_to_string($doc_rec);
     1055
     1056    # Store it into GDBM
     1057    my $cmd = "gdbmset \"$infodb_file_path\" \"$docid\" \"$serialized_doc_rec\"";
     1058    my $status = system($cmd);
     1059    if ($status != 0) {
     1060        # Catch error if gdbmget failed
     1061    my $mess = "Failed to set metadata key: $docid\n";
     1062   
     1063    $mess .= "PATH: $ENV{'PATH'}\n";
     1064    $mess .= "cmd = $cmd\n";
     1065    $mess .= "Exit status: $status\n";
     1066    $mess .= "System Error Message: $!\n";
     1067
     1068    $gsdl_cgi->generate_error($mess);
     1069    }
     1070    else {
     1071    my $mess = "insert-metadata successful: Key[$docid]\n";
     1072    $mess .= "  [In metadata.xml] $metaname";
     1073    $mess .= " = $metavalue\n";
     1074    $mess .= "  [In database] $metaname";
     1075    $mess .= " = $macro_metavalue\n";
     1076    $mess .= "  The new text has not been indexed, rebuilding collection is required\n";
     1077        $gsdl_cgi->generate_ok_message($mess);
     1078    }   
     1079    #=======================================================================#
     1080    # set_metadata (accumulate version) [END]
     1081    #=======================================================================#
     1082
     1083    # Release the lock once it is done
     1084    $self->unlock_collection($username, $collect);
     1085}
     1086
    86110871;
Note: See TracChangeset for help on using the changeset viewer.