Changeset 21716

Show
Ignore:
Timestamp:
05.02.2010 15:30:50 (9 years ago)
Author:
mdewsnip
Message:

Improvements to metadataaction.pm made by Jeffrey Ke and Shane Taylor at DL Consulting Ltd:

1. Added new insert-metadata action - basically combined the set_import_metadata and set_metadata functions
2. Fixed the XML::Rules in set_archive_metadata and set_import_metadata functions - the "raw" rule should return the reference to the pair not the pair itself.

'raw' = the [tagname => attrs] is pushed to the parent tag's _content.

You would use this style if you wanted to be able to print
the parent tag as XML preserving the whitespace or other textual content
sub { [$_[0] => $_[1]]}

3. Added FileSet? check in set_import_metadata - The system now checks to make sure it is in the right FileSet? before performing the actions of override/accumulate.
4. Added a hack to the set_import_metadata - After the metadata.xml get filtered by XML::Rules, it lost its DTD and encoding information. I am not sure if there is a flag to keep them or it is just a bug. I have hard coded it to replace the generated header: [<?xml version="1.0"?>] with [<?xml version="1.0" encoding="UTF-8"?>\n<!DOCTYPE DirectoryMetadata? SYSTEM " http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd">]

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/cgiactions/metadataaction.pm

    r21715 r21716  
    7272 
    7373    "remove-metadata"       => { 'compulsory-args' => [ "d", "metaname" ], 
    74                      'optional-args'   => [ "metapos" ] } 
     74                     'optional-args'   => [ "metapos" ] }, 
     75 
     76    "insert-metadata"       => { 'compulsory-args' => [ "d", "metaname", "metavalue" ], 
     77                     'optional-args'   => [ ] 
     78                   } 
    7579}; 
    7680 
     
    383387    } 
    384388 
    385     # raw extended 
    386     # Someone please write some comments on why adding ':'.$tagname => $attrHash 
    387     return (':'.$tagname => $attrHash, [$tagname => $attrHash]); 
     389    # RAW is [$tagname => $attrHash] not $tagname => $attrHash!! 
     390    return [$tagname => $attrHash]; 
    388391} 
    389392 
     
    412415    } 
    413416 
    414     # raw 
    415     return $tagname => $attrHash; 
     417 
     418    # RAW is [$tagname => $attrHash] not $tagname => $attrHash!! 
     419    return [$tagname => $attrHash]; 
    416420} 
    417421 
     
    461465    # Set the call back functions 
    462466    my @rules =  
    463     ( _default => 'raw extended', 
     467    ( _default => 'raw', 
    464468      'Metadata' => \&dxml_metadata, 
    465469      'Description' => \&dxml_description ); 
     
    530534    my $metamode = $parser->{'parameters'}->{'metamode'}; 
    531535 
     536    # Report error if we don't see FileName tag before this 
     537    die "Fatel Error: Unexpected metadata.xml structure. Undefind current_file, possiblely encountered Description before FileName" if (!defined($parser->{'parameters'}->{'current_file'})); 
     538     
     539    # Don't do anything if we are not in the right FileSet 
     540    my $file_regexp = $parser->{'parameters'}->{'current_file'}; 
     541    return [$tagname => $attrHash] if (!($parser->{'parameters'}->{'src_file'} =~ /$file_regexp/)); 
     542     
    532543    # Find the right metadata tag and checks if we are going to override it 
    533544    # Note: This over writes the first metadata block it encountered even if it doesn't belong to the source file we specified 
     
    542553    } 
    543554 
    544     # raw extended 
    545     # Someone please write some comments on why adding ':'.$tagname => $attrHash 
    546     return (':'.$tagname => $attrHash, [$tagname => $attrHash]); 
     555    # RAW is [$tagname => $attrHash] not $tagname => $attrHash!! 
     556    return [$tagname => $attrHash]; 
    547557} 
    548558 
     
    551561{ 
    552562    my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_; 
    553     my $metamode = $parser->{'parameters'}->{'metamode'}; 
     563    my $metamode = $parser->{'parameters'}->{'metamode'};     
     564 
     565    # Failed... Report error if we don't see FileName tag before this 
     566    die "Fatel Error: Unexpected metadata.xml structure. Undefind current_file, possiblely encountered Description before FileName" if (!defined($parser->{'parameters'}->{'current_file'})); 
     567 
     568    # Don't do anything if we are not in the right FileSet 
     569    my $file_regexp = $parser->{'parameters'}->{'current_file'}; 
     570    return [$tagname => $attrHash] if (!($parser->{'parameters'}->{'src_file'} =~ /$file_regexp/)); 
    554571 
    555572    # Accumulate the metadata block to the end of the description block 
     
    571588    } 
    572589 
    573     # raw 
    574     return $tagname => $attrHash; 
     590    # RAW is [$tagname => $attrHash] not $tagname => $attrHash!! 
     591    return [$tagname => $attrHash]; 
     592} 
     593 
     594 
     595sub mxml_filename 
     596{ 
     597    my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_; 
     598 
     599    # Store the filename of the Current Fileset 
     600    # Note: According to http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd 
     601    # FileName tag must come before Description tag 
     602    $parser->{'parameters'}->{'current_file'} = $attrHash->{'_content'}; 
     603 
     604    # RAW is [$tagname => $attrHash] not $tagname => $attrHash!! 
     605    return [$tagname => $attrHash]; 
     606} 
     607 
     608 
     609sub mxml_fileset 
     610{ 
     611    my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_; 
     612 
     613    # Initilise the current_file 
     614    # Note: According to http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd 
     615    # FileName tag must come before Description tag 
     616    $parser->{'parameters'}->{'current_file'} = ""; 
     617 
     618    # RAW is [$tagname => $attrHash] not $tagname => $attrHash!! 
     619    return [$tagname => $attrHash]; 
    575620} 
    576621 
     
    579624{ 
    580625    my $self = shift @_; 
    581     my ($gsdl_cgi, $metadata_xml_filename, $metaname, $metavalue, $metamode) = @_; 
     626    my ($gsdl_cgi, $metadata_xml_filename, $metaname, $metavalue, $metamode, $src_file) = @_; 
    582627 
    583628    # Set the call-back functions for the metadata tags 
    584629    my @rules =  
    585     ( _default => 'raw extended', 
     630    ( _default => 'raw', 
     631          'FileName' => \&mxml_filename, 
    586632      'Metadata' => \&mxml_metadata, 
    587       'Description' => \&mxml_description ); 
     633      'Description' => \&mxml_description, 
     634          'FileSet' => \&mxml_fileset); 
    588635 
    589636    # use XML::Rules to add it in (read in and out again) 
    590637    my $parser = XML::Rules->new(rules => \@rules,  
    591                  style => 'filter' ); 
     638                 style => 'filter', 
     639                                 output_encoding => 'utf8'); 
    592640 
    593641    my $xml_in = ""; 
     
    607655    $parser->filter($xml_in,\$xml_out, { metaname => $metaname, 
    608656                         metavalue => $metavalue, 
    609                                              metamode => $metamode } ); 
    610          
     657                                             metamode => $metamode, 
     658                         src_file => $src_file, 
     659                         current_file => undef} ); 
     660 
    611661    if (!open(MOUT,">$metadata_xml_filename")) { 
    612662        $gsdl_cgi->generate_error("Unable to write out to $metadata_xml_filename: $!"); 
    613663    } 
    614664    else { 
     665            # Some wise person please find out how to keep the DTD and encode lines in after it gets filtered by this XML::Rules 
     666            # At the moment, I will just hack it! 
     667            my $header_with_utf8_dtd = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!DOCTYPE DirectoryMetadata SYSTEM \"http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd\">"; 
     668            $xml_out =~ s/\<\?xml\sversion\=\"1.0\"\?\>/$header_with_utf8_dtd/; 
    615669        print MOUT $xml_out; 
    616670        close(MOUT);         
     
    641695    # Make sure the collection isn't locked by someone else 
    642696    $self->lock_collection($username, $collect); 
    643  
     697     
    644698    # look up additional args 
    645699    # want either d= or f= 
     
    675729        $import_filename = &util::filename_cat($collect_dir,$collect,$import_file); 
    676730    } 
    677      
    678      
     731         
    679732    # figure out correct metadata.xml file [?] 
    680733    # Assuming the metadata.xml file is next to the source file 
     
    685738 
    686739    # Edit the metadata.xml 
    687     # Note: At moment it doesn't correctly on metadata.xml with multiple FileSets 
    688     # "accumulate" mode will add new metadata block to all FileSets 
    689     # "override" mode will over write the first encountered metadata block, even if it doesn't have the right source 
    690     $self->edit_metadata_xml($gsdl_cgi,$metadata_xml_filename, 
    691                  $metaname,$metavalue,$metamode); 
     740    # Modified by Jeffrey from DL Consulting 
     741    # Handle the case where there is one metadata.xml file for multiple FileSets 
     742    # The XML filter needs to know whether it is in the right FileSet 
     743    # TODO: This doesn't fix the problem where the metadata.xml is not next to the src file. 
     744    # TODO: This doesn't handle the common metadata (where FileName doesn't point to a single file) 
     745    $self->edit_metadata_xml($gsdl_cgi, $metadata_xml_filename, 
     746                             $metaname, $metavalue, $metamode, $import_tailname); 
    692747 
    693748    # Release the lock once it is done 
     
    859914 
    860915 
     916# Was trying to reused the codes, but the functions need to be broken down more before they can be reused, otherwise there will be too much overhead and duplicate process... 
     917sub insert_metadata 
     918{ 
     919    my $self = shift @_; 
     920     
     921    my $username  = $self->{'username'}; 
     922    my $collect   = $self->{'collect'}; 
     923    my $gsdl_cgi  = $self->{'gsdl_cgi'}; 
     924    my $gsdlhome  = $self->{'gsdlhome'}; 
     925 
     926    # If the import metadata and gdbm database have been updated, we need to insert some notification to warn user that the the text they see at the moment is not indexed and require a rebuild. 
     927    my $rebuild_pending_macro = "_rebuildpendingmessage_"; 
     928 
     929    # don't user authenticate for now 
     930    if ($baseaction::authentication_enabled) { 
     931    # Ensure the user is allowed to edit this collection 
     932    $self->authenticate_user($username, $collect); 
     933    } 
     934 
     935    # Obtain the collect and archive dir     
     936    my $collect_dir = &util::filename_cat($gsdlhome, "collect"); 
     937    my $archive_dir = &util::filename_cat($collect_dir,$collect,"archives"); 
     938 
     939    # Make sure the collection isn't locked by someone else 
     940    $self->lock_collection($username, $collect); 
     941     
     942    # Check additional args 
     943    my $docid = $self->{'d'}; 
     944    if (!defined($docid)) { 
     945    $gsdl_cgi->generate_error("No document id is specified: d=..."); 
     946    }  
     947    my $metaname = $self->{'metaname'}; 
     948    if (!defined($metaname)) { 
     949    $gsdl_cgi->generate_error("No metaname is specified: metadataname=..."); 
     950    }  
     951    my $metavalue = $self->{'metavalue'}; 
     952    if (!defined($metavalue) || $metavalue eq "") { 
     953    $gsdl_cgi->generate_error("No metavalue or empty metavalue is specified: metadataname=..."); 
     954    }  
     955    # make "accumulate" the default (less destructive, as won't actually  
     956    # delete any existing values) 
     957    my $metamode = "accumulate"; 
     958 
     959    #=======================================================================# 
     960    # set_import_metadata [START] 
     961    #=======================================================================# 
     962    # Obtain where the metadata.xml is from the archiveinfo-doc.gdb file 
     963    # If the doc oid is not specified, we assume the metadata.xml is next to the specified "f" 
     964    my $metadata_xml_file; 
     965    my $arcinfo_doc_filename = &dbutil::get_infodb_file_path("gdbm", "archiveinf-doc", $archive_dir); 
     966    my $archive_doc_rec_string = &dbutil::read_infodb_entry("gdbm", $arcinfo_doc_filename, $docid); 
     967    my $archive_doc_rec = &dbutil::convert_infodb_string_to_hash($archive_doc_rec_string); 
     968     
     969    # This now stores the full pathname 
     970    my $import_filename = $archive_doc_rec->{'src-file'}->[0]; 
     971     
     972    # figure out correct metadata.xml file [?] 
     973    # Assuming the metadata.xml file is next to the source file 
     974    # Note: This will not work if it is using the inherited metadata from the parent folder 
     975    my ($import_tailname, $import_dirname)  
     976    = File::Basename::fileparse($import_filename); 
     977    my $metadata_xml_filename = &util::filename_cat($import_dirname,"metadata.xml"); 
     978 
     979    # Shane's escape characters 
     980    $metavalue = pack "U0C*", unpack "C*", $metavalue; 
     981    $metavalue =~ s/\,/&#44;/g; 
     982    $metavalue =~ s/\:/&#58;/g; 
     983    $metavalue =~ s/\|/&#124;/g; 
     984    $metavalue =~ s/\(/&#40;/g; 
     985    $metavalue =~ s/\)/&#41;/g; 
     986    $metavalue =~ s/\[/&#91;/g; 
     987    $metavalue =~ s/\\/&#92;/g; 
     988    $metavalue =~ s/\]/&#93;/g; 
     989    $metavalue =~ s/\{/&#123;/g; 
     990    $metavalue =~ s/\}/&#125;/g; 
     991    $metavalue =~ s/\"/&#34;/g; 
     992    $metavalue =~ s/\`/&#96;/g; 
     993    $metavalue =~ s/\n/_newline_/g; 
     994 
     995    # Edit the metadata.xml 
     996    # Modified by Jeffrey from DL Consulting 
     997    # Handle the case where there is one metadata.xml file for multiple FileSets 
     998    # The XML filter needs to know whether it is in the right FileSet 
     999    # TODO: This doesn't fix the problem where the metadata.xml is not next to the src file. 
     1000    # TODO: This doesn't handle the common metadata (where FileName doesn't point to a single file) 
     1001    $self->edit_metadata_xml($gsdl_cgi, $metadata_xml_filename, 
     1002                             $metaname, $metavalue, $metamode, $import_tailname); 
     1003    #=======================================================================# 
     1004    # set_import_metadata [END] 
     1005    #=======================================================================# 
     1006 
     1007 
     1008    #=======================================================================# 
     1009    # set_metadata (accumulate version) [START] 
     1010    #=======================================================================# 
     1011    # To people who know $collect_tail please add some comments 
     1012    # Obtain path to the database 
     1013    my $collect_tail = $collect; 
     1014    $collect_tail =~ s/^.*[\/\\]//; 
     1015    my $index_text_directory = &util::filename_cat($collect_dir,$collect,"index","text"); 
     1016    my $infodb_file_path = &dbutil::get_infodb_file_path("gdbm", $collect_tail, $index_text_directory); 
     1017 
     1018    # Read the docid entry 
     1019    my $doc_rec_string = &dbutil::read_infodb_entry("gdbm", $infodb_file_path, $docid); 
     1020    my $doc_rec = &dbutil::convert_infodb_string_to_hash($doc_rec_string); 
     1021    foreach my $k (keys %$doc_rec) { 
     1022    my @escaped_v = (); 
     1023    foreach my $v (@{$doc_rec->{$k}}) { 
     1024        if ($k eq "contains") { 
     1025        # protect quotes in ".2;".3 etc 
     1026        $v =~ s/\"/\\\"/g; 
     1027        push(@escaped_v, $v); 
     1028        } 
     1029        else { 
     1030        my $ev = &ghtml::unescape_html($v); 
     1031        $ev =~ s/\"/\\\"/g; 
     1032        push(@escaped_v, $ev); 
     1033        } 
     1034    } 
     1035    $doc_rec->{$k} = \@escaped_v; 
     1036    } 
     1037 
     1038    # Protect the quotes 
     1039    $metavalue =~ s/\"/\\\"/g; 
     1040 
     1041    # Adds the pending macro 
     1042    my $macro_metavalue = $rebuild_pending_macro . $metavalue; 
     1043 
     1044    # If the metadata doesn't exist, create a new one 
     1045    if (!defined($doc_rec->{$metaname})){     
     1046    $doc_rec->{$metaname} = [ $macro_metavalue ]; 
     1047    } 
     1048    # Else, let's acculumate the values 
     1049    else { 
     1050        push(@{$doc_rec->{$metaname}},$macro_metavalue); 
     1051    } 
     1052 
     1053    # Generate the record string 
     1054    my $serialized_doc_rec = &dbutil::convert_infodb_hash_to_string($doc_rec); 
     1055 
     1056    # Store it into GDBM 
     1057    my $cmd = "gdbmset \"$infodb_file_path\" \"$docid\" \"$serialized_doc_rec\""; 
     1058    my $status = system($cmd); 
     1059    if ($status != 0) { 
     1060        # Catch error if gdbmget failed 
     1061    my $mess = "Failed to set metadata key: $docid\n"; 
     1062     
     1063    $mess .= "PATH: $ENV{'PATH'}\n"; 
     1064    $mess .= "cmd = $cmd\n"; 
     1065    $mess .= "Exit status: $status\n"; 
     1066    $mess .= "System Error Message: $!\n"; 
     1067 
     1068    $gsdl_cgi->generate_error($mess); 
     1069    } 
     1070    else { 
     1071    my $mess = "insert-metadata successful: Key[$docid]\n"; 
     1072    $mess .= "  [In metadata.xml] $metaname"; 
     1073    $mess .= " = $metavalue\n"; 
     1074    $mess .= "  [In database] $metaname"; 
     1075    $mess .= " = $macro_metavalue\n"; 
     1076    $mess .= "  The new text has not been indexed, rebuilding collection is required\n"; 
     1077        $gsdl_cgi->generate_ok_message($mess); 
     1078    }     
     1079    #=======================================================================# 
     1080    # set_metadata (accumulate version) [END] 
     1081    #=======================================================================# 
     1082 
     1083    # Release the lock once it is done 
     1084    $self->unlock_collection($username, $collect); 
     1085} 
     1086 
    86110871;