root/main/trunk/greenstone2/perllib/cgiactions/metadataaction.pm @ 33183

Revision 33183, 35.0 KB (checked in by ak19, 14 months ago)

1. Added the untested erase_archives/index/live/import_metadata() subroutines to modmetadataaction, which will remove all metadata with matching metadata name from the specified docid. They generate no perl interpreter errors when running metadata-server.pl in authenticated mode, but otherwise they're untested. 2. Fixed an issue in baseaction.pm with running metadata-server.pl. 3. Commented out debug messages from metadataaction.pm. Second commit phase will need to introduce matching javascript methods to erase metadata.

Line 
1##########################################################################
2#
3# metadataaction.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2009 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package metadataaction;
27
28use strict;
29
30use cgiactions::baseaction;
31
32use dbutil;
33use ghtml;
34
35use JSON;
36
37# This class is conditionally expanded with set-metadata, remove-metadata and insert-metadata subroutines
38# defined in modmetadataaction.pm. The BEGIN code block determines whether the condition holds.
39# See
40# http://stackoverflow.com/questions/3998619/what-is-the-role-of-the-begin-block-in-perl
41# http://www.perlmonks.org/?node_id=881761 - splitting module into multiple files
42# http://www.perlmonks.org/?node_id=524456 - merging hashes
43
44our $modmeta_action_table; # don't init to empty hash here, else it will overwrite whatever BEGIN sets this to
45                  # see http://stackoverflow.com/questions/3998619/what-is-the-role-of-the-begin-block-in-perl
46
47BEGIN {
48#    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan/perl-5.8");
49    require XML::Rules;
50
51    # if we're GS3, then GS3_AUTHENTICATED must be defined and set to true
52    # in order to have access to subroutines that modify metadata (the set-
53    # and remove- metadata subroutines).
54    # TODO: if we're GS2, then we continue to behave as before?
55
56    if(!defined $ENV{'GSDL3HOME'} || (defined $ENV{'GS3_AUTHENTICATED'} && $ENV{'GS3_AUTHENTICATED'} eq "true")) {
57    require modmetadataaction;
58    }
59    else {
60    $modmeta_action_table = {};
61    }
62}
63
64@metadataaction::ISA = ('baseaction');
65
66
67my $getmeta_action_table =
68{
69    # unused and untested
70    # when DocEdit=1, need to retrieve a doc's full text (or doc section's full text) from archives
71    "get-archives-text" => {
72    'compulsory-args' => [ "d" ] },
73    #'compulsory-args' => [ "d" ],
74    #'optional-args' => [ "section" ] },
75   
76    #GET METHODS
77    "get-import-metadata" => {
78        'compulsory-args' => [ "d", "metaname" ],
79        'optional-args'   => [ "metapos" ] },
80
81    "get-archives-metadata" => {
82        'compulsory-args' => [ "d", "metaname" ],
83        'optional-args'   => [ "metapos" ] },
84   
85    "get-index-metadata" => {
86        'compulsory-args' => [ "d", "metaname" ],
87        'optional-args'   => [ "metapos" ] },
88
89    "get-metadata" => { # alias for get-index-metadata
90        'compulsory-args' => [ "d", "metaname" ],
91        'optional-args'   => [ "metapos" ] },
92
93    "get-live-metadata" => {
94        'compulsory-args' => [ "d", "metaname" ],
95        'optional-args'   => [ ] },
96
97    "get-metadata-array" => { # where param can be ONE of: index (default), import, archives, live
98        'compulsory-args' => [ "json" ],
99        'optional-args'   => [ "where" ],
100        'help-string' => [
101        'metadata-server.pl?a=get-metadata-array&c=demo&where=index&json=[{"docid":"HASHc5bce2d6d3e5b04e470ec9","metatable":[{"metaname":"username","metapos":"all"},{"metaname":"usertimestamp","metapos":"all"}, {"metaname":"usercomment","metapos":"all"}]}]'
102        ]}
103};
104
105# To get the final action_table of all available subroutines in this class,
106# merge the get- and mod-metadata hashes. See http://www.perlmonks.org/?node_id=524456
107# Note that modmeta_action_table will be empty of subroutines if the user does not have permissions
108# to modify metadata.
109my $action_table = { %$getmeta_action_table, %$modmeta_action_table };
110
111
112sub new
113{
114    my $class = shift (@_);
115    my ($gsdl_cgi,$iis6_mode) = @_;
116
117    # Treat metavalue specially.  To transmit this through a GET request
118    # the Javascript side has url-encoded it, so here we need to decode
119    # it before proceeding
120
121    my $url_encoded_metavalue = $gsdl_cgi->param("metavalue");
122    my $url_decoded_metavalue = &unicode::url_decode($url_encoded_metavalue,1);
123#$gsdl_cgi->generate_message("@@@ metaaction new - DEBUG before utf82unicode: " . &unicode::debug_unicode_string($url_decoded_metavalue));
124   
125    my $unicode_array = &unicode::utf82unicode($url_decoded_metavalue);
126
127    $url_decoded_metavalue = join("",map(chr($_),@$unicode_array));
128
129#$gsdl_cgi->generate_message("@@@ metaaction new - DEBUG after utf82unicode: " . &unicode::debug_unicode_string($url_decoded_metavalue));
130   
131    $gsdl_cgi->param("metavalue",$url_decoded_metavalue);
132
133    # need to do the same with prevmetavalue
134    my $url_encoded_prevmetavalue = $gsdl_cgi->param("prevmetavalue");
135    my $url_decoded_prevmetavalue = &unicode::url_decode($url_encoded_prevmetavalue,1);
136    my $prevunicode_array = &unicode::utf82unicode($url_decoded_prevmetavalue);
137
138    $url_decoded_prevmetavalue = join("",map(chr($_),@$prevunicode_array));
139    $gsdl_cgi->param("prevmetavalue",$url_decoded_prevmetavalue);
140
141    my $self = new baseaction($action_table,$gsdl_cgi,$iis6_mode);
142
143    return bless $self, $class;
144}
145
146
147sub get_live_metadata
148{
149    my $self = shift @_;
150
151    my $username  = $self->{'username'};
152    my $collect   = $self->{'collect'};
153    my $gsdl_cgi  = $self->{'gsdl_cgi'};
154    my $gsdlhome  = $self->{'gsdlhome'};
155    my $infodbtype = $self->{'infodbtype'};
156   
157    # live metadata gets/saves value scoped (prefixed) by the current usename
158    # so (for now) let's not bother to enforce authentication
159
160    # Obtain the collect dir
161    my $site = $self->{'site'};
162    my $collect_dir = $gsdl_cgi->get_collection_dir($site);
163    ## my $collect_dir = &util::filename_cat($gsdlhome, "collect");
164
165    # No locking collection when getting metadata, only when setting it
166#    $self->lock_collection($username, $collect); # Make sure the collection isn't locked by someone else
167
168    # look up additional args
169    my $docid  = $self->{'d'};
170    if ((!defined $docid) || ($docid =~ m/^\s*$/)) {
171       $gsdl_cgi->generate_error("No docid (d=...) specified.");
172    }
173
174    # Generate the dbkey
175    my $metaname  = $self->{'metaname'};
176    my $dbkey = "$docid.$metaname";
177
178    # To people who know $collect_tail please add some comments
179    # Obtain path to the database
180    my $collect_tail = $collect;
181    $collect_tail =~ s/^.*[\/|\\]//;
182    my $index_text_directory = &util::filename_cat($collect_dir,$collect,"index","text");
183    my $infodb_file_path = &dbutil::get_infodb_file_path($infodbtype, "live-$collect_tail", $index_text_directory);
184   
185    # Obtain the content of the key
186    my $cmd = "gdbmget $infodb_file_path $dbkey";
187    if (open(GIN,"$cmd |") == 0) {
188        # Catch error if gdbmget failed
189    my $mess = "Failed to get metadata key: $metaname\n";
190    $mess .= "$!\n";
191
192    $gsdl_cgi->generate_error($mess);
193    }
194    else {
195    binmode(GIN,":utf8");
196        # Read everything in and concatenate them into $metavalue
197    my $metavalue = "";
198    my $line;
199    while (defined ($line=<GIN>)) {
200        $metavalue .= $line;
201    }
202    close(GIN);
203    chomp($metavalue); # Get rid off the tailing newlines
204    $gsdl_cgi->generate_ok_message("$metavalue");
205    }
206
207    # Release the lock once it is done
208#    $self->unlock_collection($username, $collect);
209}
210
211# just calls the index version
212sub get_metadata
213{
214    my $self = shift @_;
215    $self->get_index_metadata(@_);
216}
217
218# JSON version that will get the requested metadata values
219# from the requested source (index, import, archives or live)
220# One of the params is a JSON string and the return value is JSON too
221# http://forums.asp.net/t/1844684.aspx/1 - Web api method return json in string
222sub get_metadata_array
223{
224    my $self = shift @_;
225
226    my $where = $self->{'where'};
227    if (!$where || ($where =~ m/^\s*$/)) { # 0, "0", "" and undef are all false. All else is true.
228    # What is truth in perl: http://www.berkeleyinternet.com/perl/node11.html
229    # and http://www.perlmonks.org/?node_id=33638
230
231    $where = "index"; # default behaviour is to get the values from index
232    }
233
234    # Only when setting metadata do we perform authentication and do we lock the collection,
235    # not when getting metadata
236
237    # for get_meta_array, the where param can only be ONE of import, archives, index, live
238    if($where =~ m/index/) {
239    $self->_get_index_metadata_array(@_);
240    }
241    elsif($where =~ m/archives/) {
242    $self->_get_archives_metadata_array(@_);
243    }
244    elsif($where =~ m/import/) {
245    $self->_get_import_metadata_array(@_);
246    }
247    elsif($where =~ m/live/) {
248        $self->_get_live_metadata_array(@_);
249    }
250}
251
252# Unused at present. Added for completion. Tested.
253sub _get_import_metadata_array {
254   
255    my $self = shift @_;
256
257    my $collect   = $self->{'collect'};
258    my $gsdl_cgi  = $self->{'gsdl_cgi'};
259    my $site = $self->{'site'};
260    my $collect_dir = $gsdl_cgi->get_collection_dir($site);
261   
262    # look up additional args
263    my $infodbtype = $self->{'infodbtype'};
264   
265    my $archive_dir = &util::filename_cat($collect_dir, $collect, "archives");
266    my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archive_dir);
267    my $json_str      = $self->{'json'};
268    my $doc_array = decode_json $json_str;
269
270    my $json_result_str = "[";
271    my $first_doc_rec = 1;
272    foreach my $doc_array_rec ( @$doc_array ) {
273   
274    my $docid = $doc_array_rec->{'docid'}; # no subsection metadata support in metadata.xml, only toplevel meta
275   
276    if($first_doc_rec) {
277        $first_doc_rec = 0;
278    } else {
279        $json_result_str .= ",";
280    }
281    $json_result_str = $json_result_str . "{\"docid\":\"" . $docid . "\""; 
282
283    my $metatable = $doc_array_rec->{'metatable'}; # a subarray, or need to generate an error saying JSON structure is wrong
284    $json_result_str = $json_result_str . ",\"metatable\":[";
285
286    my $first_rec = 1;
287    foreach my $metatable_rec ( @$metatable ) { # the subarray metatable is an array of hashmaps       
288
289        # Read the docid entry     
290        my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $docid);
291        # This now stores the full pathname
292        my $import_filename = $doc_rec->{'src-file'}->[0];
293        $import_filename = &util::placeholders_to_abspath($import_filename);
294
295        # figure out correct metadata.xml file [?]
296        # Assuming the metadata.xml file is next to the source file
297        # Note: This will not work if it is using the inherited metadata from the parent folder
298        my ($import_tailname, $import_dirname) = File::Basename::fileparse($import_filename);
299        my $metadata_xml_filename = &util::filename_cat($import_dirname, "metadata.xml");
300
301
302        if($first_rec) {
303        $first_rec = 0;
304        } else {
305        $json_result_str .= ",";       
306        }
307       
308        my $metaname  = $metatable_rec->{'metaname'};
309        $json_result_str .= "{\"metaname\":\"$metaname\",\"metavals\":[";
310
311        my $metapos   = $metatable_rec->{'metapos'}; # 0... 1|all|undefined
312        if(!defined $metapos) {
313        $metapos = 0;
314        }
315
316        # Obtain the specified metadata value(s)
317        my $metavalue;
318
319        if($metapos ne "all") { # get the value at a single metapos
320        $metavalue = $self->get_metadata_from_metadata_xml($gsdl_cgi, $metadata_xml_filename, $metaname, $metapos, $import_tailname);
321
322        #print STDERR "**** Metafilename, metaname, metapos, sec_num: $metadata_xml_filename, $metaname, $metapos, $import_tailname\n";
323       
324        $json_result_str .= "{\"metapos\":\"$metapos\",\"metavalue\":\"$metavalue\"}";
325
326        } else {
327        my $first_metaval = 1;
328        $metapos = 0;
329        $metavalue = $self->get_metadata_from_metadata_xml($gsdl_cgi, $metadata_xml_filename, $metaname, $metapos, $import_tailname);
330
331        while (defined $metavalue && $metavalue ne "") {
332            if($first_metaval) {       
333            $first_metaval = 0;
334            } else {
335            $json_result_str .= ",";
336            }
337       
338            $json_result_str .= "{\"metapos\":\"$metapos\",\"metavalue\":\"$metavalue\"}";
339
340            $metapos++;
341            $metavalue = $self->get_metadata_from_metadata_xml($gsdl_cgi, $metadata_xml_filename, $metaname, $metapos, $import_tailname);
342        }
343        }
344
345        $json_result_str .= "]}"; # close metavals array and metatable record
346    }
347   
348    $json_result_str .= "]}"; # close metatable array and docid record
349    }
350
351    $json_result_str .= "]"; # close array of docids
352    $gsdl_cgi->generate_ok_message($json_result_str."\n");
353}
354
355# Unused method, but included for completion. Tested, works. Takes a JSON string and returns a JSON string.
356# For more information on the format of the output, see _get_index_metadata_array, which is in use.
357sub _get_archives_metadata_array {
358
359    my $self = shift @_;
360
361    my $collect   = $self->{'collect'};
362    my $gsdl_cgi  = $self->{'gsdl_cgi'};
363    my $site = $self->{'site'};
364    my $collect_dir = $gsdl_cgi->get_collection_dir($site);
365
366    # look up additional args   
367    my $infodbtype = $self->{'infodbtype'};
368
369    my $archive_dir = &util::filename_cat($collect_dir, $collect, "archives");
370    my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archive_dir);
371
372    my $json_str      = $self->{'json'};
373    my $doc_array = decode_json $json_str;
374
375    my $json_result_str = "[";
376    my $first_doc_rec = 1;
377    foreach my $doc_array_rec ( @$doc_array ) {
378   
379    my $docid     = $doc_array_rec->{'docid'};
380   
381    if($first_doc_rec) {
382        $first_doc_rec = 0;
383    } else {
384        $json_result_str .= ",";
385    }
386    $json_result_str = $json_result_str . "{\"docid\":\"" . $docid . "\""; 
387
388    my $metatable = $doc_array_rec->{'metatable'}; # a subarray, or need to generate an error saying JSON structure is wrong
389    $json_result_str = $json_result_str . ",\"metatable\":[";
390
391    my $first_rec = 1;
392    foreach my $metatable_rec ( @$metatable ) { # the subarray metatable is an array of hashmaps       
393
394        my ($docid, $docid_secnum) = ($doc_array_rec->{'docid'} =~ m/^(.*?)(\..*)?$/);
395        $docid_secnum = "" if (!defined $docid_secnum);
396
397        # Read the docid entry     
398        my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $docid);
399        # This now stores the full pathname
400        my $doc_filename = $doc_rec->{'doc-file'}->[0];
401        $doc_filename = &util::filename_cat($archive_dir, $doc_filename);
402
403        if($first_rec) {
404        $first_rec = 0;
405        } else {
406        $json_result_str .= ",";       
407        }
408       
409        my $metaname  = $metatable_rec->{'metaname'};
410        $json_result_str .= "{\"metaname\":\"$metaname\",\"metavals\":[";
411
412        my $metapos   = $metatable_rec->{'metapos'}; # 0... 1|all|undefined
413        if(!defined $metapos) {
414        $metapos = 0;
415        }
416
417
418        # Obtain the specified metadata value(s)
419        my $metavalue;
420
421        if($metapos ne "all") { # get the value at a single metapos
422
423        $metavalue = $self->get_metadata_from_archive_xml($gsdl_cgi, $doc_filename, $metaname, $metapos, $docid_secnum);
424        #print STDERR "**** Docname, metaname, metapos, sec_num: $doc_filename, $metaname, $metapos, $docid_secnum\n";
425       
426        $json_result_str .= "{\"metapos\":\"$metapos\",\"metavalue\":\"$metavalue\"}";
427
428        } else {
429        my $first_metaval = 1;
430        $metapos = 0;
431        $metavalue = $self->get_metadata_from_archive_xml($gsdl_cgi, $doc_filename, $metaname, $metapos, $docid_secnum);
432
433        while (defined $metavalue && $metavalue ne "") {
434            if($first_metaval) {       
435            $first_metaval = 0;
436            } else {
437            $json_result_str .= ",";
438            }
439       
440            $json_result_str .= "{\"metapos\":\"$metapos\",\"metavalue\":\"$metavalue\"}";
441
442            $metapos++;
443            $metavalue = $self->get_metadata_from_archive_xml($gsdl_cgi, $doc_filename, $metaname, $metapos, $docid_secnum);
444        }
445        }
446
447        $json_result_str .= "]}"; # close metavals array and metatable record
448    }
449   
450    $json_result_str .= "]}"; # close metatable array and docid record
451    }
452
453    $json_result_str .= "]"; # close array of docids
454    $gsdl_cgi->generate_ok_message($json_result_str."\n");
455}
456
457
458# Unused at present. Added for completion. Tested, but not sure if it retrieves metadata in the manner it's expected to.
459sub _get_live_metadata_array
460{
461    my $self = shift @_;
462
463    my $collect   = $self->{'collect'};
464    my $gsdl_cgi  = $self->{'gsdl_cgi'};
465    my $site = $self->{'site'};
466    my $collect_dir = $gsdl_cgi->get_collection_dir($site);
467
468    # look up additional args   
469    my $infodbtype = $self->{'infodbtype'};
470   
471    # To people who know $collect_tail please add some comments
472    # Obtain the path to the database
473    my $collect_tail = $collect;
474    $collect_tail =~ s/^.*[\/|\\]//;
475    my $index_text_directory = &util::filename_cat($collect_dir,$collect,"index","text");
476    my $infodb_file_path = &dbutil::get_infodb_file_path($infodbtype, "live-$collect_tail", $index_text_directory);
477
478    my $json_str      = $self->{'json'};
479    my $doc_array = decode_json $json_str;
480
481    my $json_result_str = "[";
482    my $first_doc_rec = 1;
483
484    foreach my $doc_array_rec ( @$doc_array ) {
485   
486    my $docid     = $doc_array_rec->{'docid'};
487   
488    if($first_doc_rec) {
489        $first_doc_rec = 0;
490    } else {
491        $json_result_str .= ",";
492    }
493    $json_result_str = $json_result_str . "{\"docid\":\"" . $docid . "\""; 
494   
495    my $metatable = $doc_array_rec->{'metatable'}; # a subarray, or need to generate an error saying JSON structure is wrong
496    $json_result_str = $json_result_str . ",\"metatable\":[";
497   
498    my $first_rec = 1;
499    foreach my $metatable_rec ( @$metatable ) { # the subarray metatable is an array of hashmaps       
500        if($first_rec) {
501        $first_rec = 0;
502        } else {
503        $json_result_str .= ",";       
504        }
505       
506        my $metaname  = $metatable_rec->{'metaname'};
507        $json_result_str .= "{\"metaname\":\"$metaname\",\"metavals\":[";
508       
509        # Generate the dbkey
510        my $dbkey = "$docid.$metaname";
511       
512        # metapos for get_live_metadata is always assumed to be "all".
513        # It's always going to get all the lines of metavalues associated with a metaname
514        # (It's the metaname itself that should contain an increment number, if there are to be multiple values).
515        #my $metapos = "all";
516        my $metapos = $metatable_rec->{'metapos'} || 0; # Can be 0... 1|all|undefined. Defaults to 0 if undefined/false
517        my $metavalue = "";
518       
519        # Obtain the content of the key
520        my $cmd = "gdbmget $infodb_file_path $dbkey";
521        if (open(GIN,"$cmd |") != 0) { # Success.
522       
523        binmode(GIN,":utf8");
524        # Read everything in and concatenate them into $metavalue       
525        my $line;
526        my $first_metaval = 1;
527        my $pos = 0;
528        while (defined ($line=<GIN>)) {
529            chomp($line); # Get rid off the tailing newlines
530           
531            if($metapos eq "all") {
532            if($first_metaval) {       
533                $first_metaval = 0;
534            } else {
535                $json_result_str .= ",";
536            }           
537            $metavalue = $line;
538            $json_result_str .= "{\"metapos\":\"$pos\",\"metavalue\":\"$metavalue\"}";
539            } elsif($metapos == $pos) {
540            $metavalue = $line;
541            $json_result_str .= "{\"metapos\":\"$metapos\",\"metavalue\":\"$metavalue\"}";
542            last;
543            } # else, the current $pos is not the required $metapos
544            $pos += 1;
545        }
546        close(GIN);
547        } # else open cmd == 0 (failed) and metavals array will be empty [] for this metaname
548       
549        $json_result_str .= "]}"; # close metavals array and metatable record
550    }
551   
552    $json_result_str .= "]}"; # close metatable array and docid record
553    }
554
555    $json_result_str .= "]"; # close array of docids
556   
557    $gsdl_cgi->generate_ok_message($json_result_str."\n");   
558}
559
560
561# Takes a JSON string and returns a JSON string
562# Request string is of the form:
563# http://localhost:8283/greenstone/cgi-bin/metadata-server.pl?a=get-metadata-array&c=demo&where=index&json=[{"docid":"HASHc5bce2d6d3e5b04e470ec9","metatable":[{"metaname":"username","metapos":"all"},{"metaname":"usertimestamp","metapos":"all"}, {"metaname":"usercomment","metapos":"all"}]}]
564# Resulting string is of the form:
565# [{"docid":"HASHc5bce2d6d3e5b04e470ec9","metatable":[{"metaname":"username","metavals":[{"metapos":"0","metavalue":"me"},{"metapos":"1","metavalue":"admin"}]},{"metaname":"usertimestamp","metavals":[{"metapos":"0","metavalue":"1367900586888"},{"metapos":"1","metavalue":"1367900616574"}]},{"metaname":"usercomment","metavals":[{"metapos":"0","metavalue":"Hi"},{"metapos":"1","metavalue":"Hello"}]}]}]
566sub _get_index_metadata_array
567{
568    my $self = shift @_;
569
570    my $collect   = $self->{'collect'};
571    my $gsdl_cgi  = $self->{'gsdl_cgi'};
572    my $site = $self->{'site'};
573    my $collect_dir = $gsdl_cgi->get_collection_dir($site);
574
575    # look up additional args   
576    my $infodbtype = $self->{'infodbtype'};
577   
578    # To people who know $collect_tail please add some comments
579    # Obtain the path to the database
580    my $collect_tail = $collect;
581    $collect_tail =~ s/^.*[\/|\\]//;
582    my $index_text_directory = &util::filename_cat($collect_dir,$collect,"index","text");
583    my $infodb_file_path = &dbutil::get_infodb_file_path($infodbtype, $collect_tail, $index_text_directory);
584
585    my $json_str      = $self->{'json'};
586    my $doc_array = decode_json $json_str;
587
588    my $json_result_str = "[";
589    my $first_doc_rec = 1;
590
591    foreach my $doc_array_rec ( @$doc_array ) {
592   
593    my $docid     = $doc_array_rec->{'docid'};
594   
595    if($first_doc_rec) {
596        $first_doc_rec = 0;
597    } else {
598        $json_result_str .= ",";
599    }
600    $json_result_str = $json_result_str . "{\"docid\":\"" . $docid . "\""; 
601
602    my $metatable = $doc_array_rec->{'metatable'}; # a subarray, or need to generate an error saying JSON structure is wrong
603    $json_result_str = $json_result_str . ",\"metatable\":[";
604
605    my $first_rec = 1;
606    foreach my $metatable_rec ( @$metatable ) { # the subarray metatable is an array of hashmaps       
607        if($first_rec) {
608        $first_rec = 0;
609        } else {
610        $json_result_str .= ",";       
611        }
612       
613        my $metaname  = $metatable_rec->{'metaname'};
614        $json_result_str .= "{\"metaname\":\"$metaname\",\"metavals\":[";
615
616        my $metapos   = $metatable_rec->{'metapos'}; # 0... 1|all|undefined
617        if(!defined $metapos) {
618        $metapos = 0;
619        }
620
621         # Read the docid entry
622        my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $infodb_file_path, $docid);
623 
624        # Basically loop through and unescape_html the values
625        foreach my $k (keys %$doc_rec) {
626        my @escaped_v = ();
627        foreach my $v (@{$doc_rec->{$k}}) {
628            my $ev = &ghtml::unescape_html($v);
629            push(@escaped_v, $ev);
630        }
631        $doc_rec->{$k} = \@escaped_v;
632        }
633
634        # Obtain the specified metadata value(s)
635        my $metavalue;
636
637        if($metapos ne "all") { # get the value at a single metapos
638
639        $metavalue = $doc_rec->{$metaname}->[$metapos];
640
641        # protect any double quotes and colons in the metavalue before putting it into JSON
642        $metavalue =~ s/\"/&quot;/g if defined $metavalue;
643        $metavalue =~ s/\:/&58;/g if defined $metavalue;
644
645        $json_result_str .= "{\"metapos\":\"$metapos\",\"metavalue\":\"$metavalue\"}";
646
647        } else {
648        my $first_metaval = 1;
649        $metapos = 0;
650        $metavalue = $doc_rec->{$metaname}->[$metapos];
651
652        while (defined $metavalue) {
653
654            # protect any double quotes and colons in the metavalue before putting it into JSON
655            $metavalue =~ s/\"/&quot;/g;
656            $metavalue =~ s/\:/&58;/g;
657
658            if($first_metaval) {       
659            $first_metaval = 0;
660            } else {
661            $json_result_str .= ",";
662            }
663       
664            $json_result_str .= "{\"metapos\":\"$metapos\",\"metavalue\":\"$metavalue\"}";
665
666            $metapos++;
667            $metavalue = $doc_rec->{$metaname}->[$metapos];
668        }
669        }
670
671        $json_result_str .= "]}"; # close metavals array and metatable record
672    }
673   
674    $json_result_str .= "]}"; # close metatable array and docid record
675    }
676
677    $json_result_str .= "]"; # close array of docids
678
679    $gsdl_cgi->generate_ok_message($json_result_str."\n");   
680}
681
682
683sub get_index_metadata
684{
685    my $self = shift @_;
686
687    my $username  = $self->{'username'};
688    my $collect   = $self->{'collect'};
689    my $gsdl_cgi  = $self->{'gsdl_cgi'};
690    my $gsdlhome  = $self->{'gsdlhome'};
691
692    # Obtain the collect dir
693    my $site = $self->{'site'};
694    my $collect_dir = $gsdl_cgi->get_collection_dir($site);
695    ##my $collect_dir = &util::filename_cat($gsdlhome, "collect");
696
697    # look up additional args
698    my $docid     = $self->{'d'};
699    my $metaname  = $self->{'metaname'};
700    my $metapos   = $self->{'metapos'};
701    my $infodbtype = $self->{'infodbtype'};
702
703    # To people who know $collect_tail please add some comments
704    # Obtain path to the database
705    my $collect_tail = $collect;
706    $collect_tail =~ s/^.*[\/|\\]//;
707    my $index_text_directory = &util::filename_cat($collect_dir,$collect,"index","text");
708    my $infodb_file_path = &dbutil::get_infodb_file_path($infodbtype, $collect_tail, $index_text_directory);
709
710    # Read the docid entry
711    my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $infodb_file_path, $docid);
712 
713    # Basically loop through and unescape_html the values
714    foreach my $k (keys %$doc_rec) {
715    my @escaped_v = ();
716    foreach my $v (@{$doc_rec->{$k}}) {
717        my $ev = &ghtml::unescape_html($v);
718        push(@escaped_v, $ev);
719    }
720    $doc_rec->{$k} = \@escaped_v;
721    }
722
723    # Obtain the specified metadata value
724    $metapos = 0 if (!defined $metapos || ($metapos =~ m/^\s*$/));
725    my $metavalue = $doc_rec->{$metaname}->[$metapos];
726    $gsdl_cgi->generate_ok_message("$metavalue");
727   
728}
729
730
731sub get_import_metadata
732{
733    my $self = shift @_;
734
735    my $username  = $self->{'username'};
736    my $collect   = $self->{'collect'};
737    my $gsdl_cgi  = $self->{'gsdl_cgi'};
738    my $gsdlhome  = $self->{'gsdlhome'};
739
740    # Obtain the collect dir
741    my $site = $self->{'site'};
742    my $collect_dir = $gsdl_cgi->get_collection_dir($site);
743    ##my $collect_dir = &util::filename_cat($gsdlhome, "collect");
744
745    # look up additional args
746    my $docid     = $self->{'d'};
747    my $metaname  = $self->{'metaname'};
748    my $metapos = $self->{'metapos'};
749    $metapos = 0 if (!defined $metapos || ($metapos =~ m/^\s*$/)); # gets the first value by default since metapos defaults to 0
750
751    my $infodbtype = $self->{'infodbtype'};
752    if (!defined $docid)
753    {
754        $gsdl_cgi->generate_error("No docid (d=...) specified.\n");
755    }
756
757    # Obtain where the metadata.xml is from the archiveinfo-doc.gdb file
758    # If the doc oid is not specified, we assume the metadata.xml is next to the specified "f"
759    my $metadata_xml_file;
760    my $import_filename = undef;
761   
762
763    my $archive_dir = &util::filename_cat($collect_dir, $collect, "archives");
764    my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archive_dir);
765    my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $docid);
766
767    # This now stores the full pathname
768    $import_filename = $doc_rec->{'src-file'}->[0];
769    $import_filename = &util::placeholders_to_abspath($import_filename);
770
771    # figure out correct metadata.xml file [?]
772    # Assuming the metadata.xml file is next to the source file
773    # Note: This will not work if it is using the inherited metadata from the parent folder
774    my ($import_tailname, $import_dirname) = File::Basename::fileparse($import_filename);
775    my $metadata_xml_filename = &util::filename_cat($import_dirname, "metadata.xml");
776
777    $gsdl_cgi->generate_ok_message($self->get_metadata_from_metadata_xml($gsdl_cgi, $metadata_xml_filename, $metaname, $metapos, $import_tailname));
778
779}
780
781sub get_metadata_from_metadata_xml
782{
783    my $self = shift @_;
784    my ($gsdl_cgi, $metadata_xml_filename, $metaname, $metapos, $src_file) = @_;
785   
786    my @rules =
787    (
788        _default => 'raw',
789        'Metadata' => \&gfmxml_metadata,
790        'FileName' => \&mxml_filename
791    );
792       
793    my $parser = XML::Rules->new
794    (
795        rules => \@rules,
796        output_encoding => 'utf8'
797    );
798   
799    my $xml_in = "";
800    if (!open(MIN,"<$metadata_xml_filename"))
801    {
802        $gsdl_cgi->generate_error("Unable to read in $metadata_xml_filename: $!");
803    }
804    else
805    {
806        # Read them in
807        my $line;
808        while (defined ($line=<MIN>)) {
809            $xml_in .= $line;
810        }
811        close(MIN);
812
813        $parser->parse($xml_in, {metaname => $metaname, metapos => $metapos, src_file => $src_file});
814       
815        if(defined $parser->{'pad'}->{'metavalue'})
816        {
817            return $parser->{'pad'}->{'metavalue'};
818        }
819        else
820        {
821            return "";
822        }
823    }
824}
825
826sub gfmxml_metadata
827{
828    my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
829
830    # no subsection support yet in metadata.xml
831
832    if (($parser->{'parameters'}->{'src_file'} eq $parser->{'parameters'}->{'current_file'}) && $parser->{'parameters'}->{'metaname'} eq $attrHash->{'name'})
833    {
834        if (!defined $parser->{'parameters'}->{'poscount'})
835        {
836            $parser->{'parameters'}->{'poscount'} = 0;
837        }
838        else
839        {
840            $parser->{'parameters'}->{'poscount'}++;
841        }
842   
843        # gets the first value by default, since metapos defaults to 0
844        if (($parser->{'parameters'}->{'poscount'} == $parser->{'parameters'}->{'metapos'}))
845        {
846            if($parser->{'parameters'}->{'metapos'} > 0) {
847            print STDERR "@@@@ WARNING: non-zero metapos.\n";
848            print STDERR "@@@@ Assuming SIMPLE collection and proceeding to retrieve the import meta at position: ".$parser->{'parameters'}->{'metapos'}.".\n";
849            }
850            $parser->{'pad'}->{'metavalue'} = $attrHash->{'_content'};
851        }
852    }
853}
854
855sub mxml_filename
856{
857    my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
858
859    # Store the filename of the Current Fileset
860    # Note: According to http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd
861    # FileName tag must come before Description tag
862    $parser->{'parameters'}->{'current_file'} = $attrHash->{'_content'};
863
864    # RAW is [$tagname => $attrHash] not $tagname => $attrHash!!
865    return [$tagname => $attrHash];
866}
867
868sub get_archives_metadata
869{
870    my $self = shift @_;
871
872    my $username  = $self->{'username'};
873    my $collect   = $self->{'collect'};
874    my $gsdl_cgi  = $self->{'gsdl_cgi'};
875#   my $gsdlhome  = $self->{'gsdlhome'};
876    my $infodbtype = $self->{'infodbtype'};
877
878    # Obtain the collect dir
879    my $site = $self->{'site'};
880    my $collect_dir = $gsdl_cgi->get_collection_dir($site);
881   
882    my $archive_dir = &util::filename_cat($collect_dir, $collect, "archives");
883
884    # look up additional args
885    my ($docid, $docid_secnum) = ($self->{'d'} =~ m/^(.*?)(\..*)?$/);
886    $docid_secnum = "" if (!defined $docid_secnum);
887   
888    my $metaname = $self->{'metaname'};
889    my $metapos = $self->{'metapos'};
890    $metapos = 0 if (!defined $metapos || ($metapos =~ m/^\s*$/));
891   
892    my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archive_dir);
893    my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $docid);
894
895    # This now stores the full pathname
896    my $doc_filename = $doc_rec->{'doc-file'}->[0];
897
898    $gsdl_cgi->generate_ok_message($self->get_metadata_from_archive_xml($gsdl_cgi, &util::filename_cat($archive_dir, $doc_filename), $metaname, $metapos, $docid_secnum));
899
900}
901
902# unused and untested
903sub get_archives_text
904{
905    my $self = shift @_;
906
907    my $username  = $self->{'username'};
908    my $collect   = $self->{'collect'};
909    my $gsdl_cgi  = $self->{'gsdl_cgi'};
910#   my $gsdlhome  = $self->{'gsdlhome'};
911    my $infodbtype = $self->{'infodbtype'};
912
913    # Obtain the collect dir
914    my $site = $self->{'site'};
915    my $collect_dir = $gsdl_cgi->get_collection_dir($site);
916   
917    my $archive_dir = &util::filename_cat($collect_dir, $collect, "archives");
918
919    # look up additional args
920    my ($docid, $docid_secnum) = ($self->{'d'} =~ m/^(.*?)(\..*)?$/);
921    $docid_secnum = "" if (!defined $docid_secnum);
922   
923    my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archive_dir);
924    my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $docid);
925
926    # This now stores the full pathname
927    my $doc_filename = $doc_rec->{'doc-file'}->[0];
928   
929    my $metaname = undef;
930    my $metapos = -1;
931   
932    $gsdl_cgi->generate_ok_message($self->get_metadata_from_archive_xml($gsdl_cgi, &util::filename_cat($archive_dir, $doc_filename), $metaname, $metapos, $docid_secnum));
933
934}
935
936sub get_metadata_from_archive_xml
937{
938    my $self = shift @_;
939    my ($gsdl_cgi, $doc_xml_filename, $metaname, $metapos, $secid) = @_;
940   
941    my @start_rules = ('Section' => \&dxml_start_section);
942   
943    my @rules =
944    (
945        _default => 'raw',
946        'Metadata' => \&gfdxml_metadata
947    );
948       
949    my $parser = XML::Rules->new
950    (
951        start_rules => \@start_rules,
952        rules => \@rules,
953        output_encoding => 'utf8'
954    );
955   
956    my $xml_in = "";
957    if (!open(MIN,"<$doc_xml_filename"))
958    {
959        $gsdl_cgi->generate_error("Unable to read in $doc_xml_filename: $!");
960    }
961    else
962    {
963        # Read them in
964        my $line;
965        while (defined ($line=<MIN>)) {
966            $xml_in .= $line;
967        }
968        close(MIN);
969
970        $parser->parse($xml_in, {metaname => $metaname, metapos => $metapos, secid => $secid});
971       
972        if(defined $parser->{'pad'}->{'metavalue'})
973        {
974            return $parser->{'pad'}->{'metavalue'};
975        }
976        else
977        {
978            return "";
979        }
980    }
981}
982
983# unused and untested
984sub get_text_from_archive_xml
985{
986    my $self = shift @_;
987    my ($gsdl_cgi, $doc_xml_filename, $secid) = @_;
988
989    # To monitor which section/subsection number we are in
990    my @start_rules = ('Section' => \&dxml_start_section);
991
992    # set the callback functions for the elements in doc.xml we're interested in, <Content>
993    my @rules =
994    (
995        _default => 'raw',
996        'Content' => \&gfdxml_text # gfdxml = get from doc xml?
997    );
998       
999    my $parser = XML::Rules->new
1000    (
1001        start_rules => \@start_rules,
1002        rules => \@rules,
1003        output_encoding => 'utf8'
1004    );
1005   
1006    my $xml_in = "";
1007    if (!open(MIN,"<$doc_xml_filename"))
1008    {
1009        $gsdl_cgi->generate_error("Unable to read in $doc_xml_filename: $!");
1010    }
1011    else
1012    {
1013        # Read them in
1014        my $line;
1015        while (defined ($line=<MIN>)) {
1016            $xml_in .= $line;
1017        }
1018        close(MIN);
1019
1020        $parser->parse($xml_in, {secid => $secid});
1021       
1022        if(defined $parser->{'pad'}->{'textcontent'})
1023        {
1024            return $parser->{'pad'}->{'textcontent'};
1025        }
1026        else
1027        {
1028            return "";
1029        }
1030    }
1031}
1032
1033sub gfdxml_metadata
1034{
1035    my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
1036   
1037    if(!($parser->{'parameters'}->{'secid'} eq $parser->{'parameters'}->{'curr_section_num'}))
1038    {
1039        return;
1040    }
1041
1042    if ($parser->{'parameters'}->{'metaname'} eq $attrHash->{'name'})
1043    {
1044        if (!defined $parser->{'parameters'}->{'poscount'})
1045        {
1046            $parser->{'parameters'}->{'poscount'} = 0;
1047        }
1048        else
1049        {
1050            $parser->{'parameters'}->{'poscount'}++;
1051        }
1052    }
1053
1054    if (($parser->{'parameters'}->{'metaname'} eq $attrHash->{'name'}) && ($parser->{'parameters'}->{'poscount'} == $parser->{'parameters'}->{'metapos'}))
1055    {   
1056        $parser->{'pad'}->{'metavalue'} = $attrHash->{'_content'};
1057    }
1058}
1059
1060# unused and untested - for get_archives_text
1061sub gfdxml_text
1062{
1063    my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
1064   
1065    if($parser->{'parameters'}->{'secid'} eq $parser->{'parameters'}->{'curr_section_num'})
1066    {
1067        $parser->{'pad'}->{'textcontent'} = $attrHash->{'_content'}; # the textnode content
1068
1069    }
1070    else {
1071        return;
1072    }
1073}
1074
1075sub dxml_start_section
1076{
1077    my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
1078
1079    my $new_depth = scalar(@$contextArray);
1080
1081#   print STDERR "**** START SECTION \n";
1082   
1083    if ($new_depth == 1) {
1084    $parser->{'parameters'}->{'curr_section_depth'} = 1;
1085    $parser->{'parameters'}->{'curr_section_num'}   = "";
1086    }
1087
1088    my $old_depth  = $parser->{'parameters'}->{'curr_section_depth'};
1089    my $old_secnum = $parser->{'parameters'}->{'curr_section_num'};
1090
1091    my $new_secnum;
1092
1093    if ($new_depth > $old_depth) {
1094    # child subsection
1095    $new_secnum = "$old_secnum.1";
1096    }
1097    elsif ($new_depth == $old_depth) {
1098    # sibling section => increase it's value by 1
1099    my ($tail_num) = ($old_secnum =~ m/\.(\d+)$/);
1100    $tail_num++;
1101    $new_secnum = $old_secnum;
1102    $new_secnum =~ s/\.(\d+)$/\.$tail_num/;
1103    }
1104    else {
1105    # back up to parent section => lop off tail
1106    $new_secnum = $old_secnum;
1107    $new_secnum =~ s/\.\d+$//;
1108    }
1109
1110    $parser->{'parameters'}->{'curr_section_depth'} = $new_depth;
1111    $parser->{'parameters'}->{'curr_section_num'}   = $new_secnum;
1112   
1113    1;
1114}
1115
11161;
Note: See TracBrowser for help on using the browser.