source: main/trunk/greenstone2/perllib/cgiactions/metadataaction.pm@ 37207

Last change on this file since 37207 was 37207, checked in by davidb, 15 months ago

Support added in for optional cgi param 'dv' specifying the file-level document-version history (_fldv_history) folder to use

File size: 36.6 KB
Line 
1##########################################################################
2#
3# metadataaction.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2009 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package metadataaction;
27
28use strict;
29
30use cgiactions::baseaction;
31use cgiactions::metadataactioncommon;
32
33use dbutil;
34use ghtml;
35
36use JSON;
37
38# convenient to have shorter version around
39my $FLDV_HISTORY_DIR = $metadataactioncommon::FLDV_HISTORY_DIR;
40
41# This class is conditionally expanded with set-metadata, remove-metadata and insert-metadata subroutines
42# defined in modmetadataaction.pm. The BEGIN code block determines whether the condition holds.
43# See
44# http://stackoverflow.com/questions/3998619/what-is-the-role-of-the-begin-block-in-perl
45# http://www.perlmonks.org/?node_id=881761 - splitting module into multiple files
46# http://www.perlmonks.org/?node_id=524456 - merging hashes
47
48our $modmeta_action_table; # don't init to empty hash here, else it will overwrite whatever BEGIN sets this to
49 # see http://stackoverflow.com/questions/3998619/what-is-the-role-of-the-begin-block-in-perl
50
51BEGIN {
52# unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan/perl-5.8");
53 require XML::Rules;
54
55 # if we're GS3, then GS3_AUTHENTICATED must be defined and set to true
56 # in order to have access to subroutines that modify metadata (the set-
57 # and remove- metadata subroutines).
58 # TODO: if we're GS2, then we continue to behave as before?
59
60 if(!defined $ENV{'GSDL3HOME'} || (defined $ENV{'GS3_AUTHENTICATED'} && $ENV{'GS3_AUTHENTICATED'} eq "true")) {
61 require modmetadataaction;
62 }
63 else {
64 $modmeta_action_table = {};
65 }
66}
67
68@metadataaction::ISA = ('baseaction');
69
70
71my $getmeta_action_table =
72{
73 # unused and untested
74 # when DocEdit=1, need to retrieve a doc's full text (or doc section's full text) from archives
75 "get-archives-text" => {
76 'compulsory-args' => [ "d" ],
77 'optional-args' => [ "dv" ] },
78 #'compulsory-args' => [ "d" ],
79 #'optional-args' => [ "section" ] },
80
81 #GET METHODS
82 "get-import-metadata" => {
83 'compulsory-args' => [ "d", "metaname" ],
84 'optional-args' => [ "metapos" ] },
85
86 "get-archives-metadata" => {
87 'compulsory-args' => [ "d", "metaname" ],
88 'optional-args' => [ "dv", "metapos" ] },
89
90 "get-index-metadata" => {
91 'compulsory-args' => [ "d", "metaname" ],
92 'optional-args' => [ "metapos" ] },
93
94 "get-metadata" => { # alias for get-index-metadata
95 'compulsory-args' => [ "d", "metaname" ],
96 'optional-args' => [ "metapos" ] },
97
98 "get-live-metadata" => {
99 'compulsory-args' => [ "d", "metaname" ],
100 'optional-args' => [ ] },
101
102 "get-metadata-array" => { # where param can be ONE of: index (default), import, archives, live
103 'compulsory-args' => [ "json" ],
104 'optional-args' => [ "where" ],
105 'help-string' => [
106 'metadata-server.pl?a=get-metadata-array&c=demo&where=index&json=[{"docid":"HASHc5bce2d6d3e5b04e470ec9","metatable":[{"metaname":"username","metapos":"all"},{"metaname":"usertimestamp","metapos":"all"}, {"metaname":"usercomment","metapos":"all"}]}]'
107 ]}
108};
109
110# To get the final action_table of all available subroutines in this class,
111# merge the get- and mod-metadata hashes. See http://www.perlmonks.org/?node_id=524456
112# Note that modmeta_action_table will be empty of subroutines if the user does not have permissions
113# to modify metadata.
114my $action_table = { %$getmeta_action_table, %$modmeta_action_table };
115
116
117sub new
118{
119 my $class = shift (@_);
120 my ($gsdl_cgi,$iis6_mode) = @_;
121
122 # Treat metavalue specially. To transmit this through a GET request
123 # the Javascript side has url-encoded it, so here we need to decode
124 # it before proceeding
125
126 my $url_encoded_metavalue = $gsdl_cgi->param("metavalue");
127 my $url_decoded_metavalue = &unicode::url_decode($url_encoded_metavalue,1);
128#$gsdl_cgi->generate_message("@@@ metaaction new - DEBUG before utf82unicode: " . &unicode::debug_unicode_string($url_decoded_metavalue));
129
130 my $unicode_array = &unicode::utf82unicode($url_decoded_metavalue);
131
132 $url_decoded_metavalue = join("",map(chr($_),@$unicode_array));
133
134#$gsdl_cgi->generate_message("@@@ metaaction new - DEBUG after utf82unicode: " . &unicode::debug_unicode_string($url_decoded_metavalue));
135
136 $gsdl_cgi->param("metavalue",$url_decoded_metavalue);
137
138 # need to do the same with prevmetavalue
139 my $url_encoded_prevmetavalue = $gsdl_cgi->param("prevmetavalue");
140 my $url_decoded_prevmetavalue = &unicode::url_decode($url_encoded_prevmetavalue,1);
141 my $prevunicode_array = &unicode::utf82unicode($url_decoded_prevmetavalue);
142
143 $url_decoded_prevmetavalue = join("",map(chr($_),@$prevunicode_array));
144 $gsdl_cgi->param("prevmetavalue",$url_decoded_prevmetavalue);
145
146 my $self = new baseaction($action_table,$gsdl_cgi,$iis6_mode);
147
148 return bless $self, $class;
149}
150
151
152sub get_live_metadata
153{
154 my $self = shift @_;
155
156 my $username = $self->{'username'};
157 my $collect = $self->{'collect'};
158 my $gsdl_cgi = $self->{'gsdl_cgi'};
159 #my $gsdlhome = $self->{'gsdlhome'};
160 my $infodbtype = $self->{'infodbtype'};
161
162 # live metadata gets/saves value scoped (prefixed) by the current usename
163 # so (for now) let's not bother to enforce authentication
164
165 # Obtain the collect dir
166 my $site = $self->{'site'};
167 my $collect_dir = $gsdl_cgi->get_collection_dir($site);
168 ## my $collect_dir = &util::filename_cat($gsdlhome, "collect");
169
170 # No locking collection when getting metadata, only when setting it
171# $self->lock_collection($username, $collect); # Make sure the collection isn't locked by someone else
172
173 # look up additional args
174 my $docid = $self->{'d'};
175 if ((!defined $docid) || ($docid =~ m/^\s*$/)) {
176 $gsdl_cgi->generate_error("No docid (d=...) specified.");
177 }
178
179 # Generate the dbkey
180 my $metaname = $self->{'metaname'};
181 my $dbkey = "$docid.$metaname";
182
183 # To people who know $collect_tail please add some comments
184 # Obtain path to the database
185 my $collect_tail = $collect;
186 $collect_tail =~ s/^.*[\/|\\]//;
187 my $index_text_directory = &util::filename_cat($collect_dir,$collect,"index","text");
188 my $infodb_file_path = &dbutil::get_infodb_file_path($infodbtype, "live-$collect_tail", $index_text_directory);
189
190 # Obtain the content of the key
191 my $cmd = "gdbmget $infodb_file_path $dbkey";
192 if (open(GIN,"$cmd |") == 0) {
193 # Catch error if gdbmget failed
194 my $mess = "Failed to get metadata key: $metaname\n";
195 $mess .= "$!\n";
196
197 $gsdl_cgi->generate_error($mess);
198 }
199 else {
200 binmode(GIN,":utf8");
201 # Read everything in and concatenate them into $metavalue
202 my $metavalue = "";
203 my $line;
204 while (defined ($line=<GIN>)) {
205 $metavalue .= $line;
206 }
207 close(GIN);
208 chomp($metavalue); # Get rid off the tailing newlines
209 $gsdl_cgi->generate_ok_message("$metavalue");
210 }
211
212 # Release the lock once it is done
213# $self->unlock_collection($username, $collect);
214}
215
216# just calls the index version
217sub get_metadata
218{
219 my $self = shift @_;
220 $self->get_index_metadata(@_);
221}
222
223# JSON version that will get the requested metadata values
224# from the requested source (index, import, archives or live)
225# One of the params is a JSON string and the return value is JSON too
226# http://forums.asp.net/t/1844684.aspx/1 - Web api method return json in string
227sub get_metadata_array
228{
229 my $self = shift @_;
230
231 my $where = $self->{'where'};
232 if (!$where || ($where =~ m/^\s*$/)) { # 0, "0", "" and undef are all false. All else is true.
233 # What is truth in perl: http://www.berkeleyinternet.com/perl/node11.html
234 # and http://www.perlmonks.org/?node_id=33638
235
236 $where = "index"; # default behaviour is to get the values from index
237 }
238
239 # Only when setting metadata do we perform authentication and do we lock the collection,
240 # not when getting metadata
241
242 # for get_meta_array, the where param can only be ONE of import, archives, index, live
243 if($where =~ m/index/) {
244 $self->_get_index_metadata_array(@_);
245 }
246 elsif($where =~ m/archives/) {
247 $self->_get_archives_metadata_array(@_);
248 }
249 elsif($where =~ m/import/) {
250 $self->_get_import_metadata_array(@_);
251 }
252 elsif($where =~ m/live/) {
253 $self->_get_live_metadata_array(@_);
254 }
255}
256
257# Unused at present. Added for completion. Tested.
258sub _get_import_metadata_array {
259
260 my $self = shift @_;
261
262 my $collect = $self->{'collect'};
263 my $gsdl_cgi = $self->{'gsdl_cgi'};
264 my $site = $self->{'site'};
265 my $collect_dir = $gsdl_cgi->get_collection_dir($site);
266
267 # look up additional args
268 my $infodbtype = $self->{'infodbtype'};
269
270 my $archive_dir = &util::filename_cat($collect_dir, $collect, "archives");
271 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archive_dir);
272 my $json_str = $self->{'json'};
273 my $doc_array = decode_json $json_str;
274
275 my $json_result_str = "[";
276 my $first_doc_rec = 1;
277 foreach my $doc_array_rec ( @$doc_array ) {
278
279 my $docid = $doc_array_rec->{'docid'}; # no subsection metadata support in metadata.xml, only toplevel meta
280
281 if($first_doc_rec) {
282 $first_doc_rec = 0;
283 } else {
284 $json_result_str .= ",";
285 }
286 $json_result_str = $json_result_str . "{\"docid\":\"" . $docid . "\"";
287
288 my $metatable = $doc_array_rec->{'metatable'}; # a subarray, or need to generate an error saying JSON structure is wrong
289 $json_result_str = $json_result_str . ",\"metatable\":[";
290
291 my $first_rec = 1;
292 foreach my $metatable_rec ( @$metatable ) { # the subarray metatable is an array of hashmaps
293
294 # Read the docid entry
295 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $docid);
296 # This now stores the full pathname
297 my $import_filename = $doc_rec->{'src-file'}->[0];
298 $import_filename = &util::placeholders_to_abspath($import_filename);
299
300 # figure out correct metadata.xml file [?]
301 # Assuming the metadata.xml file is next to the source file
302 # Note: This will not work if it is using the inherited metadata from the parent folder
303 my ($import_tailname, $import_dirname) = File::Basename::fileparse($import_filename);
304 my $metadata_xml_filename = &util::filename_cat($import_dirname, "metadata.xml");
305
306
307 if($first_rec) {
308 $first_rec = 0;
309 } else {
310 $json_result_str .= ",";
311 }
312
313 my $metaname = $metatable_rec->{'metaname'};
314 $json_result_str .= "{\"metaname\":\"$metaname\",\"metavals\":[";
315
316 my $metapos = $metatable_rec->{'metapos'}; # 0... 1|all|undefined
317 if(!defined $metapos) {
318 $metapos = 0;
319 }
320
321 # Obtain the specified metadata value(s)
322 my $metavalue;
323
324 if($metapos ne "all") { # get the value at a single metapos
325 $metavalue = $self->get_metadata_from_metadata_xml($gsdl_cgi, $metadata_xml_filename, $metaname, $metapos, $import_tailname);
326
327 #print STDERR "**** Metafilename, metaname, metapos, sec_num: $metadata_xml_filename, $metaname, $metapos, $import_tailname\n";
328
329 $json_result_str .= "{\"metapos\":\"$metapos\",\"metavalue\":\"$metavalue\"}";
330
331 } else {
332 my $first_metaval = 1;
333 $metapos = 0;
334 $metavalue = $self->get_metadata_from_metadata_xml($gsdl_cgi, $metadata_xml_filename, $metaname, $metapos, $import_tailname);
335
336 while (defined $metavalue && $metavalue ne "") {
337 if($first_metaval) {
338 $first_metaval = 0;
339 } else {
340 $json_result_str .= ",";
341 }
342
343 $json_result_str .= "{\"metapos\":\"$metapos\",\"metavalue\":\"$metavalue\"}";
344
345 $metapos++;
346 $metavalue = $self->get_metadata_from_metadata_xml($gsdl_cgi, $metadata_xml_filename, $metaname, $metapos, $import_tailname);
347 }
348 }
349
350 $json_result_str .= "]}"; # close metavals array and metatable record
351 }
352
353 $json_result_str .= "]}"; # close metatable array and docid record
354 }
355
356 $json_result_str .= "]"; # close array of docids
357 $gsdl_cgi->generate_ok_message($json_result_str."\n");
358}
359
360# Unused method, but included for completion. Tested, works. Takes a JSON string and returns a JSON string.
361# For more information on the format of the output, see _get_index_metadata_array, which is in use.
362sub _get_archives_metadata_array {
363
364 my $self = shift @_;
365
366 my $collect = $self->{'collect'};
367 my $gsdl_cgi = $self->{'gsdl_cgi'};
368 my $site = $self->{'site'};
369 my $collect_dir = $gsdl_cgi->get_collection_dir($site);
370
371 # look up additional args
372 my $infodbtype = $self->{'infodbtype'};
373
374 my $archive_dir = &util::filename_cat($collect_dir, $collect, "archives");
375 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archive_dir);
376
377 my $json_str = $self->{'json'};
378 my $doc_array = decode_json $json_str;
379
380 my $json_result_str = "[";
381 my $first_doc_rec = 1;
382 foreach my $doc_array_rec ( @$doc_array ) {
383
384 my $docid = $doc_array_rec->{'docid'};
385
386 if($first_doc_rec) {
387 $first_doc_rec = 0;
388 } else {
389 $json_result_str .= ",";
390 }
391 $json_result_str = $json_result_str . "{\"docid\":\"" . $docid . "\"";
392
393 my $metatable = $doc_array_rec->{'metatable'}; # a subarray, or need to generate an error saying JSON structure is wrong
394 $json_result_str = $json_result_str . ",\"metatable\":[";
395
396 my $first_rec = 1;
397 foreach my $metatable_rec ( @$metatable ) { # the subarray metatable is an array of hashmaps
398
399 my ($docid, $docid_secnum) = ($doc_array_rec->{'docid'} =~ m/^(.*?)(\..*)?$/);
400 $docid_secnum = "" if (!defined $docid_secnum);
401
402 # Read the docid entry
403 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $docid);
404 # This now stores the full pathname
405 my $doc_file = $doc_rec->{'doc-file'}->[0];
406
407 # check if request if for file-level doc-version history 'nminus-<n>' version
408 my $dv = $self->{'dv'};
409 if (defined $dv && ($dv ne "")) {
410 # Need to insert '_fldv_history/nminus-<n>' into doc_filename
411
412 my ($doc_tailname, $doc_dirname) = File::Basename::fileparse($doc_file);
413 $doc_file = &util::filename_cat($doc_dirname,$FLDV_HISTORY_DIR,$dv,$doc_tailname);
414 }
415
416 my $doc_filename = &util::filename_cat($archive_dir, $doc_file);
417
418 if($first_rec) {
419 $first_rec = 0;
420 } else {
421 $json_result_str .= ",";
422 }
423
424 my $metaname = $metatable_rec->{'metaname'};
425 $json_result_str .= "{\"metaname\":\"$metaname\",\"metavals\":[";
426
427 my $metapos = $metatable_rec->{'metapos'}; # 0... 1|all|undefined
428 if(!defined $metapos) {
429 $metapos = 0;
430 }
431
432
433 # Obtain the specified metadata value(s)
434 my $metavalue;
435
436 if($metapos ne "all") { # get the value at a single metapos
437
438 $metavalue = $self->get_metadata_from_archive_xml($gsdl_cgi, $doc_filename, $metaname, $metapos, $docid_secnum);
439 #print STDERR "**** Docname, metaname, metapos, sec_num: $doc_filename, $metaname, $metapos, $docid_secnum\n";
440
441 $json_result_str .= "{\"metapos\":\"$metapos\",\"metavalue\":\"$metavalue\"}";
442
443 } else {
444 my $first_metaval = 1;
445 $metapos = 0;
446 $metavalue = $self->get_metadata_from_archive_xml($gsdl_cgi, $doc_filename, $metaname, $metapos, $docid_secnum);
447
448 while (defined $metavalue && $metavalue ne "") {
449 if($first_metaval) {
450 $first_metaval = 0;
451 } else {
452 $json_result_str .= ",";
453 }
454
455 $json_result_str .= "{\"metapos\":\"$metapos\",\"metavalue\":\"$metavalue\"}";
456
457 $metapos++;
458 $metavalue = $self->get_metadata_from_archive_xml($gsdl_cgi, $doc_filename, $metaname, $metapos, $docid_secnum);
459 }
460 }
461
462 $json_result_str .= "]}"; # close metavals array and metatable record
463 }
464
465 $json_result_str .= "]}"; # close metatable array and docid record
466 }
467
468 $json_result_str .= "]"; # close array of docids
469 $gsdl_cgi->generate_ok_message($json_result_str."\n");
470}
471
472
473# Unused at present. Added for completion. Tested, but not sure if it retrieves metadata in the manner it's expected to.
474sub _get_live_metadata_array
475{
476 my $self = shift @_;
477
478 my $collect = $self->{'collect'};
479 my $gsdl_cgi = $self->{'gsdl_cgi'};
480 my $site = $self->{'site'};
481 my $collect_dir = $gsdl_cgi->get_collection_dir($site);
482
483 # look up additional args
484 my $infodbtype = $self->{'infodbtype'};
485
486 # To people who know $collect_tail please add some comments
487 # Obtain the path to the database
488 my $collect_tail = $collect;
489 $collect_tail =~ s/^.*[\/|\\]//;
490 my $index_text_directory = &util::filename_cat($collect_dir,$collect,"index","text");
491 my $infodb_file_path = &dbutil::get_infodb_file_path($infodbtype, "live-$collect_tail", $index_text_directory);
492
493 my $json_str = $self->{'json'};
494 my $doc_array = decode_json $json_str;
495
496 my $json_result_str = "[";
497 my $first_doc_rec = 1;
498
499 foreach my $doc_array_rec ( @$doc_array ) {
500
501 my $docid = $doc_array_rec->{'docid'};
502
503 if($first_doc_rec) {
504 $first_doc_rec = 0;
505 } else {
506 $json_result_str .= ",";
507 }
508 $json_result_str = $json_result_str . "{\"docid\":\"" . $docid . "\"";
509
510 my $metatable = $doc_array_rec->{'metatable'}; # a subarray, or need to generate an error saying JSON structure is wrong
511 $json_result_str = $json_result_str . ",\"metatable\":[";
512
513 my $first_rec = 1;
514 foreach my $metatable_rec ( @$metatable ) { # the subarray metatable is an array of hashmaps
515 if($first_rec) {
516 $first_rec = 0;
517 } else {
518 $json_result_str .= ",";
519 }
520
521 my $metaname = $metatable_rec->{'metaname'};
522 $json_result_str .= "{\"metaname\":\"$metaname\",\"metavals\":[";
523
524 # Generate the dbkey
525 my $dbkey = "$docid.$metaname";
526
527 # metapos for get_live_metadata is always assumed to be "all".
528 # It's always going to get all the lines of metavalues associated with a metaname
529 # (It's the metaname itself that should contain an increment number, if there are to be multiple values).
530 #my $metapos = "all";
531 my $metapos = $metatable_rec->{'metapos'} || 0; # Can be 0... 1|all|undefined. Defaults to 0 if undefined/false
532 my $metavalue = "";
533
534 # Obtain the content of the key
535 my $cmd = "gdbmget $infodb_file_path $dbkey";
536 if (open(GIN,"$cmd |") != 0) { # Success.
537
538 binmode(GIN,":utf8");
539 # Read everything in and concatenate them into $metavalue
540 my $line;
541 my $first_metaval = 1;
542 my $pos = 0;
543 while (defined ($line=<GIN>)) {
544 chomp($line); # Get rid off the tailing newlines
545
546 if($metapos eq "all") {
547 if($first_metaval) {
548 $first_metaval = 0;
549 } else {
550 $json_result_str .= ",";
551 }
552 $metavalue = $line;
553 $json_result_str .= "{\"metapos\":\"$pos\",\"metavalue\":\"$metavalue\"}";
554 } elsif($metapos == $pos) {
555 $metavalue = $line;
556 $json_result_str .= "{\"metapos\":\"$metapos\",\"metavalue\":\"$metavalue\"}";
557 last;
558 } # else, the current $pos is not the required $metapos
559 $pos += 1;
560 }
561 close(GIN);
562 } # else open cmd == 0 (failed) and metavals array will be empty [] for this metaname
563
564 $json_result_str .= "]}"; # close metavals array and metatable record
565 }
566
567 $json_result_str .= "]}"; # close metatable array and docid record
568 }
569
570 $json_result_str .= "]"; # close array of docids
571
572 $gsdl_cgi->generate_ok_message($json_result_str."\n");
573}
574
575
576# Takes a JSON string and returns a JSON string
577# Request string is of the form:
578# http://localhost:8283/greenstone/cgi-bin/metadata-server.pl?a=get-metadata-array&c=demo&where=index&json=[{"docid":"HASHc5bce2d6d3e5b04e470ec9","metatable":[{"metaname":"username","metapos":"all"},{"metaname":"usertimestamp","metapos":"all"}, {"metaname":"usercomment","metapos":"all"}]}]
579# Resulting string is of the form:
580# [{"docid":"HASHc5bce2d6d3e5b04e470ec9","metatable":[{"metaname":"username","metavals":[{"metapos":"0","metavalue":"me"},{"metapos":"1","metavalue":"admin"}]},{"metaname":"usertimestamp","metavals":[{"metapos":"0","metavalue":"1367900586888"},{"metapos":"1","metavalue":"1367900616574"}]},{"metaname":"usercomment","metavals":[{"metapos":"0","metavalue":"Hi"},{"metapos":"1","metavalue":"Hello"}]}]}]
581sub _get_index_metadata_array
582{
583 my $self = shift @_;
584
585 my $collect = $self->{'collect'};
586 my $gsdl_cgi = $self->{'gsdl_cgi'};
587 my $site = $self->{'site'};
588 my $collect_dir = $gsdl_cgi->get_collection_dir($site);
589
590 # look up additional args
591 my $infodbtype = $self->{'infodbtype'};
592
593 # To people who know $collect_tail please add some comments
594 # Obtain the path to the database
595 my $collect_tail = $collect;
596 $collect_tail =~ s/^.*[\/|\\]//;
597 my $index_text_directory = &util::filename_cat($collect_dir,$collect,"index","text");
598 my $infodb_file_path = &dbutil::get_infodb_file_path($infodbtype, $collect_tail, $index_text_directory);
599
600 my $json_str = $self->{'json'};
601 my $doc_array = decode_json $json_str;
602
603 my $json_result_str = "[";
604 my $first_doc_rec = 1;
605
606 foreach my $doc_array_rec ( @$doc_array ) {
607
608 my $docid = $doc_array_rec->{'docid'};
609
610 if($first_doc_rec) {
611 $first_doc_rec = 0;
612 } else {
613 $json_result_str .= ",";
614 }
615 $json_result_str = $json_result_str . "{\"docid\":\"" . $docid . "\"";
616
617 my $metatable = $doc_array_rec->{'metatable'}; # a subarray, or need to generate an error saying JSON structure is wrong
618 $json_result_str = $json_result_str . ",\"metatable\":[";
619
620 my $first_rec = 1;
621 foreach my $metatable_rec ( @$metatable ) { # the subarray metatable is an array of hashmaps
622 if($first_rec) {
623 $first_rec = 0;
624 } else {
625 $json_result_str .= ",";
626 }
627
628 my $metaname = $metatable_rec->{'metaname'};
629 $json_result_str .= "{\"metaname\":\"$metaname\",\"metavals\":[";
630
631 my $metapos = $metatable_rec->{'metapos'}; # 0... 1|all|undefined
632 if(!defined $metapos) {
633 $metapos = 0;
634 }
635
636 # Read the docid entry
637 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $infodb_file_path, $docid);
638
639 # Basically loop through and unescape_html the values
640 foreach my $k (keys %$doc_rec) {
641 my @escaped_v = ();
642 foreach my $v (@{$doc_rec->{$k}}) {
643 my $ev = &ghtml::unescape_html($v);
644 push(@escaped_v, $ev);
645 }
646 $doc_rec->{$k} = \@escaped_v;
647 }
648
649 # Obtain the specified metadata value(s)
650 my $metavalue;
651
652 if($metapos ne "all") { # get the value at a single metapos
653
654 $metavalue = $doc_rec->{$metaname}->[$metapos];
655
656 # protect any double quotes and colons in the metavalue before putting it into JSON
657 $metavalue =~ s/\"/&quot;/g if defined $metavalue;
658 $metavalue =~ s/\:/&58;/g if defined $metavalue;
659
660 $json_result_str .= "{\"metapos\":\"$metapos\",\"metavalue\":\"$metavalue\"}";
661
662 } else {
663 my $first_metaval = 1;
664 $metapos = 0;
665 $metavalue = $doc_rec->{$metaname}->[$metapos];
666
667 while (defined $metavalue) {
668
669 # protect any double quotes and colons in the metavalue before putting it into JSON
670 $metavalue =~ s/\"/&quot;/g;
671 $metavalue =~ s/\:/&58;/g;
672
673 if($first_metaval) {
674 $first_metaval = 0;
675 } else {
676 $json_result_str .= ",";
677 }
678
679 $json_result_str .= "{\"metapos\":\"$metapos\",\"metavalue\":\"$metavalue\"}";
680
681 $metapos++;
682 $metavalue = $doc_rec->{$metaname}->[$metapos];
683 }
684 }
685
686 $json_result_str .= "]}"; # close metavals array and metatable record
687 }
688
689 $json_result_str .= "]}"; # close metatable array and docid record
690 }
691
692 $json_result_str .= "]"; # close array of docids
693
694 $gsdl_cgi->generate_ok_message($json_result_str."\n");
695}
696
697
698sub get_index_metadata
699{
700 my $self = shift @_;
701
702 my $username = $self->{'username'};
703 my $collect = $self->{'collect'};
704 my $gsdl_cgi = $self->{'gsdl_cgi'};
705 #my $gsdlhome = $self->{'gsdlhome'};
706
707 # Obtain the collect dir
708 my $site = $self->{'site'};
709 my $collect_dir = $gsdl_cgi->get_collection_dir($site);
710 ##my $collect_dir = &util::filename_cat($gsdlhome, "collect");
711
712 # look up additional args
713 my $docid = $self->{'d'};
714 my $metaname = $self->{'metaname'};
715 my $metapos = $self->{'metapos'};
716 my $infodbtype = $self->{'infodbtype'};
717
718 # To people who know $collect_tail please add some comments
719 # Obtain path to the database
720 my $collect_tail = $collect;
721 $collect_tail =~ s/^.*[\/|\\]//;
722 my $index_text_directory = &util::filename_cat($collect_dir,$collect,"index","text");
723 my $infodb_file_path = &dbutil::get_infodb_file_path($infodbtype, $collect_tail, $index_text_directory);
724
725 # Read the docid entry
726 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $infodb_file_path, $docid);
727
728 # Basically loop through and unescape_html the values
729 foreach my $k (keys %$doc_rec) {
730 my @escaped_v = ();
731 foreach my $v (@{$doc_rec->{$k}}) {
732 my $ev = &ghtml::unescape_html($v);
733 push(@escaped_v, $ev);
734 }
735 $doc_rec->{$k} = \@escaped_v;
736 }
737
738 # Obtain the specified metadata value
739 $metapos = 0 if (!defined $metapos || ($metapos =~ m/^\s*$/));
740 my $metavalue = $doc_rec->{$metaname}->[$metapos];
741 $gsdl_cgi->generate_ok_message("$metavalue");
742
743}
744
745
746sub get_import_metadata
747{
748 my $self = shift @_;
749
750 my $username = $self->{'username'};
751 my $collect = $self->{'collect'};
752 my $gsdl_cgi = $self->{'gsdl_cgi'};
753 #my $gsdlhome = $self->{'gsdlhome'};
754
755 # Obtain the collect dir
756 my $site = $self->{'site'};
757 my $collect_dir = $gsdl_cgi->get_collection_dir($site);
758 ##my $collect_dir = &util::filename_cat($gsdlhome, "collect");
759
760 # look up additional args
761 my $docid = $self->{'d'};
762 my $metaname = $self->{'metaname'};
763 my $metapos = $self->{'metapos'};
764 $metapos = 0 if (!defined $metapos || ($metapos =~ m/^\s*$/)); # gets the first value by default since metapos defaults to 0
765
766 my $infodbtype = $self->{'infodbtype'};
767 if (!defined $docid)
768 {
769 $gsdl_cgi->generate_error("No docid (d=...) specified.\n");
770 }
771
772 # Obtain where the metadata.xml is from the archiveinfo-doc.gdb file
773 # If the doc oid is not specified, we assume the metadata.xml is next to the specified "f"
774 my $metadata_xml_file;
775 my $import_filename = undef;
776
777
778 my $archive_dir = &util::filename_cat($collect_dir, $collect, "archives");
779 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archive_dir);
780 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $docid);
781
782 # This now stores the full pathname
783 $import_filename = $doc_rec->{'src-file'}->[0];
784 $import_filename = &util::placeholders_to_abspath($import_filename);
785
786 # figure out correct metadata.xml file [?]
787 # Assuming the metadata.xml file is next to the source file
788 # Note: This will not work if it is using the inherited metadata from the parent folder
789 my ($import_tailname, $import_dirname) = File::Basename::fileparse($import_filename);
790 my $metadata_xml_filename = &util::filename_cat($import_dirname, "metadata.xml");
791
792 $gsdl_cgi->generate_ok_message($self->get_metadata_from_metadata_xml($gsdl_cgi, $metadata_xml_filename, $metaname, $metapos, $import_tailname));
793
794}
795
796sub get_metadata_from_metadata_xml
797{
798 my $self = shift @_;
799 my ($gsdl_cgi, $metadata_xml_filename, $metaname, $metapos, $src_file) = @_;
800
801 my @rules =
802 (
803 _default => 'raw',
804 'Metadata' => \&gfmxml_metadata,
805 'FileName' => \&mxml_filename
806 );
807
808 my $parser = XML::Rules->new
809 (
810 rules => \@rules,
811 output_encoding => 'utf8'
812 );
813
814 my $xml_in = "";
815 if (!open(MIN,"<$metadata_xml_filename"))
816 {
817 $gsdl_cgi->generate_error("Unable to read in $metadata_xml_filename: $!");
818 }
819 else
820 {
821 # Read them in
822 my $line;
823 while (defined ($line=<MIN>)) {
824 $xml_in .= $line;
825 }
826 close(MIN);
827
828 $parser->parse($xml_in, {metaname => $metaname, metapos => $metapos, src_file => $src_file});
829
830 if(defined $parser->{'pad'}->{'metavalue'})
831 {
832 return $parser->{'pad'}->{'metavalue'};
833 }
834 else
835 {
836 return "";
837 }
838 }
839}
840
841sub gfmxml_metadata
842{
843 my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
844
845 # no subsection support yet in metadata.xml
846
847 if (($parser->{'parameters'}->{'src_file'} eq $parser->{'parameters'}->{'current_file'}) && $parser->{'parameters'}->{'metaname'} eq $attrHash->{'name'})
848 {
849 if (!defined $parser->{'parameters'}->{'poscount'})
850 {
851 $parser->{'parameters'}->{'poscount'} = 0;
852 }
853 else
854 {
855 $parser->{'parameters'}->{'poscount'}++;
856 }
857
858 # gets the first value by default, since metapos defaults to 0
859 if (($parser->{'parameters'}->{'poscount'} == $parser->{'parameters'}->{'metapos'}))
860 {
861 if($parser->{'parameters'}->{'metapos'} > 0) {
862 print STDERR "@@@@ WARNING: non-zero metapos.\n";
863 print STDERR "@@@@ Assuming SIMPLE collection and proceeding to retrieve the import meta at position: ".$parser->{'parameters'}->{'metapos'}.".\n";
864 }
865 $parser->{'pad'}->{'metavalue'} = $attrHash->{'_content'};
866 }
867 }
868}
869
870sub mxml_filename
871{
872 my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
873
874 # Store the filename of the Current Fileset
875 # Note: According to http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd
876 # FileName tag must come before Description tag
877 $parser->{'parameters'}->{'current_file'} = $attrHash->{'_content'};
878
879 # RAW is [$tagname => $attrHash] not $tagname => $attrHash!!
880 return [$tagname => $attrHash];
881}
882
883sub get_archives_metadata
884{
885 my $self = shift @_;
886
887 my $username = $self->{'username'};
888 my $collect = $self->{'collect'};
889 my $gsdl_cgi = $self->{'gsdl_cgi'};
890 my $infodbtype = $self->{'infodbtype'};
891
892 # Obtain the collect dir
893 my $site = $self->{'site'};
894 my $collect_dir = $gsdl_cgi->get_collection_dir($site);
895
896 my $archive_dir = &util::filename_cat($collect_dir, $collect, "archives");
897
898 # look up additional args
899 my ($docid, $docid_secnum) = ($self->{'d'} =~ m/^(.*?)(\..*)?$/);
900 $docid_secnum = "" if (!defined $docid_secnum);
901
902 my $metaname = $self->{'metaname'};
903 my $metapos = $self->{'metapos'};
904 $metapos = 0 if (!defined $metapos || ($metapos =~ m/^\s*$/));
905
906 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archive_dir);
907 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $docid);
908
909 # This now stores the full pathname
910 my $doc_file = $doc_rec->{'doc-file'}->[0];
911
912 # check if request if for file-level doc-version history 'nminus-<n>' version
913 my $dv = $self->{'dv'};
914 if (defined $dv && ($dv ne "")) {
915 # Need to insert '_fldv_history/nminus-<n>' into doc_filename
916
917 my ($doc_tailname, $doc_dirname) = File::Basename::fileparse($doc_file);
918 $doc_file = &util::filename_cat($doc_dirname,$FLDV_HISTORY_DIR,$dv,$doc_tailname);
919 }
920
921 my $doc_filename = &util::filename_cat($archive_dir, $doc_file);
922
923 $gsdl_cgi->generate_ok_message($self->get_metadata_from_archive_xml($gsdl_cgi, $doc_filename, $metaname, $metapos, $docid_secnum));
924
925}
926
927# unused and untested
928sub get_archives_text
929{
930 my $self = shift @_;
931
932 my $username = $self->{'username'};
933 my $collect = $self->{'collect'};
934 my $gsdl_cgi = $self->{'gsdl_cgi'};
935 my $infodbtype = $self->{'infodbtype'};
936
937 # Obtain the collect dir
938 my $site = $self->{'site'};
939 my $collect_dir = $gsdl_cgi->get_collection_dir($site);
940
941 my $archive_dir = &util::filename_cat($collect_dir, $collect, "archives");
942
943 # look up additional args
944 my ($docid, $docid_secnum) = ($self->{'d'} =~ m/^(.*?)(\..*)?$/);
945 $docid_secnum = "" if (!defined $docid_secnum);
946
947 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archive_dir);
948 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $docid);
949
950 # This now stores the full pathname
951 my $doc_file = $doc_rec->{'doc-file'}->[0];
952
953 # check if request if for file-level doc-version history 'nminus-<n>' version
954 my $dv = $self->{'dv'};
955 if (defined $dv && ($dv ne "")) {
956 # Need to insert '_fldv_history/nminus-<n>' into doc_filename
957
958 my ($doc_tailname, $doc_dirname) = File::Basename::fileparse($doc_file);
959 $doc_file = &util::filename_cat($doc_dirname,$FLDV_HISTORY_DIR,$dv,$doc_tailname);
960 }
961
962 my $doc_filename = &util::filename_cat($archive_dir, $doc_file);
963
964 my $metaname = undef;
965 my $metapos = -1;
966
967 $gsdl_cgi->generate_ok_message($self->get_metadata_from_archive_xml($gsdl_cgi, $doc_filename, $metaname, $metapos, $docid_secnum));
968
969}
970
971sub get_metadata_from_archive_xml
972{
973 my $self = shift @_;
974 my ($gsdl_cgi, $doc_xml_filename, $metaname, $metapos, $secid) = @_;
975
976 my @start_rules = ('Section' => \&dxml_start_section);
977
978 my @rules =
979 (
980 _default => 'raw',
981 'Metadata' => \&gfdxml_metadata
982 );
983
984 my $parser = XML::Rules->new
985 (
986 start_rules => \@start_rules,
987 rules => \@rules,
988 output_encoding => 'utf8'
989 );
990
991 my $xml_in = "";
992 if (!open(MIN,"<$doc_xml_filename"))
993 {
994 $gsdl_cgi->generate_error("Unable to read in $doc_xml_filename: $!");
995 }
996 else
997 {
998 # Read them in
999 my $line;
1000 while (defined ($line=<MIN>)) {
1001 $xml_in .= $line;
1002 }
1003 close(MIN);
1004
1005 $parser->parse($xml_in, {metaname => $metaname, metapos => $metapos, secid => $secid});
1006
1007 if(defined $parser->{'pad'}->{'metavalue'})
1008 {
1009 return $parser->{'pad'}->{'metavalue'};
1010 }
1011 else
1012 {
1013 return "";
1014 }
1015 }
1016}
1017
1018# unused and untested
1019sub get_text_from_archive_xml
1020{
1021 my $self = shift @_;
1022 my ($gsdl_cgi, $doc_xml_filename, $secid) = @_;
1023
1024 # To monitor which section/subsection number we are in
1025 my @start_rules = ('Section' => \&dxml_start_section);
1026
1027 # set the callback functions for the elements in doc.xml we're interested in, <Content>
1028 my @rules =
1029 (
1030 _default => 'raw',
1031 'Content' => \&gfdxml_text # gfdxml = get from doc xml?
1032 );
1033
1034 my $parser = XML::Rules->new
1035 (
1036 start_rules => \@start_rules,
1037 rules => \@rules,
1038 output_encoding => 'utf8'
1039 );
1040
1041 my $xml_in = "";
1042 if (!open(MIN,"<$doc_xml_filename"))
1043 {
1044 $gsdl_cgi->generate_error("Unable to read in $doc_xml_filename: $!");
1045 }
1046 else
1047 {
1048 # Read them in
1049 my $line;
1050 while (defined ($line=<MIN>)) {
1051 $xml_in .= $line;
1052 }
1053 close(MIN);
1054
1055 $parser->parse($xml_in, {secid => $secid});
1056
1057 if(defined $parser->{'pad'}->{'textcontent'})
1058 {
1059 return $parser->{'pad'}->{'textcontent'};
1060 }
1061 else
1062 {
1063 return "";
1064 }
1065 }
1066}
1067
1068sub gfdxml_metadata
1069{
1070 my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
1071
1072 if(!($parser->{'parameters'}->{'secid'} eq $parser->{'parameters'}->{'curr_section_num'}))
1073 {
1074 return;
1075 }
1076
1077 if ($parser->{'parameters'}->{'metaname'} eq $attrHash->{'name'})
1078 {
1079 if (!defined $parser->{'parameters'}->{'poscount'})
1080 {
1081 $parser->{'parameters'}->{'poscount'} = 0;
1082 }
1083 else
1084 {
1085 $parser->{'parameters'}->{'poscount'}++;
1086 }
1087 }
1088
1089 if (($parser->{'parameters'}->{'metaname'} eq $attrHash->{'name'}) && ($parser->{'parameters'}->{'poscount'} == $parser->{'parameters'}->{'metapos'}))
1090 {
1091 $parser->{'pad'}->{'metavalue'} = $attrHash->{'_content'};
1092 }
1093}
1094
1095# unused and untested - for get_archives_text
1096sub gfdxml_text
1097{
1098 my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
1099
1100 if($parser->{'parameters'}->{'secid'} eq $parser->{'parameters'}->{'curr_section_num'})
1101 {
1102 $parser->{'pad'}->{'textcontent'} = $attrHash->{'_content'}; # the textnode content
1103
1104 }
1105 else {
1106 return;
1107 }
1108}
1109
1110sub dxml_start_section
1111{
1112 my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
1113
1114 my $new_depth = scalar(@$contextArray);
1115
1116# print STDERR "**** START SECTION \n";
1117
1118 if ($new_depth == 1) {
1119 $parser->{'parameters'}->{'curr_section_depth'} = 1;
1120 $parser->{'parameters'}->{'curr_section_num'} = "";
1121 }
1122
1123 my $old_depth = $parser->{'parameters'}->{'curr_section_depth'};
1124 my $old_secnum = $parser->{'parameters'}->{'curr_section_num'};
1125
1126 my $new_secnum;
1127
1128 if ($new_depth > $old_depth) {
1129 # child subsection
1130 $new_secnum = "$old_secnum.1";
1131 }
1132 elsif ($new_depth == $old_depth) {
1133 # sibling section => increase its value by 1
1134 my ($tail_num) = ($old_secnum =~ m/\.(\d+)$/);
1135 $tail_num++;
1136 $new_secnum = $old_secnum;
1137 $new_secnum =~ s/\.(\d+)$/\.$tail_num/;
1138 }
1139 else {
1140 # back up to parent section => lop off tail
1141 $new_secnum = $old_secnum;
1142 $new_secnum =~ s/\.\d+$//;
1143 # remember to increment from parent section
1144 my ($tail_num) = ($new_secnum =~ m/\.(\d+)$/);
1145 $tail_num++;
1146 $new_secnum =~ s/\.(\d+)$/\.$tail_num/;
1147 }
1148
1149 $parser->{'parameters'}->{'curr_section_depth'} = $new_depth;
1150 $parser->{'parameters'}->{'curr_section_num'} = $new_secnum;
1151
1152 #print STDERR "**** START SECTION - $new_secnum\n************************\n\n";
1153 1;
1154}
1155
11561;
Note: See TracBrowser for help on using the repository browser.