source: main/trunk/greenstone2/perllib/cgiactions/metadataaction.pm@ 32352

Last change on this file since 32352 was 32062, checked in by ak19, 7 years ago

Committing unused and untested new subroutines, in case they may prove useful in future.

File size: 34.7 KB
RevLine 
[28159]1##########################################################################
[19293]2#
3# metadataaction.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2009 New Zealand Digital Library Project
9#
[29099]10# This program is free software; you can redistribute it and/or modify
[19293]11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package metadataaction;
27
28use strict;
29
30use cgiactions::baseaction;
31
[21551]32use dbutil;
[19499]33use ghtml;
[19293]34
[24071]35use JSON;
[21563]36
[31617]37# This class is conditionally expanded with set-metadata, remove-metadata and insert-metadata subroutines
38# defined in modmetadataaction.pm. The BEGIN code block determines whether the condition holds.
39# See
[31602]40# http://stackoverflow.com/questions/3998619/what-is-the-role-of-the-begin-block-in-perl
41# http://www.perlmonks.org/?node_id=881761 - splitting module into multiple files
42# http://www.perlmonks.org/?node_id=524456 - merging hashes
[24071]43
[31602]44our $modmeta_action_table; # don't init to empty hash here, else it will overwrite whatever BEGIN sets this to
45 # see http://stackoverflow.com/questions/3998619/what-is-the-role-of-the-begin-block-in-perl
46
[19293]47BEGIN {
[22331]48# unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan/perl-5.8");
[19293]49 require XML::Rules;
[31602]50
51 # if we're GS3, then GS3_AUTHENTICATED must be defined and set to true
52 # in order to have access to subroutines that modify metadata (the set-
53 # and remove- metadata subroutines).
54 # TODO: if we're GS2, then we continue to behave as before?
55
56 if(!defined $ENV{'GSDL3HOME'} || (defined $ENV{'GS3_AUTHENTICATED'} && $ENV{'GS3_AUTHENTICATED'} eq "true")) {
57 require modmetadataaction;
58 }
59 else {
60 $modmeta_action_table = {};
61 }
[19293]62}
63
64@metadataaction::ISA = ('baseaction');
65
[31602]66
67my $getmeta_action_table =
[25097]68{
[32062]69 # unused and untested
70 # when DocEdit=1, need to retrieve a doc's full text (or doc section's full text) from archives
71 "get-archives-text" => {
72 'compulsory-args' => [ "d" ] },
73 #'compulsory-args' => [ "d" ],
74 #'optional-args' => [ "section" ] },
75
[25097]76 #GET METHODS
77 "get-import-metadata" => {
78 'compulsory-args' => [ "d", "metaname" ],
[27176]79 'optional-args' => [ "metapos" ] },
[19499]80
[25097]81 "get-archives-metadata" => {
82 'compulsory-args' => [ "d", "metaname" ],
83 'optional-args' => [ "metapos" ] },
84
[27157]85 "get-index-metadata" => {
[25097]86 'compulsory-args' => [ "d", "metaname" ],
87 'optional-args' => [ "metapos" ] },
[19499]88
[27157]89 "get-metadata" => { # alias for get-index-metadata
90 'compulsory-args' => [ "d", "metaname" ],
91 'optional-args' => [ "metapos" ] },
92
[25097]93 "get-live-metadata" => {
94 'compulsory-args' => [ "d", "metaname" ],
95 'optional-args' => [ ] },
[19499]96
[27312]97 "get-metadata-array" => { # where param can be ONE of: index (default), import, archives, live
98 'compulsory-args' => [ "json" ],
99 'optional-args' => [ "where" ],
100 'help-string' => [
101 'metadata-server.pl?a=get-metadata-array&c=demo&where=index&json=[{"docid":"HASHc5bce2d6d3e5b04e470ec9","metatable":[{"metaname":"username","metapos":"all"},{"metaname":"usertimestamp","metapos":"all"}, {"metaname":"usercomment","metapos":"all"}]}]'
[31602]102 ]}
103};
[27312]104
[31617]105# To get the final action_table of all available subroutines in this class,
106# merge the get- and mod-metadata hashes. See http://www.perlmonks.org/?node_id=524456
107# Note that modmeta_action_table will be empty of subroutines if the user does not have permissions
108# to modify metadata.
[31602]109my $action_table = { %$getmeta_action_table, %$modmeta_action_table };
[19499]110
[27261]111
[19293]112sub new
113{
114 my $class = shift (@_);
115 my ($gsdl_cgi,$iis6_mode) = @_;
116
[23761]117 # Treat metavalue specially. To transmit this through a GET request
118 # the Javascript side has url-encoded it, so here we need to decode
119 # it before proceeding
120
121 my $url_encoded_metavalue = $gsdl_cgi->param("metavalue");
122 my $url_decoded_metavalue = &unicode::url_decode($url_encoded_metavalue,1);
123 my $unicode_array = &unicode::utf82unicode($url_decoded_metavalue);
124
125 $url_decoded_metavalue = join("",map(chr($_),@$unicode_array));
126 $gsdl_cgi->param("metavalue",$url_decoded_metavalue);
127
[29086]128 # need to do the same with prevmetavalue
129 my $url_encoded_prevmetavalue = $gsdl_cgi->param("prevmetavalue");
130 my $url_decoded_prevmetavalue = &unicode::url_decode($url_encoded_prevmetavalue,1);
131 my $prevunicode_array = &unicode::utf82unicode($url_decoded_prevmetavalue);
132
133 $url_decoded_prevmetavalue = join("",map(chr($_),@$prevunicode_array));
134 $gsdl_cgi->param("prevmetavalue",$url_decoded_prevmetavalue);
135
[19293]136 my $self = new baseaction($action_table,$gsdl_cgi,$iis6_mode);
137
138 return bless $self, $class;
139}
140
141
142sub get_live_metadata
143{
144 my $self = shift @_;
145
146 my $username = $self->{'username'};
147 my $collect = $self->{'collect'};
148 my $gsdl_cgi = $self->{'gsdl_cgi'};
149 my $gsdlhome = $self->{'gsdlhome'};
[23478]150 my $infodbtype = $self->{'infodbtype'};
[27180]151
[23447]152 # live metadata gets/saves value scoped (prefixed) by the current usename
[23761]153 # so (for now) let's not bother to enforce authentication
[21715]154
155 # Obtain the collect dir
[23766]156 my $site = $self->{'site'};
157 my $collect_dir = $gsdl_cgi->get_collection_dir($site);
158 ## my $collect_dir = &util::filename_cat($gsdlhome, "collect");
[19293]159
[27314]160 # No locking collection when getting metadata, only when setting it
161# $self->lock_collection($username, $collect); # Make sure the collection isn't locked by someone else
[19293]162
163 # look up additional args
164 my $docid = $self->{'d'};
165 if ((!defined $docid) || ($docid =~ m/^\s*$/)) {
[21715]166 $gsdl_cgi->generate_error("No docid (d=...) specified.");
[19293]167 }
168
[21715]169 # Generate the dbkey
[19293]170 my $metaname = $self->{'metaname'};
171 my $dbkey = "$docid.$metaname";
172
[21715]173 # To people who know $collect_tail please add some comments
174 # Obtain path to the database
[19293]175 my $collect_tail = $collect;
176 $collect_tail =~ s/^.*[\/|\\]//;
[21564]177 my $index_text_directory = &util::filename_cat($collect_dir,$collect,"index","text");
[23400]178 my $infodb_file_path = &dbutil::get_infodb_file_path($infodbtype, "live-$collect_tail", $index_text_directory);
[21715]179
180 # Obtain the content of the key
[21569]181 my $cmd = "gdbmget $infodb_file_path $dbkey";
[19293]182 if (open(GIN,"$cmd |") == 0) {
[21715]183 # Catch error if gdbmget failed
[19293]184 my $mess = "Failed to get metadata key: $metaname\n";
185 $mess .= "$!\n";
186
187 $gsdl_cgi->generate_error($mess);
188 }
189 else {
[23761]190 binmode(GIN,":utf8");
[21715]191 # Read everything in and concatenate them into $metavalue
[19293]192 my $metavalue = "";
193 my $line;
194 while (defined ($line=<GIN>)) {
195 $metavalue .= $line;
196 }
197 close(GIN);
[21715]198 chomp($metavalue); # Get rid off the tailing newlines
[19293]199 $gsdl_cgi->generate_ok_message("$metavalue");
200 }
[21715]201
202 # Release the lock once it is done
[27314]203# $self->unlock_collection($username, $collect);
[19499]204}
[19293]205
[27157]206# just calls the index version
[19499]207sub get_metadata
208{
209 my $self = shift @_;
[27157]210 $self->get_index_metadata(@_);
211}
[19499]212
[27312]213# JSON version that will get the requested metadata values
214# from the requested source (index, import, archives or live)
215# One of the params is a JSON string and the return value is JSON too
216# http://forums.asp.net/t/1844684.aspx/1 - Web api method return json in string
217sub get_metadata_array
218{
219 my $self = shift @_;
220
221 my $where = $self->{'where'};
[31589]222 if (!$where || ($where =~ m/^\s*$/)) { # 0, "0", "" and undef are all false. All else is true.
223 # What is truth in perl: http://www.berkeleyinternet.com/perl/node11.html
224 # and http://www.perlmonks.org/?node_id=33638
225
[27312]226 $where = "index"; # default behaviour is to get the values from index
227 }
228
[27336]229 # Only when setting metadata do we perform authentication and do we lock the collection,
[27312]230 # not when getting metadata
231
232 # for get_meta_array, the where param can only be ONE of import, archives, index, live
233 if($where =~ m/index/) {
234 $self->_get_index_metadata_array(@_);
235 }
[27315]236 elsif($where =~ m/archives/) {
[27324]237 $self->_get_archives_metadata_array(@_);
[27315]238 }
239 elsif($where =~ m/import/) {
[27333]240 $self->_get_import_metadata_array(@_);
[27315]241 }
242 elsif($where =~ m/live/) {
[27336]243 $self->_get_live_metadata_array(@_);
[27315]244 }
[27312]245}
246
[27336]247# Unused at present. Added for completion. Tested.
[27333]248sub _get_import_metadata_array {
249
250 my $self = shift @_;
[27324]251
[27333]252 my $collect = $self->{'collect'};
253 my $gsdl_cgi = $self->{'gsdl_cgi'};
254 my $site = $self->{'site'};
255 my $collect_dir = $gsdl_cgi->get_collection_dir($site);
256
257 # look up additional args
258 my $infodbtype = $self->{'infodbtype'};
259
260 my $archive_dir = &util::filename_cat($collect_dir, $collect, "archives");
261 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archive_dir);
262 my $json_str = $self->{'json'};
263 my $doc_array = decode_json $json_str;
264
265 my $json_result_str = "[";
266 my $first_doc_rec = 1;
267 foreach my $doc_array_rec ( @$doc_array ) {
268
269 my $docid = $doc_array_rec->{'docid'}; # no subsection metadata support in metadata.xml, only toplevel meta
270
271 if($first_doc_rec) {
272 $first_doc_rec = 0;
273 } else {
274 $json_result_str .= ",";
275 }
276 $json_result_str = $json_result_str . "{\"docid\":\"" . $docid . "\"";
277
278 my $metatable = $doc_array_rec->{'metatable'}; # a subarray, or need to generate an error saying JSON structure is wrong
279 $json_result_str = $json_result_str . ",\"metatable\":[";
280
281 my $first_rec = 1;
282 foreach my $metatable_rec ( @$metatable ) { # the subarray metatable is an array of hashmaps
283
284 # Read the docid entry
285 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $docid);
286 # This now stores the full pathname
287 my $import_filename = $doc_rec->{'src-file'}->[0];
[28211]288 $import_filename = &util::placeholders_to_abspath($import_filename);
[27333]289
290 # figure out correct metadata.xml file [?]
291 # Assuming the metadata.xml file is next to the source file
292 # Note: This will not work if it is using the inherited metadata from the parent folder
293 my ($import_tailname, $import_dirname) = File::Basename::fileparse($import_filename);
294 my $metadata_xml_filename = &util::filename_cat($import_dirname, "metadata.xml");
295
296
297 if($first_rec) {
298 $first_rec = 0;
299 } else {
300 $json_result_str .= ",";
301 }
302
303 my $metaname = $metatable_rec->{'metaname'};
304 $json_result_str .= "{\"metaname\":\"$metaname\",\"metavals\":[";
305
306 my $metapos = $metatable_rec->{'metapos'}; # 0... 1|all|undefined
307 if(!defined $metapos) {
308 $metapos = 0;
309 }
310
311 # Obtain the specified metadata value(s)
312 my $metavalue;
313
314 if($metapos ne "all") { # get the value at a single metapos
315 $metavalue = $self->get_metadata_from_metadata_xml($gsdl_cgi, $metadata_xml_filename, $metaname, $metapos, $import_tailname);
316
317 #print STDERR "**** Metafilename, metaname, metapos, sec_num: $metadata_xml_filename, $metaname, $metapos, $import_tailname\n";
318
319 $json_result_str .= "{\"metapos\":\"$metapos\",\"metavalue\":\"$metavalue\"}";
320
321 } else {
322 my $first_metaval = 1;
323 $metapos = 0;
324 $metavalue = $self->get_metadata_from_metadata_xml($gsdl_cgi, $metadata_xml_filename, $metaname, $metapos, $import_tailname);
325
326 while (defined $metavalue && $metavalue ne "") {
327 if($first_metaval) {
328 $first_metaval = 0;
329 } else {
330 $json_result_str .= ",";
331 }
332
333 $json_result_str .= "{\"metapos\":\"$metapos\",\"metavalue\":\"$metavalue\"}";
334
335 $metapos++;
336 $metavalue = $self->get_metadata_from_metadata_xml($gsdl_cgi, $metadata_xml_filename, $metaname, $metapos, $import_tailname);
337 }
338 }
339
340 $json_result_str .= "]}"; # close metavals array and metatable record
341 }
342
343 $json_result_str .= "]}"; # close metatable array and docid record
344 }
345
346 $json_result_str .= "]"; # close array of docids
347 $gsdl_cgi->generate_ok_message($json_result_str."\n");
348}
349
[27324]350# Unused method, but included for completion. Tested, works. Takes a JSON string and returns a JSON string.
351# For more information on the format of the output, see _get_index_metadata_array, which is in use.
352sub _get_archives_metadata_array {
353
354 my $self = shift @_;
355
356 my $collect = $self->{'collect'};
357 my $gsdl_cgi = $self->{'gsdl_cgi'};
358 my $site = $self->{'site'};
359 my $collect_dir = $gsdl_cgi->get_collection_dir($site);
360
361 # look up additional args
362 my $infodbtype = $self->{'infodbtype'};
363
364 my $archive_dir = &util::filename_cat($collect_dir, $collect, "archives");
365 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archive_dir);
366
367 my $json_str = $self->{'json'};
368 my $doc_array = decode_json $json_str;
369
370 my $json_result_str = "[";
371 my $first_doc_rec = 1;
372 foreach my $doc_array_rec ( @$doc_array ) {
373
374 my $docid = $doc_array_rec->{'docid'};
375
376 if($first_doc_rec) {
377 $first_doc_rec = 0;
378 } else {
379 $json_result_str .= ",";
380 }
381 $json_result_str = $json_result_str . "{\"docid\":\"" . $docid . "\"";
382
383 my $metatable = $doc_array_rec->{'metatable'}; # a subarray, or need to generate an error saying JSON structure is wrong
384 $json_result_str = $json_result_str . ",\"metatable\":[";
385
386 my $first_rec = 1;
387 foreach my $metatable_rec ( @$metatable ) { # the subarray metatable is an array of hashmaps
388
389 my ($docid, $docid_secnum) = ($doc_array_rec->{'docid'} =~ m/^(.*?)(\..*)?$/);
390 $docid_secnum = "" if (!defined $docid_secnum);
391
392 # Read the docid entry
393 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $docid);
394 # This now stores the full pathname
[28211]395 my $doc_filename = $doc_rec->{'doc-file'}->[0];
[27324]396 $doc_filename = &util::filename_cat($archive_dir, $doc_filename);
397
398 if($first_rec) {
399 $first_rec = 0;
400 } else {
401 $json_result_str .= ",";
402 }
403
404 my $metaname = $metatable_rec->{'metaname'};
405 $json_result_str .= "{\"metaname\":\"$metaname\",\"metavals\":[";
406
407 my $metapos = $metatable_rec->{'metapos'}; # 0... 1|all|undefined
408 if(!defined $metapos) {
409 $metapos = 0;
410 }
411
412
413 # Obtain the specified metadata value(s)
414 my $metavalue;
415
416 if($metapos ne "all") { # get the value at a single metapos
417
418 $metavalue = $self->get_metadata_from_archive_xml($gsdl_cgi, $doc_filename, $metaname, $metapos, $docid_secnum);
419 #print STDERR "**** Docname, metaname, metapos, sec_num: $doc_filename, $metaname, $metapos, $docid_secnum\n";
420
421 $json_result_str .= "{\"metapos\":\"$metapos\",\"metavalue\":\"$metavalue\"}";
422
423 } else {
424 my $first_metaval = 1;
425 $metapos = 0;
426 $metavalue = $self->get_metadata_from_archive_xml($gsdl_cgi, $doc_filename, $metaname, $metapos, $docid_secnum);
427
428 while (defined $metavalue && $metavalue ne "") {
429 if($first_metaval) {
430 $first_metaval = 0;
431 } else {
432 $json_result_str .= ",";
433 }
434
435 $json_result_str .= "{\"metapos\":\"$metapos\",\"metavalue\":\"$metavalue\"}";
436
437 $metapos++;
438 $metavalue = $self->get_metadata_from_archive_xml($gsdl_cgi, $doc_filename, $metaname, $metapos, $docid_secnum);
439 }
440 }
441
442 $json_result_str .= "]}"; # close metavals array and metatable record
443 }
444
445 $json_result_str .= "]}"; # close metatable array and docid record
446 }
447
448 $json_result_str .= "]"; # close array of docids
449 $gsdl_cgi->generate_ok_message($json_result_str."\n");
450}
451
[27336]452
453# Unused at present. Added for completion. Tested, but not sure if it retrieves metadata in the manner it's expected to.
454sub _get_live_metadata_array
455{
456 my $self = shift @_;
457
458 my $collect = $self->{'collect'};
459 my $gsdl_cgi = $self->{'gsdl_cgi'};
460 my $site = $self->{'site'};
461 my $collect_dir = $gsdl_cgi->get_collection_dir($site);
462
463 # look up additional args
464 my $infodbtype = $self->{'infodbtype'};
465
466 # To people who know $collect_tail please add some comments
467 # Obtain the path to the database
468 my $collect_tail = $collect;
469 $collect_tail =~ s/^.*[\/|\\]//;
470 my $index_text_directory = &util::filename_cat($collect_dir,$collect,"index","text");
471 my $infodb_file_path = &dbutil::get_infodb_file_path($infodbtype, "live-$collect_tail", $index_text_directory);
472
473 my $json_str = $self->{'json'};
474 my $doc_array = decode_json $json_str;
475
476 my $json_result_str = "[";
477 my $first_doc_rec = 1;
478
479 foreach my $doc_array_rec ( @$doc_array ) {
480
481 my $docid = $doc_array_rec->{'docid'};
482
483 if($first_doc_rec) {
484 $first_doc_rec = 0;
485 } else {
486 $json_result_str .= ",";
487 }
488 $json_result_str = $json_result_str . "{\"docid\":\"" . $docid . "\"";
489
490 my $metatable = $doc_array_rec->{'metatable'}; # a subarray, or need to generate an error saying JSON structure is wrong
491 $json_result_str = $json_result_str . ",\"metatable\":[";
492
493 my $first_rec = 1;
494 foreach my $metatable_rec ( @$metatable ) { # the subarray metatable is an array of hashmaps
495 if($first_rec) {
496 $first_rec = 0;
497 } else {
498 $json_result_str .= ",";
499 }
500
501 my $metaname = $metatable_rec->{'metaname'};
502 $json_result_str .= "{\"metaname\":\"$metaname\",\"metavals\":[";
503
504 # Generate the dbkey
505 my $dbkey = "$docid.$metaname";
506
507 # metapos for get_live_metadata is always assumed to be "all".
508 # It's always going to get all the lines of metavalues associated with a metaname
509 # (It's the metaname itself that should contain an increment number, if there are to be multiple values).
510 #my $metapos = "all";
511 my $metapos = $metatable_rec->{'metapos'} || 0; # Can be 0... 1|all|undefined. Defaults to 0 if undefined/false
512 my $metavalue = "";
513
514 # Obtain the content of the key
515 my $cmd = "gdbmget $infodb_file_path $dbkey";
516 if (open(GIN,"$cmd |") != 0) { # Success.
517
518 binmode(GIN,":utf8");
519 # Read everything in and concatenate them into $metavalue
520 my $line;
521 my $first_metaval = 1;
522 my $pos = 0;
523 while (defined ($line=<GIN>)) {
524 chomp($line); # Get rid off the tailing newlines
525
526 if($metapos eq "all") {
527 if($first_metaval) {
528 $first_metaval = 0;
529 } else {
530 $json_result_str .= ",";
531 }
532 $metavalue = $line;
533 $json_result_str .= "{\"metapos\":\"$pos\",\"metavalue\":\"$metavalue\"}";
534 } elsif($metapos == $pos) {
535 $metavalue = $line;
536 $json_result_str .= "{\"metapos\":\"$metapos\",\"metavalue\":\"$metavalue\"}";
537 last;
538 } # else, the current $pos is not the required $metapos
539 $pos += 1;
540 }
541 close(GIN);
542 } # else open cmd == 0 (failed) and metavals array will be empty [] for this metaname
543
544 $json_result_str .= "]}"; # close metavals array and metatable record
545 }
546
547 $json_result_str .= "]}"; # close metatable array and docid record
548 }
549
550 $json_result_str .= "]"; # close array of docids
551
552 $gsdl_cgi->generate_ok_message($json_result_str."\n");
553}
554
555
556# Takes a JSON string and returns a JSON string
557# Request string is of the form:
558# http://localhost:8283/greenstone/cgi-bin/metadata-server.pl?a=get-metadata-array&c=demo&where=index&json=[{"docid":"HASHc5bce2d6d3e5b04e470ec9","metatable":[{"metaname":"username","metapos":"all"},{"metaname":"usertimestamp","metapos":"all"}, {"metaname":"usercomment","metapos":"all"}]}]
559# Resulting string is of the form:
[27313]560# [{"docid":"HASHc5bce2d6d3e5b04e470ec9","metatable":[{"metaname":"username","metavals":[{"metapos":"0","metavalue":"me"},{"metapos":"1","metavalue":"admin"}]},{"metaname":"usertimestamp","metavals":[{"metapos":"0","metavalue":"1367900586888"},{"metapos":"1","metavalue":"1367900616574"}]},{"metaname":"usercomment","metavals":[{"metapos":"0","metavalue":"Hi"},{"metapos":"1","metavalue":"Hello"}]}]}]
[27312]561sub _get_index_metadata_array
562{
563 my $self = shift @_;
564
565 my $collect = $self->{'collect'};
566 my $gsdl_cgi = $self->{'gsdl_cgi'};
567 my $site = $self->{'site'};
568 my $collect_dir = $gsdl_cgi->get_collection_dir($site);
569
570 # look up additional args
571 my $infodbtype = $self->{'infodbtype'};
572
573 # To people who know $collect_tail please add some comments
574 # Obtain the path to the database
575 my $collect_tail = $collect;
[27336]576 $collect_tail =~ s/^.*[\/|\\]//;
[27312]577 my $index_text_directory = &util::filename_cat($collect_dir,$collect,"index","text");
578 my $infodb_file_path = &dbutil::get_infodb_file_path($infodbtype, $collect_tail, $index_text_directory);
579
580 my $json_str = $self->{'json'};
581 my $doc_array = decode_json $json_str;
582
583 my $json_result_str = "[";
584 my $first_doc_rec = 1;
585
586 foreach my $doc_array_rec ( @$doc_array ) {
587
588 my $docid = $doc_array_rec->{'docid'};
589
590 if($first_doc_rec) {
591 $first_doc_rec = 0;
592 } else {
593 $json_result_str .= ",";
594 }
[27313]595 $json_result_str = $json_result_str . "{\"docid\":\"" . $docid . "\"";
[27312]596
597 my $metatable = $doc_array_rec->{'metatable'}; # a subarray, or need to generate an error saying JSON structure is wrong
[27313]598 $json_result_str = $json_result_str . ",\"metatable\":[";
[27312]599
600 my $first_rec = 1;
601 foreach my $metatable_rec ( @$metatable ) { # the subarray metatable is an array of hashmaps
602 if($first_rec) {
603 $first_rec = 0;
604 } else {
605 $json_result_str .= ",";
606 }
607
608 my $metaname = $metatable_rec->{'metaname'};
[27313]609 $json_result_str .= "{\"metaname\":\"$metaname\",\"metavals\":[";
[27312]610
611 my $metapos = $metatable_rec->{'metapos'}; # 0... 1|all|undefined
612 if(!defined $metapos) {
[27313]613 $metapos = 0;
[27312]614 }
615
616 # Read the docid entry
617 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $infodb_file_path, $docid);
618
619 # Basically loop through and unescape_html the values
620 foreach my $k (keys %$doc_rec) {
621 my @escaped_v = ();
622 foreach my $v (@{$doc_rec->{$k}}) {
623 my $ev = &ghtml::unescape_html($v);
624 push(@escaped_v, $ev);
625 }
626 $doc_rec->{$k} = \@escaped_v;
627 }
628
629 # Obtain the specified metadata value(s)
630 my $metavalue;
631
632 if($metapos ne "all") { # get the value at a single metapos
633
[27366]634 $metavalue = $doc_rec->{$metaname}->[$metapos];
635
636 # protect any double quotes and colons in the metavalue before putting it into JSON
637 $metavalue =~ s/\"/&quot;/g if defined $metavalue;
638 $metavalue =~ s/\:/&58;/g if defined $metavalue;
639
[27313]640 $json_result_str .= "{\"metapos\":\"$metapos\",\"metavalue\":\"$metavalue\"}";
[27312]641
642 } else {
643 my $first_metaval = 1;
644 $metapos = 0;
645 $metavalue = $doc_rec->{$metaname}->[$metapos];
646
647 while (defined $metavalue) {
[27366]648
649 # protect any double quotes and colons in the metavalue before putting it into JSON
650 $metavalue =~ s/\"/&quot;/g;
651 $metavalue =~ s/\:/&58;/g;
652
[27312]653 if($first_metaval) {
654 $first_metaval = 0;
655 } else {
656 $json_result_str .= ",";
657 }
658
[27313]659 $json_result_str .= "{\"metapos\":\"$metapos\",\"metavalue\":\"$metavalue\"}";
[27312]660
661 $metapos++;
[27366]662 $metavalue = $doc_rec->{$metaname}->[$metapos];
[27312]663 }
664 }
665
666 $json_result_str .= "]}"; # close metavals array and metatable record
667 }
668
669 $json_result_str .= "]}"; # close metatable array and docid record
670 }
671
672 $json_result_str .= "]"; # close array of docids
673
[27324]674 $gsdl_cgi->generate_ok_message($json_result_str."\n");
[27312]675}
676
677
[27157]678sub get_index_metadata
679{
680 my $self = shift @_;
681
[19499]682 my $username = $self->{'username'};
683 my $collect = $self->{'collect'};
684 my $gsdl_cgi = $self->{'gsdl_cgi'};
685 my $gsdlhome = $self->{'gsdlhome'};
686
[21715]687 # Obtain the collect dir
[23766]688 my $site = $self->{'site'};
689 my $collect_dir = $gsdl_cgi->get_collection_dir($site);
690 ##my $collect_dir = &util::filename_cat($gsdlhome, "collect");
[19499]691
692 # look up additional args
693 my $docid = $self->{'d'};
694 my $metaname = $self->{'metaname'};
695 my $metapos = $self->{'metapos'};
[23400]696 my $infodbtype = $self->{'infodbtype'};
[19499]697
[21715]698 # To people who know $collect_tail please add some comments
699 # Obtain path to the database
[19499]700 my $collect_tail = $collect;
[27336]701 $collect_tail =~ s/^.*[\/|\\]//;
[21564]702 my $index_text_directory = &util::filename_cat($collect_dir,$collect,"index","text");
[23400]703 my $infodb_file_path = &dbutil::get_infodb_file_path($infodbtype, $collect_tail, $index_text_directory);
[21715]704
705 # Read the docid entry
[23400]706 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $infodb_file_path, $docid);
707
[21715]708 # Basically loop through and unescape_html the values
[19499]709 foreach my $k (keys %$doc_rec) {
710 my @escaped_v = ();
711 foreach my $v (@{$doc_rec->{$k}}) {
712 my $ev = &ghtml::unescape_html($v);
713 push(@escaped_v, $ev);
714 }
715 $doc_rec->{$k} = \@escaped_v;
716 }
717
[21715]718 # Obtain the specified metadata value
[31589]719 $metapos = 0 if (!defined $metapos || ($metapos =~ m/^\s*$/));
[19499]720 my $metavalue = $doc_rec->{$metaname}->[$metapos];
721 $gsdl_cgi->generate_ok_message("$metavalue");
[21715]722
[19293]723}
724
725
[25097]726sub get_import_metadata
727{
728 my $self = shift @_;
729
730 my $username = $self->{'username'};
731 my $collect = $self->{'collect'};
732 my $gsdl_cgi = $self->{'gsdl_cgi'};
733 my $gsdlhome = $self->{'gsdlhome'};
734
735 # Obtain the collect dir
736 my $site = $self->{'site'};
737 my $collect_dir = $gsdl_cgi->get_collection_dir($site);
738 ##my $collect_dir = &util::filename_cat($gsdlhome, "collect");
739
740 # look up additional args
741 my $docid = $self->{'d'};
742 my $metaname = $self->{'metaname'};
[27176]743 my $metapos = $self->{'metapos'};
[31589]744 $metapos = 0 if (!defined $metapos || ($metapos =~ m/^\s*$/)); # gets the first value by default since metapos defaults to 0
[27176]745
[25097]746 my $infodbtype = $self->{'infodbtype'};
[27176]747 if (!defined $docid)
[25097]748 {
[27173]749 $gsdl_cgi->generate_error("No docid (d=...) specified.\n");
[25097]750 }
751
752 # Obtain where the metadata.xml is from the archiveinfo-doc.gdb file
753 # If the doc oid is not specified, we assume the metadata.xml is next to the specified "f"
754 my $metadata_xml_file;
755 my $import_filename = undef;
756
757
758 my $archive_dir = &util::filename_cat($collect_dir, $collect, "archives");
759 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archive_dir);
760 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $docid);
761
762 # This now stores the full pathname
763 $import_filename = $doc_rec->{'src-file'}->[0];
[28211]764 $import_filename = &util::placeholders_to_abspath($import_filename);
[25097]765
766 # figure out correct metadata.xml file [?]
767 # Assuming the metadata.xml file is next to the source file
768 # Note: This will not work if it is using the inherited metadata from the parent folder
769 my ($import_tailname, $import_dirname) = File::Basename::fileparse($import_filename);
770 my $metadata_xml_filename = &util::filename_cat($import_dirname, "metadata.xml");
771
[27176]772 $gsdl_cgi->generate_ok_message($self->get_metadata_from_metadata_xml($gsdl_cgi, $metadata_xml_filename, $metaname, $metapos, $import_tailname));
[25097]773
774}
775
776sub get_metadata_from_metadata_xml
777{
778 my $self = shift @_;
[27176]779 my ($gsdl_cgi, $metadata_xml_filename, $metaname, $metapos, $src_file) = @_;
[25097]780
781 my @rules =
782 (
783 _default => 'raw',
784 'Metadata' => \&gfmxml_metadata,
785 'FileName' => \&mxml_filename
786 );
787
788 my $parser = XML::Rules->new
789 (
790 rules => \@rules,
791 output_encoding => 'utf8'
792 );
793
794 my $xml_in = "";
795 if (!open(MIN,"<$metadata_xml_filename"))
796 {
797 $gsdl_cgi->generate_error("Unable to read in $metadata_xml_filename: $!");
798 }
799 else
800 {
801 # Read them in
802 my $line;
803 while (defined ($line=<MIN>)) {
804 $xml_in .= $line;
805 }
806 close(MIN);
807
[27176]808 $parser->parse($xml_in, {metaname => $metaname, metapos => $metapos, src_file => $src_file});
[25097]809
810 if(defined $parser->{'pad'}->{'metavalue'})
811 {
812 return $parser->{'pad'}->{'metavalue'};
813 }
814 else
815 {
816 return "";
817 }
818 }
819}
820
821sub gfmxml_metadata
822{
823 my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
824
[27176]825 # no subsection support yet in metadata.xml
826
827 if (($parser->{'parameters'}->{'src_file'} eq $parser->{'parameters'}->{'current_file'}) && $parser->{'parameters'}->{'metaname'} eq $attrHash->{'name'})
[25097]828 {
[27176]829 if (!defined $parser->{'parameters'}->{'poscount'})
830 {
831 $parser->{'parameters'}->{'poscount'} = 0;
832 }
833 else
834 {
835 $parser->{'parameters'}->{'poscount'}++;
836 }
837
838 # gets the first value by default, since metapos defaults to 0
839 if (($parser->{'parameters'}->{'poscount'} == $parser->{'parameters'}->{'metapos'}))
840 {
841 if($parser->{'parameters'}->{'metapos'} > 0) {
842 print STDERR "@@@@ WARNING: non-zero metapos.\n";
[27333]843 print STDERR "@@@@ Assuming SIMPLE collection and proceeding to retrieve the import meta at position: ".$parser->{'parameters'}->{'metapos'}.".\n";
[27176]844 }
845 $parser->{'pad'}->{'metavalue'} = $attrHash->{'_content'};
846 }
[25097]847 }
848}
849
[32061]850sub mxml_filename
851{
852 my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
853
854 # Store the filename of the Current Fileset
855 # Note: According to http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd
856 # FileName tag must come before Description tag
857 $parser->{'parameters'}->{'current_file'} = $attrHash->{'_content'};
858
859 # RAW is [$tagname => $attrHash] not $tagname => $attrHash!!
860 return [$tagname => $attrHash];
861}
862
[25097]863sub get_archives_metadata
864{
865 my $self = shift @_;
866
867 my $username = $self->{'username'};
868 my $collect = $self->{'collect'};
869 my $gsdl_cgi = $self->{'gsdl_cgi'};
[27324]870# my $gsdlhome = $self->{'gsdlhome'};
[25097]871 my $infodbtype = $self->{'infodbtype'};
872
873 # Obtain the collect dir
874 my $site = $self->{'site'};
875 my $collect_dir = $gsdl_cgi->get_collection_dir($site);
876
877 my $archive_dir = &util::filename_cat($collect_dir, $collect, "archives");
878
879 # look up additional args
880 my ($docid, $docid_secnum) = ($self->{'d'} =~ m/^(.*?)(\..*)?$/);
881 $docid_secnum = "" if (!defined $docid_secnum);
882
883 my $metaname = $self->{'metaname'};
884 my $metapos = $self->{'metapos'};
[31589]885 $metapos = 0 if (!defined $metapos || ($metapos =~ m/^\s*$/));
[25097]886
887 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archive_dir);
888 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $docid);
889
890 # This now stores the full pathname
891 my $doc_filename = $doc_rec->{'doc-file'}->[0];
892
893 $gsdl_cgi->generate_ok_message($self->get_metadata_from_archive_xml($gsdl_cgi, &util::filename_cat($archive_dir, $doc_filename), $metaname, $metapos, $docid_secnum));
894
895}
896
[32062]897# unused and untested
898sub get_archives_text
899{
900 my $self = shift @_;
901
902 my $username = $self->{'username'};
903 my $collect = $self->{'collect'};
904 my $gsdl_cgi = $self->{'gsdl_cgi'};
905# my $gsdlhome = $self->{'gsdlhome'};
906 my $infodbtype = $self->{'infodbtype'};
907
908 # Obtain the collect dir
909 my $site = $self->{'site'};
910 my $collect_dir = $gsdl_cgi->get_collection_dir($site);
911
912 my $archive_dir = &util::filename_cat($collect_dir, $collect, "archives");
913
914 # look up additional args
915 my ($docid, $docid_secnum) = ($self->{'d'} =~ m/^(.*?)(\..*)?$/);
916 $docid_secnum = "" if (!defined $docid_secnum);
917
918 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archive_dir);
919 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $docid);
920
921 # This now stores the full pathname
922 my $doc_filename = $doc_rec->{'doc-file'}->[0];
923
924 my $metaname = undef;
925 my $metapos = -1;
926
927 $gsdl_cgi->generate_ok_message($self->get_metadata_from_archive_xml($gsdl_cgi, &util::filename_cat($archive_dir, $doc_filename), $metaname, $metapos, $docid_secnum));
928
929}
930
[25097]931sub get_metadata_from_archive_xml
932{
933 my $self = shift @_;
934 my ($gsdl_cgi, $doc_xml_filename, $metaname, $metapos, $secid) = @_;
935
936 my @start_rules = ('Section' => \&dxml_start_section);
937
938 my @rules =
939 (
940 _default => 'raw',
941 'Metadata' => \&gfdxml_metadata
942 );
943
944 my $parser = XML::Rules->new
945 (
946 start_rules => \@start_rules,
947 rules => \@rules,
948 output_encoding => 'utf8'
949 );
950
951 my $xml_in = "";
952 if (!open(MIN,"<$doc_xml_filename"))
953 {
954 $gsdl_cgi->generate_error("Unable to read in $doc_xml_filename: $!");
955 }
956 else
957 {
958 # Read them in
959 my $line;
960 while (defined ($line=<MIN>)) {
961 $xml_in .= $line;
962 }
963 close(MIN);
964
965 $parser->parse($xml_in, {metaname => $metaname, metapos => $metapos, secid => $secid});
966
967 if(defined $parser->{'pad'}->{'metavalue'})
968 {
969 return $parser->{'pad'}->{'metavalue'};
970 }
971 else
972 {
973 return "";
974 }
975 }
976}
977
[32062]978# unused and untested
979sub get_text_from_archive_xml
980{
981 my $self = shift @_;
982 my ($gsdl_cgi, $doc_xml_filename, $secid) = @_;
983
984 # To monitor which section/subsection number we are in
985 my @start_rules = ('Section' => \&dxml_start_section);
986
987 # set the callback functions for the elements in doc.xml we're interested in, <Content>
988 my @rules =
989 (
990 _default => 'raw',
991 'Content' => \&gfdxml_text # gfdxml = get from doc xml?
992 );
993
994 my $parser = XML::Rules->new
995 (
996 start_rules => \@start_rules,
997 rules => \@rules,
998 output_encoding => 'utf8'
999 );
1000
1001 my $xml_in = "";
1002 if (!open(MIN,"<$doc_xml_filename"))
1003 {
1004 $gsdl_cgi->generate_error("Unable to read in $doc_xml_filename: $!");
1005 }
1006 else
1007 {
1008 # Read them in
1009 my $line;
1010 while (defined ($line=<MIN>)) {
1011 $xml_in .= $line;
1012 }
1013 close(MIN);
1014
1015 $parser->parse($xml_in, {secid => $secid});
1016
1017 if(defined $parser->{'pad'}->{'textcontent'})
1018 {
1019 return $parser->{'pad'}->{'textcontent'};
1020 }
1021 else
1022 {
1023 return "";
1024 }
1025 }
1026}
1027
[25097]1028sub gfdxml_metadata
1029{
1030 my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
1031
1032 if(!($parser->{'parameters'}->{'secid'} eq $parser->{'parameters'}->{'curr_section_num'}))
1033 {
1034 return;
1035 }
1036
1037 if ($parser->{'parameters'}->{'metaname'} eq $attrHash->{'name'})
1038 {
1039 if (!defined $parser->{'parameters'}->{'poscount'})
1040 {
1041 $parser->{'parameters'}->{'poscount'} = 0;
1042 }
1043 else
1044 {
1045 $parser->{'parameters'}->{'poscount'}++;
1046 }
1047 }
1048
1049 if (($parser->{'parameters'}->{'metaname'} eq $attrHash->{'name'}) && ($parser->{'parameters'}->{'poscount'} == $parser->{'parameters'}->{'metapos'}))
1050 {
1051 $parser->{'pad'}->{'metavalue'} = $attrHash->{'_content'};
1052 }
1053}
1054
[32062]1055# unused and untested - for get_archives_text
1056sub gfdxml_text
1057{
1058 my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
1059
1060 if($parser->{'parameters'}->{'secid'} eq $parser->{'parameters'}->{'curr_section_num'})
1061 {
1062 $parser->{'pad'}->{'textcontent'} = $attrHash->{'_content'}; # the textnode content
[32061]1063
[32062]1064 }
1065 else {
1066 return;
1067 }
1068}
1069
[32061]1070sub dxml_start_section
1071{
1072 my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
1073
1074 my $new_depth = scalar(@$contextArray);
1075
1076# print STDERR "**** START SECTION \n";
1077
1078 if ($new_depth == 1) {
1079 $parser->{'parameters'}->{'curr_section_depth'} = 1;
1080 $parser->{'parameters'}->{'curr_section_num'} = "";
1081 }
1082
1083 my $old_depth = $parser->{'parameters'}->{'curr_section_depth'};
1084 my $old_secnum = $parser->{'parameters'}->{'curr_section_num'};
1085
1086 my $new_secnum;
1087
1088 if ($new_depth > $old_depth) {
1089 # child subsection
1090 $new_secnum = "$old_secnum.1";
1091 }
1092 elsif ($new_depth == $old_depth) {
1093 # sibling section => increase it's value by 1
1094 my ($tail_num) = ($old_secnum =~ m/\.(\d+)$/);
1095 $tail_num++;
1096 $new_secnum = $old_secnum;
1097 $new_secnum =~ s/\.(\d+)$/\.$tail_num/;
1098 }
1099 else {
[32062]1100 # back up to parent section => lop off tail
[32061]1101 $new_secnum = $old_secnum;
1102 $new_secnum =~ s/\.\d+$//;
1103 }
1104
1105 $parser->{'parameters'}->{'curr_section_depth'} = $new_depth;
1106 $parser->{'parameters'}->{'curr_section_num'} = $new_secnum;
1107
[32062]1108 1;
[32061]1109}
1110
[19293]11111;
Note: See TracBrowser for help on using the repository browser.