source: main/trunk/greenstone2/perllib/cgiactions/metadataaction.pm@ 33311

Last change on this file since 33311 was 33311, checked in by ak19, 5 years ago

Bugfix to metadataaction, which hadn't been calculating subsections correctly past 1.x. If any section has subsections, then the next section wasn't being incremented at the correct time with the increment lagging. For example, 3, 3.1, 3.2 ... 3.6, then 3 again instead of being immediately changed to 4 when necessary.

File size: 35.2 KB
Line 
1##########################################################################
2#
3# metadataaction.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2009 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package metadataaction;
27
28use strict;
29
30use cgiactions::baseaction;
31
32use dbutil;
33use ghtml;
34
35use JSON;
36
37# This class is conditionally expanded with set-metadata, remove-metadata and insert-metadata subroutines
38# defined in modmetadataaction.pm. The BEGIN code block determines whether the condition holds.
39# See
40# http://stackoverflow.com/questions/3998619/what-is-the-role-of-the-begin-block-in-perl
41# http://www.perlmonks.org/?node_id=881761 - splitting module into multiple files
42# http://www.perlmonks.org/?node_id=524456 - merging hashes
43
44our $modmeta_action_table; # don't init to empty hash here, else it will overwrite whatever BEGIN sets this to
45 # see http://stackoverflow.com/questions/3998619/what-is-the-role-of-the-begin-block-in-perl
46
47BEGIN {
48# unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan/perl-5.8");
49 require XML::Rules;
50
51 # if we're GS3, then GS3_AUTHENTICATED must be defined and set to true
52 # in order to have access to subroutines that modify metadata (the set-
53 # and remove- metadata subroutines).
54 # TODO: if we're GS2, then we continue to behave as before?
55
56 if(!defined $ENV{'GSDL3HOME'} || (defined $ENV{'GS3_AUTHENTICATED'} && $ENV{'GS3_AUTHENTICATED'} eq "true")) {
57 require modmetadataaction;
58 }
59 else {
60 $modmeta_action_table = {};
61 }
62}
63
64@metadataaction::ISA = ('baseaction');
65
66
67my $getmeta_action_table =
68{
69 # unused and untested
70 # when DocEdit=1, need to retrieve a doc's full text (or doc section's full text) from archives
71 "get-archives-text" => {
72 'compulsory-args' => [ "d" ] },
73 #'compulsory-args' => [ "d" ],
74 #'optional-args' => [ "section" ] },
75
76 #GET METHODS
77 "get-import-metadata" => {
78 'compulsory-args' => [ "d", "metaname" ],
79 'optional-args' => [ "metapos" ] },
80
81 "get-archives-metadata" => {
82 'compulsory-args' => [ "d", "metaname" ],
83 'optional-args' => [ "metapos" ] },
84
85 "get-index-metadata" => {
86 'compulsory-args' => [ "d", "metaname" ],
87 'optional-args' => [ "metapos" ] },
88
89 "get-metadata" => { # alias for get-index-metadata
90 'compulsory-args' => [ "d", "metaname" ],
91 'optional-args' => [ "metapos" ] },
92
93 "get-live-metadata" => {
94 'compulsory-args' => [ "d", "metaname" ],
95 'optional-args' => [ ] },
96
97 "get-metadata-array" => { # where param can be ONE of: index (default), import, archives, live
98 'compulsory-args' => [ "json" ],
99 'optional-args' => [ "where" ],
100 'help-string' => [
101 'metadata-server.pl?a=get-metadata-array&c=demo&where=index&json=[{"docid":"HASHc5bce2d6d3e5b04e470ec9","metatable":[{"metaname":"username","metapos":"all"},{"metaname":"usertimestamp","metapos":"all"}, {"metaname":"usercomment","metapos":"all"}]}]'
102 ]}
103};
104
105# To get the final action_table of all available subroutines in this class,
106# merge the get- and mod-metadata hashes. See http://www.perlmonks.org/?node_id=524456
107# Note that modmeta_action_table will be empty of subroutines if the user does not have permissions
108# to modify metadata.
109my $action_table = { %$getmeta_action_table, %$modmeta_action_table };
110
111
112sub new
113{
114 my $class = shift (@_);
115 my ($gsdl_cgi,$iis6_mode) = @_;
116
117 # Treat metavalue specially. To transmit this through a GET request
118 # the Javascript side has url-encoded it, so here we need to decode
119 # it before proceeding
120
121 my $url_encoded_metavalue = $gsdl_cgi->param("metavalue");
122 my $url_decoded_metavalue = &unicode::url_decode($url_encoded_metavalue,1);
123#$gsdl_cgi->generate_message("@@@ metaaction new - DEBUG before utf82unicode: " . &unicode::debug_unicode_string($url_decoded_metavalue));
124
125 my $unicode_array = &unicode::utf82unicode($url_decoded_metavalue);
126
127 $url_decoded_metavalue = join("",map(chr($_),@$unicode_array));
128
129#$gsdl_cgi->generate_message("@@@ metaaction new - DEBUG after utf82unicode: " . &unicode::debug_unicode_string($url_decoded_metavalue));
130
131 $gsdl_cgi->param("metavalue",$url_decoded_metavalue);
132
133 # need to do the same with prevmetavalue
134 my $url_encoded_prevmetavalue = $gsdl_cgi->param("prevmetavalue");
135 my $url_decoded_prevmetavalue = &unicode::url_decode($url_encoded_prevmetavalue,1);
136 my $prevunicode_array = &unicode::utf82unicode($url_decoded_prevmetavalue);
137
138 $url_decoded_prevmetavalue = join("",map(chr($_),@$prevunicode_array));
139 $gsdl_cgi->param("prevmetavalue",$url_decoded_prevmetavalue);
140
141 my $self = new baseaction($action_table,$gsdl_cgi,$iis6_mode);
142
143 return bless $self, $class;
144}
145
146
147sub get_live_metadata
148{
149 my $self = shift @_;
150
151 my $username = $self->{'username'};
152 my $collect = $self->{'collect'};
153 my $gsdl_cgi = $self->{'gsdl_cgi'};
154 my $gsdlhome = $self->{'gsdlhome'};
155 my $infodbtype = $self->{'infodbtype'};
156
157 # live metadata gets/saves value scoped (prefixed) by the current usename
158 # so (for now) let's not bother to enforce authentication
159
160 # Obtain the collect dir
161 my $site = $self->{'site'};
162 my $collect_dir = $gsdl_cgi->get_collection_dir($site);
163 ## my $collect_dir = &util::filename_cat($gsdlhome, "collect");
164
165 # No locking collection when getting metadata, only when setting it
166# $self->lock_collection($username, $collect); # Make sure the collection isn't locked by someone else
167
168 # look up additional args
169 my $docid = $self->{'d'};
170 if ((!defined $docid) || ($docid =~ m/^\s*$/)) {
171 $gsdl_cgi->generate_error("No docid (d=...) specified.");
172 }
173
174 # Generate the dbkey
175 my $metaname = $self->{'metaname'};
176 my $dbkey = "$docid.$metaname";
177
178 # To people who know $collect_tail please add some comments
179 # Obtain path to the database
180 my $collect_tail = $collect;
181 $collect_tail =~ s/^.*[\/|\\]//;
182 my $index_text_directory = &util::filename_cat($collect_dir,$collect,"index","text");
183 my $infodb_file_path = &dbutil::get_infodb_file_path($infodbtype, "live-$collect_tail", $index_text_directory);
184
185 # Obtain the content of the key
186 my $cmd = "gdbmget $infodb_file_path $dbkey";
187 if (open(GIN,"$cmd |") == 0) {
188 # Catch error if gdbmget failed
189 my $mess = "Failed to get metadata key: $metaname\n";
190 $mess .= "$!\n";
191
192 $gsdl_cgi->generate_error($mess);
193 }
194 else {
195 binmode(GIN,":utf8");
196 # Read everything in and concatenate them into $metavalue
197 my $metavalue = "";
198 my $line;
199 while (defined ($line=<GIN>)) {
200 $metavalue .= $line;
201 }
202 close(GIN);
203 chomp($metavalue); # Get rid off the tailing newlines
204 $gsdl_cgi->generate_ok_message("$metavalue");
205 }
206
207 # Release the lock once it is done
208# $self->unlock_collection($username, $collect);
209}
210
211# just calls the index version
212sub get_metadata
213{
214 my $self = shift @_;
215 $self->get_index_metadata(@_);
216}
217
218# JSON version that will get the requested metadata values
219# from the requested source (index, import, archives or live)
220# One of the params is a JSON string and the return value is JSON too
221# http://forums.asp.net/t/1844684.aspx/1 - Web api method return json in string
222sub get_metadata_array
223{
224 my $self = shift @_;
225
226 my $where = $self->{'where'};
227 if (!$where || ($where =~ m/^\s*$/)) { # 0, "0", "" and undef are all false. All else is true.
228 # What is truth in perl: http://www.berkeleyinternet.com/perl/node11.html
229 # and http://www.perlmonks.org/?node_id=33638
230
231 $where = "index"; # default behaviour is to get the values from index
232 }
233
234 # Only when setting metadata do we perform authentication and do we lock the collection,
235 # not when getting metadata
236
237 # for get_meta_array, the where param can only be ONE of import, archives, index, live
238 if($where =~ m/index/) {
239 $self->_get_index_metadata_array(@_);
240 }
241 elsif($where =~ m/archives/) {
242 $self->_get_archives_metadata_array(@_);
243 }
244 elsif($where =~ m/import/) {
245 $self->_get_import_metadata_array(@_);
246 }
247 elsif($where =~ m/live/) {
248 $self->_get_live_metadata_array(@_);
249 }
250}
251
252# Unused at present. Added for completion. Tested.
253sub _get_import_metadata_array {
254
255 my $self = shift @_;
256
257 my $collect = $self->{'collect'};
258 my $gsdl_cgi = $self->{'gsdl_cgi'};
259 my $site = $self->{'site'};
260 my $collect_dir = $gsdl_cgi->get_collection_dir($site);
261
262 # look up additional args
263 my $infodbtype = $self->{'infodbtype'};
264
265 my $archive_dir = &util::filename_cat($collect_dir, $collect, "archives");
266 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archive_dir);
267 my $json_str = $self->{'json'};
268 my $doc_array = decode_json $json_str;
269
270 my $json_result_str = "[";
271 my $first_doc_rec = 1;
272 foreach my $doc_array_rec ( @$doc_array ) {
273
274 my $docid = $doc_array_rec->{'docid'}; # no subsection metadata support in metadata.xml, only toplevel meta
275
276 if($first_doc_rec) {
277 $first_doc_rec = 0;
278 } else {
279 $json_result_str .= ",";
280 }
281 $json_result_str = $json_result_str . "{\"docid\":\"" . $docid . "\"";
282
283 my $metatable = $doc_array_rec->{'metatable'}; # a subarray, or need to generate an error saying JSON structure is wrong
284 $json_result_str = $json_result_str . ",\"metatable\":[";
285
286 my $first_rec = 1;
287 foreach my $metatable_rec ( @$metatable ) { # the subarray metatable is an array of hashmaps
288
289 # Read the docid entry
290 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $docid);
291 # This now stores the full pathname
292 my $import_filename = $doc_rec->{'src-file'}->[0];
293 $import_filename = &util::placeholders_to_abspath($import_filename);
294
295 # figure out correct metadata.xml file [?]
296 # Assuming the metadata.xml file is next to the source file
297 # Note: This will not work if it is using the inherited metadata from the parent folder
298 my ($import_tailname, $import_dirname) = File::Basename::fileparse($import_filename);
299 my $metadata_xml_filename = &util::filename_cat($import_dirname, "metadata.xml");
300
301
302 if($first_rec) {
303 $first_rec = 0;
304 } else {
305 $json_result_str .= ",";
306 }
307
308 my $metaname = $metatable_rec->{'metaname'};
309 $json_result_str .= "{\"metaname\":\"$metaname\",\"metavals\":[";
310
311 my $metapos = $metatable_rec->{'metapos'}; # 0... 1|all|undefined
312 if(!defined $metapos) {
313 $metapos = 0;
314 }
315
316 # Obtain the specified metadata value(s)
317 my $metavalue;
318
319 if($metapos ne "all") { # get the value at a single metapos
320 $metavalue = $self->get_metadata_from_metadata_xml($gsdl_cgi, $metadata_xml_filename, $metaname, $metapos, $import_tailname);
321
322 #print STDERR "**** Metafilename, metaname, metapos, sec_num: $metadata_xml_filename, $metaname, $metapos, $import_tailname\n";
323
324 $json_result_str .= "{\"metapos\":\"$metapos\",\"metavalue\":\"$metavalue\"}";
325
326 } else {
327 my $first_metaval = 1;
328 $metapos = 0;
329 $metavalue = $self->get_metadata_from_metadata_xml($gsdl_cgi, $metadata_xml_filename, $metaname, $metapos, $import_tailname);
330
331 while (defined $metavalue && $metavalue ne "") {
332 if($first_metaval) {
333 $first_metaval = 0;
334 } else {
335 $json_result_str .= ",";
336 }
337
338 $json_result_str .= "{\"metapos\":\"$metapos\",\"metavalue\":\"$metavalue\"}";
339
340 $metapos++;
341 $metavalue = $self->get_metadata_from_metadata_xml($gsdl_cgi, $metadata_xml_filename, $metaname, $metapos, $import_tailname);
342 }
343 }
344
345 $json_result_str .= "]}"; # close metavals array and metatable record
346 }
347
348 $json_result_str .= "]}"; # close metatable array and docid record
349 }
350
351 $json_result_str .= "]"; # close array of docids
352 $gsdl_cgi->generate_ok_message($json_result_str."\n");
353}
354
355# Unused method, but included for completion. Tested, works. Takes a JSON string and returns a JSON string.
356# For more information on the format of the output, see _get_index_metadata_array, which is in use.
357sub _get_archives_metadata_array {
358
359 my $self = shift @_;
360
361 my $collect = $self->{'collect'};
362 my $gsdl_cgi = $self->{'gsdl_cgi'};
363 my $site = $self->{'site'};
364 my $collect_dir = $gsdl_cgi->get_collection_dir($site);
365
366 # look up additional args
367 my $infodbtype = $self->{'infodbtype'};
368
369 my $archive_dir = &util::filename_cat($collect_dir, $collect, "archives");
370 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archive_dir);
371
372 my $json_str = $self->{'json'};
373 my $doc_array = decode_json $json_str;
374
375 my $json_result_str = "[";
376 my $first_doc_rec = 1;
377 foreach my $doc_array_rec ( @$doc_array ) {
378
379 my $docid = $doc_array_rec->{'docid'};
380
381 if($first_doc_rec) {
382 $first_doc_rec = 0;
383 } else {
384 $json_result_str .= ",";
385 }
386 $json_result_str = $json_result_str . "{\"docid\":\"" . $docid . "\"";
387
388 my $metatable = $doc_array_rec->{'metatable'}; # a subarray, or need to generate an error saying JSON structure is wrong
389 $json_result_str = $json_result_str . ",\"metatable\":[";
390
391 my $first_rec = 1;
392 foreach my $metatable_rec ( @$metatable ) { # the subarray metatable is an array of hashmaps
393
394 my ($docid, $docid_secnum) = ($doc_array_rec->{'docid'} =~ m/^(.*?)(\..*)?$/);
395 $docid_secnum = "" if (!defined $docid_secnum);
396
397 # Read the docid entry
398 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $docid);
399 # This now stores the full pathname
400 my $doc_filename = $doc_rec->{'doc-file'}->[0];
401 $doc_filename = &util::filename_cat($archive_dir, $doc_filename);
402
403 if($first_rec) {
404 $first_rec = 0;
405 } else {
406 $json_result_str .= ",";
407 }
408
409 my $metaname = $metatable_rec->{'metaname'};
410 $json_result_str .= "{\"metaname\":\"$metaname\",\"metavals\":[";
411
412 my $metapos = $metatable_rec->{'metapos'}; # 0... 1|all|undefined
413 if(!defined $metapos) {
414 $metapos = 0;
415 }
416
417
418 # Obtain the specified metadata value(s)
419 my $metavalue;
420
421 if($metapos ne "all") { # get the value at a single metapos
422
423 $metavalue = $self->get_metadata_from_archive_xml($gsdl_cgi, $doc_filename, $metaname, $metapos, $docid_secnum);
424 #print STDERR "**** Docname, metaname, metapos, sec_num: $doc_filename, $metaname, $metapos, $docid_secnum\n";
425
426 $json_result_str .= "{\"metapos\":\"$metapos\",\"metavalue\":\"$metavalue\"}";
427
428 } else {
429 my $first_metaval = 1;
430 $metapos = 0;
431 $metavalue = $self->get_metadata_from_archive_xml($gsdl_cgi, $doc_filename, $metaname, $metapos, $docid_secnum);
432
433 while (defined $metavalue && $metavalue ne "") {
434 if($first_metaval) {
435 $first_metaval = 0;
436 } else {
437 $json_result_str .= ",";
438 }
439
440 $json_result_str .= "{\"metapos\":\"$metapos\",\"metavalue\":\"$metavalue\"}";
441
442 $metapos++;
443 $metavalue = $self->get_metadata_from_archive_xml($gsdl_cgi, $doc_filename, $metaname, $metapos, $docid_secnum);
444 }
445 }
446
447 $json_result_str .= "]}"; # close metavals array and metatable record
448 }
449
450 $json_result_str .= "]}"; # close metatable array and docid record
451 }
452
453 $json_result_str .= "]"; # close array of docids
454 $gsdl_cgi->generate_ok_message($json_result_str."\n");
455}
456
457
458# Unused at present. Added for completion. Tested, but not sure if it retrieves metadata in the manner it's expected to.
459sub _get_live_metadata_array
460{
461 my $self = shift @_;
462
463 my $collect = $self->{'collect'};
464 my $gsdl_cgi = $self->{'gsdl_cgi'};
465 my $site = $self->{'site'};
466 my $collect_dir = $gsdl_cgi->get_collection_dir($site);
467
468 # look up additional args
469 my $infodbtype = $self->{'infodbtype'};
470
471 # To people who know $collect_tail please add some comments
472 # Obtain the path to the database
473 my $collect_tail = $collect;
474 $collect_tail =~ s/^.*[\/|\\]//;
475 my $index_text_directory = &util::filename_cat($collect_dir,$collect,"index","text");
476 my $infodb_file_path = &dbutil::get_infodb_file_path($infodbtype, "live-$collect_tail", $index_text_directory);
477
478 my $json_str = $self->{'json'};
479 my $doc_array = decode_json $json_str;
480
481 my $json_result_str = "[";
482 my $first_doc_rec = 1;
483
484 foreach my $doc_array_rec ( @$doc_array ) {
485
486 my $docid = $doc_array_rec->{'docid'};
487
488 if($first_doc_rec) {
489 $first_doc_rec = 0;
490 } else {
491 $json_result_str .= ",";
492 }
493 $json_result_str = $json_result_str . "{\"docid\":\"" . $docid . "\"";
494
495 my $metatable = $doc_array_rec->{'metatable'}; # a subarray, or need to generate an error saying JSON structure is wrong
496 $json_result_str = $json_result_str . ",\"metatable\":[";
497
498 my $first_rec = 1;
499 foreach my $metatable_rec ( @$metatable ) { # the subarray metatable is an array of hashmaps
500 if($first_rec) {
501 $first_rec = 0;
502 } else {
503 $json_result_str .= ",";
504 }
505
506 my $metaname = $metatable_rec->{'metaname'};
507 $json_result_str .= "{\"metaname\":\"$metaname\",\"metavals\":[";
508
509 # Generate the dbkey
510 my $dbkey = "$docid.$metaname";
511
512 # metapos for get_live_metadata is always assumed to be "all".
513 # It's always going to get all the lines of metavalues associated with a metaname
514 # (It's the metaname itself that should contain an increment number, if there are to be multiple values).
515 #my $metapos = "all";
516 my $metapos = $metatable_rec->{'metapos'} || 0; # Can be 0... 1|all|undefined. Defaults to 0 if undefined/false
517 my $metavalue = "";
518
519 # Obtain the content of the key
520 my $cmd = "gdbmget $infodb_file_path $dbkey";
521 if (open(GIN,"$cmd |") != 0) { # Success.
522
523 binmode(GIN,":utf8");
524 # Read everything in and concatenate them into $metavalue
525 my $line;
526 my $first_metaval = 1;
527 my $pos = 0;
528 while (defined ($line=<GIN>)) {
529 chomp($line); # Get rid off the tailing newlines
530
531 if($metapos eq "all") {
532 if($first_metaval) {
533 $first_metaval = 0;
534 } else {
535 $json_result_str .= ",";
536 }
537 $metavalue = $line;
538 $json_result_str .= "{\"metapos\":\"$pos\",\"metavalue\":\"$metavalue\"}";
539 } elsif($metapos == $pos) {
540 $metavalue = $line;
541 $json_result_str .= "{\"metapos\":\"$metapos\",\"metavalue\":\"$metavalue\"}";
542 last;
543 } # else, the current $pos is not the required $metapos
544 $pos += 1;
545 }
546 close(GIN);
547 } # else open cmd == 0 (failed) and metavals array will be empty [] for this metaname
548
549 $json_result_str .= "]}"; # close metavals array and metatable record
550 }
551
552 $json_result_str .= "]}"; # close metatable array and docid record
553 }
554
555 $json_result_str .= "]"; # close array of docids
556
557 $gsdl_cgi->generate_ok_message($json_result_str."\n");
558}
559
560
561# Takes a JSON string and returns a JSON string
562# Request string is of the form:
563# http://localhost:8283/greenstone/cgi-bin/metadata-server.pl?a=get-metadata-array&c=demo&where=index&json=[{"docid":"HASHc5bce2d6d3e5b04e470ec9","metatable":[{"metaname":"username","metapos":"all"},{"metaname":"usertimestamp","metapos":"all"}, {"metaname":"usercomment","metapos":"all"}]}]
564# Resulting string is of the form:
565# [{"docid":"HASHc5bce2d6d3e5b04e470ec9","metatable":[{"metaname":"username","metavals":[{"metapos":"0","metavalue":"me"},{"metapos":"1","metavalue":"admin"}]},{"metaname":"usertimestamp","metavals":[{"metapos":"0","metavalue":"1367900586888"},{"metapos":"1","metavalue":"1367900616574"}]},{"metaname":"usercomment","metavals":[{"metapos":"0","metavalue":"Hi"},{"metapos":"1","metavalue":"Hello"}]}]}]
566sub _get_index_metadata_array
567{
568 my $self = shift @_;
569
570 my $collect = $self->{'collect'};
571 my $gsdl_cgi = $self->{'gsdl_cgi'};
572 my $site = $self->{'site'};
573 my $collect_dir = $gsdl_cgi->get_collection_dir($site);
574
575 # look up additional args
576 my $infodbtype = $self->{'infodbtype'};
577
578 # To people who know $collect_tail please add some comments
579 # Obtain the path to the database
580 my $collect_tail = $collect;
581 $collect_tail =~ s/^.*[\/|\\]//;
582 my $index_text_directory = &util::filename_cat($collect_dir,$collect,"index","text");
583 my $infodb_file_path = &dbutil::get_infodb_file_path($infodbtype, $collect_tail, $index_text_directory);
584
585 my $json_str = $self->{'json'};
586 my $doc_array = decode_json $json_str;
587
588 my $json_result_str = "[";
589 my $first_doc_rec = 1;
590
591 foreach my $doc_array_rec ( @$doc_array ) {
592
593 my $docid = $doc_array_rec->{'docid'};
594
595 if($first_doc_rec) {
596 $first_doc_rec = 0;
597 } else {
598 $json_result_str .= ",";
599 }
600 $json_result_str = $json_result_str . "{\"docid\":\"" . $docid . "\"";
601
602 my $metatable = $doc_array_rec->{'metatable'}; # a subarray, or need to generate an error saying JSON structure is wrong
603 $json_result_str = $json_result_str . ",\"metatable\":[";
604
605 my $first_rec = 1;
606 foreach my $metatable_rec ( @$metatable ) { # the subarray metatable is an array of hashmaps
607 if($first_rec) {
608 $first_rec = 0;
609 } else {
610 $json_result_str .= ",";
611 }
612
613 my $metaname = $metatable_rec->{'metaname'};
614 $json_result_str .= "{\"metaname\":\"$metaname\",\"metavals\":[";
615
616 my $metapos = $metatable_rec->{'metapos'}; # 0... 1|all|undefined
617 if(!defined $metapos) {
618 $metapos = 0;
619 }
620
621 # Read the docid entry
622 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $infodb_file_path, $docid);
623
624 # Basically loop through and unescape_html the values
625 foreach my $k (keys %$doc_rec) {
626 my @escaped_v = ();
627 foreach my $v (@{$doc_rec->{$k}}) {
628 my $ev = &ghtml::unescape_html($v);
629 push(@escaped_v, $ev);
630 }
631 $doc_rec->{$k} = \@escaped_v;
632 }
633
634 # Obtain the specified metadata value(s)
635 my $metavalue;
636
637 if($metapos ne "all") { # get the value at a single metapos
638
639 $metavalue = $doc_rec->{$metaname}->[$metapos];
640
641 # protect any double quotes and colons in the metavalue before putting it into JSON
642 $metavalue =~ s/\"/&quot;/g if defined $metavalue;
643 $metavalue =~ s/\:/&58;/g if defined $metavalue;
644
645 $json_result_str .= "{\"metapos\":\"$metapos\",\"metavalue\":\"$metavalue\"}";
646
647 } else {
648 my $first_metaval = 1;
649 $metapos = 0;
650 $metavalue = $doc_rec->{$metaname}->[$metapos];
651
652 while (defined $metavalue) {
653
654 # protect any double quotes and colons in the metavalue before putting it into JSON
655 $metavalue =~ s/\"/&quot;/g;
656 $metavalue =~ s/\:/&58;/g;
657
658 if($first_metaval) {
659 $first_metaval = 0;
660 } else {
661 $json_result_str .= ",";
662 }
663
664 $json_result_str .= "{\"metapos\":\"$metapos\",\"metavalue\":\"$metavalue\"}";
665
666 $metapos++;
667 $metavalue = $doc_rec->{$metaname}->[$metapos];
668 }
669 }
670
671 $json_result_str .= "]}"; # close metavals array and metatable record
672 }
673
674 $json_result_str .= "]}"; # close metatable array and docid record
675 }
676
677 $json_result_str .= "]"; # close array of docids
678
679 $gsdl_cgi->generate_ok_message($json_result_str."\n");
680}
681
682
683sub get_index_metadata
684{
685 my $self = shift @_;
686
687 my $username = $self->{'username'};
688 my $collect = $self->{'collect'};
689 my $gsdl_cgi = $self->{'gsdl_cgi'};
690 my $gsdlhome = $self->{'gsdlhome'};
691
692 # Obtain the collect dir
693 my $site = $self->{'site'};
694 my $collect_dir = $gsdl_cgi->get_collection_dir($site);
695 ##my $collect_dir = &util::filename_cat($gsdlhome, "collect");
696
697 # look up additional args
698 my $docid = $self->{'d'};
699 my $metaname = $self->{'metaname'};
700 my $metapos = $self->{'metapos'};
701 my $infodbtype = $self->{'infodbtype'};
702
703 # To people who know $collect_tail please add some comments
704 # Obtain path to the database
705 my $collect_tail = $collect;
706 $collect_tail =~ s/^.*[\/|\\]//;
707 my $index_text_directory = &util::filename_cat($collect_dir,$collect,"index","text");
708 my $infodb_file_path = &dbutil::get_infodb_file_path($infodbtype, $collect_tail, $index_text_directory);
709
710 # Read the docid entry
711 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $infodb_file_path, $docid);
712
713 # Basically loop through and unescape_html the values
714 foreach my $k (keys %$doc_rec) {
715 my @escaped_v = ();
716 foreach my $v (@{$doc_rec->{$k}}) {
717 my $ev = &ghtml::unescape_html($v);
718 push(@escaped_v, $ev);
719 }
720 $doc_rec->{$k} = \@escaped_v;
721 }
722
723 # Obtain the specified metadata value
724 $metapos = 0 if (!defined $metapos || ($metapos =~ m/^\s*$/));
725 my $metavalue = $doc_rec->{$metaname}->[$metapos];
726 $gsdl_cgi->generate_ok_message("$metavalue");
727
728}
729
730
731sub get_import_metadata
732{
733 my $self = shift @_;
734
735 my $username = $self->{'username'};
736 my $collect = $self->{'collect'};
737 my $gsdl_cgi = $self->{'gsdl_cgi'};
738 my $gsdlhome = $self->{'gsdlhome'};
739
740 # Obtain the collect dir
741 my $site = $self->{'site'};
742 my $collect_dir = $gsdl_cgi->get_collection_dir($site);
743 ##my $collect_dir = &util::filename_cat($gsdlhome, "collect");
744
745 # look up additional args
746 my $docid = $self->{'d'};
747 my $metaname = $self->{'metaname'};
748 my $metapos = $self->{'metapos'};
749 $metapos = 0 if (!defined $metapos || ($metapos =~ m/^\s*$/)); # gets the first value by default since metapos defaults to 0
750
751 my $infodbtype = $self->{'infodbtype'};
752 if (!defined $docid)
753 {
754 $gsdl_cgi->generate_error("No docid (d=...) specified.\n");
755 }
756
757 # Obtain where the metadata.xml is from the archiveinfo-doc.gdb file
758 # If the doc oid is not specified, we assume the metadata.xml is next to the specified "f"
759 my $metadata_xml_file;
760 my $import_filename = undef;
761
762
763 my $archive_dir = &util::filename_cat($collect_dir, $collect, "archives");
764 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archive_dir);
765 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $docid);
766
767 # This now stores the full pathname
768 $import_filename = $doc_rec->{'src-file'}->[0];
769 $import_filename = &util::placeholders_to_abspath($import_filename);
770
771 # figure out correct metadata.xml file [?]
772 # Assuming the metadata.xml file is next to the source file
773 # Note: This will not work if it is using the inherited metadata from the parent folder
774 my ($import_tailname, $import_dirname) = File::Basename::fileparse($import_filename);
775 my $metadata_xml_filename = &util::filename_cat($import_dirname, "metadata.xml");
776
777 $gsdl_cgi->generate_ok_message($self->get_metadata_from_metadata_xml($gsdl_cgi, $metadata_xml_filename, $metaname, $metapos, $import_tailname));
778
779}
780
781sub get_metadata_from_metadata_xml
782{
783 my $self = shift @_;
784 my ($gsdl_cgi, $metadata_xml_filename, $metaname, $metapos, $src_file) = @_;
785
786 my @rules =
787 (
788 _default => 'raw',
789 'Metadata' => \&gfmxml_metadata,
790 'FileName' => \&mxml_filename
791 );
792
793 my $parser = XML::Rules->new
794 (
795 rules => \@rules,
796 output_encoding => 'utf8'
797 );
798
799 my $xml_in = "";
800 if (!open(MIN,"<$metadata_xml_filename"))
801 {
802 $gsdl_cgi->generate_error("Unable to read in $metadata_xml_filename: $!");
803 }
804 else
805 {
806 # Read them in
807 my $line;
808 while (defined ($line=<MIN>)) {
809 $xml_in .= $line;
810 }
811 close(MIN);
812
813 $parser->parse($xml_in, {metaname => $metaname, metapos => $metapos, src_file => $src_file});
814
815 if(defined $parser->{'pad'}->{'metavalue'})
816 {
817 return $parser->{'pad'}->{'metavalue'};
818 }
819 else
820 {
821 return "";
822 }
823 }
824}
825
826sub gfmxml_metadata
827{
828 my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
829
830 # no subsection support yet in metadata.xml
831
832 if (($parser->{'parameters'}->{'src_file'} eq $parser->{'parameters'}->{'current_file'}) && $parser->{'parameters'}->{'metaname'} eq $attrHash->{'name'})
833 {
834 if (!defined $parser->{'parameters'}->{'poscount'})
835 {
836 $parser->{'parameters'}->{'poscount'} = 0;
837 }
838 else
839 {
840 $parser->{'parameters'}->{'poscount'}++;
841 }
842
843 # gets the first value by default, since metapos defaults to 0
844 if (($parser->{'parameters'}->{'poscount'} == $parser->{'parameters'}->{'metapos'}))
845 {
846 if($parser->{'parameters'}->{'metapos'} > 0) {
847 print STDERR "@@@@ WARNING: non-zero metapos.\n";
848 print STDERR "@@@@ Assuming SIMPLE collection and proceeding to retrieve the import meta at position: ".$parser->{'parameters'}->{'metapos'}.".\n";
849 }
850 $parser->{'pad'}->{'metavalue'} = $attrHash->{'_content'};
851 }
852 }
853}
854
855sub mxml_filename
856{
857 my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
858
859 # Store the filename of the Current Fileset
860 # Note: According to http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd
861 # FileName tag must come before Description tag
862 $parser->{'parameters'}->{'current_file'} = $attrHash->{'_content'};
863
864 # RAW is [$tagname => $attrHash] not $tagname => $attrHash!!
865 return [$tagname => $attrHash];
866}
867
868sub get_archives_metadata
869{
870 my $self = shift @_;
871
872 my $username = $self->{'username'};
873 my $collect = $self->{'collect'};
874 my $gsdl_cgi = $self->{'gsdl_cgi'};
875# my $gsdlhome = $self->{'gsdlhome'};
876 my $infodbtype = $self->{'infodbtype'};
877
878 # Obtain the collect dir
879 my $site = $self->{'site'};
880 my $collect_dir = $gsdl_cgi->get_collection_dir($site);
881
882 my $archive_dir = &util::filename_cat($collect_dir, $collect, "archives");
883
884 # look up additional args
885 my ($docid, $docid_secnum) = ($self->{'d'} =~ m/^(.*?)(\..*)?$/);
886 $docid_secnum = "" if (!defined $docid_secnum);
887
888 my $metaname = $self->{'metaname'};
889 my $metapos = $self->{'metapos'};
890 $metapos = 0 if (!defined $metapos || ($metapos =~ m/^\s*$/));
891
892 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archive_dir);
893 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $docid);
894
895 # This now stores the full pathname
896 my $doc_filename = $doc_rec->{'doc-file'}->[0];
897
898 $gsdl_cgi->generate_ok_message($self->get_metadata_from_archive_xml($gsdl_cgi, &util::filename_cat($archive_dir, $doc_filename), $metaname, $metapos, $docid_secnum));
899
900}
901
902# unused and untested
903sub get_archives_text
904{
905 my $self = shift @_;
906
907 my $username = $self->{'username'};
908 my $collect = $self->{'collect'};
909 my $gsdl_cgi = $self->{'gsdl_cgi'};
910# my $gsdlhome = $self->{'gsdlhome'};
911 my $infodbtype = $self->{'infodbtype'};
912
913 # Obtain the collect dir
914 my $site = $self->{'site'};
915 my $collect_dir = $gsdl_cgi->get_collection_dir($site);
916
917 my $archive_dir = &util::filename_cat($collect_dir, $collect, "archives");
918
919 # look up additional args
920 my ($docid, $docid_secnum) = ($self->{'d'} =~ m/^(.*?)(\..*)?$/);
921 $docid_secnum = "" if (!defined $docid_secnum);
922
923 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archive_dir);
924 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $docid);
925
926 # This now stores the full pathname
927 my $doc_filename = $doc_rec->{'doc-file'}->[0];
928
929 my $metaname = undef;
930 my $metapos = -1;
931
932 $gsdl_cgi->generate_ok_message($self->get_metadata_from_archive_xml($gsdl_cgi, &util::filename_cat($archive_dir, $doc_filename), $metaname, $metapos, $docid_secnum));
933
934}
935
936sub get_metadata_from_archive_xml
937{
938 my $self = shift @_;
939 my ($gsdl_cgi, $doc_xml_filename, $metaname, $metapos, $secid) = @_;
940
941 my @start_rules = ('Section' => \&dxml_start_section);
942
943 my @rules =
944 (
945 _default => 'raw',
946 'Metadata' => \&gfdxml_metadata
947 );
948
949 my $parser = XML::Rules->new
950 (
951 start_rules => \@start_rules,
952 rules => \@rules,
953 output_encoding => 'utf8'
954 );
955
956 my $xml_in = "";
957 if (!open(MIN,"<$doc_xml_filename"))
958 {
959 $gsdl_cgi->generate_error("Unable to read in $doc_xml_filename: $!");
960 }
961 else
962 {
963 # Read them in
964 my $line;
965 while (defined ($line=<MIN>)) {
966 $xml_in .= $line;
967 }
968 close(MIN);
969
970 $parser->parse($xml_in, {metaname => $metaname, metapos => $metapos, secid => $secid});
971
972 if(defined $parser->{'pad'}->{'metavalue'})
973 {
974 return $parser->{'pad'}->{'metavalue'};
975 }
976 else
977 {
978 return "";
979 }
980 }
981}
982
983# unused and untested
984sub get_text_from_archive_xml
985{
986 my $self = shift @_;
987 my ($gsdl_cgi, $doc_xml_filename, $secid) = @_;
988
989 # To monitor which section/subsection number we are in
990 my @start_rules = ('Section' => \&dxml_start_section);
991
992 # set the callback functions for the elements in doc.xml we're interested in, <Content>
993 my @rules =
994 (
995 _default => 'raw',
996 'Content' => \&gfdxml_text # gfdxml = get from doc xml?
997 );
998
999 my $parser = XML::Rules->new
1000 (
1001 start_rules => \@start_rules,
1002 rules => \@rules,
1003 output_encoding => 'utf8'
1004 );
1005
1006 my $xml_in = "";
1007 if (!open(MIN,"<$doc_xml_filename"))
1008 {
1009 $gsdl_cgi->generate_error("Unable to read in $doc_xml_filename: $!");
1010 }
1011 else
1012 {
1013 # Read them in
1014 my $line;
1015 while (defined ($line=<MIN>)) {
1016 $xml_in .= $line;
1017 }
1018 close(MIN);
1019
1020 $parser->parse($xml_in, {secid => $secid});
1021
1022 if(defined $parser->{'pad'}->{'textcontent'})
1023 {
1024 return $parser->{'pad'}->{'textcontent'};
1025 }
1026 else
1027 {
1028 return "";
1029 }
1030 }
1031}
1032
1033sub gfdxml_metadata
1034{
1035 my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
1036
1037 if(!($parser->{'parameters'}->{'secid'} eq $parser->{'parameters'}->{'curr_section_num'}))
1038 {
1039 return;
1040 }
1041
1042 if ($parser->{'parameters'}->{'metaname'} eq $attrHash->{'name'})
1043 {
1044 if (!defined $parser->{'parameters'}->{'poscount'})
1045 {
1046 $parser->{'parameters'}->{'poscount'} = 0;
1047 }
1048 else
1049 {
1050 $parser->{'parameters'}->{'poscount'}++;
1051 }
1052 }
1053
1054 if (($parser->{'parameters'}->{'metaname'} eq $attrHash->{'name'}) && ($parser->{'parameters'}->{'poscount'} == $parser->{'parameters'}->{'metapos'}))
1055 {
1056 $parser->{'pad'}->{'metavalue'} = $attrHash->{'_content'};
1057 }
1058}
1059
1060# unused and untested - for get_archives_text
1061sub gfdxml_text
1062{
1063 my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
1064
1065 if($parser->{'parameters'}->{'secid'} eq $parser->{'parameters'}->{'curr_section_num'})
1066 {
1067 $parser->{'pad'}->{'textcontent'} = $attrHash->{'_content'}; # the textnode content
1068
1069 }
1070 else {
1071 return;
1072 }
1073}
1074
1075sub dxml_start_section
1076{
1077 my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
1078
1079 my $new_depth = scalar(@$contextArray);
1080
1081# print STDERR "**** START SECTION \n";
1082
1083 if ($new_depth == 1) {
1084 $parser->{'parameters'}->{'curr_section_depth'} = 1;
1085 $parser->{'parameters'}->{'curr_section_num'} = "";
1086 }
1087
1088 my $old_depth = $parser->{'parameters'}->{'curr_section_depth'};
1089 my $old_secnum = $parser->{'parameters'}->{'curr_section_num'};
1090
1091 my $new_secnum;
1092
1093 if ($new_depth > $old_depth) {
1094 # child subsection
1095 $new_secnum = "$old_secnum.1";
1096 }
1097 elsif ($new_depth == $old_depth) {
1098 # sibling section => increase its value by 1
1099 my ($tail_num) = ($old_secnum =~ m/\.(\d+)$/);
1100 $tail_num++;
1101 $new_secnum = $old_secnum;
1102 $new_secnum =~ s/\.(\d+)$/\.$tail_num/;
1103 }
1104 else {
1105 # back up to parent section => lop off tail
1106 $new_secnum = $old_secnum;
1107 $new_secnum =~ s/\.\d+$//;
1108 # remember to increment from parent section
1109 my ($tail_num) = ($new_secnum =~ m/\.(\d+)$/);
1110 $tail_num++;
1111 $new_secnum =~ s/\.(\d+)$/\.$tail_num/;
1112 }
1113
1114 $parser->{'parameters'}->{'curr_section_depth'} = $new_depth;
1115 $parser->{'parameters'}->{'curr_section_num'} = $new_secnum;
1116
1117 #print STDERR "**** START SECTION - $new_secnum\n************************\n\n";
1118 1;
1119}
1120
11211;
Note: See TracBrowser for help on using the repository browser.