source: main/trunk/greenstone2/perllib/cgiactions/docextractaction.pm@ 28268

Last change on this file since 28268 was 28268, checked in by davidb, 11 years ago

Existing doc.xml maked with R for reindex, new file I to index

File size: 13.0 KB
Line 
1##########################################################################
2#
3# docextractaction.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2009 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package docextractaction;
27
28use strict;
29
30use cgiactions::baseaction;
31
32use dbutil;
33use ghtml;
34
35use JSON;
36
37
38BEGIN {
39 require XML::Rules;
40}
41
42@docextractaction::ISA = ('baseaction');
43
44my $action_table =
45{
46 "extract-archives-doc" => { # where param can be ONE of: index (default), import, archives, live
47 'compulsory-args' => [ "d", "json-sections" ],
48 'optional-args' => [ "json-metadata", "newd",
49 "keep-parent-metadata", "keep-parent-content" ],
50# 'optional-args' => [ "where" ],
51 'help-string' => [
52 'document-extract.pl?a=extract-archives-doc&c=demo&d=HASH0123456789ABC&json-sections=["1.2","1.3.2","2.1","2.2"]&json-metadata=[{"metaname":"dc.Title","metavalue":"All Black Rugy Success","metamode":"accumulate"]'
53 ."\n\n Add site=xxxx if a Greenstone3 installation"
54 ]}
55
56};
57
58
59sub new
60{
61 my $class = shift (@_);
62 my ($gsdl_cgi,$iis6_mode) = @_;
63
64 # Treat metavalue specially. To transmit this through a GET request
65 # the Javascript side has url-encoded it, so here we need to decode
66 # it before proceeding
67
68# my $url_encoded_metavalue = $gsdl_cgi->param("metavalue");
69# my $url_decoded_metavalue = &unicode::url_decode($url_encoded_metavalue,1);
70# my $unicode_array = &unicode::utf82unicode($url_decoded_metavalue);
71
72# $url_decoded_metavalue = join("",map(chr($_),@$unicode_array));
73# $gsdl_cgi->param("metavalue",$url_decoded_metavalue);
74
75 my $self = new baseaction($action_table,$gsdl_cgi,$iis6_mode);
76
77 return bless $self, $class;
78}
79
80
81
82sub dxml_start_section
83{
84 my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
85
86 my $new_depth = scalar(@$contextArray);
87
88 if ($new_depth == 1) {
89 $parser->{'parameters'}->{'curr_section_depth'} = 1;
90 $parser->{'parameters'}->{'curr_section_num'} = "";
91 }
92
93 my $old_depth = $parser->{'parameters'}->{'curr_section_depth'};
94 my $old_secnum = $parser->{'parameters'}->{'curr_section_num'};
95
96 my $new_secnum;
97
98 if ($new_depth > $old_depth) {
99 # first child subsection
100 $new_secnum = "$old_secnum.1";
101 }
102 elsif ($new_depth == $old_depth) {
103 # sibling section => increase it's value by 1
104 my ($tail_num) = ($old_secnum =~ m/\.(\d+)$/);
105 $tail_num++;
106 $new_secnum = $old_secnum;
107 $new_secnum =~ s/\.(\d+)$/\.$tail_num/;
108 }
109 else {
110 # back up to parent section => lopp off tail
111# $new_secnum = $old_secnum;
112# $new_secnum =~ s/\.\d+$//;
113 }
114
115 $parser->{'parameters'}->{'curr_section_depth'} = $new_depth;
116 $parser->{'parameters'}->{'curr_section_num'} = $new_secnum;
117
118 1;
119}
120
121
122
123sub dxml_metadata
124{
125 my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
126
127 my $parent_sec_num_hash = $parser->{'parameters'}->{'parent_sec_num_hash'};
128
129 my $keep_parent_metadata = $parser->{'parameters'}->{'keep_parent_metadata'};
130 my $keep_parent_content = $parser->{'parameters'}->{'keep_parent_content'};
131
132 my $mode = $parser->{'parameters'}->{'mode'};
133
134 if ($mode eq "extract") {
135
136 my $new_docid = $parser->{'parameters'}->{'new_docid'};
137 if ($attrHash->{'name'} eq "Identifier") {
138 $attrHash->{'_content'} = $new_docid;
139 }
140 }
141
142 return [ $tagname => $attrHash ];
143}
144
145
146sub dxml_section
147{
148 my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
149
150 my $curr_sec_num = $parser->{'parameters'}->{'curr_section_num'} || undef;
151
152 my $sec_num_hash = $parser->{'parameters'}->{'sec_num_hash'};
153 my $parent_sec_num_hash = $parser->{'parameters'}->{'parent_sec_num_hash'};
154
155 my $keep_parent_metadata = $parser->{'parameters'}->{'keep_parent_metadata'};
156 my $keep_parent_content = $parser->{'parameters'}->{'keep_parent_content'};
157
158 my $mode = $parser->{'parameters'}->{'mode'};
159
160 my $prev_depth = $parser->{'parameters'}->{'curr_section_depth'};
161 my $live_depth = scalar(@$contextArray);
162
163 if ($live_depth < $prev_depth) {
164 # In a closing-sections poping off situation:
165 # </Section>
166 # </Section>
167
168 # => Back up to parent section => lopp off tail
169
170 $curr_sec_num =~ s/\.\d+$//;
171 $parser->{'parameters'}->{'curr_section_depth'} = $live_depth;
172 $parser->{'parameters'}->{'curr_section_num'} = $curr_sec_num;
173 }
174
175
176 if ($live_depth == 1) {
177 # root sectin tag, which must always exist
178 return [$tagname => $attrHash];
179 }
180 elsif ($mode eq "delete") {
181 if (defined $sec_num_hash->{$curr_sec_num}) {
182 # remove it
183 return undef
184 }
185 else {
186 # keep it
187 return [$tagname => $attrHash];
188 }
189 }
190 else {
191 # mode is extract
192
193 if (defined $sec_num_hash->{$curr_sec_num}) {
194 # keep it
195## print STDERR "**** Asked to keep: sec num = $curr_sec_num\n";
196
197 return [$tagname => $attrHash];
198 }
199 elsif (defined $parent_sec_num_hash->{$curr_sec_num}) {
200 # want this element, but cut down to just the child <Section>
201
202 my $section_child = undef;
203
204## print STDERR "**** Parent match: sec num = $curr_sec_num\n";
205
206 my $filtered_elems = [];
207
208 foreach my $elem ( @{$attrHash->{'_content'}}) {
209 if (ref $elem eq "ARRAY") {
210 my $child_tagname = $elem->[0];
211## print STDERR "***## elem name $child_tagname\n";
212
213
214 if ($child_tagname eq "Description") {
215 if ($keep_parent_metadata) {
216 push(@$filtered_elems,$elem);
217 }
218 }
219 elsif ($child_tagname eq "Content") {
220 if ($keep_parent_content) {
221 push(@$filtered_elems,$elem);
222 }
223 }
224 else {
225 push(@$filtered_elems,$elem);
226 }
227 }
228 else {
229 push(@$filtered_elems,$elem);
230 }
231 }
232
233 $attrHash->{'_content'} = $filtered_elems;
234
235 return [$tagname => $attrHash];
236
237 }
238 else {
239 # not in our list => remove it
240 return undef;
241 }
242 }
243}
244
245
246sub remove_from_doc_xml
247{
248 my $self = shift @_;
249 my ($gsdl_cgi, $doc_xml_filename, $newdoc_xml_filename,
250 $sec_num_hash, $parent_sec_num_hash, $mode) = @_;
251
252 my @start_rules = ('Section' => \&dxml_start_section);
253
254 # Set the call-back functions for the metadata tags
255 my @rules =
256 (
257 _default => 'raw',
258 'Section' => \&dxml_section,
259 'Metadata' => \&dxml_metadata
260 );
261
262 my $parser = XML::Rules->new
263 (
264 start_rules => \@start_rules,
265 rules => \@rules,
266 style => 'filter',
267 output_encoding => 'utf8',
268## normalisespaces => 1, # http://search.cpan.org/~jenda/XML-Rules-1.16/lib/XML/Rules.pm
269# stripspaces => 2|0|0 # ineffectual
270 );
271
272 my $status = 0;
273 my $xml_in = "";
274 if (!open(MIN,"<$doc_xml_filename"))
275 {
276 $gsdl_cgi->generate_error("Unable to read in $doc_xml_filename: $!");
277 $status = 1;
278 }
279 else
280 {
281 # Read them in
282 my $line;
283 while (defined ($line=<MIN>)) {
284 $xml_in .= $line;
285 }
286 close(MIN);
287
288 # Filter with the call-back functions
289 my $xml_out = "";
290
291 my $MOUT;
292 if (!open($MOUT,">$newdoc_xml_filename")) {
293 $gsdl_cgi->generate_error("Unable to write out to $newdoc_xml_filename: $!");
294 $status = 1;
295 }
296 else {
297 binmode($MOUT,":utf8");
298
299 my $options = { sec_num_hash => $sec_num_hash,
300 parent_sec_num_hash => $parent_sec_num_hash,
301 keep_parent_metadata => $self->{'keep-parent-metadata'},
302 keep_parent_content => $self->{'keep-parent-content'},
303 new_docid => $self->{'new_docid'},
304 mode => $mode };
305
306 $parser->filter($xml_in, $MOUT, $options);
307 close($MOUT);
308 }
309 }
310 return $status;
311}
312
313sub sections_as_hash
314{
315 my $self = shift @_;
316
317 my ($json_sections_array) = @_;
318
319 my $sec_num_hash = {};
320
321 foreach my $sn ( @$json_sections_array ) {
322
323 # our XML parser curr_sec_num puts '.' at the root
324 # Need to do the same here, so things can be matched up
325 $sec_num_hash->{".$sn"} = 1;
326 }
327
328 return $sec_num_hash;
329}
330
331
332sub parent_sections_as_hash
333{
334 my $self = shift @_;
335
336 my ($json_sections_array) = @_;
337
338 my $sec_num_hash = {};
339
340 foreach my $sn ( @$json_sections_array ) {
341
342 # needs to make a copy, otherwise version stored in json_sections gets changed
343 my $sn_copy = $sn;
344 while ($sn_copy =~ s/\.\d+$//) {
345 # our XML parser curr_sec_num puts '.' at the root
346 # Need to do the same here, so things can be matched up
347
348 $sec_num_hash->{".$sn_copy"} = 1;
349 }
350 }
351
352 return $sec_num_hash;
353}
354
355sub parse_flag
356{
357 my $self = shift @_;
358
359 my ($arg_name) = @_;
360
361 my $flag = $self->{$arg_name} || 0;
362
363 $flag =~ s/^true/1/i;
364 $flag =~ s/^false/0/i;
365
366 return $flag;
367}
368
369sub _extract_archives_doc
370{
371 my $self = shift @_;
372
373 my $collect = $self->{'collect'};
374 my $gsdl_cgi = $self->{'gsdl_cgi'};
375 my $infodbtype = $self->{'infodbtype'};
376
377 my $site = $self->{'site'};
378
379 # Obtain the collect and archive dir
380 my $collect_dir = $gsdl_cgi->get_collection_dir($site);
381
382 my $archive_dir = &util::filename_cat($collect_dir,$collect,"archives");
383
384 # look up additional args
385 my $docid = $self->{'d'};
386
387 my $timestamp = time();
388 my $new_docid = $self->{'newd'} || "HASH$timestamp";
389 $self->{'new_docid'} = $new_docid;
390
391 $self->{'keep-parent-metadata'} = $self->parse_flag("keep-parent-metadata");
392 $self->{'keep-parent-content'} = $self->parse_flag("keep-parent-content");
393
394 my $json_sections_str = $self->{'json-sections'};
395 my $json_sections_array = decode_json($json_sections_str);
396
397
398 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archive_dir);
399 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $docid);
400
401 my $doc_file = $doc_rec->{'doc-file'}->[0];
402 my $doc_filename = &util::filename_cat($archive_dir, $doc_file);
403
404 my $new_doc_file = $doc_file;
405 $new_doc_file =~ s/doc(-\d+)?.xml$/doc-$timestamp.xml/;
406
407 my $newdoc_filename = &util::filename_cat($archive_dir, $new_doc_file);
408
409 my $sec_num_hash = $self->sections_as_hash($json_sections_array);
410 my $parent_sec_num_hash = $self->parent_sections_as_hash($json_sections_array);
411
412 my $extract_status = $self->remove_from_doc_xml($gsdl_cgi, $doc_filename ,$newdoc_filename, $sec_num_hash, $parent_sec_num_hash, "extract");
413
414 if ($extract_status == 0)
415 {
416 my $delete_sec_num_hash = $self->sections_as_hash($json_sections_array,"no-parents");
417
418 my $delete_status = $self->remove_from_doc_xml($gsdl_cgi, $doc_filename ,$doc_filename, $sec_num_hash, undef, "delete");
419
420 if ($delete_status == 0) {
421
422 # Existing doc record needs to be reindexed
423 $doc_rec->{'index-status'} = ["R"];
424 &dbutil::set_infodb_entry($infodbtype, $arcinfo_doc_filename, $docid, $doc_rec);
425
426 # Create doc-record entry for the newly extracted document
427
428 my $new_doc_rec = $doc_rec;
429 $new_doc_rec->{'index-status'} = ["I"];
430 #### Need to cut this down to just the assoc files the new document references
431
432 $new_doc_rec->{'doc-file'} = [$new_doc_file];
433
434 &dbutil::set_infodb_entry($infodbtype, $arcinfo_doc_filename, $new_docid, $new_doc_rec);
435
436 #### Also need to update the archivesinf-src database!!!!
437 # For all the assoc and src files, retrieve record, and add in new_docid
438
439 my $mess = "document-extract successful: Key[$docid]\n";
440
441 $gsdl_cgi->generate_ok_message($mess);
442 }
443 else {
444 my $mess .= "Failed to extract identified section numbers for key: $docid\n";
445 $mess .= "Exit status: $delete_status\n";
446 $mess .= "System Error Message: $!\n";
447 $mess .= "-" x 20 . "\n";
448
449 $gsdl_cgi->generate_error($mess);
450 }
451 }
452 else
453 {
454 my $mess .= "Failed to remove identified section numbers for key: $docid\n";
455 $mess .= "Exit status: $extract_status\n";
456 $mess .= "System Error Message: $!\n";
457 $mess .= "-" x 20 . "\n";
458
459 $gsdl_cgi->generate_error($mess);
460 }
461
462 #return $status; # in case calling functions have a use for this
463}
464
465
466
467# JSON version that will get the requested metadata values
468# from the requested source (index, import, archives or live)
469# One of the params is a JSON string and the return value is JSON too
470# http://forums.asp.net/t/1844684.aspx/1 - Web api method return json in string
471sub extract_archives_doc
472{
473 my $self = shift @_;
474
475 my $username = $self->{'username'};
476 my $collect = $self->{'collect'};
477 my $gsdl_cgi = $self->{'gsdl_cgi'};
478
479 if ($baseaction::authentication_enabled)
480 {
481 # Ensure the user is allowed to edit this collection
482 $self->authenticate_user($username, $collect);
483 }
484
485 # Make sure the collection isn't locked by someone else
486 $self->lock_collection($username, $collect);
487
488 $self->_extract_archives_doc(@_);
489
490 # Release the lock once it is done
491 $self->unlock_collection($username, $collect);
492
493
494}
495
496
4971;
Note: See TracBrowser for help on using the repository browser.