source: main/trunk/greenstone2/perllib/cgiactions/docextractaction.pm@ 28261

Last change on this file since 28261 was 28261, checked in by davidb, 11 years ago

This action now saves the newly created document in the same archives folder as the original, and update the archivesinf-doc database appropriately

File size: 12.2 KB
Line 
1##########################################################################
2#
3# docextractaction.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2009 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package docextractaction;
27
28use strict;
29
30use cgiactions::baseaction;
31
32use dbutil;
33use ghtml;
34
35use JSON;
36
37
38BEGIN {
39 require XML::Rules;
40}
41
42@docextractaction::ISA = ('baseaction');
43
44my $action_table =
45{
46 "extract-archives-doc" => { # where param can be ONE of: index (default), import, archives, live
47 'compulsory-args' => [ "d", "json-sections" ],
48 'optional-args' => [ "json-metadata", "newd",
49 "keep-parent-metadata", "keep-parent-content" ],
50# 'optional-args' => [ "where" ],
51 'help-string' => [
52 'document-extract.pl?a=extract-archives-doc&c=demo&d=HASH0123456789ABC&json-sections=["1.2","1.3.2","2.1","2.2"]&json-metadata=[{"metaname":"dc.Title","metavalue":"All Black Rugy Success","metamode":"accumulate"]'
53 ."\n\n Add site=xxxx if a Greenstone3 installation"
54 ]}
55
56};
57
58
59sub new
60{
61 my $class = shift (@_);
62 my ($gsdl_cgi,$iis6_mode) = @_;
63
64 # Treat metavalue specially. To transmit this through a GET request
65 # the Javascript side has url-encoded it, so here we need to decode
66 # it before proceeding
67
68# my $url_encoded_metavalue = $gsdl_cgi->param("metavalue");
69# my $url_decoded_metavalue = &unicode::url_decode($url_encoded_metavalue,1);
70# my $unicode_array = &unicode::utf82unicode($url_decoded_metavalue);
71
72# $url_decoded_metavalue = join("",map(chr($_),@$unicode_array));
73# $gsdl_cgi->param("metavalue",$url_decoded_metavalue);
74
75 my $self = new baseaction($action_table,$gsdl_cgi,$iis6_mode);
76
77 return bless $self, $class;
78}
79
80
81
82sub dxml_start_section
83{
84 my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
85
86 my $new_depth = scalar(@$contextArray);
87
88 if ($new_depth == 1) {
89 $parser->{'parameters'}->{'curr_section_depth'} = 1;
90 $parser->{'parameters'}->{'curr_section_num'} = "";
91 }
92
93 my $old_depth = $parser->{'parameters'}->{'curr_section_depth'};
94 my $old_secnum = $parser->{'parameters'}->{'curr_section_num'};
95
96 my $new_secnum;
97
98 if ($new_depth > $old_depth) {
99 # first child subsection
100 $new_secnum = "$old_secnum.1";
101 }
102 elsif ($new_depth == $old_depth) {
103 # sibling section => increase it's value by 1
104 my ($tail_num) = ($old_secnum =~ m/\.(\d+)$/);
105 $tail_num++;
106 $new_secnum = $old_secnum;
107 $new_secnum =~ s/\.(\d+)$/\.$tail_num/;
108 }
109 else {
110 # back up to parent section => lopp off tail
111# $new_secnum = $old_secnum;
112# $new_secnum =~ s/\.\d+$//;
113 }
114
115 $parser->{'parameters'}->{'curr_section_depth'} = $new_depth;
116 $parser->{'parameters'}->{'curr_section_num'} = $new_secnum;
117
118 1;
119}
120
121
122sub dxml_section
123{
124 my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
125
126 my $curr_sec_num = $parser->{'parameters'}->{'curr_section_num'} || undef;
127
128 my $sec_num_hash = $parser->{'parameters'}->{'sec_num_hash'};
129 my $parent_sec_num_hash = $parser->{'parameters'}->{'parent_sec_num_hash'};
130
131 my $keep_parent_metadata = $parser->{'parameters'}->{'keep_parent_metadata'};
132 my $keep_parent_content = $parser->{'parameters'}->{'keep_parent_content'};
133
134 my $mode = $parser->{'parameters'}->{'mode'};
135
136 my $prev_depth = $parser->{'parameters'}->{'curr_section_depth'};
137 my $live_depth = scalar(@$contextArray);
138
139 if ($live_depth < $prev_depth) {
140 # In a closing-sections poping off situation:
141 # </Section>
142 # </Section>
143
144 # => Back up to parent section => lopp off tail
145
146 $curr_sec_num =~ s/\.\d+$//;
147 $parser->{'parameters'}->{'curr_section_depth'} = $live_depth;
148 $parser->{'parameters'}->{'curr_section_num'} = $curr_sec_num;
149 }
150
151
152 if ($live_depth == 1) {
153 # root sectin tag, which must always exist
154 return [$tagname => $attrHash];
155 }
156 elsif ($mode eq "delete") {
157 if (defined $sec_num_hash->{$curr_sec_num}) {
158 # remove it
159 return undef
160 }
161 else {
162 # keep it
163 return [$tagname => $attrHash];
164 }
165 }
166 else {
167 # mode is extract
168
169 if (defined $sec_num_hash->{$curr_sec_num}) {
170 # keep it
171## print STDERR "**** Asked to keep: sec num = $curr_sec_num\n";
172
173 return [$tagname => $attrHash];
174 }
175 elsif (defined $parent_sec_num_hash->{$curr_sec_num}) {
176 # want this element, but cut down to just the child <Section>
177
178 my $section_child = undef;
179
180## print STDERR "**** Parent match: sec num = $curr_sec_num\n";
181
182 my $filtered_elems = [];
183
184 foreach my $elem ( @{$attrHash->{'_content'}}) {
185 if (ref $elem eq "ARRAY") {
186 my $child_tagname = $elem->[0];
187## print STDERR "***## elem name $child_tagname\n";
188
189
190 if ($child_tagname eq "Description") {
191 if ($keep_parent_metadata) {
192 push(@$filtered_elems,$elem);
193 }
194 }
195 elsif ($child_tagname eq "Content") {
196 if ($keep_parent_content) {
197 push(@$filtered_elems,$elem);
198 }
199 }
200 else {
201 push(@$filtered_elems,$elem);
202 }
203 }
204 else {
205 push(@$filtered_elems,$elem);
206 }
207 }
208
209 $attrHash->{'_content'} = $filtered_elems;
210
211 return [$tagname => $attrHash];
212
213 }
214 else {
215 # not in our list => remove it
216 return undef;
217 }
218 }
219}
220
221
222sub remove_from_doc_xml
223{
224 my $self = shift @_;
225 my ($gsdl_cgi, $doc_xml_filename, $newdoc_xml_filename,
226 $sec_num_hash, $parent_sec_num_hash, $mode) = @_;
227
228 my @start_rules = ('Section' => \&dxml_start_section);
229
230 # Set the call-back functions for the metadata tags
231 my @rules =
232 (
233 _default => 'raw',
234 'Section' => \&dxml_section
235 );
236
237 my $parser = XML::Rules->new
238 (
239 start_rules => \@start_rules,
240 rules => \@rules,
241 style => 'filter',
242 output_encoding => 'utf8',
243## normalisespaces => 1, # http://search.cpan.org/~jenda/XML-Rules-1.16/lib/XML/Rules.pm
244# stripspaces => 2|0|0 # ineffectual
245 );
246
247 my $status = 0;
248 my $xml_in = "";
249 if (!open(MIN,"<$doc_xml_filename"))
250 {
251 $gsdl_cgi->generate_error("Unable to read in $doc_xml_filename: $!");
252 $status = 1;
253 }
254 else
255 {
256 # Read them in
257 my $line;
258 while (defined ($line=<MIN>)) {
259 $xml_in .= $line;
260 }
261 close(MIN);
262
263 # Filter with the call-back functions
264 my $xml_out = "";
265
266 my $MOUT;
267 if (!open($MOUT,">$newdoc_xml_filename")) {
268 $gsdl_cgi->generate_error("Unable to write out to $newdoc_xml_filename: $!");
269 $status = 1;
270 }
271 else {
272 binmode($MOUT,":utf8");
273
274 my $options = { sec_num_hash => $sec_num_hash,
275 parent_sec_num_hash => $parent_sec_num_hash,
276 keep_parent_metadata => $self->{'keep-parent-metadata'},
277 keep_parent_content => $self->{'keep-parent-content'},
278 mode => $mode };
279
280 $parser->filter($xml_in, $MOUT, $options);
281 close($MOUT);
282 }
283 }
284 return $status;
285}
286
287sub sections_as_hash
288{
289 my $self = shift @_;
290
291 my ($json_sections_array) = @_;
292
293 my $sec_num_hash = {};
294
295 foreach my $sn ( @$json_sections_array ) {
296
297 # our XML parser curr_sec_num puts '.' at the root
298 # Need to do the same here, so things can be matched up
299 $sec_num_hash->{".$sn"} = 1;
300 }
301
302 return $sec_num_hash;
303}
304
305
306sub parent_sections_as_hash
307{
308 my $self = shift @_;
309
310 my ($json_sections_array) = @_;
311
312 my $sec_num_hash = {};
313
314 foreach my $sn ( @$json_sections_array ) {
315
316 # needs to make a copy, otherwise version stored in json_sections gets changed
317 my $sn_copy = $sn;
318 while ($sn_copy =~ s/\.\d+$//) {
319 # our XML parser curr_sec_num puts '.' at the root
320 # Need to do the same here, so things can be matched up
321
322 $sec_num_hash->{".$sn_copy"} = 1;
323 }
324 }
325
326 return $sec_num_hash;
327}
328
329sub parse_flag
330{
331 my $self = shift @_;
332
333 my ($arg_name) = @_;
334
335 my $flag = $self->{$arg_name} || 0;
336
337 $flag =~ s/^true/1/i;
338 $flag =~ s/^false/0/i;
339
340 return $flag;
341}
342
343sub _extract_archives_doc
344{
345 my $self = shift @_;
346
347 my $collect = $self->{'collect'};
348 my $gsdl_cgi = $self->{'gsdl_cgi'};
349 my $infodbtype = $self->{'infodbtype'};
350
351 my $site = $self->{'site'};
352
353 # Obtain the collect and archive dir
354 my $collect_dir = $gsdl_cgi->get_collection_dir($site);
355
356 my $archive_dir = &util::filename_cat($collect_dir,$collect,"archives");
357
358 # look up additional args
359 my $docid = $self->{'d'};
360
361 my $timestamp = time();
362 my $new_docid = $self->{'newd'} || "HASH$timestamp";
363
364 $self->{'keep-parent-metadata'} = $self->parse_flag("keep-parent-metadata");
365 $self->{'keep-parent-content'} = $self->parse_flag("keep-parent-content");
366
367 my $json_sections_str = $self->{'json-sections'};
368 my $json_sections_array = decode_json($json_sections_str);
369
370
371 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archive_dir);
372 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $docid);
373
374 my $doc_file = $doc_rec->{'doc-file'}->[0];
375 my $doc_filename = &util::filename_cat($archive_dir, $doc_file);
376
377 my $new_doc_file = $doc_file;
378 $new_doc_file =~ s/doc(-\d+)?.xml$/doc-$timestamp.xml/;
379
380 my $newdoc_filename = &util::filename_cat($archive_dir, $new_doc_file);
381
382 my $sec_num_hash = $self->sections_as_hash($json_sections_array);
383 my $parent_sec_num_hash = $self->parent_sections_as_hash($json_sections_array);
384
385 my $extract_status = $self->remove_from_doc_xml($gsdl_cgi, $doc_filename ,$newdoc_filename, $sec_num_hash, $parent_sec_num_hash, "extract");
386
387 if ($extract_status == 0)
388 {
389 my $delete_sec_num_hash = $self->sections_as_hash($json_sections_array,"no-parents");
390
391 my $delete_status = $self->remove_from_doc_xml($gsdl_cgi, $doc_filename ,$doc_filename, $sec_num_hash, undef, "delete");
392
393 if ($delete_status == 0) {
394
395 # Existing doc record needs to be reindexed
396 $doc_rec->{'index-status'} = ["I"];
397 &dbutil::set_infodb_entry($infodbtype, $arcinfo_doc_filename, $docid, $doc_rec);
398
399 # Create doc-record entry for the newly extracted document
400
401 my $new_doc_rec = $doc_rec;
402 #### Need to cut this down to just the assoc files the new document references
403
404 &dbutil::set_infodb_entry($infodbtype, $arcinfo_doc_filename, $new_docid, $new_doc_rec);
405
406 #### Also need to update the archivesinf-src database!!!!
407 # For all the assoc and src files, retrieve record, and add in new_docid
408
409 my $mess = "document-extract successful: Key[$docid]\n";
410
411 $gsdl_cgi->generate_ok_message($mess);
412 }
413 else {
414 my $mess .= "Failed to extract identified section numbers for key: $docid\n";
415 $mess .= "Exit status: $delete_status\n";
416 $mess .= "System Error Message: $!\n";
417 $mess .= "-" x 20 . "\n";
418
419 $gsdl_cgi->generate_error($mess);
420 }
421 }
422 else
423 {
424 my $mess .= "Failed to remove identified section numbers for key: $docid\n";
425 $mess .= "Exit status: $extract_status\n";
426 $mess .= "System Error Message: $!\n";
427 $mess .= "-" x 20 . "\n";
428
429 $gsdl_cgi->generate_error($mess);
430 }
431
432 #return $status; # in case calling functions have a use for this
433}
434
435
436
437# JSON version that will get the requested metadata values
438# from the requested source (index, import, archives or live)
439# One of the params is a JSON string and the return value is JSON too
440# http://forums.asp.net/t/1844684.aspx/1 - Web api method return json in string
441sub extract_archives_doc
442{
443 my $self = shift @_;
444
445 my $username = $self->{'username'};
446 my $collect = $self->{'collect'};
447 my $gsdl_cgi = $self->{'gsdl_cgi'};
448
449 if ($baseaction::authentication_enabled)
450 {
451 # Ensure the user is allowed to edit this collection
452 $self->authenticate_user($username, $collect);
453 }
454
455 # Make sure the collection isn't locked by someone else
456 $self->lock_collection($username, $collect);
457
458 $self->_extract_archives_doc(@_);
459
460 # Release the lock once it is done
461 $self->unlock_collection($username, $collect);
462
463
464}
465
466
4671;
Note: See TracBrowser for help on using the repository browser.