source: main/trunk/greenstone2/perllib/cgiactions/docextractaction.pm@ 28251

Last change on this file since 28251 was 28251, checked in by davidb, 11 years ago

Change to calculation of curr_sec_num so it correctly pops off values as a series of </Section> tags are encountered

File size: 11.7 KB
Line 
1##########################################################################
2#
3# docextractaction.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2009 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package docextractaction;
27
28use strict;
29
30use cgiactions::baseaction;
31
32use dbutil;
33use ghtml;
34
35use JSON;
36
37
38BEGIN {
39 require XML::Rules;
40}
41
42@docextractaction::ISA = ('baseaction');
43
44my $action_table =
45{
46 "extract-archives-doc" => { # where param can be ONE of: index (default), import, archives, live
47 'compulsory-args' => [ "d", "json-sections" ],
48 'optional-args' => [ "json-metadata", "newd",
49 "keep-parent-metadata", "keep-parent-content" ],
50# 'optional-args' => [ "where" ],
51 'help-string' => [
52 'document-extract.pl?a=extract-archives-doc&c=demo&d=HASH0123456789ABC&json-sections=["1.2","1.3.2","2.1","2.2"]&json-metadata=[{"metaname":"dc.Title","metavalue":"All Black Rugy Success","metamode":"accumulate"]'
53 ."\n\n Add site=xxxx if a Greenstone3 installation"
54 ]}
55
56};
57
58
59sub new
60{
61 my $class = shift (@_);
62 my ($gsdl_cgi,$iis6_mode) = @_;
63
64 # Treat metavalue specially. To transmit this through a GET request
65 # the Javascript side has url-encoded it, so here we need to decode
66 # it before proceeding
67
68# my $url_encoded_metavalue = $gsdl_cgi->param("metavalue");
69# my $url_decoded_metavalue = &unicode::url_decode($url_encoded_metavalue,1);
70# my $unicode_array = &unicode::utf82unicode($url_decoded_metavalue);
71
72# $url_decoded_metavalue = join("",map(chr($_),@$unicode_array));
73# $gsdl_cgi->param("metavalue",$url_decoded_metavalue);
74
75 my $self = new baseaction($action_table,$gsdl_cgi,$iis6_mode);
76
77 return bless $self, $class;
78}
79
80
81
82sub dxml_start_section
83{
84 my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
85
86 my $new_depth = scalar(@$contextArray);
87
88 if ($new_depth == 1) {
89 $parser->{'parameters'}->{'curr_section_depth'} = 1;
90 $parser->{'parameters'}->{'curr_section_num'} = "";
91 }
92
93 my $old_depth = $parser->{'parameters'}->{'curr_section_depth'};
94 my $old_secnum = $parser->{'parameters'}->{'curr_section_num'};
95
96 my $new_secnum;
97
98 if ($new_depth > $old_depth) {
99 # first child subsection
100 $new_secnum = "$old_secnum.1";
101 }
102 elsif ($new_depth == $old_depth) {
103 # sibling section => increase it's value by 1
104 my ($tail_num) = ($old_secnum =~ m/\.(\d+)$/);
105 $tail_num++;
106 $new_secnum = $old_secnum;
107 $new_secnum =~ s/\.(\d+)$/\.$tail_num/;
108 }
109 else {
110 # back up to parent section => lopp off tail
111# $new_secnum = $old_secnum;
112# $new_secnum =~ s/\.\d+$//;
113 }
114
115 $parser->{'parameters'}->{'curr_section_depth'} = $new_depth;
116 $parser->{'parameters'}->{'curr_section_num'} = $new_secnum;
117
118 1;
119}
120
121
122sub dxml_section
123{
124 my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
125
126 my $curr_sec_num = $parser->{'parameters'}->{'curr_section_num'} || undef;
127
128 my $sec_num_hash = $parser->{'parameters'}->{'sec_num_hash'};
129 my $parent_sec_num_hash = $parser->{'parameters'}->{'parent_sec_num_hash'};
130
131 my $keep_parent_metadata = $parser->{'parameters'}->{'keep_parent_metadata'};
132 my $keep_parent_content = $parser->{'parameters'}->{'keep_parent_content'};
133
134 my $mode = $parser->{'parameters'}->{'mode'};
135
136 my $prev_depth = $parser->{'parameters'}->{'curr_section_depth'};
137 my $live_depth = scalar(@$contextArray);
138
139 if ($live_depth < $prev_depth) {
140 # In a closing-sections poping off situation:
141 # </Section>
142 # </Section>
143
144 # => Back up to parent section => lopp off tail
145
146 $curr_sec_num =~ s/\.\d+$//;
147 $parser->{'parameters'}->{'curr_section_depth'} = $live_depth;
148 $parser->{'parameters'}->{'curr_section_num'} = $curr_sec_num;
149 }
150
151
152 if ($live_depth == 1) {
153 # root sectin tag, which must always exist
154 return [$tagname => $attrHash];
155 }
156 elsif ($mode eq "delete") {
157 if (defined $sec_num_hash->{$curr_sec_num}) {
158 # remove it
159 return undef
160 }
161 else {
162 # keep it
163 return [$tagname => $attrHash];
164 }
165 }
166 else {
167 # mode is extract
168
169 if (defined $sec_num_hash->{$curr_sec_num}) {
170 # keep it
171## print STDERR "**** Asked to keep: sec num = $curr_sec_num\n";
172
173 return [$tagname => $attrHash];
174 }
175 elsif (defined $parent_sec_num_hash->{$curr_sec_num}) {
176 # want this element, but cut down to just the child <Section>
177
178 my $section_child = undef;
179
180## print STDERR "**** Parent match: sec num = $curr_sec_num\n";
181
182 my $filtered_elems = [];
183
184 foreach my $elem ( @{$attrHash->{'_content'}}) {
185 if (ref $elem eq "ARRAY") {
186 my $child_tagname = $elem->[0];
187## print STDERR "***## elem name $child_tagname\n";
188
189
190 if ($child_tagname eq "Description") {
191 if ($keep_parent_metadata) {
192 push(@$filtered_elems,$elem);
193 }
194 }
195 elsif ($child_tagname eq "Content") {
196 if ($keep_parent_content) {
197 push(@$filtered_elems,$elem);
198 }
199 }
200 else {
201 push(@$filtered_elems,$elem);
202 }
203 }
204 else {
205 push(@$filtered_elems,$elem);
206 }
207 }
208
209 $attrHash->{'_content'} = $filtered_elems;
210
211 return [$tagname => $attrHash];
212
213 }
214 else {
215 # not in our list => remove it
216 return undef;
217 }
218 }
219}
220
221
222sub remove_from_doc_xml
223{
224 my $self = shift @_;
225 my ($gsdl_cgi, $doc_xml_filename, $newdoc_xml_filename,
226 $sec_num_hash, $parent_sec_num_hash, $mode) = @_;
227
228 my @start_rules = ('Section' => \&dxml_start_section);
229
230 # Set the call-back functions for the metadata tags
231 my @rules =
232 (
233 _default => 'raw',
234 'Section' => \&dxml_section
235 );
236
237 my $parser = XML::Rules->new
238 (
239 start_rules => \@start_rules,
240 rules => \@rules,
241 style => 'filter',
242 output_encoding => 'utf8',
243## normalisespaces => 1, # http://search.cpan.org/~jenda/XML-Rules-1.16/lib/XML/Rules.pm
244# stripspaces => 2|0|0 # ineffectual
245 );
246
247 my $status = 0;
248 my $xml_in = "";
249 if (!open(MIN,"<$doc_xml_filename"))
250 {
251 $gsdl_cgi->generate_error("Unable to read in $doc_xml_filename: $!");
252 $status = 1;
253 }
254 else
255 {
256 # Read them in
257 my $line;
258 while (defined ($line=<MIN>)) {
259 $xml_in .= $line;
260 }
261 close(MIN);
262
263 # Filter with the call-back functions
264 my $xml_out = "";
265
266 my $MOUT;
267 if (!open($MOUT,">$newdoc_xml_filename")) {
268 $gsdl_cgi->generate_error("Unable to write out to $newdoc_xml_filename: $!");
269 $status = 1;
270 }
271 else {
272 binmode($MOUT,":utf8");
273
274 my $options = { sec_num_hash => $sec_num_hash,
275 parent_sec_num_hash => $parent_sec_num_hash,
276 keep_parent_metadata => $self->{'keep-parent-metadata'},
277 keep_parent_content => $self->{'keep-parent-content'},
278 mode => $mode };
279
280 $parser->filter($xml_in, $MOUT, $options);
281 close($MOUT);
282 }
283 }
284 return $status;
285}
286
287sub sections_as_hash
288{
289 my $self = shift @_;
290
291 my ($json_sections_array) = @_;
292
293 my $sec_num_hash = {};
294
295 foreach my $sn ( @$json_sections_array ) {
296
297 # our XML parser curr_sec_num puts '.' at the root
298 # Need to do the same here, so things can be matched up
299 $sec_num_hash->{".$sn"} = 1;
300 }
301
302 return $sec_num_hash;
303}
304
305
306sub parent_sections_as_hash
307{
308 my $self = shift @_;
309
310 my ($json_sections_array) = @_;
311
312 my $sec_num_hash = {};
313
314 foreach my $sn ( @$json_sections_array ) {
315
316 # needs to make a copy, otherwise version stored in json_sections gets changed
317 my $sn_copy = $sn;
318 while ($sn_copy =~ s/\.\d+$//) {
319 # our XML parser curr_sec_num puts '.' at the root
320 # Need to do the same here, so things can be matched up
321
322 $sec_num_hash->{".$sn_copy"} = 1;
323 }
324 }
325
326 return $sec_num_hash;
327}
328
329sub parse_flag
330{
331 my $self = shift @_;
332
333 my ($arg_name) = @_;
334
335 my $flag = $self->{$arg_name} || 0;
336
337 $flag =~ s/^true/1/i;
338 $flag =~ s/^false/0/i;
339
340 return $flag;
341}
342
343sub _extract_archives_doc
344{
345 my $self = shift @_;
346
347 my $collect = $self->{'collect'};
348 my $gsdl_cgi = $self->{'gsdl_cgi'};
349 my $infodbtype = $self->{'infodbtype'};
350
351 my $site = $self->{'site'};
352
353 # Obtain the collect and archive dir
354 my $collect_dir = $gsdl_cgi->get_collection_dir($site);
355
356 my $archive_dir = &util::filename_cat($collect_dir,$collect,"archives");
357
358 # look up additional args
359 my $docid = $self->{'d'};
360 my $new_docid = $self->{'newd'} || "HASH" . localtime(time);
361
362 $self->{'keep-parent-metadata'} = $self->parse_flag("keep-parent-metadata");
363 $self->{'keep-parent-content'} = $self->parse_flag("keep-parent-content");
364
365 my $json_sections_str = $self->{'json-sections'};
366 my $json_sections_array = decode_json($json_sections_str);
367
368
369 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archive_dir);
370 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $docid);
371
372 my $doc_file = $doc_rec->{'doc-file'}->[0];
373 my $doc_filename = &util::filename_cat($archive_dir, $doc_file);
374
375 my $newdoc_filename = &util::filename_cat($archive_dir, "test.xml");
376
377# # This now stores the full pathname
378# my $doc_filename = $doc_rec->{'doc-file'}->[0];
379
380 my $sec_num_hash = $self->sections_as_hash($json_sections_array);
381 my $parent_sec_num_hash = $self->parent_sections_as_hash($json_sections_array);
382
383 my $extract_status = $self->remove_from_doc_xml($gsdl_cgi, $doc_filename ,$newdoc_filename, $sec_num_hash, $parent_sec_num_hash, "extract");
384
385 if ($extract_status == 0)
386 {
387 my $delete_sec_num_hash = $self->sections_as_hash($json_sections_array,"no-parents");
388
389 my $delete_status = $self->remove_from_doc_xml($gsdl_cgi, $doc_filename ,$doc_filename, $sec_num_hash, undef, "delete");
390
391 if ($delete_status == 0) {
392
393 my $mess = "document-extract successful: Key[$docid]\n";
394
395 $gsdl_cgi->generate_ok_message($mess);
396 }
397 else {
398 my $mess .= "Failed to extract identified section numbers for key: $docid\n";
399 $mess .= "Exit status: $delete_status\n";
400 $mess .= "System Error Message: $!\n";
401 $mess .= "-" x 20 . "\n";
402
403 $gsdl_cgi->generate_error($mess);
404 }
405 }
406 else
407 {
408 my $mess .= "Failed to remove identified section numbers for key: $docid\n";
409 $mess .= "Exit status: $extract_status\n";
410 $mess .= "System Error Message: $!\n";
411 $mess .= "-" x 20 . "\n";
412
413 $gsdl_cgi->generate_error($mess);
414 }
415
416 #return $status; # in case calling functions have a use for this
417}
418
419
420
421# JSON version that will get the requested metadata values
422# from the requested source (index, import, archives or live)
423# One of the params is a JSON string and the return value is JSON too
424# http://forums.asp.net/t/1844684.aspx/1 - Web api method return json in string
425sub extract_archives_doc
426{
427 my $self = shift @_;
428
429 my $username = $self->{'username'};
430 my $collect = $self->{'collect'};
431 my $gsdl_cgi = $self->{'gsdl_cgi'};
432
433 if ($baseaction::authentication_enabled)
434 {
435 # Ensure the user is allowed to edit this collection
436 $self->authenticate_user($username, $collect);
437 }
438
439 # Make sure the collection isn't locked by someone else
440 $self->lock_collection($username, $collect);
441
442 $self->_extract_archives_doc(@_);
443
444 # Release the lock once it is done
445 $self->unlock_collection($username, $collect);
446
447
448}
449
450
4511;
Note: See TracBrowser for help on using the repository browser.