Context Navigation

source: main/trunk/greenstone2/perllib/cgiactions/docextractaction.pm@ 28251

Last change on this file since 28251 was 28251, checked in by davidb, 11 years ago
Change to calculation of curr_sec_num so it correctly pops off values as a series of </Section> tags are encountered
File size: 11.7 KB

Line
1	##########################################################################
2	#
3	# docextractaction.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 2009 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistr te it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package docextractaction;
27
28	use strict;
29
30	use cgiactions::baseaction;
31
32	use dbutil;
33	use ghtml;
34
35	use JSON;
36
37
38	BEGIN {
39	require XML::Rules;
40	}
41
42	@docextractaction::ISA = ('baseaction');
43
44	my $action_table =
45	{
46	"extract-archives-doc" => { # where param can be ONE of: index (default), import, archives, live
47	'compulsory-args' => [ "d", "json-sections" ],
48	'optional-args' => [ "json-metadata", "newd",
49	"keep-parent-metadata", "keep-parent-content" ],
50	# 'optional-args' => [ "where" ],
51	'help-string' => [
52	'document-extract.pl?a=extract-archives-doc&c=demo&d=HASH0123456789ABC&json-sections=["1.2","1.3.2","2.1","2.2"]&json-metadata=[{"metaname":"dc.Title","metavalue":"All Black Rugy Success","metamode":"accumulate"]'
53	."\n\n Add site=xxxx if a Greenstone3 installation"
54	]}
55
56	};
57
58
59	sub new
60	{
61	my $class = shift (@_);
62	my ($gsdl_cgi,$iis6_mode) = @_;
63
64	# Treat metavalue specially. To transmit this through a GET request
65	# the Javascript side has url-encoded it, so here we need to decode
66	# it before proceeding
67
68	# my $url_encoded_metavalue = $gsdl_cgi->param("metavalue");
69	# my $url_decoded_metavalue = &unicode::url_decode($url_encoded_metavalue,1);
70	# my $unicode_array = &unicode::utf82unicode($url_decoded_metavalue);
71
72	# $url_decoded_metavalue = join("",map(chr($_),@$unicode_array));
73	# $gsdl_cgi->param("metavalue",$url_decoded_metavalue);
74
75	my $self = new baseaction($action_table,$gsdl_cgi,$iis6_mode);
76
77	return bless $self, $class;
78	}
79
80
81
82	sub dxml_start_section
83	{
84	my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
85
86	my $new_depth = scalar(@$contextArray);
87
88	if ($new_depth == 1) {
89	$parser->{'parameters'}->{'curr_section_depth'} = 1;
90	$parser->{'parameters'}->{'curr_section_num'} = "";
91	}
92
93	my $old_depth = $parser->{'parameters'}->{'curr_section_depth'};
94	my $old_secnum = $parser->{'parameters'}->{'curr_section_num'};
95
96	my $new_secnum;
97
98	if ($new_depth > $old_depth) {
99	# first child subsection
100	$new_secnum = "$old_secnum.1";
101	}
102	elsif ($new_depth == $old_depth) {
103	# sibling section => increase it's value by 1
104	my ($tail_num) = ($old_secnum =~ m/\.(\d+)$/);
105	$tail_num++;
106	$new_secnum = $old_secnum;
107	$new_secnum =~ s/\.(\d+)$/\.$tail_num/;
108	}
109	else {
110	# back up to parent section => lopp off tail
111	# $new_secnum = $old_secnum;
112	# $new_secnum =~ s/\.\d+$//;
113	}
114
115	$parser->{'parameters'}->{'curr_section_depth'} = $new_depth;
116	$parser->{'parameters'}->{'curr_section_num'} = $new_secnum;
117
118	1;
119	}
120
121
122	sub dxml_section
123	{
124	my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
125
126	my $curr_sec_num = $parser->{'parameters'}->{'curr_section_num'} \|\| undef;
127
128	my $sec_num_hash = $parser->{'parameters'}->{'sec_num_hash'};
129	my $parent_sec_num_hash = $parser->{'parameters'}->{'parent_sec_num_hash'};
130
131	my $keep_parent_metadata = $parser->{'parameters'}->{'keep_parent_metadata'};
132	my $keep_parent_content = $parser->{'parameters'}->{'keep_parent_content'};
133
134	my $mode = $parser->{'parameters'}->{'mode'};
135
136	my $prev_depth = $parser->{'parameters'}->{'curr_section_depth'};
137	my $live_depth = scalar(@$contextArray);
138
139	if ($live_depth < $prev_depth) {
140	# In a closing-sections poping off situation:
141	# </Section>
142	# </Section>
143
144	# => Back up to parent section => lopp off tail
145
146	$curr_sec_num =~ s/\.\d+$//;
147	$parser->{'parameters'}->{'curr_section_depth'} = $live_depth;
148	$parser->{'parameters'}->{'curr_section_num'} = $curr_sec_num;
149	}
150
151
152	if ($live_depth == 1) {
153	# root sectin tag, which must always exist
154	return [$tagname => $attrHash];
155	}
156	elsif ($mode eq "delete") {
157	if (defined $sec_num_hash->{$curr_sec_num}) {
158	# remove it
159	return undef
160	}
161	else {
162	# keep it
163	return [$tagname => $attrHash];
164	}
165	}
166	else {
167	# mode is extract
168
169	if (defined $sec_num_hash->{$curr_sec_num}) {
170	# keep it
171	## print STDERR "**** Asked to keep: sec num = $curr_sec_num\n";
172
173	return [$tagname => $attrHash];
174	}
175	elsif (defined $parent_sec_num_hash->{$curr_sec_num}) {
176	# want this element, but cut down to just the child <Section>
177
178	my $section_child = undef;
179
180	## print STDERR "**** Parent match: sec num = $curr_sec_num\n";
181
182	my $filtered_elems = [];
183
184	foreach my $elem ( @{$attrHash->{'_content'}}) {
185	if (ref $elem eq "ARRAY") {
186	my $child_tagname = $elem->[0];
187	## print STDERR "***## elem name $child_tagname\n";
188
189
190	if ($child_tagname eq "Description") {
191	if ($keep_parent_metadata) {
192	push(@$filtered_elems,$elem);
193	}
194	}
195	elsif ($child_tagname eq "Content") {
196	if ($keep_parent_content) {
197	push(@$filtered_elems,$elem);
198	}
199	}
200	else {
201	push(@$filtered_elems,$elem);
202	}
203	}
204	else {
205	push(@$filtered_elems,$elem);
206	}
207	}
208
209	$attrHash->{'_content'} = $filtered_elems;
210
211	return [$tagname => $attrHash];
212
213	}
214	else {
215	# not in our list => remove it
216	return undef;
217	}
218	}
219	}
220
221
222	sub remove_from_doc_xml
223	{
224	my $self = shift @_;
225	my ($gsdl_cgi, $doc_xml_filename, $newdoc_xml_filename,
226	$sec_num_hash, $parent_sec_num_hash, $mode) = @_;
227
228	my @start_rules = ('Section' => \&dxml_start_section);
229
230	# Set the call-back functions for the metadata tags
231	my @rules =
232	(
233	_default => 'raw',
234	'Section' => \&dxml_section
235	);
236
237	my $parser = XML::Rules->new
238	(
239	start_rules => \@start_rules,
240	rules => \@rules,
241	style => 'filter',
242	output_encoding => 'utf8',
243	## normalisespaces => 1, # http://search.cpan.org/~jenda/XML-Rules-1.16/lib/XML/Rules.pm
244	# stripspaces => 2\|0\|0 # ineffectual
245	);
246
247	my $status = 0;
248	my $xml_in = "";
249	if (!open(MIN,"<$doc_xml_filename"))
250	{
251	$gsdl_cgi->generate_error("Unable to read in $doc_xml_filename: $!");
252	$status = 1;
253	}
254	else
255	{
256	# Read them in
257	my $line;
258	while (defined ($line=<MIN>)) {
259	$xml_in .= $line;
260	}
261	close(MIN);
262
263	# Filter with the call-back functions
264	my $xml_out = "";
265
266	my $MOUT;
267	if (!open($MOUT,">$newdoc_xml_filename")) {
268	$gsdl_cgi->generate_error("Unable to write out to $newdoc_xml_filename: $!");
269	$status = 1;
270	}
271	else {
272	binmode($MOUT,":utf8");
273
274	my $options = { sec_num_hash => $sec_num_hash,
275	parent_sec_num_hash => $parent_sec_num_hash,
276	keep_parent_metadata => $self->{'keep-parent-metadata'},
277	keep_parent_content => $self->{'keep-parent-content'},
278	mode => $mode };
279
280	$parser->filter($xml_in, $MOUT, $options);
281	close($MOUT);
282	}
283	}
284	return $status;
285	}
286
287	sub sections_as_hash
288	{
289	my $self = shift @_;
290
291	my ($json_sections_array) = @_;
292
293	my $sec_num_hash = {};
294
295	foreach my $sn ( @$json_sections_array ) {
296
297	# our XML parser curr_sec_num puts '.' at the root
298	# Need to do the same here, so things can be matched up
299	$sec_num_hash->{".$sn"} = 1;
300	}
301
302	return $sec_num_hash;
303	}
304
305
306	sub parent_sections_as_hash
307	{
308	my $self = shift @_;
309
310	my ($json_sections_array) = @_;
311
312	my $sec_num_hash = {};
313
314	foreach my $sn ( @$json_sections_array ) {
315
316	# needs to make a copy, otherwise version stored in json_sections gets changed
317	my $sn_copy = $sn;
318	while ($sn_copy =~ s/\.\d+$//) {
319	# our XML parser curr_sec_num puts '.' at the root
320	# Need to do the same here, so things can be matched up
321
322	$sec_num_hash->{".$sn_copy"} = 1;
323	}
324	}
325
326	return $sec_num_hash;
327	}
328
329	sub parse_flag
330	{
331	my $self = shift @_;
332
333	my ($arg_name) = @_;
334
335	my $flag = $self->{$arg_name} \|\| 0;
336
337	$flag =~ s/^true/1/i;
338	$flag =~ s/^false/0/i;
339
340	return $flag;
341	}
342
343	sub _extract_archives_doc
344	{
345	my $self = shift @_;
346
347	my $collect = $self->{'collect'};
348	my $gsdl_cgi = $self->{'gsdl_cgi'};
349	my $infodbtype = $self->{'infodbtype'};
350
351	my $site = $self->{'site'};
352
353	# Obtain the collect and archive dir
354	my $collect_dir = $gsdl_cgi->get_collection_dir($site);
355
356	my $archive_dir = &util::filename_cat($collect_dir,$collect,"archives");
357
358	# look up additional args
359	my $docid = $self->{'d'};
360	my $new_docid = $self->{'newd'} \|\| "HASH" . localtime(time);
361
362	$self->{'keep-parent-metadata'} = $self->parse_flag("keep-parent-metadata");
363	$self->{'keep-parent-content'} = $self->parse_flag("keep-parent-content");
364
365	my $json_sections_str = $self->{'json-sections'};
366	my $json_sections_array = decode_json($json_sections_str);
367
368
369	my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archive_dir);
370	my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $docid);
371
372	my $doc_file = $doc_rec->{'doc-file'}->[0];
373	my $doc_filename = &util::filename_cat($archive_dir, $doc_file);
374
375	my $newdoc_filename = &util::filename_cat($archive_dir, "test.xml");
376
377	# # This now stores the full pathname
378	# my $doc_filename = $doc_rec->{'doc-file'}->[0];
379
380	my $sec_num_hash = $self->sections_as_hash($json_sections_array);
381	my $parent_sec_num_hash = $self->parent_sections_as_hash($json_sections_array);
382
383	my $extract_status = $self->remove_from_doc_xml($gsdl_cgi, $doc_filename ,$newdoc_filename, $sec_num_hash, $parent_sec_num_hash, "extract");
384
385	if ($extract_status == 0)
386	{
387	my $delete_sec_num_hash = $self->sections_as_hash($json_sections_array,"no-parents");
388
389	my $delete_status = $self->remove_from_doc_xml($gsdl_cgi, $doc_filename ,$doc_filename, $sec_num_hash, undef, "delete");
390
391	if ($delete_status == 0) {
392
393	my $mess = "document-extract successful: Key[$docid]\n";
394
395	$gsdl_cgi->generate_ok_message($mess);
396	}
397	else {
398	my $mess .= "Failed to extract identified section numbers for key: $docid\n";
399	$mess .= "Exit status: $delete_status\n";
400	$mess .= "System Error Message: $!\n";
401	$mess .= "-" x 20 . "\n";
402
403	$gsdl_cgi->generate_error($mess);
404	}
405	}
406	else
407	{
408	my $mess .= "Failed to remove identified section numbers for key: $docid\n";
409	$mess .= "Exit status: $extract_status\n";
410	$mess .= "System Error Message: $!\n";
411	$mess .= "-" x 20 . "\n";
412
413	$gsdl_cgi->generate_error($mess);
414	}
415
416	#return $status; # in case calling functions have a use for this
417	}
418
419
420
421	# JSON version that will get the requested metadata values
422	# from the requested source (index, import, archives or live)
423	# One of the params is a JSON string and the return value is JSON too
424	# http://forums.asp.net/t/1844684.aspx/1 - Web api method return json in string
425	sub extract_archives_doc
426	{
427	my $self = shift @_;
428
429	my $username = $self->{'username'};
430	my $collect = $self->{'collect'};
431	my $gsdl_cgi = $self->{'gsdl_cgi'};
432
433	if ($baseaction::authentication_enabled)
434	{
435	# Ensure the user is allowed to edit this collection
436	$self->authenticate_user($username, $collect);
437	}
438
439	# Make sure the collection isn't locked by someone else
440	$self->lock_collection($username, $collect);
441
442	$self->_extract_archives_doc(@_);
443
444	# Release the lock once it is done
445	$self->unlock_collection($username, $collect);
446
447
448	}
449
450
451	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: