Context Navigation

source: main/trunk/greenstone2/perllib/cgiactions/docextractaction.pm@ 28261

Last change on this file since 28261 was 28261, checked in by davidb, 11 years ago
This action now saves the newly created document in the same archives folder as the original, and update the archivesinf-doc database appropriately
File size: 12.2 KB

Line
1	##########################################################################
2	#
3	# docextractaction.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 2009 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistr te it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package docextractaction;
27
28	use strict;
29
30	use cgiactions::baseaction;
31
32	use dbutil;
33	use ghtml;
34
35	use JSON;
36
37
38	BEGIN {
39	require XML::Rules;
40	}
41
42	@docextractaction::ISA = ('baseaction');
43
44	my $action_table =
45	{
46	"extract-archives-doc" => { # where param can be ONE of: index (default), import, archives, live
47	'compulsory-args' => [ "d", "json-sections" ],
48	'optional-args' => [ "json-metadata", "newd",
49	"keep-parent-metadata", "keep-parent-content" ],
50	# 'optional-args' => [ "where" ],
51	'help-string' => [
52	'document-extract.pl?a=extract-archives-doc&c=demo&d=HASH0123456789ABC&json-sections=["1.2","1.3.2","2.1","2.2"]&json-metadata=[{"metaname":"dc.Title","metavalue":"All Black Rugy Success","metamode":"accumulate"]'
53	."\n\n Add site=xxxx if a Greenstone3 installation"
54	]}
55
56	};
57
58
59	sub new
60	{
61	my $class = shift (@_);
62	my ($gsdl_cgi,$iis6_mode) = @_;
63
64	# Treat metavalue specially. To transmit this through a GET request
65	# the Javascript side has url-encoded it, so here we need to decode
66	# it before proceeding
67
68	# my $url_encoded_metavalue = $gsdl_cgi->param("metavalue");
69	# my $url_decoded_metavalue = &unicode::url_decode($url_encoded_metavalue,1);
70	# my $unicode_array = &unicode::utf82unicode($url_decoded_metavalue);
71
72	# $url_decoded_metavalue = join("",map(chr($_),@$unicode_array));
73	# $gsdl_cgi->param("metavalue",$url_decoded_metavalue);
74
75	my $self = new baseaction($action_table,$gsdl_cgi,$iis6_mode);
76
77	return bless $self, $class;
78	}
79
80
81
82	sub dxml_start_section
83	{
84	my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
85
86	my $new_depth = scalar(@$contextArray);
87
88	if ($new_depth == 1) {
89	$parser->{'parameters'}->{'curr_section_depth'} = 1;
90	$parser->{'parameters'}->{'curr_section_num'} = "";
91	}
92
93	my $old_depth = $parser->{'parameters'}->{'curr_section_depth'};
94	my $old_secnum = $parser->{'parameters'}->{'curr_section_num'};
95
96	my $new_secnum;
97
98	if ($new_depth > $old_depth) {
99	# first child subsection
100	$new_secnum = "$old_secnum.1";
101	}
102	elsif ($new_depth == $old_depth) {
103	# sibling section => increase it's value by 1
104	my ($tail_num) = ($old_secnum =~ m/\.(\d+)$/);
105	$tail_num++;
106	$new_secnum = $old_secnum;
107	$new_secnum =~ s/\.(\d+)$/\.$tail_num/;
108	}
109	else {
110	# back up to parent section => lopp off tail
111	# $new_secnum = $old_secnum;
112	# $new_secnum =~ s/\.\d+$//;
113	}
114
115	$parser->{'parameters'}->{'curr_section_depth'} = $new_depth;
116	$parser->{'parameters'}->{'curr_section_num'} = $new_secnum;
117
118	1;
119	}
120
121
122	sub dxml_section
123	{
124	my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
125
126	my $curr_sec_num = $parser->{'parameters'}->{'curr_section_num'} \|\| undef;
127
128	my $sec_num_hash = $parser->{'parameters'}->{'sec_num_hash'};
129	my $parent_sec_num_hash = $parser->{'parameters'}->{'parent_sec_num_hash'};
130
131	my $keep_parent_metadata = $parser->{'parameters'}->{'keep_parent_metadata'};
132	my $keep_parent_content = $parser->{'parameters'}->{'keep_parent_content'};
133
134	my $mode = $parser->{'parameters'}->{'mode'};
135
136	my $prev_depth = $parser->{'parameters'}->{'curr_section_depth'};
137	my $live_depth = scalar(@$contextArray);
138
139	if ($live_depth < $prev_depth) {
140	# In a closing-sections poping off situation:
141	# </Section>
142	# </Section>
143
144	# => Back up to parent section => lopp off tail
145
146	$curr_sec_num =~ s/\.\d+$//;
147	$parser->{'parameters'}->{'curr_section_depth'} = $live_depth;
148	$parser->{'parameters'}->{'curr_section_num'} = $curr_sec_num;
149	}
150
151
152	if ($live_depth == 1) {
153	# root sectin tag, which must always exist
154	return [$tagname => $attrHash];
155	}
156	elsif ($mode eq "delete") {
157	if (defined $sec_num_hash->{$curr_sec_num}) {
158	# remove it
159	return undef
160	}
161	else {
162	# keep it
163	return [$tagname => $attrHash];
164	}
165	}
166	else {
167	# mode is extract
168
169	if (defined $sec_num_hash->{$curr_sec_num}) {
170	# keep it
171	## print STDERR "**** Asked to keep: sec num = $curr_sec_num\n";
172
173	return [$tagname => $attrHash];
174	}
175	elsif (defined $parent_sec_num_hash->{$curr_sec_num}) {
176	# want this element, but cut down to just the child <Section>
177
178	my $section_child = undef;
179
180	## print STDERR "**** Parent match: sec num = $curr_sec_num\n";
181
182	my $filtered_elems = [];
183
184	foreach my $elem ( @{$attrHash->{'_content'}}) {
185	if (ref $elem eq "ARRAY") {
186	my $child_tagname = $elem->[0];
187	## print STDERR "***## elem name $child_tagname\n";
188
189
190	if ($child_tagname eq "Description") {
191	if ($keep_parent_metadata) {
192	push(@$filtered_elems,$elem);
193	}
194	}
195	elsif ($child_tagname eq "Content") {
196	if ($keep_parent_content) {
197	push(@$filtered_elems,$elem);
198	}
199	}
200	else {
201	push(@$filtered_elems,$elem);
202	}
203	}
204	else {
205	push(@$filtered_elems,$elem);
206	}
207	}
208
209	$attrHash->{'_content'} = $filtered_elems;
210
211	return [$tagname => $attrHash];
212
213	}
214	else {
215	# not in our list => remove it
216	return undef;
217	}
218	}
219	}
220
221
222	sub remove_from_doc_xml
223	{
224	my $self = shift @_;
225	my ($gsdl_cgi, $doc_xml_filename, $newdoc_xml_filename,
226	$sec_num_hash, $parent_sec_num_hash, $mode) = @_;
227
228	my @start_rules = ('Section' => \&dxml_start_section);
229
230	# Set the call-back functions for the metadata tags
231	my @rules =
232	(
233	_default => 'raw',
234	'Section' => \&dxml_section
235	);
236
237	my $parser = XML::Rules->new
238	(
239	start_rules => \@start_rules,
240	rules => \@rules,
241	style => 'filter',
242	output_encoding => 'utf8',
243	## normalisespaces => 1, # http://search.cpan.org/~jenda/XML-Rules-1.16/lib/XML/Rules.pm
244	# stripspaces => 2\|0\|0 # ineffectual
245	);
246
247	my $status = 0;
248	my $xml_in = "";
249	if (!open(MIN,"<$doc_xml_filename"))
250	{
251	$gsdl_cgi->generate_error("Unable to read in $doc_xml_filename: $!");
252	$status = 1;
253	}
254	else
255	{
256	# Read them in
257	my $line;
258	while (defined ($line=<MIN>)) {
259	$xml_in .= $line;
260	}
261	close(MIN);
262
263	# Filter with the call-back functions
264	my $xml_out = "";
265
266	my $MOUT;
267	if (!open($MOUT,">$newdoc_xml_filename")) {
268	$gsdl_cgi->generate_error("Unable to write out to $newdoc_xml_filename: $!");
269	$status = 1;
270	}
271	else {
272	binmode($MOUT,":utf8");
273
274	my $options = { sec_num_hash => $sec_num_hash,
275	parent_sec_num_hash => $parent_sec_num_hash,
276	keep_parent_metadata => $self->{'keep-parent-metadata'},
277	keep_parent_content => $self->{'keep-parent-content'},
278	mode => $mode };
279
280	$parser->filter($xml_in, $MOUT, $options);
281	close($MOUT);
282	}
283	}
284	return $status;
285	}
286
287	sub sections_as_hash
288	{
289	my $self = shift @_;
290
291	my ($json_sections_array) = @_;
292
293	my $sec_num_hash = {};
294
295	foreach my $sn ( @$json_sections_array ) {
296
297	# our XML parser curr_sec_num puts '.' at the root
298	# Need to do the same here, so things can be matched up
299	$sec_num_hash->{".$sn"} = 1;
300	}
301
302	return $sec_num_hash;
303	}
304
305
306	sub parent_sections_as_hash
307	{
308	my $self = shift @_;
309
310	my ($json_sections_array) = @_;
311
312	my $sec_num_hash = {};
313
314	foreach my $sn ( @$json_sections_array ) {
315
316	# needs to make a copy, otherwise version stored in json_sections gets changed
317	my $sn_copy = $sn;
318	while ($sn_copy =~ s/\.\d+$//) {
319	# our XML parser curr_sec_num puts '.' at the root
320	# Need to do the same here, so things can be matched up
321
322	$sec_num_hash->{".$sn_copy"} = 1;
323	}
324	}
325
326	return $sec_num_hash;
327	}
328
329	sub parse_flag
330	{
331	my $self = shift @_;
332
333	my ($arg_name) = @_;
334
335	my $flag = $self->{$arg_name} \|\| 0;
336
337	$flag =~ s/^true/1/i;
338	$flag =~ s/^false/0/i;
339
340	return $flag;
341	}
342
343	sub _extract_archives_doc
344	{
345	my $self = shift @_;
346
347	my $collect = $self->{'collect'};
348	my $gsdl_cgi = $self->{'gsdl_cgi'};
349	my $infodbtype = $self->{'infodbtype'};
350
351	my $site = $self->{'site'};
352
353	# Obtain the collect and archive dir
354	my $collect_dir = $gsdl_cgi->get_collection_dir($site);
355
356	my $archive_dir = &util::filename_cat($collect_dir,$collect,"archives");
357
358	# look up additional args
359	my $docid = $self->{'d'};
360
361	my $timestamp = time();
362	my $new_docid = $self->{'newd'} \|\| "HASH$timestamp";
363
364	$self->{'keep-parent-metadata'} = $self->parse_flag("keep-parent-metadata");
365	$self->{'keep-parent-content'} = $self->parse_flag("keep-parent-content");
366
367	my $json_sections_str = $self->{'json-sections'};
368	my $json_sections_array = decode_json($json_sections_str);
369
370
371	my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archive_dir);
372	my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $docid);
373
374	my $doc_file = $doc_rec->{'doc-file'}->[0];
375	my $doc_filename = &util::filename_cat($archive_dir, $doc_file);
376
377	my $new_doc_file = $doc_file;
378	$new_doc_file =~ s/doc(-\d+)?.xml$/doc-$timestamp.xml/;
379
380	my $newdoc_filename = &util::filename_cat($archive_dir, $new_doc_file);
381
382	my $sec_num_hash = $self->sections_as_hash($json_sections_array);
383	my $parent_sec_num_hash = $self->parent_sections_as_hash($json_sections_array);
384
385	my $extract_status = $self->remove_from_doc_xml($gsdl_cgi, $doc_filename ,$newdoc_filename, $sec_num_hash, $parent_sec_num_hash, "extract");
386
387	if ($extract_status == 0)
388	{
389	my $delete_sec_num_hash = $self->sections_as_hash($json_sections_array,"no-parents");
390
391	my $delete_status = $self->remove_from_doc_xml($gsdl_cgi, $doc_filename ,$doc_filename, $sec_num_hash, undef, "delete");
392
393	if ($delete_status == 0) {
394
395	# Existing doc record needs to be reindexed
396	$doc_rec->{'index-status'} = ["I"];
397	&dbutil::set_infodb_entry($infodbtype, $arcinfo_doc_filename, $docid, $doc_rec);
398
399	# Create doc-record entry for the newly extracted document
400
401	my $new_doc_rec = $doc_rec;
402	#### Need to cut this down to just the assoc files the new document references
403
404	&dbutil::set_infodb_entry($infodbtype, $arcinfo_doc_filename, $new_docid, $new_doc_rec);
405
406	#### Also need to update the archivesinf-src database!!!!
407	# For all the assoc and src files, retrieve record, and add in new_docid
408
409	my $mess = "document-extract successful: Key[$docid]\n";
410
411	$gsdl_cgi->generate_ok_message($mess);
412	}
413	else {
414	my $mess .= "Failed to extract identified section numbers for key: $docid\n";
415	$mess .= "Exit status: $delete_status\n";
416	$mess .= "System Error Message: $!\n";
417	$mess .= "-" x 20 . "\n";
418
419	$gsdl_cgi->generate_error($mess);
420	}
421	}
422	else
423	{
424	my $mess .= "Failed to remove identified section numbers for key: $docid\n";
425	$mess .= "Exit status: $extract_status\n";
426	$mess .= "System Error Message: $!\n";
427	$mess .= "-" x 20 . "\n";
428
429	$gsdl_cgi->generate_error($mess);
430	}
431
432	#return $status; # in case calling functions have a use for this
433	}
434
435
436
437	# JSON version that will get the requested metadata values
438	# from the requested source (index, import, archives or live)
439	# One of the params is a JSON string and the return value is JSON too
440	# http://forums.asp.net/t/1844684.aspx/1 - Web api method return json in string
441	sub extract_archives_doc
442	{
443	my $self = shift @_;
444
445	my $username = $self->{'username'};
446	my $collect = $self->{'collect'};
447	my $gsdl_cgi = $self->{'gsdl_cgi'};
448
449	if ($baseaction::authentication_enabled)
450	{
451	# Ensure the user is allowed to edit this collection
452	$self->authenticate_user($username, $collect);
453	}
454
455	# Make sure the collection isn't locked by someone else
456	$self->lock_collection($username, $collect);
457
458	$self->_extract_archives_doc(@_);
459
460	# Release the lock once it is done
461	$self->unlock_collection($username, $collect);
462
463
464	}
465
466
467	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: