Context Navigation

docextractaction.pm@ 29013

Last change on this file since 29013 was 28268, checked in by davidb, 11 years ago
Existing doc.xml maked with R for reindex, new file I to index
File size: 13.0 KB

Line
1	##########################################################################
2	#
3	# docextractaction.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 2009 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistr te it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package docextractaction;
27
28	use strict;
29
30	use cgiactions::baseaction;
31
32	use dbutil;
33	use ghtml;
34
35	use JSON;
36
37
38	BEGIN {
39	require XML::Rules;
40	}
41
42	@docextractaction::ISA = ('baseaction');
43
44	my $action_table =
45	{
46	"extract-archives-doc" => { # where param can be ONE of: index (default), import, archives, live
47	'compulsory-args' => [ "d", "json-sections" ],
48	'optional-args' => [ "json-metadata", "newd",
49	"keep-parent-metadata", "keep-parent-content" ],
50	# 'optional-args' => [ "where" ],
51	'help-string' => [
52	'document-extract.pl?a=extract-archives-doc&c=demo&d=HASH0123456789ABC&json-sections=["1.2","1.3.2","2.1","2.2"]&json-metadata=[{"metaname":"dc.Title","metavalue":"All Black Rugy Success","metamode":"accumulate"]'
53	."\n\n Add site=xxxx if a Greenstone3 installation"
54	]}
55
56	};
57
58
59	sub new
60	{
61	my $class = shift (@_);
62	my ($gsdl_cgi,$iis6_mode) = @_;
63
64	# Treat metavalue specially. To transmit this through a GET request
65	# the Javascript side has url-encoded it, so here we need to decode
66	# it before proceeding
67
68	# my $url_encoded_metavalue = $gsdl_cgi->param("metavalue");
69	# my $url_decoded_metavalue = &unicode::url_decode($url_encoded_metavalue,1);
70	# my $unicode_array = &unicode::utf82unicode($url_decoded_metavalue);
71
72	# $url_decoded_metavalue = join("",map(chr($_),@$unicode_array));
73	# $gsdl_cgi->param("metavalue",$url_decoded_metavalue);
74
75	my $self = new baseaction($action_table,$gsdl_cgi,$iis6_mode);
76
77	return bless $self, $class;
78	}
79
80
81
82	sub dxml_start_section
83	{
84	my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
85
86	my $new_depth = scalar(@$contextArray);
87
88	if ($new_depth == 1) {
89	$parser->{'parameters'}->{'curr_section_depth'} = 1;
90	$parser->{'parameters'}->{'curr_section_num'} = "";
91	}
92
93	my $old_depth = $parser->{'parameters'}->{'curr_section_depth'};
94	my $old_secnum = $parser->{'parameters'}->{'curr_section_num'};
95
96	my $new_secnum;
97
98	if ($new_depth > $old_depth) {
99	# first child subsection
100	$new_secnum = "$old_secnum.1";
101	}
102	elsif ($new_depth == $old_depth) {
103	# sibling section => increase it's value by 1
104	my ($tail_num) = ($old_secnum =~ m/\.(\d+)$/);
105	$tail_num++;
106	$new_secnum = $old_secnum;
107	$new_secnum =~ s/\.(\d+)$/\.$tail_num/;
108	}
109	else {
110	# back up to parent section => lopp off tail
111	# $new_secnum = $old_secnum;
112	# $new_secnum =~ s/\.\d+$//;
113	}
114
115	$parser->{'parameters'}->{'curr_section_depth'} = $new_depth;
116	$parser->{'parameters'}->{'curr_section_num'} = $new_secnum;
117
118	1;
119	}
120
121
122
123	sub dxml_metadata
124	{
125	my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
126
127	my $parent_sec_num_hash = $parser->{'parameters'}->{'parent_sec_num_hash'};
128
129	my $keep_parent_metadata = $parser->{'parameters'}->{'keep_parent_metadata'};
130	my $keep_parent_content = $parser->{'parameters'}->{'keep_parent_content'};
131
132	my $mode = $parser->{'parameters'}->{'mode'};
133
134	if ($mode eq "extract") {
135
136	my $new_docid = $parser->{'parameters'}->{'new_docid'};
137	if ($attrHash->{'name'} eq "Identifier") {
138	$attrHash->{'_content'} = $new_docid;
139	}
140	}
141
142	return [ $tagname => $attrHash ];
143	}
144
145
146	sub dxml_section
147	{
148	my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
149
150	my $curr_sec_num = $parser->{'parameters'}->{'curr_section_num'} \|\| undef;
151
152	my $sec_num_hash = $parser->{'parameters'}->{'sec_num_hash'};
153	my $parent_sec_num_hash = $parser->{'parameters'}->{'parent_sec_num_hash'};
154
155	my $keep_parent_metadata = $parser->{'parameters'}->{'keep_parent_metadata'};
156	my $keep_parent_content = $parser->{'parameters'}->{'keep_parent_content'};
157
158	my $mode = $parser->{'parameters'}->{'mode'};
159
160	my $prev_depth = $parser->{'parameters'}->{'curr_section_depth'};
161	my $live_depth = scalar(@$contextArray);
162
163	if ($live_depth < $prev_depth) {
164	# In a closing-sections poping off situation:
165	# </Section>
166	# </Section>
167
168	# => Back up to parent section => lopp off tail
169
170	$curr_sec_num =~ s/\.\d+$//;
171	$parser->{'parameters'}->{'curr_section_depth'} = $live_depth;
172	$parser->{'parameters'}->{'curr_section_num'} = $curr_sec_num;
173	}
174
175
176	if ($live_depth == 1) {
177	# root sectin tag, which must always exist
178	return [$tagname => $attrHash];
179	}
180	elsif ($mode eq "delete") {
181	if (defined $sec_num_hash->{$curr_sec_num}) {
182	# remove it
183	return undef
184	}
185	else {
186	# keep it
187	return [$tagname => $attrHash];
188	}
189	}
190	else {
191	# mode is extract
192
193	if (defined $sec_num_hash->{$curr_sec_num}) {
194	# keep it
195	## print STDERR "**** Asked to keep: sec num = $curr_sec_num\n";
196
197	return [$tagname => $attrHash];
198	}
199	elsif (defined $parent_sec_num_hash->{$curr_sec_num}) {
200	# want this element, but cut down to just the child <Section>
201
202	my $section_child = undef;
203
204	## print STDERR "**** Parent match: sec num = $curr_sec_num\n";
205
206	my $filtered_elems = [];
207
208	foreach my $elem ( @{$attrHash->{'_content'}}) {
209	if (ref $elem eq "ARRAY") {
210	my $child_tagname = $elem->[0];
211	## print STDERR "***## elem name $child_tagname\n";
212
213
214	if ($child_tagname eq "Description") {
215	if ($keep_parent_metadata) {
216	push(@$filtered_elems,$elem);
217	}
218	}
219	elsif ($child_tagname eq "Content") {
220	if ($keep_parent_content) {
221	push(@$filtered_elems,$elem);
222	}
223	}
224	else {
225	push(@$filtered_elems,$elem);
226	}
227	}
228	else {
229	push(@$filtered_elems,$elem);
230	}
231	}
232
233	$attrHash->{'_content'} = $filtered_elems;
234
235	return [$tagname => $attrHash];
236
237	}
238	else {
239	# not in our list => remove it
240	return undef;
241	}
242	}
243	}
244
245
246	sub remove_from_doc_xml
247	{
248	my $self = shift @_;
249	my ($gsdl_cgi, $doc_xml_filename, $newdoc_xml_filename,
250	$sec_num_hash, $parent_sec_num_hash, $mode) = @_;
251
252	my @start_rules = ('Section' => \&dxml_start_section);
253
254	# Set the call-back functions for the metadata tags
255	my @rules =
256	(
257	_default => 'raw',
258	'Section' => \&dxml_section,
259	'Metadata' => \&dxml_metadata
260	);
261
262	my $parser = XML::Rules->new
263	(
264	start_rules => \@start_rules,
265	rules => \@rules,
266	style => 'filter',
267	output_encoding => 'utf8',
268	## normalisespaces => 1, # http://search.cpan.org/~jenda/XML-Rules-1.16/lib/XML/Rules.pm
269	# stripspaces => 2\|0\|0 # ineffectual
270	);
271
272	my $status = 0;
273	my $xml_in = "";
274	if (!open(MIN,"<$doc_xml_filename"))
275	{
276	$gsdl_cgi->generate_error("Unable to read in $doc_xml_filename: $!");
277	$status = 1;
278	}
279	else
280	{
281	# Read them in
282	my $line;
283	while (defined ($line=<MIN>)) {
284	$xml_in .= $line;
285	}
286	close(MIN);
287
288	# Filter with the call-back functions
289	my $xml_out = "";
290
291	my $MOUT;
292	if (!open($MOUT,">$newdoc_xml_filename")) {
293	$gsdl_cgi->generate_error("Unable to write out to $newdoc_xml_filename: $!");
294	$status = 1;
295	}
296	else {
297	binmode($MOUT,":utf8");
298
299	my $options = { sec_num_hash => $sec_num_hash,
300	parent_sec_num_hash => $parent_sec_num_hash,
301	keep_parent_metadata => $self->{'keep-parent-metadata'},
302	keep_parent_content => $self->{'keep-parent-content'},
303	new_docid => $self->{'new_docid'},
304	mode => $mode };
305
306	$parser->filter($xml_in, $MOUT, $options);
307	close($MOUT);
308	}
309	}
310	return $status;
311	}
312
313	sub sections_as_hash
314	{
315	my $self = shift @_;
316
317	my ($json_sections_array) = @_;
318
319	my $sec_num_hash = {};
320
321	foreach my $sn ( @$json_sections_array ) {
322
323	# our XML parser curr_sec_num puts '.' at the root
324	# Need to do the same here, so things can be matched up
325	$sec_num_hash->{".$sn"} = 1;
326	}
327
328	return $sec_num_hash;
329	}
330
331
332	sub parent_sections_as_hash
333	{
334	my $self = shift @_;
335
336	my ($json_sections_array) = @_;
337
338	my $sec_num_hash = {};
339
340	foreach my $sn ( @$json_sections_array ) {
341
342	# needs to make a copy, otherwise version stored in json_sections gets changed
343	my $sn_copy = $sn;
344	while ($sn_copy =~ s/\.\d+$//) {
345	# our XML parser curr_sec_num puts '.' at the root
346	# Need to do the same here, so things can be matched up
347
348	$sec_num_hash->{".$sn_copy"} = 1;
349	}
350	}
351
352	return $sec_num_hash;
353	}
354
355	sub parse_flag
356	{
357	my $self = shift @_;
358
359	my ($arg_name) = @_;
360
361	my $flag = $self->{$arg_name} \|\| 0;
362
363	$flag =~ s/^true/1/i;
364	$flag =~ s/^false/0/i;
365
366	return $flag;
367	}
368
369	sub _extract_archives_doc
370	{
371	my $self = shift @_;
372
373	my $collect = $self->{'collect'};
374	my $gsdl_cgi = $self->{'gsdl_cgi'};
375	my $infodbtype = $self->{'infodbtype'};
376
377	my $site = $self->{'site'};
378
379	# Obtain the collect and archive dir
380	my $collect_dir = $gsdl_cgi->get_collection_dir($site);
381
382	my $archive_dir = &util::filename_cat($collect_dir,$collect,"archives");
383
384	# look up additional args
385	my $docid = $self->{'d'};
386
387	my $timestamp = time();
388	my $new_docid = $self->{'newd'} \|\| "HASH$timestamp";
389	$self->{'new_docid'} = $new_docid;
390
391	$self->{'keep-parent-metadata'} = $self->parse_flag("keep-parent-metadata");
392	$self->{'keep-parent-content'} = $self->parse_flag("keep-parent-content");
393
394	my $json_sections_str = $self->{'json-sections'};
395	my $json_sections_array = decode_json($json_sections_str);
396
397
398	my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archive_dir);
399	my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $docid);
400
401	my $doc_file = $doc_rec->{'doc-file'}->[0];
402	my $doc_filename = &util::filename_cat($archive_dir, $doc_file);
403
404	my $new_doc_file = $doc_file;
405	$new_doc_file =~ s/doc(-\d+)?.xml$/doc-$timestamp.xml/;
406
407	my $newdoc_filename = &util::filename_cat($archive_dir, $new_doc_file);
408
409	my $sec_num_hash = $self->sections_as_hash($json_sections_array);
410	my $parent_sec_num_hash = $self->parent_sections_as_hash($json_sections_array);
411
412	my $extract_status = $self->remove_from_doc_xml($gsdl_cgi, $doc_filename ,$newdoc_filename, $sec_num_hash, $parent_sec_num_hash, "extract");
413
414	if ($extract_status == 0)
415	{
416	my $delete_sec_num_hash = $self->sections_as_hash($json_sections_array,"no-parents");
417
418	my $delete_status = $self->remove_from_doc_xml($gsdl_cgi, $doc_filename ,$doc_filename, $sec_num_hash, undef, "delete");
419
420	if ($delete_status == 0) {
421
422	# Existing doc record needs to be reindexed
423	$doc_rec->{'index-status'} = ["R"];
424	&dbutil::set_infodb_entry($infodbtype, $arcinfo_doc_filename, $docid, $doc_rec);
425
426	# Create doc-record entry for the newly extracted document
427
428	my $new_doc_rec = $doc_rec;
429	$new_doc_rec->{'index-status'} = ["I"];
430	#### Need to cut this down to just the assoc files the new document references
431
432	$new_doc_rec->{'doc-file'} = [$new_doc_file];
433
434	&dbutil::set_infodb_entry($infodbtype, $arcinfo_doc_filename, $new_docid, $new_doc_rec);
435
436	#### Also need to update the archivesinf-src database!!!!
437	# For all the assoc and src files, retrieve record, and add in new_docid
438
439	my $mess = "document-extract successful: Key[$docid]\n";
440
441	$gsdl_cgi->generate_ok_message($mess);
442	}
443	else {
444	my $mess .= "Failed to extract identified section numbers for key: $docid\n";
445	$mess .= "Exit status: $delete_status\n";
446	$mess .= "System Error Message: $!\n";
447	$mess .= "-" x 20 . "\n";
448
449	$gsdl_cgi->generate_error($mess);
450	}
451	}
452	else
453	{
454	my $mess .= "Failed to remove identified section numbers for key: $docid\n";
455	$mess .= "Exit status: $extract_status\n";
456	$mess .= "System Error Message: $!\n";
457	$mess .= "-" x 20 . "\n";
458
459	$gsdl_cgi->generate_error($mess);
460	}
461
462	#return $status; # in case calling functions have a use for this
463	}
464
465
466
467	# JSON version that will get the requested metadata values
468	# from the requested source (index, import, archives or live)
469	# One of the params is a JSON string and the return value is JSON too
470	# http://forums.asp.net/t/1844684.aspx/1 - Web api method return json in string
471	sub extract_archives_doc
472	{
473	my $self = shift @_;
474
475	my $username = $self->{'username'};
476	my $collect = $self->{'collect'};
477	my $gsdl_cgi = $self->{'gsdl_cgi'};
478
479	if ($baseaction::authentication_enabled)
480	{
481	# Ensure the user is allowed to edit this collection
482	$self->authenticate_user($username, $collect);
483	}
484
485	# Make sure the collection isn't locked by someone else
486	$self->lock_collection($username, $collect);
487
488	$self->_extract_archives_doc(@_);
489
490	# Release the lock once it is done
491	$self->unlock_collection($username, $collect);
492
493
494	}
495
496
497	1;

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/perllib/cgiactions/docextractaction.pm@ 29013

Download in other formats: