Context Navigation

docextractaction.pm@ 29013

Last change on this file since 29013 was 28268, checked in by davidb, 11 years ago
Existing doc.xml maked with R for reindex, new file I to index
File size: 13.0 KB

Rev	Line
[28243]	1	##########################################################################
	2	#
	3	# docextractaction.pm --
	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
	8	# Copyright (C) 2009 New Zealand Digital Library Project
	9	#
	10	# This program is free software; you can redistr te it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
	25
	26	package docextractaction;
	27
	28	use strict;
	29
	30	use cgiactions::baseaction;
	31
	32	use dbutil;
	33	use ghtml;
	34
	35	use JSON;
	36
	37
	38	BEGIN {
	39	require XML::Rules;
	40	}
	41
	42	@docextractaction::ISA = ('baseaction');
	43
	44	my $action_table =
	45	{
	46	"extract-archives-doc" => { # where param can be ONE of: index (default), import, archives, live
	47	'compulsory-args' => [ "d", "json-sections" ],
[28251]	48	'optional-args' => [ "json-metadata", "newd",
	49	"keep-parent-metadata", "keep-parent-content" ],
[28243]	50	# 'optional-args' => [ "where" ],
	51	'help-string' => [
[28245]	52	'document-extract.pl?a=extract-archives-doc&c=demo&d=HASH0123456789ABC&json-sections=["1.2","1.3.2","2.1","2.2"]&json-metadata=[{"metaname":"dc.Title","metavalue":"All Black Rugy Success","metamode":"accumulate"]'
	53	."\n\n Add site=xxxx if a Greenstone3 installation"
[28243]	54	]}
	55
	56	};
	57
	58
	59	sub new
	60	{
	61	my $class = shift (@_);
	62	my ($gsdl_cgi,$iis6_mode) = @_;
	63
	64	# Treat metavalue specially. To transmit this through a GET request
	65	# the Javascript side has url-encoded it, so here we need to decode
	66	# it before proceeding
	67
	68	# my $url_encoded_metavalue = $gsdl_cgi->param("metavalue");
	69	# my $url_decoded_metavalue = &unicode::url_decode($url_encoded_metavalue,1);
	70	# my $unicode_array = &unicode::utf82unicode($url_decoded_metavalue);
	71
	72	# $url_decoded_metavalue = join("",map(chr($_),@$unicode_array));
	73	# $gsdl_cgi->param("metavalue",$url_decoded_metavalue);
	74
	75	my $self = new baseaction($action_table,$gsdl_cgi,$iis6_mode);
	76
	77	return bless $self, $class;
	78	}
	79
	80
[28249]	81
	82	sub dxml_start_section
	83	{
	84	my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
	85
	86	my $new_depth = scalar(@$contextArray);
	87
	88	if ($new_depth == 1) {
	89	$parser->{'parameters'}->{'curr_section_depth'} = 1;
	90	$parser->{'parameters'}->{'curr_section_num'} = "";
	91	}
	92
	93	my $old_depth = $parser->{'parameters'}->{'curr_section_depth'};
	94	my $old_secnum = $parser->{'parameters'}->{'curr_section_num'};
	95
	96	my $new_secnum;
	97
	98	if ($new_depth > $old_depth) {
[28251]	99	# first child subsection
[28249]	100	$new_secnum = "$old_secnum.1";
	101	}
	102	elsif ($new_depth == $old_depth) {
	103	# sibling section => increase it's value by 1
	104	my ($tail_num) = ($old_secnum =~ m/\.(\d+)$/);
	105	$tail_num++;
	106	$new_secnum = $old_secnum;
	107	$new_secnum =~ s/\.(\d+)$/\.$tail_num/;
	108	}
	109	else {
	110	# back up to parent section => lopp off tail
[28251]	111	# $new_secnum = $old_secnum;
	112	# $new_secnum =~ s/\.\d+$//;
[28249]	113	}
	114
	115	$parser->{'parameters'}->{'curr_section_depth'} = $new_depth;
	116	$parser->{'parameters'}->{'curr_section_num'} = $new_secnum;
	117
	118	1;
	119	}
	120
	121
[28263]	122
	123	sub dxml_metadata
	124	{
	125	my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
	126
	127	my $parent_sec_num_hash = $parser->{'parameters'}->{'parent_sec_num_hash'};
	128
	129	my $keep_parent_metadata = $parser->{'parameters'}->{'keep_parent_metadata'};
	130	my $keep_parent_content = $parser->{'parameters'}->{'keep_parent_content'};
	131
	132	my $mode = $parser->{'parameters'}->{'mode'};
	133
	134	if ($mode eq "extract") {
	135
	136	my $new_docid = $parser->{'parameters'}->{'new_docid'};
	137	if ($attrHash->{'name'} eq "Identifier") {
	138	$attrHash->{'_content'} = $new_docid;
	139	}
	140	}
	141
	142	return [ $tagname => $attrHash ];
	143	}
	144
	145
[28249]	146	sub dxml_section
	147	{
	148	my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
	149
	150	my $curr_sec_num = $parser->{'parameters'}->{'curr_section_num'} \|\| undef;
	151
[28251]	152	my $sec_num_hash = $parser->{'parameters'}->{'sec_num_hash'};
	153	my $parent_sec_num_hash = $parser->{'parameters'}->{'parent_sec_num_hash'};
	154
	155	my $keep_parent_metadata = $parser->{'parameters'}->{'keep_parent_metadata'};
	156	my $keep_parent_content = $parser->{'parameters'}->{'keep_parent_content'};
	157
[28249]	158	my $mode = $parser->{'parameters'}->{'mode'};
	159
[28251]	160	my $prev_depth = $parser->{'parameters'}->{'curr_section_depth'};
[28249]	161	my $live_depth = scalar(@$contextArray);
	162
[28251]	163	if ($live_depth < $prev_depth) {
	164	# In a closing-sections poping off situation:
	165	# </Section>
	166	# </Section>
[28249]	167
[28251]	168	# => Back up to parent section => lopp off tail
	169
	170	$curr_sec_num =~ s/\.\d+$//;
	171	$parser->{'parameters'}->{'curr_section_depth'} = $live_depth;
	172	$parser->{'parameters'}->{'curr_section_num'} = $curr_sec_num;
	173	}
	174
	175
[28249]	176	if ($live_depth == 1) {
	177	# root sectin tag, which must always exist
	178	return [$tagname => $attrHash];
	179	}
[28251]	180	elsif ($mode eq "delete") {
	181	if (defined $sec_num_hash->{$curr_sec_num}) {
	182	# remove it
	183	return undef
	184	}
	185	else {
[28249]	186	# keep it
	187	return [$tagname => $attrHash];
	188	}
	189	}
	190	else {
[28251]	191	# mode is extract
	192
	193	if (defined $sec_num_hash->{$curr_sec_num}) {
[28249]	194	# keep it
[28251]	195	## print STDERR "**** Asked to keep: sec num = $curr_sec_num\n";
	196
[28249]	197	return [$tagname => $attrHash];
	198	}
[28251]	199	elsif (defined $parent_sec_num_hash->{$curr_sec_num}) {
	200	# want this element, but cut down to just the child <Section>
	201
	202	my $section_child = undef;
	203
	204	## print STDERR "**** Parent match: sec num = $curr_sec_num\n";
	205
	206	my $filtered_elems = [];
	207
	208	foreach my $elem ( @{$attrHash->{'_content'}}) {
	209	if (ref $elem eq "ARRAY") {
	210	my $child_tagname = $elem->[0];
	211	## print STDERR "***## elem name $child_tagname\n";
	212
	213
	214	if ($child_tagname eq "Description") {
	215	if ($keep_parent_metadata) {
	216	push(@$filtered_elems,$elem);
	217	}
	218	}
	219	elsif ($child_tagname eq "Content") {
	220	if ($keep_parent_content) {
	221	push(@$filtered_elems,$elem);
	222	}
	223	}
	224	else {
	225	push(@$filtered_elems,$elem);
	226	}
	227	}
	228	else {
	229	push(@$filtered_elems,$elem);
	230	}
	231	}
	232
	233	$attrHash->{'_content'} = $filtered_elems;
	234
	235	return [$tagname => $attrHash];
	236
	237	}
	238	else {
	239	# not in our list => remove it
	240	return undef;
	241	}
[28249]	242	}
	243	}
	244
	245
	246	sub remove_from_doc_xml
	247	{
	248	my $self = shift @_;
[28251]	249	my ($gsdl_cgi, $doc_xml_filename, $newdoc_xml_filename,
	250	$sec_num_hash, $parent_sec_num_hash, $mode) = @_;
[28249]	251
	252	my @start_rules = ('Section' => \&dxml_start_section);
	253
	254	# Set the call-back functions for the metadata tags
	255	my @rules =
	256	(
	257	_default => 'raw',
[28263]	258	'Section' => \&dxml_section,
	259	'Metadata' => \&dxml_metadata
[28249]	260	);
	261
	262	my $parser = XML::Rules->new
	263	(
	264	start_rules => \@start_rules,
	265	rules => \@rules,
	266	style => 'filter',
	267	output_encoding => 'utf8',
	268	## normalisespaces => 1, # http://search.cpan.org/~jenda/XML-Rules-1.16/lib/XML/Rules.pm
	269	# stripspaces => 2\|0\|0 # ineffectual
	270	);
	271
	272	my $status = 0;
	273	my $xml_in = "";
	274	if (!open(MIN,"<$doc_xml_filename"))
	275	{
	276	$gsdl_cgi->generate_error("Unable to read in $doc_xml_filename: $!");
	277	$status = 1;
	278	}
	279	else
	280	{
	281	# Read them in
	282	my $line;
	283	while (defined ($line=<MIN>)) {
	284	$xml_in .= $line;
	285	}
	286	close(MIN);
	287
	288	# Filter with the call-back functions
	289	my $xml_out = "";
	290
	291	my $MOUT;
	292	if (!open($MOUT,">$newdoc_xml_filename")) {
	293	$gsdl_cgi->generate_error("Unable to write out to $newdoc_xml_filename: $!");
	294	$status = 1;
	295	}
	296	else {
	297	binmode($MOUT,":utf8");
	298
[28251]	299	my $options = { sec_num_hash => $sec_num_hash,
	300	parent_sec_num_hash => $parent_sec_num_hash,
	301	keep_parent_metadata => $self->{'keep-parent-metadata'},
	302	keep_parent_content => $self->{'keep-parent-content'},
[28263]	303	new_docid => $self->{'new_docid'},
[28251]	304	mode => $mode };
[28249]	305
	306	$parser->filter($xml_in, $MOUT, $options);
	307	close($MOUT);
	308	}
	309	}
	310	return $status;
	311	}
	312
	313	sub sections_as_hash
	314	{
	315	my $self = shift @_;
[28251]	316
	317	my ($json_sections_array) = @_;
[28249]	318
	319	my $sec_num_hash = {};
	320
	321	foreach my $sn ( @$json_sections_array ) {
	322
	323	# our XML parser curr_sec_num puts '.' at the root
	324	# Need to do the same here, so things can be matched up
	325	$sec_num_hash->{".$sn"} = 1;
[28251]	326	}
[28249]	327
[28251]	328	return $sec_num_hash;
	329	}
[28249]	330
	331
[28251]	332	sub parent_sections_as_hash
	333	{
	334	my $self = shift @_;
	335
	336	my ($json_sections_array) = @_;
	337
	338	my $sec_num_hash = {};
	339
	340	foreach my $sn ( @$json_sections_array ) {
	341
	342	# needs to make a copy, otherwise version stored in json_sections gets changed
	343	my $sn_copy = $sn;
	344	while ($sn_copy =~ s/\.\d+$//) {
	345	# our XML parser curr_sec_num puts '.' at the root
	346	# Need to do the same here, so things can be matched up
	347
	348	$sec_num_hash->{".$sn_copy"} = 1;
[28249]	349	}
	350	}
	351
	352	return $sec_num_hash;
	353	}
	354
[28251]	355	sub parse_flag
	356	{
	357	my $self = shift @_;
[28249]	358
[28251]	359	my ($arg_name) = @_;
[28249]	360
[28251]	361	my $flag = $self->{$arg_name} \|\| 0;
	362
	363	$flag =~ s/^true/1/i;
	364	$flag =~ s/^false/0/i;
	365
	366	return $flag;
	367	}
	368
[28249]	369	sub _extract_archives_doc
	370	{
	371	my $self = shift @_;
	372
	373	my $collect = $self->{'collect'};
	374	my $gsdl_cgi = $self->{'gsdl_cgi'};
	375	my $infodbtype = $self->{'infodbtype'};
	376
	377	my $site = $self->{'site'};
	378
	379	# Obtain the collect and archive dir
	380	my $collect_dir = $gsdl_cgi->get_collection_dir($site);
	381
	382	my $archive_dir = &util::filename_cat($collect_dir,$collect,"archives");
	383
	384	# look up additional args
	385	my $docid = $self->{'d'};
	386
[28261]	387	my $timestamp = time();
	388	my $new_docid = $self->{'newd'} \|\| "HASH$timestamp";
[28263]	389	$self->{'new_docid'} = $new_docid;
[28261]	390
[28251]	391	$self->{'keep-parent-metadata'} = $self->parse_flag("keep-parent-metadata");
	392	$self->{'keep-parent-content'} = $self->parse_flag("keep-parent-content");
	393
[28249]	394	my $json_sections_str = $self->{'json-sections'};
	395	my $json_sections_array = decode_json($json_sections_str);
	396
	397
	398	my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archive_dir);
	399	my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $docid);
	400
	401	my $doc_file = $doc_rec->{'doc-file'}->[0];
	402	my $doc_filename = &util::filename_cat($archive_dir, $doc_file);
	403
[28261]	404	my $new_doc_file = $doc_file;
	405	$new_doc_file =~ s/doc(-\d+)?.xml$/doc-$timestamp.xml/;
[28249]	406
[28261]	407	my $newdoc_filename = &util::filename_cat($archive_dir, $new_doc_file);
[28249]	408
[28251]	409	my $sec_num_hash = $self->sections_as_hash($json_sections_array);
	410	my $parent_sec_num_hash = $self->parent_sections_as_hash($json_sections_array);
[28249]	411
[28251]	412	my $extract_status = $self->remove_from_doc_xml($gsdl_cgi, $doc_filename ,$newdoc_filename, $sec_num_hash, $parent_sec_num_hash, "extract");
[28249]	413
	414	if ($extract_status == 0)
	415	{
	416	my $delete_sec_num_hash = $self->sections_as_hash($json_sections_array,"no-parents");
	417
[28251]	418	my $delete_status = $self->remove_from_doc_xml($gsdl_cgi, $doc_filename ,$doc_filename, $sec_num_hash, undef, "delete");
[28249]	419
	420	if ($delete_status == 0) {
	421
[28261]	422	# Existing doc record needs to be reindexed
[28268]	423	$doc_rec->{'index-status'} = ["R"];
[28261]	424	&dbutil::set_infodb_entry($infodbtype, $arcinfo_doc_filename, $docid, $doc_rec);
	425
	426	# Create doc-record entry for the newly extracted document
	427
	428	my $new_doc_rec = $doc_rec;
[28268]	429	$new_doc_rec->{'index-status'} = ["I"];
[28261]	430	#### Need to cut this down to just the assoc files the new document references
	431
[28264]	432	$new_doc_rec->{'doc-file'} = [$new_doc_file];
	433
[28261]	434	&dbutil::set_infodb_entry($infodbtype, $arcinfo_doc_filename, $new_docid, $new_doc_rec);
	435
	436	#### Also need to update the archivesinf-src database!!!!
	437	# For all the assoc and src files, retrieve record, and add in new_docid
	438
[28249]	439	my $mess = "document-extract successful: Key[$docid]\n";
	440
	441	$gsdl_cgi->generate_ok_message($mess);
	442	}
	443	else {
	444	my $mess .= "Failed to extract identified section numbers for key: $docid\n";
	445	$mess .= "Exit status: $delete_status\n";
	446	$mess .= "System Error Message: $!\n";
	447	$mess .= "-" x 20 . "\n";
	448
	449	$gsdl_cgi->generate_error($mess);
	450	}
	451	}
	452	else
	453	{
	454	my $mess .= "Failed to remove identified section numbers for key: $docid\n";
	455	$mess .= "Exit status: $extract_status\n";
	456	$mess .= "System Error Message: $!\n";
	457	$mess .= "-" x 20 . "\n";
	458
	459	$gsdl_cgi->generate_error($mess);
	460	}
	461
	462	#return $status; # in case calling functions have a use for this
	463	}
	464
	465
	466
[28243]	467	# JSON version that will get the requested metadata values
	468	# from the requested source (index, import, archives or live)
	469	# One of the params is a JSON string and the return value is JSON too
	470	# http://forums.asp.net/t/1844684.aspx/1 - Web api method return json in string
	471	sub extract_archives_doc
	472	{
[28249]	473	my $self = shift @_;
[28243]	474
[28249]	475	my $username = $self->{'username'};
	476	my $collect = $self->{'collect'};
	477	my $gsdl_cgi = $self->{'gsdl_cgi'};
	478
	479	if ($baseaction::authentication_enabled)
	480	{
	481	# Ensure the user is allowed to edit this collection
	482	$self->authenticate_user($username, $collect);
	483	}
[28243]	484
[28249]	485	# Make sure the collection isn't locked by someone else
	486	$self->lock_collection($username, $collect);
[28243]	487
[28249]	488	$self->_extract_archives_doc(@_);
	489
	490	# Release the lock once it is done
	491	$self->unlock_collection($username, $collect);
	492
	493
[28243]	494	}
	495
	496
	497	1;

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/perllib/cgiactions/docextractaction.pm@ 29013

Download in other formats: