Context Navigation

source: main/trunk/greenstone2/perllib/doc.pm@ 23131

Last change on this file since 23131 was 23131, checked in by kjdon, 14 years ago
added a method get_total_text_length. returns the total lenght for the document, ie al the text lengths for each section
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 30.4 KB

Rev	Line
[537]	1	###########################################################################
	2	#
	3	# doc.pm --
	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
	8	# Copyright (C) 1999 New Zealand Digital Library Project
	9	#
[8894]	10	# This program is free software; you can redistr te it and/or modify
[537]	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
	25
[1241]	26	# base class to hold documents
[4]	27
	28	package doc;
[3834]	29	eval {require bytes};
[4]	30
[832]	31	BEGIN {
[1241]	32	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
	33	unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
[832]	34	}
	35
[15894]	36	use strict;
[1241]	37	use unicode;
	38	use util;
	39	use ghtml;
[8220]	40	use File::stat;
[1241]	41	##use hashdoc;
[13172]	42	use docprint;
[1241]	43
[4]	44	# the document type may be indexed_doc, nonindexed_doc, or
	45	# classification
	46
[18528]	47	our $OIDcount = 0;
[2327]	48
[20412]	49	# rename_method can be 'url', 'none', 'base64'
[4]	50	sub new {
	51	my $class = shift (@_);
[18319]	52	my ($source_filename, $doc_type, $rename_method) = @_;
[1374]	53
[13770]	54
[1241]	55	my $self = bless {'associated_files'=>[],
	56	'subsection_order'=>[],
	57	'next_subsection'=>1,
	58	'subsections'=>{},
	59	'metadata'=>[],
[2327]	60	'text'=>"",
	61	'OIDtype'=>"hash"}, $class;
[4]	62
[10217]	63	# used to set lastmodified here, but this can screw up the HASH ids, so
	64	# the docsave processor now calls set_lastmodified
[13770]	65
	66	$self->{'source_path'} = $source_filename;
[10217]	67
[7929]	68	if (defined $source_filename) {
[15874]	69	$source_filename = &util::filename_within_collection($source_filename);
[18508]	70	print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
[18319]	71	$self->set_source_filename ($source_filename, $rename_method);
[7929]	72	}
	73
[1374]	74	$self->set_doc_type ($doc_type) if defined $doc_type;
[485]	75
[4]	76	return $self;
	77	}
[10217]	78	# set lastmodified for OAI purposes, added by GRB, moved by kjdon
	79	sub set_lastmodified {
	80	my $self = shift (@_);
[4]	81
[13770]	82	my $source_path = $self->{'source_path'};
	83
	84	if (defined $source_path && (-e $source_path)) {
	85
	86	my $file_stat = stat($source_path);
[10217]	87	my $mtime = $file_stat->mtime;
[21862]	88	my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
	89	$wday, $yday, $isdst) = localtime($mtime);
	90
	91	my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
	92
[10217]	93	$self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
[21862]	94	$self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
[10217]	95	}
	96	}
	97
[1241]	98	# clone the $self object
	99	sub duplicate {
	100	my $self = shift (@_);
	101
	102	my $newobj = {};
	103
[8716]	104	foreach my $k (keys %$self) {
[1241]	105	$newobj->{$k} = &clone ($self->{$k});
	106	}
	107
	108	bless $newobj, ref($self);
	109	return $newobj;
	110	}
	111
	112	sub clone {
	113	my ($from) = @_;
	114	my $type = ref ($from);
	115
	116	if ($type eq "HASH") {
	117	my $to = {};
[8716]	118	foreach my $key (keys %$from) {
[1241]	119	$to->{$key} = &clone ($from->{$key});
	120	}
	121	return $to;
	122	} elsif ($type eq "ARRAY") {
	123	my $to = [];
[8716]	124	foreach my $v (@$from) {
[1241]	125	push (@$to, &clone ($v));
	126	}
	127	return $to;
	128	} else {
	129	return $from;
	130	}
	131	}
	132
[2327]	133	sub set_OIDtype {
	134	my $self = shift (@_);
[12268]	135	my ($type, $metadata) = @_;
[1241]	136
[17025]	137	if (defined $type && $type =~ /^(hash\|hash_on_file\|hash_on_ga_xml\|incremental\|dirname\|assigned)$/) {
[2327]	138	$self->{'OIDtype'} = $type;
	139	} else {
	140	$self->{'OIDtype'} = "hash";
	141	}
[16792]	142
[12268]	143	if ($type =~ /^assigned$/) {
	144	if (defined $metadata) {
	145	$self->{'OIDmetadata'} = $metadata;
	146	} else {
	147	$self->{'OIDmetadata'} = "dc.Identifier";
	148	}
	149	}
[2327]	150	}
	151
[20412]	152	# rename_method can be 'url', 'none', 'base64'
[1241]	153	sub set_source_filename {
	154	my $self = shift (@_);
[18319]	155	my ($source_filename, $rename_method) = @_;
[1241]	156
[16578]	157	# Since the gsdlsourcefilename element goes into the doc.xml it has
	158	# to be utf8. However, it should also represent the source filename
	159	# (in the import directory) which may not be utf8 at all.
[18319]	160	# For instance, if this meta element (gsdlsourcefilename) will be used
	161	# by other applications that parse doc.xml in order to locate
	162	# gsdlsourcefilename. Therefore, the solution is to URLencode or base64
	163	# encode the real filename as this is a binary-to-text encoding meaning
	164	# that the resulting string is ASCII (utf8). Decoding will give the original.
[16578]	165
[18319]	166	# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
	167
[16670]	168	# URLencode just the gsdl_source_filename, not the directory. Then prepend dir
	169	my ($srcfilename,$dirname,$suffix)
	170	= &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
[16792]	171	# print STDERR "-> $srcfilename -> ";
[18319]	172	$srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
[16670]	173	$source_filename = &util::filename_cat($dirname, $srcfilename);
[16792]	174	# print STDERR "$source_filename\n";
[16670]	175
[16578]	176	$self->set_utf8_metadata_element ($self->get_top_section(),
[1241]	177	"gsdlsourcefilename",
	178	$source_filename);
	179	}
	180
[7569]	181	sub set_converted_filename {
	182	my $self = shift (@_);
	183	my ($converted_filename) = @_;
	184
[16578]	185	# we know the converted filename is utf8
	186	$self->set_utf8_metadata_element ($self->get_top_section(),
[7569]	187	"gsdlconvertedfilename",
	188	$converted_filename);
	189	}
	190
[1241]	191	# returns the source_filename as it was provided
[19829]	192	sub get_unmodified_source_filename {
	193	my $self = shift (@_);
	194
	195	return $self->{'source_path'};
	196	}
	197
	198	# returns the source_filename with whatever rename_method was given
[1241]	199	sub get_source_filename {
	200	my $self = shift (@_);
	201
	202	return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
	203	}
	204
[19829]	205
	206
[7569]	207	# returns converted filename if available else returns source filename
	208	sub get_filename_for_hashing {
	209	my $self = shift (@_);
	210
	211	my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
	212
	213	if (!defined $filename) {
[11097]	214	my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
[10980]	215	# if NULPlug processed file, then don't give a filename
[11097]	216	if (defined $plugin_name && $plugin_name eq "NULPlug") {
[10980]	217	$filename = undef;
[16670]	218	} else { # returns the URL encoded source filename!
[10980]	219	$filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
	220	}
[7569]	221	}
	222	return $filename;
	223	}
	224
[1241]	225	sub set_doc_type {
	226	my $self = shift (@_);
	227	my ($doc_type) = @_;
	228
	229	$self->set_metadata_element ($self->get_top_section(),
	230	"gsdldoctype",
	231	$doc_type);
	232	}
	233
[10217]	234	# returns the gsdldoctype as it was provided
[1241]	235	# the default of "indexed_doc" is used if no document
	236	# type was provided
	237	sub get_doc_type {
	238	my $self = shift (@_);
	239
	240	my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
	241	return $doc_type if (defined $doc_type);
	242	return "indexed_doc";
	243	}
	244
	245
	246	# look up the reference to the a particular section
	247	sub _lookup_section {
	248	my $self = shift (@_);
	249	my ($section) = @_;
	250
	251	my ($num);
	252	my $sectionref = $self;
	253
	254	while (defined $section && $section ne "") {
[12327]	255
[1241]	256	($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
[12327]	257
	258	$num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
	259
[1241]	260	$section = "" unless defined $section;
	261
[12327]	262
[1241]	263	if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
	264	$sectionref = $sectionref->{'subsections'}->{$num};
	265	} else {
	266	return undef;
	267	}
	268	}
	269
	270	return $sectionref;
	271	}
	272
[2327]	273	# calculate OID by hashing the contents of the document
[1241]	274	sub _calc_OID {
	275	my $self = shift (@_);
	276	my ($filename) = @_;
	277
[22855]	278
[1241]	279	my $osexe = &util::get_os_exe();
	280
	281	my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
	282	$ENV{'GSDLOS'},"hashfile$osexe");
[8504]	283
[1241]	284	my $result = "NULL";
[22855]	285
[1679]	286
[1241]	287	if (-e "$hashfile_exe") {
[1679]	288	# $result = `\"$hashfile_exe\" \"$filename\"`;
[21862]	289	# $result = `hashfile$osexe \"$filename\" 2>&1`;
[1679]	290	$result = `hashfile$osexe \"$filename\"`;
[21862]	291
[1241]	292	($result) = $result =~ /:\s*([0-9a-f]+)/i;
	293	} else {
	294	print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
	295	}
	296	return "HASH$result";
	297	}
	298
	299	# methods dealing with OID, not groups of them.
	300
[2327]	301	# if $OID is not provided one is calculated
[1241]	302	sub set_OID {
	303	my $self = shift (@_);
	304	my ($OID) = @_;
[8504]	305
[8797]	306	my $use_hash_oid = 0;
[17057]	307	# if an OID wasn't provided calculate one
[1241]	308	if (!defined $OID) {
	309	$OID = "NULL";
[17025]	310	if ($self->{'OIDtype'} =~ /^hash/) {
[8797]	311	$use_hash_oid = 1;
	312	} elsif ($self->{'OIDtype'} eq "incremental") {
[2327]	313	$OID = "D" . $OIDcount;
	314	$OIDcount ++;
[8716]	315
	316	} elsif ($self->{'OIDtype'} eq "dirname") {
	317	$OID = 'J';
	318	my $filename = $self->get_source_filename();
[8797]	319	if (defined($filename)) { # && -e $filename) {
[8716]	320	$OID = &File::Basename::dirname($filename);
	321	if (defined $OID) {
	322	$OID = 'J'.&File::Basename::basename($OID);
[19617]	323	$OID = &util::tidy_up_oid($OID);
[8716]	324	} else {
[8797]	325	print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
	326	$use_hash_oid = 1;
[8716]	327	}
	328	} else {
[8797]	329	print STDERR "Failed to find filename, generating hash id\n";
	330	$use_hash_oid = 1;
[8716]	331	}
[2327]	332
[8797]	333	} elsif ($self->{'OIDtype'} eq "assigned") {
[12268]	334	my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
[8797]	335	if (defined $identifier && $identifier ne "") {
[17025]	336	$OID = $identifier;
[19617]	337	$OID = &util::tidy_up_oid($OID);
[8797]	338	} else {
	339	# need a hash id
[12268]	340	print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
[8797]	341	$use_hash_oid = 1;
	342	}
	343
[2327]	344	} else {
[8797]	345	$use_hash_oid = 1;
	346	}
	347
	348	if ($use_hash_oid) {
[17025]	349	my $hash_on_file = 1;
	350	if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
	351	$hash_on_file = 0;
	352	}
	353	if ($hash_on_file) {
	354	# "hash" OID - feed file to hashfile.exe
	355	my $filename = $self->get_filename_for_hashing();
[22855]	356
[17025]	357	# -z: don't want to hash on the file if it is zero size
	358	if (defined($filename) && -e $filename && !-z $filename) {
	359	$OID = $self->_calc_OID ($filename);
	360	} else {
	361	$hash_on_file = 0;
	362	}
	363	}
	364	if (!$hash_on_file) {
	365	my $filename = &util::get_tmp_filename();
[22855]	366	if (!open (OUTFILE, ">:utf8", $filename)) {
[2327]	367	print STDERR "doc::set_OID could not write to $filename\n";
	368	} else {
[13172]	369	my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
	370	print OUTFILE $doc_text;
[2327]	371	close (OUTFILE);
	372	}
	373	$OID = $self->_calc_OID ($filename);
	374	&util::rm ($filename);
[1374]	375	}
[1241]	376	}
	377	}
	378	$self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
	379	}
	380
	381	# this uses hashdoc (embedded c thingy) which is faster but still
	382	# needs a little work to be suffiently stable
	383	sub ___set_OID {
	384	my $self = shift (@_);
	385	my ($OID) = @_;
	386
	387	# if an OID wasn't provided then calculate hash value based on document
	388	if (!defined $OID)
	389	{
[13172]	390	my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
[1241]	391	my $hash_len = length($hash_text);
	392
	393	$OID = &hashdoc::buffer($hash_text,$hash_len);
	394	}
	395
	396	$self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
	397	}
	398
	399	# returns the OID for this document
	400	sub get_OID {
	401	my $self = shift (@_);
	402	my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
	403	return $OID if (defined $OID);
	404	return "NULL";
	405	}
	406
	407	sub delete_OID {
	408	my $self = shift (@_);
	409
	410	$self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
	411	}
	412
	413
	414	# methods for manipulating section names
	415
	416	# returns the name of the top-most section (the top
	417	# level of the document
	418	sub get_top_section {
	419	my $self = shift (@_);
	420
	421	return "";
	422	}
	423
	424	# returns a section
	425	sub get_parent_section {
	426	my $self = shift (@_);
	427	my ($section) = @_;
	428
	429	$section =~ s/(^\|\.)\d+$//;
	430
	431	return $section;
	432	}
	433
	434	# returns the first child section (or the end child
	435	# if there isn't any)
	436	sub get_begin_child {
	437	my $self = shift (@_);
	438	my ($section) = @_;
	439
	440	my $section_ptr = $self->_lookup_section($section);
	441	return "" unless defined $section_ptr;
	442
	443	if (defined $section_ptr->{'subsection_order'}->[0]) {
	444	return "$section.$section_ptr->{'subsection_order'}->[0]";
	445	}
	446
	447	return $self->get_end_child ($section);
	448	}
	449
	450	# returns the next child of a parent section
	451	sub get_next_child {
	452	my $self = shift (@_);
	453	my ($section) = @_;
	454
	455	my $parent_section = $self->get_parent_section($section);
	456	my $parent_section_ptr = $self->_lookup_section($parent_section);
	457	return undef unless defined $parent_section_ptr;
	458
	459	my ($section_num) = $section =~ /(\d+)$/;
	460	return undef unless defined $section_num;
	461
	462	my $i = 0;
	463	my $section_order = $parent_section_ptr->{'subsection_order'};
	464	while ($i < scalar(@$section_order)) {
	465	last if $section_order->[$i] eq $section_num;
	466	$i++;
	467	}
	468
	469	$i++; # the next child
	470	if ($i < scalar(@$section_order)) {
	471	return $section_order->[$i] if $parent_section eq "";
	472	return "$parent_section.$section_order->[$i]";
	473	}
	474
	475	# no more sections in this level
	476	return undef;
	477	}
	478
	479	# returns a reference to a list of children
	480	sub get_children {
	481	my $self = shift (@_);
	482	my ($section) = @_;
	483
	484	my $section_ptr = $self->_lookup_section($section);
	485	return [] unless defined $section_ptr;
	486
	487	my @children = @{$section_ptr->{'subsection_order'}};
	488
	489	map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
	490	return \@children;
	491	}
	492
	493	# returns the child section one past the last one (which
	494	# is coded as "0")
	495	sub get_end_child {
	496	my $self = shift (@_);
	497	my ($section) = @_;
	498
	499	return $section . ".0" unless $section eq "";
	500	return "0";
	501	}
	502
	503	# returns the next section in book order
	504	sub get_next_section {
	505	my $self = shift (@_);
	506	my ($section) = @_;
	507
	508	return undef unless defined $section;
	509
	510	my $section_ptr = $self->_lookup_section($section);
	511	return undef unless defined $section_ptr;
	512
	513	# first try to find first child
	514	if (defined $section_ptr->{'subsection_order'}->[0]) {
	515	return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
	516	return "$section.$section_ptr->{'subsection_order'}->[0]";
	517	}
	518
	519	do {
	520	# try to find sibling
	521	my $next_child = $self->get_next_child ($section);
	522	return $next_child if (defined $next_child);
	523
	524	# move up one level
	525	$section = $self->get_parent_section ($section);
	526	} while $section =~ /\d/;
	527
	528	return undef;
	529	}
	530
	531	sub is_leaf_section {
	532	my $self = shift (@_);
	533	my ($section) = @_;
	534
	535	my $section_ptr = $self->_lookup_section($section);
	536	return 1 unless defined $section_ptr;
	537
	538	return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
	539	}
	540
	541	# methods for dealing with sections
	542
	543	# returns the name of the inserted section
	544	sub insert_section {
	545	my $self = shift (@_);
	546	my ($before_section) = @_;
	547
	548	# get the child to insert before and its parent section
	549	my $parent_section = "";
	550	my $before_child = "0";
	551	my @before_section = split (/\./, $before_section);
	552	if (scalar(@before_section) > 0) {
	553	$before_child = pop (@before_section);
	554	$parent_section = join (".", @before_section);
	555	}
	556
	557	my $parent_section_ptr = $self->_lookup_section($parent_section);
	558	if (!defined $parent_section_ptr) {
	559	print STDERR "doc::insert_section couldn't find parent section " .
	560	"$parent_section\n";
	561	return;
	562	}
	563
	564	# get the next section number
	565	my $section_num = $parent_section_ptr->{'next_subsection'}++;
	566
	567	my $i = 0;
	568	while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
	569	$parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
	570	$i++;
	571	}
	572
	573	# insert the section number into the order list
	574	splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
	575
	576	# add this section to the parent section
	577	my $section_ptr = {'subsection_order'=>[],
	578	'next_subsection'=>1,
	579	'subsections'=>{},
	580	'metadata'=>[],
	581	'text'=>""};
	582	$parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
	583
	584	# work out the full section number
	585	my $section = $parent_section;
	586	$section .= "." unless $section eq "";
	587	$section .= $section_num;
	588
	589	return $section;
	590	}
	591
	592	# creates a pre-named section
	593	sub create_named_section {
	594	my $self = shift (@_);
	595	my ($mastersection) = @_;
	596
	597	my ($num);
	598	my $section = $mastersection;
	599	my $sectionref = $self;
	600
	601	while ($section ne "") {
	602	($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
	603	$num =~ s/^0+(\d)/$1/; # remove leading 0s
	604	$section = "" unless defined $section;
	605
	606	if (defined $num) {
	607	if (!defined $sectionref->{'subsections'}->{$num}) {
	608	push (@{$sectionref->{'subsection_order'}}, $num);
	609	$sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
	610	'next_subsection'=>1,
	611	'subsections'=>{},
	612	'metadata'=>[],
	613	'text'=>""};
	614	if ($num >= $sectionref->{'next_subsection'}) {
	615	$sectionref->{'next_subsection'} = $num + 1;
	616	}
	617	}
	618	$sectionref = $sectionref->{'subsections'}->{$num};
	619
	620	} else {
	621	print STDERR "doc::create_named_section couldn't create section ";
	622	print STDERR "$mastersection\n";
	623	last;
	624	}
	625	}
	626	}
	627
	628	# returns a reference to a list of subsections
	629	sub list_subsections {
	630	my $self = shift (@_);
	631	my ($section) = @_;
	632
	633	my $section_ptr = $self->_lookup_section ($section);
	634	if (!defined $section_ptr) {
	635	print STDERR "doc::list_subsections couldn't find section $section\n";
	636	return [];
	637	}
	638
	639	return [@{$section_ptr->{'subsection_order'}}];
	640	}
	641
	642	sub delete_section {
	643	my $self = shift (@_);
	644	my ($section) = @_;
	645
	646	# my $section_ptr = {'subsection_order'=>[],
	647	# 'next_subsection'=>1,
	648	# 'subsections'=>{},
	649	# 'metadata'=>[],
	650	# 'text'=>""};
	651
	652	# if this is the top section reset everything
	653	if ($section eq "") {
	654	$self->{'subsection_order'} = [];
	655	$self->{'subsections'} = {};
	656	$self->{'metadata'} = [];
	657	$self->{'text'} = "";
	658	return;
	659	}
	660
	661	# find the parent of the section to delete
	662	my $parent_section = "";
	663	my $child = "0";
	664	my @section = split (/\./, $section);
	665	if (scalar(@section) > 0) {
	666	$child = pop (@section);
	667	$parent_section = join (".", @section);
	668	}
	669
	670	my $parent_section_ptr = $self->_lookup_section($parent_section);
	671	if (!defined $parent_section_ptr) {
	672	print STDERR "doc::delete_section couldn't find parent section " .
	673	"$parent_section\n";
	674	return;
	675	}
	676
	677	# remove this section from the subsection_order list
	678	my $i = 0;
	679	while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
	680	if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
	681	splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
	682	last;
	683	}
	684	$i++;
	685	}
	686
	687	# remove this section from the subsection hash
	688	if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
	689	undef $parent_section_ptr->{'subsections'}->{$child};
	690	}
	691	}
	692
	693	#--
[4]	694	# methods for dealing with metadata
	695
	696	# set_metadata_element and get_metadata_element are for metadata
	697	# which should only have one value. add_meta_data and get_metadata
	698	# are for metadata which can have more than one value.
	699
[1241]	700	# returns the first metadata value which matches field
[6111]	701
	702	# This version of get metadata element works much like the one above,
	703	# except it allows for the namespace portion of a metadata element to
	704	# be ignored, thus if you are searching for dc.Title, the first piece
	705	# of matching metadata ending with the name Title (once any namespace
	706	# is removed) would be returned.
	707	# 28-11-2003 John Thompson
[1241]	708	sub get_metadata_element {
	709	my $self = shift (@_);
[6111]	710	my ($section, $field, $ignore_namespace) = @_;
[1241]	711	my ($data);
	712
[6111]	713	$ignore_namespace = 0 unless defined $ignore_namespace;
	714
[1241]	715	my $section_ptr = $self->_lookup_section($section);
	716	if (!defined $section_ptr) {
[8716]	717	print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
[1241]	718	return;
	719	}
	720
[20417]	721	# Remove the namespace if we are being told to ignore them
[6111]	722	if($ignore_namespace) {
	723	$field =~ s/^\w*\.//;
	724	}
	725
[1241]	726	foreach $data (@{$section_ptr->{'metadata'}}) {
[6111]	727
	728	my $data_name = $data->[0];
[14966]	729
[6111]	730	# Remove the any namespace if we are being told to ignore them
	731	if($ignore_namespace) {
	732	$data_name =~ s/^\w*\.//;
	733	}
[22294]	734	$data_name =~ s/^ex\.//; # we always remove ex. - it maybe there in doc_obj, but we will never ask for it.
[6111]	735	return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
[1241]	736	}
	737
	738	return undef; # was not found
	739	}
	740
	741	# returns a list of the form [value1, value2, ...]
	742	sub get_metadata {
	743	my $self = shift (@_);
[6111]	744	my ($section, $field, $ignore_namespace) = @_;
[1241]	745	my ($data);
	746
[6111]	747	$ignore_namespace = 0 unless defined $ignore_namespace;
	748
[1241]	749	my $section_ptr = $self->_lookup_section($section);
	750	if (!defined $section_ptr) {
[8716]	751	print STDERR "doc::get_metadata couldn't find section ",
	752	$section, "\n";
[1241]	753	return;
	754	}
	755
[6111]	756	# Remove the any namespace if we are being told to ignore them
	757	if($ignore_namespace) {
	758	$field =~ s/^\w*\.//;
	759	}
	760
[1241]	761	my @metadata = ();
	762	foreach $data (@{$section_ptr->{'metadata'}}) {
[6111]	763
	764	my $data_name = $data->[0];
	765	# Remove the any namespace if we are being told to ignore them
	766	if($ignore_namespace) {
	767	$data_name =~ s/^\w*\.//;
	768	}
[22294]	769	$data_name =~ s/^ex\.//; # we always remove ex. - it maybe there in doc_obj, but we will never ask for it.
[6111]	770	push (@metadata, $data->[1]) if ($data_name eq $field);
[1241]	771	}
[9241]	772
[1241]	773	return \@metadata;
	774	}
	775
	776	# returns a list of the form [[field,value],[field,value],...]
	777	sub get_all_metadata {
	778	my $self = shift (@_);
	779	my ($section) = @_;
	780
	781	my $section_ptr = $self->_lookup_section($section);
	782	if (!defined $section_ptr) {
[8716]	783	print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
[1241]	784	return;
	785	}
	786
	787	return $section_ptr->{'metadata'};
	788	}
	789
	790	# $value is optional
	791	sub delete_metadata {
	792	my $self = shift (@_);
	793	my ($section, $field, $value) = @_;
	794
	795	my $section_ptr = $self->_lookup_section($section);
	796	if (!defined $section_ptr) {
[8716]	797	print STDERR "doc::delete_metadata couldn't find section ", $section, "\n";
[1241]	798	return;
	799	}
	800
	801	my $i = 0;
	802	while ($i < scalar (@{$section_ptr->{'metadata'}})) {
	803	if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
	804	(!defined $value \|\| $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
	805	splice (@{$section_ptr->{'metadata'}}, $i, 1);
	806	} else {
	807	$i++;
	808	}
	809	}
	810	}
	811
	812	sub delete_all_metadata {
	813	my $self = shift (@_);
	814	my ($section) = @_;
	815
	816	my $section_ptr = $self->_lookup_section($section);
	817	if (!defined $section_ptr) {
[8716]	818	print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
[1241]	819	return;
	820	}
	821
	822	$section_ptr->{'metadata'} = [];
	823	}
	824
[4]	825	sub set_metadata_element {
	826	my $self = shift (@_);
	827	my ($section, $field, $value) = @_;
	828
[97]	829	$self->set_utf8_metadata_element ($section, $field,
[1870]	830	&unicode::ascii2utf8(\$value));
[73]	831	}
	832
	833	# set_utf8_metadata_element assumes the text has already been
	834	# converted to the UTF-8 encoding.
	835	sub set_utf8_metadata_element {
	836	my $self = shift (@_);
	837	my ($section, $field, $value) = @_;
	838
[4]	839	$self->delete_metadata ($section, $field);
[73]	840	$self->add_utf8_metadata ($section, $field, $value);
[4]	841	}
	842
	843
[73]	844	# add_metadata assumes the text is in (extended) ascii form. For
[8220]	845	# text which has already been converted to the UTF-8 format use
[73]	846	# add_utf8_metadata.
[4]	847	sub add_metadata {
	848	my $self = shift (@_);
	849	my ($section, $field, $value) = @_;
	850
[97]	851	$self->add_utf8_metadata ($section, $field,
[1870]	852	&unicode::ascii2utf8(\$value));
[73]	853	}
	854
	855	sub add_utf8_metadata {
	856	my $self = shift (@_);
	857	my ($section, $field, $value) = @_;
	858
[22950]	859	# my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
	860	# my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
	861	# print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
	862
[4]	863	my $section_ptr = $self->_lookup_section($section);
	864	if (!defined $section_ptr) {
[8716]	865	print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
[4]	866	return;
	867	}
[1732]	868	if (!defined $value) {
	869	print STDERR "doc::add_utf8_metadata undefined value for $field\n";
	870	return;
	871	}
	872	if (!defined $field) {
	873	print STDERR "doc::add_utf8_metadata undefined metadata type \n";
	874	return;
	875	}
[4]	876
[8894]	877	#print STDERR "###$field=$value\n";
[22950]	878
	879	# For now, supress this check. Given that text data read in is now
	880	# Unicode aware, then the following block of code can (ironically enough)
	881	# cause our unicode compliant string to be re-encoded (leading to
	882	# a double-encoded UTF-8 string, which we definitely don't want!).
	883
	884
[7798]	885	# double check that the value is utf-8
[22950]	886	# if (!&unicode::check_is_utf8($value)) {
	887	# print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
	888	# &unicode::ensure_utf8(\$value);
	889	# print STDERR " Tried converting to utf8: $value\n";
	890	# }
[7798]	891
[4]	892	push (@{$section_ptr->{'metadata'}}, [$field, $value]);
	893	}
	894
	895
	896	# methods for dealing with text
	897
[1241]	898	# returns the text for a section
	899	sub get_text {
	900	my $self = shift (@_);
	901	my ($section) = @_;
	902
	903	my $section_ptr = $self->_lookup_section($section);
	904	if (!defined $section_ptr) {
	905	print STDERR "doc::get_text couldn't find section " .
	906	"$section\n";
	907	return "";
	908	}
	909
	910	return $section_ptr->{'text'};
	911	}
	912
	913	# returns the (utf-8 encoded) length of the text for a section
	914	sub get_text_length {
	915	my $self = shift (@_);
	916	my ($section) = @_;
	917
	918	my $section_ptr = $self->_lookup_section($section);
	919	if (!defined $section_ptr) {
	920	print STDERR "doc::get_text_length couldn't find section " .
	921	"$section\n";
	922	return 0;
	923	}
	924
	925	return length ($section_ptr->{'text'});
	926	}
	927
[23131]	928	# returns the total length for all the sections
	929	sub get_total_text_length {
	930	my $self = shift (@_);
	931
	932	my $section = $self->get_top_section();
	933	my $length = 0;
	934	while (defined $section) {
	935	$length += $self->get_text_length($section);
	936	$section = $self->get_next_section($section);
	937	}
	938	return $length;
	939	}
	940
[1241]	941	sub delete_text {
	942	my $self = shift (@_);
	943	my ($section) = @_;
	944
	945	my $section_ptr = $self->_lookup_section($section);
	946	if (!defined $section_ptr) {
	947	print STDERR "doc::delete_text couldn't find section " .
	948	"$section\n";
	949	return;
	950	}
	951
	952	$section_ptr->{'text'} = "";
	953	}
	954
[73]	955	# add_text assumes the text is in (extended) ascii form. For
	956	# text which has been already converted to the UTF-8 format
	957	# use add_utf8_text.
[4]	958	sub add_text {
	959	my $self = shift (@_);
	960	my ($section, $text) = @_;
	961
[73]	962	# convert the text to UTF-8 encoded unicode characters
	963	# and add the text
[1870]	964	$self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
[73]	965	}
	966
	967
	968	# add_utf8_text assumes the text to be added has already
	969	# been converted to the UTF-8 encoding. For ascii text use
	970	# add_text
	971	sub add_utf8_text {
	972	my $self = shift (@_);
	973	my ($section, $text) = @_;
	974
[4]	975	my $section_ptr = $self->_lookup_section($section);
	976	if (!defined $section_ptr) {
[73]	977	print STDERR "doc::add_utf8_text couldn't find section " .
[4]	978	"$section\n";
	979	return;
	980	}
	981
	982	$section_ptr->{'text'} .= $text;
	983	}
	984
[16950]	985	# returns the Source meta, which is the utf8 filename generated.
	986	# Added a separate method here for convenience
	987	sub get_source {
[16924]	988	my $self = shift (@_);
[16950]	989	return $self->get_metadata_element ($self->get_top_section(), "Source");
	990	}
	991
	992	# returns the SourceFile meta, which is the url reference to the URL-encoded
	993	# version of Source (the utf8 filename). Added a separate method here for convenience
	994	sub get_sourcefile {
	995	my $self = shift (@_);
	996	return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
	997	}
	998
	999	# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
	1000	# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
	1001	sub get_assocfile_from_sourcefile {
	1002	my $self = shift (@_);
[16924]	1003
	1004	# get the SourceFile meta, which is a URL to a file on the filesystem
	1005	my $top_section = $self->get_top_section();
	1006	my $source_file = $self->get_metadata_element($top_section, "SourceFile");
[4]	1007
[16924]	1008	# get the actual filename as it exists on the filesystem which this url refers to
[16928]	1009	$source_file = &unicode::url_to_filename($source_file);
[16924]	1010	my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
	1011	return $assocfilename;
	1012	}
	1013
[1241]	1014	# methods for dealing with associated files
	1015
	1016	# a file is associated with a document, NOT a section.
	1017	# if section is defined it is noted in the data structure
	1018	# only so that files associated from a particular section
	1019	# may be removed later (using delete_section_assoc_files)
	1020	sub associate_file {
	1021	my $self = shift (@_);
	1022	my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
	1023	$mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
	1024
	1025	# remove all associated files with the same name
	1026	$self->delete_assoc_file ($assoc_filename);
[8504]	1027
[1241]	1028	push (@{$self->{'associated_files'}},
	1029	[$real_filename, $assoc_filename, $mime_type, $section]);
	1030	}
	1031
	1032	# returns a list of associated files in the form
	1033	# [[real_filename, assoc_filename, mimetype], ...]
	1034	sub get_assoc_files {
	1035	my $self = shift (@_);
	1036
	1037	return $self->{'associated_files'};
	1038	}
	1039
[20775]	1040	# the following two methods used to keep track of original associated files
	1041	# for incremental building. eg a txt file used by an item file does not end
	1042	# up as an assoc file for the doc.xml, but it needs to be recorded as a source
	1043	# file for incremental build
	1044	sub associate_source_file {
	1045	my $self = shift (@_);
	1046	my ($full_filename) = @_;
[19494]	1047
[20775]	1048	push (@{$self->{'source_assoc_files'}}, $full_filename);
	1049
	1050	}
	1051
	1052	sub get_source_assoc_files {
	1053	my $self = shift (@_);
	1054
	1055	return $self->{'source_assoc_files'};
	1056
	1057
	1058	}
[19494]	1059	sub metadata_file {
	1060	my $self = shift (@_);
	1061	my ($real_filename, $filename) = @_;
	1062
	1063	push (@{$self->{'metadata_files'}},
	1064	[$real_filename, $filename]);
	1065	}
	1066
[21566]	1067	# used for writing out the archiveinf-doc info database, to list all the metadata files
[19494]	1068	sub get_meta_files {
	1069	my $self = shift (@_);
	1070
	1071	return $self->{'metadata_files'};
	1072	}
	1073
[1241]	1074	sub delete_section_assoc_files {
	1075	my $self = shift (@_);
	1076	my ($section) = @_;
	1077
	1078	my $i=0;
	1079	while ($i < scalar (@{$self->{'associated_files'}})) {
	1080	if (defined $self->{'associated_files'}->[$i]->[3] &&
	1081	$self->{'associated_files'}->[$i]->[3] eq $section) {
	1082	splice (@{$self->{'associated_files'}}, $i, 1);
	1083	} else {
	1084	$i++;
	1085	}
	1086	}
	1087	}
	1088
	1089	sub delete_assoc_file {
	1090	my $self = shift (@_);
	1091	my ($assoc_filename) = @_;
	1092
	1093	my $i=0;
	1094	while ($i < scalar (@{$self->{'associated_files'}})) {
	1095	if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
	1096	splice (@{$self->{'associated_files'}}, $i, 1);
	1097	} else {
	1098	$i++;
	1099	}
	1100	}
	1101	}
	1102
	1103	sub reset_nextsection_ptr {
	1104	my $self = shift (@_);
	1105	my ($section) = @_;
	1106
	1107	my $section_ptr = $self->_lookup_section($section);
	1108	$section_ptr->{'next_subsection'} = 1;
	1109	}
	1110
[4]	1111	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: