Context Navigation

source: main/trunk/greenstone2/perllib/doc.pm@ 24219

Last change on this file since 24219 was 23939, checked in by ak19, 13 years ago
GS3's OAIserver passes final official oaiserver validation tests: to do with earliestDatestamp. 1. Perl code (inexport, basebuilder, colcfg, buildconfigxml.pm perl files) write out the earliestDatestamp into GS3's buildconfig.xml. Whenever a full-build is performed, the archives directory is recreated. At this stage, inexport creates a new file in archives called earliestDatestamp containing the current time. Whenever an incremental build is performed, this file already exists in archive, so it is left untouched, preserving the time of the full-build (which is the earliestDatestamp). The other perl files are concerned with obtaining this value from the archives directory and writing it out to the build config file. 2. doc.pm and BasePlugout.pm write out the current date and time for each document processed under the new fields oailastmodified and oailastmodifieddate. Changes made in this commit are related to GS3 java src code changes that work in tandem.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 34.2 KB

Rev	Line
[537]	1	###########################################################################
	2	#
	3	# doc.pm --
	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
	8	# Copyright (C) 1999 New Zealand Digital Library Project
	9	#
[8894]	10	# This program is free software; you can redistr te it and/or modify
[537]	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
	25
[1241]	26	# base class to hold documents
[4]	27
	28	package doc;
[3834]	29	eval {require bytes};
[4]	30
[832]	31	BEGIN {
[1241]	32	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
	33	unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
[832]	34	}
	35
[15894]	36	use strict;
[1241]	37	use unicode;
	38	use util;
	39	use ghtml;
[8220]	40	use File::stat;
[1241]	41	##use hashdoc;
[13172]	42	use docprint;
[1241]	43
[4]	44	# the document type may be indexed_doc, nonindexed_doc, or
	45	# classification
	46
[18528]	47	our $OIDcount = 0;
[2327]	48
[20412]	49	# rename_method can be 'url', 'none', 'base64'
[4]	50	sub new {
	51	my $class = shift (@_);
[18319]	52	my ($source_filename, $doc_type, $rename_method) = @_;
[1374]	53
[13770]	54
[1241]	55	my $self = bless {'associated_files'=>[],
	56	'subsection_order'=>[],
	57	'next_subsection'=>1,
	58	'subsections'=>{},
	59	'metadata'=>[],
[2327]	60	'text'=>"",
	61	'OIDtype'=>"hash"}, $class;
[4]	62
[10217]	63	# used to set lastmodified here, but this can screw up the HASH ids, so
	64	# the docsave processor now calls set_lastmodified
[13770]	65
[23362]	66	$self->set_source_path($source_filename);
[10217]	67
[7929]	68	if (defined $source_filename) {
[15874]	69	$source_filename = &util::filename_within_collection($source_filename);
[18508]	70	print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
[18319]	71	$self->set_source_filename ($source_filename, $rename_method);
[7929]	72	}
	73
[1374]	74	$self->set_doc_type ($doc_type) if defined $doc_type;
[485]	75
[4]	76	return $self;
	77	}
[23362]	78
	79
	80	sub set_source_path
	81	{
	82	my $self = shift @_;
	83	my ($source_filename) = @_;
	84
	85	if (defined $source_filename) {
	86	# On Windows the source_filename can be in terse DOS format
	87	# e.g. test~1.txt
	88
	89	$self->{'terse_source_path'} = $source_filename;
	90
	91	if (-e $source_filename) {
	92	# See if we can do better for Windows with a filename
	93	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
	94	require Win32;
	95	$self->{'source_path'} = Win32::GetLongPathName($source_filename);
	96	}
	97	else {
	98	# For Unix-based systems, there is no difference between the two
	99	$self->{'source_path'} = $source_filename;
	100	}
	101	}
	102	else {
	103	print STDERR "Warning: In doc::set_source_path(), file\n";
	104	print STDERR " $source_filename\n";
	105	print STDERR " does not exist\n";
	106
	107	# (default) Set it to whatever we were given
	108	$self->{'source_path'} = $source_filename;
	109	}
	110	}
	111	else {
	112	# Previous code for setting source_path allowed for
	113	# it to be undefined, so continue this practice
	114	$self->{'terse_source_path'} = undef;
	115	$self->{'source_path'} = undef;
	116	}
	117	}
	118
	119
	120	sub get_source_path
	121	{
	122	my $self = shift @_;
	123
	124	return $self->{'terse_source_path'};
	125	}
	126
[10217]	127	# set lastmodified for OAI purposes, added by GRB, moved by kjdon
[23939]	128	sub set_oailastmodified {
	129	my $self = shift (@_);
	130
	131	my $source_path = $self->{'terse_source_path'};
	132
	133	if (defined $source_path && (-e $source_path)) {
	134	my $current_time = time;
	135
	136	my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
	137	$wday, $yday, $isdst) = localtime($current_time);
	138
	139	my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
	140
	141	$self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
	142	$self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
	143	}
	144	}
	145
	146	# no longer used for OAI purposes, since lastmodified is not what we want as the
	147	# Datestamp of a document. This doc metadata may be useful for general purposes.
[10217]	148	sub set_lastmodified {
	149	my $self = shift (@_);
[4]	150
[23362]	151	my $source_path = $self->{'terse_source_path'};
[13770]	152
	153	if (defined $source_path && (-e $source_path)) {
	154
	155	my $file_stat = stat($source_path);
[10217]	156	my $mtime = $file_stat->mtime;
[21862]	157	my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
	158	$wday, $yday, $isdst) = localtime($mtime);
	159
	160	my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
	161
[23923]	162	$self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
[21862]	163	$self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
[10217]	164	}
	165	}
	166
[1241]	167	# clone the $self object
	168	sub duplicate {
	169	my $self = shift (@_);
	170
	171	my $newobj = {};
	172
[8716]	173	foreach my $k (keys %$self) {
[1241]	174	$newobj->{$k} = &clone ($self->{$k});
	175	}
	176
	177	bless $newobj, ref($self);
	178	return $newobj;
	179	}
	180
	181	sub clone {
	182	my ($from) = @_;
	183	my $type = ref ($from);
	184
	185	if ($type eq "HASH") {
	186	my $to = {};
[8716]	187	foreach my $key (keys %$from) {
[1241]	188	$to->{$key} = &clone ($from->{$key});
	189	}
	190	return $to;
	191	} elsif ($type eq "ARRAY") {
	192	my $to = [];
[8716]	193	foreach my $v (@$from) {
[1241]	194	push (@$to, &clone ($v));
	195	}
	196	return $to;
	197	} else {
	198	return $from;
	199	}
	200	}
	201
[2327]	202	sub set_OIDtype {
	203	my $self = shift (@_);
[12268]	204	my ($type, $metadata) = @_;
[1241]	205
[17025]	206	if (defined $type && $type =~ /^(hash\|hash_on_file\|hash_on_ga_xml\|incremental\|dirname\|assigned)$/) {
[2327]	207	$self->{'OIDtype'} = $type;
	208	} else {
	209	$self->{'OIDtype'} = "hash";
	210	}
[16792]	211
[12268]	212	if ($type =~ /^assigned$/) {
	213	if (defined $metadata) {
	214	$self->{'OIDmetadata'} = $metadata;
	215	} else {
	216	$self->{'OIDmetadata'} = "dc.Identifier";
	217	}
	218	}
[2327]	219	}
	220
[20412]	221	# rename_method can be 'url', 'none', 'base64'
[1241]	222	sub set_source_filename {
	223	my $self = shift (@_);
[18319]	224	my ($source_filename, $rename_method) = @_;
[1241]	225
[16578]	226	# Since the gsdlsourcefilename element goes into the doc.xml it has
	227	# to be utf8. However, it should also represent the source filename
	228	# (in the import directory) which may not be utf8 at all.
[18319]	229	# For instance, if this meta element (gsdlsourcefilename) will be used
	230	# by other applications that parse doc.xml in order to locate
	231	# gsdlsourcefilename. Therefore, the solution is to URLencode or base64
	232	# encode the real filename as this is a binary-to-text encoding meaning
	233	# that the resulting string is ASCII (utf8). Decoding will give the original.
[16578]	234
[18319]	235	# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
	236
[16670]	237	# URLencode just the gsdl_source_filename, not the directory. Then prepend dir
[23278]	238	$source_filename = $self->encode_filename($source_filename, $rename_method);
	239	# my ($srcfilename,$dirname,$suffix)
	240	# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
[16792]	241	# print STDERR "-> $srcfilename -> ";
[23278]	242	# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
	243	# $source_filename = &util::filename_cat($dirname, $srcfilename);
[16792]	244	# print STDERR "$source_filename\n";
[16670]	245
[16578]	246	$self->set_utf8_metadata_element ($self->get_top_section(),
[1241]	247	"gsdlsourcefilename",
	248	$source_filename);
	249	}
	250
[23278]	251	sub encode_filename {
	252	my $self = shift (@_);
	253	my ($source_filename, $rename_method) = @_;
	254
	255	my ($srcfilename,$dirname,$suffix)
	256	= &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
	257	# print STDERR "-> $srcfilename -> ";
	258	$srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
	259	$source_filename = &util::filename_cat($dirname, $srcfilename);
	260
	261	return $source_filename;
	262	}
	263
[7569]	264	sub set_converted_filename {
	265	my $self = shift (@_);
	266	my ($converted_filename) = @_;
	267
[16578]	268	# we know the converted filename is utf8
	269	$self->set_utf8_metadata_element ($self->get_top_section(),
[7569]	270	"gsdlconvertedfilename",
	271	$converted_filename);
	272	}
	273
[1241]	274	# returns the source_filename as it was provided
[19829]	275	sub get_unmodified_source_filename {
	276	my $self = shift (@_);
	277
[23362]	278	return $self->{'terse_source_path'};
[19829]	279	}
	280
	281	# returns the source_filename with whatever rename_method was given
[1241]	282	sub get_source_filename {
	283	my $self = shift (@_);
	284
	285	return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
	286	}
	287
[19829]	288
	289
[7569]	290	# returns converted filename if available else returns source filename
	291	sub get_filename_for_hashing {
	292	my $self = shift (@_);
	293
	294	my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
	295
	296	if (!defined $filename) {
[11097]	297	my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
[10980]	298	# if NULPlug processed file, then don't give a filename
[11097]	299	if (defined $plugin_name && $plugin_name eq "NULPlug") {
[10980]	300	$filename = undef;
[16670]	301	} else { # returns the URL encoded source filename!
[10980]	302	$filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
	303	}
[7569]	304	}
[23485]	305
	306	if (!&util::filename_is_absolute($filename)) {
	307	$filename = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$filename);
	308	}
	309
[7569]	310	return $filename;
	311	}
	312
[1241]	313	sub set_doc_type {
	314	my $self = shift (@_);
	315	my ($doc_type) = @_;
	316
	317	$self->set_metadata_element ($self->get_top_section(),
	318	"gsdldoctype",
	319	$doc_type);
	320	}
	321
[10217]	322	# returns the gsdldoctype as it was provided
[1241]	323	# the default of "indexed_doc" is used if no document
	324	# type was provided
	325	sub get_doc_type {
	326	my $self = shift (@_);
	327
	328	my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
	329	return $doc_type if (defined $doc_type);
	330	return "indexed_doc";
	331	}
	332
	333
	334	# look up the reference to the a particular section
	335	sub _lookup_section {
	336	my $self = shift (@_);
	337	my ($section) = @_;
	338
	339	my ($num);
	340	my $sectionref = $self;
	341
	342	while (defined $section && $section ne "") {
[12327]	343
[1241]	344	($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
[12327]	345
	346	$num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
	347
[1241]	348	$section = "" unless defined $section;
	349
[12327]	350
[1241]	351	if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
	352	$sectionref = $sectionref->{'subsections'}->{$num};
	353	} else {
	354	return undef;
	355	}
	356	}
	357
	358	return $sectionref;
	359	}
	360
[2327]	361	# calculate OID by hashing the contents of the document
[1241]	362	sub _calc_OID {
	363	my $self = shift (@_);
	364	my ($filename) = @_;
	365
[22855]	366
[1241]	367	my $osexe = &util::get_os_exe();
	368
	369	my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
	370	$ENV{'GSDLOS'},"hashfile$osexe");
[8504]	371
[1241]	372	my $result = "NULL";
[22855]	373
[1679]	374
[1241]	375	if (-e "$hashfile_exe") {
[1679]	376	# $result = `\"$hashfile_exe\" \"$filename\"`;
[21862]	377	# $result = `hashfile$osexe \"$filename\" 2>&1`;
[1679]	378	$result = `hashfile$osexe \"$filename\"`;
[21862]	379
[1241]	380	($result) = $result =~ /:\s*([0-9a-f]+)/i;
	381	} else {
	382	print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
	383	}
	384	return "HASH$result";
	385	}
	386
	387	# methods dealing with OID, not groups of them.
	388
[2327]	389	# if $OID is not provided one is calculated
[1241]	390	sub set_OID {
	391	my $self = shift (@_);
	392	my ($OID) = @_;
[8504]	393
[8797]	394	my $use_hash_oid = 0;
[17057]	395	# if an OID wasn't provided calculate one
[1241]	396	if (!defined $OID) {
	397	$OID = "NULL";
[17025]	398	if ($self->{'OIDtype'} =~ /^hash/) {
[8797]	399	$use_hash_oid = 1;
	400	} elsif ($self->{'OIDtype'} eq "incremental") {
[2327]	401	$OID = "D" . $OIDcount;
	402	$OIDcount ++;
[8716]	403
	404	} elsif ($self->{'OIDtype'} eq "dirname") {
	405	$OID = 'J';
	406	my $filename = $self->get_source_filename();
[8797]	407	if (defined($filename)) { # && -e $filename) {
[8716]	408	$OID = &File::Basename::dirname($filename);
	409	if (defined $OID) {
	410	$OID = 'J'.&File::Basename::basename($OID);
[19617]	411	$OID = &util::tidy_up_oid($OID);
[8716]	412	} else {
[8797]	413	print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
	414	$use_hash_oid = 1;
[8716]	415	}
	416	} else {
[8797]	417	print STDERR "Failed to find filename, generating hash id\n";
	418	$use_hash_oid = 1;
[8716]	419	}
[2327]	420
[8797]	421	} elsif ($self->{'OIDtype'} eq "assigned") {
[12268]	422	my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
[8797]	423	if (defined $identifier && $identifier ne "") {
[17025]	424	$OID = $identifier;
[19617]	425	$OID = &util::tidy_up_oid($OID);
[8797]	426	} else {
	427	# need a hash id
[12268]	428	print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
[8797]	429	$use_hash_oid = 1;
	430	}
	431
[2327]	432	} else {
[8797]	433	$use_hash_oid = 1;
	434	}
	435
	436	if ($use_hash_oid) {
[23562]	437	my $hash_on_file = 1;
[17025]	438	if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
	439	$hash_on_file = 0;
	440	}
	441	if ($hash_on_file) {
	442	# "hash" OID - feed file to hashfile.exe
	443	my $filename = $self->get_filename_for_hashing();
[23562]	444
[17025]	445	# -z: don't want to hash on the file if it is zero size
	446	if (defined($filename) && -e $filename && !-z $filename) {
	447	$OID = $self->_calc_OID ($filename);
	448	} else {
	449	$hash_on_file = 0;
	450	}
	451	}
	452	if (!$hash_on_file) {
	453	my $filename = &util::get_tmp_filename();
[22855]	454	if (!open (OUTFILE, ">:utf8", $filename)) {
[2327]	455	print STDERR "doc::set_OID could not write to $filename\n";
	456	} else {
[13172]	457	my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
	458	print OUTFILE $doc_text;
[2327]	459	close (OUTFILE);
	460	}
	461	$OID = $self->_calc_OID ($filename);
	462	&util::rm ($filename);
[1374]	463	}
[1241]	464	}
	465	}
	466	$self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
	467	}
	468
	469	# this uses hashdoc (embedded c thingy) which is faster but still
	470	# needs a little work to be suffiently stable
	471	sub ___set_OID {
	472	my $self = shift (@_);
	473	my ($OID) = @_;
	474
	475	# if an OID wasn't provided then calculate hash value based on document
	476	if (!defined $OID)
	477	{
[13172]	478	my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
[1241]	479	my $hash_len = length($hash_text);
	480
	481	$OID = &hashdoc::buffer($hash_text,$hash_len);
	482	}
	483
	484	$self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
	485	}
	486
	487	# returns the OID for this document
	488	sub get_OID {
	489	my $self = shift (@_);
	490	my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
	491	return $OID if (defined $OID);
	492	return "NULL";
	493	}
	494
	495	sub delete_OID {
	496	my $self = shift (@_);
	497
	498	$self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
	499	}
	500
	501
	502	# methods for manipulating section names
	503
	504	# returns the name of the top-most section (the top
	505	# level of the document
	506	sub get_top_section {
	507	my $self = shift (@_);
	508
	509	return "";
	510	}
	511
	512	# returns a section
	513	sub get_parent_section {
	514	my $self = shift (@_);
	515	my ($section) = @_;
	516
	517	$section =~ s/(^\|\.)\d+$//;
	518
	519	return $section;
	520	}
	521
	522	# returns the first child section (or the end child
	523	# if there isn't any)
	524	sub get_begin_child {
	525	my $self = shift (@_);
	526	my ($section) = @_;
	527
	528	my $section_ptr = $self->_lookup_section($section);
	529	return "" unless defined $section_ptr;
	530
	531	if (defined $section_ptr->{'subsection_order'}->[0]) {
	532	return "$section.$section_ptr->{'subsection_order'}->[0]";
	533	}
	534
	535	return $self->get_end_child ($section);
	536	}
	537
	538	# returns the next child of a parent section
	539	sub get_next_child {
	540	my $self = shift (@_);
	541	my ($section) = @_;
	542
	543	my $parent_section = $self->get_parent_section($section);
	544	my $parent_section_ptr = $self->_lookup_section($parent_section);
	545	return undef unless defined $parent_section_ptr;
	546
	547	my ($section_num) = $section =~ /(\d+)$/;
	548	return undef unless defined $section_num;
	549
	550	my $i = 0;
	551	my $section_order = $parent_section_ptr->{'subsection_order'};
	552	while ($i < scalar(@$section_order)) {
	553	last if $section_order->[$i] eq $section_num;
	554	$i++;
	555	}
	556
	557	$i++; # the next child
	558	if ($i < scalar(@$section_order)) {
	559	return $section_order->[$i] if $parent_section eq "";
	560	return "$parent_section.$section_order->[$i]";
	561	}
	562
	563	# no more sections in this level
	564	return undef;
	565	}
	566
	567	# returns a reference to a list of children
	568	sub get_children {
	569	my $self = shift (@_);
	570	my ($section) = @_;
	571
	572	my $section_ptr = $self->_lookup_section($section);
	573	return [] unless defined $section_ptr;
	574
	575	my @children = @{$section_ptr->{'subsection_order'}};
	576
	577	map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
	578	return \@children;
	579	}
	580
	581	# returns the child section one past the last one (which
	582	# is coded as "0")
	583	sub get_end_child {
	584	my $self = shift (@_);
	585	my ($section) = @_;
	586
	587	return $section . ".0" unless $section eq "";
	588	return "0";
	589	}
	590
	591	# returns the next section in book order
	592	sub get_next_section {
	593	my $self = shift (@_);
	594	my ($section) = @_;
	595
	596	return undef unless defined $section;
	597
	598	my $section_ptr = $self->_lookup_section($section);
	599	return undef unless defined $section_ptr;
	600
	601	# first try to find first child
	602	if (defined $section_ptr->{'subsection_order'}->[0]) {
	603	return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
	604	return "$section.$section_ptr->{'subsection_order'}->[0]";
	605	}
	606
	607	do {
	608	# try to find sibling
	609	my $next_child = $self->get_next_child ($section);
	610	return $next_child if (defined $next_child);
	611
	612	# move up one level
	613	$section = $self->get_parent_section ($section);
	614	} while $section =~ /\d/;
	615
	616	return undef;
	617	}
	618
	619	sub is_leaf_section {
	620	my $self = shift (@_);
	621	my ($section) = @_;
	622
	623	my $section_ptr = $self->_lookup_section($section);
	624	return 1 unless defined $section_ptr;
	625
	626	return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
	627	}
	628
	629	# methods for dealing with sections
	630
	631	# returns the name of the inserted section
	632	sub insert_section {
	633	my $self = shift (@_);
	634	my ($before_section) = @_;
	635
	636	# get the child to insert before and its parent section
	637	my $parent_section = "";
	638	my $before_child = "0";
	639	my @before_section = split (/\./, $before_section);
	640	if (scalar(@before_section) > 0) {
	641	$before_child = pop (@before_section);
	642	$parent_section = join (".", @before_section);
	643	}
	644
	645	my $parent_section_ptr = $self->_lookup_section($parent_section);
	646	if (!defined $parent_section_ptr) {
	647	print STDERR "doc::insert_section couldn't find parent section " .
	648	"$parent_section\n";
	649	return;
	650	}
	651
	652	# get the next section number
	653	my $section_num = $parent_section_ptr->{'next_subsection'}++;
	654
	655	my $i = 0;
	656	while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
	657	$parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
	658	$i++;
	659	}
	660
	661	# insert the section number into the order list
	662	splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
	663
	664	# add this section to the parent section
	665	my $section_ptr = {'subsection_order'=>[],
	666	'next_subsection'=>1,
	667	'subsections'=>{},
	668	'metadata'=>[],
	669	'text'=>""};
	670	$parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
	671
	672	# work out the full section number
	673	my $section = $parent_section;
	674	$section .= "." unless $section eq "";
	675	$section .= $section_num;
	676
	677	return $section;
	678	}
	679
	680	# creates a pre-named section
	681	sub create_named_section {
	682	my $self = shift (@_);
	683	my ($mastersection) = @_;
	684
	685	my ($num);
	686	my $section = $mastersection;
	687	my $sectionref = $self;
	688
	689	while ($section ne "") {
	690	($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
	691	$num =~ s/^0+(\d)/$1/; # remove leading 0s
	692	$section = "" unless defined $section;
	693
	694	if (defined $num) {
	695	if (!defined $sectionref->{'subsections'}->{$num}) {
	696	push (@{$sectionref->{'subsection_order'}}, $num);
	697	$sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
	698	'next_subsection'=>1,
	699	'subsections'=>{},
	700	'metadata'=>[],
	701	'text'=>""};
	702	if ($num >= $sectionref->{'next_subsection'}) {
	703	$sectionref->{'next_subsection'} = $num + 1;
	704	}
	705	}
	706	$sectionref = $sectionref->{'subsections'}->{$num};
	707
	708	} else {
	709	print STDERR "doc::create_named_section couldn't create section ";
	710	print STDERR "$mastersection\n";
	711	last;
	712	}
	713	}
	714	}
	715
	716	# returns a reference to a list of subsections
	717	sub list_subsections {
	718	my $self = shift (@_);
	719	my ($section) = @_;
	720
	721	my $section_ptr = $self->_lookup_section ($section);
	722	if (!defined $section_ptr) {
	723	print STDERR "doc::list_subsections couldn't find section $section\n";
	724	return [];
	725	}
	726
	727	return [@{$section_ptr->{'subsection_order'}}];
	728	}
	729
	730	sub delete_section {
	731	my $self = shift (@_);
	732	my ($section) = @_;
	733
	734	# my $section_ptr = {'subsection_order'=>[],
	735	# 'next_subsection'=>1,
	736	# 'subsections'=>{},
	737	# 'metadata'=>[],
	738	# 'text'=>""};
	739
	740	# if this is the top section reset everything
	741	if ($section eq "") {
	742	$self->{'subsection_order'} = [];
	743	$self->{'subsections'} = {};
	744	$self->{'metadata'} = [];
	745	$self->{'text'} = "";
	746	return;
	747	}
	748
	749	# find the parent of the section to delete
	750	my $parent_section = "";
	751	my $child = "0";
	752	my @section = split (/\./, $section);
	753	if (scalar(@section) > 0) {
	754	$child = pop (@section);
	755	$parent_section = join (".", @section);
	756	}
	757
	758	my $parent_section_ptr = $self->_lookup_section($parent_section);
	759	if (!defined $parent_section_ptr) {
	760	print STDERR "doc::delete_section couldn't find parent section " .
	761	"$parent_section\n";
	762	return;
	763	}
	764
	765	# remove this section from the subsection_order list
	766	my $i = 0;
	767	while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
	768	if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
	769	splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
	770	last;
	771	}
	772	$i++;
	773	}
	774
	775	# remove this section from the subsection hash
	776	if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
	777	undef $parent_section_ptr->{'subsections'}->{$child};
	778	}
	779	}
	780
	781	#--
[4]	782	# methods for dealing with metadata
	783
	784	# set_metadata_element and get_metadata_element are for metadata
	785	# which should only have one value. add_meta_data and get_metadata
	786	# are for metadata which can have more than one value.
	787
[1241]	788	# returns the first metadata value which matches field
[6111]	789
	790	# This version of get metadata element works much like the one above,
	791	# except it allows for the namespace portion of a metadata element to
	792	# be ignored, thus if you are searching for dc.Title, the first piece
	793	# of matching metadata ending with the name Title (once any namespace
	794	# is removed) would be returned.
	795	# 28-11-2003 John Thompson
[1241]	796	sub get_metadata_element {
	797	my $self = shift (@_);
[6111]	798	my ($section, $field, $ignore_namespace) = @_;
[1241]	799	my ($data);
	800
[6111]	801	$ignore_namespace = 0 unless defined $ignore_namespace;
	802
[1241]	803	my $section_ptr = $self->_lookup_section($section);
	804	if (!defined $section_ptr) {
[8716]	805	print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
[1241]	806	return;
	807	}
	808
[20417]	809	# Remove the namespace if we are being told to ignore them
[6111]	810	if($ignore_namespace) {
	811	$field =~ s/^\w*\.//;
	812	}
	813
[1241]	814	foreach $data (@{$section_ptr->{'metadata'}}) {
[6111]	815
	816	my $data_name = $data->[0];
[14966]	817
[6111]	818	# Remove the any namespace if we are being told to ignore them
	819	if($ignore_namespace) {
	820	$data_name =~ s/^\w*\.//;
	821	}
[22294]	822	$data_name =~ s/^ex\.//; # we always remove ex. - it maybe there in doc_obj, but we will never ask for it.
[6111]	823	return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
[1241]	824	}
	825
	826	return undef; # was not found
	827	}
	828
	829	# returns a list of the form [value1, value2, ...]
	830	sub get_metadata {
	831	my $self = shift (@_);
[6111]	832	my ($section, $field, $ignore_namespace) = @_;
[1241]	833	my ($data);
	834
[6111]	835	$ignore_namespace = 0 unless defined $ignore_namespace;
	836
[1241]	837	my $section_ptr = $self->_lookup_section($section);
	838	if (!defined $section_ptr) {
[8716]	839	print STDERR "doc::get_metadata couldn't find section ",
	840	$section, "\n";
[1241]	841	return;
	842	}
	843
[6111]	844	# Remove the any namespace if we are being told to ignore them
	845	if($ignore_namespace) {
	846	$field =~ s/^\w*\.//;
	847	}
	848
[1241]	849	my @metadata = ();
	850	foreach $data (@{$section_ptr->{'metadata'}}) {
[6111]	851
	852	my $data_name = $data->[0];
	853	# Remove the any namespace if we are being told to ignore them
	854	if($ignore_namespace) {
	855	$data_name =~ s/^\w*\.//;
	856	}
[22294]	857	$data_name =~ s/^ex\.//; # we always remove ex. - it maybe there in doc_obj, but we will never ask for it.
[6111]	858	push (@metadata, $data->[1]) if ($data_name eq $field);
[1241]	859	}
[9241]	860
[1241]	861	return \@metadata;
	862	}
	863
[23827]	864	sub get_metadata_hashmap {
	865	my $self = shift (@_);
	866	my ($section, $opt_namespace) = @_;
	867
	868	my $section_ptr = $self->_lookup_section($section);
	869	if (!defined $section_ptr) {
	870	print STDERR "doc::get_metadata couldn't find section ",
	871	$section, "\n";
	872	return;
	873	}
	874
	875	my $metadata_hashmap = {};
	876	foreach my $data (@{$section_ptr->{'metadata'}}) {
	877	my $metaname = $data->[0];
	878
	879	if ((!defined $opt_namespace) \|\| ($metaname =~ m/^$opt_namespace\./)) {
	880	if (!defined $metadata_hashmap->{$metaname}) {
	881	$metadata_hashmap->{$metaname} = [];
	882	}
	883	my $metaval_list = $metadata_hashmap->{$metaname};
	884	push(@$metaval_list, $data->[1]);
	885	}
	886	}
	887
	888	return $metadata_hashmap;
	889	}
	890
[1241]	891	# returns a list of the form [[field,value],[field,value],...]
	892	sub get_all_metadata {
	893	my $self = shift (@_);
	894	my ($section) = @_;
	895
	896	my $section_ptr = $self->_lookup_section($section);
	897	if (!defined $section_ptr) {
[8716]	898	print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
[1241]	899	return;
	900	}
	901
	902	return $section_ptr->{'metadata'};
	903	}
	904
	905	# $value is optional
	906	sub delete_metadata {
	907	my $self = shift (@_);
	908	my ($section, $field, $value) = @_;
	909
	910	my $section_ptr = $self->_lookup_section($section);
	911	if (!defined $section_ptr) {
[23562]	912	print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
[1241]	913	return;
	914	}
	915
	916	my $i = 0;
	917	while ($i < scalar (@{$section_ptr->{'metadata'}})) {
	918	if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
	919	(!defined $value \|\| $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
	920	splice (@{$section_ptr->{'metadata'}}, $i, 1);
	921	} else {
	922	$i++;
	923	}
	924	}
	925	}
	926
	927	sub delete_all_metadata {
	928	my $self = shift (@_);
	929	my ($section) = @_;
	930
	931	my $section_ptr = $self->_lookup_section($section);
	932	if (!defined $section_ptr) {
[8716]	933	print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
[1241]	934	return;
	935	}
	936
	937	$section_ptr->{'metadata'} = [];
	938	}
	939
[4]	940	sub set_metadata_element {
	941	my $self = shift (@_);
	942	my ($section, $field, $value) = @_;
	943
[97]	944	$self->set_utf8_metadata_element ($section, $field,
[1870]	945	&unicode::ascii2utf8(\$value));
[73]	946	}
	947
	948	# set_utf8_metadata_element assumes the text has already been
	949	# converted to the UTF-8 encoding.
	950	sub set_utf8_metadata_element {
	951	my $self = shift (@_);
	952	my ($section, $field, $value) = @_;
	953
[4]	954	$self->delete_metadata ($section, $field);
[73]	955	$self->add_utf8_metadata ($section, $field, $value);
[4]	956	}
	957
	958
[73]	959	# add_metadata assumes the text is in (extended) ascii form. For
[8220]	960	# text which has already been converted to the UTF-8 format use
[73]	961	# add_utf8_metadata.
[4]	962	sub add_metadata {
	963	my $self = shift (@_);
	964	my ($section, $field, $value) = @_;
	965
[97]	966	$self->add_utf8_metadata ($section, $field,
[1870]	967	&unicode::ascii2utf8(\$value));
[73]	968	}
	969
	970	sub add_utf8_metadata {
	971	my $self = shift (@_);
	972	my ($section, $field, $value) = @_;
	973
[22950]	974	# my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
	975	# my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
	976	# print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
	977
[4]	978	my $section_ptr = $self->_lookup_section($section);
	979	if (!defined $section_ptr) {
[8716]	980	print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
[4]	981	return;
	982	}
[1732]	983	if (!defined $value) {
	984	print STDERR "doc::add_utf8_metadata undefined value for $field\n";
	985	return;
	986	}
	987	if (!defined $field) {
	988	print STDERR "doc::add_utf8_metadata undefined metadata type \n";
	989	return;
	990	}
[4]	991
[8894]	992	#print STDERR "###$field=$value\n";
[22950]	993
	994	# For now, supress this check. Given that text data read in is now
	995	# Unicode aware, then the following block of code can (ironically enough)
	996	# cause our unicode compliant string to be re-encoded (leading to
	997	# a double-encoded UTF-8 string, which we definitely don't want!).
	998
	999
[7798]	1000	# double check that the value is utf-8
[22950]	1001	# if (!&unicode::check_is_utf8($value)) {
	1002	# print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
	1003	# &unicode::ensure_utf8(\$value);
	1004	# print STDERR " Tried converting to utf8: $value\n";
	1005	# }
[7798]	1006
[4]	1007	push (@{$section_ptr->{'metadata'}}, [$field, $value]);
	1008	}
	1009
	1010
	1011	# methods for dealing with text
	1012
[1241]	1013	# returns the text for a section
	1014	sub get_text {
	1015	my $self = shift (@_);
	1016	my ($section) = @_;
	1017
	1018	my $section_ptr = $self->_lookup_section($section);
	1019	if (!defined $section_ptr) {
	1020	print STDERR "doc::get_text couldn't find section " .
	1021	"$section\n";
	1022	return "";
	1023	}
	1024
	1025	return $section_ptr->{'text'};
	1026	}
	1027
	1028	# returns the (utf-8 encoded) length of the text for a section
	1029	sub get_text_length {
	1030	my $self = shift (@_);
	1031	my ($section) = @_;
	1032
	1033	my $section_ptr = $self->_lookup_section($section);
	1034	if (!defined $section_ptr) {
	1035	print STDERR "doc::get_text_length couldn't find section " .
	1036	"$section\n";
	1037	return 0;
	1038	}
	1039
	1040	return length ($section_ptr->{'text'});
	1041	}
	1042
[23131]	1043	# returns the total length for all the sections
	1044	sub get_total_text_length {
	1045	my $self = shift (@_);
	1046
	1047	my $section = $self->get_top_section();
	1048	my $length = 0;
	1049	while (defined $section) {
	1050	$length += $self->get_text_length($section);
	1051	$section = $self->get_next_section($section);
	1052	}
	1053	return $length;
	1054	}
	1055
[1241]	1056	sub delete_text {
	1057	my $self = shift (@_);
	1058	my ($section) = @_;
	1059
	1060	my $section_ptr = $self->_lookup_section($section);
	1061	if (!defined $section_ptr) {
	1062	print STDERR "doc::delete_text couldn't find section " .
	1063	"$section\n";
	1064	return;
	1065	}
	1066
	1067	$section_ptr->{'text'} = "";
	1068	}
	1069
[73]	1070	# add_text assumes the text is in (extended) ascii form. For
	1071	# text which has been already converted to the UTF-8 format
	1072	# use add_utf8_text.
[4]	1073	sub add_text {
	1074	my $self = shift (@_);
	1075	my ($section, $text) = @_;
	1076
[73]	1077	# convert the text to UTF-8 encoded unicode characters
	1078	# and add the text
[1870]	1079	$self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
[73]	1080	}
	1081
	1082
	1083	# add_utf8_text assumes the text to be added has already
	1084	# been converted to the UTF-8 encoding. For ascii text use
	1085	# add_text
	1086	sub add_utf8_text {
	1087	my $self = shift (@_);
	1088	my ($section, $text) = @_;
	1089
[4]	1090	my $section_ptr = $self->_lookup_section($section);
	1091	if (!defined $section_ptr) {
[73]	1092	print STDERR "doc::add_utf8_text couldn't find section " .
[4]	1093	"$section\n";
	1094	return;
	1095	}
	1096
	1097	$section_ptr->{'text'} .= $text;
	1098	}
	1099
[16950]	1100	# returns the Source meta, which is the utf8 filename generated.
	1101	# Added a separate method here for convenience
	1102	sub get_source {
[16924]	1103	my $self = shift (@_);
[16950]	1104	return $self->get_metadata_element ($self->get_top_section(), "Source");
	1105	}
	1106
	1107	# returns the SourceFile meta, which is the url reference to the URL-encoded
	1108	# version of Source (the utf8 filename). Added a separate method here for convenience
	1109	sub get_sourcefile {
	1110	my $self = shift (@_);
	1111	return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
	1112	}
	1113
	1114	# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
	1115	# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
	1116	sub get_assocfile_from_sourcefile {
	1117	my $self = shift (@_);
[16924]	1118
	1119	# get the SourceFile meta, which is a URL to a file on the filesystem
	1120	my $top_section = $self->get_top_section();
	1121	my $source_file = $self->get_metadata_element($top_section, "SourceFile");
[4]	1122
[16924]	1123	# get the actual filename as it exists on the filesystem which this url refers to
[16928]	1124	$source_file = &unicode::url_to_filename($source_file);
[16924]	1125	my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
	1126	return $assocfilename;
	1127	}
	1128
[1241]	1129	# methods for dealing with associated files
	1130
	1131	# a file is associated with a document, NOT a section.
	1132	# if section is defined it is noted in the data structure
	1133	# only so that files associated from a particular section
	1134	# may be removed later (using delete_section_assoc_files)
	1135	sub associate_file {
	1136	my $self = shift (@_);
	1137	my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
	1138	$mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
	1139
	1140	# remove all associated files with the same name
	1141	$self->delete_assoc_file ($assoc_filename);
[23413]	1142
	1143	# Too harsh a requirement
	1144	# Definitely get HTML docs, for example, with some missing
	1145	# support files
	1146	# if (!&util::fd_exists($real_filename)) {
	1147	# print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
	1148	# exit -1;
	1149	# }
	1150
[23387]	1151	# print STDERR "**** is the following a UTF8 rep of real filename?\n $real_filename\n";
	1152	# print STDERR "****##### so, ensure it is before storing?!?!?\n";
	1153	## my $utf8_filename = Encode::encode("utf8",$filename);
	1154
[1241]	1155	push (@{$self->{'associated_files'}},
	1156	[$real_filename, $assoc_filename, $mime_type, $section]);
	1157	}
	1158
	1159	# returns a list of associated files in the form
	1160	# [[real_filename, assoc_filename, mimetype], ...]
	1161	sub get_assoc_files {
	1162	my $self = shift (@_);
	1163
	1164	return $self->{'associated_files'};
	1165	}
	1166
[20775]	1167	# the following two methods used to keep track of original associated files
	1168	# for incremental building. eg a txt file used by an item file does not end
	1169	# up as an assoc file for the doc.xml, but it needs to be recorded as a source
	1170	# file for incremental build
	1171	sub associate_source_file {
	1172	my $self = shift (@_);
	1173	my ($full_filename) = @_;
[19494]	1174
[20775]	1175	push (@{$self->{'source_assoc_files'}}, $full_filename);
	1176
	1177	}
	1178
	1179	sub get_source_assoc_files {
	1180	my $self = shift (@_);
	1181
	1182	return $self->{'source_assoc_files'};
	1183
	1184
	1185	}
[19494]	1186	sub metadata_file {
	1187	my $self = shift (@_);
	1188	my ($real_filename, $filename) = @_;
	1189
	1190	push (@{$self->{'metadata_files'}},
	1191	[$real_filename, $filename]);
	1192	}
	1193
[21566]	1194	# used for writing out the archiveinf-doc info database, to list all the metadata files
[19494]	1195	sub get_meta_files {
	1196	my $self = shift (@_);
	1197
	1198	return $self->{'metadata_files'};
	1199	}
	1200
[1241]	1201	sub delete_section_assoc_files {
	1202	my $self = shift (@_);
	1203	my ($section) = @_;
	1204
	1205	my $i=0;
	1206	while ($i < scalar (@{$self->{'associated_files'}})) {
	1207	if (defined $self->{'associated_files'}->[$i]->[3] &&
	1208	$self->{'associated_files'}->[$i]->[3] eq $section) {
	1209	splice (@{$self->{'associated_files'}}, $i, 1);
	1210	} else {
	1211	$i++;
	1212	}
	1213	}
	1214	}
	1215
	1216	sub delete_assoc_file {
	1217	my $self = shift (@_);
	1218	my ($assoc_filename) = @_;
	1219
	1220	my $i=0;
	1221	while ($i < scalar (@{$self->{'associated_files'}})) {
	1222	if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
	1223	splice (@{$self->{'associated_files'}}, $i, 1);
	1224	} else {
	1225	$i++;
	1226	}
	1227	}
	1228	}
	1229
	1230	sub reset_nextsection_ptr {
	1231	my $self = shift (@_);
	1232	my ($section) = @_;
	1233
	1234	my $section_ptr = $self->_lookup_section($section);
	1235	$section_ptr->{'next_subsection'} = 1;
	1236	}
	1237
[4]	1238	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: