Context Navigation

source: main/trunk/greenstone2/perllib/doc.pm@ 26221

Last change on this file since 26221 was 26221, checked in by kjdon, 12 years ago
new OIDtype, filename, will use the file name without any folders or file extension. Must be unique filenames in the collection. BasePlugin add_OID method returns if an id has already been set
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 35.5 KB

Rev	Line
[537]	1	###########################################################################
	2	#
	3	# doc.pm --
	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
	8	# Copyright (C) 1999 New Zealand Digital Library Project
	9	#
[8894]	10	# This program is free software; you can redistr te it and/or modify
[537]	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
	25
[1241]	26	# base class to hold documents
[4]	27
	28	package doc;
[3834]	29	eval {require bytes};
[4]	30
[832]	31	BEGIN {
[1241]	32	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
	33	unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
[832]	34	}
	35
[15894]	36	use strict;
[1241]	37	use unicode;
	38	use util;
	39	use ghtml;
[8220]	40	use File::stat;
[1241]	41	##use hashdoc;
[13172]	42	use docprint;
[1241]	43
[4]	44	# the document type may be indexed_doc, nonindexed_doc, or
	45	# classification
	46
[18528]	47	our $OIDcount = 0;
[2327]	48
[20412]	49	# rename_method can be 'url', 'none', 'base64'
[4]	50	sub new {
	51	my $class = shift (@_);
[18319]	52	my ($source_filename, $doc_type, $rename_method) = @_;
[1374]	53
[13770]	54
[1241]	55	my $self = bless {'associated_files'=>[],
	56	'subsection_order'=>[],
	57	'next_subsection'=>1,
	58	'subsections'=>{},
	59	'metadata'=>[],
[2327]	60	'text'=>"",
	61	'OIDtype'=>"hash"}, $class;
[4]	62
[10217]	63	# used to set lastmodified here, but this can screw up the HASH ids, so
	64	# the docsave processor now calls set_lastmodified
[13770]	65
[23362]	66	$self->set_source_path($source_filename);
[10217]	67
[7929]	68	if (defined $source_filename) {
[15874]	69	$source_filename = &util::filename_within_collection($source_filename);
[18508]	70	print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
[18319]	71	$self->set_source_filename ($source_filename, $rename_method);
[7929]	72	}
	73
[1374]	74	$self->set_doc_type ($doc_type) if defined $doc_type;
[485]	75
[4]	76	return $self;
	77	}
[23362]	78
	79
	80	sub set_source_path
	81	{
	82	my $self = shift @_;
	83	my ($source_filename) = @_;
	84
	85	if (defined $source_filename) {
	86	# On Windows the source_filename can be in terse DOS format
	87	# e.g. test~1.txt
	88
	89	$self->{'terse_source_path'} = $source_filename;
	90
	91	if (-e $source_filename) {
	92	# See if we can do better for Windows with a filename
	93	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
	94	require Win32;
	95	$self->{'source_path'} = Win32::GetLongPathName($source_filename);
	96	}
	97	else {
	98	# For Unix-based systems, there is no difference between the two
	99	$self->{'source_path'} = $source_filename;
	100	}
	101	}
	102	else {
	103	print STDERR "Warning: In doc::set_source_path(), file\n";
	104	print STDERR " $source_filename\n";
	105	print STDERR " does not exist\n";
	106
	107	# (default) Set it to whatever we were given
	108	$self->{'source_path'} = $source_filename;
	109	}
	110	}
	111	else {
	112	# Previous code for setting source_path allowed for
	113	# it to be undefined, so continue this practice
	114	$self->{'terse_source_path'} = undef;
	115	$self->{'source_path'} = undef;
	116	}
	117	}
	118
	119
	120	sub get_source_path
	121	{
	122	my $self = shift @_;
	123
	124	return $self->{'terse_source_path'};
	125	}
	126
[10217]	127	# set lastmodified for OAI purposes, added by GRB, moved by kjdon
[23939]	128	sub set_oailastmodified {
	129	my $self = shift (@_);
	130
	131	my $source_path = $self->{'terse_source_path'};
	132
	133	if (defined $source_path && (-e $source_path)) {
	134	my $current_time = time;
	135
	136	my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
	137	$wday, $yday, $isdst) = localtime($current_time);
	138
	139	my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
	140
	141	$self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
	142	$self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
	143	}
	144	}
	145
	146	# no longer used for OAI purposes, since lastmodified is not what we want as the
	147	# Datestamp of a document. This doc metadata may be useful for general purposes.
[10217]	148	sub set_lastmodified {
	149	my $self = shift (@_);
[4]	150
[23362]	151	my $source_path = $self->{'terse_source_path'};
[13770]	152
	153	if (defined $source_path && (-e $source_path)) {
	154
	155	my $file_stat = stat($source_path);
[10217]	156	my $mtime = $file_stat->mtime;
[21862]	157	my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
	158	$wday, $yday, $isdst) = localtime($mtime);
	159
	160	my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
	161
[23923]	162	$self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
[21862]	163	$self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
[10217]	164	}
	165	}
	166
[1241]	167	# clone the $self object
	168	sub duplicate {
	169	my $self = shift (@_);
	170
	171	my $newobj = {};
	172
[8716]	173	foreach my $k (keys %$self) {
[1241]	174	$newobj->{$k} = &clone ($self->{$k});
	175	}
	176
	177	bless $newobj, ref($self);
	178	return $newobj;
	179	}
	180
	181	sub clone {
	182	my ($from) = @_;
	183	my $type = ref ($from);
	184
	185	if ($type eq "HASH") {
	186	my $to = {};
[8716]	187	foreach my $key (keys %$from) {
[1241]	188	$to->{$key} = &clone ($from->{$key});
	189	}
	190	return $to;
	191	} elsif ($type eq "ARRAY") {
	192	my $to = [];
[8716]	193	foreach my $v (@$from) {
[1241]	194	push (@$to, &clone ($v));
	195	}
	196	return $to;
	197	} else {
	198	return $from;
	199	}
	200	}
	201
[2327]	202	sub set_OIDtype {
	203	my $self = shift (@_);
[12268]	204	my ($type, $metadata) = @_;
[1241]	205
[26221]	206	if (defined $type && $type =~ /^(hash\|hash_on_file\|hash_on_ga_xml\|incremental\|filename\|dirname\|assigned)$/) {
[2327]	207	$self->{'OIDtype'} = $type;
	208	} else {
	209	$self->{'OIDtype'} = "hash";
	210	}
[16792]	211
[12268]	212	if ($type =~ /^assigned$/) {
	213	if (defined $metadata) {
	214	$self->{'OIDmetadata'} = $metadata;
	215	} else {
	216	$self->{'OIDmetadata'} = "dc.Identifier";
	217	}
	218	}
[2327]	219	}
	220
[20412]	221	# rename_method can be 'url', 'none', 'base64'
[1241]	222	sub set_source_filename {
	223	my $self = shift (@_);
[18319]	224	my ($source_filename, $rename_method) = @_;
[1241]	225
[16578]	226	# Since the gsdlsourcefilename element goes into the doc.xml it has
	227	# to be utf8. However, it should also represent the source filename
	228	# (in the import directory) which may not be utf8 at all.
[18319]	229	# For instance, if this meta element (gsdlsourcefilename) will be used
	230	# by other applications that parse doc.xml in order to locate
	231	# gsdlsourcefilename. Therefore, the solution is to URLencode or base64
	232	# encode the real filename as this is a binary-to-text encoding meaning
	233	# that the resulting string is ASCII (utf8). Decoding will give the original.
[16578]	234
[18319]	235	# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
	236
[16670]	237	# URLencode just the gsdl_source_filename, not the directory. Then prepend dir
[23278]	238	$source_filename = $self->encode_filename($source_filename, $rename_method);
	239	# my ($srcfilename,$dirname,$suffix)
	240	# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
[16792]	241	# print STDERR "-> $srcfilename -> ";
[23278]	242	# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
	243	# $source_filename = &util::filename_cat($dirname, $srcfilename);
[16792]	244	# print STDERR "$source_filename\n";
[16670]	245
[16578]	246	$self->set_utf8_metadata_element ($self->get_top_section(),
[1241]	247	"gsdlsourcefilename",
	248	$source_filename);
	249	}
	250
[23278]	251	sub encode_filename {
	252	my $self = shift (@_);
	253	my ($source_filename, $rename_method) = @_;
	254
	255	my ($srcfilename,$dirname,$suffix)
	256	= &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
	257	# print STDERR "-> $srcfilename -> ";
	258	$srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
	259	$source_filename = &util::filename_cat($dirname, $srcfilename);
	260
	261	return $source_filename;
	262	}
	263
[7569]	264	sub set_converted_filename {
	265	my $self = shift (@_);
	266	my ($converted_filename) = @_;
	267
[16578]	268	# we know the converted filename is utf8
	269	$self->set_utf8_metadata_element ($self->get_top_section(),
[7569]	270	"gsdlconvertedfilename",
	271	$converted_filename);
	272	}
	273
[1241]	274	# returns the source_filename as it was provided
[19829]	275	sub get_unmodified_source_filename {
	276	my $self = shift (@_);
	277
[23362]	278	return $self->{'terse_source_path'};
[19829]	279	}
	280
	281	# returns the source_filename with whatever rename_method was given
[1241]	282	sub get_source_filename {
	283	my $self = shift (@_);
	284
	285	return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
	286	}
	287
[19829]	288
	289
[7569]	290	# returns converted filename if available else returns source filename
	291	sub get_filename_for_hashing {
	292	my $self = shift (@_);
	293
	294	my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
	295
	296	if (!defined $filename) {
[11097]	297	my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
[10980]	298	# if NULPlug processed file, then don't give a filename
[11097]	299	if (defined $plugin_name && $plugin_name eq "NULPlug") {
[10980]	300	$filename = undef;
[16670]	301	} else { # returns the URL encoded source filename!
[10980]	302	$filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
	303	}
[7569]	304	}
[23485]	305
	306	if (!&util::filename_is_absolute($filename)) {
	307	$filename = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$filename);
	308	}
	309
[7569]	310	return $filename;
	311	}
	312
[1241]	313	sub set_doc_type {
	314	my $self = shift (@_);
	315	my ($doc_type) = @_;
	316
	317	$self->set_metadata_element ($self->get_top_section(),
	318	"gsdldoctype",
	319	$doc_type);
	320	}
	321
[10217]	322	# returns the gsdldoctype as it was provided
[1241]	323	# the default of "indexed_doc" is used if no document
	324	# type was provided
	325	sub get_doc_type {
	326	my $self = shift (@_);
	327
	328	my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
	329	return $doc_type if (defined $doc_type);
	330	return "indexed_doc";
	331	}
	332
	333
	334	# look up the reference to the a particular section
	335	sub _lookup_section {
	336	my $self = shift (@_);
	337	my ($section) = @_;
	338
	339	my ($num);
	340	my $sectionref = $self;
	341
	342	while (defined $section && $section ne "") {
[12327]	343
[1241]	344	($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
[12327]	345
	346	$num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
	347
[1241]	348	$section = "" unless defined $section;
	349
[12327]	350
[1241]	351	if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
	352	$sectionref = $sectionref->{'subsections'}->{$num};
	353	} else {
	354	return undef;
	355	}
	356	}
	357
	358	return $sectionref;
	359	}
	360
[2327]	361	# calculate OID by hashing the contents of the document
[1241]	362	sub _calc_OID {
	363	my $self = shift (@_);
	364	my ($filename) = @_;
	365
[22855]	366
[1241]	367	my $osexe = &util::get_os_exe();
	368
	369	my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
	370	$ENV{'GSDLOS'},"hashfile$osexe");
[8504]	371
[1241]	372	my $result = "NULL";
[22855]	373
[1679]	374
[1241]	375	if (-e "$hashfile_exe") {
[1679]	376	# $result = `\"$hashfile_exe\" \"$filename\"`;
[21862]	377	# $result = `hashfile$osexe \"$filename\" 2>&1`;
[1679]	378	$result = `hashfile$osexe \"$filename\"`;
[21862]	379
[1241]	380	($result) = $result =~ /:\s*([0-9a-f]+)/i;
	381	} else {
	382	print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
	383	}
	384	return "HASH$result";
	385	}
	386
	387	# methods dealing with OID, not groups of them.
	388
[2327]	389	# if $OID is not provided one is calculated
[1241]	390	sub set_OID {
	391	my $self = shift (@_);
	392	my ($OID) = @_;
[8504]	393
[8797]	394	my $use_hash_oid = 0;
[17057]	395	# if an OID wasn't provided calculate one
[1241]	396	if (!defined $OID) {
	397	$OID = "NULL";
[17025]	398	if ($self->{'OIDtype'} =~ /^hash/) {
[8797]	399	$use_hash_oid = 1;
	400	} elsif ($self->{'OIDtype'} eq "incremental") {
[2327]	401	$OID = "D" . $OIDcount;
	402	$OIDcount ++;
[26221]	403	} elsif ($self->{'OIDtype'} eq "filename") {
	404	my $filename = $self->get_source_filename();
	405	$OID = &File::Basename::fileparse($filename, qr/\.[^.]*/);
	406	$OID = &util::tidy_up_oid($OID);
[8716]	407	} elsif ($self->{'OIDtype'} eq "dirname") {
	408	$OID = 'J';
	409	my $filename = $self->get_source_filename();
[8797]	410	if (defined($filename)) { # && -e $filename) {
[8716]	411	$OID = &File::Basename::dirname($filename);
	412	if (defined $OID) {
	413	$OID = 'J'.&File::Basename::basename($OID);
[19617]	414	$OID = &util::tidy_up_oid($OID);
[8716]	415	} else {
[8797]	416	print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
	417	$use_hash_oid = 1;
[8716]	418	}
	419	} else {
[8797]	420	print STDERR "Failed to find filename, generating hash id\n";
	421	$use_hash_oid = 1;
[8716]	422	}
[2327]	423
[8797]	424	} elsif ($self->{'OIDtype'} eq "assigned") {
[12268]	425	my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
[8797]	426	if (defined $identifier && $identifier ne "") {
[17025]	427	$OID = $identifier;
[19617]	428	$OID = &util::tidy_up_oid($OID);
[8797]	429	} else {
	430	# need a hash id
[12268]	431	print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
[8797]	432	$use_hash_oid = 1;
	433	}
	434
[2327]	435	} else {
[8797]	436	$use_hash_oid = 1;
	437	}
	438
	439	if ($use_hash_oid) {
[23562]	440	my $hash_on_file = 1;
[17025]	441	if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
	442	$hash_on_file = 0;
	443	}
	444	if ($hash_on_file) {
	445	# "hash" OID - feed file to hashfile.exe
	446	my $filename = $self->get_filename_for_hashing();
[23562]	447
[17025]	448	# -z: don't want to hash on the file if it is zero size
	449	if (defined($filename) && -e $filename && !-z $filename) {
	450	$OID = $self->_calc_OID ($filename);
	451	} else {
	452	$hash_on_file = 0;
	453	}
	454	}
	455	if (!$hash_on_file) {
	456	my $filename = &util::get_tmp_filename();
[22855]	457	if (!open (OUTFILE, ">:utf8", $filename)) {
[2327]	458	print STDERR "doc::set_OID could not write to $filename\n";
	459	} else {
[13172]	460	my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
	461	print OUTFILE $doc_text;
[2327]	462	close (OUTFILE);
	463	}
	464	$OID = $self->_calc_OID ($filename);
	465	&util::rm ($filename);
[1374]	466	}
[1241]	467	}
	468	}
	469	$self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
	470	}
	471
	472	# this uses hashdoc (embedded c thingy) which is faster but still
	473	# needs a little work to be suffiently stable
	474	sub ___set_OID {
	475	my $self = shift (@_);
	476	my ($OID) = @_;
	477
	478	# if an OID wasn't provided then calculate hash value based on document
	479	if (!defined $OID)
	480	{
[13172]	481	my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
[1241]	482	my $hash_len = length($hash_text);
	483
	484	$OID = &hashdoc::buffer($hash_text,$hash_len);
	485	}
	486
	487	$self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
	488	}
	489
	490	# returns the OID for this document
	491	sub get_OID {
	492	my $self = shift (@_);
	493	my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
	494	return $OID if (defined $OID);
	495	return "NULL";
	496	}
	497
	498	sub delete_OID {
	499	my $self = shift (@_);
	500
	501	$self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
	502	}
	503
	504
	505	# methods for manipulating section names
	506
	507	# returns the name of the top-most section (the top
	508	# level of the document
	509	sub get_top_section {
	510	my $self = shift (@_);
	511
	512	return "";
	513	}
	514
	515	# returns a section
	516	sub get_parent_section {
	517	my $self = shift (@_);
	518	my ($section) = @_;
	519
	520	$section =~ s/(^\|\.)\d+$//;
	521
	522	return $section;
	523	}
	524
	525	# returns the first child section (or the end child
	526	# if there isn't any)
	527	sub get_begin_child {
	528	my $self = shift (@_);
	529	my ($section) = @_;
	530
	531	my $section_ptr = $self->_lookup_section($section);
	532	return "" unless defined $section_ptr;
	533
	534	if (defined $section_ptr->{'subsection_order'}->[0]) {
	535	return "$section.$section_ptr->{'subsection_order'}->[0]";
	536	}
	537
	538	return $self->get_end_child ($section);
	539	}
	540
	541	# returns the next child of a parent section
	542	sub get_next_child {
	543	my $self = shift (@_);
	544	my ($section) = @_;
	545
	546	my $parent_section = $self->get_parent_section($section);
	547	my $parent_section_ptr = $self->_lookup_section($parent_section);
	548	return undef unless defined $parent_section_ptr;
	549
	550	my ($section_num) = $section =~ /(\d+)$/;
	551	return undef unless defined $section_num;
	552
	553	my $i = 0;
	554	my $section_order = $parent_section_ptr->{'subsection_order'};
	555	while ($i < scalar(@$section_order)) {
	556	last if $section_order->[$i] eq $section_num;
	557	$i++;
	558	}
	559
	560	$i++; # the next child
	561	if ($i < scalar(@$section_order)) {
	562	return $section_order->[$i] if $parent_section eq "";
	563	return "$parent_section.$section_order->[$i]";
	564	}
	565
	566	# no more sections in this level
	567	return undef;
	568	}
	569
	570	# returns a reference to a list of children
	571	sub get_children {
	572	my $self = shift (@_);
	573	my ($section) = @_;
	574
	575	my $section_ptr = $self->_lookup_section($section);
	576	return [] unless defined $section_ptr;
	577
	578	my @children = @{$section_ptr->{'subsection_order'}};
	579
	580	map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
	581	return \@children;
	582	}
	583
	584	# returns the child section one past the last one (which
	585	# is coded as "0")
	586	sub get_end_child {
	587	my $self = shift (@_);
	588	my ($section) = @_;
	589
	590	return $section . ".0" unless $section eq "";
	591	return "0";
	592	}
	593
	594	# returns the next section in book order
	595	sub get_next_section {
	596	my $self = shift (@_);
	597	my ($section) = @_;
	598
	599	return undef unless defined $section;
	600
	601	my $section_ptr = $self->_lookup_section($section);
	602	return undef unless defined $section_ptr;
	603
	604	# first try to find first child
	605	if (defined $section_ptr->{'subsection_order'}->[0]) {
	606	return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
	607	return "$section.$section_ptr->{'subsection_order'}->[0]";
	608	}
	609
	610	do {
	611	# try to find sibling
	612	my $next_child = $self->get_next_child ($section);
	613	return $next_child if (defined $next_child);
	614
	615	# move up one level
	616	$section = $self->get_parent_section ($section);
	617	} while $section =~ /\d/;
	618
	619	return undef;
	620	}
	621
	622	sub is_leaf_section {
	623	my $self = shift (@_);
	624	my ($section) = @_;
	625
	626	my $section_ptr = $self->_lookup_section($section);
	627	return 1 unless defined $section_ptr;
	628
	629	return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
	630	}
	631
	632	# methods for dealing with sections
	633
	634	# returns the name of the inserted section
	635	sub insert_section {
	636	my $self = shift (@_);
	637	my ($before_section) = @_;
	638
	639	# get the child to insert before and its parent section
	640	my $parent_section = "";
	641	my $before_child = "0";
	642	my @before_section = split (/\./, $before_section);
	643	if (scalar(@before_section) > 0) {
	644	$before_child = pop (@before_section);
	645	$parent_section = join (".", @before_section);
	646	}
	647
	648	my $parent_section_ptr = $self->_lookup_section($parent_section);
	649	if (!defined $parent_section_ptr) {
	650	print STDERR "doc::insert_section couldn't find parent section " .
	651	"$parent_section\n";
	652	return;
	653	}
	654
	655	# get the next section number
	656	my $section_num = $parent_section_ptr->{'next_subsection'}++;
	657
	658	my $i = 0;
	659	while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
	660	$parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
	661	$i++;
	662	}
	663
	664	# insert the section number into the order list
	665	splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
	666
	667	# add this section to the parent section
	668	my $section_ptr = {'subsection_order'=>[],
	669	'next_subsection'=>1,
	670	'subsections'=>{},
	671	'metadata'=>[],
	672	'text'=>""};
	673	$parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
	674
	675	# work out the full section number
	676	my $section = $parent_section;
	677	$section .= "." unless $section eq "";
	678	$section .= $section_num;
	679
	680	return $section;
	681	}
	682
	683	# creates a pre-named section
	684	sub create_named_section {
	685	my $self = shift (@_);
	686	my ($mastersection) = @_;
	687
	688	my ($num);
	689	my $section = $mastersection;
	690	my $sectionref = $self;
	691
	692	while ($section ne "") {
	693	($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
	694	$num =~ s/^0+(\d)/$1/; # remove leading 0s
	695	$section = "" unless defined $section;
	696
	697	if (defined $num) {
	698	if (!defined $sectionref->{'subsections'}->{$num}) {
	699	push (@{$sectionref->{'subsection_order'}}, $num);
	700	$sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
	701	'next_subsection'=>1,
	702	'subsections'=>{},
	703	'metadata'=>[],
	704	'text'=>""};
	705	if ($num >= $sectionref->{'next_subsection'}) {
	706	$sectionref->{'next_subsection'} = $num + 1;
	707	}
	708	}
	709	$sectionref = $sectionref->{'subsections'}->{$num};
	710
	711	} else {
	712	print STDERR "doc::create_named_section couldn't create section ";
	713	print STDERR "$mastersection\n";
	714	last;
	715	}
	716	}
	717	}
	718
	719	# returns a reference to a list of subsections
	720	sub list_subsections {
	721	my $self = shift (@_);
	722	my ($section) = @_;
	723
	724	my $section_ptr = $self->_lookup_section ($section);
	725	if (!defined $section_ptr) {
	726	print STDERR "doc::list_subsections couldn't find section $section\n";
	727	return [];
	728	}
	729
	730	return [@{$section_ptr->{'subsection_order'}}];
	731	}
	732
	733	sub delete_section {
	734	my $self = shift (@_);
	735	my ($section) = @_;
	736
	737	# my $section_ptr = {'subsection_order'=>[],
	738	# 'next_subsection'=>1,
	739	# 'subsections'=>{},
	740	# 'metadata'=>[],
	741	# 'text'=>""};
	742
	743	# if this is the top section reset everything
	744	if ($section eq "") {
	745	$self->{'subsection_order'} = [];
	746	$self->{'subsections'} = {};
	747	$self->{'metadata'} = [];
	748	$self->{'text'} = "";
	749	return;
	750	}
	751
	752	# find the parent of the section to delete
	753	my $parent_section = "";
	754	my $child = "0";
	755	my @section = split (/\./, $section);
	756	if (scalar(@section) > 0) {
	757	$child = pop (@section);
	758	$parent_section = join (".", @section);
	759	}
	760
	761	my $parent_section_ptr = $self->_lookup_section($parent_section);
	762	if (!defined $parent_section_ptr) {
	763	print STDERR "doc::delete_section couldn't find parent section " .
	764	"$parent_section\n";
	765	return;
	766	}
	767
	768	# remove this section from the subsection_order list
	769	my $i = 0;
	770	while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
	771	if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
	772	splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
	773	last;
	774	}
	775	$i++;
	776	}
	777
	778	# remove this section from the subsection hash
	779	if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
	780	undef $parent_section_ptr->{'subsections'}->{$child};
	781	}
	782	}
	783
	784	#--
[4]	785	# methods for dealing with metadata
	786
	787	# set_metadata_element and get_metadata_element are for metadata
	788	# which should only have one value. add_meta_data and get_metadata
	789	# are for metadata which can have more than one value.
	790
[1241]	791	# returns the first metadata value which matches field
[6111]	792
	793	# This version of get metadata element works much like the one above,
	794	# except it allows for the namespace portion of a metadata element to
	795	# be ignored, thus if you are searching for dc.Title, the first piece
	796	# of matching metadata ending with the name Title (once any namespace
	797	# is removed) would be returned.
	798	# 28-11-2003 John Thompson
[1241]	799	sub get_metadata_element {
	800	my $self = shift (@_);
[6111]	801	my ($section, $field, $ignore_namespace) = @_;
[1241]	802	my ($data);
	803
[6111]	804	$ignore_namespace = 0 unless defined $ignore_namespace;
	805
[1241]	806	my $section_ptr = $self->_lookup_section($section);
	807	if (!defined $section_ptr) {
[8716]	808	print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
[1241]	809	return;
	810	}
	811
[24404]	812	# Remove any namespace if we are being told to ignore them
[6111]	813	if($ignore_namespace) {
[24404]	814	$field =~ s/^.\.//; #$field =~ s/^\w\.//;
[6111]	815	}
	816
[1241]	817	foreach $data (@{$section_ptr->{'metadata'}}) {
[6111]	818
	819	my $data_name = $data->[0];
[14966]	820
[24404]	821	# Remove any namespace if we are being told to ignore them
[6111]	822	if($ignore_namespace) {
[24404]	823	$data_name =~ s/^.\.//; #$data_name =~ s/^\w\.//;
[6111]	824	}
[24404]	825	# we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
	826	$data_name =~ s/^ex\.([^.]+)$/$1/; #$data_name =~ s/^ex\.//;
	827
[6111]	828	return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
[1241]	829	}
	830
	831	return undef; # was not found
	832	}
	833
	834	# returns a list of the form [value1, value2, ...]
	835	sub get_metadata {
	836	my $self = shift (@_);
[6111]	837	my ($section, $field, $ignore_namespace) = @_;
[1241]	838	my ($data);
	839
[6111]	840	$ignore_namespace = 0 unless defined $ignore_namespace;
	841
[1241]	842	my $section_ptr = $self->_lookup_section($section);
	843	if (!defined $section_ptr) {
[8716]	844	print STDERR "doc::get_metadata couldn't find section ",
	845	$section, "\n";
[1241]	846	return;
	847	}
	848
[24404]	849	# Remove any namespace if we are being told to ignore them
[6111]	850	if($ignore_namespace) {
[24404]	851	$field =~ s/^.*\.//;
[6111]	852	}
	853
[1241]	854	my @metadata = ();
	855	foreach $data (@{$section_ptr->{'metadata'}}) {
[6111]	856
	857	my $data_name = $data->[0];
[24404]	858
	859	# Remove any namespace if we are being told to ignore them
[6111]	860	if($ignore_namespace) {
[24404]	861	$data_name =~ s/^.*\.//;
	862	}
	863	# we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
	864	$data_name =~ s/^ex\.([^.]+)$/$1/;
	865
[6111]	866	push (@metadata, $data->[1]) if ($data_name eq $field);
[1241]	867	}
[9241]	868
[1241]	869	return \@metadata;
	870	}
	871
[23827]	872	sub get_metadata_hashmap {
	873	my $self = shift (@_);
	874	my ($section, $opt_namespace) = @_;
	875
	876	my $section_ptr = $self->_lookup_section($section);
	877	if (!defined $section_ptr) {
	878	print STDERR "doc::get_metadata couldn't find section ",
	879	$section, "\n";
	880	return;
	881	}
	882
	883	my $metadata_hashmap = {};
	884	foreach my $data (@{$section_ptr->{'metadata'}}) {
	885	my $metaname = $data->[0];
	886
	887	if ((!defined $opt_namespace) \|\| ($metaname =~ m/^$opt_namespace\./)) {
	888	if (!defined $metadata_hashmap->{$metaname}) {
	889	$metadata_hashmap->{$metaname} = [];
	890	}
	891	my $metaval_list = $metadata_hashmap->{$metaname};
	892	push(@$metaval_list, $data->[1]);
	893	}
	894	}
	895
	896	return $metadata_hashmap;
	897	}
	898
[1241]	899	# returns a list of the form [[field,value],[field,value],...]
	900	sub get_all_metadata {
	901	my $self = shift (@_);
	902	my ($section) = @_;
	903
	904	my $section_ptr = $self->_lookup_section($section);
	905	if (!defined $section_ptr) {
[8716]	906	print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
[1241]	907	return;
	908	}
	909
	910	return $section_ptr->{'metadata'};
	911	}
	912
	913	# $value is optional
	914	sub delete_metadata {
	915	my $self = shift (@_);
	916	my ($section, $field, $value) = @_;
	917
	918	my $section_ptr = $self->_lookup_section($section);
	919	if (!defined $section_ptr) {
[23562]	920	print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
[1241]	921	return;
	922	}
	923
	924	my $i = 0;
	925	while ($i < scalar (@{$section_ptr->{'metadata'}})) {
	926	if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
	927	(!defined $value \|\| $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
	928	splice (@{$section_ptr->{'metadata'}}, $i, 1);
	929	} else {
	930	$i++;
	931	}
	932	}
	933	}
	934
	935	sub delete_all_metadata {
	936	my $self = shift (@_);
	937	my ($section) = @_;
	938
	939	my $section_ptr = $self->_lookup_section($section);
	940	if (!defined $section_ptr) {
[8716]	941	print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
[1241]	942	return;
	943	}
	944
	945	$section_ptr->{'metadata'} = [];
	946	}
	947
[4]	948	sub set_metadata_element {
	949	my $self = shift (@_);
	950	my ($section, $field, $value) = @_;
	951
[97]	952	$self->set_utf8_metadata_element ($section, $field,
[1870]	953	&unicode::ascii2utf8(\$value));
[73]	954	}
	955
	956	# set_utf8_metadata_element assumes the text has already been
	957	# converted to the UTF-8 encoding.
	958	sub set_utf8_metadata_element {
	959	my $self = shift (@_);
	960	my ($section, $field, $value) = @_;
[25557]	961
[4]	962	$self->delete_metadata ($section, $field);
[73]	963	$self->add_utf8_metadata ($section, $field, $value);
[4]	964	}
	965
	966
[73]	967	# add_metadata assumes the text is in (extended) ascii form. For
[8220]	968	# text which has already been converted to the UTF-8 format use
[73]	969	# add_utf8_metadata.
[4]	970	sub add_metadata {
	971	my $self = shift (@_);
	972	my ($section, $field, $value) = @_;
[25557]	973
[97]	974	$self->add_utf8_metadata ($section, $field,
[1870]	975	&unicode::ascii2utf8(\$value));
[73]	976	}
	977
	978	sub add_utf8_metadata {
	979	my $self = shift (@_);
	980	my ($section, $field, $value) = @_;
[25557]	981
	982	# my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
	983	# my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
	984	# print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
[73]	985
[4]	986	my $section_ptr = $self->_lookup_section($section);
	987	if (!defined $section_ptr) {
[8716]	988	print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
[4]	989	return;
	990	}
[1732]	991	if (!defined $value) {
	992	print STDERR "doc::add_utf8_metadata undefined value for $field\n";
	993	return;
	994	}
	995	if (!defined $field) {
	996	print STDERR "doc::add_utf8_metadata undefined metadata type \n";
	997	return;
	998	}
[4]	999
[8894]	1000	#print STDERR "###$field=$value\n";
[22950]	1001
	1002	# For now, supress this check. Given that text data read in is now
	1003	# Unicode aware, then the following block of code can (ironically enough)
	1004	# cause our unicode compliant string to be re-encoded (leading to
	1005	# a double-encoded UTF-8 string, which we definitely don't want!).
	1006
	1007
[7798]	1008	# double check that the value is utf-8
[25557]	1009	# if (!&unicode::check_is_utf8($value)) {
	1010	# print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
	1011	# &unicode::ensure_utf8(\$value);
	1012	# print STDERR " Tried converting to utf8: $value\n";
	1013	# }
[7798]	1014
[25557]	1015	#If the metadata value is either a latitude or a longitude value then we want to save a shortened version for spacial searching purposes
	1016	if($field eq "Latitude" \|\| $field eq "Longitude")
	1017	{
	1018	my $direction;
	1019	if($value =~ m/^-/)
	1020	{
	1021	$direction = ($field eq "Latitude") ? "S" : "W";
	1022	}
	1023	else
	1024	{
	1025	$direction = ($field eq "Latitude") ? "N" : "E";
	1026	}
	1027
	1028	my ($beforeDec, $afterDec) = ($value =~ m/^-?([0-9]+)\.([0-9]+)$/);
	1029	if(defined $beforeDec && defined $afterDec)
	1030	{
	1031	my $name = ($field eq "Latitude") ? "LatShort" : "LngShort";
	1032	push (@{$section_ptr->{'metadata'}}, [$name, $beforeDec . $direction]);
	1033
	1034	for(my $i = 2; $i <= 4; $i++)
	1035	{
	1036	if(length($afterDec) >= $i)
	1037	{
	1038	push (@{$section_ptr->{'metadata'}}, [$name, substr($afterDec, 0, $i)]);
	1039	}
	1040	}
	1041
	1042	#Only add the metadata if it has not already been added
	1043	my $metaMap = $self->get_metadata_hashmap($section);
	1044	}
	1045	}
	1046
[4]	1047	push (@{$section_ptr->{'metadata'}}, [$field, $value]);
	1048	}
	1049
	1050
	1051	# methods for dealing with text
	1052
[1241]	1053	# returns the text for a section
	1054	sub get_text {
	1055	my $self = shift (@_);
	1056	my ($section) = @_;
	1057
	1058	my $section_ptr = $self->_lookup_section($section);
	1059	if (!defined $section_ptr) {
	1060	print STDERR "doc::get_text couldn't find section " .
	1061	"$section\n";
	1062	return "";
	1063	}
	1064
	1065	return $section_ptr->{'text'};
	1066	}
	1067
	1068	# returns the (utf-8 encoded) length of the text for a section
	1069	sub get_text_length {
	1070	my $self = shift (@_);
	1071	my ($section) = @_;
	1072
	1073	my $section_ptr = $self->_lookup_section($section);
	1074	if (!defined $section_ptr) {
	1075	print STDERR "doc::get_text_length couldn't find section " .
	1076	"$section\n";
	1077	return 0;
	1078	}
	1079
	1080	return length ($section_ptr->{'text'});
	1081	}
	1082
[23131]	1083	# returns the total length for all the sections
	1084	sub get_total_text_length {
	1085	my $self = shift (@_);
	1086
	1087	my $section = $self->get_top_section();
	1088	my $length = 0;
	1089	while (defined $section) {
	1090	$length += $self->get_text_length($section);
	1091	$section = $self->get_next_section($section);
	1092	}
	1093	return $length;
	1094	}
	1095
[1241]	1096	sub delete_text {
	1097	my $self = shift (@_);
	1098	my ($section) = @_;
	1099
	1100	my $section_ptr = $self->_lookup_section($section);
	1101	if (!defined $section_ptr) {
	1102	print STDERR "doc::delete_text couldn't find section " .
	1103	"$section\n";
	1104	return;
	1105	}
	1106
	1107	$section_ptr->{'text'} = "";
	1108	}
	1109
[73]	1110	# add_text assumes the text is in (extended) ascii form. For
	1111	# text which has been already converted to the UTF-8 format
	1112	# use add_utf8_text.
[4]	1113	sub add_text {
	1114	my $self = shift (@_);
	1115	my ($section, $text) = @_;
	1116
[73]	1117	# convert the text to UTF-8 encoded unicode characters
	1118	# and add the text
[1870]	1119	$self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
[73]	1120	}
	1121
	1122
	1123	# add_utf8_text assumes the text to be added has already
	1124	# been converted to the UTF-8 encoding. For ascii text use
	1125	# add_text
	1126	sub add_utf8_text {
	1127	my $self = shift (@_);
	1128	my ($section, $text) = @_;
	1129
[4]	1130	my $section_ptr = $self->_lookup_section($section);
	1131	if (!defined $section_ptr) {
[73]	1132	print STDERR "doc::add_utf8_text couldn't find section " .
[4]	1133	"$section\n";
	1134	return;
	1135	}
	1136
	1137	$section_ptr->{'text'} .= $text;
	1138	}
	1139
[16950]	1140	# returns the Source meta, which is the utf8 filename generated.
	1141	# Added a separate method here for convenience
	1142	sub get_source {
[16924]	1143	my $self = shift (@_);
[16950]	1144	return $self->get_metadata_element ($self->get_top_section(), "Source");
	1145	}
	1146
	1147	# returns the SourceFile meta, which is the url reference to the URL-encoded
	1148	# version of Source (the utf8 filename). Added a separate method here for convenience
	1149	sub get_sourcefile {
	1150	my $self = shift (@_);
	1151	return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
	1152	}
	1153
	1154	# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
	1155	# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
	1156	sub get_assocfile_from_sourcefile {
	1157	my $self = shift (@_);
[16924]	1158
	1159	# get the SourceFile meta, which is a URL to a file on the filesystem
	1160	my $top_section = $self->get_top_section();
	1161	my $source_file = $self->get_metadata_element($top_section, "SourceFile");
[4]	1162
[16924]	1163	# get the actual filename as it exists on the filesystem which this url refers to
[16928]	1164	$source_file = &unicode::url_to_filename($source_file);
[16924]	1165	my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
	1166	return $assocfilename;
	1167	}
	1168
[1241]	1169	# methods for dealing with associated files
	1170
	1171	# a file is associated with a document, NOT a section.
	1172	# if section is defined it is noted in the data structure
	1173	# only so that files associated from a particular section
	1174	# may be removed later (using delete_section_assoc_files)
	1175	sub associate_file {
	1176	my $self = shift (@_);
	1177	my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
	1178	$mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
	1179
	1180	# remove all associated files with the same name
	1181	$self->delete_assoc_file ($assoc_filename);
[23413]	1182
	1183	# Too harsh a requirement
	1184	# Definitely get HTML docs, for example, with some missing
	1185	# support files
	1186	# if (!&util::fd_exists($real_filename)) {
	1187	# print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
	1188	# exit -1;
	1189	# }
	1190
[23387]	1191	# print STDERR "**** is the following a UTF8 rep of real filename?\n $real_filename\n";
	1192	# print STDERR "****##### so, ensure it is before storing?!?!?\n";
	1193	## my $utf8_filename = Encode::encode("utf8",$filename);
	1194
[1241]	1195	push (@{$self->{'associated_files'}},
	1196	[$real_filename, $assoc_filename, $mime_type, $section]);
	1197	}
	1198
	1199	# returns a list of associated files in the form
	1200	# [[real_filename, assoc_filename, mimetype], ...]
	1201	sub get_assoc_files {
	1202	my $self = shift (@_);
	1203
	1204	return $self->{'associated_files'};
	1205	}
	1206
[20775]	1207	# the following two methods used to keep track of original associated files
	1208	# for incremental building. eg a txt file used by an item file does not end
	1209	# up as an assoc file for the doc.xml, but it needs to be recorded as a source
	1210	# file for incremental build
	1211	sub associate_source_file {
	1212	my $self = shift (@_);
	1213	my ($full_filename) = @_;
[19494]	1214
[20775]	1215	push (@{$self->{'source_assoc_files'}}, $full_filename);
	1216
	1217	}
	1218
	1219	sub get_source_assoc_files {
	1220	my $self = shift (@_);
	1221
	1222	return $self->{'source_assoc_files'};
	1223
	1224
	1225	}
[19494]	1226	sub metadata_file {
	1227	my $self = shift (@_);
	1228	my ($real_filename, $filename) = @_;
	1229
	1230	push (@{$self->{'metadata_files'}},
	1231	[$real_filename, $filename]);
	1232	}
	1233
[21566]	1234	# used for writing out the archiveinf-doc info database, to list all the metadata files
[19494]	1235	sub get_meta_files {
	1236	my $self = shift (@_);
	1237
	1238	return $self->{'metadata_files'};
	1239	}
	1240
[1241]	1241	sub delete_section_assoc_files {
	1242	my $self = shift (@_);
	1243	my ($section) = @_;
	1244
	1245	my $i=0;
	1246	while ($i < scalar (@{$self->{'associated_files'}})) {
	1247	if (defined $self->{'associated_files'}->[$i]->[3] &&
	1248	$self->{'associated_files'}->[$i]->[3] eq $section) {
	1249	splice (@{$self->{'associated_files'}}, $i, 1);
	1250	} else {
	1251	$i++;
	1252	}
	1253	}
	1254	}
	1255
	1256	sub delete_assoc_file {
	1257	my $self = shift (@_);
	1258	my ($assoc_filename) = @_;
	1259
	1260	my $i=0;
	1261	while ($i < scalar (@{$self->{'associated_files'}})) {
	1262	if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
	1263	splice (@{$self->{'associated_files'}}, $i, 1);
	1264	} else {
	1265	$i++;
	1266	}
	1267	}
	1268	}
	1269
	1270	sub reset_nextsection_ptr {
	1271	my $self = shift (@_);
	1272	my ($section) = @_;
	1273
	1274	my $section_ptr = $self->_lookup_section($section);
	1275	$section_ptr->{'next_subsection'} = 1;
	1276	}
	1277
[4]	1278	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: