Context Navigation

source: main/trunk/greenstone2/perllib/doc.pm@ 27306

Last change on this file since 27306 was 27306, checked in by jmt12, 11 years ago
Moving the critical file-related functions (copy, rm, etc) out of util.pm into their own proper class FileUtils. Use of the old functions in util.pm will prompt deprecated warning messages. There may be further functions that could be moved across in the future, but these are the critical ones when considering supporting other filesystems (HTTP, HDFS, WebDav, etc). Updated some key files to use the new functions so now deprecated messages thrown when importing/building demo collection 'out of the box'
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 38.2 KB

Line
1	###########################################################################
2	#
3	# doc.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistr te it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# base class to hold documents
27
28	package doc;
29	eval {require bytes};
30
31	BEGIN {
32	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33	unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34	}
35
36	use strict;
37	use unicode;
38	use util;
39	use FileUtils;
40	use ghtml;
41	use File::stat;
42	##use hashdoc;
43	use docprint;
44
45	# the document type may be indexed_doc, nonindexed_doc, or
46	# classification
47
48	our $OIDcount = 0;
49
50	# rename_method can be 'url', 'none', 'base64'
51	sub new {
52	my $class = shift (@_);
53	my ($source_filename, $doc_type, $rename_method) = @_;
54
55
56	my $self = bless {'associated_files'=>[],
57	'subsection_order'=>[],
58	'next_subsection'=>1,
59	'subsections'=>{},
60	'metadata'=>[],
61	'text'=>"",
62	'OIDtype'=>"hash"}, $class;
63
64	# used to set lastmodified here, but this can screw up the HASH ids, so
65	# the docsave processor now calls set_lastmodified
66
67	$self->set_source_path($source_filename);
68
69	if (defined $source_filename) {
70	$source_filename = &util::filename_within_collection($source_filename);
71	print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
72	$self->set_source_filename ($source_filename, $rename_method);
73	}
74
75	$self->set_doc_type ($doc_type) if defined $doc_type;
76
77	return $self;
78	}
79
80
81	sub set_source_path
82	{
83	my $self = shift @_;
84	my ($source_filename) = @_;
85
86	if (defined $source_filename) {
87	# On Windows the source_filename can be in terse DOS format
88	# e.g. test~1.txt
89
90	$self->{'terse_source_path'} = $source_filename;
91
92	# Use the FileUtil library methods as they are aware of more special
93	# cases such as HDFS [jmt12]
94	##if (&util::file_exists($source_filename)) {
95	if (-e $source_filename) {
96	# See if we can do better for Windows with a filename
97	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
98	require Win32;
99	$self->{'source_path'} = Win32::GetLongPathName($source_filename);
100	}
101	else {
102	# For Unix-based systems, there is no difference between the two
103	$self->{'source_path'} = $source_filename;
104	}
105	}
106	else {
107	print STDERR "Warning: In doc::set_source_path(), file\n";
108	print STDERR " $source_filename\n";
109	print STDERR " does not exist\n";
110
111	# (default) Set it to whatever we were given
112	$self->{'source_path'} = $source_filename;
113	}
114	}
115	else {
116	# Previous code for setting source_path allowed for
117	# it to be undefined, so continue this practice
118	$self->{'terse_source_path'} = undef;
119	$self->{'source_path'} = undef;
120	}
121	}
122
123
124	sub get_source_path
125	{
126	my $self = shift @_;
127
128	return $self->{'terse_source_path'};
129	}
130
131	# set lastmodified for OAI purposes, added by GRB, moved by kjdon
132	sub set_oailastmodified {
133	my $self = shift (@_);
134
135	my $source_path = $self->{'terse_source_path'};
136
137	if (defined $source_path && (-e $source_path)) {
138	my $current_time = time;
139
140	my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
141	$wday, $yday, $isdst) = localtime($current_time);
142
143	my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
144
145	$self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
146	$self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
147	}
148	}
149
150	# no longer used for OAI purposes, since lastmodified is not what we want as the
151	# Datestamp of a document. This doc metadata may be useful for general purposes.
152	sub set_lastmodified {
153	my $self = shift (@_);
154
155	my $source_path = $self->{'terse_source_path'};
156
157	if (defined $source_path && (-e $source_path)) {
158
159	my $file_stat = stat($source_path);
160	my $mtime = $file_stat->mtime;
161	my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
162	$wday, $yday, $isdst) = localtime($mtime);
163
164	my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
165
166	$self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
167	$self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
168	}
169	}
170
171	# clone the $self object
172	sub duplicate {
173	my $self = shift (@_);
174
175	my $newobj = {};
176
177	foreach my $k (keys %$self) {
178	$newobj->{$k} = &clone ($self->{$k});
179	}
180
181	bless $newobj, ref($self);
182	return $newobj;
183	}
184
185	sub clone {
186	my ($from) = @_;
187	my $type = ref ($from);
188
189	if ($type eq "HASH") {
190	my $to = {};
191	foreach my $key (keys %$from) {
192	$to->{$key} = &clone ($from->{$key});
193	}
194	return $to;
195	} elsif ($type eq "ARRAY") {
196	my $to = [];
197	foreach my $v (@$from) {
198	push (@$to, &clone ($v));
199	}
200	return $to;
201	} else {
202	return $from;
203	}
204	}
205
206	sub set_OIDtype {
207	my $self = shift (@_);
208	my ($type, $metadata) = @_;
209
210	if (defined $type && $type =~ /^(hash\|hash_on_file\|hash_on_ga_xml\|hash_on_full_filename\|incremental\|filename\|dirname\|full_filename\|assigned)$/) {
211	$self->{'OIDtype'} = $type;
212	} else {
213	$self->{'OIDtype'} = "hash";
214	}
215
216	if ($type =~ /^assigned$/) {
217	if (defined $metadata) {
218	$self->{'OIDmetadata'} = $metadata;
219	} else {
220	$self->{'OIDmetadata'} = "dc.Identifier";
221	}
222	}
223	}
224
225	# rename_method can be 'url', 'none', 'base64'
226	sub set_source_filename {
227	my $self = shift (@_);
228	my ($source_filename, $rename_method) = @_;
229
230	# Since the gsdlsourcefilename element goes into the doc.xml it has
231	# to be utf8. However, it should also represent the source filename
232	# (in the import directory) which may not be utf8 at all.
233	# For instance, if this meta element (gsdlsourcefilename) will be used
234	# by other applications that parse doc.xml in order to locate
235	# gsdlsourcefilename. Therefore, the solution is to URLencode or base64
236	# encode the real filename as this is a binary-to-text encoding meaning
237	# that the resulting string is ASCII (utf8). Decoding will give the original.
238
239	# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
240
241	# URLencode just the gsdl_source_filename, not the directory. Then prepend dir
242	$source_filename = $self->encode_filename($source_filename, $rename_method);
243	# my ($srcfilename,$dirname,$suffix)
244	# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
245	# print STDERR "-> $srcfilename -> ";
246	# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
247	# $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
248	# print STDERR "$source_filename\n";
249
250	$self->set_utf8_metadata_element ($self->get_top_section(),
251	"gsdlsourcefilename",
252	$source_filename);
253	}
254
255	sub encode_filename {
256	my $self = shift (@_);
257	my ($source_filename, $rename_method) = @_;
258
259	my ($srcfilename,$dirname,$suffix)
260	= &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
261	# print STDERR "-> $srcfilename -> ";
262	$srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
263	$source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
264
265	return $source_filename;
266	}
267
268	sub set_converted_filename {
269	my $self = shift (@_);
270	my ($converted_filename) = @_;
271
272	# we know the converted filename is utf8
273	$self->set_utf8_metadata_element ($self->get_top_section(),
274	"gsdlconvertedfilename",
275	$converted_filename);
276	}
277
278	# returns the source_filename as it was provided
279	sub get_unmodified_source_filename {
280	my $self = shift (@_);
281
282	return $self->{'terse_source_path'};
283	}
284
285	# returns the source_filename with whatever rename_method was given
286	sub get_source_filename {
287	my $self = shift (@_);
288
289	return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
290	}
291
292
293
294	# returns converted filename if available else returns source filename
295	sub get_filename_for_hashing {
296	my $self = shift (@_);
297
298	my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
299
300	if (!defined $filename) {
301	my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
302	# if NULPlug processed file, then don't give a filename
303	if (defined $plugin_name && $plugin_name eq "NULPlug") {
304	$filename = undef;
305	} else { # returns the URL encoded source filename!
306	$filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
307	}
308	}
309
310	if (!&FileUtils::isFilenameAbsolute($filename)) {
311	$filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$filename);
312	}
313
314	return $filename;
315	}
316
317	sub set_doc_type {
318	my $self = shift (@_);
319	my ($doc_type) = @_;
320
321	$self->set_metadata_element ($self->get_top_section(),
322	"gsdldoctype",
323	$doc_type);
324	}
325
326	# returns the gsdldoctype as it was provided
327	# the default of "indexed_doc" is used if no document
328	# type was provided
329	sub get_doc_type {
330	my $self = shift (@_);
331
332	my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
333	return $doc_type if (defined $doc_type);
334	return "indexed_doc";
335	}
336
337
338	# look up the reference to the a particular section
339	sub _lookup_section {
340	my $self = shift (@_);
341	my ($section) = @_;
342
343	my ($num);
344	my $sectionref = $self;
345
346	while (defined $section && $section ne "") {
347
348	($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
349
350	$num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
351
352	$section = "" unless defined $section;
353
354
355	if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
356	$sectionref = $sectionref->{'subsections'}->{$num};
357	} else {
358	return undef;
359	}
360	}
361
362	return $sectionref;
363	}
364
365	# calculate OID by hashing the contents of the document
366	sub _calc_OID {
367	my $self = shift (@_);
368	my ($filename) = @_;
369
370
371	my $osexe = &util::get_os_exe();
372
373	my $hashfile_exe = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin",
374	$ENV{'GSDLOS'},"hashfile$osexe");
375
376	my $result = "NULL";
377
378
379	if (-e "$hashfile_exe") {
380	# $result = `\"$hashfile_exe\" \"$filename\"`;
381	# $result = `hashfile$osexe \"$filename\" 2>&1`;
382	$result = `hashfile$osexe \"$filename\"`;
383
384	($result) = $result =~ /:\s*([0-9a-f]+)/i;
385	} else {
386	print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
387	}
388	return "HASH$result";
389	}
390
391	# methods dealing with OID, not groups of them.
392
393	# if $OID is not provided one is calculated
394	sub set_OID {
395	my $self = shift (@_);
396	my ($OID) = @_;
397
398	my $use_hash_oid = 0;
399	# if an OID wasn't provided calculate one
400	if (!defined $OID) {
401	$OID = "NULL";
402	if ($self->{'OIDtype'} =~ /^hash/) {
403	$use_hash_oid = 1;
404	} elsif ($self->{'OIDtype'} eq "incremental") {
405	$OID = "D" . $OIDcount;
406	$OIDcount ++;
407	} elsif ($self->{'OIDtype'} eq "filename") {
408	my $filename = $self->get_source_filename();
409	$OID = &File::Basename::fileparse($filename, qr/\.[^.]*/);
410	$OID = &util::tidy_up_oid($OID);
411	} elsif ($self->{'OIDtype'} eq "full_filename") {
412	my $source_filename = $self->get_source_filename();
413	my $dirsep = &util::get_os_dirsep();
414
415	$source_filename =~ s/^import$dirsep//;
416	$source_filename =~ s/$dirsep/-/g;
417	$source_filename =~ s/\./_/g;
418
419	$OID = $source_filename;
420	$OID = &util::tidy_up_oid($OID);
421	} elsif ($self->{'OIDtype'} eq "dirname") {
422	$OID = 'J';
423	my $filename = $self->get_source_filename();
424	if (defined($filename)) { # && -e $filename) {
425	$OID = &File::Basename::dirname($filename);
426	if (defined $OID) {
427	$OID = 'J'.&File::Basename::basename($OID);
428	$OID = &util::tidy_up_oid($OID);
429	} else {
430	print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
431	$use_hash_oid = 1;
432	}
433	} else {
434	print STDERR "Failed to find filename, generating hash id\n";
435	$use_hash_oid = 1;
436	}
437
438	} elsif ($self->{'OIDtype'} eq "assigned") {
439	my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
440	if (defined $identifier && $identifier ne "") {
441	$OID = $identifier;
442	$OID = &util::tidy_up_oid($OID);
443	} else {
444	# need a hash id
445	print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
446	$use_hash_oid = 1;
447	}
448
449	} else {
450	$use_hash_oid = 1;
451	}
452
453	if ($use_hash_oid) {
454	my $hash_on_file = 1;
455	my $hash_on_ga_xml = 0;
456
457	if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
458	$hash_on_file = 0;
459	$hash_on_ga_xml = 1;
460	}
461
462	if ($self->{'OIDtype'} eq "hash_on_full_filename") {
463	$hash_on_file = 0;
464	$hash_on_ga_xml = 0;
465
466	my $source_filename = $self->get_source_filename();
467	my $dirsep = &util::get_os_dirsep();
468
469	$source_filename =~ s/^import$dirsep//;
470	$source_filename =~ s/$dirsep/-/g;
471	$source_filename =~ s/\./_/g;
472
473	# If the filename is very short then (handled naively)
474	# this can cause conjestion in the hash-values
475	# computed, leading documents sharing the same leading
476	# Hex values in the computed has.
477	#
478	# The solution taken here is to replace the name of
479	# the file name a sufficient number of times (up to
480	# the character limit defined in 'rep_limit' and
481	# make that the content that is hashed on
482
483	# *** Think twice before changing the following value
484	# as it will break backward compatability of computed
485	# document HASH values
486
487	my $rep_limit = 256;
488	my $hash_content = undef;
489
490	if (length($source_filename)<$rep_limit) {
491	my $rep_string = "$source_filename\|";
492	my $rs_len = length($rep_string);
493
494	my $clone_times = int(($rep_limit-1)/$rs_len) +1;
495
496	$hash_content = substr($rep_string x $clone_times, 0, $rep_limit);
497	}
498	else {
499	$hash_content = $source_filename;
500	}
501
502	my $filename = &util::get_tmp_filename();
503	if (!open (OUTFILE, ">:utf8", $filename)) {
504	print STDERR "doc::set_OID could not write to $filename\n";
505	} else {
506	print OUTFILE $hash_content;
507	close (OUTFILE);
508	}
509	$OID = $self->_calc_OID ($filename);
510
511	&util::rm ($filename);
512	}
513
514	if ($hash_on_file) {
515	# "hash" OID - feed file to hashfile.exe
516	my $filename = $self->get_filename_for_hashing();
517
518	# -z: don't want to hash on the file if it is zero size
519	if (defined($filename) && -e $filename && !-z $filename) {
520	$OID = $self->_calc_OID ($filename);
521	} else {
522	$hash_on_ga_xml = 1; # switch to back-up plan, and hash on GA file instead
523	}
524	}
525
526	if ($hash_on_ga_xml) {
527	# In addition being asked to explicity calculate the has based on the GA file,
528	# can also end up coming into this block is doing 'hash_on_file' but the file
529	# itself is of zero bytes (as could be the case with 'doc.nul' file
530
531	my $filename = &util::get_tmp_filename();
532	if (!open (OUTFILE, ">:utf8", $filename)) {
533	print STDERR "doc::set_OID could not write to $filename\n";
534	} else {
535	my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
536	print OUTFILE $doc_text;
537	close (OUTFILE);
538	}
539	$OID = $self->_calc_OID ($filename);
540	&util::rm ($filename);
541	}
542	}
543	}
544	$self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
545	}
546
547	# this uses hashdoc (embedded c thingy) which is faster but still
548	# needs a little work to be suffiently stable
549	sub ___set_OID {
550	my $self = shift (@_);
551	my ($OID) = @_;
552
553	# if an OID wasn't provided then calculate hash value based on document
554	if (!defined $OID)
555	{
556	my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
557	my $hash_len = length($hash_text);
558
559	$OID = &hashdoc::buffer($hash_text,$hash_len);
560	}
561
562	$self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
563	}
564
565	# returns the OID for this document
566	sub get_OID {
567	my $self = shift (@_);
568	my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
569	return $OID if (defined $OID);
570	return "NULL";
571	}
572
573	sub delete_OID {
574	my $self = shift (@_);
575
576	$self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
577	}
578
579
580	# methods for manipulating section names
581
582	# returns the name of the top-most section (the top
583	# level of the document
584	sub get_top_section {
585	my $self = shift (@_);
586
587	return "";
588	}
589
590	# returns a section
591	sub get_parent_section {
592	my $self = shift (@_);
593	my ($section) = @_;
594
595	$section =~ s/(^\|\.)\d+$//;
596
597	return $section;
598	}
599
600	# returns the first child section (or the end child
601	# if there isn't any)
602	sub get_begin_child {
603	my $self = shift (@_);
604	my ($section) = @_;
605
606	my $section_ptr = $self->_lookup_section($section);
607	return "" unless defined $section_ptr;
608
609	if (defined $section_ptr->{'subsection_order'}->[0]) {
610	return "$section.$section_ptr->{'subsection_order'}->[0]";
611	}
612
613	return $self->get_end_child ($section);
614	}
615
616	# returns the next child of a parent section
617	sub get_next_child {
618	my $self = shift (@_);
619	my ($section) = @_;
620
621	my $parent_section = $self->get_parent_section($section);
622	my $parent_section_ptr = $self->_lookup_section($parent_section);
623	return undef unless defined $parent_section_ptr;
624
625	my ($section_num) = $section =~ /(\d+)$/;
626	return undef unless defined $section_num;
627
628	my $i = 0;
629	my $section_order = $parent_section_ptr->{'subsection_order'};
630	while ($i < scalar(@$section_order)) {
631	last if $section_order->[$i] eq $section_num;
632	$i++;
633	}
634
635	$i++; # the next child
636	if ($i < scalar(@$section_order)) {
637	return $section_order->[$i] if $parent_section eq "";
638	return "$parent_section.$section_order->[$i]";
639	}
640
641	# no more sections in this level
642	return undef;
643	}
644
645	# returns a reference to a list of children
646	sub get_children {
647	my $self = shift (@_);
648	my ($section) = @_;
649
650	my $section_ptr = $self->_lookup_section($section);
651	return [] unless defined $section_ptr;
652
653	my @children = @{$section_ptr->{'subsection_order'}};
654
655	map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
656	return \@children;
657	}
658
659	# returns the child section one past the last one (which
660	# is coded as "0")
661	sub get_end_child {
662	my $self = shift (@_);
663	my ($section) = @_;
664
665	return $section . ".0" unless $section eq "";
666	return "0";
667	}
668
669	# returns the next section in book order
670	sub get_next_section {
671	my $self = shift (@_);
672	my ($section) = @_;
673
674	return undef unless defined $section;
675
676	my $section_ptr = $self->_lookup_section($section);
677	return undef unless defined $section_ptr;
678
679	# first try to find first child
680	if (defined $section_ptr->{'subsection_order'}->[0]) {
681	return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
682	return "$section.$section_ptr->{'subsection_order'}->[0]";
683	}
684
685	do {
686	# try to find sibling
687	my $next_child = $self->get_next_child ($section);
688	return $next_child if (defined $next_child);
689
690	# move up one level
691	$section = $self->get_parent_section ($section);
692	} while $section =~ /\d/;
693
694	return undef;
695	}
696
697	sub is_leaf_section {
698	my $self = shift (@_);
699	my ($section) = @_;
700
701	my $section_ptr = $self->_lookup_section($section);
702	return 1 unless defined $section_ptr;
703
704	return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
705	}
706
707	# methods for dealing with sections
708
709	# returns the name of the inserted section
710	sub insert_section {
711	my $self = shift (@_);
712	my ($before_section) = @_;
713
714	# get the child to insert before and its parent section
715	my $parent_section = "";
716	my $before_child = "0";
717	my @before_section = split (/\./, $before_section);
718	if (scalar(@before_section) > 0) {
719	$before_child = pop (@before_section);
720	$parent_section = join (".", @before_section);
721	}
722
723	my $parent_section_ptr = $self->_lookup_section($parent_section);
724	if (!defined $parent_section_ptr) {
725	print STDERR "doc::insert_section couldn't find parent section " .
726	"$parent_section\n";
727	return;
728	}
729
730	# get the next section number
731	my $section_num = $parent_section_ptr->{'next_subsection'}++;
732
733	my $i = 0;
734	while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
735	$parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
736	$i++;
737	}
738
739	# insert the section number into the order list
740	splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
741
742	# add this section to the parent section
743	my $section_ptr = {'subsection_order'=>[],
744	'next_subsection'=>1,
745	'subsections'=>{},
746	'metadata'=>[],
747	'text'=>""};
748	$parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
749
750	# work out the full section number
751	my $section = $parent_section;
752	$section .= "." unless $section eq "";
753	$section .= $section_num;
754
755	return $section;
756	}
757
758	# creates a pre-named section
759	sub create_named_section {
760	my $self = shift (@_);
761	my ($mastersection) = @_;
762
763	my ($num);
764	my $section = $mastersection;
765	my $sectionref = $self;
766
767	while ($section ne "") {
768	($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
769	$num =~ s/^0+(\d)/$1/; # remove leading 0s
770	$section = "" unless defined $section;
771
772	if (defined $num) {
773	if (!defined $sectionref->{'subsections'}->{$num}) {
774	push (@{$sectionref->{'subsection_order'}}, $num);
775	$sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
776	'next_subsection'=>1,
777	'subsections'=>{},
778	'metadata'=>[],
779	'text'=>""};
780	if ($num >= $sectionref->{'next_subsection'}) {
781	$sectionref->{'next_subsection'} = $num + 1;
782	}
783	}
784	$sectionref = $sectionref->{'subsections'}->{$num};
785
786	} else {
787	print STDERR "doc::create_named_section couldn't create section ";
788	print STDERR "$mastersection\n";
789	last;
790	}
791	}
792	}
793
794	# returns a reference to a list of subsections
795	sub list_subsections {
796	my $self = shift (@_);
797	my ($section) = @_;
798
799	my $section_ptr = $self->_lookup_section ($section);
800	if (!defined $section_ptr) {
801	print STDERR "doc::list_subsections couldn't find section $section\n";
802	return [];
803	}
804
805	return [@{$section_ptr->{'subsection_order'}}];
806	}
807
808	sub delete_section {
809	my $self = shift (@_);
810	my ($section) = @_;
811
812	# my $section_ptr = {'subsection_order'=>[],
813	# 'next_subsection'=>1,
814	# 'subsections'=>{},
815	# 'metadata'=>[],
816	# 'text'=>""};
817
818	# if this is the top section reset everything
819	if ($section eq "") {
820	$self->{'subsection_order'} = [];
821	$self->{'subsections'} = {};
822	$self->{'metadata'} = [];
823	$self->{'text'} = "";
824	return;
825	}
826
827	# find the parent of the section to delete
828	my $parent_section = "";
829	my $child = "0";
830	my @section = split (/\./, $section);
831	if (scalar(@section) > 0) {
832	$child = pop (@section);
833	$parent_section = join (".", @section);
834	}
835
836	my $parent_section_ptr = $self->_lookup_section($parent_section);
837	if (!defined $parent_section_ptr) {
838	print STDERR "doc::delete_section couldn't find parent section " .
839	"$parent_section\n";
840	return;
841	}
842
843	# remove this section from the subsection_order list
844	my $i = 0;
845	while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
846	if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
847	splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
848	last;
849	}
850	$i++;
851	}
852
853	# remove this section from the subsection hash
854	if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
855	undef $parent_section_ptr->{'subsections'}->{$child};
856	}
857	}
858
859	#--
860	# methods for dealing with metadata
861
862	# set_metadata_element and get_metadata_element are for metadata
863	# which should only have one value. add_meta_data and get_metadata
864	# are for metadata which can have more than one value.
865
866	# returns the first metadata value which matches field
867
868	# This version of get metadata element works much like the one above,
869	# except it allows for the namespace portion of a metadata element to
870	# be ignored, thus if you are searching for dc.Title, the first piece
871	# of matching metadata ending with the name Title (once any namespace
872	# is removed) would be returned.
873	# 28-11-2003 John Thompson
874	sub get_metadata_element {
875	my $self = shift (@_);
876	my ($section, $field, $ignore_namespace) = @_;
877	my ($data);
878
879	$ignore_namespace = 0 unless defined $ignore_namespace;
880
881	my $section_ptr = $self->_lookup_section($section);
882	if (!defined $section_ptr) {
883	print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
884	return;
885	}
886
887	# Remove any namespace if we are being told to ignore them
888	if($ignore_namespace) {
889	$field =~ s/^.\.//; #$field =~ s/^\w\.//;
890	}
891
892	foreach $data (@{$section_ptr->{'metadata'}}) {
893
894	my $data_name = $data->[0];
895
896	# Remove any namespace if we are being told to ignore them
897	if($ignore_namespace) {
898	$data_name =~ s/^.\.//; #$data_name =~ s/^\w\.//;
899	}
900	# we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
901	$data_name =~ s/^ex\.([^.]+)$/$1/; #$data_name =~ s/^ex\.//;
902
903	return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
904	}
905
906	return undef; # was not found
907	}
908
909	# returns a list of the form [value1, value2, ...]
910	sub get_metadata {
911	my $self = shift (@_);
912	my ($section, $field, $ignore_namespace) = @_;
913	my ($data);
914
915	$ignore_namespace = 0 unless defined $ignore_namespace;
916
917	my $section_ptr = $self->_lookup_section($section);
918	if (!defined $section_ptr) {
919	print STDERR "doc::get_metadata couldn't find section ",
920	$section, "\n";
921	return;
922	}
923
924	# Remove any namespace if we are being told to ignore them
925	if($ignore_namespace) {
926	$field =~ s/^.*\.//;
927	}
928
929	my @metadata = ();
930	foreach $data (@{$section_ptr->{'metadata'}}) {
931
932	my $data_name = $data->[0];
933
934	# Remove any namespace if we are being told to ignore them
935	if($ignore_namespace) {
936	$data_name =~ s/^.*\.//;
937	}
938	# we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
939	$data_name =~ s/^ex\.([^.]+)$/$1/;
940
941	push (@metadata, $data->[1]) if ($data_name eq $field);
942	}
943
944	return \@metadata;
945	}
946
947	sub get_metadata_hashmap {
948	my $self = shift (@_);
949	my ($section, $opt_namespace) = @_;
950
951	my $section_ptr = $self->_lookup_section($section);
952	if (!defined $section_ptr) {
953	print STDERR "doc::get_metadata couldn't find section ",
954	$section, "\n";
955	return;
956	}
957
958	my $metadata_hashmap = {};
959	foreach my $data (@{$section_ptr->{'metadata'}}) {
960	my $metaname = $data->[0];
961
962	if ((!defined $opt_namespace) \|\| ($metaname =~ m/^$opt_namespace\./)) {
963	if (!defined $metadata_hashmap->{$metaname}) {
964	$metadata_hashmap->{$metaname} = [];
965	}
966	my $metaval_list = $metadata_hashmap->{$metaname};
967	push(@$metaval_list, $data->[1]);
968	}
969	}
970
971	return $metadata_hashmap;
972	}
973
974	# returns a list of the form [[field,value],[field,value],...]
975	sub get_all_metadata {
976	my $self = shift (@_);
977	my ($section) = @_;
978
979	my $section_ptr = $self->_lookup_section($section);
980	if (!defined $section_ptr) {
981	print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
982	return;
983	}
984
985	return $section_ptr->{'metadata'};
986	}
987
988	# $value is optional
989	sub delete_metadata {
990	my $self = shift (@_);
991	my ($section, $field, $value) = @_;
992
993	my $section_ptr = $self->_lookup_section($section);
994	if (!defined $section_ptr) {
995	print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
996	return;
997	}
998
999	my $i = 0;
1000	while ($i < scalar (@{$section_ptr->{'metadata'}})) {
1001	if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1002	(!defined $value \|\| $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1003	splice (@{$section_ptr->{'metadata'}}, $i, 1);
1004	} else {
1005	$i++;
1006	}
1007	}
1008	}
1009
1010	sub delete_all_metadata {
1011	my $self = shift (@_);
1012	my ($section) = @_;
1013
1014	my $section_ptr = $self->_lookup_section($section);
1015	if (!defined $section_ptr) {
1016	print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
1017	return;
1018	}
1019
1020	$section_ptr->{'metadata'} = [];
1021	}
1022
1023	sub set_metadata_element {
1024	my $self = shift (@_);
1025	my ($section, $field, $value) = @_;
1026
1027	$self->set_utf8_metadata_element ($section, $field,
1028	&unicode::ascii2utf8(\$value));
1029	}
1030
1031	# set_utf8_metadata_element assumes the text has already been
1032	# converted to the UTF-8 encoding.
1033	sub set_utf8_metadata_element {
1034	my $self = shift (@_);
1035	my ($section, $field, $value) = @_;
1036
1037	$self->delete_metadata ($section, $field);
1038	$self->add_utf8_metadata ($section, $field, $value);
1039	}
1040
1041
1042	# add_metadata assumes the text is in (extended) ascii form. For
1043	# text which has already been converted to the UTF-8 format use
1044	# add_utf8_metadata.
1045	sub add_metadata {
1046	my $self = shift (@_);
1047	my ($section, $field, $value) = @_;
1048
1049	$self->add_utf8_metadata ($section, $field,
1050	&unicode::ascii2utf8(\$value));
1051	}
1052
1053	sub add_utf8_metadata {
1054	my $self = shift (@_);
1055	my ($section, $field, $value) = @_;
1056
1057	# my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1058	# my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
1059	# print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
1060
1061	my $section_ptr = $self->_lookup_section($section);
1062	if (!defined $section_ptr) {
1063	print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
1064	return;
1065	}
1066	if (!defined $value) {
1067	print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1068	return;
1069	}
1070	if (!defined $field) {
1071	print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1072	return;
1073	}
1074
1075	#print STDERR "###$field=$value\n";
1076
1077	# For now, supress this check. Given that text data read in is now
1078	# Unicode aware, then the following block of code can (ironically enough)
1079	# cause our unicode compliant string to be re-encoded (leading to
1080	# a double-encoded UTF-8 string, which we definitely don't want!).
1081
1082
1083	# double check that the value is utf-8
1084	# if (!&unicode::check_is_utf8($value)) {
1085	# print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
1086	# &unicode::ensure_utf8(\$value);
1087	# print STDERR " Tried converting to utf8: $value\n";
1088	# }
1089
1090	#If the metadata value is either a latitude or a longitude value then we want to save a shortened version for spacial searching purposes
1091	if ($field =~ m/^(.+\.)?Latitude$/ \|\| $field =~ m/^(.+\.)?Longitude$/)
1092	{
1093	my ($mdprefix,$metaname) = ($field =~ m/(.+)\.(.+)$/);
1094	if (defined $mdprefix) {
1095	# Add in a version of Latitude/Longitude without the metadata namespace prefix to keep Runtime happy
1096	push (@{$section_ptr->{'metadata'}}, [$metaname, $value]);
1097	}
1098
1099	my $direction;
1100	if($value =~ m/^-/)
1101	{
1102	$direction = ($field eq "Latitude") ? "S" : "W";
1103	}
1104	else
1105	{
1106	$direction = ($field eq "Latitude") ? "N" : "E";
1107	}
1108
1109	my ($beforeDec, $afterDec) = ($value =~ m/^-?([0-9]+)\.([0-9]+)$/);
1110	if(defined $beforeDec && defined $afterDec)
1111	{
1112	my $name = ($field eq "Latitude") ? "LatShort" : "LngShort";
1113	push (@{$section_ptr->{'metadata'}}, [$name, $beforeDec . $direction]);
1114
1115	for(my $i = 2; $i <= 4; $i++)
1116	{
1117	if(length($afterDec) >= $i)
1118	{
1119	push (@{$section_ptr->{'metadata'}}, [$name, substr($afterDec, 0, $i)]);
1120	}
1121	}
1122
1123	#Only add the metadata if it has not already been added
1124	my $metaMap = $self->get_metadata_hashmap($section);
1125	}
1126	}
1127
1128	push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1129	}
1130
1131
1132	# methods for dealing with text
1133
1134	# returns the text for a section
1135	sub get_text {
1136	my $self = shift (@_);
1137	my ($section) = @_;
1138
1139	my $section_ptr = $self->_lookup_section($section);
1140	if (!defined $section_ptr) {
1141	print STDERR "doc::get_text couldn't find section " .
1142	"$section\n";
1143	return "";
1144	}
1145
1146	return $section_ptr->{'text'};
1147	}
1148
1149	# returns the (utf-8 encoded) length of the text for a section
1150	sub get_text_length {
1151	my $self = shift (@_);
1152	my ($section) = @_;
1153
1154	my $section_ptr = $self->_lookup_section($section);
1155	if (!defined $section_ptr) {
1156	print STDERR "doc::get_text_length couldn't find section " .
1157	"$section\n";
1158	return 0;
1159	}
1160
1161	return length ($section_ptr->{'text'});
1162	}
1163
1164	# returns the total length for all the sections
1165	sub get_total_text_length {
1166	my $self = shift (@_);
1167
1168	my $section = $self->get_top_section();
1169	my $length = 0;
1170	while (defined $section) {
1171	$length += $self->get_text_length($section);
1172	$section = $self->get_next_section($section);
1173	}
1174	return $length;
1175	}
1176
1177	sub delete_text {
1178	my $self = shift (@_);
1179	my ($section) = @_;
1180
1181	my $section_ptr = $self->_lookup_section($section);
1182	if (!defined $section_ptr) {
1183	print STDERR "doc::delete_text couldn't find section " .
1184	"$section\n";
1185	return;
1186	}
1187
1188	$section_ptr->{'text'} = "";
1189	}
1190
1191	# add_text assumes the text is in (extended) ascii form. For
1192	# text which has been already converted to the UTF-8 format
1193	# use add_utf8_text.
1194	sub add_text {
1195	my $self = shift (@_);
1196	my ($section, $text) = @_;
1197
1198	# convert the text to UTF-8 encoded unicode characters
1199	# and add the text
1200	$self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1201	}
1202
1203
1204	# add_utf8_text assumes the text to be added has already
1205	# been converted to the UTF-8 encoding. For ascii text use
1206	# add_text
1207	sub add_utf8_text {
1208	my $self = shift (@_);
1209	my ($section, $text) = @_;
1210
1211	my $section_ptr = $self->_lookup_section($section);
1212	if (!defined $section_ptr) {
1213	print STDERR "doc::add_utf8_text couldn't find section " .
1214	"$section\n";
1215	return;
1216	}
1217
1218	$section_ptr->{'text'} .= $text;
1219	}
1220
1221	# returns the Source meta, which is the utf8 filename generated.
1222	# Added a separate method here for convenience
1223	sub get_source {
1224	my $self = shift (@_);
1225	return $self->get_metadata_element ($self->get_top_section(), "Source");
1226	}
1227
1228	# returns the SourceFile meta, which is the url reference to the URL-encoded
1229	# version of Source (the utf8 filename). Added a separate method here for convenience
1230	sub get_sourcefile {
1231	my $self = shift (@_);
1232	return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1233	}
1234
1235	# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1236	# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1237	sub get_assocfile_from_sourcefile {
1238	my $self = shift (@_);
1239
1240	# get the SourceFile meta, which is a URL to a file on the filesystem
1241	my $top_section = $self->get_top_section();
1242	my $source_file = $self->get_metadata_element($top_section, "SourceFile");
1243
1244	# get the actual filename as it exists on the filesystem which this url refers to
1245	$source_file = &unicode::url_to_filename($source_file);
1246	my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1247	return $assocfilename;
1248	}
1249
1250	# methods for dealing with associated files
1251
1252	# a file is associated with a document, NOT a section.
1253	# if section is defined it is noted in the data structure
1254	# only so that files associated from a particular section
1255	# may be removed later (using delete_section_assoc_files)
1256	sub associate_file {
1257	my $self = shift (@_);
1258	my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1259	$mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1260
1261	# remove all associated files with the same name
1262	$self->delete_assoc_file ($assoc_filename);
1263
1264	# Too harsh a requirement
1265	# Definitely get HTML docs, for example, with some missing
1266	# support files
1267	# if (!&util::fd_exists($real_filename)) {
1268	# print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1269	# exit -1;
1270	# }
1271
1272	# print STDERR "**** is the following a UTF8 rep of real filename?\n $real_filename\n";
1273	# print STDERR "****##### so, ensure it is before storing?!?!?\n";
1274	## my $utf8_filename = Encode::encode("utf8",$filename);
1275
1276	push (@{$self->{'associated_files'}},
1277	[$real_filename, $assoc_filename, $mime_type, $section]);
1278	}
1279
1280	# returns a list of associated files in the form
1281	# [[real_filename, assoc_filename, mimetype], ...]
1282	sub get_assoc_files {
1283	my $self = shift (@_);
1284
1285	return $self->{'associated_files'};
1286	}
1287
1288	# the following two methods used to keep track of original associated files
1289	# for incremental building. eg a txt file used by an item file does not end
1290	# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1291	# file for incremental build
1292	sub associate_source_file {
1293	my $self = shift (@_);
1294	my ($full_filename) = @_;
1295
1296	push (@{$self->{'source_assoc_files'}}, $full_filename);
1297
1298	}
1299
1300	sub get_source_assoc_files {
1301	my $self = shift (@_);
1302
1303	return $self->{'source_assoc_files'};
1304
1305
1306	}
1307	sub metadata_file {
1308	my $self = shift (@_);
1309	my ($real_filename, $filename) = @_;
1310
1311	push (@{$self->{'metadata_files'}},
1312	[$real_filename, $filename]);
1313	}
1314
1315	# used for writing out the archiveinf-doc info database, to list all the metadata files
1316	sub get_meta_files {
1317	my $self = shift (@_);
1318
1319	return $self->{'metadata_files'};
1320	}
1321
1322	sub delete_section_assoc_files {
1323	my $self = shift (@_);
1324	my ($section) = @_;
1325
1326	my $i=0;
1327	while ($i < scalar (@{$self->{'associated_files'}})) {
1328	if (defined $self->{'associated_files'}->[$i]->[3] &&
1329	$self->{'associated_files'}->[$i]->[3] eq $section) {
1330	splice (@{$self->{'associated_files'}}, $i, 1);
1331	} else {
1332	$i++;
1333	}
1334	}
1335	}
1336
1337	sub delete_assoc_file {
1338	my $self = shift (@_);
1339	my ($assoc_filename) = @_;
1340
1341	my $i=0;
1342	while ($i < scalar (@{$self->{'associated_files'}})) {
1343	if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1344	splice (@{$self->{'associated_files'}}, $i, 1);
1345	} else {
1346	$i++;
1347	}
1348	}
1349	}
1350
1351	sub reset_nextsection_ptr {
1352	my $self = shift (@_);
1353	my ($section) = @_;
1354
1355	my $section_ptr = $self->_lookup_section($section);
1356	$section_ptr->{'next_subsection'} = 1;
1357	}
1358
1359	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: