Context Navigation

source: main/trunk/greenstone2/perllib/doc.pm@ 26850

Last change on this file since 26850 was 26850, checked in by davidb, 11 years ago
Additional tweak to handling Latitude and Longitude added, so versions that use metadata set prefixes also get fed in to the 'ex' version, so the runtime system can see these values
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 38.0 KB

Line
1	###########################################################################
2	#
3	# doc.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistr te it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# base class to hold documents
27
28	package doc;
29	eval {require bytes};
30
31	BEGIN {
32	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33	unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34	}
35
36	use strict;
37	use unicode;
38	use util;
39	use ghtml;
40	use File::stat;
41	##use hashdoc;
42	use docprint;
43
44	# the document type may be indexed_doc, nonindexed_doc, or
45	# classification
46
47	our $OIDcount = 0;
48
49	# rename_method can be 'url', 'none', 'base64'
50	sub new {
51	my $class = shift (@_);
52	my ($source_filename, $doc_type, $rename_method) = @_;
53
54
55	my $self = bless {'associated_files'=>[],
56	'subsection_order'=>[],
57	'next_subsection'=>1,
58	'subsections'=>{},
59	'metadata'=>[],
60	'text'=>"",
61	'OIDtype'=>"hash"}, $class;
62
63	# used to set lastmodified here, but this can screw up the HASH ids, so
64	# the docsave processor now calls set_lastmodified
65
66	$self->set_source_path($source_filename);
67
68	if (defined $source_filename) {
69	$source_filename = &util::filename_within_collection($source_filename);
70	print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
71	$self->set_source_filename ($source_filename, $rename_method);
72	}
73
74	$self->set_doc_type ($doc_type) if defined $doc_type;
75
76	return $self;
77	}
78
79
80	sub set_source_path
81	{
82	my $self = shift @_;
83	my ($source_filename) = @_;
84
85	if (defined $source_filename) {
86	# On Windows the source_filename can be in terse DOS format
87	# e.g. test~1.txt
88
89	$self->{'terse_source_path'} = $source_filename;
90
91	if (-e $source_filename) {
92	# See if we can do better for Windows with a filename
93	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
94	require Win32;
95	$self->{'source_path'} = Win32::GetLongPathName($source_filename);
96	}
97	else {
98	# For Unix-based systems, there is no difference between the two
99	$self->{'source_path'} = $source_filename;
100	}
101	}
102	else {
103	print STDERR "Warning: In doc::set_source_path(), file\n";
104	print STDERR " $source_filename\n";
105	print STDERR " does not exist\n";
106
107	# (default) Set it to whatever we were given
108	$self->{'source_path'} = $source_filename;
109	}
110	}
111	else {
112	# Previous code for setting source_path allowed for
113	# it to be undefined, so continue this practice
114	$self->{'terse_source_path'} = undef;
115	$self->{'source_path'} = undef;
116	}
117	}
118
119
120	sub get_source_path
121	{
122	my $self = shift @_;
123
124	return $self->{'terse_source_path'};
125	}
126
127	# set lastmodified for OAI purposes, added by GRB, moved by kjdon
128	sub set_oailastmodified {
129	my $self = shift (@_);
130
131	my $source_path = $self->{'terse_source_path'};
132
133	if (defined $source_path && (-e $source_path)) {
134	my $current_time = time;
135
136	my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
137	$wday, $yday, $isdst) = localtime($current_time);
138
139	my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
140
141	$self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
142	$self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
143	}
144	}
145
146	# no longer used for OAI purposes, since lastmodified is not what we want as the
147	# Datestamp of a document. This doc metadata may be useful for general purposes.
148	sub set_lastmodified {
149	my $self = shift (@_);
150
151	my $source_path = $self->{'terse_source_path'};
152
153	if (defined $source_path && (-e $source_path)) {
154
155	my $file_stat = stat($source_path);
156	my $mtime = $file_stat->mtime;
157	my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
158	$wday, $yday, $isdst) = localtime($mtime);
159
160	my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
161
162	$self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
163	$self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
164	}
165	}
166
167	# clone the $self object
168	sub duplicate {
169	my $self = shift (@_);
170
171	my $newobj = {};
172
173	foreach my $k (keys %$self) {
174	$newobj->{$k} = &clone ($self->{$k});
175	}
176
177	bless $newobj, ref($self);
178	return $newobj;
179	}
180
181	sub clone {
182	my ($from) = @_;
183	my $type = ref ($from);
184
185	if ($type eq "HASH") {
186	my $to = {};
187	foreach my $key (keys %$from) {
188	$to->{$key} = &clone ($from->{$key});
189	}
190	return $to;
191	} elsif ($type eq "ARRAY") {
192	my $to = [];
193	foreach my $v (@$from) {
194	push (@$to, &clone ($v));
195	}
196	return $to;
197	} else {
198	return $from;
199	}
200	}
201
202	sub set_OIDtype {
203	my $self = shift (@_);
204	my ($type, $metadata) = @_;
205
206	if (defined $type && $type =~ /^(hash\|hash_on_file\|hash_on_ga_xml\|hash_on_full_filename\|incremental\|filename\|dirname\|full_filename\|assigned)$/) {
207	$self->{'OIDtype'} = $type;
208	} else {
209	$self->{'OIDtype'} = "hash";
210	}
211
212	if ($type =~ /^assigned$/) {
213	if (defined $metadata) {
214	$self->{'OIDmetadata'} = $metadata;
215	} else {
216	$self->{'OIDmetadata'} = "dc.Identifier";
217	}
218	}
219	}
220
221	# rename_method can be 'url', 'none', 'base64'
222	sub set_source_filename {
223	my $self = shift (@_);
224	my ($source_filename, $rename_method) = @_;
225
226	# Since the gsdlsourcefilename element goes into the doc.xml it has
227	# to be utf8. However, it should also represent the source filename
228	# (in the import directory) which may not be utf8 at all.
229	# For instance, if this meta element (gsdlsourcefilename) will be used
230	# by other applications that parse doc.xml in order to locate
231	# gsdlsourcefilename. Therefore, the solution is to URLencode or base64
232	# encode the real filename as this is a binary-to-text encoding meaning
233	# that the resulting string is ASCII (utf8). Decoding will give the original.
234
235	# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
236
237	# URLencode just the gsdl_source_filename, not the directory. Then prepend dir
238	$source_filename = $self->encode_filename($source_filename, $rename_method);
239	# my ($srcfilename,$dirname,$suffix)
240	# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
241	# print STDERR "-> $srcfilename -> ";
242	# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
243	# $source_filename = &util::filename_cat($dirname, $srcfilename);
244	# print STDERR "$source_filename\n";
245
246	$self->set_utf8_metadata_element ($self->get_top_section(),
247	"gsdlsourcefilename",
248	$source_filename);
249	}
250
251	sub encode_filename {
252	my $self = shift (@_);
253	my ($source_filename, $rename_method) = @_;
254
255	my ($srcfilename,$dirname,$suffix)
256	= &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
257	# print STDERR "-> $srcfilename -> ";
258	$srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
259	$source_filename = &util::filename_cat($dirname, $srcfilename);
260
261	return $source_filename;
262	}
263
264	sub set_converted_filename {
265	my $self = shift (@_);
266	my ($converted_filename) = @_;
267
268	# we know the converted filename is utf8
269	$self->set_utf8_metadata_element ($self->get_top_section(),
270	"gsdlconvertedfilename",
271	$converted_filename);
272	}
273
274	# returns the source_filename as it was provided
275	sub get_unmodified_source_filename {
276	my $self = shift (@_);
277
278	return $self->{'terse_source_path'};
279	}
280
281	# returns the source_filename with whatever rename_method was given
282	sub get_source_filename {
283	my $self = shift (@_);
284
285	return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
286	}
287
288
289
290	# returns converted filename if available else returns source filename
291	sub get_filename_for_hashing {
292	my $self = shift (@_);
293
294	my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
295
296	if (!defined $filename) {
297	my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
298	# if NULPlug processed file, then don't give a filename
299	if (defined $plugin_name && $plugin_name eq "NULPlug") {
300	$filename = undef;
301	} else { # returns the URL encoded source filename!
302	$filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
303	}
304	}
305
306	if (!&util::filename_is_absolute($filename)) {
307	$filename = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$filename);
308	}
309
310	return $filename;
311	}
312
313	sub set_doc_type {
314	my $self = shift (@_);
315	my ($doc_type) = @_;
316
317	$self->set_metadata_element ($self->get_top_section(),
318	"gsdldoctype",
319	$doc_type);
320	}
321
322	# returns the gsdldoctype as it was provided
323	# the default of "indexed_doc" is used if no document
324	# type was provided
325	sub get_doc_type {
326	my $self = shift (@_);
327
328	my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
329	return $doc_type if (defined $doc_type);
330	return "indexed_doc";
331	}
332
333
334	# look up the reference to the a particular section
335	sub _lookup_section {
336	my $self = shift (@_);
337	my ($section) = @_;
338
339	my ($num);
340	my $sectionref = $self;
341
342	while (defined $section && $section ne "") {
343
344	($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
345
346	$num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
347
348	$section = "" unless defined $section;
349
350
351	if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
352	$sectionref = $sectionref->{'subsections'}->{$num};
353	} else {
354	return undef;
355	}
356	}
357
358	return $sectionref;
359	}
360
361	# calculate OID by hashing the contents of the document
362	sub _calc_OID {
363	my $self = shift (@_);
364	my ($filename) = @_;
365
366
367	my $osexe = &util::get_os_exe();
368
369	my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
370	$ENV{'GSDLOS'},"hashfile$osexe");
371
372	my $result = "NULL";
373
374
375	if (-e "$hashfile_exe") {
376	# $result = `\"$hashfile_exe\" \"$filename\"`;
377	# $result = `hashfile$osexe \"$filename\" 2>&1`;
378	$result = `hashfile$osexe \"$filename\"`;
379
380	($result) = $result =~ /:\s*([0-9a-f]+)/i;
381	} else {
382	print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
383	}
384	return "HASH$result";
385	}
386
387	# methods dealing with OID, not groups of them.
388
389	# if $OID is not provided one is calculated
390	sub set_OID {
391	my $self = shift (@_);
392	my ($OID) = @_;
393
394	my $use_hash_oid = 0;
395	# if an OID wasn't provided calculate one
396	if (!defined $OID) {
397	$OID = "NULL";
398	if ($self->{'OIDtype'} =~ /^hash/) {
399	$use_hash_oid = 1;
400	} elsif ($self->{'OIDtype'} eq "incremental") {
401	$OID = "D" . $OIDcount;
402	$OIDcount ++;
403	} elsif ($self->{'OIDtype'} eq "filename") {
404	my $filename = $self->get_source_filename();
405	$OID = &File::Basename::fileparse($filename, qr/\.[^.]*/);
406	$OID = &util::tidy_up_oid($OID);
407	} elsif ($self->{'OIDtype'} eq "full_filename") {
408	my $source_filename = $self->get_source_filename();
409	my $dirsep = &util::get_os_dirsep();
410
411	$source_filename =~ s/^import$dirsep//;
412	$source_filename =~ s/$dirsep/-/g;
413	$source_filename =~ s/\./_/g;
414
415	$OID = $source_filename;
416	$OID = &util::tidy_up_oid($OID);
417	} elsif ($self->{'OIDtype'} eq "dirname") {
418	$OID = 'J';
419	my $filename = $self->get_source_filename();
420	if (defined($filename)) { # && -e $filename) {
421	$OID = &File::Basename::dirname($filename);
422	if (defined $OID) {
423	$OID = 'J'.&File::Basename::basename($OID);
424	$OID = &util::tidy_up_oid($OID);
425	} else {
426	print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
427	$use_hash_oid = 1;
428	}
429	} else {
430	print STDERR "Failed to find filename, generating hash id\n";
431	$use_hash_oid = 1;
432	}
433
434	} elsif ($self->{'OIDtype'} eq "assigned") {
435	my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
436	if (defined $identifier && $identifier ne "") {
437	$OID = $identifier;
438	$OID = &util::tidy_up_oid($OID);
439	} else {
440	# need a hash id
441	print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
442	$use_hash_oid = 1;
443	}
444
445	} else {
446	$use_hash_oid = 1;
447	}
448
449	if ($use_hash_oid) {
450	my $hash_on_file = 1;
451	my $hash_on_ga_xml = 0;
452
453	if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
454	$hash_on_file = 0;
455	$hash_on_ga_xml = 1;
456	}
457
458	if ($self->{'OIDtype'} eq "hash_on_full_filename") {
459	$hash_on_file = 0;
460	$hash_on_ga_xml = 0;
461
462	my $source_filename = $self->get_source_filename();
463	my $dirsep = &util::get_os_dirsep();
464
465	$source_filename =~ s/^import$dirsep//;
466	$source_filename =~ s/$dirsep/-/g;
467	$source_filename =~ s/\./_/g;
468
469	# If the filename is very short then (handled naively)
470	# this can cause conjestion in the hash-values
471	# computed, leading documents sharing the same leading
472	# Hex values in the computed has.
473	#
474	# The solution taken here is to replace the name of
475	# the file name a sufficient number of times (up to
476	# the character limit defined in 'rep_limit' and
477	# make that the content that is hashed on
478
479	# *** Think twice before changing the following value
480	# as it will break backward compatability of computed
481	# document HASH values
482
483	my $rep_limit = 256;
484	my $hash_content = undef;
485
486	if (length($source_filename)<$rep_limit) {
487	my $rep_string = "$source_filename\|";
488	my $rs_len = length($rep_string);
489
490	my $clone_times = int(($rep_limit-1)/$rs_len) +1;
491
492	$hash_content = substr($rep_string x $clone_times, 0, $rep_limit);
493	}
494	else {
495	$hash_content = $source_filename;
496	}
497
498	my $filename = &util::get_tmp_filename();
499	if (!open (OUTFILE, ">:utf8", $filename)) {
500	print STDERR "doc::set_OID could not write to $filename\n";
501	} else {
502	print OUTFILE $hash_content;
503	close (OUTFILE);
504	}
505	$OID = $self->_calc_OID ($filename);
506
507	&util::rm ($filename);
508	}
509
510	if ($hash_on_file) {
511	# "hash" OID - feed file to hashfile.exe
512	my $filename = $self->get_filename_for_hashing();
513
514	# -z: don't want to hash on the file if it is zero size
515	if (defined($filename) && -e $filename && !-z $filename) {
516	$OID = $self->_calc_OID ($filename);
517	} else {
518	$hash_on_ga_xml = 1; # switch to back-up plan, and hash on GA file instead
519	}
520	}
521
522	if ($hash_on_ga_xml) {
523	# In addition being asked to explicity calculate the has based on the GA file,
524	# can also end up coming into this block is doing 'hash_on_file' but the file
525	# itself is of zero bytes (as could be the case with 'doc.nul' file
526
527	my $filename = &util::get_tmp_filename();
528	if (!open (OUTFILE, ">:utf8", $filename)) {
529	print STDERR "doc::set_OID could not write to $filename\n";
530	} else {
531	my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
532	print OUTFILE $doc_text;
533	close (OUTFILE);
534	}
535	$OID = $self->_calc_OID ($filename);
536	&util::rm ($filename);
537	}
538	}
539	}
540	$self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
541	}
542
543	# this uses hashdoc (embedded c thingy) which is faster but still
544	# needs a little work to be suffiently stable
545	sub ___set_OID {
546	my $self = shift (@_);
547	my ($OID) = @_;
548
549	# if an OID wasn't provided then calculate hash value based on document
550	if (!defined $OID)
551	{
552	my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
553	my $hash_len = length($hash_text);
554
555	$OID = &hashdoc::buffer($hash_text,$hash_len);
556	}
557
558	$self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
559	}
560
561	# returns the OID for this document
562	sub get_OID {
563	my $self = shift (@_);
564	my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
565	return $OID if (defined $OID);
566	return "NULL";
567	}
568
569	sub delete_OID {
570	my $self = shift (@_);
571
572	$self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
573	}
574
575
576	# methods for manipulating section names
577
578	# returns the name of the top-most section (the top
579	# level of the document
580	sub get_top_section {
581	my $self = shift (@_);
582
583	return "";
584	}
585
586	# returns a section
587	sub get_parent_section {
588	my $self = shift (@_);
589	my ($section) = @_;
590
591	$section =~ s/(^\|\.)\d+$//;
592
593	return $section;
594	}
595
596	# returns the first child section (or the end child
597	# if there isn't any)
598	sub get_begin_child {
599	my $self = shift (@_);
600	my ($section) = @_;
601
602	my $section_ptr = $self->_lookup_section($section);
603	return "" unless defined $section_ptr;
604
605	if (defined $section_ptr->{'subsection_order'}->[0]) {
606	return "$section.$section_ptr->{'subsection_order'}->[0]";
607	}
608
609	return $self->get_end_child ($section);
610	}
611
612	# returns the next child of a parent section
613	sub get_next_child {
614	my $self = shift (@_);
615	my ($section) = @_;
616
617	my $parent_section = $self->get_parent_section($section);
618	my $parent_section_ptr = $self->_lookup_section($parent_section);
619	return undef unless defined $parent_section_ptr;
620
621	my ($section_num) = $section =~ /(\d+)$/;
622	return undef unless defined $section_num;
623
624	my $i = 0;
625	my $section_order = $parent_section_ptr->{'subsection_order'};
626	while ($i < scalar(@$section_order)) {
627	last if $section_order->[$i] eq $section_num;
628	$i++;
629	}
630
631	$i++; # the next child
632	if ($i < scalar(@$section_order)) {
633	return $section_order->[$i] if $parent_section eq "";
634	return "$parent_section.$section_order->[$i]";
635	}
636
637	# no more sections in this level
638	return undef;
639	}
640
641	# returns a reference to a list of children
642	sub get_children {
643	my $self = shift (@_);
644	my ($section) = @_;
645
646	my $section_ptr = $self->_lookup_section($section);
647	return [] unless defined $section_ptr;
648
649	my @children = @{$section_ptr->{'subsection_order'}};
650
651	map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
652	return \@children;
653	}
654
655	# returns the child section one past the last one (which
656	# is coded as "0")
657	sub get_end_child {
658	my $self = shift (@_);
659	my ($section) = @_;
660
661	return $section . ".0" unless $section eq "";
662	return "0";
663	}
664
665	# returns the next section in book order
666	sub get_next_section {
667	my $self = shift (@_);
668	my ($section) = @_;
669
670	return undef unless defined $section;
671
672	my $section_ptr = $self->_lookup_section($section);
673	return undef unless defined $section_ptr;
674
675	# first try to find first child
676	if (defined $section_ptr->{'subsection_order'}->[0]) {
677	return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
678	return "$section.$section_ptr->{'subsection_order'}->[0]";
679	}
680
681	do {
682	# try to find sibling
683	my $next_child = $self->get_next_child ($section);
684	return $next_child if (defined $next_child);
685
686	# move up one level
687	$section = $self->get_parent_section ($section);
688	} while $section =~ /\d/;
689
690	return undef;
691	}
692
693	sub is_leaf_section {
694	my $self = shift (@_);
695	my ($section) = @_;
696
697	my $section_ptr = $self->_lookup_section($section);
698	return 1 unless defined $section_ptr;
699
700	return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
701	}
702
703	# methods for dealing with sections
704
705	# returns the name of the inserted section
706	sub insert_section {
707	my $self = shift (@_);
708	my ($before_section) = @_;
709
710	# get the child to insert before and its parent section
711	my $parent_section = "";
712	my $before_child = "0";
713	my @before_section = split (/\./, $before_section);
714	if (scalar(@before_section) > 0) {
715	$before_child = pop (@before_section);
716	$parent_section = join (".", @before_section);
717	}
718
719	my $parent_section_ptr = $self->_lookup_section($parent_section);
720	if (!defined $parent_section_ptr) {
721	print STDERR "doc::insert_section couldn't find parent section " .
722	"$parent_section\n";
723	return;
724	}
725
726	# get the next section number
727	my $section_num = $parent_section_ptr->{'next_subsection'}++;
728
729	my $i = 0;
730	while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
731	$parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
732	$i++;
733	}
734
735	# insert the section number into the order list
736	splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
737
738	# add this section to the parent section
739	my $section_ptr = {'subsection_order'=>[],
740	'next_subsection'=>1,
741	'subsections'=>{},
742	'metadata'=>[],
743	'text'=>""};
744	$parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
745
746	# work out the full section number
747	my $section = $parent_section;
748	$section .= "." unless $section eq "";
749	$section .= $section_num;
750
751	return $section;
752	}
753
754	# creates a pre-named section
755	sub create_named_section {
756	my $self = shift (@_);
757	my ($mastersection) = @_;
758
759	my ($num);
760	my $section = $mastersection;
761	my $sectionref = $self;
762
763	while ($section ne "") {
764	($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
765	$num =~ s/^0+(\d)/$1/; # remove leading 0s
766	$section = "" unless defined $section;
767
768	if (defined $num) {
769	if (!defined $sectionref->{'subsections'}->{$num}) {
770	push (@{$sectionref->{'subsection_order'}}, $num);
771	$sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
772	'next_subsection'=>1,
773	'subsections'=>{},
774	'metadata'=>[],
775	'text'=>""};
776	if ($num >= $sectionref->{'next_subsection'}) {
777	$sectionref->{'next_subsection'} = $num + 1;
778	}
779	}
780	$sectionref = $sectionref->{'subsections'}->{$num};
781
782	} else {
783	print STDERR "doc::create_named_section couldn't create section ";
784	print STDERR "$mastersection\n";
785	last;
786	}
787	}
788	}
789
790	# returns a reference to a list of subsections
791	sub list_subsections {
792	my $self = shift (@_);
793	my ($section) = @_;
794
795	my $section_ptr = $self->_lookup_section ($section);
796	if (!defined $section_ptr) {
797	print STDERR "doc::list_subsections couldn't find section $section\n";
798	return [];
799	}
800
801	return [@{$section_ptr->{'subsection_order'}}];
802	}
803
804	sub delete_section {
805	my $self = shift (@_);
806	my ($section) = @_;
807
808	# my $section_ptr = {'subsection_order'=>[],
809	# 'next_subsection'=>1,
810	# 'subsections'=>{},
811	# 'metadata'=>[],
812	# 'text'=>""};
813
814	# if this is the top section reset everything
815	if ($section eq "") {
816	$self->{'subsection_order'} = [];
817	$self->{'subsections'} = {};
818	$self->{'metadata'} = [];
819	$self->{'text'} = "";
820	return;
821	}
822
823	# find the parent of the section to delete
824	my $parent_section = "";
825	my $child = "0";
826	my @section = split (/\./, $section);
827	if (scalar(@section) > 0) {
828	$child = pop (@section);
829	$parent_section = join (".", @section);
830	}
831
832	my $parent_section_ptr = $self->_lookup_section($parent_section);
833	if (!defined $parent_section_ptr) {
834	print STDERR "doc::delete_section couldn't find parent section " .
835	"$parent_section\n";
836	return;
837	}
838
839	# remove this section from the subsection_order list
840	my $i = 0;
841	while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
842	if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
843	splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
844	last;
845	}
846	$i++;
847	}
848
849	# remove this section from the subsection hash
850	if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
851	undef $parent_section_ptr->{'subsections'}->{$child};
852	}
853	}
854
855	#--
856	# methods for dealing with metadata
857
858	# set_metadata_element and get_metadata_element are for metadata
859	# which should only have one value. add_meta_data and get_metadata
860	# are for metadata which can have more than one value.
861
862	# returns the first metadata value which matches field
863
864	# This version of get metadata element works much like the one above,
865	# except it allows for the namespace portion of a metadata element to
866	# be ignored, thus if you are searching for dc.Title, the first piece
867	# of matching metadata ending with the name Title (once any namespace
868	# is removed) would be returned.
869	# 28-11-2003 John Thompson
870	sub get_metadata_element {
871	my $self = shift (@_);
872	my ($section, $field, $ignore_namespace) = @_;
873	my ($data);
874
875	$ignore_namespace = 0 unless defined $ignore_namespace;
876
877	my $section_ptr = $self->_lookup_section($section);
878	if (!defined $section_ptr) {
879	print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
880	return;
881	}
882
883	# Remove any namespace if we are being told to ignore them
884	if($ignore_namespace) {
885	$field =~ s/^.\.//; #$field =~ s/^\w\.//;
886	}
887
888	foreach $data (@{$section_ptr->{'metadata'}}) {
889
890	my $data_name = $data->[0];
891
892	# Remove any namespace if we are being told to ignore them
893	if($ignore_namespace) {
894	$data_name =~ s/^.\.//; #$data_name =~ s/^\w\.//;
895	}
896	# we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
897	$data_name =~ s/^ex\.([^.]+)$/$1/; #$data_name =~ s/^ex\.//;
898
899	return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
900	}
901
902	return undef; # was not found
903	}
904
905	# returns a list of the form [value1, value2, ...]
906	sub get_metadata {
907	my $self = shift (@_);
908	my ($section, $field, $ignore_namespace) = @_;
909	my ($data);
910
911	$ignore_namespace = 0 unless defined $ignore_namespace;
912
913	my $section_ptr = $self->_lookup_section($section);
914	if (!defined $section_ptr) {
915	print STDERR "doc::get_metadata couldn't find section ",
916	$section, "\n";
917	return;
918	}
919
920	# Remove any namespace if we are being told to ignore them
921	if($ignore_namespace) {
922	$field =~ s/^.*\.//;
923	}
924
925	my @metadata = ();
926	foreach $data (@{$section_ptr->{'metadata'}}) {
927
928	my $data_name = $data->[0];
929
930	# Remove any namespace if we are being told to ignore them
931	if($ignore_namespace) {
932	$data_name =~ s/^.*\.//;
933	}
934	# we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
935	$data_name =~ s/^ex\.([^.]+)$/$1/;
936
937	push (@metadata, $data->[1]) if ($data_name eq $field);
938	}
939
940	return \@metadata;
941	}
942
943	sub get_metadata_hashmap {
944	my $self = shift (@_);
945	my ($section, $opt_namespace) = @_;
946
947	my $section_ptr = $self->_lookup_section($section);
948	if (!defined $section_ptr) {
949	print STDERR "doc::get_metadata couldn't find section ",
950	$section, "\n";
951	return;
952	}
953
954	my $metadata_hashmap = {};
955	foreach my $data (@{$section_ptr->{'metadata'}}) {
956	my $metaname = $data->[0];
957
958	if ((!defined $opt_namespace) \|\| ($metaname =~ m/^$opt_namespace\./)) {
959	if (!defined $metadata_hashmap->{$metaname}) {
960	$metadata_hashmap->{$metaname} = [];
961	}
962	my $metaval_list = $metadata_hashmap->{$metaname};
963	push(@$metaval_list, $data->[1]);
964	}
965	}
966
967	return $metadata_hashmap;
968	}
969
970	# returns a list of the form [[field,value],[field,value],...]
971	sub get_all_metadata {
972	my $self = shift (@_);
973	my ($section) = @_;
974
975	my $section_ptr = $self->_lookup_section($section);
976	if (!defined $section_ptr) {
977	print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
978	return;
979	}
980
981	return $section_ptr->{'metadata'};
982	}
983
984	# $value is optional
985	sub delete_metadata {
986	my $self = shift (@_);
987	my ($section, $field, $value) = @_;
988
989	my $section_ptr = $self->_lookup_section($section);
990	if (!defined $section_ptr) {
991	print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
992	return;
993	}
994
995	my $i = 0;
996	while ($i < scalar (@{$section_ptr->{'metadata'}})) {
997	if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
998	(!defined $value \|\| $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
999	splice (@{$section_ptr->{'metadata'}}, $i, 1);
1000	} else {
1001	$i++;
1002	}
1003	}
1004	}
1005
1006	sub delete_all_metadata {
1007	my $self = shift (@_);
1008	my ($section) = @_;
1009
1010	my $section_ptr = $self->_lookup_section($section);
1011	if (!defined $section_ptr) {
1012	print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
1013	return;
1014	}
1015
1016	$section_ptr->{'metadata'} = [];
1017	}
1018
1019	sub set_metadata_element {
1020	my $self = shift (@_);
1021	my ($section, $field, $value) = @_;
1022
1023	$self->set_utf8_metadata_element ($section, $field,
1024	&unicode::ascii2utf8(\$value));
1025	}
1026
1027	# set_utf8_metadata_element assumes the text has already been
1028	# converted to the UTF-8 encoding.
1029	sub set_utf8_metadata_element {
1030	my $self = shift (@_);
1031	my ($section, $field, $value) = @_;
1032
1033	$self->delete_metadata ($section, $field);
1034	$self->add_utf8_metadata ($section, $field, $value);
1035	}
1036
1037
1038	# add_metadata assumes the text is in (extended) ascii form. For
1039	# text which has already been converted to the UTF-8 format use
1040	# add_utf8_metadata.
1041	sub add_metadata {
1042	my $self = shift (@_);
1043	my ($section, $field, $value) = @_;
1044
1045	$self->add_utf8_metadata ($section, $field,
1046	&unicode::ascii2utf8(\$value));
1047	}
1048
1049	sub add_utf8_metadata {
1050	my $self = shift (@_);
1051	my ($section, $field, $value) = @_;
1052
1053	# my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1054	# my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
1055	# print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
1056
1057	my $section_ptr = $self->_lookup_section($section);
1058	if (!defined $section_ptr) {
1059	print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
1060	return;
1061	}
1062	if (!defined $value) {
1063	print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1064	return;
1065	}
1066	if (!defined $field) {
1067	print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1068	return;
1069	}
1070
1071	#print STDERR "###$field=$value\n";
1072
1073	# For now, supress this check. Given that text data read in is now
1074	# Unicode aware, then the following block of code can (ironically enough)
1075	# cause our unicode compliant string to be re-encoded (leading to
1076	# a double-encoded UTF-8 string, which we definitely don't want!).
1077
1078
1079	# double check that the value is utf-8
1080	# if (!&unicode::check_is_utf8($value)) {
1081	# print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
1082	# &unicode::ensure_utf8(\$value);
1083	# print STDERR " Tried converting to utf8: $value\n";
1084	# }
1085
1086	#If the metadata value is either a latitude or a longitude value then we want to save a shortened version for spacial searching purposes
1087	if ($field =~ m/^(.+\.)?Latitude$/ \|\| $field =~ m/^(.+\.)?Longitude$/)
1088	{
1089	my ($mdprefix,$metaname) = ($field =~ m/(.+)\.(.+)$/);
1090	if (defined $mdprefix) {
1091	# Add in a version of Latitude/Longitude without the metadata namespace prefix to keep Runtime happy
1092	push (@{$section_ptr->{'metadata'}}, [$metaname, $value]);
1093	}
1094
1095	my $direction;
1096	if($value =~ m/^-/)
1097	{
1098	$direction = ($field eq "Latitude") ? "S" : "W";
1099	}
1100	else
1101	{
1102	$direction = ($field eq "Latitude") ? "N" : "E";
1103	}
1104
1105	my ($beforeDec, $afterDec) = ($value =~ m/^-?([0-9]+)\.([0-9]+)$/);
1106	if(defined $beforeDec && defined $afterDec)
1107	{
1108	my $name = ($field eq "Latitude") ? "LatShort" : "LngShort";
1109	push (@{$section_ptr->{'metadata'}}, [$name, $beforeDec . $direction]);
1110
1111	for(my $i = 2; $i <= 4; $i++)
1112	{
1113	if(length($afterDec) >= $i)
1114	{
1115	push (@{$section_ptr->{'metadata'}}, [$name, substr($afterDec, 0, $i)]);
1116	}
1117	}
1118
1119	#Only add the metadata if it has not already been added
1120	my $metaMap = $self->get_metadata_hashmap($section);
1121	}
1122	}
1123
1124	push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1125	}
1126
1127
1128	# methods for dealing with text
1129
1130	# returns the text for a section
1131	sub get_text {
1132	my $self = shift (@_);
1133	my ($section) = @_;
1134
1135	my $section_ptr = $self->_lookup_section($section);
1136	if (!defined $section_ptr) {
1137	print STDERR "doc::get_text couldn't find section " .
1138	"$section\n";
1139	return "";
1140	}
1141
1142	return $section_ptr->{'text'};
1143	}
1144
1145	# returns the (utf-8 encoded) length of the text for a section
1146	sub get_text_length {
1147	my $self = shift (@_);
1148	my ($section) = @_;
1149
1150	my $section_ptr = $self->_lookup_section($section);
1151	if (!defined $section_ptr) {
1152	print STDERR "doc::get_text_length couldn't find section " .
1153	"$section\n";
1154	return 0;
1155	}
1156
1157	return length ($section_ptr->{'text'});
1158	}
1159
1160	# returns the total length for all the sections
1161	sub get_total_text_length {
1162	my $self = shift (@_);
1163
1164	my $section = $self->get_top_section();
1165	my $length = 0;
1166	while (defined $section) {
1167	$length += $self->get_text_length($section);
1168	$section = $self->get_next_section($section);
1169	}
1170	return $length;
1171	}
1172
1173	sub delete_text {
1174	my $self = shift (@_);
1175	my ($section) = @_;
1176
1177	my $section_ptr = $self->_lookup_section($section);
1178	if (!defined $section_ptr) {
1179	print STDERR "doc::delete_text couldn't find section " .
1180	"$section\n";
1181	return;
1182	}
1183
1184	$section_ptr->{'text'} = "";
1185	}
1186
1187	# add_text assumes the text is in (extended) ascii form. For
1188	# text which has been already converted to the UTF-8 format
1189	# use add_utf8_text.
1190	sub add_text {
1191	my $self = shift (@_);
1192	my ($section, $text) = @_;
1193
1194	# convert the text to UTF-8 encoded unicode characters
1195	# and add the text
1196	$self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1197	}
1198
1199
1200	# add_utf8_text assumes the text to be added has already
1201	# been converted to the UTF-8 encoding. For ascii text use
1202	# add_text
1203	sub add_utf8_text {
1204	my $self = shift (@_);
1205	my ($section, $text) = @_;
1206
1207	my $section_ptr = $self->_lookup_section($section);
1208	if (!defined $section_ptr) {
1209	print STDERR "doc::add_utf8_text couldn't find section " .
1210	"$section\n";
1211	return;
1212	}
1213
1214	$section_ptr->{'text'} .= $text;
1215	}
1216
1217	# returns the Source meta, which is the utf8 filename generated.
1218	# Added a separate method here for convenience
1219	sub get_source {
1220	my $self = shift (@_);
1221	return $self->get_metadata_element ($self->get_top_section(), "Source");
1222	}
1223
1224	# returns the SourceFile meta, which is the url reference to the URL-encoded
1225	# version of Source (the utf8 filename). Added a separate method here for convenience
1226	sub get_sourcefile {
1227	my $self = shift (@_);
1228	return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1229	}
1230
1231	# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1232	# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1233	sub get_assocfile_from_sourcefile {
1234	my $self = shift (@_);
1235
1236	# get the SourceFile meta, which is a URL to a file on the filesystem
1237	my $top_section = $self->get_top_section();
1238	my $source_file = $self->get_metadata_element($top_section, "SourceFile");
1239
1240	# get the actual filename as it exists on the filesystem which this url refers to
1241	$source_file = &unicode::url_to_filename($source_file);
1242	my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1243	return $assocfilename;
1244	}
1245
1246	# methods for dealing with associated files
1247
1248	# a file is associated with a document, NOT a section.
1249	# if section is defined it is noted in the data structure
1250	# only so that files associated from a particular section
1251	# may be removed later (using delete_section_assoc_files)
1252	sub associate_file {
1253	my $self = shift (@_);
1254	my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1255	$mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1256
1257	# remove all associated files with the same name
1258	$self->delete_assoc_file ($assoc_filename);
1259
1260	# Too harsh a requirement
1261	# Definitely get HTML docs, for example, with some missing
1262	# support files
1263	# if (!&util::fd_exists($real_filename)) {
1264	# print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1265	# exit -1;
1266	# }
1267
1268	# print STDERR "**** is the following a UTF8 rep of real filename?\n $real_filename\n";
1269	# print STDERR "****##### so, ensure it is before storing?!?!?\n";
1270	## my $utf8_filename = Encode::encode("utf8",$filename);
1271
1272	push (@{$self->{'associated_files'}},
1273	[$real_filename, $assoc_filename, $mime_type, $section]);
1274	}
1275
1276	# returns a list of associated files in the form
1277	# [[real_filename, assoc_filename, mimetype], ...]
1278	sub get_assoc_files {
1279	my $self = shift (@_);
1280
1281	return $self->{'associated_files'};
1282	}
1283
1284	# the following two methods used to keep track of original associated files
1285	# for incremental building. eg a txt file used by an item file does not end
1286	# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1287	# file for incremental build
1288	sub associate_source_file {
1289	my $self = shift (@_);
1290	my ($full_filename) = @_;
1291
1292	push (@{$self->{'source_assoc_files'}}, $full_filename);
1293
1294	}
1295
1296	sub get_source_assoc_files {
1297	my $self = shift (@_);
1298
1299	return $self->{'source_assoc_files'};
1300
1301
1302	}
1303	sub metadata_file {
1304	my $self = shift (@_);
1305	my ($real_filename, $filename) = @_;
1306
1307	push (@{$self->{'metadata_files'}},
1308	[$real_filename, $filename]);
1309	}
1310
1311	# used for writing out the archiveinf-doc info database, to list all the metadata files
1312	sub get_meta_files {
1313	my $self = shift (@_);
1314
1315	return $self->{'metadata_files'};
1316	}
1317
1318	sub delete_section_assoc_files {
1319	my $self = shift (@_);
1320	my ($section) = @_;
1321
1322	my $i=0;
1323	while ($i < scalar (@{$self->{'associated_files'}})) {
1324	if (defined $self->{'associated_files'}->[$i]->[3] &&
1325	$self->{'associated_files'}->[$i]->[3] eq $section) {
1326	splice (@{$self->{'associated_files'}}, $i, 1);
1327	} else {
1328	$i++;
1329	}
1330	}
1331	}
1332
1333	sub delete_assoc_file {
1334	my $self = shift (@_);
1335	my ($assoc_filename) = @_;
1336
1337	my $i=0;
1338	while ($i < scalar (@{$self->{'associated_files'}})) {
1339	if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1340	splice (@{$self->{'associated_files'}}, $i, 1);
1341	} else {
1342	$i++;
1343	}
1344	}
1345	}
1346
1347	sub reset_nextsection_ptr {
1348	my $self = shift (@_);
1349	my ($section) = @_;
1350
1351	my $section_ptr = $self->_lookup_section($section);
1352	$section_ptr->{'next_subsection'} = 1;
1353	}
1354
1355	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: