Context Navigation

source: main/trunk/greenstone2/perllib/doc.pm@ 23278

Last change on this file since 23278 was 23278, checked in by kjdon, 13 years ago
split out the encoding filename bit from set_source_metadata so that other places can use it
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 30.9 KB

Line
1	###########################################################################
2	#
3	# doc.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistr te it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# base class to hold documents
27
28	package doc;
29	eval {require bytes};
30
31	BEGIN {
32	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33	unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34	}
35
36	use strict;
37	use unicode;
38	use util;
39	use ghtml;
40	use File::stat;
41	##use hashdoc;
42	use docprint;
43
44	# the document type may be indexed_doc, nonindexed_doc, or
45	# classification
46
47	our $OIDcount = 0;
48
49	# rename_method can be 'url', 'none', 'base64'
50	sub new {
51	my $class = shift (@_);
52	my ($source_filename, $doc_type, $rename_method) = @_;
53
54
55	my $self = bless {'associated_files'=>[],
56	'subsection_order'=>[],
57	'next_subsection'=>1,
58	'subsections'=>{},
59	'metadata'=>[],
60	'text'=>"",
61	'OIDtype'=>"hash"}, $class;
62
63	# used to set lastmodified here, but this can screw up the HASH ids, so
64	# the docsave processor now calls set_lastmodified
65
66	$self->{'source_path'} = $source_filename;
67
68	if (defined $source_filename) {
69	$source_filename = &util::filename_within_collection($source_filename);
70	print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
71	$self->set_source_filename ($source_filename, $rename_method);
72	}
73
74	$self->set_doc_type ($doc_type) if defined $doc_type;
75
76	return $self;
77	}
78	# set lastmodified for OAI purposes, added by GRB, moved by kjdon
79	sub set_lastmodified {
80	my $self = shift (@_);
81
82	my $source_path = $self->{'source_path'};
83
84	if (defined $source_path && (-e $source_path)) {
85
86	my $file_stat = stat($source_path);
87	my $mtime = $file_stat->mtime;
88	my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
89	$wday, $yday, $isdst) = localtime($mtime);
90
91	my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
92
93	$self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
94	$self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
95	}
96	}
97
98	# clone the $self object
99	sub duplicate {
100	my $self = shift (@_);
101
102	my $newobj = {};
103
104	foreach my $k (keys %$self) {
105	$newobj->{$k} = &clone ($self->{$k});
106	}
107
108	bless $newobj, ref($self);
109	return $newobj;
110	}
111
112	sub clone {
113	my ($from) = @_;
114	my $type = ref ($from);
115
116	if ($type eq "HASH") {
117	my $to = {};
118	foreach my $key (keys %$from) {
119	$to->{$key} = &clone ($from->{$key});
120	}
121	return $to;
122	} elsif ($type eq "ARRAY") {
123	my $to = [];
124	foreach my $v (@$from) {
125	push (@$to, &clone ($v));
126	}
127	return $to;
128	} else {
129	return $from;
130	}
131	}
132
133	sub set_OIDtype {
134	my $self = shift (@_);
135	my ($type, $metadata) = @_;
136
137	if (defined $type && $type =~ /^(hash\|hash_on_file\|hash_on_ga_xml\|incremental\|dirname\|assigned)$/) {
138	$self->{'OIDtype'} = $type;
139	} else {
140	$self->{'OIDtype'} = "hash";
141	}
142
143	if ($type =~ /^assigned$/) {
144	if (defined $metadata) {
145	$self->{'OIDmetadata'} = $metadata;
146	} else {
147	$self->{'OIDmetadata'} = "dc.Identifier";
148	}
149	}
150	}
151
152	# rename_method can be 'url', 'none', 'base64'
153	sub set_source_filename {
154	my $self = shift (@_);
155	my ($source_filename, $rename_method) = @_;
156
157	# Since the gsdlsourcefilename element goes into the doc.xml it has
158	# to be utf8. However, it should also represent the source filename
159	# (in the import directory) which may not be utf8 at all.
160	# For instance, if this meta element (gsdlsourcefilename) will be used
161	# by other applications that parse doc.xml in order to locate
162	# gsdlsourcefilename. Therefore, the solution is to URLencode or base64
163	# encode the real filename as this is a binary-to-text encoding meaning
164	# that the resulting string is ASCII (utf8). Decoding will give the original.
165
166	# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
167
168	# URLencode just the gsdl_source_filename, not the directory. Then prepend dir
169	$source_filename = $self->encode_filename($source_filename, $rename_method);
170	# my ($srcfilename,$dirname,$suffix)
171	# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
172	# print STDERR "-> $srcfilename -> ";
173	# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
174	# $source_filename = &util::filename_cat($dirname, $srcfilename);
175	# print STDERR "$source_filename\n";
176
177	$self->set_utf8_metadata_element ($self->get_top_section(),
178	"gsdlsourcefilename",
179	$source_filename);
180	}
181
182	sub encode_filename {
183	my $self = shift (@_);
184	my ($source_filename, $rename_method) = @_;
185
186	my ($srcfilename,$dirname,$suffix)
187	= &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
188	# print STDERR "-> $srcfilename -> ";
189	$srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
190	$source_filename = &util::filename_cat($dirname, $srcfilename);
191
192	return $source_filename;
193	}
194
195	sub set_converted_filename {
196	my $self = shift (@_);
197	my ($converted_filename) = @_;
198
199	# we know the converted filename is utf8
200	$self->set_utf8_metadata_element ($self->get_top_section(),
201	"gsdlconvertedfilename",
202	$converted_filename);
203	}
204
205	# returns the source_filename as it was provided
206	sub get_unmodified_source_filename {
207	my $self = shift (@_);
208
209	return $self->{'source_path'};
210	}
211
212	# returns the source_filename with whatever rename_method was given
213	sub get_source_filename {
214	my $self = shift (@_);
215
216	return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
217	}
218
219
220
221	# returns converted filename if available else returns source filename
222	sub get_filename_for_hashing {
223	my $self = shift (@_);
224
225	my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
226
227	if (!defined $filename) {
228	my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
229	# if NULPlug processed file, then don't give a filename
230	if (defined $plugin_name && $plugin_name eq "NULPlug") {
231	$filename = undef;
232	} else { # returns the URL encoded source filename!
233	$filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
234	}
235	}
236	return $filename;
237	}
238
239	sub set_doc_type {
240	my $self = shift (@_);
241	my ($doc_type) = @_;
242
243	$self->set_metadata_element ($self->get_top_section(),
244	"gsdldoctype",
245	$doc_type);
246	}
247
248	# returns the gsdldoctype as it was provided
249	# the default of "indexed_doc" is used if no document
250	# type was provided
251	sub get_doc_type {
252	my $self = shift (@_);
253
254	my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
255	return $doc_type if (defined $doc_type);
256	return "indexed_doc";
257	}
258
259
260	# look up the reference to the a particular section
261	sub _lookup_section {
262	my $self = shift (@_);
263	my ($section) = @_;
264
265	my ($num);
266	my $sectionref = $self;
267
268	while (defined $section && $section ne "") {
269
270	($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
271
272	$num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
273
274	$section = "" unless defined $section;
275
276
277	if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
278	$sectionref = $sectionref->{'subsections'}->{$num};
279	} else {
280	return undef;
281	}
282	}
283
284	return $sectionref;
285	}
286
287	# calculate OID by hashing the contents of the document
288	sub _calc_OID {
289	my $self = shift (@_);
290	my ($filename) = @_;
291
292
293	my $osexe = &util::get_os_exe();
294
295	my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
296	$ENV{'GSDLOS'},"hashfile$osexe");
297
298	my $result = "NULL";
299
300
301	if (-e "$hashfile_exe") {
302	# $result = `\"$hashfile_exe\" \"$filename\"`;
303	# $result = `hashfile$osexe \"$filename\" 2>&1`;
304	$result = `hashfile$osexe \"$filename\"`;
305
306	($result) = $result =~ /:\s*([0-9a-f]+)/i;
307	} else {
308	print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
309	}
310	return "HASH$result";
311	}
312
313	# methods dealing with OID, not groups of them.
314
315	# if $OID is not provided one is calculated
316	sub set_OID {
317	my $self = shift (@_);
318	my ($OID) = @_;
319
320	my $use_hash_oid = 0;
321	# if an OID wasn't provided calculate one
322	if (!defined $OID) {
323	$OID = "NULL";
324	if ($self->{'OIDtype'} =~ /^hash/) {
325	$use_hash_oid = 1;
326	} elsif ($self->{'OIDtype'} eq "incremental") {
327	$OID = "D" . $OIDcount;
328	$OIDcount ++;
329
330	} elsif ($self->{'OIDtype'} eq "dirname") {
331	$OID = 'J';
332	my $filename = $self->get_source_filename();
333	if (defined($filename)) { # && -e $filename) {
334	$OID = &File::Basename::dirname($filename);
335	if (defined $OID) {
336	$OID = 'J'.&File::Basename::basename($OID);
337	$OID = &util::tidy_up_oid($OID);
338	} else {
339	print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
340	$use_hash_oid = 1;
341	}
342	} else {
343	print STDERR "Failed to find filename, generating hash id\n";
344	$use_hash_oid = 1;
345	}
346
347	} elsif ($self->{'OIDtype'} eq "assigned") {
348	my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
349	if (defined $identifier && $identifier ne "") {
350	$OID = $identifier;
351	$OID = &util::tidy_up_oid($OID);
352	} else {
353	# need a hash id
354	print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
355	$use_hash_oid = 1;
356	}
357
358	} else {
359	$use_hash_oid = 1;
360	}
361
362	if ($use_hash_oid) {
363	my $hash_on_file = 1;
364	if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
365	$hash_on_file = 0;
366	}
367	if ($hash_on_file) {
368	# "hash" OID - feed file to hashfile.exe
369	my $filename = $self->get_filename_for_hashing();
370
371	# -z: don't want to hash on the file if it is zero size
372	if (defined($filename) && -e $filename && !-z $filename) {
373	$OID = $self->_calc_OID ($filename);
374	} else {
375	$hash_on_file = 0;
376	}
377	}
378	if (!$hash_on_file) {
379	my $filename = &util::get_tmp_filename();
380	if (!open (OUTFILE, ">:utf8", $filename)) {
381	print STDERR "doc::set_OID could not write to $filename\n";
382	} else {
383	my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
384	print OUTFILE $doc_text;
385	close (OUTFILE);
386	}
387	$OID = $self->_calc_OID ($filename);
388	&util::rm ($filename);
389	}
390	}
391	}
392	$self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
393	}
394
395	# this uses hashdoc (embedded c thingy) which is faster but still
396	# needs a little work to be suffiently stable
397	sub ___set_OID {
398	my $self = shift (@_);
399	my ($OID) = @_;
400
401	# if an OID wasn't provided then calculate hash value based on document
402	if (!defined $OID)
403	{
404	my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
405	my $hash_len = length($hash_text);
406
407	$OID = &hashdoc::buffer($hash_text,$hash_len);
408	}
409
410	$self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
411	}
412
413	# returns the OID for this document
414	sub get_OID {
415	my $self = shift (@_);
416	my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
417	return $OID if (defined $OID);
418	return "NULL";
419	}
420
421	sub delete_OID {
422	my $self = shift (@_);
423
424	$self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
425	}
426
427
428	# methods for manipulating section names
429
430	# returns the name of the top-most section (the top
431	# level of the document
432	sub get_top_section {
433	my $self = shift (@_);
434
435	return "";
436	}
437
438	# returns a section
439	sub get_parent_section {
440	my $self = shift (@_);
441	my ($section) = @_;
442
443	$section =~ s/(^\|\.)\d+$//;
444
445	return $section;
446	}
447
448	# returns the first child section (or the end child
449	# if there isn't any)
450	sub get_begin_child {
451	my $self = shift (@_);
452	my ($section) = @_;
453
454	my $section_ptr = $self->_lookup_section($section);
455	return "" unless defined $section_ptr;
456
457	if (defined $section_ptr->{'subsection_order'}->[0]) {
458	return "$section.$section_ptr->{'subsection_order'}->[0]";
459	}
460
461	return $self->get_end_child ($section);
462	}
463
464	# returns the next child of a parent section
465	sub get_next_child {
466	my $self = shift (@_);
467	my ($section) = @_;
468
469	my $parent_section = $self->get_parent_section($section);
470	my $parent_section_ptr = $self->_lookup_section($parent_section);
471	return undef unless defined $parent_section_ptr;
472
473	my ($section_num) = $section =~ /(\d+)$/;
474	return undef unless defined $section_num;
475
476	my $i = 0;
477	my $section_order = $parent_section_ptr->{'subsection_order'};
478	while ($i < scalar(@$section_order)) {
479	last if $section_order->[$i] eq $section_num;
480	$i++;
481	}
482
483	$i++; # the next child
484	if ($i < scalar(@$section_order)) {
485	return $section_order->[$i] if $parent_section eq "";
486	return "$parent_section.$section_order->[$i]";
487	}
488
489	# no more sections in this level
490	return undef;
491	}
492
493	# returns a reference to a list of children
494	sub get_children {
495	my $self = shift (@_);
496	my ($section) = @_;
497
498	my $section_ptr = $self->_lookup_section($section);
499	return [] unless defined $section_ptr;
500
501	my @children = @{$section_ptr->{'subsection_order'}};
502
503	map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
504	return \@children;
505	}
506
507	# returns the child section one past the last one (which
508	# is coded as "0")
509	sub get_end_child {
510	my $self = shift (@_);
511	my ($section) = @_;
512
513	return $section . ".0" unless $section eq "";
514	return "0";
515	}
516
517	# returns the next section in book order
518	sub get_next_section {
519	my $self = shift (@_);
520	my ($section) = @_;
521
522	return undef unless defined $section;
523
524	my $section_ptr = $self->_lookup_section($section);
525	return undef unless defined $section_ptr;
526
527	# first try to find first child
528	if (defined $section_ptr->{'subsection_order'}->[0]) {
529	return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
530	return "$section.$section_ptr->{'subsection_order'}->[0]";
531	}
532
533	do {
534	# try to find sibling
535	my $next_child = $self->get_next_child ($section);
536	return $next_child if (defined $next_child);
537
538	# move up one level
539	$section = $self->get_parent_section ($section);
540	} while $section =~ /\d/;
541
542	return undef;
543	}
544
545	sub is_leaf_section {
546	my $self = shift (@_);
547	my ($section) = @_;
548
549	my $section_ptr = $self->_lookup_section($section);
550	return 1 unless defined $section_ptr;
551
552	return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
553	}
554
555	# methods for dealing with sections
556
557	# returns the name of the inserted section
558	sub insert_section {
559	my $self = shift (@_);
560	my ($before_section) = @_;
561
562	# get the child to insert before and its parent section
563	my $parent_section = "";
564	my $before_child = "0";
565	my @before_section = split (/\./, $before_section);
566	if (scalar(@before_section) > 0) {
567	$before_child = pop (@before_section);
568	$parent_section = join (".", @before_section);
569	}
570
571	my $parent_section_ptr = $self->_lookup_section($parent_section);
572	if (!defined $parent_section_ptr) {
573	print STDERR "doc::insert_section couldn't find parent section " .
574	"$parent_section\n";
575	return;
576	}
577
578	# get the next section number
579	my $section_num = $parent_section_ptr->{'next_subsection'}++;
580
581	my $i = 0;
582	while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
583	$parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
584	$i++;
585	}
586
587	# insert the section number into the order list
588	splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
589
590	# add this section to the parent section
591	my $section_ptr = {'subsection_order'=>[],
592	'next_subsection'=>1,
593	'subsections'=>{},
594	'metadata'=>[],
595	'text'=>""};
596	$parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
597
598	# work out the full section number
599	my $section = $parent_section;
600	$section .= "." unless $section eq "";
601	$section .= $section_num;
602
603	return $section;
604	}
605
606	# creates a pre-named section
607	sub create_named_section {
608	my $self = shift (@_);
609	my ($mastersection) = @_;
610
611	my ($num);
612	my $section = $mastersection;
613	my $sectionref = $self;
614
615	while ($section ne "") {
616	($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
617	$num =~ s/^0+(\d)/$1/; # remove leading 0s
618	$section = "" unless defined $section;
619
620	if (defined $num) {
621	if (!defined $sectionref->{'subsections'}->{$num}) {
622	push (@{$sectionref->{'subsection_order'}}, $num);
623	$sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
624	'next_subsection'=>1,
625	'subsections'=>{},
626	'metadata'=>[],
627	'text'=>""};
628	if ($num >= $sectionref->{'next_subsection'}) {
629	$sectionref->{'next_subsection'} = $num + 1;
630	}
631	}
632	$sectionref = $sectionref->{'subsections'}->{$num};
633
634	} else {
635	print STDERR "doc::create_named_section couldn't create section ";
636	print STDERR "$mastersection\n";
637	last;
638	}
639	}
640	}
641
642	# returns a reference to a list of subsections
643	sub list_subsections {
644	my $self = shift (@_);
645	my ($section) = @_;
646
647	my $section_ptr = $self->_lookup_section ($section);
648	if (!defined $section_ptr) {
649	print STDERR "doc::list_subsections couldn't find section $section\n";
650	return [];
651	}
652
653	return [@{$section_ptr->{'subsection_order'}}];
654	}
655
656	sub delete_section {
657	my $self = shift (@_);
658	my ($section) = @_;
659
660	# my $section_ptr = {'subsection_order'=>[],
661	# 'next_subsection'=>1,
662	# 'subsections'=>{},
663	# 'metadata'=>[],
664	# 'text'=>""};
665
666	# if this is the top section reset everything
667	if ($section eq "") {
668	$self->{'subsection_order'} = [];
669	$self->{'subsections'} = {};
670	$self->{'metadata'} = [];
671	$self->{'text'} = "";
672	return;
673	}
674
675	# find the parent of the section to delete
676	my $parent_section = "";
677	my $child = "0";
678	my @section = split (/\./, $section);
679	if (scalar(@section) > 0) {
680	$child = pop (@section);
681	$parent_section = join (".", @section);
682	}
683
684	my $parent_section_ptr = $self->_lookup_section($parent_section);
685	if (!defined $parent_section_ptr) {
686	print STDERR "doc::delete_section couldn't find parent section " .
687	"$parent_section\n";
688	return;
689	}
690
691	# remove this section from the subsection_order list
692	my $i = 0;
693	while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
694	if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
695	splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
696	last;
697	}
698	$i++;
699	}
700
701	# remove this section from the subsection hash
702	if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
703	undef $parent_section_ptr->{'subsections'}->{$child};
704	}
705	}
706
707	#--
708	# methods for dealing with metadata
709
710	# set_metadata_element and get_metadata_element are for metadata
711	# which should only have one value. add_meta_data and get_metadata
712	# are for metadata which can have more than one value.
713
714	# returns the first metadata value which matches field
715
716	# This version of get metadata element works much like the one above,
717	# except it allows for the namespace portion of a metadata element to
718	# be ignored, thus if you are searching for dc.Title, the first piece
719	# of matching metadata ending with the name Title (once any namespace
720	# is removed) would be returned.
721	# 28-11-2003 John Thompson
722	sub get_metadata_element {
723	my $self = shift (@_);
724	my ($section, $field, $ignore_namespace) = @_;
725	my ($data);
726
727	$ignore_namespace = 0 unless defined $ignore_namespace;
728
729	my $section_ptr = $self->_lookup_section($section);
730	if (!defined $section_ptr) {
731	print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
732	return;
733	}
734
735	# Remove the namespace if we are being told to ignore them
736	if($ignore_namespace) {
737	$field =~ s/^\w*\.//;
738	}
739
740	foreach $data (@{$section_ptr->{'metadata'}}) {
741
742	my $data_name = $data->[0];
743
744	# Remove the any namespace if we are being told to ignore them
745	if($ignore_namespace) {
746	$data_name =~ s/^\w*\.//;
747	}
748	$data_name =~ s/^ex\.//; # we always remove ex. - it maybe there in doc_obj, but we will never ask for it.
749	return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
750	}
751
752	return undef; # was not found
753	}
754
755	# returns a list of the form [value1, value2, ...]
756	sub get_metadata {
757	my $self = shift (@_);
758	my ($section, $field, $ignore_namespace) = @_;
759	my ($data);
760
761	$ignore_namespace = 0 unless defined $ignore_namespace;
762
763	my $section_ptr = $self->_lookup_section($section);
764	if (!defined $section_ptr) {
765	print STDERR "doc::get_metadata couldn't find section ",
766	$section, "\n";
767	return;
768	}
769
770	# Remove the any namespace if we are being told to ignore them
771	if($ignore_namespace) {
772	$field =~ s/^\w*\.//;
773	}
774
775	my @metadata = ();
776	foreach $data (@{$section_ptr->{'metadata'}}) {
777
778	my $data_name = $data->[0];
779	# Remove the any namespace if we are being told to ignore them
780	if($ignore_namespace) {
781	$data_name =~ s/^\w*\.//;
782	}
783	$data_name =~ s/^ex\.//; # we always remove ex. - it maybe there in doc_obj, but we will never ask for it.
784	push (@metadata, $data->[1]) if ($data_name eq $field);
785	}
786
787	return \@metadata;
788	}
789
790	# returns a list of the form [[field,value],[field,value],...]
791	sub get_all_metadata {
792	my $self = shift (@_);
793	my ($section) = @_;
794
795	my $section_ptr = $self->_lookup_section($section);
796	if (!defined $section_ptr) {
797	print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
798	return;
799	}
800
801	return $section_ptr->{'metadata'};
802	}
803
804	# $value is optional
805	sub delete_metadata {
806	my $self = shift (@_);
807	my ($section, $field, $value) = @_;
808
809	my $section_ptr = $self->_lookup_section($section);
810	if (!defined $section_ptr) {
811	print STDERR "doc::delete_metadata couldn't find section ", $section, "\n";
812	return;
813	}
814
815	my $i = 0;
816	while ($i < scalar (@{$section_ptr->{'metadata'}})) {
817	if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
818	(!defined $value \|\| $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
819	splice (@{$section_ptr->{'metadata'}}, $i, 1);
820	} else {
821	$i++;
822	}
823	}
824	}
825
826	sub delete_all_metadata {
827	my $self = shift (@_);
828	my ($section) = @_;
829
830	my $section_ptr = $self->_lookup_section($section);
831	if (!defined $section_ptr) {
832	print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
833	return;
834	}
835
836	$section_ptr->{'metadata'} = [];
837	}
838
839	sub set_metadata_element {
840	my $self = shift (@_);
841	my ($section, $field, $value) = @_;
842
843	$self->set_utf8_metadata_element ($section, $field,
844	&unicode::ascii2utf8(\$value));
845	}
846
847	# set_utf8_metadata_element assumes the text has already been
848	# converted to the UTF-8 encoding.
849	sub set_utf8_metadata_element {
850	my $self = shift (@_);
851	my ($section, $field, $value) = @_;
852
853	$self->delete_metadata ($section, $field);
854	$self->add_utf8_metadata ($section, $field, $value);
855	}
856
857
858	# add_metadata assumes the text is in (extended) ascii form. For
859	# text which has already been converted to the UTF-8 format use
860	# add_utf8_metadata.
861	sub add_metadata {
862	my $self = shift (@_);
863	my ($section, $field, $value) = @_;
864
865	$self->add_utf8_metadata ($section, $field,
866	&unicode::ascii2utf8(\$value));
867	}
868
869	sub add_utf8_metadata {
870	my $self = shift (@_);
871	my ($section, $field, $value) = @_;
872
873	# my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
874	# my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
875	# print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
876
877	my $section_ptr = $self->_lookup_section($section);
878	if (!defined $section_ptr) {
879	print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
880	return;
881	}
882	if (!defined $value) {
883	print STDERR "doc::add_utf8_metadata undefined value for $field\n";
884	return;
885	}
886	if (!defined $field) {
887	print STDERR "doc::add_utf8_metadata undefined metadata type \n";
888	return;
889	}
890
891	#print STDERR "###$field=$value\n";
892
893	# For now, supress this check. Given that text data read in is now
894	# Unicode aware, then the following block of code can (ironically enough)
895	# cause our unicode compliant string to be re-encoded (leading to
896	# a double-encoded UTF-8 string, which we definitely don't want!).
897
898
899	# double check that the value is utf-8
900	# if (!&unicode::check_is_utf8($value)) {
901	# print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
902	# &unicode::ensure_utf8(\$value);
903	# print STDERR " Tried converting to utf8: $value\n";
904	# }
905
906	push (@{$section_ptr->{'metadata'}}, [$field, $value]);
907	}
908
909
910	# methods for dealing with text
911
912	# returns the text for a section
913	sub get_text {
914	my $self = shift (@_);
915	my ($section) = @_;
916
917	my $section_ptr = $self->_lookup_section($section);
918	if (!defined $section_ptr) {
919	print STDERR "doc::get_text couldn't find section " .
920	"$section\n";
921	return "";
922	}
923
924	return $section_ptr->{'text'};
925	}
926
927	# returns the (utf-8 encoded) length of the text for a section
928	sub get_text_length {
929	my $self = shift (@_);
930	my ($section) = @_;
931
932	my $section_ptr = $self->_lookup_section($section);
933	if (!defined $section_ptr) {
934	print STDERR "doc::get_text_length couldn't find section " .
935	"$section\n";
936	return 0;
937	}
938
939	return length ($section_ptr->{'text'});
940	}
941
942	# returns the total length for all the sections
943	sub get_total_text_length {
944	my $self = shift (@_);
945
946	my $section = $self->get_top_section();
947	my $length = 0;
948	while (defined $section) {
949	$length += $self->get_text_length($section);
950	$section = $self->get_next_section($section);
951	}
952	return $length;
953	}
954
955	sub delete_text {
956	my $self = shift (@_);
957	my ($section) = @_;
958
959	my $section_ptr = $self->_lookup_section($section);
960	if (!defined $section_ptr) {
961	print STDERR "doc::delete_text couldn't find section " .
962	"$section\n";
963	return;
964	}
965
966	$section_ptr->{'text'} = "";
967	}
968
969	# add_text assumes the text is in (extended) ascii form. For
970	# text which has been already converted to the UTF-8 format
971	# use add_utf8_text.
972	sub add_text {
973	my $self = shift (@_);
974	my ($section, $text) = @_;
975
976	# convert the text to UTF-8 encoded unicode characters
977	# and add the text
978	$self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
979	}
980
981
982	# add_utf8_text assumes the text to be added has already
983	# been converted to the UTF-8 encoding. For ascii text use
984	# add_text
985	sub add_utf8_text {
986	my $self = shift (@_);
987	my ($section, $text) = @_;
988
989	my $section_ptr = $self->_lookup_section($section);
990	if (!defined $section_ptr) {
991	print STDERR "doc::add_utf8_text couldn't find section " .
992	"$section\n";
993	return;
994	}
995
996	$section_ptr->{'text'} .= $text;
997	}
998
999	# returns the Source meta, which is the utf8 filename generated.
1000	# Added a separate method here for convenience
1001	sub get_source {
1002	my $self = shift (@_);
1003	return $self->get_metadata_element ($self->get_top_section(), "Source");
1004	}
1005
1006	# returns the SourceFile meta, which is the url reference to the URL-encoded
1007	# version of Source (the utf8 filename). Added a separate method here for convenience
1008	sub get_sourcefile {
1009	my $self = shift (@_);
1010	return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1011	}
1012
1013	# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1014	# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1015	sub get_assocfile_from_sourcefile {
1016	my $self = shift (@_);
1017
1018	# get the SourceFile meta, which is a URL to a file on the filesystem
1019	my $top_section = $self->get_top_section();
1020	my $source_file = $self->get_metadata_element($top_section, "SourceFile");
1021
1022	# get the actual filename as it exists on the filesystem which this url refers to
1023	$source_file = &unicode::url_to_filename($source_file);
1024	my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1025	return $assocfilename;
1026	}
1027
1028	# methods for dealing with associated files
1029
1030	# a file is associated with a document, NOT a section.
1031	# if section is defined it is noted in the data structure
1032	# only so that files associated from a particular section
1033	# may be removed later (using delete_section_assoc_files)
1034	sub associate_file {
1035	my $self = shift (@_);
1036	my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1037	$mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1038
1039	# remove all associated files with the same name
1040	$self->delete_assoc_file ($assoc_filename);
1041
1042	push (@{$self->{'associated_files'}},
1043	[$real_filename, $assoc_filename, $mime_type, $section]);
1044	}
1045
1046	# returns a list of associated files in the form
1047	# [[real_filename, assoc_filename, mimetype], ...]
1048	sub get_assoc_files {
1049	my $self = shift (@_);
1050
1051	return $self->{'associated_files'};
1052	}
1053
1054	# the following two methods used to keep track of original associated files
1055	# for incremental building. eg a txt file used by an item file does not end
1056	# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1057	# file for incremental build
1058	sub associate_source_file {
1059	my $self = shift (@_);
1060	my ($full_filename) = @_;
1061
1062	push (@{$self->{'source_assoc_files'}}, $full_filename);
1063
1064	}
1065
1066	sub get_source_assoc_files {
1067	my $self = shift (@_);
1068
1069	return $self->{'source_assoc_files'};
1070
1071
1072	}
1073	sub metadata_file {
1074	my $self = shift (@_);
1075	my ($real_filename, $filename) = @_;
1076
1077	push (@{$self->{'metadata_files'}},
1078	[$real_filename, $filename]);
1079	}
1080
1081	# used for writing out the archiveinf-doc info database, to list all the metadata files
1082	sub get_meta_files {
1083	my $self = shift (@_);
1084
1085	return $self->{'metadata_files'};
1086	}
1087
1088	sub delete_section_assoc_files {
1089	my $self = shift (@_);
1090	my ($section) = @_;
1091
1092	my $i=0;
1093	while ($i < scalar (@{$self->{'associated_files'}})) {
1094	if (defined $self->{'associated_files'}->[$i]->[3] &&
1095	$self->{'associated_files'}->[$i]->[3] eq $section) {
1096	splice (@{$self->{'associated_files'}}, $i, 1);
1097	} else {
1098	$i++;
1099	}
1100	}
1101	}
1102
1103	sub delete_assoc_file {
1104	my $self = shift (@_);
1105	my ($assoc_filename) = @_;
1106
1107	my $i=0;
1108	while ($i < scalar (@{$self->{'associated_files'}})) {
1109	if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1110	splice (@{$self->{'associated_files'}}, $i, 1);
1111	} else {
1112	$i++;
1113	}
1114	}
1115	}
1116
1117	sub reset_nextsection_ptr {
1118	my $self = shift (@_);
1119	my ($section) = @_;
1120
1121	my $section_ptr = $self->_lookup_section($section);
1122	$section_ptr->{'next_subsection'} = 1;
1123	}
1124
1125	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: