Context Navigation

source: main/trunk/greenstone2/perllib/doc.pm@ 22950

Last change on this file since 22950 was 22950, checked in by davidb, 14 years ago
Old routine used to work on raw binary strings that just happened to be UTF-8 compliant. Now strings are (in the Perl sense) Unicode aware, we don't want to check for them being valid utf8. In fact, things can start to go wrong if we do
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 30.1 KB

Line
1	###########################################################################
2	#
3	# doc.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistr te it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# base class to hold documents
27
28	package doc;
29	eval {require bytes};
30
31	BEGIN {
32	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33	unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34	}
35
36	use strict;
37	use unicode;
38	use util;
39	use ghtml;
40	use File::stat;
41	##use hashdoc;
42	use docprint;
43
44	# the document type may be indexed_doc, nonindexed_doc, or
45	# classification
46
47	our $OIDcount = 0;
48
49	# rename_method can be 'url', 'none', 'base64'
50	sub new {
51	my $class = shift (@_);
52	my ($source_filename, $doc_type, $rename_method) = @_;
53
54
55	my $self = bless {'associated_files'=>[],
56	'subsection_order'=>[],
57	'next_subsection'=>1,
58	'subsections'=>{},
59	'metadata'=>[],
60	'text'=>"",
61	'OIDtype'=>"hash"}, $class;
62
63	# used to set lastmodified here, but this can screw up the HASH ids, so
64	# the docsave processor now calls set_lastmodified
65
66	$self->{'source_path'} = $source_filename;
67
68	if (defined $source_filename) {
69	$source_filename = &util::filename_within_collection($source_filename);
70	print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
71	$self->set_source_filename ($source_filename, $rename_method);
72	}
73
74	$self->set_doc_type ($doc_type) if defined $doc_type;
75
76	return $self;
77	}
78	# set lastmodified for OAI purposes, added by GRB, moved by kjdon
79	sub set_lastmodified {
80	my $self = shift (@_);
81
82	my $source_path = $self->{'source_path'};
83
84	if (defined $source_path && (-e $source_path)) {
85
86	my $file_stat = stat($source_path);
87	my $mtime = $file_stat->mtime;
88	my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
89	$wday, $yday, $isdst) = localtime($mtime);
90
91	my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
92
93	$self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
94	$self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
95	}
96	}
97
98	# clone the $self object
99	sub duplicate {
100	my $self = shift (@_);
101
102	my $newobj = {};
103
104	foreach my $k (keys %$self) {
105	$newobj->{$k} = &clone ($self->{$k});
106	}
107
108	bless $newobj, ref($self);
109	return $newobj;
110	}
111
112	sub clone {
113	my ($from) = @_;
114	my $type = ref ($from);
115
116	if ($type eq "HASH") {
117	my $to = {};
118	foreach my $key (keys %$from) {
119	$to->{$key} = &clone ($from->{$key});
120	}
121	return $to;
122	} elsif ($type eq "ARRAY") {
123	my $to = [];
124	foreach my $v (@$from) {
125	push (@$to, &clone ($v));
126	}
127	return $to;
128	} else {
129	return $from;
130	}
131	}
132
133	sub set_OIDtype {
134	my $self = shift (@_);
135	my ($type, $metadata) = @_;
136
137	if (defined $type && $type =~ /^(hash\|hash_on_file\|hash_on_ga_xml\|incremental\|dirname\|assigned)$/) {
138	$self->{'OIDtype'} = $type;
139	} else {
140	$self->{'OIDtype'} = "hash";
141	}
142
143	if ($type =~ /^assigned$/) {
144	if (defined $metadata) {
145	$self->{'OIDmetadata'} = $metadata;
146	} else {
147	$self->{'OIDmetadata'} = "dc.Identifier";
148	}
149	}
150	}
151
152	# rename_method can be 'url', 'none', 'base64'
153	sub set_source_filename {
154	my $self = shift (@_);
155	my ($source_filename, $rename_method) = @_;
156
157	# Since the gsdlsourcefilename element goes into the doc.xml it has
158	# to be utf8. However, it should also represent the source filename
159	# (in the import directory) which may not be utf8 at all.
160	# For instance, if this meta element (gsdlsourcefilename) will be used
161	# by other applications that parse doc.xml in order to locate
162	# gsdlsourcefilename. Therefore, the solution is to URLencode or base64
163	# encode the real filename as this is a binary-to-text encoding meaning
164	# that the resulting string is ASCII (utf8). Decoding will give the original.
165
166	# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
167
168	# URLencode just the gsdl_source_filename, not the directory. Then prepend dir
169	my ($srcfilename,$dirname,$suffix)
170	= &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
171	# print STDERR "-> $srcfilename -> ";
172	$srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
173	$source_filename = &util::filename_cat($dirname, $srcfilename);
174	# print STDERR "$source_filename\n";
175
176	$self->set_utf8_metadata_element ($self->get_top_section(),
177	"gsdlsourcefilename",
178	$source_filename);
179	}
180
181	sub set_converted_filename {
182	my $self = shift (@_);
183	my ($converted_filename) = @_;
184
185	# we know the converted filename is utf8
186	$self->set_utf8_metadata_element ($self->get_top_section(),
187	"gsdlconvertedfilename",
188	$converted_filename);
189	}
190
191	# returns the source_filename as it was provided
192	sub get_unmodified_source_filename {
193	my $self = shift (@_);
194
195	return $self->{'source_path'};
196	}
197
198	# returns the source_filename with whatever rename_method was given
199	sub get_source_filename {
200	my $self = shift (@_);
201
202	return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
203	}
204
205
206
207	# returns converted filename if available else returns source filename
208	sub get_filename_for_hashing {
209	my $self = shift (@_);
210
211	my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
212
213	if (!defined $filename) {
214	my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
215	# if NULPlug processed file, then don't give a filename
216	if (defined $plugin_name && $plugin_name eq "NULPlug") {
217	$filename = undef;
218	} else { # returns the URL encoded source filename!
219	$filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
220	}
221	}
222	return $filename;
223	}
224
225	sub set_doc_type {
226	my $self = shift (@_);
227	my ($doc_type) = @_;
228
229	$self->set_metadata_element ($self->get_top_section(),
230	"gsdldoctype",
231	$doc_type);
232	}
233
234	# returns the gsdldoctype as it was provided
235	# the default of "indexed_doc" is used if no document
236	# type was provided
237	sub get_doc_type {
238	my $self = shift (@_);
239
240	my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
241	return $doc_type if (defined $doc_type);
242	return "indexed_doc";
243	}
244
245
246	# look up the reference to the a particular section
247	sub _lookup_section {
248	my $self = shift (@_);
249	my ($section) = @_;
250
251	my ($num);
252	my $sectionref = $self;
253
254	while (defined $section && $section ne "") {
255
256	($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
257
258	$num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
259
260	$section = "" unless defined $section;
261
262
263	if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
264	$sectionref = $sectionref->{'subsections'}->{$num};
265	} else {
266	return undef;
267	}
268	}
269
270	return $sectionref;
271	}
272
273	# calculate OID by hashing the contents of the document
274	sub _calc_OID {
275	my $self = shift (@_);
276	my ($filename) = @_;
277
278
279	my $osexe = &util::get_os_exe();
280
281	my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
282	$ENV{'GSDLOS'},"hashfile$osexe");
283
284	my $result = "NULL";
285
286
287	if (-e "$hashfile_exe") {
288	# $result = `\"$hashfile_exe\" \"$filename\"`;
289	# $result = `hashfile$osexe \"$filename\" 2>&1`;
290	$result = `hashfile$osexe \"$filename\"`;
291
292	($result) = $result =~ /:\s*([0-9a-f]+)/i;
293	} else {
294	print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
295	}
296	return "HASH$result";
297	}
298
299	# methods dealing with OID, not groups of them.
300
301	# if $OID is not provided one is calculated
302	sub set_OID {
303	my $self = shift (@_);
304	my ($OID) = @_;
305
306	my $use_hash_oid = 0;
307	# if an OID wasn't provided calculate one
308	if (!defined $OID) {
309	$OID = "NULL";
310	if ($self->{'OIDtype'} =~ /^hash/) {
311	$use_hash_oid = 1;
312	} elsif ($self->{'OIDtype'} eq "incremental") {
313	$OID = "D" . $OIDcount;
314	$OIDcount ++;
315
316	} elsif ($self->{'OIDtype'} eq "dirname") {
317	$OID = 'J';
318	my $filename = $self->get_source_filename();
319	if (defined($filename)) { # && -e $filename) {
320	$OID = &File::Basename::dirname($filename);
321	if (defined $OID) {
322	$OID = 'J'.&File::Basename::basename($OID);
323	$OID = &util::tidy_up_oid($OID);
324	} else {
325	print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
326	$use_hash_oid = 1;
327	}
328	} else {
329	print STDERR "Failed to find filename, generating hash id\n";
330	$use_hash_oid = 1;
331	}
332
333	} elsif ($self->{'OIDtype'} eq "assigned") {
334	my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
335	if (defined $identifier && $identifier ne "") {
336	$OID = $identifier;
337	$OID = &util::tidy_up_oid($OID);
338	} else {
339	# need a hash id
340	print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
341	$use_hash_oid = 1;
342	}
343
344	} else {
345	$use_hash_oid = 1;
346	}
347
348	if ($use_hash_oid) {
349	my $hash_on_file = 1;
350	if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
351	$hash_on_file = 0;
352	}
353	if ($hash_on_file) {
354	# "hash" OID - feed file to hashfile.exe
355	my $filename = $self->get_filename_for_hashing();
356
357	# -z: don't want to hash on the file if it is zero size
358	if (defined($filename) && -e $filename && !-z $filename) {
359	$OID = $self->_calc_OID ($filename);
360	} else {
361	$hash_on_file = 0;
362	}
363	}
364	if (!$hash_on_file) {
365	my $filename = &util::get_tmp_filename();
366	if (!open (OUTFILE, ">:utf8", $filename)) {
367	print STDERR "doc::set_OID could not write to $filename\n";
368	} else {
369	my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
370	print OUTFILE $doc_text;
371	close (OUTFILE);
372	}
373	$OID = $self->_calc_OID ($filename);
374	&util::rm ($filename);
375	}
376	}
377	}
378	$self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
379	}
380
381	# this uses hashdoc (embedded c thingy) which is faster but still
382	# needs a little work to be suffiently stable
383	sub ___set_OID {
384	my $self = shift (@_);
385	my ($OID) = @_;
386
387	# if an OID wasn't provided then calculate hash value based on document
388	if (!defined $OID)
389	{
390	my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
391	my $hash_len = length($hash_text);
392
393	$OID = &hashdoc::buffer($hash_text,$hash_len);
394	}
395
396	$self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
397	}
398
399	# returns the OID for this document
400	sub get_OID {
401	my $self = shift (@_);
402	my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
403	return $OID if (defined $OID);
404	return "NULL";
405	}
406
407	sub delete_OID {
408	my $self = shift (@_);
409
410	$self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
411	}
412
413
414	# methods for manipulating section names
415
416	# returns the name of the top-most section (the top
417	# level of the document
418	sub get_top_section {
419	my $self = shift (@_);
420
421	return "";
422	}
423
424	# returns a section
425	sub get_parent_section {
426	my $self = shift (@_);
427	my ($section) = @_;
428
429	$section =~ s/(^\|\.)\d+$//;
430
431	return $section;
432	}
433
434	# returns the first child section (or the end child
435	# if there isn't any)
436	sub get_begin_child {
437	my $self = shift (@_);
438	my ($section) = @_;
439
440	my $section_ptr = $self->_lookup_section($section);
441	return "" unless defined $section_ptr;
442
443	if (defined $section_ptr->{'subsection_order'}->[0]) {
444	return "$section.$section_ptr->{'subsection_order'}->[0]";
445	}
446
447	return $self->get_end_child ($section);
448	}
449
450	# returns the next child of a parent section
451	sub get_next_child {
452	my $self = shift (@_);
453	my ($section) = @_;
454
455	my $parent_section = $self->get_parent_section($section);
456	my $parent_section_ptr = $self->_lookup_section($parent_section);
457	return undef unless defined $parent_section_ptr;
458
459	my ($section_num) = $section =~ /(\d+)$/;
460	return undef unless defined $section_num;
461
462	my $i = 0;
463	my $section_order = $parent_section_ptr->{'subsection_order'};
464	while ($i < scalar(@$section_order)) {
465	last if $section_order->[$i] eq $section_num;
466	$i++;
467	}
468
469	$i++; # the next child
470	if ($i < scalar(@$section_order)) {
471	return $section_order->[$i] if $parent_section eq "";
472	return "$parent_section.$section_order->[$i]";
473	}
474
475	# no more sections in this level
476	return undef;
477	}
478
479	# returns a reference to a list of children
480	sub get_children {
481	my $self = shift (@_);
482	my ($section) = @_;
483
484	my $section_ptr = $self->_lookup_section($section);
485	return [] unless defined $section_ptr;
486
487	my @children = @{$section_ptr->{'subsection_order'}};
488
489	map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
490	return \@children;
491	}
492
493	# returns the child section one past the last one (which
494	# is coded as "0")
495	sub get_end_child {
496	my $self = shift (@_);
497	my ($section) = @_;
498
499	return $section . ".0" unless $section eq "";
500	return "0";
501	}
502
503	# returns the next section in book order
504	sub get_next_section {
505	my $self = shift (@_);
506	my ($section) = @_;
507
508	return undef unless defined $section;
509
510	my $section_ptr = $self->_lookup_section($section);
511	return undef unless defined $section_ptr;
512
513	# first try to find first child
514	if (defined $section_ptr->{'subsection_order'}->[0]) {
515	return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
516	return "$section.$section_ptr->{'subsection_order'}->[0]";
517	}
518
519	do {
520	# try to find sibling
521	my $next_child = $self->get_next_child ($section);
522	return $next_child if (defined $next_child);
523
524	# move up one level
525	$section = $self->get_parent_section ($section);
526	} while $section =~ /\d/;
527
528	return undef;
529	}
530
531	sub is_leaf_section {
532	my $self = shift (@_);
533	my ($section) = @_;
534
535	my $section_ptr = $self->_lookup_section($section);
536	return 1 unless defined $section_ptr;
537
538	return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
539	}
540
541	# methods for dealing with sections
542
543	# returns the name of the inserted section
544	sub insert_section {
545	my $self = shift (@_);
546	my ($before_section) = @_;
547
548	# get the child to insert before and its parent section
549	my $parent_section = "";
550	my $before_child = "0";
551	my @before_section = split (/\./, $before_section);
552	if (scalar(@before_section) > 0) {
553	$before_child = pop (@before_section);
554	$parent_section = join (".", @before_section);
555	}
556
557	my $parent_section_ptr = $self->_lookup_section($parent_section);
558	if (!defined $parent_section_ptr) {
559	print STDERR "doc::insert_section couldn't find parent section " .
560	"$parent_section\n";
561	return;
562	}
563
564	# get the next section number
565	my $section_num = $parent_section_ptr->{'next_subsection'}++;
566
567	my $i = 0;
568	while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
569	$parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
570	$i++;
571	}
572
573	# insert the section number into the order list
574	splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
575
576	# add this section to the parent section
577	my $section_ptr = {'subsection_order'=>[],
578	'next_subsection'=>1,
579	'subsections'=>{},
580	'metadata'=>[],
581	'text'=>""};
582	$parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
583
584	# work out the full section number
585	my $section = $parent_section;
586	$section .= "." unless $section eq "";
587	$section .= $section_num;
588
589	return $section;
590	}
591
592	# creates a pre-named section
593	sub create_named_section {
594	my $self = shift (@_);
595	my ($mastersection) = @_;
596
597	my ($num);
598	my $section = $mastersection;
599	my $sectionref = $self;
600
601	while ($section ne "") {
602	($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
603	$num =~ s/^0+(\d)/$1/; # remove leading 0s
604	$section = "" unless defined $section;
605
606	if (defined $num) {
607	if (!defined $sectionref->{'subsections'}->{$num}) {
608	push (@{$sectionref->{'subsection_order'}}, $num);
609	$sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
610	'next_subsection'=>1,
611	'subsections'=>{},
612	'metadata'=>[],
613	'text'=>""};
614	if ($num >= $sectionref->{'next_subsection'}) {
615	$sectionref->{'next_subsection'} = $num + 1;
616	}
617	}
618	$sectionref = $sectionref->{'subsections'}->{$num};
619
620	} else {
621	print STDERR "doc::create_named_section couldn't create section ";
622	print STDERR "$mastersection\n";
623	last;
624	}
625	}
626	}
627
628	# returns a reference to a list of subsections
629	sub list_subsections {
630	my $self = shift (@_);
631	my ($section) = @_;
632
633	my $section_ptr = $self->_lookup_section ($section);
634	if (!defined $section_ptr) {
635	print STDERR "doc::list_subsections couldn't find section $section\n";
636	return [];
637	}
638
639	return [@{$section_ptr->{'subsection_order'}}];
640	}
641
642	sub delete_section {
643	my $self = shift (@_);
644	my ($section) = @_;
645
646	# my $section_ptr = {'subsection_order'=>[],
647	# 'next_subsection'=>1,
648	# 'subsections'=>{},
649	# 'metadata'=>[],
650	# 'text'=>""};
651
652	# if this is the top section reset everything
653	if ($section eq "") {
654	$self->{'subsection_order'} = [];
655	$self->{'subsections'} = {};
656	$self->{'metadata'} = [];
657	$self->{'text'} = "";
658	return;
659	}
660
661	# find the parent of the section to delete
662	my $parent_section = "";
663	my $child = "0";
664	my @section = split (/\./, $section);
665	if (scalar(@section) > 0) {
666	$child = pop (@section);
667	$parent_section = join (".", @section);
668	}
669
670	my $parent_section_ptr = $self->_lookup_section($parent_section);
671	if (!defined $parent_section_ptr) {
672	print STDERR "doc::delete_section couldn't find parent section " .
673	"$parent_section\n";
674	return;
675	}
676
677	# remove this section from the subsection_order list
678	my $i = 0;
679	while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
680	if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
681	splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
682	last;
683	}
684	$i++;
685	}
686
687	# remove this section from the subsection hash
688	if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
689	undef $parent_section_ptr->{'subsections'}->{$child};
690	}
691	}
692
693	#--
694	# methods for dealing with metadata
695
696	# set_metadata_element and get_metadata_element are for metadata
697	# which should only have one value. add_meta_data and get_metadata
698	# are for metadata which can have more than one value.
699
700	# returns the first metadata value which matches field
701
702	# This version of get metadata element works much like the one above,
703	# except it allows for the namespace portion of a metadata element to
704	# be ignored, thus if you are searching for dc.Title, the first piece
705	# of matching metadata ending with the name Title (once any namespace
706	# is removed) would be returned.
707	# 28-11-2003 John Thompson
708	sub get_metadata_element {
709	my $self = shift (@_);
710	my ($section, $field, $ignore_namespace) = @_;
711	my ($data);
712
713	$ignore_namespace = 0 unless defined $ignore_namespace;
714
715	my $section_ptr = $self->_lookup_section($section);
716	if (!defined $section_ptr) {
717	print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
718	return;
719	}
720
721	# Remove the namespace if we are being told to ignore them
722	if($ignore_namespace) {
723	$field =~ s/^\w*\.//;
724	}
725
726	foreach $data (@{$section_ptr->{'metadata'}}) {
727
728	my $data_name = $data->[0];
729
730	# Remove the any namespace if we are being told to ignore them
731	if($ignore_namespace) {
732	$data_name =~ s/^\w*\.//;
733	}
734	$data_name =~ s/^ex\.//; # we always remove ex. - it maybe there in doc_obj, but we will never ask for it.
735	return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
736	}
737
738	return undef; # was not found
739	}
740
741	# returns a list of the form [value1, value2, ...]
742	sub get_metadata {
743	my $self = shift (@_);
744	my ($section, $field, $ignore_namespace) = @_;
745	my ($data);
746
747	$ignore_namespace = 0 unless defined $ignore_namespace;
748
749	my $section_ptr = $self->_lookup_section($section);
750	if (!defined $section_ptr) {
751	print STDERR "doc::get_metadata couldn't find section ",
752	$section, "\n";
753	return;
754	}
755
756	# Remove the any namespace if we are being told to ignore them
757	if($ignore_namespace) {
758	$field =~ s/^\w*\.//;
759	}
760
761	my @metadata = ();
762	foreach $data (@{$section_ptr->{'metadata'}}) {
763
764	my $data_name = $data->[0];
765	# Remove the any namespace if we are being told to ignore them
766	if($ignore_namespace) {
767	$data_name =~ s/^\w*\.//;
768	}
769	$data_name =~ s/^ex\.//; # we always remove ex. - it maybe there in doc_obj, but we will never ask for it.
770	push (@metadata, $data->[1]) if ($data_name eq $field);
771	}
772
773	return \@metadata;
774	}
775
776	# returns a list of the form [[field,value],[field,value],...]
777	sub get_all_metadata {
778	my $self = shift (@_);
779	my ($section) = @_;
780
781	my $section_ptr = $self->_lookup_section($section);
782	if (!defined $section_ptr) {
783	print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
784	return;
785	}
786
787	return $section_ptr->{'metadata'};
788	}
789
790	# $value is optional
791	sub delete_metadata {
792	my $self = shift (@_);
793	my ($section, $field, $value) = @_;
794
795	my $section_ptr = $self->_lookup_section($section);
796	if (!defined $section_ptr) {
797	print STDERR "doc::delete_metadata couldn't find section ", $section, "\n";
798	return;
799	}
800
801	my $i = 0;
802	while ($i < scalar (@{$section_ptr->{'metadata'}})) {
803	if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
804	(!defined $value \|\| $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
805	splice (@{$section_ptr->{'metadata'}}, $i, 1);
806	} else {
807	$i++;
808	}
809	}
810	}
811
812	sub delete_all_metadata {
813	my $self = shift (@_);
814	my ($section) = @_;
815
816	my $section_ptr = $self->_lookup_section($section);
817	if (!defined $section_ptr) {
818	print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
819	return;
820	}
821
822	$section_ptr->{'metadata'} = [];
823	}
824
825	sub set_metadata_element {
826	my $self = shift (@_);
827	my ($section, $field, $value) = @_;
828
829	$self->set_utf8_metadata_element ($section, $field,
830	&unicode::ascii2utf8(\$value));
831	}
832
833	# set_utf8_metadata_element assumes the text has already been
834	# converted to the UTF-8 encoding.
835	sub set_utf8_metadata_element {
836	my $self = shift (@_);
837	my ($section, $field, $value) = @_;
838
839	$self->delete_metadata ($section, $field);
840	$self->add_utf8_metadata ($section, $field, $value);
841	}
842
843
844	# add_metadata assumes the text is in (extended) ascii form. For
845	# text which has already been converted to the UTF-8 format use
846	# add_utf8_metadata.
847	sub add_metadata {
848	my $self = shift (@_);
849	my ($section, $field, $value) = @_;
850
851	$self->add_utf8_metadata ($section, $field,
852	&unicode::ascii2utf8(\$value));
853	}
854
855	sub add_utf8_metadata {
856	my $self = shift (@_);
857	my ($section, $field, $value) = @_;
858
859	# my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
860	# my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
861	# print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
862
863	my $section_ptr = $self->_lookup_section($section);
864	if (!defined $section_ptr) {
865	print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
866	return;
867	}
868	if (!defined $value) {
869	print STDERR "doc::add_utf8_metadata undefined value for $field\n";
870	return;
871	}
872	if (!defined $field) {
873	print STDERR "doc::add_utf8_metadata undefined metadata type \n";
874	return;
875	}
876
877	#print STDERR "###$field=$value\n";
878
879	# For now, supress this check. Given that text data read in is now
880	# Unicode aware, then the following block of code can (ironically enough)
881	# cause our unicode compliant string to be re-encoded (leading to
882	# a double-encoded UTF-8 string, which we definitely don't want!).
883
884
885	# double check that the value is utf-8
886	# if (!&unicode::check_is_utf8($value)) {
887	# print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
888	# &unicode::ensure_utf8(\$value);
889	# print STDERR " Tried converting to utf8: $value\n";
890	# }
891
892	push (@{$section_ptr->{'metadata'}}, [$field, $value]);
893	}
894
895
896	# methods for dealing with text
897
898	# returns the text for a section
899	sub get_text {
900	my $self = shift (@_);
901	my ($section) = @_;
902
903	my $section_ptr = $self->_lookup_section($section);
904	if (!defined $section_ptr) {
905	print STDERR "doc::get_text couldn't find section " .
906	"$section\n";
907	return "";
908	}
909
910	return $section_ptr->{'text'};
911	}
912
913	# returns the (utf-8 encoded) length of the text for a section
914	sub get_text_length {
915	my $self = shift (@_);
916	my ($section) = @_;
917
918	my $section_ptr = $self->_lookup_section($section);
919	if (!defined $section_ptr) {
920	print STDERR "doc::get_text_length couldn't find section " .
921	"$section\n";
922	return 0;
923	}
924
925	return length ($section_ptr->{'text'});
926	}
927
928	sub delete_text {
929	my $self = shift (@_);
930	my ($section) = @_;
931
932	my $section_ptr = $self->_lookup_section($section);
933	if (!defined $section_ptr) {
934	print STDERR "doc::delete_text couldn't find section " .
935	"$section\n";
936	return;
937	}
938
939	$section_ptr->{'text'} = "";
940	}
941
942	# add_text assumes the text is in (extended) ascii form. For
943	# text which has been already converted to the UTF-8 format
944	# use add_utf8_text.
945	sub add_text {
946	my $self = shift (@_);
947	my ($section, $text) = @_;
948
949	# convert the text to UTF-8 encoded unicode characters
950	# and add the text
951	$self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
952	}
953
954
955	# add_utf8_text assumes the text to be added has already
956	# been converted to the UTF-8 encoding. For ascii text use
957	# add_text
958	sub add_utf8_text {
959	my $self = shift (@_);
960	my ($section, $text) = @_;
961
962	my $section_ptr = $self->_lookup_section($section);
963	if (!defined $section_ptr) {
964	print STDERR "doc::add_utf8_text couldn't find section " .
965	"$section\n";
966	return;
967	}
968
969	$section_ptr->{'text'} .= $text;
970	}
971
972	# returns the Source meta, which is the utf8 filename generated.
973	# Added a separate method here for convenience
974	sub get_source {
975	my $self = shift (@_);
976	return $self->get_metadata_element ($self->get_top_section(), "Source");
977	}
978
979	# returns the SourceFile meta, which is the url reference to the URL-encoded
980	# version of Source (the utf8 filename). Added a separate method here for convenience
981	sub get_sourcefile {
982	my $self = shift (@_);
983	return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
984	}
985
986	# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
987	# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
988	sub get_assocfile_from_sourcefile {
989	my $self = shift (@_);
990
991	# get the SourceFile meta, which is a URL to a file on the filesystem
992	my $top_section = $self->get_top_section();
993	my $source_file = $self->get_metadata_element($top_section, "SourceFile");
994
995	# get the actual filename as it exists on the filesystem which this url refers to
996	$source_file = &unicode::url_to_filename($source_file);
997	my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
998	return $assocfilename;
999	}
1000
1001	# methods for dealing with associated files
1002
1003	# a file is associated with a document, NOT a section.
1004	# if section is defined it is noted in the data structure
1005	# only so that files associated from a particular section
1006	# may be removed later (using delete_section_assoc_files)
1007	sub associate_file {
1008	my $self = shift (@_);
1009	my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1010	$mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1011
1012	# remove all associated files with the same name
1013	$self->delete_assoc_file ($assoc_filename);
1014
1015	push (@{$self->{'associated_files'}},
1016	[$real_filename, $assoc_filename, $mime_type, $section]);
1017	}
1018
1019	# returns a list of associated files in the form
1020	# [[real_filename, assoc_filename, mimetype], ...]
1021	sub get_assoc_files {
1022	my $self = shift (@_);
1023
1024	return $self->{'associated_files'};
1025	}
1026
1027	# the following two methods used to keep track of original associated files
1028	# for incremental building. eg a txt file used by an item file does not end
1029	# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1030	# file for incremental build
1031	sub associate_source_file {
1032	my $self = shift (@_);
1033	my ($full_filename) = @_;
1034
1035	push (@{$self->{'source_assoc_files'}}, $full_filename);
1036
1037	}
1038
1039	sub get_source_assoc_files {
1040	my $self = shift (@_);
1041
1042	return $self->{'source_assoc_files'};
1043
1044
1045	}
1046	sub metadata_file {
1047	my $self = shift (@_);
1048	my ($real_filename, $filename) = @_;
1049
1050	push (@{$self->{'metadata_files'}},
1051	[$real_filename, $filename]);
1052	}
1053
1054	# used for writing out the archiveinf-doc info database, to list all the metadata files
1055	sub get_meta_files {
1056	my $self = shift (@_);
1057
1058	return $self->{'metadata_files'};
1059	}
1060
1061	sub delete_section_assoc_files {
1062	my $self = shift (@_);
1063	my ($section) = @_;
1064
1065	my $i=0;
1066	while ($i < scalar (@{$self->{'associated_files'}})) {
1067	if (defined $self->{'associated_files'}->[$i]->[3] &&
1068	$self->{'associated_files'}->[$i]->[3] eq $section) {
1069	splice (@{$self->{'associated_files'}}, $i, 1);
1070	} else {
1071	$i++;
1072	}
1073	}
1074	}
1075
1076	sub delete_assoc_file {
1077	my $self = shift (@_);
1078	my ($assoc_filename) = @_;
1079
1080	my $i=0;
1081	while ($i < scalar (@{$self->{'associated_files'}})) {
1082	if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1083	splice (@{$self->{'associated_files'}}, $i, 1);
1084	} else {
1085	$i++;
1086	}
1087	}
1088	}
1089
1090	sub reset_nextsection_ptr {
1091	my $self = shift (@_);
1092	my ($section) = @_;
1093
1094	my $section_ptr = $self->_lookup_section($section);
1095	$section_ptr->{'next_subsection'} = 1;
1096	}
1097
1098	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: