Context Navigation

source: main/trunk/greenstone2/perllib/doc.pm@ 23827

Last change on this file since 23827 was 23827, checked in by sjm84, 13 years ago
Phase four of commiting the files changed to extend the DSpace exporting capabilities to include more than just dublin core metadata
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 33.5 KB

Line
1	###########################################################################
2	#
3	# doc.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistr te it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# base class to hold documents
27
28	package doc;
29	eval {require bytes};
30
31	BEGIN {
32	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33	unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34	}
35
36	use strict;
37	use unicode;
38	use util;
39	use ghtml;
40	use File::stat;
41	##use hashdoc;
42	use docprint;
43
44	# the document type may be indexed_doc, nonindexed_doc, or
45	# classification
46
47	our $OIDcount = 0;
48
49	# rename_method can be 'url', 'none', 'base64'
50	sub new {
51	my $class = shift (@_);
52	my ($source_filename, $doc_type, $rename_method) = @_;
53
54
55	my $self = bless {'associated_files'=>[],
56	'subsection_order'=>[],
57	'next_subsection'=>1,
58	'subsections'=>{},
59	'metadata'=>[],
60	'text'=>"",
61	'OIDtype'=>"hash"}, $class;
62
63	# used to set lastmodified here, but this can screw up the HASH ids, so
64	# the docsave processor now calls set_lastmodified
65
66	$self->set_source_path($source_filename);
67
68	if (defined $source_filename) {
69	$source_filename = &util::filename_within_collection($source_filename);
70	print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
71	$self->set_source_filename ($source_filename, $rename_method);
72	}
73
74	$self->set_doc_type ($doc_type) if defined $doc_type;
75
76	return $self;
77	}
78
79
80	sub set_source_path
81	{
82	my $self = shift @_;
83	my ($source_filename) = @_;
84
85	if (defined $source_filename) {
86	# On Windows the source_filename can be in terse DOS format
87	# e.g. test~1.txt
88
89	$self->{'terse_source_path'} = $source_filename;
90
91	if (-e $source_filename) {
92	# See if we can do better for Windows with a filename
93	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
94	require Win32;
95	$self->{'source_path'} = Win32::GetLongPathName($source_filename);
96	}
97	else {
98	# For Unix-based systems, there is no difference between the two
99	$self->{'source_path'} = $source_filename;
100	}
101	}
102	else {
103	print STDERR "Warning: In doc::set_source_path(), file\n";
104	print STDERR " $source_filename\n";
105	print STDERR " does not exist\n";
106
107	# (default) Set it to whatever we were given
108	$self->{'source_path'} = $source_filename;
109	}
110	}
111	else {
112	# Previous code for setting source_path allowed for
113	# it to be undefined, so continue this practice
114	$self->{'terse_source_path'} = undef;
115	$self->{'source_path'} = undef;
116	}
117	}
118
119
120	sub get_source_path
121	{
122	my $self = shift @_;
123
124	return $self->{'terse_source_path'};
125	}
126
127	# set lastmodified for OAI purposes, added by GRB, moved by kjdon
128	sub set_lastmodified {
129	my $self = shift (@_);
130
131	my $source_path = $self->{'terse_source_path'};
132
133	if (defined $source_path && (-e $source_path)) {
134
135	my $file_stat = stat($source_path);
136	my $mtime = $file_stat->mtime;
137	my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
138	$wday, $yday, $isdst) = localtime($mtime);
139
140	my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
141
142	$self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
143	$self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
144	}
145	}
146
147	# clone the $self object
148	sub duplicate {
149	my $self = shift (@_);
150
151	my $newobj = {};
152
153	foreach my $k (keys %$self) {
154	$newobj->{$k} = &clone ($self->{$k});
155	}
156
157	bless $newobj, ref($self);
158	return $newobj;
159	}
160
161	sub clone {
162	my ($from) = @_;
163	my $type = ref ($from);
164
165	if ($type eq "HASH") {
166	my $to = {};
167	foreach my $key (keys %$from) {
168	$to->{$key} = &clone ($from->{$key});
169	}
170	return $to;
171	} elsif ($type eq "ARRAY") {
172	my $to = [];
173	foreach my $v (@$from) {
174	push (@$to, &clone ($v));
175	}
176	return $to;
177	} else {
178	return $from;
179	}
180	}
181
182	sub set_OIDtype {
183	my $self = shift (@_);
184	my ($type, $metadata) = @_;
185
186	if (defined $type && $type =~ /^(hash\|hash_on_file\|hash_on_ga_xml\|incremental\|dirname\|assigned)$/) {
187	$self->{'OIDtype'} = $type;
188	} else {
189	$self->{'OIDtype'} = "hash";
190	}
191
192	if ($type =~ /^assigned$/) {
193	if (defined $metadata) {
194	$self->{'OIDmetadata'} = $metadata;
195	} else {
196	$self->{'OIDmetadata'} = "dc.Identifier";
197	}
198	}
199	}
200
201	# rename_method can be 'url', 'none', 'base64'
202	sub set_source_filename {
203	my $self = shift (@_);
204	my ($source_filename, $rename_method) = @_;
205
206	# Since the gsdlsourcefilename element goes into the doc.xml it has
207	# to be utf8. However, it should also represent the source filename
208	# (in the import directory) which may not be utf8 at all.
209	# For instance, if this meta element (gsdlsourcefilename) will be used
210	# by other applications that parse doc.xml in order to locate
211	# gsdlsourcefilename. Therefore, the solution is to URLencode or base64
212	# encode the real filename as this is a binary-to-text encoding meaning
213	# that the resulting string is ASCII (utf8). Decoding will give the original.
214
215	# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
216
217	# URLencode just the gsdl_source_filename, not the directory. Then prepend dir
218	$source_filename = $self->encode_filename($source_filename, $rename_method);
219	# my ($srcfilename,$dirname,$suffix)
220	# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
221	# print STDERR "-> $srcfilename -> ";
222	# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
223	# $source_filename = &util::filename_cat($dirname, $srcfilename);
224	# print STDERR "$source_filename\n";
225
226	$self->set_utf8_metadata_element ($self->get_top_section(),
227	"gsdlsourcefilename",
228	$source_filename);
229	}
230
231	sub encode_filename {
232	my $self = shift (@_);
233	my ($source_filename, $rename_method) = @_;
234
235	my ($srcfilename,$dirname,$suffix)
236	= &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
237	# print STDERR "-> $srcfilename -> ";
238	$srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
239	$source_filename = &util::filename_cat($dirname, $srcfilename);
240
241	return $source_filename;
242	}
243
244	sub set_converted_filename {
245	my $self = shift (@_);
246	my ($converted_filename) = @_;
247
248	# we know the converted filename is utf8
249	$self->set_utf8_metadata_element ($self->get_top_section(),
250	"gsdlconvertedfilename",
251	$converted_filename);
252	}
253
254	# returns the source_filename as it was provided
255	sub get_unmodified_source_filename {
256	my $self = shift (@_);
257
258	return $self->{'terse_source_path'};
259	}
260
261	# returns the source_filename with whatever rename_method was given
262	sub get_source_filename {
263	my $self = shift (@_);
264
265	return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
266	}
267
268
269
270	# returns converted filename if available else returns source filename
271	sub get_filename_for_hashing {
272	my $self = shift (@_);
273
274	my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
275
276	if (!defined $filename) {
277	my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
278	# if NULPlug processed file, then don't give a filename
279	if (defined $plugin_name && $plugin_name eq "NULPlug") {
280	$filename = undef;
281	} else { # returns the URL encoded source filename!
282	$filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
283	}
284	}
285
286	if (!&util::filename_is_absolute($filename)) {
287	$filename = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$filename);
288	}
289
290	return $filename;
291	}
292
293	sub set_doc_type {
294	my $self = shift (@_);
295	my ($doc_type) = @_;
296
297	$self->set_metadata_element ($self->get_top_section(),
298	"gsdldoctype",
299	$doc_type);
300	}
301
302	# returns the gsdldoctype as it was provided
303	# the default of "indexed_doc" is used if no document
304	# type was provided
305	sub get_doc_type {
306	my $self = shift (@_);
307
308	my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
309	return $doc_type if (defined $doc_type);
310	return "indexed_doc";
311	}
312
313
314	# look up the reference to the a particular section
315	sub _lookup_section {
316	my $self = shift (@_);
317	my ($section) = @_;
318
319	my ($num);
320	my $sectionref = $self;
321
322	while (defined $section && $section ne "") {
323
324	($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
325
326	$num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
327
328	$section = "" unless defined $section;
329
330
331	if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
332	$sectionref = $sectionref->{'subsections'}->{$num};
333	} else {
334	return undef;
335	}
336	}
337
338	return $sectionref;
339	}
340
341	# calculate OID by hashing the contents of the document
342	sub _calc_OID {
343	my $self = shift (@_);
344	my ($filename) = @_;
345
346
347	my $osexe = &util::get_os_exe();
348
349	my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
350	$ENV{'GSDLOS'},"hashfile$osexe");
351
352	my $result = "NULL";
353
354
355	if (-e "$hashfile_exe") {
356	# $result = `\"$hashfile_exe\" \"$filename\"`;
357	# $result = `hashfile$osexe \"$filename\" 2>&1`;
358	$result = `hashfile$osexe \"$filename\"`;
359
360	($result) = $result =~ /:\s*([0-9a-f]+)/i;
361	} else {
362	print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
363	}
364	return "HASH$result";
365	}
366
367	# methods dealing with OID, not groups of them.
368
369	# if $OID is not provided one is calculated
370	sub set_OID {
371	my $self = shift (@_);
372	my ($OID) = @_;
373
374	my $use_hash_oid = 0;
375	# if an OID wasn't provided calculate one
376	if (!defined $OID) {
377	$OID = "NULL";
378	if ($self->{'OIDtype'} =~ /^hash/) {
379	$use_hash_oid = 1;
380	} elsif ($self->{'OIDtype'} eq "incremental") {
381	$OID = "D" . $OIDcount;
382	$OIDcount ++;
383
384	} elsif ($self->{'OIDtype'} eq "dirname") {
385	$OID = 'J';
386	my $filename = $self->get_source_filename();
387	if (defined($filename)) { # && -e $filename) {
388	$OID = &File::Basename::dirname($filename);
389	if (defined $OID) {
390	$OID = 'J'.&File::Basename::basename($OID);
391	$OID = &util::tidy_up_oid($OID);
392	} else {
393	print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
394	$use_hash_oid = 1;
395	}
396	} else {
397	print STDERR "Failed to find filename, generating hash id\n";
398	$use_hash_oid = 1;
399	}
400
401	} elsif ($self->{'OIDtype'} eq "assigned") {
402	my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
403	if (defined $identifier && $identifier ne "") {
404	$OID = $identifier;
405	$OID = &util::tidy_up_oid($OID);
406	} else {
407	# need a hash id
408	print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
409	$use_hash_oid = 1;
410	}
411
412	} else {
413	$use_hash_oid = 1;
414	}
415
416	if ($use_hash_oid) {
417	my $hash_on_file = 1;
418	if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
419	$hash_on_file = 0;
420	}
421	if ($hash_on_file) {
422	# "hash" OID - feed file to hashfile.exe
423	my $filename = $self->get_filename_for_hashing();
424
425	# -z: don't want to hash on the file if it is zero size
426	if (defined($filename) && -e $filename && !-z $filename) {
427	$OID = $self->_calc_OID ($filename);
428	} else {
429	$hash_on_file = 0;
430	}
431	}
432	if (!$hash_on_file) {
433	my $filename = &util::get_tmp_filename();
434	if (!open (OUTFILE, ">:utf8", $filename)) {
435	print STDERR "doc::set_OID could not write to $filename\n";
436	} else {
437	my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
438	print OUTFILE $doc_text;
439	close (OUTFILE);
440	}
441	$OID = $self->_calc_OID ($filename);
442	&util::rm ($filename);
443	}
444	}
445	}
446	$self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
447	}
448
449	# this uses hashdoc (embedded c thingy) which is faster but still
450	# needs a little work to be suffiently stable
451	sub ___set_OID {
452	my $self = shift (@_);
453	my ($OID) = @_;
454
455	# if an OID wasn't provided then calculate hash value based on document
456	if (!defined $OID)
457	{
458	my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
459	my $hash_len = length($hash_text);
460
461	$OID = &hashdoc::buffer($hash_text,$hash_len);
462	}
463
464	$self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
465	}
466
467	# returns the OID for this document
468	sub get_OID {
469	my $self = shift (@_);
470	my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
471	return $OID if (defined $OID);
472	return "NULL";
473	}
474
475	sub delete_OID {
476	my $self = shift (@_);
477
478	$self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
479	}
480
481
482	# methods for manipulating section names
483
484	# returns the name of the top-most section (the top
485	# level of the document
486	sub get_top_section {
487	my $self = shift (@_);
488
489	return "";
490	}
491
492	# returns a section
493	sub get_parent_section {
494	my $self = shift (@_);
495	my ($section) = @_;
496
497	$section =~ s/(^\|\.)\d+$//;
498
499	return $section;
500	}
501
502	# returns the first child section (or the end child
503	# if there isn't any)
504	sub get_begin_child {
505	my $self = shift (@_);
506	my ($section) = @_;
507
508	my $section_ptr = $self->_lookup_section($section);
509	return "" unless defined $section_ptr;
510
511	if (defined $section_ptr->{'subsection_order'}->[0]) {
512	return "$section.$section_ptr->{'subsection_order'}->[0]";
513	}
514
515	return $self->get_end_child ($section);
516	}
517
518	# returns the next child of a parent section
519	sub get_next_child {
520	my $self = shift (@_);
521	my ($section) = @_;
522
523	my $parent_section = $self->get_parent_section($section);
524	my $parent_section_ptr = $self->_lookup_section($parent_section);
525	return undef unless defined $parent_section_ptr;
526
527	my ($section_num) = $section =~ /(\d+)$/;
528	return undef unless defined $section_num;
529
530	my $i = 0;
531	my $section_order = $parent_section_ptr->{'subsection_order'};
532	while ($i < scalar(@$section_order)) {
533	last if $section_order->[$i] eq $section_num;
534	$i++;
535	}
536
537	$i++; # the next child
538	if ($i < scalar(@$section_order)) {
539	return $section_order->[$i] if $parent_section eq "";
540	return "$parent_section.$section_order->[$i]";
541	}
542
543	# no more sections in this level
544	return undef;
545	}
546
547	# returns a reference to a list of children
548	sub get_children {
549	my $self = shift (@_);
550	my ($section) = @_;
551
552	my $section_ptr = $self->_lookup_section($section);
553	return [] unless defined $section_ptr;
554
555	my @children = @{$section_ptr->{'subsection_order'}};
556
557	map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
558	return \@children;
559	}
560
561	# returns the child section one past the last one (which
562	# is coded as "0")
563	sub get_end_child {
564	my $self = shift (@_);
565	my ($section) = @_;
566
567	return $section . ".0" unless $section eq "";
568	return "0";
569	}
570
571	# returns the next section in book order
572	sub get_next_section {
573	my $self = shift (@_);
574	my ($section) = @_;
575
576	return undef unless defined $section;
577
578	my $section_ptr = $self->_lookup_section($section);
579	return undef unless defined $section_ptr;
580
581	# first try to find first child
582	if (defined $section_ptr->{'subsection_order'}->[0]) {
583	return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
584	return "$section.$section_ptr->{'subsection_order'}->[0]";
585	}
586
587	do {
588	# try to find sibling
589	my $next_child = $self->get_next_child ($section);
590	return $next_child if (defined $next_child);
591
592	# move up one level
593	$section = $self->get_parent_section ($section);
594	} while $section =~ /\d/;
595
596	return undef;
597	}
598
599	sub is_leaf_section {
600	my $self = shift (@_);
601	my ($section) = @_;
602
603	my $section_ptr = $self->_lookup_section($section);
604	return 1 unless defined $section_ptr;
605
606	return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
607	}
608
609	# methods for dealing with sections
610
611	# returns the name of the inserted section
612	sub insert_section {
613	my $self = shift (@_);
614	my ($before_section) = @_;
615
616	# get the child to insert before and its parent section
617	my $parent_section = "";
618	my $before_child = "0";
619	my @before_section = split (/\./, $before_section);
620	if (scalar(@before_section) > 0) {
621	$before_child = pop (@before_section);
622	$parent_section = join (".", @before_section);
623	}
624
625	my $parent_section_ptr = $self->_lookup_section($parent_section);
626	if (!defined $parent_section_ptr) {
627	print STDERR "doc::insert_section couldn't find parent section " .
628	"$parent_section\n";
629	return;
630	}
631
632	# get the next section number
633	my $section_num = $parent_section_ptr->{'next_subsection'}++;
634
635	my $i = 0;
636	while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
637	$parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
638	$i++;
639	}
640
641	# insert the section number into the order list
642	splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
643
644	# add this section to the parent section
645	my $section_ptr = {'subsection_order'=>[],
646	'next_subsection'=>1,
647	'subsections'=>{},
648	'metadata'=>[],
649	'text'=>""};
650	$parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
651
652	# work out the full section number
653	my $section = $parent_section;
654	$section .= "." unless $section eq "";
655	$section .= $section_num;
656
657	return $section;
658	}
659
660	# creates a pre-named section
661	sub create_named_section {
662	my $self = shift (@_);
663	my ($mastersection) = @_;
664
665	my ($num);
666	my $section = $mastersection;
667	my $sectionref = $self;
668
669	while ($section ne "") {
670	($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
671	$num =~ s/^0+(\d)/$1/; # remove leading 0s
672	$section = "" unless defined $section;
673
674	if (defined $num) {
675	if (!defined $sectionref->{'subsections'}->{$num}) {
676	push (@{$sectionref->{'subsection_order'}}, $num);
677	$sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
678	'next_subsection'=>1,
679	'subsections'=>{},
680	'metadata'=>[],
681	'text'=>""};
682	if ($num >= $sectionref->{'next_subsection'}) {
683	$sectionref->{'next_subsection'} = $num + 1;
684	}
685	}
686	$sectionref = $sectionref->{'subsections'}->{$num};
687
688	} else {
689	print STDERR "doc::create_named_section couldn't create section ";
690	print STDERR "$mastersection\n";
691	last;
692	}
693	}
694	}
695
696	# returns a reference to a list of subsections
697	sub list_subsections {
698	my $self = shift (@_);
699	my ($section) = @_;
700
701	my $section_ptr = $self->_lookup_section ($section);
702	if (!defined $section_ptr) {
703	print STDERR "doc::list_subsections couldn't find section $section\n";
704	return [];
705	}
706
707	return [@{$section_ptr->{'subsection_order'}}];
708	}
709
710	sub delete_section {
711	my $self = shift (@_);
712	my ($section) = @_;
713
714	# my $section_ptr = {'subsection_order'=>[],
715	# 'next_subsection'=>1,
716	# 'subsections'=>{},
717	# 'metadata'=>[],
718	# 'text'=>""};
719
720	# if this is the top section reset everything
721	if ($section eq "") {
722	$self->{'subsection_order'} = [];
723	$self->{'subsections'} = {};
724	$self->{'metadata'} = [];
725	$self->{'text'} = "";
726	return;
727	}
728
729	# find the parent of the section to delete
730	my $parent_section = "";
731	my $child = "0";
732	my @section = split (/\./, $section);
733	if (scalar(@section) > 0) {
734	$child = pop (@section);
735	$parent_section = join (".", @section);
736	}
737
738	my $parent_section_ptr = $self->_lookup_section($parent_section);
739	if (!defined $parent_section_ptr) {
740	print STDERR "doc::delete_section couldn't find parent section " .
741	"$parent_section\n";
742	return;
743	}
744
745	# remove this section from the subsection_order list
746	my $i = 0;
747	while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
748	if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
749	splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
750	last;
751	}
752	$i++;
753	}
754
755	# remove this section from the subsection hash
756	if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
757	undef $parent_section_ptr->{'subsections'}->{$child};
758	}
759	}
760
761	#--
762	# methods for dealing with metadata
763
764	# set_metadata_element and get_metadata_element are for metadata
765	# which should only have one value. add_meta_data and get_metadata
766	# are for metadata which can have more than one value.
767
768	# returns the first metadata value which matches field
769
770	# This version of get metadata element works much like the one above,
771	# except it allows for the namespace portion of a metadata element to
772	# be ignored, thus if you are searching for dc.Title, the first piece
773	# of matching metadata ending with the name Title (once any namespace
774	# is removed) would be returned.
775	# 28-11-2003 John Thompson
776	sub get_metadata_element {
777	my $self = shift (@_);
778	my ($section, $field, $ignore_namespace) = @_;
779	my ($data);
780
781	$ignore_namespace = 0 unless defined $ignore_namespace;
782
783	my $section_ptr = $self->_lookup_section($section);
784	if (!defined $section_ptr) {
785	print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
786	return;
787	}
788
789	# Remove the namespace if we are being told to ignore them
790	if($ignore_namespace) {
791	$field =~ s/^\w*\.//;
792	}
793
794	foreach $data (@{$section_ptr->{'metadata'}}) {
795
796	my $data_name = $data->[0];
797
798	# Remove the any namespace if we are being told to ignore them
799	if($ignore_namespace) {
800	$data_name =~ s/^\w*\.//;
801	}
802	$data_name =~ s/^ex\.//; # we always remove ex. - it maybe there in doc_obj, but we will never ask for it.
803	return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
804	}
805
806	return undef; # was not found
807	}
808
809	# returns a list of the form [value1, value2, ...]
810	sub get_metadata {
811	my $self = shift (@_);
812	my ($section, $field, $ignore_namespace) = @_;
813	my ($data);
814
815	$ignore_namespace = 0 unless defined $ignore_namespace;
816
817	my $section_ptr = $self->_lookup_section($section);
818	if (!defined $section_ptr) {
819	print STDERR "doc::get_metadata couldn't find section ",
820	$section, "\n";
821	return;
822	}
823
824	# Remove the any namespace if we are being told to ignore them
825	if($ignore_namespace) {
826	$field =~ s/^\w*\.//;
827	}
828
829	my @metadata = ();
830	foreach $data (@{$section_ptr->{'metadata'}}) {
831
832	my $data_name = $data->[0];
833	# Remove the any namespace if we are being told to ignore them
834	if($ignore_namespace) {
835	$data_name =~ s/^\w*\.//;
836	}
837	$data_name =~ s/^ex\.//; # we always remove ex. - it maybe there in doc_obj, but we will never ask for it.
838	push (@metadata, $data->[1]) if ($data_name eq $field);
839	}
840
841	return \@metadata;
842	}
843
844	sub get_metadata_hashmap {
845	my $self = shift (@_);
846	my ($section, $opt_namespace) = @_;
847
848	my $section_ptr = $self->_lookup_section($section);
849	if (!defined $section_ptr) {
850	print STDERR "doc::get_metadata couldn't find section ",
851	$section, "\n";
852	return;
853	}
854
855	my $metadata_hashmap = {};
856	foreach my $data (@{$section_ptr->{'metadata'}}) {
857	my $metaname = $data->[0];
858
859	if ((!defined $opt_namespace) \|\| ($metaname =~ m/^$opt_namespace\./)) {
860	if (!defined $metadata_hashmap->{$metaname}) {
861	$metadata_hashmap->{$metaname} = [];
862	}
863	my $metaval_list = $metadata_hashmap->{$metaname};
864	push(@$metaval_list, $data->[1]);
865	}
866	}
867
868	return $metadata_hashmap;
869	}
870
871	# returns a list of the form [[field,value],[field,value],...]
872	sub get_all_metadata {
873	my $self = shift (@_);
874	my ($section) = @_;
875
876	my $section_ptr = $self->_lookup_section($section);
877	if (!defined $section_ptr) {
878	print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
879	return;
880	}
881
882	return $section_ptr->{'metadata'};
883	}
884
885	# $value is optional
886	sub delete_metadata {
887	my $self = shift (@_);
888	my ($section, $field, $value) = @_;
889
890	my $section_ptr = $self->_lookup_section($section);
891	if (!defined $section_ptr) {
892	print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
893	return;
894	}
895
896	my $i = 0;
897	while ($i < scalar (@{$section_ptr->{'metadata'}})) {
898	if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
899	(!defined $value \|\| $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
900	splice (@{$section_ptr->{'metadata'}}, $i, 1);
901	} else {
902	$i++;
903	}
904	}
905	}
906
907	sub delete_all_metadata {
908	my $self = shift (@_);
909	my ($section) = @_;
910
911	my $section_ptr = $self->_lookup_section($section);
912	if (!defined $section_ptr) {
913	print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
914	return;
915	}
916
917	$section_ptr->{'metadata'} = [];
918	}
919
920	sub set_metadata_element {
921	my $self = shift (@_);
922	my ($section, $field, $value) = @_;
923
924	$self->set_utf8_metadata_element ($section, $field,
925	&unicode::ascii2utf8(\$value));
926	}
927
928	# set_utf8_metadata_element assumes the text has already been
929	# converted to the UTF-8 encoding.
930	sub set_utf8_metadata_element {
931	my $self = shift (@_);
932	my ($section, $field, $value) = @_;
933
934	$self->delete_metadata ($section, $field);
935	$self->add_utf8_metadata ($section, $field, $value);
936	}
937
938
939	# add_metadata assumes the text is in (extended) ascii form. For
940	# text which has already been converted to the UTF-8 format use
941	# add_utf8_metadata.
942	sub add_metadata {
943	my $self = shift (@_);
944	my ($section, $field, $value) = @_;
945
946	$self->add_utf8_metadata ($section, $field,
947	&unicode::ascii2utf8(\$value));
948	}
949
950	sub add_utf8_metadata {
951	my $self = shift (@_);
952	my ($section, $field, $value) = @_;
953
954	# my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
955	# my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
956	# print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
957
958	my $section_ptr = $self->_lookup_section($section);
959	if (!defined $section_ptr) {
960	print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
961	return;
962	}
963	if (!defined $value) {
964	print STDERR "doc::add_utf8_metadata undefined value for $field\n";
965	return;
966	}
967	if (!defined $field) {
968	print STDERR "doc::add_utf8_metadata undefined metadata type \n";
969	return;
970	}
971
972	#print STDERR "###$field=$value\n";
973
974	# For now, supress this check. Given that text data read in is now
975	# Unicode aware, then the following block of code can (ironically enough)
976	# cause our unicode compliant string to be re-encoded (leading to
977	# a double-encoded UTF-8 string, which we definitely don't want!).
978
979
980	# double check that the value is utf-8
981	# if (!&unicode::check_is_utf8($value)) {
982	# print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
983	# &unicode::ensure_utf8(\$value);
984	# print STDERR " Tried converting to utf8: $value\n";
985	# }
986
987	push (@{$section_ptr->{'metadata'}}, [$field, $value]);
988	}
989
990
991	# methods for dealing with text
992
993	# returns the text for a section
994	sub get_text {
995	my $self = shift (@_);
996	my ($section) = @_;
997
998	my $section_ptr = $self->_lookup_section($section);
999	if (!defined $section_ptr) {
1000	print STDERR "doc::get_text couldn't find section " .
1001	"$section\n";
1002	return "";
1003	}
1004
1005	return $section_ptr->{'text'};
1006	}
1007
1008	# returns the (utf-8 encoded) length of the text for a section
1009	sub get_text_length {
1010	my $self = shift (@_);
1011	my ($section) = @_;
1012
1013	my $section_ptr = $self->_lookup_section($section);
1014	if (!defined $section_ptr) {
1015	print STDERR "doc::get_text_length couldn't find section " .
1016	"$section\n";
1017	return 0;
1018	}
1019
1020	return length ($section_ptr->{'text'});
1021	}
1022
1023	# returns the total length for all the sections
1024	sub get_total_text_length {
1025	my $self = shift (@_);
1026
1027	my $section = $self->get_top_section();
1028	my $length = 0;
1029	while (defined $section) {
1030	$length += $self->get_text_length($section);
1031	$section = $self->get_next_section($section);
1032	}
1033	return $length;
1034	}
1035
1036	sub delete_text {
1037	my $self = shift (@_);
1038	my ($section) = @_;
1039
1040	my $section_ptr = $self->_lookup_section($section);
1041	if (!defined $section_ptr) {
1042	print STDERR "doc::delete_text couldn't find section " .
1043	"$section\n";
1044	return;
1045	}
1046
1047	$section_ptr->{'text'} = "";
1048	}
1049
1050	# add_text assumes the text is in (extended) ascii form. For
1051	# text which has been already converted to the UTF-8 format
1052	# use add_utf8_text.
1053	sub add_text {
1054	my $self = shift (@_);
1055	my ($section, $text) = @_;
1056
1057	# convert the text to UTF-8 encoded unicode characters
1058	# and add the text
1059	$self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1060	}
1061
1062
1063	# add_utf8_text assumes the text to be added has already
1064	# been converted to the UTF-8 encoding. For ascii text use
1065	# add_text
1066	sub add_utf8_text {
1067	my $self = shift (@_);
1068	my ($section, $text) = @_;
1069
1070	my $section_ptr = $self->_lookup_section($section);
1071	if (!defined $section_ptr) {
1072	print STDERR "doc::add_utf8_text couldn't find section " .
1073	"$section\n";
1074	return;
1075	}
1076
1077	$section_ptr->{'text'} .= $text;
1078	}
1079
1080	# returns the Source meta, which is the utf8 filename generated.
1081	# Added a separate method here for convenience
1082	sub get_source {
1083	my $self = shift (@_);
1084	return $self->get_metadata_element ($self->get_top_section(), "Source");
1085	}
1086
1087	# returns the SourceFile meta, which is the url reference to the URL-encoded
1088	# version of Source (the utf8 filename). Added a separate method here for convenience
1089	sub get_sourcefile {
1090	my $self = shift (@_);
1091	return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1092	}
1093
1094	# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1095	# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1096	sub get_assocfile_from_sourcefile {
1097	my $self = shift (@_);
1098
1099	# get the SourceFile meta, which is a URL to a file on the filesystem
1100	my $top_section = $self->get_top_section();
1101	my $source_file = $self->get_metadata_element($top_section, "SourceFile");
1102
1103	# get the actual filename as it exists on the filesystem which this url refers to
1104	$source_file = &unicode::url_to_filename($source_file);
1105	my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1106	return $assocfilename;
1107	}
1108
1109	# methods for dealing with associated files
1110
1111	# a file is associated with a document, NOT a section.
1112	# if section is defined it is noted in the data structure
1113	# only so that files associated from a particular section
1114	# may be removed later (using delete_section_assoc_files)
1115	sub associate_file {
1116	my $self = shift (@_);
1117	my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1118	$mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1119
1120	# remove all associated files with the same name
1121	$self->delete_assoc_file ($assoc_filename);
1122
1123	# Too harsh a requirement
1124	# Definitely get HTML docs, for example, with some missing
1125	# support files
1126	# if (!&util::fd_exists($real_filename)) {
1127	# print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1128	# exit -1;
1129	# }
1130
1131	# print STDERR "**** is the following a UTF8 rep of real filename?\n $real_filename\n";
1132	# print STDERR "****##### so, ensure it is before storing?!?!?\n";
1133	## my $utf8_filename = Encode::encode("utf8",$filename);
1134
1135	push (@{$self->{'associated_files'}},
1136	[$real_filename, $assoc_filename, $mime_type, $section]);
1137	}
1138
1139	# returns a list of associated files in the form
1140	# [[real_filename, assoc_filename, mimetype], ...]
1141	sub get_assoc_files {
1142	my $self = shift (@_);
1143
1144	return $self->{'associated_files'};
1145	}
1146
1147	# the following two methods used to keep track of original associated files
1148	# for incremental building. eg a txt file used by an item file does not end
1149	# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1150	# file for incremental build
1151	sub associate_source_file {
1152	my $self = shift (@_);
1153	my ($full_filename) = @_;
1154
1155	push (@{$self->{'source_assoc_files'}}, $full_filename);
1156
1157	}
1158
1159	sub get_source_assoc_files {
1160	my $self = shift (@_);
1161
1162	return $self->{'source_assoc_files'};
1163
1164
1165	}
1166	sub metadata_file {
1167	my $self = shift (@_);
1168	my ($real_filename, $filename) = @_;
1169
1170	push (@{$self->{'metadata_files'}},
1171	[$real_filename, $filename]);
1172	}
1173
1174	# used for writing out the archiveinf-doc info database, to list all the metadata files
1175	sub get_meta_files {
1176	my $self = shift (@_);
1177
1178	return $self->{'metadata_files'};
1179	}
1180
1181	sub delete_section_assoc_files {
1182	my $self = shift (@_);
1183	my ($section) = @_;
1184
1185	my $i=0;
1186	while ($i < scalar (@{$self->{'associated_files'}})) {
1187	if (defined $self->{'associated_files'}->[$i]->[3] &&
1188	$self->{'associated_files'}->[$i]->[3] eq $section) {
1189	splice (@{$self->{'associated_files'}}, $i, 1);
1190	} else {
1191	$i++;
1192	}
1193	}
1194	}
1195
1196	sub delete_assoc_file {
1197	my $self = shift (@_);
1198	my ($assoc_filename) = @_;
1199
1200	my $i=0;
1201	while ($i < scalar (@{$self->{'associated_files'}})) {
1202	if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1203	splice (@{$self->{'associated_files'}}, $i, 1);
1204	} else {
1205	$i++;
1206	}
1207	}
1208	}
1209
1210	sub reset_nextsection_ptr {
1211	my $self = shift (@_);
1212	my ($section) = @_;
1213
1214	my $section_ptr = $self->_lookup_section($section);
1215	$section_ptr->{'next_subsection'} = 1;
1216	}
1217
1218	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: