Context Navigation

source: gs2-extensions/parallel-building/trunk/src/perllib/doc.pm@ 26958

Last change on this file since 26958 was 26958, checked in by jmt12, 11 years ago
Using the util library instead of built-in perl file tests
File size: 34.5 KB

Line
1	###########################################################################
2	#
3	# doc.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistr te it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# base class to hold documents
27
28	package doc;
29	eval {require bytes};
30
31	BEGIN {
32	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33	# unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34	}
35
36	use strict;
37	use unicode;
38	use util;
39	use ghtml;
40	use File::stat;
41	##use hashdoc;
42	use docprint;
43
44	# the document type may be indexed_doc, nonindexed_doc, or
45	# classification
46
47	our $OIDcount = 0;
48
49	# rename_method can be 'url', 'none', 'base64'
50	sub new {
51	my $class = shift (@_);
52	my ($source_filename, $doc_type, $rename_method) = @_;
53
54
55	my $self = bless {'associated_files'=>[],
56	'subsection_order'=>[],
57	'next_subsection'=>1,
58	'subsections'=>{},
59	'metadata'=>[],
60	'text'=>"",
61	'OIDtype'=>"hash"}, $class;
62
63	# used to set lastmodified here, but this can screw up the HASH ids, so
64	# the docsave processor now calls set_lastmodified
65
66	$self->set_source_path($source_filename);
67
68	if (defined $source_filename) {
69	$source_filename = &util::filename_within_collection($source_filename);
70	print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
71	$self->set_source_filename ($source_filename, $rename_method);
72	}
73
74	$self->set_doc_type ($doc_type) if defined $doc_type;
75
76	return $self;
77	}
78
79
80	sub set_source_path
81	{
82	my $self = shift @_;
83	my ($source_filename) = @_;
84
85	if (defined $source_filename) {
86	# On Windows the source_filename can be in terse DOS format
87	# e.g. test~1.txt
88
89	$self->{'terse_source_path'} = $source_filename;
90
91	# Use the Util library methods as they are aware of more special cases
92	# such as HDFS [jmt12]
93	if (&util::file_exists($source_filename)) {
94	# See if we can do better for Windows with a filename
95	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
96	require Win32;
97	$self->{'source_path'} = Win32::GetLongPathName($source_filename);
98	}
99	else {
100	# For Unix-based systems, there is no difference between the two
101	$self->{'source_path'} = $source_filename;
102	}
103	}
104	else {
105	print STDERR "Warning: In doc::set_source_path(), file\n";
106	print STDERR " $source_filename\n";
107	print STDERR " does not exist\n";
108
109	# (default) Set it to whatever we were given
110	$self->{'source_path'} = $source_filename;
111	}
112	}
113	else {
114	# Previous code for setting source_path allowed for
115	# it to be undefined, so continue this practice
116	$self->{'terse_source_path'} = undef;
117	$self->{'source_path'} = undef;
118	}
119	}
120
121
122	sub get_source_path
123	{
124	my $self = shift @_;
125
126	return $self->{'terse_source_path'};
127	}
128
129	# set lastmodified for OAI purposes, added by GRB, moved by kjdon
130	sub set_oailastmodified {
131	my $self = shift (@_);
132
133	my $source_path = $self->{'terse_source_path'};
134
135	if (defined $source_path && (-e $source_path)) {
136	my $current_time = time;
137
138	my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
139	$wday, $yday, $isdst) = localtime($current_time);
140
141	my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
142
143	$self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
144	$self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
145	}
146	}
147
148	# no longer used for OAI purposes, since lastmodified is not what we want as the
149	# Datestamp of a document. This doc metadata may be useful for general purposes.
150	sub set_lastmodified {
151	my $self = shift (@_);
152
153	my $source_path = $self->{'terse_source_path'};
154
155	if (defined $source_path && (-e $source_path)) {
156
157	my $file_stat = stat($source_path);
158	my $mtime = $file_stat->mtime;
159	my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
160	$wday, $yday, $isdst) = localtime($mtime);
161
162	my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
163
164	$self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
165	$self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
166	}
167	}
168
169	# clone the $self object
170	sub duplicate {
171	my $self = shift (@_);
172
173	my $newobj = {};
174
175	foreach my $k (keys %$self) {
176	$newobj->{$k} = &clone ($self->{$k});
177	}
178
179	bless $newobj, ref($self);
180	return $newobj;
181	}
182
183	sub clone {
184	my ($from) = @_;
185	my $type = ref ($from);
186
187	if ($type eq "HASH") {
188	my $to = {};
189	foreach my $key (keys %$from) {
190	$to->{$key} = &clone ($from->{$key});
191	}
192	return $to;
193	} elsif ($type eq "ARRAY") {
194	my $to = [];
195	foreach my $v (@$from) {
196	push (@$to, &clone ($v));
197	}
198	return $to;
199	} else {
200	return $from;
201	}
202	}
203
204	sub set_OIDtype {
205	my $self = shift (@_);
206	my ($type, $metadata) = @_;
207
208	if (defined $type && $type =~ /^(hash\|hash_on_file\|hash_on_ga_xml\|incremental\|dirname\|assigned)$/) {
209	$self->{'OIDtype'} = $type;
210	} else {
211	$self->{'OIDtype'} = "hash";
212	}
213
214	if ($type =~ /^assigned$/) {
215	if (defined $metadata) {
216	$self->{'OIDmetadata'} = $metadata;
217	} else {
218	$self->{'OIDmetadata'} = "dc.Identifier";
219	}
220	}
221	}
222
223	# rename_method can be 'url', 'none', 'base64'
224	sub set_source_filename {
225	my $self = shift (@_);
226	my ($source_filename, $rename_method) = @_;
227
228	# Since the gsdlsourcefilename element goes into the doc.xml it has
229	# to be utf8. However, it should also represent the source filename
230	# (in the import directory) which may not be utf8 at all.
231	# For instance, if this meta element (gsdlsourcefilename) will be used
232	# by other applications that parse doc.xml in order to locate
233	# gsdlsourcefilename. Therefore, the solution is to URLencode or base64
234	# encode the real filename as this is a binary-to-text encoding meaning
235	# that the resulting string is ASCII (utf8). Decoding will give the original.
236
237	# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
238
239	# URLencode just the gsdl_source_filename, not the directory. Then prepend dir
240	$source_filename = $self->encode_filename($source_filename, $rename_method);
241	# my ($srcfilename,$dirname,$suffix)
242	# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
243	# print STDERR "-> $srcfilename -> ";
244	# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
245	# $source_filename = &util::filename_cat($dirname, $srcfilename);
246	# print STDERR "$source_filename\n";
247
248	$self->set_utf8_metadata_element ($self->get_top_section(),
249	"gsdlsourcefilename",
250	$source_filename);
251	}
252
253	sub encode_filename {
254	my $self = shift (@_);
255	my ($source_filename, $rename_method) = @_;
256
257	my ($srcfilename,$dirname,$suffix)
258	= &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
259	# print STDERR "-> $srcfilename -> ";
260	$srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
261	$source_filename = &util::filename_cat($dirname, $srcfilename);
262
263	return $source_filename;
264	}
265
266	sub set_converted_filename {
267	my $self = shift (@_);
268	my ($converted_filename) = @_;
269
270	# we know the converted filename is utf8
271	$self->set_utf8_metadata_element ($self->get_top_section(),
272	"gsdlconvertedfilename",
273	$converted_filename);
274	}
275
276	# returns the source_filename as it was provided
277	sub get_unmodified_source_filename {
278	my $self = shift (@_);
279
280	return $self->{'terse_source_path'};
281	}
282
283	# returns the source_filename with whatever rename_method was given
284	sub get_source_filename {
285	my $self = shift (@_);
286
287	return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
288	}
289
290
291
292	# returns converted filename if available else returns source filename
293	sub get_filename_for_hashing {
294	my $self = shift (@_);
295
296	my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
297
298	if (!defined $filename) {
299	my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
300	# if NULPlug processed file, then don't give a filename
301	if (defined $plugin_name && $plugin_name eq "NULPlug") {
302	$filename = undef;
303	} else { # returns the URL encoded source filename!
304	$filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
305	}
306	}
307
308	if (!&util::filename_is_absolute($filename)) {
309	$filename = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$filename);
310	}
311
312	return $filename;
313	}
314
315	sub set_doc_type {
316	my $self = shift (@_);
317	my ($doc_type) = @_;
318
319	$self->set_metadata_element ($self->get_top_section(),
320	"gsdldoctype",
321	$doc_type);
322	}
323
324	# returns the gsdldoctype as it was provided
325	# the default of "indexed_doc" is used if no document
326	# type was provided
327	sub get_doc_type {
328	my $self = shift (@_);
329
330	my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
331	return $doc_type if (defined $doc_type);
332	return "indexed_doc";
333	}
334
335
336	# look up the reference to the a particular section
337	sub _lookup_section {
338	my $self = shift (@_);
339	my ($section) = @_;
340
341	my ($num);
342	my $sectionref = $self;
343
344	while (defined $section && $section ne "") {
345
346	($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
347
348	$num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
349
350	$section = "" unless defined $section;
351
352
353	if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
354	$sectionref = $sectionref->{'subsections'}->{$num};
355	} else {
356	return undef;
357	}
358	}
359
360	return $sectionref;
361	}
362
363	# calculate OID by hashing the contents of the document
364	sub _calc_OID {
365	my $self = shift (@_);
366	my ($filename) = @_;
367
368
369	my $osexe = &util::get_os_exe();
370
371	my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
372	$ENV{'GSDLOS'},"hashfile$osexe");
373
374	my $result = "NULL";
375
376
377	if (-e "$hashfile_exe") {
378	# $result = `\"$hashfile_exe\" \"$filename\"`;
379	# $result = `hashfile$osexe \"$filename\" 2>&1`;
380	$result = `hashfile$osexe \"$filename\"`;
381
382	($result) = $result =~ /:\s*([0-9a-f]+)/i;
383	} else {
384	print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
385	}
386	return "HASH$result";
387	}
388
389	# methods dealing with OID, not groups of them.
390
391	# if $OID is not provided one is calculated
392	sub set_OID {
393	my $self = shift (@_);
394	my ($OID) = @_;
395
396	my $use_hash_oid = 0;
397	# if an OID wasn't provided calculate one
398	if (!defined $OID) {
399	$OID = "NULL";
400	if ($self->{'OIDtype'} =~ /^hash/) {
401	$use_hash_oid = 1;
402	} elsif ($self->{'OIDtype'} eq "incremental") {
403	$OID = "D" . $OIDcount;
404	$OIDcount ++;
405
406	} elsif ($self->{'OIDtype'} eq "dirname") {
407	$OID = 'J';
408	my $filename = $self->get_source_filename();
409	if (defined($filename)) { # && -e $filename) {
410	$OID = &File::Basename::dirname($filename);
411	if (defined $OID) {
412	$OID = 'J'.&File::Basename::basename($OID);
413	$OID = &util::tidy_up_oid($OID);
414	} else {
415	print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
416	$use_hash_oid = 1;
417	}
418	} else {
419	print STDERR "Failed to find filename, generating hash id\n";
420	$use_hash_oid = 1;
421	}
422
423	} elsif ($self->{'OIDtype'} eq "assigned") {
424	my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
425	if (defined $identifier && $identifier ne "") {
426	$OID = $identifier;
427	$OID = &util::tidy_up_oid($OID);
428	} else {
429	# need a hash id
430	print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
431	$use_hash_oid = 1;
432	}
433
434	} else {
435	$use_hash_oid = 1;
436	}
437
438	if ($use_hash_oid) {
439	my $hash_on_file = 1;
440	if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
441	$hash_on_file = 0;
442	}
443	if ($hash_on_file) {
444	# "hash" OID - feed file to hashfile.exe
445	my $filename = $self->get_filename_for_hashing();
446
447	# -z: don't want to hash on the file if it is zero size
448	if (defined($filename) && -e $filename && !-z $filename) {
449	$OID = $self->_calc_OID ($filename);
450	} else {
451	$hash_on_file = 0;
452	}
453	}
454	if (!$hash_on_file) {
455	my $filename = &util::get_tmp_filename();
456	if (!open (OUTFILE, ">:utf8", $filename)) {
457	print STDERR "doc::set_OID could not write to $filename\n";
458	} else {
459	my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
460	print OUTFILE $doc_text;
461	close (OUTFILE);
462	}
463	$OID = $self->_calc_OID ($filename);
464	&util::rm ($filename);
465	}
466	}
467	}
468	$self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
469	}
470
471	# this uses hashdoc (embedded c thingy) which is faster but still
472	# needs a little work to be suffiently stable
473	sub ___set_OID {
474	my $self = shift (@_);
475	my ($OID) = @_;
476
477	# if an OID wasn't provided then calculate hash value based on document
478	if (!defined $OID)
479	{
480	my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
481	my $hash_len = length($hash_text);
482
483	$OID = &hashdoc::buffer($hash_text,$hash_len);
484	}
485
486	$self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
487	}
488
489	# returns the OID for this document
490	sub get_OID {
491	my $self = shift (@_);
492	my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
493	return $OID if (defined $OID);
494	return "NULL";
495	}
496
497	sub delete_OID {
498	my $self = shift (@_);
499
500	$self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
501	}
502
503
504	# methods for manipulating section names
505
506	# returns the name of the top-most section (the top
507	# level of the document
508	sub get_top_section {
509	my $self = shift (@_);
510
511	return "";
512	}
513
514	# returns a section
515	sub get_parent_section {
516	my $self = shift (@_);
517	my ($section) = @_;
518
519	$section =~ s/(^\|\.)\d+$//;
520
521	return $section;
522	}
523
524	# returns the first child section (or the end child
525	# if there isn't any)
526	sub get_begin_child {
527	my $self = shift (@_);
528	my ($section) = @_;
529
530	my $section_ptr = $self->_lookup_section($section);
531	return "" unless defined $section_ptr;
532
533	if (defined $section_ptr->{'subsection_order'}->[0]) {
534	return "$section.$section_ptr->{'subsection_order'}->[0]";
535	}
536
537	return $self->get_end_child ($section);
538	}
539
540	# returns the next child of a parent section
541	sub get_next_child {
542	my $self = shift (@_);
543	my ($section) = @_;
544
545	my $parent_section = $self->get_parent_section($section);
546	my $parent_section_ptr = $self->_lookup_section($parent_section);
547	return undef unless defined $parent_section_ptr;
548
549	my ($section_num) = $section =~ /(\d+)$/;
550	return undef unless defined $section_num;
551
552	my $i = 0;
553	my $section_order = $parent_section_ptr->{'subsection_order'};
554	while ($i < scalar(@$section_order)) {
555	last if $section_order->[$i] eq $section_num;
556	$i++;
557	}
558
559	$i++; # the next child
560	if ($i < scalar(@$section_order)) {
561	return $section_order->[$i] if $parent_section eq "";
562	return "$parent_section.$section_order->[$i]";
563	}
564
565	# no more sections in this level
566	return undef;
567	}
568
569	# returns a reference to a list of children
570	sub get_children {
571	my $self = shift (@_);
572	my ($section) = @_;
573
574	my $section_ptr = $self->_lookup_section($section);
575	return [] unless defined $section_ptr;
576
577	my @children = @{$section_ptr->{'subsection_order'}};
578
579	map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
580	return \@children;
581	}
582
583	# returns the child section one past the last one (which
584	# is coded as "0")
585	sub get_end_child {
586	my $self = shift (@_);
587	my ($section) = @_;
588
589	return $section . ".0" unless $section eq "";
590	return "0";
591	}
592
593	# returns the next section in book order
594	sub get_next_section {
595	my $self = shift (@_);
596	my ($section) = @_;
597
598	return undef unless defined $section;
599
600	my $section_ptr = $self->_lookup_section($section);
601	return undef unless defined $section_ptr;
602
603	# first try to find first child
604	if (defined $section_ptr->{'subsection_order'}->[0]) {
605	return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
606	return "$section.$section_ptr->{'subsection_order'}->[0]";
607	}
608
609	do {
610	# try to find sibling
611	my $next_child = $self->get_next_child ($section);
612	return $next_child if (defined $next_child);
613
614	# move up one level
615	$section = $self->get_parent_section ($section);
616	} while $section =~ /\d/;
617
618	return undef;
619	}
620
621	sub is_leaf_section {
622	my $self = shift (@_);
623	my ($section) = @_;
624
625	my $section_ptr = $self->_lookup_section($section);
626	return 1 unless defined $section_ptr;
627
628	return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
629	}
630
631	# methods for dealing with sections
632
633	# returns the name of the inserted section
634	sub insert_section {
635	my $self = shift (@_);
636	my ($before_section) = @_;
637
638	# get the child to insert before and its parent section
639	my $parent_section = "";
640	my $before_child = "0";
641	my @before_section = split (/\./, $before_section);
642	if (scalar(@before_section) > 0) {
643	$before_child = pop (@before_section);
644	$parent_section = join (".", @before_section);
645	}
646
647	my $parent_section_ptr = $self->_lookup_section($parent_section);
648	if (!defined $parent_section_ptr) {
649	print STDERR "doc::insert_section couldn't find parent section " .
650	"$parent_section\n";
651	return;
652	}
653
654	# get the next section number
655	my $section_num = $parent_section_ptr->{'next_subsection'}++;
656
657	my $i = 0;
658	while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
659	$parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
660	$i++;
661	}
662
663	# insert the section number into the order list
664	splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
665
666	# add this section to the parent section
667	my $section_ptr = {'subsection_order'=>[],
668	'next_subsection'=>1,
669	'subsections'=>{},
670	'metadata'=>[],
671	'text'=>""};
672	$parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
673
674	# work out the full section number
675	my $section = $parent_section;
676	$section .= "." unless $section eq "";
677	$section .= $section_num;
678
679	return $section;
680	}
681
682	# creates a pre-named section
683	sub create_named_section {
684	my $self = shift (@_);
685	my ($mastersection) = @_;
686
687	my ($num);
688	my $section = $mastersection;
689	my $sectionref = $self;
690
691	while ($section ne "") {
692	($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
693	$num =~ s/^0+(\d)/$1/; # remove leading 0s
694	$section = "" unless defined $section;
695
696	if (defined $num) {
697	if (!defined $sectionref->{'subsections'}->{$num}) {
698	push (@{$sectionref->{'subsection_order'}}, $num);
699	$sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
700	'next_subsection'=>1,
701	'subsections'=>{},
702	'metadata'=>[],
703	'text'=>""};
704	if ($num >= $sectionref->{'next_subsection'}) {
705	$sectionref->{'next_subsection'} = $num + 1;
706	}
707	}
708	$sectionref = $sectionref->{'subsections'}->{$num};
709
710	} else {
711	print STDERR "doc::create_named_section couldn't create section ";
712	print STDERR "$mastersection\n";
713	last;
714	}
715	}
716	}
717
718	# returns a reference to a list of subsections
719	sub list_subsections {
720	my $self = shift (@_);
721	my ($section) = @_;
722
723	my $section_ptr = $self->_lookup_section ($section);
724	if (!defined $section_ptr) {
725	print STDERR "doc::list_subsections couldn't find section $section\n";
726	return [];
727	}
728
729	return [@{$section_ptr->{'subsection_order'}}];
730	}
731
732	sub delete_section {
733	my $self = shift (@_);
734	my ($section) = @_;
735
736	# my $section_ptr = {'subsection_order'=>[],
737	# 'next_subsection'=>1,
738	# 'subsections'=>{},
739	# 'metadata'=>[],
740	# 'text'=>""};
741
742	# if this is the top section reset everything
743	if ($section eq "") {
744	$self->{'subsection_order'} = [];
745	$self->{'subsections'} = {};
746	$self->{'metadata'} = [];
747	$self->{'text'} = "";
748	return;
749	}
750
751	# find the parent of the section to delete
752	my $parent_section = "";
753	my $child = "0";
754	my @section = split (/\./, $section);
755	if (scalar(@section) > 0) {
756	$child = pop (@section);
757	$parent_section = join (".", @section);
758	}
759
760	my $parent_section_ptr = $self->_lookup_section($parent_section);
761	if (!defined $parent_section_ptr) {
762	print STDERR "doc::delete_section couldn't find parent section " .
763	"$parent_section\n";
764	return;
765	}
766
767	# remove this section from the subsection_order list
768	my $i = 0;
769	while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
770	if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
771	splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
772	last;
773	}
774	$i++;
775	}
776
777	# remove this section from the subsection hash
778	if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
779	undef $parent_section_ptr->{'subsections'}->{$child};
780	}
781	}
782
783	#--
784	# methods for dealing with metadata
785
786	# set_metadata_element and get_metadata_element are for metadata
787	# which should only have one value. add_meta_data and get_metadata
788	# are for metadata which can have more than one value.
789
790	# returns the first metadata value which matches field
791
792	# This version of get metadata element works much like the one above,
793	# except it allows for the namespace portion of a metadata element to
794	# be ignored, thus if you are searching for dc.Title, the first piece
795	# of matching metadata ending with the name Title (once any namespace
796	# is removed) would be returned.
797	# 28-11-2003 John Thompson
798	sub get_metadata_element {
799	my $self = shift (@_);
800	my ($section, $field, $ignore_namespace) = @_;
801	my ($data);
802
803	$ignore_namespace = 0 unless defined $ignore_namespace;
804
805	my $section_ptr = $self->_lookup_section($section);
806	if (!defined $section_ptr) {
807	print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
808	return;
809	}
810
811	# Remove any namespace if we are being told to ignore them
812	if($ignore_namespace) {
813	$field =~ s/^.\.//; #$field =~ s/^\w\.//;
814	}
815
816	foreach $data (@{$section_ptr->{'metadata'}}) {
817
818	my $data_name = $data->[0];
819
820	# Remove any namespace if we are being told to ignore them
821	if($ignore_namespace) {
822	$data_name =~ s/^.\.//; #$data_name =~ s/^\w\.//;
823	}
824	# we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
825	$data_name =~ s/^ex\.([^.]+)$/$1/; #$data_name =~ s/^ex\.//;
826
827	return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
828	}
829
830	return undef; # was not found
831	}
832
833	# returns a list of the form [value1, value2, ...]
834	sub get_metadata {
835	my $self = shift (@_);
836	my ($section, $field, $ignore_namespace) = @_;
837	my ($data);
838
839	$ignore_namespace = 0 unless defined $ignore_namespace;
840
841	my $section_ptr = $self->_lookup_section($section);
842	if (!defined $section_ptr) {
843	print STDERR "doc::get_metadata couldn't find section ",
844	$section, "\n";
845	return;
846	}
847
848	# Remove any namespace if we are being told to ignore them
849	if($ignore_namespace) {
850	$field =~ s/^.*\.//;
851	}
852
853	my @metadata = ();
854	foreach $data (@{$section_ptr->{'metadata'}}) {
855
856	my $data_name = $data->[0];
857
858	# Remove any namespace if we are being told to ignore them
859	if($ignore_namespace) {
860	$data_name =~ s/^.*\.//;
861	}
862	# we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
863	$data_name =~ s/^ex\.([^.]+)$/$1/;
864
865	push (@metadata, $data->[1]) if ($data_name eq $field);
866	}
867
868	return \@metadata;
869	}
870
871	sub get_metadata_hashmap {
872	my $self = shift (@_);
873	my ($section, $opt_namespace) = @_;
874
875	my $section_ptr = $self->_lookup_section($section);
876	if (!defined $section_ptr) {
877	print STDERR "doc::get_metadata couldn't find section ",
878	$section, "\n";
879	return;
880	}
881
882	my $metadata_hashmap = {};
883	foreach my $data (@{$section_ptr->{'metadata'}}) {
884	my $metaname = $data->[0];
885
886	if ((!defined $opt_namespace) \|\| ($metaname =~ m/^$opt_namespace\./)) {
887	if (!defined $metadata_hashmap->{$metaname}) {
888	$metadata_hashmap->{$metaname} = [];
889	}
890	my $metaval_list = $metadata_hashmap->{$metaname};
891	push(@$metaval_list, $data->[1]);
892	}
893	}
894
895	return $metadata_hashmap;
896	}
897
898	# returns a list of the form [[field,value],[field,value],...]
899	sub get_all_metadata {
900	my $self = shift (@_);
901	my ($section) = @_;
902
903	my $section_ptr = $self->_lookup_section($section);
904	if (!defined $section_ptr) {
905	print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
906	return;
907	}
908
909	return $section_ptr->{'metadata'};
910	}
911
912	# $value is optional
913	sub delete_metadata {
914	my $self = shift (@_);
915	my ($section, $field, $value) = @_;
916
917	my $section_ptr = $self->_lookup_section($section);
918	if (!defined $section_ptr) {
919	print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
920	return;
921	}
922
923	my $i = 0;
924	while ($i < scalar (@{$section_ptr->{'metadata'}})) {
925	if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
926	(!defined $value \|\| $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
927	splice (@{$section_ptr->{'metadata'}}, $i, 1);
928	} else {
929	$i++;
930	}
931	}
932	}
933
934	sub delete_all_metadata {
935	my $self = shift (@_);
936	my ($section) = @_;
937
938	my $section_ptr = $self->_lookup_section($section);
939	if (!defined $section_ptr) {
940	print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
941	return;
942	}
943
944	$section_ptr->{'metadata'} = [];
945	}
946
947	sub set_metadata_element {
948	my $self = shift (@_);
949	my ($section, $field, $value) = @_;
950
951	$self->set_utf8_metadata_element ($section, $field,
952	&unicode::ascii2utf8(\$value));
953	}
954
955	# set_utf8_metadata_element assumes the text has already been
956	# converted to the UTF-8 encoding.
957	sub set_utf8_metadata_element {
958	my $self = shift (@_);
959	my ($section, $field, $value) = @_;
960
961	$self->delete_metadata ($section, $field);
962	$self->add_utf8_metadata ($section, $field, $value);
963	}
964
965
966	# add_metadata assumes the text is in (extended) ascii form. For
967	# text which has already been converted to the UTF-8 format use
968	# add_utf8_metadata.
969	sub add_metadata {
970	my $self = shift (@_);
971	my ($section, $field, $value) = @_;
972
973	$self->add_utf8_metadata ($section, $field,
974	&unicode::ascii2utf8(\$value));
975	}
976
977	sub add_utf8_metadata {
978	my $self = shift (@_);
979	my ($section, $field, $value) = @_;
980
981	# my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
982	# my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
983	# print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
984
985	my $section_ptr = $self->_lookup_section($section);
986	if (!defined $section_ptr) {
987	print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
988	return;
989	}
990	if (!defined $value) {
991	print STDERR "doc::add_utf8_metadata undefined value for $field\n";
992	return;
993	}
994	if (!defined $field) {
995	print STDERR "doc::add_utf8_metadata undefined metadata type \n";
996	return;
997	}
998
999	#print STDERR "###$field=$value\n";
1000
1001	# For now, supress this check. Given that text data read in is now
1002	# Unicode aware, then the following block of code can (ironically enough)
1003	# cause our unicode compliant string to be re-encoded (leading to
1004	# a double-encoded UTF-8 string, which we definitely don't want!).
1005
1006
1007	# double check that the value is utf-8
1008	# if (!&unicode::check_is_utf8($value)) {
1009	# print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
1010	# &unicode::ensure_utf8(\$value);
1011	# print STDERR " Tried converting to utf8: $value\n";
1012	# }
1013
1014	push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1015	}
1016
1017
1018	# methods for dealing with text
1019
1020	# returns the text for a section
1021	sub get_text {
1022	my $self = shift (@_);
1023	my ($section) = @_;
1024
1025	my $section_ptr = $self->_lookup_section($section);
1026	if (!defined $section_ptr) {
1027	print STDERR "doc::get_text couldn't find section " .
1028	"$section\n";
1029	return "";
1030	}
1031
1032	return $section_ptr->{'text'};
1033	}
1034
1035	# returns the (utf-8 encoded) length of the text for a section
1036	sub get_text_length {
1037	my $self = shift (@_);
1038	my ($section) = @_;
1039
1040	my $section_ptr = $self->_lookup_section($section);
1041	if (!defined $section_ptr) {
1042	print STDERR "doc::get_text_length couldn't find section " .
1043	"$section\n";
1044	return 0;
1045	}
1046
1047	return length ($section_ptr->{'text'});
1048	}
1049
1050	# returns the total length for all the sections
1051	sub get_total_text_length {
1052	my $self = shift (@_);
1053
1054	my $section = $self->get_top_section();
1055	my $length = 0;
1056	while (defined $section) {
1057	$length += $self->get_text_length($section);
1058	$section = $self->get_next_section($section);
1059	}
1060	return $length;
1061	}
1062
1063	sub delete_text {
1064	my $self = shift (@_);
1065	my ($section) = @_;
1066
1067	my $section_ptr = $self->_lookup_section($section);
1068	if (!defined $section_ptr) {
1069	print STDERR "doc::delete_text couldn't find section " .
1070	"$section\n";
1071	return;
1072	}
1073
1074	$section_ptr->{'text'} = "";
1075	}
1076
1077	# add_text assumes the text is in (extended) ascii form. For
1078	# text which has been already converted to the UTF-8 format
1079	# use add_utf8_text.
1080	sub add_text {
1081	my $self = shift (@_);
1082	my ($section, $text) = @_;
1083
1084	# convert the text to UTF-8 encoded unicode characters
1085	# and add the text
1086	$self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1087	}
1088
1089
1090	# add_utf8_text assumes the text to be added has already
1091	# been converted to the UTF-8 encoding. For ascii text use
1092	# add_text
1093	sub add_utf8_text {
1094	my $self = shift (@_);
1095	my ($section, $text) = @_;
1096
1097	my $section_ptr = $self->_lookup_section($section);
1098	if (!defined $section_ptr) {
1099	print STDERR "doc::add_utf8_text couldn't find section " .
1100	"$section\n";
1101	return;
1102	}
1103
1104	$section_ptr->{'text'} .= $text;
1105	}
1106
1107	# returns the Source meta, which is the utf8 filename generated.
1108	# Added a separate method here for convenience
1109	sub get_source {
1110	my $self = shift (@_);
1111	return $self->get_metadata_element ($self->get_top_section(), "Source");
1112	}
1113
1114	# returns the SourceFile meta, which is the url reference to the URL-encoded
1115	# version of Source (the utf8 filename). Added a separate method here for convenience
1116	sub get_sourcefile {
1117	my $self = shift (@_);
1118	return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1119	}
1120
1121	# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1122	# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1123	sub get_assocfile_from_sourcefile {
1124	my $self = shift (@_);
1125
1126	# get the SourceFile meta, which is a URL to a file on the filesystem
1127	my $top_section = $self->get_top_section();
1128	my $source_file = $self->get_metadata_element($top_section, "SourceFile");
1129
1130	# get the actual filename as it exists on the filesystem which this url refers to
1131	$source_file = &unicode::url_to_filename($source_file);
1132	my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1133	return $assocfilename;
1134	}
1135
1136	# methods for dealing with associated files
1137
1138	# a file is associated with a document, NOT a section.
1139	# if section is defined it is noted in the data structure
1140	# only so that files associated from a particular section
1141	# may be removed later (using delete_section_assoc_files)
1142	sub associate_file {
1143	my $self = shift (@_);
1144	my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1145	$mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1146
1147	# remove all associated files with the same name
1148	$self->delete_assoc_file ($assoc_filename);
1149
1150	# Too harsh a requirement
1151	# Definitely get HTML docs, for example, with some missing
1152	# support files
1153	# if (!&util::fd_exists($real_filename)) {
1154	# print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1155	# exit -1;
1156	# }
1157
1158	# print STDERR "**** is the following a UTF8 rep of real filename?\n $real_filename\n";
1159	# print STDERR "****##### so, ensure it is before storing?!?!?\n";
1160	## my $utf8_filename = Encode::encode("utf8",$filename);
1161
1162	push (@{$self->{'associated_files'}},
1163	[$real_filename, $assoc_filename, $mime_type, $section]);
1164	}
1165
1166	# returns a list of associated files in the form
1167	# [[real_filename, assoc_filename, mimetype], ...]
1168	sub get_assoc_files {
1169	my $self = shift (@_);
1170
1171	return $self->{'associated_files'};
1172	}
1173
1174	# the following two methods used to keep track of original associated files
1175	# for incremental building. eg a txt file used by an item file does not end
1176	# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1177	# file for incremental build
1178	sub associate_source_file {
1179	my $self = shift (@_);
1180	my ($full_filename) = @_;
1181
1182	push (@{$self->{'source_assoc_files'}}, $full_filename);
1183
1184	}
1185
1186	sub get_source_assoc_files {
1187	my $self = shift (@_);
1188
1189	return $self->{'source_assoc_files'};
1190
1191
1192	}
1193	sub metadata_file {
1194	my $self = shift (@_);
1195	my ($real_filename, $filename) = @_;
1196
1197	push (@{$self->{'metadata_files'}},
1198	[$real_filename, $filename]);
1199	}
1200
1201	# used for writing out the archiveinf-doc info database, to list all the metadata files
1202	sub get_meta_files {
1203	my $self = shift (@_);
1204
1205	return $self->{'metadata_files'};
1206	}
1207
1208	sub delete_section_assoc_files {
1209	my $self = shift (@_);
1210	my ($section) = @_;
1211
1212	my $i=0;
1213	while ($i < scalar (@{$self->{'associated_files'}})) {
1214	if (defined $self->{'associated_files'}->[$i]->[3] &&
1215	$self->{'associated_files'}->[$i]->[3] eq $section) {
1216	splice (@{$self->{'associated_files'}}, $i, 1);
1217	} else {
1218	$i++;
1219	}
1220	}
1221	}
1222
1223	sub delete_assoc_file {
1224	my $self = shift (@_);
1225	my ($assoc_filename) = @_;
1226
1227	my $i=0;
1228	while ($i < scalar (@{$self->{'associated_files'}})) {
1229	if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1230	splice (@{$self->{'associated_files'}}, $i, 1);
1231	} else {
1232	$i++;
1233	}
1234	}
1235	}
1236
1237	sub reset_nextsection_ptr {
1238	my $self = shift (@_);
1239	my ($section) = @_;
1240
1241	my $section_ptr = $self->_lookup_section($section);
1242	$section_ptr->{'next_subsection'} = 1;
1243	}
1244
1245	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: