source: trunk/gsdl/perllib/basedoc.pm@ 900

Last change on this file since 900 was 899, checked in by sjboddie, 24 years ago

small change to doc data structure to allow for some hacking
in WebPlug

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 19.6 KB
Line 
1###########################################################################
2#
3# basedoc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package basedoc;
29
30BEGIN {
31 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
32 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
33}
34
35use unicode;
36use util;
37use html;
38##use hashdoc;
39
40# the document type may be indexed_doc, nonindexed_doc, or
41# classification
42
43sub new {
44 my $class = shift (@_);
45
46 my $self = bless {'associated_files'=>[],
47 'subsection_order'=>[],
48 'next_subsection'=>1,
49 'subsections'=>{},
50 'metadata'=>[],
51 'text'=>""}, $class;
52
53 return $self;
54}
55
56# clone the $self object
57sub duplicate {
58 my $self = shift (@_);
59
60 my $newobj = {};
61
62 foreach $k (keys %$self) {
63 $newobj->{$k} = &clone ($self->{$k});
64 }
65
66 bless $newobj, ref($self);
67 return $newobj;
68}
69
70sub clone {
71 my ($from) = @_;
72 my $type = ref ($from);
73
74 if ($type eq "HASH") {
75 my $to = {};
76 foreach $key (keys %$from) {
77 $to->{$key} = &clone ($from->{$key});
78 }
79 return $to;
80 } elsif ($type eq "ARRAY") {
81 my $to = [];
82 foreach $v (@$from) {
83 push (@$to, &clone ($v));
84 }
85 return $to;
86 } else {
87 return $from;
88 }
89}
90
91
92sub set_source_filename {
93 my $self = shift (@_);
94 my ($source_filename) = @_;
95
96 $self->set_metadata_element ($self->get_top_section(),
97 "gsdlsourcefilename",
98 $source_filename);
99}
100
101# returns the source_filename as it was provided
102sub get_source_filename {
103 my $self = shift (@_);
104
105 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
106}
107
108sub set_doc_type {
109 my $self = shift (@_);
110 my ($doc_type) = @_;
111
112 $self->set_metadata_element ($self->get_top_section(),
113 "gsdldoctype",
114 $doc_type);
115}
116
117# returns the source_filename as it was provided
118# the default of "indexed_doc" is used if no document
119# type was provided
120sub get_doc_type {
121 my $self = shift (@_);
122
123 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
124 return $doc_type if (defined $doc_type);
125 return "indexed_doc";
126}
127
128sub _escape_text {
129 my ($text) = @_;
130
131 # special characters in the gml encoding
132 $text =~ s/&/&/g; # this has to be first...
133 $text =~ s/</&lt;/g;
134 $text =~ s/>/&gt;/g;
135 $text =~ s/\"/&quot;/g;
136
137 return $text;
138}
139
140
141sub buffer_section {
142 my $self = shift (@_);
143 my ($section, $suppress_subject_info) = @_;
144
145 $suppress_subject_info = 0 unless defined $suppress_subject_info;
146 my ($all_text,$data, $subsection);
147
148 my $section_ptr = $self->_lookup_section ($section);
149 my ($section_num) = $section =~ /(\d+)$/;
150
151 return "" unless defined $section_ptr;
152
153 # output the section header (including the section number
154 # and metadata)
155
156 $all_text = "<gsdlsection";
157 $all_text .= " gsdlnum=\"$section_num\"" if defined $section_num;
158 foreach $data (@{$section_ptr->{'metadata'}}) {
159 $all_text .= " $data->[0]=\"" . &_escape_text($data->[1]) . "\""
160 unless $suppress_subject_info && $data->[0] eq "Subject";
161 }
162 $all_text .= ">";
163
164 # output the text
165 $all_text .= &_escape_text($section_ptr->{'text'});
166
167 # output all the subsections
168 foreach $subsection (@{$section_ptr->{'subsection_order'}}) {
169 $all_text .= $self->buffer_section("$section.$subsection", $suppress_subject_info);
170 }
171
172 # output the closing tag
173 $all_text .= "</gsdlsection>\n";
174
175 return $all_text;
176}
177
178sub output_section {
179 my $self = shift (@_);
180 my ($handle, $section, $suppress_subject_info) = @_;
181
182 my $all_text = $self->buffer_section($section, $suppress_subject_info);
183 print $handle $all_text;
184}
185
186# look up the reference to the a particular section
187sub _lookup_section {
188 my $self = shift (@_);
189 my ($section) = @_;
190
191 my ($num);
192 my $sectionref = $self;
193
194 while (defined $section && $section ne "") {
195 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
196 $num =~ s/^0+(\d)/$1/; # remove leading 0s
197 $section = "" unless defined $section;
198
199 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
200 $sectionref = $sectionref->{'subsections'}->{$num};
201 } else {
202 return undef;
203 }
204 }
205
206 return $sectionref;
207}
208
209sub _calc_OID {
210 my $self = shift (@_);
211 my ($filename) = @_;
212
213 my $osexe = &util::get_os_exe();
214
215 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
216 $ENV{'GSDLOS'},"hashfile$osexe");
217 my $result = "NULL";
218
219 if (-e "$hashfile_exe") {
220 $result = `$hashfile_exe \"$filename\"`;
221 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
222
223 } else {
224 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
225 }
226
227 return "HASH$result";
228}
229
230# methods dealing with OID, not groups of them.
231
232# if $OID is not provided one is calculated from hashing the
233# current contents of the document
234# An OID are actually stored as metadata of the document
235sub set_OID {
236 my $self = shift (@_);
237 my ($OID) = @_;
238
239 # if an OID wasn't provided then feed this document to
240 # hashfile.exe
241 if (!defined $OID) {
242 $OID = "NULL";
243 my $tmp_filename = &util::get_tmp_filename();
244 if (!open (OUTFILE, ">$tmp_filename")) {
245 print STDERR "doc::set_OID could not write to $tmp_filename\n";
246 } else {
247 $self->output_section('OUTFILE', $self->get_top_section(), 1);
248 close (OUTFILE);
249
250 $OID = $self->_calc_OID ($tmp_filename);
251 &util::rm ($tmp_filename);
252 }
253 }
254
255 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
256}
257
258# this uses hashdoc (embedded c thingy) which is faster but still
259# needs a little work to be suffiently stable
260sub ___set_OID {
261 my $self = shift (@_);
262 my ($OID) = @_;
263
264 # if an OID wasn't provided then calculate hash value based on document
265 if (!defined $OID)
266 {
267 my $hash_text = $self->buffer_section($self->get_top_section(), 1);
268 my $hash_len = length($hash_text);
269
270 $OID = &hashdoc::buffer($hash_text,$hash_len);
271 }
272
273 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
274}
275
276# returns the OID for this document
277sub get_OID {
278 my $self = shift (@_);
279 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
280 return $OID if (defined $OID);
281 return "NULL";
282}
283
284sub delete_OID {
285 my $self = shift (@_);
286
287 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
288}
289
290
291# methods for manipulating section names
292
293# returns the name of the top-most section (the top
294# level of the document
295sub get_top_section {
296 my $self = shift (@_);
297
298 return "";
299}
300
301# returns a section
302sub get_parent_section {
303 my $self = shift (@_);
304 my ($section) = @_;
305
306 $section =~ s/(^|\.)\d+$//;
307
308 return $section;
309}
310
311# returns the first child section (or the end child
312# if there isn't any)
313sub get_begin_child {
314 my $self = shift (@_);
315 my ($section) = @_;
316
317 my $section_ptr = $self->_lookup_section($section);
318 return "" unless defined $section_ptr;
319
320 if (defined $section_ptr->{'subsection_order'}->[0]) {
321 return "$section.$section_ptr->{'subsection_order'}->[0]";
322 }
323
324 return $self->get_end_child ($section);
325}
326
327# returns the next child of a parent section
328sub get_next_child {
329 my $self = shift (@_);
330 my ($section) = @_;
331
332 my $parent_section = $self->get_parent_section($section);
333 my $parent_section_ptr = $self->_lookup_section($parent_section);
334 return undef unless defined $parent_section_ptr;
335
336 my ($section_num) = $section =~ /(\d+)$/;
337 return undef unless defined $section_num;
338
339 my $i = 0;
340 my $section_order = $parent_section_ptr->{'subsection_order'};
341 while ($i < scalar(@$section_order)) {
342 last if $section_order->[$i] eq $section_num;
343 $i++;
344 }
345
346 $i++; # the next child
347 if ($i < scalar(@$section_order)) {
348 return $section_order->[$i] if $parent_section eq "";
349 return "$parent_section.$section_order->[$i]";
350 }
351
352 # no more sections in this level
353 return undef;
354}
355
356# returns a reference to a list of children
357sub get_children {
358 my $self = shift (@_);
359 my ($section) = @_;
360
361 my $section_ptr = $self->_lookup_section($section);
362 return [] unless defined $section_ptr;
363
364 my @children = @{$section_ptr->{'subsection_order'}};
365
366 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
367 return \@children;
368}
369
370# returns the child section one past the last one (which
371# is coded as "0")
372sub get_end_child {
373 my $self = shift (@_);
374 my ($section) = @_;
375
376 return $section . ".0" unless $section eq "";
377 return "0";
378}
379
380# returns the next section in book order
381sub get_next_section {
382 my $self = shift (@_);
383 my ($section) = @_;
384
385 return undef unless defined $section;
386
387 my $section_ptr = $self->_lookup_section($section);
388 return undef unless defined $section_ptr;
389
390 # first try to find first child
391 if (defined $section_ptr->{'subsection_order'}->[0]) {
392 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
393 return "$section.$section_ptr->{'subsection_order'}->[0]";
394 }
395
396 do {
397 # try to find sibling
398 my $next_child = $self->get_next_child ($section);
399 return $next_child if (defined $next_child);
400
401 # move up one level
402 $section = $self->get_parent_section ($section);
403 } while $section =~ /\d/;
404
405 return undef;
406}
407
408sub is_leaf_section {
409 my $self = shift (@_);
410 my ($section) = @_;
411
412 my $section_ptr = $self->_lookup_section($section);
413 return 1 unless defined $section_ptr;
414
415 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
416}
417
418# methods for dealing with sections
419
420# returns the name of the inserted section
421sub insert_section {
422 my $self = shift (@_);
423 my ($before_section) = @_;
424
425 # get the child to insert before and its parent section
426 my $parent_section = "";
427 my $before_child = "0";
428 my @before_section = split (/\./, $before_section);
429 if (scalar(@before_section) > 0) {
430 $before_child = pop (@before_section);
431 $parent_section = join (".", @before_section);
432 }
433
434 my $parent_section_ptr = $self->_lookup_section($parent_section);
435 if (!defined $parent_section_ptr) {
436 print STDERR "doc::insert_section couldn't find parent section " .
437 "$parent_section\n";
438 return;
439 }
440
441 # get the next section number
442 my $section_num = $parent_section_ptr->{'next_subsection'}++;
443
444 my $i = 0;
445 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
446 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
447 $i++;
448 }
449
450 # insert the section number into the order list
451 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
452
453 # add this section to the parent section
454 my $section_ptr = {'subsection_order'=>[],
455 'next_subsection'=>1,
456 'subsections'=>{},
457 'metadata'=>[],
458 'text'=>""};
459 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
460
461 # work out the full section number
462 my $section = $parent_section;
463 $section .= "." unless $section eq "";
464 $section .= $section_num;
465
466 return $section;
467}
468
469# creates a pre-named section
470sub create_named_section {
471 my $self = shift (@_);
472 my ($mastersection) = @_;
473
474 my ($num);
475 my $section = $mastersection;
476 my $sectionref = $self;
477
478#### print STDERR "*** mastersection = $mastersection\n";
479
480 while ($section ne "") {
481 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
482 $num =~ s/^0+(\d)/$1/; # remove leading 0s
483 $section = "" unless defined $section;
484
485 if (defined $num) {
486 if (!defined $sectionref->{'subsections'}->{$num}) {
487 push (@{$sectionref->{'subsection_order'}}, $num);
488 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
489 'next_subsection'=>1,
490 'subsections'=>{},
491 'metadata'=>[],
492 'text'=>""};
493 if ($num >= $sectionref->{'next_subsection'}) {
494 $sectionref->{'next_subsection'} = $num + 1;
495 }
496 }
497 $sectionref = $sectionref->{'subsections'}->{$num};
498
499 } else {
500 print STDERR "doc::create_named_section couldn't create section ";
501 print STDERR "$mastersection\n";
502 last;
503 }
504 }
505}
506
507# returns a reference to a list of subsections
508sub list_subsections {
509 my $self = shift (@_);
510 my ($section) = @_;
511
512 my $section_ptr = $self->_lookup_section ($section);
513 if (!defined $section_ptr) {
514 print STDERR "doc::list_subsections couldn't find section $section\n";
515 return [];
516 }
517
518 return [@{$section_ptr->{'subsection_order'}}];
519}
520
521sub delete_section {
522 my $self = shift (@_);
523 my ($section) = @_;
524
525# my $section_ptr = {'subsection_order'=>[],
526# 'next_subsection'=>1,
527# 'subsections'=>{},
528# 'metadata'=>[],
529# 'text'=>""};
530
531 # if this is the top section reset everything
532 if ($section eq "") {
533 $self->{'subsection_order'} = [];
534 $self->{'subsections'} = {};
535 $self->{'metadata'} = [];
536 $self->{'text'} = "";
537 return;
538 }
539
540 # find the parent of the section to delete
541 my $parent_section = "";
542 my $child = "0";
543 my @section = split (/\./, $section);
544 if (scalar(@section) > 0) {
545 $child = pop (@section);
546 $parent_section = join (".", @section);
547 }
548
549 my $parent_section_ptr = $self->_lookup_section($parent_section);
550 if (!defined $parent_section_ptr) {
551 print STDERR "doc::delete_section couldn't find parent section " .
552 "$parent_section\n";
553 return;
554 }
555
556 # remove this section from the subsection_order list
557 my $i = 0;
558 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
559 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
560 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
561 last;
562 }
563 $i++;
564 }
565
566 # remove this section from the subsection hash
567 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
568 undef $parent_section_ptr->{'subsections'}->{$child};
569 }
570}
571
572#--
573# methods for dealing with metadata
574
575# returns the first metadata value which matches field
576sub get_metadata_element {
577 my $self = shift (@_);
578 my ($section, $field) = @_;
579 my ($data);
580
581 my $section_ptr = $self->_lookup_section($section);
582 if (!defined $section_ptr) {
583 print STDERR "doc::get_metadata_element couldn't find section " .
584 "$section\n";
585 return;
586 }
587
588 foreach $data (@{$section_ptr->{'metadata'}}) {
589 return $data->[1] if (scalar(@$data) >= 2 && $data->[0] eq $field);
590 }
591
592 return undef; # was not found
593}
594
595
596# returns a list of the form [value1, value2, ...]
597sub get_metadata {
598 my $self = shift (@_);
599 my ($section, $field) = @_;
600 my ($data);
601
602 my $section_ptr = $self->_lookup_section($section);
603 if (!defined $section_ptr) {
604 print STDERR "doc::get_metadata couldn't find section " .
605 "$section\n";
606 return;
607 }
608
609 my @metadata = ();
610 foreach $data (@{$section_ptr->{'metadata'}}) {
611 push (@metadata, $data->[1]) if ($data->[0] eq $field);
612 }
613
614 return \@metadata;
615}
616
617# returns a list of the form [[field,value],[field,value],...]
618sub get_all_metadata {
619 my $self = shift (@_);
620 my ($section) = @_;
621
622 my $section_ptr = $self->_lookup_section($section);
623 if (!defined $section_ptr) {
624 print STDERR "doc::get_all_metadata couldn't find section " .
625 "$section\n";
626 return;
627 }
628
629 return $section_ptr->{'metadata'};
630}
631
632# $value is optional
633sub delete_metadata {
634 my $self = shift (@_);
635 my ($section, $field, $value) = @_;
636
637 my $section_ptr = $self->_lookup_section($section);
638 if (!defined $section_ptr) {
639 print STDERR "doc::delete_metadata couldn't find section " .
640 "$section\n";
641 return;
642 }
643
644 my $i = 0;
645 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
646 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
647 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
648 splice (@{$section_ptr->{'metadata'}}, $i, 1);
649 } else {
650 $i++;
651 }
652 }
653}
654
655sub delete_all_metadata {
656 my $self = shift (@_);
657 my ($section) = @_;
658
659 my $section_ptr = $self->_lookup_section($section);
660 if (!defined $section_ptr) {
661 print STDERR "doc::delete_all_metadata couldn't find section " .
662 "$section\n";
663 return;
664 }
665
666 $section_ptr->{'metadata'} = [];
667}
668
669# methods for dealing with text
670
671# returns the text for a section
672sub get_text {
673 my $self = shift (@_);
674 my ($section) = @_;
675
676 my $section_ptr = $self->_lookup_section($section);
677 if (!defined $section_ptr) {
678 print STDERR "doc::get_text couldn't find section " .
679 "$section\n";
680 return "";
681 }
682
683 return $section_ptr->{'text'};
684}
685
686# returns the (utf-8 encoded) length of the text for a section
687sub get_text_length {
688 my $self = shift (@_);
689 my ($section) = @_;
690
691 my $section_ptr = $self->_lookup_section($section);
692 if (!defined $section_ptr) {
693 print STDERR "doc::get_text_length couldn't find section " .
694 "$section\n";
695 return 0;
696 }
697
698 return length ($section_ptr->{'text'});
699}
700
701sub delete_text {
702 my $self = shift (@_);
703 my ($section) = @_;
704
705 my $section_ptr = $self->_lookup_section($section);
706 if (!defined $section_ptr) {
707 print STDERR "doc::delete_text couldn't find section " .
708 "$section\n";
709 return;
710 }
711
712 $section_ptr->{'text'} = "";
713}
714
715
716# methods for dealing with associated files
717
718# a file is associated with a document, NOT a section.
719# if section is defined it is noted in the data structure
720# only so that files associated from a particular section
721# may be removed later (using delete_section_assoc_files)
722sub associate_file {
723 my $self = shift (@_);
724 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
725 $mime_type = &html::guess_mime_type ($real_filename) unless defined $mime_type;
726
727 # remove all associated files with the same name
728 $self->delete_assoc_file ($assoc_filename);
729
730 push (@{$self->{'associated_files'}},
731 [$real_filename, $assoc_filename, $mime_type, $section]);
732}
733
734# returns a list of associated files in the form
735# [[real_filename, assoc_filename, mimetype], ...]
736sub get_assoc_files {
737 my $self = shift (@_);
738
739 return $self->{'associated_files'};
740}
741
742sub delete_section_assoc_files {
743 my $self = shift (@_);
744 my ($section) = @_;
745
746 my $i=0;
747 while ($i < scalar (@{$self->{'associated_files'}})) {
748 if (defined $self->{'associated_files'}->[$i]->[3] &&
749 $self->{'associated_files'}->[$i]->[3] eq $section) {
750 splice (@{$self->{'associated_files'}}, $i, 1);
751 } else {
752 $i++;
753 }
754 }
755}
756
757sub delete_assoc_file {
758 my $self = shift (@_);
759 my ($assoc_filename) = @_;
760
761 my $i=0;
762 while ($i < scalar (@{$self->{'associated_files'}})) {
763 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
764 splice (@{$self->{'associated_files'}}, $i, 1);
765 } else {
766 $i++;
767 }
768 }
769}
770
771sub reset_nextsection_ptr {
772 my $self = shift (@_);
773 my ($section) = @_;
774
775 my $section_ptr = $self->_lookup_section($section);
776 $section_ptr->{'next_subsection'} = 1;
777}
778
7791;
Note: See TracBrowser for help on using the repository browser.