source: trunk/gsdl/perllib/doc.pm@ 537

Last change on this file since 537 was 537, checked in by sjboddie, 25 years ago

added GPL headers

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 19.9 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# class to hold documents
27
28package doc;
29
30use unicode;
31use util;
32use html;
33
34# the document type may be indexed_doc, nonindexed_doc, or
35# classification
36
37sub new {
38 my $class = shift (@_);
39 my ($source_filename, $doc_type) = @_;
40
41 my $self = bless {'associated_files'=>[],
42 'subsection_order'=>[],
43 'next_subsection'=>1,
44 'subsections'=>{},
45 'metadata'=>[],
46 'text'=>""}, $class;
47
48
49# $self->set_source_filename ($source_filename) if defined $source_filename;
50 push (@{$self->{'metadata'}}, ["gsdlsourcefilename", &unicode::ascii2utf8($source_filename)])
51 if defined $source_filename;
52# $self->set_doc_type ($doc_type) if defined $doc_type;
53 push (@{$self->{'metadata'}}, ["gsdldoctype", &unicode::ascii2utf8($doc_type)])
54 if defined $doc_type;
55
56 return $self;
57}
58
59sub set_source_filename {
60 my $self = shift (@_);
61 my ($source_filename) = @_;
62
63 $self->set_metadata_element ($self->get_top_section(),
64 "gsdlsourcefilename",
65 $source_filename);
66}
67
68# returns the source_filename as it was provided
69sub get_source_filename {
70 my $self = shift (@_);
71
72 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
73}
74
75sub set_doc_type {
76 my $self = shift (@_);
77 my ($doc_type) = @_;
78
79 $self->set_metadata_element ($self->get_top_section(),
80 "gsdldoctype",
81 $doc_type);
82}
83
84# returns the source_filename as it was provided
85# the default of "indexed_doc" is used if no document
86# type was provided
87sub get_doc_type {
88 my $self = shift (@_);
89
90 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
91 return $doc_type if (defined $doc_type);
92 return "indexed_doc";
93}
94
95sub _escape_text {
96 my ($text) = @_;
97
98 # special characters in the gml encoding
99 $text =~ s/&/&/g; # this has to be first...
100 $text =~ s/</&lt;/g;
101 $text =~ s/>/&gt;/g;
102 $text =~ s/\"/&quot;/g;
103
104 return $text;
105}
106
107sub output_section {
108 my $self = shift (@_);
109 my ($handle, $section, $suppress_subject_info) = @_;
110 $suppress_subject_info = 0 unless defined $suppress_subject_info;
111 my ($data, $subsection);
112
113 my $section_ptr = $self->_lookup_section ($section);
114 my ($section_num) = $section =~ /(\d+)$/;
115
116 return unless defined $section_ptr;
117
118 # output the section header (including the section number
119 # and metadata)
120 print $handle "<gsdlsection";
121 print $handle " gsdlnum=\"$section_num\"" if defined $section_num;
122 foreach $data (@{$section_ptr->{'metadata'}}) {
123 print $handle " $data->[0]=\"" . &_escape_text($data->[1]) . "\""
124 unless $suppress_subject_info && $data->[0] eq "Subject";
125 }
126 print $handle ">";
127
128 # output the text
129 print $handle &_escape_text($section_ptr->{'text'});
130
131 # output all the subsections
132 foreach $subsection (@{$section_ptr->{'subsection_order'}}) {
133 $self->output_section($handle, "$section.$subsection", $suppress_subject_info);
134 }
135
136 # output the closing tag
137 print $handle "</gsdlsection>\n";
138}
139
140# look up the reference to the a particular section
141sub _lookup_section {
142 my $self = shift (@_);
143 my ($section) = @_;
144
145 my ($num);
146 my $sectionref = $self;
147
148 while (defined $section && $section ne "") {
149 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
150 $num =~ s/^0+(\d)/$1/; # remove leading 0s
151 $section = "" unless defined $section;
152
153 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
154 $sectionref = $sectionref->{'subsections'}->{$num};
155 } else {
156 return undef;
157 }
158 }
159
160 return $sectionref;
161}
162
163sub _calc_OID {
164 my $self = shift (@_);
165 my ($filename) = @_;
166
167 my $osexe = &util::get_os_exe();
168
169 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
170 $ENV{'GSDLOS'},"hashfile$osexe");
171 my $result = "NULL";
172
173 if (-e "$hashfile_exe") {
174 $result = `$hashfile_exe \"$filename\"`;
175 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
176
177 } else {
178 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
179 }
180
181 return "HASH$result";
182}
183
184# methods dealing with OID, not groups of them.
185
186# if $OID is not provided one is calculated from hashing the
187# current contents of the document
188# An OID are actually stored as metadata of the document
189sub set_OID {
190 my $self = shift (@_);
191 my ($OID) = @_;
192
193 # if an OID wasn't provided then feed this document to
194 # hashfile.exe
195 if (!defined $OID) {
196 $OID = "NULL";
197 my $tmp_filename = &util::get_tmp_filename();
198 if (!open (OUTFILE, ">$tmp_filename")) {
199 print STDERR "doc::set_OID could not write to $tmp_filename\n";
200 } else {
201 $self->output_section('OUTFILE', $self->get_top_section(), 1);
202 close (OUTFILE);
203
204 $OID = $self->_calc_OID ($tmp_filename);
205 &util::rm ($tmp_filename);
206 }
207 }
208
209 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
210}
211
212# returns the OID for this document
213sub get_OID {
214 my $self = shift (@_);
215 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
216 return $OID if (defined $OID);
217 return "NULL";
218}
219
220sub delete_OID {
221 my $self = shift (@_);
222
223 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
224}
225
226
227# methods for manipulating section names
228
229# returns the name of the top-most section (the top
230# level of the document
231sub get_top_section {
232 my $self = shift (@_);
233
234 return "";
235}
236
237# returns a section
238sub get_parent_section {
239 my $self = shift (@_);
240 my ($section) = @_;
241
242 $section =~ s/(^|\.)\d+$//;
243
244 return $section;
245}
246
247# returns the first child section (or the end child
248# if there isn't any)
249sub get_begin_child {
250 my $self = shift (@_);
251 my ($section) = @_;
252
253 my $section_ptr = $self->_lookup_section($section);
254 return "" unless defined $section_ptr;
255
256 if (defined $section_ptr->{'subsection_order'}->[0]) {
257 return "$section.$section_ptr->{'subsection_order'}->[0]";
258 }
259
260 return $self->get_end_child ($section);
261}
262
263# returns the next child of a parent section
264sub get_next_child {
265 my $self = shift (@_);
266 my ($section) = @_;
267
268 my $parent_section = $self->get_parent_section($section);
269 my $parent_section_ptr = $self->_lookup_section($parent_section);
270 return undef unless defined $parent_section_ptr;
271
272 my ($section_num) = $section =~ /(\d+)$/;
273 return undef unless defined $section_num;
274
275 my $i = 0;
276 my $section_order = $parent_section_ptr->{'subsection_order'};
277 while ($i < scalar(@$section_order)) {
278 last if $section_order->[$i] eq $section_num;
279 $i++;
280 }
281
282 $i++; # the next child
283 if ($i < scalar(@$section_order)) {
284 return $section_order->[$i] if $parent_section eq "";
285 return "$parent_section.$section_order->[$i]";
286 }
287
288 # no more sections in this level
289 return undef;
290}
291
292# returns a reference to a list of children
293sub get_children {
294 my $self = shift (@_);
295 my ($section) = @_;
296
297 my $section_ptr = $self->_lookup_section($section);
298 return [] unless defined $section_ptr;
299
300 my @children = @{$section_ptr->{'subsection_order'}};
301
302 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
303 return \@children;
304}
305
306# returns the child section one past the last one (which
307# is coded as "0")
308sub get_end_child {
309 my $self = shift (@_);
310 my ($section) = @_;
311
312 return $section . ".0" unless $section eq "";
313 return "0";
314}
315
316# returns the next section in book order
317sub get_next_section {
318 my $self = shift (@_);
319 my ($section) = @_;
320
321 return undef unless defined $section;
322
323 my $section_ptr = $self->_lookup_section($section);
324 return undef unless defined $section_ptr;
325
326 # first try to find first child
327 if (defined $section_ptr->{'subsection_order'}->[0]) {
328 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
329 return "$section.$section_ptr->{'subsection_order'}->[0]";
330 }
331
332 do {
333 # try to find sibling
334 my $next_child = $self->get_next_child ($section);
335 return $next_child if (defined $next_child);
336
337 # move up one level
338 $section = $self->get_parent_section ($section);
339 } while $section =~ /\d/;
340
341 return undef;
342}
343
344sub is_leaf_section {
345 my $self = shift (@_);
346 my ($section) = @_;
347
348 my $section_ptr = $self->_lookup_section($section);
349 return 1 unless defined $section_ptr;
350
351 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
352}
353
354# methods for dealing with sections
355
356# returns the name of the inserted section
357sub insert_section {
358 my $self = shift (@_);
359 my ($before_section) = @_;
360
361 # get the child to insert before and its parent section
362 my $parent_section = "";
363 my $before_child = "0";
364 my @before_section = split (/\./, $before_section);
365 if (scalar(@before_section) > 0) {
366 $before_child = pop (@before_section);
367 $parent_section = join (".", @before_section);
368 }
369
370 my $parent_section_ptr = $self->_lookup_section($parent_section);
371 if (!defined $parent_section_ptr) {
372 print STDERR "doc::insert_section couldn't find parent section " .
373 "$parent_section\n";
374 return;
375 }
376
377 # get the next section number
378 my $section_num = $parent_section_ptr->{'next_subsection'}++;
379
380 my $i = 0;
381 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
382 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
383 $i++;
384 }
385
386 # insert the section number into the order list
387 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
388
389 # add this section to the parent section
390 my $section_ptr = {'subsection_order'=>[],
391 'next_subsection'=>1,
392 'subsections'=>{},
393 'metadata'=>[],
394 'text'=>""};
395 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
396
397 # work out the full section number
398 my $section = $parent_section;
399 $section .= "." unless $section eq "";
400 $section .= $section_num;
401
402 return $section;
403}
404
405# creates a pre-named section
406sub create_named_section {
407 my $self = shift (@_);
408 my ($mastersection) = @_;
409
410 my ($num);
411 my $section = $mastersection;
412 my $sectionref = $self;
413
414 while ($section ne "") {
415 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
416 $num =~ s/^0+(\d)/$1/; # remove leading 0s
417 $section = "" unless defined $section;
418
419 if (defined $num) {
420 if (!defined $sectionref->{'subsections'}->{$num}) {
421 push (@{$sectionref->{'subsection_order'}}, $num);
422 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
423 'next_subsection'=>1,
424 'subsections'=>{},
425 'metadata'=>[],
426 'text'=>""};
427 if ($num >= $sectionref->{'next_subsection'}) {
428 $sectionref->{'next_subsection'} = $num + 1;
429 }
430 }
431 $sectionref = $sectionref->{'subsections'}->{$num};
432
433 } else {
434 print STDERR "doc::create_named_section couldn't create section " .
435 "$mastersection\n";
436 last;
437 }
438 }
439}
440
441# returns a reference to a list of subsections
442sub list_subsections {
443 my $self = shift (@_);
444 my ($section) = @_;
445
446 my $section_ptr = $self->_lookup_section ($section);
447 if (!defined $section_ptr) {
448 print STDERR "doc::list_subsections couldn't find section $section\n";
449 return [];
450 }
451
452 return [@{$section_ptr->{'subsection_order'}}];
453}
454
455sub delete_section {
456 my $self = shift (@_);
457 my ($section) = @_;
458
459# my $section_ptr = {'subsection_order'=>[],
460# 'next_subsection'=>1,
461# 'subsections'=>{},
462# 'metadata'=>[],
463# 'text'=>""};
464
465 # if this is the top section reset everything
466 if ($section eq "") {
467 $self->{'subsection_order'} = [];
468 $self->{'subsections'} = {};
469 $self->{'metadata'} = [];
470 $self->{'text'} = "";
471 return;
472 }
473
474 # find the parent of the section to delete
475 my $parent_section = "";
476 my $child = "0";
477 my @section = split (".", $section);
478 if (scalar(@section) > 0) {
479 $child = pop (@section);
480 $parent_section = join (".", @section);
481 }
482
483 my $parent_section_ptr = $self->_lookup_section($parent_section);
484 if (!defined $parent_section_ptr) {
485 print STDERR "doc::delete_section couldn't find parent section " .
486 "$parent_section\n";
487 return;
488 }
489
490 # remove this section from the subsection_order list
491 my $i = 0;
492 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
493 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
494 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
495 last;
496 }
497 $i++;
498 }
499
500 # remove this section from the subsection hash
501 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
502 undef $parent_section_ptr->{'subsections'}->{$child};
503 }
504}
505
506
507# methods for dealing with metadata
508
509# set_metadata_element and get_metadata_element are for metadata
510# which should only have one value. add_meta_data and get_metadata
511# are for metadata which can have more than one value.
512
513# set_metadata_element assumes the value is in (extended) ascii form.
514# For text which hash been already converted to the UTF-8 format use
515# set_utf8_metadata_element.
516sub set_metadata_element {
517 my $self = shift (@_);
518 my ($section, $field, $value) = @_;
519
520 $self->set_utf8_metadata_element ($section, $field,
521 &unicode::ascii2utf8($value));
522}
523
524# set_utf8_metadata_element assumes the text has already been
525# converted to the UTF-8 encoding.
526sub set_utf8_metadata_element {
527 my $self = shift (@_);
528 my ($section, $field, $value) = @_;
529
530 $self->delete_metadata ($section, $field);
531 $self->add_utf8_metadata ($section, $field, $value);
532}
533
534# returns the first metadata value which matches field
535sub get_metadata_element {
536 my $self = shift (@_);
537 my ($section, $field) = @_;
538 my ($data);
539
540 my $section_ptr = $self->_lookup_section($section);
541 if (!defined $section_ptr) {
542 print STDERR "doc::get_metadata_element couldn't find section " .
543 "$section\n";
544 return;
545 }
546
547 foreach $data (@{$section_ptr->{'metadata'}}) {
548 return $data->[1] if (scalar(@$data) >= 2 && $data->[0] eq $field);
549 }
550
551 return undef; # was not found
552}
553
554# add_metadata assumes the text is in (extended) ascii form. For
555# text which hash been already converted to the UTF-8 format use
556# add_utf8_metadata.
557sub add_metadata {
558 my $self = shift (@_);
559 my ($section, $field, $value) = @_;
560
561 $self->add_utf8_metadata ($section, $field,
562 &unicode::ascii2utf8($value));
563}
564
565# add_utf8_metadata assumes the text has already been converted
566# to the UTF-8 encoding.
567sub add_utf8_metadata {
568 my $self = shift (@_);
569 my ($section, $field, $value) = @_;
570
571 my $section_ptr = $self->_lookup_section($section);
572 if (!defined $section_ptr) {
573 print STDERR "doc::add_utf8_metadata couldn't find section " .
574 "$section\n";
575 return;
576 }
577
578 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
579}
580
581# returns a list of the form [value1, value2, ...]
582sub get_metadata {
583 my $self = shift (@_);
584 my ($section, $field) = @_;
585 my ($data);
586
587 my $section_ptr = $self->_lookup_section($section);
588 if (!defined $section_ptr) {
589 print STDERR "doc::get_metadata couldn't find section " .
590 "$section\n";
591 return;
592 }
593
594 my @metadata = ();
595 foreach $data (@{$section_ptr->{'metadata'}}) {
596 push (@metadata, $data->[1]) if ($data->[0] eq $field);
597 }
598
599 return \@metadata;
600}
601
602# returns a list of the form [[field,value],[field,value],...]
603sub get_all_metadata {
604 my $self = shift (@_);
605 my ($section) = @_;
606
607 my $section_ptr = $self->_lookup_section($section);
608 if (!defined $section_ptr) {
609 print STDERR "doc::get_all_metadata couldn't find section " .
610 "$section\n";
611 return;
612 }
613
614 return $section_ptr->{'metadata'};
615}
616
617# $value is optional
618sub delete_metadata {
619 my $self = shift (@_);
620 my ($section, $field, $value) = @_;
621
622 my $section_ptr = $self->_lookup_section($section);
623 if (!defined $section_ptr) {
624 print STDERR "doc::delete_metadata couldn't find section " .
625 "$section\n";
626 return;
627 }
628
629 my $i = 0;
630 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
631 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
632 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
633 splice (@{$section_ptr->{'metadata'}}, $i, 1);
634 } else {
635 $i++;
636 }
637 }
638}
639
640sub delete_all_metadata {
641 my $self = shift (@_);
642 my ($section) = @_;
643
644 my $section_ptr = $self->_lookup_section($section);
645 if (!defined $section_ptr) {
646 print STDERR "doc::delete_all_metadata couldn't find section " .
647 "$section\n";
648 return;
649 }
650
651 $section_ptr->{'metadata'} = [];
652}
653
654
655# methods for dealing with associated files
656
657sub associate_file {
658 my $self = shift (@_);
659 my ($real_filename, $assoc_filename, $mime_type) = @_;
660 $mime_type = &html::guess_mime_type ($real_filename) unless defined $mime_type;
661
662 # remove all associated files with the same name
663 $self->delete_assoc_file ($assoc_filename);
664
665 push (@{$self->{'associated_files'}},
666 [$real_filename, $assoc_filename, $mime_type]);
667}
668
669# returns a list of associated files in the form
670# [[real_filename, assoc_filename, mimetype], ...]
671sub get_assoc_files {
672 my $self = shift (@_);
673
674 return $self->{'associated_files'};
675}
676
677sub delete_assoc_file {
678 my $self = shift (@_);
679 my ($assoc_filename) = @_;
680
681 my $i=0;
682 while ($i < scalar (@{$self->{'associated_files'}})) {
683 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
684 splice (@{$self->{'associated_files'}}, $i, 1);
685 } else {
686 $i++;
687 }
688 }
689}
690
691
692# methods for dealing with text
693
694# add_text assumes the text is in (extended) ascii form. For
695# text which has been already converted to the UTF-8 format
696# use add_utf8_text.
697sub add_text {
698 my $self = shift (@_);
699 my ($section, $text) = @_;
700
701 # convert the text to UTF-8 encoded unicode characters
702 # and add the text
703 $self->add_utf8_text($section, &unicode::ascii2utf8($text));
704}
705
706
707# add_utf8_text assumes the text to be added has already
708# been converted to the UTF-8 encoding. For ascii text use
709# add_text
710sub add_utf8_text {
711 my $self = shift (@_);
712 my ($section, $text) = @_;
713
714 my $section_ptr = $self->_lookup_section($section);
715 if (!defined $section_ptr) {
716 print STDERR "doc::add_utf8_text couldn't find section " .
717 "$section\n";
718 return;
719 }
720
721 $section_ptr->{'text'} .= $text;
722}
723
724# returns the text for a section
725sub get_text {
726 my $self = shift (@_);
727 my ($section) = @_;
728
729 my $section_ptr = $self->_lookup_section($section);
730 if (!defined $section_ptr) {
731 print STDERR "doc::get_text couldn't find section " .
732 "$section\n";
733 return "";
734 }
735
736 return $section_ptr->{'text'};
737}
738
739# returns the (utf-8 encoded) length of the text for a section
740sub get_text_length {
741 my $self = shift (@_);
742 my ($section) = @_;
743
744 my $section_ptr = $self->_lookup_section($section);
745 if (!defined $section_ptr) {
746 print STDERR "doc::get_text_length couldn't find section " .
747 "$section\n";
748 return 0;
749 }
750
751 return length ($section_ptr->{'text'});
752}
753
754sub delete_text {
755 my $self = shift (@_);
756 my ($section) = @_;
757
758 my $section_ptr = $self->_lookup_section($section);
759 if (!defined $section_ptr) {
760 print STDERR "doc::delete_text couldn't find section " .
761 "$section\n";
762 return;
763 }
764
765 $section_ptr->{'text'} = "";
766}
767
768
7691;
Note: See TracBrowser for help on using the repository browser.