source: trunk/gsdl/perllib/doc.pm@ 246

Last change on this file since 246 was 215, checked in by rjmcnab, 25 years ago

Added code to build a new gdbm format.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 18.6 KB
Line 
1# class to hold documents
2
3package doc;
4
5use unicode;
6use util;
7use html;
8
9# the document type may be indexed_doc, nonindexed_doc, or
10# classification
11
12sub new {
13 my $class = shift (@_);
14 my ($source_filename, $doc_type) = @_;
15
16 my $self = bless {'associated_files'=>[],
17 'subsection_order'=>[],
18 'next_subsection'=>1,
19 'subsections'=>{},
20 'metadata'=>[],
21 'text'=>""}, $class;
22
23 $self->set_source_filename ($source_filename) if defined $source_filename;
24 $self->set_doc_type ($doc_type) if defined $doc_type;
25
26 return $self;
27}
28
29sub set_source_filename {
30 my $self = shift (@_);
31 my ($source_filename) = @_;
32
33 $self->set_metadata_element ($self->get_top_section(),
34 "gsdlsourcefilename",
35 $source_filename);
36}
37
38# returns the source_filename as it was provided
39sub get_source_filename {
40 my $self = shift (@_);
41
42 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
43}
44
45sub set_doc_type {
46 my $self = shift (@_);
47 my ($doc_type) = @_;
48
49 $self->set_metadata_element ($self->get_top_section(),
50 "gsdldoctype",
51 $doc_type);
52}
53
54# returns the source_filename as it was provided
55# the default of "indexed_doc" is used if no document
56# type was provided
57sub get_doc_type {
58 my $self = shift (@_);
59
60 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
61 return $doc_type if (defined $doc_type);
62 return "indexed_doc";
63}
64
65sub _escape_text {
66 my ($text) = @_;
67
68 # special characters in the gml encoding
69 $text =~ s/&/&/g; # this has to be first...
70 $text =~ s/</&lt;/g;
71 $text =~ s/>/&gt;/g;
72 $text =~ s/\"/&quot;/g;
73
74 return $text;
75}
76
77sub output_section {
78 my $self = shift (@_);
79 my ($handle, $section, $suppress_subject_info) = @_;
80 $suppress_subject_info = 0 unless defined $suppress_subject_info;
81 my ($data, $subsection);
82
83 my $section_ptr = $self->_lookup_section ($section);
84 my ($section_num) = $section =~ /(\d+)$/;
85
86 return unless defined $section_ptr;
87
88 # output the section header (including the section number
89 # and metadata)
90 print $handle "<gsdlsection";
91 print $handle " gsdlnum=\"$section_num\"" if defined $section_num;
92 foreach $data (@{$section_ptr->{'metadata'}}) {
93 print $handle " $data->[0]=\"" . &_escape_text($data->[1]) . "\""
94 unless $suppress_subject_info && $data->[0] eq "Subject";
95 }
96 print $handle ">";
97
98 # output the text
99 print $handle &_escape_text($section_ptr->{'text'});
100
101 # output all the subsections
102 foreach $subsection (@{$section_ptr->{'subsection_order'}}) {
103 $self->output_section($handle, "$section.$subsection", $suppress_subject_info);
104 }
105
106 # output the closing tag
107 print $handle "</gsdlsection>\n";
108}
109
110# look up the reference to the a particular section
111sub _lookup_section {
112 my $self = shift (@_);
113 my ($section) = @_;
114
115 my ($num);
116 my $sectionref = $self;
117
118 while (defined $section && $section ne "") {
119 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
120 $num =~ s/^0+(\d)/$1/; # remove leading 0s
121 $section = "" unless defined $section;
122
123 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
124 $sectionref = $sectionref->{'subsections'}->{$num};
125 } else {
126 return undef;
127 }
128 }
129
130 return $sectionref;
131}
132
133sub _calc_OID {
134 my $self = shift (@_);
135 my ($filename) = @_;
136
137 my $osexe = &util::get_os_exe();
138
139 my $hashfile_exe = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/hashfile$osexe";
140 my $result = "NULL";
141
142 if (-e "$hashfile_exe") {
143 $result = `$hashfile_exe \"$filename\"`;
144 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
145
146 } else {
147 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
148 }
149
150 return "HASH$result";
151}
152
153# methods dealing with OID, not groups of them.
154
155# if $OID is not provided one is calculated from hashing the
156# current contents of the document
157# An OID are actually stored as metadata of the document
158sub set_OID {
159 my $self = shift (@_);
160 my ($OID) = @_;
161
162 # if an OID wasn't provided then feed this document to
163 # hashfile.exe
164 if (!defined $OID) {
165 $OID = "NULL";
166 my $tmp_filename = &util::get_tmp_filename();
167 if (!open (OUTFILE, ">$tmp_filename")) {
168 print STDERR "doc::set_OID could not write to $tmp_filename\n";
169 } else {
170 $self->output_section('OUTFILE', $self->get_top_section(), 1);
171 close (OUTFILE);
172
173 $OID = $self->_calc_OID ($tmp_filename);
174 &util::rm ($tmp_filename);
175 }
176 }
177
178 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
179}
180
181# returns the OID for this document
182sub get_OID {
183 my $self = shift (@_);
184 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
185 return $OID if (defined $OID);
186 return "NULL";
187}
188
189sub delete_OID {
190 my $self = shift (@_);
191
192 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
193}
194
195
196# methods for manipulating section names
197
198# returns the name of the top-most section (the top
199# level of the document
200sub get_top_section {
201 my $self = shift (@_);
202
203 return "";
204}
205
206# returns a section
207sub get_parent_section {
208 my $self = shift (@_);
209 my ($section) = @_;
210
211 $section =~ s/(^|\.)\d+$//;
212
213 return $section;
214}
215
216# returns the first child section (or the end child
217# if there isn't any)
218sub get_begin_child {
219 my $self = shift (@_);
220 my ($section) = @_;
221
222 my $section_ptr = $self->_lookup_section($section);
223 return "" unless defined $section_ptr;
224
225 if (defined $section_ptr->{'subsection_order'}->[0]) {
226 return "$section.$section_ptr->{'subsection_order'}->[0]";
227 }
228
229 return $self->get_end_child ($section);
230}
231
232# returns the next child of a parent section
233sub get_next_child {
234 my $self = shift (@_);
235 my ($section) = @_;
236
237 my $parent_section = $self->get_parent_section($section);
238 my $parent_section_ptr = $self->_lookup_section($parent_section);
239 return undef unless defined $parent_section_ptr;
240
241 my ($section_num) = $section =~ /(\d+)$/;
242 return undef unless defined $section_num;
243
244 my $i = 0;
245 my $section_order = $parent_section_ptr->{'subsection_order'};
246 while ($i < scalar(@$section_order)) {
247 last if $section_order->[$i] eq $section_num;
248 $i++;
249 }
250
251 $i++; # the next child
252 if ($i < scalar(@$section_order)) {
253 return $section_order->[$i] if $parent_section eq "";
254 return "$parent_section.$section_order->[$i]";
255 }
256
257 # no more sections in this level
258 return undef;
259}
260
261# returns a reference to a list of children
262sub get_children {
263 my $self = shift (@_);
264 my ($section) = @_;
265
266 my $section_ptr = $self->_lookup_section($section);
267 return [] unless defined $section_ptr;
268
269 my @children = @{$section_ptr->{'subsection_order'}};
270
271 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
272 return \@children;
273}
274
275# returns the child section one past the last one (which
276# is coded as "0")
277sub get_end_child {
278 my $self = shift (@_);
279 my ($section) = @_;
280
281 return $section . ".0" unless $section eq "";
282 return "0";
283}
284
285# returns the next section in book order
286sub get_next_section {
287 my $self = shift (@_);
288 my ($section) = @_;
289
290 return undef unless defined $section;
291
292 my $section_ptr = $self->_lookup_section($section);
293 return undef unless defined $section_ptr;
294
295 # first try to find first child
296 if (defined $section_ptr->{'subsection_order'}->[0]) {
297 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
298 return "$section.$section_ptr->{'subsection_order'}->[0]";
299 }
300
301 do {
302 # try to find sibling
303 my $next_child = $self->get_next_child ($section);
304 return $next_child if (defined $next_child);
305
306 # move up one level
307 $section = $self->get_parent_section ($section);
308 } while $section =~ /\d/;
309
310 return undef;
311}
312
313sub is_leaf_section {
314 my $self = shift (@_);
315 my ($section) = @_;
316
317 my $section_ptr = $self->_lookup_section($section);
318 return 1 unless defined $section_ptr;
319
320 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
321}
322
323# methods for dealing with sections
324
325# returns the name of the inserted section
326sub insert_section {
327 my $self = shift (@_);
328 my ($before_section) = @_;
329
330 # get the child to insert before and its parent section
331 my $parent_section = "";
332 my $before_child = "0";
333 my @before_section = split (/\./, $before_section);
334 if (scalar(@before_section) > 0) {
335 $before_child = pop (@before_section);
336 $parent_section = join (".", @before_section);
337 }
338
339 my $parent_section_ptr = $self->_lookup_section($parent_section);
340 if (!defined $parent_section_ptr) {
341 print STDERR "doc::insert_section couldn't find parent section " .
342 "$parent_section\n";
343 return;
344 }
345
346 # get the next section number
347 my $section_num = $parent_section_ptr->{'next_subsection'}++;
348
349 my $i = 0;
350 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
351 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
352 $i++;
353 }
354
355 # insert the section number into the order list
356 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
357
358 # add this section to the parent section
359 my $section_ptr = {'subsection_order'=>[],
360 'next_subsection'=>1,
361 'subsections'=>{},
362 'metadata'=>[],
363 'text'=>""};
364 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
365
366 # work out the full section number
367 my $section = $parent_section;
368 $section .= "." unless $section eq "";
369 $section .= $section_num;
370
371 return $section;
372}
373
374# creates a pre-named section
375sub create_named_section {
376 my $self = shift (@_);
377 my ($mastersection) = @_;
378
379 my ($num);
380 my $section = $mastersection;
381 my $sectionref = $self;
382
383 while ($section ne "") {
384 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
385 $num =~ s/^0+(\d)/$1/; # remove leading 0s
386 $section = "" unless defined $section;
387
388 if (defined $num) {
389 if (!defined $sectionref->{'subsections'}->{$num}) {
390 push (@{$sectionref->{'subsection_order'}}, $num);
391 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
392 'next_subsection'=>1,
393 'subsections'=>{},
394 'metadata'=>[],
395 'text'=>""};
396 if ($num >= $sectionref->{'next_subsection'}) {
397 $sectionref->{'next_subsection'} = $num + 1;
398 }
399 }
400 $sectionref = $sectionref->{'subsections'}->{$num};
401
402 } else {
403 print STDERR "doc::create_named_section couldn't create section " .
404 "$mastersection\n";
405 last;
406 }
407 }
408}
409
410# returns a reference to a list of subsections
411sub list_subsections {
412 my $self = shift (@_);
413 my ($section) = @_;
414
415 my $section_ptr = $self->_lookup_section ($section);
416 if (!defined $section_ptr) {
417 print STDERR "doc::list_subsections couldn't find section $section\n";
418 return [];
419 }
420
421 return [@{$section_ptr->{'subsection_order'}}];
422}
423
424sub delete_section {
425 my $self = shift (@_);
426 my ($section) = @_;
427
428# my $section_ptr = {'subsection_order'=>[],
429# 'next_subsection'=>1,
430# 'subsections'=>{},
431# 'metadata'=>[],
432# 'text'=>""};
433
434 # if this is the top section reset everything
435 if ($section eq "") {
436 $self->{'subsection_order'} = [];
437 $self->{'subsections'} = {};
438 $self->{'metadata'} = [];
439 $self->{'text'} = "";
440 return;
441 }
442
443 # find the parent of the section to delete
444 my $parent_section = "";
445 my $child = "0";
446 my @section = split (".", $section);
447 if (scalar(@section) > 0) {
448 $child = pop (@section);
449 $parent_section = join (".", @section);
450 }
451
452 my $parent_section_ptr = $self->_lookup_section($parent_section);
453 if (!defined $parent_section_ptr) {
454 print STDERR "doc::delete_section couldn't find parent section " .
455 "$parent_section\n";
456 return;
457 }
458
459 # remove this section from the subsection_order list
460 my $i = 0;
461 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
462 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
463 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
464 last;
465 }
466 $i++;
467 }
468
469 # remove this section from the subsection hash
470 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
471 undef $parent_section_ptr->{'subsections'}->{$child};
472 }
473}
474
475
476# methods for dealing with metadata
477
478# set_metadata_element and get_metadata_element are for metadata
479# which should only have one value. add_meta_data and get_metadata
480# are for metadata which can have more than one value.
481
482# set_metadata_element assumes the value is in (extended) ascii form.
483# For text which hash been already converted to the UTF-8 format use
484# set_utf8_metadata_element.
485sub set_metadata_element {
486 my $self = shift (@_);
487 my ($section, $field, $value) = @_;
488
489 $self->set_utf8_metadata_element ($section, $field,
490 &unicode::ascii2utf8($value));
491}
492
493# set_utf8_metadata_element assumes the text has already been
494# converted to the UTF-8 encoding.
495sub set_utf8_metadata_element {
496 my $self = shift (@_);
497 my ($section, $field, $value) = @_;
498
499 $self->delete_metadata ($section, $field);
500 $self->add_utf8_metadata ($section, $field, $value);
501}
502
503# returns the first metadata value which matches field
504sub get_metadata_element {
505 my $self = shift (@_);
506 my ($section, $field) = @_;
507 my ($data);
508
509 my $section_ptr = $self->_lookup_section($section);
510 if (!defined $section_ptr) {
511 print STDERR "doc::get_metadata_element couldn't find section " .
512 "$section\n";
513 return;
514 }
515
516 foreach $data (@{$section_ptr->{'metadata'}}) {
517 return $data->[1] if (scalar(@$data) >= 2 && $data->[0] eq $field);
518 }
519
520 return undef; # was not found
521}
522
523# add_metadata assumes the text is in (extended) ascii form. For
524# text which hash been already converted to the UTF-8 format use
525# add_utf8_metadata.
526sub add_metadata {
527 my $self = shift (@_);
528 my ($section, $field, $value) = @_;
529
530 $self->add_utf8_metadata ($section, $field,
531 &unicode::ascii2utf8($value));
532}
533
534# add_utf8_metadata assumes the text has already been converted
535# to the UTF-8 encoding.
536sub add_utf8_metadata {
537 my $self = shift (@_);
538 my ($section, $field, $value) = @_;
539
540 my $section_ptr = $self->_lookup_section($section);
541 if (!defined $section_ptr) {
542 print STDERR "doc::add_utf8_metadata couldn't find section " .
543 "$section\n";
544 return;
545 }
546
547 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
548}
549
550# returns a list of the form [value1, value2, ...]
551sub get_metadata {
552 my $self = shift (@_);
553 my ($section, $field) = @_;
554 my ($data);
555
556 my $section_ptr = $self->_lookup_section($section);
557 if (!defined $section_ptr) {
558 print STDERR "doc::get_metadata couldn't find section " .
559 "$section\n";
560 return;
561 }
562
563 my @metadata = ();
564 foreach $data (@{$section_ptr->{'metadata'}}) {
565 push (@metadata, $data->[1]) if ($data->[0] eq $field);
566 }
567
568 return \@metadata;
569}
570
571# returns a list of the form [[field,value],[field,value],...]
572sub get_all_metadata {
573 my $self = shift (@_);
574 my ($section) = @_;
575
576 my $section_ptr = $self->_lookup_section($section);
577 if (!defined $section_ptr) {
578 print STDERR "doc::get_all_metadata couldn't find section " .
579 "$section\n";
580 return;
581 }
582
583 return $section_ptr->{'metadata'};
584}
585
586# $value is optional
587sub delete_metadata {
588 my $self = shift (@_);
589 my ($section, $field, $value) = @_;
590
591 my $section_ptr = $self->_lookup_section($section);
592 if (!defined $section_ptr) {
593 print STDERR "doc::delete_metadata couldn't find section " .
594 "$section\n";
595 return;
596 }
597
598 my $i = 0;
599 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
600 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
601 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
602 splice (@{$section_ptr->{'metadata'}}, $i, 1);
603 } else {
604 $i++;
605 }
606 }
607}
608
609sub delete_all_metadata {
610 my $self = shift (@_);
611 my ($section) = @_;
612
613 my $section_ptr = $self->_lookup_section($section);
614 if (!defined $section_ptr) {
615 print STDERR "doc::delete_all_metadata couldn't find section " .
616 "$section\n";
617 return;
618 }
619
620 $section_ptr->{'metadata'} = [];
621}
622
623
624# methods for dealing with associated files
625
626sub associate_file {
627 my $self = shift (@_);
628 my ($real_filename, $assoc_filename, $mime_type) = @_;
629 $mime_type = &html::guess_mime_type ($real_filename) unless defined $mime_type;
630
631 # remove all associated files with the same name
632 $self->delete_assoc_file ($assoc_filename);
633
634 push (@{$self->{'associated_files'}},
635 [$real_filename, $assoc_filename, $mime_type]);
636}
637
638# returns a list of associated files in the form
639# [[real_filename, assoc_filename, mimetype], ...]
640sub get_assoc_files {
641 my $self = shift (@_);
642
643 return $self->{'associated_files'};
644}
645
646sub delete_assoc_file {
647 my $self = shift (@_);
648 my ($assoc_filename) = @_;
649
650 my $i=0;
651 while ($i < scalar (@{$self->{'associated_files'}})) {
652 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
653 splice (@{$self->{'associated_files'}}, $i, 1);
654 } else {
655 $i++;
656 }
657 }
658}
659
660
661# methods for dealing with text
662
663# add_text assumes the text is in (extended) ascii form. For
664# text which has been already converted to the UTF-8 format
665# use add_utf8_text.
666sub add_text {
667 my $self = shift (@_);
668 my ($section, $text) = @_;
669
670 # convert the text to UTF-8 encoded unicode characters
671 # and add the text
672 $self->add_utf8_text($section, &unicode::ascii2utf8($text));
673}
674
675
676# add_utf8_text assumes the text to be added has already
677# been converted to the UTF-8 encoding. For ascii text use
678# add_text
679sub add_utf8_text {
680 my $self = shift (@_);
681 my ($section, $text) = @_;
682
683 my $section_ptr = $self->_lookup_section($section);
684 if (!defined $section_ptr) {
685 print STDERR "doc::add_utf8_text couldn't find section " .
686 "$section\n";
687 return;
688 }
689
690 $section_ptr->{'text'} .= $text;
691}
692
693# returns the text for a section
694sub get_text {
695 my $self = shift (@_);
696 my ($section) = @_;
697
698 my $section_ptr = $self->_lookup_section($section);
699 if (!defined $section_ptr) {
700 print STDERR "doc::get_text couldn't find section " .
701 "$section\n";
702 return "";
703 }
704
705 return $section_ptr->{'text'};
706}
707
708# returns the (utf-8 encoded) length of the text for a section
709sub get_text_length {
710 my $self = shift (@_);
711 my ($section) = @_;
712
713 my $section_ptr = $self->_lookup_section($section);
714 if (!defined $section_ptr) {
715 print STDERR "doc::get_text_length couldn't find section " .
716 "$section\n";
717 return 0;
718 }
719
720 return length ($section_ptr->{'text'});
721}
722
723sub delete_text {
724 my $self = shift (@_);
725 my ($section) = @_;
726
727 my $section_ptr = $self->_lookup_section($section);
728 if (!defined $section_ptr) {
729 print STDERR "doc::delete_text couldn't find section " .
730 "$section\n";
731 return;
732 }
733
734 $section_ptr->{'text'} = "";
735}
736
737
7381;
Note: See TracBrowser for help on using the repository browser.