source: trunk/gsdl/perllib/doc.pm@ 75

Last change on this file since 75 was 74, checked in by rjmcnab, 26 years ago

Hack to make things work as they used to (until I have finished
the rest of the software.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 18.5 KB
Line 
1# class to hold documents
2
3package doc;
4
5use unicode;
6use util;
7use html;
8
9# the document type may be indexed_doc, nonindexed_doc, or
10# classification
11
12sub new {
13 my $class = shift (@_);
14 my ($source_filename, $doc_type) = @_;
15
16 my $self = bless {'associated_files'=>[],
17 'subsection_order'=>[],
18 'next_subsection'=>1,
19 'subsections'=>{},
20 'metadata'=>[],
21 'text'=>""}, $class;
22
23 $self->set_source_filename ($source_filename) if defined $source_filename;
24 $self->set_doc_type ($doc_type) if defined $doc_type;
25
26 return $self;
27}
28
29sub set_source_filename {
30 my $self = shift (@_);
31 my ($source_filename) = @_;
32
33 $self->set_metadata_element ($self->get_top_section(),
34 "gsdlsourcefilename",
35 $source_filename);
36}
37
38# returns the source_filename as it was provided
39sub get_source_filename {
40 my $self = shift (@_);
41
42 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
43}
44
45sub set_doc_type {
46 my $self = shift (@_);
47 my ($doc_type) = @_;
48
49 $self->set_metadata_element ($self->get_top_section(),
50 "gsdldoctype",
51 $doc_type);
52}
53
54# returns the source_filename as it was provided
55sub get_doc_type {
56 my $self = shift (@_);
57
58 return $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
59}
60
61sub output_section {
62 my $self = shift (@_);
63 my ($handle, $section, $suppress_subject_info) = @_;
64 $suppress_subject_info = 0 unless defined $suppress_subject_info;
65 my ($data, $subsection);
66
67 my $section_ptr = $self->_lookup_section ($section);
68 my ($section_num) = $section =~ /(\d+)$/;
69
70 return unless defined $section_ptr;
71
72 # output the section header (including the section number
73 # and metadata)
74 print $handle "<gsdlsection";
75 print $handle " gsdlnum=\"$section_num\"" if defined $section_num;
76 foreach $data (@{$section_ptr->{'metadata'}}) {
77 print $handle " $data->[0]=\"$data->[1]\""
78 unless $suppress_subject_info && $data->[0] eq "Subject";
79 }
80 print $handle ">";
81
82 # output the text
83 print $handle $section_ptr->{'text'};
84
85 # output all the subsections
86 foreach $subsection (@{$section_ptr->{'subsection_order'}}) {
87 $self->output_section($handle, "$section.$subsection", $suppress_subject_info);
88 }
89
90 # output the closing tag
91 print $handle "</gsdlsection>\n";
92}
93
94# look up the reference to the a particular section
95sub _lookup_section {
96 my $self = shift (@_);
97 my ($section) = @_;
98
99 my ($num);
100 my $sectionref = $self;
101
102 while ($section ne "") {
103 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
104 $num =~ s/^0+(\d)/$1/; # remove leading 0s
105 $section = "" unless defined $section;
106
107 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
108 $sectionref = $sectionref->{'subsections'}->{$num};
109 } else {
110 return undef;
111 }
112 }
113
114 return $sectionref;
115}
116
117sub _calc_OID {
118 my $self = shift (@_);
119 my ($filename) = @_;
120
121 my $osexe = &util::get_os_exe();
122
123 my $hashfile_exe = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/hashfile$osexe";
124 my $result = "NULL";
125
126 if (-e "$hashfile_exe") {
127 $result = `$hashfile_exe \"$filename\"`;
128 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
129
130 } else {
131 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
132 }
133
134 return "HASH$result";
135}
136
137# methods dealing with OID, not groups of them.
138
139# if $OID is not provided one is calculated from hashing the
140# current contents of the document
141# An OID are actually stored as metadata of the document
142sub set_OID {
143 my $self = shift (@_);
144 my ($OID) = @_;
145
146 # if an OID wasn't provided then feed this document to
147 # hashfile.exe
148 if (!defined $OID) {
149 $OID = "NULL";
150 my $tmp_filename = &util::get_tmp_filename();
151 if (!open (OUTFILE, ">$tmp_filename")) {
152 print STDERR "doc::set_OID could not write to $tmp_filename\n";
153 } else {
154 $self->output_section('OUTFILE', $self->get_top_section(), 1);
155 close (OUTFILE);
156
157 $OID = $self->_calc_OID ($tmp_filename);
158 &util::rm ($tmp_filename);
159 }
160 }
161
162 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
163}
164
165# returns the OID for this document
166sub get_OID {
167 my $self = shift (@_);
168
169 return $self->get_metadata_element ($self->get_top_section(), "Identifier");
170}
171
172sub delete_OID {
173 my $self = shift (@_);
174
175 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
176}
177
178
179# methods for manipulating section names
180
181# returns the name of the top-most section (the top
182# level of the document
183sub get_top_section {
184 my $self = shift (@_);
185
186 return "";
187}
188
189# returns a section
190sub get_parent_section {
191 my $self = shift (@_);
192 my ($section) = @_;
193
194 $section =~ s/(^|\.)\d+$//;
195
196 return $section;
197}
198
199# returns the first child section (or the end child
200# if there isn't any)
201sub get_begin_child {
202 my $self = shift (@_);
203 my ($section) = @_;
204
205 my $section_ptr = $self->_lookup_section($section);
206 return "" unless defined $section_ptr;
207
208 if (defined $section_ptr->{'subsection_order'}->[0]) {
209 return "$section.$section_ptr->{'subsection_order'}->[0]";
210 }
211
212 return $self->get_end_child ($section);
213}
214
215# returns the next child of a parent section
216sub get_next_child {
217 my $self = shift (@_);
218 my ($section) = @_;
219
220 my $parent_section = $self->get_parent_section($section);
221 my $parent_section_ptr = $self->_lookup_section($parent_section);
222 return undef unless defined $parent_section_ptr;
223
224 my ($section_num) = $section =~ /(\d+)$/;
225 return undef unless defined $section_num;
226
227 my $i = 0;
228 my $section_order = $parent_section_ptr->{'subsection_order'};
229 while ($i < scalar(@$section_order)) {
230 last if $section_order->[$i] eq $section_num;
231 $i++;
232 }
233
234 $i++; # the next child
235 if ($i < scalar(@$section_order)) {
236 return $section_order->[$i] if $parent_section eq "";
237 return "$parent_section.$section_order->[$i]";
238 }
239
240 # no more sections in this level
241 return undef;
242}
243
244# returns a reference to a list of children
245
246sub get_children {
247 my $self = shift (@_);
248 my ($section) = @_;
249
250 my $section_ptr = $self->_lookup_section($section);
251 return [] unless defined $section_ptr;
252
253 my @children = @{$section_ptr->{'subsection_order'}};
254
255 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
256 return \@children;
257}
258
259# returns the child section one past the last one (which
260# is coded as "0")
261sub get_end_child {
262 my $self = shift (@_);
263 my ($section) = @_;
264
265 return $section . ".0" unless $section eq "";
266 return "0";
267}
268
269# returns the next section in book order
270sub get_next_section {
271 my $self = shift (@_);
272 my ($section) = @_;
273
274 return undef unless defined $section;
275
276 my $section_ptr = $self->_lookup_section($section);
277 return undef unless defined $section_ptr;
278
279 # first try to find first child
280 if (defined $section_ptr->{'subsection_order'}->[0]) {
281 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
282 return "$section.$section_ptr->{'subsection_order'}->[0]";
283 }
284
285 do {
286 # try to find sibling
287 my $next_child = $self->get_next_child ($section);
288 return $next_child if (defined $next_child);
289
290 # move up one level
291 $section = $self->get_parent_section ($section);
292 } while $section =~ /\d/;
293
294 return undef;
295}
296
297sub is_leaf_section {
298 my $self = shift (@_);
299 my ($section) = @_;
300
301 my $section_ptr = $self->_lookup_section($section);
302 return 1 unless defined $section_ptr;
303
304 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
305}
306
307# methods for dealing with sections
308
309# returns the name of the inserted section
310sub insert_section {
311 my $self = shift (@_);
312 my ($before_section) = @_;
313
314 # get the child to insert before and its parent section
315 my $parent_section = "";
316 my $before_child = "0";
317 my @before_section = split (/\./, $before_section);
318 if (scalar(@before_section) > 0) {
319 $before_child = pop (@before_section);
320 $parent_section = join (".", @before_section);
321 }
322
323 my $parent_section_ptr = $self->_lookup_section($parent_section);
324 if (!defined $parent_section_ptr) {
325 print STDERR "doc::insert_section couldn't find parent section " .
326 "$parent_section\n";
327 return;
328 }
329
330 # get the next section number
331 my $section_num = $parent_section_ptr->{'next_subsection'}++;
332
333 my $i = 0;
334 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
335 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
336 $i++;
337 }
338
339 # insert the section number into the order list
340 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
341
342 # add this section to the parent section
343 my $section_ptr = {'subsection_order'=>[],
344 'next_subsection'=>1,
345 'subsections'=>{},
346 'metadata'=>[],
347 'text'=>""};
348 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
349
350 # work out the full section number
351 my $section = $parent_section;
352 $section .= "." unless $section eq "";
353 $section .= $section_num;
354
355 return $section;
356}
357
358# creates a pre-named section
359sub create_named_section {
360 my $self = shift (@_);
361 my ($mastersection) = @_;
362
363 my ($num);
364 my $section = $mastersection;
365 my $sectionref = $self;
366
367 while ($section ne "") {
368 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
369 $num =~ s/^0+(\d)/$1/; # remove leading 0s
370 $section = "" unless defined $section;
371
372 if (defined $num) {
373 if (!defined $sectionref->{'subsections'}->{$num}) {
374 push (@{$sectionref->{'subsection_order'}}, $num);
375 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
376 'next_subsection'=>1,
377 'subsections'=>{},
378 'metadata'=>[],
379 'text'=>""};
380 if ($num >= $sectionref->{'next_subsection'}) {
381 $sectionref->{'next_subsection'} = $num + 1;
382 }
383 }
384 $sectionref = $sectionref->{'subsections'}->{$num};
385
386 } else {
387 print STDERR "doc::create_named_section couldn't create section " .
388 "$mastersection\n";
389 last;
390 }
391 }
392}
393
394# returns a reference to a list of subsections
395sub list_subsections {
396 my $self = shift (@_);
397 my ($section) = @_;
398
399 my $section_ptr = $self->_lookup_section ($section);
400 if (!defined $section_ptr) {
401 print STDERR "doc::list_subsections couldn't find section $section\n";
402 return [];
403 }
404
405 return [@{$section_ptr->{'subsection_order'}}];
406}
407
408sub delete_section {
409 my $self = shift (@_);
410 my ($section) = @_;
411
412# my $section_ptr = {'subsection_order'=>[],
413# 'next_subsection'=>1,
414# 'subsections'=>{},
415# 'metadata'=>[],
416# 'text'=>""};
417
418 # if this is the top section reset everything
419 if ($section eq "") {
420 $self->{'subsection_order'} = [];
421 $self->{'subsections'} = {};
422 $self->{'metadata'} = [];
423 $self->{'text'} = "";
424 return;
425 }
426
427 # find the parent of the section to delete
428 my $parent_section = "";
429 my $child = "0";
430 my @section = split (".", $section);
431 if (scalar(@section) > 0) {
432 $child = pop (@section);
433 $parent_section = join (".", @section);
434 }
435
436 my $parent_section_ptr = $self->_lookup_section($parent_section);
437 if (!defined $parent_section_ptr) {
438 print STDERR "doc::delete_section couldn't find parent section " .
439 "$parent_section\n";
440 return;
441 }
442
443 # remove this section from the subsection_order list
444 my $i = 0;
445 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
446 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
447 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
448 last;
449 }
450 $i++;
451 }
452
453 # remove this section from the subsection hash
454 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
455 undef $parent_section_ptr->{'subsections'}->{$child};
456 }
457}
458
459
460# methods for dealing with metadata
461
462# set_metadata_element and get_metadata_element are for metadata
463# which should only have one value. add_meta_data and get_metadata
464# are for metadata which can have more than one value.
465
466# set_metadata_element assumes the value is in (extended) ascii form.
467# For text which hash been already converted to the UTF-8 format use
468# set_utf8_metadata_element.
469sub set_metadata_element {
470 my $self = shift (@_);
471 my ($section, $field, $value) = @_;
472
473# hack till the rest of the software has changed to Unicode.
474 $self->set_utf8_metadata_element ($section, $field, $value);
475# $self->set_utf8_metadata_element ($section, $field,
476# &unicode::ascii2utf8($value));
477}
478
479# set_utf8_metadata_element assumes the text has already been
480# converted to the UTF-8 encoding.
481sub set_utf8_metadata_element {
482 my $self = shift (@_);
483 my ($section, $field, $value) = @_;
484
485 $self->delete_metadata ($section, $field);
486 $self->add_utf8_metadata ($section, $field, $value);
487}
488
489# returns the first metadata value which matches field
490sub get_metadata_element {
491 my $self = shift (@_);
492 my ($section, $field) = @_;
493 my ($data);
494
495 my $section_ptr = $self->_lookup_section($section);
496 if (!defined $section_ptr) {
497 print STDERR "doc::get_metadata_element couldn't find section " .
498 "$section\n";
499 return;
500 }
501
502 foreach $data (@{$section_ptr->{'metadata'}}) {
503 return $data->[1] if ($data->[0] eq $field);
504 }
505
506 return undef; # was not found
507}
508
509# add_metadata assumes the text is in (extended) ascii form. For
510# text which hash been already converted to the UTF-8 format use
511# add_utf8_metadata.
512sub add_metadata {
513 my $self = shift (@_);
514 my ($section, $field, $value) = @_;
515
516 # hack until the rest of the software has changed to Unicode.
517 $self->add_utf8_metadata ($section, $field, $value);
518# $self->add_utf8_metadata ($section, $field,
519# &unicode::ascii2utf8($value));
520}
521
522# add_utf8_metadata assumes the text has already been converted
523# to the UTF-8 encoding.
524sub add_utf8_metadata {
525 my $self = shift (@_);
526 my ($section, $field, $value) = @_;
527
528 my $section_ptr = $self->_lookup_section($section);
529 if (!defined $section_ptr) {
530 print STDERR "doc::add_utf8_metadata couldn't find section " .
531 "$section\n";
532 return;
533 }
534
535 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
536}
537
538# returns a list of the form [value1, value2, ...]
539sub get_metadata {
540 my $self = shift (@_);
541 my ($section, $field) = @_;
542 my ($data);
543
544 my $section_ptr = $self->_lookup_section($section);
545 if (!defined $section_ptr) {
546 print STDERR "doc::get_metadata couldn't find section " .
547 "$section\n";
548 return;
549 }
550
551 my @metadata = ();
552 foreach $data (@{$section_ptr->{'metadata'}}) {
553 push (@metadata, $data->[1]) if ($data->[0] eq $field);
554 }
555
556 return \@metadata;
557}
558
559# returns a list of the form [[field,value],[field,value],...]
560sub get_all_metadata {
561 my $self = shift (@_);
562 my ($section) = @_;
563
564 my $section_ptr = $self->_lookup_section($section);
565 if (!defined $section_ptr) {
566 print STDERR "doc::get_all_metadata couldn't find section " .
567 "$section\n";
568 return;
569 }
570
571 return $section_ptr->{'metadata'};
572}
573
574# $value is optional
575sub delete_metadata {
576 my $self = shift (@_);
577 my ($section, $field, $value) = @_;
578
579 my $section_ptr = $self->_lookup_section($section);
580 if (!defined $section_ptr) {
581 print STDERR "doc::delete_metadata couldn't find section " .
582 "$section\n";
583 return;
584 }
585
586 my $i = 0;
587 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
588 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
589 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
590 splice (@{$section_ptr->{'metadata'}}, $i, 1);
591 } else {
592 $i++;
593 }
594 }
595}
596
597sub delete_all_metadata {
598 my $self = shift (@_);
599 my ($section) = @_;
600
601 my $section_ptr = $self->_lookup_section($section);
602 if (!defined $section_ptr) {
603 print STDERR "doc::delete_all_metadata couldn't find section " .
604 "$section\n";
605 return;
606 }
607
608 $section_ptr->{'metadata'} = [];
609}
610
611
612# methods for dealing with associated files
613
614sub associate_file {
615 my $self = shift (@_);
616 my ($real_filename, $assoc_filename, $mime_type) = @_;
617 $mime_type = &html::guess_mime_type ($real_filename) unless defined $mime_type;
618
619 # remove all associated files with the same name
620 $self->delete_assoc_file ($assoc_filename);
621
622 push (@{$self->{'associated_files'}},
623 [$real_filename, $assoc_filename, $mime_type]);
624}
625
626# returns a list of associated files in the form
627# [[real_filename, assoc_filename, mimetype], ...]
628sub get_assoc_files {
629 my $self = shift (@_);
630
631 return $self->{'associated_files'};
632}
633
634sub delete_assoc_file {
635 my $self = shift (@_);
636 my ($assoc_filename) = @_;
637
638 my $i=0;
639 while ($i < scalar (@{$self->{'associated_files'}})) {
640 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
641 splice (@{$self->{'associated_files'}}, $i, 1);
642 } else {
643 $i++;
644 }
645 }
646}
647
648
649# methods for dealing with text
650
651# add_text assumes the text is in (extended) ascii form. For
652# text which has been already converted to the UTF-8 format
653# use add_utf8_text.
654sub add_text {
655 my $self = shift (@_);
656 my ($section, $text) = @_;
657
658 # convert the text to UTF-8 encoded unicode characters
659 # and add the text
660
661 # hack until the rest of the software has changed to unicode
662 $self->add_utf8_text($section, &unicode::ascii2utf8($text));
663# $self->add_utf8_text($section, &unicode::ascii2utf8($text));
664}
665
666
667# add_utf8_text assumes the text to be added has already
668# been converted to the UTF-8 encoding. For ascii text use
669# add_text
670sub add_utf8_text {
671 my $self = shift (@_);
672 my ($section, $text) = @_;
673
674 my $section_ptr = $self->_lookup_section($section);
675 if (!defined $section_ptr) {
676 print STDERR "doc::add_utf8_text couldn't find section " .
677 "$section\n";
678 return;
679 }
680
681 $section_ptr->{'text'} .= $text;
682}
683
684# returns the text for a section
685sub get_text {
686 my $self = shift (@_);
687 my ($section) = @_;
688
689 my $section_ptr = $self->_lookup_section($section);
690 if (!defined $section_ptr) {
691 print STDERR "doc::get_text couldn't find section " .
692 "$section\n";
693 return "";
694 }
695
696 return $section_ptr->{'text'};
697}
698
699# returns the (utf-8 encoded) length of the text for a section
700sub get_text_length {
701 my $self = shift (@_);
702 my ($section) = @_;
703
704 my $section_ptr = $self->_lookup_section($section);
705 if (!defined $section_ptr) {
706 print STDERR "doc::get_text_length couldn't find section " .
707 "$section\n";
708 return 0;
709 }
710
711 return length ($section_ptr->{'text'});
712}
713
714sub delete_text {
715 my $self = shift (@_);
716 my ($section) = @_;
717
718 my $section_ptr = $self->_lookup_section($section);
719 if (!defined $section_ptr) {
720 print STDERR "doc::delete_text couldn't find section " .
721 "$section\n";
722 return;
723 }
724
725 $section_ptr->{'text'} = "";
726}
727
728
7291;
Note: See TracBrowser for help on using the repository browser.