###########################################################################
#
# doc.pm --
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the
# University of Waikato, New Zealand.
#
# Copyright (C) 1999 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################
# base class to hold documents
package doc;
BEGIN {
die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
}
use unicode;
use util;
use ghtml;
##use hashdoc;
# the document type may be indexed_doc, nonindexed_doc, or
# classification
my $OIDcount = 0;
sub new {
my $class = shift (@_);
my ($source_filename, $doc_type) = @_;
my $self = bless {'associated_files'=>[],
'subsection_order'=>[],
'next_subsection'=>1,
'subsections'=>{},
'metadata'=>[],
'text'=>"",
'OIDtype'=>"hash"}, $class;
$self->set_source_filename ($source_filename) if defined $source_filename;
$self->set_doc_type ($doc_type) if defined $doc_type;
return $self;
}
# clone the $self object
sub duplicate {
my $self = shift (@_);
my $newobj = {};
foreach $k (keys %$self) {
$newobj->{$k} = &clone ($self->{$k});
}
bless $newobj, ref($self);
return $newobj;
}
sub clone {
my ($from) = @_;
my $type = ref ($from);
if ($type eq "HASH") {
my $to = {};
foreach $key (keys %$from) {
$to->{$key} = &clone ($from->{$key});
}
return $to;
} elsif ($type eq "ARRAY") {
my $to = [];
foreach $v (@$from) {
push (@$to, &clone ($v));
}
return $to;
} else {
return $from;
}
}
sub set_OIDtype {
my $self = shift (@_);
my ($type) = @_;
if ($type eq "incremental") {
$self->{'OIDtype'} = $type;
} else {
$self->{'OIDtype'} = "hash";
}
}
sub set_source_filename {
my $self = shift (@_);
my ($source_filename) = @_;
$self->set_metadata_element ($self->get_top_section(),
"gsdlsourcefilename",
$source_filename);
}
# returns the source_filename as it was provided
sub get_source_filename {
my $self = shift (@_);
return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
}
sub set_doc_type {
my $self = shift (@_);
my ($doc_type) = @_;
$self->set_metadata_element ($self->get_top_section(),
"gsdldoctype",
$doc_type);
}
# returns the source_filename as it was provided
# the default of "indexed_doc" is used if no document
# type was provided
sub get_doc_type {
my $self = shift (@_);
my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
return $doc_type if (defined $doc_type);
return "indexed_doc";
}
sub _escape_text {
my ($text) = @_;
# special characters in the gml encoding
$text =~ s/&/&/g; # this has to be first...
$text =~ s/</g;
$text =~ s/>/>/g;
$text =~ s/\"/"/g;
return $text;
}
sub buffer_section_gml {
my $self = shift (@_);
my ($section, $suppress_subject_info) = @_;
$suppress_subject_info = 0 unless defined $suppress_subject_info;
my ($all_text,$data, $subsection);
my $section_ptr = $self->_lookup_section ($section);
my ($section_num) = $section =~ /(\d+)$/;
return "" unless defined $section_ptr;
# output the section header (including the section number
# and metadata)
$all_text = "{'metadata'}}) {
$all_text .= " $data->[0]=\"" . &_escape_text($data->[1]) . "\""
unless $suppress_subject_info && $data->[0] eq "Subject";
}
$all_text .= ">";
# output the text
$all_text .= &_escape_text($section_ptr->{'text'});
# output all the subsections
foreach $subsection (@{$section_ptr->{'subsection_order'}}) {
$all_text .= $self->buffer_section_gml("$section.$subsection",
$suppress_subject_info);
}
# output the closing tag
$all_text .= "\n";
return $all_text;
}
sub buffer_section_xml {
my $self = shift (@_);
my ($section, $dtd_metadata, $suppress_subject_info) = @_;
$suppress_subject_info = 0 unless defined $suppress_subject_info;
my ($all_text, $data, $subsection);
my $section_ptr = $self->_lookup_section ($section);
my ($section_num) = $section =~ /(\d+)$/;
return "" unless defined $section_ptr;
# output the section header (including the section number
# and metadata)
$all_text .= "{'metadata'}}) {
my $tag_name = $data->[0];
# a tagname beginning with '/' (like ) will cause problems
# so we'll escape any leading '/'
$tag_name =~ s/^\//&\#47;/;
my $tag_value = &_escape_text($data->[1]);
unless ($suppress_subject_info && $tag_name eq "Subject")
{
if (defined $dtd_metadata)
{
$dtd_metadata->{$tag_name}++;
}
$all_text .= " <$tag_name>$tag_value$tag_name>\n";
}
}
$all_text .= " \n";
# output the text
$all_text .= &_escape_text($section_ptr->{'text'});
# output all the subsections
foreach $subsection (@{$section_ptr->{'subsection_order'}}) {
$all_text .= $self->buffer_section_xml("$section.$subsection",
$dtd_metadata,
$suppress_subject_info);
}
# output the closing tag
$all_text .= "\n";
return $all_text;
}
sub output_section {
my $self = shift (@_);
my ($handle, $section, $colname, $dtd_metadata,
$suppress_subject_info) = @_;
my $all_text = $self->buffer_section_xml($section, $dtd_metadata,
$suppress_subject_info);
# xml header
if (defined $collection)
{
my $xml_head
= ' xml version="1.0" standalone="no" encoding="UTF-8" ?>';
$xml_head .= "\n\n";
$all_text = $xml_head.$all_text;
}
print $handle $all_text;
}
# look up the reference to the a particular section
sub _lookup_section {
my $self = shift (@_);
my ($section) = @_;
my ($num);
my $sectionref = $self;
while (defined $section && $section ne "") {
($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
$num =~ s/^0+(\d)/$1/; # remove leading 0s
$section = "" unless defined $section;
if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
$sectionref = $sectionref->{'subsections'}->{$num};
} else {
return undef;
}
}
return $sectionref;
}
# calculate OID by hashing the contents of the document
sub _calc_OID {
my $self = shift (@_);
my ($filename) = @_;
my $osexe = &util::get_os_exe();
my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
$ENV{'GSDLOS'},"hashfile$osexe");
my $result = "NULL";
if (-e "$hashfile_exe") {
# $result = `\"$hashfile_exe\" \"$filename\"`;
$result = `hashfile$osexe \"$filename\"`;
($result) = $result =~ /:\s*([0-9a-f]+)/i;
} else {
print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
}
return "HASH$result";
}
# methods dealing with OID, not groups of them.
# if $OID is not provided one is calculated
sub set_OID {
my $self = shift (@_);
my ($OID) = @_;
# if an OID wasn't provided claculate one
if (!defined $OID) {
$OID = "NULL";
if ($self->{'OIDtype'} eq "incremental") {
$OID = "D" . $OIDcount;
$OIDcount ++;
} else {
# "hash" OID - feed file to hashfile.exe
my $filename = $self->get_source_filename();
if (defined($filename) && -e $filename) {
$OID = $self->_calc_OID ($filename);
} else {
$filename = &util::get_tmp_filename();
if (!open (OUTFILE, ">$filename")) {
print STDERR "doc::set_OID could not write to $filename\n";
} else {
$self->output_section('OUTFILE', $self->get_top_section(),
undef, 1);
close (OUTFILE);
}
$OID = $self->_calc_OID ($filename);
&util::rm ($filename);
}
}
}
$self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
}
# this uses hashdoc (embedded c thingy) which is faster but still
# needs a little work to be suffiently stable
sub ___set_OID {
my $self = shift (@_);
my ($OID) = @_;
# if an OID wasn't provided then calculate hash value based on document
if (!defined $OID)
{
my $hash_text = $self->buffer_section_gml($self->get_top_section(),
undef, 1);
my $hash_len = length($hash_text);
$OID = &hashdoc::buffer($hash_text,$hash_len);
}
$self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
}
# returns the OID for this document
sub get_OID {
my $self = shift (@_);
my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
return $OID if (defined $OID);
return "NULL";
}
sub delete_OID {
my $self = shift (@_);
$self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
}
# methods for manipulating section names
# returns the name of the top-most section (the top
# level of the document
sub get_top_section {
my $self = shift (@_);
return "";
}
# returns a section
sub get_parent_section {
my $self = shift (@_);
my ($section) = @_;
$section =~ s/(^|\.)\d+$//;
return $section;
}
# returns the first child section (or the end child
# if there isn't any)
sub get_begin_child {
my $self = shift (@_);
my ($section) = @_;
my $section_ptr = $self->_lookup_section($section);
return "" unless defined $section_ptr;
if (defined $section_ptr->{'subsection_order'}->[0]) {
return "$section.$section_ptr->{'subsection_order'}->[0]";
}
return $self->get_end_child ($section);
}
# returns the next child of a parent section
sub get_next_child {
my $self = shift (@_);
my ($section) = @_;
my $parent_section = $self->get_parent_section($section);
my $parent_section_ptr = $self->_lookup_section($parent_section);
return undef unless defined $parent_section_ptr;
my ($section_num) = $section =~ /(\d+)$/;
return undef unless defined $section_num;
my $i = 0;
my $section_order = $parent_section_ptr->{'subsection_order'};
while ($i < scalar(@$section_order)) {
last if $section_order->[$i] eq $section_num;
$i++;
}
$i++; # the next child
if ($i < scalar(@$section_order)) {
return $section_order->[$i] if $parent_section eq "";
return "$parent_section.$section_order->[$i]";
}
# no more sections in this level
return undef;
}
# returns a reference to a list of children
sub get_children {
my $self = shift (@_);
my ($section) = @_;
my $section_ptr = $self->_lookup_section($section);
return [] unless defined $section_ptr;
my @children = @{$section_ptr->{'subsection_order'}};
map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
return \@children;
}
# returns the child section one past the last one (which
# is coded as "0")
sub get_end_child {
my $self = shift (@_);
my ($section) = @_;
return $section . ".0" unless $section eq "";
return "0";
}
# returns the next section in book order
sub get_next_section {
my $self = shift (@_);
my ($section) = @_;
return undef unless defined $section;
my $section_ptr = $self->_lookup_section($section);
return undef unless defined $section_ptr;
# first try to find first child
if (defined $section_ptr->{'subsection_order'}->[0]) {
return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
return "$section.$section_ptr->{'subsection_order'}->[0]";
}
do {
# try to find sibling
my $next_child = $self->get_next_child ($section);
return $next_child if (defined $next_child);
# move up one level
$section = $self->get_parent_section ($section);
} while $section =~ /\d/;
return undef;
}
sub is_leaf_section {
my $self = shift (@_);
my ($section) = @_;
my $section_ptr = $self->_lookup_section($section);
return 1 unless defined $section_ptr;
return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
}
# methods for dealing with sections
# returns the name of the inserted section
sub insert_section {
my $self = shift (@_);
my ($before_section) = @_;
# get the child to insert before and its parent section
my $parent_section = "";
my $before_child = "0";
my @before_section = split (/\./, $before_section);
if (scalar(@before_section) > 0) {
$before_child = pop (@before_section);
$parent_section = join (".", @before_section);
}
my $parent_section_ptr = $self->_lookup_section($parent_section);
if (!defined $parent_section_ptr) {
print STDERR "doc::insert_section couldn't find parent section " .
"$parent_section\n";
return;
}
# get the next section number
my $section_num = $parent_section_ptr->{'next_subsection'}++;
my $i = 0;
while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
$parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
$i++;
}
# insert the section number into the order list
splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
# add this section to the parent section
my $section_ptr = {'subsection_order'=>[],
'next_subsection'=>1,
'subsections'=>{},
'metadata'=>[],
'text'=>""};
$parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
# work out the full section number
my $section = $parent_section;
$section .= "." unless $section eq "";
$section .= $section_num;
return $section;
}
# creates a pre-named section
sub create_named_section {
my $self = shift (@_);
my ($mastersection) = @_;
my ($num);
my $section = $mastersection;
my $sectionref = $self;
while ($section ne "") {
($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
$num =~ s/^0+(\d)/$1/; # remove leading 0s
$section = "" unless defined $section;
if (defined $num) {
if (!defined $sectionref->{'subsections'}->{$num}) {
push (@{$sectionref->{'subsection_order'}}, $num);
$sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
'next_subsection'=>1,
'subsections'=>{},
'metadata'=>[],
'text'=>""};
if ($num >= $sectionref->{'next_subsection'}) {
$sectionref->{'next_subsection'} = $num + 1;
}
}
$sectionref = $sectionref->{'subsections'}->{$num};
} else {
print STDERR "doc::create_named_section couldn't create section ";
print STDERR "$mastersection\n";
last;
}
}
}
# returns a reference to a list of subsections
sub list_subsections {
my $self = shift (@_);
my ($section) = @_;
my $section_ptr = $self->_lookup_section ($section);
if (!defined $section_ptr) {
print STDERR "doc::list_subsections couldn't find section $section\n";
return [];
}
return [@{$section_ptr->{'subsection_order'}}];
}
sub delete_section {
my $self = shift (@_);
my ($section) = @_;
# my $section_ptr = {'subsection_order'=>[],
# 'next_subsection'=>1,
# 'subsections'=>{},
# 'metadata'=>[],
# 'text'=>""};
# if this is the top section reset everything
if ($section eq "") {
$self->{'subsection_order'} = [];
$self->{'subsections'} = {};
$self->{'metadata'} = [];
$self->{'text'} = "";
return;
}
# find the parent of the section to delete
my $parent_section = "";
my $child = "0";
my @section = split (/\./, $section);
if (scalar(@section) > 0) {
$child = pop (@section);
$parent_section = join (".", @section);
}
my $parent_section_ptr = $self->_lookup_section($parent_section);
if (!defined $parent_section_ptr) {
print STDERR "doc::delete_section couldn't find parent section " .
"$parent_section\n";
return;
}
# remove this section from the subsection_order list
my $i = 0;
while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
last;
}
$i++;
}
# remove this section from the subsection hash
if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
undef $parent_section_ptr->{'subsections'}->{$child};
}
}
#--
# methods for dealing with metadata
# set_metadata_element and get_metadata_element are for metadata
# which should only have one value. add_meta_data and get_metadata
# are for metadata which can have more than one value.
# returns the first metadata value which matches field
sub get_metadata_element {
my $self = shift (@_);
my ($section, $field) = @_;
my ($data);
my $section_ptr = $self->_lookup_section($section);
if (!defined $section_ptr) {
print STDERR "doc::get_metadata_element couldn't find section " .
"$section\n";
return;
}
foreach $data (@{$section_ptr->{'metadata'}}) {
return $data->[1] if (scalar(@$data) >= 2 && $data->[0] eq $field);
}
return undef; # was not found
}
# returns a list of the form [value1, value2, ...]
sub get_metadata {
my $self = shift (@_);
my ($section, $field) = @_;
my ($data);
my $section_ptr = $self->_lookup_section($section);
if (!defined $section_ptr) {
print STDERR "doc::get_metadata couldn't find section " .
"$section\n";
return;
}
my @metadata = ();
foreach $data (@{$section_ptr->{'metadata'}}) {
push (@metadata, $data->[1]) if ($data->[0] eq $field);
}
return \@metadata;
}
# returns a list of the form [[field,value],[field,value],...]
sub get_all_metadata {
my $self = shift (@_);
my ($section) = @_;
my $section_ptr = $self->_lookup_section($section);
if (!defined $section_ptr) {
print STDERR "doc::get_all_metadata couldn't find section " .
"$section\n";
return;
}
return $section_ptr->{'metadata'};
}
# $value is optional
sub delete_metadata {
my $self = shift (@_);
my ($section, $field, $value) = @_;
my $section_ptr = $self->_lookup_section($section);
if (!defined $section_ptr) {
print STDERR "doc::delete_metadata couldn't find section " .
"$section\n";
return;
}
my $i = 0;
while ($i < scalar (@{$section_ptr->{'metadata'}})) {
if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
(!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
splice (@{$section_ptr->{'metadata'}}, $i, 1);
} else {
$i++;
}
}
}
sub delete_all_metadata {
my $self = shift (@_);
my ($section) = @_;
my $section_ptr = $self->_lookup_section($section);
if (!defined $section_ptr) {
print STDERR "doc::delete_all_metadata couldn't find section " .
"$section\n";
return;
}
$section_ptr->{'metadata'} = [];
}
sub set_metadata_element {
my $self = shift (@_);
my ($section, $field, $value) = @_;
$self->set_utf8_metadata_element ($section, $field,
&unicode::ascii2utf8(\$value));
}
# set_utf8_metadata_element assumes the text has already been
# converted to the UTF-8 encoding.
sub set_utf8_metadata_element {
my $self = shift (@_);
my ($section, $field, $value) = @_;
$self->delete_metadata ($section, $field);
$self->add_utf8_metadata ($section, $field, $value);
}
# add_metadata assumes the text is in (extended) ascii form. For
# text which hash been already converted to the UTF-8 format use
# add_utf8_metadata.
sub add_metadata {
my $self = shift (@_);
my ($section, $field, $value) = @_;
$self->add_utf8_metadata ($section, $field,
&unicode::ascii2utf8(\$value));
}
# add_utf8_metadata assumes the text has already been converted
# to the UTF-8 encoding.
sub add_utf8_metadata {
my $self = shift (@_);
my ($section, $field, $value) = @_;
my $section_ptr = $self->_lookup_section($section);
if (!defined $section_ptr) {
print STDERR "doc::add_utf8_metadata couldn't find section " .
"$section\n";
return;
}
if (!defined $value) {
print STDERR "doc::add_utf8_metadata undefined value for $field\n";
return;
}
if (!defined $field) {
print STDERR "doc::add_utf8_metadata undefined metadata type \n";
return;
}
push (@{$section_ptr->{'metadata'}}, [$field, $value]);
}
# methods for dealing with text
# returns the text for a section
sub get_text {
my $self = shift (@_);
my ($section) = @_;
my $section_ptr = $self->_lookup_section($section);
if (!defined $section_ptr) {
print STDERR "doc::get_text couldn't find section " .
"$section\n";
return "";
}
return $section_ptr->{'text'};
}
# returns the (utf-8 encoded) length of the text for a section
sub get_text_length {
my $self = shift (@_);
my ($section) = @_;
my $section_ptr = $self->_lookup_section($section);
if (!defined $section_ptr) {
print STDERR "doc::get_text_length couldn't find section " .
"$section\n";
return 0;
}
return length ($section_ptr->{'text'});
}
sub delete_text {
my $self = shift (@_);
my ($section) = @_;
my $section_ptr = $self->_lookup_section($section);
if (!defined $section_ptr) {
print STDERR "doc::delete_text couldn't find section " .
"$section\n";
return;
}
$section_ptr->{'text'} = "";
}
# add_text assumes the text is in (extended) ascii form. For
# text which has been already converted to the UTF-8 format
# use add_utf8_text.
sub add_text {
my $self = shift (@_);
my ($section, $text) = @_;
# convert the text to UTF-8 encoded unicode characters
# and add the text
$self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
}
# add_utf8_text assumes the text to be added has already
# been converted to the UTF-8 encoding. For ascii text use
# add_text
sub add_utf8_text {
my $self = shift (@_);
my ($section, $text) = @_;
my $section_ptr = $self->_lookup_section($section);
if (!defined $section_ptr) {
print STDERR "doc::add_utf8_text couldn't find section " .
"$section\n";
return;
}
$section_ptr->{'text'} .= $text;
}
# methods for dealing with associated files
# a file is associated with a document, NOT a section.
# if section is defined it is noted in the data structure
# only so that files associated from a particular section
# may be removed later (using delete_section_assoc_files)
sub associate_file {
my $self = shift (@_);
my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
$mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
# remove all associated files with the same name
$self->delete_assoc_file ($assoc_filename);
push (@{$self->{'associated_files'}},
[$real_filename, $assoc_filename, $mime_type, $section]);
}
# returns a list of associated files in the form
# [[real_filename, assoc_filename, mimetype], ...]
sub get_assoc_files {
my $self = shift (@_);
return $self->{'associated_files'};
}
sub delete_section_assoc_files {
my $self = shift (@_);
my ($section) = @_;
my $i=0;
while ($i < scalar (@{$self->{'associated_files'}})) {
if (defined $self->{'associated_files'}->[$i]->[3] &&
$self->{'associated_files'}->[$i]->[3] eq $section) {
splice (@{$self->{'associated_files'}}, $i, 1);
} else {
$i++;
}
}
}
sub delete_assoc_file {
my $self = shift (@_);
my ($assoc_filename) = @_;
my $i=0;
while ($i < scalar (@{$self->{'associated_files'}})) {
if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
splice (@{$self->{'associated_files'}}, $i, 1);
} else {
$i++;
}
}
}
sub reset_nextsection_ptr {
my $self = shift (@_);
my ($section) = @_;
my $section_ptr = $self->_lookup_section($section);
$section_ptr->{'next_subsection'} = 1;
}
1;