###########################################################################
#
# doc.pm --
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# Copyright (C) 1999 New Zealand Digital Library Project
#
# This program is free software; you can redistr   te it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################

# base class to hold documents

package doc;
eval {require bytes};

BEGIN {
    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
}

use unicode;
use util;
use ghtml;
use File::stat;
##use hashdoc;

# the document type may be indexed_doc, nonindexed_doc, or
# classification

my $OIDcount = 0;

sub new {
    my $class = shift (@_);
    my ($source_filename, $doc_type) = @_;

    my $self = bless {'associated_files'=>[],
		      'subsection_order'=>[],
		      'next_subsection'=>1,
		      'subsections'=>{},
		      'metadata'=>[],
		      'text'=>"",
		      'OIDtype'=>"hash"}, $class;

    #GRB: Added code to set lastmodified for OAI purposes
    if ((defined $doc_type) && (defined $source_filename) && (-e $source_filename)) {
	my $file_stat = stat($source_filename);
	my $mtime = $file_stat->mtime;
	$self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
    }
    #GRB: end inserted code

    if (defined $source_filename) {
	my $collect_dir = $ENV{'GSDLCOLLECTDIR'};

	if (defined $collect_dir) {
	    my $dirsep = &util::get_dirsep();
	    if ($collect_dir !~ m/$dirsep$/) {
		$collect_dir .= $dirsep;
	    }

	    $collect_dir =~ s/\\/\\\\/g; # escape DOS style file separator

	    if ($source_filename =~ /^$collect_dir(.*)$/) {
		$source_filename = $1;
	    }
	}

	$self->set_source_filename ($source_filename);
    }

    $self->set_doc_type ($doc_type) if defined $doc_type;

    return $self;
}

# clone the $self object
sub duplicate {
    my $self = shift (@_);

    my $newobj = {};
    
    foreach my $k (keys %$self) {
	$newobj->{$k} = &clone ($self->{$k});
    }

    bless $newobj, ref($self);
    return $newobj;
}

sub clone {
    my ($from) = @_;
    my $type = ref ($from);

    if ($type eq "HASH") {
	my $to = {};
	foreach my $key (keys %$from) {
	    $to->{$key} = &clone ($from->{$key});
	}
	return $to;
    } elsif ($type eq "ARRAY") {
	my $to = [];
	foreach my $v (@$from) {
	    push (@$to, &clone ($v));
	}
	return $to;
    } else {
	return $from;
    }
}

sub set_OIDtype {
    my $self = shift (@_);
    my ($type) = @_;

    if ($type =~ /^(hash|incremental|dirname|assigned)$/) {
	$self->{'OIDtype'} = $type;
    } else {
	$self->{'OIDtype'} = "hash";
    }
}

sub set_source_filename {
    my $self = shift (@_);
    my ($source_filename) = @_;

    $self->set_metadata_element ($self->get_top_section(), 
				 "gsdlsourcefilename", 
				 $source_filename);
}

sub set_converted_filename {
    my $self = shift (@_);
    my ($converted_filename) = @_;

    $self->set_metadata_element ($self->get_top_section(), 
				 "gsdlconvertedfilename", 
				 $converted_filename);
}


# returns the source_filename as it was provided
sub get_source_filename {
    my $self = shift (@_);

    return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
}

# returns converted filename if available else returns source filename
sub get_filename_for_hashing {
    my $self = shift (@_);

    my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");

    if (!defined $filename) {
	$filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
    }
    return $filename;
}

sub set_doc_type {
    my $self = shift (@_);
    my ($doc_type) = @_;

    $self->set_metadata_element ($self->get_top_section(), 
				 "gsdldoctype", 
				 $doc_type);
}

# returns the source_filename as it was provided
# the default of "indexed_doc" is used if no document
# type was provided
sub get_doc_type {
    my $self = shift (@_);

    my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
    return $doc_type if (defined $doc_type);
    return "indexed_doc";
}

sub _escape_text {
    my ($text) = @_;

    # special characters in the gml encoding
    $text =~ s/&/&amp;/g; # this has to be first...
    $text =~ s/</&lt;/g;
    $text =~ s/>/&gt;/g;
    $text =~ s/\"/&quot;/g;

    return $text;
}

sub buffer_section_xml {
    my $self = shift (@_);
    my ($section) = @_;

    my $section_ptr = $self->_lookup_section ($section);
    return "" unless defined $section_ptr;

    my $all_text = "<Section>\n";
    $all_text .= "  <Description>\n";
    
    # output metadata
    foreach my $data (@{$section_ptr->{'metadata'}}) {
	my $escaped_value = &_escape_text($data->[1]);
	$all_text .= '    <Metadata name="' . $data->[0] . '">' . $escaped_value . "</Metadata>\n";
    }

    $all_text .= "  </Description>\n";

    # output the text
    $all_text .= "  <Content>";
    $all_text .= &_escape_text($section_ptr->{'text'});
    $all_text .= "</Content>\n";
    
    # output all the subsections
    foreach my $subsection (@{$section_ptr->{'subsection_order'}}) {
	$all_text .= $self->buffer_section_xml("$section.$subsection");
    }
    
    $all_text .=  "</Section>\n";

    # make sure no nasty control characters have snuck through
    # (XML::Parser will barf on anything it doesn't consider to be
    # valid UTF-8 text, including things like \c@, \cC etc.)
    $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;

    return $all_text;
}

sub buffer_txt_section_xml {
    my $self = shift(@_);
    my ($section) = @_;
 
    my $section_ptr = $self->_lookup_section ($section);
    
    return "" unless defined $section_ptr;
   
    my $all_text = "<Section>\n";
   
    ##output the text
    #$all_text .= "   <Content>";
    $all_text .= &_escape_text($section_ptr->{'text'});
    #$all_text .= "   </Content>\n";
    
   
    #output all the subsections
    foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
       $all_text .= $self->buffer_txt_section_xml("$section.$subsection");
     }

     $all_text .= "</Section>\n";

     
     $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
     return $all_text;
}

sub buffer_mets_fileSection_section_xml() {
    my $self = shift(@_);
    my ($section) = @_;

    #$section="" unless defined $section;
    

    my $section_ptr=$self->_lookup_section($section);
    return "" unless defined $section_ptr;
 

    #**output fileSection by sections
    my $section_num ="1". $section;
  
    
    #my $filePath = $doc_Dir . '/doctxt.xml';

    my $filePath = 'doctxt.xml';

    #**output the fileSection details
    my $all_text = '  <mets:fileGrp ID="FILEGROUP_PRELUDE' . $section_num . '">'. "\n";
    $all_text .= '    <mets:file MIMETYPE="text/xml" ID="FILE'.$section_num. '">'. "\n";
    $all_text .= '      <mets:FLocat LOCTYPE="URL" xlink:href="file:'.$filePath.'#xpointer(/Section[';
    
    my $xpath = "1".$section;
 
    $xpath =~ s/\./]\/Section[/g;
   
    $all_text .=  $xpath;

    $all_text .= ']/text())" />' . "\n";
    $all_text .= "    </mets:file>\n";
    $all_text .= "  </mets:fileGrp>\n";


    foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
	$all_text .= $self->buffer_mets_fileSection_section_xml("$section.$subsection");
    }
    
    $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;

    return $all_text;
}

sub buffer_mets_fileWhole_section_xml(){
    my $self = shift(@_);
    my ($section) = @_;

    my $section_ptr = $self-> _lookup_section($section);
    return "" unless defined $section_ptr;
    
    my $all_text="" unless defined $all_txt;

    my ($dirPath)="" unless defined $dirPath;
    my $fileID=0;

    #** output the fileSection for the whole section
    #*** get the sourcefile and associative file

    foreach my $data (@{$section_ptr->{'metadata'}}){
       my $escaped_value = &_escape_text($data->[1]);
       if ($data->[0] eq "gsdlsourcefilename") { 
          ($dirPath) = $escaped_value =~ m/^(.*)[\/\\][^\/\\]*$/;
	  
	  $all_text .= '  <mets:fileGrp ID="default">'."\n";
          ++$fileID;
          $all_text .= '    <mets:file MIMETYPE="text/xml" ID="default.'.$fileID.'">'. "\n";
        
	  $all_text .= '      <mets:FLocat LOCTYPE="URL" xlink:href="file:'.$data->[1].'" />'."\n";
	  
          $all_text .= "    </mets:file>\n";
       }
       
       if ($data->[0] eq "gsdlassocfile"){
	  $escaped_value =~ m/^(.*?):(.*):$/;

	  my $assfilePath = $dirPath . '/'. $1;
          ++$fileID;

	  $all_text .= '    <mets:file MIMETYPE="'.$2.'" ID="default.'.$fileID. '">'. "\n";
	  $all_text .= '      <mets:FLocat LOCTYPE="URL" xlink:href="file:'.$assfilePath.'" />'."\n";
          
	  $all_text .= "    </mets:file>\n";
       }
    }
    $all_text .= "  </mets:fileGrp>\n";
                                            
                                      
    $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
    
    return $all_text;
}

sub buffer_mets_StruMapSection_section_xml(){
    my $self = shift(@_);
    my ($section, $order_numref) = @_;

    $section="" unless defined $section;
    
    
    my $section_ptr=$self->_lookup_section($section);
    return "" unless defined $section_ptr;


    #***output fileSection by Sections
    my $section_num ="1". $section;
    my $dmd_num = $section_num;

    ##**output the dmdSection details  
    #if ($section_num eq "1") {
    #	$dmd_num = "0";
    #}

    #**output the StruMap details
 
    my $all_text = '  <mets:div ID="DS'. $section_num .'" TYPE="Section" ORDER="'.$$order_numref++.'" ORDERLABEL="'. $section_num .'" LABEL="';

    $all_text .= $section_num . '" DMDID="DM'.$dmd_num.'">'. "\n";
   
    $all_text .= '    <mets:fptr FILEID="FILEGROUP_PRELUDE'.$section_num.'" />'. "\n";


    foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
       $all_text .= $self->buffer_mets_StruMapSection_section_xml("$section.$subsection", $order_numref);
    }
    
    $all_text .= "  </mets:div>\n";

    $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;

    return $all_text;
}


sub buffer_mets_StruMapWhole_section_xml(){
    my $self = shift(@_);
    my ($section) = @_;
    
    my $section_ptr = $self-> _lookup_section($section);
    return "" unless defined $section_ptr;
    
    my $all_text="" unless defined $all_txt;   
    my $fileID=0;
    my $order_num = 0;

    $all_text .= '<mets:structMap ID="All" TYPE="Whole Document" LABEL="All">'."\n";
    $all_text .= '  <mets:div ID="DSAll" TYPE="Document" ORDER="'.$order_num.'" ORDERLABEL="All" LABEL="Whole Documemt" DMDID="DM1">' . "\n";
  

    #** output the StruMapSection for the whole section
    #*** get the sourcefile and associative file

    foreach my $data (@{$section_ptr->{'metadata'}}){
       my $escaped_value = &_escape_text($data->[1]);
   
       if ($data->[0] eq "gsdlsourcefilename") { 
          ++$fileID;
	  $all_text .= '    <mets:fptr FILEID="default.'.$fileID.'" />'."\n";
       }
       
       if ($data->[0] eq "gsdlassocfile"){
          ++$fileID;
	  $all_text .= '    <mets:fptr FILEID="default.'.$fileID. '" />'. "\n";
       }
    }
    $all_text .= "  </mets:div>\n";
    
    $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
    
    return $all_text;
}


sub buffer_mets_dmdSection_section_xml(){
    my $self = shift(@_);
    my ($section) = @_;
   
    $section="" unless defined $section;
    
    my $section_ptr=$self->_lookup_section($section);
    return "" unless defined $section_ptr;

    #***convert section number
    my $section_num ="1". $section;
    my $dmd_num = $section_num;

    # #**output the dmdSection details  
    # if ($section_num eq "1") {
    #	$dmd_num = "0";
    # }
    my $all_text = '<mets:dmdSec ID="DM'.$dmd_num.'" GROUPID="'.$section_num.'">'. "\n";
    $all_text .= '  <mets:mdWrap MDTYPE="OTHER" OTHERMDTYPE="gsdl3" ID="gsdl'.$section_num.'">'."\n";
    $all_text .= "    <mets:xmlData>\n";
    foreach my $data (@{$section_ptr->{'metadata'}}){
       my $escaped_value = &_escape_text($data->[1]);
       $all_text .= '      <gsdl3:Metadata name="'. $data->[0].'">'. $escaped_value. "</gsdl3:Metadata>\n";
       if ($data->[0] eq "dc.Title") {
	   $all_text .= '      <gsdl3:Metadata name="Title">'. $escaped_value."</gsdl3:Metadata>\n";
       }
    }
   
    $all_text .= "    </mets:xmlData>\n";
    $all_text .= "  </mets:mdWrap>\n";
    $all_text .= "</mets:dmdSec>\n";

    foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
       $all_text .= $self->buffer_mets_dmdSection_section_xml("$section.$subsection");
    }
    
    $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;

    return $all_text;
}

sub output_section {
    my $self = shift (@_);
    my ($handle, $section) = @_;

    print $handle $self->buffer_section_xml($section);
}

#*** print out DSpace dublin_core metadata section
sub output_dspace_section {
    my $self = shift (@_);
    my ($handle, $section) = @_;

    my $section_ptr = $self->_lookup_section ($section);
    return "" unless defined $section_ptr;

    my $all_text = "<Section>\n";
    $all_text .= "  <Description>\n";
    
    # output metadata
    foreach my $data (@{$section_ptr->{'metadata'}}) {
	my $escaped_value = &_escape_text($data->[1]);
	$all_text .= '    <Metadata name="' . $data->[0] . '">' . $escaped_value . "</Metadata>\n";
    }

    $all_text .= "  </Description>\n";
    $all_text .=  "</Section>\n";

    # make sure no nasty control characters have snuck through
    # (XML::Parser will barf on anything it doesn't consider to be
    # valid UTF-8 text, including things like \c@, \cC etc.)
    $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;

    return $all_text;
}

#*** print out doctxt.xml file
sub output_txt_section {
    my $self = shift (@_);
    my ($handle, $section) = @_;

    print $handle $self->buffer_txt_section_xml($section);
}

#*** print out docmets.xml file
sub output_mets_section {
    my $self = shift(@_);
    my ($handle, $section) = @_;

    #***print out the dmdSection
    print $handle $self->buffer_mets_dmdSection_section_xml($section);

    #***print out the fileSection by sections
    print $handle "<mets:fileSec>\n";
    print $handle $self->buffer_mets_fileSection_section_xml($section);

    #***print out the whole fileSection
    print $handle $self->buffer_mets_fileWhole_section_xml($section); 
    print $handle "</mets:fileSec>\n";
    
    #***print out the StruMapSection by sections
    print $handle '<mets:structMap ID="Section" TYPE="Section" LABEL="Section">' . "\n";
    my $order_num=0;
    print $handle $self->buffer_mets_StruMapSection_section_xml($section, \$order_num);
    print $handle "</mets:structMap>\n";
    print $handle $self->buffer_mets_StruMapWhole_section_xml($section);
    print $handle "</mets:structMap>\n";
}

#*** print out dublin_core.xml file
sub output_dc_section {
    my $self = shift(@_);
    my ($handle, $section, $doc_Dir) = @_;
    
    #***print out the dublin_core 
    $section="" unless defined $section;
    
    my $section_ptr=$self->_lookup_section($section);
    return "" unless defined $section_ptr;
    my $all_text="";
    foreach my $data (@{$section_ptr->{'metadata'}}){
	my $escaped_value = &_escape_text($data->[1]);
	if ($data->[0]=~ /^dc/) {
	    $data->[0] =~ tr/[A-Z]/[a-z]/;
	    $data->[0] =~ /^dc\.(.*)/;
	    my $dc_element =  $1;
	    #$all_text .= '   <dcvalue element="'. $data->[0].'" qualifier="#####">'. $escaped_value. "</dcvalue>\n";
	    $all_text .= '   <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
	} 
    }
    if ($all_text eq "") {
	$all_text .= "   There is no Dublin Core metatdata in this document\n";
    }	
    $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;

    print $handle $all_text;
}

# look up the reference to the a particular section
sub _lookup_section {
    my $self = shift (@_);
    my ($section) = @_;

    my ($num);
    my $sectionref = $self;

    while (defined $section && $section ne "") {
	($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
	$num =~ s/^0+(\d)/$1/; # remove leading 0s
	$section = "" unless defined $section;
	
	if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
	    $sectionref = $sectionref->{'subsections'}->{$num};
	} else {
	    return undef;
	}
    }
    
    return $sectionref;
}

# calculate OID by hashing the contents of the document
sub _calc_OID {
    my $self = shift (@_);
    my ($filename) = @_;

    my $osexe = &util::get_os_exe();

    my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
					   $ENV{'GSDLOS'},"hashfile$osexe");

    my $result = "NULL";
    
    if (-e "$hashfile_exe") {
#	$result = `\"$hashfile_exe\" \"$filename\"`;
	$result = `hashfile$osexe \"$filename\"`;
	($result) = $result =~ /:\s*([0-9a-f]+)/i;

    } else {
	print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
    }
    return "HASH$result";
}

# methods dealing with OID, not groups of them.

# if $OID is not provided one is calculated
sub set_OID {
    my $self = shift (@_);
    my ($OID) = @_;
    
    my $use_hash_oid = 0;
    # if an OID wasn't provided claculate one
    if (!defined $OID) {
	$OID = "NULL";
	if ($self->{'OIDtype'} eq "hash") {
	    $use_hash_oid = 1;
	} elsif ($self->{'OIDtype'} eq "incremental") {
	    $OID = "D" . $OIDcount;
	    $OIDcount ++;
	  
	} elsif ($self->{'OIDtype'} eq "dirname") {
	    $OID = 'J';
	    my $filename = $self->get_source_filename();
	    if (defined($filename)) { # && -e $filename) {
		$OID = &File::Basename::dirname($filename);
		if (defined $OID) {
		    $OID = 'J'.&File::Basename::basename($OID);
		    $OID =~ s/\.//; #remove any periods
		} else {
		    print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
		    $use_hash_oid = 1;
		}
	    } else {
		print STDERR "Failed to find filename, generating hash id\n";
		$use_hash_oid = 1;
	    }    
	    
	} elsif ($self->{'OIDtype'} eq "assigned") {
	    my $identifier = $self->get_metadata_element ($self->get_top_section(), "dc.Identifier");
	    if (defined $identifier && $identifier ne "") {
		$OID = "D" . $self->get_metadata_element ($self->get_top_section(), "dc.Identifier");
		$OID =~ s/\.//; #remove any periods
	    } else {
		# need a hash id
		print STDERR "no dc.Identifier found, generating hash id\n";
		$use_hash_oid = 1;
	    }
	    	    
	} else {
	    $use_hash_oid = 1;
	}

	if ($use_hash_oid) {
	    
	    # "hash" OID - feed file to hashfile.exe
	    #my $filename = $self->get_source_filename();
	    # we want to use the converted file for hashing if available
	    # cos its quicker
	    my $filename = $self->get_filename_for_hashing();

	    # -z: don't want to hash on the file if it is zero size
	    if (defined($filename) && -e $filename && !-z $filename) {
		$OID = $self->_calc_OID ($filename);
	    } else {
		$filename = &util::get_tmp_filename();
		if (!open (OUTFILE, ">$filename")) {
		    print STDERR "doc::set_OID could not write to $filename\n";
		} else {
		    $self->output_section('OUTFILE', $self->get_top_section(), 
					  undef, 1);
		    close (OUTFILE);
		}
		$OID = $self->_calc_OID ($filename);
		&util::rm ($filename);
	    }
	}
    }
    $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
}

# this uses hashdoc (embedded c thingy) which is faster but still 
# needs a little work to be suffiently stable
sub ___set_OID {
    my $self = shift (@_);
    my ($OID) = @_;

    # if an OID wasn't provided then calculate hash value based on document
    if (!defined $OID) 
    {
	my $hash_text = $self->buffer_section_gml($self->get_top_section(),
						  undef, 1);
	my $hash_len = length($hash_text);

        $OID = &hashdoc::buffer($hash_text,$hash_len);
    }

    $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
}

# returns the OID for this document
sub get_OID {
    my $self = shift (@_);
    my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
    return $OID if (defined $OID);
    return "NULL";
}

sub delete_OID {
    my $self = shift (@_);
    
    $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
}


# methods for manipulating section names

# returns the name of the top-most section (the top
# level of the document
sub get_top_section {
    my $self = shift (@_);
    
    return "";
}

# returns a section
sub get_parent_section {
    my $self = shift (@_);
    my ($section) = @_;

    $section =~ s/(^|\.)\d+$//;

    return $section;
}

# returns the first child section (or the end child 
# if there isn't any)
sub get_begin_child {
    my $self = shift (@_);
    my ($section) = @_;

    my $section_ptr = $self->_lookup_section($section);
    return "" unless defined $section_ptr;

    if (defined $section_ptr->{'subsection_order'}->[0]) {
	return "$section.$section_ptr->{'subsection_order'}->[0]";
    }

    return $self->get_end_child ($section);
}

# returns the next child of a parent section
sub get_next_child {
    my $self = shift (@_);
    my ($section) = @_;
    
    my $parent_section = $self->get_parent_section($section);
    my $parent_section_ptr = $self->_lookup_section($parent_section);
    return undef unless defined $parent_section_ptr;

    my ($section_num) = $section =~ /(\d+)$/;
    return undef unless defined $section_num;

    my $i = 0;
    my $section_order = $parent_section_ptr->{'subsection_order'};
    while ($i < scalar(@$section_order)) {
	last if $section_order->[$i] eq $section_num;
	$i++;
    }

    $i++; # the next child
    if ($i < scalar(@$section_order)) {
	return $section_order->[$i] if $parent_section eq "";
	return "$parent_section.$section_order->[$i]";
    }

    # no more sections in this level
    return undef;
}

# returns a reference to a list of children
sub get_children {
    my $self = shift (@_);
    my ($section) = @_;

    my $section_ptr = $self->_lookup_section($section);
    return [] unless defined $section_ptr;

    my @children = @{$section_ptr->{'subsection_order'}};

    map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
    return \@children;
}

# returns the child section one past the last one (which
# is coded as "0")
sub get_end_child {
    my $self = shift (@_);
    my ($section) = @_;

    return $section . ".0" unless $section eq "";
    return "0";
}

# returns the next section in book order
sub get_next_section {
    my $self = shift (@_);
    my ($section) = @_;

    return undef unless defined $section;

    my $section_ptr = $self->_lookup_section($section);
    return undef unless defined $section_ptr;

    # first try to find first child
    if (defined $section_ptr->{'subsection_order'}->[0]) {
	return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
	return "$section.$section_ptr->{'subsection_order'}->[0]";
    }

    do {
	# try to find sibling
	my $next_child = $self->get_next_child ($section);
	return $next_child if (defined $next_child);

	# move up one level
	$section = $self->get_parent_section ($section);
    } while $section =~ /\d/;

    return undef;
}

sub is_leaf_section {
    my $self = shift (@_);
    my ($section) = @_;

    my $section_ptr = $self->_lookup_section($section);
    return 1 unless defined $section_ptr;

    return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
}

# methods for dealing with sections

# returns the name of the inserted section
sub insert_section {
    my $self = shift (@_);
    my ($before_section) = @_;

    # get the child to insert before and its parent section
    my $parent_section = "";
    my $before_child = "0";
    my @before_section = split (/\./, $before_section);
    if (scalar(@before_section) > 0) {
	$before_child = pop (@before_section);
	$parent_section = join (".", @before_section);
    }

    my $parent_section_ptr = $self->_lookup_section($parent_section);
    if (!defined $parent_section_ptr) {
	print STDERR "doc::insert_section couldn't find parent section " .
	    "$parent_section\n";
	return;
    }

    # get the next section number
    my $section_num = $parent_section_ptr->{'next_subsection'}++;

    my $i = 0;
    while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
	   $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
	$i++;
    }
    
    # insert the section number into the order list
    splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);

    # add this section to the parent section
    my $section_ptr = {'subsection_order'=>[],
		       'next_subsection'=>1,
		       'subsections'=>{},
		       'metadata'=>[],
		       'text'=>""};
    $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;

    # work out the full section number
    my $section = $parent_section;
    $section .= "." unless $section eq "";
    $section .= $section_num;
    
    return $section;
}

# creates a pre-named section
sub create_named_section {
    my $self = shift (@_);
    my ($mastersection) = @_;

    my ($num);
    my $section = $mastersection;
    my $sectionref = $self;

    while ($section ne "") {
	($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
	$num =~ s/^0+(\d)/$1/; # remove leading 0s
	$section = "" unless defined $section;
	
	if (defined $num) {
	    if (!defined $sectionref->{'subsections'}->{$num}) {
		push (@{$sectionref->{'subsection_order'}}, $num);
		$sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
							'next_subsection'=>1,
							'subsections'=>{},
							'metadata'=>[],
							'text'=>""};
		if ($num >= $sectionref->{'next_subsection'}) {
		    $sectionref->{'next_subsection'} = $num + 1;
		}
	    }
	    $sectionref = $sectionref->{'subsections'}->{$num};

	} else {
	    print STDERR "doc::create_named_section couldn't create section ";
	    print STDERR "$mastersection\n";
	    last;
	}
    }
}

# returns a reference to a list of subsections
sub list_subsections {
    my $self = shift (@_);
    my ($section) = @_;

    my $section_ptr = $self->_lookup_section ($section);
    if (!defined $section_ptr) {
	print STDERR "doc::list_subsections couldn't find section $section\n";
	return [];
    }

    return [@{$section_ptr->{'subsection_order'}}];
}

sub delete_section {
    my $self = shift (@_);
    my ($section) = @_;

#    my $section_ptr = {'subsection_order'=>[],
#		       'next_subsection'=>1,
#		       'subsections'=>{},
#		       'metadata'=>[],
#		       'text'=>""};

    # if this is the top section reset everything
    if ($section eq "") {
	$self->{'subsection_order'} = [];
	$self->{'subsections'} = {};
	$self->{'metadata'} = [];
	$self->{'text'} = "";
	return;
    }

    # find the parent of the section to delete
    my $parent_section = "";
    my $child = "0";
    my @section = split (/\./, $section);
    if (scalar(@section) > 0) {
	$child = pop (@section);
	$parent_section = join (".", @section);
    }

    my $parent_section_ptr = $self->_lookup_section($parent_section);
    if (!defined $parent_section_ptr) {
	print STDERR "doc::delete_section couldn't find parent section " .
	    "$parent_section\n";
	return;
    }

    # remove this section from the subsection_order list
    my $i = 0;
    while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
	if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
	    splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
	    last;
	}
	$i++;
    }

    # remove this section from the subsection hash
    if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
	undef $parent_section_ptr->{'subsections'}->{$child};
    }
}

#--
# methods for dealing with metadata

# set_metadata_element and get_metadata_element are for metadata
# which should only have one value. add_meta_data and get_metadata
# are for metadata which can have more than one value.

# returns the first metadata value which matches field

# This version of get metadata element works much like the one above,
# except it allows for the namespace portion of a metadata element to
# be ignored, thus if you are searching for dc.Title, the first piece
# of matching metadata ending with the name Title (once any namespace
# is removed) would be returned.
# 28-11-2003 John Thompson
sub get_metadata_element {
    my $self = shift (@_);
    my ($section, $field, $ignore_namespace) = @_;
    my ($data);

    $ignore_namespace = 0 unless defined $ignore_namespace;

    my $section_ptr = $self->_lookup_section($section);
    if (!defined $section_ptr) {
	print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
	return;
    }

    # Remove the any namespace if we are being told to ignore them
    if($ignore_namespace) {
	$field =~ s/^\w*\.//;
    }

    foreach $data (@{$section_ptr->{'metadata'}}) {

	my $data_name = $data->[0];
	# Remove the any namespace if we are being told to ignore them
	if($ignore_namespace) {
	    $data_name =~ s/^\w*\.//;
	}
	
	return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
    }
	
    return undef; # was not found
}

# returns a list of the form [value1, value2, ...]
sub get_metadata {
    my $self = shift (@_);
    my ($section, $field, $ignore_namespace) = @_;
    my ($data);

    $ignore_namespace = 0 unless defined $ignore_namespace;

    my $section_ptr = $self->_lookup_section($section);
    if (!defined $section_ptr) {
        print STDERR "doc::get_metadata couldn't find section ",
	    $section, "\n";
        return;
    }

    # Remove the any namespace if we are being told to ignore them
    if($ignore_namespace) {
	$field =~ s/^\w*\.//;
    }

    my @metadata = ();
    foreach $data (@{$section_ptr->{'metadata'}}) {

	my $data_name = $data->[0];
	# Remove the any namespace if we are being told to ignore them
	if($ignore_namespace) {
	    $data_name =~ s/^\w*\.//;
	}

        push (@metadata, $data->[1]) if ($data_name eq $field);
    }
        
    return \@metadata;
}

# returns a list of the form [[field,value],[field,value],...]
sub get_all_metadata {
    my $self = shift (@_);
    my ($section) = @_;

    my $section_ptr = $self->_lookup_section($section);
    if (!defined $section_ptr) {
	print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
	return;
    }
    
    return $section_ptr->{'metadata'};
}

# $value is optional
sub delete_metadata {
    my $self = shift (@_);
    my ($section, $field, $value) = @_;

    my $section_ptr = $self->_lookup_section($section);
    if (!defined $section_ptr) {
	print STDERR "doc::delete_metadata couldn't find section ", $section, "\n";
	return;
    }

    my $i = 0;
    while ($i < scalar (@{$section_ptr->{'metadata'}})) {
	if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
	    (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
	    splice (@{$section_ptr->{'metadata'}}, $i, 1);
	} else {
	    $i++;
	}
    }
}

sub delete_all_metadata {
    my $self = shift (@_);
    my ($section) = @_;

    my $section_ptr = $self->_lookup_section($section);
    if (!defined $section_ptr) {
	print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
	return;
    }
    
    $section_ptr->{'metadata'} = [];
}

sub set_metadata_element {
    my $self = shift (@_);
    my ($section, $field, $value) = @_;

    $self->set_utf8_metadata_element ($section, $field, 
				      &unicode::ascii2utf8(\$value));
}

# set_utf8_metadata_element assumes the text has already been
# converted to the UTF-8 encoding.
sub set_utf8_metadata_element {
    my $self = shift (@_);
    my ($section, $field, $value) = @_;

    $self->delete_metadata ($section, $field);
    $self->add_utf8_metadata ($section, $field, $value);
}


# add_metadata assumes the text is in (extended) ascii form. For
# text which has already been converted to the UTF-8 format use
# add_utf8_metadata.
sub add_metadata {
    my $self = shift (@_);
    my ($section, $field, $value) = @_;

    $self->add_utf8_metadata ($section, $field,
			      &unicode::ascii2utf8(\$value));
}

sub add_utf8_metadata {
    my $self = shift (@_);
    my ($section, $field, $value) = @_;

    my $section_ptr = $self->_lookup_section($section);
    if (!defined $section_ptr) {
	print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
	return;
    }
    if (!defined $value) {
	print STDERR "doc::add_utf8_metadata undefined value for $field\n";
	return;
    }
    if (!defined $field) {
	print STDERR "doc::add_utf8_metadata undefined metadata type \n";
	return;
    }
    
    #print STDERR "###$field=$value\n";
    # double check that the value is utf-8
    if (unicode::ensure_utf8(\$value)) {
	print STDERR "doc::add_utf8_metadata: warning: '$field' wasn't utf8\n";
    }

    push (@{$section_ptr->{'metadata'}}, [$field, $value]);
}


# methods for dealing with text

# returns the text for a section
sub get_text {
    my $self = shift (@_);
    my ($section) = @_;

    my $section_ptr = $self->_lookup_section($section);
    if (!defined $section_ptr) {
	print STDERR "doc::get_text couldn't find section " .
	    "$section\n";
	return "";
    }

    return $section_ptr->{'text'};
}

# returns the (utf-8 encoded) length of the text for a section
sub get_text_length {
    my $self = shift (@_);
    my ($section) = @_;

    my $section_ptr = $self->_lookup_section($section);
    if (!defined $section_ptr) {
	print STDERR "doc::get_text_length couldn't find section " .
	    "$section\n";
	return 0;
    }

    return length ($section_ptr->{'text'});
}

sub delete_text {
    my $self = shift (@_);
    my ($section) = @_;

    my $section_ptr = $self->_lookup_section($section);
    if (!defined $section_ptr) {
	print STDERR "doc::delete_text couldn't find section " .
	    "$section\n";
	return;
    }

    $section_ptr->{'text'} = "";
}

# add_text assumes the text is in (extended) ascii form. For
# text which has been already converted to the UTF-8 format
# use add_utf8_text.
sub add_text {
    my $self = shift (@_);
    my ($section, $text) = @_;

    # convert the text to UTF-8 encoded unicode characters
    # and add the text
    $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
}


# add_utf8_text assumes the text to be added has already
# been converted to the UTF-8 encoding. For ascii text use
# add_text
sub add_utf8_text {
    my $self = shift (@_);
    my ($section, $text) = @_;

    my $section_ptr = $self->_lookup_section($section);
    if (!defined $section_ptr) {
	print STDERR "doc::add_utf8_text couldn't find section " .
	    "$section\n";
	return;
    }

    $section_ptr->{'text'} .= $text;
}


# methods for dealing with associated files

# a file is associated with a document, NOT a section.
# if section is defined it is noted in the data structure
# only so that files associated from a particular section
# may be removed later (using delete_section_assoc_files)
sub associate_file {
    my $self = shift (@_);
    my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
    $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;

    # remove all associated files with the same name
    $self->delete_assoc_file ($assoc_filename);
    
    push (@{$self->{'associated_files'}}, 
	  [$real_filename, $assoc_filename, $mime_type, $section]);
}

# returns a list of associated files in the form
#   [[real_filename, assoc_filename, mimetype], ...]
sub get_assoc_files {
    my $self = shift (@_);

    return $self->{'associated_files'};
}

sub delete_section_assoc_files {
    my $self = shift (@_);
    my ($section) = @_;

    my $i=0;
    while ($i < scalar (@{$self->{'associated_files'}})) {
	if (defined $self->{'associated_files'}->[$i]->[3] &&
	    $self->{'associated_files'}->[$i]->[3] eq $section) {
	    splice (@{$self->{'associated_files'}}, $i, 1);
	} else {
	    $i++;
	}
    }
}

sub delete_assoc_file {
    my $self = shift (@_);
    my ($assoc_filename) = @_;

    my $i=0;
    while ($i < scalar (@{$self->{'associated_files'}})) {
	if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
	    splice (@{$self->{'associated_files'}}, $i, 1);
	} else {
	    $i++;
	}
    }
}

sub reset_nextsection_ptr {
    my $self = shift (@_);
    my ($section) = @_;
    
    my $section_ptr = $self->_lookup_section($section);
    $section_ptr->{'next_subsection'} = 1;
}

1;