###########################################################################
#
# doc.pm --
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the
# University of Waikato, New Zealand.
#
# Copyright (C) 1999 New Zealand Digital Library Project
#
# This program is free software; you can redistr te it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################
# base class to hold documents
package doc;
eval {require bytes};
BEGIN {
die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
}
use unicode;
use util;
use ghtml;
use File::stat;
##use hashdoc;
# the document type may be indexed_doc, nonindexed_doc, or
# classification
my $OIDcount = 0;
sub new {
my $class = shift (@_);
my ($source_filename, $doc_type) = @_;
my $self = bless {'associated_files'=>[],
'subsection_order'=>[],
'next_subsection'=>1,
'subsections'=>{},
'metadata'=>[],
'text'=>"",
'OIDtype'=>"hash"}, $class;
#GRB: Added code to set lastmodified for OAI purposes
if ((defined $doc_type) && (defined $source_filename) && (-e $source_filename)) {
my $file_stat = stat($source_filename);
my $mtime = $file_stat->mtime;
$self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
}
#GRB: end inserted code
if (defined $source_filename) {
my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
if (defined $collect_dir) {
my $dirsep = &util::get_dirsep();
if ($collect_dir !~ m/$dirsep$/) {
$collect_dir .= $dirsep;
}
$collect_dir =~ s/\\/\\\\/g; # escape DOS style file separator
if ($source_filename =~ /^$collect_dir(.*)$/) {
$source_filename = $1;
}
}
$self->set_source_filename ($source_filename);
}
$self->set_doc_type ($doc_type) if defined $doc_type;
return $self;
}
# clone the $self object
sub duplicate {
my $self = shift (@_);
my $newobj = {};
foreach my $k (keys %$self) {
$newobj->{$k} = &clone ($self->{$k});
}
bless $newobj, ref($self);
return $newobj;
}
sub clone {
my ($from) = @_;
my $type = ref ($from);
if ($type eq "HASH") {
my $to = {};
foreach my $key (keys %$from) {
$to->{$key} = &clone ($from->{$key});
}
return $to;
} elsif ($type eq "ARRAY") {
my $to = [];
foreach my $v (@$from) {
push (@$to, &clone ($v));
}
return $to;
} else {
return $from;
}
}
sub set_OIDtype {
my $self = shift (@_);
my ($type) = @_;
if ($type =~ /^(hash|incremental|dirname|assigned)$/) {
$self->{'OIDtype'} = $type;
} else {
$self->{'OIDtype'} = "hash";
}
}
sub set_source_filename {
my $self = shift (@_);
my ($source_filename) = @_;
$self->set_metadata_element ($self->get_top_section(),
"gsdlsourcefilename",
$source_filename);
}
sub set_converted_filename {
my $self = shift (@_);
my ($converted_filename) = @_;
$self->set_metadata_element ($self->get_top_section(),
"gsdlconvertedfilename",
$converted_filename);
}
# returns the source_filename as it was provided
sub get_source_filename {
my $self = shift (@_);
return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
}
# returns converted filename if available else returns source filename
sub get_filename_for_hashing {
my $self = shift (@_);
my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
if (!defined $filename) {
$filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
}
return $filename;
}
sub set_doc_type {
my $self = shift (@_);
my ($doc_type) = @_;
$self->set_metadata_element ($self->get_top_section(),
"gsdldoctype",
$doc_type);
}
# returns the source_filename as it was provided
# the default of "indexed_doc" is used if no document
# type was provided
sub get_doc_type {
my $self = shift (@_);
my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
return $doc_type if (defined $doc_type);
return "indexed_doc";
}
sub _escape_text {
my ($text) = @_;
# special characters in the gml encoding
$text =~ s/&/&/g; # this has to be first...
$text =~ s/</g;
$text =~ s/>/>/g;
$text =~ s/\"/"/g;
return $text;
}
sub buffer_section_xml {
my $self = shift (@_);
my ($section) = @_;
my $section_ptr = $self->_lookup_section ($section);
return "" unless defined $section_ptr;
my $all_text = "\n";
$all_text .= " \n";
# output metadata
foreach my $data (@{$section_ptr->{'metadata'}}) {
my $escaped_value = &_escape_text($data->[1]);
$all_text .= ' ' . $escaped_value . "\n";
}
$all_text .= " \n";
# output the text
$all_text .= " ";
$all_text .= &_escape_text($section_ptr->{'text'});
$all_text .= "\n";
# output all the subsections
foreach my $subsection (@{$section_ptr->{'subsection_order'}}) {
$all_text .= $self->buffer_section_xml("$section.$subsection");
}
$all_text .= "\n";
# make sure no nasty control characters have snuck through
# (XML::Parser will barf on anything it doesn't consider to be
# valid UTF-8 text, including things like \c@, \cC etc.)
$all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
return $all_text;
}
sub buffer_txt_section_xml {
my $self = shift(@_);
my ($section) = @_;
my $section_ptr = $self->_lookup_section ($section);
return "" unless defined $section_ptr;
my $all_text = "\n";
##output the text
#$all_text .= " ";
$all_text .= &_escape_text($section_ptr->{'text'});
#$all_text .= " \n";
#output all the subsections
foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
$all_text .= $self->buffer_txt_section_xml("$section.$subsection");
}
$all_text .= "\n";
$all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
return $all_text;
}
sub buffer_mets_fileSection_section_xml() {
my $self = shift(@_);
my ($section) = @_;
#$section="" unless defined $section;
my $section_ptr=$self->_lookup_section($section);
return "" unless defined $section_ptr;
#**output fileSection by sections
my $section_num ="1". $section;
#my $filePath = $doc_Dir . '/doctxt.xml';
my $filePath = 'doctxt.xml';
#**output the fileSection details
my $all_text = ' '. "\n";
$all_text .= ' '. "\n";
$all_text .= ' {'subsection_order'}}){
$all_text .= $self->buffer_mets_fileSection_section_xml("$section.$subsection");
}
$all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
return $all_text;
}
sub buffer_mets_fileWhole_section_xml(){
my $self = shift(@_);
my ($section) = @_;
my $section_ptr = $self-> _lookup_section($section);
return "" unless defined $section_ptr;
my $all_text="" unless defined $all_txt;
my ($dirPath)="" unless defined $dirPath;
my $fileID=0;
#** output the fileSection for the whole section
#*** get the sourcefile and associative file
foreach my $data (@{$section_ptr->{'metadata'}}){
my $escaped_value = &_escape_text($data->[1]);
if ($data->[0] eq "gsdlsourcefilename") {
($dirPath) = $escaped_value =~ m/^(.*)[\/\\][^\/\\]*$/;
$all_text .= ' '."\n";
++$fileID;
$all_text .= ' '. "\n";
$all_text .= ' '."\n";
$all_text .= " \n";
}
if ($data->[0] eq "gsdlassocfile"){
$escaped_value =~ m/^(.*?):(.*):$/;
my $assfilePath = $dirPath . '/'. $1;
++$fileID;
$all_text .= ' '. "\n";
$all_text .= ' '."\n";
$all_text .= " \n";
}
}
$all_text .= " \n";
$all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
return $all_text;
}
sub buffer_mets_StruMapSection_section_xml(){
my $self = shift(@_);
my ($section, $order_numref) = @_;
$section="" unless defined $section;
my $section_ptr=$self->_lookup_section($section);
return "" unless defined $section_ptr;
#***output fileSection by Sections
my $section_num ="1". $section;
my $dmd_num = $section_num;
##**output the dmdSection details
#if ($section_num eq "1") {
# $dmd_num = "0";
#}
#**output the StruMap details
my $all_text = ' '. "\n";
$all_text .= ' '. "\n";
foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
$all_text .= $self->buffer_mets_StruMapSection_section_xml("$section.$subsection", $order_numref);
}
$all_text .= " \n";
$all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
return $all_text;
}
sub buffer_mets_StruMapWhole_section_xml(){
my $self = shift(@_);
my ($section) = @_;
my $section_ptr = $self-> _lookup_section($section);
return "" unless defined $section_ptr;
my $all_text="" unless defined $all_txt;
my $fileID=0;
my $order_num = 0;
$all_text .= ''."\n";
$all_text .= ' ' . "\n";
#** output the StruMapSection for the whole section
#*** get the sourcefile and associative file
foreach my $data (@{$section_ptr->{'metadata'}}){
my $escaped_value = &_escape_text($data->[1]);
if ($data->[0] eq "gsdlsourcefilename") {
++$fileID;
$all_text .= ' '."\n";
}
if ($data->[0] eq "gsdlassocfile"){
++$fileID;
$all_text .= ' '. "\n";
}
}
$all_text .= " \n";
$all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
return $all_text;
}
sub buffer_mets_dmdSection_section_xml(){
my $self = shift(@_);
my ($section) = @_;
$section="" unless defined $section;
my $section_ptr=$self->_lookup_section($section);
return "" unless defined $section_ptr;
#***convert section number
my $section_num ="1". $section;
my $dmd_num = $section_num;
# #**output the dmdSection details
# if ($section_num eq "1") {
# $dmd_num = "0";
# }
my $all_text = ''. "\n";
$all_text .= ' '."\n";
$all_text .= " \n";
foreach my $data (@{$section_ptr->{'metadata'}}){
my $escaped_value = &_escape_text($data->[1]);
$all_text .= ' '. $escaped_value. "\n";
if ($data->[0] eq "dc.Title") {
$all_text .= ' '. $escaped_value."\n";
}
}
$all_text .= " \n";
$all_text .= " \n";
$all_text .= "\n";
foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
$all_text .= $self->buffer_mets_dmdSection_section_xml("$section.$subsection");
}
$all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
return $all_text;
}
sub output_section {
my $self = shift (@_);
my ($handle, $section) = @_;
print $handle $self->buffer_section_xml($section);
}
#*** print out DSpace dublin_core metadata section
sub output_dspace_section {
my $self = shift (@_);
my ($handle, $section) = @_;
my $section_ptr = $self->_lookup_section ($section);
return "" unless defined $section_ptr;
my $all_text = "\n";
$all_text .= " \n";
# output metadata
foreach my $data (@{$section_ptr->{'metadata'}}) {
my $escaped_value = &_escape_text($data->[1]);
$all_text .= ' ' . $escaped_value . "\n";
}
$all_text .= " \n";
$all_text .= "\n";
# make sure no nasty control characters have snuck through
# (XML::Parser will barf on anything it doesn't consider to be
# valid UTF-8 text, including things like \c@, \cC etc.)
$all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
return $all_text;
}
#*** print out doctxt.xml file
sub output_txt_section {
my $self = shift (@_);
my ($handle, $section) = @_;
print $handle $self->buffer_txt_section_xml($section);
}
#*** print out docmets.xml file
sub output_mets_section {
my $self = shift(@_);
my ($handle, $section) = @_;
#***print out the dmdSection
print $handle $self->buffer_mets_dmdSection_section_xml($section);
#***print out the fileSection by sections
print $handle "\n";
print $handle $self->buffer_mets_fileSection_section_xml($section);
#***print out the whole fileSection
print $handle $self->buffer_mets_fileWhole_section_xml($section);
print $handle "\n";
#***print out the StruMapSection by sections
print $handle '' . "\n";
my $order_num=0;
print $handle $self->buffer_mets_StruMapSection_section_xml($section, \$order_num);
print $handle "\n";
print $handle $self->buffer_mets_StruMapWhole_section_xml($section);
print $handle "\n";
}
#*** print out dublin_core.xml file
sub output_dc_section {
my $self = shift(@_);
my ($handle, $section, $doc_Dir) = @_;
#***print out the dublin_core
$section="" unless defined $section;
my $section_ptr=$self->_lookup_section($section);
return "" unless defined $section_ptr;
my $all_text="";
foreach my $data (@{$section_ptr->{'metadata'}}){
my $escaped_value = &_escape_text($data->[1]);
if ($data->[0]=~ /^dc/) {
$data->[0] =~ tr/[A-Z]/[a-z]/;
$data->[0] =~ /^dc\.(.*)/;
my $dc_element = $1;
#$all_text .= ' '. $escaped_value. "\n";
$all_text .= ' '. $escaped_value. "\n";
}
}
if ($all_text eq "") {
$all_text .= " There is no Dublin Core metatdata in this document\n";
}
$all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
print $handle $all_text;
}
# look up the reference to the a particular section
sub _lookup_section {
my $self = shift (@_);
my ($section) = @_;
my ($num);
my $sectionref = $self;
while (defined $section && $section ne "") {
($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
$num =~ s/^0+(\d)/$1/; # remove leading 0s
$section = "" unless defined $section;
if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
$sectionref = $sectionref->{'subsections'}->{$num};
} else {
return undef;
}
}
return $sectionref;
}
# calculate OID by hashing the contents of the document
sub _calc_OID {
my $self = shift (@_);
my ($filename) = @_;
my $osexe = &util::get_os_exe();
my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
$ENV{'GSDLOS'},"hashfile$osexe");
my $result = "NULL";
if (-e "$hashfile_exe") {
# $result = `\"$hashfile_exe\" \"$filename\"`;
$result = `hashfile$osexe \"$filename\"`;
($result) = $result =~ /:\s*([0-9a-f]+)/i;
} else {
print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
}
return "HASH$result";
}
# methods dealing with OID, not groups of them.
# if $OID is not provided one is calculated
sub set_OID {
my $self = shift (@_);
my ($OID) = @_;
my $use_hash_oid = 0;
# if an OID wasn't provided claculate one
if (!defined $OID) {
$OID = "NULL";
if ($self->{'OIDtype'} eq "hash") {
$use_hash_oid = 1;
} elsif ($self->{'OIDtype'} eq "incremental") {
$OID = "D" . $OIDcount;
$OIDcount ++;
} elsif ($self->{'OIDtype'} eq "dirname") {
$OID = 'J';
my $filename = $self->get_source_filename();
if (defined($filename)) { # && -e $filename) {
$OID = &File::Basename::dirname($filename);
if (defined $OID) {
$OID = 'J'.&File::Basename::basename($OID);
$OID =~ s/\.//; #remove any periods
} else {
print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
$use_hash_oid = 1;
}
} else {
print STDERR "Failed to find filename, generating hash id\n";
$use_hash_oid = 1;
}
} elsif ($self->{'OIDtype'} eq "assigned") {
my $identifier = $self->get_metadata_element ($self->get_top_section(), "dc.Identifier");
if (defined $identifier && $identifier ne "") {
$OID = "D" . $self->get_metadata_element ($self->get_top_section(), "dc.Identifier");
$OID =~ s/\.//; #remove any periods
} else {
# need a hash id
print STDERR "no dc.Identifier found, generating hash id\n";
$use_hash_oid = 1;
}
} else {
$use_hash_oid = 1;
}
if ($use_hash_oid) {
# "hash" OID - feed file to hashfile.exe
#my $filename = $self->get_source_filename();
# we want to use the converted file for hashing if available
# cos its quicker
my $filename = $self->get_filename_for_hashing();
# -z: don't want to hash on the file if it is zero size
if (defined($filename) && -e $filename && !-z $filename) {
$OID = $self->_calc_OID ($filename);
} else {
$filename = &util::get_tmp_filename();
if (!open (OUTFILE, ">$filename")) {
print STDERR "doc::set_OID could not write to $filename\n";
} else {
$self->output_section('OUTFILE', $self->get_top_section(),
undef, 1);
close (OUTFILE);
}
$OID = $self->_calc_OID ($filename);
&util::rm ($filename);
}
}
}
$self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
}
# this uses hashdoc (embedded c thingy) which is faster but still
# needs a little work to be suffiently stable
sub ___set_OID {
my $self = shift (@_);
my ($OID) = @_;
# if an OID wasn't provided then calculate hash value based on document
if (!defined $OID)
{
my $hash_text = $self->buffer_section_gml($self->get_top_section(),
undef, 1);
my $hash_len = length($hash_text);
$OID = &hashdoc::buffer($hash_text,$hash_len);
}
$self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
}
# returns the OID for this document
sub get_OID {
my $self = shift (@_);
my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
return $OID if (defined $OID);
return "NULL";
}
sub delete_OID {
my $self = shift (@_);
$self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
}
# methods for manipulating section names
# returns the name of the top-most section (the top
# level of the document
sub get_top_section {
my $self = shift (@_);
return "";
}
# returns a section
sub get_parent_section {
my $self = shift (@_);
my ($section) = @_;
$section =~ s/(^|\.)\d+$//;
return $section;
}
# returns the first child section (or the end child
# if there isn't any)
sub get_begin_child {
my $self = shift (@_);
my ($section) = @_;
my $section_ptr = $self->_lookup_section($section);
return "" unless defined $section_ptr;
if (defined $section_ptr->{'subsection_order'}->[0]) {
return "$section.$section_ptr->{'subsection_order'}->[0]";
}
return $self->get_end_child ($section);
}
# returns the next child of a parent section
sub get_next_child {
my $self = shift (@_);
my ($section) = @_;
my $parent_section = $self->get_parent_section($section);
my $parent_section_ptr = $self->_lookup_section($parent_section);
return undef unless defined $parent_section_ptr;
my ($section_num) = $section =~ /(\d+)$/;
return undef unless defined $section_num;
my $i = 0;
my $section_order = $parent_section_ptr->{'subsection_order'};
while ($i < scalar(@$section_order)) {
last if $section_order->[$i] eq $section_num;
$i++;
}
$i++; # the next child
if ($i < scalar(@$section_order)) {
return $section_order->[$i] if $parent_section eq "";
return "$parent_section.$section_order->[$i]";
}
# no more sections in this level
return undef;
}
# returns a reference to a list of children
sub get_children {
my $self = shift (@_);
my ($section) = @_;
my $section_ptr = $self->_lookup_section($section);
return [] unless defined $section_ptr;
my @children = @{$section_ptr->{'subsection_order'}};
map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
return \@children;
}
# returns the child section one past the last one (which
# is coded as "0")
sub get_end_child {
my $self = shift (@_);
my ($section) = @_;
return $section . ".0" unless $section eq "";
return "0";
}
# returns the next section in book order
sub get_next_section {
my $self = shift (@_);
my ($section) = @_;
return undef unless defined $section;
my $section_ptr = $self->_lookup_section($section);
return undef unless defined $section_ptr;
# first try to find first child
if (defined $section_ptr->{'subsection_order'}->[0]) {
return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
return "$section.$section_ptr->{'subsection_order'}->[0]";
}
do {
# try to find sibling
my $next_child = $self->get_next_child ($section);
return $next_child if (defined $next_child);
# move up one level
$section = $self->get_parent_section ($section);
} while $section =~ /\d/;
return undef;
}
sub is_leaf_section {
my $self = shift (@_);
my ($section) = @_;
my $section_ptr = $self->_lookup_section($section);
return 1 unless defined $section_ptr;
return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
}
# methods for dealing with sections
# returns the name of the inserted section
sub insert_section {
my $self = shift (@_);
my ($before_section) = @_;
# get the child to insert before and its parent section
my $parent_section = "";
my $before_child = "0";
my @before_section = split (/\./, $before_section);
if (scalar(@before_section) > 0) {
$before_child = pop (@before_section);
$parent_section = join (".", @before_section);
}
my $parent_section_ptr = $self->_lookup_section($parent_section);
if (!defined $parent_section_ptr) {
print STDERR "doc::insert_section couldn't find parent section " .
"$parent_section\n";
return;
}
# get the next section number
my $section_num = $parent_section_ptr->{'next_subsection'}++;
my $i = 0;
while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
$parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
$i++;
}
# insert the section number into the order list
splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
# add this section to the parent section
my $section_ptr = {'subsection_order'=>[],
'next_subsection'=>1,
'subsections'=>{},
'metadata'=>[],
'text'=>""};
$parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
# work out the full section number
my $section = $parent_section;
$section .= "." unless $section eq "";
$section .= $section_num;
return $section;
}
# creates a pre-named section
sub create_named_section {
my $self = shift (@_);
my ($mastersection) = @_;
my ($num);
my $section = $mastersection;
my $sectionref = $self;
while ($section ne "") {
($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
$num =~ s/^0+(\d)/$1/; # remove leading 0s
$section = "" unless defined $section;
if (defined $num) {
if (!defined $sectionref->{'subsections'}->{$num}) {
push (@{$sectionref->{'subsection_order'}}, $num);
$sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
'next_subsection'=>1,
'subsections'=>{},
'metadata'=>[],
'text'=>""};
if ($num >= $sectionref->{'next_subsection'}) {
$sectionref->{'next_subsection'} = $num + 1;
}
}
$sectionref = $sectionref->{'subsections'}->{$num};
} else {
print STDERR "doc::create_named_section couldn't create section ";
print STDERR "$mastersection\n";
last;
}
}
}
# returns a reference to a list of subsections
sub list_subsections {
my $self = shift (@_);
my ($section) = @_;
my $section_ptr = $self->_lookup_section ($section);
if (!defined $section_ptr) {
print STDERR "doc::list_subsections couldn't find section $section\n";
return [];
}
return [@{$section_ptr->{'subsection_order'}}];
}
sub delete_section {
my $self = shift (@_);
my ($section) = @_;
# my $section_ptr = {'subsection_order'=>[],
# 'next_subsection'=>1,
# 'subsections'=>{},
# 'metadata'=>[],
# 'text'=>""};
# if this is the top section reset everything
if ($section eq "") {
$self->{'subsection_order'} = [];
$self->{'subsections'} = {};
$self->{'metadata'} = [];
$self->{'text'} = "";
return;
}
# find the parent of the section to delete
my $parent_section = "";
my $child = "0";
my @section = split (/\./, $section);
if (scalar(@section) > 0) {
$child = pop (@section);
$parent_section = join (".", @section);
}
my $parent_section_ptr = $self->_lookup_section($parent_section);
if (!defined $parent_section_ptr) {
print STDERR "doc::delete_section couldn't find parent section " .
"$parent_section\n";
return;
}
# remove this section from the subsection_order list
my $i = 0;
while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
last;
}
$i++;
}
# remove this section from the subsection hash
if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
undef $parent_section_ptr->{'subsections'}->{$child};
}
}
#--
# methods for dealing with metadata
# set_metadata_element and get_metadata_element are for metadata
# which should only have one value. add_meta_data and get_metadata
# are for metadata which can have more than one value.
# returns the first metadata value which matches field
# This version of get metadata element works much like the one above,
# except it allows for the namespace portion of a metadata element to
# be ignored, thus if you are searching for dc.Title, the first piece
# of matching metadata ending with the name Title (once any namespace
# is removed) would be returned.
# 28-11-2003 John Thompson
sub get_metadata_element {
my $self = shift (@_);
my ($section, $field, $ignore_namespace) = @_;
my ($data);
$ignore_namespace = 0 unless defined $ignore_namespace;
my $section_ptr = $self->_lookup_section($section);
if (!defined $section_ptr) {
print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
return;
}
# Remove the any namespace if we are being told to ignore them
if($ignore_namespace) {
$field =~ s/^\w*\.//;
}
foreach $data (@{$section_ptr->{'metadata'}}) {
my $data_name = $data->[0];
# Remove the any namespace if we are being told to ignore them
if($ignore_namespace) {
$data_name =~ s/^\w*\.//;
}
return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
}
return undef; # was not found
}
# returns a list of the form [value1, value2, ...]
sub get_metadata {
my $self = shift (@_);
my ($section, $field, $ignore_namespace) = @_;
my ($data);
$ignore_namespace = 0 unless defined $ignore_namespace;
my $section_ptr = $self->_lookup_section($section);
if (!defined $section_ptr) {
print STDERR "doc::get_metadata couldn't find section ",
$section, "\n";
return;
}
# Remove the any namespace if we are being told to ignore them
if($ignore_namespace) {
$field =~ s/^\w*\.//;
}
my @metadata = ();
foreach $data (@{$section_ptr->{'metadata'}}) {
my $data_name = $data->[0];
# Remove the any namespace if we are being told to ignore them
if($ignore_namespace) {
$data_name =~ s/^\w*\.//;
}
push (@metadata, $data->[1]) if ($data_name eq $field);
}
return \@metadata;
}
# returns a list of the form [[field,value],[field,value],...]
sub get_all_metadata {
my $self = shift (@_);
my ($section) = @_;
my $section_ptr = $self->_lookup_section($section);
if (!defined $section_ptr) {
print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
return;
}
return $section_ptr->{'metadata'};
}
# $value is optional
sub delete_metadata {
my $self = shift (@_);
my ($section, $field, $value) = @_;
my $section_ptr = $self->_lookup_section($section);
if (!defined $section_ptr) {
print STDERR "doc::delete_metadata couldn't find section ", $section, "\n";
return;
}
my $i = 0;
while ($i < scalar (@{$section_ptr->{'metadata'}})) {
if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
(!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
splice (@{$section_ptr->{'metadata'}}, $i, 1);
} else {
$i++;
}
}
}
sub delete_all_metadata {
my $self = shift (@_);
my ($section) = @_;
my $section_ptr = $self->_lookup_section($section);
if (!defined $section_ptr) {
print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
return;
}
$section_ptr->{'metadata'} = [];
}
sub set_metadata_element {
my $self = shift (@_);
my ($section, $field, $value) = @_;
$self->set_utf8_metadata_element ($section, $field,
&unicode::ascii2utf8(\$value));
}
# set_utf8_metadata_element assumes the text has already been
# converted to the UTF-8 encoding.
sub set_utf8_metadata_element {
my $self = shift (@_);
my ($section, $field, $value) = @_;
$self->delete_metadata ($section, $field);
$self->add_utf8_metadata ($section, $field, $value);
}
# add_metadata assumes the text is in (extended) ascii form. For
# text which has already been converted to the UTF-8 format use
# add_utf8_metadata.
sub add_metadata {
my $self = shift (@_);
my ($section, $field, $value) = @_;
$self->add_utf8_metadata ($section, $field,
&unicode::ascii2utf8(\$value));
}
sub add_utf8_metadata {
my $self = shift (@_);
my ($section, $field, $value) = @_;
my $section_ptr = $self->_lookup_section($section);
if (!defined $section_ptr) {
print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
return;
}
if (!defined $value) {
print STDERR "doc::add_utf8_metadata undefined value for $field\n";
return;
}
if (!defined $field) {
print STDERR "doc::add_utf8_metadata undefined metadata type \n";
return;
}
#print STDERR "###$field=$value\n";
# double check that the value is utf-8
if (unicode::ensure_utf8(\$value)) {
print STDERR "doc::add_utf8_metadata: warning: '$field' wasn't utf8\n";
}
push (@{$section_ptr->{'metadata'}}, [$field, $value]);
}
# methods for dealing with text
# returns the text for a section
sub get_text {
my $self = shift (@_);
my ($section) = @_;
my $section_ptr = $self->_lookup_section($section);
if (!defined $section_ptr) {
print STDERR "doc::get_text couldn't find section " .
"$section\n";
return "";
}
return $section_ptr->{'text'};
}
# returns the (utf-8 encoded) length of the text for a section
sub get_text_length {
my $self = shift (@_);
my ($section) = @_;
my $section_ptr = $self->_lookup_section($section);
if (!defined $section_ptr) {
print STDERR "doc::get_text_length couldn't find section " .
"$section\n";
return 0;
}
return length ($section_ptr->{'text'});
}
sub delete_text {
my $self = shift (@_);
my ($section) = @_;
my $section_ptr = $self->_lookup_section($section);
if (!defined $section_ptr) {
print STDERR "doc::delete_text couldn't find section " .
"$section\n";
return;
}
$section_ptr->{'text'} = "";
}
# add_text assumes the text is in (extended) ascii form. For
# text which has been already converted to the UTF-8 format
# use add_utf8_text.
sub add_text {
my $self = shift (@_);
my ($section, $text) = @_;
# convert the text to UTF-8 encoded unicode characters
# and add the text
$self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
}
# add_utf8_text assumes the text to be added has already
# been converted to the UTF-8 encoding. For ascii text use
# add_text
sub add_utf8_text {
my $self = shift (@_);
my ($section, $text) = @_;
my $section_ptr = $self->_lookup_section($section);
if (!defined $section_ptr) {
print STDERR "doc::add_utf8_text couldn't find section " .
"$section\n";
return;
}
$section_ptr->{'text'} .= $text;
}
# methods for dealing with associated files
# a file is associated with a document, NOT a section.
# if section is defined it is noted in the data structure
# only so that files associated from a particular section
# may be removed later (using delete_section_assoc_files)
sub associate_file {
my $self = shift (@_);
my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
$mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
# remove all associated files with the same name
$self->delete_assoc_file ($assoc_filename);
push (@{$self->{'associated_files'}},
[$real_filename, $assoc_filename, $mime_type, $section]);
}
# returns a list of associated files in the form
# [[real_filename, assoc_filename, mimetype], ...]
sub get_assoc_files {
my $self = shift (@_);
return $self->{'associated_files'};
}
sub delete_section_assoc_files {
my $self = shift (@_);
my ($section) = @_;
my $i=0;
while ($i < scalar (@{$self->{'associated_files'}})) {
if (defined $self->{'associated_files'}->[$i]->[3] &&
$self->{'associated_files'}->[$i]->[3] eq $section) {
splice (@{$self->{'associated_files'}}, $i, 1);
} else {
$i++;
}
}
}
sub delete_assoc_file {
my $self = shift (@_);
my ($assoc_filename) = @_;
my $i=0;
while ($i < scalar (@{$self->{'associated_files'}})) {
if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
splice (@{$self->{'associated_files'}}, $i, 1);
} else {
$i++;
}
}
}
sub reset_nextsection_ptr {
my $self = shift (@_);
my ($section) = @_;
my $section_ptr = $self->_lookup_section($section);
$section_ptr->{'next_subsection'} = 1;
}
1;