###########################################################################
#
# incremental_build.pm -- API to assist incremental building
# 
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
# 
# Copyright (C) 2006 DL Consulting Ltd and New Zealand Digital Library Project
# 
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################

###########################################################################
# /** Initial versions of these functions by John Thompson, revisions by 
#  *  and turning it into a package by John Rowe. Used heavily by 
#  *  basebuilder::remove_document() and getdocument.pl
#  *  
#  *  @version 1.0 Initial version by John Thompson
#  *  @version 1.1 Addition of get_document and change of get_document_as_xml by John Rowe
#  *  @version 2.0 Package version including seperation from calling code and modularisation
#  *               by creating gdbmget, gdbmset and get_database_path by John Rowe
#  *  
#  *  @author John Thompson, DL Consulting Ltd.
#  *  @author John Rowe, DL Consulting Ltd.
#  */
###########################################################################

use util;

package incremental_build;
# Change debugging to 1 if you want verbose debugging output
$debug = 0;

# Ensure the collection specific binaries are on the search path
my $path_separator = ":";
$ENV{'PATH'} = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}).$path_separator.&util::filename_cat($ENV{'GSDLHOME'}, "bin", "script").$path_separator.$ENV{'PATH'};


# /** Use the gdbm get tool to retrieve and populate a doc object with data.
#  *  Then return the doc object if it was found and nothing if not.
#  *  
#  *  @param  $database The full path, including the file itself, of the gdbm
#  *                    database as a string.
#  *  @param  $oid The unique identifier of the required document as a string.
#  *  @author John Thompson, DL Consulting Ltd.
#  *  @author John Rowe, DL Consulting Ltd.
#  */
sub get_document
{
  my($collection, $oid) = @_;
  
  # Get the raw document text to create a document object out of
  $raw_document = gdbmget($collection, $oid);

  # Check for content and if some are found then we can return the created object
  if($raw_document =~ /\w+/)
  {
    # Create a new document object
    my $doc_obj = new doc();
    $doc_obj->set_OID($oid);

    &process_document_section($collection, $oid, $doc_obj, "", "", 0, 0);
    return $doc_obj;
  }
  
  # Otherwise we return nothing
}

# /** This works out the database path and returns it to the calling
#  *  calling function.
#  *  
#  *  @param $collection The current collection name
#  *  
#  *  @author John Rowe, DL Consulting Ltd.
#  */
sub get_database_path
{
  $collection = shift(@_);
  
  # Find out the database extension
  my $ext = ".bdb";
  $ext = ".ldb" if &util::is_little_endian();
  
  # Now return the full filename of the database
  return &util::filename_cat($ENV{'GSDLHOME'}, "collect", $collection, "index", "text", $collection.$ext);
}

# /** This wraps John T's gdbmget executable to get the gdbm database entry for
#  *  a particular OID.
#  *  
#  *  @param $collection is the collection name.
#  *  @param $oid is the internal document id.
#  *  
#  *  @author John Rowe, DL Consulting Ltd.
#  */
sub gdbmget
{
  my ($collection, $oid) = @_;
  
  # Where's the database?
  $database = &get_database_path($collection);
  
  # Are we in windows? Do we need .exe?
  $exe = "";
  $exe = ".exe" if $ENV{'GSDLOS'} =~ /^windows$/i;
  
  # Retrieve the raw document content
  print STDERR "#Get document\ncmd: gdbmget$exe \"$database\" \"$oid\"\n" if $debug;
  return `gdbmget$exe "$database" "$oid"`;
}

# /** This wraps John T's gdbmset executable to set the gdbm database entry for
#  *  a particular OID. This does not yet report errors.
#  *  
#  *  @param $collection is the collection name.
#  *  @param $oid is the internal document id.
#  *  @param $value is the new value to set for the oid.
#  *  
#  *  @author John Rowe, DL Consulting Ltd.
#  */
sub gdbmset
{
  my ($collection, $oid, $value) = @_;
  
  # Where's the database?
  $database = &get_database_path($collection);
  
  # Are we in windows? Do we need .exe?
  my $exe = &util::get_os_exe();
  
  # Retrieve the raw document content
  print STDERR "#Get document\ncmd: gdbmset$exe \"$database\" \"$oid\" \"$value\"\n" if $debug;
  `gdbmset$exe "$database" "$oid" "$value"`;
}

# /** This uses get_document to retrieve the document object, it then outputs the 
#  *  XML text of the document to STDOUT.
#  *  
#  *  @param  $collection The collection the document exists in.
#  *                      
#  *  @param  $oid The unique identifier of the required document as a string.
#  *  @author John Rowe, DL Consulting Ltd.
#  */
sub get_document_as_xml
{
  my($collection, $oid) = @_;
  
  # Try to grab our document object
  $doc_obj = get_document($collection, $oid);
  
  # If there is an object returned then output it before we leave
  if(defined $doc_obj)
  {
      my $doc_text = &docprint::get_section_xml($doc_obj, $doc_obj->get_top_section());
      print STDOUT $doc_text;
      # Create a new document printer processor
      #my $processor = new docprint();
      # Finally process it into xml
      #$processor->process($doc_obj);
  }
}

# /** This processes the information out of the gdbm database into a document
#  *  object.
#  *  
#  *  @version 1.0 Initial version by John Thompson
#  *  @version 2.0 Modified the gdbm fetch routines to use the perl abstractions
#  *               by John Rowe
#  *  
#  *  @author John Thompson, DL Consulting Ltd.
#  *  @author John Rowe, DL Consulting Ltd.
#  */
sub process_document_section
{
  my ($collection, $oid, $doc_obj, $section, $archivedir, $assocdir, $out) = @_;
  
  my $hastxt = 0;
  my $contains = "";
  my $docnum = 0;
  
  my $srclink = "";
  # Grab the information out of the gdbm database
  my $data = gdbmget($collection, $oid);
  # Loop through the information and look at each line to add metadata to the document object
  foreach my $line (split(/\n/, $data))
  {
    next unless $line =~ /^<([^>]+)>(.*)$/;
    my $key = $1;
    my $value = $2;
    if ($key eq "hastxt" && $value eq "1") {
      $hastxt = 1;
    } elsif ($key eq "archivedir") {
      $archivedir = $value;
    } elsif ($key eq "contains") {
      $contains = $value;
    } elsif ($key eq "docnum") {
      $docnum = $value;
    } elsif ($key !~ /^(doctype|thistype|childtype)$/) {
      if ($section ne "") {
# section level metadata
        $doc_obj->add_utf8_metadata($section, $key, $value);
        
      } else {
        if (!defined($metadata->{$oid}->{$key})) {
# top level plugin derived metadata (i.e. stuff not in
# new metadata.xml file - including Language, Encoding,
# srcext, srclink, srcicon, DocExt, ContentType)
          $doc_obj->add_utf8_metadata($section, $key, $value);
          if ($key eq "srclink") {
            $srclink = $value;
          }
        }
      }
    }
  }
  
#my $adir = &util::filename_cat($assocdir, $archivedir);
  
# associate source file if required
#if ($srclink ne "") {
#	my ($srcfile) = $srclink =~ /([^\\\/]*?)[\">]*$/;
#	&associate_file($adir, $srcfile, $srcfile, $doc_obj);
#    }
  
  if ($section eq "") {
    
# top level metadata comes from metadata.xml of update package
# (except for plugin derived metadata like "Language",
# "Encoding", "srcext" etc. which is set above)
    foreach my $metaname (keys %{$metadata->{$oid}}) {
      foreach my $value (@{$metadata->{$oid}->{$metaname}}) {
        $doc_obj->add_utf8_metadata($section, $metaname, $value);
      }
    }
  }
  
# add text
#    if ($hastxt && $docnum) {
#	my $text = "";
#	&get_text($docnum, \$text);
#	$doc_obj->add_utf8_text($section, $text);
#
#	# sort out any associated files
#	$text =~ s/(_http(?:doc|coll)img(?:full)?_\/)([^\">\/]+)/$1 . $2 . &associate_file($adir, $2, $2, $doc_obj, $out)/eg;
#    }
  
# Don't process the subsections of classifiers
  if ($contains =~ /\w/) {
    if($oid =~ /^CL/) {
      $doc_obj->add_utf8_metadata($section, "contains", $contains);
    }
    else {
# process subsections
      foreach my $suboid (split(/;/, $contains)) {
        $suboid =~ s/^\"/$oid/;
        my $subsection = $doc_obj->insert_section($doc_obj->get_end_child($section));
        &process_document_section($collection, $suboid, $doc_obj, $subsection, $archivedir, $assocdir, $out);
      }
    }
  }
}

sub associate_file {
  my ($dir, $realname, $assocname, $doc_obj, $out) = @_;
  
  my $assocfile = &util::filename_cat($dir, $realname);
  if (-e $assocfile) {
    $doc_obj->associate_file($assocfile, $assocname);
  } else {
    print $out "WARNING: Associated file $assocfile could not be found\n";
  }
  
  return "";
}