########################################################################### # # incremental_build.pm -- API to assist incremental building # # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 2006 DL Consulting Ltd and New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### ########################################################################### # /** Initial versions of these functions by John Thompson, revisions by # * and turning it into a package by John Rowe. Used heavily by # * basebuilder::remove_document() and getdocument.pl # * # * @version 1.0 Initial version by John Thompson # * @version 1.1 Addition of get_document and change of get_document_as_xml by John Rowe # * @version 2.0 Package version including seperation from calling code and modularisation # * by creating gdbmget, gdbmset and get_database_path by John Rowe # * # * @author John Thompson, DL Consulting Ltd. # * @author John Rowe, DL Consulting Ltd. # */ ########################################################################### use util; package incremental_build; # Change debugging to 1 if you want verbose debugging output $debug = 0; # Ensure the collection specific binaries are on the search path my $path_separator = ":"; $ENV{'PATH'} = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}).$path_separator.&util::filename_cat($ENV{'GSDLHOME'}, "bin", "script").$path_separator.$ENV{'PATH'}; # /** Use the gdbm get tool to retrieve and populate a doc object with data. # * Then return the doc object if it was found and nothing if not. # * # * @param $database The full path, including the file itself, of the gdbm # * database as a string. # * @param $oid The unique identifier of the required document as a string. # * @author John Thompson, DL Consulting Ltd. # * @author John Rowe, DL Consulting Ltd. # */ sub get_document { my($collection, $oid) = @_; # Get the raw document text to create a document object out of $raw_document = gdbmget($collection, $oid); # Check for content and if some are found then we can return the created object if($raw_document =~ /\w+/) { # Create a new document object my $doc_obj = new doc(); $doc_obj->set_OID($oid); &process_document_section($collection, $oid, $doc_obj, "", "", 0, 0); return $doc_obj; } # Otherwise we return nothing } # /** This works out the database path and returns it to the calling # * calling function. # * # * @param $collection The current collection name # * # * @author John Rowe, DL Consulting Ltd. # */ sub get_database_path { $collection = shift(@_); # Find out the database extension my $ext = ".bdb"; $ext = ".ldb" if &util::is_little_endian(); # Now return the full filename of the database return &util::filename_cat($ENV{'GSDLHOME'}, "collect", $collection, "index", "text", $collection.$ext); } # /** This wraps John T's gdbmget executable to get the gdbm database entry for # * a particular OID. # * # * @param $collection is the collection name. # * @param $oid is the internal document id. # * # * @author John Rowe, DL Consulting Ltd. # */ sub gdbmget { my ($collection, $oid) = @_; # Where's the database? $database = &get_database_path($collection); # Are we in windows? Do we need .exe? $exe = ""; $exe = ".exe" if $ENV{'GSDLOS'} =~ /^windows$/i; # Retrieve the raw document content print STDERR "#Get document\ncmd: gdbmget$exe \"$database\" \"$oid\"\n" if $debug; return `gdbmget$exe "$database" "$oid"`; } # /** This wraps John T's gdbmset executable to set the gdbm database entry for # * a particular OID. This does not yet report errors. # * # * @param $collection is the collection name. # * @param $oid is the internal document id. # * @param $value is the new value to set for the oid. # * # * @author John Rowe, DL Consulting Ltd. # */ sub gdbmset { my ($collection, $oid, $value) = @_; # Where's the database? $database = &get_database_path($collection); # Are we in windows? Do we need .exe? my $exe = &util::get_os_exe(); # Retrieve the raw document content print STDERR "#Get document\ncmd: gdbmset$exe \"$database\" \"$oid\" \"$value\"\n" if $debug; `gdbmset$exe "$database" "$oid" "$value"`; } # /** This uses get_document to retrieve the document object, it then outputs the # * XML text of the document to STDOUT. # * # * @param $collection The collection the document exists in. # * # * @param $oid The unique identifier of the required document as a string. # * @author John Rowe, DL Consulting Ltd. # */ sub get_document_as_xml { my($collection, $oid) = @_; # Try to grab our document object $doc_obj = get_document($collection, $oid); # If there is an object returned then output it before we leave if(defined $doc_obj) { my $doc_text = &docprint::get_section_xml($doc_obj, $doc_obj->get_top_section()); print STDOUT $doc_text; # Create a new document printer processor #my $processor = new docprint(); # Finally process it into xml #$processor->process($doc_obj); } } # /** This processes the information out of the gdbm database into a document # * object. # * # * @version 1.0 Initial version by John Thompson # * @version 2.0 Modified the gdbm fetch routines to use the perl abstractions # * by John Rowe # * # * @author John Thompson, DL Consulting Ltd. # * @author John Rowe, DL Consulting Ltd. # */ sub process_document_section { my ($collection, $oid, $doc_obj, $section, $archivedir, $assocdir, $out) = @_; my $hastxt = 0; my $contains = ""; my $docnum = 0; my $srclink = ""; # Grab the information out of the gdbm database my $data = gdbmget($collection, $oid); # Loop through the information and look at each line to add metadata to the document object foreach my $line (split(/\n/, $data)) { next unless $line =~ /^<([^>]+)>(.*)$/; my $key = $1; my $value = $2; if ($key eq "hastxt" && $value eq "1") { $hastxt = 1; } elsif ($key eq "archivedir") { $archivedir = $value; } elsif ($key eq "contains") { $contains = $value; } elsif ($key eq "docnum") { $docnum = $value; } elsif ($key !~ /^(doctype|thistype|childtype)$/) { if ($section ne "") { # section level metadata $doc_obj->add_utf8_metadata($section, $key, $value); } else { if (!defined($metadata->{$oid}->{$key})) { # top level plugin derived metadata (i.e. stuff not in # new metadata.xml file - including Language, Encoding, # srcext, srclink, srcicon, DocExt, ContentType) $doc_obj->add_utf8_metadata($section, $key, $value); if ($key eq "srclink") { $srclink = $value; } } } } } #my $adir = &util::filename_cat($assocdir, $archivedir); # associate source file if required #if ($srclink ne "") { # my ($srcfile) = $srclink =~ /([^\\\/]*?)[\">]*$/; # &associate_file($adir, $srcfile, $srcfile, $doc_obj); # } if ($section eq "") { # top level metadata comes from metadata.xml of update package # (except for plugin derived metadata like "Language", # "Encoding", "srcext" etc. which is set above) foreach my $metaname (keys %{$metadata->{$oid}}) { foreach my $value (@{$metadata->{$oid}->{$metaname}}) { $doc_obj->add_utf8_metadata($section, $metaname, $value); } } } # add text # if ($hastxt && $docnum) { # my $text = ""; # &get_text($docnum, \$text); # $doc_obj->add_utf8_text($section, $text); # # # sort out any associated files # $text =~ s/(_http(?:doc|coll)img(?:full)?_\/)([^\">\/]+)/$1 . $2 . &associate_file($adir, $2, $2, $doc_obj, $out)/eg; # } # Don't process the subsections of classifiers if ($contains =~ /\w/) { if($oid =~ /^CL/) { $doc_obj->add_utf8_metadata($section, "contains", $contains); } else { # process subsections foreach my $suboid (split(/;/, $contains)) { $suboid =~ s/^\"/$oid/; my $subsection = $doc_obj->insert_section($doc_obj->get_end_child($section)); &process_document_section($collection, $suboid, $doc_obj, $subsection, $archivedir, $assocdir, $out); } } } } sub associate_file { my ($dir, $realname, $assocname, $doc_obj, $out) = @_; my $assocfile = &util::filename_cat($dir, $realname); if (-e $assocfile) { $doc_obj->associate_file($assocfile, $assocname); } else { print $out "WARNING: Associated file $assocfile could not be found\n"; } return ""; }