source: trunk/gsdl/perllib/incremental_build.pm@ 12844

Last change on this file since 12844 was 12844, checked in by mdewsnip, 18 years ago

Incremental building and dynamic GDBM updating code, many thanks to John Rowe and John Thompson at DL Consulting Ltd.

  • Property svn:keywords set to Author Date Id Revision
File size: 9.2 KB
Line 
1###########################################################################
2#
3# incremental_build.pm -- API to assist incremental building
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2006 DL Consulting Ltd and New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27###########################################################################
28# /** Initial versions of these functions by John Thompson, revisions by
29# * and turning it into a package by John Rowe. Used heavily by
30# * basebuilder::remove_document() and getdocument.pl
31# *
32# * @version 1.0 Initial version by John Thompson
33# * @version 1.1 Addition of get_document and change of get_document_as_xml by John Rowe
34# * @version 2.0 Package version including seperation from calling code and modularisation
35# * by creating gdbmget, gdbmset and get_database_path by John Rowe
36# *
37# * @author John Thompson, DL Consulting Ltd.
38# * @author John Rowe, DL Consulting Ltd.
39# */
40###########################################################################
41
42use util;
43
44package incremental_build;
45# Change debugging to 1 if you want verbose debugging output
46$debug = 0;
47
48# Ensure the collection specific binaries are on the search path
49my $path_separator = ":";
50$ENV{'PATH'} = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}).$path_separator.&util::filename_cat($ENV{'GSDLHOME'}, "bin", "script").$path_separator.$ENV{'PATH'};
51
52
53
54# /** Use the gdbm get tool to retrieve and populate a doc object with data.
55# * Then return the doc object if it was found and nothing if not.
56# *
57# * @param $database The full path, including the file itself, of the gdbm
58# * database as a string.
59# * @param $oid The unique identifier of the required document as a string.
60# * @author John Thompson, DL Consulting Ltd.
61# * @author John Rowe, DL Consulting Ltd.
62# */
63sub get_document
64{
65 my($collection, $oid) = @_;
66
67 # Get the raw document text to create a document object out of
68 $raw_document = gdbmget($collection, $oid);
69
70 # Check for content and if some are found then we can return the created object
71 if($raw_document =~ /\w+/)
72 {
73 # Create a new document object
74 my $doc_obj = new doc();
75 $doc_obj->set_OID($oid);
76
77 &process_document_section($collection, $oid, $doc_obj, "", "", 0, 0);
78 return $doc_obj;
79 }
80
81 # Otherwise we return nothing
82}
83
84# /** This works out the database path and returns it to the calling
85# * calling function.
86# *
87# * @param $collection The current collection name
88# *
89# * @author John Rowe, DL Consulting Ltd.
90# */
91sub get_database_path
92{
93 $collection = shift(@_);
94
95 # Find out the database extension
96 my $ext = ".bdb";
97 $ext = ".ldb" if &util::is_little_endian();
98
99 # Now return the full filename of the database
100 return &util::filename_cat($ENV{'GSDLHOME'}, "collect", $collection, "index", "text", $collection.$ext);
101}
102
103# /** This wraps John T's gdbmget executable to get the gdbm database entry for
104# * a particular OID.
105# *
106# * @param $collection is the collection name.
107# * @param $oid is the internal document id.
108# *
109# * @author John Rowe, DL Consulting Ltd.
110# */
111sub gdbmget
112{
113 my ($collection, $oid) = @_;
114
115 # Where's the database?
116 $database = &get_database_path($collection);
117
118 # Are we in windows? Do we need .exe?
119 $exe = "";
120 $exe = ".exe" if $ENV{'GSDLOS'} =~ /^windows$/i;
121
122 # Retrieve the raw document content
123 print STDERR "#Get document\ncmd: gdbmget$exe \"$database\" \"$oid\"\n" if $debug;
124 return `gdbmget$exe "$database" "$oid"`;
125}
126
127# /** This wraps John T's gdbmset executable to set the gdbm database entry for
128# * a particular OID. This does not yet report errors.
129# *
130# * @param $collection is the collection name.
131# * @param $oid is the internal document id.
132# * @param $value is the new value to set for the oid.
133# *
134# * @author John Rowe, DL Consulting Ltd.
135# */
136sub gdbmset
137{
138 my ($collection, $oid, $value) = @_;
139
140 # Where's the database?
141 $database = &get_database_path($collection);
142
143 # Are we in windows? Do we need .exe?
144 my $exe = &util::get_os_exe();
145
146 # Retrieve the raw document content
147 print STDERR "#Get document\ncmd: gdbmset$exe \"$database\" \"$oid\" \"$value\"\n" if $debug;
148 `gdbmset$exe "$database" "$oid" "$value"`;
149}
150
151# /** This uses get_document to retrieve the document object, it then outputs the
152# * XML text of the document to STDOUT.
153# *
154# * @param $collection The collection the document exists in.
155# *
156# * @param $oid The unique identifier of the required document as a string.
157# * @author John Rowe, DL Consulting Ltd.
158# */
159sub get_document_as_xml
160{
161 my($collection, $oid) = @_;
162
163 # Try to grab our document object
164 $doc_obj = get_document($collection, $oid);
165
166 # If there is an object returned then output it before we leave
167 if(defined $doc_obj)
168 {
169 # Create a new document printer processor
170 my $processor = new docprint();
171 # Finally process it into xml
172 $processor->process($doc_obj);
173 }
174}
175
176# /** This processes the information out of the gdbm database into a document
177# * object.
178# *
179# * @version 1.0 Initial version by John Thompson
180# * @version 2.0 Modified the gdbm fetch routines to use the perl abstractions
181# * by John Rowe
182# *
183# * @author John Thompson, DL Consulting Ltd.
184# * @author John Rowe, DL Consulting Ltd.
185# */
186sub process_document_section
187{
188 my ($collection, $oid, $doc_obj, $section, $archivedir, $assocdir, $out) = @_;
189
190 my $hastxt = 0;
191 my $contains = "";
192 my $docnum = 0;
193
194 my $srclink = "";
195 # Grab the information out of the gdbm database
196 my $data = gdbmget($collection, $oid);
197 # Loop through the information and look at each line to add metadata to the document object
198 foreach my $line (split(/\n/, $data))
199 {
200 next unless $line =~ /^<([^>]+)>(.*)$/;
201 my $key = $1;
202 my $value = $2;
203 if ($key eq "hastxt" && $value eq "1") {
204 $hastxt = 1;
205 } elsif ($key eq "archivedir") {
206 $archivedir = $value;
207 } elsif ($key eq "contains") {
208 $contains = $value;
209 } elsif ($key eq "docnum") {
210 $docnum = $value;
211 } elsif ($key !~ /^(doctype|thistype|childtype)$/) {
212 if ($section ne "") {
213# section level metadata
214 $doc_obj->add_utf8_metadata($section, $key, $value);
215
216 } else {
217 if (!defined($metadata->{$oid}->{$key})) {
218# top level plugin derived metadata (i.e. stuff not in
219# new metadata.xml file - including Language, Encoding,
220# srcext, srclink, srcicon, DocExt, ContentType)
221 $doc_obj->add_utf8_metadata($section, $key, $value);
222 if ($key eq "srclink") {
223 $srclink = $value;
224 }
225 }
226 }
227 }
228 }
229
230#my $adir = &util::filename_cat($assocdir, $archivedir);
231
232# associate source file if required
233#if ($srclink ne "") {
234# my ($srcfile) = $srclink =~ /([^\\\/]*?)[\">]*$/;
235# &associate_file($adir, $srcfile, $srcfile, $doc_obj);
236# }
237
238 if ($section eq "") {
239
240# top level metadata comes from metadata.xml of update package
241# (except for plugin derived metadata like "Language",
242# "Encoding", "srcext" etc. which is set above)
243 foreach my $metaname (keys %{$metadata->{$oid}}) {
244 foreach my $value (@{$metadata->{$oid}->{$metaname}}) {
245 $doc_obj->add_utf8_metadata($section, $metaname, $value);
246 }
247 }
248 }
249
250# add text
251# if ($hastxt && $docnum) {
252# my $text = "";
253# &get_text($docnum, \$text);
254# $doc_obj->add_utf8_text($section, $text);
255#
256# # sort out any associated files
257# $text =~ s/(_http(?:doc|coll)img(?:full)?_\/)([^\">\/]+)/$1 . $2 . &associate_file($adir, $2, $2, $doc_obj, $out)/eg;
258# }
259
260# Don't process the subsections of classifiers
261 if ($contains =~ /\w/) {
262 if($oid =~ /^CL/) {
263 $doc_obj->add_utf8_metadata($section, "contains", $contains);
264 }
265 else {
266# process subsections
267 foreach my $suboid (split(/;/, $contains)) {
268 $suboid =~ s/^\"/$oid/;
269 my $subsection = $doc_obj->insert_section($doc_obj->get_end_child($section));
270 &process_document_section($collection, $suboid, $doc_obj, $subsection, $archivedir, $assocdir, $out);
271 }
272 }
273 }
274}
275
276sub associate_file {
277 my ($dir, $realname, $assocname, $doc_obj, $out) = @_;
278
279 my $assocfile = &util::filename_cat($dir, $realname);
280 if (-e $assocfile) {
281 $doc_obj->associate_file($assocfile, $assocname);
282 } else {
283 print $out "WARNING: Associated file $assocfile could not be found\n";
284 }
285
286 return "";
287}
Note: See TracBrowser for help on using the repository browser.