source: gsdl/trunk/perllib/incremental_build.pm@ 14374

Last change on this file since 14374 was 13171, checked in by kjdon, 18 years ago

docprint is no longer a docproc, now need to call get_section_xml and print to STDOUT ourselves. Note, I don't know how to test this module

  • Property svn:keywords set to Author Date Id Revision
File size: 9.3 KB
Line 
1###########################################################################
2#
3# incremental_build.pm -- API to assist incremental building
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2006 DL Consulting Ltd and New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27###########################################################################
28# /** Initial versions of these functions by John Thompson, revisions by
29# * and turning it into a package by John Rowe. Used heavily by
30# * basebuilder::remove_document() and getdocument.pl
31# *
32# * @version 1.0 Initial version by John Thompson
33# * @version 1.1 Addition of get_document and change of get_document_as_xml by John Rowe
34# * @version 2.0 Package version including seperation from calling code and modularisation
35# * by creating gdbmget, gdbmset and get_database_path by John Rowe
36# *
37# * @author John Thompson, DL Consulting Ltd.
38# * @author John Rowe, DL Consulting Ltd.
39# */
40###########################################################################
41
42use util;
43
44package incremental_build;
45# Change debugging to 1 if you want verbose debugging output
46$debug = 0;
47
48# Ensure the collection specific binaries are on the search path
49my $path_separator = ":";
50$ENV{'PATH'} = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}).$path_separator.&util::filename_cat($ENV{'GSDLHOME'}, "bin", "script").$path_separator.$ENV{'PATH'};
51
52
53
54# /** Use the gdbm get tool to retrieve and populate a doc object with data.
55# * Then return the doc object if it was found and nothing if not.
56# *
57# * @param $database The full path, including the file itself, of the gdbm
58# * database as a string.
59# * @param $oid The unique identifier of the required document as a string.
60# * @author John Thompson, DL Consulting Ltd.
61# * @author John Rowe, DL Consulting Ltd.
62# */
63sub get_document
64{
65 my($collection, $oid) = @_;
66
67 # Get the raw document text to create a document object out of
68 $raw_document = gdbmget($collection, $oid);
69
70 # Check for content and if some are found then we can return the created object
71 if($raw_document =~ /\w+/)
72 {
73 # Create a new document object
74 my $doc_obj = new doc();
75 $doc_obj->set_OID($oid);
76
77 &process_document_section($collection, $oid, $doc_obj, "", "", 0, 0);
78 return $doc_obj;
79 }
80
81 # Otherwise we return nothing
82}
83
84# /** This works out the database path and returns it to the calling
85# * calling function.
86# *
87# * @param $collection The current collection name
88# *
89# * @author John Rowe, DL Consulting Ltd.
90# */
91sub get_database_path
92{
93 $collection = shift(@_);
94
95 # Find out the database extension
96 my $ext = ".bdb";
97 $ext = ".ldb" if &util::is_little_endian();
98
99 # Now return the full filename of the database
100 return &util::filename_cat($ENV{'GSDLHOME'}, "collect", $collection, "index", "text", $collection.$ext);
101}
102
103# /** This wraps John T's gdbmget executable to get the gdbm database entry for
104# * a particular OID.
105# *
106# * @param $collection is the collection name.
107# * @param $oid is the internal document id.
108# *
109# * @author John Rowe, DL Consulting Ltd.
110# */
111sub gdbmget
112{
113 my ($collection, $oid) = @_;
114
115 # Where's the database?
116 $database = &get_database_path($collection);
117
118 # Are we in windows? Do we need .exe?
119 $exe = "";
120 $exe = ".exe" if $ENV{'GSDLOS'} =~ /^windows$/i;
121
122 # Retrieve the raw document content
123 print STDERR "#Get document\ncmd: gdbmget$exe \"$database\" \"$oid\"\n" if $debug;
124 return `gdbmget$exe "$database" "$oid"`;
125}
126
127# /** This wraps John T's gdbmset executable to set the gdbm database entry for
128# * a particular OID. This does not yet report errors.
129# *
130# * @param $collection is the collection name.
131# * @param $oid is the internal document id.
132# * @param $value is the new value to set for the oid.
133# *
134# * @author John Rowe, DL Consulting Ltd.
135# */
136sub gdbmset
137{
138 my ($collection, $oid, $value) = @_;
139
140 # Where's the database?
141 $database = &get_database_path($collection);
142
143 # Are we in windows? Do we need .exe?
144 my $exe = &util::get_os_exe();
145
146 # Retrieve the raw document content
147 print STDERR "#Get document\ncmd: gdbmset$exe \"$database\" \"$oid\" \"$value\"\n" if $debug;
148 `gdbmset$exe "$database" "$oid" "$value"`;
149}
150
151# /** This uses get_document to retrieve the document object, it then outputs the
152# * XML text of the document to STDOUT.
153# *
154# * @param $collection The collection the document exists in.
155# *
156# * @param $oid The unique identifier of the required document as a string.
157# * @author John Rowe, DL Consulting Ltd.
158# */
159sub get_document_as_xml
160{
161 my($collection, $oid) = @_;
162
163 # Try to grab our document object
164 $doc_obj = get_document($collection, $oid);
165
166 # If there is an object returned then output it before we leave
167 if(defined $doc_obj)
168 {
169 my $doc_text = &docprint::get_section_xml($doc_obj, $doc_obj->get_top_section());
170 print STDOUT $doc_text;
171 # Create a new document printer processor
172 #my $processor = new docprint();
173 # Finally process it into xml
174 #$processor->process($doc_obj);
175 }
176}
177
178# /** This processes the information out of the gdbm database into a document
179# * object.
180# *
181# * @version 1.0 Initial version by John Thompson
182# * @version 2.0 Modified the gdbm fetch routines to use the perl abstractions
183# * by John Rowe
184# *
185# * @author John Thompson, DL Consulting Ltd.
186# * @author John Rowe, DL Consulting Ltd.
187# */
188sub process_document_section
189{
190 my ($collection, $oid, $doc_obj, $section, $archivedir, $assocdir, $out) = @_;
191
192 my $hastxt = 0;
193 my $contains = "";
194 my $docnum = 0;
195
196 my $srclink = "";
197 # Grab the information out of the gdbm database
198 my $data = gdbmget($collection, $oid);
199 # Loop through the information and look at each line to add metadata to the document object
200 foreach my $line (split(/\n/, $data))
201 {
202 next unless $line =~ /^<([^>]+)>(.*)$/;
203 my $key = $1;
204 my $value = $2;
205 if ($key eq "hastxt" && $value eq "1") {
206 $hastxt = 1;
207 } elsif ($key eq "archivedir") {
208 $archivedir = $value;
209 } elsif ($key eq "contains") {
210 $contains = $value;
211 } elsif ($key eq "docnum") {
212 $docnum = $value;
213 } elsif ($key !~ /^(doctype|thistype|childtype)$/) {
214 if ($section ne "") {
215# section level metadata
216 $doc_obj->add_utf8_metadata($section, $key, $value);
217
218 } else {
219 if (!defined($metadata->{$oid}->{$key})) {
220# top level plugin derived metadata (i.e. stuff not in
221# new metadata.xml file - including Language, Encoding,
222# srcext, srclink, srcicon, DocExt, ContentType)
223 $doc_obj->add_utf8_metadata($section, $key, $value);
224 if ($key eq "srclink") {
225 $srclink = $value;
226 }
227 }
228 }
229 }
230 }
231
232#my $adir = &util::filename_cat($assocdir, $archivedir);
233
234# associate source file if required
235#if ($srclink ne "") {
236# my ($srcfile) = $srclink =~ /([^\\\/]*?)[\">]*$/;
237# &associate_file($adir, $srcfile, $srcfile, $doc_obj);
238# }
239
240 if ($section eq "") {
241
242# top level metadata comes from metadata.xml of update package
243# (except for plugin derived metadata like "Language",
244# "Encoding", "srcext" etc. which is set above)
245 foreach my $metaname (keys %{$metadata->{$oid}}) {
246 foreach my $value (@{$metadata->{$oid}->{$metaname}}) {
247 $doc_obj->add_utf8_metadata($section, $metaname, $value);
248 }
249 }
250 }
251
252# add text
253# if ($hastxt && $docnum) {
254# my $text = "";
255# &get_text($docnum, \$text);
256# $doc_obj->add_utf8_text($section, $text);
257#
258# # sort out any associated files
259# $text =~ s/(_http(?:doc|coll)img(?:full)?_\/)([^\">\/]+)/$1 . $2 . &associate_file($adir, $2, $2, $doc_obj, $out)/eg;
260# }
261
262# Don't process the subsections of classifiers
263 if ($contains =~ /\w/) {
264 if($oid =~ /^CL/) {
265 $doc_obj->add_utf8_metadata($section, "contains", $contains);
266 }
267 else {
268# process subsections
269 foreach my $suboid (split(/;/, $contains)) {
270 $suboid =~ s/^\"/$oid/;
271 my $subsection = $doc_obj->insert_section($doc_obj->get_end_child($section));
272 &process_document_section($collection, $suboid, $doc_obj, $subsection, $archivedir, $assocdir, $out);
273 }
274 }
275 }
276}
277
278sub associate_file {
279 my ($dir, $realname, $assocname, $doc_obj, $out) = @_;
280
281 my $assocfile = &util::filename_cat($dir, $realname);
282 if (-e $assocfile) {
283 $doc_obj->associate_file($assocfile, $assocname);
284 } else {
285 print $out "WARNING: Associated file $assocfile could not be found\n";
286 }
287
288 return "";
289}
Note: See TracBrowser for help on using the repository browser.