Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/gsdl/perllib/incremental_build.pm@ 12844

Last change on this file since 12844 was 12844, checked in by mdewsnip, 18 years ago
Incremental building and dynamic GDBM updating code, many thanks to John Rowe and John Thompson at DL Consulting Ltd.
Property svn:keywords set to `Author Date Id Revision`
File size: 9.2 KB

Line
1	###########################################################################
2	#
3	# incremental_build.pm -- API to assist incremental building
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 2006 DL Consulting Ltd and New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	###########################################################################
28	# /** Initial versions of these functions by John Thompson, revisions by
29	# * and turning it into a package by John Rowe. Used heavily by
30	# * basebuilder::remove_document() and getdocument.pl
31	# *
32	# * @version 1.0 Initial version by John Thompson
33	# * @version 1.1 Addition of get_document and change of get_document_as_xml by John Rowe
34	# * @version 2.0 Package version including seperation from calling code and modularisation
35	# * by creating gdbmget, gdbmset and get_database_path by John Rowe
36	# *
37	# * @author John Thompson, DL Consulting Ltd.
38	# * @author John Rowe, DL Consulting Ltd.
39	# */
40	###########################################################################
41
42	use util;
43
44	package incremental_build;
45	# Change debugging to 1 if you want verbose debugging output
46	$debug = 0;
47
48	# Ensure the collection specific binaries are on the search path
49	my $path_separator = ":";
50	$ENV{'PATH'} = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}).$path_separator.&util::filename_cat($ENV{'GSDLHOME'}, "bin", "script").$path_separator.$ENV{'PATH'};
51
52
53
54	# /** Use the gdbm get tool to retrieve and populate a doc object with data.
55	# * Then return the doc object if it was found and nothing if not.
56	# *
57	# * @param $database The full path, including the file itself, of the gdbm
58	# * database as a string.
59	# * @param $oid The unique identifier of the required document as a string.
60	# * @author John Thompson, DL Consulting Ltd.
61	# * @author John Rowe, DL Consulting Ltd.
62	# */
63	sub get_document
64	{
65	my($collection, $oid) = @_;
66
67	# Get the raw document text to create a document object out of
68	$raw_document = gdbmget($collection, $oid);
69
70	# Check for content and if some are found then we can return the created object
71	if($raw_document =~ /\w+/)
72	{
73	# Create a new document object
74	my $doc_obj = new doc();
75	$doc_obj->set_OID($oid);
76
77	&process_document_section($collection, $oid, $doc_obj, "", "", 0, 0);
78	return $doc_obj;
79	}
80
81	# Otherwise we return nothing
82	}
83
84	# /** This works out the database path and returns it to the calling
85	# * calling function.
86	# *
87	# * @param $collection The current collection name
88	# *
89	# * @author John Rowe, DL Consulting Ltd.
90	# */
91	sub get_database_path
92	{
93	$collection = shift(@_);
94
95	# Find out the database extension
96	my $ext = ".bdb";
97	$ext = ".ldb" if &util::is_little_endian();
98
99	# Now return the full filename of the database
100	return &util::filename_cat($ENV{'GSDLHOME'}, "collect", $collection, "index", "text", $collection.$ext);
101	}
102
103	# /** This wraps John T's gdbmget executable to get the gdbm database entry for
104	# * a particular OID.
105	# *
106	# * @param $collection is the collection name.
107	# * @param $oid is the internal document id.
108	# *
109	# * @author John Rowe, DL Consulting Ltd.
110	# */
111	sub gdbmget
112	{
113	my ($collection, $oid) = @_;
114
115	# Where's the database?
116	$database = &get_database_path($collection);
117
118	# Are we in windows? Do we need .exe?
119	$exe = "";
120	$exe = ".exe" if $ENV{'GSDLOS'} =~ /^windows$/i;
121
122	# Retrieve the raw document content
123	print STDERR "#Get document\ncmd: gdbmget$exe \"$database\" \"$oid\"\n" if $debug;
124	return `gdbmget$exe "$database" "$oid"`;
125	}
126
127	# /** This wraps John T's gdbmset executable to set the gdbm database entry for
128	# * a particular OID. This does not yet report errors.
129	# *
130	# * @param $collection is the collection name.
131	# * @param $oid is the internal document id.
132	# * @param $value is the new value to set for the oid.
133	# *
134	# * @author John Rowe, DL Consulting Ltd.
135	# */
136	sub gdbmset
137	{
138	my ($collection, $oid, $value) = @_;
139
140	# Where's the database?
141	$database = &get_database_path($collection);
142
143	# Are we in windows? Do we need .exe?
144	my $exe = &util::get_os_exe();
145
146	# Retrieve the raw document content
147	print STDERR "#Get document\ncmd: gdbmset$exe \"$database\" \"$oid\" \"$value\"\n" if $debug;
148	`gdbmset$exe "$database" "$oid" "$value"`;
149	}
150
151	# /** This uses get_document to retrieve the document object, it then outputs the
152	# * XML text of the document to STDOUT.
153	# *
154	# * @param $collection The collection the document exists in.
155	# *
156	# * @param $oid The unique identifier of the required document as a string.
157	# * @author John Rowe, DL Consulting Ltd.
158	# */
159	sub get_document_as_xml
160	{
161	my($collection, $oid) = @_;
162
163	# Try to grab our document object
164	$doc_obj = get_document($collection, $oid);
165
166	# If there is an object returned then output it before we leave
167	if(defined $doc_obj)
168	{
169	# Create a new document printer processor
170	my $processor = new docprint();
171	# Finally process it into xml
172	$processor->process($doc_obj);
173	}
174	}
175
176	# /** This processes the information out of the gdbm database into a document
177	# * object.
178	# *
179	# * @version 1.0 Initial version by John Thompson
180	# * @version 2.0 Modified the gdbm fetch routines to use the perl abstractions
181	# * by John Rowe
182	# *
183	# * @author John Thompson, DL Consulting Ltd.
184	# * @author John Rowe, DL Consulting Ltd.
185	# */
186	sub process_document_section
187	{
188	my ($collection, $oid, $doc_obj, $section, $archivedir, $assocdir, $out) = @_;
189
190	my $hastxt = 0;
191	my $contains = "";
192	my $docnum = 0;
193
194	my $srclink = "";
195	# Grab the information out of the gdbm database
196	my $data = gdbmget($collection, $oid);
197	# Loop through the information and look at each line to add metadata to the document object
198	foreach my $line (split(/\n/, $data))
199	{
200	next unless $line =~ /^<([^>]+)>(.*)$/;
201	my $key = $1;
202	my $value = $2;
203	if ($key eq "hastxt" && $value eq "1") {
204	$hastxt = 1;
205	} elsif ($key eq "archivedir") {
206	$archivedir = $value;
207	} elsif ($key eq "contains") {
208	$contains = $value;
209	} elsif ($key eq "docnum") {
210	$docnum = $value;
211	} elsif ($key !~ /^(doctype\|thistype\|childtype)$/) {
212	if ($section ne "") {
213	# section level metadata
214	$doc_obj->add_utf8_metadata($section, $key, $value);
215
216	} else {
217	if (!defined($metadata->{$oid}->{$key})) {
218	# top level plugin derived metadata (i.e. stuff not in
219	# new metadata.xml file - including Language, Encoding,
220	# srcext, srclink, srcicon, DocExt, ContentType)
221	$doc_obj->add_utf8_metadata($section, $key, $value);
222	if ($key eq "srclink") {
223	$srclink = $value;
224	}
225	}
226	}
227	}
228	}
229
230	#my $adir = &util::filename_cat($assocdir, $archivedir);
231
232	# associate source file if required
233	#if ($srclink ne "") {
234	# my ($srcfile) = $srclink =~ /([^\\\/]?)[\">]$/;
235	# &associate_file($adir, $srcfile, $srcfile, $doc_obj);
236	# }
237
238	if ($section eq "") {
239
240	# top level metadata comes from metadata.xml of update package
241	# (except for plugin derived metadata like "Language",
242	# "Encoding", "srcext" etc. which is set above)
243	foreach my $metaname (keys %{$metadata->{$oid}}) {
244	foreach my $value (@{$metadata->{$oid}->{$metaname}}) {
245	$doc_obj->add_utf8_metadata($section, $metaname, $value);
246	}
247	}
248	}
249
250	# add text
251	# if ($hastxt && $docnum) {
252	# my $text = "";
253	# &get_text($docnum, \$text);
254	# $doc_obj->add_utf8_text($section, $text);
255	#
256	# # sort out any associated files
257	# $text =~ s/(_http(?:doc\|coll)img(?:full)?_\/)([^\">\/]+)/$1 . $2 . &associate_file($adir, $2, $2, $doc_obj, $out)/eg;
258	# }
259
260	# Don't process the subsections of classifiers
261	if ($contains =~ /\w/) {
262	if($oid =~ /^CL/) {
263	$doc_obj->add_utf8_metadata($section, "contains", $contains);
264	}
265	else {
266	# process subsections
267	foreach my $suboid (split(/;/, $contains)) {
268	$suboid =~ s/^\"/$oid/;
269	my $subsection = $doc_obj->insert_section($doc_obj->get_end_child($section));
270	&process_document_section($collection, $suboid, $doc_obj, $subsection, $archivedir, $assocdir, $out);
271	}
272	}
273	}
274	}
275
276	sub associate_file {
277	my ($dir, $realname, $assocname, $doc_obj, $out) = @_;
278
279	my $assocfile = &util::filename_cat($dir, $realname);
280	if (-e $assocfile) {
281	$doc_obj->associate_file($assocfile, $assocname);
282	} else {
283	print $out "WARNING: Associated file $assocfile could not be found\n";
284	}
285
286	return "";
287	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: