Context Navigation

source: gsdl/tags/2.75/perllib/incremental_build.pm@ 18244

Last change on this file since 18244 was 13171, checked in by kjdon, 18 years ago
docprint is no longer a docproc, now need to call get_section_xml and print to STDOUT ourselves. Note, I don't know how to test this module
Property svn:keywords set to `Author Date Id Revision`
File size: 9.3 KB

Line
1	###########################################################################
2	#
3	# incremental_build.pm -- API to assist incremental building
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 2006 DL Consulting Ltd and New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	###########################################################################
28	# /** Initial versions of these functions by John Thompson, revisions by
29	# * and turning it into a package by John Rowe. Used heavily by
30	# * basebuilder::remove_document() and getdocument.pl
31	# *
32	# * @version 1.0 Initial version by John Thompson
33	# * @version 1.1 Addition of get_document and change of get_document_as_xml by John Rowe
34	# * @version 2.0 Package version including seperation from calling code and modularisation
35	# * by creating gdbmget, gdbmset and get_database_path by John Rowe
36	# *
37	# * @author John Thompson, DL Consulting Ltd.
38	# * @author John Rowe, DL Consulting Ltd.
39	# */
40	###########################################################################
41
42	use util;
43
44	package incremental_build;
45	# Change debugging to 1 if you want verbose debugging output
46	$debug = 0;
47
48	# Ensure the collection specific binaries are on the search path
49	my $path_separator = ":";
50	$ENV{'PATH'} = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}).$path_separator.&util::filename_cat($ENV{'GSDLHOME'}, "bin", "script").$path_separator.$ENV{'PATH'};
51
52
53
54	# /** Use the gdbm get tool to retrieve and populate a doc object with data.
55	# * Then return the doc object if it was found and nothing if not.
56	# *
57	# * @param $database The full path, including the file itself, of the gdbm
58	# * database as a string.
59	# * @param $oid The unique identifier of the required document as a string.
60	# * @author John Thompson, DL Consulting Ltd.
61	# * @author John Rowe, DL Consulting Ltd.
62	# */
63	sub get_document
64	{
65	my($collection, $oid) = @_;
66
67	# Get the raw document text to create a document object out of
68	$raw_document = gdbmget($collection, $oid);
69
70	# Check for content and if some are found then we can return the created object
71	if($raw_document =~ /\w+/)
72	{
73	# Create a new document object
74	my $doc_obj = new doc();
75	$doc_obj->set_OID($oid);
76
77	&process_document_section($collection, $oid, $doc_obj, "", "", 0, 0);
78	return $doc_obj;
79	}
80
81	# Otherwise we return nothing
82	}
83
84	# /** This works out the database path and returns it to the calling
85	# * calling function.
86	# *
87	# * @param $collection The current collection name
88	# *
89	# * @author John Rowe, DL Consulting Ltd.
90	# */
91	sub get_database_path
92	{
93	$collection = shift(@_);
94
95	# Find out the database extension
96	my $ext = ".bdb";
97	$ext = ".ldb" if &util::is_little_endian();
98
99	# Now return the full filename of the database
100	return &util::filename_cat($ENV{'GSDLHOME'}, "collect", $collection, "index", "text", $collection.$ext);
101	}
102
103	# /** This wraps John T's gdbmget executable to get the gdbm database entry for
104	# * a particular OID.
105	# *
106	# * @param $collection is the collection name.
107	# * @param $oid is the internal document id.
108	# *
109	# * @author John Rowe, DL Consulting Ltd.
110	# */
111	sub gdbmget
112	{
113	my ($collection, $oid) = @_;
114
115	# Where's the database?
116	$database = &get_database_path($collection);
117
118	# Are we in windows? Do we need .exe?
119	$exe = "";
120	$exe = ".exe" if $ENV{'GSDLOS'} =~ /^windows$/i;
121
122	# Retrieve the raw document content
123	print STDERR "#Get document\ncmd: gdbmget$exe \"$database\" \"$oid\"\n" if $debug;
124	return `gdbmget$exe "$database" "$oid"`;
125	}
126
127	# /** This wraps John T's gdbmset executable to set the gdbm database entry for
128	# * a particular OID. This does not yet report errors.
129	# *
130	# * @param $collection is the collection name.
131	# * @param $oid is the internal document id.
132	# * @param $value is the new value to set for the oid.
133	# *
134	# * @author John Rowe, DL Consulting Ltd.
135	# */
136	sub gdbmset
137	{
138	my ($collection, $oid, $value) = @_;
139
140	# Where's the database?
141	$database = &get_database_path($collection);
142
143	# Are we in windows? Do we need .exe?
144	my $exe = &util::get_os_exe();
145
146	# Retrieve the raw document content
147	print STDERR "#Get document\ncmd: gdbmset$exe \"$database\" \"$oid\" \"$value\"\n" if $debug;
148	`gdbmset$exe "$database" "$oid" "$value"`;
149	}
150
151	# /** This uses get_document to retrieve the document object, it then outputs the
152	# * XML text of the document to STDOUT.
153	# *
154	# * @param $collection The collection the document exists in.
155	# *
156	# * @param $oid The unique identifier of the required document as a string.
157	# * @author John Rowe, DL Consulting Ltd.
158	# */
159	sub get_document_as_xml
160	{
161	my($collection, $oid) = @_;
162
163	# Try to grab our document object
164	$doc_obj = get_document($collection, $oid);
165
166	# If there is an object returned then output it before we leave
167	if(defined $doc_obj)
168	{
169	my $doc_text = &docprint::get_section_xml($doc_obj, $doc_obj->get_top_section());
170	print STDOUT $doc_text;
171	# Create a new document printer processor
172	#my $processor = new docprint();
173	# Finally process it into xml
174	#$processor->process($doc_obj);
175	}
176	}
177
178	# /** This processes the information out of the gdbm database into a document
179	# * object.
180	# *
181	# * @version 1.0 Initial version by John Thompson
182	# * @version 2.0 Modified the gdbm fetch routines to use the perl abstractions
183	# * by John Rowe
184	# *
185	# * @author John Thompson, DL Consulting Ltd.
186	# * @author John Rowe, DL Consulting Ltd.
187	# */
188	sub process_document_section
189	{
190	my ($collection, $oid, $doc_obj, $section, $archivedir, $assocdir, $out) = @_;
191
192	my $hastxt = 0;
193	my $contains = "";
194	my $docnum = 0;
195
196	my $srclink = "";
197	# Grab the information out of the gdbm database
198	my $data = gdbmget($collection, $oid);
199	# Loop through the information and look at each line to add metadata to the document object
200	foreach my $line (split(/\n/, $data))
201	{
202	next unless $line =~ /^<([^>]+)>(.*)$/;
203	my $key = $1;
204	my $value = $2;
205	if ($key eq "hastxt" && $value eq "1") {
206	$hastxt = 1;
207	} elsif ($key eq "archivedir") {
208	$archivedir = $value;
209	} elsif ($key eq "contains") {
210	$contains = $value;
211	} elsif ($key eq "docnum") {
212	$docnum = $value;
213	} elsif ($key !~ /^(doctype\|thistype\|childtype)$/) {
214	if ($section ne "") {
215	# section level metadata
216	$doc_obj->add_utf8_metadata($section, $key, $value);
217
218	} else {
219	if (!defined($metadata->{$oid}->{$key})) {
220	# top level plugin derived metadata (i.e. stuff not in
221	# new metadata.xml file - including Language, Encoding,
222	# srcext, srclink, srcicon, DocExt, ContentType)
223	$doc_obj->add_utf8_metadata($section, $key, $value);
224	if ($key eq "srclink") {
225	$srclink = $value;
226	}
227	}
228	}
229	}
230	}
231
232	#my $adir = &util::filename_cat($assocdir, $archivedir);
233
234	# associate source file if required
235	#if ($srclink ne "") {
236	# my ($srcfile) = $srclink =~ /([^\\\/]?)[\">]$/;
237	# &associate_file($adir, $srcfile, $srcfile, $doc_obj);
238	# }
239
240	if ($section eq "") {
241
242	# top level metadata comes from metadata.xml of update package
243	# (except for plugin derived metadata like "Language",
244	# "Encoding", "srcext" etc. which is set above)
245	foreach my $metaname (keys %{$metadata->{$oid}}) {
246	foreach my $value (@{$metadata->{$oid}->{$metaname}}) {
247	$doc_obj->add_utf8_metadata($section, $metaname, $value);
248	}
249	}
250	}
251
252	# add text
253	# if ($hastxt && $docnum) {
254	# my $text = "";
255	# &get_text($docnum, \$text);
256	# $doc_obj->add_utf8_text($section, $text);
257	#
258	# # sort out any associated files
259	# $text =~ s/(_http(?:doc\|coll)img(?:full)?_\/)([^\">\/]+)/$1 . $2 . &associate_file($adir, $2, $2, $doc_obj, $out)/eg;
260	# }
261
262	# Don't process the subsections of classifiers
263	if ($contains =~ /\w/) {
264	if($oid =~ /^CL/) {
265	$doc_obj->add_utf8_metadata($section, "contains", $contains);
266	}
267	else {
268	# process subsections
269	foreach my $suboid (split(/;/, $contains)) {
270	$suboid =~ s/^\"/$oid/;
271	my $subsection = $doc_obj->insert_section($doc_obj->get_end_child($section));
272	&process_document_section($collection, $suboid, $doc_obj, $subsection, $archivedir, $assocdir, $out);
273	}
274	}
275	}
276	}
277
278	sub associate_file {
279	my ($dir, $realname, $assocname, $doc_obj, $out) = @_;
280
281	my $assocfile = &util::filename_cat($dir, $realname);
282	if (-e $assocfile) {
283	$doc_obj->associate_file($assocfile, $assocname);
284	} else {
285	print $out "WARNING: Associated file $assocfile could not be found\n";
286	}
287
288	return "";
289	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: