1 | ###########################################################################
|
---|
2 | #
|
---|
3 | # incremental_build.pm -- API to assist incremental building
|
---|
4 | #
|
---|
5 | # A component of the Greenstone digital library software
|
---|
6 | # from the New Zealand Digital Library Project at the
|
---|
7 | # University of Waikato, New Zealand.
|
---|
8 | #
|
---|
9 | # Copyright (C) 2006 DL Consulting Ltd and New Zealand Digital Library Project
|
---|
10 | #
|
---|
11 | # This program is free software; you can redistribute it and/or modify
|
---|
12 | # it under the terms of the GNU General Public License as published by
|
---|
13 | # the Free Software Foundation; either version 2 of the License, or
|
---|
14 | # (at your option) any later version.
|
---|
15 | #
|
---|
16 | # This program is distributed in the hope that it will be useful,
|
---|
17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
19 | # GNU General Public License for more details.
|
---|
20 | #
|
---|
21 | # You should have received a copy of the GNU General Public License
|
---|
22 | # along with this program; if not, write to the Free Software
|
---|
23 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
24 | #
|
---|
25 | ###########################################################################
|
---|
26 |
|
---|
27 | ###########################################################################
|
---|
28 | # /** Initial versions of these functions by John Thompson, revisions by
|
---|
29 | # * and turning it into a package by John Rowe. Used heavily by
|
---|
30 | # * basebuilder::remove_document() and getdocument.pl
|
---|
31 | # *
|
---|
32 | # * @version 1.0 Initial version by John Thompson
|
---|
33 | # * @version 1.1 Addition of get_document and change of get_document_as_xml by John Rowe
|
---|
34 | # * @version 2.0 Package version including seperation from calling code and modularisation
|
---|
35 | # * by creating gdbmget, gdbmset and get_database_path by John Rowe
|
---|
36 | # *
|
---|
37 | # * @author John Thompson, DL Consulting Ltd.
|
---|
38 | # * @author John Rowe, DL Consulting Ltd.
|
---|
39 | # */
|
---|
40 | ###########################################################################
|
---|
41 |
|
---|
42 | use util;
|
---|
43 |
|
---|
44 | package incremental_build;
|
---|
45 | # Change debugging to 1 if you want verbose debugging output
|
---|
46 | $debug = 0;
|
---|
47 |
|
---|
48 | # Ensure the collection specific binaries are on the search path
|
---|
49 | my $path_separator = ":";
|
---|
50 | $ENV{'PATH'} = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}).$path_separator.&util::filename_cat($ENV{'GSDLHOME'}, "bin", "script").$path_separator.$ENV{'PATH'};
|
---|
51 |
|
---|
52 |
|
---|
53 |
|
---|
54 | # /** Use the gdbm get tool to retrieve and populate a doc object with data.
|
---|
55 | # * Then return the doc object if it was found and nothing if not.
|
---|
56 | # *
|
---|
57 | # * @param $database The full path, including the file itself, of the gdbm
|
---|
58 | # * database as a string.
|
---|
59 | # * @param $oid The unique identifier of the required document as a string.
|
---|
60 | # * @author John Thompson, DL Consulting Ltd.
|
---|
61 | # * @author John Rowe, DL Consulting Ltd.
|
---|
62 | # */
|
---|
63 | sub get_document
|
---|
64 | {
|
---|
65 | my($collection, $oid) = @_;
|
---|
66 |
|
---|
67 | # Get the raw document text to create a document object out of
|
---|
68 | $raw_document = gdbmget($collection, $oid);
|
---|
69 |
|
---|
70 | # Check for content and if some are found then we can return the created object
|
---|
71 | if($raw_document =~ /\w+/)
|
---|
72 | {
|
---|
73 | # Create a new document object
|
---|
74 | my $doc_obj = new doc();
|
---|
75 | $doc_obj->set_OID($oid);
|
---|
76 |
|
---|
77 | &process_document_section($collection, $oid, $doc_obj, "", "", 0, 0);
|
---|
78 | return $doc_obj;
|
---|
79 | }
|
---|
80 |
|
---|
81 | # Otherwise we return nothing
|
---|
82 | }
|
---|
83 |
|
---|
84 | # /** This works out the database path and returns it to the calling
|
---|
85 | # * calling function.
|
---|
86 | # *
|
---|
87 | # * @param $collection The current collection name
|
---|
88 | # *
|
---|
89 | # * @author John Rowe, DL Consulting Ltd.
|
---|
90 | # */
|
---|
91 | sub get_database_path
|
---|
92 | {
|
---|
93 | $collection = shift(@_);
|
---|
94 |
|
---|
95 | # Find out the database extension
|
---|
96 | my $ext = ".bdb";
|
---|
97 | $ext = ".ldb" if &util::is_little_endian();
|
---|
98 |
|
---|
99 | # Now return the full filename of the database
|
---|
100 | return &util::filename_cat($ENV{'GSDLHOME'}, "collect", $collection, "index", "text", $collection.$ext);
|
---|
101 | }
|
---|
102 |
|
---|
103 | # /** This wraps John T's gdbmget executable to get the gdbm database entry for
|
---|
104 | # * a particular OID.
|
---|
105 | # *
|
---|
106 | # * @param $collection is the collection name.
|
---|
107 | # * @param $oid is the internal document id.
|
---|
108 | # *
|
---|
109 | # * @author John Rowe, DL Consulting Ltd.
|
---|
110 | # */
|
---|
111 | sub gdbmget
|
---|
112 | {
|
---|
113 | my ($collection, $oid) = @_;
|
---|
114 |
|
---|
115 | # Where's the database?
|
---|
116 | $database = &get_database_path($collection);
|
---|
117 |
|
---|
118 | # Are we in windows? Do we need .exe?
|
---|
119 | $exe = "";
|
---|
120 | $exe = ".exe" if $ENV{'GSDLOS'} =~ /^windows$/i;
|
---|
121 |
|
---|
122 | # Retrieve the raw document content
|
---|
123 | print STDERR "#Get document\ncmd: gdbmget$exe \"$database\" \"$oid\"\n" if $debug;
|
---|
124 | return `gdbmget$exe "$database" "$oid"`;
|
---|
125 | }
|
---|
126 |
|
---|
127 | # /** This wraps John T's gdbmset executable to set the gdbm database entry for
|
---|
128 | # * a particular OID. This does not yet report errors.
|
---|
129 | # *
|
---|
130 | # * @param $collection is the collection name.
|
---|
131 | # * @param $oid is the internal document id.
|
---|
132 | # * @param $value is the new value to set for the oid.
|
---|
133 | # *
|
---|
134 | # * @author John Rowe, DL Consulting Ltd.
|
---|
135 | # */
|
---|
136 | sub gdbmset
|
---|
137 | {
|
---|
138 | my ($collection, $oid, $value) = @_;
|
---|
139 |
|
---|
140 | # Where's the database?
|
---|
141 | $database = &get_database_path($collection);
|
---|
142 |
|
---|
143 | # Are we in windows? Do we need .exe?
|
---|
144 | my $exe = &util::get_os_exe();
|
---|
145 |
|
---|
146 | # Retrieve the raw document content
|
---|
147 | print STDERR "#Get document\ncmd: gdbmset$exe \"$database\" \"$oid\" \"$value\"\n" if $debug;
|
---|
148 | `gdbmset$exe "$database" "$oid" "$value"`;
|
---|
149 | }
|
---|
150 |
|
---|
151 | # /** This uses get_document to retrieve the document object, it then outputs the
|
---|
152 | # * XML text of the document to STDOUT.
|
---|
153 | # *
|
---|
154 | # * @param $collection The collection the document exists in.
|
---|
155 | # *
|
---|
156 | # * @param $oid The unique identifier of the required document as a string.
|
---|
157 | # * @author John Rowe, DL Consulting Ltd.
|
---|
158 | # */
|
---|
159 | sub get_document_as_xml
|
---|
160 | {
|
---|
161 | my($collection, $oid) = @_;
|
---|
162 |
|
---|
163 | # Try to grab our document object
|
---|
164 | $doc_obj = get_document($collection, $oid);
|
---|
165 |
|
---|
166 | # If there is an object returned then output it before we leave
|
---|
167 | if(defined $doc_obj)
|
---|
168 | {
|
---|
169 | my $doc_text = &docprint::get_section_xml($doc_obj, $doc_obj->get_top_section());
|
---|
170 | print STDOUT $doc_text;
|
---|
171 | # Create a new document printer processor
|
---|
172 | #my $processor = new docprint();
|
---|
173 | # Finally process it into xml
|
---|
174 | #$processor->process($doc_obj);
|
---|
175 | }
|
---|
176 | }
|
---|
177 |
|
---|
178 | # /** This processes the information out of the gdbm database into a document
|
---|
179 | # * object.
|
---|
180 | # *
|
---|
181 | # * @version 1.0 Initial version by John Thompson
|
---|
182 | # * @version 2.0 Modified the gdbm fetch routines to use the perl abstractions
|
---|
183 | # * by John Rowe
|
---|
184 | # *
|
---|
185 | # * @author John Thompson, DL Consulting Ltd.
|
---|
186 | # * @author John Rowe, DL Consulting Ltd.
|
---|
187 | # */
|
---|
188 | sub process_document_section
|
---|
189 | {
|
---|
190 | my ($collection, $oid, $doc_obj, $section, $archivedir, $assocdir, $out) = @_;
|
---|
191 |
|
---|
192 | my $hastxt = 0;
|
---|
193 | my $contains = "";
|
---|
194 | my $docnum = 0;
|
---|
195 |
|
---|
196 | my $srclink = "";
|
---|
197 | # Grab the information out of the gdbm database
|
---|
198 | my $data = gdbmget($collection, $oid);
|
---|
199 | # Loop through the information and look at each line to add metadata to the document object
|
---|
200 | foreach my $line (split(/\n/, $data))
|
---|
201 | {
|
---|
202 | next unless $line =~ /^<([^>]+)>(.*)$/;
|
---|
203 | my $key = $1;
|
---|
204 | my $value = $2;
|
---|
205 | if ($key eq "hastxt" && $value eq "1") {
|
---|
206 | $hastxt = 1;
|
---|
207 | } elsif ($key eq "archivedir") {
|
---|
208 | $archivedir = $value;
|
---|
209 | } elsif ($key eq "contains") {
|
---|
210 | $contains = $value;
|
---|
211 | } elsif ($key eq "docnum") {
|
---|
212 | $docnum = $value;
|
---|
213 | } elsif ($key !~ /^(doctype|thistype|childtype)$/) {
|
---|
214 | if ($section ne "") {
|
---|
215 | # section level metadata
|
---|
216 | $doc_obj->add_utf8_metadata($section, $key, $value);
|
---|
217 |
|
---|
218 | } else {
|
---|
219 | if (!defined($metadata->{$oid}->{$key})) {
|
---|
220 | # top level plugin derived metadata (i.e. stuff not in
|
---|
221 | # new metadata.xml file - including Language, Encoding,
|
---|
222 | # srcext, srclink, srcicon, DocExt, ContentType)
|
---|
223 | $doc_obj->add_utf8_metadata($section, $key, $value);
|
---|
224 | if ($key eq "srclink") {
|
---|
225 | $srclink = $value;
|
---|
226 | }
|
---|
227 | }
|
---|
228 | }
|
---|
229 | }
|
---|
230 | }
|
---|
231 |
|
---|
232 | #my $adir = &util::filename_cat($assocdir, $archivedir);
|
---|
233 |
|
---|
234 | # associate source file if required
|
---|
235 | #if ($srclink ne "") {
|
---|
236 | # my ($srcfile) = $srclink =~ /([^\\\/]*?)[\">]*$/;
|
---|
237 | # &associate_file($adir, $srcfile, $srcfile, $doc_obj);
|
---|
238 | # }
|
---|
239 |
|
---|
240 | if ($section eq "") {
|
---|
241 |
|
---|
242 | # top level metadata comes from metadata.xml of update package
|
---|
243 | # (except for plugin derived metadata like "Language",
|
---|
244 | # "Encoding", "srcext" etc. which is set above)
|
---|
245 | foreach my $metaname (keys %{$metadata->{$oid}}) {
|
---|
246 | foreach my $value (@{$metadata->{$oid}->{$metaname}}) {
|
---|
247 | $doc_obj->add_utf8_metadata($section, $metaname, $value);
|
---|
248 | }
|
---|
249 | }
|
---|
250 | }
|
---|
251 |
|
---|
252 | # add text
|
---|
253 | # if ($hastxt && $docnum) {
|
---|
254 | # my $text = "";
|
---|
255 | # &get_text($docnum, \$text);
|
---|
256 | # $doc_obj->add_utf8_text($section, $text);
|
---|
257 | #
|
---|
258 | # # sort out any associated files
|
---|
259 | # $text =~ s/(_http(?:doc|coll)img(?:full)?_\/)([^\">\/]+)/$1 . $2 . &associate_file($adir, $2, $2, $doc_obj, $out)/eg;
|
---|
260 | # }
|
---|
261 |
|
---|
262 | # Don't process the subsections of classifiers
|
---|
263 | if ($contains =~ /\w/) {
|
---|
264 | if($oid =~ /^CL/) {
|
---|
265 | $doc_obj->add_utf8_metadata($section, "contains", $contains);
|
---|
266 | }
|
---|
267 | else {
|
---|
268 | # process subsections
|
---|
269 | foreach my $suboid (split(/;/, $contains)) {
|
---|
270 | $suboid =~ s/^\"/$oid/;
|
---|
271 | my $subsection = $doc_obj->insert_section($doc_obj->get_end_child($section));
|
---|
272 | &process_document_section($collection, $suboid, $doc_obj, $subsection, $archivedir, $assocdir, $out);
|
---|
273 | }
|
---|
274 | }
|
---|
275 | }
|
---|
276 | }
|
---|
277 |
|
---|
278 | sub associate_file {
|
---|
279 | my ($dir, $realname, $assocname, $doc_obj, $out) = @_;
|
---|
280 |
|
---|
281 | my $assocfile = &util::filename_cat($dir, $realname);
|
---|
282 | if (-e $assocfile) {
|
---|
283 | $doc_obj->associate_file($assocfile, $assocname);
|
---|
284 | } else {
|
---|
285 | print $out "WARNING: Associated file $assocfile could not be found\n";
|
---|
286 | }
|
---|
287 |
|
---|
288 | return "";
|
---|
289 | }
|
---|