source: trunk/gsdl/bin/script/buildcol.pl@ 2785

Last change on this file since 2785 was 2785, checked in by sjboddie, 23 years ago

The build process now creates a summary of how many files were included,
which were rejected, etc. A link to a page containing this summary is
provided from the final page of the collector (once the collection is built
successfully) and from the default "about this collection" text for
collections built by the collector.

Also did a little bit of tidying in a couple of places

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 13.6 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# buildcol.pl -- This program will build a particular collection
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28package buildcol;
29
30BEGIN {
31 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
32 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
34 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
35 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
36}
37
38use colcfg;
39use parsargv;
40use util;
41use FileHandle;
42
43&main();
44
45sub print_usage {
46 print STDOUT "\n";
47 print STDOUT "buildcol.pl: Builds the indexes of a Greenstone collection.\n\n";
48 print STDOUT " usage: $0 [options] collection-name\n\n";
49 print STDOUT " options:\n";
50 print STDOUT " -verbosity number 0=none, 3=lots\n";
51 print STDOUT " -archivedir directory Where the archives live\n";
52 print STDOUT " -builddir directory Where to put the built indexes\n";
53 print STDOUT " -maxdocs number Maximum number of documents to build\n";
54 print STDOUT " -debug Print output to STDOUT\n";
55 print STDOUT " -mode all|compress_text|build_index|infodb\n";
56 print STDOUT " -index indexname Index to build (will build all in\n";
57 print STDOUT " config file if not set)\n";
58 print STDOUT " -keepold will not destroy the current contents of the\n";
59 print STDOUT " building directory\n";
60 print STDOUT " -no_text Don't store compressed text. This option is\n";
61 print STDOUT " useful for minimizing the size of the built\n";
62 print STDOUT " indexes if you intend always to display the\n";
63 print STDOUT " original documents at run time (i.e. you won't\n";
64 print STDOUT " be able to retrieve the compressed text version)\n";
65 print STDOUT " -allclassifications Don't remove empty classifications\n";
66 print STDOUT " -create_images Attempt to create default images for new\n";
67 print STDOUT " collection. This relies on the Gimp being\n";
68 print STDOUT " installed along with relevant perl modules\n";
69 print STDOUT " to allow scripting from perl\n";
70 print STDOUT " -collectdir directory Collection directory (defaults to " .
71 &util::filename_cat ($ENV{'GSDLHOME'}, "collect") . ")\n";
72 print STDOUT " -out Filename or handle to print output status to.\n";
73 print STDOUT " The default is STDERR\n";
74 print STDOUT " -buildtype mg|mgpp This will override the config file setting.\n";
75 print STDOUT " (default is mg)\n";
76 print STDOUT " -no_strip_html Do not strip the html tags from the indexed text\n";
77 print STDOUT " (only used for mgpp collections).\n\n";
78 print STDOUT " -faillog name Fail log filename. This log receives the filenames\n";
79 print STDOUT " of any files which fail to be processed (defaults.\n";
80 print STDOUT " to " .
81 &util::filename_cat("<collectdir>", "colname", "etc", "fail.log") . ")\n";
82 print STDOUT " [Type \"perl -S buildcol.pl | more\" if this help text scrolled off your screen]";
83 print STDOUT "\n" unless $ENV{'GSDLOS'} =~ /^windows$/i;
84}
85
86
87sub main
88{
89 my ($verbosity, $archivedir, $cachedir, $builddir, $maxdocs,
90 $debug, $mode, $indexname, $keepold, $allclassifications,
91 $create_images, $collectdir, $out, $buildtype, $textindex,
92 $no_strip_html, $no_text, $faillog);
93
94 # note that no defaults are passed for most options as they're set
95 # later (after we check the collect.cfg file)
96 if (!parsargv::parse(\@ARGV,
97 'verbosity/\d+/', \$verbosity,
98 'archivedir/.*/', \$archivedir,
99 'cachedir/.*/', \$cachedir,
100 'builddir/.*/', \$builddir,
101 'maxdocs/^\-?\d+/', \$maxdocs,
102 'debug', \$debug,
103 'mode/^(all|compress_text|build_index|infodb)$/', \$mode,
104 'index/.*/', \$indexname,
105 'no_text', \$no_text,
106 'keepold', \$keepold,
107 'allclassifications', \$allclassifications,
108 'create_images', \$create_images,
109 'collectdir/.*/', \$collectdir,
110 'out/.*/STDERR', \$out,
111 'no_strip_html', \$no_strip_html,
112 'buildtype/^(mg|mgpp)$/', \$buildtype,
113 'faillog/.*/', \$faillog)) {
114 &print_usage();
115 die "\n";
116 }
117
118 $textindex = "";
119 my $close_out = 0;
120 if ($out !~ /^(STDERR|STDOUT)$/i) {
121 open (OUT, ">$out") || die "Couldn't open output file $out\n";
122 $out = "buildcol::OUT";
123 $close_out = 1;
124 }
125 $out->autoflush(1);
126
127 # get and check the collection
128 if (($collection = &util::use_collection(@ARGV, $collectdir)) eq "") {
129 &print_usage();
130 die "\n";
131 }
132
133 if ($faillog eq "") {
134 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
135 }
136 # note that we're appending to the faillog here (import.pl clears it each time)
137 # this could potentially create a situation where the faillog keeps being added
138 # to over multiple builds (if the import process is being skipped)
139 open (FAILLOG, ">>$faillog") || die "Couldn't open fail log $faillog\n";
140 $faillog = 'buildcol::FAILLOG';
141 $faillog->autoflush(1);
142
143 # read the configuration file
144 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
145 if (-e $configfilename) {
146 $collectcfg = &colcfg::read_collect_cfg ($configfilename);
147
148 if ($verbosity !~ /\d+/) {
149 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
150 $verbosity = $collectcfg->{'verbosity'};
151 } else {
152 $verbosity = 2; # the default
153 }
154 }
155 if (defined $collectcfg->{'buildtype'} && $buildtype eq "") {
156 $buildtype = $collectcfg->{'buildtype'};
157 }
158 if ($buildtype eq "") {
159 $buildtype = "mg"; # mg is the default
160 }
161 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
162 $archivedir = $collectcfg->{'archivedir'};
163 }
164 if (defined $collectcfg->{'cachedir'} && $cachedir eq "") {
165 $cachedir = $collectcfg->{'cachedir'};
166 }
167 if (defined $collectcfg->{'builddir'} && $builddir eq "") {
168 $builddir = $collectcfg->{'builddir'};
169 }
170 if ($maxdocs !~ /\-?\d+/) {
171 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
172 $maxdocs = $collectcfg->{'maxdocs'};
173 } else {
174 $maxdocs = -1; # the default
175 }
176 }
177 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
178 $debug = 1;
179 }
180 if ($mode !~ /^(all|compress_text|build_index|infodb)$/) {
181 if (defined $collectcfg->{'mode'} && $collectcfg->{'mode'} =~ /^(all|compress_text|build_index|infodb)$/) {
182 $mode = $collectcfg->{'mode'};
183 } else {
184 $mode = "all"; # the default
185 }
186 }
187 if (defined $collectcfg->{'index'} && $indexname eq "") {
188 $indexname = $collectcfg->{'index'};
189 }
190 if (defined $collectcfg->{'no_text'} && $no_text == 0) {
191 if ($collectcfg->{'no_text'} =~ /^true$/i) {
192 $no_text = 1;
193 }
194 }
195 if (defined $collectcfg->{'allclassifications'} && $allclassifications == 0) {
196 if ($collectcfg->{'allclassifications'} =~ /^true$/i) {
197 $allclassifications = 1;
198 }
199 }
200 if (defined $collectcfg->{'keepold'} && $collectcfg->{'keepold'} =~ /^true$/i) {
201 $keepold = 1;
202 }
203 if (defined $collectcfg->{'create_images'} && $collectcfg->{'create_images'} =~ /^true$/i) {
204 $create_images = 1;
205 }
206 if ($buildtype eq "mgpp" && defined $collectcfg->{'textcompress'}) {
207 $textindex = $collectcfg->{'textcompress'};
208 }
209
210 } else {
211 die "Couldn't find the configuration file $configfilename\n";
212 }
213
214 #set the text index
215 if ($buildtype eq "mgpp") {
216 if ($textindex eq "") {
217 $textindex = "text";
218 }
219 }
220 else {
221 $textindex = "section:text";
222 }
223
224 # create default images if required
225 if ($create_images) {
226 my $collection_name = $collection;
227 $collection_name = $collectcfg->{'collectionmeta'}->{'collectionname'}
228 if defined $collectcfg->{'collectionmeta'}->{'collectionname'};
229
230 &create_images ($collection_name);
231 }
232
233 # fill in the default archives and building directories if none
234 # were supplied, turn all \ into / and remove trailing /
235 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives") if $archivedir eq "";
236 $archivedir =~ s/[\\\/]+/\//g;
237 $archivedir =~ s/\/$//;
238 $builddir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "building") if $builddir eq "";
239 $builddir =~ s/[\\\/]+/\//g;
240 $builddir =~ s/\/$//;
241
242 # update the archive cache if needed
243 if ($cachedir) {
244 print $out "Updating archive cache\n" if ($verbosity >= 1);
245
246 $cachedir =~ s/[\\\/]+$//;
247 $cachedir .= "/collect/$collection" unless
248 $cachedir =~ /collect\/$collection/;
249
250 $realarchivedir = "$cachedir/archives";
251 $realbuilddir = "$cachedir/building";
252 &util::mk_all_dir ($realarchivedir);
253 &util::mk_all_dir ($realbuilddir);
254 &util::cachedir ($archivedir, $realarchivedir, $verbosity);
255
256 } else {
257 $realarchivedir = $archivedir;
258 $realbuilddir = $builddir;
259 }
260
261 # build it in realbuilddir
262 &util::mk_all_dir ($realbuilddir);
263
264
265 # if a builder class has been created for this collection, use it
266 # otherwise, use the mg or mgpp builder
267 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}builder.pm") {
268 $builderdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
269 $buildertype = "${collection}builder";
270 } else {
271 $builderdir = "$ENV{'GSDLHOME'}/perllib";
272 if ($buildtype eq "mgpp") {
273 $buildertype = "mgppbuilder";
274 }
275 else {
276 $buildertype = "mgbuilder";
277 }
278 }
279
280 require "$builderdir/$buildertype.pm";
281
282 eval("\$builder = new $buildertype(\$collection, " .
283 "\$realarchivedir, \$realbuilddir, \$verbosity, " .
284 "\$maxdocs, \$debug, \$keepold, \$allclassifications, " .
285 "\$out, \$no_text, \$faillog)");
286 die "$@" if $@;
287
288 $builder->init();
289
290 if ($buildertype eq "mgppbuilder" && $no_strip_html) {
291 $builder->set_strip_html(0);
292 }
293 if ($mode =~ /^all$/i) {
294 $builder->compress_text($textindex);
295 $builder->build_indexes($indexname);
296 $builder->make_infodatabase();
297 $builder->collect_specific();
298 } elsif ($mode =~ /^compress_text$/i) {
299 $builder->compress_text($textindex);
300 } elsif ($mode =~ /^build_index$/i) {
301 $builder->build_indexes($indexname);
302 } elsif ($mode =~ /^infodb$/i) {
303 $builder->make_infodatabase();
304 } else {
305 die "unknown mode: $mode\n";
306 }
307
308 $builder->make_auxiliary_files() if !$debug;
309 $builder->deinit();
310
311 if (($realbuilddir ne $builddir) && !$debug) {
312 print $out "Copying back the cached build\n" if ($verbosity >= 1);
313 &util::rm_r ($builddir);
314 &util::cp_r ($realbuilddir, $builddir);
315 }
316
317 close OUT if $close_out;
318 close FAILLOG;
319}
320
321sub create_images {
322 my ($collection_name) = @_;
323
324 my $image_script = &util::filename_cat ($ENV{'GSDLHOME'}, "bin", "script", "gimp", "title_icon.pl");
325 if (!-e $image_script) {
326 print $out "WARNING: Image making script ($image_script) could not be found\n";
327 print $out " Default images will not be generated\n\n";
328 return;
329 }
330
331 my $imagedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "images");
332
333 &util::mk_all_dir ($imagedir);
334
335 # create the images
336 system ("$image_script -size 1.5 -image_dir \"$imagedir\" -filename $collection.gif -text \"$collection_name\"");
337 system ("$image_script -image_dir \"$imagedir\" -filename ${collection}sm.gif -text \"$collection_name\"");
338
339 # update the collect.cfg configuration file (this will need
340 # to be changed when the config file format changes)
341 if (!open (CFGFILE, $configfilename)) {
342 print $out "WARNING: Couldn't open config file ($configfilename)\n";
343 print $out " for updating so collection images may not be linked correctly\n";
344 return;
345 }
346
347 my $line = ""; my $file = "";
348 my $found = 0; my $foundsm = 0;
349 while (defined ($line = <CFGFILE>)) {
350 if ($line =~ /collectionmeta\s+iconcollection\s+/) {
351 $line = "collectionmeta iconcollection _httpprefix_/collect/$collection/images/$collection.gif\n";
352 $found = 1;
353 } elsif ($line =~ /collectionmeta\s+iconcollectionsmall\s+/) {
354 $line = "collectionmeta iconcollectionsmall _httpprefix_/collect/$collection/images/${collection}sm.gif\n";
355 $foundsm = 1;
356 }
357 $file .= $line;
358 }
359 close CFGFILE;
360
361 $file .= "collectionmeta iconcollection _httpprefix_/collect/$collection/images/$collection.gif\n" if !$found;
362 $file .= "collectionmeta iconcollectionsmall _httpprefix_/collect/$collection/images/${collection}sm.gif\n" if !$foundsm;
363
364 if (!open (CFGFILE, ">$configfilename")) {
365 print $out "WARNING: Couldn't open config file ($configfilename)\n";
366 print $out " for updating so collection images may not be linked correctly\n";
367 return;
368 }
369 print CFGFILE $file;
370 close CFGFILE;
371}
372
Note: See TracBrowser for help on using the repository browser.