source: trunk/gsdl/bin/script/buildcol.pl@ 2355

Last change on this file since 2355 was 2355, checked in by sjboddie, 23 years ago

All options to import.pl and buildcol.pl may now be specified from
within a collect.cfg file

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 12.7 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# buildcol.pl -- This program will build a particular collection
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28package buildcol;
29
30BEGIN {
31 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
32 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
34 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
35 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
36}
37
38use colcfg;
39use parsargv;
40use util;
41use FileHandle;
42
43&main();
44
45sub print_usage {
46 print STDERR "\n";
47 print STDERR "buildcol.pl: Builds the indexes of a Greenstone collection.\n\n";
48 print STDERR " usage: $0 [options] collection-name\n\n";
49 print STDERR " options:\n";
50 print STDERR " -verbosity number 0=none, 3=lots\n";
51 print STDERR " -archivedir directory Where the archives live\n";
52 print STDERR " -builddir directory Where to put the built indexes\n";
53 print STDERR " -maxdocs number Maximum number of documents to build\n";
54 print STDERR " -debug Print output to STDOUT\n";
55 print STDERR " -mode all|compress_text|build_index|infodb\n";
56 print STDERR " -index indexname Index to build (will build all in\n";
57 print STDERR " config file if not set)\n";
58 print STDERR " -keepold will not destroy the current contents of the\n";
59 print STDERR " building directory\n";
60 print STDERR " -no_text Don't store compressed text. This option is\n";
61 print STDERR " useful for minimizing the size of the built\n";
62 print STDERR " indexes if you intend always to display the\n";
63 print STDERR " original documents at run time (i.e. you won't\n";
64 print STDERR " be able to retrieve the compressed text version)\n";
65 print STDERR " -allclassifications Don't remove empty classifications\n";
66 print STDERR " -create_images Attempt to create default images for new\n";
67 print STDERR " collection. This relies on the Gimp being\n";
68 print STDERR " installed along with relevant perl modules\n";
69 print STDERR " to allow scripting from perl\n";
70 print STDERR " -collectdir directory Collection directory (defaults to " .
71 &util::filename_cat ($ENV{'GSDLHOME'}, "collect") . ")\n";
72 print STDERR " -out Filename or handle to print output status to.\n";
73 print STDERR " The default is STDERR\n";
74 print STDERR " -buildtype mg|mgpp This will override the config file setting.\n";
75 print STDERR " (default is mg)\n";
76 print STDERR " -no_strip_html Do not strip the html tags from the indexed text\n";
77 print STDERR " (only used for mgpp collections).\n\n";
78}
79
80
81sub main
82{
83 my ($verbosity, $archivedir, $cachedir, $builddir, $maxdocs,
84 $debug, $mode, $indexname, $keepold, $allclassifications,
85 $create_images, $collectdir, $out, $buildtype, $textindex,
86 $no_strip_html, $no_text);
87
88 # note that no defaults are passed for most options as they're set
89 # later (after we check the collect.cfg file)
90 if (!parsargv::parse(\@ARGV,
91 'verbosity/\d+/', \$verbosity,
92 'archivedir/.*/', \$archivedir,
93 'cachedir/.*/', \$cachedir,
94 'builddir/.*/', \$builddir,
95 'maxdocs/^\-?\d+/', \$maxdocs,
96 'debug', \$debug,
97 'mode/^(all|compress_text|build_index|infodb)$/', \$mode,
98 'index/.*/', \$indexname,
99 'no_text', \$no_text,
100 'keepold', \$keepold,
101 'allclassifications', \$allclassifications,
102 'create_images', \$create_images,
103 'collectdir/.*/', \$collectdir,
104 'out/.*/STDERR', \$out,
105 'no_strip_html', \$no_strip_html,
106 'buildtype/^(mg|mgpp)$/', \$buildtype)) {
107 &print_usage();
108 die "\n";
109 }
110
111 $textindex = "";
112 my $close_out = 0;
113 if ($out !~ /^(STDERR|STDOUT)$/i) {
114 open (OUT, ">$out") || die "Couldn't open output file $out\n";
115 $out = "buildcol::OUT";
116 $close_out = 1;
117 }
118 $out->autoflush(1);
119
120 # get and check the collection
121 if (($collection = &util::use_collection(@ARGV, $collectdir)) eq "") {
122 &print_usage();
123 die "\n";
124 }
125
126 # read the configuration file
127 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
128 if (-e $configfilename) {
129 $collectcfg = &colcfg::read_collect_cfg ($configfilename);
130
131 if ($verbosity !~ /\d+/) {
132 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
133 $verbosity = $collectcfg->{'verbosity'};
134 } else {
135 $verbosity = 2; # the default
136 }
137 }
138 if (defined $collectcfg->{'buildtype'} && $buildtype eq "") {
139 $buildtype = $collectcfg->{'buildtype'};
140 }
141 if ($buildtype eq "") {
142 $buildtype = "mg"; # mg is the default
143 }
144 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
145 $archivedir = $collectcfg->{'archivedir'};
146 }
147 if (defined $collectcfg->{'cachedir'} && $cachedir eq "") {
148 $cachedir = $collectcfg->{'cachedir'};
149 }
150 if (defined $collectcfg->{'builddir'} && $builddir eq "") {
151 $builddir = $collectcfg->{'builddir'};
152 }
153 if ($maxdocs !~ /\-?\d+/) {
154 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
155 $maxdocs = $collectcfg->{'maxdocs'};
156 } else {
157 $maxdocs = -1; # the default
158 }
159 }
160 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
161 $debug = 1;
162 }
163 if ($mode !~ /^(all|compress_text|build_index|infodb)$/) {
164 if (defined $collectcfg->{'mode'} && $collectcfg->{'mode'} =~ /^(all|compress_text|build_index|infodb)$/) {
165 $mode = $collectcfg->{'mode'};
166 } else {
167 $mode = "all"; # the default
168 }
169 }
170 if (defined $collectcfg->{'index'} && $indexname eq "") {
171 $indexname = $collectcfg->{'index'};
172 }
173 if (defined $collectcfg->{'no_text'} && $no_text == 0) {
174 if ($collectcfg->{'no_text'} =~ /^true$/i) {
175 $no_text = 1;
176 }
177 }
178 if (defined $collectcfg->{'allclassifications'} && $allclassifications == 0) {
179 if ($collectcfg->{'allclassifications'} =~ /^true$/i) {
180 $allclassifications = 1;
181 }
182 }
183 if (defined $collectcfg->{'keepold'} && $collectcfg->{'keepold'} =~ /^true$/i) {
184 $keepold = 1;
185 }
186 if (defined $collectcfg->{'create_images'} && $collectcfg->{'create_images'} =~ /^true$/i) {
187 $create_images = 1;
188 }
189 if ($buildtype eq "mgpp" && defined $collectcfg->{'textcompress'}) {
190 $textindex = $collectcfg->{'textcompress'};
191 }
192
193 } else {
194 die "Couldn't find the configuration file $configfilename\n";
195 }
196
197 #mgpp doesn't work yet on windows
198 if ($buildtype eq "mgpp" && $ENV{'GSDLOS'} =~ /^windows$/) {
199 die "mgpp doesn't work on windows\n";
200 }
201
202 #set the text index
203 if ($buildtype eq "mgpp") {
204 if ($textindex eq "") {
205 $textindex = "text";
206 }
207 }
208 else {
209 $textindex = "section:text";
210 }
211
212 # create default images if required
213 if ($create_images) {
214 my $collection_name = $collection;
215 $collection_name = $collectcfg->{'collectionmeta'}->{'collectionname'}
216 if defined $collectcfg->{'collectionmeta'}->{'collectionname'};
217
218 &create_images ($collection_name);
219 }
220
221 # fill in the default archives and building directories if none
222 # were supplied, turn all \ into / and remove trailing /
223 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives") if $archivedir eq "";
224 $archivedir =~ s/[\\\/]+/\//g;
225 $archivedir =~ s/\/$//;
226 $builddir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "building") if $builddir eq "";
227 $builddir =~ s/[\\\/]+/\//g;
228 $builddir =~ s/\/$//;
229
230 # update the archive cache if needed
231 if ($cachedir) {
232 print $out "Updating archive cache\n" if ($verbosity >= 1);
233
234 $cachedir =~ s/[\\\/]+$//;
235 $cachedir .= "/collect/$collection" unless
236 $cachedir =~ /collect\/$collection/;
237
238 $realarchivedir = "$cachedir/archives";
239 $realbuilddir = "$cachedir/building";
240 &util::mk_all_dir ($realarchivedir);
241 &util::mk_all_dir ($realbuilddir);
242 &util::cachedir ($archivedir, $realarchivedir, $verbosity);
243
244 } else {
245 $realarchivedir = $archivedir;
246 $realbuilddir = $builddir;
247 }
248
249 # build it in realbuilddir
250 &util::mk_all_dir ($realbuilddir);
251
252
253 # if a builder class has been created for this collection, use it
254 # otherwise, use the mg or mgpp builder
255 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}builder.pm") {
256 $builderdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
257 $buildertype = "${collection}builder";
258 } else {
259 $builderdir = "$ENV{'GSDLHOME'}/perllib";
260 if ($buildtype eq "mgpp") {
261 $buildertype = "mgppbuilder";
262 }
263 else {
264 $buildertype = "mgbuilder";
265 }
266 }
267
268 require "$builderdir/$buildertype.pm";
269
270 eval("\$builder = new $buildertype(\$collection, " .
271 "\$realarchivedir, \$realbuilddir, \$verbosity, " .
272 "\$maxdocs, \$debug, \$keepold, \$allclassifications, " .
273 "\$out, \$no_text)");
274 die "$@" if $@;
275
276 $builder->init();
277
278 if ($buildertype eq "mgppbuilder" && $no_strip_html) {
279 $builder->set_strip_html(0);
280 }
281 if ($mode =~ /^all$/i) {
282 $builder->compress_text($textindex);
283 $builder->build_indexes($indexname);
284 $builder->make_infodatabase();
285 $builder->collect_specific();
286 } elsif ($mode =~ /^compress_text$/i) {
287 $builder->compress_text($textindex);
288 } elsif ($mode =~ /^build_index$/i) {
289 $builder->build_indexes($indexname);
290 } elsif ($mode =~ /^infodb$/i) {
291 $builder->make_infodatabase();
292 } else {
293 die "unknown mode: $mode\n";
294 }
295
296 $builder->make_auxiliary_files() if !$debug;
297 $builder->deinit();
298
299 if (($realbuilddir ne $builddir) && !$debug) {
300 print $out "Copying back the cached build\n" if ($verbosity >= 1);
301 &util::rm_r ($builddir);
302 &util::cp_r ($realbuilddir, $builddir);
303 }
304
305 close OUT if $close_out;
306}
307
308sub create_images {
309 my ($collection_name) = @_;
310
311 my $image_script = &util::filename_cat ($ENV{'GSDLHOME'}, "bin", "script", "gimp", "title_icon.pl");
312 if (!-e $image_script) {
313 print $out "WARNING: Image making script ($image_script) could not be found\n";
314 print $out " Default images will not be generated\n\n";
315 return;
316 }
317
318 my $imagedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "images");
319
320 &util::mk_all_dir ($imagedir);
321
322 # create the images
323 system ("$image_script -size 1.5 -image_dir \"$imagedir\" -filename $collection.gif -text \"$collection_name\"");
324 system ("$image_script -image_dir \"$imagedir\" -filename ${collection}sm.gif -text \"$collection_name\"");
325
326 # update the collect.cfg configuration file (this will need
327 # to be changed when the config file format changes)
328 if (!open (CFGFILE, $configfilename)) {
329 print $out "WARNING: Couldn't open config file ($configfilename)\n";
330 print $out " for updating so collection images may not be linked correctly\n";
331 return;
332 }
333
334 my $line = ""; my $file = "";
335 my $found = 0; my $foundsm = 0;
336 while (defined ($line = <CFGFILE>)) {
337 if ($line =~ /collectionmeta\s+iconcollection\s+/) {
338 $line = "collectionmeta iconcollection _httpprefix_/collect/$collection/images/$collection.gif\n";
339 $found = 1;
340 } elsif ($line =~ /collectionmeta\s+iconcollectionsmall\s+/) {
341 $line = "collectionmeta iconcollectionsmall _httpprefix_/collect/$collection/images/${collection}sm.gif\n";
342 $foundsm = 1;
343 }
344 $file .= $line;
345 }
346 close CFGFILE;
347
348 $file .= "collectionmeta iconcollection _httpprefix_/collect/$collection/images/$collection.gif\n" if !$found;
349 $file .= "collectionmeta iconcollectionsmall _httpprefix_/collect/$collection/images/${collection}sm.gif\n" if !$foundsm;
350
351 if (!open (CFGFILE, ">$configfilename")) {
352 print $out "WARNING: Couldn't open config file ($configfilename)\n";
353 print $out " for updating so collection images may not be linked correctly\n";
354 return;
355 }
356 print CFGFILE $file;
357 close CFGFILE;
358}
359
Note: See TracBrowser for help on using the repository browser.