source: main/tags/2.40/gsdl/bin/script/buildcol.pl@ 31150

Last change on this file since 31150 was 4776, checked in by mdewsnip, 21 years ago

Now uses the PrintUsage module to automatically generate usage text from the $options and $arguments structures.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 18.1 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# buildcol.pl -- This program will build a particular collection
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# 11/04/03 Added usage datastructure - John Thompson
29
30package buildcol;
31
32BEGIN {
33 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
34 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
35 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
36 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
38}
39
40use colcfg;
41use parsargv;
42use util;
43use FileHandle;
44use printusage;
45
46my $mode_list =
47 [ { 'name' => "all",
48 'desc' => "Do everything." },
49 { 'name' => "compress_text",
50 'desc' => "Just compress the text." },
51 { 'name' => "build_index",
52 'desc' => "Just index the text." },
53 { 'name' => "infodb",
54 'desc' => "Just build the metadata database." } ];
55
56my $arguments =
57 [ { 'name' => "archivedir",
58 'desc' => "Where the archives live.",
59 'type' => "string",
60 'reqd' => "no" },
61 { 'name' => "verbosity",
62 'desc' => "0=none, 3=lots",
63 'type' => "int",
64 'deft' => "2",
65 'reqd' => "no" },
66 { 'name' => "builddir",
67 'desc' => "Where to put the built indexes.",
68 'type' => "string",
69 'reqd' => "no" },
70 { 'name' => "cachedir",
71 'desc' => "?",
72 'type' => "string",
73 'reqd' => "no" },
74 { 'name' => "maxdocs",
75 'desc' => "Maximum number of documents to build.",
76 'type' => "int",
77 'reqd' => "no" },
78 { 'name' => "debug",
79 'desc' => "Print output to STDOUT.",
80 'type' => "flag",
81 'reqd' => "no" },
82 { 'name' => "mode",
83 'desc' => "The parts of the building process to carry out.",
84 'type' => "enum",
85 'list' => $mode_list,
86 'deft' => "all",
87 'reqd' => "no" },
88 { 'name' => "index",
89 'desc' => "Index to build (will build all in config file if not set).",
90 'type' => "string",
91 'reqd' => "no" },
92 { 'name' => "keepold",
93 'desc' => "Will not destroy the current contents of the building directory.",
94 'type' => "flag",
95 'reqd' => "no" },
96 { 'name' => "no_text",
97 'desc' => "Don't store compressed text. This option is useful for minimizing the size of the built indexes if you intend always to display the original documents at run time (i.e. you won't be able to retrieve the compressed text version).",
98 'type' => "flag",
99 'reqd' => "no" },
100 { 'name' => "allclassifications",
101 'desc' => "Don't remove empty classifications.",
102 'type' => "flag",
103 'reqd' => "no" },
104 { 'name' => "create_images",
105 'desc' => "Attempt to create default images for new collection. This relies on the Gimp being installed along with relevant perl modules to allow scripting from perl.",
106 'type' => "flag",
107 'reqd' => "no" },
108 { 'name' => "collectdir",
109 'desc' => "Collection directory.",
110 'type' => "string",
111 'deft' => &util::filename_cat ($ENV{'GSDLHOME'}, "collect"),
112 'reqd' => "no" },
113 { 'name' => "out",
114 'desc' => "Filename or handle to print output status to.",
115 'type' => "string",
116 'deft' => "STDERR",
117 'reqd' => "no" },
118 { 'name' => "no_strip_html",
119 'desc' => "Do not strip the html tags from the indexed text (only used for mgpp collections).",
120 'type' => "flag",
121 'reqd' => "no" },
122 { 'name' => "faillog",
123 'desc' => "Fail log filename. This log receives the filenames of any files which fail to be processed.",
124 'type' => "string",
125 'deft' => &util::filename_cat("<collectdir>", "colname", "etc", "fail.log"),
126 'reqd' => "no" } ];
127
128my $options = { 'name' => "buildcol.pl",
129 'desc' => "PERL script used to build a greenstone collection from GML documents.",
130 'args' => $arguments };
131
132
133sub print_xml_usage
134{
135 &PrintUsage::print_xml_header();
136
137 print STDERR "<Info>\n";
138 print STDERR " <Name>$options->{'name'}</Name>\n";
139 print STDERR " <Desc>$options->{'desc'}</Desc>\n";
140 print STDERR " <Arguments>\n";
141 if (defined($options->{'args'})) {
142 &PrintUsage::print_options_xml($options->{'args'});
143 }
144 print STDERR " </Arguments>\n";
145 print STDERR "</Info>\n";
146}
147
148
149sub print_txt_usage
150{
151 local $programname = $options->{'name'};
152 local $programargs = $options->{'args'};
153
154 # Find the length of the longest option string
155 local $descoffset = 0;
156 if (defined($programargs)) {
157 $descoffset = &PrintUsage::find_longest_option_string($programargs);
158 }
159
160 # Produce the usage information using the data structure above
161 print STDERR " usage: $programname [options] collection-name\n\n";
162
163 # Display the program options, if there are some
164 if (defined($programargs)) {
165 # Calculate the column offset of the option descriptions
166 local $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
167
168 print STDERR " options:\n";
169
170 # Display the program options
171 &PrintUsage::print_options_txt($programargs, $optiondescoffset);
172 }
173}
174
175
176# sub print_usage {
177# print STDOUT "\n";
178# print STDOUT "buildcol.pl: Builds the indexes of a Greenstone collection.\n\n";
179# print STDOUT " usage: $0 [options] collection-name\n\n";
180# print STDOUT " options:\n";
181# print STDOUT " -verbosity number 0=none, 3=lots\n";
182# print STDOUT " -archivedir directory Where the archives live\n";
183# print STDOUT " -builddir directory Where to put the built indexes\n";
184# print STDOUT " -maxdocs number Maximum number of documents to build\n";
185# print STDOUT " -debug Print output to STDOUT\n";
186# print STDOUT " -mode all|compress_text|build_index|infodb\n";
187# print STDOUT " -index indexname Index to build (will build all in\n";
188# print STDOUT " config file if not set)\n";
189# print STDOUT " -keepold will not destroy the current contents of the\n";
190# print STDOUT " building directory\n";
191# print STDOUT " -no_text Don't store compressed text. This option is\n";
192# print STDOUT " useful for minimizing the size of the built\n";
193# print STDOUT " indexes if you intend always to display the\n";
194# print STDOUT " original documents at run time (i.e. you won't\n";
195# print STDOUT " be able to retrieve the compressed text version)\n";
196# print STDOUT " -allclassifications Don't remove empty classifications\n";
197# print STDOUT " -create_images Attempt to create default images for new\n";
198# print STDOUT " collection. This relies on the Gimp being\n";
199# print STDOUT " installed along with relevant perl modules\n";
200# print STDOUT " to allow scripting from perl\n";
201# print STDOUT " -collectdir directory Collection directory (defaults to " .
202# &util::filename_cat ($ENV{'GSDLHOME'}, "collect") . ")\n";
203# print STDOUT " -out Filename or handle to print output status to.\n";
204# print STDOUT " The default is STDERR\n";
205# print STDOUT " -no_strip_html Do not strip the html tags from the indexed text\n";
206# print STDOUT " (only used for mgpp collections).\n\n";
207# print STDOUT " -faillog name Fail log filename. This log receives the filenames\n";
208# print STDOUT " of any files which fail to be processed (defaults.\n";
209# print STDOUT " to " .
210# &util::filename_cat("<collectdir>", "colname", "etc", "fail.log") . ")\n";
211# print STDOUT " [Type \"perl -S buildcol.pl | more\" if this help text scrolled off your screen]";
212# print STDOUT "\n" unless $ENV{'GSDLOS'} =~ /^windows$/i;
213# }
214
215
216&main();
217
218
219sub main
220{
221 my ($verbosity, $archivedir, $cachedir, $builddir, $maxdocs,
222 $debug, $mode, $indexname, $keepold, $allclassifications,
223 $create_images, $collectdir, $out, $buildtype, $textindex,
224 $no_strip_html, $no_text, $faillog);
225
226 # ***** 11-04-03 - John Thompson *****
227 my $xml = 0;
228 # ************************************
229
230 # note that no defaults are passed for most options as they're set
231 # later (after we check the collect.cfg file)
232 if (!parsargv::parse(\@ARGV,
233 'verbosity/\d+/', \$verbosity,
234 'archivedir/.*/', \$archivedir,
235 'cachedir/.*/', \$cachedir,
236 'builddir/.*/', \$builddir,
237 'maxdocs/^\-?\d+/', \$maxdocs,
238 'debug', \$debug,
239 'mode/^(all|compress_text|build_index|infodb)$/', \$mode,
240 'index/.*/', \$indexname,
241 'no_text', \$no_text,
242 'keepold', \$keepold,
243 'allclassifications', \$allclassifications,
244 'create_images', \$create_images,
245 'collectdir/.*/', \$collectdir,
246 'out/.*/STDERR', \$out,
247 'no_strip_html', \$no_strip_html,
248 'faillog/.*/', \$faillog,
249 q^xml^, \$xml)) {
250 &print_txt_usage();
251 die "\n";
252 }
253
254 if ($xml) {
255 &print_xml_usage();
256 die "\n";
257 }
258
259 $textindex = "";
260 my $close_out = 0;
261 if ($out !~ /^(STDERR|STDOUT)$/i) {
262 open (OUT, ">$out") || die "Couldn't open output file $out\n";
263 $out = "buildcol::OUT";
264 $close_out = 1;
265 }
266 $out->autoflush(1);
267
268 # get and check the collection
269 if (($collection = &util::use_collection(@ARGV, $collectdir)) eq "") {
270 &print_txt_usage();
271 die "\n";
272 }
273
274 if ($faillog eq "") {
275 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
276 }
277 # note that we're appending to the faillog here (import.pl clears it each time)
278 # this could potentially create a situation where the faillog keeps being added
279 # to over multiple builds (if the import process is being skipped)
280 open (FAILLOG, ">>$faillog") || die "Couldn't open fail log $faillog\n";
281 $faillog = 'buildcol::FAILLOG';
282 $faillog->autoflush(1);
283
284 # read the configuration file
285 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
286 if (-e $configfilename) {
287 $collectcfg = &colcfg::read_collect_cfg ($configfilename);
288
289 if ($verbosity !~ /\d+/) {
290 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
291 $verbosity = $collectcfg->{'verbosity'};
292 } else {
293 $verbosity = 2; # the default
294 }
295 }
296 # we use searchtype for determining buildtype, but for old versions, use buildtype
297 if (defined $collectcfg->{'searchtype'}) {
298 $buildtype = "mgpp";
299 }
300 elsif (defined $collectcfg->{'buildtype'}) {
301 $buildtype = $collectcfg->{'buildtype'};
302 } else {
303 $buildtype = "mg"; #mg is the default
304 }
305 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
306 $archivedir = $collectcfg->{'archivedir'};
307 }
308 if (defined $collectcfg->{'cachedir'} && $cachedir eq "") {
309 $cachedir = $collectcfg->{'cachedir'};
310 }
311 if (defined $collectcfg->{'builddir'} && $builddir eq "") {
312 $builddir = $collectcfg->{'builddir'};
313 }
314 if ($maxdocs !~ /\-?\d+/) {
315 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
316 $maxdocs = $collectcfg->{'maxdocs'};
317 } else {
318 $maxdocs = -1; # the default
319 }
320 }
321 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
322 $debug = 1;
323 }
324 if ($mode !~ /^(all|compress_text|build_index|infodb)$/) {
325 if (defined $collectcfg->{'mode'} && $collectcfg->{'mode'} =~ /^(all|compress_text|build_index|infodb)$/) {
326 $mode = $collectcfg->{'mode'};
327 } else {
328 $mode = "all"; # the default
329 }
330 }
331 if (defined $collectcfg->{'index'} && $indexname eq "") {
332 $indexname = $collectcfg->{'index'};
333 }
334 if (defined $collectcfg->{'no_text'} && $no_text == 0) {
335 if ($collectcfg->{'no_text'} =~ /^true$/i) {
336 $no_text = 1;
337 }
338 }
339 if (defined $collectcfg->{'allclassifications'} && $allclassifications == 0) {
340 if ($collectcfg->{'allclassifications'} =~ /^true$/i) {
341 $allclassifications = 1;
342 }
343 }
344 if (defined $collectcfg->{'keepold'} && $collectcfg->{'keepold'} =~ /^true$/i) {
345 $keepold = 1;
346 }
347 if (defined $collectcfg->{'create_images'} && $collectcfg->{'create_images'} =~ /^true$/i) {
348 $create_images = 1;
349 }
350 if ($buildtype eq "mgpp" && defined $collectcfg->{'textcompress'}) {
351 $textindex = $collectcfg->{'textcompress'};
352 }
353
354 } else {
355 die "Couldn't find the configuration file $configfilename\n";
356 }
357
358 #set the text index
359 if ($buildtype eq "mgpp") {
360 if ($textindex eq "") {
361 $textindex = "text";
362 }
363 }
364 else {
365 $textindex = "section:text";
366 }
367
368 # create default images if required
369 if ($create_images) {
370 my $collection_name = $collection;
371 $collection_name = $collectcfg->{'collectionmeta'}->{'collectionname'}->{'default'}
372 if defined $collectcfg->{'collectionmeta'}->{'collectionname'}->{'default'};
373 &create_images ($collection_name);
374 }
375
376 # fill in the default archives and building directories if none
377 # were supplied, turn all \ into / and remove trailing /
378 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives") if $archivedir eq "";
379 $archivedir =~ s/[\\\/]+/\//g;
380 $archivedir =~ s/\/$//;
381 $builddir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "building") if $builddir eq "";
382 $builddir =~ s/[\\\/]+/\//g;
383 $builddir =~ s/\/$//;
384
385 # update the archive cache if needed
386 if ($cachedir) {
387 print $out "Updating archive cache\n" if ($verbosity >= 1);
388
389 $cachedir =~ s/[\\\/]+$//;
390 $cachedir .= "/collect/$collection" unless
391 $cachedir =~ /collect\/$collection/;
392
393 $realarchivedir = "$cachedir/archives";
394 $realbuilddir = "$cachedir/building";
395 &util::mk_all_dir ($realarchivedir);
396 &util::mk_all_dir ($realbuilddir);
397 &util::cachedir ($archivedir, $realarchivedir, $verbosity);
398
399 } else {
400 $realarchivedir = $archivedir;
401 $realbuilddir = $builddir;
402 }
403
404 # build it in realbuilddir
405 &util::mk_all_dir ($realbuilddir);
406
407
408 # if a builder class has been created for this collection, use it
409 # otherwise, use the mg or mgpp builder
410 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}builder.pm") {
411 $builderdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
412 $buildertype = "${collection}builder";
413 } else {
414 $builderdir = "$ENV{'GSDLHOME'}/perllib";
415 if ($buildtype eq "mgpp") {
416 $buildertype = "mgppbuilder";
417 }
418 else {
419 $buildertype = "mgbuilder";
420 }
421 }
422
423 require "$builderdir/$buildertype.pm";
424
425 eval("\$builder = new $buildertype(\$collection, " .
426 "\$realarchivedir, \$realbuilddir, \$verbosity, " .
427 "\$maxdocs, \$debug, \$keepold, \$allclassifications, " .
428 "\$out, \$no_text, \$faillog)");
429 die "$@" if $@;
430
431 $builder->init();
432
433 if ($buildertype eq "mgppbuilder" && $no_strip_html) {
434 $builder->set_strip_html(0);
435 }
436 if ($mode =~ /^all$/i) {
437 $builder->compress_text($textindex);
438 $builder->build_indexes($indexname);
439 $builder->make_infodatabase();
440 $builder->collect_specific();
441 } elsif ($mode =~ /^compress_text$/i) {
442 $builder->compress_text($textindex);
443 } elsif ($mode =~ /^build_index$/i) {
444 $builder->build_indexes($indexname);
445 } elsif ($mode =~ /^infodb$/i) {
446 $builder->make_infodatabase();
447 } else {
448 die "unknown mode: $mode\n";
449 }
450
451 $builder->make_auxiliary_files() if !$debug;
452 $builder->deinit();
453
454 if (($realbuilddir ne $builddir) && !$debug) {
455 print $out "Copying back the cached build\n" if ($verbosity >= 1);
456 &util::rm_r ($builddir);
457 &util::cp_r ($realbuilddir, $builddir);
458 }
459
460 close OUT if $close_out;
461 close FAILLOG;
462}
463
464sub create_images {
465 my ($collection_name) = @_;
466
467 my $image_script = &util::filename_cat ($ENV{'GSDLHOME'}, "bin", "script", "gimp", "title_icon-1.2.pl");
468 if (!-e $image_script) {
469 print $out "WARNING: Image making script ($image_script) could not be found\n";
470 print $out " Default images will not be generated\n\n";
471 return;
472 }
473
474 my $imagedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "images");
475
476 &util::mk_all_dir ($imagedir);
477
478 # create the images
479 system ("$image_script -size 1.5 -image_dir \"$imagedir\" -filename $collection.gif -text \"$collection_name\"");
480 system ("$image_script -image_dir \"$imagedir\" -filename ${collection}sm.gif -text \"$collection_name\"");
481
482 # update the collect.cfg configuration file (this will need
483 # to be changed when the config file format changes)
484 if (!open (CFGFILE, $configfilename)) {
485 print $out "WARNING: Couldn't open config file ($configfilename)\n";
486 print $out " for updating so collection images may not be linked correctly\n";
487 return;
488 }
489
490 my $line = ""; my $file = "";
491 my $found = 0; my $foundsm = 0;
492 while (defined ($line = <CFGFILE>)) {
493 if ($line =~ /collectionmeta\s+iconcollection\s+/) {
494 $line = "collectionmeta iconcollection _httpprefix_/collect/$collection/images/$collection.gif\n";
495 $found = 1;
496 } elsif ($line =~ /collectionmeta\s+iconcollectionsmall\s+/) {
497 $line = "collectionmeta iconcollectionsmall _httpprefix_/collect/$collection/images/${collection}sm.gif\n";
498 $foundsm = 1;
499 }
500 $file .= $line;
501 }
502 close CFGFILE;
503
504 $file .= "collectionmeta iconcollection _httpprefix_/collect/$collection/images/$collection.gif\n" if !$found;
505 $file .= "collectionmeta iconcollectionsmall _httpprefix_/collect/$collection/images/${collection}sm.gif\n" if !$foundsm;
506
507 if (!open (CFGFILE, ">$configfilename")) {
508 print $out "WARNING: Couldn't open config file ($configfilename)\n";
509 print $out " for updating so collection images may not be linked correctly\n";
510 return;
511 }
512 print CFGFILE $file;
513 close CFGFILE;
514}
515
Note: See TracBrowser for help on using the repository browser.