source: trunk/gsdl/bin/script/build@ 2381

Last change on this file since 2381 was 2381, checked in by sjboddie, 23 years ago

build script now fails with an error message if the import stage doesn't
import any documents (e.g. if all the documents in the import directory
are in a format that no plugin can process).

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 19.6 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# build --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 2000 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# This perl script may be called directly or by running build.bat on
29# windows (build.bat is in bin\windows)
30
31# Note that this script has grown over time and now has many options for
32# use when called from within the collector. If it appears to
33# over-complicate things a little, that's why.
34
35package build;
36
37use FileHandle;
38use File::Copy;
39
40BEGIN {
41
42 die "GSDLHOME not set - did you remember to source setup.bash (unix) or " .
43 "run setup.bat (windows)?\n" unless defined $ENV{'GSDLHOME'};
44 die "GSDLOS not set - did you remember to source setup.bash (unix) or " .
45 "run setup.bat (windows)?\n" unless defined $ENV{'GSDLOS'};
46 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
47
48 STDOUT->autoflush(1);
49 STDERR->autoflush(1);
50}
51
52use lib qq($ENV{'GSDLHOME'}/perllib/cpan);
53use Mail::Sendmail;
54use parsargv;
55use util;
56use cfgread;
57
58# set up path - this allows for paths not to be supplied to system calls
59# and overcomes problems when GSDLHOME contains spaces (double quoting
60# the call doesn't work on win2k and probably other variants of winnt)
61my $path_separator = ":";
62$path_separator = ";" if $ENV{'GSDLOS'} =~ /^windows$/;
63$ENV{'PATH'} = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}) .
64 $path_separator . &util::filename_cat($ENV{'GSDLHOME'}, "bin", "script") .
65 $path_separator . $ENV{'PATH'};
66
67&parse_args (\@ARGV);
68
69my ($collection) = @ARGV;
70
71if (!defined $collection || $collection !~ /\w/) {
72 print STDERR "You must specify a collection to build\n";
73 &print_usage();
74 die "\n";
75}
76
77if ($optionfile =~ /\w/) {
78 open (OPTIONS, $optionfile) || die "Couldn't open $optionfile\n";
79 my $line = [];
80 my $options = [];
81 while (defined ($line = &cfgread::read_cfg_line ('build::OPTIONS'))) {
82 push (@$options, @$line);
83 }
84 close OPTIONS;
85 &parse_args ($options);
86}
87
88if ($maxdocs == -1) {
89 $maxdocs = "";
90} else {
91 $maxdocs = "-maxdocs $maxdocs";
92}
93
94my $cdir = $collectdir;
95$cdir = &util::filename_cat ($ENV{'GSDLHOME'}, "collect") unless $collectdir =~ /\w/;
96my $importdir = &util::filename_cat ($cdir, $collection, "import");
97my $archivedir = &util::filename_cat ($cdir, $collection, "archives");
98my $buildingdir = &util::filename_cat ($cdir, $collection, "building");
99my $indexdir = &util::filename_cat ($cdir, $collection, "index");
100my $bindir = &util::filename_cat ($ENV{'GSDLHOME'}, "bin");
101
102my $use_out = 0;
103my $outfile = $out;
104if ($out !~ /^(STDERR|STDOUT)$/i) {
105 open (OUT, ">$out") || die "Couldn't open output file $out\n";
106 $out = "OUT";
107
108 # delete any existing .final file
109 &util::rm ("$outfile.final") if -e "$outfile.final";
110
111 $use_out = 1;
112}
113$out->autoflush(1);
114
115# delete any .kill file left laying around from a previously aborted build
116if (-e &util::filename_cat ($cdir, $collection, ".kill")) {
117 &util::rm (&util::filename_cat ($cdir, $collection, ".kill"));
118}
119
120# get maintainer email address from main.cfg
121my $maintainer = "NULL";
122my $main_cfg = &util::filename_cat ($ENV{'GSDLHOME'}, "etc", "main.cfg");
123my $cfgdata = &cfgread::read_cfg_file ($main_cfg, "maintainer");
124if (defined $cfgdata->{'maintainer'} && $cfgdata->{'maintainer'} =~ /\w/) {
125 $maintainer = $cfgdata->{'maintainer'};
126}
127# if maintainer is "NULL" email_events should be disabled
128if ($maintainer =~ /^NULL$/i) {
129 $email_events = "";
130}
131
132&main();
133
134close OUT if $use_out;
135
136sub print_usage {
137 print STDOUT "\n";
138 print STDOUT "build: Builds a Greenstone collection (i.e. runs import.pl and buildcol.pl\n";
139 print STDOUT " then copies the resulting indexes to the correct place).\n\n";
140 print STDOUT " usage: $0 [options] collection-name\n\n";
141 print STDOUT " options:\n";
142 print STDOUT " -optionfile file Get options from file, useful on systems where\n";
143 print STDOUT " long command lines may cause problems\n";
144 print STDOUT " -append Add new files to existing collection\n";
145 print STDOUT " -remove_archives Remove archives directory after successfully\n";
146 print STDOUT " building the collection.\n";
147 print STDOUT " -remove_import Remove import directory after successfully\n";
148 print STDOUT " importing the collection.\n";
149 print STDOUT " -buildtype build|import If 'build' attempt to build directly\n";
150 print STDOUT " from archives directory (bypassing import\n";
151 print STDOUT " stage). Defaults to 'import'\n";
152 print STDOUT " -maxdocs number Maximum number of documents to build\n";
153 print STDOUT " -download directory Directory (or file) to get import documents from.\n";
154 print STDOUT " There may be multiple download directories and they\n";
155 print STDOUT " may be of type http://, ftp://, or file://\n";
156 print STDOUT " Note that any existing import directory will be\n";
157 print STDOUT " deleted to make way for the downloaded data if\n";
158 print STDOUT " a -download option is supplied\n";
159 print STDOUT " -collectdir directory Collection directory (defaults to " .
160 &util::filename_cat ($ENV{'GSDLHOME'}, "collect") . ")\n";
161 print STDOUT " -dontinstall Only applicable if -collectdir is set to something\n";
162 print STDOUT " other than the default. -dontinstall will suppress\n";
163 print STDOUT " the default behaviour which is to install the\n";
164 print STDOUT " collection to the gsdl/collect directory once it has\n";
165 print STDOUT " been built.\n";
166 print STDOUT " -save_archives Create a copy of the existing archives directory\n";
167 print STDOUT " called archives.org\n";
168 print STDOUT " -out Filename or handle to print output status to.\n";
169 print STDOUT " The default is STDERR\n";
170 print STDOUT " -log_events Log important events (collection built successfully\n";
171 print STDOUT " etc.) to event_log_file\n";
172 print STDOUT " -event_log_file file File to append important events to (defaults to\n";
173 print STDOUT " " . &util::filename_cat ($ENV{'GSDLHOME'}, "etc", "events.txt") . "\n";
174 print STDOUT " -email_events addr Comma separated list of email addresses to mail\n";
175 print STDOUT " details of important collection building events\n";
176 print STDOUT " -mail_server server The outgoing (SMTP) mail server to be used by\n";
177 print STDOUT " email_events. email_events will be disabled if\n";
178 print STDOUT " mail_server isn't set\n";
179 print STDOUT " -event_header file File containing a header to go on any event\n";
180 print STDOUT " messages. If not specified build will create a\n";
181 print STDOUT " generic header\n\n";
182 print STDOUT " [Type \"perl -S buildcol.pl | more\" if this help text scrolled off your screen]";
183 print STDOUT "\n" unless $ENV{'GSDLOS'} =~ /^windows$/i;
184}
185
186sub main {
187
188 if ($save_archives && -d $archivedir) {
189 print $out "caching original archives to ${archivedir}.org\n";
190 &util::cp_r ($archivedir, "${archivedir}.org");
191 }
192
193 # do the download thing if we have any -download options
194 if (scalar (@download)) {
195 # remove any existing import data
196 if (&has_content ($importdir)) {
197 print $out "build: WARNING: removing contents of $importdir\n";
198 &util::rm_r ($importdir);
199 }
200
201 foreach $download_dir (@download) {
202
203 # remove any leading or trailing whitespace from filenames (just in case)
204 $download_dir =~ s/^\s+//;
205 $download_dir =~ s/\s+$//;
206
207 if ($download_dir =~ /^(http|ftp):\/\//) {
208 # use wget to mirror http or ftp urls
209 # options used are:
210 # -P = the directory to download documents to
211 # -np = don't ascend to parent directories. this means that only documents
212 # that live in the same directory or below on the same server as
213 # the given url will be downloaded
214 # -nv = not too verbose
215 # -r = recursively mirror
216 # -N = use time-stamping to see if an up-to-date local copy of each
217 # file already exists. this may be useful if wget fails and
218 # is restarted
219 # -l inf = infinite recursion depth
220 # -R "*\?*" = don't download cgi based urls
221 # -o = the output file to write download status to (only used if the -out
222 # option was given to build)
223
224 my $download_cmd = "perl -S gsWget.pl -P \"$importdir\" -np -nv";
225 $download_cmd .= " -r -N -l inf -R \"*\\?*\"";
226 $download_cmd .= " -o \"$outfile.download\"" if $use_out;
227 $download_cmd .= " \"$download_dir\"";
228 system ($download_cmd);
229
230 # note that wget obeys the robot rules. this means that it will have
231 # downloaded a robots.txt file if one was present. since it's unlikely
232 # anyone really wants to include it in a collection we'll delete it.
233 # robots.txt shouldn't be more than two directories deep (I think it will
234 # always be exactly two deep but will look for it in the top directory too)
235 # so that's as far as we'll go looking for it.
236 if (opendir (DIR, $importdir)) {
237 my @files = readdir DIR;
238 closedir DIR;
239 foreach my $file (@files) {
240 next if $file =~ /^\.\.?$/;
241 if ($file =~ /^robots.txt$/i) {
242 &util::rm (&util::filename_cat ($importdir, $file));
243 last;
244 } else {
245 $file = &util::filename_cat ($importdir, $file);
246 if (-d $file) {
247 if (opendir (DIR, $file)) {
248 my @files2 = readdir DIR;
249 closedir DIR;
250 foreach my $file2 (@files2) {
251 if ($file2 =~ /^robots.txt$/i) {
252 &util::rm (&util::filename_cat ($file, $file2));
253 last;
254 }
255 }
256 }
257 }
258 }
259 }
260 }
261
262 # if using output directory append the file download output to it
263 &append_file ($out, "$outfile.download");
264
265 } else {
266 # we assume anything not beginning with http:// or ftp://
267 # is a file or directory on the local file system.
268 $download_dir =~ s/^file:(\/\/)?//;
269 $download_dir =~ s/^\s+//; # may be whitespace between "file://" and the rest
270
271 if (-e $download_dir) {
272 # copy download_dir and all it contains to the import directory
273 my $download_cmd = "perl -S filecopy.pl";
274 $download_cmd .= " -collectdir \"$collectdir\"" if $collectdir =~ /\w/;
275 $download_cmd .= " -out \"$outfile.download\"" if $use_out;
276 $download_cmd .= " \"" . $download_dir . "\" " . $collection;
277 system ($download_cmd);
278 # if using output directory append the file download output to it
279 &append_file ($out, "$outfile.download");
280 } else {
281 print $out "WARNING: '$download_dir' does not exist\n";
282 }
283 }
284 }
285 }
286
287 if (-e &util::filename_cat ($archivedir, "archives.inf")) {
288 if (&has_content ($importdir)) {
289 if ($buildtype eq "build") {
290 &gsdl_build();
291 } else {
292 &gsdl_import();
293 &gsdl_build();
294 }
295 } else {
296 # there are archives but no import, build directly from archives
297 print $out "build: no import material was found, building directly\n";
298 print $out " from archives\n";
299 &gsdl_build();
300 }
301 } else {
302 if (&has_content ($importdir)) {
303 if ($buildtype eq "build") {
304 print $out "build: can't build directly from archives as no\n";
305 print $out " imported archives exist (did you forget to\n";
306 print $out " move the contents of $collection/import to\n";
307 print $out " collection/archives?)\n";
308 }
309 &gsdl_import();
310 if (&has_content ($archivedir, "^archives.inf\$")) {
311 &gsdl_build();
312 } else {
313 my $msg = "build: ERROR: The collection could not be built as no\n";
314 $msg .= " valid data was imported. Are at least some of\n";
315 $msg .= " the files you imported in a format that can be\n";
316 $msg .= " processed by the specified Greenstone plugins?\n";
317 print $out $msg;
318 &log_event ($msg);
319 &final_out (6) if $use_out;
320 die "\n";
321 }
322 } else {
323 # no import or archives
324 my $msg = "build: ERROR: The collection could not be built as it contains no data.\n";
325 print $out $msg;
326 &log_event ($msg);
327 &final_out (1) if $use_out;
328 die "\n";
329 }
330 }
331
332 if ($collectdir ne "" && !$dontinstall) {
333 my $install_collectdir = &util::filename_cat ($ENV{'GSDLHOME'}, "collect");
334 if (!&util::filenames_equal ($collectdir, $install_collectdir)) {
335
336 # install collection to gsdl/collect
337 print $out "installing the $collection collection\n";
338 my $newdir = &util::filename_cat ($install_collectdir, $collection);
339 my $olddir = &util::filename_cat ($collectdir, $collection);
340 if (-d $newdir) {
341 my $msg = "build: Could not install collection as $newdir\n" .
342 " already exists. Collection will remain at\n$olddir\n";
343
344 print $out $msg;
345 &log_event ($msg);
346 &final_out (4) if $use_out;
347 die "\n";
348 }
349 if (!&File::Copy::move ($olddir, $newdir)) {
350 my $msg = "build: Failed to install collection to $newdir\n" .
351 " Collection will remain at $olddir\n";
352 print $out $msg;
353 &log_event ($msg);
354 &final_out (5) if $use_out;
355 die "\n";
356 }
357 }
358 }
359
360 &log_event ("The $collection collection was built successfully\n");
361 &final_out (0) if $use_out;
362}
363
364sub gsdl_import {
365
366 print $out "importing the $collection collection\n\n";
367
368 my $import_cmd = "perl -S import.pl";
369 $import_cmd .= " -out \"$outfile.import\"" if $use_out;
370 $import_cmd .= " -removeold" unless $append;
371 $import_cmd .= " -collectdir \"$collectdir\"" if $collectdir =~ /\w/;
372 $import_cmd .= " $maxdocs $collection";
373 system ($import_cmd);
374 # if using output directory append the import output to it
375 &append_file ($out, "$outfile.import");
376
377 if (-e &util::filename_cat ($archivedir, "archives.inf")) {
378 print $out "$collection collection imported successfully\n\n";
379 if ($remove_import) {
380 print $out "removing import directory ($importdir)\n";
381 &util::rm_r ($importdir);
382 }
383 } else {
384 my $msg = "build: ERROR: import.pl failed\n";
385 print $out "\n$msg";
386 &log_event ($msg);
387 &final_out (2) if $use_out;
388 die "\n";
389 }
390}
391
392sub gsdl_build {
393
394 print $out "building the $collection collection\n\n";
395
396 my $build_cmd = "perl -S buildcol.pl";
397 $build_cmd .= " -out \"$outfile.build\"" if $use_out;
398 $build_cmd .= " -collectdir \"$collectdir\"" if $collectdir =~ /\w/;
399 $build_cmd .= " $maxdocs $collection";
400 system ($build_cmd);
401 # if using output directory append the buildcol output to it
402 &append_file ($out, "$outfile.build");
403
404 if (-e &util::filename_cat ($buildingdir, "text", "$collection.ldb") ||
405 -e &util::filename_cat ($buildingdir, "text", "$collection.bdb")) {
406 print $out "$collection collection built successfully\n\n";
407 if ($remove_archives) {
408 print $out "removing archives directory ($archivedir)\n";
409 &util::rm_r ($archivedir);
410 }
411 } else {
412 my $msg = "build: ERROR: buildcol.pl failed\n";
413 print $out "\n$msg";
414 &log_event ($msg);
415 &final_out (3) if $use_out;
416 die "\n";
417 }
418
419 # replace old indexes with new ones
420 if (&has_content ($indexdir)) {
421 print $out "removing old indexes\n";
422 &util::rm_r ($indexdir);
423 }
424 rmdir ($indexdir) if -d $indexdir;
425 &File::Copy::move ($buildingdir, $indexdir);
426
427 # remove the cached arhives
428 if ($save_archives && -d "${archivedir}.org") {
429 &util::rm_r ("${archivedir}.org");
430 }
431}
432
433# return 1 if $dir directory contains any files or sub-directories (other
434# than those specified in the $ignore regular expression)
435sub has_content {
436 my ($dir, $ignore) = @_;
437
438 if (!-d $dir) {return 0;}
439
440 opendir (DIR, $dir) || return 0;
441 my @files = readdir DIR;
442 close DIR;
443
444 foreach my $file (@files) {
445 if ($file !~ /^\.{1,2}$/) {
446 return 1 unless (defined $ignore && $file =~ /$ignore/);
447 }
448 }
449 return 0;
450}
451
452sub append_file {
453 my ($handle, $file) = @_;
454
455 open (FILE, $file) || return;
456 undef $/;
457 print $handle <FILE>;
458 $/ = "\n";
459 close FILE;
460 &util::rm ($file);
461}
462
463# creates a file called $outfile.final (should only be called if -out option
464# is used and isn't STDERR or STDOUT) and writes an output code to it.
465# An output code of 0 specifies that there was no error
466sub final_out {
467 my ($exit_code) = @_;
468
469 if (open (FINAL, ">$outfile.final")) {
470 print FINAL $exit_code;
471 close FINAL;
472 }
473}
474
475sub log_event {
476 my ($msg) = @_;
477
478 return unless ($log_events || $email_events);
479
480 # get the event header
481 my $eheader = "[Build Event]\n";
482 $eheader .= "Date: " . scalar localtime() . "\n";
483 if ($event_header ne "" && open (HEADER, $event_header)) {
484 undef $/;
485 $eheader .= <HEADER>;
486 $/ = "\n";
487 close HEADER;
488 } else {
489 $eheader .= "Collection: $collection\n";
490 $eheader .= "GSDLHOME: $ENV{'GSDLHOME'}\n";
491 $eheader .= "Build Location: $collectdir\n";
492 }
493
494 if ($log_events) {
495 my $fail = 0;
496 # append the event to the event log file
497 if ($event_log_file eq "" || !open (LOG, ">>$event_log_file")) {
498 # log file defaults to $GSDLHOME/etc/events.txt
499 $event_log_file = &util::filename_cat ($ENV{'GSDLHOME'}, "etc", "events.txt");
500 if (!open (LOG, ">>$event_log_file")) {
501 print $out "build: ERROR: Couldn't open event log file $event_log_file\n";
502 $fail = 1;
503 }
504 }
505 if (!$fail) {
506 print LOG $eheader;
507 print LOG $msg;
508 print LOG "\n";
509 close LOG;
510 }
511 }
512
513 if ($email_events) {
514 # if mail_server isn't set email_events does nothing
515 if ($mail_server eq "") {
516 print $out "build: WARNING: mail_server was not set - email_events option was ignored\n";
517 return;
518 }
519
520 my %mail = ('SMTP' => $mail_server,
521 'To' => $email_events,
522 'From' => $maintainer,
523 'Subject' => 'Greenstone Build Event'
524 );
525 $mail{'Message'} = $eheader . $msg;
526
527 if (!sendmail %mail) {
528 print $out "build: ERROR sending mail to $email_events\n";
529 print $out "'$Mail::Sendmail::error'\n";
530 }
531 }
532}
533
534
535sub parse_args {
536 my ($argref) = @_;
537
538 if (!parsargv::parse($argref,
539 'optionfile/.*/', \$optionfile,
540 'append', \$append,
541 'remove_archives', \$remove_archives,
542 'remove_import', \$remove_import,
543 'buildtype/^(build|import)$/import', \$buildtype,
544 'maxdocs/^\-?\d+/-1', \$maxdocs,
545 'download/.+', \@download,
546 'collectdir/.*/', \$collectdir,
547 'dontinstall', \$dontinstall,
548 'save_archives', \$save_archives,
549 'out/.*/STDERR', \$out,
550 'log_events', \$log_events,
551 'event_log_file/.*/', \$event_log_file,
552 'email_events/.*/', \$email_events,
553 'mail_server/.*/', \$mail_server,
554 'event_header/.*/', \$event_header)) {
555
556 &print_usage();
557 die "\n";
558 }
559}
Note: See TracBrowser for help on using the repository browser.