source: tags/gsdl-2_37-distribution/gsdl/bin/script/build@ 2843

Last change on this file since 2843 was 2785, checked in by sjboddie, 23 years ago

The build process now creates a summary of how many files were included,
which were rejected, etc. A link to a page containing this summary is
provided from the final page of the collector (once the collection is built
successfully) and from the default "about this collection" text for
collections built by the collector.

Also did a little bit of tidying in a couple of places

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 21.5 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# build --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 2000 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# This perl script may be called directly or by running build.bat on
29# windows (build.bat is in bin\windows)
30
31# Note that this script has grown over time and now has many options for
32# use when called from within the collector. If it appears to
33# over-complicate things a little, that's why.
34
35package build;
36
37use FileHandle;
38use File::Copy;
39
40BEGIN {
41
42 die "GSDLHOME not set - did you remember to source setup.bash (unix) or " .
43 "run setup.bat (windows)?\n" unless defined $ENV{'GSDLHOME'};
44 die "GSDLOS not set - did you remember to source setup.bash (unix) or " .
45 "run setup.bat (windows)?\n" unless defined $ENV{'GSDLOS'};
46 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
47
48 STDOUT->autoflush(1);
49 STDERR->autoflush(1);
50}
51
52use lib qq($ENV{'GSDLHOME'}/perllib/cpan);
53use Mail::Sendmail;
54use parsargv;
55use util;
56use cfgread;
57
58# set up path - this allows for paths not to be supplied to system calls
59# and overcomes problems when GSDLHOME contains spaces (double quoting
60# the call doesn't work on win2k and probably other variants of winnt)
61my $path_separator = ":";
62$path_separator = ";" if $ENV{'GSDLOS'} =~ /^windows$/;
63$ENV{'PATH'} = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}) .
64 $path_separator . &util::filename_cat($ENV{'GSDLHOME'}, "bin", "script") .
65 $path_separator . $ENV{'PATH'};
66
67&parse_args (\@ARGV);
68
69my ($collection) = @ARGV;
70
71if (!defined $collection || $collection !~ /\w/) {
72 print STDERR "You must specify a collection to build\n";
73 &print_usage();
74 die "\n";
75}
76
77if ($optionfile =~ /\w/) {
78 open (OPTIONS, $optionfile) || die "Couldn't open $optionfile\n";
79 my $line = [];
80 my $options = [];
81 while (defined ($line = &cfgread::read_cfg_line ('build::OPTIONS'))) {
82 push (@$options, @$line);
83 }
84 close OPTIONS;
85 &parse_args ($options);
86}
87
88if ($maxdocs == -1) {
89 $maxdocs = "";
90} else {
91 $maxdocs = "-maxdocs $maxdocs";
92}
93
94my $cdir = $collectdir;
95$cdir = &util::filename_cat ($ENV{'GSDLHOME'}, "collect") unless $collectdir =~ /\w/;
96my $importdir = &util::filename_cat ($cdir, $collection, "import");
97my $archivedir = &util::filename_cat ($cdir, $collection, "archives");
98my $buildingdir = &util::filename_cat ($cdir, $collection, "building");
99my $indexdir = &util::filename_cat ($cdir, $collection, "index");
100my $bindir = &util::filename_cat ($ENV{'GSDLHOME'}, "bin");
101
102my $use_out = 0;
103my $outfile = $out;
104if ($out !~ /^(STDERR|STDOUT)$/i) {
105 open (OUT, ">$out") || die "Couldn't open output file $out\n";
106 $out = "OUT";
107
108 # delete any existing .final file
109 &util::rm ("$outfile.final") if -e "$outfile.final";
110
111 $use_out = 1;
112}
113$out->autoflush(1);
114
115# delete any .kill file left laying around from a previously aborted build
116if (-e &util::filename_cat ($cdir, $collection, ".kill")) {
117 &util::rm (&util::filename_cat ($cdir, $collection, ".kill"));
118}
119
120# get maintainer email address from main.cfg
121my $maintainer = "NULL";
122my $main_cfg = &util::filename_cat ($ENV{'GSDLHOME'}, "etc", "main.cfg");
123my $cfgdata = &cfgread::read_cfg_file ($main_cfg, "maintainer");
124if (defined $cfgdata->{'maintainer'} && $cfgdata->{'maintainer'} =~ /\w/) {
125 $maintainer = $cfgdata->{'maintainer'};
126}
127# if maintainer is "NULL" email_events should be disabled
128if ($maintainer =~ /^NULL$/i) {
129 $email_events = "";
130}
131
132&main();
133
134if ($use_out) {
135 close OUT;
136
137 # if we've created a build log we'll copy it to the collection's etc
138 # directory
139 my ($etcdir);
140 if ($dontinstall) {
141 $etcdir = &util::filename_cat($collectdir, "etc", "build.log");
142 } else {
143 $etcdir = &util::filename_cat($ENV{'GSDLHOME'}, "collect", $collection, "etc", "build.log");
144 }
145
146 &util::cp($outfile, $etcdir);
147}
148
149sub print_usage {
150 print STDOUT "\n";
151 print STDOUT "build: Builds a Greenstone collection (i.e. runs import.pl and buildcol.pl\n";
152 print STDOUT " then copies the resulting indexes to the correct place).\n\n";
153 print STDOUT " usage: $0 [options] collection-name\n\n";
154 print STDOUT " options:\n";
155 print STDOUT " -optionfile file Get options from file, useful on systems where\n";
156 print STDOUT " long command lines may cause problems\n";
157 print STDOUT " -append Add new files to existing collection\n";
158 print STDOUT " -remove_archives Remove archives directory after successfully\n";
159 print STDOUT " building the collection.\n";
160 print STDOUT " -remove_import Remove import directory after successfully\n";
161 print STDOUT " importing the collection.\n";
162 print STDOUT " -buildtype build|import If 'build' attempt to build directly\n";
163 print STDOUT " from archives directory (bypassing import\n";
164 print STDOUT " stage). Defaults to 'import'\n";
165 print STDOUT " -maxdocs number Maximum number of documents to build\n";
166 print STDOUT " -download directory Directory (or file) to get import documents from.\n";
167 print STDOUT " There may be multiple download directories and they\n";
168 print STDOUT " may be of type http://, ftp://, or file://\n";
169 print STDOUT " Note that any existing import directory will be\n";
170 print STDOUT " deleted to make way for the downloaded data if\n";
171 print STDOUT " a -download option is supplied\n";
172 print STDOUT " -collectdir directory Collection directory (defaults to " .
173 &util::filename_cat ($ENV{'GSDLHOME'}, "collect") . ")\n";
174 print STDOUT " -dontinstall Only applicable if -collectdir is set to something\n";
175 print STDOUT " other than the default. -dontinstall will suppress\n";
176 print STDOUT " the default behaviour which is to install the\n";
177 print STDOUT " collection to the gsdl/collect directory once it has\n";
178 print STDOUT " been built.\n";
179 print STDOUT " -save_archives Create a copy of the existing archives directory\n";
180 print STDOUT " called archives.org\n";
181 print STDOUT " -out Filename or handle to print output status to.\n";
182 print STDOUT " The default is STDERR\n";
183 print STDOUT " -statsfile name Filename or handle to print import statistics to.\n";
184 print STDOUT " The default is STDERR\n";
185 print STDOUT " -make_writable If set build will make the collection and any\n";
186 print STDOUT " temporary files it created globally writable after\n";
187 print STDOUT " it finishes\n";
188 print STDOUT " -log_events Log important events (collection built successfully\n";
189 print STDOUT " etc.) to event_log_file\n";
190 print STDOUT " -event_log_file file File to append important events to (defaults to\n";
191 print STDOUT " " . &util::filename_cat ($ENV{'GSDLHOME'}, "etc", "events.txt") . "\n";
192 print STDOUT " -email_events addr Comma separated list of email addresses to mail\n";
193 print STDOUT " details of important collection building events\n";
194 print STDOUT " -mail_server server The outgoing (SMTP) mail server to be used by\n";
195 print STDOUT " email_events. email_events will be disabled if\n";
196 print STDOUT " mail_server isn't set\n";
197 print STDOUT " -event_header file File containing a header to go on any event\n";
198 print STDOUT " messages. If not specified build will create a\n";
199 print STDOUT " generic header\n\n";
200 print STDOUT " [Type \"build | more\" if this help text scrolled off your screen]";
201 print STDOUT "\n" unless $ENV{'GSDLOS'} =~ /^windows$/i;
202}
203
204sub main {
205
206 if ($save_archives && -d $archivedir) {
207 print $out "caching original archives to ${archivedir}.org\n";
208 &util::cp_r ($archivedir, "${archivedir}.org");
209 }
210
211 # do the download thing if we have any -download options
212 if (scalar (@download)) {
213 # remove any existing import data
214 if (&has_content ($importdir)) {
215 print $out "build: WARNING: removing contents of $importdir\n";
216 &util::rm_r ($importdir);
217 }
218
219 foreach $download_dir (@download) {
220
221 # remove any leading or trailing whitespace from filenames (just in case)
222 $download_dir =~ s/^\s+//;
223 $download_dir =~ s/\s+$//;
224
225 if ($download_dir =~ /^(http|ftp):\/\//) {
226 # use wget to mirror http or ftp urls
227 # options used are:
228 # -P = the directory to download documents to
229 # -np = don't ascend to parent directories. this means that only documents
230 # that live in the same directory or below on the same server as
231 # the given url will be downloaded
232 # -nv = not too verbose
233 # -r = recursively mirror
234 # -N = use time-stamping to see if an up-to-date local copy of each
235 # file already exists. this may be useful if wget fails and
236 # is restarted
237 # -l inf = infinite recursion depth
238 # -R "*\?*" = don't download cgi based urls
239 # -o = the output file to write download status to (only used if the -out
240 # option was given to build)
241
242 my $download_cmd = "perl -S gsWget.pl -P \"$importdir\" -np -nv";
243 $download_cmd .= " -r -N -l inf -R \"*\\?*\"";
244 $download_cmd .= " -o \"$outfile.download\"" if $use_out;
245 $download_cmd .= " \"$download_dir\"";
246 system ($download_cmd);
247
248 # note that wget obeys the robot rules. this means that it will have
249 # downloaded a robots.txt file if one was present. since it's unlikely
250 # anyone really wants to include it in a collection we'll delete it.
251 # robots.txt shouldn't be more than two directories deep (I think it will
252 # always be exactly two deep but will look for it in the top directory too)
253 # so that's as far as we'll go looking for it.
254 if (opendir (DIR, $importdir)) {
255 my @files = readdir DIR;
256 closedir DIR;
257 foreach my $file (@files) {
258 next if $file =~ /^\.\.?$/;
259 if ($file =~ /^robots.txt$/i) {
260 &util::rm (&util::filename_cat ($importdir, $file));
261 last;
262 } else {
263 $file = &util::filename_cat ($importdir, $file);
264 if (-d $file) {
265 if (opendir (DIR, $file)) {
266 my @files2 = readdir DIR;
267 closedir DIR;
268 foreach my $file2 (@files2) {
269 if ($file2 =~ /^robots.txt$/i) {
270 &util::rm (&util::filename_cat ($file, $file2));
271 last;
272 }
273 }
274 }
275 }
276 }
277 }
278 }
279
280 # if using output directory append the file download output to it
281 &append_file ($out, "$outfile.download");
282
283 } else {
284 # we assume anything not beginning with http:// or ftp://
285 # is a file or directory on the local file system.
286 $download_dir =~ s/^file:(\/\/)?//;
287 $download_dir =~ s/^\s+//; # may be whitespace between "file://" and the rest
288
289 if (-e $download_dir) {
290 # copy download_dir and all it contains to the import directory
291 my $download_cmd = "perl -S filecopy.pl";
292 $download_cmd .= " -collectdir \"$collectdir\"" if $collectdir =~ /\w/;
293 $download_cmd .= " -out \"$outfile.download\"" if $use_out;
294 $download_cmd .= " \"" . $download_dir . "\" " . $collection;
295 system ($download_cmd);
296 # if using output directory append the file download output to it
297 &append_file ($out, "$outfile.download");
298 } else {
299 print $out "WARNING: '$download_dir' does not exist\n";
300 }
301 }
302 }
303 }
304
305 if (-e &util::filename_cat ($archivedir, "archives.inf")) {
306 if (&has_content ($importdir)) {
307 if ($buildtype eq "build") {
308 &gsdl_build();
309 } else {
310 &gsdl_import();
311 &gsdl_build();
312 }
313 } else {
314 # there are archives but no import, build directly from archives
315 print $out "build: no import material was found, building directly\n";
316 print $out " from archives\n";
317 &gsdl_build();
318 }
319 } else {
320 if (&has_content ($importdir)) {
321 if ($buildtype eq "build") {
322 print $out "build: can't build directly from archives as no\n";
323 print $out " imported archives exist (did you forget to\n";
324 print $out " move the contents of $collection/import to\n";
325 print $out " collection/archives?)\n";
326 }
327 &gsdl_import();
328 if (&has_content ($archivedir, "^archives.inf\$")) {
329 &gsdl_build();
330 } else {
331 my $msg = "build: ERROR: The collection could not be built as no\n";
332 $msg .= " valid data was imported. Are at least some of\n";
333 $msg .= " the files you imported in a format that can be\n";
334 $msg .= " processed by the specified Greenstone plugins?\n";
335 print $out $msg;
336 &log_event ($msg);
337 &final_out (6) if $use_out;
338 die "\n";
339 }
340 } else {
341 # no import or archives
342 my $msg = "build: ERROR: The collection could not be built as it contains no data.\n";
343 print $out $msg;
344 &log_event ($msg);
345 &final_out (1) if $use_out;
346 die "\n";
347 }
348 }
349
350 if ($collectdir ne "" && !$dontinstall) {
351 my $install_collectdir = &util::filename_cat ($ENV{'GSDLHOME'}, "collect");
352 if (!&util::filenames_equal ($collectdir, $install_collectdir)) {
353
354 # install collection to gsdl/collect
355 print $out "installing the $collection collection\n";
356 my $newdir = &util::filename_cat ($install_collectdir, $collection);
357 my $olddir = &util::filename_cat ($collectdir, $collection);
358 if (-d $newdir) {
359 my $msg = "build: Could not install collection as $newdir\n" .
360 " already exists. Collection will remain at\n$olddir\n";
361
362 print $out $msg;
363 &log_event ($msg);
364 &final_out (4) if $use_out;
365 die "\n";
366 }
367 if (!&File::Copy::move ($olddir, $newdir)) {
368 my $msg = "build: Failed to install collection to $newdir\n" .
369 " Collection will remain at $olddir\n";
370 print $out $msg;
371 &log_event ($msg);
372 &final_out (5) if $use_out;
373 die "\n";
374 }
375 }
376 }
377
378 &log_event ("The $collection collection was built successfully\n");
379 &final_out (0) if $use_out;
380}
381
382sub gsdl_import {
383
384 print $out "importing the $collection collection\n\n";
385
386 my $import_cmd = "perl -S import.pl";
387 $import_cmd .= " -out \"$outfile.import\"" if $use_out;
388 $import_cmd .= " -removeold" unless $append;
389 $import_cmd .= " -collectdir \"$collectdir\"" if $collectdir =~ /\w/;
390 $import_cmd .= " -statsfile \"$statsfile\"" if $statsfile =~ /\w/;
391 $import_cmd .= " $maxdocs $collection";
392
393 print STDERR "\n**import_cmd: $import_cmd\n";
394
395 system ($import_cmd);
396 # if using output directory append the import output to it
397 &append_file ($out, "$outfile.import");
398
399 if (-e &util::filename_cat ($archivedir, "archives.inf")) {
400 print $out "$collection collection imported successfully\n\n";
401 if ($remove_import) {
402 print $out "removing import directory ($importdir)\n";
403 &util::rm_r ($importdir);
404 }
405 } else {
406 my $msg = "build: ERROR: import.pl failed\n";
407 print $out "\n$msg";
408 &log_event ($msg);
409 &final_out (2) if $use_out;
410 die "\n";
411 }
412}
413
414sub gsdl_build {
415
416 print $out "building the $collection collection\n\n";
417
418 my $build_cmd = "perl -S buildcol.pl";
419 $build_cmd .= " -out \"$outfile.build\"" if $use_out;
420 $build_cmd .= " -collectdir \"$collectdir\"" if $collectdir =~ /\w/;
421 $build_cmd .= " $maxdocs $collection";
422 system ($build_cmd);
423 # if using output directory append the buildcol output to it
424 &append_file ($out, "$outfile.build");
425
426 if (-e &util::filename_cat ($buildingdir, "text", "$collection.ldb") ||
427 -e &util::filename_cat ($buildingdir, "text", "$collection.bdb")) {
428 print $out "$collection collection built successfully\n\n";
429 if ($remove_archives) {
430 print $out "removing archives directory ($archivedir)\n";
431 &util::rm_r ($archivedir);
432 }
433 } else {
434 my $msg = "build: ERROR: buildcol.pl failed\n";
435 print $out "\n$msg";
436 &log_event ($msg);
437 &final_out (3) if $use_out;
438 die "\n";
439 }
440
441 # replace old indexes with new ones
442 if (&has_content ($indexdir)) {
443 print $out "removing old indexes\n";
444 &util::rm_r ($indexdir);
445 }
446 rmdir ($indexdir) if -d $indexdir;
447 &File::Copy::move ($buildingdir, $indexdir);
448
449 # remove the cached arhives
450 if ($save_archives && -d "${archivedir}.org") {
451 &util::rm_r ("${archivedir}.org");
452 }
453}
454
455# return 1 if $dir directory contains any files or sub-directories (other
456# than those specified in the $ignore regular expression)
457sub has_content {
458 my ($dir, $ignore) = @_;
459
460 if (!-d $dir) {return 0;}
461
462 opendir (DIR, $dir) || return 0;
463 my @files = readdir DIR;
464 closedir DIR;
465
466 foreach my $file (@files) {
467 if ($file !~ /^\.{1,2}$/) {
468 return 1 unless (defined $ignore && $file =~ /$ignore/);
469 }
470 }
471 return 0;
472}
473
474sub append_file {
475 my ($handle, $file) = @_;
476
477 open (FILE, $file) || return;
478 undef $/;
479 print $handle <FILE>;
480 $/ = "\n";
481 close FILE;
482 &util::rm ($file);
483}
484
485# creates a file called $outfile.final and writes an output code to it.
486# An output code of 0 specifies that there was no error
487sub final_out {
488 my ($exit_code) = @_;
489
490 if ($use_out && (!-e "$outfile.final")) {
491
492 if (open (FINAL, ">$outfile.final")) {
493 print FINAL $exit_code;
494 close FINAL;
495 }
496 }
497}
498
499sub log_event {
500 my ($msg) = @_;
501
502 return unless ($log_events || $email_events);
503
504 # get the event header
505 my $eheader = "[Build Event]\n";
506 $eheader .= "Date: " . scalar localtime() . "\n";
507 if ($event_header ne "" && open (HEADER, $event_header)) {
508 undef $/;
509 $eheader .= <HEADER>;
510 $/ = "\n";
511 close HEADER;
512 } else {
513 $eheader .= "Collection: $collection\n";
514 $eheader .= "GSDLHOME: $ENV{'GSDLHOME'}\n";
515 $eheader .= "Build Location: $collectdir\n";
516 }
517
518 if ($log_events) {
519 my $fail = 0;
520 # append the event to the event log file
521 if ($event_log_file eq "" || !open (LOG, ">>$event_log_file")) {
522 # log file defaults to $GSDLHOME/etc/events.txt
523 $event_log_file = &util::filename_cat ($ENV{'GSDLHOME'}, "etc", "events.txt");
524 if (!open (LOG, ">>$event_log_file")) {
525 print $out "build: ERROR: Couldn't open event log file $event_log_file\n";
526 $fail = 1;
527 }
528 }
529 if (!$fail) {
530 print LOG $eheader;
531 print LOG $msg;
532 print LOG "\n";
533 close LOG;
534 }
535 }
536
537 if ($email_events) {
538 # if mail_server isn't set email_events does nothing
539 if ($mail_server eq "") {
540 print $out "build: WARNING: mail_server was not set - email_events option was ignored\n";
541 return;
542 }
543
544 my %mail = ('SMTP' => $mail_server,
545 'To' => $email_events,
546 'From' => $maintainer,
547 'Subject' => 'Greenstone Build Event'
548 );
549 $mail{'Message'} = $eheader . $msg;
550
551 if (!sendmail %mail) {
552 print $out "build: ERROR sending mail to $email_events\n";
553 print $out "'$Mail::Sendmail::error'\n";
554 }
555 }
556}
557
558
559sub parse_args {
560 my ($argref) = @_;
561
562 if (!parsargv::parse($argref,
563 'optionfile/.*/', \$optionfile,
564 'append', \$append,
565 'remove_archives', \$remove_archives,
566 'remove_import', \$remove_import,
567 'buildtype/^(build|import)$/import', \$buildtype,
568 'maxdocs/^\-?\d+/-1', \$maxdocs,
569 'download/.+', \@download,
570 'collectdir/.*/', \$collectdir,
571 'dontinstall', \$dontinstall,
572 'save_archives', \$save_archives,
573 'out/.*/STDERR', \$out,
574 'make_writable', \$make_writable,
575 'log_events', \$log_events,
576 'event_log_file/.*/', \$event_log_file,
577 'email_events/.*/', \$email_events,
578 'mail_server/.*/', \$mail_server,
579 'statsfile/.*/STDERR', \$statsfile,
580 'event_header/.*/', \$event_header)) {
581
582 &print_usage();
583 die "\n";
584 }
585}
586
587
588END {
589
590 if ($make_writable) {
591 # chmod a+rw new collection
592 my $installed_collection = &util::filename_cat($ENV{'GSDLHOME'}, "collect", $collection);
593 &recursive_chmod($installed_collection);
594
595 # chmod a+rw anything we've left laying about in the tmp directory
596 if (($collectdir ne "") &&
597 (!&util::filenames_equal ($collectdir, &util::filename_cat($ENV{'GSDLHOME'}, "collect")))) {
598 &recursive_chmod($collectdir);
599 }
600 }
601
602 # this will produce a .final file if one doesn't exist yet - that
603 # should only happen if there's been an error somewhere in the perl
604 # code
605 &final_out(7);
606
607 sub recursive_chmod {
608 my ($dir) = @_;
609 return unless -d $dir;
610
611 chmod (0777, $dir);
612
613 opendir (DIR, $dir) || die;
614 my @files = readdir DIR;
615 closedir DIR;
616
617 foreach my $file (@files) {
618 next if $file =~ /^\.\.?$/;
619 $file = &util::filename_cat($dir, $file);
620 if (-d $file) {
621 &recursive_chmod ($file);
622 } else {
623 chmod (0777, $file);
624 }
625 }
626 }
627}
Note: See TracBrowser for help on using the repository browser.