source: trunk/gsdl/bin/script/buildcol.pl@ 2336

Last change on this file since 2336 was 2336, checked in by sjboddie, 23 years ago

added a -no_text option to buildcol.pl to allow collections to be built
without storing compressed text (intended for use in collections where
original documents (PDFs or Word docs maybe) are returned instead of the
compressed text)

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 11.7 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# buildcol.pl -- This program will build a particular collection
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28package buildcol;
29
30BEGIN {
31 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
32 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
34 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
35 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
36}
37
38use colcfg;
39use parsargv;
40use util;
41use FileHandle;
42
43&main();
44
45sub print_usage {
46 print STDERR "\n";
47 print STDERR "buildcol.pl: Builds the indexes of a Greenstone collection.\n\n";
48 print STDERR " usage: $0 [options] collection-name\n\n";
49 print STDERR " options:\n";
50 print STDERR " -verbosity number 0=none, 3=lots\n";
51 print STDERR " -archivedir directory Where the archives live\n";
52 print STDERR " -builddir directory Where to put the built indexes\n";
53 print STDERR " -maxdocs number Maximum number of documents to build\n";
54 print STDERR " -debug Print output to STDOUT\n";
55 print STDERR " -mode all|compress_text|build_index|infodb\n";
56 print STDERR " -index indexname Index to build (will build all in\n";
57 print STDERR " config file if not set)\n";
58 print STDERR " -keepold will not destroy the current contents of the\n";
59 print STDERR " building directory\n";
60 print STDERR " -no_text Don't store compressed text. This option is\n";
61 print STDERR " useful for minimizing the size of the built\n";
62 print STDERR " indexes if you intend always to display the\n";
63 print STDERR " original documents at run time (i.e. you won't\n";
64 print STDERR " be able to retrieve the compressed text version)\n";
65 print STDERR " -allclassifications Don't remove empty classifications\n";
66 print STDERR " -create_images Attempt to create default images for new\n";
67 print STDERR " collection. This relies on the Gimp being\n";
68 print STDERR " installed along with relevant perl modules\n";
69 print STDERR " to allow scripting from perl\n";
70 print STDERR " -collectdir directory Collection directory (defaults to " .
71 &util::filename_cat ($ENV{'GSDLHOME'}, "collect") . ")\n";
72 print STDERR " -out Filename or handle to print output status to.\n";
73 print STDERR " The default is STDERR\n";
74 print STDERR " -buildtype mg|mgpp This will override the config file setting.\n";
75 print STDERR " (default is mg)\n";
76 print STDERR " -no_strip_html Do not strip the html tags from the indexed text\n";
77 print STDERR " (only used for mgpp collections).\n\n";
78}
79
80
81sub main
82{
83 my ($verbosity, $archivedir, $cachedir, $builddir, $maxdocs,
84 $debug, $mode, $indexname, $keepold, $allclassifications,
85 $create_images, $collectdir, $out, $buildtype, $textindex,
86 $no_strip_html, $no_text);
87 if (!parsargv::parse(\@ARGV,
88 'verbosity/\d+/2', \$verbosity,
89 'archivedir/.*/', \$archivedir,
90 'cachedir/.*/', \$cachedir,
91 'builddir/.*/', \$builddir,
92 'maxdocs/^\-?\d+/-1', \$maxdocs,
93 'debug', \$debug,
94 'mode/^(all|compress_text|build_index|infodb)$/all', \$mode,
95 'index/.*/', \$indexname,
96 'no_text', \$no_text,
97 'keepold', \$keepold,
98 'allclassifications', \$allclassifications,
99 'create_images', \$create_images,
100 'collectdir/.*/', \$collectdir,
101 'out/.*/STDERR', \$out,
102 'no_strip_html', \$no_strip_html,
103 'buildtype/^(mg|mgpp)$/', \$buildtype)) {
104 &print_usage();
105 die "\n";
106 }
107
108 $textindex = "";
109 my $close_out = 0;
110 if ($out !~ /^(STDERR|STDOUT)$/i) {
111 open (OUT, ">$out") || die "Couldn't open output file $out\n";
112 $out = "buildcol::OUT";
113 $close_out = 1;
114 }
115 $out->autoflush(1);
116
117 # get and check the collection
118 if (($collection = &util::use_collection(@ARGV, $collectdir)) eq "") {
119 &print_usage();
120 die "\n";
121 }
122
123 # read the configuration file
124
125 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
126 if (-e $configfilename) {
127 $collectcfg = &colcfg::read_collect_cfg ($configfilename);
128
129 if (defined $collectcfg->{'buildtype'} && $buildtype eq "") {
130 $buildtype = $collectcfg->{'buildtype'};
131 }
132 if ($buildtype eq "") {
133 $buildtype = "mg"; # mg is the default
134 }
135 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
136 $archivedir = $collectcfg->{'archivedir'};
137 }
138 if (defined $collectcfg->{'cachedir'} && $cachedir eq "") {
139 $cachedir = $collectcfg->{'cachedir'};
140 }
141 if (defined $collectcfg->{'builddir'} && $builddir eq "") {
142 $builddir = $collectcfg->{'builddir'};
143 }
144 if (defined $collectcfg->{'collectdir'} && $collectdir eq "") {
145 $collectdir = $collectcfg->{'collectdir'};
146 }
147 if (defined $collectcfg->{'no_text'} && $no_text == 0) {
148 if ($collectcfg->{'no_text'} =~ /^true$/) {
149 $no_text = 1;
150 }
151 }
152 if (defined $collectcfg->{'allclassifications'} && $allclassifications == 0) {
153 if ($collectcfg->{'allclassifications'} =~ /^true$/) {
154 $allclassifications = 1;
155 }
156 }
157 if ($buildtype eq "mgpp" && defined $collectcfg->{'textcompress'}) {
158 $textindex = $collectcfg->{'textcompress'};
159 }
160
161 } else {
162 die "Couldn't find the configuration file $configfilename\n";
163 }
164
165 #mgpp doesn't work yet on windows
166 if ($buildtype eq "mgpp" && $ENV{'GSDLOS'} =~ /^windows$/) {
167 die "mgpp doesn't work on windows\n";
168 }
169
170 #set the text index
171 if ($buildtype eq "mgpp") {
172 if ($textindex eq "") {
173 $textindex = "text";
174 }
175 }
176 else {
177 $textindex = "section:text";
178 }
179
180 # create default images if required
181 if ($create_images) {
182 my $collection_name = $collection;
183 $collection_name = $collectcfg->{'collectionmeta'}->{'collectionname'}
184 if defined $collectcfg->{'collectionmeta'}->{'collectionname'};
185
186 &create_images ($collection_name);
187 }
188
189 # fill in the default archives and building directories if none
190 # were supplied, turn all \ into / and remove trailing /
191 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives") if $archivedir eq "";
192 $archivedir =~ s/[\\\/]+/\//g;
193 $archivedir =~ s/\/$//;
194 $builddir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "building") if $builddir eq "";
195 $builddir =~ s/[\\\/]+/\//g;
196 $builddir =~ s/\/$//;
197
198 # update the archive cache if needed
199 if ($cachedir) {
200 print $out "Updating archive cache\n" if ($verbosity >= 1);
201
202 $cachedir =~ s/[\\\/]+$//;
203 $cachedir .= "/collect/$collection" unless
204 $cachedir =~ /collect\/$collection/;
205
206 $realarchivedir = "$cachedir/archives";
207 $realbuilddir = "$cachedir/building";
208 &util::mk_all_dir ($realarchivedir);
209 &util::mk_all_dir ($realbuilddir);
210 &util::cachedir ($archivedir, $realarchivedir, $verbosity);
211
212 } else {
213 $realarchivedir = $archivedir;
214 $realbuilddir = $builddir;
215 }
216
217 # build it in realbuilddir
218 &util::mk_all_dir ($realbuilddir);
219
220
221 # if a builder class has been created for this collection, use it
222 # otherwise, use the mg or mgpp builder
223 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}builder.pm") {
224 $builderdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
225 $buildertype = "${collection}builder";
226 } else {
227 $builderdir = "$ENV{'GSDLHOME'}/perllib";
228 if ($buildtype eq "mgpp") {
229 $buildertype = "mgppbuilder";
230 }
231 else {
232 $buildertype = "mgbuilder";
233 }
234 }
235
236 require "$builderdir/$buildertype.pm";
237
238 eval("\$builder = new $buildertype(\$collection, " .
239 "\$realarchivedir, \$realbuilddir, \$verbosity, " .
240 "\$maxdocs, \$debug, \$keepold, \$allclassifications, " .
241 "\$out, \$no_text)");
242 die "$@" if $@;
243
244 $builder->init();
245
246 if ($buildertype eq "mgppbuilder" && $no_strip_html) {
247 $builder->set_strip_html(0);
248 }
249 if ($mode =~ /^all$/i) {
250 $builder->compress_text($textindex);
251 $builder->build_indexes($indexname);
252 $builder->make_infodatabase();
253 $builder->collect_specific();
254 } elsif ($mode =~ /^compress_text$/i) {
255 $builder->compress_text($textindex);
256 } elsif ($mode =~ /^build_index$/i) {
257 $builder->build_indexes($indexname);
258 } elsif ($mode =~ /^infodb$/i) {
259 $builder->make_infodatabase();
260 } else {
261 die "unknown mode: $mode\n";
262 }
263
264 $builder->make_auxiliary_files() if !$debug;
265 $builder->deinit();
266
267 if (($realbuilddir ne $builddir) && !$debug) {
268 print $out "Copying back the cached build\n" if ($verbosity >= 1);
269 &util::rm_r ($builddir);
270 &util::cp_r ($realbuilddir, $builddir);
271 }
272
273 close OUT if $close_out;
274}
275
276sub create_images {
277 my ($collection_name) = @_;
278
279 my $image_script = &util::filename_cat ($ENV{'GSDLHOME'}, "bin", "script", "gimp", "title_icon.pl");
280 if (!-e $image_script) {
281 print $out "WARNING: Image making script ($image_script) could not be found\n";
282 print $out " Default images will not be generated\n\n";
283 return;
284 }
285
286 my $imagedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "images");
287
288 &util::mk_all_dir ($imagedir);
289
290 # create the images
291 system ("$image_script -size 1.5 -image_dir \"$imagedir\" -filename $collection.gif -text \"$collection_name\"");
292 system ("$image_script -image_dir \"$imagedir\" -filename ${collection}sm.gif -text \"$collection_name\"");
293
294 # update the collect.cfg configuration file (this will need
295 # to be changed when the config file format changes)
296 if (!open (CFGFILE, $configfilename)) {
297 print $out "WARNING: Couldn't open config file ($configfilename)\n";
298 print $out " for updating so collection images may not be linked correctly\n";
299 return;
300 }
301
302 my $line = ""; my $file = "";
303 my $found = 0; my $foundsm = 0;
304 while (defined ($line = <CFGFILE>)) {
305 if ($line =~ /collectionmeta\s+iconcollection\s+/) {
306 $line = "collectionmeta iconcollection _httpprefix_/collect/$collection/images/$collection.gif\n";
307 $found = 1;
308 } elsif ($line =~ /collectionmeta\s+iconcollectionsmall\s+/) {
309 $line = "collectionmeta iconcollectionsmall _httpprefix_/collect/$collection/images/${collection}sm.gif\n";
310 $foundsm = 1;
311 }
312 $file .= $line;
313 }
314 close CFGFILE;
315
316 $file .= "collectionmeta iconcollection _httpprefix_/collect/$collection/images/$collection.gif\n" if !$found;
317 $file .= "collectionmeta iconcollectionsmall _httpprefix_/collect/$collection/images/${collection}sm.gif\n" if !$foundsm;
318
319 if (!open (CFGFILE, ">$configfilename")) {
320 print $out "WARNING: Couldn't open config file ($configfilename)\n";
321 print $out " for updating so collection images may not be linked correctly\n";
322 return;
323 }
324 print CFGFILE $file;
325 close CFGFILE;
326}
327
Note: See TracBrowser for help on using the repository browser.