source: trunk/gsdl/perllib/mgbuilder.pm@ 75

Last change on this file since 75 was 69, checked in by sjboddie, 26 years ago

Sub-collection indexes may now be defined within the collect.cfg file as
subcollection blah1 Title/blah/i
subcollection blah2 !Title/blah/i
indexsubcollections blah1 blah2 blah1,blah2
indexes section:text document:text
This example would create section:text and document:text indexes for:

  1. the blah1 subcollection (i.e those documents whose Title field contains 'blah')
  2. the blah2 subcollection (i.e. those documents whose Title field doesn't contain 'blah')
  3. both subcollections (i.e. all documents)

The field to match the regular expression against (Title in this example) may be
any valid metadata tag or 'filename'.
The regular expression (blah in this example) may be any valid perl regular expression.

  • Property svn:keywords set to Author Date Id Revision
File size: 14.1 KB
Line 
1# MGBuilder object
2#
3
4package mgbuilder;
5
6use cfgread;
7use colcfg;
8use plugin;
9use util;
10
11$maxdocsize = 12000;
12
13%wanted_index_files = ('td'=>1,
14 't'=>1,
15 'idb'=>1,
16 'ib1'=>1,
17 'ib2'=>1,
18 'ib3'=>1,
19 'i'=>1,
20 'ip'=>1,
21 'tiw'=>1,
22 'wa'=>1);
23
24
25sub new {
26 my ($class, $collection, $source_dir, $build_dir, $verbosity) = @_;
27
28 # create an mgbuilder object
29 my $self = bless {'collection'=>$collection,
30 'source_dir'=>$source_dir,
31 'build_dir'=>$build_dir,
32 'verbosity'=>$verbosity}, $class;
33
34
35 # read in the collection configuration file
36 if (!-e "$ENV{'GSDLHOME'}/collect/$collection/collect.cfg") {
37 die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n";
38 }
39 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ("$ENV{'GSDLHOME'}/collect/" .
40 "$collection/collect.cfg");
41
42
43 # sort out subcollection indexes
44 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
45 my $indexes = $self->{'collect_cfg'}->{'indexes'};
46 $self->{'collect_cfg'}->{'indexes'} = [];
47 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
48 foreach $index (@$indexes) {
49 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
50 }
51 }
52 }
53
54 # get the list of plugins for this collection
55 my @plugins = (); # some good choice of plugins .... ????
56 if (defined $self->{'collect_cfg'}->{'plugins'}) {
57 @plugins = @{$self->{'collect_cfg'}->{'plugins'}};
58 }
59
60
61 # load all the plugins
62 $self->{'pluginfo'} = &plugin::load_plugins ($collection, \@plugins);
63 if (scalar(@{$self->{'pluginfo'}}) == 0) {
64 print STDERR "No plugins were loaded.\n";
65 die "\n";
66 }
67
68
69 # load up the document processor for building
70 # if a buildproc class has been created for this collection, use it
71 # otherwise, use the mg buildproc
72 my ($buildprocdir, $buildproctype);
73 if (-e "$ENV{'GSDLHOME'}/collect/$collection/perllib/${collection}buildproc.pm") {
74 $buildprocdir = "$ENV{'GSDLHOME'}/collect/$collection/perllib";
75 $buildproctype = "${collection}buildproc";
76 } else {
77 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
78 $buildproctype = "mgbuildproc";
79 }
80 require "$buildprocdir/$buildproctype.pm";
81
82 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
83 "\$source_dir, \$build_dir, \$verbosity)");
84 die "$@" if $@;
85
86
87 return $self;
88}
89
90sub init {
91 my $self = shift (@_);
92
93 # remove any old builds
94 &util::rm_r($self->{'build_dir'});
95 &util::mk_all_dir($self->{'build_dir'});
96
97 # make the text directory
98 my $textdir = "$self->{'build_dir'}/text";
99 &util::mk_all_dir($textdir);
100}
101
102sub compress_text {
103 my $self = shift (@_);
104 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
105 my $exe = &util::get_os_exe ();
106
107 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
108 my $basefilename = "text/$self->{'collection'}";
109 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
110
111 my $osextra = "";
112 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
113 $fulltextprefix =~ s/\//\\/g;
114 } else {
115 $osextra = " -d /";
116 }
117
118 print STDERR "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
119
120 # set up the document processor
121 $self->{'buildproc'}->set_output_handle ('mgbuilder::PIPEOUT');
122 $self->{'buildproc'}->set_mode ('text');
123 $self->{'buildproc'}->set_index ('section:text');
124
125 # collect the statistics for the text
126 # -b $maxdocsize sets the maximum document size to be 12 meg
127 print STDERR "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);
128 if (!-e "$exedir/mg_passes$exe" || !open (PIPEOUT,
129 "| $exedir/mg_passes$exe -f $fulltextprefix -b $maxdocsize -T1 $osextra")) {
130 die "mgbuilder::compress_text - couldn't run $exedir/mg_passes$exe\n";
131 }
132 $self->{'buildproc'}->reset();
133 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
134 "", {}, $self->{'buildproc'});
135 close (PIPEOUT);
136
137 # create the compression dictionary
138 # the compression dictionary is built by assuming the stats are from a seed
139 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
140 # and the resulting dictionary must be less than 5 meg with the most frequent
141 # words being put into the dictionary first (-2 -k 5120)
142 print STDERR "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
143 if (!-e "$exedir/mg_compression_dict$exe") {
144 die "mgbuilder::compress_text - couldn't run $exedir/mg_compression_dict$exe\n";
145 }
146 system ("$exedir/mg_compression_dict$exe -f $fulltextprefix -S -H -2 -k 5120 $osextra");
147
148 # compress the text
149 # -b $maxdocsize sets the maximum document size to be 12 meg
150 print STDERR "\n compressing the text\n" if ($self->{'verbosity'} >= 1);
151 if (!-e "$exedir/mg_passes$exe" || !open (PIPEOUT,
152 "| $exedir/mg_passes$exe -f $fulltextprefix -b $maxdocsize -T2 $osextra")) {
153 die "mgbuilder::compress_text - couldn't run $exedir/mg_passes$exe\n";
154 }
155 $self->{'buildproc'}->reset();
156 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
157 "", {}, $self->{'buildproc'});
158 close (PIPEOUT);
159}
160
161sub build_indexes {
162 my $self = shift (@_);
163 my $indexes = $self->{'collect_cfg'}->{'indexes'};
164
165 # create the mapping between the index descriptions
166 # and their directory names
167 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
168
169 # build each of the indexes
170 foreach $index (@$indexes) {
171 print STDERR "\n*** building index $index in subdirectory " .
172 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
173 $self->build_index($index);
174 }
175}
176
177# creates directory names for each of the index descriptions
178sub create_index_mapping {
179 my $self = shift (@_);
180 my ($indexes) = @_;
181
182 my %mapping = ();
183
184 # dirnames is used to check for collisions. Start this off
185 # with the manditory directory names
186 my %dirnames = ('text'=>'text',
187 'extra'=>'extra');
188
189 foreach $index (@$indexes) {
190 my ($level, $fields, $subcollection) = split (":", $index);
191 my @fields = split (/,/, $fields);
192 splice (@fields, 2); # just want first two fields
193
194 # the directory names starts with the first character of the index level
195 my ($dirname) = $level =~ /^(.)/;
196
197 # next comes a processed version of the first two fields in the index
198 # the processed version contains the first character and the next
199 # consonant
200 map {s/^(.).*?([bcdfghjklmnpqrstvwxyz]).*$/$1$2/i;} @fields;
201 $dirname .= join("", @fields);
202
203 # next comes a processed version of the subcollection if there is one.
204 # the processed version contains the first character and the next
205 # consonant if there's only one field, other wise the first character
206 # of the first two fields
207 if (defined ($subcollection) && $subcollection =~ /\w/) {
208 @fields = split /,/, $subcollection;
209 if (scalar @fields >= 2) {
210 splice (@fields, 2);
211 map {s/^(.).*$/$1/i;} @fields;
212 $dirname .= join("", @fields);
213 } else {
214 $subcollection =~ s/^(.).*?([bcdfghjklmnpqrstvwxyz]).*$/$1$2/i;
215 $dirname .= $subcollection;
216 }
217 }
218
219 # convert the directory name to lowercase
220 $dirname = lc ($dirname);
221
222 # add a number to make this directory name unique
223 if (defined $dirnames{$dirname}) {
224 my $num = 1;
225 while (defined $dirnames{"$dirname$num"}) {
226 $num++;
227 }
228 $dirname .= $num;
229 }
230
231 $mapping{$index} = $dirname;
232 $dirnames{$dirname} = $index;
233 }
234
235 return \%mapping;
236}
237
238
239sub build_index {
240 my $self = shift (@_);
241 my ($index) = @_;
242
243 # get the full index directory path and make sure it exists
244 my $indexdir = $self->{'index_mapping'}->{$index};
245 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
246 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, $indexdir,
247 $self->{'collection'});
248 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
249 $self->{'collection'});
250
251 # get any os specific stuff
252 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
253 my $exe = &util::get_os_exe ();
254 my $osextra = "";
255 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
256 $fullindexprefix =~ s/\//\\/g;
257 } else {
258 $osextra = " -d /";
259 }
260
261 # get the index level from the index description
262 # the index will be level 2 unless we are building a
263 # paragraph level index
264 my $index_level = 2;
265 $index_level = 3 if $index =~ /^paragraph/i;
266
267 # get the index expression if this index belongs
268 # to a subcollection
269 my $indexexparr = [];
270 my ($level, $fields, $subcollection) = split (":", $index);
271 my (@subcollections) = split /,/, $subcollection;
272
273 foreach $subcollection (@subcollections) {
274 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
275 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
276 }
277 }
278
279 # set up the document processor
280 $self->{'buildproc'}->set_output_handle ('mgbuilder::PIPEOUT');
281 $self->{'buildproc'}->set_mode ('text');
282 $self->{'buildproc'}->set_index ($index, $indexexparr);
283
284
285 # Build index dictionary. Uses verbatim stem method
286 print STDERR "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1);
287 if (!-e "$exedir/mg_passes$exe" || !open (PIPEOUT,
288 "| $exedir/mg_passes$exe -f $fullindexprefix -b $maxdocsize " .
289 "-$index_level -m 32 -s 0 -G -t 10 -N1 $osextra")) {
290 die "mgbuilder::build_index - couldn't run $exedir/mg_passes$exe\n";
291 }
292 $self->{'buildproc'}->reset();
293 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
294 "", {}, $self->{'buildproc'});
295 close (PIPEOUT);
296
297 # create the perfect hash function
298 if (!-e "$exedir/mg_perf_hash_build$exe") {
299 die "mgbuilder::build_index - couldn't run $exedir/mg_perf_hash_build$exe\n";
300 }
301 system ("$exedir/mg_perf_hash_build$exe -f $fullindexprefix $osextra");
302
303 # invert the text
304 print STDERR "\n inverting the text\n" if ($self->{'verbosity'} >= 1);
305 if (!-e "$exedir/mg_passes$exe" || !open (PIPEOUT,
306 "| $exedir/mg_passes$exe -f $fullindexprefix -b $maxdocsize " .
307 "-$index_level -c 3 -G -t 10 -N2 $osextra")) {
308 die "mgbuilder::build_index - couldn't run $exedir/mg_passes$exe\n";
309 }
310 $self->{'buildproc'}->reset();
311 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
312 "", {}, $self->{'buildproc'});
313 close (PIPEOUT);
314
315 # create the weights file
316 print STDERR "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
317 if (!-e "$exedir/mg_weights_build$exe") {
318 die "mgbuilder::build_index - couldn't run $exedir/mg_weights_build$exe\n";
319 }
320 system ("$exedir/mg_weights_build$exe -f $fullindexprefix -t $fulltextprefix $osextra");
321
322 # create 'on-disk' stemmed dictionary
323 print STDERR "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
324 if (!-e "$exedir/mg_invf_dict$exe") {
325 die "mgbuilder::build_index - couldn't run $exedir/mg_invf_dict$exe\n";
326 }
327 system ("$exedir/mg_invf_dict$exe -f $fullindexprefix $osextra");
328
329
330 # creates stem index files for the various stemming methods
331 print STDERR "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
332 if (!-e "$exedir/mg_stem_idx$exe") {
333 die "mgbuilder::build_index - couldn't run $exedir/mg_stem_idx$exe\n";
334 }
335 system ("$exedir/mg_stem_idx$exe -b 4096 -s1 -f $fullindexprefix $osextra");
336 system ("$exedir/mg_stem_idx$exe -b 4096 -s2 -f $fullindexprefix $osextra");
337 system ("$exedir/mg_stem_idx$exe -b 4096 -s3 -f $fullindexprefix $osextra");
338
339
340 # remove unwanted files
341 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
342 opendir (DIR, $tmpdir) || die
343 "mgbuilder::build_index - couldn't read directory $tmpdir\n";
344 foreach $file (readdir(DIR)) {
345 next if $file =~ /^\./;
346 my ($suffix) = $file =~ /\.([^\.]+)$/;
347 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
348 # delete it!
349# print STDERR "deleting $file\n";
350 &util::rm (&util::filename_cat ($tmpdir, $file));
351 }
352 }
353 closedir (DIR);
354}
355
356sub make_infodatabase {
357 my $self = shift (@_);
358 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
359 &util::mk_all_dir ($textdir);
360
361 # assume little-endian for now :-)
362 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}.ldb");
363 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
364 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
365 my $exe = &util::get_os_exe ();
366
367 print STDERR "\n*** creating the info database\n" if ($self->{'verbosity'} >= 1);
368
369 # set up the document processor
370 $self->{'buildproc'}->set_output_handle ('mgbuilder::PIPEOUT');
371 $self->{'buildproc'}->set_mode ('infodb');
372
373 # collect the statistics for the text
374 # -b $maxdocsize sets the maximum document size to be 12 meg
375 if (!-e "$exedir/txt2db$exe" || !open (PIPEOUT,
376 "| $exedir/txt2db$exe $fulldbname")) {
377 die "mgbuilder::make_infodatabase - couldn't run $exedir/txt2db$exe\n";
378 }
379 $self->{'buildproc'}->reset();
380 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
381 "", {}, $self->{'buildproc'});
382 close (PIPEOUT);
383}
384
385sub make_auxiliary_files {
386 my $self = shift (@_);
387 my ($index);
388 my %build_cfg = ();
389
390 print STDERR "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
391
392 # get the text directory
393 &util::mk_all_dir ($self->{'build_dir'});
394
395 # store the build date
396 $build_cfg->{'builddate'} = time;
397
398 # store the number of documents and number of bytes
399 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
400 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
401
402 # store the mapping between the index names and the directory names
403 my @indexmap = ();
404 foreach $index (@{$self->{'collect_cfg'}->{'indexes'}}) {
405 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{$index}");
406 }
407 $build_cfg->{'indexmap'} = \@indexmap;
408
409 # write out the build information
410 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
411 '^(builddate|numdocs|numbytes)$', '^(indexmap)$');
412
413}
414
415sub deinit {
416 my $self = shift (@_);
417}
418
419
4201;
421
422
Note: See TracBrowser for help on using the repository browser.