source: trunk/gsdl/perllib/mgbuilder.pm@ 14

Last change on this file since 14 was 4, checked in by sjboddie, 26 years ago

Initial revision

  • Property svn:keywords set to Author Date Id Revision
File size: 12.6 KB
Line 
1# MGBuilder object
2#
3
4package mgbuilder;
5
6use cfgread;
7use colcfg;
8use plugin;
9use util;
10
11$maxdocsize = 12000;
12
13%wanted_index_files = ('td'=>1,
14 't'=>1,
15 'idb'=>1,
16 'ib1'=>1,
17 'ib2'=>1,
18 'ib3'=>1,
19 'i'=>1,
20 'ip'=>1,
21 'tiw'=>1,
22 'wa'=>1);
23
24
25sub new {
26 my ($class, $collection, $source_dir, $build_dir, $verbosity) = @_;
27
28 # create an mgbuilder object
29 my $self = bless {'collection'=>$collection,
30 'source_dir'=>$source_dir,
31 'build_dir'=>$build_dir,
32 'verbosity'=>$verbosity}, $class;
33
34
35 # read in the collection configuration file
36 if (!-e "$ENV{'GSDLHOME'}/collect/$collection/collect.cfg") {
37 die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n";
38 }
39 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ("$ENV{'GSDLHOME'}/collect/" .
40 "$collection/collect.cfg");
41
42
43 # get the list of plugins for this collection
44 my @plugins = (); # some good choice of plugins .... ????
45 if (defined $self->{'collect_cfg'}->{'plugins'}) {
46 @plugins = @{$self->{'collect_cfg'}->{'plugins'}};
47 }
48
49
50 # load all the plugins
51 $self->{'pluginfo'} = &plugin::load_plugins ($collection, \@plugins);
52 if (scalar(@{$self->{'pluginfo'}}) == 0) {
53 print STDERR "No plugins were loaded.\n";
54 die "\n";
55 }
56
57
58 # load up the document processor for building
59 # if a buildproc class has been created for this collection, use it
60 # otherwise, use the mg buildproc
61 my ($buildprocdir, $buildproctype);
62 if (-e "$ENV{'GSDLHOME'}/collect/$collection/lib/${collection}buildproc.pm") {
63 $buildprocdir = "$ENV{'GSDLHOME'}/collect/$collection/lib";
64 $buildproctype = "${collection}buildproc";
65 } else {
66 $buildprocdir = "$ENV{'GSDLHOME'}/lib";
67 $buildproctype = "mgbuildproc";
68 }
69 require "$buildprocdir/$buildproctype.pm";
70
71 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
72 "\$source_dir, \$build_dir, \$verbosity)");
73 die "$@" if $@;
74
75
76 return $self;
77}
78
79sub init {
80 my $self = shift (@_);
81
82 # remove any old builds
83 &util::rm_r($self->{'build_dir'});
84 &util::mk_all_dir($self->{'build_dir'});
85
86 # make the text directory
87 my $textdir = "$self->{'build_dir'}/text";
88 &util::mk_all_dir($textdir);
89}
90
91sub compress_text {
92 my $self = shift (@_);
93 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
94 my $exe = &util::get_os_exe ();
95
96 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
97 my $basefilename = "text/$self->{'collection'}";
98 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
99
100 my $osextra = "";
101 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
102 $fulltextprefix =~ s/\//\\/g;
103 } else {
104 $osextra = " -d /";
105 }
106
107 print STDERR "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
108
109 # set up the document processor
110 $self->{'buildproc'}->set_output_handle ('mgbuilder::PIPEOUT');
111 $self->{'buildproc'}->set_mode ('text');
112 $self->{'buildproc'}->set_index ('section:text');
113
114 # collect the statistics for the text
115 # -b $maxdocsize sets the maximum document size to be 12 meg
116 print STDERR "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);
117 if (!-e "$exedir/mg_passes$exe" || !open (PIPEOUT,
118 "| $exedir/mg_passes$exe -f $fulltextprefix -b $maxdocsize -T1 $osextra")) {
119 die "mgbuilder::compress_text - couldn't run $exedir/mg_passes$exe\n";
120 }
121 $self->{'buildproc'}->reset();
122 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
123 "", {}, $self->{'buildproc'});
124 close (PIPEOUT);
125
126 # create the compression dictionary
127 # the compression dictionary is built by assuming the stats are from a seed
128 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
129 # and the resulting dictionary must be less than 5 meg with the most frequent
130 # words being put into the dictionary first (-2 -k 5120)
131 print STDERR "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
132 if (!-e "$exedir/mg_compression_dict$exe") {
133 die "mgbuilder::compress_text - couldn't run $exedir/mg_compression_dict$exe\n";
134 }
135 system ("$exedir/mg_compression_dict$exe -f $fulltextprefix -S -H -2 -k 5120 $osextra");
136
137 # compress the text
138 # -b $maxdocsize sets the maximum document size to be 12 meg
139 print STDERR "\n compressing the text\n" if ($self->{'verbosity'} >= 1);
140 if (!-e "$exedir/mg_passes$exe" || !open (PIPEOUT,
141 "| $exedir/mg_passes$exe -f $fulltextprefix -b $maxdocsize -T2 $osextra")) {
142 die "mgbuilder::compress_text - couldn't run $exedir/mg_passes$exe\n";
143 }
144 $self->{'buildproc'}->reset();
145 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
146 "", {}, $self->{'buildproc'});
147 close (PIPEOUT);
148}
149
150sub build_indexes {
151 my $self = shift (@_);
152 my $indexes = $self->{'collect_cfg'}->{'indexes'};
153
154 # create the mapping between the index descriptions
155 # and their directory names
156 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
157
158 # build each of the indexes
159 foreach $index (@$indexes) {
160 print STDERR "\n*** building index $index in subdirectory " .
161 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
162 $self->build_index($index);
163 }
164}
165
166# creates directory names for each of the index descriptions
167sub create_index_mapping {
168 my $self = shift (@_);
169 my ($indexes) = @_;
170
171 my %mapping = ();
172
173 # dirnames is used to check for collisions. Start this off
174 # with the manditory directory names
175 my %dirnames = ('text'=>'text',
176 'extra'=>'extra');
177
178 foreach $index (@$indexes) {
179 my ($level, $fields) = split (":", $index);
180 my @fields = split (",", $fields);
181 splice (@fields, 2); # just want first two fields
182
183 # the directory names starts with the first character of the index level
184 my ($dirname) = $level =~ /^(.)/;
185
186 # next comes a processed version of the first two fields in the index
187 # the processed version contains the first character and the next
188 # consonant
189 map {s/^(.).*?([bcdfghjklmnpqrstvwxyz]).*$/$1$2/i;} @fields;
190 $dirname .= join("", @fields);
191
192 # convert the directory name to lowercase
193 $dirname = lc ($dirname);
194
195 # add a number to make this directory name unique
196 if (defined $dirnames{$dirname}) {
197 my $num = 1;
198 while (defined $dirnames{"$dirname$num"}) {
199 $num++;
200 }
201 $dirname .= $num;
202 }
203
204 $mapping{$index} = $dirname;
205 $dirnames{$dirname} = $index;
206 }
207
208 return \%mapping;
209}
210
211
212sub build_index {
213 my $self = shift (@_);
214 my ($index) = @_;
215
216 # get the full index directory path and make sure it exists
217 my $indexdir = $self->{'index_mapping'}->{$index};
218 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
219 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, $indexdir,
220 $self->{'collection'});
221 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
222 $self->{'collection'});
223
224 # get any os specific stuff
225 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
226 my $exe = &util::get_os_exe ();
227 my $osextra = "";
228 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
229 $fullindexprefix =~ s/\//\\/g;
230 } else {
231 $osextra = " -d /";
232 }
233
234 # get the index level from the index description
235 # the index will be level 2 unless we are building a
236 # paragraph level index
237 my $index_level = 2;
238 $index_level = 3 if $index =~ /^paragraph/i;
239
240 # set up the document processor
241 $self->{'buildproc'}->set_output_handle ('mgbuilder::PIPEOUT');
242 $self->{'buildproc'}->set_mode ('text');
243 $self->{'buildproc'}->set_index ($index);
244
245
246 # Build index dictionary. Uses verbatim stem method
247 print STDERR "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1);
248 if (!-e "$exedir/mg_passes$exe" || !open (PIPEOUT,
249 "| $exedir/mg_passes$exe -f $fullindexprefix -b $maxdocsize " .
250 "-$index_level -m 32 -s 0 -G -t 10 -N1 $osextra")) {
251 die "mgbuilder::build_index - couldn't run $exedir/mg_passes$exe\n";
252 }
253 $self->{'buildproc'}->reset();
254 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
255 "", {}, $self->{'buildproc'});
256 close (PIPEOUT);
257
258 # create the perfect hash function
259 if (!-e "$exedir/mg_perf_hash_build$exe") {
260 die "mgbuilder::build_index - couldn't run $exedir/mg_perf_hash_build$exe\n";
261 }
262 system ("$exedir/mg_perf_hash_build$exe -f $fullindexprefix $osextra");
263
264 # invert the text
265 print STDERR "\n inverting the text\n" if ($self->{'verbosity'} >= 1);
266 if (!-e "$exedir/mg_passes$exe" || !open (PIPEOUT,
267 "| $exedir/mg_passes$exe -f $fullindexprefix -b $maxdocsize " .
268 "-$index_level -c 3 -G -t 10 -N2 $osextra")) {
269 die "mgbuilder::build_index - couldn't run $exedir/mg_passes$exe\n";
270 }
271 $self->{'buildproc'}->reset();
272 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
273 "", {}, $self->{'buildproc'});
274 close (PIPEOUT);
275
276 # create the weights file
277 print STDERR "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
278 if (!-e "$exedir/mg_weights_build$exe") {
279 die "mgbuilder::build_index - couldn't run $exedir/mg_weights_build$exe\n";
280 }
281 system ("$exedir/mg_weights_build$exe -f $fullindexprefix -t $fulltextprefix $osextra");
282
283 # create 'on-disk' stemmed dictionary
284 print STDERR "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
285 if (!-e "$exedir/mg_invf_dict$exe") {
286 die "mgbuilder::build_index - couldn't run $exedir/mg_invf_dict$exe\n";
287 }
288 system ("$exedir/mg_invf_dict$exe -f $fullindexprefix $osextra");
289
290
291 # creates stem index files for the various stemming methods
292 print STDERR "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
293 if (!-e "$exedir/mg_stem_idx$exe") {
294 die "mgbuilder::build_index - couldn't run $exedir/mg_stem_idx$exe\n";
295 }
296 system ("$exedir/mg_stem_idx$exe -b 4096 -s1 -f $fullindexprefix $osextra");
297 system ("$exedir/mg_stem_idx$exe -b 4096 -s2 -f $fullindexprefix $osextra");
298 system ("$exedir/mg_stem_idx$exe -b 4096 -s3 -f $fullindexprefix $osextra");
299
300
301 # remove unwanted files
302 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
303 opendir (DIR, $tmpdir) || die
304 "mgbuilder::build_index - couldn't read directory $tmpdir\n";
305 foreach $file (readdir(DIR)) {
306 next if $file =~ /^\./;
307 my ($suffix) = $file =~ /\.([^\.]+)$/;
308 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
309 # delete it!
310# print STDERR "deleting $file\n";
311 &util::rm (&util::filename_cat ($tmpdir, $file));
312 }
313 }
314 closedir (DIR);
315}
316
317sub make_infodatabase {
318 my $self = shift (@_);
319 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
320 &util::mk_all_dir ($textdir);
321
322 # assume little-endian for now :-)
323 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}.ldb");
324 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
325 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
326 my $exe = &util::get_os_exe ();
327
328 print STDERR "\n*** creating the info database\n" if ($self->{'verbosity'} >= 1);
329
330 # set up the document processor
331 $self->{'buildproc'}->set_output_handle ('mgbuilder::PIPEOUT');
332 $self->{'buildproc'}->set_mode ('infodb');
333
334 # collect the statistics for the text
335 # -b $maxdocsize sets the maximum document size to be 12 meg
336 if (!-e "$exedir/txt2db$exe" || !open (PIPEOUT,
337 "| $exedir/txt2db$exe $fulldbname")) {
338 die "mgbuilder::make_infodatabase - couldn't run $exedir/txt2db$exe\n";
339 }
340 $self->{'buildproc'}->reset();
341 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
342 "", {}, $self->{'buildproc'});
343 close (PIPEOUT);
344}
345
346sub make_auxiliary_files {
347 my $self = shift (@_);
348 my ($index);
349 my %build_cfg = ();
350
351 print STDERR "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
352
353 # get the text directory
354 &util::mk_all_dir ($self->{'build_dir'});
355
356 # store the build date
357 $build_cfg->{'builddate'} = time;
358
359 # store the number of documents and number of bytes
360 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
361 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
362
363 # store the mapping between the index names and the directory names
364 my @indexmap = ();
365 foreach $index (@{$self->{'collect_cfg'}->{'indexes'}}) {
366 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{$index}");
367 }
368 $build_cfg->{'indexmap'} = \@indexmap;
369
370 # write out the build information
371 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
372 '^(builddate|numdocs|numbytes)$', '^(indexmap)$');
373
374}
375
376sub deinit {
377 my $self = shift (@_);
378}
379
380
3811;
382
383
Note: See TracBrowser for help on using the repository browser.