source: gsdl/trunk/trunk/mg/src/scripts/mgbuild_4.sh@ 16583

Last change on this file since 16583 was 16583, checked in by davidb, 16 years ago

Undoing change commited in r16582

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 11.7 KB
Line 
1#!/bin/csh -f
2###########################################################################
3#
4# mgbuild_4.sh -- Script used to build mg text collection in 4 passes.
5# Copyright (C) 1994 Neil Sharman; Modified by tes@kbs 1995
6#
7# This program is free software; you can redistribute it and/or modify
8# it under the terms of the GNU General Public License as published by
9# the Free Software Foundation; either version 2 of the License, or
10# (at your option) any later version.
11#
12# This program is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15# GNU General Public License for more details.
16#
17# You should have received a copy of the GNU General Public License
18# along with this program; if not, write to the Free Software
19# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20#
21#
22###########################################################################
23#
24# Does an mgbuild in 4 passes instead of 2 passes.
25# This is useful if one does not have as much RAM and a large collection.
26#
27
28set complex = ""
29
30# ************ parse the command line arguments ************
31
32while ($#argv >= 1)
33 if ("$1" == "-s" || "$1" == "--source") then
34 shift
35 if ($#argv >= 1) then
36 set source = $1
37 shift
38 endif
39 else if ("$1" == "-g" || "$1" == "--get") then
40 shift
41 if ($#argv >= 1) then
42 set get = $1
43 shift
44 endif
45 else if ("$1" == "-c" || "$1" == "--complex") then
46 set complex = "-text"
47 shift
48 else
49 if (! $?text ) then
50 set text = $1
51 endif
52 shift
53 endif
54
55end
56
57if (! $?text ) then
58 set prog = $0
59 echo "USAGE:"
60 echo " "$prog:t" [-s config-script] [-g get-program] [-c] source"
61 echo ""
62 echo " The config-script is only needed if a non-standard build is required."
63 echo " The get-program defaults to mg_get if not specified."
64 exit 1
65endif
66
67set bindir = `which $0` # find which one we are executing
68set bindir = $bindir:h # use it as the bindir path
69
70# ************ set up shell variables ************
71# Override these by providing a script to source using the -s option
72
73# if $pipe == 1 then pipe in the source text using $get and $text otherwise
74# read the source text directly from the file names specified in $input_files
75set pipe = 1
76
77set get_options = ""
78
79
80if (! $?get ) then
81 set complex = "-text"
82 set get = $bindir/mg_get $get_options
83endif
84
85
86if (-e $MGDATA/${text}.chunks) then
87 set input_files = `cat $MGDATA/${text}.chunks`
88endif
89
90# Set the stemming method
91# Bit 0 = case folding
92# Bit 1 = S stemmer
93set stem_method = 3
94
95
96# [RPAP - Jan 97: Stem Index Change]
97# If do_indexes == 1 then build collection with full indexes to blocked file
98# overriding stem_method (stem_method will be set to 0)
99# Otherwise don't build with indexes
100set do_indexes = 0
101
102
103# whether to build weights or not
104set do_weights = 1
105
106# If do_pass1 == 1 then do pass1 of building a mg database.
107# Otherwise don't do pass1.
108# Text 1st pass
109set do_pass1 = 1
110
111# If do_pass2 == 1 then do pass2 of building a mg database.
112# Otherwise don't do pass2.
113# Text 2nd pass
114set do_pass2 = 1
115
116# If do_pass3 == 1 then do pass3 of building a mg database.
117# Otherwise don't do pass3.
118# Invf 1st pass
119set do_pass3 = 1
120
121# If do_pass4 == 1 then do pass4 of building a mg database.
122# Otherwise don't do pass4.
123# Invf 2nd pass
124set do_pass4 = 1
125
126# Buffer size used by mg_passes
127# Should be big enough to hold largest document
128# In Kilobytes
129set buf_size = 3072
130
131# $invf_mem specifies the amount of memory to use for the pass2 inversion
132set invf_mem = 5
133
134# $num_chunks specifies the number of interium chunks of inverted file that
135# may be written to disc before a merge into the invf file is done
136set num_chunks = 3
137
138# $invf_level specifies the level of the inverted file that will be generated.
139set invf_level = 2
140
141
142# If $strip_sgml == 1 then sgml tags are stripped from the inversion phase.
143# Otherwise sgml tags are kepted.
144set strip_sgml = 1
145
146
147# $trace specifies the interval between trace entries in Mb.
148# If this is not set no trace entries will be generated.
149set trace = 10
150
151
152# $weight_bits specifies the number of bits of precision bo be given to the
153# approximate weights.
154set weight_bits = 6
155
156
157# $mcd specifies the commandline arguments for the mg_compression_dict program
158set mcd = -C
159
160
161# Source the parameter file to modify parameters of the build.
162if ($?source) then
163 source ${source}
164endif
165
166
167# [RPAP - Jan 97: Stem Index Change]
168# If do_indexes == 1 then set stem_method = 0
169if ("$do_indexes" == "1") then
170 set stem_method = 0
171endif
172
173
174# Generate the collection name.
175set coll_name = ${text}
176if ("$invf_level" == "3") then
177 set coll_name = ${text}-p
178endif
179
180# Generate the directory for the collection.
181if (-e $MGDATA/${coll_name}) then
182else
183 mkdir $MGDATA/${coll_name}
184endif
185
186# Generate the base name for the collection.
187set bname = ${coll_name}/${coll_name}
188
189# ************ set up pass parameters ************
190
191if ($do_pass1) then
192 set pass1_text = (-f ${bname} -T1)
193endif
194
195if ($do_pass2) then
196 set pass2_text = (-f ${bname} -T2)
197endif
198
199if ($do_pass3) then
200 set pass3_invf = (-f ${bname} -${invf_level} -m ${invf_mem} -s ${stem_method})
201 set pass3_invf = (${pass3_invf} -N1)
202endif
203
204if ($do_pass4) then
205 set pass4_invf = (-f ${bname} -${invf_level} -N2)
206endif
207
208if ($strip_sgml) then
209 set pass1_text = (${pass1_text} -G)
210 set pass2_text = (${pass2_text} -G)
211 set pass3_invf = (${pass3_invf} -G)
212 set pass4_invf = (${pass4_invf} -G)
213endif
214
215if ($?trace) then
216 set pass1_text = (${pass1_text} -t ${trace})
217 set pass2_text = (${pass2_text} -t ${trace})
218 set pass3_invf = (${pass3_invf} -t ${trace})
219 set pass4_invf = (${pass4_invf} -t ${trace})
220endif
221
222if ($?trace_name) then
223 set pass1_text = (${pass1_text} -n ${trace_name})
224 set pass2_text = (${pass2_text} -n ${trace_name})
225 set pass3_invf = (${pass3_invf} -n ${trace_name})
226 set pass4_invf = (${pass4_invf} -n ${trace_name})
227endif
228
229if ($buf_size) then
230 set pass1_text = (${pass1_text} -b ${buf_size})
231 set pass2_text = (${pass2_text} -b ${buf_size})
232 set pass3_invf = (${pass3_invf} -b ${buf_size})
233 set pass4_invf = (${pass4_invf} -b ${buf_size})
234endif
235
236if ($?comp_stats) then
237 set pass2_text = (${pass2_text} -C ${comp_stats})
238endif
239
240echo "-----------------------------------"
241echo "`uname -n`, `date`"
242echo "${text} --> ${bname}"
243echo "-----------------------------------"
244
245# ************ init get ************
246
247if ($pipe) then
248 if ("$complex" != "") then
249 echo "$get $text -init"
250 $get $text -init
251 if ("$status" != "0") exit 1
252 echo "-----------------------------------"
253 endif
254endif
255
256# ************************************* T E X T **********************************
257
258# ************ pass 1_text ************
259
260if (${do_pass1}) then
261 if ($pipe) then
262 if ($?pass1filter) then
263 echo "$get $text $complex | $pass1filter | mg_passes ${pass1_text}"
264 $get $text $complex| $pass1filter | $bindir/mg_passes ${pass1_text}
265 if ("$status" != "0") exit 1
266 else
267 echo "$get $text $complex | mg_passes ${pass1_text}"
268 $get $text $complex| $bindir/mg_passes ${pass1_text}
269 if ("$status" != "0") exit 1
270 endif
271 else
272 echo mg_passes ${pass1_text} ${input_files}
273 $bindir/mg_passes ${pass1_text} ${input_files}
274 if ("$status" != "0") exit 1
275 endif
276 echo "-----------------------------------"
277endif
278
279# ************ compression dict ************
280if ($do_pass2) then
281 echo "mg_compression_dict -f ${bname} ${mcd}"
282 $bindir/mg_compression_dict -f ${bname} ${mcd}
283 if ("$status" != "0") exit 1
284 echo "-----------------------------------"
285endif
286
287# ************ pass 2_text ************
288if (${do_pass2}) then
289 if ($pipe) then
290 if ($?pass2filter) then
291 echo "$get $text $complex | $pass2filter | mg_passes ${pass2_text}"
292 $get $text $complex| $pass2filter | $bindir/mg_passes ${pass2_text}
293 if ("$status" != "0") exit 1
294 else
295 echo "$get $text $complex | mg_passes ${pass2_text}"
296 $get $text $complex| $bindir/mg_passes ${pass2_text}
297 if ("$status" != "0") exit 1
298 endif
299 else
300 echo mg_passes ${pass2_text} ${input_files}
301 $bindir/mg_passes ${pass2_text} ${input_files}
302 if ("$status" != "0") exit 1
303 endif
304 echo "-----------------------------------"
305endif
306
307# ************************************* I N V F **********************************
308
309# ************ pass 3_invf ************
310if (${do_pass3}) then
311 if ($pipe) then
312 if ($?pass3filter) then
313 echo "$get $text $complex | $pass3filter | mg_passes ${pass3_invf}"
314 $get $text $complex| $pass3filter | $bindir/mg_passes ${pass3_invf}
315 if ("$status" != "0") exit 1
316 else
317 echo "$get $text $complex | mg_passes ${pass3_invf}"
318 $get $text $complex| $bindir/mg_passes ${pass3_invf}
319 if ("$status" != "0") exit 1
320 endif
321 else
322 echo mg_passes ${pass3_invf} ${input_files}
323 $bindir/mg_passes ${pass3_invf} ${input_files}
324 if ("$status" != "0") exit 1
325 endif
326 echo "-----------------------------------"
327endif
328
329# ************ perfect hash ************
330if ($do_pass4) then
331 echo "mg_perf_hash_build -f ${bname}"
332 $bindir/mg_perf_hash_build -f ${bname}
333 if ("$status" != "0") exit 1
334
335 echo "-----------------------------------"
336endif
337
338# ************ pass 4_invf ************
339if (${do_pass4}) then
340 if ($pipe) then
341 if ($?pass4filter) then
342 echo "$get $text $complex | $pass4filter | mg_passes ${pass4_invf}"
343 $get $text $complex| $pass4filter | $bindir/mg_passes ${pass4_invf}
344 if ("$status" != "0") exit 1
345 else
346 echo "$get $text $complex | mg_passes ${pass4_invf}"
347 $get $text $complex| $bindir/mg_passes ${pass4_invf}
348 if ("$status" != "0") exit 1
349 endif
350 else
351 echo mg_passes ${pass4_invf} ${input_files}
352 $bindir/mg_passes ${pass4_invf} ${input_files}
353 if ("$status" != "0") exit 1
354 endif
355 echo "-----------------------------------"
356endif
357
358
359# ************ build invf dictionary ************
360
361if ($do_pass4) then
362 echo "mg_invf_dict -f ${bname} -b 4096"
363 $bindir/mg_invf_dict -f ${bname} -b 4096
364 if ("$status" != "0") exit 1
365 echo "-----------------------------------"
366endif
367
368# ******** build indexes to stem dict *********
369
370# [RPAP - Jan 97: Stem Index Change]
371if ($do_pass4 && "$do_indexes" == "1") then
372 echo "mg_stem_idx -f ${bname} -b 4096 -s1"
373 $bindir/mg_stem_idx -f ${bname} -b 4096 -s1
374 if ("$status" != "0") exit 1
375
376 echo ""
377
378 echo "mg_stem_idx -f ${bname} -b 4096 -s2"
379 $bindir/mg_stem_idx -f ${bname} -b 4096 -s2
380 if ("$status" != "0") exit 1
381
382 echo ""
383
384 echo "mg_stem_idx -f ${bname} -b 4096 -s3"
385 $bindir/mg_stem_idx -f ${bname} -b 4096 -s3
386 if ("$status" != "0") exit 1
387
388 echo "-----------------------------------"
389endif
390
391# ************ build weights ************
392
393if ($do_pass4 && $do_weights) then
394 echo "mg_weights_build -f ${bname} -b ${weight_bits} "
395 $bindir/mg_weights_build -f ${bname} -b ${weight_bits}
396 if ("$status" != "0") exit 1
397 echo "-----------------------------------"
398endif
399
400# *******************************************************************************
401
402
403# ************ print out statistics ************
404if ($do_pass1 && $do_pass2 && $do_pass3 && $do_pass4) then
405 echo "mgstat -f ${bname} -E"
406 $bindir/mgstat -f ${bname} -E
407 if ("$status" != "0") exit 1
408endif
409
410
411# ************ cleanup get ************
412
413if ($pipe) then
414 if ("$complex" != "") then
415 echo "-----------------------------------"
416 echo "$get $text -cleanup"
417 $get $text -cleanup
418 if ("$status" != "0") exit 1
419 endif
420endif
421
422echo "-----------------------------------"
423echo "`uname -n`, `date`"
424echo "-----------------------------------"
425
426if ($do_pass2) then
427 echo "-- The fast-loading compression dictionary has not been built."
428 echo "-- If you wish to build it, execute the following command:"
429 echo "-- "$bindir/mg_fast_comp_dict -f ${bname}
430
431 echo "-----------------------------------"
432endif
433
434echo ""
435
Note: See TracBrowser for help on using the repository browser.