source: gsdl/trunk/trunk/mg/src/scripts/mgmerge.sh@ 16583

Last change on this file since 16583 was 16583, checked in by davidb, 16 years ago

Undoing change commited in r16582

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 12.4 KB
Line 
1#!/bin/csh -f
2###########################################################################
3#
4# mgmerge.sh -- Script used to build mg text collection.
5# Copyright (C) 1994 Neil Sharman, Shane Hudson (mods. for merging)
6#
7# This program is free software; you can redistribute it and/or modify
8# it under the terms of the GNU General Public License as published by
9# the Free Software Foundation; either version 2 of the License, or
10# (at your option) any later version.
11#
12# This program is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15# GNU General Public License for more details.
16#
17# You should have received a copy of the GNU General Public License
18# along with this program; if not, write to the Free Software
19# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20#
21#
22###########################################################################
23
24set complex = ""
25set slow_merge = ""
26set guess_weights = "-w "
27# [RPAP - Feb 97: Level 3 Merge]
28set text_pass = 1
29
30# Parse the command line arguments
31while ($#argv >= 1)
32 if ("$1" == "-s") then
33 shift
34 if ($#argv >= 1) then
35 set source = $1
36 shift
37 endif
38 else if ("$1" == "-g") then
39 shift
40 if ($#argv >= 1) then
41 set get = $1
42 shift
43 endif
44 else if ("$1" == "-c") then
45 set complex = "-text"
46 shift
47 else if ("$1" == "-S") then
48# set slow merge option in mg_invf_merge
49 set slow_merge = "-s "
50 shift
51 else if ("$1" == "-w") then
52# turn weights guessing off (rebuild weights file from scratch)
53 set guess_weights = ""
54 shift
55
56# [RPAP - Feb 97: Level 3 Merge]
57 else if ("$1" == "-T") then
58# turn text pass off
59 set text_pass = 0
60 shift
61
62 else
63 if ($?text == "0") then
64 set text = $1
65 endif
66 shift
67 endif
68
69end
70
71if ($?text == "0") then
72 set prog = $0
73 echo "USAGE:"
74 echo " "$prog:t" [-s config-script] [-g get-program] [-c] source"
75 echo ""
76 echo " The config-script is only needed if a non-standard build is required."
77 echo " The get-program defaults to mg_get if not specified."
78 exit 1
79endif
80
81set bindir = $0
82set bindir = $bindir:h
83
84# if $pipe == 1 then pipe in the source text using $get and $text otherwise
85# read the source text directly from the file names specified in $input_files
86set pipe = 1
87
88if ($?get == "0") then
89 set complex = "-text"
90 set get = $bindir/mg_get_merge
91endif
92
93if (-e $MGDATA/${text}.chunks) then
94 set input_files = `cat $MGDATA/${text}.chunks`
95endif
96
97# Set the inversion method this may be either "I" or "N"
98# N is memory-efficient, I is faster for small collections
99set invf_method = N
100
101
102# Set the stemming method
103# Bit 0 = case folding
104# Bit 1 = S stemmer
105set stem_method = 3
106
107
108# [RPAP - Jan 97: Stem Index Change]
109# If do_indexes == 1 then build collection with full indexes to blocked file
110# overriding stem_method (stem_method will be set to 0)
111# Otherwise don't build with indexes
112set do_indexes = 0
113
114
115# $invf_mem specifies the amount of memory to use for the pass2 inversion
116# This only has an effect if $invf_method is "N".
117set invf_mem = 32
118
119
120# $num_chunks specifies the number of interium chunks of inverted file that
121# may be written to disc before a merge into the invf file is done
122# This only has an effect if $invf_method is "N".
123set num_chunks = 3
124
125# $invf_level specifies the level of the inverted file that will be generated.
126# Note: The value *MUST* be the same as was used to build the collection
127# that is being merged.
128# In the current mg_invf_merge implementation the maximum level is 2.
129# Paragraph-level inversion (invf_level = 3) is NOT supported.
130set invf_level = 2
131
132
133# If $strip_sgml == 1 then sgml tags are stripped from the inversion phase.
134# Otherwise sgml tags are kepted.
135set strip_sgml = 1
136
137
138# $trace specifies the interval between trace entries in Mb.
139# If this is not set no trace entries will be generated.
140set trace = 10
141
142
143# $weight_bits specifies the number of bits of precision bo be given to the
144# approximate weights.
145set weight_bits = 6
146
147
148# $mcd specifies the commandline arguments for the mg_compression_dict program
149set mcd = -S
150
151# $merge_name is the name of the subdirectory under ${MGDATA} where
152# the mergeing steps will be performed. There should NOT be an mg
153# collection with this name! The default is "MERGE"
154set merge = "MERGE"
155
156# Source the parameter file to modify parameters of the build.
157if ($?source) then
158 source ${source}
159endif
160
161
162# [RPAP - Jan 97: Stem Index Change]
163# If do_indexes == 1 then set stem_method = 0
164if ($do_indexes) then
165 set stem_method = 0
166endif
167
168
169# Generate the collection name.
170set coll_name = ${text}
171
172# Generate the directory where merging will take place
173if (-e $MGDATA/${merge}) then
174else
175 mkdir $MGDATA/${merge}
176endif
177
178# Generate the base name for the collection.
179# [RPAP - Feb 97: Level 3 Merge]
180set bname = ${coll_name}
181
182# generate the old, new and merge names in the $merge directory
183set oldname = ${merge}/${coll_name}.old
184set newname = ${merge}/${coll_name}.new
185set mergename = ${merge}/${coll_name}
186
187# build up the command lines for pass 1 and 2 to build a
188# dummy collection for the new documents
189set pass1 = (-f ${newname} -${invf_level} -m ${invf_mem} -s ${stem_method})
190set pass2 = (-f ${newname} -${invf_level} -c ${num_chunks})
191
192if ($strip_sgml) then
193 set pass1 = (${pass1} -G)
194 set pass2 = (${pass2} -G)
195endif
196
197if ($?trace) then
198 set pass1 = (${pass1} -t ${trace})
199 set pass2 = (${pass2} -t ${trace})
200endif
201
202# Note that a -T1 pass isnt done since the old .text.stats file is used
203# as the compression model for compressing the new documents.
204
205# [RPAP - Feb 97: Level 3 Merge]
206# Do not perform -T2 pass if text_pass == 0
207if ($text_pass) then
208 set pass2 = (${pass2} -T2)
209endif
210
211set pass1 = (${pass1} -${invf_method}1)
212set pass2 = (${pass2} -${invf_method}2)
213
214
215if ($?trace_name) then
216 set pass1 = (${pass1} -n ${trace_name})
217 set pass2 = (${pass2} -n ${trace_name})
218endif
219
220if ($?comp_stats) then
221 set pass2 = (${pass2} -C ${comp_stats})
222endif
223
224########################################################################
225# Here is where mgmerge goes to work.
226
227echo "--------------------------------------------------------------"
228echo "`uname -n`, `date`"
229echo "MGMERGE collection: ${bname}"
230echo "--------------------------------------------------------------"
231echo "FIRST PHASE: build new collection from new documents"
232echo "--------------------------------------------------------------"
233
234# move and rename the text dictionary file
235# [RPAP - Feb 97: Level 3 Merge] - only if performing text pass
236if ($text_pass) then
237 mv $MGDATA/${bname}.text.dict $MGDATA/${newname}.text.dict
238endif
239
240if ($pipe) then
241 if ("$complex" != "") then
242 echo ">> $get $text -init"
243 $get $text -init
244 if ("$status" != "0") exit 1
245 echo "-----------------------------------"
246 endif
247endif
248
249if ($pipe) then
250 if ($?pass1filter) then
251 echo ">> $get $text $complex | $pass1filter | mg_passes ${pass1}"
252 $get $text $complex| $pass1filter | $bindir/mg_passes ${pass1}
253 if ("$status" != "0") exit 1
254 else
255 echo ">> $get $text $complex | mg_passes ${pass1}"
256 $get $text $complex| $bindir/mg_passes ${pass1}
257 if ("$status" != "0") exit 1
258 endif
259else
260 echo ">> mg_passes ${pass1} ${input_files}"
261 $bindir/mg_passes ${pass1} ${input_files}
262 if ("$status" != "0") exit 1
263endif
264echo "-----------------------------------"
265
266echo "mg_perf_hash_build -f ${newname}"
267$bindir/mg_perf_hash_build -f ${newname}
268if ("$status" != "0") exit 1
269echo "-----------------------------------"
270
271if ($pipe) then
272 if ($?pass2filter) then
273 echo ">> $get $text $complex | $pass2filter | mg_passes ${pass2}"
274 $get $text $complex | $pass2filter | $bindir/mg_passes ${pass2}
275 if ("$status" != "0") exit 1
276 else
277 echo ">> $get $text $complex | mg_passes ${pass2}"
278 $get $text $complex | $bindir/mg_passes ${pass2}
279 if ("$status" != "0") exit 1
280 endif
281else
282 echo ">> mg_passes ${pass2} ${input_files}"
283 $bindir/mg_passes ${pass2} ${input_files}
284 if ("$status" != "0") exit 1
285endif
286echo "-----------------------------------"
287
288if ($pipe) then
289 if ("$complex" != "") then
290 echo "-----------------------------------"
291 echo ">> $get $text -cleanup"
292 $get $text -cleanup
293 if ("$status" != "0") exit 1
294 endif
295endif
296
297echo "---------------------------------------"
298date
299echo "--------------------------------------------------------------"
300echo "SECOND PHASE: merge the two collections"
301echo "--------------------------------------------------------------"
302
303# move the appropriate files from the base directory to the merge dir
304echo "(moving files....)"
305# [RPAP - Feb 97: Level 3 Merge]
306# Move files if doing text pass
307if ($text_pass) then
308 mv $MGDATA/${bname}.text $MGDATA/${oldname}.text
309 mv $MGDATA/${bname}.text.idx $MGDATA/${oldname}.text.idx
310else
311# Need this file if building weights
312 cp -d $MGDATA/${bname}.text.idx $MGDATA/${mergename}.text.idx
313endif
314mv $MGDATA/${bname}.invf $MGDATA/${oldname}.invf
315mv $MGDATA/${bname}.invf.idx $MGDATA/${oldname}.invf.idx
316mv $MGDATA/${bname}.invf.dict $MGDATA/${oldname}.invf.dict
317# the old weight file may be used for the new file
318mv $MGDATA/${bname}.weight $MGDATA/${mergename}.weight
319# EOC: the old invf.paragraph file may be used for the new file for Level 3 Invf
320if (-e $MGDATA/${bname}.invf.paragraph) then
321 mv $MGDATA/${bname}.invf.paragraph $MGDATA/${mergename}.invf.paragraph
322endif
323echo "---------------------------------------"
324
325# [RPAP - Feb 97: Level 3 Merge]
326# Only merge text if doing text pass
327if ($text_pass) then
328 echo ">> mg_text_merge -f ${mergename}"
329 $bindir/mg_text_merge -f $mergename
330 echo "---------------------------------------"
331endif
332
333echo ">> mg_invf_merge ${slow_merge}${guess_weights} -f ${mergename}"
334$bindir/mg_invf_merge ${slow_merge}${guess_weights} -f ${mergename}
335echo "---------------------------------------"
336
337#remove .weight file, so a new one will be created by mg_weights_build
338if ("$guess_weights" == "") then
339 echo "rm ${mergename}.weight"
340 rm $MGDATA/${mergename}.weight
341endif
342
343# mv $MGDATA/${mergename}.weight $MGDATA/${mergename}.w2
344
345echo ">> mg_weights_build -f ${mergename} -b ${weight_bits}"
346$bindir/mg_weights_build -f ${mergename} -b ${weight_bits}
347if ("$status" != "0") exit 1
348
349echo "---------------------------------------"
350
351echo ">> mg_invf_dict -f ${mergename} -b 4096"
352$bindir/mg_invf_dict -f ${mergename} -b 4096
353
354# [RPAP - Jan 97: Stem Index Change]
355if ($do_indexes) then
356 echo "-----------------------------------"
357
358 echo "mg_stem_idx -f ${mergename} -b 4096 -s1"
359 $bindir/mg_stem_idx -f ${mergename} -b 4096 -s1
360 if ("$status" != "0") exit 1
361
362 echo ""
363
364 echo "mg_stem_idx -f ${mergename} -b 4096 -s2"
365 $bindir/mg_stem_idx -f ${mergename} -b 4096 -s2
366 if ("$status" != "0") exit 1
367
368 echo ""
369
370 echo "mg_stem_idx -f ${mergename} -b 4096 -s3"
371 $bindir/mg_stem_idx -f ${mergename} -b 4096 -s3
372 if ("$status" != "0") exit 1
373endif
374
375echo "--------------------------------------------------------------"
376
377# move files back to base dir
378# [RPAP - Feb 97: Level 3 Merge]
379# mv the text files if doing text pass
380if ($text_pass) then
381 mv $MGDATA/${oldname}.text $MGDATA/${bname}.text
382 mv $MGDATA/${mergename}.text.idx $MGDATA/${bname}.text.idx
383 mv $MGDATA/${newname}.text.dict $MGDATA/${bname}.text.dict
384endif
385mv $MGDATA/${mergename}.invf $MGDATA/${bname}.invf
386mv $MGDATA/${mergename}.invf.idx $MGDATA/${bname}.invf.idx
387mv $MGDATA/${mergename}.invf.dict $MGDATA/${bname}.invf.dict
388mv $MGDATA/${mergename}.invf.dict.blocked $MGDATA/${bname}.invf.dict.blocked
389
390# [RPAP - Jan 97: Stem Index Change]
391if ($do_indexes) then
392 mv $MGDATA/${mergename}.invf.dict.blocked.1 $MGDATA/${bname}.invf.dict.blocked.1
393 mv $MGDATA/${mergename}.invf.dict.blocked.2 $MGDATA/${bname}.invf.dict.blocked.2
394 mv $MGDATA/${mergename}.invf.dict.blocked.3 $MGDATA/${bname}.invf.dict.blocked.3
395endif
396
397mv $MGDATA/${mergename}.weight $MGDATA/${bname}.weight
398mv $MGDATA/${mergename}.weight.approx $MGDATA/${bname}.weight.approx
399mv $MGDATA/${mergename}.text.idx.wgt $MGDATA/${bname}.text.idx.wgt
400
401# EOC: mv .invf.paragraph file if exists
402if (-e $MGDATA/${mergename}.invf.paragraph) then
403 mv $MGDATA/${mergename}.invf.paragraph $MGDATA/${bname}.invf.paragraph
404endif
405
406# remove old files in the MERGE directory
407rm -f $MGDATA/${merge}/*
408
409echo "mgstat -f ${bname} -E"
410$bindir/mgstat -f ${bname} -E
411if ("$status" != "0") exit 1
412
413echo "--------------------------------------------------------------"
414echo "`uname -n`, `date`"
415echo "--------------------------------------------------------------"
416
417echo ""
Note: See TracBrowser for help on using the repository browser.