source: trunk/indexers/mg/src/scripts/mg_get_merge.sh@ 3745

Last change on this file since 3745 was 3745, checked in by mdewsnip, 21 years ago

Addition of MG package for search and retrieval

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 8.9 KB
Line 
1#! /bin/csh -f
2###########################################################################
3#
4# mg_get_merge.sh -- script to get text for mgbuild
5# Copyright (C) 1994 Tim Bell, Shane Hudson (mods. for merging)
6#
7# This program is free software; you can redistribute it and/or modify
8# it under the terms of the GNU General Public License as published by
9# the Free Software Foundation; either version 2 of the License, or
10# (at your option) any later version.
11#
12# This program is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15# GNU General Public License for more details.
16#
17# You should have received a copy of the GNU General Public License
18# along with this program; if not, write to the Free Software
19# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20#
21#
22###########################################################################
23#
24# "mg_get_merge" formats the text for mgbuild by collecting the appropriate
25# documents, and outputing them with control-B after each document.
26# The name of the text must be supplied as an argument.
27# mg_get is called with -i (init) first, then with -t (text) each time the
28# text is needed, then with -c (cleanup).
29# e.g. mg_get alice -t
30#
31###########################################################################
32
33
34
35# The directory containing the data files is stored in the environment
36# variable MGSAMPLE
37
38if ($?MGSAMPLE) then
39 set SampleData = $MGSAMPLE
40else
41 set SampleData = ./SampleData
42endif
43
44if (! $?MGIMAGEVIEWER) then
45 setenv MGIMAGEVIEWER "xv -"
46endif
47
48set rm = /bin/rm
49
50switch ($#argv)
51 case 1:
52 set flag = '-text'
53 breaksw
54
55 case 2:
56 set flag = $2
57 switch ($flag)
58 case '-i':
59 set flag = '-init'
60 breaksw
61 case '-t':
62 set flag = '-text'
63 breaksw
64 case '-c':
65 set flag = '-cleanup'
66 breaksw
67 case '-init':
68 case '-text':
69 case '-cleanup':
70 breaksw
71 default:
72 echo 'Usage: get <document> [-i | -t | -c]'
73 exit(1)
74 endsw
75 breaksw
76
77 default:
78 echo 'Usage: get <document> [-i | -t | -c]'
79 exit(1)
80 endsw
81
82set bindir = $0
83set bindir = $bindir:h
84
85switch ($1)
86 case alice:
87 switch ($flag)
88 case '-init':
89 breaksw
90
91 case '-text':
92 echo "This is a test document 1 2 3 4 5 6 7 8 9 "
93 echo ""
94 echo "A B C"
95 breaksw
96
97 case '-cleanup':
98 breaksw
99 endsw #flag
100 breaksw
101
102 case bible:
103 switch ($flag)
104 case '-init':
105 breaksw
106
107 case '-text':
108 # use uncompressed file if it exists
109 # revelation.txt already contains '^B' characters between verses
110 if (-e $SampleData/bible/revelation.txt) then
111 cat $SampleData/bible/revelation.txt
112 else
113 uncompress <$SampleData/bible/revelation.txt.Z
114 breaksw #-text
115
116 case '-cleanup':
117 breaksw #-cleanup
118
119 endsw #flag
120 breaksw #bible
121
122
123
124 case mailfiles:
125# Takes a list of files that contain mail, and splits them up
126# by putting ^B after each message. Assumes that each message
127# begins with a line '^From '.
128# If messages are already in individual files, use "get allfiles"
129# rather than this one.
130# The EXTRA documents added are in files in the directory mailextra
131#### The variable 'mailfiles' contains a list of files to be split up
132 set mailfiles = '~/mbox ~/.sentmail'
133 set mailextra = '~/extra/*'
134
135 switch ($flag)
136 case '-init':
137 breaksw
138
139 case '-text':
140 foreach f ($mailextra)
141 awk '/^From /&&NR!=1{print ""} {print $0} END{print ""}' $f
142 end # foreach
143 breaksw #-text
144
145 case '-cleanup':
146 breaksw #-cleanup
147
148 endsw #flag
149 breaksw #mailfiles
150
151
152
153 case allfiles:
154# Recursively concatenates every file in every subdirectory of the given
155# directory.
156
157# The variable 'directory' on the next line contains the directory to be used
158 set directory = ~/Mail
159
160# The date of the file ".mgbuildtime" in 'directory' is the last
161# build or merge. At the start of the merge ("-init" argument) the
162# file ".mgmergetime" is touched. Only files newer than ".mgbuildtime"
163# but older than ".mgmergetime" are output by mg_get_merge
164
165 switch ($flag)
166 case '-init':
167 touch $directory/.mgmergetime
168 breaksw
169
170 case '-text'
171 find $directory -type f -newer $directory/.mgbuildtime \! -newer $directory/.mgmergetime \! -name '.*' -exec echo '<' {} '>' \; -exec cat {} \; -exec echo -n '' \;
172 breaksw
173
174 case '-cleanup':
175 touch $directory/.mgmergetime
176 touch $directory/.mgbuildtime
177 breaksw #-cleanup
178 endsw #flag
179 breaksw #allfiles
180
181
182
183
184 case davinci:
185# compress and index the DaVinci collection of text and images
186# (this code is suitable for all sorts of integrated collections
187# of text and images)
188# It assumes that all files are in one directory ($sourceDir).
189# Files that are related have the same prefix. For example,
190# monaLisa.img.pgm might be a gray-level image, and
191# monaLisa.txt.txt would be a textual file describing the image.
192# The suffixes recognised are:
193# .txt.txt ascii text
194# .txt.pbm scanned text stored as a bilevel image
195# .img.pbm a black and white image (typically a line drawing)
196# .img.pgm a gray-scale image
197# In addition, if no corresponding ascii text file is found for
198# a .pbm or .pgm file, then one is created with suffix .tmp.txt,
199# and it stores the name of the image file (in principle it could
200# store the OCR of a .txt.pbm file). At present the .tmp.txt files
201# are deleted by the '-cleanup' option.
202
203#### the next two 'set' statements define which directory the files
204# come from, and where they will be stored. This is the only
205# part of this code that is specific to the 'davinci' collection
206 set sourceDir = $SampleData/davinci
207 set targetDir = $MGDATA/davinci
208
209 switch ($flag)
210 case '-init':
211 mkdir $targetDir >&/dev/null #create the directory to store
212 # compressed images in
213 $rm -rf $targetDir/* >&/dev/null #in case it already existed
214
215 # process all pbm (black and white) images of text
216 foreach f ($sourceDir/*.ptm)
217 set base = $f:t
218
219 $bindir/mgticbuild $targetDir/$base:r.ticlib.$$ $f
220 $bindir/mgticprune $targetDir/$base:r.ticlib.$$
221
222 $bindir/mgtic -L -e $targetDir/$base:r.ticlib.$$ $f >$targetDir/$base:r.tic
223 $rm $targetDir/$base:r.ticlib.$$
224 set r = $f:r #root name of file
225 if (! (-e $r.txt || -e $r.tmp)) then
226 # creates a file; could do OCR here to get its contents
227 echo "No corresponding txt file for" $f "- creating" $r.tmp
228 echo '#######' $f > $r.tmp
229 endif
230 end
231 # process all pgm (gray scale) images
232 foreach f ($sourceDir/*.pgm)
233 set base = $f:t
234 $bindir/mgfelics -e $f >$targetDir/$base:r.flx
235 set r = $f:r #root name of file
236 if (! (-e $r.txt || -e $r.tmp)) then
237 # creates a file; could do image recognition here to create one(!)
238 echo "No corresponding txt file for" $f "- creating" $r.tmp
239 echo '#######' $f > $r.tmp
240 endif
241 end
242 # process all pbm (black and white) images
243 foreach f ($sourceDir/*.pbm)
244 set base = $f:t
245 $bindir/mgbilevel -e $f >$targetDir/$base:r.blv
246 set r = $f:r #root name of file
247 if (! (-e $r.txt || -e $r.tmp)) then
248 # creates a file; could do drawing recognition here to create one(!)
249 echo "No corresponding txt file for" $f "- creating" $r.tmp
250 echo '####### No corresponding text file available for this image' >$r.tmp
251 echo 'Original image file name was' $f >> $r.tmp
252 endif
253 end
254 breaksw #-init
255
256 case '-text'
257 #output each text file and tmp.txt, with names of other associated files
258 foreach f ($sourceDir/*.txt $sourceDir/*.tmp)
259 set r = $f:r #root name of file
260 set base = $r:t
261 set n = `/bin/ls $sourceDir/$base.* | wc -l`
262 if ($n != 1) then # more files than just the text one
263 echo '::::::::::' #separate the image information from the text
264 echo 'Image(s) available:'
265 foreach d ($targetDir/$base.*)
266 switch ($d:e) # work out decoding method from suffix
267 case 'tic':
268 echo 'MGDATA (mgtic -d ' $d ' | '$MGIMAGEVIEWER') '
269 breaksw
270 case 'flx':
271 echo 'MGDATA (mgfelics -d ' $d ' | '$MGIMAGEVIEWER')'
272 breaksw
273 case 'blv':
274 echo 'MGDATA (mgbilevel -d ' $d ' | '$MGIMAGEVIEWER')'
275 breaksw
276 endsw # $d:e (suffix)
277 end #foreach d (associated file)
278 echo '::::::::::' #separate the image information from the text
279 endif # if more than one file
280 cat $f # output the text associated with the images
281 echo -n ''
282 end #foreach f (text file)
283 breaksw #-text
284
285 case '-cleanup':
286 #remove temporary text files
287 $rm $sourceDir/*.tmp
288 breaksw #-cleanup
289 endsw #flag
290 breaksw #davinci
291
292
293#---------------------------
294 case mods:
295# Splits the modifications file at the mod separator "--------"
296 switch ($flag)
297 case '-init':
298 breaksw
299
300 case '-text':
301 sed -e 's/^---.*//g' < ${SampleData}/MODIFICATIONS
302 breaksw
303
304 case '-cleanup':
305 breaksw
306 endsw #flag
307 breaksw #mods
308
309 default:
310 echo 'Sorry, I do not know how to get' $1
311 exit 1
312endsw
313exit 0
Note: See TracBrowser for help on using the repository browser.