source: gsdl/trunk/trunk/mg/src/scripts/mg_get.sh@ 16583

Last change on this file since 16583 was 16583, checked in by davidb, 16 years ago

Undoing change commited in r16582

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 11.8 KB
Line 
1#! /bin/csh -f
2###########################################################################
3#
4# mg_get.sh -- script to get text for mgbuild
5# Copyright (C) 1994 Tim Bell
6# Changed to allow it to be driven from a file ~/.mg_getrc
7# by Bruce McKenzie, Oct 1994
8#
9# This program is free software; you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation; either version 2 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program; if not, write to the Free Software
21# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22#
23# @(#)mg_get.sh 1.11 25 Mar 1994
24#
25###########################################################################
26#
27# "mg_get" formats the text for mgbuild by collecting the appropriate
28# documents, and outputing them with control-B after each document.
29# The name of the text must be supplied as an argument.
30# mg_get is called with -i (init) first, then with -t (text) each time the
31# text is needed, then with -c (cleanup).
32# e.g. mg_get alice -t
33#
34# The file ~/.mg_get contains lines of the form:
35# name TYPE files and directories
36# where name is the name supplied to mg_get, TYPE is one of
37# PARA, MAIL, DIR, DIR2, BIB or TXTIMG and says how mg_get should process the
38# named files and directories
39#
40###########################################################################
41
42if ($?MG_GETRC) then
43 set mg_getrc = $MG_GETRC
44else
45 set mg_getrc = ~/.mg_getrc
46endif
47
48# Added by [TS:May/95]
49# if we don't have an mg_getrc file in existence then create a default one
50if !(-e $mg_getrc) then
51 echo "Can not find a .mg_getrc, so creating a default one: $mg_getrc"
52
53 if ($?MGSAMPLE) then
54 set SampleData = $MGSAMPLE
55 else
56 set SampleData = ./SampleData
57 endif
58
59 cat > $mg_getrc << END
60alice PARA $SampleData/alice13a.txt.Z
61davinci TXTIMG $SampleData/davinci
62mailfiles MAIL ~/mbox ~/.sentmail
63allfiles DIR ~/Mail
64mods LINESEP $SampleData/MODIFICATIONS
65bible PLAIN $SampleData/bible/genesis.txt.Z $SampleData/bible/revelation.txt.Z
66END
67endif
68
69if (! $?MGIMAGEVIEWER) then
70 setenv MGIMAGEVIEWER "xv -"
71endif
72
73set rm = /bin/rm
74
75switch ($#argv)
76 case 1:
77 set flag = '-text'
78 breaksw
79
80 case 2:
81 set flag = $2
82 switch ($flag)
83 case '-i':
84 set flag = '-init'
85 breaksw
86 case '-t':
87 set flag = '-text'
88 breaksw
89 case '-c':
90 set flag = '-cleanup'
91 breaksw
92 case '-init':
93 case '-text':
94 case '-cleanup':
95 breaksw
96 default:
97 echo 'Usage: mg_get <document> [-i | -t | -c]'
98 exit(1)
99 endsw
100 breaksw
101
102 default:
103 echo 'Usage: get <document> [-i | -t | -c]'
104 exit(1)
105 endsw
106
107set COLLECTION = $1
108set TYPE = `grep \^$1 $mg_getrc | cut -f2`
109set FILES = `grep \^$1 $mg_getrc | cut -f3`
110
111set bindir = $0
112set bindir = $bindir:h
113
114switch ($TYPE)
115 case PLAIN:
116# Just outputs the given text, assumes ^Bs already inserted
117 switch ($flag)
118 case '-init':
119 breaksw
120
121 case '-text':
122 foreach f ($FILES)
123 if ($f:e == 'Z') then
124 set decoder=zcat
125 else if ($f:e == 'gz') then
126 set decoder=gunzip
127 else
128 set decoder=cat
129 endif
130 $decoder < $f
131 end
132 breaksw #-text
133
134 case '-cleanup':
135 breaksw
136 endsw #flag
137 breaksw
138
139 case LINESEP:
140# Assumes that the documents are separated by a line of =====
141 switch ($flag)
142 case '-init':
143 breaksw
144
145 case '-text':
146 foreach f ($FILES)
147 if ($f:e == 'Z') then
148 set decoder=zcat
149 else if ($f:e == 'gz') then
150 set decoder=gunzip
151 else
152 set decoder=cat
153 endif
154 $decoder < $f | sed -e '/\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*/s/.*//' < $f
155 end
156 breaksw #-text
157
158 case '-cleanup':
159 breaksw
160 endsw #flag
161 breaksw
162
163 case PARA:
164# Splits the file at blank lines, which corresponds to paragraphs.
165 switch ($flag)
166 case '-init':
167 breaksw
168
169 case '-text':
170 foreach f ($FILES)
171 if ($f:e == 'Z') then
172 set decoder=zcat
173 else if ($f:e == 'gz') then
174 set decoder=gunzip
175 else
176 set decoder=cat
177 endif
178 $decoder <$f |awk ' /^ *$/ {if (b!=1) printf "";b=1} \
179 \!/^ *$/ {print;b=0} \
180 END {printf ""}'
181 end #foreach
182 breaksw #-text
183
184 case '-cleanup':
185 breaksw
186 endsw #flag
187 breaksw
188
189 case BIB:
190# Takes a list of files that contain bibliographies, and splits them up
191# by putting ^B after each reference. Assumes that each reference
192# begins with a line '^@'.
193
194 switch ($flag)
195 case '-init':
196 breaksw
197
198 case '-text':
199 foreach f ($FILES)
200 if ($f:e == 'Z') then
201 set decoder=zcat
202 else if ($f:e == 'gz') then
203 set decoder=gunzip
204 else
205 set decoder=cat
206 endif
207 $decoder <$f |awk '/^@/&&NR!=1{printf ""} {print $0} END{print ""}'
208 end # foreach
209 breaksw #-text
210
211 case '-cleanup':
212 breaksw #-cleanup
213 endsw #flag
214 breaksw #BIB
215
216 case MAIL:
217# Takes a list of files that contain mail, and splits them up
218# by putting ^B after each message. Assumes that each message
219# begins with a line '^From '.
220
221 switch ($flag)
222 case '-init':
223 breaksw
224
225 case '-text':
226 foreach f ($FILES)
227 if ($f:e == 'Z') then
228 set decoder=zcat
229 else if ($f:e == 'gz') then
230 set decoder=gunzip
231 else
232 set decoder=cat
233 endif
234 $decoder <$f| awk '/xbtoa Begin/,/xbtoa End/ {next} /^From /&&NR!=1{printf ""} {print $0} END{print ""}'
235 end # foreach
236 breaksw #-text
237
238 case '-cleanup':
239 breaksw #-cleanup
240 endsw #flag
241 breaksw #MAIL
242
243 case DIR:
244# Recursively concatenates every file in every subdirectory of the given
245# directory.
246
247 switch ($flag)
248 case '-init':
249 breaksw
250
251 case '-text'
252 find $FILES -type f -name '*.gz' -exec echo '<' {} '>' \; \
253 -exec gzcat {} \; -exec echo -n '' \;
254 find $FILES -type f -name '*.Z' -exec echo '<' {} '>' \; \
255 -exec zcat {} \; -exec echo -n '' \;
256# find $FILES -type f -not -regex '.*\.\(gz\|Z\)' -exec echo '<' {} \
257# '>' \; -exec cat {} \; -exec echo -n '' \;
258 find $FILES -type f \! \( -name '*.gz' -o -name '*.Z' \) \
259 -exec echo '<' {} '>' \; \
260 -exec cat {} \; -exec echo -n '' \;
261
262 breaksw
263
264 case '-cleanup':
265 breaksw #-cleanup
266 endsw #flag
267 breaksw #DIR
268
269
270
271 case DIR2:
272# Recursively concatenates every file in every subdirectory of the given
273# directory. Does not include filename
274
275 switch ($flag)
276 case '-init':
277 breaksw
278
279 case '-text'
280 find $FILES -type f -name '*.gz' \
281 -exec gzcat {} \; -exec echo -n '' \;
282 find $FILES -type f -name '*.Z' \
283 -exec zcat {} \; -exec echo -n '' \;
284# find $FILES -type f -not -regex '.*\.\(gz\|Z\)' \
285# -exec cat {} \; -exec echo -n '' \;
286 find $FILES -type f \! \( -name '*.gz' -o -name '*.Z' \) \
287 -exec cat {} \; -exec echo -n '' \;
288 breaksw
289
290 case '-cleanup':
291 breaksw #-cleanup
292 endsw #flag
293 breaksw #DIR2
294
295
296
297
298 case TXTIMG:
299# compress and index the collection of text and images
300# (this code is suitable for all sorts of integrated collections
301# of text and images)
302# Files that are related have the same prefix. For example,
303# monaLisa.pgm might be a gray-level image, and
304# monaLisa.txt would be a textual file describing the image.
305# The suffixes recognised are:
306# .txt ascii text
307# .ptm scanned text stored as a bilevel image
308# .pbm a black and white image (typically a line drawing)
309# .pgm a gray-scale image
310# In addition, if no corresponding ascii text file is found for
311# a .pbm or .pgm file, then one is created with suffix .tmp.txt,
312# and it stores the name of the image file (in principle it could
313# store the OCR of a .txt.pbm file). At present the .tmp.txt files
314# are deleted by the '-cleanup' option.
315
316#### the next two 'set' statements define which directory the files
317# come from, and where they will be stored. This is the only
318# part of this code that is specific to the 'davinci' collection
319 set sourceDir = $FILES
320 set targetDir = $MGDATA/$COLLECTION
321
322 switch ($flag)
323 case '-init':
324 mkdir $targetDir >&/dev/null # create the directory to store
325 # compressed images in
326 $rm -rf $targetDir/* >&/dev/null # in case it already existed
327
328 # take care if no match to foreach statements
329 set nonomatch
330
331 # process all pbm (black and white) images of text
332 foreach f ($sourceDir/*.ptm)
333 if ($f == "$sourceDir/*.ptm") then
334 break
335 endif
336 set base = $f:t
337
338 $bindir/mgticbuild $targetDir/$base:r.ticlib.$$ $f
339 $bindir/mgticprune $targetDir/$base:r.ticlib.$$
340
341 $bindir/mgtic -L -e $targetDir/$base:r.ticlib.$$ $f >$targetDir/$base:r.tic
342 $rm $targetDir/$base:r.ticlib.$$
343 set r = $f:r #root name of file
344 if (! (-e $r.txt || -e $r.tmp)) then
345 # creates a file; could do OCR here to get its contents
346 echo "No corresponding txt file for" $f "- creating" $r.tmp
347 echo '#######' $f > $r.tmp
348 endif
349 end
350 # process all pgm (gray scale) images
351 foreach f ($sourceDir/*.pgm)
352 if ($f == "$sourceDir/*.pgm") then
353 break
354 endif
355 set base = $f:t
356 $bindir/mgfelics -e $f >$targetDir/$base:r.flx
357 set r = $f:r #root name of file
358 if (! (-e $r.txt || -e $r.tmp)) then
359 # creates a file; could do image recognition here to create one(!)
360 echo "No corresponding txt file for" $f "- creating" $r.tmp
361 echo '#######' $f > $r.tmp
362 endif
363 end
364 # process all pbm (black and white) images
365 foreach f ($sourceDir/*.pbm)
366 if ($f == "$sourceDir/*.pbm") then
367 break
368 endif
369 set base = $f:t
370 $bindir/mgbilevel -e $f >$targetDir/$base:r.blv
371 set r = $f:r #root name of file
372 if (! (-e $r.txt || -e $r.tmp)) then
373 # creates a file; could do drawing recognition here to create one(!)
374 echo "No corresponding txt file for" $f "- creating" $r.tmp
375 echo '####### No corresponding text file available for this image' >$r.tmp
376 echo 'Original image file name was' $f >> $r.tmp
377 endif
378 end
379 unset nonomatch
380 breaksw #-init
381
382 case '-text'
383 # take care if no match to foreach statements
384 set nonomatch
385
386 #output each text file and tmp.txt, with names of other associated files
387
388 foreach f ($sourceDir/*.txt $sourceDir/*.tmp)
389 if ($f == "$sourceDir/*.txt") then
390 continue
391 endif
392 if ($f == "$sourceDir/*.tmp") then
393 continue
394 endif
395 set r = $f:r #root name of file
396 set base = $r:t
397 set first_image = 1 # set so that I print out image header once
398 # also signifies if header printed out at all
399
400 foreach d ($targetDir/$base.*)
401 if ($d == "$targetDir/$base.*") then
402 break
403 endif
404
405 if ($first_image) then
406 set first_image = 0
407 echo '::::::::::' #separate the image information from the text
408 echo 'Image(s) available:'
409 endif
410
411 switch ($d:e) # work out decoding method from suffix
412 case 'tic':
413 echo 'MGDATA (mgtic -d ' $d ' | '$MGIMAGEVIEWER') '
414 breaksw
415 case 'flx':
416 echo 'MGDATA (mgfelics -d ' $d ' | '$MGIMAGEVIEWER')'
417 breaksw
418 case 'blv':
419 echo 'MGDATA (mgbilevel -d ' $d ' | '$MGIMAGEVIEWER')'
420 breaksw
421 endsw # $d:e (suffix)
422
423 end #foreach d (associated file)
424
425 if (! $first_image) then
426 echo '::::::::::' #separate the image information from the text
427 endif
428
429 cat $f # output the text associated with the images
430 echo -n ''
431 end #foreach f (text file)
432 unset nonomatch
433 breaksw #-text
434
435 case '-cleanup':
436 #remove temporary text files
437 $rm $sourceDir/*.tmp
438 breaksw #-cleanup
439 endsw #flag
440 breaksw #IMGTXT
441
442
443 default:
444 echo 'Sorry, I do not know how to get' $1
445 exit 1
446endsw
447exit 0
Note: See TracBrowser for help on using the repository browser.