Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: gsdl/trunk/trunk/mg/src/scripts/mg_get.sh@ 16583

Last change on this file since 16583 was 16583, checked in by davidb, 16 years ago
Undoing change commited in r16582
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 11.8 KB

Line
1	#! /bin/csh -f
2	###########################################################################
3	#
4	# mg_get.sh -- script to get text for mgbuild
5	# Copyright (C) 1994 Tim Bell
6	# Changed to allow it to be driven from a file ~/.mg_getrc
7	# by Bruce McKenzie, Oct 1994
8	#
9	# This program is free software; you can redistribute it and/or modify
10	# it under the terms of the GNU General Public License as published by
11	# the Free Software Foundation; either version 2 of the License, or
12	# (at your option) any later version.
13	#
14	# This program is distributed in the hope that it will be useful,
15	# but WITHOUT ANY WARRANTY; without even the implied warranty of
16	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17	# GNU General Public License for more details.
18	#
19	# You should have received a copy of the GNU General Public License
20	# along with this program; if not, write to the Free Software
21	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22	#
23	# @(#)mg_get.sh 1.11 25 Mar 1994
24	#
25	###########################################################################
26	#
27	# "mg_get" formats the text for mgbuild by collecting the appropriate
28	# documents, and outputing them with control-B after each document.
29	# The name of the text must be supplied as an argument.
30	# mg_get is called with -i (init) first, then with -t (text) each time the
31	# text is needed, then with -c (cleanup).
32	# e.g. mg_get alice -t
33	#
34	# The file ~/.mg_get contains lines of the form:
35	# name TYPE files and directories
36	# where name is the name supplied to mg_get, TYPE is one of
37	# PARA, MAIL, DIR, DIR2, BIB or TXTIMG and says how mg_get should process the
38	# named files and directories
39	#
40	###########################################################################
41
42	if ($?MG_GETRC) then
43	set mg_getrc = $MG_GETRC
44	else
45	set mg_getrc = ~/.mg_getrc
46	endif
47
48	# Added by [TS:May/95]
49	# if we don't have an mg_getrc file in existence then create a default one
50	if !(-e $mg_getrc) then
51	echo "Can not find a .mg_getrc, so creating a default one: $mg_getrc"
52
53	if ($?MGSAMPLE) then
54	set SampleData = $MGSAMPLE
55	else
56	set SampleData = ./SampleData
57	endif
58
59	cat > $mg_getrc << END
60	alice PARA $SampleData/alice13a.txt.Z
61	davinci TXTIMG $SampleData/davinci
62	mailfiles MAIL ~/mbox ~/.sentmail
63	allfiles DIR ~/Mail
64	mods LINESEP $SampleData/MODIFICATIONS
65	bible PLAIN $SampleData/bible/genesis.txt.Z $SampleData/bible/revelation.txt.Z
66	END
67	endif
68
69	if (! $?MGIMAGEVIEWER) then
70	setenv MGIMAGEVIEWER "xv -"
71	endif
72
73	set rm = /bin/rm
74
75	switch ($#argv)
76	case 1:
77	set flag = '-text'
78	breaksw
79
80	case 2:
81	set flag = $2
82	switch ($flag)
83	case '-i':
84	set flag = '-init'
85	breaksw
86	case '-t':
87	set flag = '-text'
88	breaksw
89	case '-c':
90	set flag = '-cleanup'
91	breaksw
92	case '-init':
93	case '-text':
94	case '-cleanup':
95	breaksw
96	default:
97	echo 'Usage: mg_get <document> [-i \| -t \| -c]'
98	exit(1)
99	endsw
100	breaksw
101
102	default:
103	echo 'Usage: get <document> [-i \| -t \| -c]'
104	exit(1)
105	endsw
106
107	set COLLECTION = $1
108	set TYPE = `grep \^$1 $mg_getrc \| cut -f2`
109	set FILES = `grep \^$1 $mg_getrc \| cut -f3`
110
111	set bindir = $0
112	set bindir = $bindir:h
113
114	switch ($TYPE)
115	case PLAIN:
116	# Just outputs the given text, assumes ^Bs already inserted
117	switch ($flag)
118	case '-init':
119	breaksw
120
121	case '-text':
122	foreach f ($FILES)
123	if ($f:e == 'Z') then
124	set decoder=zcat
125	else if ($f:e == 'gz') then
126	set decoder=gunzip
127	else
128	set decoder=cat
129	endif
130	$decoder < $f
131	end
132	breaksw #-text
133
134	case '-cleanup':
135	breaksw
136	endsw #flag
137	breaksw
138
139	case LINESEP:
140	# Assumes that the documents are separated by a line of =====
141	switch ($flag)
142	case '-init':
143	breaksw
144
145	case '-text':
146	foreach f ($FILES)
147	if ($f:e == 'Z') then
148	set decoder=zcat
149	else if ($f:e == 'gz') then
150	set decoder=gunzip
151	else
152	set decoder=cat
153	endif
154	$decoder < $f \| sed -e '/\\\\\\\\\\\\\\\\\/s/.//' < $f
155	end
156	breaksw #-text
157
158	case '-cleanup':
159	breaksw
160	endsw #flag
161	breaksw
162
163	case PARA:
164	# Splits the file at blank lines, which corresponds to paragraphs.
165	switch ($flag)
166	case '-init':
167	breaksw
168
169	case '-text':
170	foreach f ($FILES)
171	if ($f:e == 'Z') then
172	set decoder=zcat
173	else if ($f:e == 'gz') then
174	set decoder=gunzip
175	else
176	set decoder=cat
177	endif
178	$decoder <$f \|awk ' /^ *$/ {if (b!=1) printf "";b=1} \
179	\!/^ *$/ {print;b=0} \
180	END {printf ""}'
181	end #foreach
182	breaksw #-text
183
184	case '-cleanup':
185	breaksw
186	endsw #flag
187	breaksw
188
189	case BIB:
190	# Takes a list of files that contain bibliographies, and splits them up
191	# by putting ^B after each reference. Assumes that each reference
192	# begins with a line '^@'.
193
194	switch ($flag)
195	case '-init':
196	breaksw
197
198	case '-text':
199	foreach f ($FILES)
200	if ($f:e == 'Z') then
201	set decoder=zcat
202	else if ($f:e == 'gz') then
203	set decoder=gunzip
204	else
205	set decoder=cat
206	endif
207	$decoder <$f \|awk '/^@/&&NR!=1{printf ""} {print $0} END{print ""}'
208	end # foreach
209	breaksw #-text
210
211	case '-cleanup':
212	breaksw #-cleanup
213	endsw #flag
214	breaksw #BIB
215
216	case MAIL:
217	# Takes a list of files that contain mail, and splits them up
218	# by putting ^B after each message. Assumes that each message
219	# begins with a line '^From '.
220
221	switch ($flag)
222	case '-init':
223	breaksw
224
225	case '-text':
226	foreach f ($FILES)
227	if ($f:e == 'Z') then
228	set decoder=zcat
229	else if ($f:e == 'gz') then
230	set decoder=gunzip
231	else
232	set decoder=cat
233	endif
234	$decoder <$f\| awk '/xbtoa Begin/,/xbtoa End/ {next} /^From /&&NR!=1{printf ""} {print $0} END{print ""}'
235	end # foreach
236	breaksw #-text
237
238	case '-cleanup':
239	breaksw #-cleanup
240	endsw #flag
241	breaksw #MAIL
242
243	case DIR:
244	# Recursively concatenates every file in every subdirectory of the given
245	# directory.
246
247	switch ($flag)
248	case '-init':
249	breaksw
250
251	case '-text'
252	find $FILES -type f -name '*.gz' -exec echo '<' {} '>' \; \
253	-exec gzcat {} \; -exec echo -n '' \;
254	find $FILES -type f -name '*.Z' -exec echo '<' {} '>' \; \
255	-exec zcat {} \; -exec echo -n '' \;
256	# find $FILES -type f -not -regex '.*\.\(gz\\|Z\)' -exec echo '<' {} \
257	# '>' \; -exec cat {} \; -exec echo -n '' \;
258	find $FILES -type f \! \( -name '.gz' -o -name '.Z' \) \
259	-exec echo '<' {} '>' \; \
260	-exec cat {} \; -exec echo -n '' \;
261
262	breaksw
263
264	case '-cleanup':
265	breaksw #-cleanup
266	endsw #flag
267	breaksw #DIR
268
269
270
271	case DIR2:
272	# Recursively concatenates every file in every subdirectory of the given
273	# directory. Does not include filename
274
275	switch ($flag)
276	case '-init':
277	breaksw
278
279	case '-text'
280	find $FILES -type f -name '*.gz' \
281	-exec gzcat {} \; -exec echo -n '' \;
282	find $FILES -type f -name '*.Z' \
283	-exec zcat {} \; -exec echo -n '' \;
284	# find $FILES -type f -not -regex '.*\.\(gz\\|Z\)' \
285	# -exec cat {} \; -exec echo -n '' \;
286	find $FILES -type f \! \( -name '.gz' -o -name '.Z' \) \
287	-exec cat {} \; -exec echo -n '' \;
288	breaksw
289
290	case '-cleanup':
291	breaksw #-cleanup
292	endsw #flag
293	breaksw #DIR2
294
295
296
297
298	case TXTIMG:
299	# compress and index the collection of text and images
300	# (this code is suitable for all sorts of integrated collections
301	# of text and images)
302	# Files that are related have the same prefix. For example,
303	# monaLisa.pgm might be a gray-level image, and
304	# monaLisa.txt would be a textual file describing the image.
305	# The suffixes recognised are:
306	# .txt ascii text
307	# .ptm scanned text stored as a bilevel image
308	# .pbm a black and white image (typically a line drawing)
309	# .pgm a gray-scale image
310	# In addition, if no corresponding ascii text file is found for
311	# a .pbm or .pgm file, then one is created with suffix .tmp.txt,
312	# and it stores the name of the image file (in principle it could
313	# store the OCR of a .txt.pbm file). At present the .tmp.txt files
314	# are deleted by the '-cleanup' option.
315
316	#### the next two 'set' statements define which directory the files
317	# come from, and where they will be stored. This is the only
318	# part of this code that is specific to the 'davinci' collection
319	set sourceDir = $FILES
320	set targetDir = $MGDATA/$COLLECTION
321
322	switch ($flag)
323	case '-init':
324	mkdir $targetDir >&/dev/null # create the directory to store
325	# compressed images in
326	$rm -rf $targetDir/* >&/dev/null # in case it already existed
327
328	# take care if no match to foreach statements
329	set nonomatch
330
331	# process all pbm (black and white) images of text
332	foreach f ($sourceDir/*.ptm)
333	if ($f == "$sourceDir/*.ptm") then
334	break
335	endif
336	set base = $f:t
337
338	$bindir/mgticbuild $targetDir/$base:r.ticlib.$$ $f
339	$bindir/mgticprune $targetDir/$base:r.ticlib.$$
340
341	$bindir/mgtic -L -e $targetDir/$base:r.ticlib.$$ $f >$targetDir/$base:r.tic
342	$rm $targetDir/$base:r.ticlib.$$
343	set r = $f:r #root name of file
344	if (! (-e $r.txt \|\| -e $r.tmp)) then
345	# creates a file; could do OCR here to get its contents
346	echo "No corresponding txt file for" $f "- creating" $r.tmp
347	echo '#######' $f > $r.tmp
348	endif
349	end
350	# process all pgm (gray scale) images
351	foreach f ($sourceDir/*.pgm)
352	if ($f == "$sourceDir/*.pgm") then
353	break
354	endif
355	set base = $f:t
356	$bindir/mgfelics -e $f >$targetDir/$base:r.flx
357	set r = $f:r #root name of file
358	if (! (-e $r.txt \|\| -e $r.tmp)) then
359	# creates a file; could do image recognition here to create one(!)
360	echo "No corresponding txt file for" $f "- creating" $r.tmp
361	echo '#######' $f > $r.tmp
362	endif
363	end
364	# process all pbm (black and white) images
365	foreach f ($sourceDir/*.pbm)
366	if ($f == "$sourceDir/*.pbm") then
367	break
368	endif
369	set base = $f:t
370	$bindir/mgbilevel -e $f >$targetDir/$base:r.blv
371	set r = $f:r #root name of file
372	if (! (-e $r.txt \|\| -e $r.tmp)) then
373	# creates a file; could do drawing recognition here to create one(!)
374	echo "No corresponding txt file for" $f "- creating" $r.tmp
375	echo '####### No corresponding text file available for this image' >$r.tmp
376	echo 'Original image file name was' $f >> $r.tmp
377	endif
378	end
379	unset nonomatch
380	breaksw #-init
381
382	case '-text'
383	# take care if no match to foreach statements
384	set nonomatch
385
386	#output each text file and tmp.txt, with names of other associated files
387
388	foreach f ($sourceDir/.txt $sourceDir/.tmp)
389	if ($f == "$sourceDir/*.txt") then
390	continue
391	endif
392	if ($f == "$sourceDir/*.tmp") then
393	continue
394	endif
395	set r = $f:r #root name of file
396	set base = $r:t
397	set first_image = 1 # set so that I print out image header once
398	# also signifies if header printed out at all
399
400	foreach d ($targetDir/$base.*)
401	if ($d == "$targetDir/$base.*") then
402	break
403	endif
404
405	if ($first_image) then
406	set first_image = 0
407	echo '::::::::::' #separate the image information from the text
408	echo 'Image(s) available:'
409	endif
410
411	switch ($d:e) # work out decoding method from suffix
412	case 'tic':
413	echo 'MGDATA (mgtic -d ' $d ' \| '$MGIMAGEVIEWER') '
414	breaksw
415	case 'flx':
416	echo 'MGDATA (mgfelics -d ' $d ' \| '$MGIMAGEVIEWER')'
417	breaksw
418	case 'blv':
419	echo 'MGDATA (mgbilevel -d ' $d ' \| '$MGIMAGEVIEWER')'
420	breaksw
421	endsw # $d:e (suffix)
422
423	end #foreach d (associated file)
424
425	if (! $first_image) then
426	echo '::::::::::' #separate the image information from the text
427	endif
428
429	cat $f # output the text associated with the images
430	echo -n ''
431	end #foreach f (text file)
432	unset nonomatch
433	breaksw #-text
434
435	case '-cleanup':
436	#remove temporary text files
437	$rm $sourceDir/*.tmp
438	breaksw #-cleanup
439	endsw #flag
440	breaksw #IMGTXT
441
442
443	default:
444	echo 'Sorry, I do not know how to get' $1
445	exit 1
446	endsw
447	exit 0

Note: See TracBrowser for help on using the repository browser.

Download in other formats: