1 | /**************************************************************************
|
---|
2 | *
|
---|
3 | * mgstemidxlist.c -- Text dumper for the stem indexes
|
---|
4 | * Copyright (C) 1997 Ross Peeters
|
---|
5 | *
|
---|
6 | * This program is free software; you can redistribute it and/or modify
|
---|
7 | * it under the terms of the GNU General Public License as published by
|
---|
8 | * the Free Software Foundation; either version 2 of the License, or
|
---|
9 | * (at your option) any later version.
|
---|
10 | *
|
---|
11 | * This program is distributed in the hope that it will be useful,
|
---|
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
14 | * GNU General Public License for more details.
|
---|
15 | *
|
---|
16 | * You should have received a copy of the GNU General Public License
|
---|
17 | * along with this program; if not, write to the Free Software
|
---|
18 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
19 | *
|
---|
20 | **************************************************************************/
|
---|
21 |
|
---|
22 | #include "sysfuncs.h"
|
---|
23 |
|
---|
24 | #include "messages.h"
|
---|
25 | #include "memlib.h"
|
---|
26 | #include "local_strings.h"
|
---|
27 | #include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
|
---|
28 |
|
---|
29 | #include "mg_files.h"
|
---|
30 | #include "mg.h"
|
---|
31 | #include "invf.h"
|
---|
32 | #include "words.h"
|
---|
33 | #include "backend.h"
|
---|
34 |
|
---|
35 | void
|
---|
36 | read_3_in_4 (FILE * idbi)
|
---|
37 | {
|
---|
38 | unsigned long i;
|
---|
39 | stemmed_idx *si;
|
---|
40 | u_char *buffer;
|
---|
41 | int block = 0;
|
---|
42 |
|
---|
43 | if (!(si = Xmalloc (sizeof (stemmed_idx))))
|
---|
44 | {
|
---|
45 | return;
|
---|
46 | }
|
---|
47 |
|
---|
48 | si->MemForStemIdx = 0;
|
---|
49 |
|
---|
50 | fread (&(si->sih), sizeof (si->sih), 1, idbi);
|
---|
51 |
|
---|
52 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
53 | NTOHUL(si->sih.lookback);
|
---|
54 | NTOHUL(si->sih.block_size);
|
---|
55 | NTOHUL(si->sih.num_blocks);
|
---|
56 | NTOHUL(si->sih.blocks_start);
|
---|
57 | NTOHUL(si->sih.index_chars);
|
---|
58 | NTOHUL(si->sih.num_of_words);
|
---|
59 |
|
---|
60 | if (!(buffer = Xmalloc (si->sih.index_chars)))
|
---|
61 | {
|
---|
62 | Xfree (si);
|
---|
63 | return;
|
---|
64 | };
|
---|
65 | si->MemForStemIdx += si->sih.index_chars;
|
---|
66 |
|
---|
67 | if (!(si->index = Xmalloc (si->sih.num_blocks * sizeof (*si->index))))
|
---|
68 | {
|
---|
69 | Xfree (si);
|
---|
70 | Xfree (buffer);
|
---|
71 | return;
|
---|
72 | };
|
---|
73 | si->MemForStemIdx += si->sih.num_blocks * sizeof (*si->index);
|
---|
74 |
|
---|
75 | if (!(si->pos = Xmalloc (si->sih.num_blocks * sizeof (*si->pos))))
|
---|
76 | {
|
---|
77 | Xfree (si->index);
|
---|
78 | Xfree (si);
|
---|
79 | Xfree (buffer);
|
---|
80 | return;
|
---|
81 | };
|
---|
82 | si->MemForStemIdx += si->sih.num_blocks * sizeof (*si->pos);
|
---|
83 |
|
---|
84 | if (!(si->buffer = Xmalloc (si->sih.block_size * sizeof (*si->buffer))))
|
---|
85 | {
|
---|
86 | Xfree (buffer);
|
---|
87 | Xfree (si->index);
|
---|
88 | Xfree (si->buffer);
|
---|
89 | Xfree (si);
|
---|
90 | return;
|
---|
91 | };
|
---|
92 | si->MemForStemIdx += si->sih.block_size * sizeof (*si->buffer);
|
---|
93 |
|
---|
94 | si->active = -1;
|
---|
95 |
|
---|
96 | for (i = 0; i < si->sih.num_blocks; i++)
|
---|
97 | {
|
---|
98 | register u_char len;
|
---|
99 | si->index[i] = buffer;
|
---|
100 | len = fgetc (idbi);
|
---|
101 | *buffer++ = len;
|
---|
102 | fread (buffer, sizeof (u_char), len, idbi);
|
---|
103 | buffer += len;
|
---|
104 | fread (&si->pos[i], sizeof (*si->pos), 1, idbi);
|
---|
105 | NTOHUL(si->pos[i]); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
106 | }
|
---|
107 |
|
---|
108 | printf ("# lookback = %lu\n", si->sih.lookback);
|
---|
109 | printf ("# block_size = %lu\n", si->sih.block_size);
|
---|
110 | printf ("# num_blocks = %lu\n", si->sih.num_blocks);
|
---|
111 | printf ("# blocks_start = %lu\n", si->sih.blocks_start);
|
---|
112 | printf ("# index_chars = %lu\n", si->sih.index_chars);
|
---|
113 | printf ("# num_of_words = %lu\n", si->sih.num_of_words);
|
---|
114 |
|
---|
115 | block = 0;
|
---|
116 | while (block < si->sih.num_blocks)
|
---|
117 | {
|
---|
118 | unsigned long *first_word;
|
---|
119 | unsigned short *num_words;
|
---|
120 | unsigned short *index;
|
---|
121 | long res;
|
---|
122 | u_char *base;
|
---|
123 | int num_indexes;
|
---|
124 |
|
---|
125 | /* Read in next block */
|
---|
126 | fseek (idbi, si->pos[block] + si->sih.blocks_start, 0);
|
---|
127 | fread (si->buffer, si->sih.block_size, sizeof (u_char), idbi);
|
---|
128 | si->active = si->pos[block];
|
---|
129 |
|
---|
130 | first_word = (unsigned long *) (si->buffer);
|
---|
131 | NTOHUL(*first_word); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
132 | num_words = (unsigned short *) (first_word + 1);
|
---|
133 | NTOHUS(*num_words); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
134 | index = num_words + 1;
|
---|
135 | num_indexes = ((*num_words - 1) / si->sih.lookback) + 1;
|
---|
136 |
|
---|
137 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
138 | for (i = 0; i < num_indexes; i++)
|
---|
139 | NTOHUS(index[i]);
|
---|
140 |
|
---|
141 | base = (u_char *) (index + num_indexes);
|
---|
142 | base += index[0];
|
---|
143 |
|
---|
144 | printf ("\n# block = %d\n", block);
|
---|
145 | printf ("# first_word = %lu\n", *first_word);
|
---|
146 | printf ("# num_words = %u\n", *num_words);
|
---|
147 |
|
---|
148 | res = 0;
|
---|
149 | while (res < *num_words)
|
---|
150 | {
|
---|
151 | unsigned copy, suff;
|
---|
152 | u_char prev[MAXSTEMLEN + 1];
|
---|
153 | unsigned int num_entries, num_cases, blk;
|
---|
154 | unsigned short blk_index, offset;
|
---|
155 |
|
---|
156 | /* Read word entry */
|
---|
157 | copy = *base++;
|
---|
158 | suff = *base++;
|
---|
159 | bcopy ((char *) base, (char *) (prev + copy + 1), suff);
|
---|
160 | *prev = copy + suff;
|
---|
161 | base += suff;
|
---|
162 | bcopy ((char *) base, (char *) &num_entries, sizeof (num_entries));
|
---|
163 | base += sizeof (num_entries);
|
---|
164 | NTOHUI(num_entries); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
165 | printf ("%u \"%s\"\n", num_entries, word2str (prev));
|
---|
166 |
|
---|
167 | /* For all the PosEntries for the word... */
|
---|
168 | for (i = 0; i < num_entries; i++)
|
---|
169 | {
|
---|
170 | bcopy ((char *) base, (char *) &num_cases, sizeof (num_cases));
|
---|
171 | NTOHUI(num_cases); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
172 | base += sizeof (num_cases);
|
---|
173 | bcopy ((char *) base, (char *) &blk, sizeof (blk));
|
---|
174 | NTOHUI(blk); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
175 | base += sizeof (blk);
|
---|
176 | bcopy ((char *) base, (char *) &blk_index, sizeof (blk_index));
|
---|
177 | NTOHUS(blk_index); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
178 | base += sizeof (blk_index);
|
---|
179 | bcopy ((char *) base, (char *) &offset, sizeof (offset));
|
---|
180 | NTOHUS(offset); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
181 | base += sizeof (offset);
|
---|
182 |
|
---|
183 | printf (" -> %4u %4u %4u %4u\n", num_cases, blk, blk_index, offset);
|
---|
184 | }
|
---|
185 | res++;
|
---|
186 | }
|
---|
187 | block++;
|
---|
188 | }
|
---|
189 | fclose (idbi);
|
---|
190 | }
|
---|
191 |
|
---|
192 | int
|
---|
193 | main (int argc, char **argv)
|
---|
194 | {
|
---|
195 | FILE *idbi;
|
---|
196 | char *filename = "";
|
---|
197 | int ch;
|
---|
198 | int stem_method = 0;
|
---|
199 |
|
---|
200 | msg_prefix = argv[0];
|
---|
201 | opterr = 0;
|
---|
202 | while ((ch = getopt (argc, argv, "f:d:hs:")) != -1)
|
---|
203 | switch (ch)
|
---|
204 | {
|
---|
205 | case 'f': /* input file */
|
---|
206 | filename = optarg;
|
---|
207 | break;
|
---|
208 | case 'd':
|
---|
209 | set_basepath (optarg);
|
---|
210 | break;
|
---|
211 | case 's':
|
---|
212 | stem_method = atoi (optarg);
|
---|
213 | break;
|
---|
214 | case 'h':
|
---|
215 | case '?':
|
---|
216 | fprintf (stderr, "usage: %s [-d data directory] [-h] -s 1|2|3 -f name\n", argv[0]);
|
---|
217 | exit (1);
|
---|
218 | }
|
---|
219 |
|
---|
220 | /* Open required files */
|
---|
221 | switch (stem_method)
|
---|
222 | {
|
---|
223 | case (1):
|
---|
224 | idbi = open_file (filename, INVF_DICT_BLOCKED_1_SUFFIX, "rb", MAGIC_STEM_1,
|
---|
225 | MG_ABORT);
|
---|
226 | break;
|
---|
227 | case (2):
|
---|
228 | idbi = open_file (filename, INVF_DICT_BLOCKED_2_SUFFIX, "rb", MAGIC_STEM_2,
|
---|
229 | MG_ABORT);
|
---|
230 | break;
|
---|
231 | case (3):
|
---|
232 | idbi = open_file (filename, INVF_DICT_BLOCKED_3_SUFFIX, "rb", MAGIC_STEM_3,
|
---|
233 | MG_ABORT);
|
---|
234 | break;
|
---|
235 | default:
|
---|
236 | FatalError (1, "Stem method must be 1, 2 or 3\n");
|
---|
237 | }
|
---|
238 |
|
---|
239 | if (!idbi)
|
---|
240 | FatalError (1, "Could NOT open file");
|
---|
241 |
|
---|
242 | read_3_in_4 (idbi);
|
---|
243 |
|
---|
244 | return 0;
|
---|
245 | }
|
---|