1 | /**************************************************************************
|
---|
2 | *
|
---|
3 | * filename -- description
|
---|
4 | * Copyright (C) 1994 Authors
|
---|
5 | *
|
---|
6 | * This program is free software; you can redistribute it and/or modify
|
---|
7 | * it under the terms of the GNU General Public License as published by
|
---|
8 | * the Free Software Foundation; either version 2 of the License, or
|
---|
9 | * (at your option) any later version.
|
---|
10 | *
|
---|
11 | * This program is distributed in the hope that it will be useful,
|
---|
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
14 | * GNU General Public License for more details.
|
---|
15 | *
|
---|
16 | * You should have received a copy of the GNU General Public License
|
---|
17 | * along with this program; if not, write to the Free Software
|
---|
18 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
19 | *
|
---|
20 | * $Id: term_lists.c 3745 2003-02-20 21:20:24Z mdewsnip $
|
---|
21 | *
|
---|
22 | **************************************************************************/
|
---|
23 |
|
---|
24 | /*
|
---|
25 | $Log$
|
---|
26 | Revision 1.1 2003/02/20 21:18:24 mdewsnip
|
---|
27 | Addition of MG package for search and retrieval
|
---|
28 |
|
---|
29 | Revision 1.1 1999/08/10 21:18:24 sjboddie
|
---|
30 | renamed mg-1.3d directory mg
|
---|
31 |
|
---|
32 | Revision 1.2 1998/11/25 07:55:52 rjmcnab
|
---|
33 |
|
---|
34 | Modified mg to that you can specify the stemmer you want
|
---|
35 | to use via a command line option. You specify it to
|
---|
36 | mg_passes during the build process. The number of the
|
---|
37 | stemmer that you used is stored within the inverted
|
---|
38 | dictionary header and the stemmed dictionary header so
|
---|
39 | the correct stemmer is used in later stages of building
|
---|
40 | and querying.
|
---|
41 |
|
---|
42 | Revision 1.1 1998/11/17 09:35:43 rjmcnab
|
---|
43 | *** empty log message ***
|
---|
44 |
|
---|
45 | * Revision 1.1 1994/10/20 03:57:07 tes
|
---|
46 | * I have rewritten the boolean query optimiser and abstracted out the
|
---|
47 | * components of the boolean query.
|
---|
48 | *
|
---|
49 | */
|
---|
50 |
|
---|
51 | static char *RCSID = "$Id: term_lists.c 3745 2003-02-20 21:20:24Z mdewsnip $";
|
---|
52 |
|
---|
53 | #include "sysfuncs.h"
|
---|
54 |
|
---|
55 | #include "memlib.h"
|
---|
56 | #include "local_strings.h"
|
---|
57 | #include "term_lists.h"
|
---|
58 | #include "messages.h"
|
---|
59 | #include "stemmer.h"
|
---|
60 |
|
---|
61 | TermList *query_term_list = NULL;
|
---|
62 |
|
---|
63 | /* =========================================================================
|
---|
64 | * Function: MakeTermList
|
---|
65 | * Description:
|
---|
66 | * Input:
|
---|
67 | * Output:
|
---|
68 | * ========================================================================= */
|
---|
69 | TermList *
|
---|
70 | MakeTermList (int n)
|
---|
71 | {
|
---|
72 | TermList *t;
|
---|
73 | int list_size = (n == 0 ? 1 : n); /* always allocate at least one node */
|
---|
74 |
|
---|
75 | t = Xmalloc (sizeof (TermList) + (list_size - 1) * sizeof (TermEntry));
|
---|
76 | if (!t)
|
---|
77 | FatalError (1, "Unable to allocate term list");
|
---|
78 |
|
---|
79 | t->num = n;
|
---|
80 | t->list_size = list_size;
|
---|
81 |
|
---|
82 | return t;
|
---|
83 | }
|
---|
84 |
|
---|
85 | /* =========================================================================
|
---|
86 | * Function: ResizeTermList
|
---|
87 | * Description:
|
---|
88 | * Input:
|
---|
89 | * Output:
|
---|
90 | * ========================================================================= */
|
---|
91 |
|
---|
92 | #define GROWTH_FACTOR 2
|
---|
93 | #define MIN_SIZE 2
|
---|
94 |
|
---|
95 | static void
|
---|
96 | ResizeTermList (TermList ** term_list)
|
---|
97 | {
|
---|
98 | TermList *tl = *term_list;
|
---|
99 |
|
---|
100 | if (tl->num > tl->list_size)
|
---|
101 | {
|
---|
102 | if (tl->list_size)
|
---|
103 | tl->list_size *= GROWTH_FACTOR;
|
---|
104 | else
|
---|
105 | tl->list_size = MIN_SIZE;
|
---|
106 | }
|
---|
107 | tl = Xrealloc (tl, sizeof (TermList) + (tl->list_size - 1) * sizeof (TermEntry));
|
---|
108 |
|
---|
109 | if (!tl)
|
---|
110 | FatalError (1, "Unable to resize term list");
|
---|
111 |
|
---|
112 | *term_list = tl;
|
---|
113 | }
|
---|
114 |
|
---|
115 | /* =========================================================================
|
---|
116 | * Function: ConvertTermsToString
|
---|
117 | * Description:
|
---|
118 | * Convert term list into null-terminated string
|
---|
119 | * Input:
|
---|
120 | * query_term_list = term list
|
---|
121 | * Output:
|
---|
122 | * str = term string
|
---|
123 | * ========================================================================= */
|
---|
124 |
|
---|
125 | void
|
---|
126 | ConvertTermsToString (TermList * query_term_list, char *str)
|
---|
127 | {
|
---|
128 | int i = 0;
|
---|
129 | int total_len = 0;
|
---|
130 |
|
---|
131 | /* terms_str should be preallocated */
|
---|
132 | if (!str)
|
---|
133 | return;
|
---|
134 |
|
---|
135 | for (i = 0; i < query_term_list->num; i++)
|
---|
136 | {
|
---|
137 | unsigned char *word = query_term_list->TE[i].Word;
|
---|
138 | int len = word[0];
|
---|
139 | total_len += len + 1; /* +1 for space */
|
---|
140 | if (total_len > MAXTERMSTRLEN)
|
---|
141 | break;
|
---|
142 | strncpy (str, (char *) word + 1, len);
|
---|
143 | str += len;
|
---|
144 | if (i != (query_term_list->num) - 1)
|
---|
145 | {
|
---|
146 | *str = ' ';
|
---|
147 | str++; /* add space gap */
|
---|
148 | }
|
---|
149 |
|
---|
150 | }
|
---|
151 | *str = '\0';
|
---|
152 | }
|
---|
153 |
|
---|
154 | /* =========================================================================
|
---|
155 | * Function: ResetTermList
|
---|
156 | * Description:
|
---|
157 | * Input:
|
---|
158 | * Output:
|
---|
159 | * ========================================================================= */
|
---|
160 |
|
---|
161 | void
|
---|
162 | ResetTermList (TermList ** tl)
|
---|
163 | {
|
---|
164 | if (*tl)
|
---|
165 | FreeTermList (tl);
|
---|
166 | *tl = MakeTermList (0);
|
---|
167 | }
|
---|
168 |
|
---|
169 | /* =========================================================================
|
---|
170 | * Function: AddTermEntry
|
---|
171 | * Description:
|
---|
172 | * Input:
|
---|
173 | * Output:
|
---|
174 | * ========================================================================= */
|
---|
175 |
|
---|
176 | int
|
---|
177 | AddTermEntry (TermList ** query_term_list, TermEntry * te)
|
---|
178 | {
|
---|
179 | TermList *tl = *query_term_list;
|
---|
180 |
|
---|
181 | tl->num++;
|
---|
182 | ResizeTermList (query_term_list);
|
---|
183 | tl = *query_term_list;
|
---|
184 |
|
---|
185 | /* copy the structure contents */
|
---|
186 | bcopy ((char *) te, (char *) &(tl->TE[tl->num - 1]), sizeof (TermEntry));
|
---|
187 |
|
---|
188 | return tl->num - 1;
|
---|
189 | }
|
---|
190 |
|
---|
191 |
|
---|
192 |
|
---|
193 | /* =========================================================================
|
---|
194 | * Function: AddTerm
|
---|
195 | * Description: Used in boolean parser - see bool_tree [RPAP - Feb 97: Term Frequency]
|
---|
196 | * Input:
|
---|
197 | * Output:
|
---|
198 | * ========================================================================= */
|
---|
199 |
|
---|
200 | int
|
---|
201 | AddTerm (TermList ** query_term_list, u_char * Word, int Count, int word_num,
|
---|
202 | u_long count, u_long doc_count, u_long invf_ptr, u_long invf_len, int stemmer_num) /* [RPAP - Feb 97: Term Frequency] */
|
---|
203 | {
|
---|
204 | int j;
|
---|
205 | TermList *tl = *query_term_list;
|
---|
206 |
|
---|
207 | /* Look for the word in the already identified terms */
|
---|
208 | for (j = 0; j < tl->num; j++)
|
---|
209 | {
|
---|
210 | TermEntry *te = &(tl->TE[j]);
|
---|
211 | if (compare (te->Word, Word) == 0)
|
---|
212 | {
|
---|
213 | te->Count++;
|
---|
214 | return j;
|
---|
215 | }
|
---|
216 | }
|
---|
217 |
|
---|
218 |
|
---|
219 | {
|
---|
220 | /* Create a new entry in the list for the new word */
|
---|
221 | TermEntry te;
|
---|
222 |
|
---|
223 | /* [RPAP - Feb 97: Term Frequency] */
|
---|
224 | te.WE.word_num = word_num;
|
---|
225 | te.WE.count = count;
|
---|
226 | te.WE.doc_count = doc_count;
|
---|
227 | te.WE.max_doc_count = doc_count; /* [RPAP - Jan 97: Stem Index Change] */
|
---|
228 | te.WE.invf_ptr = invf_ptr;
|
---|
229 | te.WE.invf_len = invf_len;
|
---|
230 | te.Count = Count;
|
---|
231 | te.Word = copy_string (Word);
|
---|
232 | if (!te.Word)
|
---|
233 | FatalError (1, "Could NOT create memory to add term");
|
---|
234 |
|
---|
235 | /* [RPAP - Jan 97: Stem Index Change] */
|
---|
236 | te.Stem = copy_string (Word);
|
---|
237 | if (!te.Stem)
|
---|
238 | FatalError (1, "Could NOT create memory to add term");
|
---|
239 | stemmer (2, stemmer_num, te.Stem);
|
---|
240 |
|
---|
241 | te.require_match = 0; /* [RJM - 07/97: Ranked Required Terms] */
|
---|
242 |
|
---|
243 | return AddTermEntry (query_term_list, &te);
|
---|
244 | }
|
---|
245 |
|
---|
246 | }
|
---|
247 |
|
---|
248 | /* =========================================================================
|
---|
249 | * Function: FreeTermList
|
---|
250 | * Description:
|
---|
251 | * Input:
|
---|
252 | * Output:
|
---|
253 | * ========================================================================= */
|
---|
254 |
|
---|
255 | void
|
---|
256 | FreeTermList (TermList ** the_tl)
|
---|
257 | {
|
---|
258 | int j;
|
---|
259 | TermList *tl = *the_tl;
|
---|
260 |
|
---|
261 | for (j = 0; j < tl->num; j++)
|
---|
262 | {
|
---|
263 | if (tl->TE[j].Word)
|
---|
264 | Xfree (tl->TE[j].Word);
|
---|
265 | /* [RPAP - Jan 97: Stem Index Change] */
|
---|
266 | if (tl->TE[j].Stem)
|
---|
267 | Xfree (tl->TE[j].Stem);
|
---|
268 | }
|
---|
269 | Xfree (tl);
|
---|
270 |
|
---|
271 | *the_tl = NULL;
|
---|
272 | }
|
---|
273 |
|
---|
274 | /* =========================================================================
|
---|
275 | * Function: PrintWordEntry
|
---|
276 | * Description:
|
---|
277 | * Input:
|
---|
278 | * Output:
|
---|
279 | * ========================================================================= */
|
---|
280 |
|
---|
281 | void
|
---|
282 | PrintWordEntry (WordEntry * we, FILE * file)
|
---|
283 | {
|
---|
284 | fprintf (file, "we->word_num = %d\n", we->word_num);
|
---|
285 | fprintf (file, "we->count = %ld\n", we->count);
|
---|
286 | fprintf (file, "we->doc_count = %ld\n", we->doc_count);
|
---|
287 | fprintf (file, "we->max_doc_count = %ld\n", we->max_doc_count);
|
---|
288 | fprintf (file, "we->invf_ptr = %ld\n", we->invf_ptr);
|
---|
289 | fprintf (file, "we->invf_len = %ld\n", we->invf_len);
|
---|
290 | }
|
---|
291 |
|
---|
292 | /* =========================================================================
|
---|
293 | * Function: PrintTermEntry
|
---|
294 | * Description:
|
---|
295 | * Input:
|
---|
296 | * Output:
|
---|
297 | * ========================================================================= */
|
---|
298 |
|
---|
299 | void
|
---|
300 | PrintTermEntry (TermEntry * te, FILE * file)
|
---|
301 | {
|
---|
302 |
|
---|
303 | fprintf (file, "Term Entry\n");
|
---|
304 | fprintf (file, "te->Count = %d\n", te->Count);
|
---|
305 | fprintf (file, "te->Word = %s\n", str255_to_string (te->Word, NULL));
|
---|
306 | if (te->Stem != NULL)
|
---|
307 | fprintf (file, "te->Stem = %s\n", str255_to_string (te->Stem, NULL)); /* [RPAP - Jan 97: Stem Index Change] */
|
---|
308 | fprintf (file, "te->require_match = %i\n", te->require_match); /* [RJM 07/97: Ranked Required Terms] */
|
---|
309 | PrintWordEntry (&(te->WE), file);
|
---|
310 |
|
---|
311 | }
|
---|
312 |
|
---|
313 | /* =========================================================================
|
---|
314 | * Function: PrintTermList
|
---|
315 | * Description:
|
---|
316 | * Input:
|
---|
317 | * Output:
|
---|
318 | * ========================================================================= */
|
---|
319 |
|
---|
320 | void
|
---|
321 | PrintTermList (TermList * tl, FILE * file)
|
---|
322 | {
|
---|
323 | int i;
|
---|
324 |
|
---|
325 | fprintf (file, "Term List\n");
|
---|
326 | fprintf (file, "tl->list_size = %d\n", tl->list_size);
|
---|
327 | fprintf (file, "tl->num = %d\n", tl->num);
|
---|
328 |
|
---|
329 | for (i = 0; i < tl->num; i++)
|
---|
330 | {
|
---|
331 | fprintf (file, "[%d]\n", i);
|
---|
332 | PrintTermEntry (&(tl->TE[i]), file);
|
---|
333 | }
|
---|
334 | }
|
---|