source: trunk/indexers/mg/src/text/term_lists.c@ 3745

Last change on this file since 3745 was 3745, checked in by mdewsnip, 21 years ago

Addition of MG package for search and retrieval

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 8.7 KB
Line 
1/**************************************************************************
2 *
3 * filename -- description
4 * Copyright (C) 1994 Authors
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: term_lists.c 3745 2003-02-20 21:20:24Z mdewsnip $
21 *
22 **************************************************************************/
23
24/*
25 $Log$
26 Revision 1.1 2003/02/20 21:18:24 mdewsnip
27 Addition of MG package for search and retrieval
28
29 Revision 1.1 1999/08/10 21:18:24 sjboddie
30 renamed mg-1.3d directory mg
31
32 Revision 1.2 1998/11/25 07:55:52 rjmcnab
33
34 Modified mg to that you can specify the stemmer you want
35 to use via a command line option. You specify it to
36 mg_passes during the build process. The number of the
37 stemmer that you used is stored within the inverted
38 dictionary header and the stemmed dictionary header so
39 the correct stemmer is used in later stages of building
40 and querying.
41
42 Revision 1.1 1998/11/17 09:35:43 rjmcnab
43 *** empty log message ***
44
45 * Revision 1.1 1994/10/20 03:57:07 tes
46 * I have rewritten the boolean query optimiser and abstracted out the
47 * components of the boolean query.
48 *
49 */
50
51static char *RCSID = "$Id: term_lists.c 3745 2003-02-20 21:20:24Z mdewsnip $";
52
53#include "sysfuncs.h"
54
55#include "memlib.h"
56#include "local_strings.h"
57#include "term_lists.h"
58#include "messages.h"
59#include "stemmer.h"
60
61TermList *query_term_list = NULL;
62
63/* =========================================================================
64 * Function: MakeTermList
65 * Description:
66 * Input:
67 * Output:
68 * ========================================================================= */
69TermList *
70MakeTermList (int n)
71{
72 TermList *t;
73 int list_size = (n == 0 ? 1 : n); /* always allocate at least one node */
74
75 t = Xmalloc (sizeof (TermList) + (list_size - 1) * sizeof (TermEntry));
76 if (!t)
77 FatalError (1, "Unable to allocate term list");
78
79 t->num = n;
80 t->list_size = list_size;
81
82 return t;
83}
84
85/* =========================================================================
86 * Function: ResizeTermList
87 * Description:
88 * Input:
89 * Output:
90 * ========================================================================= */
91
92#define GROWTH_FACTOR 2
93#define MIN_SIZE 2
94
95static void
96ResizeTermList (TermList ** term_list)
97{
98 TermList *tl = *term_list;
99
100 if (tl->num > tl->list_size)
101 {
102 if (tl->list_size)
103 tl->list_size *= GROWTH_FACTOR;
104 else
105 tl->list_size = MIN_SIZE;
106 }
107 tl = Xrealloc (tl, sizeof (TermList) + (tl->list_size - 1) * sizeof (TermEntry));
108
109 if (!tl)
110 FatalError (1, "Unable to resize term list");
111
112 *term_list = tl;
113}
114
115/* =========================================================================
116 * Function: ConvertTermsToString
117 * Description:
118 * Convert term list into null-terminated string
119 * Input:
120 * query_term_list = term list
121 * Output:
122 * str = term string
123 * ========================================================================= */
124
125void
126ConvertTermsToString (TermList * query_term_list, char *str)
127{
128 int i = 0;
129 int total_len = 0;
130
131 /* terms_str should be preallocated */
132 if (!str)
133 return;
134
135 for (i = 0; i < query_term_list->num; i++)
136 {
137 unsigned char *word = query_term_list->TE[i].Word;
138 int len = word[0];
139 total_len += len + 1; /* +1 for space */
140 if (total_len > MAXTERMSTRLEN)
141 break;
142 strncpy (str, (char *) word + 1, len);
143 str += len;
144 if (i != (query_term_list->num) - 1)
145 {
146 *str = ' ';
147 str++; /* add space gap */
148 }
149
150 }
151 *str = '\0';
152}
153
154/* =========================================================================
155 * Function: ResetTermList
156 * Description:
157 * Input:
158 * Output:
159 * ========================================================================= */
160
161void
162ResetTermList (TermList ** tl)
163{
164 if (*tl)
165 FreeTermList (tl);
166 *tl = MakeTermList (0);
167}
168
169/* =========================================================================
170 * Function: AddTermEntry
171 * Description:
172 * Input:
173 * Output:
174 * ========================================================================= */
175
176int
177AddTermEntry (TermList ** query_term_list, TermEntry * te)
178{
179 TermList *tl = *query_term_list;
180
181 tl->num++;
182 ResizeTermList (query_term_list);
183 tl = *query_term_list;
184
185 /* copy the structure contents */
186 bcopy ((char *) te, (char *) &(tl->TE[tl->num - 1]), sizeof (TermEntry));
187
188 return tl->num - 1;
189}
190
191
192
193/* =========================================================================
194 * Function: AddTerm
195 * Description: Used in boolean parser - see bool_tree [RPAP - Feb 97: Term Frequency]
196 * Input:
197 * Output:
198 * ========================================================================= */
199
200int
201AddTerm (TermList ** query_term_list, u_char * Word, int Count, int word_num,
202 u_long count, u_long doc_count, u_long invf_ptr, u_long invf_len, int stemmer_num) /* [RPAP - Feb 97: Term Frequency] */
203{
204 int j;
205 TermList *tl = *query_term_list;
206
207 /* Look for the word in the already identified terms */
208 for (j = 0; j < tl->num; j++)
209 {
210 TermEntry *te = &(tl->TE[j]);
211 if (compare (te->Word, Word) == 0)
212 {
213 te->Count++;
214 return j;
215 }
216 }
217
218
219 {
220 /* Create a new entry in the list for the new word */
221 TermEntry te;
222
223 /* [RPAP - Feb 97: Term Frequency] */
224 te.WE.word_num = word_num;
225 te.WE.count = count;
226 te.WE.doc_count = doc_count;
227 te.WE.max_doc_count = doc_count; /* [RPAP - Jan 97: Stem Index Change] */
228 te.WE.invf_ptr = invf_ptr;
229 te.WE.invf_len = invf_len;
230 te.Count = Count;
231 te.Word = copy_string (Word);
232 if (!te.Word)
233 FatalError (1, "Could NOT create memory to add term");
234
235 /* [RPAP - Jan 97: Stem Index Change] */
236 te.Stem = copy_string (Word);
237 if (!te.Stem)
238 FatalError (1, "Could NOT create memory to add term");
239 stemmer (2, stemmer_num, te.Stem);
240
241 te.require_match = 0; /* [RJM - 07/97: Ranked Required Terms] */
242
243 return AddTermEntry (query_term_list, &te);
244 }
245
246}
247
248/* =========================================================================
249 * Function: FreeTermList
250 * Description:
251 * Input:
252 * Output:
253 * ========================================================================= */
254
255void
256FreeTermList (TermList ** the_tl)
257{
258 int j;
259 TermList *tl = *the_tl;
260
261 for (j = 0; j < tl->num; j++)
262 {
263 if (tl->TE[j].Word)
264 Xfree (tl->TE[j].Word);
265 /* [RPAP - Jan 97: Stem Index Change] */
266 if (tl->TE[j].Stem)
267 Xfree (tl->TE[j].Stem);
268 }
269 Xfree (tl);
270
271 *the_tl = NULL;
272}
273
274/* =========================================================================
275 * Function: PrintWordEntry
276 * Description:
277 * Input:
278 * Output:
279 * ========================================================================= */
280
281void
282PrintWordEntry (WordEntry * we, FILE * file)
283{
284 fprintf (file, "we->word_num = %d\n", we->word_num);
285 fprintf (file, "we->count = %ld\n", we->count);
286 fprintf (file, "we->doc_count = %ld\n", we->doc_count);
287 fprintf (file, "we->max_doc_count = %ld\n", we->max_doc_count);
288 fprintf (file, "we->invf_ptr = %ld\n", we->invf_ptr);
289 fprintf (file, "we->invf_len = %ld\n", we->invf_len);
290}
291
292/* =========================================================================
293 * Function: PrintTermEntry
294 * Description:
295 * Input:
296 * Output:
297 * ========================================================================= */
298
299void
300PrintTermEntry (TermEntry * te, FILE * file)
301{
302
303 fprintf (file, "Term Entry\n");
304 fprintf (file, "te->Count = %d\n", te->Count);
305 fprintf (file, "te->Word = %s\n", str255_to_string (te->Word, NULL));
306 if (te->Stem != NULL)
307 fprintf (file, "te->Stem = %s\n", str255_to_string (te->Stem, NULL)); /* [RPAP - Jan 97: Stem Index Change] */
308 fprintf (file, "te->require_match = %i\n", te->require_match); /* [RJM 07/97: Ranked Required Terms] */
309 PrintWordEntry (&(te->WE), file);
310
311}
312
313/* =========================================================================
314 * Function: PrintTermList
315 * Description:
316 * Input:
317 * Output:
318 * ========================================================================= */
319
320void
321PrintTermList (TermList * tl, FILE * file)
322{
323 int i;
324
325 fprintf (file, "Term List\n");
326 fprintf (file, "tl->list_size = %d\n", tl->list_size);
327 fprintf (file, "tl->num = %d\n", tl->num);
328
329 for (i = 0; i < tl->num; i++)
330 {
331 fprintf (file, "[%d]\n", i);
332 PrintTermEntry (&(tl->TE[i]), file);
333 }
334}
Note: See TracBrowser for help on using the repository browser.