source: trunk/gsdl/src/mgpp/text/invf.cpp@ 879

Last change on this file since 879 was 856, checked in by sjboddie, 24 years ago

Rodgers new C++ mg

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 15.4 KB
Line 
1/**************************************************************************
2 *
3 * invf.cpp -- Data structures for inverted files
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: invf.cpp 856 2000-01-14 02:26:25Z sjboddie $
21 *
22 **************************************************************************/
23
24#include "invf.h"
25#include "UCArray.h"
26
27
28invf_dict_header::invf_dict_header () {
29 Clear();
30}
31
32invf_dict_header::~invf_dict_header () {
33}
34
35void invf_dict_header::Clear() {
36 lookback = 0;
37 word_dict_start = 0;
38 word_dict_size = 0;
39 tag_dict_start = 0;
40 tag_dict_size = 0;
41 num_docs = 0;
42 num_frags = 0;
43 num_words = 0;
44 total_bytes = 0;
45 index_string_bytes = 0;
46 num_levels = 0;
47}
48
49bool invf_dict_header::Read (FILE *f) {
50 return (ReadUL (f, lookback) &&
51 ReadUL (f, word_dict_start) &&
52 ReadUL (f, word_dict_size) &&
53 ReadUL (f, tag_dict_start) &&
54 ReadUL (f, tag_dict_size) &&
55 ReadUL (f, num_docs) &&
56 ReadUL (f, num_frags) &&
57 ReadUL (f, num_words) &&
58 ReadUL (f, total_bytes) &&
59 ReadUL (f, index_string_bytes) &&
60 ReadUL (f, num_levels));
61}
62
63bool invf_dict_header::Write (FILE *f) const {
64 return (WriteUL (f, lookback) &&
65 WriteUL (f, word_dict_start) &&
66 WriteUL (f, word_dict_size) &&
67 WriteUL (f, tag_dict_start) &&
68 WriteUL (f, tag_dict_size) &&
69 WriteUL (f, num_docs) &&
70 WriteUL (f, num_frags) &&
71 WriteUL (f, num_words) &&
72 WriteUL (f, total_bytes) &&
73 WriteUL (f, index_string_bytes) &&
74 WriteUL (f, num_levels));
75}
76
77
78
79void dict_el::Clear () {
80 el.erase (el.begin(), el.end());
81 frag_occur = 0;
82 freq = 0;
83}
84
85bool dict_el::Read (FILE *f) {
86 return (ReadPreSufStr (f, el) &&
87 ReadUL (f, frag_occur) &&
88 ReadUL (f, freq));
89}
90
91bool dict_el::Write (FILE *f, const UCArray *lastEl) const {
92 return (WritePreSufStr (f, lastEl, el) &&
93 WriteUL (f, frag_occur) &&
94 WriteUL (f, freq));
95}
96
97
98void word_dict_el::Clear () {
99 dict_el::Clear();
100 if (levelFreqs != NULL) delete [] levelFreqs;
101 levelFreqs = NULL;
102}
103
104word_dict_el::~word_dict_el () {
105 if (levelFreqs != NULL) delete [] levelFreqs;
106}
107
108void word_dict_el::SetNumLevels (unsigned long numLevels) {
109 if (levelFreqs != NULL) delete [] levelFreqs;
110 levelFreqs = new unsigned long [numLevels];
111}
112
113bool word_dict_el::Read (FILE *f, unsigned long numLevels) {
114 if (!dict_el::Read (f)) return false;
115
116 if (levelFreqs == NULL) return false;
117
118 unsigned long i;
119 for (i=0; i<numLevels; i++) {
120 if (!ReadUL (f, levelFreqs[i])) return false;
121 }
122
123 return true;
124}
125
126bool word_dict_el::Write (FILE *f, const UCArray *lastEl,
127 unsigned long numLevels) const {
128 if (!dict_el::Write (f, lastEl)) return false;
129
130 if (levelFreqs == NULL) return false;
131
132 unsigned long i;
133 for (i=0; i<numLevels; i++) {
134 if (!WriteUL (f, levelFreqs[i])) return false;
135 }
136
137 return true;
138}
139
140
141
142block_dict_header::block_dict_header () {
143 Clear();
144}
145
146void block_dict_header::Clear () {
147 invf_dict_header::Clear();
148
149 entries_per_wblk = 0;
150 num_wblks = 0;
151 max_wblk_size = 0;
152 wblk_start = 0;
153 wblk_idx_start = 0;
154
155 entries_per_tblk = 0;
156 num_tblks = 0;
157 max_tblk_size = 0;
158 tblk_start = 0;
159 tblk_idx_start = 0;
160}
161
162bool block_dict_header::Read (FILE *f) {
163 return (invf_dict_header::Read (f) &&
164
165 ReadUL (f, entries_per_wblk) &&
166 ReadUL (f, num_wblks) &&
167 ReadUL (f, max_wblk_size) &&
168 ReadUL (f, wblk_start) &&
169 ReadUL (f, wblk_idx_start) &&
170
171 ReadUL (f, entries_per_tblk) &&
172 ReadUL (f, num_tblks) &&
173 ReadUL (f, max_tblk_size) &&
174 ReadUL (f, tblk_start) &&
175 ReadUL (f, tblk_idx_start));
176}
177
178bool block_dict_header::Write (FILE *f) const {
179 return (invf_dict_header::Write (f) &&
180
181 WriteUL (f, entries_per_wblk) &&
182 WriteUL (f, num_wblks) &&
183 WriteUL (f, max_wblk_size) &&
184 WriteUL (f, wblk_start) &&
185 WriteUL (f, wblk_idx_start) &&
186
187 WriteUL (f, entries_per_tblk) &&
188 WriteUL (f, num_tblks) &&
189 WriteUL (f, max_tblk_size) &&
190 WriteUL (f, tblk_start) &&
191 WriteUL (f, tblk_idx_start));
192}
193
194
195
196
197
198void block_dict_el::Clear () {
199 el.erase (el.begin(), el.end());
200 frag_occur = 0;
201 freq = 0;
202 invf_ptr = 0;
203}
204
205bool block_dict_el::Read (FILE *f) {
206 return (ReadPreSufStr (f, el) &&
207 ReadUL (f, frag_occur) &&
208 ReadUL (f, freq) &&
209 ReadUL (f, invf_ptr));
210}
211
212bool block_dict_el::Write (FILE *f, const UCArray *lastEl) const {
213 return (WritePreSufStr (f, lastEl, el) &&
214 WriteUL (f, frag_occur) &&
215 WriteUL (f, freq) &&
216 WriteUL (f, invf_ptr));
217}
218
219
220
221
222
223void word_block_dict_el::Clear () {
224 block_dict_el::Clear();
225 if (levelFreqs != NULL) delete [] levelFreqs;
226 levelFreqs = NULL;
227}
228
229word_block_dict_el::~word_block_dict_el () {
230 if (levelFreqs != NULL) delete [] levelFreqs;
231}
232
233void word_block_dict_el::SetNumLevels (unsigned long numLevels) {
234 if (levelFreqs != NULL) delete [] levelFreqs;
235 levelFreqs = new unsigned long [numLevels];
236}
237
238bool word_block_dict_el::Read (FILE *f, unsigned long numLevels) {
239 if (!block_dict_el::Read (f)) return false;
240
241 if (levelFreqs == NULL) return false;
242
243 unsigned long i;
244 for (i=0; i<numLevels; i++) {
245 if (!ReadUL (f, levelFreqs[i])) return false;
246 }
247
248 return true;
249}
250
251bool word_block_dict_el::Write (FILE *f, const UCArray *lastEl,
252 unsigned long numLevels) const {
253 if (!block_dict_el::Write (f, lastEl)) return false;
254
255 if (levelFreqs == NULL) return false;
256
257 unsigned long i;
258 for (i=0; i<numLevels; i++) {
259 if (!WriteUL (f, levelFreqs[i])) return false;
260 }
261
262 return true;
263}
264
265
266
267
268block_idx_info::block_idx_info () {
269 Clear ();
270}
271
272void block_idx_info::Clear () {
273 el.erase (el.begin(), el.end());
274 block_ptr = 0;
275}
276
277bool block_idx_info::Read (FILE *f) {
278 return (ReadUCArray (f, el) &&
279 ReadUL (f, block_ptr));
280}
281
282bool block_idx_info::Write (FILE *f) const {
283 return (WriteUCArray (f, el) &&
284 WriteUL (f, block_ptr));
285}
286
287
288bool ReadBlockIdx (FILE *f, block_idx &blockIdx) {
289 blockIdx.erase (blockIdx.begin(), blockIdx.end());
290
291 // read in the array size
292 unsigned long arraySize = 0;
293 if (!ReadVarLenUL (f, arraySize)) return false;
294
295 // read in the array
296 block_idx_info bi;
297 while (arraySize > 0) {
298 if (!bi.Read (f)) return false;
299 blockIdx.push_back (bi);
300
301 arraySize--;
302 }
303
304 return true;
305}
306
307bool WriteBlockIdx (FILE *f, const block_idx &blockIdx) {
308 // write out the array size
309 if (!WriteVarLenUL (f, blockIdx.size())) return false;
310
311 block_idx::const_iterator here = blockIdx.begin();
312 block_idx::const_iterator end = blockIdx.end();
313 while (here != end) {
314 if (!(*here).Write (f)) return false;
315 here++;
316 }
317
318 return true;
319}
320
321
322
323
324stem_idx_header::stem_idx_header () {
325 Clear ();
326}
327
328void stem_idx_header::Clear () {
329 lookback = 0;
330 dict_size = 0;
331
332 entries_per_block = 0;
333 num_blocks = 0;
334 max_block_size = 0;
335 blocks_start = 0;
336 block_idx_start = 0;
337
338 stemmer_num = 0;
339 stem_method = 0;
340}
341
342bool stem_idx_header::Read (FILE *f) {
343 return (ReadUL (f, lookback) &&
344 ReadUL (f, dict_size) &&
345
346 ReadUL (f, entries_per_block) &&
347 ReadUL (f, num_blocks) &&
348 ReadUL (f, max_block_size) &&
349 ReadUL (f, blocks_start) &&
350 ReadUL (f, block_idx_start) &&
351
352 ReadUL (f, stemmer_num) &&
353 ReadUL (f, stem_method));
354}
355
356bool stem_idx_header::Write (FILE *f) const {
357 return (WriteUL (f, lookback) &&
358 WriteUL (f, dict_size) &&
359
360 WriteUL (f, entries_per_block) &&
361 WriteUL (f, num_blocks) &&
362 WriteUL (f, max_block_size) &&
363 WriteUL (f, blocks_start) &&
364 WriteUL (f, block_idx_start) &&
365
366 WriteUL (f, stemmer_num) &&
367 WriteUL (f, stem_method));
368}
369
370
371
372stem_block_dict_el::stem_block_dict_el () {
373 Clear ();
374}
375
376void stem_block_dict_el::Clear () {
377 el.erase (el.begin(), el.end());
378 equivWords.erase (equivWords.begin(), equivWords.end());
379}
380
381bool stem_block_dict_el::Read (FILE *f) {
382 equivWords.erase (equivWords.begin(), equivWords.end());
383
384 if (!ReadPreSufStr (f, el)) return false;
385
386 // read in the array size
387 unsigned long arraySize = 0;
388 if (!ReadVarLenUL (f, arraySize)) return false;
389
390 // read in the array
391 unsigned long wordNum;
392 while (arraySize > 0) {
393 if (!ReadUL (f, wordNum)) return false;
394 equivWords.push_back (wordNum);
395
396 arraySize--;
397 }
398
399 return true;
400}
401
402bool stem_block_dict_el::Write (FILE *f, const UCArray *lastEl) const {
403 if (!WritePreSufStr (f, lastEl, el)) return false;
404
405 // write out the array size
406 if (!WriteVarLenUL (f, equivWords.size())) return false;
407
408 vector<unsigned long>::const_iterator here = equivWords.begin();
409 vector<unsigned long>::const_iterator end = equivWords.end();
410 while (here != end) {
411 if (!WriteUL (f, (*here))) return false;
412 here++;
413 }
414
415 return true;
416}
417
418
419
420
421invf_file_header::invf_file_header () {
422 Clear ();
423}
424
425void invf_file_header::Clear () {
426 no_of_words = 0;
427 no_of_tags = 0;
428 skip_mode = SKIP_MODE_NO_SKIPS;
429 word_level_index = 0;
430
431 int i;
432 for (i=0; i<16; i++) params[i] = 0;
433}
434
435bool invf_file_header::Read (FILE *f) {
436 if (!ReadUL (f, no_of_words) ||
437 !ReadUL (f, no_of_tags) ||
438 !ReadUL (f, skip_mode) ||
439 !ReadUL (f, word_level_index)) return false;
440
441 int i;
442 for (i=0; i<16; i++) {
443 if (!ReadUL (f, params[i])) return false;
444 }
445
446 return true;
447}
448
449bool invf_file_header::Write (FILE *f) const {
450 if (!WriteUL (f, no_of_words) ||
451 !WriteUL (f, no_of_tags) ||
452 !WriteUL (f, skip_mode) ||
453 !WriteUL (f, word_level_index)) return false;
454
455 int i;
456 for (i=0; i<16; i++) {
457 if (!WriteUL (f, params[i])) return false;
458 }
459
460 return true;
461}
462
463
464
465
466
467bool SearchElNum (const block_idx &bIdx,
468 unsigned long entriesPerBlock,
469 unsigned long elNum,
470 unsigned long &blockIdxNum,
471 unsigned long &blockStartElNum) {
472 blockIdxNum = 0;
473 blockStartElNum = 0;
474
475 // make sure the element number isn't out of range
476 if (elNum > bIdx.size()*entriesPerBlock) return false;
477
478 blockIdxNum = elNum / entriesPerBlock;
479 blockStartElNum = blockIdxNum * entriesPerBlock;
480
481 return true;
482}
483
484bool SearchEl (const block_idx &bIdx,
485 unsigned long entriesPerBlock,
486 const UCArray &el,
487 unsigned long &blockIdxNum,
488 unsigned long &blockStartElNum) {
489 blockIdxNum = 0;
490 blockStartElNum = 0;
491
492 unsigned long begin = 0;
493 unsigned long bIdxEnd = bIdx.size();
494 unsigned long end = bIdxEnd;
495 unsigned long mid;
496 while (begin < end) {
497 mid = (begin+end)/2;
498 if (DictCompare (el, bIdx[mid].el) < 0) {
499 end = mid;
500 } else if (mid+1 < bIdxEnd &&
501 DictCompare (el, bIdx[mid+1].el) >= 0) {
502 begin = mid+1;
503 } else {
504 blockIdxNum = mid;
505 blockStartElNum = blockIdxNum * entriesPerBlock;
506 return true;
507 }
508 }
509
510 return false;
511}
512
513
514
515
516bool SearchBlockDictElNum (FILE *dictFile,
517 const block_idx &bIdx,
518 unsigned long entriesPerBlock,
519 unsigned long dictSize,
520 unsigned long elNum,
521 block_dict_el &dictEl) {
522 UCArrayClear (dictEl.el);
523 if (elNum >= dictSize) return false;
524
525 // find the block that contains the element
526 unsigned long blockIdxNum, curElNum;
527 if (!SearchElNum (bIdx, entriesPerBlock, elNum,
528 blockIdxNum, curElNum))
529 return false;
530
531 // look for the block
532 fseek (dictFile, bIdx[blockIdxNum].block_ptr, SEEK_SET);
533 do {
534 dictEl.Read (dictFile);
535 } while (curElNum++ < elNum);
536
537 return true;
538}
539
540bool SearchBlockDictEl (FILE *dictFile,
541 const block_idx &bIdx,
542 unsigned long entriesPerBlock,
543 unsigned long dictSize,
544 const UCArray &el,
545 block_dict_el &dictEl,
546 unsigned long &elNum) {
547 UCArrayClear (dictEl.el);
548
549 // find the block that contains the element
550 unsigned long blockIdxNum;
551 if (!SearchEl (bIdx, entriesPerBlock, el,
552 blockIdxNum, elNum))
553 return false;
554
555 unsigned long blockEndElNum = elNum + entriesPerBlock;
556 if (blockEndElNum > dictSize) blockEndElNum = dictSize;
557
558 // look for the block
559 fseek (dictFile, bIdx[blockIdxNum].block_ptr, SEEK_SET);
560 while (elNum < blockEndElNum) {
561 dictEl.Read (dictFile);
562 int res = DictCompare (dictEl.el, el);
563 if (res == 0) return true; // found it
564 else if (res > 0) break; // not here
565
566 elNum++;
567 }
568
569 return false;
570}
571
572
573
574
575bool SearchWordBlockDictElNum (FILE *dictFile,
576 const block_idx &bIdx,
577 unsigned long entriesPerBlock,
578 unsigned long dictSize,
579 unsigned long numLevels,
580 unsigned long elNum,
581 word_block_dict_el &dictEl) {
582 UCArrayClear (dictEl.el);
583 if (elNum >= dictSize) return false;
584
585 // find the block that contains the element
586 unsigned long blockIdxNum, curElNum;
587 if (!SearchElNum (bIdx, entriesPerBlock, elNum,
588 blockIdxNum, curElNum))
589 return false;
590
591 // look for the block
592 fseek (dictFile, bIdx[blockIdxNum].block_ptr, SEEK_SET);
593 do {
594 dictEl.Read (dictFile, numLevels);
595 } while (curElNum++ < elNum);
596
597 return true;
598}
599
600bool SearchWordBlockDictEl (FILE *dictFile,
601 const block_idx &bIdx,
602 unsigned long entriesPerBlock,
603 unsigned long dictSize,
604 unsigned long numLevels,
605 const UCArray &el,
606 word_block_dict_el &dictEl,
607 unsigned long &elNum) {
608 UCArrayClear (dictEl.el);
609
610 // find the block that contains the element
611 unsigned long blockIdxNum;
612 if (!SearchEl (bIdx, entriesPerBlock, el,
613 blockIdxNum, elNum))
614 return false;
615
616 unsigned long blockEndElNum = elNum + entriesPerBlock;
617 if (blockEndElNum > dictSize) blockEndElNum = dictSize;
618
619 // look for the block
620 fseek (dictFile, bIdx[blockIdxNum].block_ptr, SEEK_SET);
621 while (elNum < blockEndElNum) {
622 dictEl.Read (dictFile, numLevels);
623 int res = DictCompare (dictEl.el, el);
624 if (res == 0) return true; // found it
625 else if (res > 0) break; // not here
626
627 elNum++;
628 }
629
630 return false;
631}
632
633
634
635
636bool SearchStemBlockDictElNum (FILE *dictFile,
637 const block_idx &bIdx,
638 unsigned long entriesPerBlock,
639 unsigned long dictSize,
640 unsigned long elNum,
641 stem_block_dict_el &dictEl) {
642 UCArrayClear (dictEl.el);
643 if (elNum >= dictSize) return false;
644
645 // find the block that contains the element
646 unsigned long blockIdxNum, curElNum;
647 if (!SearchElNum (bIdx, entriesPerBlock, elNum,
648 blockIdxNum, curElNum))
649 return false;
650
651 // look for the block
652 fseek (dictFile, bIdx[blockIdxNum].block_ptr, SEEK_SET);
653 do {
654 dictEl.Read (dictFile);
655 } while (curElNum++ < elNum);
656
657 return true;
658}
659
660bool SearchStemBlockDictEl (FILE *dictFile,
661 const block_idx &bIdx,
662 unsigned long entriesPerBlock,
663 unsigned long dictSize,
664 const UCArray &el,
665 stem_block_dict_el &dictEl,
666 unsigned long &elNum) {
667 UCArrayClear (dictEl.el);
668
669 // find the block that contains the element
670 unsigned long blockIdxNum;
671 if (!SearchEl (bIdx, entriesPerBlock, el,
672 blockIdxNum, elNum))
673 return false;
674
675 unsigned long blockEndElNum = elNum + entriesPerBlock;
676 if (blockEndElNum > dictSize) blockEndElNum = dictSize;
677
678 // look for the block
679 fseek (dictFile, bIdx[blockIdxNum].block_ptr, SEEK_SET);
680 while (elNum < blockEndElNum) {
681 dictEl.Read (dictFile);
682 int res = DictCompare (dictEl.el, el);
683 if (res == 0) return true; // found it
684 else if (res > 0) break; // not here
685
686 elNum++;
687 }
688
689 return false;
690}
Note: See TracBrowser for help on using the repository browser.