source: main/trunk/greenstone2/common-src/indexers/mgpp/text/invf.cpp@ 25147

Last change on this file since 25147 was 25147, checked in by kjdon, 12 years ago

merged 64_bit_Greenstone branch into trunk, rev 25139

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 21.1 KB
Line 
1/**************************************************************************
2 *
3 * invf.cpp -- Data structures for inverted files
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#include "invf.h"
23#include "UCArray.h"
24
25
26invf_dict_header::invf_dict_header () {
27 Clear();
28}
29
30invf_dict_header::~invf_dict_header () {
31}
32
33void invf_dict_header::Clear() {
34 lookback = 0;
35 word_dict_start = 0;
36 word_dict_size = 0;
37 tag_dict_start = 0;
38 tag_dict_size = 0;
39 num_docs = 0;
40 num_frags = 0;
41 num_words = 0;
42 total_bytes = 0;
43 index_string_bytes = 0;
44 num_levels = 0;
45}
46
47bool invf_dict_header::Read (FILE *f) {
48 return (ReadUL (f, lookback) &&
49 ReadUL (f, word_dict_start) &&
50 ReadUL (f, word_dict_size) &&
51 ReadUL (f, tag_dict_start) &&
52 ReadUL (f, tag_dict_size) &&
53 ReadUL (f, num_docs) &&
54 ReadUL (f, num_frags) &&
55 ReadUL (f, num_words) &&
56 ReadUL (f, total_bytes) &&
57 ReadUL (f, index_string_bytes) &&
58 ReadUL (f, num_levels));
59}
60
61bool invf_dict_header::Write (FILE *f) const {
62 return (WriteUL (f, lookback) &&
63 WriteUL (f, word_dict_start) &&
64 WriteUL (f, word_dict_size) &&
65 WriteUL (f, tag_dict_start) &&
66 WriteUL (f, tag_dict_size) &&
67 WriteUL (f, num_docs) &&
68 WriteUL (f, num_frags) &&
69 WriteUL (f, num_words) &&
70 WriteUL (f, total_bytes) &&
71 WriteUL (f, index_string_bytes) &&
72 WriteUL (f, num_levels));
73}
74
75
76
77void dict_el::Clear () {
78 el.erase (el.begin(), el.end());
79 frag_occur = 0;
80 freq = 0;
81}
82
83bool dict_el::Read (FILE *f) {
84 return (ReadPreSufStr (f, el) &&
85 ReadUL (f, frag_occur) &&
86 ReadUL (f, freq));
87}
88
89bool dict_el::Write (FILE *f, const UCArray *lastEl) const {
90 return (WritePreSufStr (f, lastEl, el) &&
91 WriteUL (f, frag_occur) &&
92 WriteUL (f, freq));
93}
94
95
96void word_dict_el::Clear () {
97 dict_el::Clear();
98 if (levelFreqs != NULL) delete [] levelFreqs;
99 levelFreqs = NULL;
100}
101
102word_dict_el::~word_dict_el () {
103 if (levelFreqs != NULL) delete [] levelFreqs;
104}
105
106void word_dict_el::SetNumLevels (mg_u_long numLevels) {
107 if (levelFreqs != NULL) delete [] levelFreqs;
108 levelFreqs = new mg_u_long [numLevels];
109}
110
111bool word_dict_el::Read (FILE *f, mg_u_long numLevels) {
112 if (!dict_el::Read (f)) return false;
113
114 if (levelFreqs == NULL) return false;
115
116 mg_u_long i;
117 for (i=0; i<numLevels; ++i) {
118 if (!ReadUL (f, levelFreqs[i])) return false;
119 }
120
121 return true;
122}
123
124bool word_dict_el::Write (FILE *f, const UCArray *lastEl,
125 mg_u_long numLevels) const {
126 if (!dict_el::Write (f, lastEl)) return false;
127
128 if (levelFreqs == NULL) return false;
129
130 mg_u_long i;
131 for (i=0; i<numLevels; ++i) {
132 if (!WriteUL (f, levelFreqs[i])) return false;
133 }
134
135 return true;
136}
137
138
139
140block_dict_header::block_dict_header () {
141 Clear();
142}
143
144void block_dict_header::Clear () {
145 invf_dict_header::Clear();
146
147 entries_per_wblk = 0;
148 num_wblks = 0;
149 max_wblk_size = 0;
150 wblk_start = 0;
151 wblk_idx_start = 0;
152
153 entries_per_tblk = 0;
154 num_tblks = 0;
155 max_tblk_size = 0;
156 tblk_start = 0;
157 tblk_idx_start = 0;
158}
159
160bool block_dict_header::Read (FILE *f) {
161 return (invf_dict_header::Read (f) &&
162
163 ReadUL (f, entries_per_wblk) &&
164 ReadUL (f, num_wblks) &&
165 ReadUL (f, max_wblk_size) &&
166 ReadUL (f, wblk_start) &&
167 ReadUL (f, wblk_idx_start) &&
168
169 ReadUL (f, entries_per_tblk) &&
170 ReadUL (f, num_tblks) &&
171 ReadUL (f, max_tblk_size) &&
172 ReadUL (f, tblk_start) &&
173 ReadUL (f, tblk_idx_start));
174}
175
176bool block_dict_header::Write (FILE *f) const {
177 return (invf_dict_header::Write (f) &&
178
179 WriteUL (f, entries_per_wblk) &&
180 WriteUL (f, num_wblks) &&
181 WriteUL (f, max_wblk_size) &&
182 WriteUL (f, wblk_start) &&
183 WriteUL (f, wblk_idx_start) &&
184
185 WriteUL (f, entries_per_tblk) &&
186 WriteUL (f, num_tblks) &&
187 WriteUL (f, max_tblk_size) &&
188 WriteUL (f, tblk_start) &&
189 WriteUL (f, tblk_idx_start));
190}
191
192
193
194
195
196void block_dict_el::Clear () {
197 el.erase (el.begin(), el.end());
198 frag_occur = 0;
199 freq = 0;
200 invf_ptr = 0;
201}
202
203bool block_dict_el::Read (FILE *f) {
204 return (ReadPreSufStr (f, el) &&
205 ReadUL (f, frag_occur) &&
206 ReadUL (f, freq) &&
207 ReadUL (f, invf_ptr));
208}
209
210bool block_dict_el::Write (FILE *f, const UCArray *lastEl) const {
211 return (WritePreSufStr (f, lastEl, el) &&
212 WriteUL (f, frag_occur) &&
213 WriteUL (f, freq) &&
214 WriteUL (f, invf_ptr));
215}
216
217
218
219
220
221void word_block_dict_el::Clear () {
222 block_dict_el::Clear();
223 if (levelFreqs != NULL) delete [] levelFreqs;
224 levelFreqs = NULL;
225}
226
227word_block_dict_el::~word_block_dict_el () {
228 if (levelFreqs != NULL) delete [] levelFreqs;
229}
230
231void word_block_dict_el::SetNumLevels (mg_u_long numLevels) {
232 if (levelFreqs != NULL) delete [] levelFreqs;
233 levelFreqs = new mg_u_long [numLevels];
234}
235
236bool word_block_dict_el::Read (FILE *f, mg_u_long numLevels) {
237 if (!block_dict_el::Read (f)) return false;
238
239 if (levelFreqs == NULL) return false;
240
241 mg_u_long i;
242 for (i=0; i<numLevels; ++i) {
243 if (!ReadUL (f, levelFreqs[i])) return false;
244 }
245
246 return true;
247}
248
249bool word_block_dict_el::Write (FILE *f, const UCArray *lastEl,
250 mg_u_long numLevels) const {
251 if (!block_dict_el::Write (f, lastEl)) return false;
252
253 if (levelFreqs == NULL) return false;
254
255 mg_u_long i;
256 for (i=0; i<numLevels; ++i) {
257 if (!WriteUL (f, levelFreqs[i])) return false;
258 }
259
260 return true;
261}
262
263
264
265
266block_idx_info::block_idx_info () {
267 Clear ();
268}
269
270void block_idx_info::Clear () {
271 el.erase (el.begin(), el.end());
272 block_ptr = 0;
273}
274
275bool block_idx_info::Read (FILE *f) {
276 return (ReadUCArray (f, el) &&
277 ReadUL (f, block_ptr));
278}
279
280bool block_idx_info::Write (FILE *f) const {
281 return (WriteUCArray (f, el) &&
282 WriteUL (f, block_ptr));
283}
284
285
286bool ReadBlockIdx (FILE *f, block_idx &blockIdx) {
287 blockIdx.erase (blockIdx.begin(), blockIdx.end());
288
289 // read in the array size
290 mg_u_long arraySize = 0;
291 if (!ReadVarLenUL (f, arraySize)) return false;
292
293 // read in the array
294 block_idx_info bi;
295 while (arraySize > 0) {
296 if (!bi.Read (f)) return false;
297 blockIdx.push_back (bi);
298
299 --arraySize;
300 }
301
302 return true;
303}
304
305bool WriteBlockIdx (FILE *f, const block_idx &blockIdx) {
306 // write out the array size
307 if (!WriteVarLenUL (f, blockIdx.size())) return false;
308
309 block_idx::const_iterator here = blockIdx.begin();
310 block_idx::const_iterator end = blockIdx.end();
311 while (here != end) {
312 if (!(*here).Write (f)) return false;
313 ++here;
314 }
315
316 return true;
317}
318
319
320
321
322stem_idx_header::stem_idx_header () {
323 Clear ();
324}
325
326void stem_idx_header::Clear () {
327 lookback = 0;
328 dict_size = 0;
329
330 entries_per_block = 0;
331 num_blocks = 0;
332 max_block_size = 0;
333 blocks_start = 0;
334 block_idx_start = 0;
335
336 stemmer_num = 0;
337 stem_method = 0;
338}
339
340bool stem_idx_header::Read (FILE *f) {
341 return (ReadUL (f, lookback) &&
342 ReadUL (f, dict_size) &&
343
344 ReadUL (f, entries_per_block) &&
345 ReadUL (f, num_blocks) &&
346 ReadUL (f, max_block_size) &&
347 ReadUL (f, blocks_start) &&
348 ReadUL (f, block_idx_start) &&
349
350 ReadUL (f, stemmer_num) &&
351 ReadUL (f, stem_method));
352}
353
354bool stem_idx_header::Write (FILE *f) const {
355 return (WriteUL (f, lookback) &&
356 WriteUL (f, dict_size) &&
357
358 WriteUL (f, entries_per_block) &&
359 WriteUL (f, num_blocks) &&
360 WriteUL (f, max_block_size) &&
361 WriteUL (f, blocks_start) &&
362 WriteUL (f, block_idx_start) &&
363
364 WriteUL (f, stemmer_num) &&
365 WriteUL (f, stem_method));
366}
367
368
369
370stem_block_dict_el::stem_block_dict_el () {
371 Clear ();
372}
373
374void stem_block_dict_el::Clear () {
375 el.erase (el.begin(), el.end());
376 equivWords.erase (equivWords.begin(), equivWords.end());
377}
378
379bool stem_block_dict_el::Read (FILE *f) {
380 equivWords.erase (equivWords.begin(), equivWords.end());
381
382 if (!ReadPreSufStr (f, el)) return false;
383
384 // read in the array size
385 mg_u_long arraySize = 0;
386 if (!ReadVarLenUL (f, arraySize)) return false;
387
388 // read in the array
389 mg_u_long wordNum;
390 while (arraySize > 0) {
391 if (!ReadUL (f, wordNum)) return false;
392 equivWords.push_back (wordNum);
393
394 --arraySize;
395 }
396
397 return true;
398}
399
400bool stem_block_dict_el::Write (FILE *f, const UCArray *lastEl) const {
401 if (!WritePreSufStr (f, lastEl, el)) return false;
402
403 // write out the array size
404 if (!WriteVarLenUL (f, equivWords.size())) return false;
405
406 vector<mg_u_long>::const_iterator here = equivWords.begin();
407 vector<mg_u_long>::const_iterator end = equivWords.end();
408 while (here != end) {
409 if (!WriteUL (f, (*here))) return false;
410 ++here;
411 }
412
413 return true;
414}
415
416
417
418
419invf_file_header::invf_file_header () {
420 Clear ();
421}
422
423void invf_file_header::Clear () {
424 no_of_words = 0;
425 no_of_tags = 0;
426 skip_mode = SKIP_MODE_NO_SKIPS;
427 word_level_index = 0;
428
429 int i;
430 for (i=0; i<16; ++i) params[i] = 0;
431}
432
433bool invf_file_header::Read (FILE *f) {
434 if (!ReadUL (f, no_of_words) ||
435 !ReadUL (f, no_of_tags) ||
436 !ReadUL (f, skip_mode) ||
437 !ReadUL (f, word_level_index)) return false;
438
439 int i;
440 for (i=0; i<16; ++i) {
441 if (!ReadUL (f, params[i])) return false;
442 }
443
444 return true;
445}
446
447bool invf_file_header::Write (FILE *f) const {
448 if (!WriteUL (f, no_of_words) ||
449 !WriteUL (f, no_of_tags) ||
450 !WriteUL (f, skip_mode) ||
451 !WriteUL (f, word_level_index)) return false;
452
453 int i;
454 for (i=0; i<16; ++i) {
455 if (!WriteUL (f, params[i])) return false;
456 }
457
458 return true;
459}
460
461
462
463
464
465bool SearchElNum (const block_idx &bIdx,
466 mg_u_long entriesPerBlock,
467 mg_u_long elNum,
468 mg_u_long &blockIdxNum,
469 mg_u_long &blockStartElNum) {
470 blockIdxNum = 0;
471 blockStartElNum = 0;
472
473 // make sure the element number isn't out of range
474 if (elNum > bIdx.size()*entriesPerBlock) return false;
475
476 blockIdxNum = elNum / entriesPerBlock;
477 blockStartElNum = blockIdxNum * entriesPerBlock;
478
479 return true;
480}
481
482bool SearchEl (const block_idx &bIdx,
483 mg_u_long entriesPerBlock,
484 const UCArray &el,
485 mg_u_long &blockIdxNum,
486 mg_u_long &blockStartElNum) {
487 blockIdxNum = 0;
488 blockStartElNum = 0;
489
490 mg_u_long begin = 0;
491 mg_u_long bIdxEnd = bIdx.size();
492 mg_u_long end = bIdxEnd;
493 mg_u_long mid;
494 while (begin < end) {
495 mid = (begin+end)/2;
496 if (DictCompare (el, bIdx[mid].el) < 0) {
497 end = mid;
498 } else if (mid+1 < bIdxEnd &&
499 DictCompare (el, bIdx[mid+1].el) >= 0) {
500 begin = mid+1;
501 } else {
502 blockIdxNum = mid;
503 blockStartElNum = blockIdxNum * entriesPerBlock;
504 return true;
505 }
506 }
507
508 return false;
509}
510
511
512
513// use the block dictionary functions for tag entries, and word block dict
514// functions for word entries.
515
516
517bool SearchBlockDictElNum (FILE *dictFile,
518 const block_idx &bIdx,
519 mg_u_long entriesPerBlock,
520 mg_u_long dictSize,
521 mg_u_long elNum,
522 block_dict_el &dictEl) {
523 UCArrayClear (dictEl.el);
524 if (elNum >= dictSize) return false;
525
526 // find the block that contains the element
527 mg_u_long blockIdxNum, curElNum;
528 if (!SearchElNum (bIdx, entriesPerBlock, elNum,
529 blockIdxNum, curElNum))
530 return false;
531
532 // look for the block
533 fseek (dictFile, bIdx[blockIdxNum].block_ptr, SEEK_SET);
534 do {
535 dictEl.Read (dictFile);
536 } while (curElNum++ < elNum);
537
538 return true;
539}
540
541bool SearchBlockDictEl (FILE *dictFile,
542 const block_idx &bIdx,
543 mg_u_long entriesPerBlock,
544 mg_u_long dictSize,
545 const UCArray &el,
546 block_dict_el &dictEl,
547 mg_u_long &elNum) {
548 UCArrayClear (dictEl.el);
549
550 // find the block that contains the element
551 mg_u_long blockIdxNum;
552 if (!SearchEl (bIdx, entriesPerBlock, el,
553 blockIdxNum, elNum))
554 return false;
555
556 mg_u_long blockEndElNum = elNum + entriesPerBlock;
557 if (blockEndElNum > dictSize) blockEndElNum = dictSize;
558
559 // look for the block
560 fseek (dictFile, bIdx[blockIdxNum].block_ptr, SEEK_SET);
561 while (elNum < blockEndElNum) {
562 dictEl.Read (dictFile);
563 int res = DictCompare (dictEl.el, el);
564 if (res == 0) return true; // found it
565 else if (res > 0) break; // not here
566
567 ++elNum;
568 }
569
570 return false;
571}
572
573
574
575
576bool SearchWordBlockDictElNum (FILE *dictFile,
577 const block_idx &bIdx,
578 mg_u_long entriesPerBlock,
579 mg_u_long dictSize,
580 mg_u_long numLevels,
581 mg_u_long elNum,
582 word_block_dict_el &dictEl) {
583 UCArrayClear (dictEl.el);
584 if (elNum >= dictSize) return false;
585
586 // find the block that contains the element
587 mg_u_long blockIdxNum, curElNum;
588 if (!SearchElNum (bIdx, entriesPerBlock, elNum,
589 blockIdxNum, curElNum))
590 return false;
591
592 // look for the block
593 fseek (dictFile, bIdx[blockIdxNum].block_ptr, SEEK_SET);
594 do {
595 dictEl.Read (dictFile, numLevels);
596 } while (curElNum++ < elNum);
597
598 return true;
599}
600
601bool SearchWordBlockDictEl (FILE *dictFile,
602 const block_idx &bIdx,
603 mg_u_long entriesPerBlock,
604 mg_u_long dictSize,
605 mg_u_long numLevels,
606 const UCArray &el,
607 word_block_dict_el &dictEl,
608 mg_u_long &elNum) {
609 UCArrayClear (dictEl.el);
610
611 // find the block that contains the element
612 mg_u_long blockIdxNum;
613 if (!SearchEl (bIdx, entriesPerBlock, el,
614 blockIdxNum, elNum))
615 return false;
616
617 mg_u_long blockEndElNum = elNum + entriesPerBlock;
618 if (blockEndElNum > dictSize) blockEndElNum = dictSize;
619
620 // look for the block
621 fseek (dictFile, bIdx[blockIdxNum].block_ptr, SEEK_SET);
622 while (elNum < blockEndElNum) {
623 dictEl.Read (dictFile, numLevels);
624 int res = DictCompare (dictEl.el, el);
625 if (res == 0) return true; // found it
626 else if (res > 0) break; // not here
627
628 ++elNum;
629 }
630
631 return false;
632}
633
634
635
636
637bool SearchStemBlockDictElNum (FILE *dictFile,
638 const block_idx &bIdx,
639 mg_u_long entriesPerBlock,
640 mg_u_long dictSize,
641 mg_u_long elNum,
642 stem_block_dict_el &dictEl) {
643 UCArrayClear (dictEl.el);
644 if (elNum >= dictSize) return false;
645
646 // find the block that contains the element
647 mg_u_long blockIdxNum, curElNum;
648 if (!SearchElNum (bIdx, entriesPerBlock, elNum,
649 blockIdxNum, curElNum))
650 return false;
651
652 // look for the block
653 fseek (dictFile, bIdx[blockIdxNum].block_ptr, SEEK_SET);
654 do {
655 dictEl.Read (dictFile);
656 } while (curElNum++ < elNum);
657
658 return true;
659}
660
661bool SearchStemBlockDictEl (FILE *dictFile,
662 const block_idx &bIdx,
663 mg_u_long entriesPerBlock,
664 mg_u_long dictSize,
665 const UCArray &el,
666 stem_block_dict_el &dictEl,
667 mg_u_long &elNum) {
668 UCArrayClear (dictEl.el);
669
670 // find the block that contains the element
671 mg_u_long blockIdxNum;
672 if (!SearchEl (bIdx, entriesPerBlock, el,
673 blockIdxNum, elNum))
674 return false;
675
676 mg_u_long blockEndElNum = elNum + entriesPerBlock;
677 if (blockEndElNum > dictSize) blockEndElNum = dictSize;
678
679 // look for the block
680 fseek (dictFile, bIdx[blockIdxNum].block_ptr, SEEK_SET);
681 while (elNum < blockEndElNum) {
682 dictEl.Read (dictFile);
683 int res = DictCompare (dictEl.el, el);
684 if (res == 0) return true; // found it
685 else if (res > 0) break; // not here
686
687 ++elNum;
688 }
689
690 return false;
691}
692
693// ------------------------------------------
694// functions for partial term matching
695// ie find all words that start with xxx
696bool PartialMatchSearchWordBlockDictEl (FILE *dictFile,
697 const block_idx &bIdx,
698 mg_u_long entriesPerBlock,
699 mg_u_long dictSize,
700 mg_u_long numLevels,
701 const UCArray &el,
702 word_block_dict_el &dictEl,
703 vector<mg_u_long> &elNumList,
704 bool casefold) {
705
706 UCArrayClear (dictEl.el);
707 elNumList.erase (elNumList.begin(), elNumList.end());
708 mg_u_long elNum;
709 // find the block that contains the element
710 mg_u_long blockIdxNum;
711 // will this work??
712 if (!SearchEl (bIdx, entriesPerBlock, el,
713 blockIdxNum, elNum)) {
714 return false;
715 }
716 mg_u_long blockEndElNum = elNum + entriesPerBlock;
717 if (blockEndElNum > dictSize) blockEndElNum = dictSize;
718
719 bool still_looking = true;
720 // look for the block
721 fseek (dictFile, bIdx[blockIdxNum].block_ptr, SEEK_SET);
722 // test each element
723 while (elNum < blockEndElNum) {
724 dictEl.Read (dictFile, numLevels);
725 if (StartsWithCasefold(dictEl.el, el)) {
726 if (casefold || StartsWith(dictEl.el, el)) {
727 elNumList.push_back(elNum);
728 }
729 still_looking=false; // we have found one now, so stop the next time we don't have a match
730 } else if (!still_looking) {
731 // we have found a match previously, and now this doesn't match
732 return true;
733 } // else keep looking
734
735 ++elNum;
736 }
737 // if we get here, we are either still searching for the first
738 // case, or we are collecting terms.
739 if (still_looking) {
740 //we haven't found a match yet, just check the next element,
741 dictEl.Read (dictFile, numLevels);
742 if (!StartsWithCasefold(dictEl.el, el)) {
743 // the first element of the next block is not a match, so there are no matches
744 return false;
745 } else {
746 if (casefold || StartsWith(dictEl.el, el)) {
747 elNumList.push_back(elNum);
748 ++elNum;
749 }
750
751 }
752 }
753 // just keep accumulating matches until there are no more
754 dictEl.Read (dictFile, numLevels);
755 while (StartsWithCasefold(dictEl.el, el)) {
756 if (casefold || StartsWith(dictEl.el, el)) {
757 elNumList.push_back(elNum);
758 }
759 ++elNum;
760 dictEl.Read (dictFile, numLevels);
761 }
762 return true;
763
764}
765
766
767//----------------------------------------------------------------
768// functions for full text browse
769
770bool NearestSearchWordBlockDictEl (FILE *dictFile,
771 const block_idx &bIdx,
772 mg_u_long entriesPerBlock,
773 mg_u_long dictSize,
774 mg_u_long numLevels,
775 const UCArray &el,
776 word_block_dict_el &dictEl,
777 mg_u_long &elNum) {
778
779 UCArrayClear (dictEl.el);
780
781 // find the block that contains the element
782 mg_u_long blockIdxNum;
783 if (!SearchEl (bIdx, entriesPerBlock, el,
784 blockIdxNum, elNum))
785 return false;
786
787 mg_u_long blockEndElNum = elNum + entriesPerBlock;
788 if (blockEndElNum > dictSize) blockEndElNum = dictSize;
789
790 // look for the block
791 fseek (dictFile, bIdx[blockIdxNum].block_ptr, SEEK_SET);
792 while (elNum < blockEndElNum) {
793 dictEl.Read (dictFile, numLevels);
794 int res = DictCompare (el, dictEl.el); // look for the first word that is
795 // greater or equal to the el
796 if (res <= 0) {
797 return true; // found one
798 }
799
800 ++elNum;
801 }
802 // it must be the last term
803 return true;
804
805
806}
807
808
809bool SearchWordBlockDictElNumRange (FILE *dictFile,
810 const block_idx &bIdx,
811 mg_u_long entriesPerBlock,
812 mg_u_long dictSize,
813 mg_u_long numLevels,
814 mg_u_long elNum,
815 mg_u_long numWords,
816 UCArrayVector &terms) {
817
818 word_block_dict_el dictEl;
819 dictEl.SetNumLevels (numLevels);
820 UCArrayClear(dictEl.el);
821
822 terms.erase(terms.begin(), terms.end());
823
824 if (elNum >= dictSize) return false;
825
826 // find the block that contains the element
827 mg_u_long blockIdxNum, curElNum;
828 if (!SearchElNum (bIdx, entriesPerBlock, elNum,
829 blockIdxNum, curElNum))
830 return false;
831
832 mg_u_long lastElNum = elNum + numWords - 1;
833 if (lastElNum > dictSize) lastElNum = dictSize;
834
835 // look for the block
836 fseek (dictFile, bIdx[blockIdxNum].block_ptr, SEEK_SET);
837
838 // get the first term
839 do {
840 dictEl.Read (dictFile, numLevels);
841 } while (curElNum++ < elNum);
842
843 terms.push_back(dictEl.el);
844 while (curElNum <= lastElNum ) {
845 dictEl.Read(dictFile, numLevels);
846 terms.push_back(dictEl.el);
847 ++curElNum;
848 }
849
850
851 return true;
852}
853
854// NOte: before each addition of dictEl to the array, the level freqs array
855// is deleted, as this was causing problems - generating a seg fault, I think if
856// the vector had to be reallocated or something.
857// setNumLevels has to be called each time before a read, now, to set up the level
858//freqs array. this is necessary.
859bool SearchWordBlockDictElNumRange (FILE *dictFile,
860 const block_idx &bIdx,
861 mg_u_long entriesPerBlock,
862 mg_u_long dictSize,
863 mg_u_long numLevels,
864 mg_u_long elNum,
865 mg_u_long numWords,
866 word_block_dict_el_array &terms) {
867
868 word_block_dict_el dictEl;
869 dictEl.SetNumLevels (numLevels);
870 UCArrayClear(dictEl.el);
871
872 block_dict_el elem;
873 terms.erase(terms.begin(), terms.end());
874
875 if (elNum >= dictSize) return false;
876
877 // find the block that contains the element
878 mg_u_long blockIdxNum, curElNum;
879 if (!SearchElNum (bIdx, entriesPerBlock, elNum,
880 blockIdxNum, curElNum))
881 return false;
882
883 mg_u_long lastElNum = elNum + numWords - 1;
884 if (lastElNum > dictSize) lastElNum = dictSize;
885
886 // look for the block
887 fseek (dictFile, bIdx[blockIdxNum].block_ptr, SEEK_SET);
888 // get the first term
889 do {
890 dictEl.Read (dictFile, numLevels);
891 } while (curElNum++ < elNum);
892
893 dictEl.levelFreqs = NULL;
894 terms.push_back(dictEl);
895
896 while (curElNum <= lastElNum ) {
897 dictEl.SetNumLevels(numLevels);
898 dictEl.Read(dictFile, numLevels);
899 dictEl.levelFreqs = NULL;
900 terms.push_back(dictEl);
901 ++curElNum;
902 }
903
904 return true;
905}
906
907
908
909
910
911
912
913
914
Note: See TracBrowser for help on using the repository browser.