1 | // check.h -- manipulate the suffixCheck array
|
---|
2 | //
|
---|
3 | // The suffixCheck array is used to prevent a phrase being added to
|
---|
4 | // the results of getExpansions. It works by declaring an array that
|
---|
5 | // has one bit for each entry in the suffix array, which is cleared at
|
---|
6 | // the start of getExpansions. As getExpansions adds each phrase to
|
---|
7 | // the result set, we set the bits in the suffixCheck array that
|
---|
8 | // correspond to the positions of of the phrase in the suffixArray.
|
---|
9 |
|
---|
10 | // (The point is that getExpansions can later check whether a phrase
|
---|
11 | // or its prefix has been seen before by looking up the bit
|
---|
12 | // corresponding to its first occurence in the suffixCheck array: if
|
---|
13 | // the bit is set, a subphrase of this phrase is already in the
|
---|
14 | // results already, and the current phrase can be ignored.)
|
---|
15 |
|
---|
16 | #include "suffix.h"
|
---|
17 |
|
---|
18 | // The suffixCheck array
|
---|
19 | typedef unsigned char check;
|
---|
20 | check *suffixCheck;
|
---|
21 |
|
---|
22 | // The length of the suffixCheck array
|
---|
23 | cellcount checkLength;
|
---|
24 |
|
---|
25 | // Some useful bitmask constants
|
---|
26 | const check ALLBITS = 255;
|
---|
27 | const check LEFTMOSTBIT = (ALLBITS << 7);
|
---|
28 | const check LEFTMOSTBITS[] = {0,
|
---|
29 | (ALLBITS << 7), (ALLBITS << 6), (ALLBITS << 5), (ALLBITS << 4),
|
---|
30 | (ALLBITS << 3), (ALLBITS << 2), (ALLBITS << 1), ALLBITS};
|
---|
31 | //const check LEFTMOSTBIT = 128;
|
---|
32 | //const check LEFTMOSTBITS[] = {0, 128, 192, 224, 240, 248, 252, 254, 255};
|
---|
33 |
|
---|
34 |
|
---|
35 | // Allocate the initial memory for suffixCheck
|
---|
36 | inline void allocateSuffixCheck(cellcount number_of_symbols)
|
---|
37 | {
|
---|
38 | checkLength = number_of_symbols / (sizeof(check) * 8) + 1;
|
---|
39 | suffixCheck = new check[checkLength];
|
---|
40 | if (suffixCheck == NULL) {
|
---|
41 | cerr << "Suffix error: out of memory allocating suffixCheck array" << endl;
|
---|
42 | exit(2);
|
---|
43 | }
|
---|
44 | memset(suffixCheck, 0, sizeof(check) * checkLength);
|
---|
45 | }
|
---|
46 |
|
---|
47 |
|
---|
48 | // Set all bits in suffixCheck to 0
|
---|
49 | inline void clearSuffixCheck()
|
---|
50 | {
|
---|
51 | memset(suffixCheck, 0, sizeof(check) * checkLength);
|
---|
52 | }
|
---|
53 |
|
---|
54 |
|
---|
55 | // Get the value of a particular bit in suffixCheck
|
---|
56 | inline int getSuffixCheck(cellindex suff)
|
---|
57 | {
|
---|
58 | cellindex cell = suff >> 3;
|
---|
59 | check remainder = suff & 0x07; // the last 3 bits
|
---|
60 | if (suffixCheck[cell] & (LEFTMOSTBIT >> remainder)) {
|
---|
61 | return 1;
|
---|
62 | }
|
---|
63 | return 0;
|
---|
64 | }
|
---|
65 |
|
---|
66 |
|
---|
67 | // Set the value of a bit in suffixCheck to 1
|
---|
68 | inline void setSuffixCheck(cellindex suff)
|
---|
69 | {
|
---|
70 | cellindex cell = suff >> 3;
|
---|
71 | check remainder = suff & 0x07; // the last 3 bits
|
---|
72 | suffixCheck[cell] |= (LEFTMOSTBIT >> remainder);
|
---|
73 | }
|
---|
74 |
|
---|
75 |
|
---|
76 | // Set the value of a range of bits in suffixCheck to 1
|
---|
77 | void setSuffixCheck(cellindex first, cellindex last)
|
---|
78 | {
|
---|
79 | for (cellcount i = first; i <= last; ++i)
|
---|
80 | setSuffixCheck(i);
|
---|
81 | }
|
---|
82 |
|
---|
83 |
|
---|
84 | // Set the bits in a cell in the suffixArray.
|
---|
85 | // Bits are indexed 0-7.
|
---|
86 | inline void setCellBits(cellindex cell_offset, unsigned first_bit, unsigned last_bit)
|
---|
87 | {
|
---|
88 | unsigned number_of_bits = last_bit - first_bit + 1;
|
---|
89 | check mask = (LEFTMOSTBITS[number_of_bits] >> first_bit);
|
---|
90 | suffixCheck[cell_offset] |= mask;
|
---|
91 | }
|
---|
92 |
|
---|
93 |
|
---|
94 | inline void setSuffixCheck_new(cellindex first, cellindex last)
|
---|
95 | {
|
---|
96 | // If only one bit is set, use simpler function
|
---|
97 | // We should be able to remove this case.
|
---|
98 | if (first == last) {
|
---|
99 | setSuffixCheck(first);
|
---|
100 | return;
|
---|
101 | }
|
---|
102 |
|
---|
103 | // Find the first and last cells in which bits are set
|
---|
104 | cellindex first_cell = first >> 3;
|
---|
105 | cellindex last_cell = last >> 3;
|
---|
106 |
|
---|
107 | // If all the values are in the same cell, set them
|
---|
108 | if (first_cell == last_cell) {
|
---|
109 | setCellBits(first_cell, (first & 0x07), (last & 0x07));
|
---|
110 | return;
|
---|
111 | }
|
---|
112 |
|
---|
113 | // Set the bits in the first and last cells
|
---|
114 | setCellBits(first_cell, (first & 0x07), 7);
|
---|
115 | setCellBits(last_cell, 0, (last & 0x07));
|
---|
116 |
|
---|
117 | // Set the bits in the intermediate cells
|
---|
118 | ++first_cell;
|
---|
119 | if(last_cell > first_cell)
|
---|
120 | memset(suffixCheck + first_cell, ALLBITS,
|
---|
121 | (last_cell - first_cell) * sizeof(check));
|
---|
122 | }
|
---|
123 |
|
---|
124 |
|
---|
125 | // Print the suffixCheck array (for debugging)
|
---|
126 | void printSuffixCheck()
|
---|
127 | {
|
---|
128 | for (cellcount i = 0; i < checkLength; ++i) {
|
---|
129 | cout << "cell " << i << " \t";
|
---|
130 | cout << "(" << (i * 8) << "-" << (i * 8 + 7) << ") \t";
|
---|
131 | for (cellindex j = 0; j < 8; ++j) {
|
---|
132 | cout << getSuffixCheck(i * 8 + j);
|
---|
133 | // cout << " (" << (i * 8 + j) << ") ";
|
---|
134 | }
|
---|
135 | cout << " (" << (unsigned int) suffixCheck[i] << ")\n";
|
---|
136 | }
|
---|
137 | }
|
---|
138 |
|
---|
139 |
|
---|
140 | // To test, put this at the start of main
|
---|
141 |
|
---|
142 | // allocateSuffixCheck(300);
|
---|
143 | // clearSuffixCheck();
|
---|
144 | // setSuffixCheck(20);
|
---|
145 | // setSuffixCheck(35,50);
|
---|
146 | // setSuffixCheck(68,150);
|
---|
147 | // setSuffixCheck(170,174);
|
---|
148 | // setSuffixCheck(220,230);
|
---|
149 | // setSuffixCheck(247);
|
---|
150 | // setSuffixCheck(256);
|
---|
151 | // printSuffixCheck();
|
---|
152 | // exit(0);
|
---|
153 |
|
---|