1 | // check.h -- manipulate the suffixCheck array
|
---|
2 | //
|
---|
3 | // The suffixCheck array is used to prevent a phrase being added to
|
---|
4 | // the results of getExpansions. It works by declaring an array that
|
---|
5 | // has one bit for each entry in the suffix array, which is cleared at
|
---|
6 | // the start of getExpansions. As getExpansions adds each phrase to
|
---|
7 | // the result set, we set the bits in the suffixCheck array that
|
---|
8 | // correspond to the positions of of the phrase in the suffixArray.
|
---|
9 |
|
---|
10 | // (The point is that getExpansions can later check whether a phrase
|
---|
11 | // or its prefix has been seen before by looking up the bit
|
---|
12 | // corresponding to its first occurence in the suffixCheck array: if
|
---|
13 | // the bit is set, a subphrase of this phrase is already in the
|
---|
14 | // results already, and the current phrase can be ignored.)
|
---|
15 |
|
---|
16 | #include "suffix.h"
|
---|
17 |
|
---|
18 | // The suffixCheck array
|
---|
19 | #if defined (__WIN32__)
|
---|
20 | typedef unsigned char check;
|
---|
21 | #else
|
---|
22 | #include <inttypes.h>
|
---|
23 | typedef uint8_t check;
|
---|
24 | #endif
|
---|
25 |
|
---|
26 | check *suffixCheck;
|
---|
27 |
|
---|
28 | // The length of the suffixCheck array
|
---|
29 | cellcount checkLength;
|
---|
30 |
|
---|
31 | // Some useful bitmask constants
|
---|
32 | const check ALLBITS = 255;
|
---|
33 | const check LEFTMOSTBIT = static_cast<check>(ALLBITS << 7);
|
---|
34 | const check LEFTMOSTBITS[] = {static_cast<check>(ALLBITS << 8),
|
---|
35 | static_cast<check>(ALLBITS << 7),
|
---|
36 | static_cast<check>(ALLBITS << 6),
|
---|
37 | static_cast<check>(ALLBITS << 5),
|
---|
38 | static_cast<check>(ALLBITS << 4),
|
---|
39 | static_cast<check>(ALLBITS << 3),
|
---|
40 | static_cast<check>(ALLBITS << 2),
|
---|
41 | static_cast<check>(ALLBITS << 1),
|
---|
42 | static_cast<check>(ALLBITS << 0)};
|
---|
43 |
|
---|
44 |
|
---|
45 | // Allocate the initial memory for suffixCheck
|
---|
46 | inline void allocateSuffixCheck(cellcount number_of_symbols)
|
---|
47 | {
|
---|
48 | checkLength = number_of_symbols / (sizeof(check) * 8) + 1;
|
---|
49 | suffixCheck = new check[checkLength];
|
---|
50 | if (suffixCheck == NULL) {
|
---|
51 | cerr << "Suffix error: out of memory allocating suffixCheck array" << endl;
|
---|
52 | exit(2);
|
---|
53 | }
|
---|
54 | memset(suffixCheck, 0, sizeof(check) * checkLength);
|
---|
55 | }
|
---|
56 |
|
---|
57 |
|
---|
58 | // Set all bits in suffixCheck to 0
|
---|
59 | inline void clearSuffixCheck()
|
---|
60 | {
|
---|
61 | memset(suffixCheck, 0, sizeof(check) * checkLength);
|
---|
62 | }
|
---|
63 |
|
---|
64 |
|
---|
65 | // Get the value of a particular bit in suffixCheck
|
---|
66 | inline int getSuffixCheck(cellindex position)
|
---|
67 | {
|
---|
68 | cellindex cell = position >> 3;
|
---|
69 | check remainder = position & 0x07; // the last 3 bits
|
---|
70 | if (suffixCheck[cell] & (LEFTMOSTBIT >> remainder)) {
|
---|
71 | return 1;
|
---|
72 | }
|
---|
73 | return 0;
|
---|
74 | }
|
---|
75 |
|
---|
76 | // Set the value of a bit in suffixCheck to 1
|
---|
77 | inline void setSuffixCheck(cellindex suff)
|
---|
78 | {
|
---|
79 | cellindex cell = suff >> 3;
|
---|
80 | check remainder = suff & 0x07u; // the last 3 bits
|
---|
81 | suffixCheck[cell] |= (LEFTMOSTBIT >> remainder);
|
---|
82 | }
|
---|
83 |
|
---|
84 |
|
---|
85 | // Set the bits in a cell in the suffixArray.
|
---|
86 | // Bits are indexed 0-7.
|
---|
87 | inline void setCellBits(cellindex cell_offset, unsigned first_bit, unsigned last_bit)
|
---|
88 | {
|
---|
89 | unsigned number_of_bits = last_bit - first_bit + 1;
|
---|
90 | check mask = (LEFTMOSTBITS[number_of_bits] >> first_bit);
|
---|
91 | suffixCheck[cell_offset] |= mask;
|
---|
92 | }
|
---|
93 |
|
---|
94 |
|
---|
95 | inline void setSuffixCheck(cellindex first, cellindex last)
|
---|
96 | {
|
---|
97 | // Find the first and last cells in which bits are set
|
---|
98 | cellindex first_cell = first >> 3;
|
---|
99 | cellindex last_cell = last >> 3;
|
---|
100 |
|
---|
101 | // If all the values are in the same cell, set them
|
---|
102 | if (first_cell == last_cell) {
|
---|
103 | setCellBits(first_cell, (first & 0x07u), (last & 0x07u));
|
---|
104 | return;
|
---|
105 | }
|
---|
106 |
|
---|
107 | // Set the bits in the first and last cells
|
---|
108 | setCellBits(first_cell, (first & 0x07u), 7);
|
---|
109 | setCellBits(last_cell, 0, (last & 0x07u));
|
---|
110 |
|
---|
111 | // Set the bits in the intermediate cells
|
---|
112 | ++first_cell;
|
---|
113 | if(last_cell > first_cell)
|
---|
114 | memset(suffixCheck + first_cell, ALLBITS,
|
---|
115 | (last_cell - first_cell) * sizeof(check));
|
---|
116 | }
|
---|
117 |
|
---|
118 |
|
---|
119 | // Print the suffixCheck array (for debugging)
|
---|
120 | void printSuffixCheck()
|
---|
121 | {
|
---|
122 | for (cellcount i = 0; i < checkLength; ++i) {
|
---|
123 | cout << "cell " << i << " \t";
|
---|
124 | cout << "(" << (i * 8) << "-" << (i * 8 + 7) << ") \t";
|
---|
125 | for (cellindex j = 0; j < 8; ++j)
|
---|
126 | cout << getSuffixCheck(i * 8 + j);
|
---|
127 | cout << " (" << (unsigned int) suffixCheck[i] << ")\n";
|
---|
128 | }
|
---|
129 | }
|
---|
130 |
|
---|