root/main/trunk/greenstone2/build-src/src/phind/generate/suffix.cpp @ 26158

Revision 26158, 31.7 KB (checked in by ak19, 8 years ago)

Bugfix. Many thanks to Anthony Blake. The phind browse classifier used to fail on certain particular combinations of documents on 64 bit linux. This was because of a bug in suffix.exe that caused a seg fault: all pointers are 64 bits on 64 bit machines. However, suffixCompare() and prefixCompare() functions used to try to convert their arguments, which were void* pointers to pointers, by first casting the void* pointers to symbol* and then dereference these to get their values (which are ultimately 32 bit unsigned int) but which are meant to be addresses (pointers themselves) and then converting these back to pointers objects which are 64 bit values. During this process of multiple conversion, 64 bit pointers were truncated to 32 bits before being given 64 bits for storage again. This used to work on 32 bit machines before since pointers there were 32 bits, but showed up here. The solution was a double pointer cast operation: since the data in the void* arguments are actually pointers to pointers, convert them directly back to the specific pointer data types we need. Many thanks to Anthony Blake who initially came to help with working out why gdb wasn't loading the symbols despite suffix.cpp being compiled up with the minus-g flag (when attempting to debug by printing out the values of the dereferenced pointers where things appeared to go wrong or at least step through the code). He ended up finding out what the bug actually was and that a conversion to double pointers was the solution necessary to get things working on 64 bit machines as well as still working on 32 bit machines. Now the phind classifier's been tested on 64 bit and 32 bit machines. And Anthony confirmed that this issue has indeed been resolved. Though it struck randomly, as only certain document combinations in collections show up the underlying error via a seg fault, he inspected the assembly instructions in gdb to see that the move operations were now correct, which they hadn't been before.

  • Property svn:keywords set to Author Date Id Revision
Line 
1/**********************************************************************
2 *
3 * Suffix.cpp -- Extract the repeated phrases in the input with suffix
4 *               and prefix arrays (cgn & gwp's simpler algorithm,
5 *               and kjm's improvements).
6 *
7 * Copyright 2000 Gordon W. Paynter
8 * Copyright 2000 The New Zealand Digital Library Project
9 *
10 * A component of the Greenstone digital library software from the
11 * New Zealand Digital Library Project at the University of Waikato,
12 * New Zealand.
13 *
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License as
16 * published by the Free Software Foundation; either version 2 of
17 * the License, or (at your option) any later version.
18 *
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22 * GNU General Public License for more details.
23 *
24 * You should have received a copy of the GNU General Public License
25 * along with this program; if not, write to the Free Software
26 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27 *
28 *********************************************************************/
29
30#include <assert.h>
31#include <math.h>
32#include <stdio.h>
33#include <stdlib.h>
34#include <string.h>
35
36#if defined(GSDL_USE_IOS_H)
37#  include <fstream.h>
38#  include <iostream.h>
39#else
40#  include <fstream>
41#  include <iostream>
42#endif
43
44#if defined(GSDL_USE_STL_H)
45#  if defined(GSDL_USE_ALGO_H)
46#    include <algo.h>
47#  else
48#    include <algorithm.h>
49#  endif
50#  include <vector.h>
51#else
52#  include <algorithm>
53#  include <vector>
54#endif
55
56#include "suffix.h"
57#include "phrase.h"
58#include "check.h"
59#include "mglong.h"
60
61// Global variables declared in suffix.h
62cellcount inputLength;
63
64symbol   *symbols;
65symbol  **suffixArray;
66symbol  **prefixArray;
67
68// How many documents are in this collection?
69cellcount numberOfDocuments;
70symbol  **documentArray;
71
72
73// Do we accept any phrase, or do we eliminate those ending with stopwords ?
74int phraseMode = ANYPHRASE; //STOPWORDS;
75// What is the minimum phrase frequency for a phrase to be included in the hierarchy
76int minOccurs = 2;
77
78// The filestem of the collection's phindex directory
79char collection[FILENAME_MAX];
80
81
82// The ranges of the stopword and content-word symbols for the collection
83symbol firstStopSymbol = 0;
84symbol lastStopSymbol = 0;
85symbol firstContentSymbol = 0;
86symbol lastContentSymbol = 0;
87
88
89// Some useful comparison functions, defined below.
90int suffixCompare(const void *, const void *);
91int prefixCompare(const void *, const void *);
92int pointerCompare(const void *, const void *);
93
94                                           
95// Functions for implementing "phrase memory".  These let us "remember"
96// each phrase that we've expanded without using too much memory.
97void initialisePhraseMemory();
98void rememberThisPhrase(cellindex index, cellcount length);
99bool isPhraseStored(cellindex index, cellcount length);
100void deletePhraseMemory();
101
102
103// how much output do we want?
104int verbosity = 1;
105
106
107// Get a phrase's expansions
108//
109// Get the set of "minimal", "maximal", non-unique expansions of a
110// phrase p, using the simpler algorithm that Craig and Gordon came up
111// with at Google.
112//
113// Returns a vector of Expansions.
114
115void getExpansions(Phrase &p, vector<Phrase> &results) {
116
117  // 1. Get the initial candidates
118  vector<Phrase> candidates;
119  p.initialSuffixCandidates(candidates);
120  int suffcands = candidates.size();
121  p.initialPrefixCandidates(candidates);
122
123  if (candidates.size() == 0)
124    return;
125
126  vector<Phrase>::iterator i;
127  for (i = candidates.begin(); i != candidates.end(), suffcands>0; ++i, --suffcands) {
128    i->expandWhileUniquePrefixExtension();
129    i->ensureSuffixFound();
130  }
131  for (i; i != candidates.end(); ++i) {
132    i->expandWhileUniqueSuffixExtension();
133  }
134
135  // 3. Ensure minimality: ignore phrases whose subphrases are also found
136  results.clear();
137
138  // Initialise the candidates, check array, and various variables.
139  sort(candidates.begin(), candidates.end(), isShorter);
140  unsigned minimum_length = candidates.begin()->length;
141  clearSuffixCheck();
142 
143  // Try to add each candidate to the results set, ignoring the non-minimal
144  for (vector<Phrase>::iterator candidate = candidates.begin();
145       candidate != candidates.end(); ++candidate) {
146
147    // Make a copy of candidate to mutilate while performing sub-phrase checks
148    Phrase temp_phrase(*candidate);
149    bool shorter_found = false;
150   
151    // Check for shorter and shorter versions of the temporary phrase
152    while (temp_phrase.length >= minimum_length && !shorter_found) {
153      temp_phrase.ensureSuffixFound();
154      if (getSuffixCheck(temp_phrase.firstSuffixIndex)==0)
155    temp_phrase.shortenByOneAtPrefix();
156      else
157    shorter_found = true;
158
159      // Possible efficiency here: we can finish if the prefix of c
160      // and temp_phrase are the same for candidate->length symbols.
161    }
162     
163    // If no shorter phrase is found, use this one
164    if (!shorter_found) {
165      results.push_back(*candidate);
166      candidate->ensureSuffixFound();
167      setSuffixCheck(candidate->firstSuffixIndex, candidate->lastSuffixIndex);
168    }
169  }
170}
171
172
173// suffixCompare
174//
175// Compare two pointers into a suffix array.  We use this in the
176// qsort function, so the input are pointers to pointers. 
177//
178// Return -1 if (a < b), otherwise (a > b) so return +1,
179
180int suffixCompare(const void *cpa, const void *cpb) {
181
182  // The following is a bug and so at times results in a segmentation fault on 64 bit linux machines.
183  // The bug is in the 3rd and 4th lines, which perform identical operations. Considering the 3rd line:
184  // The *pa part of this statement truncates pa, the 64 bit pointer (symbol*), with the 32 bit value
185  // symbol (unsigned int) that it's being dereferenced to, before trying to reconvert it to a pointer
186  // which is always 64 bit on 64 bit machines. The 4th line has the same problem and prefixCompare() too.
187
188  // WRONG: Cast then dereference pointers to suffix array elements
189  //symbol *pa = (symbol *) cpa;
190  //symbol *pb = (symbol *) cpb;
191  //pa = (symbol *) *pa;
192  //pb = (symbol *) *pb;
193
194  // This is the correct way that will work on both 64 bit and 32 bit machines:
195  // the 2 void* arguments to this function, cpa and cpb, are "pointers to elements" as per
196  // http://www.cplusplus.com/reference/clibrary/cstdlib/qsort/. The array elements themselves are also
197  // *pointers* themselves: to symbol values. As the comment to this function says: cpa and cpb are
198  // pointers to pointers, making them double pointers. When we cast the void* cpa and cpb arguments to
199  // the types we need to work with here, they therefore need to be cast as double pointers to symbol.
200  // We declare pa and pb as symbol pointers (symbol*) to the actual values being compared, therefore pa
201  // and pb need to be instantiated as the address of the values referenced by cpa and cpb.
202  symbol *pa = *((symbol **) cpa);
203  symbol *pb = *((symbol **) cpb);
204 
205
206  // If the two elements are the same, examine the next one
207  while (*pa == *pb) {
208    *pa++;
209    *pb++;
210  }
211
212  // Make the copmparison and return
213  if ( *pa < *pb) {
214    return -1;
215  } else {
216    return +1;
217  }
218}
219
220
221// prefixCompare
222//
223// Compare two pointers into a prefix array.  We use this in the
224// qsort function, so the input are pointers to pointers. 
225//
226// Return -1 if (a > b), otherwise (a < b) so return +1,
227
228int prefixCompare(const void *cpa, const void *cpb) {
229
230  // Cast and dereference double pointers (pointers to prefix array elements)
231  // See comments in suffixCompare()
232  symbol *pa = *((symbol **) cpa);
233  symbol *pb = *((symbol **) cpb);
234
235  // If the two elements are the same, examine the next one
236  while (*pa == *pb) {
237    *pa--;
238    *pb--;
239  }
240
241  // Make the comparison and return
242  if ( *pa > *pb) {
243    return -1;
244  } else {
245    return +1;
246  }
247}
248
249// simpleCompare
250//
251// Compare two pointers based on the memory location they point to.
252
253int pointerCompare( const void *pa, const void *pb ) {
254
255  symbol **a = (symbol **) pa;
256  symbol **b = (symbol **) pb;
257
258  if (*a < *b) {
259    return  -1;
260  } else if (*a > *b) {
261    return 1;
262  } else {
263    return 0;
264  }
265}
266
267
268// Read the clauses.numbers file into the "symbols" array.
269//
270// Each number in the file is a symbol number; it is essential that
271// the first symbol (and no others) be COLLECTIONSTART and the last
272// symbol (and no others) be COLLECTIONEND.
273//
274// Return the number of numbers in the array.
275
276int readNumbers() {
277
278  char filename[FILENAME_MAX];
279  sprintf(filename, "%s/clauses.numbers", collection);
280  if (verbosity) {
281    cout << "Reading numbers file: " << filename << endl;
282  }
283
284  // Open the numbers file
285  ifstream inFile1(filename, ios::in);
286  if (!inFile1) {
287    cerr << "File " << filename << " could not be opened\n";
288    exit(1);
289  }
290
291  // Count the number of symbols
292  inputLength = 0;
293  symbol word;
294  while (inFile1 >> word) {
295    ++inputLength;
296  }
297  inFile1.close();
298
299  // Allocate the symbbols array
300  if (verbosity > 1) {
301    cout << "Allocating symbol arrays for " << inputLength << " symbols" << endl;
302  }
303  symbols = new symbol[inputLength];
304  if (symbols == NULL) {
305    cerr << "Suffix error: not enough memory to hold " << inputLength
306     << " symbols." << endl;
307    exit(2);
308  }
309
310  // Read the numbers file into the numbers array
311  if (verbosity > 2) {
312    cout << "Reading the numbers" << endl;
313  }
314
315  ifstream inFile2(filename, ios::in);
316  if (!inFile2) {
317    cerr << "File " << filename << " could not be opened\n";
318    exit(1);
319  }
320  cellcount next = 0;
321  numberOfDocuments = 0;
322  while (inFile2 >> word) {
323    symbols[next++] = word;
324    if (word == DOCUMENTSTART) {
325      ++numberOfDocuments;
326    }
327  }
328  inFile2.close();
329 
330  // Make sure the numbers file is intact
331  assert(symbols[0] == COLLECTIONSTART);
332  assert(symbols[next-1] == COLLECTIONEND);
333
334  return inputLength;
335}
336
337
338
339// Get Document Occurrance statistics
340//
341// Given a phrase, what documents does it occur in?
342
343cellcount getDocumentOccurrances(const Phrase &p, cellcount *frequency) {
344
345  // cout << "searching for \""<< p << "\" in documents "
346  //      << 0 << "-" << numberOfDocuments - 1 << endl;
347
348  // The number of documents in which this phrase occurs
349  cellcount df = 0;
350
351  // Initialise the document frequency array
352  //  for (cellindex i = 0; i < numberOfDocuments; ++i) {
353  // frequency[i] = 0;
354  //}
355  memset(frequency, 0, sizeof(cellcount)*numberOfDocuments);
356
357  // variables used to facilitate the search
358  cellindex begin;
359  cellindex end;
360  cellindex d;
361  symbol *target;
362  bool found;
363
364  // search for the document in which each occurence of the phrase is found
365  for (cellcount j = p.firstSuffixIndex; j <= p.lastSuffixIndex; ++j) {
366   
367    // cout << "looking for phrase at suffixArray[" << j << "]\n";
368   
369    target = suffixArray[j];
370    begin = 0;
371    end = numberOfDocuments - 1;
372    found = false;
373
374    // Search for the occurence of a document delimiter that target
375    // occurs immediately after. 
376    // We do this by performing a binary chop search on documentArray.
377    while (!found) {
378
379      // cout << "searching for " << (cellindex) target << " in "
380      //      << begin << " - " << end << endl;
381
382      assert (begin <= end);
383
384      // If the beginning and end of the interval are the same,
385      // then we've found the correct document
386      if (begin == end) {
387    if (frequency[begin] == 0) {
388      ++df;
389    }
390    ++frequency[begin];
391    found = true;
392      }
393
394      // Otherwise, examine a new document midway through the begin-end
395      // interval and see if it is the one.
396      else {
397    d = (begin + end) / 2;
398    if (target > documentArray[d]) {
399      // If target addrss is greater than this, but thisi sthe last document,
400      // then this must be the one we want.  Or, if target is greater than
401      // this one but less then the next, this must be the one we wnat.
402      if ((d == numberOfDocuments - 1) || (target < documentArray[d+1])) {
403        if (frequency[d] == 0) {
404          ++df;
405        }
406        ++frequency[d];
407        found = true;
408      } else { 
409        // otherwise we know to search later in the document set
410        begin = d + 1;
411      }
412    } else {
413      // search earlier in the document set
414      end = d - 1;
415    }
416      }
417    }
418  }
419  return df;
420}
421
422
423
424
425
426
427// phraseExpansionMemory : Which phrases have we expanded?
428//
429// A set of utilities for keeping track of which phrases we have expanded.
430// We don't want to expand a phrase more than once, after all.
431//
432// This REALLY ought to be in its own class, but it works so that's okay.
433//
434// Phrases are identified by their firstSuffixPosition and length.
435//
436// Functions provided are:
437//       void initialisePhraseMemory()
438//       void rememberThisPhrase(index, length)
439//       bool isPhraseStored(index, length)
440//       void deletePhraseMemory()
441//
442// Internally, we will have two separate cases:
443//
444// Phrases of length 1-8:
445//       unsigned char phraseMemory[inputLength]
446// is an array where each cell "remembers" the corresponding index in the
447// suffixArray, and each of the 8 bits of the cell correspond to the phrases
448// of length 1, 2... 8. 
449// Eventually, we will make this disk-based (i.e. store the array in a file).
450//
451// Phrases of length 9+:
452//       file hashTableFile
453//       file listOfEntries
454// The first file is a hash table; each phrase maps to one of its cells, which
455// contains either 0 (empty, no occurence) or a number which is an entry number
456// in the second file.  This file contains a "list" of entries.  Each consists of
457// three numbers: the suffixArray index of the phrase, the length of the phrase,
458// and the entry number of the next phrase with the same hash.
459//
460
461
462unsigned char *phraseMemory;
463
464void initialiseLongPhraseMemory();
465void rememberThisLongPhrase(cellindex index, cellcount length);
466bool isLongPhraseStored(cellindex index, cellcount length);
467void deleteLongPhraseMemory();
468
469
470void initialisePhraseMemory() {
471
472  phraseMemory = new unsigned char[inputLength];
473
474  // to begin with, everything is empty
475  //  for (cellcount i = 0; i < inputLength; ++i) {
476  // phraseMemory[i] = 0;
477  //}
478  memset(phraseMemory, 0, sizeof(char)*inputLength);
479 
480  // intialise the hashTable of long phrases
481  initialiseLongPhraseMemory();
482
483}
484
485void rememberThisPhrase(cellindex index, cellcount length) {
486
487  // if the phrase is very long, use the file-based system
488  if (length > 8) {
489    rememberThisLongPhrase(index, length);
490    return;
491  }
492
493  // create a char with just the bit corresponding to length set
494  unsigned char newbit = 1;
495  for (cellcount i = 1; i < length; ++i) {
496    newbit <<= 1;
497  }
498
499  // set that bit in the memory array at position index
500  phraseMemory[index] |= newbit;
501}
502
503
504bool isPhraseStored(cellindex index, cellcount length) {
505
506  // if the phrase is very long, use the file-based system
507  if (length > 8) {
508    return isLongPhraseStored(index, length);
509  }
510
511  // create a char with just the bit corresponding to length set
512  unsigned char newbit = 1;
513  for (cellcount i = 1; i < length; ++i) {
514    newbit <<= 1;
515  }
516
517  // retrurn true if that bit is set in memory arrayat position index
518  return (phraseMemory[index] & newbit);
519}
520
521void deletePhraseMemory() {
522  delete []phraseMemory;
523  deleteLongPhraseMemory();
524}
525
526
527
528// Files etc used to store "long" equavlents of the above
529
530fstream hashTableFile;
531char    hashTableFileName[FILENAME_MAX];
532fstream listOfEntries;
533char    listOfEntriesName[FILENAME_MAX];
534cellindex nextEntryNumber;
535
536const cellcount bigPrime = 7919;
537
538
539void initialiseLongPhraseMemory() {
540
541  cellindex example = 0;
542
543  sprintf(hashTableFileName, "%s/hashTable", collection);
544  sprintf(listOfEntriesName, "%s/hashLists", collection);
545
546
547  // create the new hashtable
548  if (verbosity > 1) {
549    cout << "Initialising hashTable: " << hashTableFileName << endl;
550  }
551  hashTableFile.open(hashTableFileName, ios::in | ios::out);
552  for (cellcount i = 0; i < bigPrime; ++i) {
553    hashTableFile.write((char *) &example, sizeof(example));
554  }
555
556  // create the list of phrases
557  if (verbosity > 1) {
558    cout << "Initialising list of hashtable entries: " << listOfEntriesName << endl;
559  }
560  listOfEntries.open(listOfEntriesName, ios::in | ios::out);
561  listOfEntries.write((char *) &example, sizeof(example));
562  listOfEntries.write((char *) &example, sizeof(example));
563  listOfEntries.write((char *) &example, sizeof(example));
564  nextEntryNumber = 1;
565}
566
567
568void rememberThisLongPhrase(cellindex index, cellcount length) {
569
570  // cout << "rememberThisLongPhrase(" << index << ", " << length << ")\n";
571
572  cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
573  cellindex pointer;
574  cellindex zero = 0;
575  cellindex readp = 0;
576  cellindex readi = 0;
577  cellindex readl = 0;
578
579  hashTableFile.seekg(hashOffset);
580  hashTableFile.read((char *) &pointer, sizeof(cellindex));
581
582  if (pointer == 0) {
583    // There is no entry at all in the hash table for this entry
584    // so create one
585
586    pointer = nextEntryNumber++;
587    hashTableFile.seekg(hashOffset);
588    hashTableFile.write((char *) &pointer, sizeof(cellindex));
589       
590    listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
591    listOfEntries.write((char *) &zero, sizeof(cellindex));
592    listOfEntries.write((char *) &index, sizeof(cellindex));
593    listOfEntries.write((char *) &length, sizeof(cellindex));
594
595  } else {
596    // There is a list starting at this hash value, so the phrase may
597    // be already remembered, or it might need to be appended
598   
599    while (pointer != 0) {
600      // Read the entry pointed to by pointer
601      listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
602      listOfEntries.read((char *) &readp, sizeof(cellindex));
603      listOfEntries.read((char *) &readi, sizeof(cellindex));
604      listOfEntries.read((char *) &readl, sizeof(cellindex));
605
606      // cout << "read " << pointer << ", " << readp << ", " << readi << ", " << readl << endl;
607
608      if ((readi == index) && (readl = length)) {
609    // we've found that we've already stored it
610    return;
611      } else if (readp == 0) {
612    // we're reached the end of the list.  Add a new entry.
613    listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
614    listOfEntries.write((char *) &nextEntryNumber, sizeof(cellindex));
615    pointer = nextEntryNumber++;
616
617    listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
618    listOfEntries.write((char *) &zero, sizeof(cellindex));
619    listOfEntries.write((char *) &index, sizeof(cellindex));
620    listOfEntries.write((char *) &length, sizeof(cellindex));
621    return;
622      } else {
623    // go on to the next node
624    pointer = readp;
625      }
626    }
627  }
628
629
630}
631
632
633bool isLongPhraseStored(cellindex index, cellcount length) {
634
635  // cout << "isLongPhraseExpanded(" << index << ", " << length << ")\n";
636
637  cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
638  cellindex pointer;
639  cellindex readp = 0;
640  cellindex readi = 0;
641  cellindex readl = 0;
642
643  // Find the phrase in the hashFile
644  hashTableFile.seekg(hashOffset);
645  hashTableFile.read((char *) &pointer, sizeof(cellindex));
646
647  if (pointer == 0) {
648    // There is no entry at all in the hash table for this entry
649    // so nothing is stored
650    return false;
651
652  } else {
653    // There is a list starting at this hash value, so the phrase may
654    // be already remembered in that list
655    while (pointer != 0) {
656      // Read the entry pointed to by pointer
657      listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
658      listOfEntries.read((char *) &readp, sizeof(cellindex));
659      listOfEntries.read((char *) &readi, sizeof(cellindex));
660      listOfEntries.read((char *) &readl, sizeof(cellindex));
661
662      if ((readi == index) && (readl = length)) {
663    // we've found the phrase stored here
664    return true;
665      } else {
666    // go on to the next node
667    pointer = readp;
668      }
669    }
670  }
671  return false;
672}
673
674
675void deleteLongPhraseMemory() {
676  // remove the hash & other files
677
678  hashTableFile.close();
679  listOfEntries.close();
680  remove(hashTableFileName);
681  remove(listOfEntriesName);
682
683}
684
685
686// Read the collection statistics file
687//
688void readStatistics() {
689
690  // open the statistics file
691  char filename[FILENAME_MAX];
692  sprintf(filename, "%s/clauses.stats", collection);
693
694  // Open the file
695  ifstream inFile(filename, ios::in);
696  if (!inFile) {
697    cerr << "File " << filename << " could not be opened\n";
698    exit(1);
699  }
700
701  // Read the numbers file into the numbers array
702  char key[1000];
703  symbol value;
704  while (inFile >> key >> value){
705    if (strcmp(key, "first_stopword") == 0) {
706      firstStopSymbol = value;
707    } else if (strcmp(key, "last_stopword") == 0) {
708      lastStopSymbol = value;
709    } else if (strcmp(key, "first_contentword") == 0) {
710      firstContentSymbol = value;
711    } else if (strcmp(key, "last_contentword") == 0) {
712      lastContentSymbol = value;
713    }
714  }
715  inFile.close();
716
717  // Make sure we have the information we need
718  if (!(firstStopSymbol && lastStopSymbol && firstContentSymbol && lastContentSymbol)) {
719    cerr << "Statistics file incomplete" << endl;
720    exit(1);
721  }
722}
723
724cellcount getContentCount(symbol firstContent) {
725
726  cellcount content=0;
727  for (cellcount i=0; i<inputLength; ++i) {
728    if (symbols[i]>=firstContent) ++content;
729  }
730
731  return content;
732}
733
734
735int main (int argc, char * argv[]) {
736
737  // Command-line arguments
738  // argv[1] is the phindex directory
739  // argv[2] is the mode, where 1 is stopword mode
740  // argv[3] is the min_occurs, - minimum occurrence frequency for a phrase to be included in the hierarchy
741  // argv[4] is opitonal verbosity
742  if (argc < 4) {
743    cerr << "Usage: " << argv[0] << " phind-directory mode min-phrase-freq [verbosity]" << endl;
744    exit(1);
745  }
746
747  // collection directory
748  strcpy(collection, argv[1]);
749
750  // mode parameter
751  phraseMode = atoi(argv[2]);
752  assert((phraseMode == STOPWORDS) || (phraseMode == ANYPHRASE));
753
754  minOccurs = atoi(argv[3]);
755  assert((minOccurs > 0));
756
757  // optional verbosity parameter
758  if (argc == 5) {
759    verbosity = atoi(argv[4]);
760    assert (verbosity >= 0);
761  }
762
763  if (verbosity) {
764    cout << "suffix: the phrase extraction program" << endl;
765  }
766  if (verbosity > 1) {
767    if (phraseMode == STOPWORDS) {
768      cout << "Stopwords mode: no phrase may begin or end with a stopword" << endl;
769    } else {
770      cout << "AllPhrase mode: extract every phrase that occurs more than once" << endl;
771    }
772  }
773
774  // Read the statistics file
775  readStatistics();
776  // Read the numbers file
777  readNumbers();
778 
779  if (numberOfDocuments == 0) {
780    cerr << "There are no documents in this collection!" << endl;
781    exit(0); // not necessarily an error...
782  }
783
784  symbol firstContent;
785  if (phraseMode==STOPWORDS) firstContent=firstContentSymbol;
786  else firstContent = firstStopSymbol;
787
788  // Allocate memory for the suffix & prefix arrays
789  cellcount contentLength = 0;
790  contentLength = getContentCount(firstContent);
791  suffixArray = new symbol *[contentLength];
792  prefixArray = new symbol *[contentLength];
793  if (prefixArray == NULL) {
794    cerr << "Suffix: not enough memory to hold " << inputLength << " symbols." << endl;
795    exit(2);
796  }
797  allocateSuffixCheck(contentLength);
798  // Initialise prefix and suffix arrays, only use the needed suffixes
799  for (cellcount j = 0, here = 0; j < inputLength; ++j) {
800    if (symbols[j]>=firstContent) {
801      suffixArray[here] = &symbols[j];
802      prefixArray[here] = &symbols[j];
803      ++here;
804    }   
805  }
806  qsort(suffixArray, contentLength, sizeof(symbol *), suffixCompare);
807  qsort(prefixArray, contentLength, sizeof(symbol *), prefixCompare);
808  // Create the document arrays
809  if (verbosity > 1) {
810    cout << "Allocating document arrays for " << numberOfDocuments << " documents" << endl;
811  }
812
813  // The document frequecy array is used to count the number of times
814  // each phrase occurs in each document.  The number of documents in
815  // which a phrase occurs is stored in df.
816  frequency *documentFrequency = new frequency[numberOfDocuments];
817  frequency df;
818
819  // documentArray will be searched in order to discover which document
820  // each phrase occurs in.
821  documentArray = new symbol *[numberOfDocuments]; 
822  if (documentArray == NULL) {
823    cerr << "Suffix: out of memory allocating document arrays." << endl;
824    exit(2);
825  } 
826
827  // just scan through the input text to find the doc starts
828  cellindex d = 0;
829  for (cellindex i=0; i<inputLength; ++i) {
830    if (symbols[i] == DOCUMENTSTART) {
831      documentArray[d] = &symbols[i];
832      ++d;
833    }
834  }
835
836  // the phrases stuff is expecting inputLength to be the length of the
837  // suffix array, so change it.
838  inputLength = contentLength;
839
840  // Extract phrases
841  //
842  // We will make several passesover the data, in each case considering
843  // a set of input phrases and generating a set of output phrases, which
844  // we will expancd in later passes.
845  //
846  // The input phrases in the first pass will be the vocabulary.
847  // In later passes, the input phrases will be the output phrases of the
848  // previous pass.
849  //
850  // In each pass we will consider each input phrase in turn.  If we
851  // have seen it before, we will ignore it.  Otherwise, we will expand
852  // it and add its expansions to the set of output phrases.
853
854  cout <<"\ngenerating the phrase hierarchy\n\n";
855 
856  // Store the phrase data in the phrases file
857  char phraseDataName[FILENAME_MAX];
858  sprintf(phraseDataName, "%s/phrases", collection);
859  ofstream phraseData(phraseDataName, ios::out);
860  if (!phraseData) {
861    cout << "File " << phraseDataName << " could not be opened\n";
862    exit(1);
863  }
864
865  // Count the number of phrases output
866  mg_u_long phraseCounter = 0; //unsigned long int phraseCounter = 0;
867
868  // Set up the phrase expansion memory.
869  // We need this so that we don't expand a phrase more than once
870  initialisePhraseMemory();
871
872  // The current pass numebr
873  int phrasePass = 1;
874
875
876  // PASS NUMBER 1
877  if (verbosity > 1) {
878    cout << "Starting pass " << phrasePass << endl;
879  }
880
881  ofstream outPhrase;
882  char     outPhraseName[FILENAME_MAX];
883  mg_u_long outPhraseCounter = 0; // unsigned long int outPhraseCounter = 0;
884
885  // On the first pass, simply work through the vocabulary
886  sprintf(outPhraseName, "%s/outPhrase.1", collection);
887  outPhrase.open(outPhraseName, ios::out);
888  if (!outPhrase) {
889    cerr << "File " << outPhraseName << " could not be opened\n";
890    exit(1);
891  }
892
893  // Iterate over the different symbols by working through the suffix array
894  vector<Phrase> result;
895  cellindex ij = 0;
896  char *tmpString;
897
898  Phrase p;
899  while (ij < inputLength) {
900
901    // make a new phrase of length 1
902    p = Phrase(suffixArray[ij], 1, SUFFIX);
903    p.findFirstAndLastSuffix(ij, inputLength-1);
904
905    // We ignore this symbol if it occurs only once, if it is a delimiter,
906    // of if we are in stopwords mode and it is a stopword
907    // - in this new version, only need to check freq
908    // We could imagine a new mode/command-line option, which is like
909    // STOPWORDS but without this restrictrion.  This would let you browse
910    // from "the" to "the AGRIS" for example, but not from "AGRIS" to
911    // "the AGRIS" (where the is a stopword and AGRIS a content word).
912    // The system used to work like this; it is easy to implement, but
913    // it explodes the size of the indexes.  So: would it be useful? 
914    if (p.suffixFrequency >= minOccurs) {
915      // Get minimal expansions of the phrase
916      getExpansions(p, result);
917     
918      if (!result.empty()) {
919   
920    // Remember that we have expanded this phrase
921    rememberThisPhrase(ij, 1);
922
923    // write the phrase text
924    phraseData << ij << "-1:" << p << ":" << p.suffixFrequency << ":"
925           << result.size() << ":";
926
927    // write the results
928    for (cellcount k = 0; k < result.size(); ++k) {
929      if (k) {
930        phraseData << ",";
931      }
932      phraseData << result[k].firstSuffixIndex << "-" << result[k].length;
933      outPhrase << result[k].firstSuffixIndex << " " << result[k].length << endl;
934      ++outPhraseCounter;
935    }
936    result.clear();
937   
938    // Write the documents in which this phrase occurs
939    df = getDocumentOccurrances(p, documentFrequency);
940    phraseData << ":" << df << ":";
941
942    // write the documents
943    for (cellcount m = 0, first = 1; m < numberOfDocuments; ++m) {
944      if (documentFrequency[m]) {
945        if (first) {
946          first = 0;
947        } else {
948          phraseData << ";";
949        }
950        // Output the document number.  Note that here we've numbered the
951        // N documents from 0 to N-1, but later they'll be 1-N.  Thus we
952        // add 1 to the document id when we output it.
953        phraseData << "d" << (m+1);
954        // Next, output the frequency with which the document occurs, but
955        // only if it is > 1.
956        if (documentFrequency[m] > 1) {
957          phraseData << "," << documentFrequency[m];
958        }
959      }
960    }
961
962    phraseData << endl;
963    ++phraseCounter;
964
965    // feedback
966    if (verbosity) {
967      if (phraseCounter % 1000 == 0) {
968        cout << "phrase " << phraseCounter << ": "
969         << "cell " << p.firstSuffixIndex << " - " << p << endl;
970      }
971    }
972      }
973    }
974   ij = p.lastSuffixIndex + 1;
975  }
976  outPhrase.close();
977
978  // REMAINING PASSES
979  // The previous outPhrase file forms the input to each new pass
980  cellcount start, length;
981  while (outPhraseCounter > 0) {
982
983    // Start a new pass
984    ++phrasePass;
985    if (verbosity) {
986      cout << "Starting pass " << phrasePass << endl;
987    }
988
989    // Open the input file
990    char inPhraseName[FILENAME_MAX];
991    sprintf(inPhraseName, "%s/outPhrase.%d", collection, phrasePass - 1);
992    ifstream inPhrase (inPhraseName, ios::in);
993    if (!inPhrase) {
994      cerr << "File " << inPhraseName << " could not be opened\n";
995      exit(1);
996    }
997
998    // Open the output file
999    sprintf(outPhraseName, "%s/outPhrase.%d", collection, phrasePass);
1000    outPhrase.open(outPhraseName, ios::out);
1001    if (!outPhrase) {
1002      cerr << "File " << outPhraseName << " could not be opened\n";
1003      exit(1);
1004    }
1005    outPhraseCounter = 0;
1006
1007    // Process each phrase
1008    while(inPhrase >> start >> length) {
1009
1010      // Ignore the phrase if we have expanded it before
1011      if (isPhraseStored(start, length))
1012    continue;
1013
1014      // Remember that we have examined this phrase
1015      rememberThisPhrase(start, length);
1016
1017      // Find the phrase in the suffixarray
1018      p = Phrase(suffixArray[start], length, SUFFIX);
1019      p.findFirstAndLastSuffix(start, inputLength-1);
1020
1021      // Ignore the phrase if it only occurs once
1022      if (p.suffixFrequency < minOccurs)
1023    continue;
1024
1025      // Write the phrase text;
1026      phraseData << start << "-" << length << ":" << p << ":" << p.suffixFrequency << ":";
1027   
1028      // Expand the phrase, if it is fewer than 8 words long
1029      if (length <= 8) {
1030
1031    // Get the minimal expansions for this phrase
1032    getExpansions(p, result);
1033     
1034    // write the results
1035    phraseData << result.size() << ":";
1036
1037    for (cellcount i = 0; i < result.size(); ++i) {
1038      if (i) {
1039        phraseData << ",";
1040      }
1041      phraseData << result[i].firstSuffixIndex << "-" << result[i].length;
1042      outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;
1043      ++outPhraseCounter;
1044    }
1045    result.clear();
1046   
1047      } else {
1048    // phrase is too long to expand further
1049    phraseData << "0:";
1050      }
1051
1052      // Write the documents in which this phrase occurs
1053      df = getDocumentOccurrances(p, documentFrequency);
1054      phraseData << ":" << df << ":";
1055
1056      // write the documents
1057      for (cellcount i = 0, first = 1; i < numberOfDocuments; ++i) {
1058    if (documentFrequency[i]) {
1059      if (first) {
1060        first = 0;
1061      } else {
1062        phraseData << ";";
1063      }
1064      // Output the document number.  Note that here we've numbered the
1065      // N documents from 0 to N-1, but later they'll be 1-N.  Thus we
1066      // add 1 to the document id when we output it.
1067      phraseData << "d" << (i+1);
1068      // Next, output the frequency with which the document occurs, but
1069      // only if it is > 1.
1070      if (documentFrequency[i] > 1) {
1071        phraseData << "," << documentFrequency[i];
1072      }
1073    }
1074      }
1075     
1076      phraseData << endl;
1077      ++phraseCounter;
1078
1079      // feedback
1080      if (verbosity) {
1081    if (phraseCounter % 1000 == 0) {
1082      cout << "phrase " << phraseCounter << ": "<< "start " << start
1083           << ", length " << length << " - " << p << endl;
1084    }
1085      }
1086
1087    }
1088
1089    inPhrase.close();
1090    outPhrase.close();
1091  }
1092   
1093  phraseData.close();
1094  deletePhraseMemory();
1095
1096  delete [] documentFrequency;
1097  delete [] symbols;
1098  delete [] suffixArray;
1099  delete [] prefixArray;
1100  delete [] suffixCheck;
1101  delete [] documentArray;
1102
1103
1104 
1105  cout << endl << "Done: " << phraseCounter << " phrases in " << phraseDataName << endl;
1106  return 0;
1107}
1108
1109
1110
1111
1112
Note: See TracBrowser for help on using the browser.