root/main/trunk/greenstone2/build-src/src/phind/generate/suffix.cpp @ 26135

Revision 26135, 30.3 KB (checked in by ak19, 8 years ago)

Changing unsigned long to mg_u_long in Phind's suffix.exe program. This did not help to get suffix to work on 64 bit linux, but the changes have not broken it on Windows either (where it already worked). Therefore, since the unsigned long to mg_u_long changes bring this part of the code up to speed with the same changes made elsewhere in the C code, am still committing this.

  • Property svn:keywords set to Author Date Id Revision
Line 
1/**********************************************************************
2 *
3 * Suffix.cpp -- Extract the repeated phrases in the input with suffix
4 *               and prefix arrays (cgn & gwp's simpler algorithm,
5 *               and kjm's improvements).
6 *
7 * Copyright 2000 Gordon W. Paynter
8 * Copyright 2000 The New Zealand Digital Library Project
9 *
10 * A component of the Greenstone digital library software from the
11 * New Zealand Digital Library Project at the University of Waikato,
12 * New Zealand.
13 *
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License as
16 * published by the Free Software Foundation; either version 2 of
17 * the License, or (at your option) any later version.
18 *
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22 * GNU General Public License for more details.
23 *
24 * You should have received a copy of the GNU General Public License
25 * along with this program; if not, write to the Free Software
26 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27 *
28 *********************************************************************/
29
30#include <assert.h>
31#include <math.h>
32#include <stdio.h>
33#include <stdlib.h>
34#include <string.h>
35
36#if defined(GSDL_USE_IOS_H)
37#  include <fstream.h>
38#  include <iostream.h>
39#else
40#  include <fstream>
41#  include <iostream>
42#endif
43
44#if defined(GSDL_USE_STL_H)
45#  if defined(GSDL_USE_ALGO_H)
46#    include <algo.h>
47#  else
48#    include <algorithm.h>
49#  endif
50#  include <vector.h>
51#else
52#  include <algorithm>
53#  include <vector>
54#endif
55
56#include "suffix.h"
57#include "phrase.h"
58#include "check.h"
59#include "mglong.h"
60
61// Global variables declared in suffix.h
62cellcount inputLength;
63
64symbol   *symbols;
65symbol  **suffixArray;
66symbol  **prefixArray;
67
68// How many documents are in this collection?
69cellcount numberOfDocuments;
70symbol  **documentArray;
71
72
73// Do we accept any phrase, or do we eliminate those ending with stopwords ?
74int phraseMode = ANYPHRASE; //STOPWORDS;
75// What is the minimum phrase frequency for a phrase to be included in the hierarchy
76int minOccurs = 2;
77
78// The filestem of the collection's phindex directory
79char collection[FILENAME_MAX];
80
81
82// The ranges of the stopword and content-word symbols for the collection
83symbol firstStopSymbol = 0;
84symbol lastStopSymbol = 0;
85symbol firstContentSymbol = 0;
86symbol lastContentSymbol = 0;
87
88
89// Some useful comparison functions, defined below.
90int suffixCompare(const void *, const void *);
91int prefixCompare(const void *, const void *);
92int pointerCompare(const void *, const void *);
93
94                                           
95// Functions for implementing "phrase memory".  These let us "remember"
96// each phrase that we've expanded without using too much memory.
97void initialisePhraseMemory();
98void rememberThisPhrase(cellindex index, cellcount length);
99bool isPhraseStored(cellindex index, cellcount length);
100void deletePhraseMemory();
101
102
103// how much output do we want?
104int verbosity = 1;
105
106
107// Get a phrase's expansions
108//
109// Get the set of "minimal", "maximal", non-unique expansions of a
110// phrase p, using the simpler algorithm that Craig and Gordon came up
111// with at Google.
112//
113// Returns a vector of Expansions.
114
115void getExpansions(Phrase &p, vector<Phrase> &results) {
116
117  // 1. Get the initial candidates
118  vector<Phrase> candidates;
119  p.initialSuffixCandidates(candidates);
120  int suffcands = candidates.size();
121  p.initialPrefixCandidates(candidates);
122
123  if (candidates.size() == 0)
124    return;
125
126  vector<Phrase>::iterator i;
127  for (i = candidates.begin(); i != candidates.end(), suffcands>0; ++i, --suffcands) {
128    i->expandWhileUniquePrefixExtension();
129    i->ensureSuffixFound();
130  }
131  for (i; i != candidates.end(); ++i) {
132    i->expandWhileUniqueSuffixExtension();
133  }
134
135  // 3. Ensure minimality: ignore phrases whose subphrases are also found
136  results.clear();
137
138  // Initialise the candidates, check array, and various variables.
139  sort(candidates.begin(), candidates.end(), isShorter);
140  unsigned minimum_length = candidates.begin()->length;
141  clearSuffixCheck();
142 
143  // Try to add each candidate to the results set, ignoring the non-minimal
144  for (vector<Phrase>::iterator candidate = candidates.begin();
145       candidate != candidates.end(); ++candidate) {
146
147    // Make a copy of candidate to mutilate while performing sub-phrase checks
148    Phrase temp_phrase(*candidate);
149    bool shorter_found = false;
150   
151    // Check for shorter and shorter versions of the temporary phrase
152    while (temp_phrase.length >= minimum_length && !shorter_found) {
153      temp_phrase.ensureSuffixFound();
154      if (getSuffixCheck(temp_phrase.firstSuffixIndex)==0)
155    temp_phrase.shortenByOneAtPrefix();
156      else
157    shorter_found = true;
158
159      // Possible efficiency here: we can finish if the prefix of c
160      // and temp_phrase are the same for candidate->length symbols.
161    }
162     
163    // If no shorter phrase is found, use this one
164    if (!shorter_found) {
165      results.push_back(*candidate);
166      candidate->ensureSuffixFound();
167      setSuffixCheck(candidate->firstSuffixIndex, candidate->lastSuffixIndex);
168    }
169  }
170}
171
172
173// suffixCompare
174//
175// Compare two pointers into a suffix array.  We use this in the
176// qsort function, so the input are pointers to pointers. 
177//
178// Return -1 if (a < b), otherwise (a > b) so return +1,
179
180int suffixCompare(const void *cpa, const void *cpb) {
181
182  // Cast then dereference pointers to suffix array elements
183  symbol *pa = (symbol *) cpa;
184  symbol *pb = (symbol *) cpb;
185  pa = (symbol *) *pa;
186  pb = (symbol *) *pb;
187
188  // If the two elements are the same, examine the next one
189  while (*pa == *pb) {
190    *pa++;
191    *pb++;
192  }
193
194  // Make the copmparison and return
195  if ( *pa < *pb) {
196    return -1;
197  } else {
198    return +1;
199  }
200}
201
202
203// prefixCompare
204//
205// Compare two pointers into a prefix array.  We use this in the
206// qsort function, so the input are pointers to pointers. 
207//
208// Return -1 if (a > b), otherwise (a < b) so return +1,
209
210int prefixCompare(const void *cpa, const void *cpb) {
211
212  // Cast then dereference pointers to prefix array elements
213  symbol *pa = (symbol *) cpa;
214  symbol *pb = (symbol *) cpb;
215  pa = (symbol *) *pa;
216  pb = (symbol *) *pb;
217
218  // If the two elements are the same, examine the next one
219  while (*pa == *pb) {
220    *pa--;
221    *pb--;
222  }
223
224  // Make the copmparison and return
225  if ( *pa > *pb) {
226    return -1;
227  } else {
228    return +1;
229  }
230}
231
232// simpleCompare
233//
234// Compare two pointers based on the memory location they point to.
235
236int pointerCompare( const void *pa, const void *pb ) {
237
238  symbol **a = (symbol **) pa;
239  symbol **b = (symbol **) pb;
240
241  if (*a < *b) {
242    return  -1;
243  } else if (*a > *b) {
244    return 1;
245  } else {
246    return 0;
247  }
248}
249
250
251// Read the clauses.numbers file into the "symbols" array.
252//
253// Each number in the file is a symbol number; it is essential that
254// the first symbol (and no others) be COLLECTIONSTART and the last
255// symbol (and no others) be COLLECTIONEND.
256//
257// Return the number of numbers in the array.
258
259int readNumbers() {
260
261  char filename[FILENAME_MAX];
262  sprintf(filename, "%s/clauses.numbers", collection);
263  if (verbosity) {
264    cout << "Reading numbers file: " << filename << endl;
265  }
266
267  // Open the numbers file
268  ifstream inFile1(filename, ios::in);
269  if (!inFile1) {
270    cerr << "File " << filename << " could not be opened\n";
271    exit(1);
272  }
273
274  // Count the number of symbols
275  inputLength = 0;
276  symbol word;
277  while (inFile1 >> word) {
278    ++inputLength;
279  }
280  inFile1.close();
281
282  // Allocate the symbbols array
283  if (verbosity > 1) {
284    cout << "Allocating symbol arrays for " << inputLength << " symbols" << endl;
285  }
286  symbols = new symbol[inputLength];
287  if (symbols == NULL) {
288    cerr << "Suffix error: not enough memory to hold " << inputLength
289     << " symbols." << endl;
290    exit(2);
291  }
292
293  // Read the numbers file into the numbers array
294  if (verbosity > 2) {
295    cout << "Reading the numbers" << endl;
296  }
297
298  ifstream inFile2(filename, ios::in);
299  if (!inFile2) {
300    cerr << "File " << filename << " could not be opened\n";
301    exit(1);
302  }
303  cellcount next = 0;
304  numberOfDocuments = 0;
305  while (inFile2 >> word) {
306    symbols[next++] = word;
307    if (word == DOCUMENTSTART) {
308      ++numberOfDocuments;
309    }
310  }
311  inFile2.close();
312 
313  // Make sure the numbers file is intact
314  assert(symbols[0] == COLLECTIONSTART);
315  assert(symbols[next-1] == COLLECTIONEND);
316
317  return inputLength;
318}
319
320
321
322// Get Document Occurrance statistics
323//
324// Given a phrase, what documents does it occur in?
325
326cellcount getDocumentOccurrances(const Phrase &p, cellcount *frequency) {
327
328  // cout << "searching for \""<< p << "\" in documents "
329  //      << 0 << "-" << numberOfDocuments - 1 << endl;
330
331  // The number of documents in which this phrase occurs
332  cellcount df = 0;
333
334  // Initialise the document frequency array
335  //  for (cellindex i = 0; i < numberOfDocuments; ++i) {
336  // frequency[i] = 0;
337  //}
338  memset(frequency, 0, sizeof(cellcount)*numberOfDocuments);
339
340  // variables used to facilitate the search
341  cellindex begin;
342  cellindex end;
343  cellindex d;
344  symbol *target;
345  bool found;
346
347  // search for the document in which each occurence of the phrase is found
348  for (cellcount j = p.firstSuffixIndex; j <= p.lastSuffixIndex; ++j) {
349   
350    // cout << "looking for phrase at suffixArray[" << j << "]\n";
351   
352    target = suffixArray[j];
353    begin = 0;
354    end = numberOfDocuments - 1;
355    found = false;
356
357    // Search for the occurence of a document delimiter that target
358    // occurs immediately after. 
359    // We do this by performing a binary chop search on documentArray.
360    while (!found) {
361
362      // cout << "searching for " << (cellindex) target << " in "
363      //      << begin << " - " << end << endl;
364
365      assert (begin <= end);
366
367      // If the beginning and end of the interval are the same,
368      // then we've found the correct document
369      if (begin == end) {
370    if (frequency[begin] == 0) {
371      ++df;
372    }
373    ++frequency[begin];
374    found = true;
375      }
376
377      // Otherwise, examine a new document midway through the begin-end
378      // interval and see if it is the one.
379      else {
380    d = (begin + end) / 2;
381    if (target > documentArray[d]) {
382      // If target addrss is greater than this, but thisi sthe last document,
383      // then this must be the one we want.  Or, if target is greater than
384      // this one but less then the next, this must be the one we wnat.
385      if ((d == numberOfDocuments - 1) || (target < documentArray[d+1])) {
386        if (frequency[d] == 0) {
387          ++df;
388        }
389        ++frequency[d];
390        found = true;
391      } else { 
392        // otherwise we know to search later in the document set
393        begin = d + 1;
394      }
395    } else {
396      // search earlier in the document set
397      end = d - 1;
398    }
399      }
400    }
401  }
402  return df;
403}
404
405
406
407
408
409
410// phraseExpansionMemory : Which phrases have we expanded?
411//
412// A set of utilities for keeping track of which phrases we have expanded.
413// We don't want to expand a phrase more than once, after all.
414//
415// This REALLY ought to be in its own class, but it works so that's okay.
416//
417// Phrases are identified by their firstSuffixPosition and length.
418//
419// Functions provided are:
420//       void initialisePhraseMemory()
421//       void rememberThisPhrase(index, length)
422//       bool isPhraseStored(index, length)
423//       void deletePhraseMemory()
424//
425// Internally, we will have two separate cases:
426//
427// Phrases of length 1-8:
428//       unsigned char phraseMemory[inputLength]
429// is an array where each cell "remembers" the corresponding index in the
430// suffixArray, and each of the 8 bits of the cell correspond to the phrases
431// of length 1, 2... 8. 
432// Eventually, we will make this disk-based (i.e. store the array in a file).
433//
434// Phrases of length 9+:
435//       file hashTableFile
436//       file listOfEntries
437// The first file is a hash table; each phrase maps to one of its cells, which
438// contains either 0 (empty, no occurence) or a number which is an entry number
439// in the second file.  This file contains a "list" of entries.  Each consists of
440// three numbers: the suffixArray index of the phrase, the length of the phrase,
441// and the entry number of the next phrase with the same hash.
442//
443
444
445unsigned char *phraseMemory;
446
447void initialiseLongPhraseMemory();
448void rememberThisLongPhrase(cellindex index, cellcount length);
449bool isLongPhraseStored(cellindex index, cellcount length);
450void deleteLongPhraseMemory();
451
452
453void initialisePhraseMemory() {
454
455  phraseMemory = new unsigned char[inputLength];
456
457  // to begin with, everything is empty
458  //  for (cellcount i = 0; i < inputLength; ++i) {
459  // phraseMemory[i] = 0;
460  //}
461  memset(phraseMemory, 0, sizeof(char)*inputLength);
462 
463  // intialise the hashTable of long phrases
464  initialiseLongPhraseMemory();
465
466}
467
468void rememberThisPhrase(cellindex index, cellcount length) {
469
470  // if the phrase is very long, use the file-based system
471  if (length > 8) {
472    rememberThisLongPhrase(index, length);
473    return;
474  }
475
476  // create a char with just the bit corresponding to length set
477  unsigned char newbit = 1;
478  for (cellcount i = 1; i < length; ++i) {
479    newbit <<= 1;
480  }
481
482  // set that bit in the memory array at position index
483  phraseMemory[index] |= newbit;
484}
485
486
487bool isPhraseStored(cellindex index, cellcount length) {
488
489  // if the phrase is very long, use the file-based system
490  if (length > 8) {
491    return isLongPhraseStored(index, length);
492  }
493
494  // create a char with just the bit corresponding to length set
495  unsigned char newbit = 1;
496  for (cellcount i = 1; i < length; ++i) {
497    newbit <<= 1;
498  }
499
500  // retrurn true if that bit is set in memory arrayat position index
501  return (phraseMemory[index] & newbit);
502}
503
504void deletePhraseMemory() {
505  delete []phraseMemory;
506  deleteLongPhraseMemory();
507}
508
509
510
511// Files etc used to store "long" equavlents of the above
512
513fstream hashTableFile;
514char    hashTableFileName[FILENAME_MAX];
515fstream listOfEntries;
516char    listOfEntriesName[FILENAME_MAX];
517cellindex nextEntryNumber;
518
519const cellcount bigPrime = 7919;
520
521
522void initialiseLongPhraseMemory() {
523
524  cellindex example = 0;
525
526  sprintf(hashTableFileName, "%s/hashTable", collection);
527  sprintf(listOfEntriesName, "%s/hashLists", collection);
528
529
530  // create the new hashtable
531  if (verbosity > 1) {
532    cout << "Initialising hashTable: " << hashTableFileName << endl;
533  }
534  hashTableFile.open(hashTableFileName, ios::in | ios::out);
535  for (cellcount i = 0; i < bigPrime; ++i) {
536    hashTableFile.write((char *) &example, sizeof(example));
537  }
538
539  // create the list of phrases
540  if (verbosity > 1) {
541    cout << "Initialising list of hashtable entries: " << listOfEntriesName << endl;
542  }
543  listOfEntries.open(listOfEntriesName, ios::in | ios::out);
544  listOfEntries.write((char *) &example, sizeof(example));
545  listOfEntries.write((char *) &example, sizeof(example));
546  listOfEntries.write((char *) &example, sizeof(example));
547  nextEntryNumber = 1;
548}
549
550
551void rememberThisLongPhrase(cellindex index, cellcount length) {
552
553  // cout << "rememberThisLongPhrase(" << index << ", " << length << ")\n";
554
555  cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
556  cellindex pointer;
557  cellindex zero = 0;
558  cellindex readp = 0;
559  cellindex readi = 0;
560  cellindex readl = 0;
561
562  hashTableFile.seekg(hashOffset);
563  hashTableFile.read((char *) &pointer, sizeof(cellindex));
564
565  if (pointer == 0) {
566    // There is no entry at all in the hash table for this entry
567    // so create one
568
569    pointer = nextEntryNumber++;
570    hashTableFile.seekg(hashOffset);
571    hashTableFile.write((char *) &pointer, sizeof(cellindex));
572       
573    listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
574    listOfEntries.write((char *) &zero, sizeof(cellindex));
575    listOfEntries.write((char *) &index, sizeof(cellindex));
576    listOfEntries.write((char *) &length, sizeof(cellindex));
577
578  } else {
579    // There is a list starting at this hash value, so the phrase may
580    // be already remembered, or it might need to be appended
581   
582    while (pointer != 0) {
583      // Read the entry pointed to by pointer
584      listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
585      listOfEntries.read((char *) &readp, sizeof(cellindex));
586      listOfEntries.read((char *) &readi, sizeof(cellindex));
587      listOfEntries.read((char *) &readl, sizeof(cellindex));
588
589      // cout << "read " << pointer << ", " << readp << ", " << readi << ", " << readl << endl;
590
591      if ((readi == index) && (readl = length)) {
592    // we've found that we've already stored it
593    return;
594      } else if (readp == 0) {
595    // we're reached the end of the list.  Add a new entry.
596    listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
597    listOfEntries.write((char *) &nextEntryNumber, sizeof(cellindex));
598    pointer = nextEntryNumber++;
599
600    listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
601    listOfEntries.write((char *) &zero, sizeof(cellindex));
602    listOfEntries.write((char *) &index, sizeof(cellindex));
603    listOfEntries.write((char *) &length, sizeof(cellindex));
604    return;
605      } else {
606    // go on to the next node
607    pointer = readp;
608      }
609    }
610  }
611
612
613}
614
615
616bool isLongPhraseStored(cellindex index, cellcount length) {
617
618  // cout << "isLongPhraseExpanded(" << index << ", " << length << ")\n";
619
620  cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
621  cellindex pointer;
622  cellindex readp = 0;
623  cellindex readi = 0;
624  cellindex readl = 0;
625
626  // Find the phrase in the hashFile
627  hashTableFile.seekg(hashOffset);
628  hashTableFile.read((char *) &pointer, sizeof(cellindex));
629
630  if (pointer == 0) {
631    // There is no entry at all in the hash table for this entry
632    // so nothing is stored
633    return false;
634
635  } else {
636    // There is a list starting at this hash value, so the phrase may
637    // be already remembered in that list
638    while (pointer != 0) {
639      // Read the entry pointed to by pointer
640      listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
641      listOfEntries.read((char *) &readp, sizeof(cellindex));
642      listOfEntries.read((char *) &readi, sizeof(cellindex));
643      listOfEntries.read((char *) &readl, sizeof(cellindex));
644
645      if ((readi == index) && (readl = length)) {
646    // we've found the phrase stored here
647    return true;
648      } else {
649    // go on to the next node
650    pointer = readp;
651      }
652    }
653  }
654  return false;
655}
656
657
658void deleteLongPhraseMemory() {
659  // remove the hash & other files
660
661  hashTableFile.close();
662  listOfEntries.close();
663  remove(hashTableFileName);
664  remove(listOfEntriesName);
665
666}
667
668
669// Read the collection statistics file
670//
671void readStatistics() {
672
673  // open the statistics file
674  char filename[FILENAME_MAX];
675  sprintf(filename, "%s/clauses.stats", collection);
676
677  // Open the file
678  ifstream inFile(filename, ios::in);
679  if (!inFile) {
680    cerr << "File " << filename << " could not be opened\n";
681    exit(1);
682  }
683
684  // Read the numbers file into the numbers array
685  char key[1000];
686  symbol value;
687  while (inFile >> key >> value){
688    if (strcmp(key, "first_stopword") == 0) {
689      firstStopSymbol = value;
690    } else if (strcmp(key, "last_stopword") == 0) {
691      lastStopSymbol = value;
692    } else if (strcmp(key, "first_contentword") == 0) {
693      firstContentSymbol = value;
694    } else if (strcmp(key, "last_contentword") == 0) {
695      lastContentSymbol = value;
696    }
697  }
698  inFile.close();
699
700  // Make sure we have the information we need
701  if (!(firstStopSymbol && lastStopSymbol && firstContentSymbol && lastContentSymbol)) {
702    cerr << "Statistics file incomplete" << endl;
703    exit(1);
704  }
705}
706
707cellcount getContentCount(symbol firstContent) {
708
709  cellcount content=0;
710  for (cellcount i=0; i<inputLength; ++i) {
711    if (symbols[i]>=firstContent) ++content;
712  }
713
714  return content;
715}
716
717
718int main (int argc, char * argv[]) {
719
720  // Command-line arguments
721  // argv[1] is the phindex directory
722  // argv[2] is the mode, where 1 is stopword mode
723  // argv[3] is the min_occurs, - minimum occurrence frequency for a phrase to be included in the hierarchy
724  // argv[4] is opitonal verbosity
725  if (argc < 4) {
726    cerr << "Usage: " << argv[0] << " phind-directory mode min-phrase-freq [verbosity]" << endl;
727    exit(1);
728  }
729
730  // collection directory
731  strcpy(collection, argv[1]);
732
733  // mode parameter
734  phraseMode = atoi(argv[2]);
735  assert((phraseMode == STOPWORDS) || (phraseMode == ANYPHRASE));
736
737  minOccurs = atoi(argv[3]);
738  assert((minOccurs > 0));
739
740  // optional verbosity parameter
741  if (argc == 5) {
742    verbosity = atoi(argv[4]);
743    assert (verbosity >= 0);
744  }
745
746  if (verbosity) {
747    cout << "suffix: the phrase extraction program" << endl;
748  }
749  if (verbosity > 1) {
750    if (phraseMode == STOPWORDS) {
751      cout << "Stopwords mode: no phrase may begin or end with a stopword" << endl;
752    } else {
753      cout << "AllPhrase mode: extract every phrase that occurs more than once" << endl;
754    }
755  }
756
757  // Read the statistics file
758  readStatistics();
759  // Read the numbers file
760  readNumbers();
761 
762  if (numberOfDocuments == 0) {
763    cerr << "There are no documents in this collection!" << endl;
764    exit(0); // not necessarily an error...
765  }
766
767  symbol firstContent;
768  if (phraseMode==STOPWORDS) firstContent=firstContentSymbol;
769  else firstContent = firstStopSymbol;
770
771  // Allocate memory for the suffix & prefix arrays
772  cellcount contentLength = 0;
773  contentLength = getContentCount(firstContent);
774  suffixArray = new symbol *[contentLength];
775  prefixArray = new symbol *[contentLength];
776  if (prefixArray == NULL) {
777    cerr << "Suffix: not enough memory to hold " << inputLength << " symbols." << endl;
778    exit(2);
779  }
780  allocateSuffixCheck(contentLength);
781  // Initialise prefix and suffix arrays, only use the needed suffixes
782  for (cellcount j = 0, here = 0; j < inputLength; ++j) {
783    if (symbols[j]>=firstContent) {
784      suffixArray[here] = &symbols[j];
785      prefixArray[here] = &symbols[j];
786      ++here;
787    }   
788  }
789  qsort(suffixArray, contentLength, sizeof(symbol *), suffixCompare);
790  qsort(prefixArray, contentLength, sizeof(symbol *), prefixCompare);
791  // Create the document arrays
792  if (verbosity > 1) {
793    cout << "Allocating document arrays for " << numberOfDocuments << " documents" << endl;
794  }
795
796  // The document frequecy array is used to count the number of times
797  // each phrase occurs in each document.  The number of documents in
798  // which a phrase occurs is stored in df.
799  frequency *documentFrequency = new frequency[numberOfDocuments];
800  frequency df;
801
802  // documentArray will be searched in order to discover which document
803  // each phrase occurs in.
804  documentArray = new symbol *[numberOfDocuments]; 
805  if (documentArray == NULL) {
806    cerr << "Suffix: out of memory allocating document arrays." << endl;
807    exit(2);
808  } 
809
810  // just scan through the input text to find the doc starts
811  cellindex d = 0;
812  for (cellindex i=0; i<inputLength; ++i) {
813    if (symbols[i] == DOCUMENTSTART) {
814      documentArray[d] = &symbols[i];
815      ++d;
816    }
817  }
818
819  // the phrases stuff is expecting inputLength to be the length of the
820  // suffix array, so change it.
821  inputLength = contentLength;
822
823  // Extract phrases
824  //
825  // We will make several passesover the data, in each case considering
826  // a set of input phrases and generating a set of output phrases, which
827  // we will expancd in later passes.
828  //
829  // The input phrases in the first pass will be the vocabulary.
830  // In later passes, the input phrases will be the output phrases of the
831  // previous pass.
832  //
833  // In each pass we will consider each input phrase in turn.  If we
834  // have seen it before, we will ignore it.  Otherwise, we will expand
835  // it and add its expansions to the set of output phrases.
836
837  cout <<"\ngenerating the phrase hierarchy\n\n";
838 
839  // Store the phrase data in the phrases file
840  char phraseDataName[FILENAME_MAX];
841  sprintf(phraseDataName, "%s/phrases", collection);
842  ofstream phraseData(phraseDataName, ios::out);
843  if (!phraseData) {
844    cout << "File " << phraseDataName << " could not be opened\n";
845    exit(1);
846  }
847
848  // Count the number of phrases output
849  mg_u_long phraseCounter = 0; //unsigned long int phraseCounter = 0;
850
851  // Set up the phrase expansion memory.
852  // We need this so that we don't expand a phrase more than once
853  initialisePhraseMemory();
854
855  // The current pass numebr
856  int phrasePass = 1;
857
858
859  // PASS NUMBER 1
860  if (verbosity > 1) {
861    cout << "Starting pass " << phrasePass << endl;
862  }
863
864  ofstream outPhrase;
865  char     outPhraseName[FILENAME_MAX];
866  mg_u_long outPhraseCounter = 0; // unsigned long int outPhraseCounter = 0;
867
868  // On the first pass, simply work through the vocabulary
869  sprintf(outPhraseName, "%s/outPhrase.1", collection);
870  outPhrase.open(outPhraseName, ios::out);
871  if (!outPhrase) {
872    cerr << "File " << outPhraseName << " could not be opened\n";
873    exit(1);
874  }
875
876  // Iterate over the different symbols by working through the suffix array
877  vector<Phrase> result;
878  cellindex ij = 0;
879  char *tmpString;
880
881  Phrase p;
882  while (ij < inputLength) {
883
884    // make a new phrase of length 1
885    p = Phrase(suffixArray[ij], 1, SUFFIX);
886    p.findFirstAndLastSuffix(ij, inputLength-1);
887
888    // We ignore this symbol if it occurs only once, if it is a delimiter,
889    // of if we are in stopwords mode and it is a stopword
890    // - in this new version, only need to check freq
891    // We could imagine a new mode/command-line option, which is like
892    // STOPWORDS but without this restrictrion.  This would let you browse
893    // from "the" to "the AGRIS" for example, but not from "AGRIS" to
894    // "the AGRIS" (where the is a stopword and AGRIS a content word).
895    // The system used to work like this; it is easy to implement, but
896    // it explodes the size of the indexes.  So: would it be useful? 
897    if (p.suffixFrequency >= minOccurs) {
898      // Get minimal expansions of the phrase
899      getExpansions(p, result);
900     
901      if (!result.empty()) {
902   
903    // Remember that we have expanded this phrase
904    rememberThisPhrase(ij, 1);
905
906    // write the phrase text
907    phraseData << ij << "-1:" << p << ":" << p.suffixFrequency << ":"
908           << result.size() << ":";
909
910    // write the results
911    for (cellcount k = 0; k < result.size(); ++k) {
912      if (k) {
913        phraseData << ",";
914      }
915      phraseData << result[k].firstSuffixIndex << "-" << result[k].length;
916      outPhrase << result[k].firstSuffixIndex << " " << result[k].length << endl;
917      ++outPhraseCounter;
918    }
919    result.clear();
920   
921    // Write the documents in which this phrase occurs
922    df = getDocumentOccurrances(p, documentFrequency);
923    phraseData << ":" << df << ":";
924
925    // write the documents
926    for (cellcount m = 0, first = 1; m < numberOfDocuments; ++m) {
927      if (documentFrequency[m]) {
928        if (first) {
929          first = 0;
930        } else {
931          phraseData << ";";
932        }
933        // Output the document number.  Note that here we've numbered the
934        // N documents from 0 to N-1, but later they'll be 1-N.  Thus we
935        // add 1 to the document id when we output it.
936        phraseData << "d" << (m+1);
937        // Next, output the frequency with which the document occurs, but
938        // only if it is > 1.
939        if (documentFrequency[m] > 1) {
940          phraseData << "," << documentFrequency[m];
941        }
942      }
943    }
944
945    phraseData << endl;
946    ++phraseCounter;
947
948    // feedback
949    if (verbosity) {
950      if (phraseCounter % 1000 == 0) {
951        cout << "phrase " << phraseCounter << ": "
952         << "cell " << p.firstSuffixIndex << " - " << p << endl;
953      }
954    }
955      }
956    }
957   ij = p.lastSuffixIndex + 1;
958  }
959  outPhrase.close();
960
961  // REMAINING PASSES
962  // The previous outPhrase file forms the input to each new pass
963  cellcount start, length;
964  while (outPhraseCounter > 0) {
965
966    // Start a new pass
967    ++phrasePass;
968    if (verbosity) {
969      cout << "Starting pass " << phrasePass << endl;
970    }
971
972    // Open the input file
973    char inPhraseName[FILENAME_MAX];
974    sprintf(inPhraseName, "%s/outPhrase.%d", collection, phrasePass - 1);
975    ifstream inPhrase (inPhraseName, ios::in);
976    if (!inPhrase) {
977      cerr << "File " << inPhraseName << " could not be opened\n";
978      exit(1);
979    }
980
981    // Open the output file
982    sprintf(outPhraseName, "%s/outPhrase.%d", collection, phrasePass);
983    outPhrase.open(outPhraseName, ios::out);
984    if (!outPhrase) {
985      cerr << "File " << outPhraseName << " could not be opened\n";
986      exit(1);
987    }
988    outPhraseCounter = 0;
989
990    // Process each phrase
991    while(inPhrase >> start >> length) {
992
993      // Ignore the phrase if we have expanded it before
994      if (isPhraseStored(start, length))
995    continue;
996
997      // Remember that we have examined this phrase
998      rememberThisPhrase(start, length);
999
1000      // Find the phrase in the suffixarray
1001      p = Phrase(suffixArray[start], length, SUFFIX);
1002      p.findFirstAndLastSuffix(start, inputLength-1);
1003
1004      // Ignore the phrase if it only occurs once
1005      if (p.suffixFrequency < minOccurs)
1006    continue;
1007
1008      // Write the phrase text;
1009      phraseData << start << "-" << length << ":" << p << ":" << p.suffixFrequency << ":";
1010   
1011      // Expand the phrase, if it is fewer than 8 words long
1012      if (length <= 8) {
1013
1014    // Get the minimal expansions for this phrase
1015    getExpansions(p, result);
1016     
1017    // write the results
1018    phraseData << result.size() << ":";
1019
1020    for (cellcount i = 0; i < result.size(); ++i) {
1021      if (i) {
1022        phraseData << ",";
1023      }
1024      phraseData << result[i].firstSuffixIndex << "-" << result[i].length;
1025      outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;
1026      ++outPhraseCounter;
1027    }
1028    result.clear();
1029   
1030      } else {
1031    // phrase is too long to expand further
1032    phraseData << "0:";
1033      }
1034
1035      // Write the documents in which this phrase occurs
1036      df = getDocumentOccurrances(p, documentFrequency);
1037      phraseData << ":" << df << ":";
1038
1039      // write the documents
1040      for (cellcount i = 0, first = 1; i < numberOfDocuments; ++i) {
1041    if (documentFrequency[i]) {
1042      if (first) {
1043        first = 0;
1044      } else {
1045        phraseData << ";";
1046      }
1047      // Output the document number.  Note that here we've numbered the
1048      // N documents from 0 to N-1, but later they'll be 1-N.  Thus we
1049      // add 1 to the document id when we output it.
1050      phraseData << "d" << (i+1);
1051      // Next, output the frequency with which the document occurs, but
1052      // only if it is > 1.
1053      if (documentFrequency[i] > 1) {
1054        phraseData << "," << documentFrequency[i];
1055      }
1056    }
1057      }
1058     
1059      phraseData << endl;
1060      ++phraseCounter;
1061
1062      // feedback
1063      if (verbosity) {
1064    if (phraseCounter % 1000 == 0) {
1065      cout << "phrase " << phraseCounter << ": "<< "start " << start
1066           << ", length " << length << " - " << p << endl;
1067    }
1068      }
1069
1070    }
1071
1072    inPhrase.close();
1073    outPhrase.close();
1074  }
1075   
1076  phraseData.close();
1077  deletePhraseMemory();
1078
1079  delete [] documentFrequency;
1080  delete [] symbols;
1081  delete [] suffixArray;
1082  delete [] prefixArray;
1083  delete [] suffixCheck;
1084  delete [] documentArray;
1085
1086
1087 
1088  cout << endl << "Done: " << phraseCounter << " phrases in " << phraseDataName << endl;
1089  return 0;
1090}
1091
1092
1093
1094
1095
Note: See TracBrowser for help on using the browser.