Ignore:
Timestamp:
2004-10-18T15:30:40+13:00 (20 years ago)
Author:
kjdon
Message:

added a new option to the phind classifier: min_occurs. this is the minimum phrase frequency needed to be included in the hierarchy

Location:
trunk/gsdl/src/phind/generate
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/src/phind/generate/phrase.cpp

    r2839 r8362  
    4646
    4747// Phrase constructor functions
    48 
    4948
    5049Phrase::Phrase(symbol *words, cellcount size, int direction) {
     
    659658
    660659    // If the expansion occurs more than once and is not delimited, expand it
    661     if ((*(next.back) > LASTDELIMITER) && (next.suffixFrequency >= 2)) {
     660    if ((*(next.back) > LASTDELIMITER) && (next.suffixFrequency >= minOccurs)) {
    662661      next.expandWhileUniqueSuffixExtension();
    663662      results.push_back(next);
     
    683682
    684683    // If the expansion occurs more than once and is not delimited, expand it
    685     if ((*(next.forward) > LASTDELIMITER) && (next.prefixFrequency >= 2)) {
     684    if ((*(next.forward) > LASTDELIMITER) && (next.prefixFrequency >= minOccurs)) {
    686685      next.expandWhileUniquePrefixExtension();
    687686      results.push_back(next);
     
    751750
    752751  // if the phrase occurs only once, do nothing
    753   if (suffixFrequency < 2)
     752  if (suffixFrequency < minOccurs)
    754753    return 0;
    755754
     
    794793
    795794  // if the phrase occurs only once, do nothing
    796   if (prefixFrequency < 2)
     795  if (prefixFrequency < minOccurs)
    797796    return 0;
    798797
  • trunk/gsdl/src/phind/generate/suffix.cpp

    r3245 r8362  
    7272// Do we accept any phrase, or do we eliminate those ending with stopwords ?
    7373int phraseMode = ANYPHRASE; //STOPWORDS;
    74 
     74// What is the minimum phrase frequency for a phrase to be included in the hierarchy
     75int minOccurs = 2;
    7576
    7677// The filestem of the collection's phindex directory
     
    718719  // Command-line arguments
    719720  // argv[1] is the phindex directory
    720   // argv[2] is the maximum array symbol length (optional)
    721   // argv[3] is the mode, where 1 is stopword mode (optional)
    722   if (argc < 2) {
    723     cerr << "Usage: " << argv[0] << " phind-directory mode [verbosity]" << endl;
     721  // argv[2] is the mode, where 1 is stopword mode
     722  // argv[3] is the min_occurs, - minimum occurrence frequency for a phrase to be included in the hierarchy
     723  // argv[4] is opitonal verbosity
     724  if (argc < 4) {
     725    cerr << "Usage: " << argv[0] << " phind-directory mode min-phrase-freq [verbosity]" << endl;
    724726    exit(1);
    725727  }
     
    732734  assert((phraseMode == STOPWORDS) || (phraseMode == ANYPHRASE));
    733735
     736  minOccurs = atoi(argv[3]);
     737  assert((minOccurs > 0));
     738
    734739  // optional verbosity parameter
    735   if (argc == 4) {
    736     verbosity = atoi(argv[3]);
     740  if (argc == 5) {
     741    verbosity = atoi(argv[4]);
    737742    assert (verbosity >= 0);
    738743  }
     
    892897    // The system used to work like this; it is easy to implement, but
    893898    // it explodes the size of the indexes.  So: would it be useful? 
    894     if (p.suffixFrequency > 1) {
     899    if (p.suffixFrequency >= minOccurs) {
    895900      // Get minimal expansions of the phrase
    896901      getExpansions(p, result);
     
    10001005
    10011006      // Ignore the phrase if it only occurs once
    1002       if (p.suffixFrequency < 2)
     1007      if (p.suffixFrequency < minOccurs)
    10031008    continue;
    10041009
  • trunk/gsdl/src/phind/generate/suffix.h

    r2867 r8362  
    6868// Are we allowed to terminate a phrase on a stopword?
    6969extern int phraseMode;
     70extern int minOccurs;
    7071
    7172#define ANYPHRASE 0
Note: See TracChangeset for help on using the changeset viewer.