Changeset 8362


Ignore:
Timestamp:
2004-10-18T15:30:40+13:00 (20 years ago)
Author:
kjdon
Message:

added a new option to the phind classifier: min_occurs. this is the minimum phrase frequency needed to be included in the hierarchy

Location:
trunk/gsdl
Files:
5 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/classify/Phind.pm

    r6989 r8362  
    113113    'type' => "int",
    114114    'deft' => "1",
     115    'range' => "0,1",
     116    'reqd' => "no" },
     117      { 'name' => "min_occurs",
     118    'desc' => "{Phind.min_occurs}",
     119    'type' => "int",
     120    'deft' => "2",
     121    'range' => "1,",
    115122    'reqd' => "no" },
    116123      { 'name' => "thesaurus",
     
    176183             q^savephrases/.*/^, \$self->{'savephrases'},
    177184             q^suffixmode/\d/1^, \$self->{'suffixmode'},
     185             q^min_occurs/\d/2^, \$self->{'min_occurs'},
    178186             q^thesaurus/.*/^, \$self->{'thesaurus'},
    179187             q^untidy^, \$self->{'untidy'},
     
    288296    print $txthandle "$doclimit\n";
    289297
    290     # iterarate over the required indexes and store their text
     298    # iterate over the required indexes and store their text
    291299    my $indexes = $self->{'indexes'};
    292300    my $text = "";
     
    379387    # Construct phind indexes
    380388    my $suffixmode = $self->{'suffixmode'};
     389    my $min_occurs = $self->{'min_occurs'};
    381390    my ($command, $status);
    382391   
     
    390399    print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
    391400    print STDERR "<Phase name='ExtractingPhrase'/>\n" if $gli;
    392     &execute("suffix \"$phinddir\" $suffixmode $verbosity", $verbosity, $out);
     401    &execute("suffix \"$phinddir\" $suffixmode $min_occurs $verbosity", $verbosity, $out);
    393402
    394403    # check that we generated some files. It's not necessarily an error if
     
    13991408#
    14001409# Read phrases.3 and restore vocabulary information. Then write
    1401 # this data to the MGPP input files (pwrod.txt and pdata.txt) and
     1410# this data to the MGPP input files (pword.txt and pdata.txt) and
    14021411# (if requested) to the saved phrases file.
    14031412
  • trunk/gsdl/perllib/strings.rb

    r8361 r8362  
    368368Phind.language:Language or languages to use building hierarchy. Languages are identified by two-letter country codes like en (English), es (Spanish), and fr (French). Language is a regular expression, so 'en|fr' (English or French) and '..' (match any language) are valid.
    369369
     370Phind.min_occurs:The minimum number of times a phrase must appear in the text to be included in the phrase hierarchy.
     371
    370372Phind.savephrases:If set, the phrase infomation will be stored in the given file as text. It is probably a good idea to use an absolute path.
    371373
  • trunk/gsdl/src/phind/generate/phrase.cpp

    r2839 r8362  
    4646
    4747// Phrase constructor functions
    48 
    4948
    5049Phrase::Phrase(symbol *words, cellcount size, int direction) {
     
    659658
    660659    // If the expansion occurs more than once and is not delimited, expand it
    661     if ((*(next.back) > LASTDELIMITER) && (next.suffixFrequency >= 2)) {
     660    if ((*(next.back) > LASTDELIMITER) && (next.suffixFrequency >= minOccurs)) {
    662661      next.expandWhileUniqueSuffixExtension();
    663662      results.push_back(next);
     
    683682
    684683    // If the expansion occurs more than once and is not delimited, expand it
    685     if ((*(next.forward) > LASTDELIMITER) && (next.prefixFrequency >= 2)) {
     684    if ((*(next.forward) > LASTDELIMITER) && (next.prefixFrequency >= minOccurs)) {
    686685      next.expandWhileUniquePrefixExtension();
    687686      results.push_back(next);
     
    751750
    752751  // if the phrase occurs only once, do nothing
    753   if (suffixFrequency < 2)
     752  if (suffixFrequency < minOccurs)
    754753    return 0;
    755754
     
    794793
    795794  // if the phrase occurs only once, do nothing
    796   if (prefixFrequency < 2)
     795  if (prefixFrequency < minOccurs)
    797796    return 0;
    798797
  • trunk/gsdl/src/phind/generate/suffix.cpp

    r3245 r8362  
    7272// Do we accept any phrase, or do we eliminate those ending with stopwords ?
    7373int phraseMode = ANYPHRASE; //STOPWORDS;
    74 
     74// What is the minimum phrase frequency for a phrase to be included in the hierarchy
     75int minOccurs = 2;
    7576
    7677// The filestem of the collection's phindex directory
     
    718719  // Command-line arguments
    719720  // argv[1] is the phindex directory
    720   // argv[2] is the maximum array symbol length (optional)
    721   // argv[3] is the mode, where 1 is stopword mode (optional)
    722   if (argc < 2) {
    723     cerr << "Usage: " << argv[0] << " phind-directory mode [verbosity]" << endl;
     721  // argv[2] is the mode, where 1 is stopword mode
     722  // argv[3] is the min_occurs, - minimum occurrence frequency for a phrase to be included in the hierarchy
     723  // argv[4] is opitonal verbosity
     724  if (argc < 4) {
     725    cerr << "Usage: " << argv[0] << " phind-directory mode min-phrase-freq [verbosity]" << endl;
    724726    exit(1);
    725727  }
     
    732734  assert((phraseMode == STOPWORDS) || (phraseMode == ANYPHRASE));
    733735
     736  minOccurs = atoi(argv[3]);
     737  assert((minOccurs > 0));
     738
    734739  // optional verbosity parameter
    735   if (argc == 4) {
    736     verbosity = atoi(argv[3]);
     740  if (argc == 5) {
     741    verbosity = atoi(argv[4]);
    737742    assert (verbosity >= 0);
    738743  }
     
    892897    // The system used to work like this; it is easy to implement, but
    893898    // it explodes the size of the indexes.  So: would it be useful? 
    894     if (p.suffixFrequency > 1) {
     899    if (p.suffixFrequency >= minOccurs) {
    895900      // Get minimal expansions of the phrase
    896901      getExpansions(p, result);
     
    10001005
    10011006      // Ignore the phrase if it only occurs once
    1002       if (p.suffixFrequency < 2)
     1007      if (p.suffixFrequency < minOccurs)
    10031008    continue;
    10041009
  • trunk/gsdl/src/phind/generate/suffix.h

    r2867 r8362  
    6868// Are we allowed to terminate a phrase on a stopword?
    6969extern int phraseMode;
     70extern int minOccurs;
    7071
    7172#define ANYPHRASE 0
Note: See TracChangeset for help on using the changeset viewer.