Changeset 8362 for trunk/gsdl/src/phind/generate
- Timestamp:
- 2004-10-18T15:30:40+13:00 (20 years ago)
- Location:
- trunk/gsdl/src/phind/generate
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/src/phind/generate/phrase.cpp
r2839 r8362 46 46 47 47 // Phrase constructor functions 48 49 48 50 49 Phrase::Phrase(symbol *words, cellcount size, int direction) { … … 659 658 660 659 // If the expansion occurs more than once and is not delimited, expand it 661 if ((*(next.back) > LASTDELIMITER) && (next.suffixFrequency >= 2)) {660 if ((*(next.back) > LASTDELIMITER) && (next.suffixFrequency >= minOccurs)) { 662 661 next.expandWhileUniqueSuffixExtension(); 663 662 results.push_back(next); … … 683 682 684 683 // If the expansion occurs more than once and is not delimited, expand it 685 if ((*(next.forward) > LASTDELIMITER) && (next.prefixFrequency >= 2)) {684 if ((*(next.forward) > LASTDELIMITER) && (next.prefixFrequency >= minOccurs)) { 686 685 next.expandWhileUniquePrefixExtension(); 687 686 results.push_back(next); … … 751 750 752 751 // if the phrase occurs only once, do nothing 753 if (suffixFrequency < 2)752 if (suffixFrequency < minOccurs) 754 753 return 0; 755 754 … … 794 793 795 794 // if the phrase occurs only once, do nothing 796 if (prefixFrequency < 2)795 if (prefixFrequency < minOccurs) 797 796 return 0; 798 797 -
trunk/gsdl/src/phind/generate/suffix.cpp
r3245 r8362 72 72 // Do we accept any phrase, or do we eliminate those ending with stopwords ? 73 73 int phraseMode = ANYPHRASE; //STOPWORDS; 74 74 // What is the minimum phrase frequency for a phrase to be included in the hierarchy 75 int minOccurs = 2; 75 76 76 77 // The filestem of the collection's phindex directory … … 718 719 // Command-line arguments 719 720 // argv[1] is the phindex directory 720 // argv[2] is the maximum array symbol length (optional) 721 // argv[3] is the mode, where 1 is stopword mode (optional) 722 if (argc < 2) { 723 cerr << "Usage: " << argv[0] << " phind-directory mode [verbosity]" << endl; 721 // argv[2] is the mode, where 1 is stopword mode 722 // argv[3] is the min_occurs, - minimum occurrence frequency for a phrase to be included in the hierarchy 723 // argv[4] is opitonal verbosity 724 if (argc < 4) { 725 cerr << "Usage: " << argv[0] << " phind-directory mode min-phrase-freq [verbosity]" << endl; 724 726 exit(1); 725 727 } … … 732 734 assert((phraseMode == STOPWORDS) || (phraseMode == ANYPHRASE)); 733 735 736 minOccurs = atoi(argv[3]); 737 assert((minOccurs > 0)); 738 734 739 // optional verbosity parameter 735 if (argc == 4) {736 verbosity = atoi(argv[ 3]);740 if (argc == 5) { 741 verbosity = atoi(argv[4]); 737 742 assert (verbosity >= 0); 738 743 } … … 892 897 // The system used to work like this; it is easy to implement, but 893 898 // it explodes the size of the indexes. So: would it be useful? 894 if (p.suffixFrequency > 1) {899 if (p.suffixFrequency >= minOccurs) { 895 900 // Get minimal expansions of the phrase 896 901 getExpansions(p, result); … … 1000 1005 1001 1006 // Ignore the phrase if it only occurs once 1002 if (p.suffixFrequency < 2)1007 if (p.suffixFrequency < minOccurs) 1003 1008 continue; 1004 1009 -
trunk/gsdl/src/phind/generate/suffix.h
r2867 r8362 68 68 // Are we allowed to terminate a phrase on a stopword? 69 69 extern int phraseMode; 70 extern int minOccurs; 70 71 71 72 #define ANYPHRASE 0
Note:
See TracChangeset
for help on using the changeset viewer.