Changeset 8362
- Timestamp:
- 2004-10-18T15:30:40+13:00 (20 years ago)
- Location:
- trunk/gsdl
- Files:
-
- 5 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/classify/Phind.pm
r6989 r8362 113 113 'type' => "int", 114 114 'deft' => "1", 115 'range' => "0,1", 116 'reqd' => "no" }, 117 { 'name' => "min_occurs", 118 'desc' => "{Phind.min_occurs}", 119 'type' => "int", 120 'deft' => "2", 121 'range' => "1,", 115 122 'reqd' => "no" }, 116 123 { 'name' => "thesaurus", … … 176 183 q^savephrases/.*/^, \$self->{'savephrases'}, 177 184 q^suffixmode/\d/1^, \$self->{'suffixmode'}, 185 q^min_occurs/\d/2^, \$self->{'min_occurs'}, 178 186 q^thesaurus/.*/^, \$self->{'thesaurus'}, 179 187 q^untidy^, \$self->{'untidy'}, … … 288 296 print $txthandle "$doclimit\n"; 289 297 290 # itera rate over the required indexes and store their text298 # iterate over the required indexes and store their text 291 299 my $indexes = $self->{'indexes'}; 292 300 my $text = ""; … … 379 387 # Construct phind indexes 380 388 my $suffixmode = $self->{'suffixmode'}; 389 my $min_occurs = $self->{'min_occurs'}; 381 390 my ($command, $status); 382 391 … … 390 399 print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity; 391 400 print STDERR "<Phase name='ExtractingPhrase'/>\n" if $gli; 392 &execute("suffix \"$phinddir\" $suffixmode $ verbosity", $verbosity, $out);401 &execute("suffix \"$phinddir\" $suffixmode $min_occurs $verbosity", $verbosity, $out); 393 402 394 403 # check that we generated some files. It's not necessarily an error if … … 1399 1408 # 1400 1409 # Read phrases.3 and restore vocabulary information. Then write 1401 # this data to the MGPP input files (pw rod.txt and pdata.txt) and1410 # this data to the MGPP input files (pword.txt and pdata.txt) and 1402 1411 # (if requested) to the saved phrases file. 1403 1412 -
trunk/gsdl/perllib/strings.rb
r8361 r8362 368 368 Phind.language:Language or languages to use building hierarchy. Languages are identified by two-letter country codes like en (English), es (Spanish), and fr (French). Language is a regular expression, so 'en|fr' (English or French) and '..' (match any language) are valid. 369 369 370 Phind.min_occurs:The minimum number of times a phrase must appear in the text to be included in the phrase hierarchy. 371 370 372 Phind.savephrases:If set, the phrase infomation will be stored in the given file as text. It is probably a good idea to use an absolute path. 371 373 -
trunk/gsdl/src/phind/generate/phrase.cpp
r2839 r8362 46 46 47 47 // Phrase constructor functions 48 49 48 50 49 Phrase::Phrase(symbol *words, cellcount size, int direction) { … … 659 658 660 659 // If the expansion occurs more than once and is not delimited, expand it 661 if ((*(next.back) > LASTDELIMITER) && (next.suffixFrequency >= 2)) {660 if ((*(next.back) > LASTDELIMITER) && (next.suffixFrequency >= minOccurs)) { 662 661 next.expandWhileUniqueSuffixExtension(); 663 662 results.push_back(next); … … 683 682 684 683 // If the expansion occurs more than once and is not delimited, expand it 685 if ((*(next.forward) > LASTDELIMITER) && (next.prefixFrequency >= 2)) {684 if ((*(next.forward) > LASTDELIMITER) && (next.prefixFrequency >= minOccurs)) { 686 685 next.expandWhileUniquePrefixExtension(); 687 686 results.push_back(next); … … 751 750 752 751 // if the phrase occurs only once, do nothing 753 if (suffixFrequency < 2)752 if (suffixFrequency < minOccurs) 754 753 return 0; 755 754 … … 794 793 795 794 // if the phrase occurs only once, do nothing 796 if (prefixFrequency < 2)795 if (prefixFrequency < minOccurs) 797 796 return 0; 798 797 -
trunk/gsdl/src/phind/generate/suffix.cpp
r3245 r8362 72 72 // Do we accept any phrase, or do we eliminate those ending with stopwords ? 73 73 int phraseMode = ANYPHRASE; //STOPWORDS; 74 74 // What is the minimum phrase frequency for a phrase to be included in the hierarchy 75 int minOccurs = 2; 75 76 76 77 // The filestem of the collection's phindex directory … … 718 719 // Command-line arguments 719 720 // argv[1] is the phindex directory 720 // argv[2] is the maximum array symbol length (optional) 721 // argv[3] is the mode, where 1 is stopword mode (optional) 722 if (argc < 2) { 723 cerr << "Usage: " << argv[0] << " phind-directory mode [verbosity]" << endl; 721 // argv[2] is the mode, where 1 is stopword mode 722 // argv[3] is the min_occurs, - minimum occurrence frequency for a phrase to be included in the hierarchy 723 // argv[4] is opitonal verbosity 724 if (argc < 4) { 725 cerr << "Usage: " << argv[0] << " phind-directory mode min-phrase-freq [verbosity]" << endl; 724 726 exit(1); 725 727 } … … 732 734 assert((phraseMode == STOPWORDS) || (phraseMode == ANYPHRASE)); 733 735 736 minOccurs = atoi(argv[3]); 737 assert((minOccurs > 0)); 738 734 739 // optional verbosity parameter 735 if (argc == 4) {736 verbosity = atoi(argv[ 3]);740 if (argc == 5) { 741 verbosity = atoi(argv[4]); 737 742 assert (verbosity >= 0); 738 743 } … … 892 897 // The system used to work like this; it is easy to implement, but 893 898 // it explodes the size of the indexes. So: would it be useful? 894 if (p.suffixFrequency > 1) {899 if (p.suffixFrequency >= minOccurs) { 895 900 // Get minimal expansions of the phrase 896 901 getExpansions(p, result); … … 1000 1005 1001 1006 // Ignore the phrase if it only occurs once 1002 if (p.suffixFrequency < 2)1007 if (p.suffixFrequency < minOccurs) 1003 1008 continue; 1004 1009 -
trunk/gsdl/src/phind/generate/suffix.h
r2867 r8362 68 68 // Are we allowed to terminate a phrase on a stopword? 69 69 extern int phraseMode; 70 extern int minOccurs; 70 71 71 72 #define ANYPHRASE 0
Note:
See TracChangeset
for help on using the changeset viewer.