- Timestamp:
- 2001-07-23T16:12:40+12:00 (23 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/classify/phind.pm
r2658 r2666 291 291 292 292 # Extract a metadata field from a document 293 # (If ther eis more than one element of the given type, get them all.)293 # (If there is more than one element of the given type, get them all.) 294 294 elsif ($level eq "document") { 295 295 $dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field); … … 422 422 my ($language_exp, $text) = @_; 423 423 424 # escape any magic words... - jrm21 425 foreach my $delim (@delimiters) { 426 my $replacement=lc($delim); 427 my $num= $text=~ s/$delim/$replacement/g; 428 if (!$num) {$num=0;} 429 } 430 424 431 if ($language_exp =~ /en/) { 425 432 return &convert_gml_to_tokens_EN($text); … … 689 696 # Read words in the text and count occurences 690 697 open(TXT, "<$phinddir/clauses"); 698 691 699 my @words; 692 693 700 while(<TXT>) { 694 701 $line = $_; … … 726 733 } 727 734 } 728 729 735 undef %freq; 730 736 undef %bestfreq; … … 740 746 foreach $word (@delimiters) { 741 747 742 $word = lc($word); 743 $bestform{$word} = uc($word); 748 # $word = lc($word); # jrm21 749 $word = uc($word); 750 $bestform{$word} = $word; 744 751 $vocab[$nextsymbol] = $word; 745 752 $symbol{$word} = $nextsymbol; … … 747 754 } 748 755 $last_delimiter = $nextsymbol - 1; 749 750 756 # Stopwords 751 757 $first_stopword = $nextsymbol; 752 758 753 759 foreach my $word (sort keys %stopwords) { 754 755 # don't incluse stopword unless it occurs in the text 760 # don't include stopword unless it occurs in the text 756 761 $word = lc($word); 757 762 next unless ($totalfreq{$word}); … … 797 802 $last_contentword = $nextsymbol - 1; 798 803 799 800 804 # Outut the words 801 805 print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1); … … 847 851 open(NUM, ">$phinddir/clauses.numbers"); 848 852 849 $phrasedelimiter = $symbol{lc($senlimit)}; 850 print NUM "$symbol{lc($colstart)}\n"; 853 ## $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21 854 ## print NUM "$symbol{lc($colstart)}\n"; # jrm21 855 $phrasedelimiter = $symbol{$senlimit}; 856 print NUM "$symbol{$colstart}\n"; 851 857 852 858 # set up the special symbols that delimit documents and sentences … … 860 866 # output one token at a time 861 867 foreach $word (@words) { 862 $word = lc($word); 868 # don't lower-case special delimiters - jrm21 869 if (!map {if ($word eq $_) {1} else {()}} @delimiters) { 870 $word = lc($word); 871 } 863 872 print NUM "$symbol{$word}\n"; 864 873 } … … 869 878 870 879 close TXT; 871 print NUM "$symbol{lc($colend)}\n"; 880 # print NUM "$symbol{lc($colend)}\n";# jrm21 881 print NUM "$symbol{$colend}\n"; 872 882 close NUM; 873 883
Note:
See TracChangeset
for help on using the changeset viewer.