Changeset 2666 for trunk


Ignore:
Timestamp:
2001-07-23T16:12:40+12:00 (23 years ago)
Author:
jrm21
Message:

Modified phind classifier so that special delimiters are always upper-case,
and if they appear in the text they are changed to lower-case.
(eg COLLECTIONSTART etc appearing in a source file is probably fairly unlikely,
but it could happen - for example if someone was to build a collection that
consisted of the greenstone source code?? :)

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/classify/phind.pm

    r2658 r2666  
    291291   
    292292    # Extract a metadata field from a document
    293     # (If ther eis more than one element of the given type, get them all.)
     293    # (If there is more than one element of the given type, get them all.)
    294294    elsif ($level eq "document") {
    295295        $dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
     
    422422    my ($language_exp, $text) = @_;
    423423
     424    # escape any magic words... - jrm21
     425    foreach my $delim (@delimiters) {
     426    my $replacement=lc($delim);
     427    my $num= $text=~ s/$delim/$replacement/g;
     428    if (!$num) {$num=0;}
     429    }
     430
    424431    if ($language_exp =~ /en/) {
    425432    return &convert_gml_to_tokens_EN($text);
     
    689696    # Read words in the text and count occurences
    690697    open(TXT, "<$phinddir/clauses");
     698
    691699    my @words;
    692    
    693700    while(<TXT>) {
    694701    $line = $_;
     
    726733    }
    727734    }
    728    
    729735    undef %freq;
    730736    undef %bestfreq;
     
    740746    foreach $word (@delimiters) {
    741747
    742     $word = lc($word);
    743     $bestform{$word} = uc($word);
     748#   $word = lc($word); # jrm21
     749    $word = uc($word);
     750    $bestform{$word} = $word;
    744751    $vocab[$nextsymbol] = $word;
    745752    $symbol{$word} = $nextsymbol;
     
    747754    }
    748755    $last_delimiter = $nextsymbol - 1;
    749    
    750756    # Stopwords
    751757    $first_stopword = $nextsymbol;
    752758   
    753759    foreach my $word (sort keys %stopwords) {
    754    
    755     # don't incluse stopword unless it occurs in the text
     760    # don't include stopword unless it occurs in the text
    756761    $word = lc($word);
    757762    next unless ($totalfreq{$word});
     
    797802    $last_contentword = $nextsymbol - 1;
    798803   
    799 
    800804    # Outut the words
    801805    print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
     
    847851    open(NUM, ">$phinddir/clauses.numbers");
    848852   
    849     $phrasedelimiter = $symbol{lc($senlimit)};
    850     print NUM "$symbol{lc($colstart)}\n";
     853##    $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21
     854##    print NUM "$symbol{lc($colstart)}\n"; # jrm21
     855    $phrasedelimiter = $symbol{$senlimit};
     856    print NUM "$symbol{$colstart}\n";
    851857   
    852858    # set up the special symbols that delimit documents and sentences
     
    860866    # output one token at a time
    861867    foreach $word (@words) {
    862         $word = lc($word);
     868# don't lower-case special delimiters - jrm21
     869        if (!map {if ($word eq $_) {1} else {()}} @delimiters) {
     870        $word = lc($word);
     871        }
    863872        print NUM "$symbol{$word}\n";
    864873    }
     
    869878   
    870879    close TXT;
    871     print NUM "$symbol{lc($colend)}\n";
     880#    print NUM "$symbol{lc($colend)}\n";# jrm21
     881    print NUM "$symbol{$colend}\n";
    872882    close NUM;
    873883
Note: See TracChangeset for help on using the changeset viewer.