Context Navigation

← Previous Change
Next Change →

Changeset 2666 for trunk

Timestamp:

2001-07-23T16:12:40+12:00 (23 years ago)

Author:

jrm21

Message:

Modified phind classifier so that special delimiters are always upper-case,
and if they appear in the text they are changed to lower-case.
(eg COLLECTIONSTART etc appearing in a source file is probably fairly unlikely,
but it could happen - for example if someone was to build a collection that
consisted of the greenstone source code?? :)

File:

: 1 edited

trunk/gsdl/perllib/classify/phind.pm (modified) (10 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/perllib/classify/phind.pm

-              r2658
+              r2666
     # Extract a metadata field from a document
     # (If ther eis more than one element of the given type, get them all.)
+    # (If there is more than one element of the given type, get them all.)
     elsif ($level eq "document") {
         $dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
 …
     my ($language_exp, $text) = @_;
+    # escape any magic words... - jrm21
+    foreach my $delim (@delimiters) {
+    my $replacement=lc($delim);
+    my $num= $text=~ s/$delim/$replacement/g;
+    if (!$num) {$num=0;}
+    }
     if ($language_exp =~ /en/) {
     return &convert_gml_to_tokens_EN($text);
 …
     # Read words in the text and count occurences
     open(TXT, "<$phinddir/clauses");
     my @words;
     while(<TXT>) {
     $line = $_;
 …
+    }
+    }
     undef %freq;
     undef %bestfreq;
 …
     foreach $word (@delimiters) {
+    $word = lc($word);
+    $bestform{$word} = uc($word);
+#   $word = lc($word); # jrm21
+    $word = uc($word);
+    $bestform{$word} = $word;
     $vocab[$nextsymbol] = $word;
     $symbol{$word} = $nextsymbol;
 …
+    }
     $last_delimiter = $nextsymbol - 1;
     # Stopwords
     $first_stopword = $nextsymbol;
     foreach my $word (sort keys %stopwords) {
+    # don't incluse stopword unless it occurs in the text
+    # don't include stopword unless it occurs in the text
     $word = lc($word);
     next unless ($totalfreq{$word});
 …
     $last_contentword = $nextsymbol - 1;
     # Outut the words
     print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
 …
     open(NUM, ">$phinddir/clauses.numbers");
+    $phrasedelimiter = $symbol{lc($senlimit)};
+    print NUM "$symbol{lc($colstart)}\n";
+##    $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21
+##    print NUM "$symbol{lc($colstart)}\n"; # jrm21
+    $phrasedelimiter = $symbol{$senlimit};
+    print NUM "$symbol{$colstart}\n";
     # set up the special symbols that delimit documents and sentences
 …
     # output one token at a time
     foreach $word (@words) {
+        $word = lc($word);
+# don't lower-case special delimiters - jrm21
+        if (!map {if ($word eq $_) {1} else {()}} @delimiters) {
+        $word = lc($word);
+        }
         print NUM "$symbol{$word}\n";
+    }
 …
     close TXT;
+    print NUM "$symbol{lc($colend)}\n";
+#    print NUM "$symbol{lc($colend)}\n";# jrm21
+    print NUM "$symbol{$colend}\n";
     close NUM;

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 2666 for trunk

Legend:

trunk/gsdl/perllib/classify/phind.pm

Download in other formats: