Changeset 2064


Ignore:
Timestamp:
2001-02-24T14:03:24+13:00 (23 years ago)
Author:
paynter
Message:

Sort thesaurus phrases by frequency then type.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/classify/phind.pm

    r2025 r2064  
    11381138    # 2.
    11391139    # Read phrases file to find thesaurus entries that already
    1140     # have a phindid.  Store their phind-ids for later translation.
     1140    # have a phindid.  Store their phind-ids for later translation,
     1141    # and store their frequency for later sorting.
    11411142    my %thesaurustophindid;
    1142     my ($phindid);
     1143    my %phindidtofrequency;
     1144    my ($phindid, $freq);
    11431145
    11441146    open(IN, "<$infile");
     
    11521154    $phindid = shift @fields;
    11531155    $symbols = shift @fields;
     1156    $freq = shift @fields;
    11541157
    11551158    # do we have a thesaurus id corresponding to this phrase?
     
    11571160        $thesid = $symbolstothesid{$symbols};
    11581161        $thesaurustophindid{$thesid} = $phindid;
     1162        $phindidtofrequency{$phindid} = $freq;
    11591163    }
    11601164    }
     
    11641168
    11651169    # 3.
    1166     # Create phind-id numbers for remaining thesaurus entries
     1170    # Create phind-id numbers for remaining thesaurus entries,
     1171    # and note that their frequency is 0 for later sorting.
    11671172    my $nextphindid = $phindid + 1;
    11681173
     
    11771182    if (!defined($thesaurustophindid{$thesid})) {
    11781183        $thesaurustophindid{$thesid} = $nextphindid;
     1184        $phindidtofrequency{$nextphindid} = 0;
    11791185        $nextphindid++;
    11801186    }
     
    11861192    # phind-id numbers.
    11871193    my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
    1188     my ($relations, $linkcounter, $linktext, $linktype, @linkdata, $link);
     1194    my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
     1195    my (@links, $linkid, %linkidtotype, $newrelation);
    11891196
    11901197    open(TH, "<$thesaurusfile");
     
    12031210
    12041211    # convert each part of the relation string to use phind-id numbers
    1205     $newrelation = "";
    1206     $linkcounter = 0;
     1212    # at the same time, we want to sort the list by frequency.
     1213    undef %linkidtotype;
     1214   
    12071215    foreach $linktext (split(/;/, $relations)) {
    12081216        @linkdata = split(/,/, $linktext);
    1209 
     1217       
    12101218        # remember the linktype (e.g. BT, NT)
    12111219        $linktype = shift @linkdata;
    1212         $newrelation .= "$linktype,";
    1213 
    1214         # convert the link target identfiers
    1215         foreach $link (@linkdata) {
    1216         die unless (defined($thesaurustophindid{$link}));
    1217         $newrelation .= "$thesaurustophindid{$link},";
    1218         $linkcounter++;
     1220       
     1221        # store the type of each link
     1222        foreach $thesid (@linkdata) {
     1223        die unless (defined($thesaurustophindid{$thesid}));
     1224        $linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
    12191225        }
    1220         $newrelation =~ s/\,$//;
    1221         $newrelation .= ";";
    1222     }
    1223     $newrelation .= ":";
    1224 
    1225     print TO "$phindid:$symbols:$linkcounter:$newrelation\n";
     1226    }
     1227
     1228    # sort the list of links, first by frequency, then by type.
     1229    @links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
     1230                        or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
     1231    $linkcounter = (scalar @links);
     1232
     1233    # create a string describing the link information
     1234    $linktype = $linkidtotype{$links[0]};
     1235    $newrelation = $linktype;
     1236    foreach $linkid (@links) {
     1237        if ($linkidtotype{$linkid} ne $linktype) {
     1238        $linktype = $linkidtotype{$linkid};
     1239        $newrelation .= ";" . $linktype;
     1240        }
     1241        $newrelation .= "," . $linkid;
     1242    }
     1243    $newrelation .= ";";
     1244   
     1245
     1246    # output the new line
     1247    print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
    12261248    }
    12271249    close TH;
     
    12291251
    12301252    undef %thesaurustophindid;
     1253    undef %linkidtotype;
     1254    undef %phindidtofrequency;
    12311255
    12321256    # 5.
Note: See TracChangeset for help on using the changeset viewer.