Changeset 2064
- Timestamp:
- 2001-02-24T14:03:24+13:00 (23 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/classify/phind.pm
r2025 r2064 1138 1138 # 2. 1139 1139 # Read phrases file to find thesaurus entries that already 1140 # have a phindid. Store their phind-ids for later translation. 1140 # have a phindid. Store their phind-ids for later translation, 1141 # and store their frequency for later sorting. 1141 1142 my %thesaurustophindid; 1142 my ($phindid); 1143 my %phindidtofrequency; 1144 my ($phindid, $freq); 1143 1145 1144 1146 open(IN, "<$infile"); … … 1152 1154 $phindid = shift @fields; 1153 1155 $symbols = shift @fields; 1156 $freq = shift @fields; 1154 1157 1155 1158 # do we have a thesaurus id corresponding to this phrase? … … 1157 1160 $thesid = $symbolstothesid{$symbols}; 1158 1161 $thesaurustophindid{$thesid} = $phindid; 1162 $phindidtofrequency{$phindid} = $freq; 1159 1163 } 1160 1164 } … … 1164 1168 1165 1169 # 3. 1166 # Create phind-id numbers for remaining thesaurus entries 1170 # Create phind-id numbers for remaining thesaurus entries, 1171 # and note that their frequency is 0 for later sorting. 1167 1172 my $nextphindid = $phindid + 1; 1168 1173 … … 1177 1182 if (!defined($thesaurustophindid{$thesid})) { 1178 1183 $thesaurustophindid{$thesid} = $nextphindid; 1184 $phindidtofrequency{$nextphindid} = 0; 1179 1185 $nextphindid++; 1180 1186 } … … 1186 1192 # phind-id numbers. 1187 1193 my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid"); 1188 my ($relations, $linkcounter, $linktext, $linktype, @linkdata, $link); 1194 my ($relations, $linkcounter, $linktext, $linktype, @linkdata); 1195 my (@links, $linkid, %linkidtotype, $newrelation); 1189 1196 1190 1197 open(TH, "<$thesaurusfile"); … … 1203 1210 1204 1211 # convert each part of the relation string to use phind-id numbers 1205 $newrelation = ""; 1206 $linkcounter = 0; 1212 # at the same time, we want to sort the list by frequency. 1213 undef %linkidtotype; 1214 1207 1215 foreach $linktext (split(/;/, $relations)) { 1208 1216 @linkdata = split(/,/, $linktext); 1209 1217 1210 1218 # remember the linktype (e.g. BT, NT) 1211 1219 $linktype = shift @linkdata; 1212 $newrelation .= "$linktype,"; 1213 1214 # convert the link target identfiers 1215 foreach $link (@linkdata) { 1216 die unless (defined($thesaurustophindid{$link})); 1217 $newrelation .= "$thesaurustophindid{$link},"; 1218 $linkcounter++; 1220 1221 # store the type of each link 1222 foreach $thesid (@linkdata) { 1223 die unless (defined($thesaurustophindid{$thesid})); 1224 $linkidtotype{$thesaurustophindid{$thesid}} = $linktype; 1219 1225 } 1220 $newrelation =~ s/\,$//; 1221 $newrelation .= ";"; 1222 } 1223 $newrelation .= ":"; 1224 1225 print TO "$phindid:$symbols:$linkcounter:$newrelation\n"; 1226 } 1227 1228 # sort the list of links, first by frequency, then by type. 1229 @links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a}) 1230 or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype); 1231 $linkcounter = (scalar @links); 1232 1233 # create a string describing the link information 1234 $linktype = $linkidtotype{$links[0]}; 1235 $newrelation = $linktype; 1236 foreach $linkid (@links) { 1237 if ($linkidtotype{$linkid} ne $linktype) { 1238 $linktype = $linkidtotype{$linkid}; 1239 $newrelation .= ";" . $linktype; 1240 } 1241 $newrelation .= "," . $linkid; 1242 } 1243 $newrelation .= ";"; 1244 1245 1246 # output the new line 1247 print TO "$phindid:$symbols:$linkcounter:$newrelation:\n"; 1226 1248 } 1227 1249 close TH; … … 1229 1251 1230 1252 undef %thesaurustophindid; 1253 undef %linkidtotype; 1254 undef %phindidtofrequency; 1231 1255 1232 1256 # 5.
Note:
See TracChangeset
for help on using the changeset viewer.