Changeset 1829
- Timestamp:
- 2001-01-11T10:12:20+13:00 (23 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/classify/phind.pm
r1808 r1829 43 43 # savephrases=filename If set, phrase infomation will be stored in filename 44 44 # as text. (By defualt, it is not set.) 45 # thesaurus=name Name of a thesaurus stred in phind format in etc dir. 45 46 46 47 # How a classifier works. … … 156 157 my $suffixmode = 1; 157 158 my $suffixsize = 40000000; 158 my $savephrases = "";159 my $savephrases = 0; 159 160 160 161 my $verbosity = 2; 161 162 my $untidy = 0; 163 164 my $thesaurus = ""; 162 165 163 166 # parse the options … … 182 185 } elsif ($option =~ /^suffixmode=(.*)$/i) { 183 186 $suffixmode = $1; 187 } elsif ($option =~ /^thesaurus=(.*)$/i) { 188 $thesaurus = $1; 184 189 } elsif ($option =~ /^untidy/i) { 185 190 $untidy = 1; … … 198 203 $self->{'suffixmode'} = $suffixmode; 199 204 $self->{'suffixsize'} = $suffixsize; 200 $self->{'savephrases'} = $savephrases if ($savephrases); 205 $self->{'savephrases'} = $savephrases; 206 $self->{'thesaurus'} = $thesaurus; 201 207 202 208 # limit languages … … 205 211 $self->{'delimiter'} = $delimiter; 206 212 213 # collection directory 214 $self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'}; 215 207 216 # build directory 208 217 if (!$builddir) { … … 220 229 $self->{'verbosity'} = $verbosity; 221 230 $self->{'untidy'} = $untidy; 231 $self->{'out'} = $out; 222 232 223 233 return bless $self, $class; … … 377 387 # from the clauses file 378 388 print "\nExtracting vocabulary and statistics\n" if $verbosity; 379 &extract_vocabulary($ phinddir, $language, $verbosity);389 &extract_vocabulary($self); 380 390 381 391 # Use the suffix program to generate the phind/phrases file … … 568 578 569 579 sub extract_vocabulary { 570 my ($phind_dir, $language, $verbosity) = @_; 580 my ($self) = @_; 581 582 my $verbosity = $self->{'verbosity'}; 583 my $out = $self->{'out'}; 584 585 my $language = "english"; # $self->{'language'}; 586 587 my $collectiondir = $self->{'collectiondir'}; 588 589 my $phinddir = $self->{'phinddir'}; 571 590 572 591 my ($w, $l, $line, $word); 573 592 574 593 my ($first_delimiter, $last_delimiter, 575 594 $first_stopword, $last_stopword, … … 577 596 $first_contentword, $last_contentword, 578 597 $phrasedelimiter); 579 580 my ($use_thesaurus, %thesaurus, $first_thesaurusword, $last_thesaurusword); 581 598 599 my $thesaurus = $self->{'thesaurus'}; 600 my ($thesaurus_links, $thesaurus_terms, 601 %thesaurus, $first_thesaurusword, $last_thesaurusword); 582 602 583 603 my %symbol; 584 604 my (%freq); 585 605 586 print "Calculating vocabulary\n" if ($verbosity > 1);606 print $out "Calculating vocabulary\n" if ($verbosity > 1); 587 607 588 608 # Read and store the stopwords 589 609 my $words = `find $ENV{'GSDLHOME'}/etc/phind/$language -name "*.sw" | xargs cat`; 590 610 my %stopwords; 591 foreach my$w (split(/\s+/, $words)) {611 foreach $w (split(/\s+/, $words)) { 592 612 $l = lc($w); 593 613 $stopwords{$l} = $w; 594 614 } 595 615 596 # Read and store the thesaurus terms 597 $use_thesaurus = 0; 598 my $lex_file = &util::filename_cat("$ENV{'GSDLHOME'}", "etc", "phind", 599 "$language", "agrovoc.lex"); 600 if (-e "$lex_file") { 601 open(TH, "<$lex_file"); 616 # Read thesaurus information 617 if ($thesaurus) { 618 619 # Ensure both link and term files exist 620 $thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk"); 621 die "Cannot find thesaurus link file" unless (-e "$thesaurus_links"); 622 $thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.EN"); 623 die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms"); 624 625 # Read the thesaurus terms 626 open(TH, "<$thesaurus_terms"); 602 627 while(<TH>) { 603 628 s/^\d+ //; 604 629 s/\(.*\)//; 605 foreach my$w (split(/\s+/, $_)) {630 foreach $w (split(/\s+/, $_)) { 606 631 $thesaurus{lc($w)} = $w; 607 632 } 608 633 } 609 634 close TH; 610 $use_thesaurus = 1;611 635 } 612 636 613 637 # Read words in the text and count occurences 614 open(TXT, "<$phind _dir/clauses");638 open(TXT, "<$phinddir/clauses"); 615 639 my @words; 616 640 … … 688 712 689 713 # Thesaurus terms 690 if ($ use_thesaurus) {714 if ($thesaurus) { 691 715 $first_thesaurusword = $nextsymbol; 692 716 … … 721 745 722 746 # Outut the words 723 print "Saving vocabulary in $phind_dir/clauses.vocab\n" if ($verbosity > 1);724 open(VOC, ">$phind _dir/clauses.vocab");747 print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1); 748 open(VOC, ">$phinddir/clauses.vocab"); 725 749 726 750 for (my $i = 1; $i < $nextsymbol; $i++) { … … 733 757 734 758 759 # Create statistics file 735 760 # Output statistics about the vocablary 736 print "Saving statistics in $phind_dir/clauses.stats\n" if ($verbosity > 1); 737 &util::rm("$phind_dir/clauses.stats") if (-e "$phind_dir/clauses.stats"); 738 open(STAT, ">$phind_dir/clauses.stats") 739 || die "Cannot open $phind_dir/clauses.stats: $!"; 761 print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1); 762 &util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats"); 763 764 open(STAT, ">$phinddir/clauses.stats") 765 || die "Cannot open $phinddir/clauses.stats: $!"; 740 766 741 767 print STAT "first_delimiter $first_delimiter\n"; … … 743 769 print STAT "first_stopword $first_stopword\n"; 744 770 print STAT "last_stopword $last_stopword\n"; 745 if ($ use_thesaurus) {771 if ($thesaurus) { 746 772 print STAT "first_thesaurusword $first_thesaurusword\n"; 747 773 print STAT "last_thesaurusword $last_thesaurusword\n"; … … 760 786 761 787 788 # Create numbers file 762 789 # Save text as symbol numbers 763 print "Saving text as numbers in $phind_dir/clauses.numbers\n" if ($verbosity > 1);764 765 open(TXT, "<$phind _dir/clauses");766 open(NUM, ">$phind _dir/clauses.numbers");790 print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1); 791 792 open(TXT, "<$phinddir/clauses"); 793 open(NUM, ">$phinddir/clauses.numbers"); 767 794 768 795 $phrasedelimiter = $symbol{lc($senlimit)}; … … 788 815 789 816 print NUM "$symbol{lc($colend)}\n"; 817 close NUM; 818 819 # Save thesaurus data in one convienient file 820 if ($thesaurus) { 821 822 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers"); 823 824 825 print $out "Saving thesaurus as numbers in $thesaurusfile\n" 826 if ($verbosity > 1); 827 828 # Read the thesaurus terms 829 my ($num, $text, %thes_symbols); 830 831 open(TH, "<$thesaurus_terms"); 832 while(<TH>) { 833 chomp; 834 @words = split(/\s+/, $_); 835 $num = shift @words; 836 $text = ""; 837 838 # translate words into symbol numbers 839 foreach $word (@words) { 840 $word = lc($word); 841 if ($symbol{$word}) { 842 $text .= "s$symbol{$word} "; 843 } elsif ($verbosity) { 844 print $out "phind: No thesaurus symbol, ignoring \"$word\"\n"; 845 } 846 } 847 $text =~ s/ $//; 848 $thes_symbols{$num} = $text; 849 } 850 close TH; 851 852 # Read the thesaurus links and write the corresponding data 853 open(TH, "<$thesaurus_links"); 854 open(THOUT, ">$thesaurusfile"); 855 856 while(<TH>) { 857 chomp; 858 ($num, $text) = split(/:/, $_); 859 860 if (defined($thes_symbols{$num})) { 861 print THOUT "$num:$thes_symbols{$num}:$text\n"; 862 } else { 863 print THOUT "$num:untranslated:$text\n"; 864 } 865 } 866 close TH; 867 close THOUT; 868 } 869 870 871 790 872 791 873 } 792 874 793 875 794 # Prepare the phrases file to be input to mgpp. 795 # This means renumbering the phrases in order of decreasing frequency. 796 797 798 # This is legacy code, and a little ugly, and may be unix-specific 799 # (particularly the sort command). 876 # renumber_phrases 877 # 878 # Prepare the phrases file to be input to mgpp. The biggest problem is 879 # reconciling the phrase identifiers used by the suffix program (which 880 # we'll call suffix-id numbers) with the numbers used in the thesaurus 881 # (theesaurus-id) to create a ciommon set of phind id numbers (phind-id). 882 # Phind-id numbers must be sorted by frequency of occurance. 883 # 884 # Start creating a set of phind-id numbers from the sorted suffix-id 885 # numbers and (if required) the thesaurus-id numbers. Then add any other 886 # phrases occuring in the thesaurus. 887 # 888 # The last thing we have to do is restore the vocabulary information to the 889 # phrase file so that the phrases are stored as words, not as symbol 890 # numbers. 891 892 # The original phrases file looks something like this: 893 # 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254 894 # 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380 895 # 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374 896 # 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361 897 800 898 801 899 sub renumber_phrases { 802 my $self = shift (@_); 803 900 my ($self) = @_; 901 902 renumber_suffix_data($self); 903 renumber_thesaurus_data($self); 904 restore_vocabulary_data($self); 905 906 } 907 908 909 910 # renumber_suffix_data 911 # 912 # Translate phrases file to phrases.2 using phind keys instead 913 # of suffix keys and sorting the expansion data. 914 915 sub renumber_suffix_data { 916 my ($self) = @_; 917 804 918 my $verbosity = $self->{'verbosity'}; 805 my $phind_dir = $self->{'phinddir'}; 806 807 my $savephrases = 0; 808 $savephrases = $self->{'savephrases'} if (defined($self->{'savephrases'})); 809 810 811 812 # Sort the phrases into order of increasing frequency 813 # This means the expansions will be sorted correctly later on. 814 print "Sorting phrases into freq order\n" if ($verbosity); 815 system("sort -rnt ':' +2 -o $phind_dir/phrases $phind_dir/phrases"); 816 817 # Read the vocabulary 919 my $out = $self->{'out'}; 920 print $out "Translate phrases: suffix-ids become phind-id's\n" 921 if ($verbosity); 922 923 my $phinddir = $self->{'phinddir'}; 924 my $infile = &util::filename_cat($phinddir, 'phrases'); 925 my $outfile = &util::filename_cat($phinddir, 'phrases.2'); 926 927 # Read the phrase file. Calculate initial set of phind-id 928 # numbers and store (suffixid -> frequency) relation. 929 930 my %suffixtophind; 931 my @phindfrequency; 932 my (@fields, $suffixid); 933 my $nextphind = 1; 934 935 open(IN, "<$infile"); 936 while(<IN>) { 937 938 chomp; 939 @fields = split(/:/, $_); 940 941 # get next suffixid and phindid 942 $suffixid = shift @fields; 943 $suffixtophind{$suffixid} = $nextphind; 944 945 # store total frequency 946 shift @fields; 947 $totalfrequency[$nextphind] = shift @fields; 948 949 $nextphind++; 950 } 951 close IN; 952 953 954 # Translate phrases file to phrases.2. Use phind keys (not suffix 955 # keys), sort expansion and document occurance data in order of 956 # descending frequency.. 957 open(IN, "<$infile"); 958 open(OUT, ">$outfile"); 959 960 my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents); 961 my (@documwents, @newexp, $k, $n); 962 my $linenumber = 0; 963 964 while(<IN>) { 965 966 # read the line 967 chomp; 968 @fields = split(/:/, $_); 969 970 # get a phrase number for this line 971 $suffixid = shift @fields; 972 die unless (defined($suffixtophind{$suffixid})); 973 $phindid = $suffixtophind{$suffixid}; 974 975 # get the symbols in the phrase 976 $text = shift @fields; 977 978 # output status information 979 $linenumber++; 980 if ($verbosity > 2) { 981 if ($linenumber % 1000 == 0) { 982 print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n"; 983 } 984 print $out "$num: $key\t($text)\n" if ($verbosity > 3); 985 } 986 987 # get the phrase frequency 988 $tf = shift @fields; 989 990 # get the number of expansions 991 $countexp = shift @fields; 992 993 # get the expansions, convert them into phind-id numbers, and sort them 994 $expansions = shift @fields; 995 @newexp = (); 996 foreach $k (split(/,/, $expansions)) { 997 die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k})); 998 $n = $suffixtophind{$k}; 999 push @newexp, $n; 1000 } 1001 @newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp; 1002 1003 # get the number of documents 1004 $countdocs = shift @fields; 1005 1006 # get the documents and sort them 1007 $documents = shift @fields; 1008 $documents =~ s/d//g; 1009 @documents = split(/;/, $documents); 1010 @documents = sort by_doc_frequency @documents; 1011 1012 # output the phrase data 1013 print OUT "$phindid:$text:$tf:$countexp:$countdocs:"; 1014 print OUT join(",", @newexp), ",:", join(";", @documents), ";\n"; 1015 1016 } 1017 1018 close IN; 1019 close OUT; 1020 } 1021 1022 1023 # renumber_thesaurus_data 1024 # 1025 # Translate phrases.2 to phrases.3, adding thesaurus data if available. 1026 1027 sub renumber_thesaurus_data { 1028 my ($self) = @_; 1029 1030 my $out = $self->{'out'}; 1031 my $verbosity = $self->{'verbosity'}; 1032 my $thesaurus = $self->{'thesaurus'}; 1033 1034 my $phinddir = $self->{'phinddir'}; 1035 my $infile = &util::filename_cat($phinddir, "phrases.2"); 1036 my $outfile = &util::filename_cat($phinddir, "phrases.3"); 1037 1038 1039 # If no thesaurus is defined, simply move the phrases file. 1040 if (!$thesaurus) { 1041 print $out "Translate phrases.2: no thesaurus data\n" 1042 if ($verbosity); 1043 &util::mv($infile, $outfile); 1044 return; 1045 } 1046 1047 print $out "Translate phrases.2: add thesaurus data\n" 1048 if ($verbosity); 1049 1050 # 1. 1051 # Read thesaurus file and store (symbols->thesaurusid) mapping 1052 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers"); 1053 my %symbolstothesid; 1054 my (@fields, $thesid, $symbols); 1055 1056 open(TH, "<$thesaurusfile"); 1057 1058 while (<TH>) { 1059 1060 chomp; 1061 @fields = split(/:/, $_); 1062 1063 # get id and text 1064 $thesid = shift @fields; 1065 $symbols = shift @fields; 1066 $symbolstothesid{$symbols} = $thesid; 1067 } 1068 close TH; 1069 1070 # 2. 1071 # Read phrases file and note all thesaurus entries that already 1072 # have a phindid 1073 my %thesaurustophindid; 1074 my ($phindid); 1075 1076 open(IN, "<$infile"); 1077 1078 while(<IN>) { 1079 1080 chomp; 1081 @fields = split(/:/, $_); 1082 1083 # phindid and symbols for this line 1084 $phindid = shift @fields; 1085 $symbols = shift @fields; 1086 1087 # do we have a thesaurus id corresponding to this phrase? 1088 if (defined($symbolstothesid{$symbols})) { 1089 $thesid = $symbolstothesid{$symbols}; 1090 $thesaurustophindid{$thesid} = $phindid; 1091 } 1092 } 1093 close IN; 1094 1095 undef %symbolstothesid; 1096 1097 # 3. 1098 # Create phind-id numbers for remaining thesaurus entries 1099 my $nextphindid = $phindid + 1; 1100 1101 open(TH, "<$thesaurusfile"); 1102 while(<TH>) { 1103 1104 chomp; 1105 @fields = split(/:/, $_); 1106 1107 # read thesaurus-id and ensure it has a corresponding phind-id 1108 $thesid = shift @fields; 1109 if (!defined($thesaurustophindid{$thesid})) { 1110 $thesaurustophindid{$thesid} = $nextphindid; 1111 $nextphindid++; 1112 } 1113 } 1114 close TH; 1115 1116 # 4. 1117 # Translate thesaurus file, replacing thesaurus-id numbers with 1118 # phind-id numbers. 1119 my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid"); 1120 my ($relations, $linkcounter, $linktext, $linktype, @linkdata, $link); 1121 1122 open(TH, "<$thesaurusfile"); 1123 open(TO, ">$newthesaurusfile"); 1124 while(<TH>) { 1125 1126 chomp; 1127 @fields = split(/:/, $_); 1128 1129 # phindid and symbols for this line 1130 ($thesid, $symbols, $relations) = @fields; 1131 1132 die unless ($thesid && $symbols); 1133 die unless $thesaurustophindid{$thesid}; 1134 $phindid = $thesaurustophindid{$thesid}; 1135 1136 # convert each part of the relation string to use phind-id numbers 1137 $newrelation = ""; 1138 $linkcounter = 0; 1139 foreach $linktext (split(/;/, $relations)) { 1140 @linkdata = split(/,/, $linktext); 1141 1142 # remember the linktype (e.g. BT, NT) 1143 $linktype = shift @linkdata; 1144 $newrelation .= "$linktype,"; 1145 1146 # convert the link target identfiers 1147 foreach $link (@linkdata) { 1148 die unless (defined($thesaurustophindid{$link})); 1149 $newrelation .= "$thesaurustophindid{$link},"; 1150 $linkcounter++; 1151 } 1152 $newrelation =~ s/\,$//; 1153 $newrelation .= ";"; 1154 } 1155 $newrelation .= ":"; 1156 1157 print TO "$phindid:$symbols:$linkcounter:$newrelation\n"; 1158 } 1159 close TH; 1160 close TO; 1161 1162 undef %thesaurustophindid; 1163 1164 # 5. 1165 # Read thesaurus data (in phind-id format) into memory 1166 my %thesaurusdata; 1167 1168 open(TH, "<$newthesaurusfile"); 1169 while(<TH>) { 1170 chomp; 1171 ($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_); 1172 die unless ($phindid && $symbols); 1173 $thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations"; 1174 } 1175 1176 # 6. 1177 # Add thesaurus data to phrases file 1178 my ($text, $tf, $countexp, $expansions, $countdocs, $documents); 1179 my (@documwents, @newexp, $k, $n); 1180 my $linenumber = 0; 1181 1182 open(IN, "<$infile"); 1183 open(OUT, ">$outfile"); 1184 1185 # Update existing phrases 1186 while(<IN>) { 1187 1188 chomp; 1189 @fields = split(/:/, $_); 1190 1191 # get data for this line 1192 $phindid = shift @fields; 1193 1194 # output the phrase data, with thesaurus information 1195 print OUT "$phindid:", join(":", @fields); 1196 1197 # add thesaurus data 1198 if (defined($thesaurusdata{$phindid})) { 1199 @fields = split(/:/, $thesaurusdata{$phindid}); 1200 shift @fields; 1201 $linkcounter = shift @fields; 1202 $relations = shift @fields; 1203 1204 print OUT ":$linkcounter:$relations"; 1205 $thesaurusdata{$phindid} = ""; 1206 } 1207 print OUT "\n"; 1208 } 1209 close IN; 1210 1211 # Add phrases that aren't already in the file 1212 foreach $phindid (sort numerically keys %thesaurusdata) { 1213 next unless ($thesaurusdata{$phindid}); 1214 1215 @fields = split(/:/, $thesaurusdata{$phindid}); 1216 $symbols = shift @fields; 1217 $linkcounter = shift @fields; 1218 $relations = shift @fields; 1219 1220 print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n"; 1221 } 1222 close OUT; 1223 1224 } 1225 1226 # restore_vocabulary_data 1227 # 1228 # Read phrases.3 and restore vocabulary information. Then write 1229 # this data to the MGPP input files (pwrod.txt and pdata.txt) and 1230 # (if requested) to the saved phrases file. 1231 1232 sub restore_vocabulary_data { 1233 my ($self) = @_; 1234 1235 my $out = $self->{'out'}; 1236 my $verbosity = $self->{'verbosity'}; 1237 print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity); 1238 1239 my $phinddir = $self->{'phinddir'}; 1240 my $infile = &util::filename_cat($phinddir, 'phrases.3'); 1241 my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab'); 1242 my $datafile = &util::filename_cat($phinddir, 'pdata.txt'); 1243 my $wordfile = &util::filename_cat($phinddir, 'pword.txt'); 1244 1245 my $savephrases = $self->{'savephrases'}; 1246 1247 # 1. 1248 # Read the vocabulary file 1249 open(V, "<$vocabfile") 1250 || die "Cannot open $vocabfile: $!"; 818 1251 my @symbol; 819 print "Reading the vocabulary\n" if ($verbosity);820 open(V, "<$phind_dir/clauses.vocab")821 || die "Cannot open $phind_dir/clauses.vocab: $!";822 823 1252 my $i = 1; 824 1253 while(<V>) { … … 827 1256 } 828 1257 829 # Create file for phrase data 830 # 831 # The phrases file looks something like this 832 # 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254 833 # 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380 834 # 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374 835 # 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361 836 837 # The first field on each line is a unique phrase identifier. 838 # We need to calculate phrase numbers for each phrase 839 print "Calculate phrase numbers\n" if ($verbosity); 840 841 my %phrasenumber; 842 my $nextphrase = 1; 843 my ($line); 844 845 open(IN, "<$phind_dir/phrases"); 846 while(<IN>) { 847 848 # read the line 849 chomp; 850 $line = $_; 851 852 # we're only interested in the first field 853 $line =~ s/:.*//; 854 855 # get a phrase number for this line 856 $phrasenumber{$line} = $nextphrase; 857 $nextphrase++; 858 } 859 860 861 # Now we create a new phrase file using phrase numbers, not the old IDs. 862 print "Format phrase data for MGPP\n" if ($verbosity); 863 864 # Open the basic files 865 open(IN, "<$phind_dir/phrases"); 866 open(DATA, ">$phind_dir/pdata.txt"); 867 open(IDX, ">$phind_dir/pword.txt"); 868 869 # We may want to save the phrases in a separate text file 1258 1259 # 2. 1260 # Translate phrases.3 to MGPP input files 1261 my ($key, $text, $word); 1262 my @fields; 1263 my $linenumber = 0; 1264 1265 open(IN, "<$infile"); 1266 open(DATA, ">$datafile"); 1267 open(WORD, ">$wordfile"); 1268 1269 # Save the phrases in a separate text file 870 1270 if ($savephrases) { 871 print "Saving phrases in $savephrases\n" if ($verbosity);1271 print $out "Saving phrases in $savephrases\n" if ($verbosity); 872 1272 open(SAVE, ">$savephrases"); 873 1273 } 874 875 my ($key, $tf, $num, $countexp, $expansions, $countdocs, $documents, $text, $word);876 my @fields;877 my @documents;878 my (@newexp, $k, $n);879 880 my $linenumber = 0;881 1274 882 1275 while(<IN>) { … … 889 1282 # get a phrase number for this line 890 1283 $key = shift @fields; 891 die unless (defined($phrasenumber{$key})); 892 $num = $phrasenumber{$key}; 893 894 # get the text of the phrase 1284 1285 # restore the text of the phrase 895 1286 $text = shift @fields; 896 1287 $text =~ s/s(\d+)/$symbol[$1]/g; 897 1288 if ($text =~ / /) { 898 1289 $word = ""; 899 } els e{1290 } elsif ($text ne 'untranslated') { 900 1291 $word = $text; 901 1292 } 902 903 $linenumber++;904 if ($linenumber % 1000 == 0) {905 print "line $linenumber:\t$num\t$key\t($text)\n" if ($verbosity > 2);906 }907 print "$num: $key\t($text)\n" if ($verbosity > 3);908 909 # get the phrase frequency910 $tf = shift @fields;911 912 # get the number of expansions913 $countexp = shift @fields;914 915 # get the expansions and convert them into phrase numbers916 $expansions = shift @fields;917 @newexp = ();918 foreach $k (split(/,/, $expansions)) {919 die "ERROR - no phrase number for: $k" unless (defined($phrasenumber{$k}));920 $n = $phrasenumber{$k};921 push @newexp, $n;922 }923 @newexp = sort numerically @newexp;924 925 # get the number of documents926 $countdocs = shift @fields;927 928 # get the documents929 $documents = shift @fields;930 $documents =~ s/d//g;931 @documents = split(/;/, $documents);932 @documents = sort by_frequency @documents;933 1293 934 1294 # output the phrase data 935 1295 print DATA "<Document>"; 936 print DATA "$num:$text:$tf:$countexp:$countdocs:"; 937 print DATA join(",", @newexp), ":", join(";", @documents), "\n"; 1296 print DATA "$key:$text:", join(":", @fields), ":\n"; 938 1297 939 1298 # output the word index search data 940 print IDX"<Document>$word\n";1299 print WORD "<Document>$word\n"; 941 1300 942 1301 # output the phrases to a text file … … 944 1303 print SAVE "$tf\t$countdocs\t$text\n"; 945 1304 } 946 947 } 948 1305 } 1306 close IN; 1307 close WORD; 1308 close DATA; 949 1309 close SAVE if ($savephrases); 1310 950 1311 } 951 1312 1313 1314 952 1315 # sort routines used to renumber phrases 953 1316 954 1317 sub numerically { $a <=> $b } 955 1318 956 sub by_ frequency {1319 sub by_doc_frequency { 957 1320 my $fa = 1; 958 1321 if ($a =~ /,/) { … … 969 1332 } 970 1333 971 972 1334 1;
Note:
See TracChangeset
for help on using the changeset viewer.