Changeset 1829


Ignore:
Timestamp:
2001-01-11T10:12:20+13:00 (23 years ago)
Author:
paynter
Message:

Accept a "thesaurus=name" option that identifies a thesaurus in a
collections etc directory. In corporate the thesaurus data into the MGPP
output, including phrase entries for every thesaurus term, and thesaurus
link information encoding the thesaurus structure. At the same time I've
changed the way phrases are numbered - they are no longer sorted in order
of decreasing frequency - which has allowed me to get rid of the call to
Unix sort, which was the main impediment to a Windows version.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/classify/phind.pm

    r1808 r1829  
    4343#   savephrases=filename  If set, phrase infomation will be stored in filename
    4444#                         as text. (By defualt, it is not set.)
     45#   thesaurus=name        Name of a thesaurus stred in phind format in etc dir.
    4546
    4647# How a classifier works. 
     
    156157    my $suffixmode = 1;
    157158    my $suffixsize = 40000000;
    158     my $savephrases = "";
     159    my $savephrases = 0;
    159160
    160161    my $verbosity = 2;
    161162    my $untidy = 0;
     163
     164    my $thesaurus = "";
    162165
    163166    # parse the options
     
    182185    } elsif ($option =~ /^suffixmode=(.*)$/i) {
    183186        $suffixmode = $1;
     187    } elsif ($option =~ /^thesaurus=(.*)$/i) {
     188        $thesaurus = $1;
    184189    } elsif ($option =~ /^untidy/i) {
    185190        $untidy = 1;
     
    198203    $self->{'suffixmode'} = $suffixmode;
    199204    $self->{'suffixsize'} = $suffixsize;
    200     $self->{'savephrases'} = $savephrases if ($savephrases);
     205    $self->{'savephrases'} = $savephrases;
     206    $self->{'thesaurus'} = $thesaurus;
    201207
    202208    # limit languages
     
    205211    $self->{'delimiter'} = $delimiter;
    206212
     213    # collection directory
     214    $self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'};
     215
    207216    # build directory
    208217    if (!$builddir) {
     
    220229    $self->{'verbosity'} = $verbosity;
    221230    $self->{'untidy'} = $untidy;
     231    $self->{'out'} = $out;
    222232
    223233    return bless $self, $class;
     
    377387    # from the clauses file
    378388    print "\nExtracting vocabulary and statistics\n" if $verbosity;
    379     &extract_vocabulary($phinddir, $language, $verbosity);
     389    &extract_vocabulary($self);
    380390
    381391    # Use the suffix program to generate the phind/phrases file
     
    568578
    569579sub extract_vocabulary {
    570     my ($phind_dir, $language, $verbosity) = @_;
     580    my ($self) = @_;
     581   
     582    my $verbosity = $self->{'verbosity'};
     583    my $out = $self->{'out'};
     584
     585    my $language = "english"; # $self->{'language'};
     586
     587    my $collectiondir = $self->{'collectiondir'};
     588
     589    my $phinddir = $self->{'phinddir'};
    571590
    572591    my ($w, $l, $line, $word);
    573 
     592   
    574593    my ($first_delimiter, $last_delimiter,
    575594    $first_stopword, $last_stopword,
     
    577596    $first_contentword, $last_contentword,
    578597    $phrasedelimiter);
    579 
    580     my ($use_thesaurus, %thesaurus, $first_thesaurusword, $last_thesaurusword);
    581 
     598   
     599    my $thesaurus = $self->{'thesaurus'};
     600    my ($thesaurus_links, $thesaurus_terms,
     601    %thesaurus, $first_thesaurusword, $last_thesaurusword);
    582602
    583603    my %symbol;
    584604    my (%freq);
    585605
    586     print "Calculating vocabulary\n" if ($verbosity > 1);
     606    print $out "Calculating vocabulary\n" if ($verbosity > 1);
    587607
    588608    # Read and store the stopwords
    589609    my $words = `find $ENV{'GSDLHOME'}/etc/phind/$language -name "*.sw" | xargs cat`;
    590610    my %stopwords;
    591     foreach my $w (split(/\s+/, $words)) {
     611    foreach $w (split(/\s+/, $words)) {
    592612    $l = lc($w);
    593613    $stopwords{$l} = $w;
    594614    }
    595615   
    596     # Read and store the thesaurus terms
    597     $use_thesaurus = 0;
    598     my $lex_file = &util::filename_cat("$ENV{'GSDLHOME'}", "etc", "phind",
    599                        "$language", "agrovoc.lex");
    600     if (-e "$lex_file") {
    601     open(TH, "<$lex_file");
     616    # Read thesaurus information
     617    if ($thesaurus) {
     618
     619    # Ensure both link and term files exist
     620    $thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
     621    die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
     622    $thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.EN");
     623    die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
     624
     625    # Read the thesaurus terms
     626    open(TH, "<$thesaurus_terms");
    602627    while(<TH>) {
    603628        s/^\d+ //;
    604629        s/\(.*\)//;
    605         foreach my $w (split(/\s+/, $_)) {
     630        foreach $w (split(/\s+/, $_)) {
    606631        $thesaurus{lc($w)} = $w;
    607632        }
    608633    }
    609634    close TH;
    610     $use_thesaurus = 1;
    611635    }
    612636
    613637    # Read words in the text and count occurences
    614     open(TXT, "<$phind_dir/clauses");
     638    open(TXT, "<$phinddir/clauses");
    615639    my @words;
    616640   
     
    688712   
    689713    # Thesaurus terms
    690     if ($use_thesaurus) {
     714    if ($thesaurus) {
    691715    $first_thesaurusword = $nextsymbol;
    692716   
     
    721745
    722746    # Outut the words
    723     print "Saving vocabulary in $phind_dir/clauses.vocab\n" if ($verbosity > 1);
    724     open(VOC, ">$phind_dir/clauses.vocab");
     747    print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
     748    open(VOC, ">$phinddir/clauses.vocab");
    725749
    726750    for (my $i = 1; $i < $nextsymbol; $i++) {
     
    733757
    734758
     759    # Create statistics file
    735760    # Output statistics about the vocablary
    736     print "Saving statistics in $phind_dir/clauses.stats\n" if ($verbosity > 1);
    737     &util::rm("$phind_dir/clauses.stats") if (-e "$phind_dir/clauses.stats");
    738     open(STAT, ">$phind_dir/clauses.stats")
    739     || die "Cannot open $phind_dir/clauses.stats: $!";
     761    print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
     762    &util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
     763
     764    open(STAT, ">$phinddir/clauses.stats")
     765    || die "Cannot open $phinddir/clauses.stats: $!";
    740766
    741767    print STAT "first_delimiter $first_delimiter\n";
     
    743769    print STAT "first_stopword $first_stopword\n";
    744770    print STAT "last_stopword $last_stopword\n";
    745     if ($use_thesaurus) {
     771    if ($thesaurus) {
    746772    print STAT "first_thesaurusword $first_thesaurusword\n";
    747773    print STAT "last_thesaurusword $last_thesaurusword\n";
     
    760786
    761787
     788    # Create numbers file
    762789    # Save text as symbol numbers
    763     print "Saving text as numbers in $phind_dir/clauses.numbers\n" if ($verbosity > 1);
    764    
    765     open(TXT, "<$phind_dir/clauses");
    766     open(NUM, ">$phind_dir/clauses.numbers");
     790    print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
     791   
     792    open(TXT, "<$phinddir/clauses");
     793    open(NUM, ">$phinddir/clauses.numbers");
    767794   
    768795    $phrasedelimiter = $symbol{lc($senlimit)};
     
    788815   
    789816    print NUM "$symbol{lc($colend)}\n";
     817    close NUM;
     818
     819    # Save thesaurus  data in one convienient file
     820    if ($thesaurus) {
     821
     822    my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
     823
     824
     825    print $out "Saving thesaurus as numbers in $thesaurusfile\n"
     826        if ($verbosity > 1);
     827
     828    # Read the thesaurus terms
     829    my ($num, $text, %thes_symbols);
     830   
     831    open(TH, "<$thesaurus_terms");
     832    while(<TH>) {
     833        chomp;
     834        @words = split(/\s+/, $_);
     835        $num = shift @words;
     836        $text = "";
     837
     838        # translate words into symbol numbers
     839        foreach $word (@words) {
     840        $word = lc($word);
     841        if ($symbol{$word}) {
     842            $text .= "s$symbol{$word} ";
     843        } elsif ($verbosity) {
     844            print $out "phind: No thesaurus symbol, ignoring \"$word\"\n";
     845        }
     846        }
     847        $text =~ s/ $//;
     848        $thes_symbols{$num} = $text;
     849    }
     850    close TH;
     851
     852    # Read the thesaurus links and write the corresponding data
     853    open(TH, "<$thesaurus_links");
     854    open(THOUT, ">$thesaurusfile");
     855
     856    while(<TH>) {
     857        chomp;
     858        ($num, $text) = split(/:/, $_);
     859
     860        if (defined($thes_symbols{$num})) {
     861        print THOUT "$num:$thes_symbols{$num}:$text\n";
     862        } else {
     863        print THOUT "$num:untranslated:$text\n";
     864        }
     865    }
     866    close TH;   
     867    close THOUT;
     868    }
     869
     870
     871
    790872
    791873}
    792874
    793875
    794 # Prepare the phrases file to be input to mgpp.
    795 # This means renumbering the phrases in order of decreasing frequency.
    796 
    797 
    798 # This is legacy code, and a little ugly, and may be unix-specific
    799 # (particularly the sort command).
     876# renumber_phrases
     877#
     878# Prepare the phrases file to be input to mgpp.  The biggest problem is
     879# reconciling the phrase identifiers used by the suffix program (which
     880# we'll call suffix-id numbers) with the numbers used in the thesaurus
     881# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
     882# Phind-id numbers must be sorted by frequency of occurance.
     883#
     884# Start creating a set of phind-id numbers from the sorted suffix-id
     885# numbers and (if required) the thesaurus-id numbers.  Then add any other
     886# phrases occuring in the thesaurus.
     887#
     888# The last thing we have to do is restore the vocabulary information to the
     889# phrase file so that the phrases are stored as words, not as symbol
     890# numbers.
     891
     892# The original phrases file looks something like this:
     893#   159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
     894#   159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
     895#   159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
     896#   159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
     897
    800898
    801899sub renumber_phrases {
    802     my $self = shift (@_);
    803 
     900    my ($self) = @_;
     901
     902    renumber_suffix_data($self);
     903    renumber_thesaurus_data($self);
     904    restore_vocabulary_data($self);
     905
     906}
     907
     908
     909
     910# renumber_suffix_data
     911#
     912# Translate phrases file to phrases.2 using phind keys instead
     913# of suffix keys and sorting the expansion data.
     914
     915sub renumber_suffix_data {
     916    my ($self) = @_;
     917   
    804918    my $verbosity = $self->{'verbosity'};
    805     my $phind_dir = $self->{'phinddir'};
    806 
    807     my $savephrases = 0;
    808     $savephrases = $self->{'savephrases'} if (defined($self->{'savephrases'}));
    809 
    810                          
    811 
    812     # Sort the phrases into order of increasing frequency
    813     # This means the expansions will be sorted correctly later on.
    814     print "Sorting phrases into freq order\n" if ($verbosity);
    815     system("sort -rnt ':' +2 -o $phind_dir/phrases $phind_dir/phrases");
    816 
    817     # Read the vocabulary
     919    my $out = $self->{'out'};
     920    print $out "Translate phrases: suffix-ids become phind-id's\n"
     921    if ($verbosity);
     922   
     923    my $phinddir = $self->{'phinddir'};
     924    my $infile = &util::filename_cat($phinddir, 'phrases');
     925    my $outfile = &util::filename_cat($phinddir, 'phrases.2');
     926
     927    # Read the phrase file.  Calculate initial set of phind-id
     928    # numbers and store (suffixid -> frequency) relation.
     929
     930    my %suffixtophind;
     931    my @phindfrequency;
     932    my (@fields, $suffixid);
     933    my $nextphind = 1;
     934   
     935    open(IN, "<$infile");
     936    while(<IN>) {
     937
     938    chomp;
     939    @fields = split(/:/, $_);
     940   
     941    # get next suffixid and phindid
     942    $suffixid = shift @fields;
     943    $suffixtophind{$suffixid} = $nextphind;
     944
     945    # store total frequency
     946    shift @fields;
     947    $totalfrequency[$nextphind] = shift @fields;
     948
     949    $nextphind++;
     950    }
     951    close IN;
     952
     953
     954    # Translate phrases file to phrases.2.  Use phind keys (not suffix
     955    # keys), sort expansion and document occurance data in order of
     956    # descending frequency..
     957    open(IN, "<$infile");
     958    open(OUT, ">$outfile");
     959   
     960    my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
     961    my (@documwents, @newexp, $k, $n);
     962    my $linenumber = 0;
     963
     964    while(<IN>) {
     965   
     966    # read the line
     967    chomp;
     968    @fields = split(/:/, $_);
     969   
     970    # get a phrase number for this line
     971    $suffixid = shift @fields;
     972    die unless (defined($suffixtophind{$suffixid}));
     973    $phindid = $suffixtophind{$suffixid};
     974   
     975    # get the symbols in the phrase
     976    $text = shift @fields;
     977
     978    # output status information
     979    $linenumber++;
     980    if ($verbosity > 2) {
     981        if ($linenumber % 1000 == 0) {
     982        print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
     983        }
     984        print $out "$num: $key\t($text)\n" if ($verbosity > 3);
     985    }
     986
     987    # get the phrase frequency
     988    $tf = shift @fields;
     989   
     990    # get the number of expansions
     991    $countexp = shift @fields;
     992   
     993    # get the expansions, convert them into phind-id numbers, and sort them
     994    $expansions = shift @fields;
     995    @newexp = ();
     996    foreach $k (split(/,/, $expansions)) {
     997        die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
     998        $n = $suffixtophind{$k};
     999        push @newexp, $n;
     1000    }
     1001    @newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
     1002
     1003    # get the number of documents
     1004    $countdocs = shift @fields;
     1005   
     1006    # get the documents and sort them
     1007    $documents = shift @fields;
     1008    $documents =~ s/d//g;
     1009    @documents = split(/;/, $documents);
     1010    @documents = sort by_doc_frequency @documents;
     1011
     1012    # output the phrase data
     1013    print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
     1014    print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
     1015   
     1016    }
     1017
     1018    close IN;
     1019    close OUT;
     1020}
     1021
     1022
     1023# renumber_thesaurus_data
     1024#
     1025# Translate phrases.2 to phrases.3, adding thesaurus data if available.
     1026
     1027sub renumber_thesaurus_data {
     1028    my ($self) = @_;
     1029 
     1030    my $out = $self->{'out'};
     1031    my $verbosity = $self->{'verbosity'};
     1032    my $thesaurus = $self->{'thesaurus'};
     1033
     1034    my $phinddir = $self->{'phinddir'};
     1035    my $infile = &util::filename_cat($phinddir, "phrases.2");
     1036    my $outfile = &util::filename_cat($phinddir, "phrases.3");
     1037
     1038
     1039    # If no thesaurus is defined, simply move the phrases file.
     1040    if (!$thesaurus) {
     1041    print $out "Translate phrases.2: no thesaurus data\n"
     1042        if ($verbosity);
     1043    &util::mv($infile, $outfile);
     1044    return;
     1045    }
     1046
     1047    print $out "Translate phrases.2: add thesaurus data\n"
     1048    if ($verbosity);
     1049
     1050    # 1.
     1051    # Read thesaurus file and store (symbols->thesaurusid) mapping
     1052    my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
     1053    my %symbolstothesid;
     1054    my (@fields, $thesid, $symbols);
     1055   
     1056    open(TH, "<$thesaurusfile");
     1057
     1058    while (<TH>) {
     1059   
     1060    chomp;
     1061    @fields = split(/:/, $_);
     1062   
     1063    # get id and text
     1064    $thesid = shift @fields;
     1065    $symbols = shift @fields;
     1066    $symbolstothesid{$symbols} = $thesid;
     1067    }
     1068    close TH;   
     1069
     1070    # 2.
     1071    # Read phrases file and note all thesaurus entries that already
     1072    # have a phindid
     1073    my %thesaurustophindid;
     1074    my ($phindid);
     1075
     1076    open(IN, "<$infile");
     1077   
     1078    while(<IN>) {
     1079   
     1080    chomp;
     1081    @fields = split(/:/, $_);
     1082   
     1083    # phindid and symbols for this line
     1084    $phindid = shift @fields;
     1085    $symbols = shift @fields;
     1086
     1087    # do we have a thesaurus id corresponding to this phrase?
     1088    if (defined($symbolstothesid{$symbols})) {
     1089        $thesid = $symbolstothesid{$symbols};
     1090        $thesaurustophindid{$thesid} = $phindid;
     1091    }
     1092    }
     1093    close IN;
     1094 
     1095    undef %symbolstothesid;
     1096
     1097    # 3.
     1098    # Create phind-id numbers for remaining thesaurus entries
     1099    my $nextphindid = $phindid + 1;
     1100
     1101    open(TH, "<$thesaurusfile");
     1102    while(<TH>) {
     1103   
     1104    chomp;
     1105    @fields = split(/:/, $_);
     1106   
     1107    # read thesaurus-id and ensure it has a corresponding phind-id
     1108    $thesid = shift @fields;
     1109    if (!defined($thesaurustophindid{$thesid})) {
     1110        $thesaurustophindid{$thesid} = $nextphindid;
     1111        $nextphindid++;
     1112    }
     1113    }
     1114    close TH;
     1115
     1116    # 4.
     1117    # Translate thesaurus file, replacing thesaurus-id numbers with
     1118    # phind-id numbers.
     1119    my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
     1120    my ($relations, $linkcounter, $linktext, $linktype, @linkdata, $link);
     1121
     1122    open(TH, "<$thesaurusfile");
     1123    open(TO, ">$newthesaurusfile");
     1124    while(<TH>) {
     1125   
     1126    chomp;
     1127    @fields = split(/:/, $_);
     1128   
     1129    # phindid and symbols for this line
     1130    ($thesid, $symbols, $relations) = @fields;
     1131   
     1132    die unless ($thesid && $symbols);
     1133    die unless $thesaurustophindid{$thesid};
     1134    $phindid = $thesaurustophindid{$thesid};
     1135
     1136    # convert each part of the relation string to use phind-id numbers
     1137    $newrelation = "";
     1138    $linkcounter = 0;
     1139    foreach $linktext (split(/;/, $relations)) {
     1140        @linkdata = split(/,/, $linktext);
     1141
     1142        # remember the linktype (e.g. BT, NT)
     1143        $linktype = shift @linkdata;
     1144        $newrelation .= "$linktype,";
     1145
     1146        # convert the link target identfiers
     1147        foreach $link (@linkdata) {
     1148        die unless (defined($thesaurustophindid{$link}));
     1149        $newrelation .= "$thesaurustophindid{$link},";
     1150        $linkcounter++;
     1151        }
     1152        $newrelation =~ s/\,$//;
     1153        $newrelation .= ";";
     1154    }
     1155    $newrelation .= ":";
     1156
     1157    print TO "$phindid:$symbols:$linkcounter:$newrelation\n";
     1158    }
     1159    close TH;
     1160    close TO;
     1161
     1162    undef %thesaurustophindid;
     1163
     1164    # 5.
     1165    # Read thesaurus data (in phind-id format) into memory
     1166    my %thesaurusdata;
     1167
     1168    open(TH, "<$newthesaurusfile");
     1169    while(<TH>) {
     1170    chomp;
     1171    ($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
     1172    die unless ($phindid && $symbols);
     1173    $thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
     1174    }
     1175
     1176    # 6.
     1177    # Add thesaurus data to phrases file
     1178    my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
     1179    my (@documwents, @newexp, $k, $n);
     1180    my $linenumber = 0;
     1181
     1182    open(IN, "<$infile");
     1183    open(OUT, ">$outfile");
     1184
     1185    # Update existing phrases
     1186    while(<IN>) {
     1187   
     1188    chomp;
     1189    @fields = split(/:/, $_);
     1190       
     1191    # get data for this line
     1192    $phindid = shift @fields;
     1193   
     1194    # output the phrase data, with thesaurus information
     1195    print OUT "$phindid:", join(":", @fields);
     1196
     1197    # add thesaurus data
     1198    if (defined($thesaurusdata{$phindid})) {
     1199        @fields = split(/:/, $thesaurusdata{$phindid});
     1200        shift @fields;
     1201        $linkcounter = shift @fields;
     1202        $relations = shift @fields;
     1203
     1204        print OUT ":$linkcounter:$relations";
     1205        $thesaurusdata{$phindid} = "";
     1206    }
     1207    print OUT "\n";
     1208    }
     1209    close IN;
     1210
     1211    # Add phrases that aren't already in the file
     1212    foreach $phindid (sort numerically keys %thesaurusdata) {
     1213    next unless ($thesaurusdata{$phindid});
     1214
     1215    @fields = split(/:/, $thesaurusdata{$phindid});
     1216    $symbols = shift @fields;
     1217    $linkcounter = shift @fields;
     1218    $relations = shift @fields;
     1219
     1220    print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
     1221    }
     1222    close OUT;
     1223
     1224}
     1225
     1226# restore_vocabulary_data
     1227#
     1228# Read phrases.3 and restore vocabulary information. Then write
     1229# this data to the MGPP input files (pwrod.txt and pdata.txt) and
     1230# (if requested) to the saved phrases file.
     1231
     1232sub restore_vocabulary_data {
     1233    my ($self) = @_;
     1234 
     1235    my $out = $self->{'out'};
     1236    my $verbosity = $self->{'verbosity'};
     1237    print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
     1238
     1239    my $phinddir = $self->{'phinddir'};
     1240    my $infile = &util::filename_cat($phinddir, 'phrases.3');
     1241    my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
     1242    my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
     1243    my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
     1244
     1245    my $savephrases = $self->{'savephrases'};
     1246
     1247    # 1.
     1248    # Read the vocabulary file
     1249    open(V, "<$vocabfile")
     1250    || die "Cannot open $vocabfile: $!";
    8181251    my @symbol;
    819     print "Reading the vocabulary\n" if ($verbosity);
    820     open(V, "<$phind_dir/clauses.vocab")
    821     || die "Cannot open $phind_dir/clauses.vocab: $!";
    822    
    8231252    my $i = 1;
    8241253    while(<V>) {
     
    8271256    }
    8281257
    829     # Create file for phrase data
    830     #
    831     # The phrases file looks something like this
    832     #  159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
    833     #  159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
    834     #  159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
    835     #  159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
    836 
    837     # The first field on each line is a unique phrase identifier. 
    838     # We need to calculate phrase numbers for each phrase
    839     print "Calculate phrase numbers\n" if ($verbosity);
    840    
    841     my %phrasenumber;
    842     my $nextphrase = 1;
    843     my ($line);
    844    
    845     open(IN, "<$phind_dir/phrases");
    846     while(<IN>) {
    847    
    848     # read the line
    849     chomp;
    850     $line = $_;
    851    
    852     # we're only interested in the first field
    853     $line =~ s/:.*//;
    854    
    855     # get a phrase number for this line
    856     $phrasenumber{$line} = $nextphrase;
    857     $nextphrase++;
    858     }
    859    
    860    
    861     # Now we create a new phrase file using phrase numbers, not the old IDs.
    862     print "Format phrase data for MGPP\n" if ($verbosity);
    863    
    864     # Open the basic files
    865     open(IN, "<$phind_dir/phrases");
    866     open(DATA, ">$phind_dir/pdata.txt");
    867     open(IDX, ">$phind_dir/pword.txt");
    868    
    869     # We may want to save the phrases in a separate text file
     1258
     1259    # 2.
     1260    # Translate phrases.3 to MGPP input files
     1261    my ($key, $text, $word);
     1262    my @fields;
     1263    my $linenumber = 0;
     1264
     1265    open(IN, "<$infile");
     1266    open(DATA, ">$datafile");
     1267    open(WORD, ">$wordfile");
     1268
     1269    # Save the phrases in a separate text file
    8701270    if ($savephrases) {
    871     print "Saving phrases in $savephrases\n" if ($verbosity);
     1271    print $out "Saving phrases in $savephrases\n" if ($verbosity);
    8721272    open(SAVE, ">$savephrases");
    8731273    }
    874 
    875     my ($key, $tf, $num, $countexp, $expansions, $countdocs, $documents, $text, $word);
    876     my @fields;
    877     my @documents;
    878     my (@newexp, $k, $n);
    879    
    880     my $linenumber = 0;
    8811274
    8821275    while(<IN>) {
     
    8891282    # get a phrase number for this line
    8901283    $key = shift @fields;
    891     die unless (defined($phrasenumber{$key}));
    892     $num = $phrasenumber{$key};
    893    
    894     # get the text of the phrase
     1284   
     1285    # restore the text of the phrase
    8951286    $text = shift @fields;
    8961287    $text =~ s/s(\d+)/$symbol[$1]/g;
    8971288    if ($text =~ / /) {
    8981289        $word = "";
    899     } else {
     1290    } elsif ($text ne 'untranslated') {
    9001291        $word = $text;
    9011292    }
    902 
    903     $linenumber++;
    904     if ($linenumber % 1000 == 0) {
    905         print "line $linenumber:\t$num\t$key\t($text)\n"  if ($verbosity > 2);
    906     }
    907     print "$num: $key\t($text)\n" if ($verbosity > 3);
    908    
    909     # get the phrase frequency
    910     $tf = shift @fields;
    911    
    912     # get the number of expansions
    913     $countexp = shift @fields;
    914    
    915     # get the expansions and convert them into phrase numbers
    916     $expansions = shift @fields;
    917     @newexp = ();
    918     foreach $k (split(/,/, $expansions)) {
    919         die "ERROR - no phrase number for: $k" unless (defined($phrasenumber{$k}));
    920         $n = $phrasenumber{$k};
    921         push @newexp, $n;
    922     }
    923     @newexp = sort numerically @newexp;
    924 
    925     # get the number of documents
    926     $countdocs = shift @fields;
    927    
    928     # get the documents
    929     $documents = shift @fields;
    930     $documents =~ s/d//g;
    931     @documents = split(/;/, $documents);
    932     @documents = sort by_frequency @documents;
    9331293
    9341294    # output the phrase data
    9351295    print DATA "<Document>";
    936     print DATA "$num:$text:$tf:$countexp:$countdocs:";
    937     print DATA join(",", @newexp), ":", join(";", @documents), "\n";
     1296    print DATA "$key:$text:", join(":", @fields), ":\n";
    9381297   
    9391298    # output the word index search data
    940     print IDX "<Document>$word\n";
     1299    print WORD "<Document>$word\n";
    9411300
    9421301    # output the phrases to a text file
     
    9441303        print SAVE "$tf\t$countdocs\t$text\n";
    9451304    }
    946 
    947     }
    948 
     1305    }
     1306    close IN;
     1307    close WORD;
     1308    close DATA;
    9491309    close SAVE if ($savephrases);
     1310
    9501311}
    9511312
     1313
     1314
    9521315# sort routines used to renumber phrases
    9531316
    9541317sub numerically { $a <=> $b }
    9551318
    956 sub by_frequency {
     1319sub by_doc_frequency {
    9571320    my $fa = 1;
    9581321    if ($a =~ /,/) {
     
    9691332}
    9701333
    971 
    97213341;
Note: See TracChangeset for help on using the changeset viewer.