root/main/trunk/greenstone2/perllib/classify/Phind.pm @ 23116

Revision 23116, 40.9 KB (checked in by kjdon, 9 years ago)

for incremental build, classifiers are not really done incrementally. Previously, we reconstructed all the docs from the database, and classified them, then processed any new/edited/deleted docs, updating the classifier as necessary. Now, we process all new/updated docs, then reconstruct the docs from the database, but only classify those not changed/deleted. This means that we are only ever adding docs to a classifier, never updating or deleting. I have removed edit_mode and all code handling deleting stuff from the classifier.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# Phind.pm -- the Phind classifier
4#
5# Copyright (C) 2000 Gordon W. Paynter
6# Copyright (C) 2000 New Zealand Digital Library Project
7#
8#
9# A component of the Greenstone digital library software
10# from the New Zealand Digital Library Project at the
11# University of Waikato, New Zealand.
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# The Phind clasifier plugin.
30# Type "classinfo.pl Phind" at the command line for a summary.
31
32package Phind;
33
34use BaseClassifier;
35use util;
36use ghtml;
37use unicode;
38
39use strict;
40no strict 'refs'; # allow filehandles to be variables and viceversa
41
42my @removedirs = ();
43
44my %wanted_index_files = ('td'=>1,
45              't'=>1,
46              'ti'=>1,
47              'tl'=>1,
48              'tsd'=>1,
49              'idb'=>1,
50              'ib1'=>1,
51              'ib2'=>1,
52              'ib3'=>1,
53              'i'=>1,
54              'il'=>1,
55              'w'=>1,
56              'wa'=>1);
57
58sub BEGIN {
59    @Phind::ISA = ('BaseClassifier');
60}
61
62sub END {
63   
64    # Tidy up stray files - we do this here as there's some weird problem
65    # preventing us from doing it in the get_classify_info() function (on
66    # windows at least) where the close() appears to fail on txthandle and
67    # dochandle, thus preventing us from deleting those files
68
69    foreach my $dir (@removedirs) {
70    if (-d $dir && opendir (DIR, $dir)) {
71        my @files = readdir DIR;
72        closedir DIR;
73   
74        foreach my $file (@files) {
75        next if $file =~ /^\.\.?$/;
76        my ($suffix) = $file =~ /\.([^\.]+)$/;
77        if (!defined $suffix || !defined $wanted_index_files{$suffix}) {
78            # delete it!
79            &util::rm (&util::filename_cat ($dir, $file));
80        }
81        }
82    }
83    }
84}
85
86my $arguments =
87    [ { 'name' => "text",
88    'desc' => "{Phind.text}",
89    'type' => "string",
90    'deft' => "section:Title,section:text",
91    'reqd' => "no" },
92      { 'name' => "title",
93    'desc' => "{Phind.title}",
94    'type' => "metadata",
95    'deft' => "Title",
96    'reqd' => "no" },
97      { 'name' => "buttonname",
98    'desc' => "{BasClas.buttonname}",
99    'type' => "string",
100    'deft' => "Phrase",
101    'reqd' => "no" },
102      { 'name' => "language",
103    'desc' => "{Phind.language}",
104    'type' => "string",
105    'deft' => "en",
106    'reqd' => "no" },
107      { 'name' => "savephrases",
108    'desc' => "{Phind.savephrases}",
109    'type' => "string",
110    'deft' => "",
111    'reqd' => "no" },
112      { 'name' => "suffixmode",
113    'desc' => "{Phind.suffixmode}",
114    'type' => "int",
115    'deft' => "1",
116    'range' => "0,1",
117    'reqd' => "no" },
118      { 'name' => "min_occurs",
119    'desc' => "{Phind.min_occurs}",
120    'type' => "int",
121    'deft' => "2",
122    'range' => "1,",
123    'reqd' => "no" },
124      { 'name' => "thesaurus",
125    'desc' => "{Phind.thesaurus}",
126    'type' => "string",
127    'deft' => "",
128    'reqd' => "no" },
129      { 'name' => "untidy",
130    'desc' => "{Phind.untidy}",
131    'type' => "flag",
132    'reqd' => "no" } ];
133
134my $options = { 'name'     => "Phind",
135        'desc'     => "{Phind.desc}",
136        'abstract' => "no",
137        'inherits' => "yes",
138        'args'     => $arguments };
139
140
141# Phrase delimiter symbols - these should be abstracted out someplace
142
143my $colstart = "COLLECTIONSTART";
144my $colend   = "COLLECTIONEND";
145my $doclimit = "DOCUMENTLIMIT";
146my $senlimit = "SENTENCELIMIT";
147my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
148
149
150# Create a new Phind browser based on collect.cfg
151
152sub new {
153    my ($class) = shift (@_);
154    my ($classifierslist,$inputargs,$hashArgOptLists) = @_;
155    push(@$classifierslist, $class);
156
157    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
158    push(@{$hashArgOptLists->{"OptList"}},$options);
159
160    my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
161
162    if ($self->{'info_only'}) {
163    # don't worry about any options etc
164    return bless $self, $class;
165    }
166
167    # Ensure the Phind generate scripts are in place
168    my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
169    $file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
170    my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
171    if (!(-e $file1)) {
172    print STDERR "Phind.pm: ERROR: The Phind \"suffix\" program is not installed.\n\n";
173    exit(1);
174    }
175   
176    # things that may have ex. in them that need to be stripped off
177    $self->{'text'} = $self->strip_ex_from_metadata($self->{'text'});
178    $self->{'title'} = $self->strip_ex_from_metadata($self->{'title'});
179
180    # Transfer value from Auto Parsing to the variable name that used in previous GreenStone.
181   
182    $self->{"indexes"} = $self->{"text"};
183
184    # Further setup
185    $self->{'collection'} = $ENV{'GSDLCOLLECTION'}; # classifier information
186    $self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'}; # collection directories
187    if (! defined $self->{'builddir'}) {
188    $self->{'builddir'} = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
189    }
190    $self->{'total'} = 0;
191   
192    # we set phind to be rtl if language is arabic
193    if ($self->{'language'} eq "ar") {
194    $self->{'textorientation'} = "rtl";
195    }
196    # Clean out the unused keys
197    delete $self->{"text"};
198
199    return bless $self, $class;
200}
201
202
203# Initialise the Phind classifier
204
205sub init {
206    my $self = shift (@_);
207
208    # ensure we have a build directory
209    my $builddir = $self->{'builddir'};
210    die unless (-e "$builddir");
211
212    # create Phind directory
213    my $phnumber = 1;
214    my $phinddir = &util::filename_cat($builddir, "phind1");
215    while (-e "$phinddir") {
216    $phnumber++;
217    $phinddir = &util::filename_cat($builddir, "phind$phnumber");
218    }
219    &util::mk_dir("$phinddir");
220    $self->{'phinddir'} = $phinddir;
221    $self->{'phindnumber'} = $phnumber;
222
223    push(@removedirs, $phinddir) unless $self->{'untidy'};
224
225    # open filehandles for documents and text
226    my $clausefile =  &util::filename_cat("$phinddir", "clauses");
227    &util::rm($clausefile) if (-e $clausefile);
228
229    my $txthandle = 'TEXT' . $phnumber;
230    open($txthandle, ">$clausefile") || die "Cannot open $clausefile: $!";
231    $self->{'txthandle'} = $txthandle;
232
233    my $docfile = &util::filename_cat("$phinddir", "docs.txt");
234    &util::rm($docfile) if (-e $docfile);
235
236    my $dochandle = 'DOC' . $phnumber;
237    open($dochandle, ">$docfile") || die "Cannot open $docfile: $!";
238    $self->{'dochandle'} = $dochandle;
239   
240}
241
242
243# Classify each document.
244#
245# Each document is passed here in turn.  The classifier extracts the
246# text of each and stores it in the clauses file.  Document details are
247# stored in the docs.txt file.
248
249sub classify {
250    my $self = shift (@_);
251    my ($doc_obj) = @_;
252
253    my $verbosity = $self->{'verbosity'};
254    my $top_section = $doc_obj->get_top_section();
255
256    my $titlefield = $self->{'title'};
257   
258    my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
259    if (!defined($title)) {
260    $title = "";
261    print STDERR "Phind: document has no title\n";
262    }
263    print "process: $title\n" if ($verbosity > 2);
264
265    # Only consider the file if it is in the correct language
266    my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
267    my $phrlanguage = $self->{'language'};
268    return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
269
270    # record this file
271    $self->{'total'} ++;
272    # what is $file ???
273    # print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
274
275    # Store document details
276    my $OID = $doc_obj->get_OID();
277    $OID = "NULL" unless defined $OID;
278    my $dochandle = $self->{'dochandle'};
279    print $dochandle "<Document>\t$OID\t$title\n";
280   
281    # Store the text occuring in this object
282
283    # output the document delimiter
284    my $txthandle = $self->{'txthandle'};
285    print $txthandle "$doclimit\n";
286
287    # iterate over the required indexes and store their text
288    my $indexes = $self->{'indexes'};
289    my $text = "";
290    my ($part, $level, $field, $section, $data, $dataref);
291
292    foreach $part (split(/,/, $indexes)) {
293
294    # Each field has a level and a data element ((e.g. document:Title)
295    ($level, $field) = split(/:/, $part);
296    die unless ($level && $field);
297   
298    # Extract the text from every section
299    # (In phind, document:text and section:text are equivalent)
300    if ($field eq "text") {
301        $data = "";
302        $section = $doc_obj->get_top_section();
303        while (defined($section)) {
304        $data .= $doc_obj->get_text($section) . "\n";
305        $section = $doc_obj->get_next_section($section);
306        }
307        $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
308    }
309   
310    # Extract a metadata field from a document
311    # (If there is more than one element of the given type, get them all.)
312    elsif ($level eq "document") {
313        $dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
314        foreach $data (@$dataref) {
315        $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
316        }
317    }
318
319    # Extract metadata from every section in a document
320    elsif ($level eq "section") {
321        $data = "";
322        $section = $doc_obj->get_top_section();
323        while (defined($section)) {
324        $dataref = $doc_obj->get_metadata($section, $field);
325        $data .= join("\n", @$dataref) . "\n";
326        $section = $doc_obj->get_next_section($section);
327        }
328        $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
329    }
330   
331    # Some sort of specification which I don't understand
332    else {
333        die "Unknown level ($level) in Phind index ($part)\n";
334    }
335   
336    }
337
338    # output the text
339    $text =~ tr/\n//s;
340    print $txthandle "$text";
341}
342
343
344# Construct the classifier from the information already gathered
345#
346# When get_classify_info is called, the clauses and docs.txt files have
347# already been constructed in the Phind directory.  This function will
348# translate them into compressed, indexed MGPP files that can be read by
349# the phindcgi script.  It will also register our classifier so that it
350# shows up in the navigation bar.
351
352sub get_classify_info {
353    my $self = shift (@_);
354    my ($gli) = @_;   
355
356    close $self->{'dochandle'};
357    close $self->{'txthandle'};
358    my $verbosity = $self->{'verbosity'};
359    my $out = $self->{'outhandle'};
360    my $phinddir = $self->{'phinddir'};
361
362    my $osextra = "";
363    if ($ENV{'GSDLOS'} !~ /^windows$/i) {
364    $osextra = " -d /";
365    }
366
367    print STDERR "</Stage>\n" if $gli;
368
369    if ($verbosity) {
370    print $out "\n*** Phind.pm generating indexes for ", $self->{'indexes'}, "\n";
371    print $out "***          in ", $self->{'phinddir'}, "\n";
372    }
373
374    print STDERR "<Stage name='Phind'>\n" if $gli;
375
376    # Construct phind indexes
377    my $suffixmode = $self->{'suffixmode'};
378    my $min_occurs = $self->{'min_occurs'};
379    my ($command, $status);
380   
381    # Generate the vocabulary, symbol statistics, and numbers file
382    # from the clauses file
383    print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
384    print STDERR "<Phase name='ExtractingVocab'/>\n" if $gli;
385    &extract_vocabulary($self);
386 
387    # Use the suffix program to generate the phind/phrases file
388    print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
389    print STDERR "<Phase name='ExtractingPhrase'/>\n" if $gli;
390    &execute("suffix \"$phinddir\" $suffixmode $min_occurs $verbosity", $verbosity, $out);
391
392    # check that we generated some files. It's not necessarily an error if
393    # we didn't (execute() would have quit on error), but we can't go on.
394    my $phrasesfile=&util::filename_cat($self->{'phinddir'}, 'phrases');
395    if (! -r $phrasesfile) {
396    print STDERR "<Warning name='NoPhrasesFound'/>\n" if $gli;
397    print $out "\nNo phrases found for Phind classifier!\n";
398    return;
399    }   
400
401    # Create the phrase file and put phrase numbers in phind/phrases
402    print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
403    print STDERR "<Phase name='SortAndRenumber'/>\n" if $gli;
404    &renumber_phrases($self);
405
406    print $out "\nCreating phrase databases\n";
407    print STDERR "<Phase name='PhraseDatabases'/>\n" if $gli;
408    my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
409    my $mg_stem = &util::filename_cat($phinddir, "pdata");
410
411    &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
412    &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
413    &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
414
415    # create the mg index of words
416    print $out "\nCreating word-level search indexes\n";
417    print STDERR "<Phase name='WordLevelIndexes'/>\n" if $gli;
418    $mg_input = &util::filename_cat($phinddir, "pword.txt");
419    $mg_stem = &util::filename_cat($phinddir, "pword");
420
421    &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 -I1 \"$mg_input\"", $verbosity, $out);
422    &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
423    &execute("mgpp_perf_hash_build $osextra -f \"$mg_stem\"", $verbosity, $out);
424    &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 -I2 \"$mg_input\"", $verbosity, $out);
425    &execute("mgpp_weights_build $osextra -f \"$mg_stem\"", $verbosity, $out);
426    &execute("mgpp_invf_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
427
428    &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 1", $verbosity, $out);
429    &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 2", $verbosity, $out);
430    &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 3", $verbosity, $out);
431
432    # create the mg document information database
433    print $out "\nCreating document information databases\n";
434    print STDERR "<Phase name='DocInfoDatabases'/>\n" if $gli;
435    $mg_input = &util::filename_cat($phinddir, "docs.txt");
436    $mg_stem = &util::filename_cat($phinddir, "docs");
437
438    &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
439    &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
440    &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
441
442    my $parameters = "phindnumber=$self->{'phindnumber'}";
443    if (defined ($self->{'textorientation'})) {
444    $parameters .= ";textorientation=$self->{'textorientation'}";
445    }
446    # Return the information about the classifier that we'll later want to
447    # use to create macros when the Phind classifier document is displayed.
448    my %classifyinfo = ('thistype'=>'Invisible',
449                        'childtype'=>'Phind',
450                        'Title'=>$self->{'buttonname'},
451                        'parameters'=>$parameters,
452            'contains'=>[]);
453   
454    my $collection = $self->{'collection'};
455    my $url = "library?a=p&p=phind&c=$collection";
456    push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
457   
458    return \%classifyinfo;
459}
460
461
462
463sub convert_gml_to_tokens {
464   
465    my ($language_exp, $text) = @_;
466
467    # escape any magic words... - jrm21
468    foreach my $delim (@delimiters) {
469    my $replacement=lc($delim);
470    my $num= $text=~ s/$delim/$replacement/g;
471    if (!$num) {$num=0;}
472    }
473
474    if ($language_exp =~ /^en$/) {
475    return &convert_gml_to_tokens_EN($text);
476    }
477
478    if ($language_exp =~ /zh/) {
479    return &convert_gml_to_tokens_ZH($text);
480    } 
481   
482    $_ = $text;
483
484    # 1. remove GML tags
485
486    # Remove everything that is in a tag
487    s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
488    s/\s*<br>\s*/ LINEBREAK /isgo;
489    s/<[^>]*>/ /sgo;
490
491    # Now we have the text, but it may contain HTML
492    # elements coded as &gt; etc.  Remove these tags.
493    s/&amp;/&/sgo;
494    s/&lt;/</sgo;
495    s/&gt;/>/sgo;
496    s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
497    s/\s*<br>\s*/ LINEBREAK /isgo;
498    s/<[^>]*>/ /sgo;
499
500    # replace<p> and <br> placeholders with clause break symbol (\n)
501    s/\s+/ /gso;
502    s/PARAGRAPHBREAK/\n/sgo;
503    s/LINEBREAK/\n/sgo;
504
505   
506    # 2. Split the remaining text into space-delimited tokens
507
508    # Convert any HTML special characters (like &quot;) to their UTF8 equivalent
509    s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
510
511    # Split text at word boundaries
512    s/\b/ /go;
513   
514    # 3. Convert the remaining text to "clause format"
515
516    # Insert newline if the end of a sentence is detected
517    # (delimter is:  "[\.\?\!]\s")
518    # s/\s*[\.\?\!]\s+/\n/go;
519
520    # remove unnecessary punctuation and replace with clause break symbol (\n)
521    # the following very nicely removes all non alphanumeric characters. too bad if you are not using english...
522    #s/[^\w ]/\n/go;
523    # replace punct with new lines - is this what we want??
524    s/\s*[\?\;\:\!\,\.\"\[\]\{\}\(\)]\s*/\n/go; #"
525    # then remove other punct with space
526    s/[\'\`\\\_]/ /go;
527
528    # remove extraneous whitespace
529    s/ +/ /sgo;
530    s/^\s+//mgo;
531    s/\s*$/\n/mgo;
532
533    # remove lines that contain one word or less
534    s/^\S*$//mgo;
535    s/^\s*$//mgo;
536    tr/\n//s;
537
538    return $_;
539}
540
541# a chinese version
542sub convert_gml_to_tokens_ZH {
543
544    $_ = shift @_;
545
546    # Replace all whitespace with a simple space
547    s/\s+/ /gs;
548    # Remove everything that is in a tag
549    s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
550    s/\s*<br>\s*/ LINEBREAK /isg;
551    s/<[^>]*>/ /sg;
552
553    # Now we have the text, but it may contain HTML
554    # elements coded as &gt; etc.  Remove these tags.
555    s/&lt;/</sg;
556    s/&gt;/>/sg;
557
558    s/\s+/ /sg;
559    s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
560    s/\s*<br>\s*/ LINEBREAK /isg;
561    s/<[^>]*>/ /sg;
562
563    # remove &amp; and other miscellaneous markup tags
564    s/&amp;/&/sg;
565    s/&lt;/</sg;
566    s/&gt;/>/sg;
567    s/&amp;/&/sg;
568
569    # replace<p> and <br> placeholders with carriage returns
570    s/PARAGRAPHBREAK/\n/sg;
571    s/LINEBREAK/\n/sg;
572
573   
574#    print STDERR "text:$_\n";
575    return $_;
576}
577
578# A version of convert_gml_to_tokens that is fine-tuned to the English language.
579
580sub convert_gml_to_tokens_EN {
581    $_ = shift @_;
582
583    # FIRST, remove GML tags
584
585    # Replace all whitespace with a simple space
586    s/\s+/ /gs;
587
588    # Remove everything that is in a tag
589    s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
590    s/\s*<br>\s*/ LINEBREAK /isg;
591    s/<[^>]*>/ /sg;
592
593    # Now we have the text, but it may contain HTML
594    # elements coded as &gt; etc.  Remove these tags.
595    s/&lt;/</sg;
596    s/&gt;/>/sg;
597
598    s/\s+/ /sg;
599    s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
600    s/\s*<br>\s*/ LINEBREAK /isg;
601    s/<[^>]*>/ /sg;
602
603    # remove &amp; and other miscellaneous markup tags
604    s/&amp;/&/sg;
605    s/&lt;/</sg;
606    s/&gt;/>/sg;
607    s/&amp;/&/sg;
608
609    # replace<p> and <br> placeholders with carriage returns
610    s/PARAGRAPHBREAK/\n/sg;
611    s/LINEBREAK/\n/sg;
612
613
614    # Exceptional punctuation
615    #
616    # We make special cases of some punctuation
617
618    # remove any apostrophe that indicates omitted letters
619    s/(\w+)\'(\w*\s)/ $1$2 /g;
620
621    # remove period that appears in a person's initals
622    s/\s([A-Z])\./ $1 /g;
623
624    # replace hyphens in hypheanted words and names with a space
625    s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
626
627    # Convert the remaining text to "clause format",
628    # This means removing all excess punctuation and garbage text,
629    # normalising valid punctuation to fullstops and commas,
630    # then putting one cluse on each line.
631
632    # Insert newline when the end of a sentence is detected
633    # (delimter is:  "[\.\?\!]\s")
634    s/\s*[\.\?\!]\s+/\n/g;
635
636    # split numbers after four digits
637    s/(\d\d\d\d)/$1 /g;
638
639    # split words after 32 characters
640
641    # squash repeated punctuation
642    tr/A-Za-z0-9 //cs;
643
644    # save email addresses
645    # s/\w+@\w+\.[\w\.]+/EMAIL/g;
646
647    # normalise clause breaks (mostly punctuation symbols) to commas
648    s/[^A-Za-z0-9 \n]+/ , /g;
649
650    # Remove repeated commas, and replace with newline
651    s/\s*,[, ]+/\n/g;
652
653    # remove extra whitespace
654    s/ +/ /sg;
655    s/^\s+//mg;
656    s/\s*$/\n/mg;
657
658    # remove lines that contain one word or less
659    s/^\w*$//mg;
660    s/^\s*$//mg;
661    tr/\n//s;
662
663    return $_;
664
665}
666
667
668
669# Execute a system command
670
671sub execute {
672    my ($command, $verbosity, $outhandle) = @_;
673    print $outhandle "Executing: $command\n"  if ($verbosity > 2);
674    $! = 0;
675    my $status = system($command);
676    if ($status != 0) {
677    print STDERR "Phind - Error executing '$command': $!\n";
678    exit($status);  # this causes the build to fail...
679    }
680}
681
682
683# Generate the vocabulary, symbol statistics, and numbers file from the
684# clauses file.  This is legacy code, so is a bit messy and probably wont
685# run under windows.
686
687sub extract_vocabulary {
688    my ($self) = @_;
689   
690    my $verbosity = $self->{'verbosity'};
691    my $out = $self->{'outhandle'};
692
693    my $collectiondir = $self->{'collectiondir'};
694    my $phinddir = $self->{'phinddir'};
695
696    my $language_exp = $self->{'language'};
697
698    my ($w, $l, $line, $word);
699   
700    my ($first_delimiter, $last_delimiter,
701    $first_stopword, $last_stopword,
702    $first_extractword, $last_extractword,
703    $first_contentword, $last_contentword,
704    $phrasedelimiter);
705   
706    my $thesaurus = $self->{'thesaurus'};
707    my ($thesaurus_links, $thesaurus_terms,
708    %thesaurus, $first_thesaurusword, $last_thesaurusword);
709
710    my %symbol;
711    my (%freq);
712
713    print $out "Calculating vocabulary\n" if ($verbosity > 1);
714
715    # Read and store the stopwords
716    my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword");
717    my $stopword_files = ();
718    my ($language, $language_dir, $file, $file_name);
719    my %stopwords;
720
721    # Examine each directory in the stopword directory
722    opendir(STOPDIR, $stopdir);
723    foreach $language (readdir STOPDIR) {
724
725    # Ignore entries that do not match the classifier's language
726    next unless ($language =~ /$language_exp/);
727    $language_dir = &util::filename_cat($stopdir, $language);
728    next unless (-d "$language_dir");
729
730    opendir(LANGDIR, $language_dir);
731    foreach $file (readdir LANGDIR) {
732
733        # Ignore entries that are not stopword files
734        next unless ($file =~ /sw$/);
735        $file_name = &util::filename_cat($language_dir, $file);
736        next unless (-f "$file_name");
737
738        # Read the stopwords
739        open(STOPFILE, "<$file_name");
740        while (<STOPFILE>) {
741        s/^\s+//;
742        s/\s.*//;
743        $word = $_;
744        $l = lc($word);
745        $stopwords{$l} = $word;
746        }
747        close STOPFILE;
748
749    }
750    closedir LANGDIR;
751    }
752    closedir STOPDIR;
753
754    # Read thesaurus information
755    if ($thesaurus) {
756
757    # link file exists
758    $thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
759    die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
760
761    # ensure term file exists in the correct language
762    if ($language_exp =~ /^([a-z][a-z])/) {
763        $language = $1;
764    } else {
765        $language = 'en';
766    }
767    $thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
768    die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
769   
770
771    # Read the thesaurus terms
772    open(TH, "<$thesaurus_terms");
773    while(<TH>) {
774        s/^\d+ //;
775        s/\(.*\)//;
776        foreach $w (split(/\s+/, $_)) {
777        $thesaurus{lc($w)} = $w;
778        }
779    }
780    close TH;
781    }
782
783    # Read words in the text and count occurences
784    open(TXT, "<$phinddir/clauses");
785
786    my @words;
787    while(<TXT>) {
788    $line = $_;
789    next unless ($line =~ /./);
790   
791    @words = split(/\s+/, $line);
792    foreach $w (@words) {
793        $l = lc($w);
794        $w = $l if ((defined $stopwords{$l}) || (defined $thesaurus{$l}));
795        $freq{$w}++;
796    }
797    $freq{$senlimit}++;
798    }
799   
800    close TXT;
801
802    # Calculate the "best" form of each word
803    my (%bestform, %totalfreq, %bestfreq);
804
805    foreach $w (sort (keys %freq)) {
806    $l = lc($w);
807   
808    # totalfreq is the number of times a term appears in any form
809    $totalfreq{$l} += $freq{$w};
810   
811    if (defined $stopwords{$l}) {
812        $bestform{$l} = $stopwords{$l};
813       
814    } elsif (defined $thesaurus{$l}) {
815        $bestform{$l} = $thesaurus{$l};
816       
817    } elsif (!$bestform{$l} || ($freq{$w} > $bestfreq{$l})) {
818        $bestfreq{$l} = $freq{$w};
819        $bestform{$l} = $w;
820    }
821    }
822    undef %freq;
823    undef %bestfreq;
824   
825
826    # Assign symbol numbers to tokens
827    my $nextsymbol = 1;
828    my (@vocab);
829   
830    # Delimiters
831    $first_delimiter = 1;
832   
833    foreach $word (@delimiters) {
834
835#   $word = lc($word); # jrm21
836    $word = uc($word);
837    $bestform{$word} = $word;
838    $vocab[$nextsymbol] = $word;
839    $symbol{$word} = $nextsymbol;
840    $nextsymbol++;
841    }
842    $last_delimiter = $nextsymbol - 1;
843    # Stopwords
844    $first_stopword = $nextsymbol;
845   
846    foreach my $word (sort keys %stopwords) {
847    # don't include stopword unless it occurs in the text
848    $word = lc($word);
849    next unless ($totalfreq{$word});
850    next if ($symbol{$word});
851   
852    $vocab[$nextsymbol] = $word;
853    $symbol{$word} = $nextsymbol;
854    $nextsymbol++;
855    }
856    $last_stopword = $nextsymbol - 1;
857    $first_contentword = $nextsymbol;
858   
859    # Thesaurus terms
860    if ($thesaurus) {
861    $first_thesaurusword = $nextsymbol;
862   
863    foreach my $word (sort keys %thesaurus) {
864       
865        $word = lc($word);
866        next if ($symbol{$word});
867        $bestform{$word} = $thesaurus{$word};
868       
869        $vocab[$nextsymbol] = $word;
870        $symbol{$word} = $nextsymbol;
871        $nextsymbol++;
872       
873    }
874    $last_thesaurusword = $nextsymbol - 1;
875    }
876
877    # Other content words
878    $first_extractword = $nextsymbol;
879   
880    foreach my $word (sort (keys %bestform)) {
881   
882    next if ($symbol{$word});
883   
884    $vocab[$nextsymbol] = $word;
885    $symbol{$word} = $nextsymbol;
886    $nextsymbol++;
887    }
888    $last_extractword = $nextsymbol - 1;
889    $last_contentword = $nextsymbol - 1;
890   
891    # Outut the words
892    print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
893    open(VOC, ">$phinddir/clauses.vocab");
894
895    for (my $i = 1; $i < $nextsymbol; $i++) {
896    $w = $vocab[$i];
897
898    print VOC "$bestform{$w}\n";
899    $totalfreq{$w} = 0 unless ($totalfreq{$w});
900    }
901    close VOC;
902
903
904    # Create statistics file
905    # Output statistics about the vocablary
906    print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
907    &util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
908
909    open(STAT, ">$phinddir/clauses.stats")
910    || die "Cannot open $phinddir/clauses.stats: $!";
911
912    print STAT "first_delimiter $first_delimiter\n";
913    print STAT "last_delimiter $last_delimiter\n";
914    print STAT "first_stopword $first_stopword\n";
915    print STAT "last_stopword $last_stopword\n";
916    if ($thesaurus) {
917    print STAT "first_thesaurusword $first_thesaurusword\n";
918    print STAT "last_thesaurusword $last_thesaurusword\n";
919    }
920    print STAT "first_extractword $first_extractword\n";
921    print STAT "last_extractword $last_extractword\n";
922    print STAT "first_contentword $first_contentword\n";
923    print STAT "last_contentword $last_contentword\n";
924    print STAT "first_symbol $first_delimiter\n";
925    print STAT "last_symbol $last_contentword\n";
926    print STAT "first_word $first_stopword\n";
927    print STAT "last_word $last_contentword\n";
928    close STAT;
929
930    undef @vocab;
931
932
933    # Create numbers file
934    # Save text as symbol numbers
935    print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
936   
937    open(TXT, "<$phinddir/clauses");
938    open(NUM, ">$phinddir/clauses.numbers");
939   
940##    $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21
941##    print NUM "$symbol{lc($colstart)}\n"; # jrm21
942    $phrasedelimiter = $symbol{$senlimit};
943    print NUM "$symbol{$colstart}\n";
944   
945    # set up the special symbols that delimit documents and sentences
946    while(<TXT>) {
947   
948    # split sentence into a list of tokens
949    $line = $_;
950    next unless ($line =~ /./);
951    @words = split(/\s+/, $line);
952   
953    # output one token at a time
954    foreach $word (@words) {
955# don't lower-case special delimiters - jrm21
956        if (!map {if ($word eq $_) {1} else {()}} @delimiters) {
957        $word = lc($word);
958        }
959        print NUM "$symbol{$word}\n";
960    }
961   
962    # output phrase delimiter
963    print NUM "$phrasedelimiter\n";
964    }
965   
966    close TXT;
967#    print NUM "$symbol{lc($colend)}\n";# jrm21
968    print NUM "$symbol{$colend}\n";
969    close NUM;
970
971    # Save thesaurus  data in one convienient file
972    if ($thesaurus) {
973
974    my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
975
976
977    print $out "Saving thesaurus as numbers in $thesaurusfile\n"
978        if ($verbosity > 1);
979
980    # Read the thesaurus terms
981    my ($num, $text, %thes_symbols);
982   
983    open(TH, "<$thesaurus_terms");
984    while(<TH>) {
985        chomp;
986        @words = split(/\s+/, $_);
987        $num = shift @words;
988        $text = "";
989
990        # translate words into symbol numbers
991        foreach $word (@words) {
992        $word = lc($word);
993        if ($symbol{$word}) {
994            $text .= "s$symbol{$word} ";
995        } elsif ($verbosity) {
996            print $out "Phind: No thesaurus symbol, ignoring \"$word\"\n";
997        }
998        }
999        $text =~ s/ $//;
1000        $thes_symbols{$num} = $text;
1001    }
1002    close TH;
1003
1004    # Read the thesaurus links and write the corresponding data
1005    open(TH, "<$thesaurus_links");
1006    open(THOUT, ">$thesaurusfile");
1007
1008    while(<TH>) {
1009        chomp;
1010        ($num, $text) = split(/:/, $_);
1011
1012        if (defined($thes_symbols{$num})) {
1013        print THOUT "$num:$thes_symbols{$num}:$text\n";
1014        } else {
1015        print THOUT "$num:untranslated:$text\n";
1016        }
1017    }
1018    close TH;   
1019    close THOUT;
1020    }
1021
1022
1023
1024
1025}
1026
1027
1028# renumber_phrases
1029#
1030# Prepare the phrases file to be input to mgpp.  The biggest problem is
1031# reconciling the phrase identifiers used by the suffix program (which
1032# we'll call suffix-id numbers) with the numbers used in the thesaurus
1033# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
1034# Phind-id numbers must be sorted by frequency of occurance.
1035#
1036# Start creating a set of phind-id numbers from the sorted suffix-id
1037# numbers and (if required) the thesaurus-id numbers.  Then add any other
1038# phrases occuring in the thesaurus.
1039#
1040# The last thing we have to do is restore the vocabulary information to the
1041# phrase file so that the phrases are stored as words, not as symbol
1042# numbers.
1043
1044# The original phrases file looks something like this:
1045#   159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
1046#   159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
1047#   159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
1048#   159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
1049
1050
1051sub renumber_phrases {
1052    my ($self) = @_;
1053
1054    renumber_suffix_data($self);
1055    renumber_thesaurus_data($self);
1056    restore_vocabulary_data($self);
1057
1058}
1059
1060
1061
1062# renumber_suffix_data
1063#
1064# Translate phrases file to phrases.2 using phind keys instead
1065# of suffix keys and sorting the expansion data.
1066
1067sub renumber_suffix_data {
1068    my ($self) = @_;
1069   
1070    my $verbosity = $self->{'verbosity'};
1071    my $out = $self->{'outhandle'};
1072    print $out "Translate phrases: suffix-ids become phind-id's\n"
1073    if ($verbosity);
1074   
1075    my $phinddir = $self->{'phinddir'};
1076    my $infile = &util::filename_cat($phinddir, 'phrases');
1077    my $outfile = &util::filename_cat($phinddir, 'phrases.2');
1078
1079    # Read the phrase file.  Calculate initial set of phind-id
1080    # numbers and store (suffixid -> frequency) relation.
1081
1082    my %suffixtophind;
1083    my @totalfrequency;
1084    my (@fields, $suffixid);
1085    my $nextphind = 1;
1086   
1087    open(IN, "<$infile");
1088    while(<IN>) {
1089
1090    chomp;
1091    @fields = split(/:/, $_);
1092   
1093    # get next suffixid and phindid
1094    $suffixid = shift @fields;
1095    $suffixtophind{$suffixid} = $nextphind;
1096
1097    # store total frequency
1098    shift @fields;
1099    $totalfrequency[$nextphind] = shift @fields;
1100
1101    $nextphind++;
1102    }
1103    close IN;
1104
1105
1106    # Translate phrases file to phrases.2.  Use phind keys (not suffix
1107    # keys), sort expansion and document occurance data in order of
1108    # descending frequency..
1109    open(IN, "<$infile");
1110    open(OUT, ">$outfile");
1111   
1112    my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1113    my (@documwents, @newexp, $k, $n);
1114    my $linenumber = 0;
1115
1116    while(<IN>) {
1117   
1118    # read the line
1119    chomp;
1120    @fields = split(/:/, $_);
1121   
1122    # get a phrase number for this line
1123    $suffixid = shift @fields;
1124    die unless (defined($suffixtophind{$suffixid}));
1125    $phindid = $suffixtophind{$suffixid};
1126   
1127    # get the symbols in the phrase
1128    $text = shift @fields;
1129
1130    # output status information
1131    $linenumber++;
1132    if ($verbosity > 2) {
1133        if ($linenumber % 1000 == 0) {
1134        print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1135        }
1136        # what are $num and $key??
1137        #print $out "$num: $key\t($text)\n" if ($verbosity > 3);
1138    }
1139
1140    # get the phrase frequency
1141    $tf = shift @fields;
1142   
1143    # get the number of expansions
1144    $countexp = shift @fields;
1145   
1146    # get the expansions, convert them into phind-id numbers, and sort them
1147    $expansions = shift @fields;
1148    @newexp = ();
1149    foreach $k (split(/,/, $expansions)) {
1150        die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1151        $n = $suffixtophind{$k};
1152        push @newexp, $n;
1153    }
1154    @newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1155
1156    # get the number of documents
1157    $countdocs = shift @fields;
1158   
1159    # get the documents and sort them
1160    $documents = shift @fields;
1161    $documents =~ s/d//g;
1162    my @documents = split(/;/, $documents);
1163    @documents = sort by_doc_frequency @documents;
1164
1165    # output the phrase data
1166    print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1167    print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1168   
1169    }
1170
1171    close IN;
1172    close OUT;
1173}
1174
1175
1176# renumber_thesaurus_data
1177#
1178# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1179
1180sub renumber_thesaurus_data {
1181    my ($self) = @_;
1182 
1183    my $out = $self->{'outhandle'};
1184    my $verbosity = $self->{'verbosity'};
1185    my $thesaurus = $self->{'thesaurus'};
1186
1187    my $phinddir = $self->{'phinddir'};
1188    my $infile = &util::filename_cat($phinddir, "phrases.2");
1189    my $outfile = &util::filename_cat($phinddir, "phrases.3");
1190
1191
1192    # If no thesaurus is defined, simply move the phrases file.
1193    if (!$thesaurus) {
1194    print $out "Translate phrases.2: no thesaurus data\n"
1195        if ($verbosity);
1196    &util::mv($infile, $outfile);
1197    return;
1198    }
1199
1200    print $out "Translate phrases.2: add thesaurus data\n"
1201    if ($verbosity);
1202
1203    # 1.
1204    # Read thesaurus file and store (symbols->thesaurusid) mapping
1205    my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1206    my %symbolstothesid;
1207    my (@fields, $thesid, $symbols);
1208   
1209    open(TH, "<$thesaurusfile");
1210
1211    while (<TH>) {
1212   
1213    chomp;
1214    @fields = split(/:/, $_);
1215   
1216    # get id and text
1217    $thesid = shift @fields;
1218    $symbols = shift @fields;
1219    $symbolstothesid{$symbols} = $thesid;
1220    }
1221    close TH;   
1222
1223    # 2.
1224    # Read phrases file to find thesaurus entries that already
1225    # have a phindid.  Store their phind-ids for later translation,
1226    # and store their frequency for later sorting.
1227    my %thesaurustophindid;
1228    my %phindidtofrequency;
1229    my ($phindid, $freq);
1230
1231    open(IN, "<$infile");
1232   
1233    while(<IN>) {
1234   
1235    chomp;
1236    @fields = split(/:/, $_);
1237   
1238    # phindid and symbols for this line
1239    $phindid = shift @fields;
1240    $symbols = shift @fields;
1241    $freq = shift @fields;
1242
1243    # do we have a thesaurus id corresponding to this phrase?
1244    if (defined($symbolstothesid{$symbols})) {
1245        $thesid = $symbolstothesid{$symbols};
1246        $thesaurustophindid{$thesid} = $phindid;
1247        $phindidtofrequency{$phindid} = $freq;
1248    }
1249    }
1250    close IN;
1251 
1252    undef %symbolstothesid;
1253
1254    # 3.
1255    # Create phind-id numbers for remaining thesaurus entries,
1256    # and note that their frequency is 0 for later sorting.
1257    my $nextphindid = $phindid + 1;
1258
1259    open(TH, "<$thesaurusfile");
1260    while(<TH>) {
1261   
1262    chomp;
1263    @fields = split(/:/, $_);
1264   
1265    # read thesaurus-id and ensure it has a corresponding phind-id
1266    $thesid = shift @fields;
1267    if (!defined($thesaurustophindid{$thesid})) {
1268        $thesaurustophindid{$thesid} = $nextphindid;
1269        $phindidtofrequency{$nextphindid} = 0;
1270        $nextphindid++;
1271    }
1272    }
1273    close TH;
1274
1275    # 4.
1276    # Translate thesaurus file, replacing thesaurus-id numbers with
1277    # phind-id numbers.
1278    my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1279    my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
1280    my (@links, $linkid, %linkidtotype, $newrelation);
1281
1282    open(TH, "<$thesaurusfile");
1283    open(TO, ">$newthesaurusfile");
1284    while(<TH>) {
1285   
1286    chomp;
1287    @fields = split(/:/, $_);
1288   
1289    # phindid and symbols for this line
1290    ($thesid, $symbols, $relations) = @fields;
1291   
1292    die unless ($thesid && $symbols);
1293    die unless $thesaurustophindid{$thesid};
1294    $phindid = $thesaurustophindid{$thesid};
1295
1296    # convert each part of the relation string to use phind-id numbers
1297    # at the same time, we want to sort the list by frequency.
1298    undef %linkidtotype;
1299   
1300    foreach $linktext (split(/;/, $relations)) {
1301        @linkdata = split(/,/, $linktext);
1302       
1303        # remember the linktype (e.g. BT, NT)
1304        $linktype = shift @linkdata;
1305       
1306        # store the type of each link
1307        foreach $thesid (@linkdata) {
1308        die unless (defined($thesaurustophindid{$thesid}));
1309        $linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
1310        }
1311    }
1312
1313    # sort the list of links, first by frequency, then by type.
1314    @links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
1315                        or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
1316    $linkcounter = (scalar @links);
1317
1318    # create a string describing the link information
1319    $linktype = $linkidtotype{$links[0]};
1320    $newrelation = $linktype;
1321    foreach $linkid (@links) {
1322        if ($linkidtotype{$linkid} ne $linktype) {
1323        $linktype = $linkidtotype{$linkid};
1324        $newrelation .= ";" . $linktype;
1325        }
1326        $newrelation .= "," . $linkid;
1327    }
1328    $newrelation .= ";";
1329   
1330
1331    # output the new line
1332    print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
1333    }
1334    close TH;
1335    close TO;
1336
1337    undef %thesaurustophindid;
1338    undef %linkidtotype;
1339    undef %phindidtofrequency;
1340
1341    # 5.
1342    # Read thesaurus data (in phind-id format) into memory
1343    my %thesaurusdata;
1344
1345    open(TH, "<$newthesaurusfile");
1346    while(<TH>) {
1347    chomp;
1348    ($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1349    die unless ($phindid && $symbols);
1350    $thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1351    }
1352    close TH;
1353
1354    # 6.
1355    # Add thesaurus data to phrases file
1356    my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1357    my (@documwents, @newexp, $k, $n);
1358    my $linenumber = 0;
1359
1360    open(IN, "<$infile");
1361    open(OUT, ">$outfile");
1362
1363    # Update existing phrases
1364    while(<IN>) {
1365   
1366    chomp;
1367    @fields = split(/:/, $_);
1368       
1369    # get data for this line
1370    $phindid = shift @fields;
1371   
1372    # output the phrase data, with thesaurus information
1373    print OUT "$phindid:", join(":", @fields);
1374
1375    # add thesaurus data
1376    if (defined($thesaurusdata{$phindid})) {
1377        @fields = split(/:/, $thesaurusdata{$phindid});
1378        shift @fields;
1379        $linkcounter = shift @fields;
1380        $relations = shift @fields;
1381
1382        print OUT ":$linkcounter:$relations";
1383        $thesaurusdata{$phindid} = "";
1384    }
1385    print OUT "\n";
1386    }
1387    close IN;
1388
1389    # Add phrases that aren't already in the file
1390    foreach $phindid (sort numerically keys %thesaurusdata) {
1391    next unless ($thesaurusdata{$phindid});
1392
1393    @fields = split(/:/, $thesaurusdata{$phindid});
1394    $symbols = shift @fields;
1395    $linkcounter = shift @fields;
1396    $relations = shift @fields;
1397
1398    print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1399    }
1400    close OUT;
1401
1402}
1403
1404# restore_vocabulary_data
1405#
1406# Read phrases.3 and restore vocabulary information. Then write
1407# this data to the MGPP input files (pword.txt and pdata.txt) and
1408# (if requested) to the saved phrases file.
1409
1410sub restore_vocabulary_data {
1411    my ($self) = @_;
1412 
1413    my $out = $self->{'outhandle'};
1414    my $verbosity = $self->{'verbosity'};
1415    print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1416
1417    my $phinddir = $self->{'phinddir'};
1418    my $infile = &util::filename_cat($phinddir, 'phrases.3');
1419    my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1420    my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1421    my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1422
1423    my $savephrases = $self->{'savephrases'};
1424
1425    # 1.
1426    # Read the vocabulary file
1427    open(V, "<$vocabfile")
1428    || die "Cannot open $vocabfile: $!";
1429    my @symbol;
1430    my $i = 1;
1431    while(<V>) {
1432    chomp;
1433    $symbol[$i++] = $_;
1434    }
1435    close V;
1436
1437    # 2.
1438    # Translate phrases.3 to MGPP input files
1439    my ($key, $text, $word, $isThesaurus, $line);
1440    my @fields;
1441    my $linenumber = 0;
1442
1443    open(IN, "<$infile");
1444    open(DATA, ">$datafile");
1445    open(WORD, ">$wordfile");
1446
1447    # Save the phrases in a separate text file
1448    if ($savephrases) {
1449    print $out "Saving phrases in $savephrases\n" if ($verbosity);
1450    open(SAVE, ">$savephrases");
1451    }
1452
1453    while(<IN>) {
1454   
1455    # read the line
1456    chomp;
1457    $line = $_;
1458    @fields = split(/:/, $line);
1459   
1460    # get a phrase number for this line
1461    $key = shift @fields;
1462   
1463    # restore the text of the phrase
1464    $text = shift @fields;
1465    $text =~ s/s(\d+)/$symbol[$1]/g;
1466    if ($text =~ / /) {
1467        $word = "";
1468    } elsif ($text ne 'untranslated') {
1469        $word = $text;
1470    }
1471
1472    # output the phrase data
1473    print DATA "<Document>";
1474    print DATA "$key:$text:", join(":", @fields), ":\n";
1475   
1476    # output the word index search data
1477    print WORD "<Document>$word\n";
1478
1479    # output the phrases to a text file
1480    if ($savephrases) {
1481        if ((scalar @fields) == 7) {
1482        $isThesaurus = 1;
1483        } else {
1484        $isThesaurus = 0;
1485        }
1486        print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
1487    }
1488    }
1489    close IN;
1490    close WORD;
1491    close DATA;
1492    close SAVE if ($savephrases);
1493
1494}
1495
1496
1497
1498# sort routines used to renumber phrases
1499
1500sub numerically { $a <=> $b }
1501
1502sub by_doc_frequency {
1503    my $fa = 1;
1504    if ($a =~ /,/) {
1505    $fa = $a;
1506    $fa =~ s/\d+,//;
1507    }
1508    my $fb = 1;
1509    if ($b =~ /,/) {
1510    $fb = $b;
1511    $fb =~ s/\d+,//;
1512    }
1513
1514    return ($fb <=> $fa);
1515}
1516
15171;
Note: See TracBrowser for help on using the browser.